Академический Документы
Профессиональный Документы
Культура Документы
##Toolkit
rm(package)
###########################################################################
# Part 1 Basic Operations
###########################################################################
# descriptive statistics
summary(houses)
# combinations
# order doesn't matter, nCr=n!/[(n-r)!r!]
# Suppose 4 essays are randomly chosen to appear on the class bulletin board. How many
# different groups of 4 are possible?
comb <- combinations(30, 4) # no. combs for selecting 4 from 30
nrow(comb)
rm(comb)
# permutations
# order matters, nPr=n!/[(n-r)!]
# Suppose 4 essays are randomly chosen for awards of $10, $7, $5, and $3. How many
# different groups of 4 are possible?
perm <- permutations(30, 4) # no. perms for combining 4 and 30
nrow(perm)
rm(perm)
#or
#or
print(comb*perm)
rm(comb, perm)
##########################################################################
# Part 2 Bayes Theorem
# https://districtdatalabs.silvrback.com/conditional-probability-with-r
##########################################################################
# P( A | B ) = P( AB ) / P( B )
# P( B | A ) = P( AB ) / P( A )
# P( A | B ) = (P( B | A ) * P( A )) / P( B )
# P( B ) = (P( B | A ) * P( A )) + (P( B | ~A ) * P( ~A ))
p_neg_nocancer=0.99 # P(~B|~A) P(negative|no cancer): Outcome 1: (True Negative) Probability that the test provides a
negative result given the patient does not have cancer.
print(p_neg_nocancer) #0.99
p_pos_nocancer=0.01 # P(B|~A) P(positive|no cancer): Outcome 2 (Type I error): Probability that the test provides a
positive result given the patient does not have cancer.
print(p_pos_nocancer) #0.01
p_neg_cancer=0.05 # P(~B|A) P(negative|cancer): Outcome 3 (Type II error): Probability that the test provides a
negative result given the patient has cancer.
print(p_neg_cancer) #0.05
p_pos_cancer=0.95 # P(B|A) P(positive|cancer): Outcome 4: (True Positive) Probability that the test provides a
positive result given the patient has cancer.
print(p_pos_cancer) #0.95
# P( B ) = (P( B | A ) * P( A )) + (P( B | ~A ) * P( ~A ))
p_pos = (p_cancer * p_pos_cancer) + (p_nocancer * p_pos_nocancer) # P(B) P(positive)
print(p_pos) #0.0147
# P( A | B ) = (P( B | A ) * P( A )) / P( B )
p_cancer_pos = (p_pos_cancer * p_cancer) / p_pos # P(A|B) P(cancer|positive)
print(p_cancer_pos) #0.3231293
##########################################################################
# Part 3 Sampling and Probability
##########################################################################
# ------------------------------------------------------------------------
# Basic Sampling / Probability
# ------------------------------------------------------------------------
# select a systematic sample, starting with the seventh obs and pick
# every 10th obs thereafter (i.e. 7, 17, 27,..)
PRICE <- houses$PRICE
PRICE[seq(from=7, to=117, by=10)]
rm(PRICE)
#or
set.seed(1235)
mean(replicate(100,any(duplicated(sample(1:365, 22, replace=TRUE)))))
# probabiliy of matched draws (with replacement)
# four different witnesses pick a man from a line-up of five men
# find the probability that all four witnesses pick the same person
possible_selections = 5 * 5 * 5 * 5 # 625
same_person_outcomes = 5 # a,a,a,a b,b,b,b c,c,c,c etc.
same_person_outcomes / possible_selections
rm(possible_selections, same_person_outcomes)
#or
(1/5)^3
#or
# ------------------------------------------------------------------------
# Custom Distribution Problems
# ------------------------------------------------------------------------
#or
# ------------------------------------------------------------------------
# Uniform Distribution Problems
# ------------------------------------------------------------------------
# ------------------------------------------------------------------------
# Normal Distribution Problems
# ------------------------------------------------------------------------
# 68-95-99.7 rule
# 68.27% of data fall within 1 standard deviations of the mean.
sd1 <- 0.6827
print(1-sd1)
# 95.45% of data fall within 2 standard deviations of the mean.
sd2 <- 0.9545
print(1-sd2)
# 99.73% of data fall within 3 standard deviations of the mean.
sd3 <- 0.9973
print(1-sd3)
rm(sd1, sd2, sd3)
# Chebyshev's rule
# under Chebyshev's inequality a minimum of just 75% of values must lie
# within two standard deviations of the mean
mean <- 67.1
sd <- 3.5
mean + 2*sd
rm(mean, sd)
#or
# ------------------------------------------------------------------------
# Binomial Distribution Problems
# ------------------------------------------------------------------------
# suppose that 60% of marbles in a bag are black and 40% are white
# find the prob of drawing a black marble from 20 draws
trials <- c(0:20)
probabilities <- dbinom(trials, size=20, prob=0.6)
successes <- trials[5:20]
binomial_probabilities <- probabilities[5:20]
successes <- factor(successes)
barplot(binomial_probabilities, names.arg=successes, xlab='successes',
ylab='binomial probabilities')
# suppose a gambler goes to the race track to bet on four races, there are
# six horses in each race, what is the probability of winning xxx races?
dbinom(x=0, size=4, prob=1/6) # prob gambler loses all four
dbinom(x=1, size=4, prob=1/6) # prob gambler wins one
1 - dbinom(x=0, size=4, prob=1/6) # prob gambler wins at least one
# a series of cups of tea are prepared with m having cream added prior to
# the tea bag and m of them with the cream added after the tea bag i.e.
# follows a binomial distribution with prob equal to 0.5 and 2m trials
# what is the probability that a woman is able to guess at least/at most
# xxx amount of cups with cream?
pbinom(q=0, size=1, prob=0.5) # prob woman correct 1 out of 1 times
pbinom(q=3, size=4, prob=0.5) # prob woman correct at least 2 out of 4 times
pbinom(q=1, size=10, prob=0.5, lower.tail=FALSE) # prob woman correct more than 1 out of 10 times
pbinom(q=5, size=10, prob=0.5, lower.tail=FALSE) # prob woman correct more than 5 out of 10 times
# ------------------------------------------------------------------------
# Poisson Distribution Problems
# ------------------------------------------------------------------------
# emergency room has 4.6 serious accidents to handle on average each night
# find the prob of an accident occuring over any given night
for (x in 0:20)
cat('\n x:', x, 'prob:', dpois(x, lambda=4.6))
x <- 0:20
prob_x <- dpois(x, lambda=4.6)
plot(x, prob_x, las=1, type='h')
rm(x)
# there are twelve cars crossing a bridge per minute on average, what is
# the probability of at least/at most xxx cars crossing the bridge for a
# given minute?
ppois(5, lambda=12) # prob 5 or less cars
ppois(10, lambda=12) # prob 10 or less cars
ppois(4, lambda=12, lower=FALSE) # prob 5 or more cars
ppois(9, lambda=12, lower=FALSE) # prob 10 or more cars
##########################################################################
# Part 4 Hypothesis Testing
##########################################################################
# ------------------------------------------------------------------------
# Notes
# ------------------------------------------------------------------------
# Power = 1 - Beta
# Alpha = 1 - specificity
# Beta + Alpha = 1 ?
# | Z | Y
# | reject H0 | fail to reject H0 |
# |------------------|----------------|-------------------|
# | A | Type I | Correct |
# | H0 valid/true | False Positive | True Positive |
# | | prob=alpha | prob=1-alpha |
# | | xxxx | xxxx |
# |------------------|----------------|-------------------|
# | B | Correct | Type II |
# | H0 invalid/false | True Negative | False Negative |
# | | prob=1-beta | prob=beta |
# | | xxxx | xxxx |
# generally,
# h0: x = x0
# h1: x < x0 for a lower one-tailed test
# x > x0 for an upper one-tailed test
# x <> x0 for a two tailed text
# where x0 is a hypothesized value of the true value x.
# ------------------------------------------------------------------------
# Basic test statistics
# ------------------------------------------------------------------------
# derive z-score
x0 <- 115.8 # hypothesized/population value/mean
x <- 138 # sample value/mean
sd <- 13.5 # sample standard deviation
n <- 100 # sample size
z <- (x-x0) / sd
print(z)
rm(x0, x, sd, n, z)
# or
# z <- qnorm(area)
z <- qnorm(0.05) # z for 0.05 of left tail
z <- qnorm(1 - 0.05 / 2) # z for 0.95 confidence
z <- qnorm(1 - 0.01 / 2) # z for 0.99 confidence
rm(z)
# derive t-score
x0 <- 115.8 # hypothesized/population value/mean
x <- 138 # sample value/mean
sd <- 13.5 # sample standard deviation
n <- 100 # sample size
se <- sd/sqrt(n) # sample std error
t <- (x-x0) / se
print(t)
# or
# or
# ------------------------------------------------------------------------
# Confidence Intervals
# ------------------------------------------------------------------------
p <- mu/n
q <- 1-p
sd = sqrt((q*p)/n)
moe <- qnorm(0.95) * sd # margin of error for 0.90 confidence (0.05 of right)
moe <- qnorm(0.95) * sd # margin of error for 0.90 confidence (0.05 of right)
se <- sd/sqrt(n)
moe <- qt(0.975, df = n-1) * se # margin of error for 0.95 confidence (0.025 of right)
se <- sd/sqrt(n)
X2.left <- qchisq(0.05, df = n-1) # chi for 0.05 left tail
X2.right <- qchisq(0.95, df = n-1) # chi for 0.05 right tail
conf.lower <- sqrt(((n - 1) * sd**2)/X2.right)
conf.upper <- sqrt(((n - 1) * sd**2)/X2.left)
n2 <- 60
x2 <- 23
p2 <- x2/n2
se <- sqrt((p1*(1-p1)/n1)+(p2*(1-p2)/n2))
moe <- z*se
conf.lower <- (p1-p2) - moe
conf.upper <- (p1-p2) + moe
print(c(conf.lower, conf.upper))
rm(n1, x1, p1, n2, x2, p2, alpha, z, se, moe, conf.lower, conf.upper)
# ------------------------------------------------------------------------
# Minimum Sample Size
# ------------------------------------------------------------------------
#z <- (pbar-p0)/sqrt(p0*(1-p0)/n)
#n <- (z / (pbar-p0))**2 * p0*(1-p0) # z <- qnorm(1 - 0.05 / 2)
#n <- (z / moe)**2 * p0*(1-p0)
#n <- (z**2 * sd**2) / moe**2
n <- (z**2 * sd**2) / moe**2
print(n)
rm(sd, moe, z, n)
#z <- (pbar-p0)/sqrt(p0*(1-p0)/n)
#n <- (z / (pbar-p0))**2 * p0*(1-p0) # z <- qnorm(1 - 0.05 / 2)
#n <- (z / moe)**2 * p0*(1-p0)
#n <- (z**2 * sd**2) / moe**2
n <- (z / moe)**2 * p0*(1-p0)
print(n)
rm(p0, moe, z, n)
# ------------------------------------------------------------------------
# Test of Population Mean with Unknown Variance
# http://www.r-tutor.com/elementary-statistics/hypothesis-testing
# ------------------------------------------------------------------------
# Test statistic t in terms of the sample mean, sample size and sample
# standard deviation:
# t=(xbar-mu0)/(s/sqrt(n))
# The null hypothesis of the lower tail test of the population mean can be
# expressed as mu >= mu_0, where mu_0 is a hypothesized value of the true
# population mean mu.
# manufacturer claims that the mean lifetime of a light bulb is more than
# 10,000 hours
# in a sample of 30 light bulbs, it was found that they only last 9,900
# hours on average with a standard deviation of 125 hours
# can we reject the claim by the manufacturer?
# null hypothesis is that mu >= 10000
# alternative hypothesis is that mu < 10000
xbar <- 9900 # sample mean
mu0 <- 10000 # hypothesized value
s <- 125 # sample standard deviation
n <- 30 # sample size
t <- (xbar-mu0)/(s/sqrt(n))
t # test statistic
# test statistic -4.38178 is less than the critical value -1.699127 and
# hence, at 0.05 significance level, we reject the null hypothesis
# that the mean lifetime of a light bulb is more than 10,000 hours
#or
# The null hypothesis of the upper tail test of the population mean can be
# expressed as mu <= mu_0, where mu_0 is a hypothesized value of the true
# population mean mu.
#or
# The null hypothesis of the two-tailed test of the population mean can be
# expressed as mu=mu_0, where mu_0 is a hypothesized value of the true
# population mean mu.
# test statistic -30 lies outside of the critical values -1.9600 and
# 1.9600 hence, at 0.05 significance level, we reject the null hypothesis
# that the sample mean is equal to 56
#or
# ------------------------------------------------------------------------
# Test of Population Proportion
# http://www.r-tutor.com/elementary-statistics/hypothesis-testing
# ------------------------------------------------------------------------
# Test statistic z in terms of the sample proportion and the sample size:
# z=(pbar-p0)/sqrt(p0*(1-p0)/n)
# The null hypothesis of the lower tailed test about population proportion
# can be expressed as p >= p_0, where p_0 is a hypothesized value of the
# true population proportion p.
# test statistic -0.6376 is not less than the critical value of -1.6449
# hence, at 0.05 significance level, we do not reject the null hypothesis
# that the proportion of voters in the population is above 60% this year
#or
prop.test(85, 148, p=0.6, alt='less', correct=FALSE, conf.level=0.95) # p-value 0.2619 > 0.05, do not reject null
hypothesis
rm(pbar, p0, n, z, alpha, z.alpha, cv)
# The null hypothesis of the upper tailed test about population proportion
# can be expressed as p <= p_0, where p_0 is a hypothesized value of the
# true population proportion p.
# can we reject the null hypothesis that the proportion of ceos aged 45
# years or older is less than 50%?
# null hypothesis is that p <= 50%
# alternative hypothesis is that p > 50%
age <- salaries$AGE >= 45
pbar <- sum(age)/length(age) # sample proportion
p0 <- 0.5 # hypothesized value
n <- length(age) # sample size
z <- (pbar-p0)/sqrt(p0*(1-p0)/n) # z <- (stu-np)/sd
z # test statistic
p <- pnorm(z, lower.tail=FALSE) # upper-tail, p-value P(z* < z) probability that you would get a sample proportion of
pbar or less
p # p-value
#or
prop.test(sum(age), length(age), p=0.5, alt='greater', correct=FALSE, conf.level=0.95) # p-value 4.653e-07 < 0.05,
reject null hypothesis
rm(pbar, p0, n, z, alpha, z.alpha, cv)
# a coin is flipped 100 times at the 0.95 confidence level, test the null
# hypothesis the coin is unbiased versus the alternative that it is biased
# if 43 heads are obtained
# null hypothesis is that p=50%
# alternative hypothesis is that p <> 50%
pbar <- 43/100 # sample proportion
p0 <- 0.5 # hypothesized value
n <- 100 # sample size
z <- (pbar-p0)/sqrt(p0*(1-p0)/n)
z # test statistic
p <- 2 * pnorm(z, lower.tail=FALSE) # two-tailed
p <- pnorm(z, lower.tail=FALSE) # upper-tail, p-value P(z* < z) probability that you would get a sample proportion of
pbar or less
p <- pnorm(z) # lower-tail, p-value P(z* > -z) probability that you would get a sample proportion of pbar or greater
p # p-value
#or
prop.test(43, 100, p=0.5, alt='two.sided', correct=FALSE, conf.level=0.95) # p-value 0.1615 > 0.05, do not reject
null hypothesis
# ------------------------------------------------------------------------
# Comparison of Population Mean Between Two Independent Samples
# http://www.r-tutor.com/elementary-statistics/inference-about-two-populations
# ------------------------------------------------------------------------
z.prop=function(x1,x2,n1,n2){
numerator=(x1/n1) - (x2/n2)
p.common=(x1+x2) / (n1+n2)
denominator=sqrt(p.common * (1-p.common) * (1/n1 + 1/n2))
z.prop.ris=numerator / denominator
return(z.prop.ris)
}
z <- z.prop(x1,x2,n1,n2)
print(z) # test statistic
numerator=(x1-x2)
denominator=s*sqrt((1/n1)+(1/n2))
t=numerator / denominator
return(t)
}
rm(x1, x2, n1, n2, s1, s2, df, t, alpha, t.alpha, cv)
# compare mean salaries between rural and non-rural areas at the 0.95
# confidence level, is there a significant difference between the two?
RURAL_SALARY <- subset(nsalary, subset=(RURAL == 'YES'))$NSAL
NON_RURAL_SALARY <- subset(nsalary, subset=(RURAL == 'NO'))$NSAL
t.test(RURAL_SALARY, NON_RURAL_SALARY, alternative='two.sided', conf.int=0.95) # p-value 8.504e-06 < 0.05, reject
null hypothesis
# ------------------------------------------------------------------------
# Comparison of Population Proportions
# http://www.r-tutor.com/elementary-statistics/inference-about-two-populations
# ------------------------------------------------------------------------
# Example 1:
# find the 0.95 confidence interval estimate of the difference between the
# female proportion of Aboriginal students and the female proportion of
# Non-Aboriginal students, each within their own ethnic group
x <- table(quine$Eth, quine$Sex)
print(x)
# Example 2:
# when 100 volunteers in each group had been treated and evaluated, the
# results revealed an 85% success rate for the new drug and a 65% success
# rate for the control group at the 0.95 confidence level, is there a
# significant difference between the two?
x <- matrix(c(85,65,15,35), nrow=2, ncol=2, byrow=FALSE,
dimnames=list(c('new_drug', 'control'), c('success', 'fail')))
print(x)
prop.test(x, correct=FALSE, conf.level=0.95) # p-value 0.0009589 < 0.05, reject null hypothesis
# Example 3:
# for 267 bats, one player hit 85 home runs, for 248 bats, the other
# player hit 89 home runs assume the number of home runs follows a
# binomial distribution at the 0.95 confidence level, is there a
# significant difference between the two?
x <- matrix(c(85,89,(267-85),(248-89)), nrow=2, ncol=2, byrow=FALSE,
dimnames=list(c('Player A', 'Player B'), c('HR', 'Other')))
print(x)
prop.test(x, correct=FALSE, conf.level=0.95) # p-value 0.001091 < 0.05, reject null hypothesis
# Example 4:
# find the 0.90 confidence interval for the difference between population
# proportions, p1 - p2. x1=15, n1=50, x2=23, n2=60.
prop.test(x=c(15,23), n=c(50,60), correct=FALSE, conf.level=0.90) # p-value 0.36 > 0.10, do not reject null
hypothesis
# ------------------------------------------------------------------------
# Test of Variance
# http://www.itl.nist.gov/div898/handbook/eda/section3/eda359.htm
# ------------------------------------------------------------------------
# The null hypothesis of the upper tailed test about population variance
# can be expressed as s <= s_0, where s_0 is a hypothesized value of the
# true population variance s.
# The null hypothesis of the upper tail test is to be rejected if
# f >= f_alpha, where f_alpha is the critical value of the F distribution.
f <- s1 / s2
f # test statistic
# ------------------------------------------------------------------------
# Test of occurence
# http://www.itl.nist.gov/div898/handbook/eda/section3/eda358.htm
# ------------------------------------------------------------------------
# Example 1:
# test the claim that the characteristics occur with the same frequency
# characteristic A B C D E F
# frequency 28 30 45 48 38 39
x <- mean(freq)
freq_err <- freq - x
chi <- sum(freq_err^2 / x)
chi # test statistic
# Example 2:
# use a chi-square test to find a 0.95 confidence interval for the variance
# in the amount of calories
var.conf.int=function(x, conf.level=0.95) {
df <- length(x) - 1
chilower <- qchisq((1 - conf.level)/2, df, lower.tail=TRUE)
chiupper <- qchisq((1 - conf.level)/2, df, lower.tail=FALSE)
v <- var(x)
c(df * v/chiupper, df * v/chilower)
}
var.conf.int(beef$Calories)
# Example 3:
# use a chi-square test to see if the variance in sodium for each hot dog
# type is different from 6000 with 0.95 confidence.
var.conf.int=function(x, conf.level=0.95) {
df <- length(x) - 1
chilower <- qchisq((1 - conf.level)/2, df, lower.tail=TRUE)
chiupper <- qchisq((1 - conf.level)/2, df, lower.tail=FALSE)
v <- var(x)
c(df * v/chiupper, df * v/chilower)
}
(6000 < var.conf.int(beef$Sodium)[1]) || (6000 > var.conf.int(beef$Sodium)[2]) # true, reject the null hypothesis
##########################################################################
# Part 5 ANOVA / Regression
##########################################################################
# ------------------------------------------------------------------------
# Notes
# ------------------------------------------------------------------------
# ------------------------------------------------------------------------
# Various questions on ANOVA / Regression
# ------------------------------------------------------------------------
# compare the mean price for five data type levels using ANOVA
# at the 0.95 confidence level, are these prices equal?
RATE_anova <- aov(RATE~ TYPE - 1, data=tableware)
RATE_lm <- lm(RATE~ TYPE - 1, data=tableware)
summary(RATE_anova)
summary(RATE_lm)
# compare the mean price for five data type levels using regression
# at the 0.95 confidence level, are these prices equal?
my_price_model <- {PRICE ~ TYPE}
my_price_model_fit <- lm(my_price_model, data=tableware)
print(confint(my_price_model_fit, level=0.95))
TukeyHSD(calories.anova, conf.level=0.95)
#or
cor.test(schools[,1],schools[,2])
critical.r(28) # if observed r is less than crit.r, it is not significantly different from random
anova(fit)
#Response: mpg
# Df Sum Sq Mean Sq F value Pr(>F)
#hp 1 678.37 678.37 45.46 1.788e-07 ***
#Residuals 30 447.67 14.92
# - Mean Squared Error (MSE) is the mean of the square of the residuals.
mse <- mean(fit$residuals^2)
mse
# - R^2
r2 <- ssr / sst
r2
#or
r2 <- 1 - sse / sst
r2
# - Adjusted R^2
# R_adj^2 = 1 - (SSE/(n-k-1)) / (SST/(n-1))
adj_r2 <- 1 - (sse / (nrow(mtcars)-1-1)) / (sst / (nrow(mtcars)-1))
adj_r2
# - F-test
# F_0 = (SSR/k) / (SSE/(n-p))
f_0 <- (ssr/1) / (sse/(nrow(mtcars)-2)) # F-test
f_0
##########################################################################
# Part 6 Visualization
##########################################################################
# box-and-whisker plot
boxplot(SRS, SS)
#or
# histogram
hist(houses$PRICE)
#negative skew: left: mean is less than the mode/median.
#postive skew: right skew: mean is greater than the mode/median.
#or
# qq plot
qqnorm(salaries$AGE, datax=TRUE)
qqline(salaries$AGE, datax=TRUE, distribution=qnorm, probs=c(0.25,0.75), qtype=7)
# scatterplot
plot(houses$PRICE, houses$TAX)
#or
#or
# dual plots
par(mfrow=c(1,2))
hist(houses$TAX)
hist(houses$PRICE)
par(mfrow=c(1,1))
# -----------------------------------------------------------------------------
p <- (0.86*0.14)/((0.86*0.14)+(0.65*0.35))
# -----------------------------------------------------------------------------
x <- c(0, 1, 2, 3, 4)
p <- c(0.37, 0.13, 0.06, 0.15, 0.29)
mu <- sum(x*p)
var <- sum(((x-mu)**2)*(p))
sd <- sqrt(var)
round(sd, 2)
# -----------------------------------------------------------------------------
# -----------------------------------------------------------------------------
min <- -3
max <- 14
mu <- (min + max) / 2
sd <- (max-min)/sqrt(12)
# -----------------------------------------------------------------------------
n <- 85
x <- 49
alpha <- 0.02
p <- x/n
q <- 1-p
z <- qnorm(1-alpha/2)
oi <- sqrt((q*p)/n)
p - (z*oi)
p + (z*oi)
# -----------------------------------------------------------------------------
t <- (r)*(sqrt((n-2)/(1-(r**2)))); t
qt <- qt(1-alpha/2, n-2); qt
qt(1-0.05/2, 45)
# -----------------------------------------------------------------------------
x.n <- 16
x.mu <- 73
x.sd <- 10.9
y.n <- 12
y.mu <- 68.4
y.sd <- 8.2
t <- t1/(t2*t3); t
##########################################################################
# Part 7 Notes
##########################################################################
# ------------------------------------------------------------------------
# Basics
# ------------------------------------------------------------------------
#https://en.wikipedia.org/wiki/Coefficient_of_variation
#The coefficient of variation (CV), also known as relative standard deviation
#(RSD), is a standardized measure of dispersion of a probability distribution or
#frequency distribution.
#The coefficient of variation is useful because the standard deviation of data
#must always be understood in the context of the mean of the data.
#When the mean value is close to zero, the coefficient of variation will approach
#infinity and is therefore sensitive to small changes in the mean.
#https://en.wikipedia.org/wiki/Truncated_mean
#A truncated mean or trimmed mean is a statistical measure of central tendency,
#much like the mean and median. It involves the calculation of the mean after
#discarding given parts of a probability distribution or sample at the high and
#low end, and typically discarding an equal amount of both.
#The truncated mean is a useful estimator because it is less sensitive to outliers
#than the mean but will still give a reasonable estimate of central tendency or
#mean.
#unless the underlying distribution is symmetric, the truncated mean of a sample
#is unlikely to produce an unbiased estimator for either the mean or the median.
#https://en.wikipedia.org/wiki/Normal_distribution
#skewness=0, kurtosis=0
#kurtosis values produced by R should close to 3
#If the skewness is greater than 1.0 (or less than -1.0), the skewness is
#substantial and the distribution is far from symmetrical.
#negative skew: left: mean is less than the mode/median.
#postive skew: right skew: mean is greater than the mode/median.
#https://en.wikipedia.org/wiki/Q%E2%80%93Q_plot
#a graphical method for comparing two probability distributions by plotting their
#quantiles against each other.
#If the two distributions being compared are similar, the points in the Q-Q plot
#will approximately lie on the line y = x.
#If the distributions are linearly related, the points in the Q-Q plot will
#approximately lie on a line, but not necessarily on the line y = x.
# By default qqline draws a line through the first and third quartiles
#https://en.wikipedia.org/wiki/Box_plot
#Black, 6th ed, pg 79: A box is drawn around the median with the lower and upper
#quartiles (Q1 and Q3) as the box endpoints. These box endpoints (Q1 and Q3) are
#referred to as the hinges of the box. The value of the interquartile range (IQR)
#is computed by Q3 - Q1.
#Black, 6th ed, pg 79: Values in the data distribution that are outside the inner
#fences but within the outer fences are referred to as mild outliers. Values that
#are outside the outer fences are called extreme outliers.
# ------------------------------------------------------------------------
# Major Type of Analysis
# ------------------------------------------------------------------------
# 2 Inferential Analysis
# Goal is to test theories or beliefs so as to say something about the
# nature of a population or phenomenon. Analysis is based on random
# samples that represent the population or phenomenon.
# 3 Predictive Analysis
# Goal is to use the data on some objects to predict values for another
# object. Various methods are employed to analyze current and historical
# facts to make predictions about future events.
# 4 Causal/Mechanistic Analysis
# Goal is to determine what happens to an outcome variable or object when
# independent variables are changed. This can entail estimating the exact
# degree of change that results from changing one or more independent
# variables. The data results from a carefully designed and measured study
# or experiment. Randomization may be necessary.
# ------------------------------------------------------------------------
# T-tests Pro and Con
# ------------------------------------------------------------------------
# Pro-
# Con-
# ------------------------------------------------------------------------
# Bootstraping
# ------------------------------------------------------------------------
# Basics -
# Advantages -
# find 0.95 confidence intervals for the mean amount of calories in each
# type of hot dog i.e. Ha: true mean is not equal to 0
t.test(beef$Calories)$conf.int
t.test(meat$Calories)$conf.int
t.test(poultry$Calories)$conf.int
# construct 99% one-sided lower confidence intervals for the mean amount
# of calories in each type of hot dog i.e. Ha: true mean is less than 0
t.test(beef$Calories, alt='less', conf.level=0.99)$conf.int
t.test(meat$Calories, alt='less', conf.level=0.99)$conf.int
t.test(poultry$Calories, alt='less', conf.level=0.99)$conf.int
# find the lower bound for the 0.95 confidence intervals for the mean
# amount of calories in each type of hot dog
t.test(beef$Calories)$conf.int[1]
t.test(meat$Calories)$conf.int[1]
t.test(poultry$Calories)$conf.int[1]
# determine which type of hotdog has average calories less than 140 with
# 0.95 confidence
as.numeric(t.test(beef$Calories)$conf.int)[1] < 140
as.numeric(t.test(meat$Calories)$conf.int)[1] < 140
as.numeric(t.test(poultry$Calories)$conf.int)[1] < 140
# determine which type of hotdog has average calories not equal to 140
# with 0.95 confidence
with(beef, t.test(Calories, alternative='two.sided', mu=140)) # p-value 0.003534, therefore reject null
with(meat, t.test(Calories, alternative='two.sided', mu=140))
with(poultry, t.test(Calories, alternative='two.sided', mu=140))
# consumers are presented with two beverages in random order and asked
# which they prefer
# the first beverage was preferred 85% of the time
# how large a sample of consumers would be needed for the second beverage
# to generate a 0.95 confidence interval with an overall width just less
# than 2% (i.e. from 84% to 86%)?
p <- 0.85
z_score <- qnorm(0.025, mean=0, sd=1, lower.tail=FALSE)
sample_size <- (z_score**2)*p*(1-p)/(0.01)**2
round(sample_size)
# -----------------------------------------------------------------------------
# `Q1`
# -----------------------------------------------------------------------------
# Suppose that a class of 30 students is assigned to write an essay.
# Suppose 4 essays are randomly chosen to appear on the class bulletin board.
# How many different groups of 4 are posssible?
# Suppose 4 essays are randomly chosen for awards of $10, $7, $5, and $3. How
# many different groups of 4 are possible?
# Which gives us 24. Then we multiply 24 by the original number from Q1a:
Q1b2 <- nrow(Q1a) * nrow(Q1b1)
Q1b2
# -----------------------------------------------------------------------------
# `Q2`
# -----------------------------------------------------------------------------
# Use Bayes' theorem to find the indicated probability. Use the results
# summarized in the table.
# Create a table:
Q2table <- matrix(c(8, 17, 18, 13, 7, 37), ncol = 2, byrow = T)
colnames(Q2table) <- c("Approve of mayor", "Do not approve of mayor")
rownames(Q2table) <- c("Republican", "Democrat", "Independent")
Q2table
# -----------------------------------------------------------------------------
# `Q3`
# -----------------------------------------------------------------------------
e <- c(0, 1, 2, 3)
p <- c(0.46, 0.41, 0.09, 0.04)
mu <- sum(e*p)
# -----------------------------------------------------------------------------
# `Q4`
# -----------------------------------------------------------------------------
# Assume that the weight loss for the first month of a diet program varies
# between 6 pounds and 12 pounds, and is spread evenly over the range of
# possibilities, so that there is a uniform distribution. Find the probability
# of the given range of pounds lost:
# -----------------------------------------------------------------------------
# `Q5`
# -----------------------------------------------------------------------------
# Find the indicated z score. The graph depicts the standard normal distribution
# with mean 0 and standard deviation 1. Shaded area is 0.0901.
# -----------------------------------------------------------------------------
# `Q6`
# -----------------------------------------------------------------------------
# Examples:
Q5a <- pwr.t.test(d = (0-5)/10,
n = 35,
sig.level = 0.01,
type = "paired",
alternative = "two.sided")
Q5a
# -----------------------------------------------------------------------------
# `Q7`
# -----------------------------------------------------------------------------
# -----------------------------------------------------------------------------
# `Q8`
# -----------------------------------------------------------------------------
# A cereal company claims that the mean weight of the cereal in its packets is
# 14 oz. Identify the Type I error for the test.
# The only choice that allows this is (C): Reject the claim that the mean weight
# is 14 oz when it is actually greater than 14 oz.
# -----------------------------------------------------------------------------
# `Q9`
# -----------------------------------------------------------------------------
# Suppose that you perform a hypothesis test regarding a population mean, and
# the evidence does not warrant rejection of the null hypothesis. When
# formulating the conclusion to the test, why is the phrase "fail to reject the
# null hypothesis" more accurate than the phrase "accept the null hypothesis"?
# We use the phrase "fail to reject the null hypothesis" because there is still
# a chance the null hypothesis is false. That size of that chance depends on
# the value of alpha that we set during the test. Our failure to reject the null
# hypothesis is only true for the assumptions and parameters we specify during
# the test, and really means that based on those, we do not find sufficient
# evidence to reject the null hypothesis. That does not entail the null
# hypothesis is true, just that we do not have sufficient evidence to reject it.
# -----------------------------------------------------------------------------
# `Q10`
# -----------------------------------------------------------------------------
# Scores on a test are normally distributed with a mean of 68.2 and a standard
# deviation of 10.4. Estimate the probability that among 75 randomly selected
# students, at least 20 of them score greater than 78.
# -----------------------------------------------------------------------------
# `Q11`
# -----------------------------------------------------------------------------
# According to a recent poll, 53% of Americans would vote for the incumbent
# president. If a random sample of 100 people results in 45% who would vote
# for the incumbent, test the claim that the actual percentage is 53%. Use a
# 0.10 significance level.
# H0: p = 0.53
# H1: p != 0.53
# Two sided test since it's !=
p <- 0.53
phat <- 45/100
z <- ((phat - p)/(sqrt(p*(1-p)/100))); z
# z score of -1.60 = p-value of 0.0548
# -----------------------------------------------------------------------------
# `Q12`
# -----------------------------------------------------------------------------
# -----------------------------------------------------------------------------
# `Q13`
# -----------------------------------------------------------------------------
# What is the relationship between the linear correlation coefficient and the
# usefulness of the regression equation for making predections?
# The linear regression equation is appropriate for prediction only when there
# is a significant linear correlation between two variables. The strength of
# the linear relationship (as measured by the linear correlation coefficient)
# indicates the usefulness of the regression equation for making predictions.
# -----------------------------------------------------------------------------
# `Q14`
# -----------------------------------------------------------------------------
# The standard error of estimate, se, is a measure of the distances between the
# observed sample y values, and the predicted values yhat. Smaller values of se
# indicate that the actual values of y will be closer to the regression line,
# whereas larger values of se indicate a greater dispersion of the y values
# about the regression line. When the standard error estimate is 0, the y
# values lie on the regression line.
# -----------------------------------------------------------------------------
# `Q15`
# -----------------------------------------------------------------------------
# Use the given sample data to test the claim that p1 > p2. Use a significance
# level of 0.01.
# H0: p1 > p2
# Sample 1
n1 <- 85
x1 <- 38
# Sample 2
n2 <- 90
x2 <- 23
# Critical value:
cv <- qnorm(1-alpha)
# Since our zScore is greater than our critical value, we reject the null.
# -----------------------------------------------------------------------------
# `Q16`
# -----------------------------------------------------------------------------
# Two types of flares are tested and their burning times (in minutes) are
# recorded. The summarys statistics are given below.
# H0: u1 = u2
# Ha: u1 != u2
# Brand X
x.n <- 35
x.mu <- 19.4
x.sd <- 1.4
# Brand Y
y.n <- 40
y.mu <- 15.1
y.sd <- 0.8
# -----------------------------------------------------------------------------
# `Q17`
# -----------------------------------------------------------------------------
n1 <- 50
x1 <- 15
p1 <- x1/n1
n2 <- 60
x2 <- 23
p2 <- x2/n2
se <- sqrt((p1*(1-p1)/n1)+(p2*(1-p2)/n2))
moe <- z*se
ciLower <- (p1-p2) - moe
ciUpper <- (p1-p2) + moe
print(c(ciLower, ciUpper))
# -----------------------------------------------------------------------------
# `Q18`
# -----------------------------------------------------------------------------
# Use the given data to find the equation of the regression line. Round the
# final values to three significant digits, if necessary.