Академический Документы
Профессиональный Документы
Культура Документы
txt
############################################################
###### Math 445 - Correlation and Regression Analysis ######
###### (Dr. Toribio - Fall 2012) ######
############################################################
###################################
### Day 2 (9/7/12): Correlation ###
###################################
# Importing data
# Change the R working directory to the correct folder
data=read.csv("marathon.csv",header=T)
attach(data)
x=Distance
y=Time
plot(x,y)
plot(x,y,xlab="Distance",ylab="Time")
ss=function(x,y) {sum(x*y)-sum(x)*sum(y)/length(x)}
ss.xx=ss(x,x)
ss.yy=ss(y,y)
ss.xy=ss(x,y)
r=ss.xy/sqrt(ss.xx*ss.yy)
#################################################
### Day 3 (9/12/12): Simple Linear Regression ###
#################################################
r=ss.xy/sqrt(ss.xx*ss.yy)
list(r=r,ss.xx=ss.xx,ss.yy=ss.yy,ss.xy=ss.xy) # This will be the output of the function.
}
corre(circ,height)
####################################################
### Day 4 (9/12/12): Fitted values and Residuals ###
####################################################
b1.hat=ss.xy/ss.xx; b0.hat=mean(y)-b1.hat*mean(x)
cbind(age,price)
price[age>5] # This will list all prices of cars that are older
than 5 years.
#source("E:\\Math445\\lin.reg.R")
lin.reg(age,price)
plot(age,price)
abline(197.1991,-20.376) # Drawing a line with y-intercept=197.1991 and
slope=-20.376.
##################################################
### Day 5 (9/14/12): Inference about the slope ###
##################################################
data=read.csv("production.csv",header=T)
source("E:\\Math445\\lin.reg.R")
data=read.csv("production.csv",header=T)
data
Size Hours
1 30 73
2 20 50
3 60 128
4 80 170
5 40 87
6 50 108
7 60 135
8 30 69
9 70 148
10 60 132
attach(data)
results=lin.reg(Size,Hours)
results
$y.intercept
[1] 10
$slope
[1] 2
fitted=10+2*Size
Residuals=Hours-fitted
cbind(data,fitted,Residuals)
sigma2=sum(Residuals^2)/(length(Size)-2) # This is the MSE
http://websites.uwlax.edu/storibio/math445_fall12/r_commands.txt 2/23
10/30/2018 websites.uwlax.edu/storibio/math445_fall12/r_commands.txt
sig.hat=sqrt(sigma2)
sig.hat
[1] 2.738613
> std.res=Residuals/sig.hat
> cbind(data,fitted,Residuals,std.res)
Size Hours fitted Residuals std.res
1 30 73 70 3 1.0954451
2 20 50 50 0 0.0000000
3 60 128 130 -2 -0.7302967
4 80 170 170 0 0.0000000
5 40 87 90 -3 -1.0954451
6 50 108 110 -2 -0.7302967
7 60 135 130 5 1.8257419
8 30 69 70 -1 -0.3651484
9 70 148 150 -2 -0.7302967
10 60 132 130 2 0.7302967
sum(Residuals)
[1] 0 #Verifying property #1
sum(Size*Residuals)
[1] 0 #Verifying property #4
sum(fitted*Residuals)
[1] 0 #Verifying property #5
# Testing if slope=0.
qt(.975,8) #Obtaining the t-critical value for alpha=0.05
[1] 2.306004 #So we should reject the H0 if t-observed is > 2.306 or <
-2.306.
source("E:\\Math445\\corre.R")
corre(Size,Hours)
$ss.xx
[1] 3400
sigma2
[1] 7.5
sqrt(7.5/3400)
[1] 0.04696682 # This is SE_b1
2/.047
[1] 42.55319 # This is the t-observed value (It is > 2.306, so reject H0)
Call:
lm(formula = Hours ~ Size)
Coefficients:
(Intercept) Size
10 2
summary(results)
Call:
lm(formula = Hours ~ Size)
Residuals:
Min 1Q Median 3Q Max
-3.0 -2.0 -0.5 1.5 5.0
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 10.00000 2.50294 3.995 0.00398 **
Size 2.00000 0.04697 42.583 1.02e-10 *** # Note that the p-value is
extremely small, so reject H0
http://websites.uwlax.edu/storibio/math445_fall12/r_commands.txt 3/23
10/30/2018 websites.uwlax.edu/storibio/math445_fall12/r_commands.txt
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
########################################################
### Day 6 (9/17/12): Inference about the y-intercept ###
########################################################
# Review homework
# Tree circumference and height problem
circ=c(1.8,1.9,1.8,2.4,5.1,3.1,5.5,5.1,8.3,13.7,5.3,4.9,3.7,3.8)
height=c(21,33.5,24.6,40.7,73.2,24.9,40.4,45.3,53.5,93.8,64,62.7,47.2,44.3)
b1.hat=ss.xy/ss.xx; b0.hat=mean(y)-b1.hat*mean(x)
fitted=b0.hat+b1.hat*x
residuals=fitted-y
SSE=sum(residuals^2)
n=length(x)
MSE=SSE/(n-2)
sigma.hat=sqrt(MSE)
SE.b1=sqrt(MSE/ss.xx)
results=lin.reg(circ,height)
results2=lm(height~circ)
summary(results2)
confint(results2,level=.95) # Will give the C.I. for the
slope and y-intercept.
#############################################
### Day 8 (9/21/12): Prediction intervals ###
#############################################
data=read.csv("production.csv",header=T)
attach(data)
results=lm(Hours~Size)
summary(results)
confint(results,level=.95)
http://websites.uwlax.edu/storibio/math445_fall12/r_commands.txt 4/23
10/30/2018 websites.uwlax.edu/storibio/math445_fall12/r_commands.txt
predict(results,newdata=new2,interval="prediction",level=.99) # The default level is 0.95
########################################
### Day 9 (9/24/12): Confidence Band ###
########################################
results=lm(dias~sys)
abline(results)
new=data.frame(sys=122)
predict(results,new) # equals 79.46697
predict(results,new,interval="confidence") # [71.28516, 87.64878]
predict(results,new,interval="confidence",level=.99) # [67.99666, 90.93728]
predict(results,new,interval="prediction",level=.99) # [51.67416, 107.2598]
SSE=sum(results$res^2)
s=sqrt(SSE/12) # sqrt(MSE) = can also be obtained from summary(results)
ss.xx=13*var(sys)
#SE.mean=8.288*sqrt(1/14+(122-mean(sys))^2/ss.xx)
ci=predict(results, se.fit=TRUE) # This will give you the fitted values and their
corresponding SE.mean
SE.mean=ci$se.fit
ll=ci$fit-W*SE.mean
ul=ci$fit+W*SE.mean
# Correlation
cor(sys,dias) # Computes the Pearson correlation coeficient, r
cor.test(sys,dias) # Tests Ho:rho=0 and also constructs C.I. for rho
cor(sys,dia,method="spearman") # Computes the Spearman's correlation coeficient
cor(sys,dia,method="kendall") # Computes Kendall's Tau
################################
### (10/1/12) Residual Plots ###
################################
data=read.csv("data_Toluca.csv",header=T)
attach(data)
http://websites.uwlax.edu/storibio/math445_fall12/r_commands.txt 5/23
10/30/2018 websites.uwlax.edu/storibio/math445_fall12/r_commands.txt
table(size)
boxplot(size) # Creates a boxplot (good for outlier detection)
size.out=c(size,200)
boxplot(size.out) # Note the presence of the outlier
# Load 'epicalc' ## You might need to download and install this package first.
dotplot(size)
plot(size, hours)
results=lm(hours~size)
abline(results,col="blue")
## A non-linear example
z=rnorm(length(size),0,100)
x=size
y=x^2+z
plot(x,y) # Note that the relationship is curvilinear
cor(x,y) # Note that r is pretty high!
temp=lm(y~x)
plot(temp$fitted,temp$res) # Note the pattern in this residual plot
x2=x^2
temp2=lm(y~x2)
plot(temp2$fitted,temp2$res)
qqnorm(temp$residuals)
qqline(temp$residuals)
shapiro.test(temp$residuals)
###################################################
### (10/3/12) Diagnostics and Remedial Measures ###
###################################################
http://websites.uwlax.edu/storibio/math445_fall12/r_commands.txt 6/23
10/30/2018 websites.uwlax.edu/storibio/math445_fall12/r_commands.txt
z=rnorm(length(size),0,100*sqrt(size))
x=size
y=x+z
plot(x,y) # Note the increasing variation
temp3=lm(y~x)
plot(temp3$fitted,temp3$res) # Note the increasing variation
# 6. One or several important predictor variables have been omitted from the model.
########################################################
### (10/10/12) Tests for Constancy of Error Variance ###
########################################################
## Brown-Forsythe Test
bf=function(x,y,x.med=median(x)){
results=lm(y~x)
e=results$residuals
# x.med=median(x)
e1=e[x<=x.med]; n1=length(e1)
e2=e[x>x.med]; n2=length(e2)
d1=abs(e1-median(e1))
d2=abs(e2-median(e2))
s2=(sum((d1-mean(d1))^2)+sum((d2-mean(d2))^2))/(n1+n2-2)
t.bf=(mean(d1)-mean(d2))/(sqrt(s2)*sqrt(1/n1+1/n2))
p.val=2*pt(abs(t.bf),df=(n1+n2-2),lower.tail=F)
list(t.bf=t.bf,p.val=p.val)}
data=read.csv("data_Toluca.csv",header=T)
attach(data)
x=sample(seq(20,120,by=10),100,replace=T)
z=rnorm(length(x),0,200*sqrt(x))
y=(62.37+3.57*x)+z
plot(x,y) # Note the increasing variation
bf(x,y)
http://websites.uwlax.edu/storibio/math445_fall12/r_commands.txt 7/23
10/30/2018 websites.uwlax.edu/storibio/math445_fall12/r_commands.txt
# load the package 'lmtest'
library(lmtest)
bptest(hours~size,studentize=F)
Example 2:
x=1:50
y=(1:50)*rnorm(50)
bf(x,y)
bptest(x,y)
#########################################
### (10/12/12) F Test for Lack of Fit ###
#########################################
x=c(125,100,200,75,150,175,75,175,125,200,100)
y=c(160,112,124,28,152,156,42,124,150,104,136)
plot(x,y)
results=lm(y~x)
anova(results)
reduced.mod=lm(y~x)
full.mod=lm(y~factor(x))
anova(reduced.mod,full.mod)
##################################
### (10/15/12) Transformations ###
##################################
log10plasma=log(Plasma,base=10)
plot(Age,log10plasma)
results2=lm(log10plasma~Age)
plot(Age,results2$res)
qqnorm(results2$res)
# Box-Cox Transformations
y=Plasma;x=Age
#k2=(prod(y))^(1/length(y))
#k1=1/(lambda*k2^(lambda-1))
#w=k1*(y^lambda-1)
#results=lm(w~x)
#anova(results)
#anova(results)$Sum[2] # This will give you the value of the SSE
lambda=seq(-2,2,by=.1)
lambda[21]=0.01 # Just to avoid the special case, lambda=0
iter=length(lambda)
SSEs=array(0,iter) # Storage for the SSEs
k2=(prod(y))^(1/length(y))
for(i in 1:iter){
k1=1/(lambda[i]*k2^(lambda[i]-1))
w=k1*(y^lambda[i]-1)
http://websites.uwlax.edu/storibio/math445_fall12/r_commands.txt 8/23
10/30/2018 websites.uwlax.edu/storibio/math445_fall12/r_commands.txt
results=lm(w~x)
SSEs[i]=anova(results)$Sum[2]
}
lambda[21]=0
cbind(lambda,SSEs) # Choose lambda that minimizes the SSE
plot(lambda,SSEs)
xp=sqrt(x)
results=lm(y~xp)
summary(results) # R^2=0.9545 - note the improvement
plot(xp,results$res)
qqnorm(results$res)
##########################################
### (10/17/12) Simultaneous Inferences ###
##########################################
data=read.csv("data_Toluca.csv",header=T)
attach(data)
results=lm(hours~size)
confint(results,level=.90)
coefficients(summary(results))
new=data.frame(size=c(30,65,100))
ci=predict(results,new,interval="confidence",se.fit=T)
http://websites.uwlax.edu/storibio/math445_fall12/r_commands.txt 9/23
10/30/2018 websites.uwlax.edu/storibio/math445_fall12/r_commands.txt
########################################################################
### (10/19/12) Simultaneous Inferences and Regression through Origin ###
########################################################################
data=read.csv("data_Toluca.csv",header=T)
attach(data)
results=lm(hours~size)
MSE=anova(results)$Mean[2]
new=data.frame(size=c(80,100))
ci=predict(results,new,interval="prediction",se.fit=T)
se.fit.pred=sqrt(MSE*(1+(ci$se.fit^2/MSE)))
# Scheffe Procedure
S=sqrt(2*qf(1-.05,2,23)) # In general, use S=sqrt(k*qf(1-.05,k,n-2))
# If k=2:10, look at
plot(k,sqrt(k*qf(1-.05,k,n-2)))
lwr.scheffe=ci$fit[,1]-S*se.fit.pred
upr.scheffe=ci$fit[,1]+S*se.fit.pred
cbind(ci$fit,lwr.scheffe,upr.scheffe)
# Bonferroni Procedure
ci.bon=predict(results,new,interval="prediction",level=(1-.05/2))
n=length(cost)
b1=sum(units*cost)/sum(units^2)
fitted=b1*units
residuals=cost-fitted
SSE=sum(residuals^2)
MSE=SSE/(n-1)
se.b1=sqrt(MSE/sum(units^2))
http://websites.uwlax.edu/storibio/math445_fall12/r_commands.txt 10/23
10/30/2018 websites.uwlax.edu/storibio/math445_fall12/r_commands.txt
##################################
### (10/22/12) Matrix Approach ###
##################################
A=matrix(1:6,nrow=2)
B=matrix(c(-1,0,2,1,4,3),nrow=2)
A
B
A+B # Matrix addition = addition of corresponding entries
A-B
t(A) # transpose of A
D=t(A)%*%A # To do matrix multiplication, use %*%
E=A%*%t(A) # Note that E is not equal to D.
E.inv=solve(E) # Computing the inverse of a square matrix
I=E.inv%*%E # The product of E and it's inverse is the Identity matrix
data=read.csv("data_Toluca.csv",header=T)
attach(data)
x=size; y=hours
n=length(x)
t(y)%*%y # Sum of squares of yi's
X=matrix(c(rep(1,n),x),ncol=2) # The design matrix X
t(X)%*%X
t(X)%*%y
solve(t(X)%*%X)
B=solve(t(X)%*%X)%*%(t(X)%*%y) # Estimates of the regression coefficients
coefficients(lm(y~x)) # Note that you get these results
fitted=X%*%B
######################################
### (10/24/12) Multiple Regression ###
######################################
data=read.csv("data_dwaine.csv",header=T)
attach(data)
x1=young
x2=income
y=sales
n=length(sales)
pairs(cbind(x1,x2,y)) # Creates a scatter plot matrix
pairs(data)
cor(data) # Creates a correlation matrix
# Diagnostics
par(mfrow=c(2,2))
plot(results$fitted,results$residuals)
plot(x1,results$residuals)
plot(x2,results$residuals)
plot(x1*x2,results$residuals) # Checking for interaction effects
anova(results)
http://websites.uwlax.edu/storibio/math445_fall12/r_commands.txt 11/23
10/30/2018 websites.uwlax.edu/storibio/math445_fall12/r_commands.txt
# F Test for Regression Relation
# Coefficient of multiple determination
# Adjusted coefficient of multiple determination
# Coefficient of multiple correlation
#Prediction
new=data.frame(x1=65.4,x2=17.6)
predict(results,new)
predict(results,new,interval="confidence")
predict(results,new,interval="prediction")
new2=data.frame(x1=c(65.4,53.1),x2=c(17.6,17.7))
predict(results,new2,interval="prediction",level=(1-.10/2)) # Bonferroni Intervals
###########################################
### (10/29/12) More Multiple Regression ###
###########################################
body=read.csv("data_body_fat.csv",header=T)
attach(body)
x1=triceps
x2=thigh
x3=midarm
y=fat
results.x1x2x3=lm(y~x1+x2+x3)
results.x1x2=lm(y~x1+x2)
anova(results.x1x2)
results.x1=lm(y~x1)
anova(results.x1)
# ssr(x2|x1)=sse(x1)-sse(x1,x2)=ssr(x1,x2)-ssr(x1)
# f_obs=(ssr(x2|x1)/1)/(sse(x1,x2)/(n-3)) # Ho: beta_2=0
results.x1x2x3=lm(y~x1+x2+x3)
# ssr(x3|x1,x2)=sse(x1,x2)-sse(x1,x2,x3)=ssr(x1,x2,x3)-ssr(x1,x2)
# f_obs=(ssr(x3|x1,x2)/1)/(sse(x1,x2,x3)/(n-4)) # Ho: beta_3=0
# ssr(x2,x3|x1)=sse(x1)-sse(x1,x2,x3)=ssr(x1,x2,x3)-ssr(x1)
# f_obs=(ssr(x2,x3|x1)/2)/(sse(x1,x2,x3)/(n-4)) # Ho: beta_2=0 and beta_3=0
######################################################
### (11/5/12) Coefficient of Partial Determination ###
######################################################
anova(results.x1x2)
anova(results.x1)
# R^2(2|1)=ssr(x2|x1)/sse(x1)=33.17/143.12=0.232
# Hence, the SSE(x1) is reduced by 23.2 percent.
anova(results.x1x2x3)
anova(results.x1x2)
# R^2(3|12)=ssr(x3|x1,x2)/sse(x1,x2)=11.55/109.95=0.105
http://websites.uwlax.edu/storibio/math445_fall12/r_commands.txt 12/23
10/30/2018 websites.uwlax.edu/storibio/math445_fall12/r_commands.txt
# when explanatory variables differ substantially in order of magnitude
data=read.csv("data_dwaine.csv",header=T)
attach(data)
x1=young
x2=income
y=sales
n=length(sales)
x1.star=((x1-mean(x1))/sd(x1))/sqrt(n-1)
x2.star=((x2-mean(x2))/sd(x2))/sqrt(n-1)
y.star=((y-mean(y))/sd(y))/sqrt(n-1)
results.star=lm(y.star~0+x1.star+x2.star)
b1=(sd(y)/sd(x1))*0.7484
b2=(sd(y)/sd(x2))*0.2511
b0=mean(y)-b1*mean(x1)-b2*mean(x2)
#check
lm(y~x1+x2)
###################################
### (11/7/12) Multicollinearity ###
###################################
# Uncorrelated predictors
x1=c(4,4,4,4,6,6,6,6)
x2=c(2,2,3,3,2,2,3,3)
cor(x1,x2)
y=c(42,39,48,51,49,53,61,60)
anova(lm(y~x1))
anova(lm(y~x2))
anova(lm(y~x1+x2)) # Note that ssr(x2)=ssr(x2|x1)
# y2_hat1=-87+x3+18*x4
# y2_hat2=-7+9*x3+2*x4
# y2_hat3=-17+8*x3+4*x4 # Note that all 3 models give perfect fit.
data=read.csv("data_body_fat.csv",header=T)
attach(data)
pairs(data)
cor(data)
summary(lm(midarm~triceps+thigh))
x1=triceps
x2=thigh
x3=midarm
y=fat
coefficients(lm(y~x1))
coefficients(lm(y~x2))
coefficients(lm(y~x1+x2)) # Note how the beta estimates drastically changes as
you
coefficients(lm(y~x1+x2+x3)) # include highly correlated predictors
http://websites.uwlax.edu/storibio/math445_fall12/r_commands.txt 13/23
10/30/2018 websites.uwlax.edu/storibio/math445_fall12/r_commands.txt
results2=lm(y~x1+x2)
predict(results2,new=data.frame(x1=25,x2=50),interval="confidence",se.fit=T)
results3=lm(y~x1+x2+x3)
predict(results3,new=data.frame(x1=25,x2=50,x3=29),interval="confidence",se.fit=T)
########################################################
### (11/9/12) Regression with Qualitative Predictors ###
########################################################
plot(size,months,pch=as.numeric(type))
abline(33.874,-0.102,lwd=2) # For mutual firms
abline(33.874+8.055,-0.102,lwd=2,col="darkred") # For stock firms
# Another example
data=read.csv("ex12-38.csv",header=T)
data=attach(data)
# Another Example
yield=c(12.2,12.4,11.9,11.3,11.8,12.1,13.1,12.7,12.4,11.4,16.6,15.8,16.5,15.0,15.4,15.6,15.8,
15.8,16.0,15.8,9.5,9.5,9.6,8.8,9.5,9.8,9.1,10.3,9.5,8.5)
height=c(45,52,42,35,40,48,60,61,50,33,63,50,63,33,38,45,50,48,50,49,52,54,58,45,57,62,52,67,
55,40)
http://websites.uwlax.edu/storibio/math445_fall12/r_commands.txt 14/23
10/30/2018 websites.uwlax.edu/storibio/math445_fall12/r_commands.txt
treatment=factor(c(rep("C",10),rep("s",10),rep("f",10)))
plot(height,yield,pch=as.numeric(treatment))
summary(lm(yield~treatment+height))
tapply(yield,treatment,mean)
tapply(height,treatment,mean)
########################################
### (11/12/12) Polynomial Regression ###
########################################
# polynomial regression
data2=read.csv("data_power_cells.csv",header=T)
attach(data)
cor(charge,charge^2) # Note how correlated they are.
cor(temperature,temperature^2)
y=cycles
x1=(charge-mean(charge))/.4 # Transforming will reduce the correlation
x2=(temperature-mean(temperature))/.4
x1.2=x1^2 # Check cor(x1,x1.2)
x2.2=x2^2 # Check cor(x2,x2.2)
x1x2=x1*x2
summary(lm(y~x1+x1.2+x2+x2.2+x1x2)) # Note that the interaction effect is not
significant.
summary(lm(y~x1+x1.2+x2+x2.2)) # Note that x2^2 is not significant.
summary(lm(y~x1+x1.2+x2)) # Note that x1^2 is not significant.
summary(lm(y~x1+x2)) # Note how much R^2 changed as you remove
variables from the model.
##################################
### (11/14/12) Model Selection ###
##################################
data=read.csv("data_surgical_unit.csv",header=T)
attach(data)
y=survival
x1=clotting
x2=prognostic
x3=enzyme
x4=liver
cor(cbind(y,x1,x2,x3,x4))
pairs(cbind(y,x1,x2,x3,x4)) # Note that some relationships look more exponential
pairs(cbind(log(y),x1,x2,x3,x4)) # Note that the relationships are more linear
temp=lm(y~x1+x2+x3+x4)
plot(temp$fit,temp$res) # Note that the residual plot exhibits noncontancy of
variance.
log.y=log(y)
temp=lm(log.y~x1+x2+x3+x4)
plot(temp$fit,temp$res) # Note that the residual plot looks much better.
dataset=data.frame(log.y=log(y),x1=x1,x2=x2,x3=x3,x4=x4)
http://websites.uwlax.edu/storibio/math445_fall12/r_commands.txt 15/23
10/30/2018 websites.uwlax.edu/storibio/math445_fall12/r_commands.txt
anova(lm(log.y~x1+x2+x3))$Sum[4] # SSE = 3.109
## You can get several of these criteria using the R package "leaps".
library(leaps)
log.y=log(y)
dataset=data.frame(log.y=log(y),x1=x1,x2=x2,x3=x3,x4=x4)
results=regsubsets(log.y~.,data=dataset,nbest=6,nvmax=4)
attributes(summary(results))
outmat=summary(results)$outmat
SSE=round(summary(results)$rss,3)
http://websites.uwlax.edu/storibio/math445_fall12/r_commands.txt 16/23
10/30/2018 websites.uwlax.edu/storibio/math445_fall12/r_commands.txt
R2=round(summary(results)$rsq,3)
R2.adj=round(summary(results)$adjr2,3)
Cp=round(summary(results)$cp,3)
BIC=round(summary(results)$bic,3) # BIC is equivalent to SBC
data.frame(outmat,SSE,R2,R2.adj,Cp,BIC)
# Another example
library(faraway) # will load the 'faraway' package
library(help=faraway)
data(seatpos)
seatpos
help(seatpos)
reg=lm(hipcenter~.,data=seatpos)
summary(reg)
results2=regsubsets(hipcenter~.,data=seatpos,nbest=2)
summary(results2)
##############################################
### (11/16/12) Stepwise Regression Methods ###
##############################################
data=read.csv("data_surgical_unit.csv",header=T)
attach(data)
y=survival
x1=clotting
x2=prognostic
x3=enzyme
x4=liver
x5=age
x6=gender
x7=moderate
x8=heavy
log.y=log(y)
dataset=data.frame(log.y,x1,x2,x3,x4,x5,x6,x7,x8)
library(leaps)
results=regsubsets(log.y~.,data=dataset,nbest=4,nvmax=8)
attributes(summary(results))
outmat=summary(results)$outmat
SSE=round(summary(results)$rss,3)
R2=round(summary(results)$rsq,3)
R2.adj=round(summary(results)$adjr2,3)
Cp=round(summary(results)$cp,3)
BIC=round(summary(results)$bic,3) # BIC is equivalent to SBC
data.frame(outmat,SSE,R2,R2.adj,Cp,BIC)
# Using SBC or BIC, the best model includes x1,x2,x3, and x8.
# Using Cp, the best model includes x1,x2,x3,x5, and x8.
# Using R2.adj, the best model includes x1,x2,x3,x5,x6, and x8.
## Forward Selection ##
# Add one variable at a time.
# Insert the variable that yields the smallest (t-statistic) p-value.
# stage 1: x3 enters the model
coefficients(summary(lm(log.y~x1)))
pval.x1=coefficients(summary(lm(log.y~x1)))[2,4]
pval.x2=coefficients(summary(lm(log.y~x2)))[2,4]
pval.x3=coefficients(summary(lm(log.y~x3)))[2,4]
pval.x4=coefficients(summary(lm(log.y~x4)))[2,4]
pval.x5=coefficients(summary(lm(log.y~x5)))[2,4]
pval.x6=coefficients(summary(lm(log.y~x6)))[2,4]
http://websites.uwlax.edu/storibio/math445_fall12/r_commands.txt 17/23
10/30/2018 websites.uwlax.edu/storibio/math445_fall12/r_commands.txt
pval.x7=coefficients(summary(lm(log.y~x7)))[2,4]
pval.x8=coefficients(summary(lm(log.y~x8)))[2,4]
pvals=c(pval.x1,pval.x2,pval.x3,pval.x4,pval.x5,pval.x6,pval.x7,pval.x8)
which(pvals==min(pvals)) # x3 has the smallest p-value
(8.38e-08)
# stage 5: None of the remaining 4 variables can get into the model (all p-values > 0.05)
pval.x3x2x8x1x4=coefficients(summary(lm(log.y~x3+x2+x8+x1+x4)))[6,4]
pval.x3x2x8x1x5=coefficients(summary(lm(log.y~x3+x2+x8+x1+x5)))[6,4]
pval.x3x2x8x1x6=coefficients(summary(lm(log.y~x3+x2+x8+x1+x6)))[6,4]
pval.x3x2x8x1x7=coefficients(summary(lm(log.y~x3+x2+x8+x1+x7)))[6,4]
pvalsx3x2x8x1=c(pval.x3x2x8x1x4,pval.x3x2x8x1x5,pval.x3x2x8x1x6,pval.x3x2x8x1x7)
pvalsx3x2x8x1 # All p-values are > 0.05 --> stop
## Backward Selection ##
# Start with all the variables in the model, then remove one at a time.
coefficients(summary(lm(log.y~x1+x2+x3+x4+x5+x6+x7+x8))) # Remove x4.
coefficients(summary(lm(log.y~x1+x2+x3+x5+x6+x7+x8))) # Remove x7.
coefficients(summary(lm(log.y~x1+x2+x3+x5+x6+x8))) # Remove x5.
coefficients(summary(lm(log.y~x1+x2+x3+x6+x8))) # Remove x6.
coefficients(summary(lm(log.y~x1+x2+x3+x8))) # All p-values are now <
alpha
http://websites.uwlax.edu/storibio/math445_fall12/r_commands.txt 18/23
10/30/2018 websites.uwlax.edu/storibio/math445_fall12/r_commands.txt
# Therefore, the best model obtained using the forward selection is
# log.y~x1+x2+x3+x8
## Validation
temp=sample(1:54,20)
data.validation=data[temp,]
data.training=data[-temp,]
data.valid=read.csv("data_surgical_validation.csv",header=T)
attach(data.valid)
model.1=lm(log(survival)~clotting+prognostic+enzyme+heavy) # Based on BIC
coefficients(summary(lm(log.y~x1+x2+x3+x8)))
###############################################
### (11/19/12) Model Building - Diagnostics ###
###############################################
x1=c(14,19,12,11)
x2=c(25,32,22,15)
y=c(301,327,246,187)
X=cbind(array(1,4),x1,x2)
H=X%*%solve(t(X)%*%X)%*%t(X) # Computing the hat matrix
y.hat=H%*%y # Will give the predicted values(see table on page
393)
e=(diag(4)-H)%*%y # Will give the residuals
h=diag(H) # Extracting the main diagonal from the hat matrix
n=length(x1)
MSE=sum(e^2)/(n-3)
s2.e=MSE*(diag(4)-H) # Will give the variance and co-variance matrix
s.e=sqrt(diag(s2.e))
r.stud=e/s.e # Will give the studentize residuals
http://websites.uwlax.edu/storibio/math445_fall12/r_commands.txt 19/23
10/30/2018 websites.uwlax.edu/storibio/math445_fall12/r_commands.txt
t=round(e*sqrt((n-3-1)/(sse*(1-h)-e^2)),3)
data.frame(residuals=e,h=h,t=t) # Table 10.3 on page 397
t.crit=qt(1-.10/(2*n),df=n-3-1)
which(abs(t)>t.crit)
# Leverage
which(h>2*mean(h)) # Note that sum(h)=p ==> mean(h)=p/n
# Deletion statistics
dff=dffits(results.fat) # Compare abs(dffits) with 1 for small to medium size
data and 2*sqrt(p/n) for large data sets.
dfb=dfbeta(results.fat) # Compare abs(dfbeta) with 1 (if small or medium
data) or 2/sqrt(n) (for large data sets)
cooks=cooks.distance(results.fat) # Compare with qf(.10,p,n-p) or qf(.20,p,n-p)
round(data.frame(dffits=dff,cooks=cooks,dfbetas=dfb),3)
# Variance Inflation Factor (If VIF > 10, there is a strong collinearity)
library(car) # Install and load R package 'car'
results3=lm(fat~.,data=data)
vif(results3) # Note how large the VIFs, indicating strong
collinearity
summary(lm(midarm~triceps+thigh))$r.sq # Note how big R^2 is, indicating the strong linear
correlation
# between midarm and the other 2
predictors.
data2=read.csv("data_surgical_unit.csv",header=T)
attach(data2)
y=survival
x1=clotting
x2=prognostic
x3=enzyme
x4=liver
x5=age
x6=gender
x7=moderate
x8=heavy
log.y=log(y)
results2=lm(log.y~x1+x2+x3+x8)
vif(results2) # All VIFs are small. Therefore, we don't have collinearity problem.
#####################################################
### (11/26/12) Model Building - Remedial Measures ###
#####################################################
data=read.csv("data_body_fat.csv",header=T)
attach(data)
results.fat=lm(fat~triceps+thigh,data=data)
library(quantreg)
rq(fat~triceps+thigh,data=data)
http://websites.uwlax.edu/storibio/math445_fall12/r_commands.txt 20/23
10/30/2018 websites.uwlax.edu/storibio/math445_fall12/r_commands.txt
# 2. Least Median of Squares (LMS) Regression
# min(median(residuals^2))
# 1. The linear regression is weighted to give cases further from the middle X level
# in each neighborhood smaller weights
# 2. To make the procedure robust to outlying observations, the linear regression fitting
# is repeated, with the weights revised so that cases that had large residuals in the
# fitting receive smaller weights in the second fitting
# 3. To improve the robustness of the procedure further, step 2 is repeated one or more
# times by revising the weights according to the size of the residuals in the latest
# fitting
lowess(triceps,fat)
plot(lowess(triceps,fat),type='l')
points(triceps,fat)
par(mfrow=c(2,2))
plot(lm(fat~triceps))
data2=read.csv("data_surgical_unit.csv",header=T)
attach(data2)
results(survival~clotting)
par(mfrow=c(2,2))
plot(results)
########################################
### (11/28/12) Nonlinear Regression ###
########################################
curve(5*exp(x/2),0,10)
curve(100-50*exp(-2*x),0,3)
x=sample(1:10,100,replace=T)
y=5*exp(x/2)+rnorm(100,mean=0,sd=20)
plot(x,y)
nls(y~b*exp(a*x),start=list(a=1,b=3))
curve(4.8003*exp(x*0.5049),0,10,add=T,col="darkred",lwd=2)
data=read.csv("data_injured_patients.csv",header=T)
attach(data)
Y=index; X=days
results=nls(Y~b*exp(a*X),start=list(a=0,b=50))
Y.fitted=58.60653*exp(-0.03959*X)
residuals=Y-Y.fitted
plot(Y.fitted,residuals,main="Residual Plot")
http://websites.uwlax.edu/storibio/math445_fall12/r_commands.txt 21/23
10/30/2018 websites.uwlax.edu/storibio/math445_fall12/r_commands.txt
curve(10/(1+20*exp(-2*x)),0,4,lwd=2)
x=sample(0:5,100,replace=T)
y=10/(1+20*exp(-2*x))+rnorm(100,mean=0,sd=.5)
plot(x,y)
nls(y~c/(1+b*exp(a*x)),start=list(a=-2,b=20,c=10))
curve(10.131/(1+16.623*exp(-1.855*x)),0,5,col="darkred",lwd=2,add=T)
data.kinetics=read.csv("data_kinetics.csv",header=T)
attach(data.kinetics)
y=velocity; x=concentration
y.p=1/y
x.p=1/x
results=lm(y.p~x.p) #bo=0.03376 and b1=0.45401
nls(y~a*x/(b+x),start=list(a=1/0.03376,b=0.454/0.03376))
plot(concentration,velocity)
curve(28.14*x/(12.57+x),0,40,add=T,lwd=2)
y.fitted=28.14*x/(12.57+x)
residuals=y-y.fitted
plot(y.fitted,residuals)
############################################
### (12/3/12) Generalized Linear Models ###
############################################
curve(pnorm(0+1*x),-3,3,lwd=2)
curve(pnorm(0+2*x),-3,3,lwd=2,col="red",add=T)
curve(dnorm(x,0,1),-3,3)
curve(exp(x)/(1+exp(x))^2,-3,3,add=T,col="red")
curve(dnorm(x,0,sd=pi/sqrt(3)),-3,3,add=T,col="blue")
data=read.csv("data_task.csv",header=T)
attach(data)
x=experience
y=success
predicted=exp(-3.0597+0.1615*x)/(1+exp(-3.0597+0.1615*x))
prediction=as.numeric(results$fitted>.7)
error=success-prediction
data.frame(success,predicted=results$fitted,prediction,error)
percent.error=sum(abs(error))/length(error)
percent.error
dystrophy=read.csv("data_dystrophy.csv",header=T)
http://websites.uwlax.edu/storibio/math445_fall12/r_commands.txt 22/23
10/30/2018 websites.uwlax.edu/storibio/math445_fall12/r_commands.txt
attach(dystrophy)
http://websites.uwlax.edu/storibio/math445_fall12/r_commands.txt 23/23