Вы находитесь на странице: 1из 23

10/30/2018 websites.uwlax.edu/storibio/math445_fall12/r_commands.

txt

############################################################
###### Math 445 - Correlation and Regression Analysis ######
###### (Dr. Toribio - Fall 2012) ######
############################################################

###################################
### Day 2 (9/7/12): Correlation ###
###################################

# Importing data
# Change the R working directory to the correct folder

data=read.csv("marathon.csv",header=T)
attach(data)

x=Distance
y=Time

plot(x,y)
plot(x,y,xlab="Distance",ylab="Time")

ss=function(x,y) {sum(x*y)-sum(x)*sum(y)/length(x)}
ss.xx=ss(x,x)
ss.yy=ss(y,y)
ss.xy=ss(x,y)

r=ss.xy/sqrt(ss.xx*ss.yy)

cor(x,y) # Will compute the linear correlation coefficient

#################################################
### Day 3 (9/12/12): Simple Linear Regression ###
#################################################

### Saving a function in R

corre=function(x,y) # This function will compute the correlation


between vectors x and y.
{
ss=function(x,y) {sum(x*y)-sum(x)*sum(y)/length(x)}
ss.xx=ss(x,x)
ss.yy=ss(y,y)
ss.xy=ss(x,y)

r=ss.xy/sqrt(ss.xx*ss.yy)
list(r=r,ss.xx=ss.xx,ss.yy=ss.yy,ss.xy=ss.xy) # This will be the output of the function.
}

# Tree circumference and height problem


circ=c(1.8,1.9,1.8,2.4,5.1,3.1,5.5,5.1,8.3,13.7,5.3,4.9,3.7,3.8)
height=c(21,33.5,24.6,40.7,73.2,24.9,40.4,45.3,53.5,93.8,64,62.7,47.2,44.3)

corre(circ,height)

####################################################
### Day 4 (9/12/12): Fitted values and Residuals ###
####################################################

lin.reg=function(x,y) # This function will give you the least-squares


regression line.
http://websites.uwlax.edu/storibio/math445_fall12/r_commands.txt 1/23
10/30/2018 websites.uwlax.edu/storibio/math445_fall12/r_commands.txt
{
ss=function(x,y) {sum(x*y)-sum(x)*sum(y)/length(x)}
ss.xx=ss(x,x); ss.yy=ss(y,y); ss.xy=ss(x,y)

b1.hat=ss.xy/ss.xx; b0.hat=mean(y)-b1.hat*mean(x)

list(y.intercept=b0.hat,slope=b1.hat) # This will be the output of the function.


}

# Car price problem


age=c(5,4,6,6,5,5,6,6,2,7,7)
price=c(85,102,70,80,89,98,66,90,169,68,50)

cbind(age,price)
price[age>5] # This will list all prices of cars that are older
than 5 years.

#source("E:\\Math445\\lin.reg.R")
lin.reg(age,price)

plot(age,price)
abline(197.1991,-20.376) # Drawing a line with y-intercept=197.1991 and
slope=-20.376.

predicted=197.1991-20.376*age # predicted values are also known as fitted values


residuals=price-predicted
cbind(age,price,predicted,residuals)

##################################################
### Day 5 (9/14/12): Inference about the slope ###
##################################################

data=read.csv("production.csv",header=T)

source("E:\\Math445\\lin.reg.R")
data=read.csv("production.csv",header=T)
data
Size Hours
1 30 73
2 20 50
3 60 128
4 80 170
5 40 87
6 50 108
7 60 135
8 30 69
9 70 148
10 60 132
attach(data)
results=lin.reg(Size,Hours)
results
$y.intercept
[1] 10

$slope
[1] 2

fitted=10+2*Size
Residuals=Hours-fitted
cbind(data,fitted,Residuals)
sigma2=sum(Residuals^2)/(length(Size)-2) # This is the MSE

http://websites.uwlax.edu/storibio/math445_fall12/r_commands.txt 2/23
10/30/2018 websites.uwlax.edu/storibio/math445_fall12/r_commands.txt
sig.hat=sqrt(sigma2)
sig.hat
[1] 2.738613
> std.res=Residuals/sig.hat
> cbind(data,fitted,Residuals,std.res)
Size Hours fitted Residuals std.res
1 30 73 70 3 1.0954451
2 20 50 50 0 0.0000000
3 60 128 130 -2 -0.7302967
4 80 170 170 0 0.0000000
5 40 87 90 -3 -1.0954451
6 50 108 110 -2 -0.7302967
7 60 135 130 5 1.8257419
8 30 69 70 -1 -0.3651484
9 70 148 150 -2 -0.7302967
10 60 132 130 2 0.7302967

sum(Residuals)
[1] 0 #Verifying property #1
sum(Size*Residuals)
[1] 0 #Verifying property #4
sum(fitted*Residuals)
[1] 0 #Verifying property #5

# Testing if slope=0.
qt(.975,8) #Obtaining the t-critical value for alpha=0.05
[1] 2.306004 #So we should reject the H0 if t-observed is > 2.306 or <
-2.306.
source("E:\\Math445\\corre.R")
corre(Size,Hours)
$ss.xx
[1] 3400

sigma2
[1] 7.5
sqrt(7.5/3400)
[1] 0.04696682 # This is SE_b1
2/.047
[1] 42.55319 # This is the t-observed value (It is > 2.306, so reject H0)

results=lm(Hours~Size) # This is the built-in function for regression in R.


results

Call:
lm(formula = Hours ~ Size)

Coefficients:
(Intercept) Size
10 2

summary(results)

Call:
lm(formula = Hours ~ Size)

Residuals:
Min 1Q Median 3Q Max
-3.0 -2.0 -0.5 1.5 5.0

Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 10.00000 2.50294 3.995 0.00398 **
Size 2.00000 0.04697 42.583 1.02e-10 *** # Note that the p-value is
extremely small, so reject H0

http://websites.uwlax.edu/storibio/math445_fall12/r_commands.txt 3/23
10/30/2018 websites.uwlax.edu/storibio/math445_fall12/r_commands.txt
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.739 on 8 degrees of freedom


Multiple R-squared: 0.9956, Adjusted R-squared: 0.9951
F-statistic: 1813 on 1 and 8 DF, p-value: 1.02e-10

########################################################
### Day 6 (9/17/12): Inference about the y-intercept ###
########################################################

# Review homework
# Tree circumference and height problem
circ=c(1.8,1.9,1.8,2.4,5.1,3.1,5.5,5.1,8.3,13.7,5.3,4.9,3.7,3.8)
height=c(21,33.5,24.6,40.7,73.2,24.9,40.4,45.3,53.5,93.8,64,62.7,47.2,44.3)

lin.reg=function(x,y) # This function will give you the least-squares


regression line.
{
ss=function(x,y) {sum(x*y)-sum(x)*sum(y)/length(x)}
ss.xx=ss(x,x); ss.yy=ss(y,y); ss.xy=ss(x,y)

b1.hat=ss.xy/ss.xx; b0.hat=mean(y)-b1.hat*mean(x)
fitted=b0.hat+b1.hat*x
residuals=fitted-y
SSE=sum(residuals^2)
n=length(x)
MSE=SSE/(n-2)
sigma.hat=sqrt(MSE)
SE.b1=sqrt(MSE/ss.xx)

list(y.intercept=b0.hat,slope=b1.hat,SE.b1=SE.b1) # This will be the output of the


function.
}

results=lin.reg(circ,height)

results2=lm(height~circ)
summary(results2)
confint(results2,level=.95) # Will give the C.I. for the
slope and y-intercept.

#############################################
### Day 8 (9/21/12): Prediction intervals ###
#############################################

data=read.csv("production.csv",header=T)
attach(data)

results=lm(Hours~Size)
summary(results)

confint(results,level=.95)

# Confidence intervals for the mean response


new=data.frame(Size=c(Size,100))
predict(results,newdata=new,interval="confidence")

new2=data.frame(Size=c(90,100)) # new2 contains only the 2


new observations
predict(results,newdata=new2,interval="confidence") # This will give the mean
C.I. for the 2 new data

# Prediction interval for future values

http://websites.uwlax.edu/storibio/math445_fall12/r_commands.txt 4/23
10/30/2018 websites.uwlax.edu/storibio/math445_fall12/r_commands.txt
predict(results,newdata=new2,interval="prediction",level=.99) # The default level is 0.95

########################################
### Day 9 (9/24/12): Confidence Band ###
########################################

## Review: Homework problem


sys=c(138,130,135,140,120,125,120,130,130,144,143,140,130,150)
dias=c(82,91,100,100,80,90,80,80,80,98,105,85,70,100)
data=data.frame(systolic=sys,diastolic=dias)
plot(sys,dias)

results=lm(dias~sys)
abline(results)

new=data.frame(sys=122)
predict(results,new) # equals 79.46697
predict(results,new,interval="confidence") # [71.28516, 87.64878]
predict(results,new,interval="confidence",level=.99) # [67.99666, 90.93728]
predict(results,new,interval="prediction",level=.99) # [51.67416, 107.2598]

SSE=sum(results$res^2)
s=sqrt(SSE/12) # sqrt(MSE) = can also be obtained from summary(results)
ss.xx=13*var(sys)

SE.predmean=8.288*sqrt(1/3+1/14+(122-mean(sys))^2/ss.xx) # about 6.08


79.47-qt(.975,df=12)*SE.predmean # equals 66.22
79.47+qt(.975,df=12)*SE.predmean # equals 92.72

## Working-Hotelling Confidence Band

# mu.hat +/- W*SE.mu.hat, where W^2=2*F(1-alpha;2,n-2)

W=sqrt(qf(.95,2,12)*2) # equals 2.787577

#SE.mean=8.288*sqrt(1/14+(122-mean(sys))^2/ss.xx)
ci=predict(results, se.fit=TRUE) # This will give you the fitted values and their
corresponding SE.mean
SE.mean=ci$se.fit

ll=ci$fit-W*SE.mean
ul=ci$fit+W*SE.mean

plot(sys, dias, xlab="Systolic BP", ylab="Diastolic BP", main="Confidence Band for BP


problem")
abline(results$coefficients[1], results$coefficients[2],col="darkred",lwd=2)

points(sort(sys), sort(ll), type="l", lty=2, col="blue") # Superimposes the ll


points(sort(sys), sort(ul), type="l", lty=2, col="blue") # Superimposes the ul

# Correlation
cor(sys,dias) # Computes the Pearson correlation coeficient, r
cor.test(sys,dias) # Tests Ho:rho=0 and also constructs C.I. for rho
cor(sys,dia,method="spearman") # Computes the Spearman's correlation coeficient
cor(sys,dia,method="kendall") # Computes Kendall's Tau

################################
### (10/1/12) Residual Plots ###
################################

data=read.csv("data_Toluca.csv",header=T)
attach(data)

http://websites.uwlax.edu/storibio/math445_fall12/r_commands.txt 5/23
10/30/2018 websites.uwlax.edu/storibio/math445_fall12/r_commands.txt
table(size)
boxplot(size) # Creates a boxplot (good for outlier detection)
size.out=c(size,200)
boxplot(size.out) # Note the presence of the outlier

plot(size,type="b") # Creates a time plot

hist(size) # Creates a histogram


bin=seq(15,125,by=10)
hist(size,breaks=bin,col="gray")

stem(size) # Creates a stem-and-leaf-plot


stem(size,scale=3)

# Load 'epicalc' ## You might need to download and install this package first.
dotplot(size)

plot(size, hours)
results=lm(hours~size)
abline(results,col="blue")

##Departures from the model##

# 1. The regression function is not linear.


# Plot residuals againsts predictor OR residuals againsts fitted values
plot(size,results$residuals) # Note that the residual plot doesn't show any
systematic pattern
plot(results$fitted,results$residuals) # This plot

anova(results) # Just to get MSE


e.star=results$residuals/sqrt(2384) # MSE = 2384
plot(results$fitted,e.star)

## A non-linear example
z=rnorm(length(size),0,100)
x=size
y=x^2+z
plot(x,y) # Note that the relationship is curvilinear
cor(x,y) # Note that r is pretty high!
temp=lm(y~x)
plot(temp$fitted,temp$res) # Note the pattern in this residual plot

x2=x^2
temp2=lm(y~x2)
plot(temp2$fitted,temp2$res)

# 2. The error terms are not normally distributed.


# Plots of residuals againsts omitted predictor variables
# Box plot of residuals
# Normal probability plots of residuals
qqnorm(results$residuals)
qqline(results$residuals)
shapiro.test(results$residuals)

qqnorm(temp$residuals)
qqline(temp$residuals)
shapiro.test(temp$residuals)

###################################################
### (10/3/12) Diagnostics and Remedial Measures ###
###################################################

# 3. The error terms do not have constant variance.


# Plot of absolute or squared residuals against predictor variable

http://websites.uwlax.edu/storibio/math445_fall12/r_commands.txt 6/23
10/30/2018 websites.uwlax.edu/storibio/math445_fall12/r_commands.txt

z=rnorm(length(size),0,100*sqrt(size))
x=size
y=x+z
plot(x,y) # Note the increasing variation

temp3=lm(y~x)
plot(temp3$fitted,temp3$res) # Note the increasing variation

# 4. The error terms are not independent.


# Sequence plot of the residuals
# Plot of residuals againsts time or other sequence

# 5. The model fits all but one or a few outlier observations.


# Plot residuals againsts predictor OR residuals againsts fitted values
# Box plots, stemplots, dot plots of residuals

# 6. One or several important predictor variables have been omitted from the model.

########################################################
### (10/10/12) Tests for Constancy of Error Variance ###
########################################################

## Brown-Forsythe Test

bf=function(x,y,x.med=median(x)){
results=lm(y~x)
e=results$residuals
# x.med=median(x)

e1=e[x<=x.med]; n1=length(e1)
e2=e[x>x.med]; n2=length(e2)

d1=abs(e1-median(e1))
d2=abs(e2-median(e2))

s2=(sum((d1-mean(d1))^2)+sum((d2-mean(d2))^2))/(n1+n2-2)

t.bf=(mean(d1)-mean(d2))/(sqrt(s2)*sqrt(1/n1+1/n2))
p.val=2*pt(abs(t.bf),df=(n1+n2-2),lower.tail=F)

list(t.bf=t.bf,p.val=p.val)}

# Example: Toluca data

data=read.csv("data_Toluca.csv",header=T)
attach(data)

# Source in the function bp.R

bf(size,hours) # t_obs=1.3165 and p-value=0.201

# Example: Simulated data with increasing variance

x=sample(seq(20,120,by=10),100,replace=T)
z=rnorm(length(x),0,200*sqrt(x))
y=(62.37+3.57*x)+z
plot(x,y) # Note the increasing variation

bf(x,y)

## Breusch-Pagan Test (Also known as Cook-Weisberg test)

http://websites.uwlax.edu/storibio/math445_fall12/r_commands.txt 7/23
10/30/2018 websites.uwlax.edu/storibio/math445_fall12/r_commands.txt
# load the package 'lmtest'

library(lmtest)
bptest(hours~size,studentize=F)

Example 2:
x=1:50
y=(1:50)*rnorm(50)
bf(x,y)
bptest(x,y)

#########################################
### (10/12/12) F Test for Lack of Fit ###
#########################################

x=c(125,100,200,75,150,175,75,175,125,200,100)
y=c(160,112,124,28,152,156,42,124,150,104,136)
plot(x,y)

results=lm(y~x)
anova(results)

reduced.mod=lm(y~x)
full.mod=lm(y~factor(x))
anova(reduced.mod,full.mod)

##################################
### (10/15/12) Transformations ###
##################################

# Transforming the Y variable


# Example: Plasma (page 132)
data=read.csv("data_plasma.csv",header=T)
attach(data)
plot(Age,Plasma)
results=lm(Plasma~Age)
plot(Age,results$res)
qqnorm(results$res)

log10plasma=log(Plasma,base=10)
plot(Age,log10plasma)
results2=lm(log10plasma~Age)
plot(Age,results2$res)
qqnorm(results2$res)

# Box-Cox Transformations
y=Plasma;x=Age
#k2=(prod(y))^(1/length(y))
#k1=1/(lambda*k2^(lambda-1))
#w=k1*(y^lambda-1)
#results=lm(w~x)
#anova(results)
#anova(results)$Sum[2] # This will give you the value of the SSE

lambda=seq(-2,2,by=.1)
lambda[21]=0.01 # Just to avoid the special case, lambda=0
iter=length(lambda)
SSEs=array(0,iter) # Storage for the SSEs
k2=(prod(y))^(1/length(y))

for(i in 1:iter){
k1=1/(lambda[i]*k2^(lambda[i]-1))
w=k1*(y^lambda[i]-1)

http://websites.uwlax.edu/storibio/math445_fall12/r_commands.txt 8/23
10/30/2018 websites.uwlax.edu/storibio/math445_fall12/r_commands.txt
results=lm(w~x)
SSEs[i]=anova(results)$Sum[2]
}

w=k2*(log(y)) # This is for the special case when lambda=0


results=lm(w~x)
SSEs[21]=anova(results)$Sum[2]

lambda[21]=0
cbind(lambda,SSEs) # Choose lambda that minimizes the SSE
plot(lambda,SSEs)

# load package 'MASS'


library(help="MASS")
boxcox(y~x) # Check the resulting plot

# Transforming the X variable


# Example (page 129)
x=c(.5,.5,1,1,1.5,1.5,2,2,2.5,2.5)
y=c(42.5,50.6,68.5,80.7,89,99.6,105.3,111.8,112.3,125.7)
results=lm(y~x)
summary(results) # R^2=0.9256
plot(x,results$res) # note the curve pattern
qqnorm(results$res) # no evidence of non-normality of residuals
bf(x,y) # no evidence of deviation

xp=sqrt(x)
results=lm(y~xp)
summary(results) # R^2=0.9545 - note the improvement
plot(xp,results$res)
qqnorm(results$res)

##########################################
### (10/17/12) Simultaneous Inferences ###
##########################################

## Bonferroni Joint Estimation of beta_0 and beta_1

data=read.csv("data_Toluca.csv",header=T)
attach(data)

results=lm(hours~size)
confint(results,level=.90)
coefficients(summary(results))

b0=coefficients(summary(results))[1,1] # Extracting the value of b0


se.b0=coefficients(summary(results))[1,2] # The standard error of b0

b1=coefficients(summary(results))[2,1] # Extracting the value of b1


se.b1=coefficients(summary(results))[2,2] # The standard error of b1

# The 90% joint confidence intervals for beta_0 and beta_1:


B=qt(1-.10/(2*2),df=23) # In general, B=qt(1-.10/(2*k),df), k=no. of
intervals
lower.b0=b0-B*se.b0;upper.b0=b0+B*se.b0
lower.b1=b1-B*se.b1;upper.b1=b1+B*se.b1

# Or you can make the Bonferroni adjustment here


confint(results,level=(1-.10/2)) # In general, confint(results,level=(1-
alpha/k))

## Simultaneous Estimation of Mean Responses

new=data.frame(size=c(30,65,100))
ci=predict(results,new,interval="confidence",se.fit=T)

http://websites.uwlax.edu/storibio/math445_fall12/r_commands.txt 9/23
10/30/2018 websites.uwlax.edu/storibio/math445_fall12/r_commands.txt

# Using Working-Hotelling Procedure


# to constuct 90% joint confidence intervals for the mean response at the 3 x levels.
W2=2*qf(1-.10,2,23)

ci$fit[1]-sqrt(W2)*ci$se[1]; ci$fit[1]+sqrt(W2)*ci$se[1] # at size=30


ci$fit[2]-sqrt(W2)*ci$se[2]; ci$fit[2]+sqrt(W2)*ci$se[2] # at size=65
ci$fit[3]-sqrt(W2)*ci$se[3]; ci$fit[3]+sqrt(W2)*ci$se[3] # at size=100

# Using Bonferroni Procedure


B=qt(1-.10/(2*3),23)

ci$fit[1]-B*ci$se[1]; ci$fit[1]+B*ci$se[1] # at size=30


ci$fit[2]-B*ci$se[2]; ci$fit[2]+B*ci$se[2] # at size=65
ci$fit[3]-B*ci$se[3]; ci$fit[3]+B*ci$se[3] # at size=10

# Or you can make the Bonferroni adjustment here


predict(results,new,interval="confidence",level=(1-.10/3))

########################################################################
### (10/19/12) Simultaneous Inferences and Regression through Origin ###
########################################################################

## Simultaneous Estimation of Mean Responses

data=read.csv("data_Toluca.csv",header=T)
attach(data)

results=lm(hours~size)
MSE=anova(results)$Mean[2]

new=data.frame(size=c(80,100))
ci=predict(results,new,interval="prediction",se.fit=T)

se.fit.pred=sqrt(MSE*(1+(ci$se.fit^2/MSE)))

# Scheffe Procedure
S=sqrt(2*qf(1-.05,2,23)) # In general, use S=sqrt(k*qf(1-.05,k,n-2))
# If k=2:10, look at
plot(k,sqrt(k*qf(1-.05,k,n-2)))
lwr.scheffe=ci$fit[,1]-S*se.fit.pred
upr.scheffe=ci$fit[,1]+S*se.fit.pred
cbind(ci$fit,lwr.scheffe,upr.scheffe)

# Bonferroni Procedure
ci.bon=predict(results,new,interval="prediction",level=(1-.05/2))

## Regression through Origin

data=read.csv("data_warehouse.csv",header=T) # Example in the book (page 162)


attach(data)

n=length(cost)
b1=sum(units*cost)/sum(units^2)
fitted=b1*units
residuals=cost-fitted
SSE=sum(residuals^2)
MSE=SSE/(n-1)
se.b1=sqrt(MSE/sum(units^2))

# The 95% C.I. for beta_1 is


lwr=b1-qt(.975,n-1)*se.b1
upr=b1+qt(.975,n-1)*se.b1

http://websites.uwlax.edu/storibio/math445_fall12/r_commands.txt 10/23
10/30/2018 websites.uwlax.edu/storibio/math445_fall12/r_commands.txt

##################################
### (10/22/12) Matrix Approach ###
##################################

A=matrix(1:6,nrow=2)
B=matrix(c(-1,0,2,1,4,3),nrow=2)
A
B
A+B # Matrix addition = addition of corresponding entries
A-B
t(A) # transpose of A
D=t(A)%*%A # To do matrix multiplication, use %*%
E=A%*%t(A) # Note that E is not equal to D.
E.inv=solve(E) # Computing the inverse of a square matrix
I=E.inv%*%E # The product of E and it's inverse is the Identity matrix

data=read.csv("data_Toluca.csv",header=T)
attach(data)
x=size; y=hours
n=length(x)
t(y)%*%y # Sum of squares of yi's
X=matrix(c(rep(1,n),x),ncol=2) # The design matrix X
t(X)%*%X
t(X)%*%y
solve(t(X)%*%X)
B=solve(t(X)%*%X)%*%(t(X)%*%y) # Estimates of the regression coefficients
coefficients(lm(y~x)) # Note that you get these results

fitted=X%*%B

######################################
### (10/24/12) Multiple Regression ###
######################################

data=read.csv("data_dwaine.csv",header=T)
attach(data)
x1=young
x2=income
y=sales
n=length(sales)
pairs(cbind(x1,x2,y)) # Creates a scatter plot matrix
pairs(data)
cor(data) # Creates a correlation matrix

X=matrix(c(rep(1,n),x1,x2),ncol=3) # The design matrix X


t(X)%*%X
B=solve(t(X)%*%X)%*%(t(X)%*%y) # Estimates of the regression coefficientspair

#Or we can use


results=lm(y~x1+x2)
summary(results)

# Diagnostics
par(mfrow=c(2,2))
plot(results$fitted,results$residuals)
plot(x1,results$residuals)
plot(x2,results$residuals)
plot(x1*x2,results$residuals) # Checking for interaction effects

plot(results$fitted,abs(results$residuals)) # Checking constant variance


qqnorm(results$residuals) # Checking normality
shapiro.test(results$residuals)

anova(results)

http://websites.uwlax.edu/storibio/math445_fall12/r_commands.txt 11/23
10/30/2018 websites.uwlax.edu/storibio/math445_fall12/r_commands.txt
# F Test for Regression Relation
# Coefficient of multiple determination
# Adjusted coefficient of multiple determination
# Coefficient of multiple correlation

#Prediction
new=data.frame(x1=65.4,x2=17.6)
predict(results,new)
predict(results,new,interval="confidence")
predict(results,new,interval="prediction")

new2=data.frame(x1=c(65.4,53.1),x2=c(17.6,17.7))
predict(results,new2,interval="prediction",level=(1-.10/2)) # Bonferroni Intervals

###########################################
### (10/29/12) More Multiple Regression ###
###########################################

body=read.csv("data_body_fat.csv",header=T)
attach(body)
x1=triceps
x2=thigh
x3=midarm
y=fat

results.x1x2x3=lm(y~x1+x2+x3)
results.x1x2=lm(y~x1+x2)
anova(results.x1x2)
results.x1=lm(y~x1)
anova(results.x1)
# ssr(x2|x1)=sse(x1)-sse(x1,x2)=ssr(x1,x2)-ssr(x1)
# f_obs=(ssr(x2|x1)/1)/(sse(x1,x2)/(n-3)) # Ho: beta_2=0

results.x1x2x3=lm(y~x1+x2+x3)
# ssr(x3|x1,x2)=sse(x1,x2)-sse(x1,x2,x3)=ssr(x1,x2,x3)-ssr(x1,x2)
# f_obs=(ssr(x3|x1,x2)/1)/(sse(x1,x2,x3)/(n-4)) # Ho: beta_3=0

# ssr(x2,x3|x1)=sse(x1)-sse(x1,x2,x3)=ssr(x1,x2,x3)-ssr(x1)
# f_obs=(ssr(x2,x3|x1)/2)/(sse(x1,x2,x3)/(n-4)) # Ho: beta_2=0 and beta_3=0

#Or you can use these commands:


red=lm(y~x1)
full=lm(y~x1+x2+x3)
anova(red,full) # Partial F-Test for Ho: beta_2=0 and beta_3=0

######################################################
### (11/5/12) Coefficient of Partial Determination ###
######################################################

anova(results.x1x2)
anova(results.x1)

# R^2(2|1)=ssr(x2|x1)/sse(x1)=33.17/143.12=0.232
# Hence, the SSE(x1) is reduced by 23.2 percent.

anova(results.x1x2x3)
anova(results.x1x2)

# R^2(3|12)=ssr(x3|x1,x2)/sse(x1,x2)=11.55/109.95=0.105

## Standardized Regression Model


# We use this method when det(X'X) is close to zero OR

http://websites.uwlax.edu/storibio/math445_fall12/r_commands.txt 12/23
10/30/2018 websites.uwlax.edu/storibio/math445_fall12/r_commands.txt
# when explanatory variables differ substantially in order of magnitude

data=read.csv("data_dwaine.csv",header=T)
attach(data)
x1=young
x2=income
y=sales
n=length(sales)

x1.star=((x1-mean(x1))/sd(x1))/sqrt(n-1)
x2.star=((x2-mean(x2))/sd(x2))/sqrt(n-1)
y.star=((y-mean(y))/sd(y))/sqrt(n-1)

results.star=lm(y.star~0+x1.star+x2.star)

b1=(sd(y)/sd(x1))*0.7484
b2=(sd(y)/sd(x2))*0.2511
b0=mean(y)-b1*mean(x1)-b2*mean(x2)

#check
lm(y~x1+x2)

###################################
### (11/7/12) Multicollinearity ###
###################################

# Uncorrelated predictors
x1=c(4,4,4,4,6,6,6,6)
x2=c(2,2,3,3,2,2,3,3)
cor(x1,x2)
y=c(42,39,48,51,49,53,61,60)
anova(lm(y~x1))
anova(lm(y~x2))
anova(lm(y~x1+x2)) # Note that ssr(x2)=ssr(x2|x1)

# Perfectly correlated predictors


x3=c(2,8,6,10)
x4=c(6,9,8,10)
cor(x3,x4)
coefficients(lm(x4~x3))
y2=c(23,83,63,103)

# y2_hat1=-87+x3+18*x4
# y2_hat2=-7+9*x3+2*x4
# y2_hat3=-17+8*x3+4*x4 # Note that all 3 models give perfect fit.

data=read.csv("data_body_fat.csv",header=T)
attach(data)
pairs(data)
cor(data)
summary(lm(midarm~triceps+thigh))
x1=triceps
x2=thigh
x3=midarm
y=fat

coefficients(lm(y~x1))
coefficients(lm(y~x2))
coefficients(lm(y~x1+x2)) # Note how the beta estimates drastically changes as
you
coefficients(lm(y~x1+x2+x3)) # include highly correlated predictors

# SSR(x1|x2) and R^2(1|2)


anova(lm(y~x2+x1)) # Note how small the marginal contribution of x1 when
x2 is already in the model.

http://websites.uwlax.edu/storibio/math445_fall12/r_commands.txt 13/23
10/30/2018 websites.uwlax.edu/storibio/math445_fall12/r_commands.txt

# SSR(x1|x2) can sometimes be bigger than SSR(x1)


x1=c(5,10,5,10)
x2=c(25,30,5,10)
y=c(20,20,0,1)
temp=data.frame(y=y,x1=x1,x2=x2)
cor(temp)
anova(lm(y~x2))
anova(lm(y~x1+x2)) # x1 is a suppressor variable

# Predictions are still good even with multicollinearity of predictors


x1=triceps
x2=thigh
x3=midarm
y=fat
results=lm(y~x1)
predict(results,new=data.frame(x1=25),interval="confidence",se.fit=T)

results2=lm(y~x1+x2)
predict(results2,new=data.frame(x1=25,x2=50),interval="confidence",se.fit=T)

results3=lm(y~x1+x2+x3)
predict(results3,new=data.frame(x1=25,x2=50,x3=29),interval="confidence",se.fit=T)

########################################################
### (11/9/12) Regression with Qualitative Predictors ###
########################################################

data=read.csv("data_insurance.csv",header=T) # Example on page 316


attach(data)
type=factor(type)
results=lm(months~size+type)
confint(results)
anova(results)

plot(size,months,pch=as.numeric(type))
abline(33.874,-0.102,lwd=2) # For mutual firms
abline(33.874+8.055,-0.102,lwd=2,col="darkred") # For stock firms

# Interactions between quantitative and qualitative predictors


results2=lm(months~size+type+size*type)

# Another example
data=read.csv("ex12-38.csv",header=T)
data=attach(data)

mc=SoilMerCon # Mercury concentration in the soil (x1)


cr=Crop # Type of crop (barley, corn, or wheat) (x2)
pc=PlantMerCon # Mercury concentration in the plnat (y)

plot(mc,pc,pch=as.numeric(cr)) # Will create scatterplots for the 3 different crops


legend(1.5, 135, c("Barley","Corn","Wheat"), pch=1:3)

tapply(pc[mc==6],cr[mc==6],max) # To determine which values are above

mc.fac=factor(mc) # Declaring that mc is a qualitative variable


results=lm(pc~mc.fac)
summary(results)

# Another Example
yield=c(12.2,12.4,11.9,11.3,11.8,12.1,13.1,12.7,12.4,11.4,16.6,15.8,16.5,15.0,15.4,15.6,15.8,
15.8,16.0,15.8,9.5,9.5,9.6,8.8,9.5,9.8,9.1,10.3,9.5,8.5)
height=c(45,52,42,35,40,48,60,61,50,33,63,50,63,33,38,45,50,48,50,49,52,54,58,45,57,62,52,67,
55,40)

http://websites.uwlax.edu/storibio/math445_fall12/r_commands.txt 14/23
10/30/2018 websites.uwlax.edu/storibio/math445_fall12/r_commands.txt
treatment=factor(c(rep("C",10),rep("s",10),rep("f",10)))

plot(height,yield,pch=as.numeric(treatment))

summary(lm(yield~treatment+height))

tapply(yield,treatment,mean)
tapply(height,treatment,mean)

########################################
### (11/12/12) Polynomial Regression ###
########################################

# Interactions between quantitative and qualitative predictors


results2=lm(months~size+type+size*type)

# polynomial regression
data2=read.csv("data_power_cells.csv",header=T)
attach(data)
cor(charge,charge^2) # Note how correlated they are.
cor(temperature,temperature^2)

y=cycles
x1=(charge-mean(charge))/.4 # Transforming will reduce the correlation
x2=(temperature-mean(temperature))/.4
x1.2=x1^2 # Check cor(x1,x1.2)
x2.2=x2^2 # Check cor(x2,x2.2)
x1x2=x1*x2
summary(lm(y~x1+x1.2+x2+x2.2+x1x2)) # Note that the interaction effect is not
significant.
summary(lm(y~x1+x1.2+x2+x2.2)) # Note that x2^2 is not significant.
summary(lm(y~x1+x1.2+x2)) # Note that x1^2 is not significant.
summary(lm(y~x1+x2)) # Note how much R^2 changed as you remove
variables from the model.

##################################
### (11/14/12) Model Selection ###
##################################

data=read.csv("data_surgical_unit.csv",header=T)
attach(data)

y=survival
x1=clotting
x2=prognostic
x3=enzyme
x4=liver
cor(cbind(y,x1,x2,x3,x4))
pairs(cbind(y,x1,x2,x3,x4)) # Note that some relationships look more exponential
pairs(cbind(log(y),x1,x2,x3,x4)) # Note that the relationships are more linear

temp=lm(y~x1+x2+x3+x4)
plot(temp$fit,temp$res) # Note that the residual plot exhibits noncontancy of
variance.
log.y=log(y)
temp=lm(log.y~x1+x2+x3+x4)
plot(temp$fit,temp$res) # Note that the residual plot looks much better.

dataset=data.frame(log.y=log(y),x1=x1,x2=x2,x3=x3,x4=x4)

### Criteria for Model Selection


# 1. R^2 or SSE Criterion
# Consider the regression model log.y~x1+x2+x3
summary(lm(log.y~x1+x2+x3))$r.sq # R^2 = 0.757

http://websites.uwlax.edu/storibio/math445_fall12/r_commands.txt 15/23
10/30/2018 websites.uwlax.edu/storibio/math445_fall12/r_commands.txt
anova(lm(log.y~x1+x2+x3))$Sum[4] # SSE = 3.109

# 2. R^2_adj or MSE Criterion


(Choose model with the largest R^2_adj value or smallest MSE value)
# Consider the regression model log.y~x1+x2+x3
summary(lm(log.y~x1+x2+x3))$adj.r.sq # Adjusted R^2 = 0.743
anova(lm(log.y~x1+x2+x3))$Mean[4] # MSE = 0.0622

# 3. Cp (Mallow's Cp statistic) Criterion: Cp=SSE.p/MSE.full - (n-2p)


(Good model should give a small Cp that is approximately equal to p, where p=no. of
parameters.)
# Consider the regression model log.y~x1+x2+x3
SSE.p=anova(lm(log.y~x1+x2+x3))$Sum[4]
MSE.full=anova(lm(log.y~x1+x2+x3+x4))$Mean[5]
n=length(log.y); p=4 # p is the no. of parameters (B0, B1, B2, B3)
SSE.p/MSE.full - (n-2*p) # Cp = 3.388

#Or alternatively, you can use:


extractAIC(lm(log.y~x1+x2+x3),scale=MSE.full)[2]

# 4. AIC (Akaike's information criterion): AIC = n*ln(SSE.p/n)+2p


(Choose model with the smallest AIC value)
# Consider the regression model log.y~x1+x2+x3
SSE.p=anova(lm(log.y~x1+x2+x3))$Sum[4]
n=length(log.y); p=4 # p is the no. of parameters (B0, B1, B2, B3)
n*log(SSE.p/n)+2*p # AIC = -146.161

#Or alternatively, you can use:


extractAIC(lm(log.y~x1+x2+x3))[2]

# 5. SBC (Schwarz's Bayesian criterion): SBC = n*ln(SSE.p/n)+ln(n)*p


(Choose model with the smallest SBC value)
SSE.p=anova(lm(log.y~x1+x2+x3))$Sum[4]
n=length(log.y); p=4 # p is the no. of parameters (B0, B1, B2, B3)
n*log(SSE.p/n)+log(n)*p # SBC = -138.205

#Or alternatively, you can use:


extractAIC(lm(log.y~x1+x2+x3),k=log(n))[2]

# 6. PRESS criterion: PRESS.p = sum(y_i - y_i_hat(when ith observation was deleted))^2


(Choose model with the smallest value)
# Consider the regression model log.y~x1+x2+x3
n=length(log.y)
e2=array(0,n)
for(i in 1:n){
w1=x1[-i];w2=x2[-i];w3=x3[-i]
temp=lm(log.y[-i]~w1+w2+w3)
new=data.frame(w1=x1[i],w2=x2[i],w3=x3[i])
pred=predict(temp,new)
e2[i]=(log.y[i]-pred)^2
}
PRESS.p=sum(e2) # PRESS.p = 3.914

#Or alternatively, you can use:


fit=lm(log.y~x1+x2+x3)
sum((fit$resid/(1-hatvalues(fit)))^2)

## You can get several of these criteria using the R package "leaps".
library(leaps)
log.y=log(y)
dataset=data.frame(log.y=log(y),x1=x1,x2=x2,x3=x3,x4=x4)
results=regsubsets(log.y~.,data=dataset,nbest=6,nvmax=4)
attributes(summary(results))
outmat=summary(results)$outmat
SSE=round(summary(results)$rss,3)

http://websites.uwlax.edu/storibio/math445_fall12/r_commands.txt 16/23
10/30/2018 websites.uwlax.edu/storibio/math445_fall12/r_commands.txt
R2=round(summary(results)$rsq,3)
R2.adj=round(summary(results)$adjr2,3)
Cp=round(summary(results)$cp,3)
BIC=round(summary(results)$bic,3) # BIC is equivalent to SBC
data.frame(outmat,SSE,R2,R2.adj,Cp,BIC)

# Another example
library(faraway) # will load the 'faraway' package
library(help=faraway)
data(seatpos)
seatpos
help(seatpos)

reg=lm(hipcenter~.,data=seatpos)
summary(reg)

results2=regsubsets(hipcenter~.,data=seatpos,nbest=2)
summary(results2)

##############################################
### (11/16/12) Stepwise Regression Methods ###
##############################################

data=read.csv("data_surgical_unit.csv",header=T)
attach(data)

y=survival
x1=clotting
x2=prognostic
x3=enzyme
x4=liver
x5=age
x6=gender
x7=moderate
x8=heavy
log.y=log(y)
dataset=data.frame(log.y,x1,x2,x3,x4,x5,x6,x7,x8)

library(leaps)
results=regsubsets(log.y~.,data=dataset,nbest=4,nvmax=8)
attributes(summary(results))
outmat=summary(results)$outmat
SSE=round(summary(results)$rss,3)
R2=round(summary(results)$rsq,3)
R2.adj=round(summary(results)$adjr2,3)
Cp=round(summary(results)$cp,3)
BIC=round(summary(results)$bic,3) # BIC is equivalent to SBC
data.frame(outmat,SSE,R2,R2.adj,Cp,BIC)

# Using SBC or BIC, the best model includes x1,x2,x3, and x8.
# Using Cp, the best model includes x1,x2,x3,x5, and x8.
# Using R2.adj, the best model includes x1,x2,x3,x5,x6, and x8.

## Forward Selection ##
# Add one variable at a time.
# Insert the variable that yields the smallest (t-statistic) p-value.
# stage 1: x3 enters the model
coefficients(summary(lm(log.y~x1)))
pval.x1=coefficients(summary(lm(log.y~x1)))[2,4]
pval.x2=coefficients(summary(lm(log.y~x2)))[2,4]
pval.x3=coefficients(summary(lm(log.y~x3)))[2,4]
pval.x4=coefficients(summary(lm(log.y~x4)))[2,4]
pval.x5=coefficients(summary(lm(log.y~x5)))[2,4]
pval.x6=coefficients(summary(lm(log.y~x6)))[2,4]

http://websites.uwlax.edu/storibio/math445_fall12/r_commands.txt 17/23
10/30/2018 websites.uwlax.edu/storibio/math445_fall12/r_commands.txt
pval.x7=coefficients(summary(lm(log.y~x7)))[2,4]
pval.x8=coefficients(summary(lm(log.y~x8)))[2,4]
pvals=c(pval.x1,pval.x2,pval.x3,pval.x4,pval.x5,pval.x6,pval.x7,pval.x8)
which(pvals==min(pvals)) # x3 has the smallest p-value
(8.38e-08)

# stage 2: x2 joins the model with x3


pval.x3x1=coefficients(summary(lm(log.y~x3+x1)))[3,4]
pval.x3x2=coefficients(summary(lm(log.y~x3+x2)))[3,4]
pval.x3x4=coefficients(summary(lm(log.y~x3+x4)))[3,4]
pval.x3x5=coefficients(summary(lm(log.y~x3+x5)))[3,4]
pval.x3x6=coefficients(summary(lm(log.y~x3+x6)))[3,4]
pval.x3x7=coefficients(summary(lm(log.y~x3+x7)))[3,4]
pval.x3x8=coefficients(summary(lm(log.y~x3+x8)))[3,4]
pvalsx3=c(pval.x3x1,pval.x3x2,pval.x3x4,pval.x3x5,pval.x3x6,pval.x3x7,pval.x3x8)
which(pvalsx3==min(pvalsx3)) # x2 has the smallest p-value (2.23e-
07)

# stage 3: x8 joins the model with x3 and x2


pval.x3x2x1=coefficients(summary(lm(log.y~x3+x2+x1)))[4,4]
pval.x3x2x4=coefficients(summary(lm(log.y~x3+x2+x4)))[4,4]
pval.x3x2x5=coefficients(summary(lm(log.y~x3+x2+x5)))[4,4]
pval.x3x2x6=coefficients(summary(lm(log.y~x3+x2+x6)))[4,4]
pval.x3x2x7=coefficients(summary(lm(log.y~x3+x2+x7)))[4,4]
pval.x3x2x8=coefficients(summary(lm(log.y~x3+x2+x8)))[4,4]
pvalsx3x2=c(pval.x3x2x1,pval.x3x2x4,pval.x3x2x5,pval.x3x2x6,pval.x3x2x7,pval.x3x2x8)
which(pvalsx3x2==min(pvalsx3x2)) # x8 has the smallest p-value (5.5e-
06)

# stage 4: x1 joins the model with x3, x2, and x8


pval.x3x2x8x1=coefficients(summary(lm(log.y~x3+x2+x8+x1)))[5,4]
pval.x3x2x8x4=coefficients(summary(lm(log.y~x3+x2+x8+x4)))[5,4]
pval.x3x2x8x5=coefficients(summary(lm(log.y~x3+x2+x8+x5)))[5,4]
pval.x3x2x8x6=coefficients(summary(lm(log.y~x3+x2+x8+x6)))[5,4]
pval.x3x2x8x7=coefficients(summary(lm(log.y~x3+x2+x8+x7)))[5,4]
pvalsx3x2x8=c(pval.x3x2x8x1,pval.x3x2x8x4,pval.x3x2x8x5,pval.x3x2x8x6,pval.x3x2x8x7)
which(pvalsx3x2x8==min(pvalsx3x2x8)) # x1 has the smallest p-value (0.00033)

# stage 5: None of the remaining 4 variables can get into the model (all p-values > 0.05)
pval.x3x2x8x1x4=coefficients(summary(lm(log.y~x3+x2+x8+x1+x4)))[6,4]
pval.x3x2x8x1x5=coefficients(summary(lm(log.y~x3+x2+x8+x1+x5)))[6,4]
pval.x3x2x8x1x6=coefficients(summary(lm(log.y~x3+x2+x8+x1+x6)))[6,4]
pval.x3x2x8x1x7=coefficients(summary(lm(log.y~x3+x2+x8+x1+x7)))[6,4]
pvalsx3x2x8x1=c(pval.x3x2x8x1x4,pval.x3x2x8x1x5,pval.x3x2x8x1x6,pval.x3x2x8x1x7)
pvalsx3x2x8x1 # All p-values are > 0.05 --> stop

# Therefore, the best model obtained using the forward selection is


# log.y~x1+x2+x3+x8

# Or you can use:


reg.null=lm(log.y~1,data=dataset)
reg.full=lm(log.y~.,data=dataset)
step(reg.null,scope=list(lower=reg.null,upper=reg.full),direction="forward") # Criterion
used is AIC, not t-statistic.

## Backward Selection ##
# Start with all the variables in the model, then remove one at a time.
coefficients(summary(lm(log.y~x1+x2+x3+x4+x5+x6+x7+x8))) # Remove x4.
coefficients(summary(lm(log.y~x1+x2+x3+x5+x6+x7+x8))) # Remove x7.
coefficients(summary(lm(log.y~x1+x2+x3+x5+x6+x8))) # Remove x5.
coefficients(summary(lm(log.y~x1+x2+x3+x6+x8))) # Remove x6.
coefficients(summary(lm(log.y~x1+x2+x3+x8))) # All p-values are now <
alpha

http://websites.uwlax.edu/storibio/math445_fall12/r_commands.txt 18/23
10/30/2018 websites.uwlax.edu/storibio/math445_fall12/r_commands.txt
# Therefore, the best model obtained using the forward selection is
# log.y~x1+x2+x3+x8

# Or you can use:


reg.full=lm(log.y~.,data=dataset)
step(reg.full,direction="backward") # Criterion used is AIC, not t-statistic.

## (Forward) Stepwise Selection ##


# This procedure is just like in the 'forward' selection. The only difference is,
# every time you include a new variable into the model, you need to evaluate whether
# any of the variables already in the model should now be removed (based on their t-statistic
# p-value). See page 365 of our textbook for more information.

## Validation
temp=sample(1:54,20)
data.validation=data[temp,]
data.training=data[-temp,]

data.valid=read.csv("data_surgical_validation.csv",header=T)
attach(data.valid)
model.1=lm(log(survival)~clotting+prognostic+enzyme+heavy) # Based on BIC

MSPR=sum(model.1$residuals^2)/length(survival) # MSPR = 0.07026

coefficients(summary(lm(log.y~x1+x2+x3+x8)))

###############################################
### (11/19/12) Model Building - Diagnostics ###
###############################################

x1=c(14,19,12,11)
x2=c(25,32,22,15)
y=c(301,327,246,187)

X=cbind(array(1,4),x1,x2)
H=X%*%solve(t(X)%*%X)%*%t(X) # Computing the hat matrix
y.hat=H%*%y # Will give the predicted values(see table on page
393)
e=(diag(4)-H)%*%y # Will give the residuals
h=diag(H) # Extracting the main diagonal from the hat matrix
n=length(x1)
MSE=sum(e^2)/(n-3)
s2.e=MSE*(diag(4)-H) # Will give the variance and co-variance matrix
s.e=sqrt(diag(s2.e))
r.stud=e/s.e # Will give the studentize residuals

d=e/(1-h) # Will give the studentize deleted residuals

# Or you can use these built-in R function:


results=lm(y~x1+x2)
hatvalues(results) # Will give the main diagonal of the H matrix.
rstandard(results) # Will give the studentized residuals.
rstudent(results) # Will give the studentized deleted residuals.

# body fat example


data=read.csv("data_body_fat.csv",header=T)
attach(data)
results.fat=lm(fat~triceps+thigh,data=data)
e=round(results.fat$residuals,3)
sse=sum(e^2)
n=length(e)
h=round(hatvalues(results.fat),3)

http://websites.uwlax.edu/storibio/math445_fall12/r_commands.txt 19/23
10/30/2018 websites.uwlax.edu/storibio/math445_fall12/r_commands.txt
t=round(e*sqrt((n-3-1)/(sse*(1-h)-e^2)),3)
data.frame(residuals=e,h=h,t=t) # Table 10.3 on page 397

t.crit=qt(1-.10/(2*n),df=n-3-1)
which(abs(t)>t.crit)

# Leverage
which(h>2*mean(h)) # Note that sum(h)=p ==> mean(h)=p/n

# Deletion statistics
dff=dffits(results.fat) # Compare abs(dffits) with 1 for small to medium size
data and 2*sqrt(p/n) for large data sets.
dfb=dfbeta(results.fat) # Compare abs(dfbeta) with 1 (if small or medium
data) or 2/sqrt(n) (for large data sets)
cooks=cooks.distance(results.fat) # Compare with qf(.10,p,n-p) or qf(.20,p,n-p)
round(data.frame(dffits=dff,cooks=cooks,dfbetas=dfb),3)

summary(influence.measures(results.fat)) # This will give a summary of influential


cases.

# Variance Inflation Factor (If VIF > 10, there is a strong collinearity)
library(car) # Install and load R package 'car'

results3=lm(fat~.,data=data)
vif(results3) # Note how large the VIFs, indicating strong
collinearity
summary(lm(midarm~triceps+thigh))$r.sq # Note how big R^2 is, indicating the strong linear
correlation
# between midarm and the other 2
predictors.

data2=read.csv("data_surgical_unit.csv",header=T)
attach(data2)

y=survival
x1=clotting
x2=prognostic
x3=enzyme
x4=liver
x5=age
x6=gender
x7=moderate
x8=heavy
log.y=log(y)

results2=lm(log.y~x1+x2+x3+x8)
vif(results2) # All VIFs are small. Therefore, we don't have collinearity problem.

#####################################################
### (11/26/12) Model Building - Remedial Measures ###
#####################################################

# Using Robust Regression to dampen the effect of influential cases

# 1. Least Absolute Residuals (LAR) or Least Absolute Deviations (LAD) Regression


# min(sum(abs(residuals)))

data=read.csv("data_body_fat.csv",header=T)
attach(data)
results.fat=lm(fat~triceps+thigh,data=data)

library(quantreg)
rq(fat~triceps+thigh,data=data)

http://websites.uwlax.edu/storibio/math445_fall12/r_commands.txt 20/23
10/30/2018 websites.uwlax.edu/storibio/math445_fall12/r_commands.txt
# 2. Least Median of Squares (LMS) Regression
# min(median(residuals^2))

# Nonparametric Regression: Lowess (locally weighted regression scatter plot smoothing)


Method

# 1. The linear regression is weighted to give cases further from the middle X level
# in each neighborhood smaller weights
# 2. To make the procedure robust to outlying observations, the linear regression fitting
# is repeated, with the weights revised so that cases that had large residuals in the
# fitting receive smaller weights in the second fitting
# 3. To improve the robustness of the procedure further, step 2 is repeated one or more
# times by revising the weights according to the size of the residuals in the latest
# fitting

lowess(triceps,fat)

plot(lowess(triceps,fat),type='l')
points(triceps,fat)

par(mfrow=c(2,2))
plot(lm(fat~triceps))

data2=read.csv("data_surgical_unit.csv",header=T)
attach(data2)
results(survival~clotting)

par(mfrow=c(2,2))
plot(results)

########################################
### (11/28/12) Nonlinear Regression ###
########################################

# 1. Exponential Regression Models

curve(5*exp(x/2),0,10)
curve(100-50*exp(-2*x),0,3)

x=sample(1:10,100,replace=T)
y=5*exp(x/2)+rnorm(100,mean=0,sd=20)
plot(x,y)
nls(y~b*exp(a*x),start=list(a=1,b=3))

curve(4.8003*exp(x*0.5049),0,10,add=T,col="darkred",lwd=2)

data=read.csv("data_injured_patients.csv",header=T)
attach(data)
Y=index; X=days
results=nls(Y~b*exp(a*X),start=list(a=0,b=50))

plot(X,Y,xlab="Days Hospitalized",ylab="Prognostic Index")


curve(58.60653*exp(-0.03959*x),0,70,add=T,col="darkred",lwd=2)

Y.fitted=58.60653*exp(-0.03959*X)
residuals=Y-Y.fitted
plot(Y.fitted,residuals,main="Residual Plot")

# Or you can also use the 'glm' function


log.lin <- glm(Y~X, family=quasi(link="log", variance="constant"))

http://websites.uwlax.edu/storibio/math445_fall12/r_commands.txt 21/23
10/30/2018 websites.uwlax.edu/storibio/math445_fall12/r_commands.txt

# 2. Logistic Regression Models

curve(10/(1+20*exp(-2*x)),0,4,lwd=2)

x=sample(0:5,100,replace=T)
y=10/(1+20*exp(-2*x))+rnorm(100,mean=0,sd=.5)
plot(x,y)

nls(y~c/(1+b*exp(a*x)),start=list(a=-2,b=20,c=10))
curve(10.131/(1+16.623*exp(-1.855*x)),0,5,col="darkred",lwd=2,add=T)

# (Homework) 13.10: Enzyme kinetics

data.kinetics=read.csv("data_kinetics.csv",header=T)
attach(data.kinetics)
y=velocity; x=concentration
y.p=1/y
x.p=1/x
results=lm(y.p~x.p) #bo=0.03376 and b1=0.45401

nls(y~a*x/(b+x),start=list(a=1/0.03376,b=0.454/0.03376))
plot(concentration,velocity)
curve(28.14*x/(12.57+x),0,40,add=T,lwd=2)

y.fitted=28.14*x/(12.57+x)
residuals=y-y.fitted
plot(y.fitted,residuals)

############################################
### (12/3/12) Generalized Linear Models ###
############################################

# 1. Probit Mean Response Function

curve(pnorm(0+1*x),-3,3,lwd=2)
curve(pnorm(0+2*x),-3,3,lwd=2,col="red",add=T)

curve(dnorm(x,0,1),-3,3)
curve(exp(x)/(1+exp(x))^2,-3,3,add=T,col="red")
curve(dnorm(x,0,sd=pi/sqrt(3)),-3,3,add=T,col="blue")

data=read.csv("data_task.csv",header=T)
attach(data)
x=experience
y=success

results <- glm(success ~ experience, family=binomial(link="probit"))


results <- glm(success ~ experience, family=binomial(link="logit"))

predicted=exp(-3.0597+0.1615*x)/(1+exp(-3.0597+0.1615*x))

prediction=as.numeric(results$fitted>.7)
error=success-prediction
data.frame(success,predicted=results$fitted,prediction,error)

percent.error=sum(abs(error))/length(error)
percent.error

# Multiple logistic regression

dystrophy=read.csv("data_dystrophy.csv",header=T)

http://websites.uwlax.edu/storibio/math445_fall12/r_commands.txt 22/23
10/30/2018 websites.uwlax.edu/storibio/math445_fall12/r_commands.txt
attach(dystrophy)

dystrophy[!complete.cases(dystrophy),] # lists rows with missing values


dys.naomit=na.omit(dystrophy) # creates new data set without missing data
temp=which(is.na(PK)=="TRUE")

results <- glm(carrier ~ AGE + M + CK + H + PK + LD, data = dystrophy, family


=binomial(link="logit"))
results <- glm(carrier ~ AGE + M + CK + H + PK + LD, data = dys.naomit, family
=binomial(link="logit"))

http://websites.uwlax.edu/storibio/math445_fall12/r_commands.txt 23/23

Вам также может понравиться