c:ms:regression_lecture_note_for_r
This is an old revision of the document!
################
rnorm2 <- function(n,mean,sd) { mean+sd*scale(rnorm(n)) }
n <- 400
set.seed(101)
x <- rnorm2(n, 265, 15)
rand.00 <- rnorm2(n, 0, 12)
rand.01 <- rnorm2(n, 0, 240)
plot(rand.01)
points(rand.00, col="red")
b = 170 / 265
b
y <- b * x + rand.00
df <- data.frame(x, y)
head(df)
# plot method 0
plot(x,y)
# method 1
plot(x, y, pch = 1, cex = 1, col = "black", main = "HEIGHT against SHOE SIZE", xlab = "SHOE SIZE", ylab = "HEIGHT (cm)")
abline(h = mean(y), lwd=1.5, col="red")
abline(lm(y ~ x), lwd=1.5, col="blue")
# method 2
lm.y.x <- lm(y ~ x, data = df)
summary(lm.y.x)
# str(lm.y.x)
lm.y.x$coefficients
intercept <- lm.y.x$coefficients[1]
slope <- lm.y.x$coefficients[2]
intercept
slope
# library(ggplot2)
# ggplot(data=df, aes(x,y)) +
# geom_point(color="blue", size=1.5, pch=1.5) +
# geom_hline(aes(yintercept=mean(y))) +
# geom_abline(intercept=intercept, slope=slope)
#
#####
ss.y <- sum((y-mean(y))^2)
ss.y
df.y <- length(y)-1
df.y
ss.y/df.y
var(y)
m.x <- mean(x)
m.y <- mean(y)
sp <- sum((x-m.x)*(y-m.y))
df.tot <- n-1
sp/df.tot
cov.xy <- cov(x,y)
sd.x <- sd(x)
sd.y <- sd(y)
r.xy <- cov.xy/(sd.x*sd.y)
r.xy
cor(x,y)
cor.test(x,y)
b <- cov(x,y) / var(x)
# note that
# (sp(x,y)/n-1) / (ss(x)/n-1)
# = sp(x,y) / ss(x)
# m.y = a + b * mean.x
a <- m.y - (b * m.x)
b
a
summary(lm.y.x)
lm.y.x$coefficients
# str(lm.y.x)
y.pred <- lm.y.x$fitted.values
y <- y
m.y
res <- y - y.pred
reg <- y.pred - m.y
ss.y
ss.res <- sum(res^2)
ss.reg <- sum(reg^2)
ss.y
ss.res
ss.reg
ss.res + ss.reg
r.square <- ss.reg / ss.y
r.square
df.tot <- df.y
df.reg <- 2 - 1
df.res <- df.tot - df.reg
df.reg
df.res
df.tot
ss.tot <- ss.y
ss.reg
ss.res
ss.tot
ms.reg <- ss.reg / df.reg
ms.res <- ss.res / df.res
ms.reg
ms.res
f.cal <- ms.reg/ms.res
f.cal
p.val <- pf(f.cal, df.reg, df.res, lower.tail = F)
p.val
anova(lm.y.x)
########################################
# also make it sure that you understand
########################################
res <- lm.y.x$residuals
reg <- lm.y.x$fitted.values-m.y
# This is essentially a perfect linear
# relationship between the regression
# line and x data
lm.reg.x <- lm(reg~x, data = df)
summary(lm.reg.x)
# The relationship between residuals and
# x should be zero
lm.res.x <- lm(res~x, data = df)
summary(lm.res.x)
summary(lm.y.x)
> rnorm2 <- function(n,mean,sd) { mean+sd*scale(rnorm(n)) }
> n <- 400
>
> set.seed(101)
> x <- rnorm2(n, 265, 15)
> rand.00 <- rnorm2(n, 0, 12)
>
> rand.01 <- rnorm2(n, 0, 240)
> plot(rand.01)
> points(rand.00, col="red")
>
> b = 170 / 265
> b
[1] 0.6415094
> y <- b * x + rand.00
>
> df <- data.frame(x, y)
> head(df)
x y
1 260.3289 164.1701
2 273.9897 149.7065
3 254.9034 140.6106
4 268.7321 179.5289
5 270.2313 176.5863
6 283.6541 172.5179
>
> # plot method 0
> plot(x,y)
>
> # method 1
> plot(x, y, pch = 1, cex = 1, col = "black", main = "HEIGHT against SHOE SIZE", xlab = "SHOE SIZE", ylab = "HEIGHT (cm)")
> abline(h = mean(y), lwd=1.5, col="red")
> abline(lm(y ~ x), lwd=1.5, col="blue")
>
> # method 2
> lm.y.x <- lm(y ~ x, data = df)
> summary(lm.y.x)
Call:
lm(formula = y ~ x, data = df)
Residuals:
Min 1Q Median 3Q Max
-39.653 -8.878 -0.226 7.322 30.630
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.8022 10.6435 0.075 0.94
x 0.6385 0.0401 15.922 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 12.01 on 398 degrees of freedom
Multiple R-squared: 0.3891, Adjusted R-squared: 0.3876
F-statistic: 253.5 on 1 and 398 DF, p-value: < 2.2e-16
> # str(lm.y.x)
> lm.y.x$coefficients
(Intercept) x
0.8021833 0.6384823
> intercept <- lm.y.x$coefficients[1]
> slope <- lm.y.x$coefficients[2]
> intercept
(Intercept)
0.8021833
> slope
x
0.6384823
>
> # library(ggplot2)
> # ggplot(data=df, aes(x,y)) +
> # geom_point(color="blue", size=1.5, pch=1.5) +
> # geom_hline(aes(yintercept=mean(y))) +
> # geom_abline(intercept=intercept, slope=slope)
> #
> #####
> ss.y <- sum((y-mean(y))^2)
> ss.y
[1] 94052.83
> df.y <- length(y)-1
> df.y
[1] 399
> ss.y/df.y
[1] 235.7214
> var(y)
[,1]
[1,] 235.7214
>
> m.x <- mean(x)
> m.y <- mean(y)
> sp <- sum((x-m.x)*(y-m.y))
> df.tot <- n-1
> sp/df.tot
[1] 143.6585
> cov.xy <- cov(x,y)
>
> sd.x <- sd(x)
> sd.y <- sd(y)
>
> r.xy <- cov.xy/(sd.x*sd.y)
> r.xy
[,1]
[1,] 0.6237932
> cor(x,y)
[,1]
[1,] 0.6237932
> cor.test(x,y)
Pearson's product-moment correlation
data: x and y
t = 15.922, df = 398, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.5599929 0.6802388
sample estimates:
cor
0.6237932
>
> b <- cov(x,y) / var(x)
> # note that
> # (sp(x,y)/n-1) / (ss(x)/n-1)
> # = sp(x,y) / ss(x)
> # m.y = a + b * mean.x
> a <- m.y - (b * m.x)
> b
[,1]
[1,] 0.6384823
> a
[,1]
[1,] 0.8021833
>
>
> summary(lm.y.x)
Call:
lm(formula = y ~ x, data = df)
Residuals:
Min 1Q Median 3Q Max
-39.653 -8.878 -0.226 7.322 30.630
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.8022 10.6435 0.075 0.94
x 0.6385 0.0401 15.922 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 12.01 on 398 degrees of freedom
Multiple R-squared: 0.3891, Adjusted R-squared: 0.3876
F-statistic: 253.5 on 1 and 398 DF, p-value: < 2.2e-16
> lm.y.x$coefficients
(Intercept) x
0.8021833 0.6384823
>
> # str(lm.y.x)
>
> y.pred <- lm.y.x$fitted.values
> y <- y
> m.y
[1] 170
>
> res <- y - y.pred
> reg <- y.pred - m.y
> ss.y
[1] 94052.83
> ss.res <- sum(res^2)
> ss.reg <- sum(reg^2)
> ss.y
[1] 94052.83
> ss.res
[1] 57455.18
> ss.reg
[1] 36597.65
> ss.res + ss.reg
[1] 94052.83
>
> r.square <- ss.reg / ss.y
> r.square
[1] 0.389118
>
>
> df.tot <- df.y
> df.reg <- 2 - 1
> df.res <- df.tot - df.reg
>
> df.reg
[1] 1
> df.res
[1] 398
> df.tot
[1] 399
>
> ss.tot <- ss.y
> ss.reg
[1] 36597.65
> ss.res
[1] 57455.18
> ss.tot
[1] 94052.83
>
> ms.reg <- ss.reg / df.reg
> ms.res <- ss.res / df.res
> ms.reg
[1] 36597.65
> ms.res
[1] 144.3597
>
> f.cal <- ms.reg/ms.res
> f.cal
[1] 253.517
>
> p.val <- pf(f.cal, df.reg, df.res, lower.tail = F)
> p.val
[1] 1.623694e-44
> anova(lm.y.x)
Analysis of Variance Table
Response: y
Df Sum Sq Mean Sq F value Pr(>F)
x 1 36598 36598 253.52 < 2.2e-16 ***
Residuals 398 57455 144
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
>
> ########################################
> # also make it sure that you understand
> ########################################
>
> res <- lm.y.x$residuals
> reg <- lm.y.x$fitted.values-m.y
>
> # This is essentially a perfect linear
> # relationship between the regression
> # line and x data
> lm.reg.x <- lm(reg~x, data = df)
> summary(lm.reg.x)
Call:
lm(formula = reg ~ x, data = df)
Residuals:
Min 1Q Median 3Q Max
-6.828e-14 -7.900e-15 -5.500e-16 6.410e-15 3.979e-13
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.692e+02 1.937e-14 -8.733e+15 <2e-16 ***
x 6.385e-01 7.299e-17 8.747e+15 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.187e-14 on 398 degrees of freedom
Multiple R-squared: 1, Adjusted R-squared: 1
F-statistic: 7.651e+31 on 1 and 398 DF, p-value: < 2.2e-16
>
> # The relationship between residuals and
> # x should be zero
> lm.res.x <- lm(res~x, data = df)
> summary(lm.res.x)
Call:
lm(formula = res ~ x, data = df)
Residuals:
Min 1Q Median 3Q Max
-39.653 -8.878 -0.226 7.322 30.630
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -3.586e-15 1.064e+01 0 1
x 1.186e-17 4.010e-02 0 1
Residual standard error: 12.01 on 398 degrees of freedom
Multiple R-squared: 6.744e-33, Adjusted R-squared: -0.002513
F-statistic: 2.684e-30 on 1 and 398 DF, p-value: 1
> summary(lm.y.x)
Call:
lm(formula = y ~ x, data = df)
Residuals:
Min 1Q Median 3Q Max
-39.653 -8.878 -0.226 7.322 30.630
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.8022 10.6435 0.075 0.94
x 0.6385 0.0401 15.922 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 12.01 on 398 degrees of freedom
Multiple R-squared: 0.3891, Adjusted R-squared: 0.3876
F-statistic: 253.5 on 1 and 398 DF, p-value: < 2.2e-16
>
>
>
c/ms/regression_lecture_note_for_r.1716935598.txt.gz · Last modified: by hkimscil
