User Tools

Site Tools


c:ms:regression_lecture_note_for_r

This is an old revision of the document!


################
rnorm2 <- function(n,mean,sd) { mean+sd*scale(rnorm(n)) } 
n <- 400

set.seed(101)
x <- rnorm2(n, 265, 15) 
rand.00 <- rnorm2(n, 0, 12)

rand.01 <- rnorm2(n, 0, 240)
plot(rand.01)
points(rand.00, col="red")

b = 170 / 265
b
y <- b * x + rand.00

df <- data.frame(x, y)
head(df)

# plot method 0
plot(x,y)

# method 1
plot(x, y, pch = 1, cex = 1, col = "black", main = "HEIGHT against SHOE SIZE", xlab = "SHOE SIZE", ylab = "HEIGHT (cm)")
abline(h = mean(y), lwd=1.5, col="red")
abline(lm(y ~ x), lwd=1.5, col="blue")

# method 2
lm.y.x <- lm(y ~ x, data = df)
summary(lm.y.x)
# str(lm.y.x)
lm.y.x$coefficients
intercept <- lm.y.x$coefficients[1]
slope <- lm.y.x$coefficients[2]
intercept
slope

# library(ggplot2)
# ggplot(data=df, aes(x,y)) +
#  geom_point(color="blue", size=1.5, pch=1.5) +
#  geom_hline(aes(yintercept=mean(y))) +
#  geom_abline(intercept=intercept, slope=slope)
#
#####
ss.y <- sum((y-mean(y))^2)
ss.y
df.y <- length(y)-1
df.y
ss.y/df.y
var(y)

m.x <- mean(x)
m.y <- mean(y)
sp <- sum((x-m.x)*(y-m.y))
df.tot <- n-1
sp/df.tot
cov.xy <- cov(x,y)

sd.x <- sd(x)
sd.y <- sd(y)

r.xy <- cov.xy/(sd.x*sd.y)
r.xy
cor(x,y)
cor.test(x,y)

b <- cov(x,y) / var(x)
# note that 
# (sp(x,y)/n-1) / (ss(x)/n-1)
# = sp(x,y) / ss(x)
# m.y = a + b * mean.x
a <- m.y - (b * m.x)
b
a


summary(lm.y.x)
lm.y.x$coefficients

# str(lm.y.x)

y.pred <- lm.y.x$fitted.values
y <- y
m.y

res <- y - y.pred
reg <- y.pred - m.y
ss.y 
ss.res <- sum(res^2)
ss.reg <- sum(reg^2)
ss.y
ss.res
ss.reg
ss.res + ss.reg

r.square <- ss.reg / ss.y 
r.square 


df.tot <- df.y
df.reg <- 2 - 1
df.res <- df.tot - df.reg

df.reg
df.res
df.tot

ss.tot <- ss.y
ss.reg
ss.res
ss.tot

ms.reg <- ss.reg / df.reg
ms.res <- ss.res / df.res
ms.reg
ms.res

f.cal <- ms.reg/ms.res
f.cal

p.val <- pf(f.cal, df.reg, df.res, lower.tail = F)
p.val
anova(lm.y.x)

########################################
# also make it sure that you understand
########################################

res <- lm.y.x$residuals
reg <- lm.y.x$fitted.values-m.y

# This is essentially a perfect linear 
# relationship between the regression 
# line and x data
lm.reg.x <- lm(reg~x, data = df)
summary(lm.reg.x)

# The relationship between residuals and 
# x should be zero
lm.res.x <- lm(res~x, data = df)
summary(lm.res.x)
summary(lm.y.x)
> rnorm2 <- function(n,mean,sd) { mean+sd*scale(rnorm(n)) } 
> n <- 400
> 
> set.seed(101)
> x <- rnorm2(n, 265, 15) 
> rand.00 <- rnorm2(n, 0, 12)
> 
> rand.01 <- rnorm2(n, 0, 240)
> plot(rand.01)
> points(rand.00, col="red")
> 
> b = 170 / 265
> b
[1] 0.6415094
> y <- b * x + rand.00
> 
> df <- data.frame(x, y)
> head(df)
         x        y
1 260.3289 164.1701
2 273.9897 149.7065
3 254.9034 140.6106
4 268.7321 179.5289
5 270.2313 176.5863
6 283.6541 172.5179
> 
> # plot method 0
> plot(x,y)
> 
> # method 1
> plot(x, y, pch = 1, cex = 1, col = "black", main = "HEIGHT against SHOE SIZE", xlab = "SHOE SIZE", ylab = "HEIGHT (cm)")
> abline(h = mean(y), lwd=1.5, col="red")
> abline(lm(y ~ x), lwd=1.5, col="blue")
> 
> # method 2
> lm.y.x <- lm(y ~ x, data = df)
> summary(lm.y.x)

Call:
lm(formula = y ~ x, data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-39.653  -8.878  -0.226   7.322  30.630 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   0.8022    10.6435   0.075     0.94    
x             0.6385     0.0401  15.922   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 12.01 on 398 degrees of freedom
Multiple R-squared:  0.3891,	Adjusted R-squared:  0.3876 
F-statistic: 253.5 on 1 and 398 DF,  p-value: < 2.2e-16

> # str(lm.y.x)
> lm.y.x$coefficients
(Intercept)           x 
  0.8021833   0.6384823 
> intercept <- lm.y.x$coefficients[1]
> slope <- lm.y.x$coefficients[2]
> intercept
(Intercept) 
  0.8021833 
> slope
        x 
0.6384823 
> 
> # library(ggplot2)
> # ggplot(data=df, aes(x,y)) +
> #  geom_point(color="blue", size=1.5, pch=1.5) +
> #  geom_hline(aes(yintercept=mean(y))) +
> #  geom_abline(intercept=intercept, slope=slope)
> #
> #####
> ss.y <- sum((y-mean(y))^2)
> ss.y
[1] 94052.83
> df.y <- length(y)-1
> df.y
[1] 399
> ss.y/df.y
[1] 235.7214
> var(y)
         [,1]
[1,] 235.7214
> 
> m.x <- mean(x)
> m.y <- mean(y)
> sp <- sum((x-m.x)*(y-m.y))
> df.tot <- n-1
> sp/df.tot
[1] 143.6585
> cov.xy <- cov(x,y)
> 
> sd.x <- sd(x)
> sd.y <- sd(y)
> 
> r.xy <- cov.xy/(sd.x*sd.y)
> r.xy
          [,1]
[1,] 0.6237932
> cor(x,y)
          [,1]
[1,] 0.6237932
> cor.test(x,y)

	Pearson's product-moment correlation

data:  x and y
t = 15.922, df = 398, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.5599929 0.6802388
sample estimates:
      cor 
0.6237932 

> 
> b <- cov(x,y) / var(x)
> # note that 
> # (sp(x,y)/n-1) / (ss(x)/n-1)
> # = sp(x,y) / ss(x)
> # m.y = a + b * mean.x
> a <- m.y - (b * m.x)
> b
          [,1]
[1,] 0.6384823
> a
          [,1]
[1,] 0.8021833
> 
> 
> summary(lm.y.x)

Call:
lm(formula = y ~ x, data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-39.653  -8.878  -0.226   7.322  30.630 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   0.8022    10.6435   0.075     0.94    
x             0.6385     0.0401  15.922   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 12.01 on 398 degrees of freedom
Multiple R-squared:  0.3891,	Adjusted R-squared:  0.3876 
F-statistic: 253.5 on 1 and 398 DF,  p-value: < 2.2e-16

> lm.y.x$coefficients
(Intercept)           x 
  0.8021833   0.6384823 
> 
> # str(lm.y.x)
> 
> y.pred <- lm.y.x$fitted.values
> y <- y
> m.y
[1] 170
> 
> res <- y - y.pred
> reg <- y.pred - m.y
> ss.y 
[1] 94052.83
> ss.res <- sum(res^2)
> ss.reg <- sum(reg^2)
> ss.y
[1] 94052.83
> ss.res
[1] 57455.18
> ss.reg
[1] 36597.65
> ss.res + ss.reg
[1] 94052.83
> 
> r.square <- ss.reg / ss.y 
> r.square 
[1] 0.389118
> 
> 
> df.tot <- df.y
> df.reg <- 2 - 1
> df.res <- df.tot - df.reg
> 
> df.reg
[1] 1
> df.res
[1] 398
> df.tot
[1] 399
> 
> ss.tot <- ss.y
> ss.reg
[1] 36597.65
> ss.res
[1] 57455.18
> ss.tot
[1] 94052.83
> 
> ms.reg <- ss.reg / df.reg
> ms.res <- ss.res / df.res
> ms.reg
[1] 36597.65
> ms.res
[1] 144.3597
> 
> f.cal <- ms.reg/ms.res
> f.cal
[1] 253.517
> 
> p.val <- pf(f.cal, df.reg, df.res, lower.tail = F)
> p.val
[1] 1.623694e-44
> anova(lm.y.x)
Analysis of Variance Table

Response: y
           Df Sum Sq Mean Sq F value    Pr(>F)    
x           1  36598   36598  253.52 < 2.2e-16 ***
Residuals 398  57455     144                      
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
> 
> ########################################
> # also make it sure that you understand
> ########################################
> 
> res <- lm.y.x$residuals
> reg <- lm.y.x$fitted.values-m.y
> 
> # This is essentially a perfect linear 
> # relationship between the regression 
> # line and x data
> lm.reg.x <- lm(reg~x, data = df)
> summary(lm.reg.x)

Call:
lm(formula = reg ~ x, data = df)

Residuals:
       Min         1Q     Median         3Q        Max 
-6.828e-14 -7.900e-15 -5.500e-16  6.410e-15  3.979e-13 

Coefficients:
              Estimate Std. Error    t value Pr(>|t|)    
(Intercept) -1.692e+02  1.937e-14 -8.733e+15   <2e-16 ***
x            6.385e-01  7.299e-17  8.747e+15   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.187e-14 on 398 degrees of freedom
Multiple R-squared:      1,	Adjusted R-squared:      1 
F-statistic: 7.651e+31 on 1 and 398 DF,  p-value: < 2.2e-16

> 
> # The relationship between residuals and 
> # x should be zero
> lm.res.x <- lm(res~x, data = df)
> summary(lm.res.x)

Call:
lm(formula = res ~ x, data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-39.653  -8.878  -0.226   7.322  30.630 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)
(Intercept) -3.586e-15  1.064e+01       0        1
x            1.186e-17  4.010e-02       0        1

Residual standard error: 12.01 on 398 degrees of freedom
Multiple R-squared:  6.744e-33,	Adjusted R-squared:  -0.002513 
F-statistic: 2.684e-30 on 1 and 398 DF,  p-value: 1

> summary(lm.y.x)

Call:
lm(formula = y ~ x, data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-39.653  -8.878  -0.226   7.322  30.630 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   0.8022    10.6435   0.075     0.94    
x             0.6385     0.0401  15.922   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 12.01 on 398 degrees of freedom
Multiple R-squared:  0.3891,	Adjusted R-squared:  0.3876 
F-statistic: 253.5 on 1 and 398 DF,  p-value: < 2.2e-16

> 
> 
> 
c/ms/regression_lecture_note_for_r.1716935598.txt.gz · Last modified: 2024/05/29 07:33 by hkimscil

Donate Powered by PHP Valid HTML5 Valid CSS Driven by DokuWiki