c:ms:multiple_regression_lecture_note_for_r

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Both sides previous revisionPrevious revision
Next revision
Previous revision
c:ms:multiple_regression_lecture_note_for_r [2024/06/05 16:13] hkimscilc:ms:multiple_regression_lecture_note_for_r [2024/09/30 08:56] (current) – [Simple regression] hkimscil
Line 10: Line 10:
 colnames(d) <- c("y", "x1", "x2") colnames(d) <- c("y", "x1", "x2")
 d d
-attach(d)+attach(d)
 lm.y.x1 <- lm(y ~ x1, data=d) lm.y.x1 <- lm(y ~ x1, data=d)
 summary(lm.y.x1) summary(lm.y.x1)
Line 56: Line 56:
 beta2 beta2
  
-install.packages("lm.beta")+install.packages("lm.beta")
 library(lm.beta) library(lm.beta)
 lm.beta(lm.y.x1x2) lm.beta(lm.y.x1x2)
Line 67: Line 67:
  
 lm.tmp.2 <- lm(y~x1, data=d) lm.tmp.2 <- lm(y~x1, data=d)
-res.y.x2 <- lm.tmp.2$residuals+res.y.x1 <- lm.tmp.2$residuals
  
-lm.tmp.3 <- lm(res.y.x2 ~ res.x2.x1, data=d)+lm.tmp.3 <- lm(res.y.x1 ~ res.x2.x1, data=d)
 summary(lm.tmp.3) summary(lm.tmp.3)
  
-install.packages("ppcor")+install.packages("ppcor")
 library(ppcor) library(ppcor)
 pcor(d) pcor(d)
 spcor(d) spcor(d)
-partial.r <- pcor.test(y,x2,x1+partial.r <- pcor.test(y, x2, x1)
-str(partial.r)+
 partial.r partial.r
 +str(partial.r)
 partial.r$estimate^2 partial.r$estimate^2
  
Line 86: Line 86:
  
 lm.tmp.5 <- lm(y~x2, data=d) lm.tmp.5 <- lm(y~x2, data=d)
-res.y.x1 <- lm.tmp.5$residuals+res.y.x2 <- lm.tmp.5$residuals
  
-lm.tmp.6 <- lm(res.y.x1 ~ res.x1.x2, data=d)+lm.tmp.6 <- lm(res.y.x2 ~ res.x1.x2, data=d)
 summary(lm.tmp.6) summary(lm.tmp.6)
  
-partial.r2 <- pcor.test(y,x1,x2)+partial.r2 <- pcor.test(y, x1, x2)
 str(partial.r2) str(partial.r2)
 partial.r2$estimate^2 partial.r2$estimate^2
Line 105: Line 105:
 spr.2$estimate^2 spr.2$estimate^2
  
-lm.tmp.7 <- lm(y~res.x2.x1, data = d)+lm.tmp.7 <- lm(y ~ res.x2.x1, data = d)
 summary(lm.tmp.7) summary(lm.tmp.7)
 ####################################################### #######################################################
Line 140: Line 140:
 cma.2 cma.2
 cma.3 cma.3
-# Note that it is only possible with +# Note that sorting out unique and common 
 +# explanation area is only possible with 
 # semi-partial correlation determinant # semi-partial correlation determinant
 # NOT partial correlation determinant # NOT partial correlation determinant
 +# because only semi-partial correlation
 +# shares the same denominator (as total 
 +# y).
 ############################################# #############################################
 +
 </code> </code>
  
 ====== Output ====== ====== Output ======
 +===== Multiple regression =====
 <code> <code>
-> # multiple regression: a simple e.g. 
-> # 
-> # 
-> rm(list=ls()) 
-> d <- read.csv("http://commres.net/wiki/_media/regression01-bankaccount.csv" 
-> d 
-   bankaccount income famnum 
-1            6    220      5 
-2            5    190      6 
-3            7    260      3 
-4            7    200      4 
-5            8    330      2 
-6           10    490      4 
-7            8    210      3 
-8           11    380      2 
-9            9    320      1 
-10              270      3 
- 
-> colnames(d) <- c("y", "x1", "x2") 
-> d 
-    y  x1 x2 
-1   6 220  5 
-2   5 190  6 
-3   7 260  3 
-4   7 200  4 
-5   8 330  2 
-6  10 490  4 
-7   8 210  3 
-8  11 380  2 
-9   9 320  1 
-10  9 270  3 
-> attach(d) 
-The following objects are masked from d (pos = 3): 
- 
-    x1, x2, y 
- 
-The following objects are masked from d (pos = 7): 
- 
-    x1, x2, y 
- 
-The following objects are masked from d (pos = 8): 
- 
-    x1, x2, y 
- 
-The following objects are masked from dx (pos = 9): 
- 
-    x1, x2, y 
- 
-The following objects are masked from dx (pos = 10): 
- 
-    x1, x2, y 
- 
-The following objects are masked from dx (pos = 11): 
- 
-    x1, x2, y 
- 
-The following objects are masked from d (pos = 13): 
- 
-    x1, x2, y 
- 
-The following objects are masked from d (pos = 14): 
- 
-    x1, x2, y 
- 
-> lm.y.x1 <- lm(y ~ x1, data=d) 
-> summary(lm.y.x1) 
- 
-Call: 
-lm(formula = y ~ x1, data = d) 
- 
-Residuals: 
-    Min      1Q  Median      3Q     Max  
--1.5189 -0.8969 -0.1297  1.0058  1.5800  
- 
-Coefficients: 
-            Estimate Std. Error t value Pr(>|t|)    
-(Intercept) 3.617781   1.241518   2.914  0.01947 *  
-x1          0.015269   0.004127   3.700  0.00605 ** 
---- 
-Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 
- 
-Residual standard error: 1.176 on 8 degrees of freedom 
-Multiple R-squared:  0.6311, Adjusted R-squared:  0.585  
-F-statistic: 13.69 on 1 and 8 DF,  p-value: 0.006046 
- 
- 
-> lm.y.x2 <- lm(y ~ x2, data=d) 
-> summary(lm.y.x2) 
- 
-Call: 
-lm(formula = y ~ x2, data = d) 
- 
-Residuals: 
-    Min      1Q  Median      3Q     Max  
--1.2537 -0.8881 -0.4851  0.4963  2.5920  
- 
-Coefficients: 
-            Estimate Std. Error t value Pr(>|t|)     
-(Intercept)  10.7910     1.1195   9.639 1.12e-05 *** 
-x2           -0.8458     0.3117  -2.713   0.0265 *   
---- 
-Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 
- 
-Residual standard error: 1.397 on 8 degrees of freedom 
-Multiple R-squared:  0.4793, Adjusted R-squared:  0.4142  
-F-statistic: 7.363 on 1 and 8 DF,  p-value: 0.02651 
- 
  
 > lm.y.x1x2 <- lm(y ~ x1+x2, data=d) > lm.y.x1x2 <- lm(y ~ x1+x2, data=d)
Line 290: Line 189:
 > y.pred <- a + (b1 * x1) + (b2 * x2) > y.pred <- a + (b1 * x1) + (b2 * x2)
 > y.pred > y.pred
- [1]  6.280586  5.380616  7.843699  6.588485  9.217328 10.022506  7.251626  9.809401  9.643641 + [1]  6.280586  5.380616  7.843699  6.588485  9.217328 10.022506 
-[10]  7.962113+ [7]  7.251626  9.809401  9.643641  7.962113
 > y.real <- y > y.real <- y
 > y.real > y.real
Line 349: Line 248:
 -0.4458785  -0.4458785 
  
-> install.packages("lm.beta") +install.packages("lm.beta")
-Error in install.packages : Updating loaded packages+
 > library(lm.beta) > library(lm.beta)
 > lm.beta(lm.y.x1x2) > lm.beta(lm.y.x1x2)
Line 369: Line 267:
  
 > lm.tmp.2 <- lm(y~x1, data=d) > lm.tmp.2 <- lm(y~x1, data=d)
-> res.y.x2 <- lm.tmp.2$residuals+> res.y.x1 <- lm.tmp.2$residuals
  
-> lm.tmp.3 <- lm(res.y.x2 ~ res.x2.x1, data=d)+> lm.tmp.3 <- lm(res.y.x1 ~ res.x2.x1, data=d)
 > summary(lm.tmp.3) > summary(lm.tmp.3)
  
 Call: Call:
-lm(formula = res.y.x2 ~ res.x2.x1, data = d)+lm(formula = res.y.x1 ~ res.x2.x1, data = d)
  
 Residuals: Residuals:
Line 393: Line 291:
  
  
-> install.packages("ppcor") +install.packages("ppcor")
-Error in install.packages : Updating loaded packages+
 > library(ppcor) > library(ppcor)
 > pcor(d) > pcor(d)
Line 452: Line 349:
 [1] "pearson" [1] "pearson"
  
-> partial.r <- pcor.test(y,x2,x1)+> partial.r <- pcor.test(y, x2, x1) 
 +> partial.r 
 +   estimate    p.value statistic  n gp  Method 
 +1 -0.672856 0.04702022 -2.406425 10  1 pearson
 > str(partial.r) > str(partial.r)
 'data.frame': 1 obs. of  6 variables: 'data.frame': 1 obs. of  6 variables:
Line 461: Line 361:
  $ gp       : num 1  $ gp       : num 1
  $ Method   : chr "pearson"  $ Method   : chr "pearson"
-> partial.r 
-   estimate    p.value statistic  n gp  Method 
-1 -0.672856 0.04702022 -2.406425 10  1 pearson 
 > partial.r$estimate^2 > partial.r$estimate^2
 [1] 0.4527352 [1] 0.4527352
Line 472: Line 369:
  
 > lm.tmp.5 <- lm(y~x2, data=d) > lm.tmp.5 <- lm(y~x2, data=d)
-> res.y.x1 <- lm.tmp.5$residuals+> res.y.x2 <- lm.tmp.5$residuals
  
-> lm.tmp.6 <- lm(res.y.x1 ~ res.x1.x2, data=d)+> lm.tmp.6 <- lm(res.y.x2 ~ res.x1.x2, data=d)
 > summary(lm.tmp.6) > summary(lm.tmp.6)
  
 Call: Call:
-lm(formula = res.y.x1 ~ res.x1.x2, data = d)+lm(formula = res.y.x2 ~ res.x1.x2, data = d)
  
 Residuals: Residuals:
Line 496: Line 393:
  
  
-> partial.r2 <- pcor.test(y,x1,x2)+> partial.r2 <- pcor.test(y, x1, x2)
 > str(partial.r2) > str(partial.r2)
 'data.frame': 1 obs. of  6 variables: 'data.frame': 1 obs. of  6 variables:
Line 524: Line 421:
 [1] 0.3188552 [1] 0.3188552
  
-> lm.tmp.7 <- lm(y~res.x2.x1, data = d)+> lm.tmp.7 <- lm(y ~ res.x2.x1, data = d)
 > summary(lm.tmp.7) > summary(lm.tmp.7)
  
Line 649: Line 546:
 > cma.3 > cma.3
 [1] 0.3122658 [1] 0.3122658
-> # Note that it is only possible with +> # Note that sorting out unique and common 
 +> # explanation area is only possible with 
 > # semi-partial correlation determinant > # semi-partial correlation determinant
 > # NOT partial correlation determinant > # NOT partial correlation determinant
 +> # because only semi-partial correlation
 +> # shares the same denominator (as total 
 +> # y).
 > ############################################# > #############################################
  
 +
 +>
 </code> </code>
 +====== Simple regression ======
 +<code>
 +> # multiple regression: a simple e.g.
 +> #
 +> #
 +> rm(list=ls())
 +> d <- read.csv("http://commres.net/wiki/_media/regression01-bankaccount.csv"
 +> d
 +   bankaccount income famnum
 +1            6    220      5
 +2            5    190      6
 +3            7    260      3
 +4            7    200      4
 +5            8    330      2
 +6           10    490      4
 +7            8    210      3
 +8           11    380      2
 +9            9    320      1
 +10              270      3
 +
 +> colnames(d) <- c("y", "x1", "x2")
 +> d
 +    y  x1 x2
 +1   6 220  5
 +2   5 190  6
 +3   7 260  3
 +4   7 200  4
 +5   8 330  2
 +6  10 490  4
 +7   8 210  3
 +8  11 380  2
 +9   9 320  1
 +10  9 270  3
 +> # attach(d)
 +> lm.y.x1 <- lm(y ~ x1, data=d)
 +> summary(lm.y.x1)
 +
 +Call:
 +lm(formula = y ~ x1, data = d)
 +
 +Residuals:
 +    Min      1Q  Median      3Q     Max 
 +-1.5189 -0.8969 -0.1297  1.0058  1.5800 
 +
 +Coefficients:
 +            Estimate Std. Error t value Pr(>|t|)   
 +(Intercept) 3.617781   1.241518   2.914  0.01947 * 
 +x1          0.015269   0.004127   3.700  0.00605 **
 +---
 +Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
 +
 +Residual standard error: 1.176 on 8 degrees of freedom
 +Multiple R-squared:  0.6311, Adjusted R-squared:  0.585 
 +F-statistic: 13.69 on 1 and 8 DF,  p-value: 0.006046
 +</code>
 +
 +단순회귀분석에서 (simple regression) F-test와 t-test는 (slope test) 기본적으로 똑 같은 테스트를 말한다. 왜냐하면 F-test에 기여하는 독립변인이 오직하나이고 그 하나가 slope test에 (t-test) 사용되기 때문이다. 이것은 t-test의 t값과 F-test의 F값의 관계에서도 나타난다. 
 +
 +$$ t^2 = F $$
 +
 +<code>
 +> t.cal <- 3.7 
 +> t.cal^2 
 +[1] 13.69
 +> F.cal <- 13.69
 +> F.cal
 +[1] 13.69
 +</code> 
 +
 +Simple regression에서 설명한 것처럼 기울기에 (slope) 대한 t-test는 기울기가 y 변인의 variability를 (평균을 중심으로 흔들림을) 설명하는 데 기여했는가를 테스트 하기 위한 것이다. 기울기가 0 이라면 이는 평균을 (평균선이 기울기가 0이다) 사용하는 것과 같으므로 기울기의 효과가 없음을 의미한다. 따라서 b와 b zero의 차이가 통계학적으로 의미있었는가를 t-test한다.
 +$$ \text{t calculated value} = \frac {b - 0}{se} $$
 +위에서 $se$는 아래처럼 구한다고 언급하였다.
 +
 +\begin{eqnarray*}
 +se & = & \sqrt{\frac{1}{n-2} * \frac{\text{SSE}}{\text{SSx}}} \\
 +& = & \sqrt{\frac {\text{MSE}} {\text{SSx}}} \\
 +\text{note that MSE } & = & \text{mean square error } \\
 +& = & \text{ms.res }
 +\end{eqnarray*}
 +
 +위에서 구한 t값의 p value는 R에서 
 +<code>
 +summary(lm.y.x1)
 +n <- length(y)
 +k <- 1 # num of predictor variables
 +sse <- sum(lm.y.x1$residuals^2) # ss.res
 +ssx1 <- sum((x1-mean(x1))^2)
 +b <- lm.y.x1$coefficient[2]
 +se <- sqrt((1/(n-2))*(sse/ssx1))
 +t.b.cal <- (b - 0) / se
 +t.b.cal
 +p.value <- 2 * pt(t.b.cal, n-k-1, lower.tail=F)
 +p.value 
 +# checck
 +t.b.cal
 +f.cal <- t.b.cal^2
 +f.cal
 +p.value 
 +</code>
 +
 +<code>
 +> summary(lm.y.x1)
 +
 +Call:
 +lm(formula = y ~ x1, data = d)
 +
 +Residuals:
 +    Min      1Q  Median      3Q     Max 
 +-1.5189 -0.8969 -0.1297  1.0058  1.5800 
 +
 +Coefficients:
 +            Estimate Std. Error t value Pr(>|t|)   
 +(Intercept) 3.617781   1.241518   2.914  0.01947 * 
 +x1          0.015269   0.004127   3.700  0.00605 **
 +---
 +Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
 +
 +Residual standard error: 1.176 on 8 degrees of freedom
 +Multiple R-squared:  0.6311, Adjusted R-squared:  0.585 
 +F-statistic: 13.69 on 1 and 8 DF,  p-value: 0.006046
 +
 +> n <- length(y)
 +> k <- 1 # num of predictor variables
 +> sse <- sum(lm.y.x1$residuals^2)
 +> ssx1 <- sum((x1-mean(x1))^2)
 +> b <- lm.y.x1$coefficient[2]
 +> se <-sqrt((1/(n-2))*(sse/ssx1))
 +> se <-sqrt(mse/ssx1)
 +> t.b.cal <- (b - 0) / se
 +> t.b.cal
 +      x1 
 +3.699639 
 +> p.value <- 2 * pt(t.b.cal, n-k-1, lower.tail=F)
 +
 +> # checck
 +> t.b.cal
 +      x1 
 +3.699639 
 +> t.b.cal^2
 +      x1 
 +13.68733 
 +> p.value 
 +         x1 
 +0.006045749 
 +
 +
 +</code>
 +
 +<code>
 +
 +> lm.y.x2 <- lm(y ~ x2, data=d)
 +> summary(lm.y.x2)
 +
 +Call:
 +lm(formula = y ~ x2, data = d)
 +
 +Residuals:
 +    Min      1Q  Median      3Q     Max 
 +-1.2537 -0.8881 -0.4851  0.4963  2.5920 
 +
 +Coefficients:
 +            Estimate Std. Error t value Pr(>|t|)    
 +(Intercept)  10.7910     1.1195   9.639 1.12e-05 ***
 +x2           -0.8458     0.3117  -2.713   0.0265 *  
 +---
 +Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
 +
 +Residual standard error: 1.397 on 8 degrees of freedom
 +Multiple R-squared:  0.4793, Adjusted R-squared:  0.4142 
 +F-statistic: 7.363 on 1 and 8 DF,  p-value: 0.02651
 +>
 +>
 +</code>
 +
c/ms/multiple_regression_lecture_note_for_r.1717571596.txt.gz · Last modified: 2024/06/05 16:13 by hkimscil

Donate Powered by PHP Valid HTML5 Valid CSS Driven by DokuWiki