Differences

This shows you the differences between two versions of the page.

--- c:ms:multiple_regression_lecture_note_for_r [2024/06/05 10:33] – created hkimscil
+++ c:ms:multiple_regression_lecture_note_for_r [2024/09/30 08:56] (current) – [Simple regression] hkimscil
@@ Line 1: / Line 1: @@
+====== Multiple regression with pr, spr, zero-order r ======
 <code>
 # multiple regression: a simple e.g.
@@ Line 9: / Line 10: @@
 colnames(d) <- c("y", "x1", "x2")
 d
-attach(d)
+# attach(d)
 lm.y.x1 <- lm(y ~ x1, data=d)
 summary(lm.y.x1)
@@ Line 55: / Line 56: @@
 beta2
-install.packages("lm.beta")
+# install.packages("lm.beta")
 library(lm.beta)
 lm.beta(lm.y.x1x2)
@@ Line 66: / Line 67: @@
 lm.tmp.2 <- lm(y~x1, data=d)
-res.y.x2 <- lm.tmp.2$residuals
+res.y.x1 <- lm.tmp.2$residuals
-lm.tmp.3 <- lm(res.y.x2 ~ res.x2.x1, data=d)
+lm.tmp.3 <- lm(res.y.x1 ~ res.x2.x1, data=d)
 summary(lm.tmp.3)
-install.packages("ppcor")
+# install.packages("ppcor")
 library(ppcor)
 pcor(d)
 spcor(d)
-partial.r <- pcor.test(y,x2,x1)
+partial.r <- pcor.test(y, x2, x1)
-str(partial.r)
 partial.r
+str(partial.r)
 partial.r$estimate^2
@@ Line 85: / Line 86: @@
 lm.tmp.5 <- lm(y~x2, data=d)
-res.y.x1 <- lm.tmp.5$residuals
+res.y.x2 <- lm.tmp.5$residuals
-lm.tmp.6 <- lm(res.y.x1 ~ res.x1.x2, data=d)
+lm.tmp.6 <- lm(res.y.x2 ~ res.x1.x2, data=d)
 summary(lm.tmp.6)
-partial.r2 <- pcor.test(y,x1,x2)
+partial.r2 <- pcor.test(y, x1, x2)
 str(partial.r2)
 partial.r2$estimate^2
@@ Line 104: / Line 105: @@
 spr.2$estimate^2
-lm.tmp.7 <- lm(y~res.x2.x1, data = d)
+lm.tmp.7 <- lm(y ~ res.x2.x1, data = d)
 summary(lm.tmp.7)
 #######################################################
+# get the common area that explain the y variable
+# 1.
 summary(lm.y.x2)
 all.x2 <- summary(lm.y.x2)$r.squared
@@ Line 113: / Line 116: @@
 all.x2
 sp.x2
-all.x2 - sp.x2
+cma.1 <- all.x2 - sp.x2
+cma.1
+# 2.
 summary(lm.y.x1)
 all.x1 <- summary(lm.y.x1)$r.squared
@@ Line 120: / Line 125: @@
 all.x1
 sp.x1
-all.x1 - sp.x1
+cma.2 <- all.x1 - sp.x1
+cma.2
+# OR 3.
+summary(lm.y.x1x2)
+r2.y.x1x2 <- summary(lm.y.x1x2)$r.square
+r2.y.x1x2
+sp.x1
+sp.x2
+cma.3 <- r2.y.x1x2 - (sp.x1 + sp.x2)
+cma.3
+cma.1
+cma.2
+cma.3
+# Note that sorting out unique and common
+# explanation area is only possible with
+# semi-partial correlation determinant
+# NOT partial correlation determinant
+# because only semi-partial correlation
+# shares the same denominator (as total
+# y).
+#############################################
 </code>
+====== Output ======
+===== Multiple regression =====
+<code>
+>
+> lm.y.x1x2 <- lm(y ~ x1+x2, data=d)
+> summary(lm.y.x1x2)
+Call:
+lm(formula = y ~ x1 + x2, data = d)
+Residuals:
+    Min      1Q  Median      3Q     Max
+-1.2173 -0.5779 -0.1515  0.6642  1.1906
+Coefficients:
+             Estimate Std. Error t value Pr(>|t|)
+(Intercept)  6.399103   1.516539   4.220  0.00394 **
+x1           0.011841   0.003561   3.325  0.01268 *
+x2          -0.544727   0.226364  -2.406  0.04702 *
+---
+Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+Residual standard error: 0.9301 on 7 degrees of freedom
+Multiple R-squared:  0.7981,	Adjusted R-squared:  0.7404
+F-statistic: 13.84 on 2 and 7 DF,  p-value: 0.003696
+>
+>
+> lm.y.x1x2$coefficient
+(Intercept)          x1          x2
+.39910298  0.01184145 -0.54472725
+> # y.hat = 6.399103 + (0.011841)*x1 + (−0.544727)*x2
+> a <- lm.y.x1x2$coefficient[1]
+> b1 <- lm.y.x1x2$coefficient[2]
+> b2 <- lm.y.x1x2$coefficient[3]
+>
+> y.pred <- a + (b1 * x1) + (b2 * x2)
+> y.pred
+ [1]  6.280586  5.380616  7.843699  6.588485  9.217328 10.022506
+ [7]  7.251626  9.809401  9.643641  7.962113
+> y.real <- y
+> y.real
+ [1]  6  5  7  7  8 10  8 11  9  9
+> y.mean <- mean(y)
+> y.mean
+[1] 8
+>
+> res <- y.real - y.pred
+> reg <- y.pred - y.mean
+> ss.res <- sum(res^2)
+> ss.reg <- sum(reg^2)
+>
+> ss.tot <- var(y) * (length(y)-1)
+> ss.tot
+[1] 30
+> ss.res
+[1] 6.056235
+> ss.reg
+[1] 23.94376
+> ss.res+ss.reg
+[1] 30
+>
+> # slope test
+> summary(lm.y.x1x2)
+Call:
+lm(formula = y ~ x1 + x2, data = d)
+Residuals:
+    Min      1Q  Median      3Q     Max
+-1.2173 -0.5779 -0.1515  0.6642  1.1906
+Coefficients:
+             Estimate Std. Error t value Pr(>|t|)
+(Intercept)  6.399103   1.516539   4.220  0.00394 **
+x1           0.011841   0.003561   3.325  0.01268 *
+x2          -0.544727   0.226364  -2.406  0.04702 *
+---
+Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+Residual standard error: 0.9301 on 7 degrees of freedom
+Multiple R-squared:  0.7981,	Adjusted R-squared:  0.7404
+F-statistic: 13.84 on 2 and 7 DF,  p-value: 0.003696
+> # note on 2 t-tests
+>
+> # beta coefficient (standardized b)
+> # beta <- b * (sd(x)/sd(y))
+> beta1 <- b1 * (sd(x1)/sd(y))
+> beta2 <- b2 * (sd(x2)/sd(y))
+> beta1
+      x1
+.616097
+> beta2
+        x2
+-0.4458785
+>
+> # install.packages("lm.beta")
+> library(lm.beta)
+> lm.beta(lm.y.x1x2)
+Call:
+lm(formula = y ~ x1 + x2, data = d)
+Standardized Coefficients::
+(Intercept)          x1          x2
+         NA   0.6160970  -0.4458785
+>
+> #######################################################
+> # partial correlation coefficient and pr2
+> # x2's explanation?
+> lm.tmp.1 <- lm(x2~x1, data=d)
+> res.x2.x1 <- lm.tmp.1$residuals
+>
+> lm.tmp.2 <- lm(y~x1, data=d)
+> res.y.x1 <- lm.tmp.2$residuals
+>
+> lm.tmp.3 <- lm(res.y.x1 ~ res.x2.x1, data=d)
+> summary(lm.tmp.3)
+Call:
+lm(formula = res.y.x1 ~ res.x2.x1, data = d)
+Residuals:
+    Min      1Q  Median      3Q     Max
+-1.2173 -0.5779 -0.1515  0.6642  1.1906
+Coefficients:
+              Estimate Std. Error t value Pr(>|t|)
+(Intercept)  6.281e-18  2.751e-01   0.000    1.000
+res.x2.x1   -5.447e-01  2.117e-01  -2.573    0.033 *
+---
+Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+Residual standard error: 0.8701 on 8 degrees of freedom
+Multiple R-squared:  0.4527,	Adjusted R-squared:  0.3843
+F-statistic: 6.618 on 1 and 8 DF,  p-value: 0.033
+>
+> # install.packages("ppcor")
+> library(ppcor)
+> pcor(d)
+$estimate
+            y        x1         x2
+y   1.0000000 0.7825112 -0.6728560
+x1  0.7825112 1.0000000  0.3422911
+x2 -0.6728560 0.3422911  1.0000000
+$p.value
+            y         x1         x2
+y  0.00000000 0.01267595 0.04702022
+x1 0.01267595 0.00000000 0.36723388
+x2 0.04702022 0.36723388 0.00000000
+$statistic
+           y        x1         x2
+y   0.000000 3.3251023 -2.4064253
+x1  3.325102 0.0000000  0.9638389
+x2 -2.406425 0.9638389  0.0000000
+$n
+[1] 10
+$gp
+[1] 1
+$method
+[1] "pearson"
+> spcor(d)
+$estimate
+            y        x1         x2
+y   1.0000000 0.5646726 -0.4086619
+x1  0.7171965 1.0000000  0.2078919
+x2 -0.6166940 0.2470028  1.0000000
+$p.value
+            y       x1        x2
+y  0.00000000 0.113182 0.2748117
+x1 0.02964029 0.000000 0.5914441
+x2 0.07691195 0.521696 0.0000000
+$statistic
+           y        x1         x2
+y   0.000000 1.8101977 -1.1846548
+x1  2.722920 0.0000000  0.5623159
+x2 -2.072679 0.6744045  0.0000000
+$n
+[1] 10
+$gp
+[1] 1
+$method
+[1] "pearson"
+> partial.r <- pcor.test(y, x2, x1)
+> partial.r
+   estimate    p.value statistic  n gp  Method
+-0.672856 0.04702022 -2.406425 10  1 pearson
+> str(partial.r)
+'data.frame':	1 obs. of  6 variables:
+ $ estimate : num -0.673
+ $ p.value  : num 0.047
+ $ statistic: num -2.41
+ $ n        : int 10
+ $ gp       : num 1
+ $ Method   : chr "pearson"
+> partial.r$estimate^2
+[1] 0.4527352
+>
+> # x1's own explanation?
+> lm.tmp.4 <- lm(x1~x2, data=d)
+> res.x1.x2 <- lm.tmp.4$residuals
+>
+> lm.tmp.5 <- lm(y~x2, data=d)
+> res.y.x2 <- lm.tmp.5$residuals
+>
+> lm.tmp.6 <- lm(res.y.x2 ~ res.x1.x2, data=d)
+> summary(lm.tmp.6)
+Call:
+lm(formula = res.y.x2 ~ res.x1.x2, data = d)
+Residuals:
+    Min      1Q  Median      3Q     Max
+-1.2173 -0.5779 -0.1515  0.6642  1.1906
+Coefficients:
+             Estimate Std. Error t value Pr(>|t|)
+(Intercept) 1.330e-17  2.751e-01   0.000  1.00000
+res.x1.x2   1.184e-02  3.331e-03   3.555  0.00746 **
+---
+Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+Residual standard error: 0.8701 on 8 degrees of freedom
+Multiple R-squared:  0.6123,	Adjusted R-squared:  0.5639
+F-statistic: 12.64 on 1 and 8 DF,  p-value: 0.007458
+>
+> partial.r2 <- pcor.test(y, x1, x2)
+> str(partial.r2)
+'data.frame':	1 obs. of  6 variables:
+ $ estimate : num 0.783
+ $ p.value  : num 0.0127
+ $ statistic: num 3.33
+ $ n        : int 10
+ $ gp       : num 1
+ $ Method   : chr "pearson"
+> partial.r2$estimate^2
+[1] 0.6123238
+> #######################################################
+>
+> # semipartial correlation coefficient and spr2
+> #
+> spr.1 <- spcor.test(y,x2,x1)
+> spr.2 <- spcor.test(y,x1,x2)
+> spr.1
+    estimate   p.value statistic  n gp  Method
+-0.4086619 0.2748117 -1.184655 10  1 pearson
+> spr.2
+   estimate  p.value statistic  n gp  Method
+0.5646726 0.113182  1.810198 10  1 pearson
+> spr.1$estimate^2
+[1] 0.1670045
+> spr.2$estimate^2
+[1] 0.3188552
+>
+> lm.tmp.7 <- lm(y ~ res.x2.x1, data = d)
+> summary(lm.tmp.7)
+Call:
+lm(formula = y ~ res.x2.x1, data = d)
+Residuals:
+    Min      1Q  Median      3Q     Max
+-1.8617 -1.1712 -0.4940  0.5488  3.0771
+Coefficients:
+            Estimate Std. Error t value Pr(>|t|)
+(Intercept)   8.0000     0.5589  14.314 5.54e-07 ***
+res.x2.x1    -0.5447     0.4301  -1.266    0.241
+---
+Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+Residual standard error: 1.767 on 8 degrees of freedom
+Multiple R-squared:  0.167,	Adjusted R-squared:  0.06288
+F-statistic: 1.604 on 1 and 8 DF,  p-value: 0.241
+> #######################################################
+>
+> # get the common area that explain the y variable
+> # 1.
+> summary(lm.y.x2)
+Call:
+lm(formula = y ~ x2, data = d)
+Residuals:
+    Min      1Q  Median      3Q     Max
+-1.2537 -0.8881 -0.4851  0.4963  2.5920
+Coefficients:
+            Estimate Std. Error t value Pr(>|t|)
+(Intercept)  10.7910     1.1195   9.639 1.12e-05 ***
+x2           -0.8458     0.3117  -2.713   0.0265 *
+---
+Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+Residual standard error: 1.397 on 8 degrees of freedom
+Multiple R-squared:  0.4793,	Adjusted R-squared:  0.4142
+F-statistic: 7.363 on 1 and 8 DF,  p-value: 0.02651
+> all.x2 <- summary(lm.y.x2)$r.squared
+> sp.x2 <- spr.1$estimate^2
+> all.x2
+[1] 0.4792703
+> sp.x2
+[1] 0.1670045
+> cma.1 <- all.x2 - sp.x2
+> cma.1
+[1] 0.3122658
+>
+> # 2.
+> summary(lm.y.x1)
+Call:
+lm(formula = y ~ x1, data = d)
+Residuals:
+    Min      1Q  Median      3Q     Max
+-1.5189 -0.8969 -0.1297  1.0058  1.5800
+Coefficients:
+            Estimate Std. Error t value Pr(>|t|)
+(Intercept) 3.617781   1.241518   2.914  0.01947 *
+x1          0.015269   0.004127   3.700  0.00605 **
+---
+Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+Residual standard error: 1.176 on 8 degrees of freedom
+Multiple R-squared:  0.6311,	Adjusted R-squared:  0.585
+F-statistic: 13.69 on 1 and 8 DF,  p-value: 0.006046
+> all.x1 <- summary(lm.y.x1)$r.squared
+> sp.x1 <- spr.2$estimate^2
+> all.x1
+[1] 0.631121
+> sp.x1
+[1] 0.3188552
+> cma.2 <- all.x1 - sp.x1
+> cma.2
+[1] 0.3122658
+>
+> # OR 3.
+> summary(lm.y.x1x2)
+Call:
+lm(formula = y ~ x1 + x2, data = d)
+Residuals:
+    Min      1Q  Median      3Q     Max
+-1.2173 -0.5779 -0.1515  0.6642  1.1906
+Coefficients:
+             Estimate Std. Error t value Pr(>|t|)
+(Intercept)  6.399103   1.516539   4.220  0.00394 **
+x1           0.011841   0.003561   3.325  0.01268 *
+x2          -0.544727   0.226364  -2.406  0.04702 *
+---
+Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+Residual standard error: 0.9301 on 7 degrees of freedom
+Multiple R-squared:  0.7981,	Adjusted R-squared:  0.7404
+F-statistic: 13.84 on 2 and 7 DF,  p-value: 0.003696
+> r2.y.x1x2 <- summary(lm.y.x1x2)$r.square
+> r2.y.x1x2
+[1] 0.7981255
+> sp.x1
+[1] 0.3188552
+> sp.x2
+[1] 0.1670045
+> cma.3 <- r2.y.x1x2 - (sp.x1 + sp.x2)
+> cma.3
+[1] 0.3122658
+>
+> cma.1
+[1] 0.3122658
+> cma.2
+[1] 0.3122658
+> cma.3
+[1] 0.3122658
+> # Note that sorting out unique and common
+> # explanation area is only possible with
+> # semi-partial correlation determinant
+> # NOT partial correlation determinant
+> # because only semi-partial correlation
+> # shares the same denominator (as total
+> # y).
+> #############################################
+>
+>
+>
+</code>
+====== Simple regression ======
+<code>
+> # multiple regression: a simple e.g.
+> #
+> #
+> rm(list=ls())
+> d <- read.csv("http://commres.net/wiki/_media/regression01-bankaccount.csv")
+> d
+   bankaccount income famnum
+            6    220      5
+            5    190      6
+            7    260      3
+            7    200      4
+            8    330      2
+           10    490      4
+            8    210      3
+           11    380      2
+            9    320      1
+           9    270      3
+>
+> colnames(d) <- c("y", "x1", "x2")
+> d
+    y  x1 x2
+   6 220  5
+   5 190  6
+   7 260  3
+   7 200  4
+   8 330  2
+  10 490  4
+   8 210  3
+  11 380  2
+   9 320  1
+  9 270  3
+> # attach(d)
+> lm.y.x1 <- lm(y ~ x1, data=d)
+> summary(lm.y.x1)
+Call:
+lm(formula = y ~ x1, data = d)
+Residuals:
+    Min      1Q  Median      3Q     Max
+-1.5189 -0.8969 -0.1297  1.0058  1.5800
+Coefficients:
+            Estimate Std. Error t value Pr(>|t|)
+(Intercept) 3.617781   1.241518   2.914  0.01947 *
+x1          0.015269   0.004127   3.700  0.00605 **
+---
+Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+Residual standard error: 1.176 on 8 degrees of freedom
+Multiple R-squared:  0.6311,	Adjusted R-squared:  0.585
+F-statistic: 13.69 on 1 and 8 DF,  p-value: 0.006046
+</code>
+단순회귀분석에서 (simple regression) F-test와 t-test는 (slope test) 기본적으로 똑 같은 테스트를 말한다. 왜냐하면 F-test에 기여하는 독립변인이 오직하나이고 그 하나가 slope test에 (t-test) 사용되기 때문이다. 이것은 t-test의 t값과 F-test의 F값의 관계에서도 나타난다.
+$$ t^2 = F $$
+<code>
+> t.cal <- 3.7
+> t.cal^2
+[1] 13.69
+> F.cal <- 13.69
+> F.cal
+[1] 13.69
+</code>
+Simple regression에서 설명한 것처럼 기울기에 (slope) 대한 t-test는 기울기가 y 변인의 variability를 (평균을 중심으로 흔들림을) 설명하는 데 기여했는가를 테스트 하기 위한 것이다. 기울기가 0 이라면 이는 평균을 (평균선이 기울기가 0이다) 사용하는 것과 같으므로 기울기의 효과가 없음을 의미한다. 따라서 b와 b zero의 차이가 통계학적으로 의미있었는가를 t-test한다.
+$$ \text{t calculated value} = \frac {b - 0}{se} $$
+위에서 $se$는 아래처럼 구한다고 언급하였다.
+\begin{eqnarray*}
+se & = & \sqrt{\frac{1}{n-2} * \frac{\text{SSE}}{\text{SSx}}} \\
+& = & \sqrt{\frac {\text{MSE}} {\text{SSx}}} \\
+\text{note that MSE } & = & \text{mean square error } \\
+& = & \text{ms.res }
+\end{eqnarray*}
+위에서 구한 t값의 p value는 R에서
+<code>
+summary(lm.y.x1)
+n <- length(y)
+k <- 1 # num of predictor variables
+sse <- sum(lm.y.x1$residuals^2) # ss.res
+ssx1 <- sum((x1-mean(x1))^2)
+b <- lm.y.x1$coefficient[2]
+se <- sqrt((1/(n-2))*(sse/ssx1))
+t.b.cal <- (b - 0) / se
+t.b.cal
+p.value <- 2 * pt(t.b.cal, n-k-1, lower.tail=F)
+p.value
+# checck
+t.b.cal
+f.cal <- t.b.cal^2
+f.cal
+p.value
+</code>
+<code>
+> summary(lm.y.x1)
+Call:
+lm(formula = y ~ x1, data = d)
+Residuals:
+    Min      1Q  Median      3Q     Max
+-1.5189 -0.8969 -0.1297  1.0058  1.5800
+Coefficients:
+            Estimate Std. Error t value Pr(>|t|)
+(Intercept) 3.617781   1.241518   2.914  0.01947 *
+x1          0.015269   0.004127   3.700  0.00605 **
+---
+Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+Residual standard error: 1.176 on 8 degrees of freedom
+Multiple R-squared:  0.6311,	Adjusted R-squared:  0.585
+F-statistic: 13.69 on 1 and 8 DF,  p-value: 0.006046
+> n <- length(y)
+> k <- 1 # num of predictor variables
+> sse <- sum(lm.y.x1$residuals^2)
+> ssx1 <- sum((x1-mean(x1))^2)
+> b <- lm.y.x1$coefficient[2]
+> se <-sqrt((1/(n-2))*(sse/ssx1))
+> se <-sqrt(mse/ssx1)
+> t.b.cal <- (b - 0) / se
+> t.b.cal
+      x1
+.699639
+> p.value <- 2 * pt(t.b.cal, n-k-1, lower.tail=F)
+>
+> # checck
+> t.b.cal
+      x1
+.699639
+> t.b.cal^2
+      x1
+.68733
+> p.value
+         x1
+.006045749
+>
+>
+</code>
+<code>
+>
+> lm.y.x2 <- lm(y ~ x2, data=d)
+> summary(lm.y.x2)
+Call:
+lm(formula = y ~ x2, data = d)
+Residuals:
+    Min      1Q  Median      3Q     Max
+-1.2537 -0.8881 -0.4851  0.4963  2.5920
+Coefficients:
+            Estimate Std. Error t value Pr(>|t|)
+(Intercept)  10.7910     1.1195   9.639 1.12e-05 ***
+x2           -0.8458     0.3117  -2.713   0.0265 *
+---
+Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+Residual standard error: 1.397 on 8 degrees of freedom
+Multiple R-squared:  0.4793,	Adjusted R-squared:  0.4142
+F-statistic: 7.363 on 1 and 8 DF,  p-value: 0.02651
+>
+>
+</code>