Differences

This shows you the differences between two versions of the page.

--- c:ms:multiple_regression_lecture_note_for_r [2024/06/05 16:13] – hkimscil
+++ c:ms:multiple_regression_lecture_note_for_r [2024/09/30 08:56] (current) – [Simple regression] hkimscil
@@ Line 10: / Line 10: @@
 colnames(d) <- c("y", "x1", "x2")
 d
-attach(d)
+# attach(d)
 lm.y.x1 <- lm(y ~ x1, data=d)
 summary(lm.y.x1)
@@ Line 56: / Line 56: @@
 beta2
-install.packages("lm.beta")
+# install.packages("lm.beta")
 library(lm.beta)
 lm.beta(lm.y.x1x2)
@@ Line 67: / Line 67: @@
 lm.tmp.2 <- lm(y~x1, data=d)
-res.y.x2 <- lm.tmp.2$residuals
+res.y.x1 <- lm.tmp.2$residuals
-lm.tmp.3 <- lm(res.y.x2 ~ res.x2.x1, data=d)
+lm.tmp.3 <- lm(res.y.x1 ~ res.x2.x1, data=d)
 summary(lm.tmp.3)
-install.packages("ppcor")
+# install.packages("ppcor")
 library(ppcor)
 pcor(d)
 spcor(d)
-partial.r <- pcor.test(y,x2,x1)
+partial.r <- pcor.test(y, x2, x1)
-str(partial.r)
 partial.r
+str(partial.r)
 partial.r$estimate^2
@@ Line 86: / Line 86: @@
 lm.tmp.5 <- lm(y~x2, data=d)
-res.y.x1 <- lm.tmp.5$residuals
+res.y.x2 <- lm.tmp.5$residuals
-lm.tmp.6 <- lm(res.y.x1 ~ res.x1.x2, data=d)
+lm.tmp.6 <- lm(res.y.x2 ~ res.x1.x2, data=d)
 summary(lm.tmp.6)
-partial.r2 <- pcor.test(y,x1,x2)
+partial.r2 <- pcor.test(y, x1, x2)
 str(partial.r2)
 partial.r2$estimate^2
@@ Line 105: / Line 105: @@
 spr.2$estimate^2
-lm.tmp.7 <- lm(y~res.x2.x1, data = d)
+lm.tmp.7 <- lm(y ~ res.x2.x1, data = d)
 summary(lm.tmp.7)
 #######################################################
@@ Line 140: / Line 140: @@
 cma.2
 cma.3
-# Note that it is only possible with
+# Note that sorting out unique and common
+# explanation area is only possible with
 # semi-partial correlation determinant
 # NOT partial correlation determinant
+# because only semi-partial correlation
+# shares the same denominator (as total
+# y).
 #############################################
 </code>
 ====== Output ======
+===== Multiple regression =====
 <code>
-> # multiple regression: a simple e.g.
-> #
-> #
-> rm(list=ls())
-> d <- read.csv("http://commres.net/wiki/_media/regression01-bankaccount.csv")
-> d
-   bankaccount income famnum
-            6    220      5
-            5    190      6
-            7    260      3
-            7    200      4
-            8    330      2
-           10    490      4
-            8    210      3
-           11    380      2
-            9    320      1
-           9    270      3
->
-> colnames(d) <- c("y", "x1", "x2")
-> d
-    y  x1 x2
-   6 220  5
-   5 190  6
-   7 260  3
-   7 200  4
-   8 330  2
-  10 490  4
-   8 210  3
-  11 380  2
-   9 320  1
-  9 270  3
-> attach(d)
-The following objects are masked from d (pos = 3):
-    x1, x2, y
-The following objects are masked from d (pos = 7):
-    x1, x2, y
-The following objects are masked from d (pos = 8):
-    x1, x2, y
-The following objects are masked from dx (pos = 9):
-    x1, x2, y
-The following objects are masked from dx (pos = 10):
-    x1, x2, y
-The following objects are masked from dx (pos = 11):
-    x1, x2, y
-The following objects are masked from d (pos = 13):
-    x1, x2, y
-The following objects are masked from d (pos = 14):
-    x1, x2, y
-> lm.y.x1 <- lm(y ~ x1, data=d)
-> summary(lm.y.x1)
-Call:
-lm(formula = y ~ x1, data = d)
-Residuals:
-    Min      1Q  Median      3Q     Max
--1.5189 -0.8969 -0.1297  1.0058  1.5800
-Coefficients:
-            Estimate Std. Error t value Pr(>|t|)
-(Intercept) 3.617781   1.241518   2.914  0.01947 *
-x1          0.015269   0.004127   3.700  0.00605 **
----
-Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
-Residual standard error: 1.176 on 8 degrees of freedom
-Multiple R-squared:  0.6311,	Adjusted R-squared:  0.585
-F-statistic: 13.69 on 1 and 8 DF,  p-value: 0.006046
->
-> lm.y.x2 <- lm(y ~ x2, data=d)
-> summary(lm.y.x2)
-Call:
-lm(formula = y ~ x2, data = d)
-Residuals:
-    Min      1Q  Median      3Q     Max
--1.2537 -0.8881 -0.4851  0.4963  2.5920
-Coefficients:
-            Estimate Std. Error t value Pr(>|t|)
-(Intercept)  10.7910     1.1195   9.639 1.12e-05 ***
-x2           -0.8458     0.3117  -2.713   0.0265 *
----
-Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
-Residual standard error: 1.397 on 8 degrees of freedom
-Multiple R-squared:  0.4793,	Adjusted R-squared:  0.4142
-F-statistic: 7.363 on 1 and 8 DF,  p-value: 0.02651
 >
 > lm.y.x1x2 <- lm(y ~ x1+x2, data=d)
@@ Line 290: / Line 189: @@
 > y.pred <- a + (b1 * x1) + (b2 * x2)
 > y.pred
- [1]  6.280586  5.380616  7.843699  6.588485  9.217328 10.022506  7.251626  9.809401  9.643641
+ [1]  6.280586  5.380616  7.843699  6.588485  9.217328 10.022506
-[10]  7.962113
+ [7]  7.251626  9.809401  9.643641  7.962113
 > y.real <- y
 > y.real
@@ Line 349: / Line 248: @@
 -0.4458785
 >
-> install.packages("lm.beta")
+> # install.packages("lm.beta")
-Error in install.packages : Updating loaded packages
 > library(lm.beta)
 > lm.beta(lm.y.x1x2)
@@ Line 369: / Line 267: @@
 >
 > lm.tmp.2 <- lm(y~x1, data=d)
-> res.y.x2 <- lm.tmp.2$residuals
+> res.y.x1 <- lm.tmp.2$residuals
 >
-> lm.tmp.3 <- lm(res.y.x2 ~ res.x2.x1, data=d)
+> lm.tmp.3 <- lm(res.y.x1 ~ res.x2.x1, data=d)
 > summary(lm.tmp.3)
 Call:
-lm(formula = res.y.x2 ~ res.x2.x1, data = d)
+lm(formula = res.y.x1 ~ res.x2.x1, data = d)
 Residuals:
@@ Line 393: / Line 291: @@
 >
-> install.packages("ppcor")
+> # install.packages("ppcor")
-Error in install.packages : Updating loaded packages
 > library(ppcor)
 > pcor(d)
@@ Line 452: / Line 349: @@
 [1] "pearson"
-> partial.r <- pcor.test(y,x2,x1)
+> partial.r <- pcor.test(y, x2, x1)
+> partial.r
+   estimate    p.value statistic  n gp  Method
+-0.672856 0.04702022 -2.406425 10  1 pearson
 > str(partial.r)
 'data.frame':	1 obs. of  6 variables:
@@ Line 461: / Line 361: @@
  $ gp       : num 1
  $ Method   : chr "pearson"
-> partial.r
-   estimate    p.value statistic  n gp  Method
--0.672856 0.04702022 -2.406425 10  1 pearson
 > partial.r$estimate^2
 [1] 0.4527352
@@ Line 472: / Line 369: @@
 >
 > lm.tmp.5 <- lm(y~x2, data=d)
-> res.y.x1 <- lm.tmp.5$residuals
+> res.y.x2 <- lm.tmp.5$residuals
 >
-> lm.tmp.6 <- lm(res.y.x1 ~ res.x1.x2, data=d)
+> lm.tmp.6 <- lm(res.y.x2 ~ res.x1.x2, data=d)
 > summary(lm.tmp.6)
 Call:
-lm(formula = res.y.x1 ~ res.x1.x2, data = d)
+lm(formula = res.y.x2 ~ res.x1.x2, data = d)
 Residuals:
@@ Line 496: / Line 393: @@
 >
-> partial.r2 <- pcor.test(y,x1,x2)
+> partial.r2 <- pcor.test(y, x1, x2)
 > str(partial.r2)
 'data.frame':	1 obs. of  6 variables:
@@ Line 524: / Line 421: @@
 [1] 0.3188552
 >
-> lm.tmp.7 <- lm(y~res.x2.x1, data = d)
+> lm.tmp.7 <- lm(y ~ res.x2.x1, data = d)
 > summary(lm.tmp.7)
@@ Line 649: / Line 546: @@
 > cma.3
 [1] 0.3122658
-> # Note that it is only possible with
+> # Note that sorting out unique and common
+> # explanation area is only possible with
 > # semi-partial correlation determinant
 > # NOT partial correlation determinant
+> # because only semi-partial correlation
+> # shares the same denominator (as total
+> # y).
 > #############################################
 >
+>
+>
 </code>
+====== Simple regression ======
+<code>
+> # multiple regression: a simple e.g.
+> #
+> #
+> rm(list=ls())
+> d <- read.csv("http://commres.net/wiki/_media/regression01-bankaccount.csv")
+> d
+   bankaccount income famnum
+            6    220      5
+            5    190      6
+            7    260      3
+            7    200      4
+            8    330      2
+           10    490      4
+            8    210      3
+           11    380      2
+            9    320      1
+           9    270      3
+>
+> colnames(d) <- c("y", "x1", "x2")
+> d
+    y  x1 x2
+   6 220  5
+   5 190  6
+   7 260  3
+   7 200  4
+   8 330  2
+  10 490  4
+   8 210  3
+  11 380  2
+   9 320  1
+  9 270  3
+> # attach(d)
+> lm.y.x1 <- lm(y ~ x1, data=d)
+> summary(lm.y.x1)
+Call:
+lm(formula = y ~ x1, data = d)
+Residuals:
+    Min      1Q  Median      3Q     Max
+-1.5189 -0.8969 -0.1297  1.0058  1.5800
+Coefficients:
+            Estimate Std. Error t value Pr(>|t|)
+(Intercept) 3.617781   1.241518   2.914  0.01947 *
+x1          0.015269   0.004127   3.700  0.00605 **
+---
+Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+Residual standard error: 1.176 on 8 degrees of freedom
+Multiple R-squared:  0.6311,	Adjusted R-squared:  0.585
+F-statistic: 13.69 on 1 and 8 DF,  p-value: 0.006046
+</code>
+단순회귀분석에서 (simple regression) F-test와 t-test는 (slope test) 기본적으로 똑 같은 테스트를 말한다. 왜냐하면 F-test에 기여하는 독립변인이 오직하나이고 그 하나가 slope test에 (t-test) 사용되기 때문이다. 이것은 t-test의 t값과 F-test의 F값의 관계에서도 나타난다.
+$$ t^2 = F $$
+<code>
+> t.cal <- 3.7
+> t.cal^2
+[1] 13.69
+> F.cal <- 13.69
+> F.cal
+[1] 13.69
+</code>
+Simple regression에서 설명한 것처럼 기울기에 (slope) 대한 t-test는 기울기가 y 변인의 variability를 (평균을 중심으로 흔들림을) 설명하는 데 기여했는가를 테스트 하기 위한 것이다. 기울기가 0 이라면 이는 평균을 (평균선이 기울기가 0이다) 사용하는 것과 같으므로 기울기의 효과가 없음을 의미한다. 따라서 b와 b zero의 차이가 통계학적으로 의미있었는가를 t-test한다.
+$$ \text{t calculated value} = \frac {b - 0}{se} $$
+위에서 $se$는 아래처럼 구한다고 언급하였다.
+\begin{eqnarray*}
+se & = & \sqrt{\frac{1}{n-2} * \frac{\text{SSE}}{\text{SSx}}} \\
+& = & \sqrt{\frac {\text{MSE}} {\text{SSx}}} \\
+\text{note that MSE } & = & \text{mean square error } \\
+& = & \text{ms.res }
+\end{eqnarray*}
+위에서 구한 t값의 p value는 R에서
+<code>
+summary(lm.y.x1)
+n <- length(y)
+k <- 1 # num of predictor variables
+sse <- sum(lm.y.x1$residuals^2) # ss.res
+ssx1 <- sum((x1-mean(x1))^2)
+b <- lm.y.x1$coefficient[2]
+se <- sqrt((1/(n-2))*(sse/ssx1))
+t.b.cal <- (b - 0) / se
+t.b.cal
+p.value <- 2 * pt(t.b.cal, n-k-1, lower.tail=F)
+p.value
+# checck
+t.b.cal
+f.cal <- t.b.cal^2
+f.cal
+p.value
+</code>
+<code>
+> summary(lm.y.x1)
+Call:
+lm(formula = y ~ x1, data = d)
+Residuals:
+    Min      1Q  Median      3Q     Max
+-1.5189 -0.8969 -0.1297  1.0058  1.5800
+Coefficients:
+            Estimate Std. Error t value Pr(>|t|)
+(Intercept) 3.617781   1.241518   2.914  0.01947 *
+x1          0.015269   0.004127   3.700  0.00605 **
+---
+Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+Residual standard error: 1.176 on 8 degrees of freedom
+Multiple R-squared:  0.6311,	Adjusted R-squared:  0.585
+F-statistic: 13.69 on 1 and 8 DF,  p-value: 0.006046
+> n <- length(y)
+> k <- 1 # num of predictor variables
+> sse <- sum(lm.y.x1$residuals^2)
+> ssx1 <- sum((x1-mean(x1))^2)
+> b <- lm.y.x1$coefficient[2]
+> se <-sqrt((1/(n-2))*(sse/ssx1))
+> se <-sqrt(mse/ssx1)
+> t.b.cal <- (b - 0) / se
+> t.b.cal
+      x1
+.699639
+> p.value <- 2 * pt(t.b.cal, n-k-1, lower.tail=F)
+>
+> # checck
+> t.b.cal
+      x1
+.699639
+> t.b.cal^2
+      x1
+.68733
+> p.value
+         x1
+.006045749
+>
+>
+</code>
+<code>
+>
+> lm.y.x2 <- lm(y ~ x2, data=d)
+> summary(lm.y.x2)
+Call:
+lm(formula = y ~ x2, data = d)
+Residuals:
+    Min      1Q  Median      3Q     Max
+-1.2537 -0.8881 -0.4851  0.4963  2.5920
+Coefficients:
+            Estimate Std. Error t value Pr(>|t|)
+(Intercept)  10.7910     1.1195   9.639 1.12e-05 ***
+x2           -0.8458     0.3117  -2.713   0.0265 *
+---
+Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+Residual standard error: 1.397 on 8 degrees of freedom
+Multiple R-squared:  0.4793,	Adjusted R-squared:  0.4142
+F-statistic: 7.363 on 1 and 8 DF,  p-value: 0.02651
+>
+>
+</code>