Differences

This shows you the differences between two versions of the page.

--- partial_and_semipartial_correlation [2024/06/12 08:01] – [IV 간의 correlation이 심할 때 Regression 결과의 오류] hkimscil
+++ partial_and_semipartial_correlation [2025/06/04 08:37] (current) – [X1과 X2 간의 상관관계가 심할 때 Regression 결과의 오류] hkimscil
@@ Line 414: / Line 414: @@
 ====== e.g. Using ppcor.test with 4 var ======
 <code>
-options(digits = 4)
+rm(list=ls())
-HSGPA <- c(3.0, 3.2, 2.8, 2.5, 3.2, 3.8, 3.9, 3.8, 3.5, 3.1)
+library(ggplot2)
-FGPA <-  c(2.8, 3.0, 2.8, 2.2, 3.3, 3.3, 3.5, 3.7, 3.4, 2.9)
+library(dplyr)
-SATV <-  c(500, 550, 450, 400, 600, 650, 700, 550, 650, 550)
+library(tidyr)
-GREV <-  c(600, 670, 540, 800, 750, 820, 830, 670, 690, 600)
+library(faux)
-##GREV <- c(510, 670, 440, 800, 750, 420, 830, 470, 690, 600)
+set.seed(101)
+scholar <- rnorm_multi(n = 50,
+                       mu = c(3.12, 3.3, 540, 650),
+                       sd = c(.25, .34, 12, 13),
+                       r = c(0.15, 0.44, 0.47, 0.55, 0.45, 0.88),
+                       varnames = c("HSGPA", "FGPA", "SATV", "GREV"),
+                       empirical = FALSE)
+attach(scholar)
-scholar <- data.frame(HSGPA, FGPA, SATV, GREV) # collect into a data frame
+# library(psych)
-# install.packages("psych")
-library(psych)
 describe(scholar) # provides descrptive information about each variable
@@ Line 437: / Line 441: @@
 # install.packages("ppcor")
 library(ppcor)
-pcor.test(scholar$GREV, scholar$FGPA, scholar[,c("SATV", "HSGPA")]) # working
-reg3 <- lm(GREV ~ SATV + HSGPA)   # run linear regression
+reg.g.sh <- lm(GREV ~ SATV + HSGPA)
-resid3 <- resid(reg3)     # find the residuals - GREV free of SATV and HSGPA
+res.g.sh <- resid(reg.g.sh)
-reg4 <- lm(FGPA ~ SATV + HSGPA)   # second regression
+reg.g.fh <- lm(GREV ~ FGPA + HSGPA)
-resid4 <- resid(reg4)     # second set of residuals - FGPA free of SATV and HSGPA
+res.g.fh <- resid(reg.g.fh)
+reg.g.sf <- lm(GREV ~ SATV + FGPA)
+res.g.sf <- resid(reg.g.sf)
+reg.f.sh <- lm(FGPA ~ SATV + HSGPA)   # second regression
+res.f <- resid(reg.f.sh)     # second set of residuals - FGPA free of SATV and HSGPA
+reg.s.fh <- lm(SATV ~ FGPA + HSGPA)
+res.s <- resid(reg.s.fh)
+reg.h.sf <- lm(HSGPA ~ FGPA + SATV)
+res.h <- resid(reg.h.sf)
+reg.all <- lm(GREV ~ HSGPA + FGPA + SATV)
+reg.1 <- lm(GREV ~ res.f)
+reg.2 <- lm(GREV ~ res.s)
+reg.3 <- lm(GREV ~ res.h)
+summary(reg.all)
+summary(reg.1)
+summary(reg.2)
+summary(reg.3)
+reg.1a <- lm(res.g.sh~res.f)
+reg.2a <- lm(res.g.fh~res.s)
+reg.3a <- lm(res.g.sf~res.h)
+reg.1$coefficient[2]
+reg.2$coefficient[2]
+reg.3$coefficient[2]
+reg.1a$coefficient[2]
+reg.2a$coefficient[2]
+reg.3a$coefficient[2]
+spr.y.f <- spcor.test(GREV, FGPA, scholar[,c("SATV", "HSGPA")])
+spr.y.s <- spcor.test(GREV, SATV, scholar[,c("HSGPA", "FGPA")])
+spr.y.h <- spcor.test(GREV, HSGPA, scholar[,c("SATV", "FGPA")])
+spr.y.f$estimate
+spr.y.s$estimate
+spr.y.h$estimate
+spr.y.f$estimate^2
+spr.y.s$estimate^2
+spr.y.h$estimate^2
+summary(reg.1)$r.square
+summary(reg.2)$r.square
+summary(reg.3)$r.square
+ca <- summary(reg.1)$r.square +
+  summary(reg.2)$r.square +
+  summary(reg.3)$r.square
+# so common explanation area should be
+summary(reg.all)$r.square - ca
-cor(resid3, resid4)       # correlation of residuals - partial correlation
 </code>
 <code>
-> pcor.test(scholar$GREV, scholar$FGPA, scholar[,c("SATV", "HSGPA")]) # working
-  estimate p.value statistic  n gp  Method
-   -0.535  0.1719    -1.551 10  2 pearson
 >
-> reg3 <- lm(GREV ~ SATV + HSGPA)   # run linear regression
+> rm(list=ls())
-> resid3 <- resid(reg3)     # find the residuals - GREV free of SATV and HSGPA
 >
-> reg4 <- lm(FGPA ~ SATV + HSGPA)   # second regression
+> library(ggplot2)
-> resid4 <- resid(reg4)     # second set of residuals - FGPA free of SATV and HSGPA
+> library(dplyr)
+> library(tidyr)
+> library(faux)
 >
-> cor(resid3, resid4)       # correlation of residuals - partial correlation
+> set.seed(101)
-[1] -0.535
+> scholar <- rnorm_multi(n = 50,
++                        mu = c(3.12, 3.3, 540, 650),
++                        sd = c(.25, .34, 12, 13),
++                        r = c(0.15, 0.44, 0.47, 0.55, 0.45, 0.88),
++                        varnames = c("HSGPA", "FGPA", "SATV", "GREV"),
++                        empirical = FALSE)
+> attach(scholar)
+The following objects are masked from scholar (pos = 3):
+    FGPA, GREV, HSGPA, SATV
-</code>
+>
-----
+> # library(psych)
-----
+> describe(scholar) # provides descrptive information about each variable
-학자인 A는 GRE점수는 (GREV) 학점에 신경을 쓰는 활동보다는 지능지수와 관련된다고 믿는 SATV의 영향력이 더 클것으로 생각된다. 그래서 SATV만의 영향력을 다른 변인을 콘트롤하여 살펴보고 싶다.
+      vars  n   mean    sd median trimmed   mad    min    max range  skew
-<code>
+HSGPA    1 50   3.13  0.24   3.11    3.13  0.16   2.35   3.62  1.26 -0.42
-pcor.test(scholar$GREV, scholar$SATV, scholar[, c("HSGPA", "FGPA")])
+FGPA     2 50   3.34  0.35   3.32    3.33  0.33   2.50   4.19  1.68  0.27
+SATV     3 50 541.28 11.43 538.45  540.50 10.85 523.74 567.97 44.24  0.58
+GREV     4 50 651.72 11.90 649.70  651.29 10.55 629.89 678.33 48.45  0.35
+      kurtosis   se
+HSGPA     1.21 0.03
+FGPA     -0.01 0.05
+SATV     -0.60 1.62
+GREV     -0.54 1.68
+>
+> corrs <- cor(scholar) # find the correlations and set them into an object called 'corrs'
+> corrs                 # print corrs
+       HSGPA   FGPA   SATV   GREV
+HSGPA 1.0000 0.3404 0.4627 0.5406
+FGPA  0.3404 1.0000 0.5266 0.5096
+SATV  0.4627 0.5266 1.0000 0.8802
+GREV  0.5406 0.5096 0.8802 1.0000
+>
+> pairs(scholar)        # pairwise scatterplots
+>
+> # install.packages("ppcor")
+> library(ppcor)
+>
+> reg.f.sh <- lm(FGPA ~ SATV + HSGPA)   # second regression
+> res.f <- resid(reg.f.sh)     # second set of residuals - FGPA free of SATV and HSGPA
+>
+> reg.s.fh <- lm(SATV ~ FGPA + HSGPA)
+> res.s <- resid(reg.s.fh)
+>
+> reg.h.sf <- lm(HSGPA ~ FGPA + SATV)
+> res.h <- resid(reg.h.sf)
+>
+> reg.all <- lm(GREV ~ HSGPA + FGPA + SATV)
+> reg.1 <- lm(GREV ~ res.f)
+> reg.2 <- lm(GREV ~ res.s)
+> reg.3 <- lm(GREV ~ res.h)
+>
+> summary(reg.all)
-reg7 <- lm(GREV ~ HSGPA + FGPA)   # run linear regression
+Call:
-resid7 <- resid(reg7)     # find the residuals - HSGPA free of SATV
+lm(formula = GREV ~ HSGPA + FGPA + SATV)
-reg8 <- lm(SATV ~ HSGPA+ FGPA)   # second regression
+Residuals:
-resid8 <- resid(reg8)     # second set of residuals - FGPA free of SATV
+    Min      1Q  Median      3Q     Max
+-13.541  -3.441   0.148   4.823   7.796
-cor(resid7, resid8)       # correlation of residuals - partial correlation
+Coefficients:
+            Estimate Std. Error t value Pr(>|t|)
+(Intercept) 180.2560    40.3988    4.46  5.2e-05 ***
+HSGPA         8.3214     3.8050    2.19    0.034 *
+FGPA          1.3994     2.6311    0.53    0.597
+SATV          0.8143     0.0867    9.40  2.8e-12 ***
+---
+Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
-</code>
+Residual standard error: 5.51 on 46 degrees of freedom
+Multiple R-squared:  0.799,	Adjusted R-squared:  0.786
+F-statistic: 60.8 on 3 and 46 DF,  p-value: 4.84e-16
+> summary(reg.1)
+Call:
+lm(formula = GREV ~ res.f)
+Residuals:
+   Min     1Q Median     3Q    Max
+-21.76  -8.65  -2.08   7.83  26.10
+Coefficients:
+            Estimate Std. Error t value Pr(>|t|)
+(Intercept)   651.72       1.70  383.59   <2e-16 ***
+res.f           1.40       5.74    0.24     0.81
+---
+Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+Residual standard error: 12 on 48 degrees of freedom
+Multiple R-squared:  0.00124,	Adjusted R-squared:  -0.0196
+F-statistic: 0.0595 on 1 and 48 DF,  p-value: 0.808
+> summary(reg.2)
+Call:
+lm(formula = GREV ~ res.s)
+Residuals:
+   Min     1Q Median     3Q    Max
+-22.54  -4.94  -1.24   6.08  20.35
+Coefficients:
+            Estimate Std. Error t value Pr(>|t|)
+(Intercept)  651.715      1.332   489.4  < 2e-16 ***
+res.s          0.814      0.148     5.5  1.4e-06 ***
+---
+Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+Residual standard error: 9.42 on 48 degrees of freedom
+Multiple R-squared:  0.386,	Adjusted R-squared:  0.374
+F-statistic: 30.2 on 1 and 48 DF,  p-value: 1.45e-06
+> summary(reg.3)
+Call:
+lm(formula = GREV ~ res.h)
+Residuals:
+   Min     1Q Median     3Q    Max
+-22.71  -9.32  -1.30   7.92  26.43
+Coefficients:
+            Estimate Std. Error t value Pr(>|t|)
+(Intercept)   651.72       1.68  387.43   <2e-16 ***
+res.h           8.32       8.21    1.01     0.32
+---
+Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+Residual standard error: 11.9 on 48 degrees of freedom
+Multiple R-squared:  0.0209,	Adjusted R-squared:  0.000538
+F-statistic: 1.03 on 1 and 48 DF,  p-value: 0.316
-<code>
-> pcor.test(scholar$GREV, scholar$SATV, scholar[, c("HSGPA", "FGPA")])
-  estimate p.value statistic  n gp  Method
-   0.3179  0.4429    0.8213 10  2 pearson
 >
-> reg7 <- lm(GREV ~ HSGPA + FGPA)   # run linear regression
+> reg.1$coefficient[2]
-> resid7 <- resid(reg7)     # find the residuals - HSGPA free of SATV
+res.f
+.399
+> reg.2$coefficient[2]
+ res.s
+.8143
+> reg.3$coefficient[2]
+res.h
+.321
 >
-> reg8 <- lm(SATV ~ HSGPA+ FGPA)   # second regression
+> spr.y.f <- spcor.test(GREV, FGPA, scholar[,c("SATV", "HSGPA")])
-> resid8 <- resid(reg8)     # second set of residuals - FGPA free of SATV
+> spr.y.s <- spcor.test(GREV, SATV, scholar[,c("HSGPA", "FGPA")])
+> spr.y.h <- spcor.test(GREV, HSGPA, scholar[,c("SATV", "FGPA")])
 >
-> cor(resid7, resid8)       # correlation of residuals - partial correlation
+> spr.y.f$estimate
-[1] 0.3179
+[1] 0.03519
+> spr.y.s$estimate
+[1] 0.6217
+> spr.y.h$estimate
+[1] 0.1447
 >
+> spr.y.f$estimate^2
+[1] 0.001238
+> spr.y.s$estimate^2
+[1] 0.3865
+> spr.y.h$estimate^2
+[1] 0.02094
+>
+> summary(reg.1)$r.square
+[1] 0.001238
+> summary(reg.2)$r.square
+[1] 0.3865
+> summary(reg.3)$r.square
+[1] 0.02094
+>
+> ca <- summary(reg.1)$r.square +
++   summary(reg.2)$r.square +
++   summary(reg.3)$r.square
+> # so common explanation area should be
+> summary(reg.all)$r.square - ca
+[1] 0.39
 >
 </code>
+----
+{{:pasted:20241016-080226.png}}
+multiple regression 분석을 보면 독립변인의 coefficient 값은 각각
+  * HSGPA         8.3214
+  * FGPA          1.3994
+  * SATV          0.8143
+이 기울기에 대해서 t-test를 각각 하여 HSGPA와 FGPA의 설명력이 significant 한지를 확인하였다. 그리고 이 때의 R<sup>2</sup> 값은
+  * 0.799 이었다.
+그런데 이 coefficient값은 독립변인 각각의 고유의 설명력을 가지고 (spcor.test(GREV, x1, 나머지제어)로 얻은 부분) 종속변인에 대해서 regression을 하여 얻은 coefficient값과 같음을 알 수 있다. 즉, <fc #ff0000>multiple regression의 독립변인의 b coefficient 값들은 고유의 설명부분을 (spr) 추출해서 y에 (GREV) regression한 결과와 같음을</fc> 알 수 있다.
+reg.all
+{{:pasted:20250604-082250.png?600}}
+reg.1
+{{:pasted:20250604-082519.png?600}}
+reg.2
+{{:pasted:20250604-082635.png?600}}
+reg.3
+{{:pasted:20250604-082740.png?600}}
+또한 세 독립변인이 공통적으로 설명하는 부분은
+  * 0.39
+임을 알 수 있다.
 ====== e.g., 독립변인 들이 서로 독립적일 때의 각각의 설명력 ======
 In this example, the two IVs are orthogonal to each other (not correlated with each other). Hence, regress res.y.x2 against x1 would not result in any problem.
@@ Line 658: / Line 878: @@
 m <- lm(weights ~ LSS + RSS)
-## F-value is very small, but neither LSS or RSS are significant
+## F-value is very large, and significant.
+# but neither LSS or RSS are significant with t-test
 summary(m)