====== Multiple regression with pr, spr, zero-order r ======


# multiple regression: a simple e.g.
#
#
rm(list=ls())
d <- read.csv("http://commres.net/wiki/_media/regression01-bankaccount.csv") 
d

colnames(d) <- c("y", "x1", "x2")
d
# attach(d)
lm.y.x1 <- lm(y ~ x1, data=d)
summary(lm.y.x1)

lm.y.x2 <- lm(y ~ x2, data=d)
summary(lm.y.x2)

lm.y.x1x2 <- lm(y ~ x1+x2, data=d)
summary(lm.y.x1x2)


lm.y.x1x2$coefficient
# y.hat = 6.399103 + (0.011841)*x1 + (−0.544727)*x2 
a <- lm.y.x1x2$coefficient[1]
b1 <- lm.y.x1x2$coefficient[2]
b2 <- lm.y.x1x2$coefficient[3]

y.pred <- a + (b1 * x1) + (b2 * x2)
y.pred
y.real <- y
y.real
y.mean <- mean(y)
y.mean 

res <- y.real - y.pred
reg <- y.pred - y.mean
ss.res <- sum(res^2)
ss.reg <- sum(reg^2)

ss.tot <- var(y) * (length(y)-1)
ss.tot
ss.res
ss.reg
ss.res+ss.reg

# slope test
summary(lm.y.x1x2)
# note on 2 t-tests 

# beta coefficient (standardized b)
# beta <- b * (sd(x)/sd(y))
beta1 <- b1 * (sd(x1)/sd(y))
beta2 <- b2 * (sd(x2)/sd(y))
beta1
beta2

# install.packages("lm.beta")
library(lm.beta)
lm.beta(lm.y.x1x2)

#######################################################
# partial correlation coefficient and pr2
# x2's explanation? 
lm.tmp.1 <- lm(x2~x1, data=d)
res.x2.x1 <- lm.tmp.1$residuals

lm.tmp.2 <- lm(y~x1, data=d)
res.y.x1 <- lm.tmp.2$residuals

lm.tmp.3 <- lm(res.y.x1 ~ res.x2.x1, data=d)
summary(lm.tmp.3)

# install.packages("ppcor")
library(ppcor)
pcor(d)
spcor(d)
partial.r <- pcor.test(y, x2, x1)
partial.r
str(partial.r)
partial.r$estimate^2

# x1's own explanation?
lm.tmp.4 <- lm(x1~x2, data=d)
res.x1.x2 <- lm.tmp.4$residuals

lm.tmp.5 <- lm(y~x2, data=d)
res.y.x2 <- lm.tmp.5$residuals

lm.tmp.6 <- lm(res.y.x2 ~ res.x1.x2, data=d)
summary(lm.tmp.6)

partial.r2 <- pcor.test(y, x1, x2)
str(partial.r2)
partial.r2$estimate^2
#######################################################

# semipartial correlation coefficient and spr2
#
spr.1 <- spcor.test(y,x2,x1)
spr.2 <- spcor.test(y,x1,x2)
spr.1
spr.2
spr.1$estimate^2
spr.2$estimate^2

lm.tmp.7 <- lm(y ~ res.x2.x1, data = d)
summary(lm.tmp.7)
#######################################################

# get the common area that explain the y variable
# 1.
summary(lm.y.x2)
all.x2 <- summary(lm.y.x2)$r.squared
sp.x2 <- spr.1$estimate^2
all.x2
sp.x2
cma.1 <- all.x2 - sp.x2
cma.1

# 2.
summary(lm.y.x1)
all.x1 <- summary(lm.y.x1)$r.squared
sp.x1 <- spr.2$estimate^2
all.x1
sp.x1
cma.2 <- all.x1 - sp.x1
cma.2

# OR 3.
summary(lm.y.x1x2)
r2.y.x1x2 <- summary(lm.y.x1x2)$r.square
r2.y.x1x2
sp.x1
sp.x2
cma.3 <- r2.y.x1x2 - (sp.x1 + sp.x2)
cma.3

cma.1
cma.2
cma.3
# Note that sorting out unique and common
# explanation area is only possible with 
# semi-partial correlation determinant
# NOT partial correlation determinant
# because only semi-partial correlation
# shares the same denominator (as total 
# y).
#############################################

====== Output ====== ===== Multiple regression =====


> 
> lm.y.x1x2 <- lm(y ~ x1+x2, data=d)
> summary(lm.y.x1x2)

Call:
lm(formula = y ~ x1 + x2, data = d)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.2173 -0.5779 -0.1515  0.6642  1.1906 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)   
(Intercept)  6.399103   1.516539   4.220  0.00394 **
x1           0.011841   0.003561   3.325  0.01268 * 
x2          -0.544727   0.226364  -2.406  0.04702 * 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.9301 on 7 degrees of freedom
Multiple R-squared:  0.7981,	Adjusted R-squared:  0.7404 
F-statistic: 13.84 on 2 and 7 DF,  p-value: 0.003696

> 
> 
> lm.y.x1x2$coefficient
(Intercept)          x1          x2 
 6.39910298  0.01184145 -0.54472725 
> # y.hat = 6.399103 + (0.011841)*x1 + (−0.544727)*x2 
> a <- lm.y.x1x2$coefficient[1]
> b1 <- lm.y.x1x2$coefficient[2]
> b2 <- lm.y.x1x2$coefficient[3]
> 
> y.pred <- a + (b1 * x1) + (b2 * x2)
> y.pred
 [1]  6.280586  5.380616  7.843699  6.588485  9.217328 10.022506
 [7]  7.251626  9.809401  9.643641  7.962113
> y.real <- y
> y.real
 [1]  6  5  7  7  8 10  8 11  9  9
> y.mean <- mean(y)
> y.mean 
[1] 8
> 
> res <- y.real - y.pred
> reg <- y.pred - y.mean
> ss.res <- sum(res^2)
> ss.reg <- sum(reg^2)
> 
> ss.tot <- var(y) * (length(y)-1)
> ss.tot
[1] 30
> ss.res
[1] 6.056235
> ss.reg
[1] 23.94376
> ss.res+ss.reg
[1] 30
> 
> # slope test
> summary(lm.y.x1x2)

Call:
lm(formula = y ~ x1 + x2, data = d)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.2173 -0.5779 -0.1515  0.6642  1.1906 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)   
(Intercept)  6.399103   1.516539   4.220  0.00394 **
x1           0.011841   0.003561   3.325  0.01268 * 
x2          -0.544727   0.226364  -2.406  0.04702 * 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.9301 on 7 degrees of freedom
Multiple R-squared:  0.7981,	Adjusted R-squared:  0.7404 
F-statistic: 13.84 on 2 and 7 DF,  p-value: 0.003696

> # note on 2 t-tests 
> 
> # beta coefficient (standardized b)
> # beta <- b * (sd(x)/sd(y))
> beta1 <- b1 * (sd(x1)/sd(y))
> beta2 <- b2 * (sd(x2)/sd(y))
> beta1
      x1 
0.616097 
> beta2
        x2 
-0.4458785 
> 
> # install.packages("lm.beta")
> library(lm.beta)
> lm.beta(lm.y.x1x2)

Call:
lm(formula = y ~ x1 + x2, data = d)

Standardized Coefficients::
(Intercept)          x1          x2 
         NA   0.6160970  -0.4458785 

> 
> #######################################################
> # partial correlation coefficient and pr2
> # x2's explanation? 
> lm.tmp.1 <- lm(x2~x1, data=d)
> res.x2.x1 <- lm.tmp.1$residuals
> 
> lm.tmp.2 <- lm(y~x1, data=d)
> res.y.x1 <- lm.tmp.2$residuals
> 
> lm.tmp.3 <- lm(res.y.x1 ~ res.x2.x1, data=d)
> summary(lm.tmp.3)

Call:
lm(formula = res.y.x1 ~ res.x2.x1, data = d)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.2173 -0.5779 -0.1515  0.6642  1.1906 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)  
(Intercept)  6.281e-18  2.751e-01   0.000    1.000  
res.x2.x1   -5.447e-01  2.117e-01  -2.573    0.033 *
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.8701 on 8 degrees of freedom
Multiple R-squared:  0.4527,	Adjusted R-squared:  0.3843 
F-statistic: 6.618 on 1 and 8 DF,  p-value: 0.033

> 
> # install.packages("ppcor")
> library(ppcor)
> pcor(d)
$estimate
            y        x1         x2
y   1.0000000 0.7825112 -0.6728560
x1  0.7825112 1.0000000  0.3422911
x2 -0.6728560 0.3422911  1.0000000

$p.value
            y         x1         x2
y  0.00000000 0.01267595 0.04702022
x1 0.01267595 0.00000000 0.36723388
x2 0.04702022 0.36723388 0.00000000

$statistic
           y        x1         x2
y   0.000000 3.3251023 -2.4064253
x1  3.325102 0.0000000  0.9638389
x2 -2.406425 0.9638389  0.0000000

$n
[1] 10

$gp
[1] 1

$method
[1] "pearson"

> spcor(d)
$estimate
            y        x1         x2
y   1.0000000 0.5646726 -0.4086619
x1  0.7171965 1.0000000  0.2078919
x2 -0.6166940 0.2470028  1.0000000

$p.value
            y       x1        x2
y  0.00000000 0.113182 0.2748117
x1 0.02964029 0.000000 0.5914441
x2 0.07691195 0.521696 0.0000000

$statistic
           y        x1         x2
y   0.000000 1.8101977 -1.1846548
x1  2.722920 0.0000000  0.5623159
x2 -2.072679 0.6744045  0.0000000

$n
[1] 10

$gp
[1] 1

$method
[1] "pearson"

> partial.r <- pcor.test(y, x2, x1)
> partial.r
   estimate    p.value statistic  n gp  Method
1 -0.672856 0.04702022 -2.406425 10  1 pearson
> str(partial.r)
'data.frame':	1 obs. of  6 variables:
 $ estimate : num -0.673
 $ p.value  : num 0.047
 $ statistic: num -2.41
 $ n        : int 10
 $ gp       : num 1
 $ Method   : chr "pearson"
> partial.r$estimate^2
[1] 0.4527352
> 
> # x1's own explanation?
> lm.tmp.4 <- lm(x1~x2, data=d)
> res.x1.x2 <- lm.tmp.4$residuals
> 
> lm.tmp.5 <- lm(y~x2, data=d)
> res.y.x2 <- lm.tmp.5$residuals
> 
> lm.tmp.6 <- lm(res.y.x2 ~ res.x1.x2, data=d)
> summary(lm.tmp.6)

Call:
lm(formula = res.y.x2 ~ res.x1.x2, data = d)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.2173 -0.5779 -0.1515  0.6642  1.1906 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)   
(Intercept) 1.330e-17  2.751e-01   0.000  1.00000   
res.x1.x2   1.184e-02  3.331e-03   3.555  0.00746 **
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.8701 on 8 degrees of freedom
Multiple R-squared:  0.6123,	Adjusted R-squared:  0.5639 
F-statistic: 12.64 on 1 and 8 DF,  p-value: 0.007458

> 
> partial.r2 <- pcor.test(y, x1, x2)
> str(partial.r2)
'data.frame':	1 obs. of  6 variables:
 $ estimate : num 0.783
 $ p.value  : num 0.0127
 $ statistic: num 3.33
 $ n        : int 10
 $ gp       : num 1
 $ Method   : chr "pearson"
> partial.r2$estimate^2
[1] 0.6123238
> #######################################################
> 
> # semipartial correlation coefficient and spr2
> #
> spr.1 <- spcor.test(y,x2,x1)
> spr.2 <- spcor.test(y,x1,x2)
> spr.1
    estimate   p.value statistic  n gp  Method
1 -0.4086619 0.2748117 -1.184655 10  1 pearson
> spr.2
   estimate  p.value statistic  n gp  Method
1 0.5646726 0.113182  1.810198 10  1 pearson
> spr.1$estimate^2
[1] 0.1670045
> spr.2$estimate^2
[1] 0.3188552
> 
> lm.tmp.7 <- lm(y ~ res.x2.x1, data = d)
> summary(lm.tmp.7)

Call:
lm(formula = y ~ res.x2.x1, data = d)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.8617 -1.1712 -0.4940  0.5488  3.0771 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   8.0000     0.5589  14.314 5.54e-07 ***
res.x2.x1    -0.5447     0.4301  -1.266    0.241    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.767 on 8 degrees of freedom
Multiple R-squared:  0.167,	Adjusted R-squared:  0.06288 
F-statistic: 1.604 on 1 and 8 DF,  p-value: 0.241

> #######################################################
> 
> # get the common area that explain the y variable
> # 1.
> summary(lm.y.x2)

Call:
lm(formula = y ~ x2, data = d)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.2537 -0.8881 -0.4851  0.4963  2.5920 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  10.7910     1.1195   9.639 1.12e-05 ***
x2           -0.8458     0.3117  -2.713   0.0265 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.397 on 8 degrees of freedom
Multiple R-squared:  0.4793,	Adjusted R-squared:  0.4142 
F-statistic: 7.363 on 1 and 8 DF,  p-value: 0.02651

> all.x2 <- summary(lm.y.x2)$r.squared
> sp.x2 <- spr.1$estimate^2
> all.x2
[1] 0.4792703
> sp.x2
[1] 0.1670045
> cma.1 <- all.x2 - sp.x2
> cma.1
[1] 0.3122658
> 
> # 2.
> summary(lm.y.x1)

Call:
lm(formula = y ~ x1, data = d)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.5189 -0.8969 -0.1297  1.0058  1.5800 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)   
(Intercept) 3.617781   1.241518   2.914  0.01947 * 
x1          0.015269   0.004127   3.700  0.00605 **
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.176 on 8 degrees of freedom
Multiple R-squared:  0.6311,	Adjusted R-squared:  0.585 
F-statistic: 13.69 on 1 and 8 DF,  p-value: 0.006046

> all.x1 <- summary(lm.y.x1)$r.squared
> sp.x1 <- spr.2$estimate^2
> all.x1
[1] 0.631121
> sp.x1
[1] 0.3188552
> cma.2 <- all.x1 - sp.x1
> cma.2
[1] 0.3122658
> 
> # OR 3.
> summary(lm.y.x1x2)

Call:
lm(formula = y ~ x1 + x2, data = d)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.2173 -0.5779 -0.1515  0.6642  1.1906 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)   
(Intercept)  6.399103   1.516539   4.220  0.00394 **
x1           0.011841   0.003561   3.325  0.01268 * 
x2          -0.544727   0.226364  -2.406  0.04702 * 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.9301 on 7 degrees of freedom
Multiple R-squared:  0.7981,	Adjusted R-squared:  0.7404 
F-statistic: 13.84 on 2 and 7 DF,  p-value: 0.003696

> r2.y.x1x2 <- summary(lm.y.x1x2)$r.square
> r2.y.x1x2
[1] 0.7981255
> sp.x1
[1] 0.3188552
> sp.x2
[1] 0.1670045
> cma.3 <- r2.y.x1x2 - (sp.x1 + sp.x2)
> cma.3
[1] 0.3122658
> 
> cma.1
[1] 0.3122658
> cma.2
[1] 0.3122658
> cma.3
[1] 0.3122658
> # Note that sorting out unique and common
> # explanation area is only possible with 
> # semi-partial correlation determinant
> # NOT partial correlation determinant
> # because only semi-partial correlation
> # shares the same denominator (as total 
> # y).
> #############################################
> 
> 
>

====== Simple regression ======


> # multiple regression: a simple e.g.
> #
> #
> rm(list=ls())
> d <- read.csv("http://commres.net/wiki/_media/regression01-bankaccount.csv") 
> d
   bankaccount income famnum
1            6    220      5
2            5    190      6
3            7    260      3
4            7    200      4
5            8    330      2
6           10    490      4
7            8    210      3
8           11    380      2
9            9    320      1
10           9    270      3
> 
> colnames(d) <- c("y", "x1", "x2")
> d
    y  x1 x2
1   6 220  5
2   5 190  6
3   7 260  3
4   7 200  4
5   8 330  2
6  10 490  4
7   8 210  3
8  11 380  2
9   9 320  1
10  9 270  3
> # attach(d)
> lm.y.x1 <- lm(y ~ x1, data=d)
> summary(lm.y.x1)

Call:
lm(formula = y ~ x1, data = d)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.5189 -0.8969 -0.1297  1.0058  1.5800 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)   
(Intercept) 3.617781   1.241518   2.914  0.01947 * 
x1          0.015269   0.004127   3.700  0.00605 **
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.176 on 8 degrees of freedom
Multiple R-squared:  0.6311,	Adjusted R-squared:  0.585 
F-statistic: 13.69 on 1 and 8 DF,  p-value: 0.006046

단순회귀분석에서 (simple regression) F-test와 t-test는 (slope test) 기본적으로 똑 같은 테스트를 말한다. 왜냐하면 F-test에 기여하는 독립변인이 오직하나이고 그 하나가 slope test에 (t-test) 사용되기 때문이다. 이것은 t-test의 t값과 F-test의 F값의 관계에서도 나타난다. $$ t^2 = F $$


> t.cal <- 3.7 
> t.cal^2 
[1] 13.69
> F.cal <- 13.69
> F.cal
[1] 13.69

Simple regression에서 설명한 것처럼 기울기에 (slope) 대한 t-test는 기울기가 y 변인의 variability를 (평균을 중심으로 흔들림을) 설명하는 데 기여했는가를 테스트 하기 위한 것이다. 기울기가 0 이라면 이는 평균을 (평균선이 기울기가 0이다) 사용하는 것과 같으므로 기울기의 효과가 없음을 의미한다. 따라서 b와 b zero의 차이가 통계학적으로 의미있었는가를 t-test한다. $$ \text{t calculated value} = \frac {b - 0}{se} $$ 위에서 $se$는 아래처럼 구한다고 언급하였다. \begin{eqnarray*} se & = & \sqrt{\frac{1}{n-2} * \frac{\text{SSE}}{\text{SSx}}} \\ & = & \sqrt{\frac {\text{MSE}} {\text{SSx}}} \\ \text{note that MSE } & = & \text{mean square error } \\ & = & \text{ms.res } \end{eqnarray*} 위에서 구한 t값의 p value는 R에서


summary(lm.y.x1)
n <- length(y)
k <- 1 # num of predictor variables
sse <- sum(lm.y.x1$residuals^2) # ss.res
ssx1 <- sum((x1-mean(x1))^2)
b <- lm.y.x1$coefficient[2]
se <- sqrt((1/(n-2))*(sse/ssx1))
t.b.cal <- (b - 0) / se
t.b.cal
p.value <- 2 * pt(t.b.cal, n-k-1, lower.tail=F)
p.value 
# checck
t.b.cal
f.cal <- t.b.cal^2
f.cal
p.value


> summary(lm.y.x1)

Call:
lm(formula = y ~ x1, data = d)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.5189 -0.8969 -0.1297  1.0058  1.5800 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)   
(Intercept) 3.617781   1.241518   2.914  0.01947 * 
x1          0.015269   0.004127   3.700  0.00605 **
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.176 on 8 degrees of freedom
Multiple R-squared:  0.6311,	Adjusted R-squared:  0.585 
F-statistic: 13.69 on 1 and 8 DF,  p-value: 0.006046

> n <- length(y)
> k <- 1 # num of predictor variables
> sse <- sum(lm.y.x1$residuals^2)
> ssx1 <- sum((x1-mean(x1))^2)
> b <- lm.y.x1$coefficient[2]
> se <-sqrt((1/(n-2))*(sse/ssx1))
> se <-sqrt(mse/ssx1)
> t.b.cal <- (b - 0) / se
> t.b.cal
      x1 
3.699639 
> p.value <- 2 * pt(t.b.cal, n-k-1, lower.tail=F)
> 
> # checck
> t.b.cal
      x1 
3.699639 
> t.b.cal^2
      x1 
13.68733 
> p.value 
         x1 
0.006045749 
> 
>


> 
> lm.y.x2 <- lm(y ~ x2, data=d)
> summary(lm.y.x2)

Call:
lm(formula = y ~ x2, data = d)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.2537 -0.8881 -0.4851  0.4963  2.5920 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  10.7910     1.1195   9.639 1.12e-05 ***
x2           -0.8458     0.3117  -2.713   0.0265 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.397 on 8 degrees of freedom
Multiple R-squared:  0.4793,	Adjusted R-squared:  0.4142 
F-statistic: 7.363 on 1 and 8 DF,  p-value: 0.02651
>
>