c:ms:2023:schedule:w11.lecture.note
Differences
This shows you the differences between two versions of the page.
Next revision | Previous revision | ||
c:ms:2023:schedule:w11.lecture.note [2023/05/15 15:08] – created hkimscil | c:ms:2023:schedule:w11.lecture.note [2023/05/24 08:37] (current) – hkimscil | ||
---|---|---|---|
Line 1: | Line 1: | ||
+ | ====== 하나의 독립변인이 갖는 고유의 영향력 혹은 설명력 파악하기 ====== | ||
+ | [[:multiple regression]] 참조 | ||
+ | [[:partial and semipartial correlation]] 참조 | ||
< | < | ||
datavar <- read.csv(" | datavar <- read.csv(" | ||
Line 6: | Line 9: | ||
attach(datavar) | attach(datavar) | ||
+ | library(ggplot2) # ggplot 사용을 위해서 | ||
+ | # install.packages(" | ||
lm.y.x1 <- lm(y~x1, data=datavar) | lm.y.x1 <- lm(y~x1, data=datavar) | ||
summary(lm.y.x1) | summary(lm.y.x1) | ||
+ | str(lm.y.x1) | ||
intercept.y <- lm.y.x1$coefficients[1] | intercept.y <- lm.y.x1$coefficients[1] | ||
slope.x <- lm.y.x1$coefficients[2] | slope.x <- lm.y.x1$coefficients[2] | ||
intercept.y | intercept.y | ||
slope.x | slope.x | ||
- | ggplot(data=datavar, | + | ggplot(data=datavar, |
geom_point(color=" | geom_point(color=" | ||
- | geom_hline(aes(yintercept=mean(bankaccount)), | + | geom_hline(aes(yintercept=mean(y)), size=1.5, color=" |
geom_abline(intercept=intercept.y, | geom_abline(intercept=intercept.y, | ||
+ | # recapping | ||
+ | ss.tot <- var(y)*(length(y)-1) | ||
+ | ss.tot | ||
+ | ss.res <- sum(lm.y.x1$residuals^2) | ||
+ | ss.res | ||
+ | ss.reg <- sum((lm.y.x1$fitted.values-mean(y))^2) | ||
+ | ss.reg | ||
+ | ss.tot | ||
+ | ss.res + ss.reg | ||
+ | |||
+ | summary(lm(lm.y.x1$residuals~x1)) | ||
+ | summary(lm((lm.y.x1$fitted.values~x1))) | ||
+ | summary(lm((lm.y.x1$fitted.values-mean(y))~x1)) | ||
+ | # recapping ends | ||
lm.y.x2 <- lm(y~x2, data=datavar) | lm.y.x2 <- lm(y~x2, data=datavar) | ||
Line 27: | Line 46: | ||
intercept.y | intercept.y | ||
slope.x | slope.x | ||
- | ggplot(data=datavar, | + | ggplot(data=datavar, |
geom_point(color=" | geom_point(color=" | ||
- | geom_hline(aes(yintercept=mean(bankaccount)), | + | geom_hline(aes(yintercept=mean(y)), |
geom_abline(intercept=intercept.y, | geom_abline(intercept=intercept.y, | ||
+ | |||
lm.y.x1x2 <- lm(y~x1+x2, data=datavar) | lm.y.x1x2 <- lm(y~x1+x2, data=datavar) | ||
summary(lm.y.x1x2) | summary(lm.y.x1x2) | ||
- | res.y.minus.x2 <- resid(lm.y.x2) # y - x2 | + | ################################## |
+ | res.lm.y.x2 <- resid(lm.y.x2) # y - x2 | ||
+ | # x2를 제외하기 위한 계산 | ||
lm.x1.x2 <- lm(x1~x2, data=datavar) | lm.x1.x2 <- lm(x1~x2, data=datavar) | ||
- | res.x1.minus.x2 <- resid(lm.x1.x2) # x1 - x2 | + | res.lm.x1.x2 <- resid(lm.x1.x2) # x1 - x2 |
- | lm(res.y.minus.x2~res.x1.minus.x2) | + | lm(res.lm.y.x2~res.lm.x1.x2) |
- | summary(lm(res.y.minus.x2~res.x1.minus.x2)) # x2를 y, x1에서 모두 제외한 분석 | + | summary(lm(res.lm.y.x2~res.lm.x1.x2)) # x2를 y, x1에서 모두 제외한 분석 |
# or | # or | ||
- | cor(res.y.minus.x2, res.x1.minus.x2) | + | cor(res.lm.y.x2, res.lm.x1.x2) |
- | cor(res.y.minus.x2, res.x1.minus.x2)^2 | + | # cor제곱 값은 x2의 설명력을 제어 혹은 제외한 후에 보는 |
- | summary(lm(res.y.minus.x2~res.x1.minus.x2))$r.squared | + | # y와 x1간의 r 제곱값이다 |
+ | cor(res.lm.y.x2, res.lm.x1.x2)^2 | ||
+ | summary(lm(res.lm.y.x2~res.lm.x1.x2))$r.squared | ||
- | ################## | + | ############################ |
- | install.packages(" | + | # http:// |
+ | # install.packages(" | ||
library(ppcor) | library(ppcor) | ||
pcor(datavar) | pcor(datavar) | ||
- | y | + | datavar |
- | x1 | + | pcor.test(y, |
- | x2 | + | |
- | datavar | + | |
- | pcor.test(y, | + | |
summary(lm(res.y.minus.x2~res.x1.minus.x2)) | summary(lm(res.y.minus.x2~res.x1.minus.x2)) | ||
- | r.sq <- (summary(lm(res.lm.ba.fn~res.lm.inc.fn)))$r.squared | + | r.sq <- (summary(lm(res.lm.y.x2~res.lm.x1.x2)))$r.squared |
+ | r.sq | ||
sqrt(r.sq) | sqrt(r.sq) | ||
Line 77: | Line 100: | ||
common.area | common.area | ||
- | y.x1 <- lm(y~x1, data=datavar) | + | # common.area가 맞는지 확인 |
- | y.x2 <- lm(y~x2, data=datavar) | + | # b + common.area는 |
- | bd <- summary(y.x1)$r.squared | + | summary(lm.y.x1)$r.squared |
- | cd <- summary(y.x2)$r.squared | + | b + common.area |
- | bd | + | # 마찬가지로 c + common.area should be the same as |
- | cd | + | # summary(lm.y.x2)$r.squared |
- | bd+cd | + | summary(lm.y.x2)$r.squared |
- | bd+cd-common.area | + | c + common.area |
- | summary(lm.y.x1x2)$r.squared | + | </code> |
+ | |||
+ | ====== 기울기 b에 대한 통계학적인 테스트 ====== | ||
+ | [[:multiple regression]]에서 앞 쪽에 기울기에 대한 테스트 부분 | ||
+ | regression 문서에서 [[: | ||
+ | |||
+ | < | ||
+ | ################################################ | ||
+ | # standard error of coefficient의 의미 | ||
+ | # b의 사선을 중심으로 에러의 분포가 정규분포를 | ||
+ | # 이룬다고 할 때, 그 때의 standard error 값. | ||
+ | # 따라서, fitted.value +- 2 * se 값이 에러의 | ||
+ | # 범위를 (range) 알려주는 방법. | ||
+ | # 단, 2 = t critical value로 대체해야 | ||
+ | ################################################ | ||
+ | |||
+ | summary(lm.y.x1x2) | ||
+ | mean(y) | ||
+ | mean(x1) | ||
+ | mean(x2) | ||
+ | intercept <- lm.y.x1x2$coefficients[1] | ||
+ | b1 <- lm.y.x1x2$coefficients[2] | ||
+ | b2 <- lm.y.x1x2$coefficients[3] | ||
+ | intercept | ||
+ | b1 | ||
+ | b2 | ||
+ | |||
+ | ggplot(datavar,aes(x = x1, y = y)) + geom_point() + geom_smooth(method = " | ||
+ | ggplot(datavar,aes(x = x2, y = y)) + geom_point() + geom_smooth(method = " | ||
+ | |||
+ | summary(lm.y.x1) | ||
+ | a <- lm.y.x1$coefficients[1] | ||
+ | b <- lm.y.x1$coefficients[2] | ||
+ | # y hat = a + b x1 | ||
+ | # x1 = 200 ? | ||
+ | p <- as.data.frame(200) | ||
+ | p | ||
+ | colnames(p) <- " | ||
+ | p | ||
+ | |||
+ | predict(lm.y.x1, | ||
+ | |||
+ | |||
+ | lm.y.x1$coefficients | ||
+ | pred <- predict(lm.y.x1, | ||
+ | summary(lm.y.x1) | ||
+ | t.critical <- qt(.975, | ||
+ | t.critical | ||
+ | |||
+ | (summary(lm.y.x1))$coefficients | ||
+ | # coefficient 값 중 독립변인의 se값은 $coefficient[4] | ||
+ | # 절편의 se는 [3] | ||
+ | |||
+ | se.x1 <- (summary(lm.y.x1))$coefficients[4] | ||
+ | se.x1 | ||
+ | pred.range.down | ||
+ | pred.range.up <- pred + (t.critical*se.x1) | ||
+ | pred.range.down | ||
+ | pred.range.up | ||
+ | |||
+ | summary(lm.y.x2) | ||
+ | a <- lm.y.x2$coefficients[1] | ||
+ | b <- lm.y.x2$coefficients[2] | ||
+ | a | ||
+ | b | ||
+ | |||
+ | # y hat = a + b x2 | ||
+ | # x2 = 5 명일때? | ||
+ | p <- as.data.frame(5) | ||
+ | colnames(p) <- " | ||
+ | |||
+ | lm.y.x2$coefficients | ||
+ | pred <- predict(lm.y.x2, | ||
+ | pred | ||
+ | |||
+ | t.critical <- qt(.975, | ||
+ | t.critical | ||
+ | se.x2 <- (summary(lm.y.x2))$coefficients[4] | ||
+ | se.x2 | ||
+ | |||
+ | lim <- t.critical*se.x2 | ||
+ | |||
+ | pred.range.down <- pred-lim | ||
+ | pred.range.up <- pred+lim | ||
+ | pred.range.down | ||
+ | pred.range.up | ||
+ | # or | ||
+ | pred.range <- pred + c(-lim, lim) | ||
+ | pred.range | ||
+ | |||
+ | |||
+ | # Note that the range depends upon the se value | ||
+ | summary(lm.y.x1) | ||
+ | summary(lm.y.x2) | ||
+ | summary(lm.y.x1x2) | ||
+ | |||
+ | |||
+ | dvar <- read.csv(" | ||
+ | mod1 <- lm(api00 ~ ell + acs_k3 + avg_ed + meals, data=dvar) | ||
+ | mod2 <- lm(api00 ~ ell + avg_ed + meals + acs_k3, data=dvar) | ||
+ | mod3 <- lm(api00 ~ meals + acs_k3 + avg_ed + ell, data=dvar) | ||
+ | summary(mod1) | ||
+ | summary(mod2) | ||
+ | summary(mod3) | ||
+ | anova(mod1) | ||
+ | anova(mod2) | ||
+ | anova(mod3) | ||
</ | </ |
c/ms/2023/schedule/w11.lecture.note.1684130887.txt.gz · Last modified: 2023/05/15 15:08 by hkimscil