c:ms:regression_lecture_note_for_r
Differences
This shows you the differences between two versions of the page.
Both sides previous revisionPrevious revisionNext revision | Previous revision | ||
c:ms:regression_lecture_note_for_r [2024/05/29 07:33] – hkimscil | c:ms:regression_lecture_note_for_r [2024/06/06 17:07] (current) – [Graphics output] hkimscil | ||
---|---|---|---|
Line 1: | Line 1: | ||
+ | ====== Code ====== | ||
+ | You need to install some packages | ||
+ | < | ||
+ | install.packages(" | ||
+ | install.packages(" | ||
+ | install.packages(" | ||
+ | |||
+ | </ | ||
< | < | ||
################ | ################ | ||
rnorm2 <- function(n, | rnorm2 <- function(n, | ||
- | n <- 400 | + | n <- 16 |
set.seed(101) | set.seed(101) | ||
Line 12: | Line 20: | ||
points(rand.00, | points(rand.00, | ||
- | b = 170 / 265 | + | b <- 170 / 265 |
+ | b <- round(b,1) | ||
b | b | ||
y <- b * x + rand.00 | y <- b * x + rand.00 | ||
Line 30: | Line 39: | ||
lm.y.x <- lm(y ~ x, data = df) | lm.y.x <- lm(y ~ x, data = df) | ||
summary(lm.y.x) | summary(lm.y.x) | ||
- | # str(lm.y.x) | + | |
+ | ########################### | ||
+ | # double check with this | ||
+ | ########################### | ||
+ | str(lm.y.x) | ||
+ | y.res <- lm.y.x$residuals | ||
+ | y.pre <- lm.y.x$fitted.values | ||
+ | y.exp <- y.pre - m.y | ||
+ | plot(x, | ||
+ | plot(x, | ||
+ | plot(x, | ||
+ | ########################### | ||
lm.y.x$coefficients | lm.y.x$coefficients | ||
intercept <- lm.y.x$coefficients[1] | intercept <- lm.y.x$coefficients[1] | ||
Line 74: | Line 94: | ||
b | b | ||
a | a | ||
+ | # we got these from the above | ||
+ | slope | ||
+ | intercept | ||
summary(lm.y.x) | summary(lm.y.x) | ||
lm.y.x$coefficients | lm.y.x$coefficients | ||
- | |||
# str(lm.y.x) | # str(lm.y.x) | ||
Line 97: | Line 119: | ||
r.square <- ss.reg / ss.y | r.square <- ss.reg / ss.y | ||
r.square | r.square | ||
+ | sqrt(r.square) | ||
+ | cor(x,y) | ||
df.tot <- df.y | df.tot <- df.y | ||
Line 130: | Line 153: | ||
res <- lm.y.x$residuals | res <- lm.y.x$residuals | ||
reg <- lm.y.x$fitted.values-m.y | reg <- lm.y.x$fitted.values-m.y | ||
+ | |||
+ | # The above can be derived as follows | ||
+ | y.hat <- intercept + (slope * x) | ||
+ | y.hat | ||
+ | head(y.hat) | ||
+ | head(lm.y.x$fitted.values) | ||
+ | y.pre <- y.hat | ||
+ | y.res <- y - y.pre | ||
+ | y.res | ||
+ | head(res) | ||
+ | head(y.res) | ||
+ | |||
+ | y.reg <- y.pre - m.y | ||
+ | head(reg) | ||
+ | head(y.reg) | ||
+ | ######################################## | ||
# This is essentially a perfect linear | # This is essentially a perfect linear | ||
Line 142: | Line 181: | ||
summary(lm.res.x) | summary(lm.res.x) | ||
summary(lm.y.x) | summary(lm.y.x) | ||
- | </ | ||
+ | ######################################## | ||
+ | # Now let's get back to the lm output | ||
+ | # this is to evalue the significance of | ||
+ | # the slope of x (b) | ||
+ | summary(lm.y.x) | ||
+ | |||
+ | # the slope, we already got this | ||
+ | b <- slope | ||
+ | # intercept | ||
+ | a <- intercept | ||
+ | |||
+ | # the real data look like this | ||
+ | plot(x,y) | ||
+ | abline(h = mean(y), lwd=1.5, col=" | ||
+ | abline(lm(y ~ x), lwd=1.5, col=" | ||
+ | |||
+ | # blue line is regression line | ||
+ | # this slope is an estimate of a real | ||
+ | # thing (population) rather than the | ||
+ | # sample we are using right now) | ||
+ | library(ggplot2) | ||
+ | |||
+ | ggplot(data = df, aes(x,y)) + | ||
+ | geom_point() + | ||
+ | geom_smooth(method=" | ||
+ | |||
+ | # the real slope should reside in between | ||
+ | # the violet area. The area is estimated | ||
+ | # with standard error of slop (regression | ||
+ | # coefficient) | ||
+ | |||
+ | # in word | ||
+ | # se for b is | ||
+ | # from summary(lm.y.x) | ||
+ | |||
+ | summary(lm.y.x) | ||
+ | se.b <- 0.0401 | ||
+ | ci.se.b <- 1.96 * se.b | ||
+ | # this is the lowest estimate | ||
+ | b - ci.se.b | ||
+ | # the highest estimate | ||
+ | b + ci.se.b | ||
+ | # calculated estimate | ||
+ | b | ||
+ | |||
+ | # how to get se for b? | ||
+ | # https:// | ||
+ | # see also http:// | ||
+ | |||
+ | ss.res | ||
+ | ss.x | ||
+ | # we didn't get ss.x | ||
+ | ss.x <- sum((x-m.x)^2) | ||
+ | ss.x | ||
+ | |||
+ | sqrt( (ss.res/ | ||
+ | # the above is the same as the below | ||
+ | sqrt( ss.res / (n-2)*ss.x ) | ||
+ | sqrt( 1/(n-2) * (ss.res/ | ||
+ | se.b <- sqrt( 1/(n-2) * (ss.res/ | ||
+ | |||
+ | # now | ||
+ | summary(lm.y.x) | ||
+ | # if b is 0, b does not do anything (null hypothesis) | ||
+ | # To test if b is working, we do t-test by | ||
+ | # b (current coefficient) - 0 (not working) / se | ||
+ | # = t.calculated. We test if this is significant | ||
+ | # tp(t.b.cal, df=n-2) | ||
+ | |||
+ | t.b.cal <- (b - 0)/ | ||
+ | t.b.cal | ||
+ | # k = num of predictor variables | ||
+ | k <- 1 | ||
+ | df.t <- n-k-1 | ||
+ | |||
+ | 2*pt(t.b.cal, | ||
+ | pf(t.b.cal^2, | ||
+ | </ | ||
+ | ====== Output ====== | ||
< | < | ||
+ | > ################ | ||
> rnorm2 <- function(n, | > rnorm2 <- function(n, | ||
- | > n <- 400 | + | > n <- 16 |
> | > | ||
> set.seed(101) | > set.seed(101) | ||
Line 157: | Line 275: | ||
> points(rand.00, | > points(rand.00, | ||
> | > | ||
- | > b = 170 / 265 | + | > b <- 170 / 265 |
+ | > b <- round(b,1) | ||
> b | > b | ||
- | [1] 0.6415094 | + | [1] 0.6 |
> y <- b * x + rand.00 | > y <- b * x + rand.00 | ||
> | > | ||
Line 165: | Line 284: | ||
> head(df) | > head(df) | ||
| | ||
- | 1 260.3289 164.1701 | + | 1 256.4615 144.9723 |
- | 2 273.9897 149.7065 | + | 2 273.7812 167.6050 |
- | 3 254.9034 140.6106 | + | 3 249.5828 141.2775 |
- | 4 268.7321 179.5289 | + | 4 267.1155 135.1836 |
- | 5 270.2313 176.5863 | + | 5 269.0162 161.7509 |
- | 6 283.6541 172.5179 | + | 6 286.0342 183.7182 |
> | > | ||
> # plot method 0 | > # plot method 0 | ||
Line 189: | Line 308: | ||
Residuals: | Residuals: | ||
Min 1Q Median | Min 1Q Median | ||
- | -39.653 -8.878 -0.226 7.322 30.630 | + | -25.508 -5.847 2.617 7.595 15.963 |
Coefficients: | Coefficients: | ||
- | Estimate Std. Error t value Pr(> | + | Estimate Std. Error t value Pr(> |
- | (Intercept) | + | (Intercept) |
- | x 0.6385 0.0401 15.922 < | + | x 0.7996 0.2071 |
--- | --- | ||
Signif. codes: | Signif. codes: | ||
- | Residual standard error: 12.01 on 398 degrees of freedom | + | Residual standard error: 12.03 on 14 degrees of freedom |
- | Multiple R-squared: | + | Multiple R-squared: |
- | F-statistic: | + | F-statistic: |
- | > # str(lm.y.x) | + | > |
+ | > ########################### | ||
+ | > # double check with this | ||
+ | > ########################### | ||
+ | > str(lm.y.x) | ||
+ | List of 12 | ||
+ | $ coefficients : Named num [1:2] -52.9 0.8 | ||
+ | ..- attr(*, " | ||
+ | $ residuals | ||
+ | ..- attr(*, " | ||
+ | $ effects | ||
+ | ..- attr(*, " | ||
+ | $ rank : int 2 | ||
+ | $ fitted.values: | ||
+ | ..- attr(*, " | ||
+ | $ assign | ||
+ | $ qr :List of 5 | ||
+ | ..$ qr : num [1:16, 1:2] -4 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25 ... | ||
+ | .. ..- attr(*, " | ||
+ | .. .. ..$ : chr [1:16] " | ||
+ | .. .. ..$ : chr [1:2] " | ||
+ | .. ..- attr(*, " | ||
+ | ..$ qraux: num [1:2] 1.25 1.18 | ||
+ | ..$ pivot: int [1:2] 1 2 | ||
+ | ..$ tol : num 1e-07 | ||
+ | ..$ rank : int 2 | ||
+ | ..- attr(*, " | ||
+ | $ df.residual | ||
+ | $ xlevels | ||
+ | $ call : language lm(formula = y ~ x, data = df) | ||
+ | $ terms :Classes ' | ||
+ | .. ..- attr(*, " | ||
+ | .. ..- attr(*, " | ||
+ | .. .. ..- attr(*, " | ||
+ | .. .. .. ..$ : chr [1:2] " | ||
+ | .. .. .. ..$ : chr " | ||
+ | .. ..- attr(*, " | ||
+ | .. ..- attr(*, " | ||
+ | .. ..- attr(*, " | ||
+ | .. ..- attr(*, " | ||
+ | .. ..- attr(*, " | ||
+ | .. ..- attr(*, " | ||
+ | .. ..- attr(*, " | ||
+ | .. .. ..- attr(*, " | ||
+ | $ model :' | ||
+ | ..$ y: num [1:16] 145 168 141 135 162 ... | ||
+ | ..$ x: num [1:16] 256 274 250 267 269 ... | ||
+ | ..- attr(*, " | ||
+ | .. .. ..- attr(*, " | ||
+ | .. .. ..- attr(*, " | ||
+ | .. .. .. ..- attr(*, " | ||
+ | .. .. .. .. ..$ : chr [1:2] " | ||
+ | .. .. .. .. ..$ : chr " | ||
+ | .. .. ..- attr(*, " | ||
+ | .. .. ..- attr(*, " | ||
+ | .. .. ..- attr(*, " | ||
+ | .. .. ..- attr(*, " | ||
+ | .. .. ..- attr(*, " | ||
+ | .. .. ..- attr(*, " | ||
+ | .. .. ..- attr(*, " | ||
+ | .. .. .. ..- attr(*, " | ||
+ | - attr(*, " | ||
+ | > y.res <- lm.y.x$residuals | ||
+ | > y.pre <- lm.y.x$fitted.values | ||
+ | > y.exp <- y.pre - m.y | ||
+ | > plot(x, | ||
+ | > plot(x, | ||
+ | > plot(x, | ||
+ | > ########################### | ||
> lm.y.x$coefficients | > lm.y.x$coefficients | ||
(Intercept) | (Intercept) | ||
- | 0.8021833 | + | -52.8817788 |
> intercept <- lm.y.x$coefficients[1] | > intercept <- lm.y.x$coefficients[1] | ||
> slope <- lm.y.x$coefficients[2] | > slope <- lm.y.x$coefficients[2] | ||
> intercept | > intercept | ||
(Intercept) | (Intercept) | ||
- | | + | |
> slope | > slope | ||
x | x | ||
- | 0.6384823 | + | 0.7995539 |
> | > | ||
> # library(ggplot2) | > # library(ggplot2) | ||
Line 224: | Line 411: | ||
> ss.y <- sum((y-mean(y))^2) | > ss.y <- sum((y-mean(y))^2) | ||
> ss.y | > ss.y | ||
- | [1] 94052.83 | + | [1] 4183.193 |
> df.y <- length(y)-1 | > df.y <- length(y)-1 | ||
> df.y | > df.y | ||
- | [1] 399 | + | [1] 15 |
> ss.y/df.y | > ss.y/df.y | ||
- | [1] 235.7214 | + | [1] 278.8795 |
> var(y) | > var(y) | ||
[,1] | [,1] | ||
- | [1,] 235.7214 | + | [1,] 278.8795 |
> | > | ||
> m.x <- mean(x) | > m.x <- mean(x) | ||
Line 239: | Line 426: | ||
> df.tot <- n-1 | > df.tot <- n-1 | ||
> sp/df.tot | > sp/df.tot | ||
- | [1] 143.6585 | + | [1] 179.8996 |
> cov.xy <- cov(x,y) | > cov.xy <- cov(x,y) | ||
> | > | ||
Line 248: | Line 435: | ||
> r.xy | > r.xy | ||
[,1] | [,1] | ||
- | [1,] 0.6237932 | + | [1,] 0.7181756 |
> cor(x,y) | > cor(x,y) | ||
[,1] | [,1] | ||
- | [1,] 0.6237932 | + | [1,] 0.7181756 |
> cor.test(x, | > cor.test(x, | ||
Line 257: | Line 444: | ||
data: x and y | data: x and y | ||
- | t = 15.922, df = 398, p-value | + | t = 3.8616, df = 14, p-value |
alternative hypothesis: true correlation is not equal to 0 | alternative hypothesis: true correlation is not equal to 0 | ||
95 percent confidence interval: | 95 percent confidence interval: | ||
- | 0.5599929 | + | 0.3454526 |
sample estimates: | sample estimates: | ||
cor | cor | ||
- | 0.6237932 | + | 0.7181756 |
> | > | ||
Line 274: | Line 461: | ||
> b | > b | ||
[,1] | [,1] | ||
- | [1,] 0.6384823 | + | [1,] 0.7995539 |
> a | > a | ||
[,1] | [,1] | ||
- | [1,] 0.8021833 | + | [1,] -52.88178 |
+ | > # we got these from the above | ||
+ | > slope | ||
+ | x | ||
+ | 0.7995539 | ||
+ | > intercept | ||
+ | (Intercept) | ||
+ | -52.88178 | ||
> | > | ||
> | > | ||
Line 287: | Line 481: | ||
Residuals: | Residuals: | ||
Min 1Q Median | Min 1Q Median | ||
- | -39.653 -8.878 -0.226 7.322 30.630 | + | -25.508 -5.847 2.617 7.595 15.963 |
Coefficients: | Coefficients: | ||
- | Estimate Std. Error t value Pr(> | + | Estimate Std. Error t value Pr(> |
- | (Intercept) | + | (Intercept) |
- | x 0.6385 0.0401 15.922 < | + | x 0.7996 0.2071 |
--- | --- | ||
Signif. codes: | Signif. codes: | ||
- | Residual standard error: 12.01 on 398 degrees of freedom | + | Residual standard error: 12.03 on 14 degrees of freedom |
- | Multiple R-squared: | + | Multiple R-squared: |
- | F-statistic: | + | F-statistic: |
> lm.y.x$coefficients | > lm.y.x$coefficients | ||
(Intercept) | (Intercept) | ||
- | 0.8021833 | + | -52.8817788 |
- | > | + | |
> # str(lm.y.x) | > # str(lm.y.x) | ||
> | > | ||
Line 309: | Line 502: | ||
> y <- y | > y <- y | ||
> m.y | > m.y | ||
- | [1] 170 | + | [1] 159 |
> | > | ||
> res <- y - y.pred | > res <- y - y.pred | ||
> reg <- y.pred - m.y | > reg <- y.pred - m.y | ||
> ss.y | > ss.y | ||
- | [1] 94052.83 | + | [1] 4183.193 |
> ss.res <- sum(res^2) | > ss.res <- sum(res^2) | ||
> ss.reg <- sum(reg^2) | > ss.reg <- sum(reg^2) | ||
> ss.y | > ss.y | ||
- | [1] 94052.83 | + | [1] 4183.193 |
> ss.res | > ss.res | ||
- | [1] 57455.18 | + | [1] 2025.602 |
> ss.reg | > ss.reg | ||
- | [1] 36597.65 | + | [1] 2157.592 |
> ss.res + ss.reg | > ss.res + ss.reg | ||
- | [1] 94052.83 | + | [1] 4183.193 |
> | > | ||
> r.square <- ss.reg / ss.y | > r.square <- ss.reg / ss.y | ||
> r.square | > r.square | ||
- | [1] 0.389118 | + | [1] 0.5157762 |
- | > | + | > sqrt(r.square) |
+ | [1] 0.7181756 | ||
+ | > cor(x,y) | ||
+ | [,1] | ||
+ | [1,] 0.7181756 | ||
> | > | ||
> df.tot <- df.y | > df.tot <- df.y | ||
Line 338: | Line 535: | ||
[1] 1 | [1] 1 | ||
> df.res | > df.res | ||
- | [1] 398 | + | [1] 14 |
> df.tot | > df.tot | ||
- | [1] 399 | + | [1] 15 |
> | > | ||
> ss.tot <- ss.y | > ss.tot <- ss.y | ||
> ss.reg | > ss.reg | ||
- | [1] 36597.65 | + | [1] 2157.592 |
> ss.res | > ss.res | ||
- | [1] 57455.18 | + | [1] 2025.602 |
> ss.tot | > ss.tot | ||
- | [1] 94052.83 | + | [1] 4183.193 |
> | > | ||
> ms.reg <- ss.reg / df.reg | > ms.reg <- ss.reg / df.reg | ||
> ms.res <- ss.res / df.res | > ms.res <- ss.res / df.res | ||
> ms.reg | > ms.reg | ||
- | [1] 36597.65 | + | [1] 2157.592 |
> ms.res | > ms.res | ||
- | [1] 144.3597 | + | [1] 144.6858 |
> | > | ||
> f.cal <- ms.reg/ | > f.cal <- ms.reg/ | ||
> f.cal | > f.cal | ||
- | [1] 253.517 | + | [1] 14.91225 |
> | > | ||
> p.val <- pf(f.cal, df.reg, df.res, lower.tail = F) | > p.val <- pf(f.cal, df.reg, df.res, lower.tail = F) | ||
> p.val | > p.val | ||
- | [1] 1.623694e-44 | + | [1] 0.001727459 |
> anova(lm.y.x) | > anova(lm.y.x) | ||
Analysis of Variance Table | Analysis of Variance Table | ||
Response: y | Response: y | ||
- | Df Sum Sq Mean Sq F value Pr(> | + | |
- | x | + | x 1 2157.6 2157.59 |
- | Residuals | + | Residuals |
--- | --- | ||
Signif. codes: | Signif. codes: | ||
Line 380: | Line 577: | ||
> res <- lm.y.x$residuals | > res <- lm.y.x$residuals | ||
> reg <- lm.y.x$fitted.values-m.y | > reg <- lm.y.x$fitted.values-m.y | ||
+ | > | ||
+ | > # The above can be derived as follows | ||
+ | > y.hat <- intercept + (slope * x) | ||
+ | > y.hat | ||
+ | [,1] | ||
+ | [1,] 152.1730 | ||
+ | [2,] 166.0210 | ||
+ | [3,] 146.6731 | ||
+ | [4,] 160.6914 | ||
+ | [5,] 162.2112 | ||
+ | [6,] 175.8179 | ||
+ | [7,] 167.0666 | ||
+ | [8,] 155.5354 | ||
+ | [9,] 171.7678 | ||
+ | [10,] 153.7931 | ||
+ | [11,] 165.6110 | ||
+ | [12,] 144.7831 | ||
+ | [13,] 179.8185 | ||
+ | [14,] 134.1906 | ||
+ | [15,] 153.5815 | ||
+ | [16,] 154.2648 | ||
+ | attr(," | ||
+ | [1] 0.1070574 | ||
+ | attr(," | ||
+ | [1] 0.7608404 | ||
+ | > head(y.hat) | ||
+ | [,1] | ||
+ | [1,] 152.1730 | ||
+ | [2,] 166.0210 | ||
+ | [3,] 146.6731 | ||
+ | [4,] 160.6914 | ||
+ | [5,] 162.2112 | ||
+ | [6,] 175.8179 | ||
+ | > head(lm.y.x$fitted.values) | ||
+ | | ||
+ | 152.1730 166.0210 146.6731 160.6914 162.2112 175.8179 | ||
+ | > y.pre <- y.hat | ||
+ | > y.res <- y - y.pre | ||
+ | > y.res | ||
+ | [,1] | ||
+ | | ||
+ | | ||
+ | | ||
+ | [4,] -25.5078178 | ||
+ | | ||
+ | | ||
+ | | ||
+ | [8,] -16.3176727 | ||
+ | | ||
+ | [10,] -15.1613471 | ||
+ | [11,] | ||
+ | [12,] | ||
+ | [13,] | ||
+ | [14,] 15.4541201 | ||
+ | [15,] 15.9625789 | ||
+ | [16,] | ||
+ | attr(," | ||
+ | [1] 0.1070574 | ||
+ | attr(," | ||
+ | [1] 0.7608404 | ||
+ | > head(res) | ||
+ | 1 | ||
+ | | ||
+ | > head(y.res) | ||
+ | [,1] | ||
+ | [1,] -7.2007773 | ||
+ | [2,] | ||
+ | [3,] -5.3956693 | ||
+ | [4,] -25.5078178 | ||
+ | [5,] -0.4602376 | ||
+ | [6,] | ||
+ | > | ||
+ | > y.reg <- y.pre - m.y | ||
+ | > head(reg) | ||
+ | | ||
+ | | ||
+ | > head(y.reg) | ||
+ | [,1] | ||
+ | [1,] -6.826963 | ||
+ | [2,] | ||
+ | [3,] -12.326872 | ||
+ | [4,] | ||
+ | [5,] | ||
+ | [6,] 16.817938 | ||
+ | > ######################################## | ||
> | > | ||
> # This is essentially a perfect linear | > # This is essentially a perfect linear | ||
Line 392: | Line 674: | ||
Residuals: | Residuals: | ||
| | ||
- | -6.828e-14 -7.900e-15 -5.500e-16 6.410e-15 | + | -1.216e-14 -9.993e-15 -2.381e-15 7.912e-15 |
Coefficients: | Coefficients: | ||
Estimate Std. Error t value Pr(> | Estimate Std. Error t value Pr(> | ||
- | (Intercept) -1.692e+02 | + | (Intercept) -2.119e+02 |
- | x | + | x |
--- | --- | ||
Signif. codes: | Signif. codes: | ||
- | Residual standard error: | + | Residual standard error: |
Multiple R-squared: | Multiple R-squared: | ||
- | F-statistic: | + | F-statistic: |
+ | 경고메시지(들): | ||
+ | summary.lm(lm.reg.x)에서: | ||
+ | essentially perfect fit: summary may be unreliable | ||
> | > | ||
> # The relationship between residuals and | > # The relationship between residuals and | ||
Line 416: | Line 701: | ||
Residuals: | Residuals: | ||
Min 1Q Median | Min 1Q Median | ||
- | -39.653 -8.878 -0.226 7.322 30.630 | + | -25.508 -5.847 2.617 7.595 15.963 |
Coefficients: | Coefficients: | ||
Estimate Std. Error t value Pr(>|t|) | Estimate Std. Error t value Pr(>|t|) | ||
- | (Intercept) -3.586e-15 1.064e+01 | + | (Intercept) -1.956e-14 5.495e+01 |
- | x | + | x |
- | Residual standard error: 12.01 on 398 degrees of freedom | + | Residual standard error: 12.03 on 14 degrees of freedom |
- | Multiple R-squared: | + | Multiple R-squared: |
- | F-statistic: | + | F-statistic: |
> summary(lm.y.x) | > summary(lm.y.x) | ||
Line 434: | Line 719: | ||
Residuals: | Residuals: | ||
Min 1Q Median | Min 1Q Median | ||
- | -39.653 -8.878 -0.226 7.322 30.630 | + | -25.508 -5.847 2.617 7.595 15.963 |
Coefficients: | Coefficients: | ||
- | Estimate Std. Error t value Pr(> | + | Estimate Std. Error t value Pr(> |
- | (Intercept) | + | (Intercept) |
- | x 0.6385 0.0401 15.922 < | + | x 0.7996 0.2071 |
--- | --- | ||
Signif. codes: | Signif. codes: | ||
- | Residual standard error: 12.01 on 398 degrees of freedom | + | Residual standard error: 12.03 on 14 degrees of freedom |
- | Multiple R-squared: | + | Multiple R-squared: |
- | F-statistic: | + | F-statistic: |
> | > | ||
+ | > ######################################## | ||
+ | > # Now let's get back to the lm output | ||
+ | > # this is to evalue the significance of | ||
+ | > # the slope of x (b) | ||
+ | > summary(lm.y.x) | ||
+ | |||
+ | Call: | ||
+ | lm(formula = y ~ x, data = df) | ||
+ | |||
+ | Residuals: | ||
+ | Min 1Q Median | ||
+ | -25.508 | ||
+ | |||
+ | Coefficients: | ||
+ | Estimate Std. Error t value Pr(> | ||
+ | (Intercept) -52.8818 | ||
+ | x | ||
+ | --- | ||
+ | Signif. codes: | ||
+ | |||
+ | Residual standard error: 12.03 on 14 degrees of freedom | ||
+ | Multiple R-squared: | ||
+ | F-statistic: | ||
+ | |||
> | > | ||
+ | > # the slope, we already got this | ||
+ | > b <- slope | ||
+ | > # intercept | ||
+ | > a <- intercept | ||
> | > | ||
+ | > # the real data look like this | ||
+ | > plot(x,y) | ||
+ | > abline(h = mean(y), lwd=1.5, col=" | ||
+ | > abline(lm(y ~ x), lwd=1.5, col=" | ||
+ | > | ||
+ | > # blue line is regression line | ||
+ | > # this slope is an estimate of a real | ||
+ | > # thing (population) rather than the | ||
+ | > # sample we are using right now) | ||
+ | > library(ggplot2) | ||
+ | > | ||
+ | > ggplot(data = df, aes(x,y)) + | ||
+ | + | ||
+ | + | ||
+ | `geom_smooth()` using formula = 'y ~ x' | ||
+ | > | ||
+ | > # the real slope should reside in between | ||
+ | > # the violet area. The area is estimated | ||
+ | > # with standard error of slop (regression | ||
+ | > # coefficient) | ||
+ | > | ||
+ | > # in word | ||
+ | > # se for b is | ||
+ | > # from summary(lm.y.x) | ||
+ | > | ||
+ | > summary(lm.y.x) | ||
+ | |||
+ | Call: | ||
+ | lm(formula = y ~ x, data = df) | ||
+ | |||
+ | Residuals: | ||
+ | Min 1Q Median | ||
+ | -25.508 | ||
+ | |||
+ | Coefficients: | ||
+ | Estimate Std. Error t value Pr(> | ||
+ | (Intercept) -52.8818 | ||
+ | x | ||
+ | --- | ||
+ | Signif. codes: | ||
+ | |||
+ | Residual standard error: 12.03 on 14 degrees of freedom | ||
+ | Multiple R-squared: | ||
+ | F-statistic: | ||
+ | |||
+ | > se.b <- 0.0401 | ||
+ | > ci.se.b <- 1.96 * se.b | ||
+ | > # this is the lowest estimate | ||
+ | > b - ci.se.b | ||
+ | x | ||
+ | 0.7209579 | ||
+ | > # the highest estimate | ||
+ | > b + ci.se.b | ||
+ | x | ||
+ | 0.8781499 | ||
+ | > # calculated estimate | ||
+ | > b | ||
+ | x | ||
+ | 0.7995539 | ||
+ | > | ||
+ | > # how to get se for b? | ||
+ | > # https:// | ||
+ | > # see also http:// | ||
+ | > | ||
+ | > ss.res | ||
+ | [1] 2025.602 | ||
+ | > ss.x | ||
+ | [1] 3375 | ||
+ | > # we didn't get ss.x | ||
+ | > ss.x <- sum((x-m.x)^2) | ||
+ | > ss.x | ||
+ | [1] 3375 | ||
+ | > | ||
+ | > sqrt( (ss.res/ | ||
+ | [1] 0.2070504 | ||
+ | > # the above is the same as the below | ||
+ | > sqrt( ss.res / (n-2)*ss.x ) | ||
+ | [1] 698.7952 | ||
+ | > sqrt( 1/(n-2) * (ss.res/ | ||
+ | [1] 0.2070504 | ||
+ | > se.b <- sqrt( 1/(n-2) * (ss.res/ | ||
+ | > | ||
+ | > # now | ||
+ | > summary(lm.y.x) | ||
+ | |||
+ | Call: | ||
+ | lm(formula = y ~ x, data = df) | ||
+ | |||
+ | Residuals: | ||
+ | Min 1Q Median | ||
+ | -25.508 | ||
+ | |||
+ | Coefficients: | ||
+ | Estimate Std. Error t value Pr(> | ||
+ | (Intercept) -52.8818 | ||
+ | x | ||
+ | --- | ||
+ | Signif. codes: | ||
+ | |||
+ | Residual standard error: 12.03 on 14 degrees of freedom | ||
+ | Multiple R-squared: | ||
+ | F-statistic: | ||
+ | |||
+ | > # if b is 0, b does not do anything (null hypothesis) | ||
+ | > # To test if b is working, we do t-test by | ||
+ | > # b (current coefficient) - 0 (not working) / se | ||
+ | > # = t.calculated. We test if this is significant | ||
+ | > # tp(t.b.cal, df=n-2) | ||
+ | > | ||
+ | > t.b.cal <- (b - 0)/ | ||
+ | > t.b.cal | ||
+ | | ||
+ | 3.861639 | ||
+ | > # k = num of predictor variables | ||
+ | > k <- 1 | ||
+ | > df.t <- n-k-1 | ||
+ | > | ||
+ | > 2*pt(t.b.cal, | ||
+ | x | ||
+ | 0.001727459 | ||
+ | > pf(t.b.cal^2, | ||
+ | x | ||
+ | 0.001727459 | ||
+ | > | ||
</ | </ | ||
+ | ====== Graphics output ====== | ||
+ | {{: | ||
+ | {{: | ||
+ | {{: | ||
+ | {{: | ||
+ | {{: | ||
+ | {{: | ||
+ | {{: | ||
+ |
c/ms/regression_lecture_note_for_r.1716935598.txt.gz · Last modified: 2024/05/29 07:33 by hkimscil