c:ms:regression_lecture_note_for_r
Differences
This shows you the differences between two versions of the page.
Both sides previous revisionPrevious revisionNext revision | Previous revision | ||
c:ms:regression_lecture_note_for_r [2024/05/29 07:31] – hkimscil | c:ms:regression_lecture_note_for_r [2024/06/06 17:07] (current) – [Graphics output] hkimscil | ||
---|---|---|---|
Line 1: | Line 1: | ||
+ | ====== Code ====== | ||
+ | You need to install some packages | ||
+ | < | ||
+ | install.packages(" | ||
+ | install.packages(" | ||
+ | install.packages(" | ||
+ | |||
+ | </ | ||
< | < | ||
################ | ################ | ||
rnorm2 <- function(n, | rnorm2 <- function(n, | ||
- | n <- 400 | + | n <- 16 |
set.seed(101) | set.seed(101) | ||
Line 12: | Line 20: | ||
points(rand.00, | points(rand.00, | ||
- | b = 170 / 265 | + | b <- 170 / 265 |
+ | b <- round(b,1) | ||
b | b | ||
y <- b * x + rand.00 | y <- b * x + rand.00 | ||
Line 30: | Line 39: | ||
lm.y.x <- lm(y ~ x, data = df) | lm.y.x <- lm(y ~ x, data = df) | ||
summary(lm.y.x) | summary(lm.y.x) | ||
- | # str(lm.y.x) | + | |
+ | ########################### | ||
+ | # double check with this | ||
+ | ########################### | ||
+ | str(lm.y.x) | ||
+ | y.res <- lm.y.x$residuals | ||
+ | y.pre <- lm.y.x$fitted.values | ||
+ | y.exp <- y.pre - m.y | ||
+ | plot(x, | ||
+ | plot(x, | ||
+ | plot(x, | ||
+ | ########################### | ||
lm.y.x$coefficients | lm.y.x$coefficients | ||
intercept <- lm.y.x$coefficients[1] | intercept <- lm.y.x$coefficients[1] | ||
Line 74: | Line 94: | ||
b | b | ||
a | a | ||
+ | # we got these from the above | ||
+ | slope | ||
+ | intercept | ||
summary(lm.y.x) | summary(lm.y.x) | ||
lm.y.x$coefficients | lm.y.x$coefficients | ||
- | |||
# str(lm.y.x) | # str(lm.y.x) | ||
Line 97: | Line 119: | ||
r.square <- ss.reg / ss.y | r.square <- ss.reg / ss.y | ||
r.square | r.square | ||
+ | sqrt(r.square) | ||
+ | cor(x,y) | ||
df.tot <- df.y | df.tot <- df.y | ||
Line 130: | Line 153: | ||
res <- lm.y.x$residuals | res <- lm.y.x$residuals | ||
reg <- lm.y.x$fitted.values-m.y | reg <- lm.y.x$fitted.values-m.y | ||
+ | |||
+ | # The above can be derived as follows | ||
+ | y.hat <- intercept + (slope * x) | ||
+ | y.hat | ||
+ | head(y.hat) | ||
+ | head(lm.y.x$fitted.values) | ||
+ | y.pre <- y.hat | ||
+ | y.res <- y - y.pre | ||
+ | y.res | ||
+ | head(res) | ||
+ | head(y.res) | ||
+ | |||
+ | y.reg <- y.pre - m.y | ||
+ | head(reg) | ||
+ | head(y.reg) | ||
+ | ######################################## | ||
# This is essentially a perfect linear | # This is essentially a perfect linear | ||
Line 142: | Line 181: | ||
summary(lm.res.x) | summary(lm.res.x) | ||
summary(lm.y.x) | summary(lm.y.x) | ||
+ | |||
+ | ######################################## | ||
+ | # Now let's get back to the lm output | ||
+ | # this is to evalue the significance of | ||
+ | # the slope of x (b) | ||
+ | summary(lm.y.x) | ||
+ | |||
+ | # the slope, we already got this | ||
+ | b <- slope | ||
+ | # intercept | ||
+ | a <- intercept | ||
+ | |||
+ | # the real data look like this | ||
+ | plot(x,y) | ||
+ | abline(h = mean(y), lwd=1.5, col=" | ||
+ | abline(lm(y ~ x), lwd=1.5, col=" | ||
+ | |||
+ | # blue line is regression line | ||
+ | # this slope is an estimate of a real | ||
+ | # thing (population) rather than the | ||
+ | # sample we are using right now) | ||
+ | library(ggplot2) | ||
+ | |||
+ | ggplot(data = df, aes(x,y)) + | ||
+ | geom_point() + | ||
+ | geom_smooth(method=" | ||
+ | |||
+ | # the real slope should reside in between | ||
+ | # the violet area. The area is estimated | ||
+ | # with standard error of slop (regression | ||
+ | # coefficient) | ||
+ | |||
+ | # in word | ||
+ | # se for b is | ||
+ | # from summary(lm.y.x) | ||
+ | |||
+ | summary(lm.y.x) | ||
+ | se.b <- 0.0401 | ||
+ | ci.se.b <- 1.96 * se.b | ||
+ | # this is the lowest estimate | ||
+ | b - ci.se.b | ||
+ | # the highest estimate | ||
+ | b + ci.se.b | ||
+ | # calculated estimate | ||
+ | b | ||
+ | |||
+ | # how to get se for b? | ||
+ | # https:// | ||
+ | # see also http:// | ||
+ | |||
+ | ss.res | ||
+ | ss.x | ||
+ | # we didn't get ss.x | ||
+ | ss.x <- sum((x-m.x)^2) | ||
+ | ss.x | ||
+ | |||
+ | sqrt( (ss.res/ | ||
+ | # the above is the same as the below | ||
+ | sqrt( ss.res / (n-2)*ss.x ) | ||
+ | sqrt( 1/(n-2) * (ss.res/ | ||
+ | se.b <- sqrt( 1/(n-2) * (ss.res/ | ||
+ | |||
+ | # now | ||
+ | summary(lm.y.x) | ||
+ | # if b is 0, b does not do anything (null hypothesis) | ||
+ | # To test if b is working, we do t-test by | ||
+ | # b (current coefficient) - 0 (not working) / se | ||
+ | # = t.calculated. We test if this is significant | ||
+ | # tp(t.b.cal, df=n-2) | ||
+ | |||
+ | t.b.cal <- (b - 0)/ | ||
+ | t.b.cal | ||
+ | # k = num of predictor variables | ||
+ | k <- 1 | ||
+ | df.t <- n-k-1 | ||
+ | |||
+ | 2*pt(t.b.cal, | ||
+ | pf(t.b.cal^2, | ||
+ | </ | ||
+ | |||
+ | ====== Output ====== | ||
+ | < | ||
+ | > ################ | ||
+ | > rnorm2 <- function(n, | ||
+ | > n <- 16 | ||
+ | > | ||
+ | > set.seed(101) | ||
+ | > x <- rnorm2(n, 265, 15) | ||
+ | > rand.00 <- rnorm2(n, 0, 12) | ||
+ | > | ||
+ | > rand.01 <- rnorm2(n, 0, 240) | ||
+ | > plot(rand.01) | ||
+ | > points(rand.00, | ||
+ | > | ||
+ | > b <- 170 / 265 | ||
+ | > b <- round(b,1) | ||
+ | > b | ||
+ | [1] 0.6 | ||
+ | > y <- b * x + rand.00 | ||
+ | > | ||
+ | > df <- data.frame(x, | ||
+ | > head(df) | ||
+ | | ||
+ | 1 256.4615 144.9723 | ||
+ | 2 273.7812 167.6050 | ||
+ | 3 249.5828 141.2775 | ||
+ | 4 267.1155 135.1836 | ||
+ | 5 269.0162 161.7509 | ||
+ | 6 286.0342 183.7182 | ||
+ | > | ||
+ | > # plot method 0 | ||
+ | > plot(x,y) | ||
+ | > | ||
+ | > # method 1 | ||
+ | > plot(x, y, pch = 1, cex = 1, col = " | ||
+ | > abline(h = mean(y), lwd=1.5, col=" | ||
+ | > abline(lm(y ~ x), lwd=1.5, col=" | ||
+ | > | ||
+ | > # method 2 | ||
+ | > lm.y.x <- lm(y ~ x, data = df) | ||
+ | > summary(lm.y.x) | ||
+ | |||
+ | Call: | ||
+ | lm(formula = y ~ x, data = df) | ||
+ | |||
+ | Residuals: | ||
+ | Min 1Q Median | ||
+ | -25.508 | ||
+ | |||
+ | Coefficients: | ||
+ | Estimate Std. Error t value Pr(> | ||
+ | (Intercept) -52.8818 | ||
+ | x | ||
+ | --- | ||
+ | Signif. codes: | ||
+ | |||
+ | Residual standard error: 12.03 on 14 degrees of freedom | ||
+ | Multiple R-squared: | ||
+ | F-statistic: | ||
+ | |||
+ | > | ||
+ | > ########################### | ||
+ | > # double check with this | ||
+ | > ########################### | ||
+ | > str(lm.y.x) | ||
+ | List of 12 | ||
+ | $ coefficients : Named num [1:2] -52.9 0.8 | ||
+ | ..- attr(*, " | ||
+ | $ residuals | ||
+ | ..- attr(*, " | ||
+ | $ effects | ||
+ | ..- attr(*, " | ||
+ | $ rank : int 2 | ||
+ | $ fitted.values: | ||
+ | ..- attr(*, " | ||
+ | $ assign | ||
+ | $ qr :List of 5 | ||
+ | ..$ qr : num [1:16, 1:2] -4 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25 ... | ||
+ | .. ..- attr(*, " | ||
+ | .. .. ..$ : chr [1:16] " | ||
+ | .. .. ..$ : chr [1:2] " | ||
+ | .. ..- attr(*, " | ||
+ | ..$ qraux: num [1:2] 1.25 1.18 | ||
+ | ..$ pivot: int [1:2] 1 2 | ||
+ | ..$ tol : num 1e-07 | ||
+ | ..$ rank : int 2 | ||
+ | ..- attr(*, " | ||
+ | $ df.residual | ||
+ | $ xlevels | ||
+ | $ call : language lm(formula = y ~ x, data = df) | ||
+ | $ terms :Classes ' | ||
+ | .. ..- attr(*, " | ||
+ | .. ..- attr(*, " | ||
+ | .. .. ..- attr(*, " | ||
+ | .. .. .. ..$ : chr [1:2] " | ||
+ | .. .. .. ..$ : chr " | ||
+ | .. ..- attr(*, " | ||
+ | .. ..- attr(*, " | ||
+ | .. ..- attr(*, " | ||
+ | .. ..- attr(*, " | ||
+ | .. ..- attr(*, " | ||
+ | .. ..- attr(*, " | ||
+ | .. ..- attr(*, " | ||
+ | .. .. ..- attr(*, " | ||
+ | $ model :' | ||
+ | ..$ y: num [1:16] 145 168 141 135 162 ... | ||
+ | ..$ x: num [1:16] 256 274 250 267 269 ... | ||
+ | ..- attr(*, " | ||
+ | .. .. ..- attr(*, " | ||
+ | .. .. ..- attr(*, " | ||
+ | .. .. .. ..- attr(*, " | ||
+ | .. .. .. .. ..$ : chr [1:2] " | ||
+ | .. .. .. .. ..$ : chr " | ||
+ | .. .. ..- attr(*, " | ||
+ | .. .. ..- attr(*, " | ||
+ | .. .. ..- attr(*, " | ||
+ | .. .. ..- attr(*, " | ||
+ | .. .. ..- attr(*, " | ||
+ | .. .. ..- attr(*, " | ||
+ | .. .. ..- attr(*, " | ||
+ | .. .. .. ..- attr(*, " | ||
+ | - attr(*, " | ||
+ | > y.res <- lm.y.x$residuals | ||
+ | > y.pre <- lm.y.x$fitted.values | ||
+ | > y.exp <- y.pre - m.y | ||
+ | > plot(x, | ||
+ | > plot(x, | ||
+ | > plot(x, | ||
+ | > ########################### | ||
+ | > lm.y.x$coefficients | ||
+ | (Intercept) | ||
+ | -52.8817788 | ||
+ | > intercept <- lm.y.x$coefficients[1] | ||
+ | > slope <- lm.y.x$coefficients[2] | ||
+ | > intercept | ||
+ | (Intercept) | ||
+ | -52.88178 | ||
+ | > slope | ||
+ | x | ||
+ | 0.7995539 | ||
+ | > | ||
+ | > # library(ggplot2) | ||
+ | > # ggplot(data=df, | ||
+ | > # geom_point(color=" | ||
+ | > # geom_hline(aes(yintercept=mean(y))) + | ||
+ | > # geom_abline(intercept=intercept, | ||
+ | > # | ||
+ | > ##### | ||
+ | > ss.y <- sum((y-mean(y))^2) | ||
+ | > ss.y | ||
+ | [1] 4183.193 | ||
+ | > df.y <- length(y)-1 | ||
+ | > df.y | ||
+ | [1] 15 | ||
+ | > ss.y/df.y | ||
+ | [1] 278.8795 | ||
+ | > var(y) | ||
+ | [,1] | ||
+ | [1,] 278.8795 | ||
+ | > | ||
+ | > m.x <- mean(x) | ||
+ | > m.y <- mean(y) | ||
+ | > sp <- sum((x-m.x)*(y-m.y)) | ||
+ | > df.tot <- n-1 | ||
+ | > sp/df.tot | ||
+ | [1] 179.8996 | ||
+ | > cov.xy <- cov(x,y) | ||
+ | > | ||
+ | > sd.x <- sd(x) | ||
+ | > sd.y <- sd(y) | ||
+ | > | ||
+ | > r.xy <- cov.xy/ | ||
+ | > r.xy | ||
+ | [,1] | ||
+ | [1,] 0.7181756 | ||
+ | > cor(x,y) | ||
+ | [,1] | ||
+ | [1,] 0.7181756 | ||
+ | > cor.test(x, | ||
+ | |||
+ | Pearson' | ||
+ | |||
+ | data: x and y | ||
+ | t = 3.8616, df = 14, p-value = 0.001727 | ||
+ | alternative hypothesis: true correlation is not equal to 0 | ||
+ | 95 percent confidence interval: | ||
+ | | ||
+ | sample estimates: | ||
+ | cor | ||
+ | 0.7181756 | ||
+ | |||
+ | > | ||
+ | > b <- cov(x,y) / var(x) | ||
+ | > # note that | ||
+ | > # (sp(x, | ||
+ | > # = sp(x,y) / ss(x) | ||
+ | > # m.y = a + b * mean.x | ||
+ | > a <- m.y - (b * m.x) | ||
+ | > b | ||
+ | [,1] | ||
+ | [1,] 0.7995539 | ||
+ | > a | ||
+ | [,1] | ||
+ | [1,] -52.88178 | ||
+ | > # we got these from the above | ||
+ | > slope | ||
+ | x | ||
+ | 0.7995539 | ||
+ | > intercept | ||
+ | (Intercept) | ||
+ | -52.88178 | ||
+ | > | ||
+ | > | ||
+ | > summary(lm.y.x) | ||
+ | |||
+ | Call: | ||
+ | lm(formula = y ~ x, data = df) | ||
+ | |||
+ | Residuals: | ||
+ | Min 1Q Median | ||
+ | -25.508 | ||
+ | |||
+ | Coefficients: | ||
+ | Estimate Std. Error t value Pr(> | ||
+ | (Intercept) -52.8818 | ||
+ | x | ||
+ | --- | ||
+ | Signif. codes: | ||
+ | |||
+ | Residual standard error: 12.03 on 14 degrees of freedom | ||
+ | Multiple R-squared: | ||
+ | F-statistic: | ||
+ | |||
+ | > lm.y.x$coefficients | ||
+ | (Intercept) | ||
+ | -52.8817788 | ||
+ | > # str(lm.y.x) | ||
+ | > | ||
+ | > y.pred <- lm.y.x$fitted.values | ||
+ | > y <- y | ||
+ | > m.y | ||
+ | [1] 159 | ||
+ | > | ||
+ | > res <- y - y.pred | ||
+ | > reg <- y.pred - m.y | ||
+ | > ss.y | ||
+ | [1] 4183.193 | ||
+ | > ss.res <- sum(res^2) | ||
+ | > ss.reg <- sum(reg^2) | ||
+ | > ss.y | ||
+ | [1] 4183.193 | ||
+ | > ss.res | ||
+ | [1] 2025.602 | ||
+ | > ss.reg | ||
+ | [1] 2157.592 | ||
+ | > ss.res + ss.reg | ||
+ | [1] 4183.193 | ||
+ | > | ||
+ | > r.square <- ss.reg / ss.y | ||
+ | > r.square | ||
+ | [1] 0.5157762 | ||
+ | > sqrt(r.square) | ||
+ | [1] 0.7181756 | ||
+ | > cor(x,y) | ||
+ | [,1] | ||
+ | [1,] 0.7181756 | ||
+ | > | ||
+ | > df.tot <- df.y | ||
+ | > df.reg <- 2 - 1 | ||
+ | > df.res <- df.tot - df.reg | ||
+ | > | ||
+ | > df.reg | ||
+ | [1] 1 | ||
+ | > df.res | ||
+ | [1] 14 | ||
+ | > df.tot | ||
+ | [1] 15 | ||
+ | > | ||
+ | > ss.tot <- ss.y | ||
+ | > ss.reg | ||
+ | [1] 2157.592 | ||
+ | > ss.res | ||
+ | [1] 2025.602 | ||
+ | > ss.tot | ||
+ | [1] 4183.193 | ||
+ | > | ||
+ | > ms.reg <- ss.reg / df.reg | ||
+ | > ms.res <- ss.res / df.res | ||
+ | > ms.reg | ||
+ | [1] 2157.592 | ||
+ | > ms.res | ||
+ | [1] 144.6858 | ||
+ | > | ||
+ | > f.cal <- ms.reg/ | ||
+ | > f.cal | ||
+ | [1] 14.91225 | ||
+ | > | ||
+ | > p.val <- pf(f.cal, df.reg, df.res, lower.tail = F) | ||
+ | > p.val | ||
+ | [1] 0.001727459 | ||
+ | > anova(lm.y.x) | ||
+ | Analysis of Variance Table | ||
+ | |||
+ | Response: y | ||
+ | Df Sum Sq Mean Sq F value | ||
+ | x 1 2157.6 2157.59 | ||
+ | Residuals 14 2025.6 | ||
+ | --- | ||
+ | Signif. codes: | ||
+ | > | ||
+ | > ######################################## | ||
+ | > # also make it sure that you understand | ||
+ | > ######################################## | ||
+ | > | ||
+ | > res <- lm.y.x$residuals | ||
+ | > reg <- lm.y.x$fitted.values-m.y | ||
+ | > | ||
+ | > # The above can be derived as follows | ||
+ | > y.hat <- intercept + (slope * x) | ||
+ | > y.hat | ||
+ | [,1] | ||
+ | [1,] 152.1730 | ||
+ | [2,] 166.0210 | ||
+ | [3,] 146.6731 | ||
+ | [4,] 160.6914 | ||
+ | [5,] 162.2112 | ||
+ | [6,] 175.8179 | ||
+ | [7,] 167.0666 | ||
+ | [8,] 155.5354 | ||
+ | [9,] 171.7678 | ||
+ | [10,] 153.7931 | ||
+ | [11,] 165.6110 | ||
+ | [12,] 144.7831 | ||
+ | [13,] 179.8185 | ||
+ | [14,] 134.1906 | ||
+ | [15,] 153.5815 | ||
+ | [16,] 154.2648 | ||
+ | attr(," | ||
+ | [1] 0.1070574 | ||
+ | attr(," | ||
+ | [1] 0.7608404 | ||
+ | > head(y.hat) | ||
+ | [,1] | ||
+ | [1,] 152.1730 | ||
+ | [2,] 166.0210 | ||
+ | [3,] 146.6731 | ||
+ | [4,] 160.6914 | ||
+ | [5,] 162.2112 | ||
+ | [6,] 175.8179 | ||
+ | > head(lm.y.x$fitted.values) | ||
+ | | ||
+ | 152.1730 166.0210 146.6731 160.6914 162.2112 175.8179 | ||
+ | > y.pre <- y.hat | ||
+ | > y.res <- y - y.pre | ||
+ | > y.res | ||
+ | [,1] | ||
+ | | ||
+ | | ||
+ | | ||
+ | [4,] -25.5078178 | ||
+ | | ||
+ | | ||
+ | | ||
+ | [8,] -16.3176727 | ||
+ | | ||
+ | [10,] -15.1613471 | ||
+ | [11,] | ||
+ | [12,] | ||
+ | [13,] | ||
+ | [14,] 15.4541201 | ||
+ | [15,] 15.9625789 | ||
+ | [16,] | ||
+ | attr(," | ||
+ | [1] 0.1070574 | ||
+ | attr(," | ||
+ | [1] 0.7608404 | ||
+ | > head(res) | ||
+ | 1 | ||
+ | | ||
+ | > head(y.res) | ||
+ | [,1] | ||
+ | [1,] -7.2007773 | ||
+ | [2,] | ||
+ | [3,] -5.3956693 | ||
+ | [4,] -25.5078178 | ||
+ | [5,] -0.4602376 | ||
+ | [6,] | ||
+ | > | ||
+ | > y.reg <- y.pre - m.y | ||
+ | > head(reg) | ||
+ | | ||
+ | | ||
+ | > head(y.reg) | ||
+ | [,1] | ||
+ | [1,] -6.826963 | ||
+ | [2,] | ||
+ | [3,] -12.326872 | ||
+ | [4,] | ||
+ | [5,] | ||
+ | [6,] 16.817938 | ||
+ | > ######################################## | ||
+ | > | ||
+ | > # This is essentially a perfect linear | ||
+ | > # relationship between the regression | ||
+ | > # line and x data | ||
+ | > lm.reg.x <- lm(reg~x, data = df) | ||
+ | > summary(lm.reg.x) | ||
+ | |||
+ | Call: | ||
+ | lm(formula = reg ~ x, data = df) | ||
+ | |||
+ | Residuals: | ||
+ | | ||
+ | -1.216e-14 -9.993e-15 -2.381e-15 | ||
+ | |||
+ | Coefficients: | ||
+ | Estimate Std. Error t value Pr(> | ||
+ | (Intercept) -2.119e+02 | ||
+ | x 7.996e-01 | ||
+ | --- | ||
+ | Signif. codes: | ||
+ | |||
+ | Residual standard error: 1.165e-14 on 14 degrees of freedom | ||
+ | Multiple R-squared: | ||
+ | F-statistic: | ||
+ | |||
+ | 경고메시지(들): | ||
+ | summary.lm(lm.reg.x)에서: | ||
+ | essentially perfect fit: summary may be unreliable | ||
+ | > | ||
+ | > # The relationship between residuals and | ||
+ | > # x should be zero | ||
+ | > lm.res.x <- lm(res~x, data = df) | ||
+ | > summary(lm.res.x) | ||
+ | |||
+ | Call: | ||
+ | lm(formula = res ~ x, data = df) | ||
+ | |||
+ | Residuals: | ||
+ | Min 1Q Median | ||
+ | -25.508 | ||
+ | |||
+ | Coefficients: | ||
+ | Estimate Std. Error t value Pr(>|t|) | ||
+ | (Intercept) -1.956e-14 | ||
+ | x 6.880e-17 | ||
+ | |||
+ | Residual standard error: 12.03 on 14 degrees of freedom | ||
+ | Multiple R-squared: | ||
+ | F-statistic: | ||
+ | |||
+ | > summary(lm.y.x) | ||
+ | |||
+ | Call: | ||
+ | lm(formula = y ~ x, data = df) | ||
+ | |||
+ | Residuals: | ||
+ | Min 1Q Median | ||
+ | -25.508 | ||
+ | |||
+ | Coefficients: | ||
+ | Estimate Std. Error t value Pr(> | ||
+ | (Intercept) -52.8818 | ||
+ | x | ||
+ | --- | ||
+ | Signif. codes: | ||
+ | |||
+ | Residual standard error: 12.03 on 14 degrees of freedom | ||
+ | Multiple R-squared: | ||
+ | F-statistic: | ||
+ | |||
+ | > | ||
+ | > ######################################## | ||
+ | > # Now let's get back to the lm output | ||
+ | > # this is to evalue the significance of | ||
+ | > # the slope of x (b) | ||
+ | > summary(lm.y.x) | ||
+ | |||
+ | Call: | ||
+ | lm(formula = y ~ x, data = df) | ||
+ | |||
+ | Residuals: | ||
+ | Min 1Q Median | ||
+ | -25.508 | ||
+ | |||
+ | Coefficients: | ||
+ | Estimate Std. Error t value Pr(> | ||
+ | (Intercept) -52.8818 | ||
+ | x | ||
+ | --- | ||
+ | Signif. codes: | ||
+ | |||
+ | Residual standard error: 12.03 on 14 degrees of freedom | ||
+ | Multiple R-squared: | ||
+ | F-statistic: | ||
+ | |||
+ | > | ||
+ | > # the slope, we already got this | ||
+ | > b <- slope | ||
+ | > # intercept | ||
+ | > a <- intercept | ||
+ | > | ||
+ | > # the real data look like this | ||
+ | > plot(x,y) | ||
+ | > abline(h = mean(y), lwd=1.5, col=" | ||
+ | > abline(lm(y ~ x), lwd=1.5, col=" | ||
+ | > | ||
+ | > # blue line is regression line | ||
+ | > # this slope is an estimate of a real | ||
+ | > # thing (population) rather than the | ||
+ | > # sample we are using right now) | ||
+ | > library(ggplot2) | ||
+ | > | ||
+ | > ggplot(data = df, aes(x,y)) + | ||
+ | + | ||
+ | + | ||
+ | `geom_smooth()` using formula = 'y ~ x' | ||
+ | > | ||
+ | > # the real slope should reside in between | ||
+ | > # the violet area. The area is estimated | ||
+ | > # with standard error of slop (regression | ||
+ | > # coefficient) | ||
+ | > | ||
+ | > # in word | ||
+ | > # se for b is | ||
+ | > # from summary(lm.y.x) | ||
+ | > | ||
+ | > summary(lm.y.x) | ||
+ | |||
+ | Call: | ||
+ | lm(formula = y ~ x, data = df) | ||
+ | |||
+ | Residuals: | ||
+ | Min 1Q Median | ||
+ | -25.508 | ||
+ | |||
+ | Coefficients: | ||
+ | Estimate Std. Error t value Pr(> | ||
+ | (Intercept) -52.8818 | ||
+ | x | ||
+ | --- | ||
+ | Signif. codes: | ||
+ | |||
+ | Residual standard error: 12.03 on 14 degrees of freedom | ||
+ | Multiple R-squared: | ||
+ | F-statistic: | ||
+ | |||
+ | > se.b <- 0.0401 | ||
+ | > ci.se.b <- 1.96 * se.b | ||
+ | > # this is the lowest estimate | ||
+ | > b - ci.se.b | ||
+ | x | ||
+ | 0.7209579 | ||
+ | > # the highest estimate | ||
+ | > b + ci.se.b | ||
+ | x | ||
+ | 0.8781499 | ||
+ | > # calculated estimate | ||
+ | > b | ||
+ | x | ||
+ | 0.7995539 | ||
+ | > | ||
+ | > # how to get se for b? | ||
+ | > # https:// | ||
+ | > # see also http:// | ||
+ | > | ||
+ | > ss.res | ||
+ | [1] 2025.602 | ||
+ | > ss.x | ||
+ | [1] 3375 | ||
+ | > # we didn't get ss.x | ||
+ | > ss.x <- sum((x-m.x)^2) | ||
+ | > ss.x | ||
+ | [1] 3375 | ||
+ | > | ||
+ | > sqrt( (ss.res/ | ||
+ | [1] 0.2070504 | ||
+ | > # the above is the same as the below | ||
+ | > sqrt( ss.res / (n-2)*ss.x ) | ||
+ | [1] 698.7952 | ||
+ | > sqrt( 1/(n-2) * (ss.res/ | ||
+ | [1] 0.2070504 | ||
+ | > se.b <- sqrt( 1/(n-2) * (ss.res/ | ||
+ | > | ||
+ | > # now | ||
+ | > summary(lm.y.x) | ||
+ | |||
+ | Call: | ||
+ | lm(formula = y ~ x, data = df) | ||
+ | |||
+ | Residuals: | ||
+ | Min 1Q Median | ||
+ | -25.508 | ||
+ | |||
+ | Coefficients: | ||
+ | Estimate Std. Error t value Pr(> | ||
+ | (Intercept) -52.8818 | ||
+ | x | ||
+ | --- | ||
+ | Signif. codes: | ||
+ | |||
+ | Residual standard error: 12.03 on 14 degrees of freedom | ||
+ | Multiple R-squared: | ||
+ | F-statistic: | ||
+ | |||
+ | > # if b is 0, b does not do anything (null hypothesis) | ||
+ | > # To test if b is working, we do t-test by | ||
+ | > # b (current coefficient) - 0 (not working) / se | ||
+ | > # = t.calculated. We test if this is significant | ||
+ | > # tp(t.b.cal, df=n-2) | ||
+ | > | ||
+ | > t.b.cal <- (b - 0)/ | ||
+ | > t.b.cal | ||
+ | | ||
+ | 3.861639 | ||
+ | > # k = num of predictor variables | ||
+ | > k <- 1 | ||
+ | > df.t <- n-k-1 | ||
+ | > | ||
+ | > 2*pt(t.b.cal, | ||
+ | x | ||
+ | 0.001727459 | ||
+ | > pf(t.b.cal^2, | ||
+ | x | ||
+ | 0.001727459 | ||
+ | > | ||
</ | </ | ||
+ | ====== Graphics output ====== | ||
+ | {{: | ||
+ | {{: | ||
+ | {{: | ||
+ | {{: | ||
+ | {{: | ||
+ | {{: | ||
+ | {{: | ||
c/ms/regression_lecture_note_for_r.1716935498.txt.gz · Last modified: 2024/05/29 07:31 by hkimscil