c:ms:regression_lecture_note_for_r
                Differences
This shows you the differences between two versions of the page.
| Both sides previous revisionPrevious revisionNext revision | Previous revision | ||
| c:ms:regression_lecture_note_for_r [2024/05/29 07:31] – hkimscil | c:ms:regression_lecture_note_for_r [2024/06/06 17:07] (current) – [Graphics output] hkimscil | ||
|---|---|---|---|
| Line 1: | Line 1: | ||
| + | ====== Code ====== | ||
| + | You need to install some packages | ||
| + | < | ||
| + | install.packages(" | ||
| + | install.packages(" | ||
| + | install.packages(" | ||
| + | |||
| + | </ | ||
| < | < | ||
| ################ | ################ | ||
| rnorm2 <- function(n, | rnorm2 <- function(n, | ||
| - | n <- 400 | + | n <- 16 | 
| set.seed(101) | set.seed(101) | ||
| Line 12: | Line 20: | ||
| points(rand.00, | points(rand.00, | ||
| - | b = 170 / 265 | + | b <- 170 / 265 | 
| + | b <- round(b,1) | ||
| b | b | ||
| y <- b * x + rand.00 | y <- b * x + rand.00 | ||
| Line 30: | Line 39: | ||
| lm.y.x <- lm(y ~ x, data = df) | lm.y.x <- lm(y ~ x, data = df) | ||
| summary(lm.y.x) | summary(lm.y.x) | ||
| - | # str(lm.y.x) | + | |
| + | ########################### | ||
| + | # double check with this | ||
| + | ########################### | ||
| + | str(lm.y.x) | ||
| + | y.res <- lm.y.x$residuals | ||
| + | y.pre <- lm.y.x$fitted.values | ||
| + | y.exp <- y.pre - m.y | ||
| + | plot(x, | ||
| + | plot(x, | ||
| + | plot(x, | ||
| + | ########################### | ||
| lm.y.x$coefficients | lm.y.x$coefficients | ||
| intercept <- lm.y.x$coefficients[1] | intercept <- lm.y.x$coefficients[1] | ||
| Line 74: | Line 94: | ||
| b | b | ||
| a | a | ||
| + | # we got these from the above | ||
| + | slope | ||
| + | intercept | ||
| summary(lm.y.x) | summary(lm.y.x) | ||
| lm.y.x$coefficients | lm.y.x$coefficients | ||
| - | |||
| # str(lm.y.x) | # str(lm.y.x) | ||
| Line 97: | Line 119: | ||
| r.square <- ss.reg / ss.y | r.square <- ss.reg / ss.y | ||
| r.square | r.square | ||
| + | sqrt(r.square) | ||
| + | cor(x,y) | ||
| df.tot <- df.y | df.tot <- df.y | ||
| Line 130: | Line 153: | ||
| res <- lm.y.x$residuals | res <- lm.y.x$residuals | ||
| reg <- lm.y.x$fitted.values-m.y | reg <- lm.y.x$fitted.values-m.y | ||
| + | |||
| + | # The above can be derived as follows | ||
| + | y.hat <- intercept + (slope * x) | ||
| + | y.hat | ||
| + | head(y.hat) | ||
| + | head(lm.y.x$fitted.values) | ||
| + | y.pre <- y.hat | ||
| + | y.res <- y - y.pre | ||
| + | y.res | ||
| + | head(res) | ||
| + | head(y.res) | ||
| + | |||
| + | y.reg <- y.pre - m.y | ||
| + | head(reg) | ||
| + | head(y.reg) | ||
| + | ######################################## | ||
| # This is essentially a perfect linear | # This is essentially a perfect linear | ||
| Line 142: | Line 181: | ||
| summary(lm.res.x) | summary(lm.res.x) | ||
| summary(lm.y.x) | summary(lm.y.x) | ||
| + | |||
| + | ######################################## | ||
| + | # Now let's get back to the lm output | ||
| + | # this is to evalue the significance of | ||
| + | # the slope of x (b) | ||
| + | summary(lm.y.x) | ||
| + | |||
| + | # the slope, we already got this | ||
| + | b <- slope | ||
| + | # intercept | ||
| + | a <- intercept | ||
| + | |||
| + | # the real data look like this | ||
| + | plot(x,y) | ||
| + | abline(h = mean(y), lwd=1.5, col=" | ||
| + | abline(lm(y ~ x), lwd=1.5, col=" | ||
| + | |||
| + | # blue line is regression line | ||
| + | # this slope is an estimate of a real | ||
| + | # thing (population) rather than the | ||
| + | # sample we are using right now) | ||
| + | library(ggplot2) | ||
| + | |||
| + | ggplot(data = df, aes(x,y)) + | ||
| + | geom_point() + | ||
| + | geom_smooth(method=" | ||
| + | |||
| + | # the real slope should reside in between | ||
| + | # the violet area. The area is estimated | ||
| + | # with standard error of slop (regression | ||
| + | # coefficient) | ||
| + | |||
| + | # in word | ||
| + | # se for b is | ||
| + | # from summary(lm.y.x) | ||
| + | |||
| + | summary(lm.y.x) | ||
| + | se.b <- 0.0401 | ||
| + | ci.se.b <- 1.96 * se.b | ||
| + | # this is the lowest estimate | ||
| + | b - ci.se.b | ||
| + | # the highest estimate | ||
| + | b + ci.se.b | ||
| + | # calculated estimate | ||
| + | b | ||
| + | |||
| + | # how to get se for b? | ||
| + | # https:// | ||
| + | # see also http:// | ||
| + | |||
| + | ss.res | ||
| + | ss.x | ||
| + | # we didn't get ss.x | ||
| + | ss.x <- sum((x-m.x)^2) | ||
| + | ss.x | ||
| + | |||
| + | sqrt( (ss.res/ | ||
| + | # the above is the same as the below | ||
| + | sqrt( ss.res / (n-2)*ss.x ) | ||
| + | sqrt( 1/(n-2) * (ss.res/ | ||
| + | se.b <- sqrt( 1/(n-2) * (ss.res/ | ||
| + | |||
| + | # now | ||
| + | summary(lm.y.x) | ||
| + | # if b is 0, b does not do anything (null hypothesis) | ||
| + | # To test if b is working, we do t-test by | ||
| + | # b (current coefficient) - 0 (not working) / se | ||
| + | # = t.calculated. We test if this is significant | ||
| + | # tp(t.b.cal, df=n-2) | ||
| + | |||
| + | t.b.cal <- (b - 0)/ | ||
| + | t.b.cal | ||
| + | # k = num of predictor variables | ||
| + | k <- 1 | ||
| + | df.t <- n-k-1 | ||
| + | |||
| + | 2*pt(t.b.cal, | ||
| + | pf(t.b.cal^2, | ||
| + | </ | ||
| + | |||
| + | ====== Output ====== | ||
| + | < | ||
| + | > ################ | ||
| + | > rnorm2 <- function(n, | ||
| + | > n <- 16 | ||
| + | > | ||
| + | > set.seed(101) | ||
| + | > x <- rnorm2(n, 265, 15) | ||
| + | > rand.00 <- rnorm2(n, 0, 12) | ||
| + | > | ||
| + | > rand.01 <- rnorm2(n, 0, 240) | ||
| + | > plot(rand.01) | ||
| + | > points(rand.00, | ||
| + | > | ||
| + | > b <- 170 / 265 | ||
| + | > b <- round(b,1) | ||
| + | > b | ||
| + | [1] 0.6 | ||
| + | > y <- b * x + rand.00 | ||
| + | > | ||
| + | > df <- data.frame(x, | ||
| + | > head(df) | ||
| + |  | ||
| + | 1 256.4615 144.9723 | ||
| + | 2 273.7812 167.6050 | ||
| + | 3 249.5828 141.2775 | ||
| + | 4 267.1155 135.1836 | ||
| + | 5 269.0162 161.7509 | ||
| + | 6 286.0342 183.7182 | ||
| + | > | ||
| + | > # plot method 0 | ||
| + | > plot(x,y) | ||
| + | > | ||
| + | > # method 1 | ||
| + | > plot(x, y, pch = 1, cex = 1, col = " | ||
| + | > abline(h = mean(y), lwd=1.5, col=" | ||
| + | > abline(lm(y ~ x), lwd=1.5, col=" | ||
| + | > | ||
| + | > # method 2 | ||
| + | > lm.y.x <- lm(y ~ x, data = df) | ||
| + | > summary(lm.y.x) | ||
| + | |||
| + | Call: | ||
| + | lm(formula = y ~ x, data = df) | ||
| + | |||
| + | Residuals: | ||
| + | Min      1Q  Median | ||
| + | -25.508 | ||
| + | |||
| + | Coefficients: | ||
| + | Estimate Std. Error t value Pr(> | ||
| + | (Intercept) -52.8818 | ||
| + | x | ||
| + | --- | ||
| + | Signif. codes: | ||
| + | |||
| + | Residual standard error: 12.03 on 14 degrees of freedom | ||
| + | Multiple R-squared: | ||
| + | F-statistic: | ||
| + | |||
| + | > | ||
| + | > ########################### | ||
| + | > # double check with this | ||
| + | > ########################### | ||
| + | > str(lm.y.x) | ||
| + | List of 12 | ||
| + | $ coefficients : Named num [1:2] -52.9 0.8 | ||
| + | ..- attr(*, " | ||
| + | $ residuals | ||
| + | ..- attr(*, " | ||
| + | $ effects | ||
| + | ..- attr(*, " | ||
| + | $ rank : int 2 | ||
| + | $ fitted.values: | ||
| + | ..- attr(*, " | ||
| + | $ assign | ||
| + | $ qr :List of 5 | ||
| + | ..$ qr : num [1:16, 1:2] -4 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25 0.25 ... | ||
| + | .. ..- attr(*, " | ||
| + | .. .. ..$ : chr [1:16] " | ||
| + | .. .. ..$ : chr [1:2] " | ||
| + | .. ..- attr(*, " | ||
| + | ..$ qraux: num [1:2] 1.25 1.18 | ||
| + | ..$ pivot: int [1:2] 1 2 | ||
| + | ..$ tol : num 1e-07 | ||
| + | ..$ rank : int 2 | ||
| + | ..- attr(*, " | ||
| + | $ df.residual | ||
| + | $ xlevels | ||
| + | $ call : language lm(formula = y ~ x, data = df) | ||
| + | $ terms        :Classes ' | ||
| + | .. ..- attr(*, " | ||
| + | .. ..- attr(*, " | ||
| + | .. .. ..- attr(*, " | ||
| + | .. .. .. ..$ : chr [1:2] " | ||
| + | .. .. .. ..$ : chr " | ||
| + | .. ..- attr(*, " | ||
| + | .. ..- attr(*, " | ||
| + | .. ..- attr(*, " | ||
| + | .. ..- attr(*, " | ||
| + | .. ..- attr(*, " | ||
| + | .. ..- attr(*, " | ||
| + | .. ..- attr(*, " | ||
| + | .. .. ..- attr(*, " | ||
| + | $ model        :' | ||
| + | ..$ y: num [1:16] 145 168 141 135 162 ... | ||
| + | ..$ x: num [1:16] 256 274 250 267 269 ... | ||
| + | ..- attr(*, " | ||
| + | .. .. ..- attr(*, " | ||
| + | .. .. ..- attr(*, " | ||
| + | .. .. .. ..- attr(*, " | ||
| + | .. .. .. .. ..$ : chr [1:2] " | ||
| + | .. .. .. .. ..$ : chr " | ||
| + | .. .. ..- attr(*, " | ||
| + | .. .. ..- attr(*, " | ||
| + | .. .. ..- attr(*, " | ||
| + | .. .. ..- attr(*, " | ||
| + | .. .. ..- attr(*, " | ||
| + | .. .. ..- attr(*, " | ||
| + | .. .. ..- attr(*, " | ||
| + | .. .. .. ..- attr(*, " | ||
| + | - attr(*, " | ||
| + | > y.res <- lm.y.x$residuals | ||
| + | > y.pre <- lm.y.x$fitted.values | ||
| + | > y.exp <- y.pre - m.y | ||
| + | > plot(x, | ||
| + | > plot(x, | ||
| + | > plot(x, | ||
| + | > ########################### | ||
| + | > lm.y.x$coefficients | ||
| + | (Intercept) | ||
| + | -52.8817788 | ||
| + | > intercept <- lm.y.x$coefficients[1] | ||
| + | > slope <- lm.y.x$coefficients[2] | ||
| + | > intercept | ||
| + | (Intercept) | ||
| + | -52.88178 | ||
| + | > slope | ||
| + | x | ||
| + | 0.7995539 | ||
| + | > | ||
| + | > # library(ggplot2) | ||
| + | > # ggplot(data=df, | ||
| + | > #  geom_point(color=" | ||
| + | > # geom_hline(aes(yintercept=mean(y))) + | ||
| + | > #  geom_abline(intercept=intercept, | ||
| + | > # | ||
| + | > ##### | ||
| + | > ss.y <- sum((y-mean(y))^2) | ||
| + | > ss.y | ||
| + | [1] 4183.193 | ||
| + | > df.y <- length(y)-1 | ||
| + | > df.y | ||
| + | [1] 15 | ||
| + | > ss.y/df.y | ||
| + | [1] 278.8795 | ||
| + | > var(y) | ||
| + | [,1] | ||
| + | [1,] 278.8795 | ||
| + | > | ||
| + | > m.x <- mean(x) | ||
| + | > m.y <- mean(y) | ||
| + | > sp <- sum((x-m.x)*(y-m.y)) | ||
| + | > df.tot <- n-1 | ||
| + | > sp/df.tot | ||
| + | [1] 179.8996 | ||
| + | > cov.xy <- cov(x,y) | ||
| + | > | ||
| + | > sd.x <- sd(x) | ||
| + | > sd.y <- sd(y) | ||
| + | > | ||
| + | > r.xy <- cov.xy/ | ||
| + | > r.xy | ||
| + | [,1] | ||
| + | [1,] 0.7181756 | ||
| + | > cor(x,y) | ||
| + | [,1] | ||
| + | [1,] 0.7181756 | ||
| + | > cor.test(x, | ||
| + | |||
| + | Pearson' | ||
| + | |||
| + | data: x and y | ||
| + | t = 3.8616, df = 14, p-value = 0.001727 | ||
| + | alternative hypothesis: true correlation is not equal to 0 | ||
| + | 95 percent confidence interval: | ||
| + |  | ||
| + | sample estimates: | ||
| + | cor | ||
| + | 0.7181756 | ||
| + | |||
| + | > | ||
| + | > b <- cov(x,y) / var(x) | ||
| + | > # note that | ||
| + | > # (sp(x, | ||
| + | > # = sp(x,y) / ss(x) | ||
| + | > # m.y = a + b * mean.x | ||
| + | > a <- m.y - (b * m.x) | ||
| + | > b | ||
| + | [,1] | ||
| + | [1,] 0.7995539 | ||
| + | > a | ||
| + | [,1] | ||
| + | [1,] -52.88178 | ||
| + | > # we got these from the above | ||
| + | > slope | ||
| + | x | ||
| + | 0.7995539 | ||
| + | > intercept | ||
| + | (Intercept) | ||
| + | -52.88178 | ||
| + | > | ||
| + | > | ||
| + | > summary(lm.y.x) | ||
| + | |||
| + | Call: | ||
| + | lm(formula = y ~ x, data = df) | ||
| + | |||
| + | Residuals: | ||
| + | Min      1Q  Median | ||
| + | -25.508 | ||
| + | |||
| + | Coefficients: | ||
| + | Estimate Std. Error t value Pr(> | ||
| + | (Intercept) -52.8818 | ||
| + | x | ||
| + | --- | ||
| + | Signif. codes: | ||
| + | |||
| + | Residual standard error: 12.03 on 14 degrees of freedom | ||
| + | Multiple R-squared: | ||
| + | F-statistic: | ||
| + | |||
| + | > lm.y.x$coefficients | ||
| + | (Intercept) | ||
| + | -52.8817788 | ||
| + | > # str(lm.y.x) | ||
| + | > | ||
| + | > y.pred <- lm.y.x$fitted.values | ||
| + | > y <- y | ||
| + | > m.y | ||
| + | [1] 159 | ||
| + | > | ||
| + | > res <- y - y.pred | ||
| + | > reg <- y.pred - m.y | ||
| + | > ss.y | ||
| + | [1] 4183.193 | ||
| + | > ss.res <- sum(res^2) | ||
| + | > ss.reg <- sum(reg^2) | ||
| + | > ss.y | ||
| + | [1] 4183.193 | ||
| + | > ss.res | ||
| + | [1] 2025.602 | ||
| + | > ss.reg | ||
| + | [1] 2157.592 | ||
| + | > ss.res + ss.reg | ||
| + | [1] 4183.193 | ||
| + | > | ||
| + | > r.square <- ss.reg / ss.y | ||
| + | > r.square | ||
| + | [1] 0.5157762 | ||
| + | > sqrt(r.square) | ||
| + | [1] 0.7181756 | ||
| + | > cor(x,y) | ||
| + | [,1] | ||
| + | [1,] 0.7181756 | ||
| + | > | ||
| + | > df.tot <- df.y | ||
| + | > df.reg <- 2 - 1 | ||
| + | > df.res <- df.tot - df.reg | ||
| + | > | ||
| + | > df.reg | ||
| + | [1] 1 | ||
| + | > df.res | ||
| + | [1] 14 | ||
| + | > df.tot | ||
| + | [1] 15 | ||
| + | > | ||
| + | > ss.tot <- ss.y | ||
| + | > ss.reg | ||
| + | [1] 2157.592 | ||
| + | > ss.res | ||
| + | [1] 2025.602 | ||
| + | > ss.tot | ||
| + | [1] 4183.193 | ||
| + | > | ||
| + | > ms.reg <- ss.reg / df.reg | ||
| + | > ms.res <- ss.res / df.res | ||
| + | > ms.reg | ||
| + | [1] 2157.592 | ||
| + | > ms.res | ||
| + | [1] 144.6858 | ||
| + | > | ||
| + | > f.cal <- ms.reg/ | ||
| + | > f.cal | ||
| + | [1] 14.91225 | ||
| + | > | ||
| + | > p.val <- pf(f.cal, df.reg, df.res, lower.tail = F) | ||
| + | > p.val | ||
| + | [1] 0.001727459 | ||
| + | > anova(lm.y.x) | ||
| + | Analysis of Variance Table | ||
| + | |||
| + | Response: y | ||
| + | Df Sum Sq Mean Sq F value | ||
| + | x          1 2157.6 2157.59 | ||
| + | Residuals 14 2025.6 | ||
| + | --- | ||
| + | Signif. codes: | ||
| + | > | ||
| + | > ######################################## | ||
| + | > # also make it sure that you understand | ||
| + | > ######################################## | ||
| + | > | ||
| + | > res <- lm.y.x$residuals | ||
| + | > reg <- lm.y.x$fitted.values-m.y | ||
| + | > | ||
| + | > # The above can be derived as follows | ||
| + | > y.hat <- intercept + (slope * x) | ||
| + | > y.hat | ||
| + | [,1] | ||
| + | [1,] 152.1730 | ||
| + | [2,] 166.0210 | ||
| + | [3,] 146.6731 | ||
| + | [4,] 160.6914 | ||
| + | [5,] 162.2112 | ||
| + | [6,] 175.8179 | ||
| + | [7,] 167.0666 | ||
| + | [8,] 155.5354 | ||
| + | [9,] 171.7678 | ||
| + | [10,] 153.7931 | ||
| + | [11,] 165.6110 | ||
| + | [12,] 144.7831 | ||
| + | [13,] 179.8185 | ||
| + | [14,] 134.1906 | ||
| + | [15,] 153.5815 | ||
| + | [16,] 154.2648 | ||
| + | attr(," | ||
| + | [1] 0.1070574 | ||
| + | attr(," | ||
| + | [1] 0.7608404 | ||
| + | > head(y.hat) | ||
| + | [,1] | ||
| + | [1,] 152.1730 | ||
| + | [2,] 166.0210 | ||
| + | [3,] 146.6731 | ||
| + | [4,] 160.6914 | ||
| + | [5,] 162.2112 | ||
| + | [6,] 175.8179 | ||
| + | > head(lm.y.x$fitted.values) | ||
| + |  | ||
| + | 152.1730 166.0210 146.6731 160.6914 162.2112 175.8179 | ||
| + | > y.pre <- y.hat | ||
| + | > y.res <- y - y.pre | ||
| + | > y.res | ||
| + | [,1] | ||
| + |  | ||
| + |  | ||
| + |  | ||
| + | [4,] -25.5078178 | ||
| + |  | ||
| + |  | ||
| + |  | ||
| + | [8,] -16.3176727 | ||
| + |  | ||
| + | [10,] -15.1613471 | ||
| + | [11,] | ||
| + | [12,] | ||
| + | [13,] | ||
| + | [14,] 15.4541201 | ||
| + | [15,] 15.9625789 | ||
| + | [16,] | ||
| + | attr(," | ||
| + | [1] 0.1070574 | ||
| + | attr(," | ||
| + | [1] 0.7608404 | ||
| + | > head(res) | ||
| + | 1 | ||
| + |  | ||
| + | > head(y.res) | ||
| + | [,1] | ||
| + | [1,] -7.2007773 | ||
| + | [2,] | ||
| + | [3,] -5.3956693 | ||
| + | [4,] -25.5078178 | ||
| + | [5,] -0.4602376 | ||
| + | [6,] | ||
| + | > | ||
| + | > y.reg <- y.pre - m.y | ||
| + | > head(reg) | ||
| + |  | ||
| + |  | ||
| + | > head(y.reg) | ||
| + | [,1] | ||
| + | [1,] -6.826963 | ||
| + | [2,] | ||
| + | [3,] -12.326872 | ||
| + | [4,] | ||
| + | [5,] | ||
| + | [6,] 16.817938 | ||
| + | > ######################################## | ||
| + | > | ||
| + | > # This is essentially a perfect linear | ||
| + | > # relationship between the regression | ||
| + | > # line and x data | ||
| + | > lm.reg.x <- lm(reg~x, data = df) | ||
| + | > summary(lm.reg.x) | ||
| + | |||
| + | Call: | ||
| + | lm(formula = reg ~ x, data = df) | ||
| + | |||
| + | Residuals: | ||
| + |  | ||
| + | -1.216e-14 -9.993e-15 -2.381e-15 | ||
| + | |||
| + | Coefficients: | ||
| + | Estimate Std. Error    t value Pr(> | ||
| + | (Intercept) -2.119e+02 | ||
| + | x            7.996e-01 | ||
| + | --- | ||
| + | Signif. codes: | ||
| + | |||
| + | Residual standard error: 1.165e-14 on 14 degrees of freedom | ||
| + | Multiple R-squared: | ||
| + | F-statistic: | ||
| + | |||
| + | 경고메시지(들): | ||
| + | summary.lm(lm.reg.x)에서: | ||
| + | essentially perfect fit: summary may be unreliable | ||
| + | > | ||
| + | > # The relationship between residuals and | ||
| + | > # x should be zero | ||
| + | > lm.res.x <- lm(res~x, data = df) | ||
| + | > summary(lm.res.x) | ||
| + | |||
| + | Call: | ||
| + | lm(formula = res ~ x, data = df) | ||
| + | |||
| + | Residuals: | ||
| + | Min      1Q  Median | ||
| + | -25.508 | ||
| + | |||
| + | Coefficients: | ||
| + | Estimate Std. Error t value Pr(>|t|) | ||
| + | (Intercept) -1.956e-14 | ||
| + | x            6.880e-17 | ||
| + | |||
| + | Residual standard error: 12.03 on 14 degrees of freedom | ||
| + | Multiple R-squared: | ||
| + | F-statistic: | ||
| + | |||
| + | > summary(lm.y.x) | ||
| + | |||
| + | Call: | ||
| + | lm(formula = y ~ x, data = df) | ||
| + | |||
| + | Residuals: | ||
| + | Min      1Q  Median | ||
| + | -25.508 | ||
| + | |||
| + | Coefficients: | ||
| + | Estimate Std. Error t value Pr(> | ||
| + | (Intercept) -52.8818 | ||
| + | x | ||
| + | --- | ||
| + | Signif. codes: | ||
| + | |||
| + | Residual standard error: 12.03 on 14 degrees of freedom | ||
| + | Multiple R-squared: | ||
| + | F-statistic: | ||
| + | |||
| + | > | ||
| + | > ######################################## | ||
| + | > # Now let's get back to the lm output | ||
| + | > # this is to evalue the significance of | ||
| + | > # the slope of x (b) | ||
| + | > summary(lm.y.x) | ||
| + | |||
| + | Call: | ||
| + | lm(formula = y ~ x, data = df) | ||
| + | |||
| + | Residuals: | ||
| + | Min      1Q  Median | ||
| + | -25.508 | ||
| + | |||
| + | Coefficients: | ||
| + | Estimate Std. Error t value Pr(> | ||
| + | (Intercept) -52.8818 | ||
| + | x | ||
| + | --- | ||
| + | Signif. codes: | ||
| + | |||
| + | Residual standard error: 12.03 on 14 degrees of freedom | ||
| + | Multiple R-squared: | ||
| + | F-statistic: | ||
| + | |||
| + | > | ||
| + | > # the slope, we already got this | ||
| + | > b <- slope | ||
| + | > # intercept | ||
| + | > a <- intercept | ||
| + | > | ||
| + | > # the real data look like this | ||
| + | > plot(x,y) | ||
| + | > abline(h = mean(y), lwd=1.5, col=" | ||
| + | > abline(lm(y ~ x), lwd=1.5, col=" | ||
| + | > | ||
| + | > # blue line is regression line | ||
| + | > # this slope is an estimate of a real | ||
| + | > # thing (population) rather than the | ||
| + | > # sample we are using right now) | ||
| + | > library(ggplot2) | ||
| + | > | ||
| + | > ggplot(data = df, aes(x,y)) + | ||
| + | + | ||
| + | + | ||
| + | `geom_smooth()` using formula = 'y ~ x' | ||
| + | > | ||
| + | > # the real slope should reside in between | ||
| + | > # the violet area. The area is estimated | ||
| + | > # with standard error of slop (regression | ||
| + | > # coefficient) | ||
| + | > | ||
| + | > # in word | ||
| + | > # se for b is | ||
| + | > # from summary(lm.y.x) | ||
| + | > | ||
| + | > summary(lm.y.x) | ||
| + | |||
| + | Call: | ||
| + | lm(formula = y ~ x, data = df) | ||
| + | |||
| + | Residuals: | ||
| + | Min      1Q  Median | ||
| + | -25.508 | ||
| + | |||
| + | Coefficients: | ||
| + | Estimate Std. Error t value Pr(> | ||
| + | (Intercept) -52.8818 | ||
| + | x | ||
| + | --- | ||
| + | Signif. codes: | ||
| + | |||
| + | Residual standard error: 12.03 on 14 degrees of freedom | ||
| + | Multiple R-squared: | ||
| + | F-statistic: | ||
| + | |||
| + | > se.b <- 0.0401 | ||
| + | > ci.se.b <- 1.96 * se.b | ||
| + | > # this is the lowest estimate | ||
| + | > b - ci.se.b | ||
| + | x | ||
| + | 0.7209579 | ||
| + | > # the highest estimate | ||
| + | > b + ci.se.b | ||
| + | x | ||
| + | 0.8781499 | ||
| + | > # calculated estimate | ||
| + | > b | ||
| + | x | ||
| + | 0.7995539 | ||
| + | > | ||
| + | > # how to get se for b? | ||
| + | > # https:// | ||
| + | > # see also http:// | ||
| + | > | ||
| + | > ss.res | ||
| + | [1] 2025.602 | ||
| + | > ss.x | ||
| + | [1] 3375 | ||
| + | > # we didn't get ss.x | ||
| + | > ss.x <- sum((x-m.x)^2) | ||
| + | > ss.x | ||
| + | [1] 3375 | ||
| + | > | ||
| + | > sqrt( (ss.res/ | ||
| + | [1] 0.2070504 | ||
| + | > # the above is the same as the below | ||
| + | > sqrt( ss.res / (n-2)*ss.x ) | ||
| + | [1] 698.7952 | ||
| + | > sqrt( 1/(n-2) * (ss.res/ | ||
| + | [1] 0.2070504 | ||
| + | > se.b <- sqrt( 1/(n-2) * (ss.res/ | ||
| + | > | ||
| + | > # now | ||
| + | > summary(lm.y.x) | ||
| + | |||
| + | Call: | ||
| + | lm(formula = y ~ x, data = df) | ||
| + | |||
| + | Residuals: | ||
| + | Min      1Q  Median | ||
| + | -25.508 | ||
| + | |||
| + | Coefficients: | ||
| + | Estimate Std. Error t value Pr(> | ||
| + | (Intercept) -52.8818 | ||
| + | x | ||
| + | --- | ||
| + | Signif. codes: | ||
| + | |||
| + | Residual standard error: 12.03 on 14 degrees of freedom | ||
| + | Multiple R-squared: | ||
| + | F-statistic: | ||
| + | |||
| + | > # if b is 0, b does not do anything (null hypothesis) | ||
| + | > # To test if b is working, we do t-test by | ||
| + | > # b (current coefficient) - 0 (not working) / se | ||
| + | > # = t.calculated. We test if this is significant | ||
| + | > # tp(t.b.cal, df=n-2) | ||
| + | > | ||
| + | > t.b.cal <- (b - 0)/ | ||
| + | > t.b.cal | ||
| + |  | ||
| + | 3.861639 | ||
| + | > # k = num of predictor variables | ||
| + | > k <- 1 | ||
| + | > df.t <- n-k-1 | ||
| + | > | ||
| + | > 2*pt(t.b.cal, | ||
| + | x | ||
| + | 0.001727459 | ||
| + | > pf(t.b.cal^2, | ||
| + | x | ||
| + | 0.001727459 | ||
| + | > | ||
| </ | </ | ||
| + | ====== Graphics output ====== | ||
| + | {{: | ||
| + | {{: | ||
| + | {{: | ||
| + | {{: | ||
| + | {{: | ||
| + | {{: | ||
| + | {{: | ||
c/ms/regression_lecture_note_for_r.1716935498.txt.gz · Last modified:  by hkimscil
                
                