c:ms:regression_lecture_note_for_r
This is an old revision of the document!
################ rnorm2 <- function(n,mean,sd) { mean+sd*scale(rnorm(n)) } n <- 400 set.seed(101) x <- rnorm2(n, 265, 15) rand.00 <- rnorm2(n, 0, 12) rand.01 <- rnorm2(n, 0, 240) plot(rand.01) points(rand.00, col="red") b = 170 / 265 b y <- b * x + rand.00 df <- data.frame(x, y) head(df) # plot method 0 plot(x,y) # method 1 plot(x, y, pch = 1, cex = 1, col = "black", main = "HEIGHT against SHOE SIZE", xlab = "SHOE SIZE", ylab = "HEIGHT (cm)") abline(h = mean(y), lwd=1.5, col="red") abline(lm(y ~ x), lwd=1.5, col="blue") # method 2 lm.y.x <- lm(y ~ x, data = df) summary(lm.y.x) # str(lm.y.x) lm.y.x$coefficients intercept <- lm.y.x$coefficients[1] slope <- lm.y.x$coefficients[2] intercept slope # library(ggplot2) # ggplot(data=df, aes(x,y)) + # geom_point(color="blue", size=1.5, pch=1.5) + # geom_hline(aes(yintercept=mean(y))) + # geom_abline(intercept=intercept, slope=slope) # ##### ss.y <- sum((y-mean(y))^2) ss.y df.y <- length(y)-1 df.y ss.y/df.y var(y) m.x <- mean(x) m.y <- mean(y) sp <- sum((x-m.x)*(y-m.y)) df.tot <- n-1 sp/df.tot cov.xy <- cov(x,y) sd.x <- sd(x) sd.y <- sd(y) r.xy <- cov.xy/(sd.x*sd.y) r.xy cor(x,y) cor.test(x,y) b <- cov(x,y) / var(x) # note that # (sp(x,y)/n-1) / (ss(x)/n-1) # = sp(x,y) / ss(x) # m.y = a + b * mean.x a <- m.y - (b * m.x) b a summary(lm.y.x) lm.y.x$coefficients # str(lm.y.x) y.pred <- lm.y.x$fitted.values y <- y m.y res <- y - y.pred reg <- y.pred - m.y ss.y ss.res <- sum(res^2) ss.reg <- sum(reg^2) ss.y ss.res ss.reg ss.res + ss.reg r.square <- ss.reg / ss.y r.square df.tot <- df.y df.reg <- 2 - 1 df.res <- df.tot - df.reg df.reg df.res df.tot ss.tot <- ss.y ss.reg ss.res ss.tot ms.reg <- ss.reg / df.reg ms.res <- ss.res / df.res ms.reg ms.res f.cal <- ms.reg/ms.res f.cal p.val <- pf(f.cal, df.reg, df.res, lower.tail = F) p.val anova(lm.y.x) ######################################## # also make it sure that you understand ######################################## res <- lm.y.x$residuals reg <- lm.y.x$fitted.values-m.y # This is essentially a perfect linear # relationship between the regression # line and x data lm.reg.x <- lm(reg~x, data = df) summary(lm.reg.x) # The relationship between residuals and # x should be zero lm.res.x <- lm(res~x, data = df) summary(lm.res.x) summary(lm.y.x)
> rnorm2 <- function(n,mean,sd) { mean+sd*scale(rnorm(n)) } > n <- 400 > > set.seed(101) > x <- rnorm2(n, 265, 15) > rand.00 <- rnorm2(n, 0, 12) > > rand.01 <- rnorm2(n, 0, 240) > plot(rand.01) > points(rand.00, col="red") > > b = 170 / 265 > b [1] 0.6415094 > y <- b * x + rand.00 > > df <- data.frame(x, y) > head(df) x y 1 260.3289 164.1701 2 273.9897 149.7065 3 254.9034 140.6106 4 268.7321 179.5289 5 270.2313 176.5863 6 283.6541 172.5179 > > # plot method 0 > plot(x,y) > > # method 1 > plot(x, y, pch = 1, cex = 1, col = "black", main = "HEIGHT against SHOE SIZE", xlab = "SHOE SIZE", ylab = "HEIGHT (cm)") > abline(h = mean(y), lwd=1.5, col="red") > abline(lm(y ~ x), lwd=1.5, col="blue") > > # method 2 > lm.y.x <- lm(y ~ x, data = df) > summary(lm.y.x) Call: lm(formula = y ~ x, data = df) Residuals: Min 1Q Median 3Q Max -39.653 -8.878 -0.226 7.322 30.630 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 0.8022 10.6435 0.075 0.94 x 0.6385 0.0401 15.922 <2e-16 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 12.01 on 398 degrees of freedom Multiple R-squared: 0.3891, Adjusted R-squared: 0.3876 F-statistic: 253.5 on 1 and 398 DF, p-value: < 2.2e-16 > # str(lm.y.x) > lm.y.x$coefficients (Intercept) x 0.8021833 0.6384823 > intercept <- lm.y.x$coefficients[1] > slope <- lm.y.x$coefficients[2] > intercept (Intercept) 0.8021833 > slope x 0.6384823 > > # library(ggplot2) > # ggplot(data=df, aes(x,y)) + > # geom_point(color="blue", size=1.5, pch=1.5) + > # geom_hline(aes(yintercept=mean(y))) + > # geom_abline(intercept=intercept, slope=slope) > # > ##### > ss.y <- sum((y-mean(y))^2) > ss.y [1] 94052.83 > df.y <- length(y)-1 > df.y [1] 399 > ss.y/df.y [1] 235.7214 > var(y) [,1] [1,] 235.7214 > > m.x <- mean(x) > m.y <- mean(y) > sp <- sum((x-m.x)*(y-m.y)) > df.tot <- n-1 > sp/df.tot [1] 143.6585 > cov.xy <- cov(x,y) > > sd.x <- sd(x) > sd.y <- sd(y) > > r.xy <- cov.xy/(sd.x*sd.y) > r.xy [,1] [1,] 0.6237932 > cor(x,y) [,1] [1,] 0.6237932 > cor.test(x,y) Pearson's product-moment correlation data: x and y t = 15.922, df = 398, p-value < 2.2e-16 alternative hypothesis: true correlation is not equal to 0 95 percent confidence interval: 0.5599929 0.6802388 sample estimates: cor 0.6237932 > > b <- cov(x,y) / var(x) > # note that > # (sp(x,y)/n-1) / (ss(x)/n-1) > # = sp(x,y) / ss(x) > # m.y = a + b * mean.x > a <- m.y - (b * m.x) > b [,1] [1,] 0.6384823 > a [,1] [1,] 0.8021833 > > > summary(lm.y.x) Call: lm(formula = y ~ x, data = df) Residuals: Min 1Q Median 3Q Max -39.653 -8.878 -0.226 7.322 30.630 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 0.8022 10.6435 0.075 0.94 x 0.6385 0.0401 15.922 <2e-16 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 12.01 on 398 degrees of freedom Multiple R-squared: 0.3891, Adjusted R-squared: 0.3876 F-statistic: 253.5 on 1 and 398 DF, p-value: < 2.2e-16 > lm.y.x$coefficients (Intercept) x 0.8021833 0.6384823 > > # str(lm.y.x) > > y.pred <- lm.y.x$fitted.values > y <- y > m.y [1] 170 > > res <- y - y.pred > reg <- y.pred - m.y > ss.y [1] 94052.83 > ss.res <- sum(res^2) > ss.reg <- sum(reg^2) > ss.y [1] 94052.83 > ss.res [1] 57455.18 > ss.reg [1] 36597.65 > ss.res + ss.reg [1] 94052.83 > > r.square <- ss.reg / ss.y > r.square [1] 0.389118 > > > df.tot <- df.y > df.reg <- 2 - 1 > df.res <- df.tot - df.reg > > df.reg [1] 1 > df.res [1] 398 > df.tot [1] 399 > > ss.tot <- ss.y > ss.reg [1] 36597.65 > ss.res [1] 57455.18 > ss.tot [1] 94052.83 > > ms.reg <- ss.reg / df.reg > ms.res <- ss.res / df.res > ms.reg [1] 36597.65 > ms.res [1] 144.3597 > > f.cal <- ms.reg/ms.res > f.cal [1] 253.517 > > p.val <- pf(f.cal, df.reg, df.res, lower.tail = F) > p.val [1] 1.623694e-44 > anova(lm.y.x) Analysis of Variance Table Response: y Df Sum Sq Mean Sq F value Pr(>F) x 1 36598 36598 253.52 < 2.2e-16 *** Residuals 398 57455 144 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 > > ######################################## > # also make it sure that you understand > ######################################## > > res <- lm.y.x$residuals > reg <- lm.y.x$fitted.values-m.y > > # This is essentially a perfect linear > # relationship between the regression > # line and x data > lm.reg.x <- lm(reg~x, data = df) > summary(lm.reg.x) Call: lm(formula = reg ~ x, data = df) Residuals: Min 1Q Median 3Q Max -6.828e-14 -7.900e-15 -5.500e-16 6.410e-15 3.979e-13 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -1.692e+02 1.937e-14 -8.733e+15 <2e-16 *** x 6.385e-01 7.299e-17 8.747e+15 <2e-16 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 2.187e-14 on 398 degrees of freedom Multiple R-squared: 1, Adjusted R-squared: 1 F-statistic: 7.651e+31 on 1 and 398 DF, p-value: < 2.2e-16 > > # The relationship between residuals and > # x should be zero > lm.res.x <- lm(res~x, data = df) > summary(lm.res.x) Call: lm(formula = res ~ x, data = df) Residuals: Min 1Q Median 3Q Max -39.653 -8.878 -0.226 7.322 30.630 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) -3.586e-15 1.064e+01 0 1 x 1.186e-17 4.010e-02 0 1 Residual standard error: 12.01 on 398 degrees of freedom Multiple R-squared: 6.744e-33, Adjusted R-squared: -0.002513 F-statistic: 2.684e-30 on 1 and 398 DF, p-value: 1 > summary(lm.y.x) Call: lm(formula = y ~ x, data = df) Residuals: Min 1Q Median 3Q Max -39.653 -8.878 -0.226 7.322 30.630 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 0.8022 10.6435 0.075 0.94 x 0.6385 0.0401 15.922 <2e-16 *** --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 12.01 on 398 degrees of freedom Multiple R-squared: 0.3891, Adjusted R-squared: 0.3876 F-statistic: 253.5 on 1 and 398 DF, p-value: < 2.2e-16 > > >
c/ms/regression_lecture_note_for_r.1716935598.txt.gz · Last modified: 2024/05/29 07:33 by hkimscil