Differences

This shows you the differences between two versions of the page.

--- c:ms:2023:schedule:week06_t-test_and_anova_note [2023/04/12 08:01] – created hkimscil
+++ c:ms:2023:schedule:week06_t-test_and_anova_note [2024/04/08 08:42] (current) – [output] hkimscil
@@ Line 1: / Line 1: @@
 ====== R ======
+<code>
+# t-test 이해 확인
+pre <- c(3,0,6,7,4,3,2,8,4)
+post <- c(5,2,5,7,10,9,7,11,8)
+mean(pre)
+mean(post)
+sd(pre)
+sd(post)
+diff.prepost <- pre-post
+mean.diff <- mean(diff.prepost)
+mean.diff
+sd.diff <- sd(diff.prepost)
+sd.diff
+#
+# remember t test = diff / rand error
+#
+se <- sd.diff/sqrt(length(diff.prepost))
+se
+t.value <- mean.diff/se
+t.value
+df.value <- length(diff.prepost)-1
+df.value
+# pt function is for getting percentage of
+# t score with df value
+pt(t.value, df=df.value)
+*pt(t.value, df=df.value)
+qt(.975, df=df.value)
+qt(.025, df=df.value)
+test.output <- t.test(pre,post, paired=T)
+test.output
+# for the reference
+# test.output$
+str(test.output)
+# from the quiz questions
+# stu should understand the logic of the ttest
+set.seed(101)
+rnorm2 <- function(n,mean,sd){ mean+sd*scale(rnorm(n)) }
+A <- rnorm2(16, 26, sqrt(1160/15))
+B <- rnorm2(16, 19, sqrt(1000/15))
+A <- c(A)
+B <- c(B)
+# we know sqrt(1160/15) is A's sdev
+# hence, A's var is sqrt(1160/15)^2
+# hence, A's SS is sqrt(1160/15)^2 * 15
+# this is 1160
+# from the above,
+# the difference between the A and B means
+# remember we try to find
+# difference due to the treatment /
+# / random chance of error
+diff <- 26 - 19
+# for se
+# we know that the situation refers to
+# #2 se two independent samples t-test
+# which is sqrt(pooled.var/na + pooled.var/nb)
+SSa <- 1160
+SSb <- 1000
+n.a <- 16
+n.b <- 16
+df.a <- n.a - 1
+df.b <- n.b - 1
+# we know that we are testing the difference
+# between two independent sample means.
+# Hence, we need to use poole variance between
+# the two group. See
+# http://commres.net/wiki/t-test#t-test_%EB%B9%84%EA%B5%90
+pooled.var <- (SSa + SSb) / (df.a + df.b)
+se <- sqrt(pooled.var/n.a + pooled.var/n.b)
+# Remember t test calculation is based on
+# diff / random error
+t.calculated <- diff / se
+pooled.var
+diff
+se
+t.calculated
+# Now use t.test function for two group
+# (independent sample) t-test
+# with an assumption that variances of
+# the two gorup are the same.
+t.result <- t.test(A, B, var.equal = T)
+t.result
+# t.result$statistic = t.calculated
+# t.result$p.value = probability level of
+# wrong decision with the t calculated value
+str(t.result)
+t.result$statistic
+t.result$p.value
+# the above p.value can be obtained with
+# pt function
+p.value <- 2*pt(-t.result$statistic, df = df.a + df.b)
+p.value
+t.result$p.value
+##
+## different approach
+#
+# A combined group with group A and B
+# We call it group total
+# we can obtain its mean, variance, ss, df, etc.
+#
+A
+B
+dat <- c(A,B)
+dat
+mean.total <- mean(dat)
+var.total <- var(dat)
+# variance를 ms라고 부르기도 한다
+ms.total <- var.total
+df.total <- length(dat)-1
+ss.total <- var.total*df.total
+ss.total.check <- sum((dat-mean(dat))^2)
+mean.total
+var.total
+ms.total
+df.total
+ss.total
+ss.total.check
+# Now for each group
+mean.a <- mean(A)
+mean.b <- mean(B)
+mean.a
+mean.b
+# 그룹 간의 차이에서 나타나는 분산
+# 수업시간에 설명을 잘 들을 것
+# mean.total 에서 그룹a의 평균까지의 차이를 구한 후
+# 이를 제곱하여 그룹 A 멤버의 숫자만큼 더한다 =
+# 즉, SS를 구하는 방법.
+# 전체평균에서 그룹평균을 뺀 것의 제곱을
+# 그룹 구성원 숫자만큼 더하는 것
+# 그리고 이들을 다시 모두 더하여
+# ss.between에 저장
+length(A) * ((mean.total - mean.a)^2)
+length(B) * ((mean.total - mean.b)^2)
+ss.between <-
+  length(A)*((mean.total - mean.a)^2) +
+  length(B)*((mean.total - mean.b)^2)
+ss.between
+# df between group은 연구에 사용된
+# 그룹의 숫자에서 1을 뺀 숫자
+df.between <- 2 - 1
+# 이 그룹 간 차이에 기인하는 분산 값은
+ms.between <- ss.between / df.between
+# 한편 ss.a 와 ss.b는 각 그룹 내의
+# 분산을 알아보기 위한 방법
+ss.a <- var(A) * df.a
+ss.b <- var(B) * df.b
+ss.within <- ss.a + ss.b
+df.a <- length(A)-1
+df.b <- length(B)-1
+df.within <- df.a + df.b
+ms.within <- ss.within / df.within
+# 여기까지 우리는
+# 전체분산
+# 그룹간분산
+# 그룹내분산 값을
+# 구한 것
+# ms.between은 그룹의 차이때문에 생긴
+# 분산으로 IV 혹은 treatment 때문에 생기는
+# 차이에 기인하는 분산이고
+# ms.within은 각 그룹 내부에서 일어나는 분산이므로
+# (variation이므로) 연구자의 관심사와는 상관이 없이
+# 나타나는 random한 분산이라고 하면
+# t test 때와 마찬가지로
+# 그룹의 차이 / 랜덤 차이를 (에러 -> 분산은 에러라고도 했다)
+# 구해볼 수 있다.
+# 즉, 그룹갑분산은 사실 = diff (between groups)
+# 그리고 그룹내 분산은 사실 = re
+# 따라서 우리는 위 둘 간의 비율을 t test와 같이
+# 살펴볼 수 있다
+# 이것을 f.calculated 이라고 하고
+f.calculated <- ms.between / ms.within
+# 이 값을 출력해 본다
+f.calculated
+# 이 계산은 차이와 랜덤에러의 비율이
+# df에 따라서 얼마나 되어야 그 차이가
+# 충분히 큰 것인지를 판단하기 위해서
+# 쓰인다. 여기서 df에는 두 가지 종류가
+# 있다. df.between 그리고 df.within
+# percentage of f distribution with
+# df1 and df2 option
+# 이는 그림의 왼쪽을 나타내므로
+# 차이가 점점 커지게 되는 오른쪽을
+# 계산하기 위해서는 1-x를 취한다
+f.calculated.pvalue <- 1-pf(f.calculated, df1=df.between, df2=df.within)
+f.calculated.pvalue
+# 한편,  t test를 했었을 때 (A, B 그룹을 가지고 independent
+# samples t-test를) 아웃 풋은
+t.result
+# 그리고 f 계산에서의 p value는 t test에서의 p.value와 같다
+f.calculated.pvalue
+t.result$p.value
+# 또한
+# 여기엣 t 값은 t.result$statistic 으로 프린트아웃할 수 있다
+# 이 값이 2.33333 이었다
+t.result$statistic
+# 혹은 우리가 계산한 값이었던
+# t.calculated
+t.calculated
+# 그런데 위의 값을 제곱한 값이 바로 f.calculated 값
+f.calculated
+t.calculated^2
+# 혹은 f.calculated 값을 제곱근한 값이 t.calculated
+sqrt(f.calculated)
+t.calculated
+# Now check this
+ss.total
+ss.between
+ss.within
+ss.total
+ss.between + ss.within
+# 한편 df는
+# df.total  30 - 1
+df.total
+df.between
+df.within
+df.total
+df.between + df.within
+# 한 편
+A
+B
+comb <- stack(list(a=A, b=B))
+comb
+colnames(comb)[1] <- "values"
+colnames(comb)[2] <- "group"
+comb
+a.res <- aov(values ~ group, data=comb)
+a.res.sum <- summary(a.res)
+a.res.sum
+# 위에서 F value는 5.444
+# 그리고 전체적인 아웃풋을 보면
+# Df group 과 Df Residuals
+# Sum Sq group 과 Residuals
+# Mean Sq (MS) group 과 MS residuals
+# MS.group =
+ms.between
+# MS.within =
+ms.within
+# F value
+ms.between / ms.within
+f.calculated
+# 아래는 기존의 아웃풋에서 확인하는 것
+str(a.res.sum)
+a.res.sum[[1]][1,4]
+sqrt(a.res.sum[[1]][1,4])
+t.result$statistic
+</code>
 ====== output ======
 <code>
+> # from the quiz questions
+> # stu should understand the logic of the ttest
+>
+> set.seed(101)
+> rnorm2 <- function(n,mean,sd){ mean+sd*scale(rnorm(n)) }
+> A <- rnorm2(16, 26, sqrt(1160/15))
+> B <- rnorm2(16, 19, sqrt(1000/15))
+> A <- c(A)
+> B <- c(B)
+> # we know sqrt(1160/15) is A's sdev
+> # hence, A's var is sqrt(1160/15)^2
+> # hence, A's SS is sqrt(1160/15)^2 * 15
+> # this is 1160
+>
+> # from the above,
+> # the difference between the A and B means
+> # remember we try to find
+> # difference due to the treatment /
+> # / random chance of error
+> diff <- 26 - 19
+>
+> # for se
+> # we know that the situation refers to
+> # #2 se two independent samples t-test
+> # which is sqrt(pooled.var/na + pooled.var/nb)
+>
+> SSa <- 1160
+> SSb <- 1000
+> n.a <- 16
+> n.b <- 16
+> df.a <- n.a - 1
+> df.b <- n.b - 1
+>
+> # we know that we are testing the difference
+> # between two independent sample means.
+> # Hence, we need to use poole variance between
+> # the two group. See
+> # http://commres.net/wiki/t-test#t-test_%EB%B9%84%EA%B5%90
+> pooled.var <- (SSa + SSb) / (df.a + df.b)
+> se <- sqrt(pooled.var/n.a + pooled.var/n.b)
+> # Remember t test calculation is based on
+> # diff / random error
+> t.calculated <- diff / se
+> pooled.var
+[1] 72
+> diff
+[1] 7
+> se
+[1] 3
+> t.calculated
+[1] 2.333333
+>
+> # Now use t.test function for two group
+> # (independent sample) t-test
+> # with an assumption that variances of
+> # the two gorup are the same.
+> t.result <- t.test(A, B, var.equal = T)
+> t.result
+	Two Sample t-test
+data:  A and B
+t = 2.3333, df = 30, p-value = 0.02652
+alternative hypothesis: true difference in means is not equal to 0
+percent confidence interval:
+.8731826 13.1268174
+sample estimates:
+mean of x mean of y
+        19
+>
+> # t.result$statistic = t.calculated
+> # t.result$p.value = probability level of
+> # wrong decision with the t calculated value
+> str(t.result)
+List of 10
+ $ statistic  : Named num 2.33
+  ..- attr(*, "names")= chr "t"
+ $ parameter  : Named num 30
+  ..- attr(*, "names")= chr "df"
+ $ p.value    : num 0.0265
+ $ conf.int   : num [1:2] 0.873 13.127
+  ..- attr(*, "conf.level")= num 0.95
+ $ estimate   : Named num [1:2] 26 19
+  ..- attr(*, "names")= chr [1:2] "mean of x" "mean of y"
+ $ null.value : Named num 0
+  ..- attr(*, "names")= chr "difference in means"
+ $ stderr     : num 3
+ $ alternative: chr "two.sided"
+ $ method     : chr " Two Sample t-test"
+ $ data.name  : chr "A and B"
+ - attr(*, "class")= chr "htest"
+> t.result$statistic
+       t
+.333333
+> t.result$p.value
+[1] 0.02652366
+>
+> # the above p.value can be obtained with
+> # pt function
+> p.value <- 2*pt(-t.result$statistic, df = df.a + df.b)
+> p.value
+         t
+.02652366
+> t.result$p.value
+[1] 0.02652366
+>
 > ##
 >
@@ Line 8: / Line 411: @@
 > #
 >
+> # A combined group with group A and B
+> # We call it group total
+> # we can obtain its mean, variance, ss, df, etc.
+> #
 > A
- [1] 20.994218 31.148068 16.961481 27.240217 28.354539 38.331534
+ [1] 20.994218 31.148068 16.961481 27.240217 28.354539
- [7] 31.914700 23.459605 35.361796 22.182136 30.847396 15.575648
+ [6] 38.331534 31.914700 23.459605 35.361796 22.182136
-[13] 41.264878  7.808831 22.026979 22.527973
+[11] 30.847396 15.575648 41.264878  7.808831 22.026979
+[16] 22.527973
 > B
- [1] 12.941146 21.270062 13.235378  1.931364 19.232163 27.231465
+ [1] 12.941146 21.270062 13.235378  1.931364 19.232163
- [7] 18.276359  7.308871 27.560815  7.799787 25.017185 19.639663
+ [6] 27.231465 18.276359  7.308871 27.560815  7.799787
-[13] 25.018756 25.302096 28.941002 23.293888
+[11] 25.017185 19.639663 25.018756 25.302096 28.941002
+[16] 23.293888
 > dat <- c(A,B)
 > dat
- [1] 20.994218 31.148068 16.961481 27.240217 28.354539 38.331534
+ [1] 20.994218 31.148068 16.961481 27.240217 28.354539
- [7] 31.914700 23.459605 35.361796 22.182136 30.847396 15.575648
+ [6] 38.331534 31.914700 23.459605 35.361796 22.182136
-[13] 41.264878  7.808831 22.026979 22.527973 12.941146 21.270062
+[11] 30.847396 15.575648 41.264878  7.808831 22.026979
-[19] 13.235378  1.931364 19.232163 27.231465 18.276359  7.308871
+[16] 22.527973 12.941146 21.270062 13.235378  1.931364
-[25] 27.560815  7.799787 25.017185 19.639663 25.018756 25.302096
+[21] 19.232163 27.231465 18.276359  7.308871 27.560815
+[26]  7.799787 25.017185 19.639663 25.018756 25.302096
 [31] 28.941002 23.293888
 >
+> mean.total <- mean(dat)
 > var.total <- var(dat)
+> # variance를 ms라고 부르기도 한다
+> ms.total <- var.total
+>
 > df.total <- length(dat)-1
 > ss.total <- var.total*df.total
 > ss.total.check <- sum((dat-mean(dat))^2)
+>
+> mean.total
+[1] 22.5
+> var.total
+[1] 82.32258
+> ms.total
+[1] 82.32258
+> df.total
+[1] 31
 > ss.total
 [1] 2552
 > ss.total.check
 [1] 2552
-> mean.total <- mean(dat)
-> mean.total
-[1] 22.5
 >
+> # Now for each group
 > mean.a <- mean(A)
 > mean.b <- mean(B)
+> mean.a
+[1] 26
+> mean.b
+[1] 19
+>
+> # 그룹 간의 차이에서 나타나는 분산
+> # 수업시간에 설명을 잘 들을 것
 >
 > # mean.total 에서 그룹a의 평균까지의 차이를 구한 후
-> # 이를 제곱하여 A의 숫자만큼 더한다 =
+> # 이를 제곱하여 그룹 A 멤버의 숫자만큼 더한다 =
 > # 즉, SS를 구하는 방법.
 > # 전체평균에서 그룹평균을 뺀 것의 제곱을
 > # 그룹 구성원 숫자만큼 더하는 것
+> # 그리고 이들을 다시 모두 더하여
+> # ss.between에 저장
 >
->
+> length(A) * ((mean.total - mean.a)^2)
-> length(A)*((mean.total - mean.a)^2)
 [1] 196
-> length(B)*((mean.total - mean.b)^2)
+> length(B) * ((mean.total - mean.b)^2)
 [1] 196
+>
 > ss.between <-
 +   length(A)*((mean.total - mean.a)^2) +
 +   length(B)*((mean.total - mean.b)^2)
->
 > ss.between
+[1] 392
+> # df between group은 연구에 사용된
+> # 그룹의 숫자에서 1을 뺀 숫자
+> df.between <- 2 - 1
+> # 이 그룹 간 차이에 기인하는 분산 값은
+> ms.between <- ss.between / df.between
+> ms.between
 [1] 392
 >
@@ Line 63: / Line 499: @@
 > ss.b <- var(B) * df.b
 > ss.within <- ss.a + ss.b
->
+> df.a <- length(A)-1
-> # Now check this
+> df.b <- length(B)-1
-> ss.total
-[1] 2552
-> ss.between
-[1] 392
-> ss.within
-[1] 2160
-> ss.total == ss.between + ss.within
-[1] FALSE
->
-> # 한편 df는
-> # df.total  30 - 1
-> df.between <- 2-1  # 그룹숫자 - 1
-> df.a <- length(A)-1 # a 구성원 - 1
-> df.b <- length(B)-1 # b 구성원 - 1
 > df.within <- df.a + df.b
+> ms.within <- ss.within / df.within
 >
-> df.total
+> ms.within
-[1] 31
+[1] 72
-> df.between
-[1] 1
-> df.within
-[1] 30
-> df.total == df.between + df.within
-[1] TRUE
 >
-> # 분산을 구하는 방법은 SS/df 이므로
+> # 여기까지 우리는
-> # 분산을 ms 라고 표기하면 우리는
+> # 전체분산
-> # ms.total, ms.between, ms.within을 구할 수 있다
+> # 그룹간분산
+> # 그룹내분산 값을
+> # 구한 것
 >
-> ms.total <- ss.total / df.total
+> # ms.between은 그룹의 차이때문에 생긴
-> ms.between <- ss.between / df.between
-> ms.within <- ss.within / df.within
->
-> # 위에서 ms.between은 그룹의 차이때문에 생긴
 > # 분산으로 IV 혹은 treatment 때문에 생기는
 > # 차이에 기인하는 분산이고
@@ Line 108: / Line 523: @@
 > # t test 때와 마찬가지로
 > # 그룹의 차이 / 랜덤 차이를 (에러 -> 분산은 에러라고도 했다)
-> # 구해볼 수 있다. 이것을 f.calculated 이라고 하고
+> # 구해볼 수 있다.
-> # 이를 프린트아웃 한다
+>
+> # 즉, 그룹갑분산은 사실 = diff (between groups)
+> # 그리고 그룹내 분산은 사실 = re
+> # 따라서 우리는 위 둘 간의 비율을 t test와 같이
+> # 살펴볼 수 있다
+>
 >
+> # 이것을 f.calculated 이라고 하고
 > f.calculated <- ms.between / ms.within
+> # 이 값을 출력해 본다
 > f.calculated
 [1] 5.444444
+> # 이 계산은 차이와 랜덤에러의 비율이
+> # df에 따라서 얼마나 되어야 그 차이가
+> # 충분히 큰 것인지를 판단하기 위해서
+> # 쓰인다. 여기서 df에는 두 가지 종류가
+> # 있다. df.between 그리고 df.within
 >
+> # percentage of f distribution with
+> # df1 and df2 option
+> # 이는 그림의 왼쪽을 나타내므로
+> # 차이가 점점 커지게 되는 오른쪽을
+> # 계산하기 위해서는 1-x를 취한다
+> f.calculated.pvalue <- 1-pf(f.calculated, df1=df.between, df2=df.within)
+> f.calculated.pvalue
+[1] 0.02652366
 > # 한편,  t test를 했었을 때 (A, B 그룹을 가지고 independent
 > # samples t-test를) 아웃 풋은
@@ Line 131: / Line 566: @@
 >
+> # 그리고 f 계산에서의 p value는 t test에서의 p.value와 같다
+> f.calculated.pvalue
+[1] 0.02652366
+> t.result$p.value
+[1] 0.02652366
+>
+> # 또한
 > # 여기엣 t 값은 t.result$statistic 으로 프린트아웃할 수 있다
 > # 이 값이 2.33333 이었다
@@ Line 153: / Line 595: @@
 [1] 2.333333
 >
+> # Now check this
+> ss.total
+[1] 2552
+> ss.between
+[1] 392
+> ss.within
+[1] 2160
+> ss.total
+[1] 2552
+> ss.between + ss.within
+[1] 2552
+>
+> # 한편 df는
+> # df.total  30 - 1
+> df.total
+[1] 31
+> df.between
+[1] 1
+> df.within
+[1] 30
+> df.total
+[1] 31
+> df.between + df.within
+[1] 31
 >
 > # 한 편
 >
 > A
- [1] 20.994218 31.148068 16.961481 27.240217 28.354539 38.331534
+ [1] 20.994218 31.148068 16.961481 27.240217 28.354539
- [7] 31.914700 23.459605 35.361796 22.182136 30.847396 15.575648
+ [6] 38.331534 31.914700 23.459605 35.361796 22.182136
-[13] 41.264878  7.808831 22.026979 22.527973
+[11] 30.847396 15.575648 41.264878  7.808831 22.026979
+[16] 22.527973
 > B
- [1] 12.941146 21.270062 13.235378  1.931364 19.232163 27.231465
+ [1] 12.941146 21.270062 13.235378  1.931364 19.232163
- [7] 18.276359  7.308871 27.560815  7.799787 25.017185 19.639663
+ [6] 27.231465 18.276359  7.308871 27.560815  7.799787
-[13] 25.018756 25.302096 28.941002 23.293888
+[11] 25.017185 19.639663 25.018756 25.302096 28.941002
+[16] 23.293888
 > comb <- stack(list(a=A, b=B))
 > comb
@@ Line 243: / Line 711: @@
 Residuals   30   2160      72
 ---
-Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+Signif. codes:
+‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
 > # 위에서 F value는 5.444
 > # 그리고 전체적인 아웃풋을 보면
@@ Line 281: / Line 750: @@
        t
 .333333
->
 </code>