Differences

This shows you the differences between two versions of the page.

--- c:ps1-1:note_on_statistical_test_2 [2024/12/11 08:42] – hkimscil
+++ c:ps1-1:note_on_statistical_test_2 [2024/12/11 09:01] (current) – [The meaning of statistical testing] hkimscil
@@ Line 1: / Line 1: @@
+====== The meaning of statistical testing  ======
+difference / random error
 <code>
 ##
@@ Line 55: / Line 57: @@
 mean(s1) + c(-c2, c2)
+##############################
+# two sample t-test
+# independent two sample t-test
+##############################
 set.seed(101)
 A <- rnorm2(16, 26, sqrt(1160/15))
@@ Line 105: / Line 112: @@
 se
 t.calculated
+pt(t.calculated, df=(n.a-1+n.b-1), lower.tail = F)*2
 # Now use t.test function for two group
@@ Line 138: / Line 146: @@
 dat <- c(A,B)
 dat
+# just to remind
+mean(A)
+mean(B)
+sd(A)
+sd(B)
 mean.total <- mean(dat)
 var.total <- var(dat)
+mean.total
+var.total
 # variance를 ms라고 부르기도 한다
 # ms = mean square (square 값을 n 으로 나눈 값이 분산이므로)
@@ Line 191: / Line 207: @@
 ss.a <- var(A) * df.a
 ss.b <- var(B) * df.b
+ss.a
+ss.b
 ss.within <- ss.a + ss.b
@@ Line 196: / Line 215: @@
 df.b <- length(B)-1
 df.within <- df.a + df.b
+df.within
 ms.within <- ss.within / df.within
+ms.within
 # 여기까지 우리는
@@ Line 241: / Line 262: @@
 # 차이가 점점 커지게 되는 오른쪽을
 # 계산하기 위해서는 1-x를 취한다
-f.calculated.pvalue <- 1-pf(f.calculated, df1=df.between, df2=df.within)
+f.calculated.pvalue <- pf(f.calculated,
+                          df1=df.between, df2=df.within,
+                          lower.tail = F)
 f.calculated.pvalue
+# 위가 의미하는 것은?
 # 한편,  t test를 했었을 때 (A, B 그룹을 가지고 independent
 # samples t-test를) 아웃 풋은
@@ Line 324: / Line 350: @@
 t.result$statistic
+</code>
+====== Output ======
+<code>
+>
+>
+> ##
+> ## t-test 이해 확인
+> ## remember t test = diff / rand error
+> ##
+>
+> #
+> iter <- 100000 # # of sampling
+>
+> set.seed(201)
+> n <- 25
+> means <- rep (NA, iter)
+> for(i in 1:iter){
++   means[i] = mean(rnorm(n, mean=100, sd=10))
++ }
+>
+> # from data
+> mean(means)
+[1] 100.0073
+> sd(means)
+[1] 2.006801
+>
+> # from CLT
+> m.means <- 100
+> sd.means <- 10/sqrt(n)
+> m.means
+[1] 100
+> sd.means
+[1] 2
+>
+> # sample n의 평균과 표준편차가 정확히
+> # mean, sd값을 갖도록 하는 변형된 펑션
+>
+> rnorm2 <- function(n, mean, sd) {
++   mean + sd * scale(rnorm(n))
++ }
+>
+> mu <- 100
+> sig <- 10
+> sigsq <- sig^2
+>
+> n <- 100
+> s1 <- rnorm2(n, 105, 10)
+> s1
+            [,1]
+  [1,]  94.82673
+  [2,] 100.67062
+  [3,]  86.56655
+  [4,] 102.50826
+  [5,] 115.06784
+  [6,] 112.31035
+  [7,] 120.18990
+  [8,] 110.75772
+  [9,]  98.72139
+ [10,] 115.53427
+ [11,] 113.08491
+ [12,] 115.15631
+ [13,] 109.03973
+ [14,] 109.80571
+ [15,] 112.59637
+ [16,] 110.05880
+ [17,]  98.12066
+ [18,] 102.55201
+ [19,] 107.48263
+ [20,] 121.72817
+ [21,]  93.28985
+ [22,] 119.32581
+ [23,] 100.53356
+ [24,] 102.47885
+ [25,]  83.84757
+ [26,] 114.45743
+ [27,] 104.89082
+ [28,]  88.02556
+ [29,] 117.62567
+ [30,]  99.43814
+ [31,] 101.09237
+ [32,] 118.65687
+ [33,]  93.25172
+ [34,] 105.57357
+ [35,] 118.83922
+ [36,]  96.54377
+ [37,]  82.22041
+ [38,]  96.65677
+ [39,] 111.01699
+ [40,] 110.31354
+ [41,] 111.67834
+ [42,]  97.06930
+ [43,] 106.48119
+ [44,] 112.62763
+ [45,] 109.52175
+ [46,] 117.83589
+ [47,] 106.59963
+ [48,]  95.41232
+ [49,] 118.29291
+ [50,] 102.73511
+ [51,] 102.07018
+ [52,] 103.52060
+ [53,]  97.40788
+ [54,]  97.62230
+ [55,] 114.44623
+ [56,] 101.93851
+ [57,] 121.84801
+ [58,]  80.19037
+ [59,]  98.76738
+ [60,]  86.28351
+ [61,] 109.94310
+ [62,] 113.33271
+ [63,] 107.60982
+ [64,] 111.13329
+ [65,] 123.62779
+ [66,]  89.83115
+ [67,] 120.45166
+ [68,]  97.78465
+ [69,] 114.37189
+ [70,] 104.57338
+ [71,] 111.63632
+ [72,] 110.01555
+ [73,] 101.47403
+ [74,] 108.06043
+ [75,] 107.34460
+ [76,] 100.78840
+ [77,] 102.86933
+ [78,] 112.86528
+ [79,] 107.89393
+ [80,] 111.71369
+ [81,] 102.47335
+ [82,] 111.80243
+ [83,] 101.35135
+ [84,]  88.35982
+ [85,] 104.51148
+ [86,] 112.76071
+ [87,]  95.75548
+ [88,] 107.23106
+ [89,]  87.08389
+ [90,] 110.88797
+ [91,]  96.50021
+ [92,]  87.04557
+ [93,] 108.36235
+ [94,]  82.16950
+ [95,] 112.75471
+ [96,]  98.58600
+ [97,] 103.24843
+ [98,]  97.48461
+ [99,] 118.06208
+[100,] 109.04152
+attr(,"scaled:center")
+[1] -0.1008262
+attr(,"scaled:scale")
+[1] 1.043405
+>
+> diff <- mean(s1) - mu
+> se <- sig/sqrt(n)
+> diff
+[1] 5
+> se
+[1] 1
+> t.cal <- diff/se
+> t.cal
+[1] 5
+> pt(t.cal, n-1, lower.tail = F)
+[1] 0.000001240698
+>
+> t.test(s1, mu=100)
+	One Sample t-test
+data:  s1
+t = 5, df = 99, p-value = 0.000002481
+alternative hypothesis: true mean is not equal to 100
+percent confidence interval:
+.0158 106.9842
+sample estimates:
+mean of x
+
+>
+> se <- sig/sqrt(n)
+> c2 <- qt(.975, df=99)
+> c2
+[1] 1.984217
+> mean(s1) + c(-c2, c2)
+[1] 103.0158 106.9842
+>
+>
+> ##############################
+> # two sample t-test
+> # independent two sample t-test
+> ##############################
+> set.seed(101)
+> A <- rnorm2(16, 26, sqrt(1160/15))
+> B <- rnorm2(16, 19, sqrt(1000/15))
+> A <- c(A)
+> B <- c(B)
+>
+> # we know sqrt(1160/15) is A's sdev
+> # hence, A's var is sqrt(1160/15)^2
+> # hence, A's SS is sqrt(1160/15)^2 * 15
+> # this is 1160
+>
+> # from the above,
+> # the difference between the A and B means
+> # remember we try to find
+> # difference due to the treatment /
+> # / random chance of error
+> diff <- mean(A) - mean(B)
+>
+> # for se
+> # we know that the situation refers to
+> # #2 se two independent samples t-test
+> # which is sqrt(pooled.var/na + pooled.var/nb)
+>
+> SSa <- 1160
+> SSb <- 1000
+> n.a <- 16
+> n.b <- 16
+> df.a <- n.a - 1
+> df.b <- n.b - 1
+>
+> # 확률과통계 수업 학생은 아래에서
+> # se값을 구한 결과만 보고 이것이
+> # se = sqrt(sigma^2/n)에
+> # 대응하는 값으로 생각하세요
+>
+> # we know that we are testing the difference
+> # between two independent sample means.
+> # Hence, we need to use poole variance between
+> # the two group. See
+> # http://commres.net/wiki/t-test#t-test_%EB%B9%84%EA%B5%90
+> pooled.var <- (SSa + SSb) / (df.a + df.b)
+> se <- sqrt( (pooled.var/n.a) + (pooled.var/n.b) )
+>
+> # Remember t test calculation is based on
+> # diff / random error
+> t.calculated <- diff / se
+> pooled.var
+[1] 72
+> diff
+[1] 7
+> se
+[1] 3
+> t.calculated
+[1] 2.333333
+> pt(t.calculated, df=(n.a-1+n.b-1), lower.tail = F)*2
+[1] 0.02652366
+>
+> # Now use t.test function for two group
+> # (independent sample) t-test
+> # with an assumption that variances of
+> # the two gorup are the same.
+> t.result <- t.test(A, B, var.equal = T)
+> t.result
+	Two Sample t-test
+data:  A and B
+t = 2.3333, df = 30, p-value = 0.02652
+alternative hypothesis: true difference in means is not equal to 0
+percent confidence interval:
+.8731826 13.1268174
+sample estimates:
+mean of x mean of y
+        19
+>
+> # t.result$statistic = t.calculated
+> # t.result$p.value = probability level of
+> # wrong decision with the t calculated value
+> str(t.result)
+List of 10
+ $ statistic  : Named num 2.33
+  ..- attr(*, "names")= chr "t"
+ $ parameter  : Named num 30
+  ..- attr(*, "names")= chr "df"
+ $ p.value    : num 0.0265
+ $ conf.int   : num [1:2] 0.873 13.127
+  ..- attr(*, "conf.level")= num 0.95
+ $ estimate   : Named num [1:2] 26 19
+  ..- attr(*, "names")= chr [1:2] "mean of x" "mean of y"
+ $ null.value : Named num 0
+  ..- attr(*, "names")= chr "difference in means"
+ $ stderr     : num 3
+ $ alternative: chr "two.sided"
+ $ method     : chr " Two Sample t-test"
+ $ data.name  : chr "A and B"
+ - attr(*, "class")= chr "htest"
+> t.result$statistic
+       t
+.333333
+> t.result$p.value
+[1] 0.02652366
+>
+> # the above p.value can be obtained with
+> # pt function
+> p.value.calculated <- 2*pt(t.calculated, df = df.a + df.b, lower.tail=FALSE)
+> p.value.calculated
+[1] 0.02652366
+> t.result$p.value
+[1] 0.02652366
+>
+> ##
+> ## different approach (분산으로 분석)
+> ##
+>
+> # A combined group with group A and B
+> # We call it group total
+> # we can obtain its mean, variance, ss, df, etc.
+> #
+> A
+ [1] 20.994218 31.148068 16.961481 27.240217 28.354539 38.331534
+ [7] 31.914700 23.459605 35.361796 22.182136 30.847396 15.575648
+[13] 41.264878  7.808831 22.026979 22.527973
+> B
+ [1] 12.941146 21.270062 13.235378  1.931364 19.232163 27.231465
+ [7] 18.276359  7.308871 27.560815  7.799787 25.017185 19.639663
+[13] 25.018756 25.302096 28.941002 23.293888
+> dat <- c(A,B)
+> dat
+ [1] 20.994218 31.148068 16.961481 27.240217 28.354539 38.331534
+ [7] 31.914700 23.459605 35.361796 22.182136 30.847396 15.575648
+[13] 41.264878  7.808831 22.026979 22.527973 12.941146 21.270062
+[19] 13.235378  1.931364 19.232163 27.231465 18.276359  7.308871
+[25] 27.560815  7.799787 25.017185 19.639663 25.018756 25.302096
+[31] 28.941002 23.293888
+>
+> # just to remind
+> mean(A)
+[1] 26
+> mean(B)
+[1] 19
+> sd(A)
+[1] 8.793937
+> sd(B)
+[1] 8.164966
+>
+> mean.total <- mean(dat)
+> var.total <- var(dat)
+> mean.total
+[1] 22.5
+> var.total
+[1] 82.32258
+> # variance를 ms라고 부르기도 한다
+> # ms = mean square (square 값을 n 으로 나눈 값이 분산이므로)
+> ms.total <- var.total
+>
+> df.total <- length(dat) - 1
+> ss.total <- var.total * df.total
+> # ss.total 은 아래처럼 구해도 된다
+> ss.total.check <- sum((dat-mean(dat))^2)
+>
+> mean.total
+[1] 22.5
+> var.total
+[1] 82.32258
+> ms.total
+[1] 82.32258
+> df.total
+[1] 31
+> ss.total
+[1] 2552
+> ss.total.check
+[1] 2552
+>
+> # Now for each group
+> mean.a <- mean(A)
+> mean.b <- mean(B)
+> mean.a
+[1] 26
+> mean.b
+[1] 19
+>
+> # 그룹 간의 차이에서 나타나는 분산
+> # 수업시간에 설명을 잘 들을 것
+>
+> # mean.total 에서 그룹a의 평균까지의 차이를 구한 후
+> # 이를 제곱하여 그룹 A 멤버의 숫자만큼 곱한다
+> # 이를 그룹b의 평균과도 다시 계산하고
+> # 이 둘을 더하여
+> # ss.between에 저장
+>
+> length(A) * ((mean.total - mean.a)^2)
+[1] 196
+> length(B) * ((mean.total - mean.b)^2)
+[1] 196
+>
+> ss.between <-
++   length(A)*((mean.total - mean.a)^2) +
++   length(B)*((mean.total - mean.b)^2)
+> ss.between
+[1] 392
+>
+> # df between group은 연구에 사용된
+> # 그룹의 숫자에서 1을 뺀 숫자
+> df.between <- 2 - 1
+>
+> # 이 그룹 간 차이에 기인하는 분산 값은
+> ms.between <- ss.between / df.between
+>
+> # 한편 아래 ss.a 와 ss.b는 각 그룹 내의
+> # 분산을 알아보기 위한 방법
+> ss.a <- var(A) * df.a
+> ss.b <- var(B) * df.b
+> ss.a
+[1] 1160
+> ss.b
+[1] 1000
+>
+> ss.within <- ss.a + ss.b
+>
+> df.a <- length(A)-1
+> df.b <- length(B)-1
+> df.within <- df.a + df.b
+> df.within
+[1] 30
+>
+> ms.within <- ss.within / df.within
+> ms.within
+[1] 72
+>
+> # 여기까지 우리는
+> # 전체분산
+> # 그룹간분산
+> # 그룹내분산 값을
+> # 구한 것
+>
+> # ms.between은 그룹의 차이때문에 생긴
+> # 분산으로 IV 혹은 treatment 때문에 생기는
+> # 차이에 기인하는 분산이다.
+>
+> # ms.within은 각 그룹 내부에서 일어나는 분산이므로
+> # (variation이므로) 연구자의 관심사와는 상관이 없이
+> # 나타나는 random한 차이를 알려주는 분산이라고 하면
+>
+> # t test 때와 마찬가지로
+> # 그룹의 차이 / 랜덤 차이를 (에러 -> 분산은 에러라고도 했다)
+> # 구해볼 수 있다.
+>
+> # 즉, 그룹갑분산은 사실 = diff (between groups)
+> # 그리고 그룹내 분산은 사실 = re
+> # 따라서 우리는 위 둘 간의 비율을 t test와 같이
+> # 살펴볼 수 있다
+>
+> # 이것을 f.calculated 이라고 하고
+> f.calculated <- ms.between / ms.within
+>
+> # 이 값을 출력해 본다
+> f.calculated
+[1] 5.444444
+>
+> # 이 계산은 차이와 랜덤에러의 비율이
+> # df에 따라서 얼마나 되어야 그 차이가
+> # 충분히 큰 것인지를 판단하기 위해서
+> # 쓰인다. 여기서 df에는 두 가지 종류가
+> # 있다. df.between 그리고 df.within
+> # 이 정보를 이용하여 f.calculated 값에 대한
+> # probability를 구할 수 있다 (t-test때 처럼)
+>
+> # percentage of f distribution with
+> # df1 and df2 option
+> # 이는 그림의 왼쪽을 나타내므로
+> # 차이가 점점 커지게 되는 오른쪽을
+> # 계산하기 위해서는 1-x를 취한다
+> f.calculated.pvalue <- pf(f.calculated,
++                           df1=df.between, df2=df.within,
++                           lower.tail = F)
+> f.calculated.pvalue
+[1] 0.02652366
+> # 위가 의미하는 것은?
+>
+>
+> # 한편,  t test를 했었을 때 (A, B 그룹을 가지고 independent
+> # samples t-test를) 아웃 풋은
+> t.result
+	Two Sample t-test
+data:  A and B
+t = 2.3333, df = 30, p-value = 0.02652
+alternative hypothesis: true difference in means is not equal to 0
+percent confidence interval:
+.8731826 13.1268174
+sample estimates:
+mean of x mean of y
+        19
+>
+> # 그리고 f 계산에서의 p value는 t test에서의 p.value와 같다
+> f.calculated.pvalue
+[1] 0.02652366
+> t.result$p.value
+[1] 0.02652366
+>
+> # 또한
+> # 여기엣 t 값은 t.result$statistic 으로 프린트아웃할 수 있다
+> # 이 값이 2.33333 이었다
+> t.result$statistic
+       t
+.333333
+> # 혹은 우리가 계산한 값이었던
+> # t.calculated
+> t.calculated
+[1] 2.333333
+>
+> # 그런데 위의 값을 제곱한 값이 바로 f.calculated 값
+> f.calculated
+[1] 5.444444
+> t.calculated
+[1] 2.333333
+> t.calculated^2
+[1] 5.444444
+>
+> # 혹은 f.calculated 값을 제곱근한 값이 t.calculated
+> f.calculated
+[1] 5.444444
+> sqrt(f.calculated)
+[1] 2.333333
+> t.calculated
+[1] 2.333333
+>
+> # 또한 우리는 계산한 ms.total의 분자 부분인 ss.total은
+> # 독립변인이 영향을 주는 ss.between 값과
+> # 독립변인이 있음에도 불구하고 여전히 남는 ss.within
+> # 으로 이루어짐을 아래처럼 확인한다.
+> ss.total
+[1] 2552
+> ss.between
+[1] 392
+> ss.within
+[1] 2160
+> ss.total
+[1] 2552
+> ss.between + ss.within
+[1] 2552
+>
+> # 한편 df는
+> # df.total  30 - 1
+> df.total
+[1] 31
+> df.between
+[1] 1
+> df.within
+[1] 30
+> df.total
+[1] 31
+> df.between + df.within
+[1] 31
+>
+> # 한 편
+> # 아래는 위의 분산을 쪼개서 분석하는 펑션인
+> # aov를 이용해서 통계분석을 한 결과이다.
+>
+> A
+ [1] 20.994218 31.148068 16.961481 27.240217 28.354539 38.331534
+ [7] 31.914700 23.459605 35.361796 22.182136 30.847396 15.575648
+[13] 41.264878  7.808831 22.026979 22.527973
+> B
+ [1] 12.941146 21.270062 13.235378  1.931364 19.232163 27.231465
+ [7] 18.276359  7.308871 27.560815  7.799787 25.017185 19.639663
+[13] 25.018756 25.302096 28.941002 23.293888
+> comb <- stack(list(a=A, b=B))
+> comb
+      values ind
+  20.994218   a
+  31.148068   a
+  16.961481   a
+  27.240217   a
+  28.354539   a
+  38.331534   a
+  31.914700   a
+  23.459605   a
+  35.361796   a
+22.182136   a
+30.847396   a
+15.575648   a
+41.264878   a
+  7.808831   a
+22.026979   a
+22.527973   a
+12.941146   b
+21.270062   b
+13.235378   b
+  1.931364   b
+19.232163   b
+27.231465   b
+18.276359   b
+  7.308871   b
+27.560815   b
+  7.799787   b
+25.017185   b
+19.639663   b
+25.018756   b
+25.302096   b
+28.941002   b
+23.293888   b
+> colnames(comb)[1] <- "values"
+> colnames(comb)[2] <- "group"
+> comb
+      values group
+  20.994218     a
+  31.148068     a
+  16.961481     a
+  27.240217     a
+  28.354539     a
+  38.331534     a
+  31.914700     a
+  23.459605     a
+  35.361796     a
+22.182136     a
+30.847396     a
+15.575648     a
+41.264878     a
+  7.808831     a
+22.026979     a
+22.527973     a
+12.941146     b
+21.270062     b
+13.235378     b
+  1.931364     b
+19.232163     b
+27.231465     b
+18.276359     b
+  7.308871     b
+27.560815     b
+  7.799787     b
+25.017185     b
+19.639663     b
+25.018756     b
+25.302096     b
+28.941002     b
+23.293888     b
+>
+> a.res <- aov(values ~ group, data=comb)
+> a.res.sum <- summary(a.res)
+> a.res.sum
+            Df Sum Sq Mean Sq F value Pr(>F)
+group        1    392     392   5.444 0.0265 *
+Residuals   30   2160      72
+---
+Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+> # 위에서 F value는 5.444
+> # 그리고 전체적인 아웃풋을 보면
+> # Df group 과 Df Residuals
+> # Sum Sq group 과 Residuals
+> # Mean Sq (MS) group 과 MS residuals
+>
+> # MS.group =
+> ms.between
+[1] 392
+>
+> # MS.within =
+> ms.within
+[1] 72
+>
+> # F value
+> ms.between / ms.within
+[1] 5.444444
+> f.calculated
+[1] 5.444444
+>
+> # 아래는 기존의 아웃풋에서 확인하는 것
+> str(a.res.sum)
+List of 1
+ $ :Classes ‘anova’ and 'data.frame':	2 obs. of  5 variables:
+  ..$ Df     : num [1:2] 1 30
+  ..$ Sum Sq : num [1:2] 392 2160
+  ..$ Mean Sq: num [1:2] 392 72
+  ..$ F value: num [1:2] 5.44 NA
+  ..$ Pr(>F) : num [1:2] 0.0265 NA
+ - attr(*, "class")= chr [1:2] "summary.aov" "listof"
+> a.res.sum[[1]][1,4]
+[1] 5.444444
+> sqrt(a.res.sum[[1]][1,4])
+[1] 2.333333
+> t.result$statistic
+       t
+.333333
+>
 </code>