c:ms:2026:lecture_note_week_05
Differences
This shows you the differences between two versions of the page.
| Both sides previous revisionPrevious revisionNext revision | Previous revision | ||
| c:ms:2026:lecture_note_week_05 [2026/04/05 23:20] – hkimscil | c:ms:2026:lecture_note_week_05 [2026/04/07 23:00] (current) – hkimscil | ||
|---|---|---|---|
| Line 1: | Line 1: | ||
| - | ====== | + | ====== |
| + | 독립변인의 효과를 알고 싶을 때, 혹은 모집단의 성격이 참인지 거짓인지 알고 싶을 때 | ||
| + | One sample z-test | ||
| [[:t-test]] 꼭 읽을 것 | [[:t-test]] 꼭 읽을 것 | ||
| Distribution of Sample Means -- mu = 40, sigma = 4 (hence var = 16) 인 모집단에서 n = n 사이즈의 샘플링을 무한 반복할 때 그 샘플평균들이 모인 집합 | Distribution of Sample Means -- mu = 40, sigma = 4 (hence var = 16) 인 모집단에서 n = n 사이즈의 샘플링을 무한 반복할 때 그 샘플평균들이 모인 집합 | ||
| Line 276: | Line 278: | ||
| </ | </ | ||
| </ | </ | ||
| - | ====== One sample t-test with population variance unknown | + | ====== |
| - | one sample t-test라고 부른다 | + | One sample t-test with population variance unknown |
| <tabbox rs.one.sample.t-test.02> | <tabbox rs.one.sample.t-test.02> | ||
| < | < | ||
| Line 335: | Line 337: | ||
| ====== Two sample t-test ====== | ====== Two sample t-test ====== | ||
| Impendence group (sample) t-test | Impendence group (sample) t-test | ||
| + | [[:two sample t-test]] 문서. 설명문서 볼 것 | ||
| + | [[:r:two sample t-test|two sample t-test in R]] 문서. r script 예제 볼 것 | ||
| <tabbox rscript02> | <tabbox rscript02> | ||
| < | < | ||
| Line 575: | Line 579: | ||
| t & = & \frac{ \overline{X}-\mu}{s_{\overline{X}} }, \quad \text{where } \;\; s_{\overline{X}} = \frac{s}{\sqrt{n}} \\ | t & = & \frac{ \overline{X}-\mu}{s_{\overline{X}} }, \quad \text{where } \;\; s_{\overline{X}} = \frac{s}{\sqrt{n}} \\ | ||
| + | \ | ||
| t & = & \frac{(\overline{X_a}-\overline{X_b})-(\mu_a-\mu_b)}{\sigma_{\text{diff} }} , \;\;\; \\ | t & = & \frac{(\overline{X_a}-\overline{X_b})-(\mu_a-\mu_b)}{\sigma_{\text{diff} }} , \;\;\; \\ | ||
| + | & & \qquad \qquad \;\;\; \mu_a = \mu_b \text{ presumed} \nonumber \\ | ||
| & & \qquad \qquad \;\;\; \text{where } \;\; \sigma_{\text{diff} } = \sqrt{ \frac{s^2_{\text{pooled}}}{n_a} | & & \qquad \qquad \;\;\; \text{where } \;\; \sigma_{\text{diff} } = \sqrt{ \frac{s^2_{\text{pooled}}}{n_a} | ||
| & & \qquad \qquad \;\;\; s^2_{\text{pooled}} = \frac {\text{SS}_a + \text{SS}_b} {df_a + df_b} \nonumber \\ | & & \qquad \qquad \;\;\; s^2_{\text{pooled}} = \frac {\text{SS}_a + \text{SS}_b} {df_a + df_b} \nonumber \\ | ||
| Line 587: | Line 592: | ||
| \end{eqnarray*} | \end{eqnarray*} | ||
| - | ====== Two sample t-test ====== | ||
| - | Independent sample t-test 의 standard error 값을 구하는 방법이 얼른 머리에 들어오지 않으면, | ||
| - | see [[:mean and variance of the sample mean]]: 샘플평균들의 집합이 가지는 평균과 분산 (CLT) | ||
| - | sa [[: | ||
| - | 두 샘플을 취해서 평균을 구한 후 (mean of A, mean of B), 그 차이를 기록하는 것을 무한히 하여 그 분포를 구하는 것은 아래와 같이 정리, 이해할 수 있다. | ||
| - | \begin{eqnarray} | ||
| - | E \left[ \overline{X} - \overline{Y} \right] & = & E \left[ \overline{X} \right] - E \left[ \overline{Y} \right] | ||
| - | & = & {\mu_{X}} - {\mu_{Y}} \\ | ||
| - | V \left[ \overline{X} - \overline{Y} \right] & = & V \left[ \overline{X} \right] + V \left[ \overline{Y} \right] | ||
| - | & = & \displaystyle \dfrac {\sigma_{X}} {n_{X}} + \dfrac {\sigma_{Y}} {n_{Y}} \\ | ||
| - | \end{eqnarray} | ||
| - | * 위가 말하는 것은 X 모집단과 Y 모집단의 평균이 같다면 거기서 뽑은 두 샘플평균의 차이를 모아 놓은 분포는 | ||
| - | * 1. Normal distribution이고 | ||
| - | * 2. 평균은 두 모집단 평균의 차이가 되고 | ||
| - | * 3. 분산은 위의 식처럼 될 것이다. | ||
| - | |||
| - | |||
| - | <tabbox rs.two.sample.t.test> | ||
| - | < | ||
| - | rm(list=ls()) | ||
| - | rnorm2 <- function(n, | ||
| - | mean+sd*scale(rnorm(n)) | ||
| - | } | ||
| - | |||
| - | ss <- function(x) { | ||
| - | sum((x-mean(x))^2) | ||
| - | } | ||
| - | |||
| - | N.p <- 1000000 | ||
| - | m.p <- 100 | ||
| - | sd.p <- 10 | ||
| - | |||
| - | set.seed(101) | ||
| - | p1 <- rnorm2(N.p, m.p, sd.p) | ||
| - | mean(p1) | ||
| - | sd(p1) | ||
| - | |||
| - | p2 <- rnorm2(N.p, m.p+10, sd.p) | ||
| - | mean(p2) | ||
| - | sd(p2) | ||
| - | |||
| - | s.size <- 50 | ||
| - | |||
| - | iter <- 100000 | ||
| - | # means <- c() | ||
| - | mdiffs <- rep(NA, iter) | ||
| - | means.s1 <- rep(NA, iter) | ||
| - | means.s2 <- rep(NA, iter) | ||
| - | tail(mdiffs) | ||
| - | |||
| - | for (i in 1:iter) { | ||
| - | # means <- append(means, | ||
| - | s1 <- sample(p1, s.size, replace = T) | ||
| - | s2 <- sample(p2, s.size, replace = T) | ||
| - | means.s1[i] <- mean(s1) | ||
| - | means.s2[i] <- mean(s2) | ||
| - | mdiffs[i] <- mean(s1-s2) | ||
| - | } | ||
| - | |||
| - | mu <- mean(p1) - mean(p2) | ||
| - | ms <- var(p1)/ | ||
| - | se <- sqrt(ms) | ||
| - | |||
| - | mu | ||
| - | ms | ||
| - | se | ||
| - | |||
| - | m.diff <- mean(mdiffs) | ||
| - | var.diff <- var(mdiffs) | ||
| - | sd.diff <- sd(mdiffs) | ||
| - | m.diff | ||
| - | var.diff | ||
| - | sd.diff | ||
| - | |||
| - | var(means.s1) | ||
| - | var(p1)/ | ||
| - | var(means.s2) | ||
| - | var(p2)/ | ||
| - | var(means.s1-means.s2) | ||
| - | var(means.s1) + var(means.s2) | ||
| - | var(p1)/ | ||
| - | |||
| - | var.diff <- (var(p1)/ | ||
| - | var.diff | ||
| - | sqrt(var.diff) | ||
| - | se.diff <- sqrt(var.diff) | ||
| - | se.diff | ||
| - | |||
| - | hist(mdiffs, | ||
| - | abline(v=mean(mdiffs), | ||
| - | | ||
| - | ci95 <- se.diff*2 | ||
| - | ci95 | ||
| - | abline(v=c(m.diff-ci95, | ||
| - | | ||
| - | text(x=m.diff, | ||
| - | | ||
| - | pos = 1 | ||
| - | ) | ||
| - | |||
| - | s1 <- sample(p1, s.size, replace = T) | ||
| - | s2 <- sample(p2, s.size, replace = T) | ||
| - | |||
| - | df <- s.size - 1 | ||
| - | pv <- (ss(s1)+ss(s2))/ | ||
| - | pv | ||
| - | ms1 <- ss(s1)/df | ||
| - | ms2 <- ss(s2)/df | ||
| - | ms1 | ||
| - | ms2 | ||
| - | |||
| - | se <- sqrt(ms1/ | ||
| - | se | ||
| - | se.z <- sqrt(pv/ | ||
| - | se.z | ||
| - | |||
| - | diff <- mean(s1)-mean(s2) | ||
| - | t.cal <- diff / se.z | ||
| - | |||
| - | t.test(s1, | ||
| - | |||
| - | t.cal | ||
| - | print(df.tot <- df+df) | ||
| - | print(p.val <- pt(abs(t.cal), | ||
| - | print(mean.diff <- mean(s1)-mean(s2)) | ||
| - | two <- qt(.05/2, df.tot) | ||
| - | two | ||
| - | # two <- -2 | ||
| - | lo2 <- se.z * two | ||
| - | lo2 | ||
| - | mean.diff+c(lo2, | ||
| - | |||
| - | zdiffs <- scale(mdiffs) | ||
| - | se.diff <- sd.diff | ||
| - | hist(zdiffs, | ||
| - | two | ||
| - | abline(v=c(0, | ||
| - | text(x=two, | ||
| - | text(x=two, | ||
| - | abline(v=c(t.cal, | ||
| - | text(x=t.cal, | ||
| - | text(x=t.cal, | ||
| - | p.val | ||
| - | |||
| - | </ | ||
| - | |||
| - | <tabbox ro.two.sample.t.test> | ||
| - | < | ||
| - | > | ||
| - | > rm(list=ls()) | ||
| - | > rnorm2 <- function(n, | ||
| - | + | ||
| - | + } | ||
| - | > | ||
| - | > ss <- function(x) { | ||
| - | + | ||
| - | + } | ||
| - | > | ||
| - | > N.p <- 1000000 | ||
| - | > m.p <- 100 | ||
| - | > sd.p <- 10 | ||
| - | > | ||
| - | > set.seed(101) | ||
| - | > p1 <- rnorm2(N.p, m.p, sd.p) | ||
| - | > mean(p1) | ||
| - | [1] 100 | ||
| - | > sd(p1) | ||
| - | [1] 10 | ||
| - | > | ||
| - | > p2 <- rnorm2(N.p, m.p+10, sd.p) | ||
| - | > mean(p2) | ||
| - | [1] 110 | ||
| - | > sd(p2) | ||
| - | [1] 10 | ||
| - | > | ||
| - | > s.size <- 50 | ||
| - | > | ||
| - | > iter <- 100000 | ||
| - | > # means <- c() | ||
| - | > mdiffs <- rep(NA, iter) | ||
| - | > means.s1 <- rep(NA, iter) | ||
| - | > means.s2 <- rep(NA, iter) | ||
| - | > tail(mdiffs) | ||
| - | [1] NA NA NA NA NA NA | ||
| - | > | ||
| - | > for (i in 1:iter) { | ||
| - | + # means <- append(means, | ||
| - | + s1 <- sample(p1, s.size, replace = T) | ||
| - | + s2 <- sample(p2, s.size, replace = T) | ||
| - | + | ||
| - | + | ||
| - | + | ||
| - | + } | ||
| - | > | ||
| - | > mu <- mean(p1) - mean(p2) | ||
| - | > ms <- var(p1)/ | ||
| - | > se <- sqrt(ms) | ||
| - | > | ||
| - | > mu | ||
| - | [1] -10 | ||
| - | > ms | ||
| - | [,1] | ||
| - | [1,] 4 | ||
| - | > se | ||
| - | [,1] | ||
| - | [1,] 2 | ||
| - | > | ||
| - | > m.diff <- mean(mdiffs) | ||
| - | > var.diff <- var(mdiffs) | ||
| - | > sd.diff <- sd(mdiffs) | ||
| - | > m.diff | ||
| - | [1] -9.988058 | ||
| - | > var.diff | ||
| - | [1] 4.023723 | ||
| - | > sd.diff | ||
| - | [1] 2.005922 | ||
| - | > | ||
| - | > var(means.s1) | ||
| - | [1] 2.002125 | ||
| - | > var(p1)/ | ||
| - | [,1] | ||
| - | [1,] 2 | ||
| - | > var(means.s2) | ||
| - | [1] 2.014368 | ||
| - | > var(p2)/ | ||
| - | [,1] | ||
| - | [1,] 2 | ||
| - | > var(means.s1-means.s2) | ||
| - | [1] 4.023723 | ||
| - | > var(means.s1) + var(means.s2) | ||
| - | [1] 4.016493 | ||
| - | > var(p1)/ | ||
| - | [,1] | ||
| - | [1,] 4 | ||
| - | > | ||
| - | > var.diff <- (var(p1)/ | ||
| - | > var.diff | ||
| - | [,1] | ||
| - | [1,] 4 | ||
| - | > sqrt(var.diff) | ||
| - | [,1] | ||
| - | [1,] 2 | ||
| - | > se.diff <- sqrt(var.diff) | ||
| - | > se.diff | ||
| - | [,1] | ||
| - | [1,] 2 | ||
| - | > | ||
| - | > hist(mdiffs, | ||
| - | > abline(v=mean(mdiffs), | ||
| - | + col=" | ||
| - | > ci95 <- se.diff*2 | ||
| - | > ci95 | ||
| - | [,1] | ||
| - | [1,] 4 | ||
| - | > abline(v=c(m.diff-ci95, | ||
| - | + col=" | ||
| - | > text(x=m.diff, | ||
| - | + labels=paste(round(m.diff-ci95, | ||
| - | + pos = 1 | ||
| - | + ) | ||
| - | > | ||
| - | > s1 <- sample(p1, s.size, replace = T) | ||
| - | > s2 <- sample(p2, s.size, replace = T) | ||
| - | > | ||
| - | > df <- s.size - 1 | ||
| - | > pv <- (ss(s1)+ss(s2))/ | ||
| - | > pv | ||
| - | [1] 106.6359 | ||
| - | > ms1 <- ss(s1)/df | ||
| - | > ms2 <- ss(s2)/df | ||
| - | > ms1 | ||
| - | [1] 105.3544 | ||
| - | > ms2 | ||
| - | [1] 107.9175 | ||
| - | > | ||
| - | > se <- sqrt(ms1/ | ||
| - | > se | ||
| - | [1] 2.065293 | ||
| - | > se.z <- sqrt(pv/ | ||
| - | > se.z | ||
| - | [1] 2.065293 | ||
| - | > | ||
| - | > diff <- mean(s1)-mean(s2) | ||
| - | > t.cal <- diff / se.z | ||
| - | > | ||
| - | > t.test(s1, | ||
| - | |||
| - | Two Sample t-test | ||
| - | |||
| - | data: s1 and s2 | ||
| - | t = -3.3699, df = 98, p-value = 0.001077 | ||
| - | alternative hypothesis: true difference in means is not equal to 0 | ||
| - | 95 percent confidence interval: | ||
| - | | ||
| - | sample estimates: | ||
| - | mean of x mean of y | ||
| - | | ||
| - | |||
| - | > | ||
| - | > t.cal | ||
| - | [1] -3.369909 | ||
| - | > print(df.tot <- df+df) | ||
| - | [1] 98 | ||
| - | > print(p.val <- pt(abs(t.cal), | ||
| - | [1] 0.001076634 | ||
| - | > print(mean.diff <- mean(s1)-mean(s2)) | ||
| - | [1] -6.959851 | ||
| - | > two <- qt(.05/2, df.tot) | ||
| - | > two | ||
| - | [1] -1.984467 | ||
| - | > # two <- -2 | ||
| - | > lo2 <- se.z * two | ||
| - | > lo2 | ||
| - | [1] -4.098508 | ||
| - | > mean.diff+c(lo2, | ||
| - | [1] -11.058359 | ||
| - | > | ||
| - | > zdiffs <- scale(mdiffs) | ||
| - | > se.diff <- sd.diff | ||
| - | > hist(zdiffs, | ||
| - | > two | ||
| - | [1] -1.984467 | ||
| - | > abline(v=c(0, | ||
| - | > text(x=two, | ||
| - | > text(x=two, | ||
| - | > abline(v=c(t.cal, | ||
| - | > text(x=t.cal, | ||
| - | > text(x=t.cal, | ||
| - | > p.val | ||
| - | [1] 0.001076634 | ||
| - | > | ||
| - | </ | ||
| - | </ | ||
| - | {{.: | ||
| - | {{.: | ||
| - | |||
| - | What if s1, s2 are from the same pop? | ||
| - | <tabbox rs.02> | ||
| - | < | ||
| - | ### | ||
| - | # what if s1 and s2 are from | ||
| - | # the same pop? | ||
| - | # the distribution of sample mean | ||
| - | # difference should be | ||
| - | # normal, mu = 0, var = var.p1/ | ||
| - | |||
| - | iter <- 100000 | ||
| - | # means <- c() | ||
| - | mdiffs <- rep(NA, iter) | ||
| - | means.s3 <- rep(NA, iter) | ||
| - | means.s4 <- rep(NA, iter) | ||
| - | tail(mdiffs) | ||
| - | |||
| - | for (i in 1:iter) { | ||
| - | # means <- append(means, | ||
| - | s3 <- sample(p1, s.size, replace = T) | ||
| - | s4 <- sample(p1, s.size, replace = T) | ||
| - | means.s3[i] <- mean(s3) | ||
| - | means.s4[i] <- mean(s4) | ||
| - | mdiffs[i] <- mean(s3-s4) | ||
| - | } | ||
| - | |||
| - | mu <- mean(p1) - mean(p1) | ||
| - | ms <- var(p1)/ | ||
| - | se <- sqrt(ms) | ||
| - | |||
| - | mu | ||
| - | ms | ||
| - | se | ||
| - | |||
| - | m.diff <- mean(mdiffs) | ||
| - | var.diff <- var(mdiffs) | ||
| - | sd.diff <- sd(mdiffs) | ||
| - | m.diff | ||
| - | var.diff | ||
| - | sd.diff | ||
| - | |||
| - | s3 <- sample(p1, s.size, replace=T) | ||
| - | s4 <- sample(p1, s.size, replace=T) | ||
| - | t.test(s3, s4, var.equal=T) | ||
| - | print(m.diff <- mean(s3)-mean(s4)) | ||
| - | # 위의 value는 0을 중심으로 -4 +4 사이에 | ||
| - | # 있을 확률이 95퍼센트이다. | ||
| - | |||
| - | |||
| - | |||
| - | |||
| - | </ | ||
| - | |||
| - | <tabbox ro.02> | ||
| - | < | ||
| - | > ### | ||
| - | > # what if s1 and s2 are from | ||
| - | > # the same pop? | ||
| - | > # the distribution of sample mean | ||
| - | > # difference should be | ||
| - | > # normal, mu = 0, var = var.p1/ | ||
| - | > | ||
| - | > iter <- 100000 | ||
| - | > # means <- c() | ||
| - | > mdiffs <- rep(NA, iter) | ||
| - | > means.s3 <- rep(NA, iter) | ||
| - | > means.s4 <- rep(NA, iter) | ||
| - | > tail(mdiffs) | ||
| - | [1] NA NA NA NA NA NA | ||
| - | > | ||
| - | > for (i in 1:iter) { | ||
| - | + # means <- append(means, | ||
| - | + s3 <- sample(p1, s.size, replace = T) | ||
| - | + s4 <- sample(p1, s.size, replace = T) | ||
| - | + | ||
| - | + | ||
| - | + | ||
| - | + } | ||
| - | > | ||
| - | > mu <- mean(p1) - mean(p1) | ||
| - | > ms <- var(p1)/ | ||
| - | > se <- sqrt(ms) | ||
| - | > | ||
| - | > mu | ||
| - | [1] 0 | ||
| - | > ms | ||
| - | [,1] | ||
| - | [1,] 4 | ||
| - | > se | ||
| - | [,1] | ||
| - | [1,] 2 | ||
| - | > | ||
| - | > m.diff <- mean(mdiffs) | ||
| - | > var.diff <- var(mdiffs) | ||
| - | > sd.diff <- sd(mdiffs) | ||
| - | > m.diff | ||
| - | [1] -0.00273072 | ||
| - | > var.diff | ||
| - | [1] 3.997207 | ||
| - | > sd.diff | ||
| - | [1] 1.999302 | ||
| - | > | ||
| - | > s3 <- sample(p1, s.size, replace=T) | ||
| - | > s4 <- sample(p1, s.size, replace=T) | ||
| - | > t.test(s3, s4, var.equal=T) | ||
| - | |||
| - | Two Sample t-test | ||
| - | |||
| - | data: s3 and s4 | ||
| - | t = 1.7165, df = 98, p-value = 0.08924 | ||
| - | alternative hypothesis: true difference in means is not equal to 0 | ||
| - | 95 percent confidence interval: | ||
| - | | ||
| - | sample estimates: | ||
| - | mean of x mean of y | ||
| - | 103.04431 | ||
| - | |||
| - | > print(m.diff <- mean(s3)-mean(s4)) | ||
| - | [1] 3.544918 | ||
| - | > # 위의 value는 0을 중심으로 -4 +4 사이에 | ||
| - | > # 있을 확률이 95퍼센트이다. | ||
| - | > | ||
| - | </ | ||
| - | </ | ||
c/ms/2026/lecture_note_week_05.1775431210.txt.gz · Last modified: by hkimscil
