Differences

This shows you the differences between two versions of the page.

--- r:sampling_distribution [2025/09/10 07:34] – [output] hkimscil
+++ r:sampling_distribution [2025/09/10 20:41] (current) – [PS1. week02] hkimscil
@@ Line 1: / Line 1: @@
 ====== PS1. week02 ======
 <code>
 rm(list=ls())
@@ Line 32: / Line 31: @@
 var(p1)
-hist(p1, breaks=100, col=rgb(1,1,1,1))
+hist(p1, breaks=50, col = rgb(1, 1, 1, 0.5),
+     main = "histogram of p1 and p2",)
+abline(v=mean(p1), col="black", lwd=3)
+hist(p2, add=T, breaks=50, col=rgb(1,1,.5,.5))
+abline(v=mean(p2), col="red", lwd=3)
+hist(p1, breaks=50, col=rgb(0,.5,.5,.5))
 abline(v=mean(p1),lwd=2)
 abline(v=mean(p1)-sd(p1), lwd=2)
@@ Line 90: / Line 97: @@
 pnorm(1.8)-pnorm(-1.8)
-hist(z.p1, breaks=100, col=rgb(0,0,0,0))
+hist(z.p1, breaks=50, col=rgb(1,0,0,0))
 abline(v=c(m.p1, -1.8, 1.8), col='red')
 -(pnorm(1.8)-pnorm(-1.8))
@@ Line 104: / Line 111: @@
 #
-hist(p1, breaks=100, col=rgb(1,1,1,1))
+hist(p1, breaks=50, col=rgb(.9,.9,.9,.9))
 abline(v=mean(p1),lwd=2)
 abline(v=mean(p1)-sd(p1), lwd=2)
@@ Line 116: / Line 123: @@
 c(a, b)
 c(-1, 1)
+# note that
+.32/2
+pnorm(-1)
+qnorm(.32/2)
+qnorm(pnorm(-1))
 # 95%
@@ Line 122: / Line 134: @@
 c(c, d)
 c(-2,2)
 # 99%
 e <- qnorm(.01/2)
@@ Line 127: / Line 140: @@
 c(e,f)
 c(-3,3)
 pnorm(b)-pnorm(a)
@@ Line 140: / Line 154: @@
 ################################
-hist(p1, breaks=50, col = rgb(1, 0, 0, 0.5),
-     main = "histogram of p1 and p2",)
-abline(v=mean(p1), col="black", lwd=3)
-hist(p2, add=T, breaks=50, col=rgb(0,0,1,.5))
-abline(v=mean(p2), col="violet", lwd=3)
 s.size <- 10
@@ Line 170: / Line 178: @@
 se.s <- sd(means)
-hist(means, breaks=100, col=rgb(.1, 0, 0, .5))
+hist(means, breaks=50,
-abline(v=mean(means), col="red", lwd=2)
+     xlim = c(mean(means)-5*sd(means), mean(means)+10*sd(means)),
+     col=rgb(1, 1, 1, .5))
+abline(v=mean(means), col="black", lwd=3)
 # now we want to get sd of this distribution
 lo1 <- mean(means)-se.s
@@ Line 180: / Line 189: @@
 lo3 <- mean(means)-3*se.s
 hi3 <- mean(means)+3*se.s
-hist(means,
-     xlim = c(mean(means)-5*sd(means), mean(means)+10*sd(means)),
-     col = rgb(1, 0, 0, .5))
 abline(v=mean(means), col="black", lwd=2)
-# abline(v=mean(p2), colo='darkgreen', lwd=3)
+# abline(v=mean(p2), colo='darkgreen', lwd=2)
 abline(v=c(lo1, hi1, lo2, hi2, lo3, hi3),
-       col=c("green","green", "blue", "blue", "orange", "orange"),
+       col=c("red","red", "blue", "blue", "orange", "orange"),
        lwd=2)
@@ Line 198: / Line 203: @@
 # sd of sample means (sd(means))
-# is sqrt(var(s1)/s.size) or
-# sd(s1) / sqrt(s.size)
 # = se.s
 # when iter value goes to
-# unlimited value:
+# infinite value:
 # mean(means) = mean(p1)
 # and
 # sd(means) = sd(p1) / sqrt(s.size)
-# that is, sd(means) = se.z
+# that is, se.s = se.z
 # This is called CLT (Central Limit Theorem)
+# see http://commres.net/wiki/cetral_limit_theorem
 mean(means)
 mean(p1)
 sd(means)
 var(p1)
+# remember we started talking sample size 10
 sqrt(var(p1)/s.size)
 se.z
@@ Line 237: / Line 243: @@
-hist(means,
+hist(means, breaks=50,
      xlim = c(mean(means)-5*sd(means), mean(means)+10*sd(means)),
-     col = rgb(1, 0, 0, .5))
+     col = rgb(1, 1, 1, .5))
-abline(v=mean(means), col="black", lwd=2)
+abline(v=mean(means), col="black", lwd=3)
 # abline(v=mean(p2), colo='darkgreen', lwd=3)
 abline(v=c(lo1, hi1, lo2, hi2, lo3, hi3),
-       col=c("green","green", "blue", "blue", "orange", "orange"),
+       col=c("darkgreen","darkgreen", "blue", "blue", "orange", "orange"),
        lwd=2)
@@ Line 257: / Line 263: @@
 m.sample.i.got
-hist(means,
+hist(means, breaks=30,
-     xlim = c(mean(means)-10*sd(means), mean(means)+10*sd(means)),
+     xlim = c(mean(means)-7*sd(means), mean(means)+10*sd(means)),
-     col = rgb(1, 0, 0, .5))
+     col = rgb(1, 1, 1, .5))
 abline(v=mean(means), col="black", lwd=3)
 abline(v=m.sample.i.got, col='darkgreen', lwd=3)
@@ Line 276: / Line 282: @@
 # (green line)
 tmp <- mean(means) - (m.sample.i.got - mean(means))
-abline(v=tmp, col='green', lwd=3)
+abline(v=tmp, col='red', lwd=3)
 * pnorm(m.sample.i.got, mean(p1), sd(means), lower.tail = F)
 m.sample.i.got
 ### one more time
+# this time, with a story
 mean(p2)
 sd(p2)
@@ Line 287: / Line 294: @@
 m.sample.i.got
-hist(means,
+tmp <- mean(means) - (m.sample.i.got-mean(means))
-     xlim = c(mean(means)-15*sd(means), mean(means)+15*sd(means)),
+tmp
-     col = rgb(1, 0, 0, .5))
-abline(v=mean(means), col="black", lwd=2)
+hist(means, breaks=30,
-abline(v=m.sample.i.got, col='darkgreen', lwd=2)
+     xlim = c(tmp-4*sd(means), m.sample.i.got+4*sd(means)),
+     col = rgb(1, 1, 1, .5))
+abline(v=mean(means), col="black", lwd=3)
+abline(v=m.sample.i.got, col='blue', lwd=3)
 # what is the probablity of getting
@@ Line 304: / Line 314: @@
 # mean(means) - m.sample.i.got - mean(means)
 # (green line)
-tmp <- mean(means) - (m.sample.i.got - mean(means))
+abline(v=tmp, col='red', lwd=3)
-abline(v=tmp, col='green', lwd=2)
 * pnorm(m.sample.i.got, mean(p1), sd(means), lower.tail = F)
 </code>
 ====== output ======
@@ Line 373: / Line 378: @@
 </WRAP>
 </WRAP>
+===== pnorm =====
 모집단 P1의 히스토그램 (histgram of P1)
 {{:r:pasted:20250910-065721.png}}
@@ Line 494: / Line 499: @@
 </WRAP>
 </WRAP>
+===== z score, 표준점수 =====
 <WRAP group>
@@ Line 554: / Line 560: @@
 p1 모집단의 모든 원소를 표준점수화 하기
   * z.p1 <- (p1-mean(p1))/sd(p1)
-  * 평균과 표주편차는 각각 0, 1이 된다 (mean(z.p1), sd(z.p1))
+  * 평균과 표준편차는 각각 0, 1이 된다 (mean(z.p1), sd(z.p1))
   * 그렇다면 이 표준점수화한 분포에서 -1.8과 1.8 사이를 제외한 바깥 쪽 부분의 면적은 (probability는)
   * 1-(p(1.8)-p(-1.8))과 같이 구할 수 있다.
@@ Line 585: / Line 591: @@
 </WRAP>
+===== qnorm =====
+{{:r:pasted:20250910-075030.png}}
 <WRAP group>
 <WRAP column half>
@@ Line 642: / Line 649: @@
 </WRAP>
 <WRAP column half>
-....
+qnorm
+  * qnorm는 pnorm의 반대값을 구하는 명령어
+  * 히스토그램에서 검정 색 부분의 바깥 쪽 부분은 32%이고 왼 쪽의 것은 이것의 반인 16% 이다.
+  * 이 16% 에 해당하는 표준점수가 무엇인가를 묻는 질문이 ''qnorm(.32/2)'' 혹은 ''qnorm(.32/2, 0, 1)''
+  * 원점수일 경우에 대입해서 물어본다면 ''qnorm(.32/2, 100, 10)''과 같이 하면 된다 (혹은 qnorm(.32/2, m.p1, sd.p1) ).
+  * 위의 명령어에 대한 답은 우리가 가장 쉽게 이해한 방법에 의하면
+  * -1 혹은 90 이렇게 될 것이다.
+  * 하지만 위에서 봤던 것처럼 pnorm(1)-pnorm(-1)이 정확히
+  * 0.6826895 인 것처럼, 우리가 이해한 방법은 정확한 값을 구해주지는 않는다.
+  * 마찬가지로 qnorm(0.05/2)와 qnorm(1-(0.05/2))는
+  * 표준편차 값인 1이 왼쪽으로 두개 내려가고 오른 쪽으로 두개 올라간 값의 quotient값을 구하는 것이므로
+  * 즉, 위의 histogram에서 붉은 색 부분의 바깥 쪽 면적에 해당하는 점수를 물어보는 것이므로
+  * -2와 2라고 대답해도 되지만 정확한 답은 아래와 같다 (1.96).
+<code>
+> qnorm(0.05/2)
+[1] -1.959964
+> qnorm(1-(0.05/2))
+[1] 1.959964
+>
+</code>
+  * 이에 해당하는 원점수 (p1 원소에 해당하는) 값은 80과 120이 아니라 아래와 같을 것이다.
+<code>
+> qnorm(0.05/2, 100, 10)
+[1] 80.40036
+> qnorm(1-(0.05/2), 100, 10)
+[1] 119.5996
+>
+</code>
 </WRAP>
 </WRAP>
 <WRAP group>
@@ Line 658: / Line 691: @@
 >
 </code>
+</WRAP>
+<WRAP column half>
+</WRAP>
+</WRAP>
+===== distribution of sample means =====
+<WRAP group>
+<WRAP column half>
 <code>
 > s.size <- 10
@@ Line 678: / Line 718: @@
 <WRAP column half>
 ....
+  * s.size는 10으로 우선 고정하고
+  * 한 샘플을 취하여 그 평균값을 means.temp 메모리에 add한다 (저장하는 것이 아니라 means.temp에 붙힌다).
+  * 그 다음 샘플의 평균을 다시 means.temp에 저장한다
+  * 그 다음 . . . 모두 5번을 하고 출력을 해본다
 </WRAP>
 </WRAP>
 <WRAP group>
@@ Line 709: / Line 753: @@
 </WRAP>
 </WRAP>
+{{:r:pasted:20250910-080946.png}}
@@ Line 735: / Line 780: @@
 <WRAP column half>
 ....
+  * 위 백만개의 샘플평균이 모인 집합의 히스토그램을 그리고
+  * 그 집합의 표준편차값을 수직선으로 표시하기 위해서
+  * mean(means) +- se.s 와 같은 방법을 쓴 후 그래프로 그린다.
+  * 아래에서 선 하나씩의 길이는 means 집합의 (distribution of sample means)
+  * 표준편차값이다.
+  * 이 표준편차 값을 위에서 sd(means)로 구한 후에 se.s로 저장한 적이 있다.
+  * 그리고 그 값은 3.161886 이었다.
+<code>
+> se.s
+[1] 3.161886
+</code>
 </WRAP>
 </WRAP>
+{{:r:pasted:20250910-081358.png}}
 <WRAP group>
@@ Line 810: / Line 868: @@
 <WRAP column half>
 ....
+  * 그런데 이 값은 (se.s = 3.161886)
+  * se.z 를 구하는 방법과 거의 같은 값을 갖는다 3.162278
+<code>
+> se.z <- sqrt(var(p1)/s.size)
+> se.z <- c(se.z)
+> se.z
+[1] 3.162278
+</code>
+  * 사실, 우리가 백만 번의 샘플을 취해서 구한 means 집합의 평균과 표준편차 값은
+    * 이것을 정확하게는 mean of the distribution of sample means,
+    * standard deviation of the distribution of sample means 라고 부른다
+    * 그리고 standard deviation of the distribution of sample means를 흔히
+    * standard error라고 부른다.
+    * script에서 se.s 그리고 se.z 와 같은 se를 쓴 이유도 standard error 값을 구한 것이기 때문이다.
+  * 만약에 백만 번이 아니라 무한 대로 더 큰 숫자를 사용한다고 하면
+  * 위의 se.z 값을 구하는 식의 값을 갖게 된다. 이것을 말로 풀어서 설명하면
+  * 샘플평균들의 집합에서 표준편차 값은
+  * 원래 모집단의 분산값을 샘플사이즈로 나누어준 값에 제곱근을 씌워서 구할 수 있다이다 (아래 Latex 정리에서 3번)
+\begin{eqnarray*}
+\overline{X} \sim \displaystyle \text{N} \left(\mu, \dfrac{\sigma^{2}}{n} \right)
+\end{eqnarray*}
+\begin{eqnarray}
+& & \text{Normal distribution of sample means.} \\
+& & \mu_{\overline{X}} = \mu \\
+& & (\sigma_{\overline{X}})^2 = \frac{\sigma^2}{n} \;\; \text{or} \;\; \sigma_{\overline{X}} = \frac{\sigma}{\sqrt{n}}
+\end{eqnarray}
+see [[:Central Limit Theorem]] and [[:Sampling Distribution]]
+  * 즉, 샘플평균을 모은 집합의 분산값은 그 샘플이 추출된 원래 population의 분산값을 샘플크기로 (sample size) 나누어 준 값이다.
+  * 즉, '' var(means) = var(p1) / s.size ''
+  * 따라서, ''std(means) = sqrt(var(p1) / s.size) ''
+  * 더하여 그 샘플평균 집합의 평균 값은 population의 평균값이 된다
+  * 즉, '' mean(means) = mean(p1) ''
+  * 따라서 lo, hi에 해당하는 means분포의 값을 mean(means) +- sd(means)로 구했었는데,
+  * 샘플평균의 분포를 무한대 번을 했다고 하면 사실 이 값은
+  * mean(p1) +- se.z 로 구하는 것이 정확할 것이다.
+  * 여기서 ''se.z = sqrt(var(p1)/s.size))''
+  * loz1 - hiz1, loz2 - hiz2 값들은 이렇게 구한 값들이다.
+참고
+<code>
+> lo1 <- mean(means)-se.s
+> hi1 <- mean(means)+se.s
+> lo2 <- mean(means)-2*se.s
+> hi2 <- mean(means)+2*se.s
+> lo3 <- mean(means)-3*se.s
+> hi3 <- mean(means)+3*se.s
+</code>
 </WRAP>
 </WRAP>
+{{:r:pasted:20250910-083710.png}}
 <WRAP group>
 <WRAP column half>
@@ Line 847: / Line 959: @@
 </WRAP>
+{{:r:pasted:20250910-084125.png}}
 <WRAP group>
 <WRAP column half>
@@ Line 877: / Line 989: @@
 > # (green line)
 > tmp <- mean(means) - (m.sample.i.got - mean(means))
+> tmp
+[1] 95.26505
 > abline(v=tmp, col='green', lwd=3)
 > 2 * pnorm(m.sample.i.got, mean(p1), sd(means), lower.tail = F)
@@ Line 887: / Line 1001: @@
 <WRAP column half>
 ....
+  * 만약에 내가 한 샘플을 취해서 평균값을 살펴보니
+  * m.sample.i.got 값이었다고 하자 (104.7383).
+  * 이 값보다 큰 값이거나 아니면
+  * 이 값에 해당하는 평균 반대편 값보다 작은 값이 값이
+  * 나올 확률은 무엇인가?
+  * 즉, 녹색선과 연두색 선 바깥 쪽 부분의 probability 값은?
+  * 아래처럼 구해서 13.4% 정도가 된다
+  * 즉, 모집단 p1에서
+  * 무작위로 샘플링을 하여 (see [[:sampling]] and probability sampling)
+  * s.size의 (10) 샘플을 취했는데 그 샘플의 평균이
+  * 104.7383 점보다 크거나 혹은
+  * 95.26505 점보다 작을 확률은 13.4% 이다.
+<code>
+> 2 * pnorm(m.sample.i.got, mean(p1), sd(means), lower.tail = F)
+[1] 0.1339882
+</code>
 </WRAP>
 </WRAP>
+===== Last one . . . Important =====
 <WRAP group>
 <WRAP column half>
@@ Line 900: / Line 1031: @@
 [1] 10
 > sample(p2, s.size)
- [1] 120.9639 119.8341 134.0344 122.5446 121.4557 113.6820 114.7603 105.2138 122.7027 125.4131
+ [1] 105.57868  94.91021 117.73624 118.05456 118.65741 126.66806 136.65185 117.02986
+ [9] 116.41683 105.91887
 > m.sample.i.got <- mean(sample(p2, s.size))
 > m.sample.i.got
-[1] 120.2492
+[1] 114.155
->
+> temp <- mean(means) - (m.sample.i.got-mean(means))
-> hist(means,
+> temp
-+      xlim = c(mean(means)-15*sd(means), mean(means)+15*sd(means)),
+[1] 85.84835
+> hist(means, breaks=30,
++      xlim = c(temp-4*sd(means), m.sample.i.got+4*sd(means)),
 +      col = rgb(1, 0, 0, .5))
 > abline(v=mean(means), col="black", lwd=2)
-> abline(v=m.sample.i.got, col='darkgreen', lwd=2)
+> abline(v=m.sample.i.got, col='green', lwd=2)
->
+> abline(v=temp, col='blue', lwd=2)
 > # what is the probablity of getting
 > # greater than
 > # m.sample.i.got?
 > m.sample.i.got
-[1] 120.2492
+[1] 114.155
 > pnorm(m.sample.i.got, mean(p1), sd(means), lower.tail = F)
-[1] 7.560352e-11
+[1] 3.820806e-06
->
 > # then, what is the probabilty of getting
 > # greater than m.sample.i.got and
@@ Line 927: / Line 1060: @@
 > abline(v=tmp, col='green', lwd=2)
 > 2 * pnorm(m.sample.i.got, mean(p1), sd(means), lower.tail = F)
-[1] 1.51207e-10
+[1] 7.641611e-06
 >
 </code>
@@ Line 933: / Line 1066: @@
 <WRAP column half>
 ....
+다른 모집단에서 온 샘플 (sample from other population, p2)
+이전 처럼 10명을 샘플로 뽑는데, p2에서 뽑는다. 따라서 이 샘플의 평균은 114.155 이다. 그런데 연구자는 이 샘플이 나온 모집단은 전혀 모르는 상태이다. 모집단의 평균도 모르고 분산값도 모른다. 그냥 10명을 샘플로 뽑았고, 이들이 p1에서 나왔을 확률을 알아보려고 한다. 이를 알아보기 위해서 스크립트를 돌려 보니, blue와 green라인 밖의 범위가 나올 확률은 7.641611e-06로 0에 가깝다. 따라서, 이 샘플은 10명은 p1에 속한 원소가 아닌 다른 성격을 가진 원소이다.
+위에서 p2의 parameter에 대해서 잘 모른다는 점에 주목하라. 그리고 아래 시나리오를 상상하라.
+어느 한 모집단의 IQ 평균이 100 이고 표준편차가 10 임을 알고 있다. 확률과통계 교수는 머리가 좋아지는 약을 개발하여 이를 팔아보려고 하고 있다. 이를 위해서 확통교수는 10을 뽑아서 머리가 좋아지는 약을 복용하도록 한 후에 IQ를 측정하였다. 그런데, 그 IQ 평균이 114.155 이다. 이 점수를 가지고 약의 효과가 있는지 검증을 해보력고 한다.
 </WRAP>
 </WRAP>
+{{:r:pasted:20250910-135442.png}}