c:ms:2026:lecture_note_week_04
This is an old revision of the document!
퀴즈 1 문제 중
모집단 Mean = 180; SD = 20, 정규분포일때
49. N=16의 샘플을 추출할 때 샘플들의 평균 분포가 갖는 표준편차 값은?
standard error 값을 묻는 질문이므로
se = sigma / sqrt(n) = 20 / 4 = 5
50. n=400일 때 샘플평균들의 분포가 갖는 표준편차 값은?
1
51. n=100 의 크기의 샘플을 취한다고 할 때 이 샘플의 평균값이 나올 구간을 99퍼센트의 확신성을 가지고 구하시오.
52. 위와합산
se값이 2이고 99퍼센트의 구간은 se값을 위로 3 밑은 3 포함하는 구간이 되므로 180-6, 180+6 이 정답
위의 문제는 모두 샘플평균들을 모아 놓은 집합의 평균과 표준편차를 (표준오차) 구하는 문제이다.
즉, 무 = 180, 시그마 = 20 일 때,
샘플평균들의 표준편차는 아래처럼 각각 5, 1, 2가 된다.
> rm(list=ls())
> rnorm2 <- function(n,mean,sd){
+ mean+sd*scale(rnorm(n))
+ }
> ss <- function(x) {
+ sum((x-mean(x))^2)
+ }
>
> mu = 180
> sigma = 20
> var = 400
>
> n.a <- 16
> n.b <- 400
> n.c <- 100
>
> se.a <- sigma/sqrt(n.a)
> se.b <- sigma/sqrt(n.b)
> se.c <- sigma/sqrt(n.c)
> se.a
[1] 5
> se.b
[1] 1
> se.c
[1] 2
Recap
Distribution of Sample Means – mu = 40, sigma = 4 (hence var = 16) 인 모집단에서 n = n 사이즈의 샘플링을 무한 반복할 때 그 샘플평균들이 모인 집합
rscript01
rm(list=ls())
rnorm2 <- function(n,mean,sd){
mean+sd*scale(rnorm(n))
}
ss <- function(x) {
sum((x-mean(x))^2)
}
mu <- 40
sigma <- 4
iter <- 1000000
sz <- 16
se <- sigma/sqrt(16)
################################
means <- rnorm2(iter, mu, se)
hist(means, breaks=50,
xlim = c(mu-6*se, mu+6*se),
main = paste("sampling distribution"))
abline(v=mu, col='black', lwd=2)
lo1 <- mu - se*1
hi1 <- mu + se*1
lo2 <- mu - se*2
hi2 <- mu + se*2
lo3 <- mu - se*3
hi3 <- mu + se*3
abline(v=c(lo1, lo2, lo3, hi1, hi2, hi3),
col=c("green","blue", "black"),
lwd=2)
print(c(lo2, hi2))
m.samp <- 37
p.val <- pnorm(m.samp, mu, se)*2
p.val
z.cal <- (m.samp-mu)/se
z.cal
p.val <- pnorm(z.cal)*2
p.val
zmeans <- scale(means)
hist(zmeans, breaks=50,
xlim = c(0-10*1, 0+10*1),
main=("normalized distribution\nof sample means"))
abline(v=0, col="black", lwd=2)
abline(v=z.cal, col='blue', lwd=2)
abline(v=-z.cal, col="green", lwd=2)
text(x=-6, y=50000,
label=paste("z.cal =", z.cal),
pos = 1,
col="blue", cex=1)
text(x=4, y=50000,
label=paste(-z.cal),
pos=1,
col="green", cex=1)
text(x=-6, y=30000,
label=paste("pnorm(z.cal)*2 =", "\n",
round(p.val,3)),
pos = 1,
col="red", cex=.8)
hist(zmeans, breaks=50,
xlim = c(0-10*1, 0+10*1),
main=("normalized distribution\nof sample means"))
abline(v=0, col="black", lwd=2)
abline(v=c(-1,-2,-3,1,2,3),
col=c("green", "blue", "black"), lwd=2)
z.cal
p.val
#####
# 위의 아이디어로는 z.cal 점수가
# +-2 밖에 있는지 보면 된다. 즉,
# 이는 prob가 0.05보다 작은지
# 보면 되는 것이다.
#####
# +-2 는 정확한 숫자가 아니고
# qnorm(.05/2) 에 해당하는 숫자
# 가 정확한 숫자
two.minus.exact <- qnorm(.05/2)
two.plus.exact <- qnorm(1-(.05/2))
c(two.minus.exact, two.plus.exact)
#####
# 그러나 R 사용시에는 z 점수로
# 판단하기 보다는
# 직접 구하는 prob.로 판단
pnorm(z.cal)*2
p.val
#####
# 위에서 그룹 간의 차이를
# standard error로 나누는 것에 주의
#
################
m.samp <- 43
sd.samp <- 4
sz <- 16
samp <- rnorm2(sz, m.samp, sd.samp)
diff <- m.samp - mu
se <- sd.samp / sqrt(sz)
t.cal <- diff/se
df <- sz-1
p.val <- pt(t.cal, df=df, lower.tail = F)*2
t.cal
df
p.val
t.test(samp, mu=mu)
out01
> rm(list=ls())
> rnorm2 <- function(n,mean,sd){
+ mean+sd*scale(rnorm(n))
+ }
> ss <- function(x) {
+ sum((x-mean(x))^2)
+ }
>
> mu <- 40
> sigma <- 4
> iter <- 1000000
> sz <- 16
> se <- sigma/sqrt(16)
> ################################
> means <- rnorm2(iter, mu, se)
> hist(means, breaks=50,
+ xlim = c(mu-6*se, mu+6*se),
+ main = paste("sampling distribution"))
> abline(v=mu, col='black', lwd=2)
> lo1 <- mu - se*1
> hi1 <- mu + se*1
> lo2 <- mu - se*2
> hi2 <- mu + se*2
> lo3 <- mu - se*3
> hi3 <- mu + se*3
>
> abline(v=c(lo1, lo2, lo3, hi1, hi2, hi3),
+ col=c("green","blue", "black"),
+ lwd=2)
>
> print(c(lo2, hi2))
[1] 38 42
>
> m.samp <- 37
> p.val <- pnorm(m.samp, mu, se)*2
> p.val
[1] 0.002699796
> z.cal <- (m.samp-mu)/se
> z.cal
[1] -3
> p.val <- pnorm(z.cal)*2
> p.val
[1] 0.002699796
>
> zmeans <- scale(means)
> hist(zmeans, breaks=50,
+ xlim = c(0-10*1, 0+10*1),
+ main=("normalized distribution\nof sample means"))
> abline(v=0, col="black", lwd=2)
> abline(v=z.cal, col='blue', lwd=2)
> abline(v=-z.cal, col="green", lwd=2)
> text(x=-6, y=50000,
+ label=paste("z.cal =", z.cal),
+ pos = 1,
+ col="blue", cex=1)
> text(x=4, y=50000,
+ label=paste(-z.cal),
+ pos=1,
+ col="green", cex=1)
> text(x=-6, y=30000,
+ label=paste("pnorm(z.cal)*2 =", "\n",
+ round(p.val,3)),
+ pos = 1,
+ col="red", cex=.8)
>
> hist(zmeans, breaks=50,
+ xlim = c(0-10*1, 0+10*1),
+ main=("normalized distribution\nof sample means"))
> abline(v=0, col="black", lwd=2)
> abline(v=c(-1,-2,-3,1,2,3),
+ col=c("green", "blue", "black"), lwd=2)
>
> z.cal
[1] -3
> p.val
[1] 0.002699796
> #####
> # 위의 아이디어로는 z.cal 점수가
> # +-2 밖에 있는지 보면 된다. 즉,
> # 이는 prob가 0.05보다 작은지
> # 보면 되는 것이다.
> #####
> # +-2 는 정확한 숫자가 아니고
> # qnorm(.05/2) 에 해당하는 숫자
> # 가 정확한 숫자
> two.minus.exact <- qnorm(.05/2)
> two.plus.exact <- qnorm(1-(.05/2))
> c(two.minus.exact, two.plus.exact)
[1] -1.959964 1.959964
> #####
> # 그러나 R 사용시에는 z 점수로
> # 판단하기 보다는
> # 직접 구하는 prob.로 판단
> pnorm(z.cal)*2
[1] 0.002699796
> p.val
[1] 0.002699796
> #####
> # 위에서 그룹 간의 차이를
> # standard error로 나누는 것에 주의
> #
>
>
> ################
> m.samp <- 43
> sd.samp <- 4
> sz <- 16
> samp <- rnorm2(sz, m.samp, sd.samp)
> diff <- m.samp - mu
> se <- sd.samp / sqrt(sz)
> t.cal <- diff/se
> df <- sz-1
> p.val <- pt(t.cal, df=df, lower.tail = F)*2
> t.cal
[1] 3
> df
[1] 15
> p.val
[1] 0.008972737
> t.test(samp, mu=mu)
One Sample t-test
data: samp
t = 3, df = 15, p-value = 0.008973
alternative hypothesis: true mean is not equal to 40
95 percent confidence interval:
40.86855 45.13145
sample estimates:
mean of x
43
>
rscript02
##### # m.a <- 5.8 m.b <- 6.3 sd.a <- .5 sd.b <- .5 sz.a <- 16 sz.b <- 16 df.a <- sz.a-1 df.b <- sz.b-1 df <- df.a + df.b a <- rnorm2(sz.a, m.a, sd.a) b <- rnorm2(sz.b, m.b, sd.b) diff <- m.a - m.b pv <- (ss(a)+ss(b))/(df.a+df.b) se <- sqrt(pv/sz.a+pv/sz.b) t.cal <- diff / se p.val <- pt(t.cal, df=df)*2 diff se t.cal df p.val t.test(a,b, var.equal = T) diff - se*2 diff + se*2 lo <- qt(.05/2,df) lo hi <- -lo diff + se*lo diff + se*hi ##### # t-test repeated measre ##### m.t1 <- 103 m.t2 <- 111 sd.t1 <- 10 sd.t2 <- 10 sz <- 16 t1 <- rnorm2(sz, m.t1, sd.t1) t2 <- rnorm2(sz, m.t2, sd.t2) t1 t2 mdiff <- m.t1-m.t2 diff <- t1-t2 sd.diff <- sd(diff) se <- sd.diff/sqrt(sz) t.cal <- mdiff/se p.val <- pt(t.cal, df=sz-1)*2 t.cal sz-1 p.val t.test(t1,t2, paired=T) two <- qt(.05/2, df=sz-1) two lo <- se*two hi <- -lo c(lo, hi) c(mdiff+lo, mdiff+hi)
rout02
> #####
> #
> m.a <- 5.8
> m.b <- 6.3
> sd.a <- .5
> sd.b <- .5
> sz.a <- 16
> sz.b <- 16
> df.a <- sz.a-1
> df.b <- sz.b-1
> df <- df.a + df.b
> a <- rnorm2(sz.a, m.a, sd.a)
> b <- rnorm2(sz.b, m.b, sd.b)
> diff <- m.a - m.b
> pv <- (ss(a)+ss(b))/(df.a+df.b)
> se <- sqrt(pv/sz.a+pv/sz.b)
> t.cal <- diff / se
> p.val <- pt(t.cal, df=df)*2
>
> diff
[1] -0.5
> se
[1] 0.1767767
> t.cal
[1] -2.828427
> df
[1] 30
> p.val
[1] 0.008257336
> t.test(a,b, var.equal = T)
Two Sample t-test
data: a and b
t = -2.8284, df = 30, p-value = 0.008257
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-0.8610262 -0.1389738
sample estimates:
mean of x mean of y
5.8 6.3
> diff - se*2
[1] -0.8535534
> diff + se*2
[1] -0.1464466
> lo <- qt(.05/2,df)
> lo
[1] -2.042272
> hi <- -lo
> diff + se*lo
[1] -0.8610262
> diff + se*hi
[1] -0.1389738
>
> #####
> # t-test repeated measre
> #####
> m.t1 <- 103
> m.t2 <- 111
> sd.t1 <- 10
> sd.t2 <- 10
> sz <- 16
> t1 <- rnorm2(sz, m.t1, sd.t1)
> t2 <- rnorm2(sz, m.t2, sd.t2)
> t1
[,1]
[1,] 89.58295
[2,] 97.20986
[3,] 100.82700
[4,] 120.11867
[5,] 103.06410
[6,] 117.36762
[7,] 98.82191
[8,] 111.72472
[9,] 100.06093
[10,] 114.58757
[11,] 105.99472
[12,] 84.34803
[13,] 94.63867
[14,] 94.49667
[15,] 106.03514
[16,] 109.12144
attr(,"scaled:center")
[1] 0.08912759
attr(,"scaled:scale")
[1] 0.9759765
> t2
[,1]
[1,] 114.76609
[2,] 111.81937
[3,] 102.93248
[4,] 122.85959
[5,] 105.68180
[6,] 110.43890
[7,] 115.34844
[8,] 97.39180
[9,] 117.00475
[10,] 98.63924
[11,] 118.87807
[12,] 107.55519
[13,] 128.46569
[14,] 93.50094
[15,] 107.15280
[16,] 123.56487
attr(,"scaled:center")
[1] 0.2000755
attr(,"scaled:scale")
[1] 0.8946962
> mdiff <- m.t1-m.t2
> diff <- t1-t2
> sd.diff <- sd(diff)
> se <- sd.diff/sqrt(sz)
> t.cal <- mdiff/se
> p.val <- pt(t.cal, df=sz-1)*2
> t.cal
[1] -2.2741
> sz-1
[1] 15
> p.val
[1] 0.03808083
> t.test(t1,t2, paired=T)
Paired t-test
data: t1 and t2
t = -2.2741, df = 15, p-value = 0.03808
alternative hypothesis: true mean difference is not equal to 0
95 percent confidence interval:
-15.4981736 -0.5018264
sample estimates:
mean difference
-8
> two <- qt(.05/2, df=sz-1)
> two
[1] -2.13145
> lo <- se*two
> hi <- -lo
> c(lo, hi)
[1] -7.498174 7.498174
> c(mdiff+lo, mdiff+hi)
[1] -15.4981736 -0.5018264
>
>
c/ms/2026/lecture_note_week_04.1774975116.txt.gz · Last modified: by hkimscil



