====== ANOVA ======
[[:ANOVA|설명문서]]
====== prev ======
set.seed(10)
sa <- rnorm2(100, 7.6, 2)
sb <- rnorm2(100, 8.3, 2)
mean.sa <- mean(sa)
mean.sb <- mean(sb)
na <- length(sa)
nb <- length(sb)
dfa <- na-1
dfb <- nb-1
var(sa)
var(sb)
ssa <- var(sa)*dfa
ssb <- var(sb)*dfb
ssa
ssb
ssa+ssb
dfa+dfb
var.pooled <- (ssa+ssb)/(dfa+dfb)
vp <- var.pooled
vp
se <- sqrt((vp/na)+(vp/nb))
diff <- mean.sa - mean.sb
t.cal <- diff/se
se
diff
t.cal
t.crit <- qt(.975, 198)
t.crit
pt(t.cal, 198)*2
t.test(sa, sb)
sab <- c(sa, sb)
sab
group <- c(rep("ga",na), rep("gb",nb))
group
sall <- data.frame(sab,group)
sall
# attach(sall)
df.tot <- (na+nb)-1
ss.tot <- var(sab)*df.tot
var.tot <- var(sab)
df.tot
ss.tot
var.tot
var.tot*df.tot
# for uniqueN function data.table required
# install.packages("data.table")
# library(data.table)
uniqueN(sall, by = c("group"))
nofg <- uniqueN(sall, by = c("group"))
df.bet <- nofg - 1
df.sa <- na-1
df.sb <- nb-1
df.within <- df.sa+df.sb
df.within
ss.sa <- var(sa)*df.sa
ss.sb <- var(sb)*df.sb
ss.sa
ss.sb
ss.within <- ss.sa + ss.sb
ss.within
mean.grand <- mean(sab)
# mean.sa <- mean(sa)
# mean.sb <- mean(sb)
mean.grand
mean.sa
mean.sb
error.ga <- na * (mean.grand - mean.sa)^2
error.gb <- nb * (mean.grand - mean.sb)^2
error.ga
error.gb
ss.bet <- error.ga + error.gb
ss.bet
# sumup
ss.tot
ss.bet
ss.within
ss.bet+ss.within
df.tot
df.bet
df.within
df.bet + df.within
ms.bet <- ss.bet / df.bet
ms.wit <- ss.within / df.within
ms.bet
ms.wit
fvalue <- ms.bet/ms.wit
fvalue
1-pf(fvalue, df.bet, df.within)
f.res <- (aov(sall$sab~sall$group))
summary(f.res)
t.test(sa,sb)
# str(t.test(sa,sb))
t.test(sa,sb)$statistic
t.test(sa,sb)$statistic^2
t.test(sa,sb)$p.value
====== ANOVA e.g.1 ======
set.seed(1024)
na <-50
nb <- 50
nc <- 50
s1 <- round(rnorm(na, 11.6, 2),0)
s2 <- round(rnorm(nb, 12.9, 2),0)
s3 <- round(rnorm(nc, 13.4, 2),0)
s1
s2
s3
sabc <- c(s1,s2,s3)
sabc
group <- c(rep("g1",na), rep("g2",nb), rep("g3",nc))
group
sall <- data.frame(sabc, group)
sall
attach(sall)
aov.sall <- aov(sabc ~ group, data=sall)
summary(aov.sall)
df.total <- length(sabc) - 1
ss.total <- var(sabc)*df.total
var.total <- var(sabc)
df.total
ss.total
var.total
df.s1 <- na-1
df.s2 <- nb-1
df.s3 <- nc-1
df.within <- df.s1+df.s2+df.s3
df.within
ss.s1 <- var(s1)*df.s1
ss.s2 <- var(s2)*df.s2
ss.s3 <- var(s3)*df.s3
ss.s1
ss.s2
ss.s3
ss.within <- ss.s1+ss.s2+ss.s3
ss.within
# for uniqueN function data.table required
# install.packages("data.table")
library(data.table)
uniqueN(sall, by = c("group"))
nofg <- uniqueN(sall, by = c("group"))
nofg
df.between <- nofg - 1
df.between
mean.grand <- mean(sabc)
mean.s1 <- mean(s1)
mean.s2 <- mean(s2)
mean.s3 <- mean(s3)
mean.grand
mean.s1
mean.s2
mean.s3
error.g1 <- na * (mean.grand - mean.s1)^2
error.g2 <- nb * (mean.grand - mean.s2)^2
error.g3 <- nc * (mean.grand - mean.s3)^2
error.g1
error.g2
error.g3
ss.between <- error.g1 + error.g2 + error.g3
ss.between
# sumup
ss.total
ss.between
ss.within
ss.between + ss.within
ss.total
df.total
df.between
df.within
df.between + df.within
ms.between <- ss.between / df.between
ms.within <- ss.within / df.within
ms.between
ms.within
fvalue <- ms.between/ms.within
fvalue
# fvalue에서 판단했을 때 그 판단이 잘못일 확률
1 - pf(fvalue, df.between, df.within)
f.res <- aov(sabc ~ group, data=sall)
summary(f.res)
# for regression
r.res <- lm(sabc ~ group, data=sall)
summary(r.res)
anova(r.res)
summary(r.res)$r.square
ss.total
ss.between
ss.within
# this is r.square value
ss.between/ss.total
> set.seed(1024)
> na <-50
> nb <- 50
> nc <- 50
>
> s1 <- round(rnorm(na, 11.6, 2),0)
> s2 <- round(rnorm(nb, 12.9, 2),0)
> s3 <- round(rnorm(nc, 13.4, 2),0)
> s1
[1] 10 11 8 10 12 7 11 16 14 13 8 9 14 12 11 15 9 11 10 9 11 13 15 14 11 11 12 13 10 13 10 12
[33] 14 11 14 11 14 11 10 11 11 14 10 9 14 13 10 10 7 13
> s2
[1] 12 12 16 14 12 12 12 12 11 12 13 12 10 13 15 12 12 18 14 13 13 13 17 13 13 14 11 13 13 11 14 15
[33] 11 12 10 14 13 12 14 15 13 10 10 17 12 14 14 16 13 12
> s3
[1] 13 12 12 16 14 14 9 12 11 14 15 13 7 17 16 12 12 12 9 11 12 16 17 13 18 14 14 12 15 11 13 12
[33] 13 10 14 10 15 10 11 14 10 14 14 13 13 13 11 11 12 11
>
> sabc <- c(s1,s2,s3)
> sabc
[1] 10 11 8 10 12 7 11 16 14 13 8 9 14 12 11 15 9 11 10 9 11 13 15 14 11 11 12 13 10 13 10 12
[33] 14 11 14 11 14 11 10 11 11 14 10 9 14 13 10 10 7 13 12 12 16 14 12 12 12 12 11 12 13 12 10 13
[65] 15 12 12 18 14 13 13 13 17 13 13 14 11 13 13 11 14 15 11 12 10 14 13 12 14 15 13 10 10 17 12 14
[97] 14 16 13 12 13 12 12 16 14 14 9 12 11 14 15 13 7 17 16 12 12 12 9 11 12 16 17 13 18 14 14 12
[129] 15 11 13 12 13 10 14 10 15 10 11 14 10 14 14 13 13 13 11 11 12 11
> group <- c(rep("g1",na), rep("g2",nb), rep("g3",nc))
> group
[1] "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1"
[20] "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1"
[39] "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g1" "g2" "g2" "g2" "g2" "g2" "g2" "g2"
[58] "g2" "g2" "g2" "g2" "g2" "g2" "g2" "g2" "g2" "g2" "g2" "g2" "g2" "g2" "g2" "g2" "g2" "g2" "g2"
[77] "g2" "g2" "g2" "g2" "g2" "g2" "g2" "g2" "g2" "g2" "g2" "g2" "g2" "g2" "g2" "g2" "g2" "g2" "g2"
[96] "g2" "g2" "g2" "g2" "g2" "g3" "g3" "g3" "g3" "g3" "g3" "g3" "g3" "g3" "g3" "g3" "g3" "g3" "g3"
[115] "g3" "g3" "g3" "g3" "g3" "g3" "g3" "g3" "g3" "g3" "g3" "g3" "g3" "g3" "g3" "g3" "g3" "g3" "g3"
[134] "g3" "g3" "g3" "g3" "g3" "g3" "g3" "g3" "g3" "g3" "g3" "g3" "g3" "g3" "g3" "g3" "g3"
>
> sall <- data.frame(sabc, group)
> sall
sabc group
1 10 g1
2 11 g1
3 8 g1
4 10 g1
5 12 g1
6 7 g1
7 11 g1
8 16 g1
9 14 g1
10 13 g1
11 8 g1
12 9 g1
13 14 g1
14 12 g1
15 11 g1
16 15 g1
17 9 g1
18 11 g1
19 10 g1
20 9 g1
21 11 g1
22 13 g1
23 15 g1
24 14 g1
25 11 g1
26 11 g1
27 12 g1
28 13 g1
29 10 g1
30 13 g1
31 10 g1
32 12 g1
33 14 g1
34 11 g1
35 14 g1
36 11 g1
37 14 g1
38 11 g1
39 10 g1
40 11 g1
41 11 g1
42 14 g1
43 10 g1
44 9 g1
45 14 g1
46 13 g1
47 10 g1
48 10 g1
49 7 g1
50 13 g1
51 12 g2
52 12 g2
53 16 g2
54 14 g2
55 12 g2
56 12 g2
57 12 g2
58 12 g2
59 11 g2
60 12 g2
61 13 g2
62 12 g2
63 10 g2
64 13 g2
65 15 g2
66 12 g2
67 12 g2
68 18 g2
69 14 g2
70 13 g2
71 13 g2
72 13 g2
73 17 g2
74 13 g2
75 13 g2
76 14 g2
77 11 g2
78 13 g2
79 13 g2
80 11 g2
81 14 g2
82 15 g2
83 11 g2
84 12 g2
85 10 g2
86 14 g2
87 13 g2
88 12 g2
89 14 g2
90 15 g2
91 13 g2
92 10 g2
93 10 g2
94 17 g2
95 12 g2
96 14 g2
97 14 g2
98 16 g2
99 13 g2
100 12 g2
101 13 g3
102 12 g3
103 12 g3
104 16 g3
105 14 g3
106 14 g3
107 9 g3
108 12 g3
109 11 g3
110 14 g3
111 15 g3
112 13 g3
113 7 g3
114 17 g3
115 16 g3
116 12 g3
117 12 g3
118 12 g3
119 9 g3
120 11 g3
121 12 g3
122 16 g3
123 17 g3
124 13 g3
125 18 g3
126 14 g3
127 14 g3
128 12 g3
129 15 g3
130 11 g3
131 13 g3
132 12 g3
133 13 g3
134 10 g3
135 14 g3
136 10 g3
137 15 g3
138 10 g3
139 11 g3
140 14 g3
141 10 g3
142 14 g3
143 14 g3
144 13 g3
145 13 g3
146 13 g3
147 11 g3
148 11 g3
149 12 g3
150 11 g3
> attach(sall)
The following objects are masked _by_ .GlobalEnv:
group, sabc
> aov.sall <- aov(sabc ~ group, data=sall)
> summary(aov.sall)
Df Sum Sq Mean Sq F value Pr(>F)
group 2 68.7 34.33 8.049 0.000482 ***
Residuals 147 626.9 4.26
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
>
> df.total <- length(sabc) - 1
> ss.total <- var(sabc)*df.total
> var.total <- var(sabc)
> df.total
[1] 149
> ss.total
[1] 695.5733
> var.total
[1] 4.668277
>
> df.s1 <- na-1
> df.s2 <- nb-1
> df.s3 <- nc-1
>
> df.within <- df.s1+df.s2+df.s3
> df.within
[1] 147
>
> ss.s1 <- var(s1)*df.s1
> ss.s2 <- var(s2)*df.s2
> ss.s3 <- var(s3)*df.s3
> ss.s1
[1] 222.32
> ss.s2
[1] 160.98
> ss.s3
[1] 243.62
> ss.within <- ss.s1+ss.s2+ss.s3
> ss.within
[1] 626.92
>
>
> # for uniqueN function data.table required
> # install.packages("data.table")
> library(data.table)
data.table 1.14.8 using 8 threads (see ?getDTthreads). Latest news: r-datatable.com
> uniqueN(sall, by = c("group"))
[1] 3
> nofg <- uniqueN(sall, by = c("group"))
> nofg
[1] 3
> df.between <- nofg - 1
> df.between
[1] 2
>
> mean.grand <- mean(sabc)
> mean.s1 <- mean(s1)
> mean.s2 <- mean(s2)
> mean.s3 <- mean(s3)
> mean.grand
[1] 12.38667
> mean.s1
[1] 11.44
> mean.s2
[1] 12.98
> mean.s3
[1] 12.74
>
> error.g1 <- na * (mean.grand - mean.s1)^2
> error.g2 <- nb * (mean.grand - mean.s2)^2
> error.g3 <- nc * (mean.grand - mean.s3)^2
>
> error.g1
[1] 44.80889
> error.g2
[1] 17.60222
> error.g3
[1] 6.242222
>
> ss.between <- error.g1 + error.g2 + error.g3
> ss.between
[1] 68.65333
>
> # sumup
> ss.total
[1] 695.5733
> ss.between
[1] 68.65333
> ss.within
[1] 626.92
> ss.between + ss.within
[1] 695.5733
> ss.total
[1] 695.5733
>
> df.total
[1] 149
> df.between
[1] 2
> df.within
[1] 147
> df.between + df.within
[1] 149
>
> ms.between <- ss.between / df.between
> ms.within <- ss.within / df.within
> ms.between
[1] 34.32667
> ms.within
[1] 4.264762
>
> fvalue <- ms.between/ms.within
> fvalue
[1] 8.048906
>
> # fvalue에서 판단했을 때 그 판단이 잘못일 확률
> 1 - pf(fvalue, df.between, df.within)
[1] 0.0004818216
>
>
> f.res <- aov(sabc ~ group, data=sall)
> summary(f.res)
Df Sum Sq Mean Sq F value Pr(>F)
group 2 68.7 34.33 8.049 0.000482 ***
Residuals 147 626.9 4.26
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
>
> # for regression
> r.res <- lm(sabc ~ group, data=sall)
> summary(r.res)
Call:
lm(formula = sabc ~ group, data = sall)
Residuals:
Min 1Q Median 3Q Max
-5.74 -1.44 -0.21 1.26 5.26
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 11.4400 0.2921 39.171 < 2e-16 ***
groupg2 1.5400 0.4130 3.729 0.000274 ***
groupg3 1.3000 0.4130 3.148 0.001994 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.065 on 147 degrees of freedom
Multiple R-squared: 0.0987, Adjusted R-squared: 0.08644
F-statistic: 8.049 on 2 and 147 DF, p-value: 0.0004818
> anova(r.res)
Analysis of Variance Table
Response: sabc
Df Sum Sq Mean Sq F value Pr(>F)
group 2 68.65 34.327 8.0489 0.0004818 ***
Residuals 147 626.92 4.265
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
> summary(r.res)$r.square
[1] 0.09870035
>
> ss.total
[1] 695.5733
> ss.between
[1] 68.65333
> ss.within
[1] 626.92
>
> # this is r.square value
> ss.between/ss.total
[1] 0.09870035
>
====== ANOVA e.g. 2 ======
##########################
# http://commres.net/wiki/r/oneway_anova
#
##########################
x1 <- c(50.5, 52.1, 51.9, 52.4, 50.6, 51.4, 51.2, 52.2, 51.5, 50.8)
x2 <- c(47.5, 47.7, 46.6, 47.1, 47.2, 47.8, 45.2, 47.4, 45.0, 47.9)
x3 <- c(46.0, 47.1, 45.6, 47.1, 47.2, 46.4, 45.9, 47.1, 44.9, 46.2)
xc <- data.frame(x1,x2,x3)
xs <- stack(xc)
xs
colnames(xs) <- c("score", "temp")
xs
levels(xs$temp) <- c("low", "mid", "hi")
xs
str(xs) # 데이터 구조 체크
attach(xs)
aov.xs <- aov(score ~ temp, data = xs)
summary(aov.xs)
mean.tot <- mean(score)
mean.tot
df.tot <- length(score)-1
ss.tot <- var(score) * df.tot
ss.tot
df.tot
var.tot <- var(score)
var.tot2 <- ss.tot/df.tot
var.tot == var.tot2
mean.each <- tapply(score, list(temp), mean)
mean.each
var.each <- tapply(score, list(temp), var)
var.each
df.within <- 3*(10-1)
ss.each <- var.each * 9
ss.within <- sum(ss.each)
ss.within
ss.tot - ss.within
ss.bet <- sum(10*(mean.tot - mean.each)^2)
ss.bet
df.bet <- 3-1
ss.tot
ss.bet
ss.within
ss.bet + ss.within
df.tot
df.bet
df.within
df.bet + df.within
ms.bet <- ss.bet / df.bet
ms.within <- ss.within / df.within
f.value <- ms.bet / ms.within
ss.bet
ss.within
df.bet
df.within
ms.bet
ms.within
f.value
alpha <- .05
qf(alpha, df.bet, df.within, lower.tail = F)
pf(f.value, df.bet, df.within, lower.tail = F)
aov.xs <- aov(score ~ temp, data = xs)
summary(aov.xs)
====== Two way ANOVA (Factorial ANOVA) ======
[[:Factorial ANOVA]]
[[:r:Factorial ANOVA|Factorial ANOVA in R]]
#################################################
# two-way anova
# subject = factor(paste('sub', 1:30, sep=''))
#################################################
n.a.group <- 3 # a treatment 숫자
n.b.group <- 2 # b 그룹 숫자
n.sub <- 30 # 총 샘플 숫자
# 데이터 생성
set.seed(9)
a <- gl(3, 10, n.sub, labels=c('a1', 'a2', 'a3'))
b <- gl(2, 5, n.sub, labels=c('b1', 'b2'))
a
b
y <- rnorm(30, mean=10) + 3.14 * (a=='a1' & b=='b2') + 1.43 * (a=='a3' & b=='b2')
y
dat <- data.frame(a, b, y)
aov.dat <- aov(y~a*b) # anova test
summary(aov.dat) # summary of the test output
# hand calculation
table(a,b)
tapply(y, list(a,b), mean) # 각 셀에서의 평균
df.within.each <- tapply(y, list(a,b), length) -1 # 각 셀에서의 샘플숫자
n.within.each <- df.within.each + 1
df.within <- sum(df.within.each) # df within
var.within <- tapply(y, list(a,b), var) # var.within
ss.within.each <- tapply(y, list(a,b), var) * df.within.each
ss.within.each
ss.within <- sum(ss.within.each) # ss.within
ss.within
interaction.plot(a,b,y)
mean.a <- tapply(y, list(a), mean)
mean.b <- tapply(y, list(b), mean)
mean.a
mean.b
var.a <- tapply(y, list(a), var)
var.b <- tapply(y, list(b), var)
mean.tot <- mean(dat$y)
var.tot <- var(dat$y)
df.tot <- n.sub - 1
ss.tot <- var.tot * df.tot
## between
mean.each <- tapply(y, list(a,b), mean)
mean.each
mean.tot <- mean(y)
mean.tot
n.each <- tapply(y, list(a,b), length)
n.each
n.a.each <- tapply(y, list(a), length)
n.b.each <- tapply(y, list(b), length)
ss.bet <- sum(n.each*(mean.each-mean.tot)^2)
ss.bet
ss.tot
ss.within
ss.bet
ss.bet + ss.within
ss.a <- sum(n.a.each * ((mean.tot - mean.a)^2))
ss.b <- sum(n.b.each * ((mean.tot - mean.b)^2))
ss.a
ss.b
ss.ab <- ss.bet - (ss.a + ss.b)
ss.ab
ss.tot
ss.bet
ss.within
ss.a
ss.b
ss.ab
df.tot <- n.sub - 1
df.bet <- (n.a.group * n.b.group) - 1
df.a <- n.a.group - 1
df.b <- n.b.group - 1
df.ab <- df.bet - (df.a + df.b)
df.within <- sum(df.within.each)
df.tot
df.bet
df.a
df.b
df.ab
df.within
ms.a <- ss.a / df.a
ms.b <- ss.b / df.b
ms.ab <- ss.ab / df.ab
ms.within <- ss.within / df.within
ms.a
ms.b
ms.ab
ms.within
f.a <- ms.a / ms.within
f.b <- ms.b / ms.within
f.ab <- ms.ab / ms.within
alpha <- .05
f.a
# 봐야할 F분포표에서의 F-value
# qt 처럼 qf 사용
# qf(alpha, df.between, df.within, lower.tail=F) 처럼 사용
qf(alpha, df.a, df.within, lower.tail = F)
pf(f.a, df.a, df.within, lower.tail = F)
f.b
qf(alpha, df.b, df.within, lower.tail = F)
pf(f.b, df.b, df.within, lower.tail = F)
f.ab
qf(alpha, df.ab, df.within, lower.tail = F)
pf(f.ab, df.ab, df.within, lower.tail = F)
# aov result
summary(aov.dat)