Differences

This shows you the differences between two versions of the page.

--- logistic_regression [2023/12/14 07:50] – [coefficient (계수) 해석] hkimscil
+++ logistic_regression [2024/12/11 11:57] (current) – [exercise: binary IV] hkimscil
@@ Line 2: / Line 2: @@
 https://www.bookdown.org/rwnahhas/RMPH/blr-orlr.html
 data: https://www.bookdown.org/rwnahhas/RMPH/appendix-nsduh.html#appendix-nsduh
+[[:Logistic Regression/examples R]]
 ====== Data preparation ======
   * [[https://www.datafiles.samhsa.gov/sites/default/files/field-uploads-protected/studies/NSDUH-2019/NSDUH-2019-datasets/NSDUH-2019-DS0001/NSDUH-2019-DS0001-bundles-with-study-info/NSDUH-2019-DS0001-bndl-data-r.zip|NSDUH-2019-DS0001-bndl-data-r.zip 파일]] 다운로드
@@ Line 73: / Line 75: @@
   * wald test
 <code>
-n <- 350
+##########
-p.cancer <- 0.08
+# see youtube
-p.mutant <- 0.39
+# https://youtu.be/8nm0G-1uJzA
+n.mut <- 23+117
+n.norm <- 6+210
+p.cancer.mut <- 23/(23+117)
+p.cancer.norm <- 6/(6+210)
-c <- runif(n, 0, 1)
+set.seed(1011)
-canc <- ifelse(c>=p.cancer, "nocancer", "cancer")
+c <- runif(n.mut, 0, 1)
-c <- runif(n, 0, 1)
+# 0 = not cancer, 1 = cancer among mutant gene
-gene <- ifelse(c>=p.mutant, "norm", "mutated")
+mutant <- ifelse(c>=p.cancer.mut, 0, 1)
-da <- data.frame(gene, canc)
+c <- runif(n.norm, 0, 1)
-da
+# 0 = not cancer, 1 = cancer among normal gene
-tab <- table(da)
+normal <- ifelse(c>=p.cancer.norm, 0, 1)
+# 0 = mutant; 1 = normal
+gene <- c(rep(0, length(mutant)), rep(1, length(normal)))
+# 0 = not cancer; 1 = cancer
+cancer <- c(mutant, normal)
+df <- as.data.frame(cbind(gene, cancer))
+df
+df$gene <- factor(df$gene, levels = c(0,1), labels = c("mutant", "norm"))
+df$cancer <- factor(df$cancer, levels = c(0,1), labels = c("nocancer", "cancer"))
+df
+tab <- table(df)
 tab
+tab[1,2]
+tab[1,1]
+# p.c.m = p.cancer.mut the above
+p.cancer.mutant <- tab[1,2]/(tab[1,1]+tab[1,2])
+p.nocancer.mutant <- tab[1,1]/(tab[1,1]+tab[1,2])
+p.cancer.mutant
+-p.cancer.mutant
+p.nocancer.mutant
+p.cancer.norm <-  tab[2,2]/(tab[2,1]+tab[2,2])
+p.nocancer.norm <- 1-p.cancer.norm
+p.cancer.norm
+p.nocancer.norm
+odds(p.cancer.mutant)
+odds(p.cancer.norm)
+odds.ratio(p.cancer.mutant, p.cancer.norm)
 </code>
+<code>
+> ##########
+> # see youtube
+> # https://youtu.be/8nm0G-1uJzA
+> n.mut <- 23+117
+> n.norm <- 6+210
+> p.cancer.mut <- 23/(23+117)
+> p.cancer.norm <- 6/(6+210)
+>
+> set.seed(1011)
+> c <- runif(n.mut, 0, 1)
+> # 0 = not cancer, 1 = cancer among mutant gene
+> mutant <- ifelse(c>=p.cancer.mut, 0, 1)
+>
+> c <- runif(n.norm, 0, 1)
+> # 0 = not cancer, 1 = cancer among normal gene
+> normal <- ifelse(c>=p.cancer.norm, 0, 1)
+>
+> # 0 = mutant; 1 = normal
+> gene <- c(rep(0, length(mutant)), rep(1, length(normal)))
+> # 0 = not cancer; 1 = cancer
+> cancer <- c(mutant, normal)
+>
+> df <- as.data.frame(cbind(gene, cancer))
+> df
+    gene cancer
+      0      0
+      0      1
+      0      0
+      0      0
+      0      0
+      0      0
+>
+> df$gene <- factor(df$gene, levels = c(0,1), labels = c("mutant", "norm"))
+> df$cancer <- factor(df$cancer, levels = c(0,1), labels = c("nocancer", "cancer"))
+> df
+      gene   cancer
+   mutant nocancer
+   mutant   cancer
+   mutant nocancer
+   mutant nocancer
+   mutant nocancer
+   mutant nocancer
+>
+> tab <- table(df)
+> tab
+        cancer
+gene     nocancer cancer
+  mutant      121     19
+  norm        210      6
+> tab[1,2]
+[1] 19
+> tab[1,1]
+[1] 121
+>
+> # p.c.m = p.cancer.mut the above
+> p.cancer.mutant <- tab[1,2]/(tab[1,1]+tab[1,2])
+> p.nocancer.mutant <- tab[1,1]/(tab[1,1]+tab[1,2])
+> p.cancer.mutant
+[1] 0.1357143
+> 1-p.cancer.mutant
+[1] 0.8642857
+> p.nocancer.mutant
+[1] 0.8642857
+>
+> p.cancer.norm <-  tab[2,2]/(tab[2,1]+tab[2,2])
+> p.nocancer.norm <- 1-p.cancer.norm
+> p.cancer.norm
+[1] 0.02777778
+> p.nocancer.norm
+[1] 0.9722222
+>
+> odds(p.cancer.mutant)
+[1] 0.1570248
+> odds(p.cancer.norm)
+[1] 0.02857143
+> odds.ratio(p.cancer.mutant, p.cancer.norm)
+[1] 5.495868
+>
+</code>
 ====== Logit 성질 ======
 여기서
 \begin{align*}
-y & = ln(x) \\
+ln(x) & = y  \\
-& = log_e {x} \\
+log_e {x} & = y  \\
 x & = e^{y} \\
 \end{align*}
@@ Line 196: / Line 311: @@
 >
 </code>
-====== Odds ratio in logistic ======
-\begin{align*}
-ln(\frac{p}{1-p}) = & y \\
-\frac {p}{1-p} = & e^{y} \;\;\; \text{where } \;\; y = a + bX \\
-\text {odds} = & e^{y} = e^{a + bX} \\
-\text{then} \;\;\; \text{odds ratio} (y_{2}/y_{1}) = & \text {odds ratio between  } \\
-& \text{odds of y at one point, } y_1 \text { and } \\
-& \text{odds of y at another point, } y_2 \\
-\text{and  }  y_1 = & a + b (X) \\
-              y_2 = & a + b (X+1) \\
-\text{then  } & \;\; \\
-\text {odds of } y_1 = & e^{(a+b(X))} \\
-\text {odds of } y_2 = & e^{(a+b(X+1))} \\
-\text {odds ratio for } y_1 = & \frac {e^{(a+bX+b)} } {e^{(a+bX)}} \\
-= & \frac {e^{(a+bX)} * e^{b}} {e^{(a+bX)} } \\
-= & e^b
-\end{align*}
-  * 위의 $e^b$ 가 의미하는 것은 $X$가 한 유닛만큼 증가하면 $Y$는 $b$만큼 증가하는 것이 되는데 이 $b$는
-  * $y2$와 $y1$ 간의 $\text{log of odds ratio}$ 로 이해되어야 한다. 따라서
-  * y2와 y1 간의 $\text{odds ratio} = e^b $ 이 된다.
 ====== Logitistic Regression Analysis ======
@@ Line 310: / Line 405: @@
 </code>
+===== Odds ratio in logistic =====
+\begin{align*}
+ln(\frac{p}{1-p}) = & y \\
+\frac {p}{1-p} = & e^{y} \;\;\; \text{where } \;\; y = a + bX \\
+\text {odds} = & e^{y} = e^{a + bX} \\
+\text{then} \;\;\; \text{odds ratio} (y_{2}/y_{1}) = & \text {odds ratio between  } \\
+& \text{odds of y at one point, } y_1 \text { and } \\
+& \text{odds of y at another point, } y_2 \\
+\text{and  }  y_1 = & a + b (X) \\
+              y_2 = & a + b (X+1) \\
+\text{then  } & \;\; \\
+\text {odds of } y_1 = & e^{(a+b(X))} \\
+\text {odds of } y_2 = & e^{(a+b(X+1))} \\
+\text {odds ratio for } y_1 = & \frac {e^{(a+bX+b)} } {e^{(a+bX)}} \\
+= & \frac {e^{(a+bX)} * e^{b}} {e^{(a+bX)} } \\
+= & e^b
+\end{align*}
+  * 위의 $e^b$ 가 의미하는 것은 $X$가 한 유닛만큼 증가하면 $Y$는 $b$만큼 증가하는 것이 되는데 이 $b$는
+  * $y2$와 $y1$ 간의 $\text{log of odds ratio}$ 로 이해되어야 한다. 따라서
+  * y2와 y1 간의 $\text{odds ratio} = e^b $ 이 된다.
 ===== coefficient (계수) 해석 =====
@@ Line 318: / Line 434: @@
   * 따라서 $a + b = -0.13504 + 0.36784 = 0.2328 $
   * 즉, $ln(odds) = 0.2328 $ 이고
-  * $ odds = e^{0.2328} = 1.262129$  이것은 X가 1일 경우이다.
+  * $ odds = \displaystyle \frac {p_{\text{ of male yes}}}{p-1} = e^{0.2328} = 1.262129$  이것은 X가 1일 경우이다.
   * $ p = e^{0.2328} / (1 + e^{0.2328}) = 0.5579386 $ 그리고 X는 1일 경우의 prob = 0.558 정도이다.
   * or ''ilogit(0.2328) = 0.5579386''
@@ Line 330: / Line 446: @@
     * 즉, $log(om/of) = b$
     * $log(1.444613) = b$
+    * $ 1.444613 = e^b$
 <code>
 > log(1.444613)
@@ Line 440: / Line 557: @@
 </code>
 마리화나의 사용경험에서 남성이 여성보다 큰 승산이 있다고 판단되었다 (Odds ratio (OR) = 1.44; 95% CI = 1.13, 1.86; p = .004). 남성은 여성보다 약 44% 더 사용경험을 할 승산을 보였다 (OR = 1.44).
+====== exercise: binary IV ======
+<code>
+########################################
+# exercise
+head(df)
+table(df)
+# base 바꾸기
+df.norm <- df %>% mutate(gene = relevel(gene, ref = "norm"))
+df.mut <- df %>% mutate(gene = relevel(gene, ref = "mutant"))
+logm.cancer.gene.1 <- glm(cancer ~ gene, family = binomial, data = df.norm)
+summary(logm.cancer.gene.1)
+a <- logm.cancer.gene.1$coefficients[1]
+b <- logm.cancer.gene.1$coefficients[2]
+a
+b
+a+b
+# when b = 0; 즉, mutant = 0 일 때
+# log(odds.norm) = a 이므로
+# odds.norm = e^a
+exp(a)
+# 확인
+odds(p.can.norm)
+# odds.mut = e^(a+b)
+exp(a+b)
+odds(p.can.mut)
+# odds.ratio = e^(b)
+exp(b)
+odds.ratio(p.can.mut, p.can.norm)
+logm.cancer.gene.2 <- glm(cancer ~ gene, family = binomial, data = df.mut)
+summary(logm.cancer.gene.2)
+a <- logm.cancer.gene.2$coefficients[1]
+b <- logm.cancer.gene.2$coefficients[2]
+a
+b
+a+b
+# when b = 0; 즉, mutant = 0 일 때
+# log(odds.norm) = a 이므로
+# odds.norm = e^a
+exp(a)
+# 확인
+odds(p.can.mut)
+# odds.mut = e^(a+b)
+exp(a+b)
+odds(p.can.norm)
+# odds.ratio = e^(b)
+exp(b)
+odds.ratio(p.can.norm, p.can.mut)
+</code>
 ====== X: numeric variable ======
 <code>