Differences

This shows you the differences between two versions of the page.

--- gradient_descent [2025/08/01 13:14] – created hkimscil
+++ gradient_descent [2025/08/01 18:57] (current) – [R output] hkimscil
@@ Line 1: / Line 1: @@
 ====== Gradient Descent ======
+====== explanation ======
 점차하강 = 조금씩 깍아서 원하는 기울기 (미분값) 찾기
+prerequisite:
+[[estimated_standard_deviation#실험적_수학적_이해|표준편차 추론에서 평균을 사용하는 이유: 실험적_수학적_이해]]
+[[:deriviation of a and b in a simple regression]]
+위의 문서는 a, b에 대한 값을 미분법을 이용해서 직접 구하였다. 컴퓨터로는 이렇게 하기가 쉽지 않다. 그렇다면 이 값을 반복계산을 이용해서 추출하는 방법은 없을까? gradient descent
+우선 위의 문서에서 (두번째) 최소값이 되는 SS값을 찾는다고 설명했는데, 이는 MS값으로 대체해서 생각해도 된다.
+\begin{eqnarray*}
+\text{MS} & = & \frac {\text{SS}}{n}
+\end{eqnarray*}
+<WRAP box>
+\begin{eqnarray*}
+\text{for a (constant)} \\
+\\
+\dfrac{\text{d}}{\text{dv}} \text{MSE (Mean Square Error)} & = & \dfrac{\text{d}}{\text{dv}} \frac {\sum{(Y_i - (a + bX_i))^2}} {N} \\
+& = & \sum \dfrac{\text{d}}{\text{dv}} \frac{{(Y_i - (a + bX_i))^2}} {N} \\
+& = & \sum{2 \frac{1}{N} (Y_i - (a + bX_i))} * (-1) \;\;\;\; \\
+& \because & \dfrac{\text{d}}{\text{dv for a}} (Y_i - (a+bX_i)) = -1 \\
+& = & -2 \frac{\sum{(Y_i - (a + bX_i))}}{N} \\
+& = & -2 * \text{mean of residuals} \\
+\end{eqnarray*}
+아래 R code에서 gradient function을 참조.
+</WRAP>
+<WRAP box>
+\begin{eqnarray*}
+\text{for b, (coefficient)} \\
+\\
+\dfrac{\text{d}}{\text{dv}} \frac{\sum{(Y_i - (a + bX_i))^2}}{N}  & = & \sum \dfrac{\text{d}}{\text{dv}} \frac{{(Y_i - (a + bX_i))^2}} {N} \\
+& = & \sum{2 \frac{1}{N} (Y_i - (a + bX_i))} * (-X_i) \;\;\;\; \\
+& \because & \dfrac{\text{d}}{\text{dv for b}} (Y_i - (a+bX_i)) = -X_i \\
+& = & -2 X_i \frac{\sum{(Y_i - (a + bX_i))}}{N} \\
+& = & -2 * X_i * \text{mean of residuals} \\
+\\
+\end{eqnarray*}
+(미분을 이해한다는 것을 전제로) 위의 식은 b값이 변할 때 msr (mean square residual) 값이 어떻게 변하는가를 알려주는 것이다. 그리고 그것은 b값에 대한 residual의 총합에 (-2/N)*X값을 곱한 값이다.
+</WRAP>
+====== R code ======
+<code>
+# d statquest explanation
+# x <- c(0.5, 2.3, 2.9)
+# y <- c(1.4, 1.9, 3.2)
+rm(list=ls())
+# set.seed(191)
+n <- 300
+x <- rnorm(n, 5, 1.2)
+y <- 2.14 * x + rnorm(n, 0, 4)
+# data <- data.frame(x, y)
+data <- tibble(x = x, y = y)
+mo <- lm(y~x)
+summary(mo)
+# set.seed(191)
+# Initialize random betas
+b1 = rnorm(1)
+b0 = rnorm(1)
+b1.init <- b1
+b0.init <- b0
+# Predict function:
+predict <- function(x, b0, b1){
+  return (b0 + b1 * x)
+}
+# And loss function is:
+residuals <- function(predictions, y) {
+  return(y - predictions)
+}
+loss_mse <- function(predictions, y){
+  residuals = y - predictions
+  return(mean(residuals ^ 2))
+}
+predictions <- predict(x, b0, b1)
+residuals <- residuals(predictions, y)
+loss = loss_mse(predictions, y)
+data <- tibble(data.frame(x, y, predictions, residuals))
+print(paste0("Loss is: ", round(loss)))
+gradient <- function(x, y, predictions){
+  dinputs = y - predictions
+  db1 = -2 * mean(x * dinputs)
+  db0 = -2 * mean(dinputs)
+  return(list("db1" = db1, "db0" = db0))
+}
+gradients <- gradient(x, y, predictions)
+print(gradients)
+# Train the model with scaled features
+x_scaled <- (x - mean(x)) / sd(x)
+learning_rate = 1e-1
+# Record Loss for each epoch:
+# logs = list()
+# bs=list()
+b0s = c()
+b1s = c()
+mse = c()
+nlen <- 80
+for (epoch in 1:nlen){
+  # Predict all y values:
+  predictions = predict(x_scaled, b0, b1)
+  loss = loss_mse(predictions, y)
+  mse = append(mse, loss)
+  # logs = append(logs, loss)
+  if (epoch %% 10 == 0){
+    print(paste0("Epoch: ",epoch, ", Loss: ", round(loss, 5)))
+  }
+  gradients <- gradient(x_scaled, y, predictions)
+  db1 <- gradients$db1
+  db0 <- gradients$db0
+  b1 <- b1 - db1 * learning_rate
+  b0 <- b0 - db0 * learning_rate
+  b0s <- append(b0s, b0)
+  b1s <- append(b1s, b1)
+}
+# unscale coefficients to make them comprehensible
+b0 =  b0 - (mean(x) / sd(x)) * b1
+b1 = b1 / sd(x)
+# changes of estimators
+b0s <- b0s - (mean(x) /sd(x)) * b1s
+b1s <- b1s / sd(x)
+parameters <- tibble(data.frame(b0s, b1s, mse))
+cat(paste0("Slope: ", b1, ", \n", "Intercept: ", b0, "\n"))
+summary(lm(y~x))$coefficients
+ggplot(data, aes(x = x, y = y)) +
+  geom_point(size = 2) +
+  geom_abline(aes(intercept = b0s, slope = b1s),
+              data = parameters, linewidth = 0.5,
+              color = 'green') +
+  theme_classic() +
+  geom_abline(aes(intercept = b0s, slope = b1s),
+              data = parameters %>% slice_head(),
+              linewidth = 1, color = 'blue') +
+  geom_abline(aes(intercept = b0s, slope = b1s),
+              data = parameters %>% slice_tail(),
+              linewidth = 1, color = 'red') +
+  labs(title = 'Gradient descent. blue: start, red: end, green: gradients')
+b0.init
+b1.init
+data
+parameters
+</code>
+====== R output =====
+<code>
+> rm(list=ls())
+> # set.seed(191)
+> n <- 300
+> x <- rnorm(n, 5, 1.2)
+> y <- 2.14 * x + rnorm(n, 0, 4)
+>
+> # data <- data.frame(x, y)
+> data <- tibble(x = x, y = y)
+>
+> mo <- lm(y~x)
+> summary(mo)
+Call:
+lm(formula = y ~ x)
+Residuals:
+   Min     1Q Median     3Q    Max
+-9.754 -2.729 -0.135  2.415 10.750
+Coefficients:
+            Estimate Std. Error t value Pr(>|t|)
+(Intercept)  -0.7794     0.9258  -0.842    0.401
+x             2.2692     0.1793  12.658   <2e-16 ***
+---
+Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+Residual standard error: 3.951 on 298 degrees of freedom
+Multiple R-squared:  0.3497,	Adjusted R-squared:  0.3475
+F-statistic: 160.2 on 1 and 298 DF,  p-value: < 2.2e-16
+>
+> # set.seed(191)
+> # Initialize random betas
+> b1 = rnorm(1)
+> b0 = rnorm(1)
+>
+> b1.init <- b1
+> b0.init <- b0
+>
+> # Predict function:
+> predict <- function(x, b0, b1){
++   return (b0 + b1 * x)
++ }
+>
+> # And loss function is:
+> residuals <- function(predictions, y) {
++   return(y - predictions)
++ }
+>
+> loss_mse <- function(predictions, y){
++   residuals = y - predictions
++   return(mean(residuals ^ 2))
++ }
+>
+> predictions <- predict(x, b0, b1)
+> residuals <- residuals(predictions, y)
+> loss = loss_mse(predictions, y)
+>
+> data <- tibble(data.frame(x, y, predictions, residuals))
+>
+> print(paste0("Loss is: ", round(loss)))
+[1] "Loss is: 393"
+>
+> gradient <- function(x, y, predictions){
++   dinputs = y - predictions
++   db1 = -2 * mean(x * dinputs)
++   db0 = -2 * mean(dinputs)
++
++   return(list("db1" = db1, "db0" = db0))
++ }
+>
+> gradients <- gradient(x, y, predictions)
+> print(gradients)
+$db1
+[1] -200.6834
+$db0
+[1] -37.76994
+>
+> # Train the model with scaled features
+> x_scaled <- (x - mean(x)) / sd(x)
+>
+> learning_rate = 1e-1
+>
+> # Record Loss for each epoch:
+> # logs = list()
+> # bs=list()
+> b0s = c()
+> b1s = c()
+> mse = c()
+>
+> nlen <- 80
+> for (epoch in 1:nlen){
++   # Predict all y values:
++   predictions = predict(x_scaled, b0, b1)
++   loss = loss_mse(predictions, y)
++   mse = append(mse, loss)
++   # logs = append(logs, loss)
++
++   if (epoch %% 10 == 0){
++     print(paste0("Epoch: ",epoch, ", Loss: ", round(loss, 5)))
++   }
++
++   gradients <- gradient(x_scaled, y, predictions)
++   db1 <- gradients$db1
++   db0 <- gradients$db0
++
++   b1 <- b1 - db1 * learning_rate
++   b0 <- b0 - db0 * learning_rate
++   b0s <- append(b0s, b0)
++   b1s <- append(b1s, b1)
++ }
+[1] "Epoch: 10, Loss: 18.5393"
+[1] "Epoch: 20, Loss: 15.54339"
+[1] "Epoch: 30, Loss: 15.50879"
+[1] "Epoch: 40, Loss: 15.50839"
+[1] "Epoch: 50, Loss: 15.50839"
+[1] "Epoch: 60, Loss: 15.50839"
+[1] "Epoch: 70, Loss: 15.50839"
+[1] "Epoch: 80, Loss: 15.50839"
+>
+> # unscale coefficients to make them comprehensible
+> b0 =  b0 - (mean(x) / sd(x)) * b1
+> b1 = b1 / sd(x)
+>
+> # changes of estimators
+> b0s <- b0s - (mean(x) /sd(x)) * b1s
+> b1s <- b1s / sd(x)
+>
+> parameters <- tibble(data.frame(b0s, b1s, mse))
+>
+> cat(paste0("Slope: ", b1, ", \n", "Intercept: ", b0, "\n"))
+Slope: 2.26922511738252,
+Intercept: -0.779435058320381
+> summary(lm(y~x))$coefficients
+              Estimate Std. Error    t value     Pr(>|t|)
+(Intercept) -0.7794352  0.9258064 -0.8418986 4.005198e-01
+x            2.2692252  0.1792660 12.6584242 1.111614e-29
+>
+> ggplot(data, aes(x = x, y = y)) +
++   geom_point(size = 2) +
++   geom_abline(aes(intercept = b0s, slope = b1s),
++               data = parameters, linewidth = 0.5,
++               color = 'green') +
++   theme_classic() +
++   geom_abline(aes(intercept = b0s, slope = b1s),
++               data = parameters %>% slice_head(),
++               linewidth = 1, color = 'blue') +
++   geom_abline(aes(intercept = b0s, slope = b1s),
++               data = parameters %>% slice_tail(),
++               linewidth = 1, color = 'red') +
++   labs(title = 'Gradient descent. blue: start, red: end, green: gradients')
+>
+> b0.init
+[1] -1.67967
+> b1.init
+[1] -1.323992
+>
+> data
+# A tibble: 300 × 4
+       x     y predictions residuals
+   <dbl> <dbl>       <dbl>     <dbl>
+  4.13  6.74       -7.14     13.9
+  7.25 14.0       -11.3      25.3
+  6.09 13.5        -9.74     23.3
+  6.29 15.1       -10.0      25.1
+  4.40  3.81       -7.51     11.3
+  6.03 13.9        -9.67     23.5
+  6.97 12.1       -10.9      23.0
+  4.84 12.8        -8.09     20.9
+  6.85 17.2       -10.7      28.0
+  3.33  3.80       -6.08      9.88
+# ℹ 290 more rows
+# ℹ Use `print(n = ...)` to see more rows
+> parameters
+# A tibble: 80 × 3
+       b0s    b1s   mse
+     <dbl>  <dbl> <dbl>
+  2.67   -0.379 183.
+  1.99    0.149 123.
+  1.44    0.571  84.3
+  1.00    0.910  59.6
+  0.652   1.18   43.7
+  0.369   1.40   33.6
+  0.142   1.57   27.1
+-0.0397  1.71   22.9
+-0.186   1.82   20.2
+-0.303   1.91   18.5
+# ℹ 70 more rows
+#
+</code>
+{{:pasted:20250801-185727.png}}