User Tools

Site Tools


gradient_descent

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Both sides previous revisionPrevious revision
Next revision
Previous revision
gradient_descent [2025/08/01 13:21] hkimscilgradient_descent [2025/08/01 18:57] (current) – [R output] hkimscil
Line 1: Line 1:
 ====== Gradient Descent ====== ====== Gradient Descent ======
 +====== explanation ======
 점차하강 = 조금씩 깍아서 원하는 기울기 (미분값) 찾기 점차하강 = 조금씩 깍아서 원하는 기울기 (미분값) 찾기
 prerequisite:  prerequisite: 
 [[estimated_standard_deviation#실험적_수학적_이해|표준편차 추론에서 평균을 사용하는 이유: 실험적_수학적_이해]] [[estimated_standard_deviation#실험적_수학적_이해|표준편차 추론에서 평균을 사용하는 이유: 실험적_수학적_이해]]
 [[:deriviation of a and b in a simple regression]] [[:deriviation of a and b in a simple regression]]
 +
 +위의 문서는 a, b에 대한 값을 미분법을 이용해서 직접 구하였다. 컴퓨터로는 이렇게 하기가 쉽지 않다. 그렇다면 이 값을 반복계산을 이용해서 추출하는 방법은 없을까? gradient descent 
 +
 +우선 위의 문서에서 (두번째) 최소값이 되는 SS값을 찾는다고 설명했는데, 이는 MS값으로 대체해서 생각해도 된다. 
 +
 +\begin{eqnarray*}
 +\text{MS} & = & \frac {\text{SS}}{n}
 +\end{eqnarray*}
 +
 +
 +<WRAP box>
 +\begin{eqnarray*}
 +\text{for a (constant)} \\ 
 +\\
 +\dfrac{\text{d}}{\text{dv}} \text{MSE (Mean Square Error)} & = & \dfrac{\text{d}}{\text{dv}} \frac {\sum{(Y_i - (a + bX_i))^2}} {N} \\ 
 +& = & \sum \dfrac{\text{d}}{\text{dv}} \frac{{(Y_i - (a + bX_i))^2}} {N} \\ 
 +& = & \sum{2 \frac{1}{N} (Y_i - (a + bX_i))} * (-1) \;\;\;\; \\
 +& \because & \dfrac{\text{d}}{\text{dv for a}} (Y_i - (a+bX_i)) = -1 \\
 +& = & -2 \frac{\sum{(Y_i - (a + bX_i))}}{N} \\
 +& = & -2 * \text{mean of residuals} \\ 
 +\end{eqnarray*} 
 +아래 R code에서 gradient function을 참조.
 +
 +</WRAP>
 +<WRAP box>
 +\begin{eqnarray*}
 +\text{for b, (coefficient)} \\ 
 +\\
 +\dfrac{\text{d}}{\text{dv}} \frac{\sum{(Y_i - (a + bX_i))^2}}{N}  & = & \sum \dfrac{\text{d}}{\text{dv}} \frac{{(Y_i - (a + bX_i))^2}} {N} \\ 
 +& = & \sum{2 \frac{1}{N} (Y_i - (a + bX_i))} * (-X_i) \;\;\;\; \\
 +& \because & \dfrac{\text{d}}{\text{dv for b}} (Y_i - (a+bX_i)) = -X_i \\
 +& = & -2 X_i \frac{\sum{(Y_i - (a + bX_i))}}{N} \\
 +& = & -2 * X_i * \text{mean of residuals} \\
 +
 +\\
 +\end{eqnarray*}
 +(미분을 이해한다는 것을 전제로) 위의 식은 b값이 변할 때 msr (mean square residual) 값이 어떻게 변하는가를 알려주는 것이다. 그리고 그것은 b값에 대한 residual의 총합에 (-2/N)*X값을 곱한 값이다. 
 +</WRAP>
 +====== R code ======
 <code> <code>
-library(tidyverse) +statquest explanation 
-# a simple example +x <- c(0.5, 2.3, 2.9) 
-# statquest explanation +y <- c(1.4, 1.9, 3.2)
-x <- c(0.5, 2.3, 2.9) +
-y <- c(1.4, 1.9, 3.2)+
  
 rm(list=ls()) rm(list=ls())
 # set.seed(191) # set.seed(191)
-n <- 500+n <- 300
 x <- rnorm(n, 5, 1.2) x <- rnorm(n, 5, 1.2)
 y <- 2.14 * x + rnorm(n, 0, 4) y <- 2.14 * x + rnorm(n, 0, 4)
Line 19: Line 57:
 # data <- data.frame(x, y) # data <- data.frame(x, y)
 data <- tibble(x = x, y = y) data <- tibble(x = x, y = y)
-data 
  
 mo <- lm(y~x) mo <- lm(y~x)
Line 28: Line 65:
 b1 = rnorm(1) b1 = rnorm(1)
 b0 = rnorm(1) b0 = rnorm(1)
 +
 +b1.init <- b1
 +b0.init <- b0
  
 # Predict function: # Predict function:
Line 48: Line 88:
 loss = loss_mse(predictions, y) loss = loss_mse(predictions, y)
  
-temp.sum <- data.frame(x, y, b0, b1,predictions, residuals) +data <- tibble(data.frame(x, y, predictions, residuals))
-temp.sum+
  
 print(paste0("Loss is: ", round(loss))) print(paste0("Loss is: ", round(loss)))
Line 70: Line 109:
  
 # Record Loss for each epoch: # Record Loss for each epoch:
-logs = list() +logs = list() 
-bs=list()+bs=list()
 b0s = c() b0s = c()
 b1s = c() b1s = c()
-msr = c()+mse = c()
  
 nlen <- 80 nlen <- 80
Line 81: Line 120:
   predictions = predict(x_scaled, b0, b1)   predictions = predict(x_scaled, b0, b1)
   loss = loss_mse(predictions, y)   loss = loss_mse(predictions, y)
-  msr = append(msr, loss) +  mse = append(mse, loss) 
-   +  logs = append(logs, loss)
-  logs = append(logs, loss)+
      
   if (epoch %% 10 == 0){   if (epoch %% 10 == 0){
Line 98: Line 136:
   b1s <- append(b1s, b1)   b1s <- append(b1s, b1)
 } }
-I must unscale coefficients to make them comprehensible+ 
 +# unscale coefficients to make them comprehensible
 b0 =  b0 - (mean(x) / sd(x)) * b1 b0 =  b0 - (mean(x) / sd(x)) * b1
 b1 = b1 / sd(x) b1 = b1 / sd(x)
  
 +# changes of estimators
 b0s <- b0s - (mean(x) /sd(x)) * b1s b0s <- b0s - (mean(x) /sd(x)) * b1s
 b1s <- b1s / sd(x) b1s <- b1s / sd(x)
  
-parameters <- tibble(data.frame(b0s, b1s, msr))+parameters <- tibble(data.frame(b0s, b1s, mse))
  
-cat(paste0("Inclination: ", b1, ", \n", "Intercept: ", b0, "\n"))+cat(paste0("Slope: ", b1, ", \n", "Intercept: ", b0, "\n"))
 summary(lm(y~x))$coefficients summary(lm(y~x))$coefficients
  
Line 113: Line 153:
   geom_point(size = 2) +    geom_point(size = 2) + 
   geom_abline(aes(intercept = b0s, slope = b1s),   geom_abline(aes(intercept = b0s, slope = b1s),
-              data = parameters, linewidth = 0.5, color = 'red') + +              data = parameters, linewidth = 0.5,  
 +              color = 'green') + 
   theme_classic() +   theme_classic() +
   geom_abline(aes(intercept = b0s, slope = b1s),    geom_abline(aes(intercept = b0s, slope = b1s), 
               data = parameters %>% slice_head(),                data = parameters %>% slice_head(), 
-              linewidth = 0.5, color = 'blue') + +              linewidth = 1, color = 'blue') + 
   geom_abline(aes(intercept = b0s, slope = b1s),    geom_abline(aes(intercept = b0s, slope = b1s), 
               data = parameters %>% slice_tail(),                data = parameters %>% slice_tail(), 
-              linewidth = 1, color = 'green') + +              linewidth = 1, color = 'red') + 
-  labs(title = 'Gradient descentblue: start, green: end')+  labs(title = 'Gradient descentblue: start, red: end, green: gradients') 
 + 
 +b0.init 
 +b1.init 
 data data
 parameters parameters
 +
 +
 +</code>
 +====== R output =====
 +<code>
 +> rm(list=ls())
 +> # set.seed(191)
 +> n <- 300
 +> x <- rnorm(n, 5, 1.2)
 +> y <- 2.14 * x + rnorm(n, 0, 4)
 +
 +> # data <- data.frame(x, y)
 +> data <- tibble(x = x, y = y)
 +
 +> mo <- lm(y~x)
 +> summary(mo)
 +
 +Call:
 +lm(formula = y ~ x)
 +
 +Residuals:
 +   Min     1Q Median     3Q    Max 
 +-9.754 -2.729 -0.135  2.415 10.750 
 +
 +Coefficients:
 +            Estimate Std. Error t value Pr(>|t|)    
 +(Intercept)  -0.7794     0.9258  -0.842    0.401    
 +x             2.2692     0.1793  12.658   <2e-16 ***
 +---
 +Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
 +
 +Residual standard error: 3.951 on 298 degrees of freedom
 +Multiple R-squared:  0.3497, Adjusted R-squared:  0.3475 
 +F-statistic: 160.2 on 1 and 298 DF,  p-value: < 2.2e-16
 +
 +
 +> # set.seed(191)
 +> # Initialize random betas
 +> b1 = rnorm(1)
 +> b0 = rnorm(1)
 +
 +> b1.init <- b1
 +> b0.init <- b0
 +
 +> # Predict function:
 +> predict <- function(x, b0, b1){
 ++   return (b0 + b1 * x)
 ++ }
 +
 +> # And loss function is:
 +> residuals <- function(predictions, y) {
 ++   return(y - predictions)
 ++ }
 +
 +> loss_mse <- function(predictions, y){
 ++   residuals = y - predictions
 ++   return(mean(residuals ^ 2))
 ++ }
 +
 +> predictions <- predict(x, b0, b1)
 +> residuals <- residuals(predictions, y)
 +> loss = loss_mse(predictions, y)
 +
 +> data <- tibble(data.frame(x, y, predictions, residuals))
 +
 +> print(paste0("Loss is: ", round(loss)))
 +[1] "Loss is: 393"
 +
 +> gradient <- function(x, y, predictions){
 ++   dinputs = y - predictions
 ++   db1 = -2 * mean(x * dinputs)
 ++   db0 = -2 * mean(dinputs)
 ++   
 ++   return(list("db1" = db1, "db0" = db0))
 ++ }
 +
 +> gradients <- gradient(x, y, predictions)
 +> print(gradients)
 +$db1
 +[1] -200.6834
 +
 +$db0
 +[1] -37.76994
 +
 +
 +> # Train the model with scaled features
 +> x_scaled <- (x - mean(x)) / sd(x)
 +
 +> learning_rate = 1e-1
 +
 +> # Record Loss for each epoch:
 +> # logs = list()
 +> # bs=list()
 +> b0s = c()
 +> b1s = c()
 +> mse = c()
 +
 +> nlen <- 80
 +> for (epoch in 1:nlen){
 ++   # Predict all y values:
 ++   predictions = predict(x_scaled, b0, b1)
 ++   loss = loss_mse(predictions, y)
 ++   mse = append(mse, loss)
 ++   # logs = append(logs, loss)
 ++   
 ++   if (epoch %% 10 == 0){
 ++     print(paste0("Epoch: ",epoch, ", Loss: ", round(loss, 5)))
 ++   }
 ++   
 ++   gradients <- gradient(x_scaled, y, predictions)
 ++   db1 <- gradients$db1
 ++   db0 <- gradients$db0
 ++   
 ++   b1 <- b1 - db1 * learning_rate
 ++   b0 <- b0 - db0 * learning_rate
 ++   b0s <- append(b0s, b0)
 ++   b1s <- append(b1s, b1)
 ++ }
 +[1] "Epoch: 10, Loss: 18.5393"
 +[1] "Epoch: 20, Loss: 15.54339"
 +[1] "Epoch: 30, Loss: 15.50879"
 +[1] "Epoch: 40, Loss: 15.50839"
 +[1] "Epoch: 50, Loss: 15.50839"
 +[1] "Epoch: 60, Loss: 15.50839"
 +[1] "Epoch: 70, Loss: 15.50839"
 +[1] "Epoch: 80, Loss: 15.50839"
 +
 +> # unscale coefficients to make them comprehensible
 +> b0 =  b0 - (mean(x) / sd(x)) * b1
 +> b1 = b1 / sd(x)
 +
 +> # changes of estimators
 +> b0s <- b0s - (mean(x) /sd(x)) * b1s
 +> b1s <- b1s / sd(x)
 +
 +> parameters <- tibble(data.frame(b0s, b1s, mse))
 +
 +> cat(paste0("Slope: ", b1, ", \n", "Intercept: ", b0, "\n"))
 +Slope: 2.26922511738252, 
 +Intercept: -0.779435058320381
 +> summary(lm(y~x))$coefficients
 +              Estimate Std. Error    t value     Pr(>|t|)
 +(Intercept) -0.7794352  0.9258064 -0.8418986 4.005198e-01
 +x            2.2692252  0.1792660 12.6584242 1.111614e-29
 +
 +> ggplot(data, aes(x = x, y = y)) + 
 ++   geom_point(size = 2) + 
 ++   geom_abline(aes(intercept = b0s, slope = b1s),
 ++               data = parameters, linewidth = 0.5, 
 ++               color = 'green') + 
 ++   theme_classic() +
 ++   geom_abline(aes(intercept = b0s, slope = b1s), 
 ++               data = parameters %>% slice_head(), 
 ++               linewidth = 1, color = 'blue') + 
 ++   geom_abline(aes(intercept = b0s, slope = b1s), 
 ++               data = parameters %>% slice_tail(), 
 ++               linewidth = 1, color = 'red') +
 ++   labs(title = 'Gradient descent. blue: start, red: end, green: gradients')
 +
 +> b0.init
 +[1] -1.67967
 +> b1.init
 +[1] -1.323992
 +
 +> data
 +# A tibble: 300 × 4
 +           y predictions residuals
 +   <dbl> <dbl>       <dbl>     <dbl>
 +  4.13  6.74       -7.14     13.9 
 +  7.25 14.0       -11.3      25.3 
 +  6.09 13.5        -9.74     23.3 
 +  6.29 15.1       -10.0      25.1 
 +  4.40  3.81       -7.51     11.3 
 +  6.03 13.9        -9.67     23.5 
 +  6.97 12.1       -10.9      23.0 
 +  4.84 12.8        -8.09     20.9 
 +  6.85 17.2       -10.7      28.0 
 +10  3.33  3.80       -6.08      9.88
 +# ℹ 290 more rows
 +# ℹ Use `print(n = ...)` to see more rows
 +> parameters
 +# A tibble: 80 × 3
 +       b0s    b1s   mse
 +     <dbl>  <dbl> <dbl>
 +  2.67   -0.379 183. 
 +  1.99    0.149 123. 
 +  1.44    0.571  84.3
 +  1.00    0.910  59.6
 +  0.652   1.18   43.7
 +  0.369   1.40   33.6
 +  0.142   1.57   27.1
 + 8 -0.0397  1.71   22.9
 + 9 -0.186   1.82   20.2
 +10 -0.303   1.91   18.5
 +# ℹ 70 more rows
 +#
  
 </code> </code>
  
 +{{:pasted:20250801-185727.png}}
  
gradient_descent.1754022099.txt.gz · Last modified: 2025/08/01 13:21 by hkimscil

Donate Powered by PHP Valid HTML5 Valid CSS Driven by DokuWiki