gradient_descent
Differences
This shows you the differences between two versions of the page.
Next revision | Previous revision | ||
gradient_descent [2025/08/01 13:14] – created hkimscil | gradient_descent [2025/08/01 18:57] (current) – [R output] hkimscil | ||
---|---|---|---|
Line 1: | Line 1: | ||
====== Gradient Descent ====== | ====== Gradient Descent ====== | ||
+ | ====== explanation ====== | ||
점차하강 = 조금씩 깍아서 원하는 기울기 (미분값) 찾기 | 점차하강 = 조금씩 깍아서 원하는 기울기 (미분값) 찾기 | ||
+ | prerequisite: | ||
+ | [[estimated_standard_deviation# | ||
+ | [[: | ||
+ | 위의 문서는 a, b에 대한 값을 미분법을 이용해서 직접 구하였다. 컴퓨터로는 이렇게 하기가 쉽지 않다. 그렇다면 이 값을 반복계산을 이용해서 추출하는 방법은 없을까? gradient descent | ||
+ | 우선 위의 문서에서 (두번째) 최소값이 되는 SS값을 찾는다고 설명했는데, | ||
+ | |||
+ | \begin{eqnarray*} | ||
+ | \text{MS} & = & \frac {\text{SS}}{n} | ||
+ | \end{eqnarray*} | ||
+ | |||
+ | |||
+ | <WRAP box> | ||
+ | \begin{eqnarray*} | ||
+ | \text{for a (constant)} \\ | ||
+ | \\ | ||
+ | \dfrac{\text{d}}{\text{dv}} \text{MSE (Mean Square Error)} & = & \dfrac{\text{d}}{\text{dv}} \frac {\sum{(Y_i - (a + bX_i))^2}} {N} \\ | ||
+ | & = & \sum \dfrac{\text{d}}{\text{dv}} \frac{{(Y_i - (a + bX_i))^2}} {N} \\ | ||
+ | & = & \sum{2 \frac{1}{N} (Y_i - (a + bX_i))} * (-1) \;\;\;\; \\ | ||
+ | & \because & \dfrac{\text{d}}{\text{dv for a}} (Y_i - (a+bX_i)) = -1 \\ | ||
+ | & = & -2 \frac{\sum{(Y_i - (a + bX_i))}}{N} \\ | ||
+ | & = & -2 * \text{mean of residuals} \\ | ||
+ | \end{eqnarray*} | ||
+ | 아래 R code에서 gradient function을 참조. | ||
+ | |||
+ | </ | ||
+ | <WRAP box> | ||
+ | \begin{eqnarray*} | ||
+ | \text{for b, (coefficient)} \\ | ||
+ | \\ | ||
+ | \dfrac{\text{d}}{\text{dv}} \frac{\sum{(Y_i - (a + bX_i))^2}}{N} | ||
+ | & = & \sum{2 \frac{1}{N} (Y_i - (a + bX_i))} * (-X_i) \;\;\;\; \\ | ||
+ | & \because & \dfrac{\text{d}}{\text{dv for b}} (Y_i - (a+bX_i)) = -X_i \\ | ||
+ | & = & -2 X_i \frac{\sum{(Y_i - (a + bX_i))}}{N} \\ | ||
+ | & = & -2 * X_i * \text{mean of residuals} \\ | ||
+ | |||
+ | \\ | ||
+ | \end{eqnarray*} | ||
+ | (미분을 이해한다는 것을 전제로) 위의 식은 b값이 변할 때 msr (mean square residual) 값이 어떻게 변하는가를 알려주는 것이다. 그리고 그것은 b값에 대한 residual의 총합에 (-2/ | ||
+ | </ | ||
+ | ====== R code ====== | ||
+ | < | ||
+ | # d statquest explanation | ||
+ | # x <- c(0.5, 2.3, 2.9) | ||
+ | # y <- c(1.4, 1.9, 3.2) | ||
+ | |||
+ | rm(list=ls()) | ||
+ | # set.seed(191) | ||
+ | n <- 300 | ||
+ | x <- rnorm(n, 5, 1.2) | ||
+ | y <- 2.14 * x + rnorm(n, 0, 4) | ||
+ | |||
+ | # data <- data.frame(x, | ||
+ | data <- tibble(x = x, y = y) | ||
+ | |||
+ | mo <- lm(y~x) | ||
+ | summary(mo) | ||
+ | |||
+ | # set.seed(191) | ||
+ | # Initialize random betas | ||
+ | b1 = rnorm(1) | ||
+ | b0 = rnorm(1) | ||
+ | |||
+ | b1.init <- b1 | ||
+ | b0.init <- b0 | ||
+ | |||
+ | # Predict function: | ||
+ | predict <- function(x, b0, b1){ | ||
+ | return (b0 + b1 * x) | ||
+ | } | ||
+ | |||
+ | # And loss function is: | ||
+ | residuals <- function(predictions, | ||
+ | return(y - predictions) | ||
+ | } | ||
+ | |||
+ | loss_mse <- function(predictions, | ||
+ | residuals = y - predictions | ||
+ | return(mean(residuals ^ 2)) | ||
+ | } | ||
+ | |||
+ | predictions <- predict(x, b0, b1) | ||
+ | residuals <- residuals(predictions, | ||
+ | loss = loss_mse(predictions, | ||
+ | |||
+ | data <- tibble(data.frame(x, | ||
+ | |||
+ | print(paste0(" | ||
+ | |||
+ | gradient <- function(x, y, predictions){ | ||
+ | dinputs = y - predictions | ||
+ | db1 = -2 * mean(x * dinputs) | ||
+ | db0 = -2 * mean(dinputs) | ||
+ | | ||
+ | return(list(" | ||
+ | } | ||
+ | |||
+ | gradients <- gradient(x, y, predictions) | ||
+ | print(gradients) | ||
+ | |||
+ | # Train the model with scaled features | ||
+ | x_scaled <- (x - mean(x)) / sd(x) | ||
+ | |||
+ | learning_rate = 1e-1 | ||
+ | |||
+ | # Record Loss for each epoch: | ||
+ | # logs = list() | ||
+ | # bs=list() | ||
+ | b0s = c() | ||
+ | b1s = c() | ||
+ | mse = c() | ||
+ | |||
+ | nlen <- 80 | ||
+ | for (epoch in 1:nlen){ | ||
+ | # Predict all y values: | ||
+ | predictions = predict(x_scaled, | ||
+ | loss = loss_mse(predictions, | ||
+ | mse = append(mse, loss) | ||
+ | # logs = append(logs, | ||
+ | | ||
+ | if (epoch %% 10 == 0){ | ||
+ | print(paste0(" | ||
+ | } | ||
+ | | ||
+ | gradients <- gradient(x_scaled, | ||
+ | db1 <- gradients$db1 | ||
+ | db0 <- gradients$db0 | ||
+ | | ||
+ | b1 <- b1 - db1 * learning_rate | ||
+ | b0 <- b0 - db0 * learning_rate | ||
+ | b0s <- append(b0s, b0) | ||
+ | b1s <- append(b1s, b1) | ||
+ | } | ||
+ | |||
+ | # unscale coefficients to make them comprehensible | ||
+ | b0 = b0 - (mean(x) / sd(x)) * b1 | ||
+ | b1 = b1 / sd(x) | ||
+ | |||
+ | # changes of estimators | ||
+ | b0s <- b0s - (mean(x) /sd(x)) * b1s | ||
+ | b1s <- b1s / sd(x) | ||
+ | |||
+ | parameters <- tibble(data.frame(b0s, | ||
+ | |||
+ | cat(paste0(" | ||
+ | summary(lm(y~x))$coefficients | ||
+ | |||
+ | ggplot(data, | ||
+ | geom_point(size = 2) + | ||
+ | geom_abline(aes(intercept = b0s, slope = b1s), | ||
+ | data = parameters, linewidth = 0.5, | ||
+ | color = ' | ||
+ | theme_classic() + | ||
+ | geom_abline(aes(intercept = b0s, slope = b1s), | ||
+ | data = parameters %>% slice_head(), | ||
+ | linewidth = 1, color = ' | ||
+ | geom_abline(aes(intercept = b0s, slope = b1s), | ||
+ | data = parameters %>% slice_tail(), | ||
+ | linewidth = 1, color = ' | ||
+ | labs(title = ' | ||
+ | |||
+ | b0.init | ||
+ | b1.init | ||
+ | |||
+ | data | ||
+ | parameters | ||
+ | |||
+ | |||
+ | </ | ||
+ | ====== R output ===== | ||
+ | < | ||
+ | > rm(list=ls()) | ||
+ | > # set.seed(191) | ||
+ | > n <- 300 | ||
+ | > x <- rnorm(n, 5, 1.2) | ||
+ | > y <- 2.14 * x + rnorm(n, 0, 4) | ||
+ | > | ||
+ | > # data <- data.frame(x, | ||
+ | > data <- tibble(x = x, y = y) | ||
+ | > | ||
+ | > mo <- lm(y~x) | ||
+ | > summary(mo) | ||
+ | |||
+ | Call: | ||
+ | lm(formula = y ~ x) | ||
+ | |||
+ | Residuals: | ||
+ | | ||
+ | -9.754 -2.729 -0.135 | ||
+ | |||
+ | Coefficients: | ||
+ | Estimate Std. Error t value Pr(> | ||
+ | (Intercept) | ||
+ | x | ||
+ | --- | ||
+ | Signif. codes: | ||
+ | |||
+ | Residual standard error: 3.951 on 298 degrees of freedom | ||
+ | Multiple R-squared: | ||
+ | F-statistic: | ||
+ | |||
+ | > | ||
+ | > # set.seed(191) | ||
+ | > # Initialize random betas | ||
+ | > b1 = rnorm(1) | ||
+ | > b0 = rnorm(1) | ||
+ | > | ||
+ | > b1.init <- b1 | ||
+ | > b0.init <- b0 | ||
+ | > | ||
+ | > # Predict function: | ||
+ | > predict <- function(x, b0, b1){ | ||
+ | + | ||
+ | + } | ||
+ | > | ||
+ | > # And loss function is: | ||
+ | > residuals <- function(predictions, | ||
+ | + | ||
+ | + } | ||
+ | > | ||
+ | > loss_mse <- function(predictions, | ||
+ | + | ||
+ | + | ||
+ | + } | ||
+ | > | ||
+ | > predictions <- predict(x, b0, b1) | ||
+ | > residuals <- residuals(predictions, | ||
+ | > loss = loss_mse(predictions, | ||
+ | > | ||
+ | > data <- tibble(data.frame(x, | ||
+ | > | ||
+ | > print(paste0(" | ||
+ | [1] "Loss is: 393" | ||
+ | > | ||
+ | > gradient <- function(x, y, predictions){ | ||
+ | + | ||
+ | + db1 = -2 * mean(x * dinputs) | ||
+ | + db0 = -2 * mean(dinputs) | ||
+ | + | ||
+ | + | ||
+ | + } | ||
+ | > | ||
+ | > gradients <- gradient(x, y, predictions) | ||
+ | > print(gradients) | ||
+ | $db1 | ||
+ | [1] -200.6834 | ||
+ | |||
+ | $db0 | ||
+ | [1] -37.76994 | ||
+ | |||
+ | > | ||
+ | > # Train the model with scaled features | ||
+ | > x_scaled <- (x - mean(x)) / sd(x) | ||
+ | > | ||
+ | > learning_rate = 1e-1 | ||
+ | > | ||
+ | > # Record Loss for each epoch: | ||
+ | > # logs = list() | ||
+ | > # bs=list() | ||
+ | > b0s = c() | ||
+ | > b1s = c() | ||
+ | > mse = c() | ||
+ | > | ||
+ | > nlen <- 80 | ||
+ | > for (epoch in 1:nlen){ | ||
+ | + # Predict all y values: | ||
+ | + | ||
+ | + loss = loss_mse(predictions, | ||
+ | + mse = append(mse, loss) | ||
+ | + # logs = append(logs, | ||
+ | + | ||
+ | + if (epoch %% 10 == 0){ | ||
+ | + | ||
+ | + } | ||
+ | + | ||
+ | + | ||
+ | + db1 <- gradients$db1 | ||
+ | + db0 <- gradients$db0 | ||
+ | + | ||
+ | + b1 <- b1 - db1 * learning_rate | ||
+ | + b0 <- b0 - db0 * learning_rate | ||
+ | + b0s <- append(b0s, b0) | ||
+ | + b1s <- append(b1s, b1) | ||
+ | + } | ||
+ | [1] " | ||
+ | [1] " | ||
+ | [1] " | ||
+ | [1] " | ||
+ | [1] " | ||
+ | [1] " | ||
+ | [1] " | ||
+ | [1] " | ||
+ | > | ||
+ | > # unscale coefficients to make them comprehensible | ||
+ | > b0 = b0 - (mean(x) / sd(x)) * b1 | ||
+ | > b1 = b1 / sd(x) | ||
+ | > | ||
+ | > # changes of estimators | ||
+ | > b0s <- b0s - (mean(x) /sd(x)) * b1s | ||
+ | > b1s <- b1s / sd(x) | ||
+ | > | ||
+ | > parameters <- tibble(data.frame(b0s, | ||
+ | > | ||
+ | > cat(paste0(" | ||
+ | Slope: 2.26922511738252, | ||
+ | Intercept: -0.779435058320381 | ||
+ | > summary(lm(y~x))$coefficients | ||
+ | Estimate Std. Error t value | ||
+ | (Intercept) -0.7794352 | ||
+ | x 2.2692252 | ||
+ | > | ||
+ | > ggplot(data, | ||
+ | + | ||
+ | + | ||
+ | + data = parameters, linewidth = 0.5, | ||
+ | + color = ' | ||
+ | + | ||
+ | + | ||
+ | + data = parameters %>% slice_head(), | ||
+ | + | ||
+ | + | ||
+ | + data = parameters %>% slice_tail(), | ||
+ | + | ||
+ | + | ||
+ | > | ||
+ | > b0.init | ||
+ | [1] -1.67967 | ||
+ | > b1.init | ||
+ | [1] -1.323992 | ||
+ | > | ||
+ | > data | ||
+ | # A tibble: 300 × 4 | ||
+ | | ||
+ | < | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | 10 3.33 3.80 | ||
+ | # ℹ 290 more rows | ||
+ | # ℹ Use `print(n = ...)` to see more rows | ||
+ | > parameters | ||
+ | # A tibble: 80 × 3 | ||
+ | | ||
+ | < | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | 8 -0.0397 | ||
+ | 9 -0.186 | ||
+ | 10 -0.303 | ||
+ | # ℹ 70 more rows | ||
+ | # | ||
+ | |||
+ | </ | ||
+ | |||
+ | {{: | ||
gradient_descent.1754021687.txt.gz · Last modified: 2025/08/01 13:14 by hkimscil