gradient_descent
Differences
This shows you the differences between two versions of the page.
Both sides previous revisionPrevious revisionNext revision | Previous revision | ||
gradient_descent [2025/08/01 13:41] – hkimscil | gradient_descent [2025/08/01 18:57] (current) – [R output] hkimscil | ||
---|---|---|---|
Line 1: | Line 1: | ||
====== Gradient Descent ====== | ====== Gradient Descent ====== | ||
+ | ====== explanation ====== | ||
점차하강 = 조금씩 깍아서 원하는 기울기 (미분값) 찾기 | 점차하강 = 조금씩 깍아서 원하는 기울기 (미분값) 찾기 | ||
prerequisite: | prerequisite: | ||
Line 22: | Line 23: | ||
& = & \sum{2 \frac{1}{N} (Y_i - (a + bX_i))} * (-1) \;\;\;\; \\ | & = & \sum{2 \frac{1}{N} (Y_i - (a + bX_i))} * (-1) \;\;\;\; \\ | ||
& \because & \dfrac{\text{d}}{\text{dv for a}} (Y_i - (a+bX_i)) = -1 \\ | & \because & \dfrac{\text{d}}{\text{dv for a}} (Y_i - (a+bX_i)) = -1 \\ | ||
- | & = & -2 \frac{1}{N} \sum{(Y_i - (a + bX_i))} \\ | + | & = & -2 \frac{\sum{(Y_i - (a + bX_i))}}{N} \\ |
+ | & = & -2 * \text{mean of residuals} \\ | ||
\end{eqnarray*} | \end{eqnarray*} | ||
+ | 아래 R code에서 gradient function을 참조. | ||
</ | </ | ||
+ | <WRAP box> | ||
+ | \begin{eqnarray*} | ||
+ | \text{for b, (coefficient)} \\ | ||
+ | \\ | ||
+ | \dfrac{\text{d}}{\text{dv}} \frac{\sum{(Y_i - (a + bX_i))^2}}{N} | ||
+ | & = & \sum{2 \frac{1}{N} (Y_i - (a + bX_i))} * (-X_i) \;\;\;\; \\ | ||
+ | & \because & \dfrac{\text{d}}{\text{dv for b}} (Y_i - (a+bX_i)) = -X_i \\ | ||
+ | & = & -2 X_i \frac{\sum{(Y_i - (a + bX_i))}}{N} \\ | ||
+ | & = & -2 * X_i * \text{mean of residuals} \\ | ||
+ | \\ | ||
+ | \end{eqnarray*} | ||
+ | (미분을 이해한다는 것을 전제로) 위의 식은 b값이 변할 때 msr (mean square residual) 값이 어떻게 변하는가를 알려주는 것이다. 그리고 그것은 b값에 대한 residual의 총합에 (-2/ | ||
+ | </ | ||
+ | ====== R code ====== | ||
< | < | ||
- | library(tidyverse) | + | # d statquest explanation |
- | # a simple example | + | # x <- c(0.5, 2.3, 2.9) |
- | # statquest explanation | + | # y <- c(1.4, 1.9, 3.2) |
- | x <- c(0.5, 2.3, 2.9) | + | |
- | y <- c(1.4, 1.9, 3.2) | + | |
rm(list=ls()) | rm(list=ls()) | ||
# set.seed(191) | # set.seed(191) | ||
- | n <- 500 | + | n <- 300 |
x <- rnorm(n, 5, 1.2) | x <- rnorm(n, 5, 1.2) | ||
y <- 2.14 * x + rnorm(n, 0, 4) | y <- 2.14 * x + rnorm(n, 0, 4) | ||
Line 43: | Line 57: | ||
# data <- data.frame(x, | # data <- data.frame(x, | ||
data <- tibble(x = x, y = y) | data <- tibble(x = x, y = y) | ||
- | data | ||
mo <- lm(y~x) | mo <- lm(y~x) | ||
Line 52: | Line 65: | ||
b1 = rnorm(1) | b1 = rnorm(1) | ||
b0 = rnorm(1) | b0 = rnorm(1) | ||
+ | |||
+ | b1.init <- b1 | ||
+ | b0.init <- b0 | ||
# Predict function: | # Predict function: | ||
Line 72: | Line 88: | ||
loss = loss_mse(predictions, | loss = loss_mse(predictions, | ||
- | temp.sum | + | data <- tibble(data.frame(x, |
- | temp.sum | + | |
print(paste0(" | print(paste0(" | ||
Line 94: | Line 109: | ||
# Record Loss for each epoch: | # Record Loss for each epoch: | ||
- | logs = list() | + | # logs = list() |
- | bs=list() | + | # bs=list() |
b0s = c() | b0s = c() | ||
b1s = c() | b1s = c() | ||
- | msr = c() | + | mse = c() |
nlen <- 80 | nlen <- 80 | ||
Line 105: | Line 120: | ||
predictions = predict(x_scaled, | predictions = predict(x_scaled, | ||
loss = loss_mse(predictions, | loss = loss_mse(predictions, | ||
- | | + | |
- | | + | |
- | | + | |
| | ||
if (epoch %% 10 == 0){ | if (epoch %% 10 == 0){ | ||
Line 122: | Line 136: | ||
b1s <- append(b1s, b1) | b1s <- append(b1s, b1) | ||
} | } | ||
- | # I must unscale coefficients to make them comprehensible | + | |
+ | # unscale coefficients to make them comprehensible | ||
b0 = b0 - (mean(x) / sd(x)) * b1 | b0 = b0 - (mean(x) / sd(x)) * b1 | ||
b1 = b1 / sd(x) | b1 = b1 / sd(x) | ||
+ | # changes of estimators | ||
b0s <- b0s - (mean(x) /sd(x)) * b1s | b0s <- b0s - (mean(x) /sd(x)) * b1s | ||
b1s <- b1s / sd(x) | b1s <- b1s / sd(x) | ||
- | parameters <- tibble(data.frame(b0s, | + | parameters <- tibble(data.frame(b0s, |
- | cat(paste0(" | + | cat(paste0(" |
summary(lm(y~x))$coefficients | summary(lm(y~x))$coefficients | ||
Line 137: | Line 153: | ||
geom_point(size = 2) + | geom_point(size = 2) + | ||
geom_abline(aes(intercept = b0s, slope = b1s), | geom_abline(aes(intercept = b0s, slope = b1s), | ||
- | data = parameters, linewidth = 0.5, color = 'red') + | + | data = parameters, linewidth = 0.5, |
+ | | ||
theme_classic() + | theme_classic() + | ||
geom_abline(aes(intercept = b0s, slope = b1s), | geom_abline(aes(intercept = b0s, slope = b1s), | ||
data = parameters %>% slice_head(), | data = parameters %>% slice_head(), | ||
- | linewidth = 0.5, color = ' | + | linewidth = 1, color = ' |
geom_abline(aes(intercept = b0s, slope = b1s), | geom_abline(aes(intercept = b0s, slope = b1s), | ||
data = parameters %>% slice_tail(), | data = parameters %>% slice_tail(), | ||
- | linewidth = 1, color = 'green') + | + | linewidth = 1, color = 'red') + |
- | labs(title = ' | + | labs(title = ' |
+ | |||
+ | b0.init | ||
+ | b1.init | ||
data | data | ||
parameters | parameters | ||
+ | |||
+ | |||
+ | </ | ||
+ | ====== R output ===== | ||
+ | < | ||
+ | > rm(list=ls()) | ||
+ | > # set.seed(191) | ||
+ | > n <- 300 | ||
+ | > x <- rnorm(n, 5, 1.2) | ||
+ | > y <- 2.14 * x + rnorm(n, 0, 4) | ||
+ | > | ||
+ | > # data <- data.frame(x, | ||
+ | > data <- tibble(x = x, y = y) | ||
+ | > | ||
+ | > mo <- lm(y~x) | ||
+ | > summary(mo) | ||
+ | |||
+ | Call: | ||
+ | lm(formula = y ~ x) | ||
+ | |||
+ | Residuals: | ||
+ | | ||
+ | -9.754 -2.729 -0.135 | ||
+ | |||
+ | Coefficients: | ||
+ | Estimate Std. Error t value Pr(> | ||
+ | (Intercept) | ||
+ | x | ||
+ | --- | ||
+ | Signif. codes: | ||
+ | |||
+ | Residual standard error: 3.951 on 298 degrees of freedom | ||
+ | Multiple R-squared: | ||
+ | F-statistic: | ||
+ | |||
+ | > | ||
+ | > # set.seed(191) | ||
+ | > # Initialize random betas | ||
+ | > b1 = rnorm(1) | ||
+ | > b0 = rnorm(1) | ||
+ | > | ||
+ | > b1.init <- b1 | ||
+ | > b0.init <- b0 | ||
+ | > | ||
+ | > # Predict function: | ||
+ | > predict <- function(x, b0, b1){ | ||
+ | + | ||
+ | + } | ||
+ | > | ||
+ | > # And loss function is: | ||
+ | > residuals <- function(predictions, | ||
+ | + | ||
+ | + } | ||
+ | > | ||
+ | > loss_mse <- function(predictions, | ||
+ | + | ||
+ | + | ||
+ | + } | ||
+ | > | ||
+ | > predictions <- predict(x, b0, b1) | ||
+ | > residuals <- residuals(predictions, | ||
+ | > loss = loss_mse(predictions, | ||
+ | > | ||
+ | > data <- tibble(data.frame(x, | ||
+ | > | ||
+ | > print(paste0(" | ||
+ | [1] "Loss is: 393" | ||
+ | > | ||
+ | > gradient <- function(x, y, predictions){ | ||
+ | + | ||
+ | + db1 = -2 * mean(x * dinputs) | ||
+ | + db0 = -2 * mean(dinputs) | ||
+ | + | ||
+ | + | ||
+ | + } | ||
+ | > | ||
+ | > gradients <- gradient(x, y, predictions) | ||
+ | > print(gradients) | ||
+ | $db1 | ||
+ | [1] -200.6834 | ||
+ | |||
+ | $db0 | ||
+ | [1] -37.76994 | ||
+ | |||
+ | > | ||
+ | > # Train the model with scaled features | ||
+ | > x_scaled <- (x - mean(x)) / sd(x) | ||
+ | > | ||
+ | > learning_rate = 1e-1 | ||
+ | > | ||
+ | > # Record Loss for each epoch: | ||
+ | > # logs = list() | ||
+ | > # bs=list() | ||
+ | > b0s = c() | ||
+ | > b1s = c() | ||
+ | > mse = c() | ||
+ | > | ||
+ | > nlen <- 80 | ||
+ | > for (epoch in 1:nlen){ | ||
+ | + # Predict all y values: | ||
+ | + | ||
+ | + loss = loss_mse(predictions, | ||
+ | + mse = append(mse, loss) | ||
+ | + # logs = append(logs, | ||
+ | + | ||
+ | + if (epoch %% 10 == 0){ | ||
+ | + | ||
+ | + } | ||
+ | + | ||
+ | + | ||
+ | + db1 <- gradients$db1 | ||
+ | + db0 <- gradients$db0 | ||
+ | + | ||
+ | + b1 <- b1 - db1 * learning_rate | ||
+ | + b0 <- b0 - db0 * learning_rate | ||
+ | + b0s <- append(b0s, b0) | ||
+ | + b1s <- append(b1s, b1) | ||
+ | + } | ||
+ | [1] " | ||
+ | [1] " | ||
+ | [1] " | ||
+ | [1] " | ||
+ | [1] " | ||
+ | [1] " | ||
+ | [1] " | ||
+ | [1] " | ||
+ | > | ||
+ | > # unscale coefficients to make them comprehensible | ||
+ | > b0 = b0 - (mean(x) / sd(x)) * b1 | ||
+ | > b1 = b1 / sd(x) | ||
+ | > | ||
+ | > # changes of estimators | ||
+ | > b0s <- b0s - (mean(x) /sd(x)) * b1s | ||
+ | > b1s <- b1s / sd(x) | ||
+ | > | ||
+ | > parameters <- tibble(data.frame(b0s, | ||
+ | > | ||
+ | > cat(paste0(" | ||
+ | Slope: 2.26922511738252, | ||
+ | Intercept: -0.779435058320381 | ||
+ | > summary(lm(y~x))$coefficients | ||
+ | Estimate Std. Error t value | ||
+ | (Intercept) -0.7794352 | ||
+ | x 2.2692252 | ||
+ | > | ||
+ | > ggplot(data, | ||
+ | + | ||
+ | + | ||
+ | + data = parameters, linewidth = 0.5, | ||
+ | + color = ' | ||
+ | + | ||
+ | + | ||
+ | + data = parameters %>% slice_head(), | ||
+ | + | ||
+ | + | ||
+ | + data = parameters %>% slice_tail(), | ||
+ | + | ||
+ | + | ||
+ | > | ||
+ | > b0.init | ||
+ | [1] -1.67967 | ||
+ | > b1.init | ||
+ | [1] -1.323992 | ||
+ | > | ||
+ | > data | ||
+ | # A tibble: 300 × 4 | ||
+ | | ||
+ | < | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | 10 3.33 3.80 | ||
+ | # ℹ 290 more rows | ||
+ | # ℹ Use `print(n = ...)` to see more rows | ||
+ | > parameters | ||
+ | # A tibble: 80 × 3 | ||
+ | | ||
+ | < | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | | ||
+ | 8 -0.0397 | ||
+ | 9 -0.186 | ||
+ | 10 -0.303 | ||
+ | # ℹ 70 more rows | ||
+ | # | ||
</ | </ | ||
+ | {{: | ||
gradient_descent.1754023279.txt.gz · Last modified: 2025/08/01 13:41 by hkimscil