gradient_descent
Differences
This shows you the differences between two versions of the page.
| Both sides previous revisionPrevious revisionNext revision | Previous revision | ||
| gradient_descent [2025/08/21 12:13] – [output] hkimscil | gradient_descent [2025/10/02 11:59] (current) – hkimscil | ||
|---|---|---|---|
| Line 1: | Line 1: | ||
| ====== Gradient Descent ====== | ====== Gradient Descent ====== | ||
| - | ====== explanation ====== | ||
| - | |||
| - | ====== Why normalize (scale or make z-score) xi ====== | ||
| - | x 변인의 측정단위로 인해서 b 값이 결정되게 되는데 이 때의 b값은 상당하고 다양한 범위를 가질 수 있다. 가령 월 수입이 (인컴) X 라고 한다면 우리가 추정해야 (추적해야) 할 b값은 수백만이 될 수도 있다.이 값을 gradient로 추적하게 된다면 너무도 많은 iteration을 거쳐야 할 수 있다. 변인이 바뀌면 이 b의 추적범위도 드라마틱하게 바뀌게 된다. 이를 표준화한 x 점수를 사용하게 된다면 일정한 learning rate와 iteration만으로도 정확한 a와 b를 추적할 수 있게 된다. | ||
| - | |||
| - | ====== How to unnormalize (unscale) a and b ====== | ||
| - | \begin{eqnarray*} | ||
| - | y & = & a + b * x \\ | ||
| - | & & \text{we use z instead of x} \\ | ||
| - | & & \text{and } \\ | ||
| - | & & z = \frac{(x - \mu)}{\sigma} \\ | ||
| - | & & \text{suppose that the result after calculation be } \\ | ||
| - | y & = & k + m * z \\ | ||
| - | & = & k + m * \frac{(x - \mu)}{\sigma} \\ | ||
| - | & = & k + \frac{m * x}{\sigma} - \frac{m * \mu}{\sigma} | ||
| - | & = & k - \frac{m * \mu}{\sigma} + \frac{m * x}{\sigma} | ||
| - | & = & k - \frac{\mu}{\sigma} * m + \frac{m}{\sigma} * x \\ | ||
| - | & & \text{therefore, | ||
| - | a & = & k - \frac{\mu}{\sigma} * m \\ | ||
| - | b & = & \frac{m}{\sigma} \\ | ||
| - | \end{eqnarray*} | ||
| - | |||
| - | |||
| ====== R code: Idea ====== | ====== R code: Idea ====== | ||
| < | < | ||
| + | library(tidyverse) | ||
| + | library(data.table) | ||
| library(ggplot2) | library(ggplot2) | ||
| library(ggpmisc) | library(ggpmisc) | ||
| Line 519: | Line 497: | ||
| > | > | ||
| </ | </ | ||
| - | 렇게 말고 | + | a와 b를 동시에 |
| - | gradient descent | + | |
| ====== Gradient descend ====== | ====== Gradient descend ====== | ||
| Line 563: | Line 540: | ||
| & = & -2 X_i \sum{(Y_i - (a + bX_i))} \\ | & = & -2 X_i \sum{(Y_i - (a + bX_i))} \\ | ||
| & = & -2 * X_i * \sum{\text{residual}} \\ | & = & -2 * X_i * \sum{\text{residual}} \\ | ||
| - | \\ | + | & .. & -2 * X_i * \frac{\sum{\text{residual}}}{n} \\ |
| + | & = & -2 * \overline{X_i * \text{residual}} \\ | ||
| \end{eqnarray*} | \end{eqnarray*} | ||
| - | (미분을 이해한다는 것을 전제로) | + | 위의 설명은 Sum of Square값을 미분하는 것을 전제로 하였지만, |
| < | < | ||
| Line 630: | Line 609: | ||
| ====== R code ====== | ====== R code ====== | ||
| < | < | ||
| - | # d statquest explanation | + | # the above no gradient |
| - | # x <- c(0.5, 2.3, 2.9) | + | # mse 값으로 계산 rather than sse |
| - | # y <- c(1.4, 1.9, 3.2) | + | # 후자는 값이 너무 커짐 |
| - | rm(list=ls()) | + | a <- rnorm(1) |
| - | # set.seed(191) | + | b <- rnorm(1) |
| - | n <- 300 | + | a.start |
| - | x <- rnorm(n, 5, 1.2) | + | b.start <- b |
| - | y <- 2.14 * x + rnorm(n, 0, 4) | + | |
| - | # data <- data.frame(x, y) | + | gradient |
| - | data <- tibble(x | + | |
| - | + | db = -2 * mean(x * error) | |
| - | mo <- lm(y~x) | + | da = -2 * mean(error) |
| - | summary(mo) | + | |
| - | + | ||
| - | # set.seed(191) | + | |
| - | # Initialize random betas | + | |
| - | b1 = rnorm(1) | + | |
| - | b0 = rnorm(1) | + | |
| - | + | ||
| - | b1.init <- b1 | + | |
| - | b0.init <- b0 | + | |
| - | + | ||
| - | # Predict function: | + | |
| - | predict <- function(x, b0, b1){ | + | |
| - | return (b0 + b1 * x) | + | |
| } | } | ||
| - | # And loss function is: | + | mseloss |
| - | residuals | + | |
| - | | + | return(mean(residuals^2)) |
| } | } | ||
| - | |||
| - | loss_mse <- function(predictions, | ||
| - | residuals = y - predictions | ||
| - | return(mean(residuals ^ 2)) | ||
| - | } | ||
| - | |||
| - | predictions <- predict(x, b0, b1) | ||
| - | residuals <- residuals(predictions, | ||
| - | loss = loss_mse(predictions, | ||
| - | |||
| - | data <- tibble(data.frame(x, | ||
| - | |||
| - | print(paste0(" | ||
| - | |||
| - | gradient <- function(x, y, predictions){ | ||
| - | dinputs = y - predictions | ||
| - | db1 = -2 * mean(x * dinputs) | ||
| - | db0 = -2 * mean(dinputs) | ||
| - | | ||
| - | return(list(" | ||
| - | } | ||
| - | |||
| - | gradients <- gradient(x, y, predictions) | ||
| - | print(gradients) | ||
| # Train the model with scaled features | # Train the model with scaled features | ||
| - | x_scaled <- (x - mean(x)) / sd(x) | + | learning.rate |
| - | + | ||
| - | learning_rate | + | |
| # Record Loss for each epoch: | # Record Loss for each epoch: | ||
| - | # logs = list() | + | as = c() |
| - | # bs=list() | + | bs = c() |
| - | b0s = c() | + | mses = c() |
| - | b1s = c() | + | sses = c() |
| - | mse = c() | + | mres = c() |
| + | zx <- (x-mean(x))/ | ||
| - | nlen <- 80 | + | nlen <- 50 |
| - | for (epoch in 1:nlen){ | + | for (epoch in 1:nlen) { |
| - | # Predict all y values: | + | predictions |
| - | predictions | + | residual <- residuals(predictions, |
| - | loss = loss_mse(predictions, | + | loss <- mseloss(predictions, |
| - | | + | |
| - | | + | |
| | | ||
| - | | + | |
| - | print(paste0(" | + | |
| - | } | + | |
| | | ||
| - | | + | |
| - | | + | |
| - | | + | |
| + | a <- a-step.a | ||
| | | ||
| - | | + | |
| - | b0 <- b0 - db0 * learning_rate | + | |
| - | b0s <- append(b0s, b0) | + | |
| - | | + | |
| } | } | ||
| + | mses | ||
| + | mres | ||
| + | as | ||
| + | bs | ||
| + | |||
| + | # scaled | ||
| + | a | ||
| + | b | ||
| # unscale coefficients to make them comprehensible | # unscale coefficients to make them comprehensible | ||
| - | b0 = | + | # see http:// |
| - | b1 = b1 / sd(x) | + | # and |
| + | # http:// | ||
| + | # | ||
| + | a = | ||
| + | b = | ||
| + | a | ||
| + | b | ||
| # changes of estimators | # changes of estimators | ||
| - | b0s <- b0s - (mean(x) /sd(x)) * b1s | + | as <- as - (mean(x) /sd(x)) * bs |
| - | b1s <- b1s / sd(x) | + | bs <- bs / sd(x) |
| - | parameters | + | as |
| + | bs | ||
| + | mres | ||
| + | mse.x <- mses | ||
| - | cat(paste0(" | + | parameters <- data.frame(as, |
| + | |||
| + | cat(paste0(" | ||
| summary(lm(y~x))$coefficients | summary(lm(y~x))$coefficients | ||
| + | mses <- data.frame(mses) | ||
| + | mses.log <- data.table(epoch = 1:nlen, mses) | ||
| + | ggplot(mses.log, | ||
| + | geom_line(color=" | ||
| + | theme_classic() | ||
| + | |||
| + | # mres <- data.frame(mres) | ||
| + | mres.log <- data.table(epoch = 1:nlen, mres) | ||
| + | ggplot(mres.log, | ||
| + | geom_line(color=" | ||
| + | theme_classic() | ||
| + | |||
| + | ch <- data.frame(mres, | ||
| + | ch | ||
| + | max(y) | ||
| ggplot(data, | ggplot(data, | ||
| geom_point(size = 2) + | geom_point(size = 2) + | ||
| - | geom_abline(aes(intercept = b0s, slope = b1s), | + | geom_abline(aes(intercept = as, slope = bs), |
| data = parameters, linewidth = 0.5, | data = parameters, linewidth = 0.5, | ||
| color = ' | color = ' | ||
| + | stat_poly_line() + | ||
| + | stat_poly_eq(use_label(c(" | ||
| theme_classic() + | theme_classic() + | ||
| - | geom_abline(aes(intercept = b0s, slope = b1s), | + | geom_abline(aes(intercept = as, slope = bs), |
| data = parameters %>% slice_head(), | data = parameters %>% slice_head(), | ||
| linewidth = 1, color = ' | linewidth = 1, color = ' | ||
| - | geom_abline(aes(intercept = b0s, slope = b1s), | + | geom_abline(aes(intercept = as, slope = bs), |
| data = parameters %>% slice_tail(), | data = parameters %>% slice_tail(), | ||
| linewidth = 1, color = ' | linewidth = 1, color = ' | ||
| labs(title = ' | labs(title = ' | ||
| - | + | summary(lm(y~x)) | |
| - | b0.init | + | a.start |
| - | b1.init | + | b.start |
| - | + | a | |
| - | data | + | b |
| - | parameters | + | |
| </ | </ | ||
| ====== R output ===== | ====== R output ===== | ||
| < | < | ||
| - | > rm(list=ls()) | ||
| - | > # set.seed(191) | ||
| - | > n <- 300 | ||
| - | > x <- rnorm(n, 5, 1.2) | ||
| - | > y <- 2.14 * x + rnorm(n, 0, 4) | ||
| > | > | ||
| - | > # data <- data.frame(x, | ||
| - | > data <- tibble(x = x, y = y) | ||
| > | > | ||
| - | > mo <- lm(y~x) | + | > # the above no gradient |
| - | > summary(mo) | + | > # mse 값으로 계산 rather than sse |
| - | + | > # 후자는 값이 너무 커짐 | |
| - | Call: | + | |
| - | lm(formula = y ~ x) | + | |
| - | + | ||
| - | Residuals: | + | |
| - | | + | |
| - | -9.754 -2.729 -0.135 | + | |
| - | + | ||
| - | Coefficients: | + | |
| - | Estimate Std. Error t value Pr(>|t|) | + | |
| - | (Intercept) | + | |
| - | x | + | |
| - | --- | + | |
| - | Signif. codes: | + | |
| - | + | ||
| - | Residual standard error: 3.951 on 298 degrees of freedom | + | |
| - | Multiple R-squared: | + | |
| - | F-statistic: | + | |
| > | > | ||
| - | > # set.seed(191) | + | > a <- rnorm(1) |
| - | > # Initialize random betas | + | > b <- rnorm(1) |
| - | > b1 = rnorm(1) | + | > a.start <- a |
| - | > b0 = rnorm(1) | + | > b.start <- b |
| > | > | ||
| - | > b1.init <- b1 | + | > gradient |
| - | > b0.init <- b0 | + | + error = y - predictions |
| - | > | + | + db = -2 * mean(x * error) |
| - | > # Predict function: | + | + da = -2 * mean(error) |
| - | > predict | + | + |
| - | + return (b0 + b1 * x) | + | |
| + } | + } | ||
| > | > | ||
| - | > # And loss function is: | + | > mseloss |
| - | > residuals | + | + residuals <- (y - predictions) |
| - | + return(y - predictions) | + | + |
| + } | + } | ||
| - | > | ||
| - | > loss_mse <- function(predictions, | ||
| - | + | ||
| - | + | ||
| - | + } | ||
| - | > | ||
| - | > predictions <- predict(x, b0, b1) | ||
| - | > residuals <- residuals(predictions, | ||
| - | > loss = loss_mse(predictions, | ||
| - | > | ||
| - | > data <- tibble(data.frame(x, | ||
| - | > | ||
| - | > print(paste0(" | ||
| - | [1] "Loss is: 393" | ||
| - | > | ||
| - | > gradient <- function(x, y, predictions){ | ||
| - | + | ||
| - | + db1 = -2 * mean(x * dinputs) | ||
| - | + db0 = -2 * mean(dinputs) | ||
| - | + | ||
| - | + | ||
| - | + } | ||
| - | > | ||
| - | > gradients <- gradient(x, y, predictions) | ||
| - | > print(gradients) | ||
| - | $db1 | ||
| - | [1] -200.6834 | ||
| - | |||
| - | $db0 | ||
| - | [1] -37.76994 | ||
| - | |||
| > | > | ||
| > # Train the model with scaled features | > # Train the model with scaled features | ||
| - | > x_scaled <- (x - mean(x)) / sd(x) | + | > learning.rate |
| - | > | + | |
| - | > learning_rate | + | |
| > | > | ||
| > # Record Loss for each epoch: | > # Record Loss for each epoch: | ||
| - | > # logs = list() | + | > as = c() |
| - | > # bs=list() | + | > bs = c() |
| - | > b0s = c() | + | > mses = c() |
| - | > b1s = c() | + | > sses = c() |
| - | > mse = c() | + | > mres = c() |
| + | > zx <- (x-mean(x))/ | ||
| > | > | ||
| - | > nlen <- 80 | + | > nlen <- 50 |
| - | > for (epoch in 1:nlen){ | + | > for (epoch in 1:nlen) { |
| - | + # Predict all y values: | + | + |
| - | + | + | + |
| - | + | + | + |
| - | + mse = append(mse, loss) | + | + mres <- append(mres, mean(residual)) |
| - | + # logs = append(logs, loss) | + | + mses <- append(mses, loss) |
| + | + | ||
| - | + if (epoch %% 10 == 0){ | + | + grad <- gradient(zx, y, predictions) |
| - | + | + | |
| - | + } | + | |
| + | + | ||
| - | + gradients | + | + step.b |
| - | + db1 <- gradients$db1 | + | + step.a |
| - | + db0 <- gradients$db0 | + | + b <- b-step.b |
| + | + a <- a-step.a | ||
| + | + | ||
| - | + b1 <- b1 - db1 * learning_rate | + | + as <- append(as, a) |
| - | + b0 <- b0 - db0 * learning_rate | + | + bs <- append(bs, b) |
| - | + | + | |
| - | + b1s <- append(b1s, b1) | + | |
| + } | + } | ||
| - | [1] " | + | > mses |
| - | [1] " | + | [1] 12376.887 10718.824 |
| - | [1] " | + | [9] |
| - | [1] " | + | [17] 7770.364 |
| - | [1] " | + | [25] 7766.783 |
| - | [1] " | + | [33] 7766.682 |
| - | [1] " | + | [41] 7766.679 |
| - | [1] " | + | [49] 7766.679 |
| + | > mres | ||
| + | | ||
| + | [7] 15.735566811 12.588453449 10.070762759 | ||
| + | [13] 4.124984426 | ||
| + | [19] 1.081339917 | ||
| + | [25] 0.283466771 | ||
| + | [31] 0.074309113 | ||
| + | [37] 0.019479688 | ||
| + | [43] 0.005106483 | ||
| + | [49] 0.001338634 | ||
| + | > as | ||
| + | | ||
| + | [9] 53.33440 54.94572 56.23478 57.26602 58.09102 58.75102 59.27901 59.70141 | ||
| + | [17] 60.03933 60.30967 60.52593 60.69895 60.83736 60.94809 61.03667 61.10754 | ||
| + | [25] 61.16423 61.20959 61.24587 61.27490 61.29812 61.31670 61.33156 61.34345 | ||
| + | [33] 61.35296 61.36057 61.36666 61.37153 61.37542 61.37854 61.38103 61.38303 | ||
| + | [41] 61.38462 61.38590 61.38692 61.38774 61.38839 61.38891 61.38933 61.38967 | ||
| + | [49] 61.38993 61.39015 | ||
| + | > bs | ||
| + | | ||
| + | [9] 26.365585 27.224909 27.913227 28.464570 28.906196 29.259938 29.543285 29.770247 | ||
| + | [17] 29.952043 30.097661 30.214302 30.307731 30.382568 30.442512 30.490527 30.528987 | ||
| + | [25] 30.559794 30.584470 30.604236 30.620068 30.632750 30.642908 30.651044 30.657562 | ||
| + | [33] 30.662782 30.666964 30.670313 30.672996 30.675145 30.676866 30.678245 30.679349 | ||
| + | [41] 30.680234 30.680943 30.681510 30.681965 30.682329 30.682621 30.682854 30.683041 | ||
| + | [49] 30.683191 30.683311 | ||
| + | > | ||
| + | > # scaled | ||
| + | > a | ||
| + | [1] 61.39015 | ||
| + | > b | ||
| + | [1] 30.68331 | ||
| > | > | ||
| > # unscale coefficients to make them comprehensible | > # unscale coefficients to make them comprehensible | ||
| - | > b0 = | + | > # see http:// |
| - | > b1 = b1 / sd(x) | + | > # and |
| + | > # http:// | ||
| + | > # | ||
| + | > a = | ||
| + | > b = | ||
| + | > a | ||
| + | [1] 8.266303 | ||
| + | > b | ||
| + | [1] 11.88797 | ||
| > | > | ||
| > # changes of estimators | > # changes of estimators | ||
| - | > b0s <- b0s - (mean(x) /sd(x)) * b1s | + | > as <- as - (mean(x) /sd(x)) * bs |
| - | > b1s <- b1s / sd(x) | + | > bs <- bs / sd(x) |
| > | > | ||
| - | > parameters <- tibble(data.frame(b0s, b1s, mse)) | + | > as |
| + | [1] 4.364717 5.189158 5.839931 6.353516 6.758752 7.078428 7.330555 7.529361 | ||
| + | [9] 7.686087 7.809611 7.906942 7.983615 8.043999 8.091541 8.128963 8.158410 | ||
| + | [17] 8.181574 8.199791 8.214112 8.225367 8.234209 8.241154 8.246605 8.250884 | ||
| + | [25] 8.254239 8.256871 8.258933 8.260549 8.261814 8.262804 8.263579 8.264184 | ||
| + | [33] 8.264658 8.265027 8.265315 8.265540 8.265716 8.265852 8.265958 8.266041 | ||
| + | [41] 8.266105 8.266155 8.266193 8.266223 8.266246 8.266264 8.266278 8.266289 | ||
| + | [49] 8.266297 8.266303 | ||
| + | > bs | ||
| + | | ||
| + | [9] 10.215107 10.548045 10.814727 11.028340 11.199444 11.336498 11.446279 11.534213 | ||
| + | [17] 11.604648 11.661067 11.706258 11.742456 11.771451 11.794676 11.813279 11.828180 | ||
| + | [25] 11.840116 11.849676 11.857334 11.863469 11.868382 11.872317 11.875470 11.877995 | ||
| + | [33] 11.880018 11.881638 11.882935 11.883975 11.884807 11.885474 11.886009 11.886437 | ||
| + | [41] 11.886779 11.887054 11.887274 11.887450 11.887591 11.887704 11.887794 11.887867 | ||
| + | [49] 11.887925 11.887972 | ||
| + | > mres | ||
| + | [1] 60.026423686 48.021138949 38.416911159 30.733528927 24.586823142 19.669458513 | ||
| + | [7] 15.735566811 12.588453449 10.070762759 | ||
| + | [13] 4.124984426 | ||
| + | [19] 1.081339917 | ||
| + | [25] 0.283466771 | ||
| + | [31] 0.074309113 | ||
| + | [37] 0.019479688 | ||
| + | [43] 0.005106483 | ||
| + | [49] 0.001338634 | ||
| + | > mse.x <- mses | ||
| > | > | ||
| - | > cat(paste0(" | + | > parameters <- data.frame(as, |
| - | Slope: 2.26922511738252, | + | > |
| - | Intercept: -0.779435058320381 | + | > cat(paste0(" |
| + | Intercept: 8.26630323816515 | ||
| + | Slope: 11.8879715830899 | ||
| > summary(lm(y~x))$coefficients | > summary(lm(y~x))$coefficients | ||
| - | | + | Estimate Std. Error |
| - | (Intercept) | + | (Intercept) |
| - | x 2.2692252 | + | x 11.888159 |
| > | > | ||
| + | > mses <- data.frame(mses) | ||
| + | > mses.log <- data.table(epoch = 1:nlen, mses) | ||
| + | > ggplot(mses.log, | ||
| + | + | ||
| + | + | ||
| + | > | ||
| + | > # mres <- data.frame(mres) | ||
| + | > mres.log <- data.table(epoch = 1:nlen, mres) | ||
| + | > ggplot(mres.log, | ||
| + | + | ||
| + | + | ||
| + | > | ||
| + | > ch <- data.frame(mres, | ||
| + | > ch | ||
| + | | ||
| + | 1 60.026423686 12376.887 | ||
| + | 2 48.021138949 10718.824 | ||
| + | 3 38.416911159 | ||
| + | 4 30.733528927 | ||
| + | 5 24.586823142 | ||
| + | 6 19.669458513 | ||
| + | 7 15.735566811 | ||
| + | 8 12.588453449 | ||
| + | 9 10.070762759 | ||
| + | 10 8.056610207 | ||
| + | 11 6.445288166 | ||
| + | 12 5.156230533 | ||
| + | 13 4.124984426 | ||
| + | 14 3.299987541 | ||
| + | 15 2.639990033 | ||
| + | 16 2.111992026 | ||
| + | 17 1.689593621 | ||
| + | 18 1.351674897 | ||
| + | 19 1.081339917 | ||
| + | 20 0.865071934 | ||
| + | 21 0.692057547 | ||
| + | 22 0.553646038 | ||
| + | 23 0.442916830 | ||
| + | 24 0.354333464 | ||
| + | 25 0.283466771 | ||
| + | 26 0.226773417 | ||
| + | 27 0.181418734 | ||
| + | 28 0.145134987 | ||
| + | 29 0.116107990 | ||
| + | 30 0.092886392 | ||
| + | 31 0.074309113 | ||
| + | 32 0.059447291 | ||
| + | 33 0.047557833 | ||
| + | 34 0.038046266 | ||
| + | 35 0.030437013 | ||
| + | 36 0.024349610 | ||
| + | 37 0.019479688 | ||
| + | 38 0.015583751 | ||
| + | 39 0.012467000 | ||
| + | 40 0.009973600 | ||
| + | 41 0.007978880 | ||
| + | 42 0.006383104 | ||
| + | 43 0.005106483 | ||
| + | 44 0.004085187 | ||
| + | 45 0.003268149 | ||
| + | 46 0.002614519 | ||
| + | 47 0.002091616 | ||
| + | 48 0.001673292 | ||
| + | 49 0.001338634 | ||
| + | 50 0.001070907 | ||
| + | > max(y) | ||
| + | [1] 383.1671 | ||
| > ggplot(data, | > ggplot(data, | ||
| + | + | ||
| - | + | + | + |
| + data = parameters, linewidth = 0.5, | + data = parameters, linewidth = 0.5, | ||
| + color = ' | + color = ' | ||
| + | + | ||
| + | + | ||
| + | + | ||
| - | + | + | + |
| + data = parameters %>% slice_head(), | + data = parameters %>% slice_head(), | ||
| + | + | ||
| - | + | + | + |
| + data = parameters %>% slice_tail(), | + data = parameters %>% slice_tail(), | ||
| + | + | ||
| + | + | ||
| - | > | + | > summary(lm(y~x)) |
| - | > b0.init | + | |
| - | [1] -1.67967 | + | |
| - | > b1.init | + | |
| - | [1] -1.323992 | + | |
| - | > | + | |
| - | > data | + | |
| - | # A tibble: 300 × 4 | + | |
| - | | + | |
| - | < | + | |
| - | | + | |
| - | | + | |
| - | | + | |
| - | | + | |
| - | | + | |
| - | | + | |
| - | | + | |
| - | | + | |
| - | | + | |
| - | 10 3.33 3.80 | + | |
| - | # ℹ 290 more rows | + | |
| - | # ℹ Use `print(n = ...)` to see more rows | + | |
| - | > parameters | + | |
| - | # A tibble: 80 × 3 | + | |
| - | | + | |
| - | < | + | |
| - | | + | |
| - | | + | |
| - | | + | |
| - | | + | |
| - | | + | |
| - | | + | |
| - | | + | |
| - | 8 -0.0397 | + | |
| - | 9 -0.186 | + | |
| - | 10 -0.303 | + | |
| - | # ℹ 70 more rows | + | |
| - | # | + | |
| + | Call: | ||
| + | lm(formula = y ~ x) | ||
| + | |||
| + | Residuals: | ||
| + | | ||
| + | -259.314 | ||
| + | |||
| + | Coefficients: | ||
| + | Estimate Std. Error t value Pr(> | ||
| + | (Intercept) | ||
| + | x | ||
| + | --- | ||
| + | Signif. codes: | ||
| + | |||
| + | Residual standard error: 88.57 on 198 degrees of freedom | ||
| + | Multiple R-squared: | ||
| + | F-statistic: | ||
| + | |||
| + | > a.start | ||
| + | [1] 1.364582 | ||
| + | > b.start | ||
| + | [1] -1.12968 | ||
| + | > a | ||
| + | [1] 8.266303 | ||
| + | > b | ||
| + | [1] 11.88797 | ||
| + | > | ||
| </ | </ | ||
| + | {{: | ||
| + | {{: | ||
| + | {{: | ||
| + | |||
| + | ====== Why normalize (scale or make z-score) xi ====== | ||
| + | x 변인의 측정단위로 인해서 b 값이 결정되게 되는데 이 때의 b값은 상당하고 다양한 범위를 가질 수 있다. 가령 월 수입이 (인컴) X 라고 한다면 우리가 추정해야 (추적해야) 할 b값은 수백만이 될 수도 있다.이 값을 gradient로 추적하게 된다면 너무도 많은 iteration을 거쳐야 할 수 있다. 변인이 바뀌면 이 b의 추적범위도 드라마틱하게 바뀌게 된다. 이를 표준화한 x 점수를 사용하게 된다면 일정한 learning rate와 iteration만으로도 정확한 a와 b를 추적할 수 있게 된다. | ||
| + | |||
| + | ====== How to unnormalize (unscale) a and b ====== | ||
| + | \begin{eqnarray*} | ||
| + | y & = & a + b * x \\ | ||
| + | & & \text{we use z instead of x} \\ | ||
| + | & & \text{and } \\ | ||
| + | & & z = \frac{(x - \mu)}{\sigma} \\ | ||
| + | & & \text{suppose that the result after calculation be } \\ | ||
| + | y & = & k + m * z \\ | ||
| + | & = & k + m * \frac{(x - \mu)}{\sigma} \\ | ||
| + | & = & k + \frac{m * x}{\sigma} - \frac{m * \mu}{\sigma} | ||
| + | & = & k - \frac{m * \mu}{\sigma} + \frac{m * x}{\sigma} | ||
| + | & = & \underbrace{k - \frac{\mu}{\sigma} * m}_\text{ 1 } + \underbrace{\frac{m}{\sigma}}_\text{ 2 } * x \\ | ||
| + | & & \text{therefore, | ||
| + | a & = & k - \frac{\mu}{\sigma} * m \\ | ||
| + | b & = & \frac{m}{\sigma} \\ | ||
| + | \end{eqnarray*} | ||
| + | |||
| - | {{: | ||
gradient_descent.1755746039.txt.gz · Last modified: by hkimscil
