gradient_descent
Differences
This shows you the differences between two versions of the page.
Both sides previous revisionPrevious revisionNext revision | Previous revision | ||
gradient_descent [2025/08/21 12:13] – [output] hkimscil | gradient_descent [2025/08/21 16:24] (current) – [Gradient descend] hkimscil | ||
---|---|---|---|
Line 1: | Line 1: | ||
====== Gradient Descent ====== | ====== Gradient Descent ====== | ||
- | ====== explanation ====== | ||
- | |||
- | ====== Why normalize (scale or make z-score) xi ====== | ||
- | x 변인의 측정단위로 인해서 b 값이 결정되게 되는데 이 때의 b값은 상당하고 다양한 범위를 가질 수 있다. 가령 월 수입이 (인컴) X 라고 한다면 우리가 추정해야 (추적해야) 할 b값은 수백만이 될 수도 있다.이 값을 gradient로 추적하게 된다면 너무도 많은 iteration을 거쳐야 할 수 있다. 변인이 바뀌면 이 b의 추적범위도 드라마틱하게 바뀌게 된다. 이를 표준화한 x 점수를 사용하게 된다면 일정한 learning rate와 iteration만으로도 정확한 a와 b를 추적할 수 있게 된다. | ||
- | |||
- | ====== How to unnormalize (unscale) a and b ====== | ||
- | \begin{eqnarray*} | ||
- | y & = & a + b * x \\ | ||
- | & & \text{we use z instead of x} \\ | ||
- | & & \text{and } \\ | ||
- | & & z = \frac{(x - \mu)}{\sigma} \\ | ||
- | & & \text{suppose that the result after calculation be } \\ | ||
- | y & = & k + m * z \\ | ||
- | & = & k + m * \frac{(x - \mu)}{\sigma} \\ | ||
- | & = & k + \frac{m * x}{\sigma} - \frac{m * \mu}{\sigma} | ||
- | & = & k - \frac{m * \mu}{\sigma} + \frac{m * x}{\sigma} | ||
- | & = & k - \frac{\mu}{\sigma} * m + \frac{m}{\sigma} * x \\ | ||
- | & & \text{therefore, | ||
- | a & = & k - \frac{\mu}{\sigma} * m \\ | ||
- | b & = & \frac{m}{\sigma} \\ | ||
- | \end{eqnarray*} | ||
- | |||
- | |||
====== R code: Idea ====== | ====== R code: Idea ====== | ||
< | < | ||
Line 519: | Line 496: | ||
> | > | ||
</ | </ | ||
- | 렇게 말고 | + | a와 b를 동시에 |
- | gradient descent | + | |
====== Gradient descend ====== | ====== Gradient descend ====== | ||
Line 563: | Line 539: | ||
& = & -2 X_i \sum{(Y_i - (a + bX_i))} \\ | & = & -2 X_i \sum{(Y_i - (a + bX_i))} \\ | ||
& = & -2 * X_i * \sum{\text{residual}} \\ | & = & -2 * X_i * \sum{\text{residual}} \\ | ||
- | \\ | + | & .. & -2 * X_i * \frac{\sum{\text{residual}}}{n} \\ |
+ | & = & -2 * \overline{X_i * \text{residual}} \\ | ||
\end{eqnarray*} | \end{eqnarray*} | ||
- | (미분을 이해한다는 것을 전제로) | + | 위의 설명은 Sum of Square값을 미분하는 것을 전제로 하였지만, |
< | < | ||
Line 630: | Line 608: | ||
====== R code ====== | ====== R code ====== | ||
< | < | ||
- | # d statquest explanation | + | # the above no gradient |
- | # x <- c(0.5, 2.3, 2.9) | + | # mse 값으로 계산 rather than sse |
- | # y <- c(1.4, 1.9, 3.2) | + | # 후자는 값이 너무 커짐 |
- | rm(list=ls()) | + | a <- rnorm(1) |
- | # set.seed(191) | + | b <- rnorm(1) |
- | n <- 300 | + | a.start |
- | x <- rnorm(n, 5, 1.2) | + | b.start <- b |
- | y <- 2.14 * x + rnorm(n, 0, 4) | + | |
- | # data <- data.frame(x, y) | + | gradient |
- | data <- tibble(x | + | |
- | + | db = -2 * mean(x * error) | |
- | mo <- lm(y~x) | + | da = -2 * mean(error) |
- | summary(mo) | + | |
- | + | ||
- | # set.seed(191) | + | |
- | # Initialize random betas | + | |
- | b1 = rnorm(1) | + | |
- | b0 = rnorm(1) | + | |
- | + | ||
- | b1.init <- b1 | + | |
- | b0.init <- b0 | + | |
- | + | ||
- | # Predict function: | + | |
- | predict <- function(x, b0, b1){ | + | |
- | return (b0 + b1 * x) | + | |
} | } | ||
- | # And loss function is: | + | mseloss |
- | residuals | + | |
- | | + | return(mean(residuals^2)) |
} | } | ||
- | |||
- | loss_mse <- function(predictions, | ||
- | residuals = y - predictions | ||
- | return(mean(residuals ^ 2)) | ||
- | } | ||
- | |||
- | predictions <- predict(x, b0, b1) | ||
- | residuals <- residuals(predictions, | ||
- | loss = loss_mse(predictions, | ||
- | |||
- | data <- tibble(data.frame(x, | ||
- | |||
- | print(paste0(" | ||
- | |||
- | gradient <- function(x, y, predictions){ | ||
- | dinputs = y - predictions | ||
- | db1 = -2 * mean(x * dinputs) | ||
- | db0 = -2 * mean(dinputs) | ||
- | | ||
- | return(list(" | ||
- | } | ||
- | |||
- | gradients <- gradient(x, y, predictions) | ||
- | print(gradients) | ||
# Train the model with scaled features | # Train the model with scaled features | ||
- | x_scaled <- (x - mean(x)) / sd(x) | + | learning.rate |
- | + | ||
- | learning_rate | + | |
# Record Loss for each epoch: | # Record Loss for each epoch: | ||
- | # logs = list() | + | as = c() |
- | # bs=list() | + | bs = c() |
- | b0s = c() | + | mses = c() |
- | b1s = c() | + | sses = c() |
- | mse = c() | + | mres = c() |
+ | zx <- (x-mean(x))/ | ||
- | nlen <- 80 | + | nlen <- 50 |
- | for (epoch in 1:nlen){ | + | for (epoch in 1:nlen) { |
- | # Predict all y values: | + | predictions |
- | predictions | + | residual <- residuals(predictions, |
- | loss = loss_mse(predictions, | + | loss <- mseloss(predictions, |
- | | + | |
- | | + | |
| | ||
- | | + | |
- | print(paste0(" | + | |
- | } | + | |
| | ||
- | | + | |
- | | + | |
- | | + | |
+ | a <- a-step.a | ||
| | ||
- | | + | |
- | b0 <- b0 - db0 * learning_rate | + | |
- | b0s <- append(b0s, b0) | + | |
- | | + | |
} | } | ||
+ | mses | ||
+ | mres | ||
+ | as | ||
+ | bs | ||
+ | |||
+ | # scaled | ||
+ | a | ||
+ | b | ||
# unscale coefficients to make them comprehensible | # unscale coefficients to make them comprehensible | ||
- | b0 = | + | # see http:// |
- | b1 = b1 / sd(x) | + | # and |
+ | # http:// | ||
+ | # | ||
+ | a = | ||
+ | b = | ||
+ | a | ||
+ | b | ||
# changes of estimators | # changes of estimators | ||
- | b0s <- b0s - (mean(x) /sd(x)) * b1s | + | as <- as - (mean(x) /sd(x)) * bs |
- | b1s <- b1s / sd(x) | + | bs <- bs / sd(x) |
- | parameters | + | as |
+ | bs | ||
+ | mres | ||
+ | mse.x <- mses | ||
- | cat(paste0(" | + | parameters <- data.frame(as, |
+ | |||
+ | cat(paste0(" | ||
summary(lm(y~x))$coefficients | summary(lm(y~x))$coefficients | ||
+ | mses <- data.frame(mses) | ||
+ | mses.log <- data.table(epoch = 1:nlen, mses) | ||
+ | ggplot(mses.log, | ||
+ | geom_line(color=" | ||
+ | theme_classic() | ||
+ | |||
+ | # mres <- data.frame(mres) | ||
+ | mres.log <- data.table(epoch = 1:nlen, mres) | ||
+ | ggplot(mres.log, | ||
+ | geom_line(color=" | ||
+ | theme_classic() | ||
+ | |||
+ | ch <- data.frame(mres, | ||
+ | ch | ||
+ | max(y) | ||
ggplot(data, | ggplot(data, | ||
geom_point(size = 2) + | geom_point(size = 2) + | ||
- | geom_abline(aes(intercept = b0s, slope = b1s), | + | geom_abline(aes(intercept = as, slope = bs), |
data = parameters, linewidth = 0.5, | data = parameters, linewidth = 0.5, | ||
color = ' | color = ' | ||
+ | stat_poly_line() + | ||
+ | stat_poly_eq(use_label(c(" | ||
theme_classic() + | theme_classic() + | ||
- | geom_abline(aes(intercept = b0s, slope = b1s), | + | geom_abline(aes(intercept = as, slope = bs), |
data = parameters %>% slice_head(), | data = parameters %>% slice_head(), | ||
linewidth = 1, color = ' | linewidth = 1, color = ' | ||
- | geom_abline(aes(intercept = b0s, slope = b1s), | + | geom_abline(aes(intercept = as, slope = bs), |
data = parameters %>% slice_tail(), | data = parameters %>% slice_tail(), | ||
linewidth = 1, color = ' | linewidth = 1, color = ' | ||
labs(title = ' | labs(title = ' | ||
- | + | summary(lm(y~x)) | |
- | b0.init | + | a.start |
- | b1.init | + | b.start |
- | + | a | |
- | data | + | b |
- | parameters | + | |
</ | </ | ||
====== R output ===== | ====== R output ===== | ||
< | < | ||
- | > rm(list=ls()) | ||
- | > # set.seed(191) | ||
- | > n <- 300 | ||
- | > x <- rnorm(n, 5, 1.2) | ||
- | > y <- 2.14 * x + rnorm(n, 0, 4) | ||
> | > | ||
- | > # data <- data.frame(x, | ||
- | > data <- tibble(x = x, y = y) | ||
> | > | ||
- | > mo <- lm(y~x) | + | > # the above no gradient |
- | > summary(mo) | + | > # mse 값으로 계산 rather than sse |
- | + | > # 후자는 값이 너무 커짐 | |
- | Call: | + | |
- | lm(formula = y ~ x) | + | |
- | + | ||
- | Residuals: | + | |
- | | + | |
- | -9.754 -2.729 -0.135 | + | |
- | + | ||
- | Coefficients: | + | |
- | Estimate Std. Error t value Pr(>|t|) | + | |
- | (Intercept) | + | |
- | x | + | |
- | --- | + | |
- | Signif. codes: | + | |
- | + | ||
- | Residual standard error: 3.951 on 298 degrees of freedom | + | |
- | Multiple R-squared: | + | |
- | F-statistic: | + | |
> | > | ||
- | > # set.seed(191) | + | > a <- rnorm(1) |
- | > # Initialize random betas | + | > b <- rnorm(1) |
- | > b1 = rnorm(1) | + | > a.start <- a |
- | > b0 = rnorm(1) | + | > b.start <- b |
> | > | ||
- | > b1.init <- b1 | + | > gradient |
- | > b0.init <- b0 | + | + error = y - predictions |
- | > | + | + db = -2 * mean(x * error) |
- | > # Predict function: | + | + da = -2 * mean(error) |
- | > predict | + | + |
- | + return (b0 + b1 * x) | + | |
+ } | + } | ||
> | > | ||
- | > # And loss function is: | + | > mseloss |
- | > residuals | + | + residuals <- (y - predictions) |
- | + return(y - predictions) | + | + |
+ } | + } | ||
- | > | ||
- | > loss_mse <- function(predictions, | ||
- | + | ||
- | + | ||
- | + } | ||
- | > | ||
- | > predictions <- predict(x, b0, b1) | ||
- | > residuals <- residuals(predictions, | ||
- | > loss = loss_mse(predictions, | ||
- | > | ||
- | > data <- tibble(data.frame(x, | ||
- | > | ||
- | > print(paste0(" | ||
- | [1] "Loss is: 393" | ||
- | > | ||
- | > gradient <- function(x, y, predictions){ | ||
- | + | ||
- | + db1 = -2 * mean(x * dinputs) | ||
- | + db0 = -2 * mean(dinputs) | ||
- | + | ||
- | + | ||
- | + } | ||
- | > | ||
- | > gradients <- gradient(x, y, predictions) | ||
- | > print(gradients) | ||
- | $db1 | ||
- | [1] -200.6834 | ||
- | |||
- | $db0 | ||
- | [1] -37.76994 | ||
- | |||
> | > | ||
> # Train the model with scaled features | > # Train the model with scaled features | ||
- | > x_scaled <- (x - mean(x)) / sd(x) | + | > learning.rate |
- | > | + | |
- | > learning_rate | + | |
> | > | ||
> # Record Loss for each epoch: | > # Record Loss for each epoch: | ||
- | > # logs = list() | + | > as = c() |
- | > # bs=list() | + | > bs = c() |
- | > b0s = c() | + | > mses = c() |
- | > b1s = c() | + | > sses = c() |
- | > mse = c() | + | > mres = c() |
+ | > zx <- (x-mean(x))/ | ||
> | > | ||
- | > nlen <- 80 | + | > nlen <- 50 |
- | > for (epoch in 1:nlen){ | + | > for (epoch in 1:nlen) { |
- | + # Predict all y values: | + | + |
- | + | + | + |
- | + | + | + |
- | + mse = append(mse, loss) | + | + mres <- append(mres, mean(residual)) |
- | + # logs = append(logs, loss) | + | + mses <- append(mses, loss) |
+ | + | ||
- | + if (epoch %% 10 == 0){ | + | + grad <- gradient(zx, y, predictions) |
- | + | + | |
- | + } | + | |
+ | + | ||
- | + gradients | + | + step.b |
- | + db1 <- gradients$db1 | + | + step.a |
- | + db0 <- gradients$db0 | + | + b <- b-step.b |
+ | + a <- a-step.a | ||
+ | + | ||
- | + b1 <- b1 - db1 * learning_rate | + | + as <- append(as, a) |
- | + b0 <- b0 - db0 * learning_rate | + | + bs <- append(bs, b) |
- | + | + | |
- | + b1s <- append(b1s, b1) | + | |
+ } | + } | ||
- | [1] " | + | > mses |
- | [1] " | + | [1] 12376.887 10718.824 |
- | [1] " | + | [9] |
- | [1] " | + | [17] 7770.364 |
- | [1] " | + | [25] 7766.783 |
- | [1] " | + | [33] 7766.682 |
- | [1] " | + | [41] 7766.679 |
- | [1] " | + | [49] 7766.679 |
+ | > mres | ||
+ | | ||
+ | [7] 15.735566811 12.588453449 10.070762759 | ||
+ | [13] 4.124984426 | ||
+ | [19] 1.081339917 | ||
+ | [25] 0.283466771 | ||
+ | [31] 0.074309113 | ||
+ | [37] 0.019479688 | ||
+ | [43] 0.005106483 | ||
+ | [49] 0.001338634 | ||
+ | > as | ||
+ | | ||
+ | [9] 53.33440 54.94572 56.23478 57.26602 58.09102 58.75102 59.27901 59.70141 | ||
+ | [17] 60.03933 60.30967 60.52593 60.69895 60.83736 60.94809 61.03667 61.10754 | ||
+ | [25] 61.16423 61.20959 61.24587 61.27490 61.29812 61.31670 61.33156 61.34345 | ||
+ | [33] 61.35296 61.36057 61.36666 61.37153 61.37542 61.37854 61.38103 61.38303 | ||
+ | [41] 61.38462 61.38590 61.38692 61.38774 61.38839 61.38891 61.38933 61.38967 | ||
+ | [49] 61.38993 61.39015 | ||
+ | > bs | ||
+ | | ||
+ | [9] 26.365585 27.224909 27.913227 28.464570 28.906196 29.259938 29.543285 29.770247 | ||
+ | [17] 29.952043 30.097661 30.214302 30.307731 30.382568 30.442512 30.490527 30.528987 | ||
+ | [25] 30.559794 30.584470 30.604236 30.620068 30.632750 30.642908 30.651044 30.657562 | ||
+ | [33] 30.662782 30.666964 30.670313 30.672996 30.675145 30.676866 30.678245 30.679349 | ||
+ | [41] 30.680234 30.680943 30.681510 30.681965 30.682329 30.682621 30.682854 30.683041 | ||
+ | [49] 30.683191 30.683311 | ||
+ | > | ||
+ | > # scaled | ||
+ | > a | ||
+ | [1] 61.39015 | ||
+ | > b | ||
+ | [1] 30.68331 | ||
> | > | ||
> # unscale coefficients to make them comprehensible | > # unscale coefficients to make them comprehensible | ||
- | > b0 = | + | > # see http:// |
- | > b1 = b1 / sd(x) | + | > # and |
+ | > # http:// | ||
+ | > # | ||
+ | > a = | ||
+ | > b = | ||
+ | > a | ||
+ | [1] 8.266303 | ||
+ | > b | ||
+ | [1] 11.88797 | ||
> | > | ||
> # changes of estimators | > # changes of estimators | ||
- | > b0s <- b0s - (mean(x) /sd(x)) * b1s | + | > as <- as - (mean(x) /sd(x)) * bs |
- | > b1s <- b1s / sd(x) | + | > bs <- bs / sd(x) |
> | > | ||
- | > parameters <- tibble(data.frame(b0s, b1s, mse)) | + | > as |
+ | [1] 4.364717 5.189158 5.839931 6.353516 6.758752 7.078428 7.330555 7.529361 | ||
+ | [9] 7.686087 7.809611 7.906942 7.983615 8.043999 8.091541 8.128963 8.158410 | ||
+ | [17] 8.181574 8.199791 8.214112 8.225367 8.234209 8.241154 8.246605 8.250884 | ||
+ | [25] 8.254239 8.256871 8.258933 8.260549 8.261814 8.262804 8.263579 8.264184 | ||
+ | [33] 8.264658 8.265027 8.265315 8.265540 8.265716 8.265852 8.265958 8.266041 | ||
+ | [41] 8.266105 8.266155 8.266193 8.266223 8.266246 8.266264 8.266278 8.266289 | ||
+ | [49] 8.266297 8.266303 | ||
+ | > bs | ||
+ | | ||
+ | [9] 10.215107 10.548045 10.814727 11.028340 11.199444 11.336498 11.446279 11.534213 | ||
+ | [17] 11.604648 11.661067 11.706258 11.742456 11.771451 11.794676 11.813279 11.828180 | ||
+ | [25] 11.840116 11.849676 11.857334 11.863469 11.868382 11.872317 11.875470 11.877995 | ||
+ | [33] 11.880018 11.881638 11.882935 11.883975 11.884807 11.885474 11.886009 11.886437 | ||
+ | [41] 11.886779 11.887054 11.887274 11.887450 11.887591 11.887704 11.887794 11.887867 | ||
+ | [49] 11.887925 11.887972 | ||
+ | > mres | ||
+ | [1] 60.026423686 48.021138949 38.416911159 30.733528927 24.586823142 19.669458513 | ||
+ | [7] 15.735566811 12.588453449 10.070762759 | ||
+ | [13] 4.124984426 | ||
+ | [19] 1.081339917 | ||
+ | [25] 0.283466771 | ||
+ | [31] 0.074309113 | ||
+ | [37] 0.019479688 | ||
+ | [43] 0.005106483 | ||
+ | [49] 0.001338634 | ||
+ | > mse.x <- mses | ||
> | > | ||
- | > cat(paste0(" | + | > parameters <- data.frame(as, |
- | Slope: 2.26922511738252, | + | > |
- | Intercept: -0.779435058320381 | + | > cat(paste0(" |
+ | Intercept: 8.26630323816515 | ||
+ | Slope: 11.8879715830899 | ||
> summary(lm(y~x))$coefficients | > summary(lm(y~x))$coefficients | ||
- | | + | Estimate Std. Error |
- | (Intercept) | + | (Intercept) |
- | x 2.2692252 | + | x 11.888159 |
> | > | ||
+ | > mses <- data.frame(mses) | ||
+ | > mses.log <- data.table(epoch = 1:nlen, mses) | ||
+ | > ggplot(mses.log, | ||
+ | + | ||
+ | + | ||
+ | > | ||
+ | > # mres <- data.frame(mres) | ||
+ | > mres.log <- data.table(epoch = 1:nlen, mres) | ||
+ | > ggplot(mres.log, | ||
+ | + | ||
+ | + | ||
+ | > | ||
+ | > ch <- data.frame(mres, | ||
+ | > ch | ||
+ | | ||
+ | 1 60.026423686 12376.887 | ||
+ | 2 48.021138949 10718.824 | ||
+ | 3 38.416911159 | ||
+ | 4 30.733528927 | ||
+ | 5 24.586823142 | ||
+ | 6 19.669458513 | ||
+ | 7 15.735566811 | ||
+ | 8 12.588453449 | ||
+ | 9 10.070762759 | ||
+ | 10 8.056610207 | ||
+ | 11 6.445288166 | ||
+ | 12 5.156230533 | ||
+ | 13 4.124984426 | ||
+ | 14 3.299987541 | ||
+ | 15 2.639990033 | ||
+ | 16 2.111992026 | ||
+ | 17 1.689593621 | ||
+ | 18 1.351674897 | ||
+ | 19 1.081339917 | ||
+ | 20 0.865071934 | ||
+ | 21 0.692057547 | ||
+ | 22 0.553646038 | ||
+ | 23 0.442916830 | ||
+ | 24 0.354333464 | ||
+ | 25 0.283466771 | ||
+ | 26 0.226773417 | ||
+ | 27 0.181418734 | ||
+ | 28 0.145134987 | ||
+ | 29 0.116107990 | ||
+ | 30 0.092886392 | ||
+ | 31 0.074309113 | ||
+ | 32 0.059447291 | ||
+ | 33 0.047557833 | ||
+ | 34 0.038046266 | ||
+ | 35 0.030437013 | ||
+ | 36 0.024349610 | ||
+ | 37 0.019479688 | ||
+ | 38 0.015583751 | ||
+ | 39 0.012467000 | ||
+ | 40 0.009973600 | ||
+ | 41 0.007978880 | ||
+ | 42 0.006383104 | ||
+ | 43 0.005106483 | ||
+ | 44 0.004085187 | ||
+ | 45 0.003268149 | ||
+ | 46 0.002614519 | ||
+ | 47 0.002091616 | ||
+ | 48 0.001673292 | ||
+ | 49 0.001338634 | ||
+ | 50 0.001070907 | ||
+ | > max(y) | ||
+ | [1] 383.1671 | ||
> ggplot(data, | > ggplot(data, | ||
+ | + | ||
- | + | + | + |
+ data = parameters, linewidth = 0.5, | + data = parameters, linewidth = 0.5, | ||
+ color = ' | + color = ' | ||
+ | + | ||
+ | + | ||
+ | + | ||
- | + | + | + |
+ data = parameters %>% slice_head(), | + data = parameters %>% slice_head(), | ||
+ | + | ||
- | + | + | + |
+ data = parameters %>% slice_tail(), | + data = parameters %>% slice_tail(), | ||
+ | + | ||
+ | + | ||
- | > | + | > summary(lm(y~x)) |
- | > b0.init | + | |
- | [1] -1.67967 | + | |
- | > b1.init | + | |
- | [1] -1.323992 | + | |
- | > | + | |
- | > data | + | |
- | # A tibble: 300 × 4 | + | |
- | | + | |
- | < | + | |
- | | + | |
- | | + | |
- | | + | |
- | | + | |
- | | + | |
- | | + | |
- | | + | |
- | | + | |
- | | + | |
- | 10 3.33 3.80 | + | |
- | # ℹ 290 more rows | + | |
- | # ℹ Use `print(n = ...)` to see more rows | + | |
- | > parameters | + | |
- | # A tibble: 80 × 3 | + | |
- | | + | |
- | < | + | |
- | | + | |
- | | + | |
- | | + | |
- | | + | |
- | | + | |
- | | + | |
- | | + | |
- | 8 -0.0397 | + | |
- | 9 -0.186 | + | |
- | 10 -0.303 | + | |
- | # ℹ 70 more rows | + | |
- | # | + | |
+ | Call: | ||
+ | lm(formula = y ~ x) | ||
+ | |||
+ | Residuals: | ||
+ | | ||
+ | -259.314 | ||
+ | |||
+ | Coefficients: | ||
+ | Estimate Std. Error t value Pr(> | ||
+ | (Intercept) | ||
+ | x | ||
+ | --- | ||
+ | Signif. codes: | ||
+ | |||
+ | Residual standard error: 88.57 on 198 degrees of freedom | ||
+ | Multiple R-squared: | ||
+ | F-statistic: | ||
+ | |||
+ | > a.start | ||
+ | [1] 1.364582 | ||
+ | > b.start | ||
+ | [1] -1.12968 | ||
+ | > a | ||
+ | [1] 8.266303 | ||
+ | > b | ||
+ | [1] 11.88797 | ||
+ | > | ||
</ | </ | ||
+ | {{: | ||
+ | {{: | ||
+ | {{: | ||
+ | |||
+ | ====== Why normalize (scale or make z-score) xi ====== | ||
+ | x 변인의 측정단위로 인해서 b 값이 결정되게 되는데 이 때의 b값은 상당하고 다양한 범위를 가질 수 있다. 가령 월 수입이 (인컴) X 라고 한다면 우리가 추정해야 (추적해야) 할 b값은 수백만이 될 수도 있다.이 값을 gradient로 추적하게 된다면 너무도 많은 iteration을 거쳐야 할 수 있다. 변인이 바뀌면 이 b의 추적범위도 드라마틱하게 바뀌게 된다. 이를 표준화한 x 점수를 사용하게 된다면 일정한 learning rate와 iteration만으로도 정확한 a와 b를 추적할 수 있게 된다. | ||
+ | |||
+ | ====== How to unnormalize (unscale) a and b ====== | ||
+ | \begin{eqnarray*} | ||
+ | y & = & a + b * x \\ | ||
+ | & & \text{we use z instead of x} \\ | ||
+ | & & \text{and } \\ | ||
+ | & & z = \frac{(x - \mu)}{\sigma} \\ | ||
+ | & & \text{suppose that the result after calculation be } \\ | ||
+ | y & = & k + m * z \\ | ||
+ | & = & k + m * \frac{(x - \mu)}{\sigma} \\ | ||
+ | & = & k + \frac{m * x}{\sigma} - \frac{m * \mu}{\sigma} | ||
+ | & = & k - \frac{m * \mu}{\sigma} + \frac{m * x}{\sigma} | ||
+ | & = & \underbrace{k - \frac{\mu}{\sigma} * m}_\text{ 1 } + \underbrace{\frac{m}{\sigma}}_\text{ 2 } * x \\ | ||
+ | & & \text{therefore, | ||
+ | a & = & k - \frac{\mu}{\sigma} * m \\ | ||
+ | b & = & \frac{m}{\sigma} \\ | ||
+ | \end{eqnarray*} | ||
+ | |||
- | {{: | ||
gradient_descent.1755746039.txt.gz · Last modified: 2025/08/21 12:13 by hkimscil