This is an old revision of the document!

Gradient Descent

점차하강 = 조금씩 깍아서 원하는 기울기 (미분값) 찾기
prerequisite:
표준편차 추론에서 평균을 사용하는 이유: 실험적_수학적_이해
deriviation of a and b in a simple regression

위의 문서는 a, b에 대한 값을 미분법을 이용해서 직접 구하였다. 컴퓨터로는 이렇게 하기가 쉽지 않다. 그렇다면 이 값을 반복계산을 이용해서 추출하는 방법은 없을까? gradient descent

우선 위의 문서에서 (두번째) 최소값이 되는 SS값을 찾는다고 설명했는데, 이는 MS값으로 대체해서 생각해도 된다.

\begin{eqnarray*} \text{MS} & = & \frac {\text{SS}}{n} \end{eqnarray*}

\begin{eqnarray*} \text{for a (constant)} \\ \\ \dfrac{\text{d}}{\text{dv}} \text{MSE (Mean Square Error)} & = & \dfrac{\text{d}}{\text{dv}} \frac {\sum{(Y_i - (a + bX_i))^2}} {N} \\ & = & \sum \dfrac{\text{d}}{\text{dv}} \frac{{(Y_i - (a + bX_i))^2}} {N} \\ & = & \sum{2 \frac{1}{N} (Y_i - (a + bX_i))} * (-1) \;\;\;\; \\ & \because & \dfrac{\text{d}}{\text{dv for a}} (Y_i - (a+bX_i)) = -1 \\ & = & -2 \frac{1}{N} \sum{(Y_i - (a + bX_i))} \\ \end{eqnarray*}

library(tidyverse)
# a simple example
# statquest explanation
x <- c(0.5, 2.3, 2.9)
y <- c(1.4, 1.9, 3.2)

rm(list=ls())
# set.seed(191)
n <- 500
x <- rnorm(n, 5, 1.2)
y <- 2.14 * x + rnorm(n, 0, 4)

# data <- data.frame(x, y)
data <- tibble(x = x, y = y)
data

mo <- lm(y~x)
summary(mo)

# set.seed(191)
# Initialize random betas
b1 = rnorm(1)
b0 = rnorm(1)

# Predict function:
predict <- function(x, b0, b1){
  return (b0 + b1 * x)
}

# And loss function is:
residuals <- function(predictions, y) {
  return(y - predictions)
}

loss_mse <- function(predictions, y){
  residuals = y - predictions
  return(mean(residuals ^ 2))
}

predictions <- predict(x, b0, b1)
residuals <- residuals(predictions, y)
loss = loss_mse(predictions, y)

temp.sum <- data.frame(x, y, b0, b1,predictions, residuals)
temp.sum

print(paste0("Loss is: ", round(loss)))

gradient <- function(x, y, predictions){
  dinputs = y - predictions
  db1 = -2 * mean(x * dinputs)
  db0 = -2 * mean(dinputs)
  
  return(list("db1" = db1, "db0" = db0))
}

gradients <- gradient(x, y, predictions)
print(gradients)

# Train the model with scaled features
x_scaled <- (x - mean(x)) / sd(x)

learning_rate = 1e-1

# Record Loss for each epoch:
logs = list()
bs=list()
b0s = c()
b1s = c()
msr = c()

nlen <- 80
for (epoch in 1:nlen){
  # Predict all y values:
  predictions = predict(x_scaled, b0, b1)
  loss = loss_mse(predictions, y)
  msr = append(msr, loss)
  
  logs = append(logs, loss)
  
  if (epoch %% 10 == 0){
    print(paste0("Epoch: ",epoch, ", Loss: ", round(loss, 5)))
  }
  
  gradients <- gradient(x_scaled, y, predictions)
  db1 <- gradients$db1
  db0 <- gradients$db0
  
  b1 <- b1 - db1 * learning_rate
  b0 <- b0 - db0 * learning_rate
  b0s <- append(b0s, b0)
  b1s <- append(b1s, b1)
}
# I must unscale coefficients to make them comprehensible
b0 =  b0 - (mean(x) / sd(x)) * b1
b1 = b1 / sd(x)

b0s <- b0s - (mean(x) /sd(x)) * b1s
b1s <- b1s / sd(x)

parameters <- tibble(data.frame(b0s, b1s, msr))

cat(paste0("Inclination: ", b1, ", \n", "Intercept: ", b0, "\n"))
summary(lm(y~x))$coefficients

ggplot(data, aes(x = x, y = y)) + 
  geom_point(size = 2) + 
  geom_abline(aes(intercept = b0s, slope = b1s),
              data = parameters, linewidth = 0.5, color = 'red') + 
  theme_classic() +
  geom_abline(aes(intercept = b0s, slope = b1s), 
              data = parameters %>% slice_head(), 
              linewidth = 0.5, color = 'blue') + 
  geom_abline(aes(intercept = b0s, slope = b1s), 
              data = parameters %>% slice_tail(), 
              linewidth = 1, color = 'green') +
  labs(title = 'Gradient descent: blue: start, green: end')
data
parameters