<- read.csv(file="https://stats.idre.ucla.edu/stat/data/binary.csv")
df <- as.matrix(data.frame(intercept=rep(x=1, times=nrow(df)),gpa=df$gpa, gre=df$gre))
X <- df$admit y
In this document, I demonstrate how the Newton-Raphson Method can be used to approximate the Maximum Likelihood Estimates of the parameters of a Logistic Regression model.
1 Logistic Regression
Let \(Y=(Y_{1},Y_{2},…,Y_{n})^{T}\) be a random sample of size \(n\) where \(Y_{i}\sim B(1,\pi_{i})\), \(i=1,2,…,n\), are \(n\) independent random scalars which follow a Bernoulli distribution with parameter \(\pi_{i}\). Let \(y=(y_{1},y_{2},…,y_{n})^{T}\) be an observed random sample in which \(y_{i}\), \(i=1,2,…,n\), are \(n\) realisations of the random scalars \(Y_{i}\), \(i=1,2,…,n\). The probability mass function of \(Y_{i}\) is
\[ \begin{align*} f(y_{i};\pi)=P(y_{i};\pi)=P(Y_{i}=y_{i})=\pi_{i}^{y_{i}}(1-\pi_{i})^{1-y_{i}} \end{align*} \]
Let \(X=(x_{1},x_{2},…,x_{n})^{T}\) be a \(n\times p\) data matrix in which each row \(x_{i}^{T}\), \(i=1,2,…,n\), represents an observation and each column represents a covariate. Let \(\beta=(\beta_{1},\beta_{2},…,\beta_{p})^{T}\) be the parameters of the Multiple Logistic Regression model. The model is
\[ \begin{align*} \log\left(\frac{\pi_{i}}{1-\pi_{i}}\right)=x_{i}^{T}\beta=\beta_{1}x_{i1}++\beta_{2}x_{i2}+…+\beta_{p}x_{ip} \end{align*} \] or
\[ \begin{align*} \pi_{i}=\frac{e^{x_{i}^{T}\beta}}{1+e^{x_{i}^{T}\beta}}=\frac{e^{\beta_{1}x_{i1}+\beta_{2}x_{i2}+…+\beta_{p}x_{ip}}}{1+e^{\beta_{1}x_{i1}+\beta_{2}x_{i2}+…+\beta_{p}x_{ip}}} \end{align*} \]
2 Maximum Likelihood Estimation
In Maximum Likelihood Estimation, the problem is to find values of \(\beta=(\beta_{1},\beta_{2},…,\beta_{p})^{T}\) so as to
\[ \begin{align*} \max L(\beta) \end{align*} \]
where \(L(\beta)\) is the likelihood function.
2.1 Likelihood Function
The likelihood function for \(\beta\) is
\[ \begin{align*} L(\beta) &=\prod_{i=1}^{n}f(y_{i};\pi_{i}) =\prod_{i=1}^{n}P(y_{i};\pi_{i}) =\prod_{i=1}^{n}P(Y_{i}=y_{i})\\ &=\prod_{i=1}^{n}\pi_{i}^{y_{i}}(1-\pi_{i})^{1-y_{i}}\\ &=\prod_{i=1}^{n}\left(\frac{e^{x_{i}^{T}\beta}}{1+e^{x_{i}^{T}\beta}}\right)^{y_{i}}\left(\frac{1}{1+e^{x_{i}^{T}\beta}}\right)^{1-y_{i}} \end{align*} \]
Therefore, the log-likelihood function for \(\beta\) is
\[ \begin{align*} l(\beta) &=\sum_{i=1}^{n}\left[y_{i}\log{\left(\frac{e^{x_{i}^{T}\beta}}{1+e^{x_{i}^{T}\beta}}\right)}+(1-y_{i})\log{\left(\frac{1}{1+e^{x_{i}^{T}\beta}}\right)} \right] \\ &=\sum_{i=1}^{n}\left[y_{i}x_{i}^{T}\beta-y_{i}\log{\left(1+e^{x_{i}^{T}\beta}\right)}-\log{\left(1+e^{x_{i}^{T}\beta}\right)}+y_{i}\log{\left(1+e^{x_{i}^{T}\beta}\right)}\right] \\ &=\sum_{i=1}^{n}\left[y_{i}x_{i}^{T}\beta-\log{\left(1+e^{x_{i}^{T}\beta}\right)}\right] \end{align*} \]
2.2 First Derivative
The first derivative of the log-likelihood function with respect to the (r)-th parameter \(\beta_{r}\), \(r=1,2,…,p\) is
\[ \begin{align*} \frac{dl}{d\beta_r} &=\frac{d}{d\beta_{r}}\sum_{i=1}^{n}\left[y_{i}x_{i}^{T}\beta-\log{\left(1+e^{x_{i}^{T}\beta}\right)}\right]\\ &=\frac{d}{d\beta_{r}}\sum_{i=1}^{n}\left[y_{i}x_{i1}\beta_{1}+y_{i}x_{i2}\beta_{2}+…+y\_{i}x_{ip}\beta_{p}-\log{\left(1+e^{x_{i1}\beta_{1}+x_{i2}\beta_{2}+…+x_{ip}\beta_{p}}\right)}\right]\\ &=\sum_{i=1}^{n}\left(y_{i}x_{ir}-\frac{e^{x_{i1}\beta_{1}+x_{i2}\beta_{2}+…+x_{ip}\beta_{p}}}{1+e^{x_{i1}\beta_{1}+x_{i2}\beta_{2}+…+x_{ip}\beta_{p}}}x_{ir}\right)\\ &=\sum_{i=1}^{n}\left[\left(y_{i}-\frac{e^{x_{i}^{T}\beta}}{1+e^{x_{i}^{T}\beta}}\right)x_{ir}\right]\\ &=\sum_{i=1}^{n}(y_{i}-\pi_{i})x_{ir} \end{align*} \]
Therefore,
\[ \begin{align*} l'(\beta) &=\frac{dl}{d\beta}\\ &=\left[\frac{dl}{d\beta_{1}} \frac{dl}{d\beta_{2}} … \frac{dl}{d\beta_{p}}\right]^{T}\\ &= \begin{bmatrix}(y_{1}-\pi_{1})x_{11}+(y_{2}-\pi_{2})x_{21}+…+(y_{n}-\pi_{n})x_{n1}\\(y_{1}-\pi_{1})x_{12}+(y_{2}-\pi_{2})x_{22}+…+(y_{n}-\pi_{n})x_{n2}\\\vdots\\(y_{1}-\pi_{1})x_{1p}+(y_{2}-\pi_{2})x_{2p}+…+(y_{n}-\pi_{n})x_{np}\\ \end{bmatrix}\\ &= \begin{bmatrix}x_{11}&x_{21}&…&x_{n1}\\\vdots&\vdots&\ddots&\vdots\\x_{1p}&x_{2p}&…&x_{np}\\ \end{bmatrix} \begin{bmatrix}(y_{1}-\pi_{1})\\\vdots\\(y_{p}-\pi_{p}) \end{bmatrix}\\ &=X^{T}(y-\pi) \end{align*} \]
2.2 Second Derivative
The second derivative of the log-likelihood function with respect to the \(r\)-th and \(r\)-th parameters \(\beta_{r}\), \(\beta_{s}\), \(r,s=1,2,…,p\) is
\[ \begin{align*} \frac{d^{2}l}{d\beta_{r}d\beta_{s}} &=\frac{d}{d\beta_{s}}\sum_{i=1}^{n}\left[\left(y_{i}-\frac{e^{x_{i}^{T}\beta}}{1+e^{x_{i}^{T}\beta}}\right)x_{ir}\right]\\ &=\frac{d}{d\beta_{s}}\sum_{i=1}^{n}\left[y_{i}x_{ir}-\left(\frac{e^{x_{i1}\beta_{1}+x_{i2}\beta_{2}+…+x_{ip}\beta_{p}}}{1+e^{e^{x_{i1}\beta_{1}+x_{i2}\beta_{2}+…+x_{ip}\beta_{p}}}}\right)x_{ir}\right]\\ &=-\sum_{i=1}^{n}\left[\frac{e^{x_{i1}\beta_{1}+x_{i2}\beta_{2}+…+x_{ip}\beta_{p}}}{1+e^{x_{i1}\beta_{1}+x_{i2}\beta_{2}+…+x_{ip}\beta_{p}}}x_{ir}x_{is}-\frac{\left(e^{x_{i1}\beta_{1}+x_{i2}\beta_{2}+…+x_{ip}\beta_{p}}\right)^{2}}{\left(1+e^{x_{i1}\beta_{1}+x_{i2}\beta_{2}+…+x_{ip}\beta_{p}}\right)^{2}}x_{ir}x_{is}\right]\\ &=-\sum_{i=1}^{n}\left[\frac{e^{x_{i}^{T}\beta}}{1+e^{x_{i}^{T}\beta}}x_{ir}x_{is}-\frac{\left(e^{x_{i}^{T}\beta}\right)^{2}}{\left(1+e^{x_{i}^{T}\beta}\right)^{2}}x_{ir}x_{is}\right]\\ &=-\sum_{i=1}^{n}\left[\frac{e^{x_{i}^{T}\beta}+\left(e^{x_{i}^{T}\beta}\right)^{2}}{\left(1+e^{x_{i}^{T}\beta}\right)^{2}}x_{ir}x_{is}-\frac{\left(e^{x_{i}^{T}\beta}\right)^{2}}{\left(1+e^{x_{i}^{T}\beta}\right)^{2}}x_{ir}x_{is}\right]\\ &=-\sum_{i=1}^{n}\frac{e^{x_{i}^{T}\beta}}{\left(1+e^{x_{i}^{T}\beta}\right)^{2}}x_{ir}x_{is}\\ &=-\sum_{i=1}^{n}\pi_{i}(1-\pi_{i})x_{ir}x_{is} \end{align*} \]
Therefore,
\[ \begin{align*} l”(\beta) &=\begin{bmatrix} \frac{d^{2}l}{d\beta_{1}^{2}}&\frac{d^{2}l}{d\beta_{1}d\beta_{2}}&…&\frac{d^{2}l}{d\beta_{1}d\beta_{p}}\\ \vdots&\vdots&\ddots&\vdots\\ \frac{d^{2}l}{d\beta_{p}d\beta_{1}}&\frac{d^{2}l}{d\beta_{p}d\beta_{2}}&…&\frac{d^{2}l}{d^{2}\beta_{p}} \end{bmatrix}\\ &=-\begin{bmatrix} \sum_{i=1}^{n}\pi_{i}(1-\pi_{i})x_{i1}x_{i1}&…&\sum_{i=1}^{n}\pi_{i}(1-\pi_{i})x_{i1}x_{ip}\\ \vdots&\ddots&\vdots\\ \sum_{i=1}^{n}\pi_{i}(1-\pi_{i})x_{ip}x_{i1}&…&\sum_{i=1}^{n}\pi_{i}(1-\pi_{i})x_{ip}x_{ip} \end{bmatrix}\\ &=-\begin{bmatrix} x_{11}&x_{21}&…&x_{n1}\\ \vdots&\vdots&\ddots&\vdots\\ x_{1p}&x_{2p}&…&x_{np} \end{bmatrix} \begin{bmatrix} \pi_{1}(1-\pi_{1})&0&…&0\\ 0&\pi_{2}(1-\pi_{2})&…&0\\ \vdots&\vdots&\ddots&\vdots\\ 0&0&…&\pi_{n}(1-\pi_{n}) \end{bmatrix} \begin{bmatrix} x_{11}&x_{12}&…&x_{1p}\\ \vdots&\vdots&\ddots&\vdots\\ x_{n1}&x_{n2}&…&x_{np} \end{bmatrix}\\ &=-X^{T}WX \end{align*} \]
where \(W=\text{diag}(\pi_{i}(1-\pi_{i})),\quad i=1,2,…,n\).
3. Newton-Raphson Method
Let \(\theta\) be a parameter and \(f\) be a function of \(\theta\). Suppose that we are interested to find the root of \(f\) (i.e., the value of \(\theta\) where \(f(\theta)=0\)). The Newton-Raphson Method is a numerical method to approximate the root of \(f\) (Thomas et al., 2005).
3.1 R
In this section, I demonstrate how the Newton-Raphson Method can be used to approximate the Maximum Likelihood Estimates of the parameters of a Logistic Regression model in \(\verb|R|\).
First I load and prepare the data. The observed random sample \(y\) is assigned to the variable \(\verb|y|\); and the data matrix \(X\) is assigned to the variable \(\verb|X|\).
Second, I define the function \(\verb|newtonRaphsonLogReg|\) which uses the Newton-Raphson Method to approximate the Maximum Likelihood Estimates of the parameters of a Logistic Regression model.
#' newtonRaphsonLogReg
#' @param X: A data matrix
#' @param y: A response vector
#' @param maxit: (Optional) maximum number of iterations
#' @param thr: (Optional) convergence threshold
#' @return Maximum likelihood estimates of regression parameters
<- function(X, y, maxit=10, thr=0.001) {
newtonRaphsonLogReg <- solve(t(X)%*%X)%*%t(X)%*%y # Initial Guess
b <- 1 # Initial Iteration Count
k <- max(abs(b)) # Initial Delta
d while(k<=maxit & d>thr) {
<- as.vector(exp(X%*%b)/(1+exp(X%*%b))) # Probabilities
p <- diag(x=p*(1-p),nrow=length(p),ncol=length(p)) # Weight Matrix
W <- X%*%b+solve(W)%*%(y-p) # Adjusted Response Vector
z <- solve(t(X)%*%W%*%X)%*%t(X)%*%W%*%z # New Guess
b_new <- max(abs(b_new-b)) # Update Delta
d <- b_new # Update Guess
b cat(paste0("[Iteration ", k, "]: ", d, "\n"))
<- k+1
k
}return(b)
}
Last, I use the function.
newtonRaphsonLogReg(X=X, y=y)
[Iteration 1]: 3.56198602366629
[Iteration 2]: 0.824206953925452
[Iteration 3]: 0.0351788519326073
[Iteration 4]: 7.20576835240294e-05
[,1]
intercept -4.949378062
gpa 0.754686856
gre 0.002690684
The results are similar to those obtained using the \(\verb|glm|\) function.
summary(glm(admit~gpa+gre, family="binomial", data=df))
Call:
glm(formula = admit ~ gpa + gre, family = "binomial", data = df)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -4.949378 1.075093 -4.604 4.15e-06 ***
gpa 0.754687 0.319586 2.361 0.0182 *
gre 0.002691 0.001057 2.544 0.0109 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 499.98 on 399 degrees of freedom
Residual deviance: 480.34 on 397 degrees of freedom
AIC: 486.34
Number of Fisher Scoring iterations: 4
4 References
Thomas, G. B., Weir, M. D., Hass, J., & Giordano, F. R. (2005). Thomas’ calculus (pp. 2379-8858). Addison-Wesley.
Copyright © 2024 Lam Fu Yuan, Kevin. All rights reserved.