\[\newcommand{\E}{\mathrm{E}} \newcommand{\Var}{\mathrm{Var}} \newcommand{\Cov}{\mathrm{Cov}} \newcommand{\se}{\text{se}} \newcommand{\Lagr}{\mathcal{L}} \newcommand{\lagr}{\mathcal{l}}\]
\[\xi \equiv f(x; \theta) : \theta \in \Theta\]
Let \(Z\) denote a standard Normal random variable. Then
\[ \begin{align} \tau &= \Pr (X > 50) = 1 - \Pr (X < 50) = 1 - \Pr \left( \frac{X - \mu}{\sigma} < \frac{50 - \mu}{\sigma} \right) \\ &= 1 - \Pr \left(Z < \frac{50 - \mu}{\sigma} \right) = 1 - \Phi \left( \frac{50 - \mu}{\sigma} \right) \end{align} \]
Parameter of interest is \(\tau = T(\mu, \sigma) = 1 - \Phi \left( \frac{50 - \mu}{\sigma} \right)\)
Suppose that \(X_1, \ldots, X_n \sim \text{Bernoulli} (\pi)\). The probability function is
\[f(x; \pi) = \pi^x (1 - \pi)^{1 - x}\]
for \(x = 0,1\)Unknown parameter \(\pi\)
\[ \begin{align} \Lagr_n(\pi) &= \prod_{i=1}^n f(X_i; \pi) \\ &= \prod_{i=1}^n \pi^{X_i} (1 - \pi)^{1 - X_i} \\ &= \pi^S (1 - \pi)^{n - S} \end{align} \]
where \(S = \sum_{i} X_i\)Log-likelihood function is
\[\lagr_n (\pi) = S \log(\pi) + (n - S) \log(1 - \pi)\]Analytical solution
(Simplified) likelihood function
\[ \begin{align} \Lagr_n (\mu, \sigma) &= \prod_i \frac{1}{\sigma} \exp \left[ - \frac{1}{2\sigma^2} (X_i - \mu)^2 \right] \\ &= \frac{1}{\sigma^n} \exp \left[ - \frac{1}{2\sigma^2} \sum_i (X_i - \mu)^2 \right] \\ &= \frac{1}{\sigma^n} \exp \left[ \frac{n S^2}{2 \sigma^2} \right] \exp \left[ - \frac{n (\bar{X} - \mu)^2}{2 \sigma^2} \right] \end{align} \]
Log-likelihood
\[\lagr_n (\mu, \sigma) = -n \log \sigma - \frac{nS^2}{2\sigma^2} - \frac{n(\bar{X} - \mu)^2}{2\sigma^2}\]
\(\se = \sqrt{\Var (\hat{\sigma}_n)}\)
\[\frac{\hat{\theta}_n - \theta_*}{\se} \leadsto N(0,1)\]
\[\frac{\hat{\theta}_n - \theta_*}{\widehat{\se}} \leadsto N(0,1)\]Requires a large-ish sample size
MLE is \(\hat{\sigma}_n = \bar{X}_n\)
\[\sqrt{n} (\hat{\theta}_n - \theta) \leadsto N(0, \sigma^2)\]
\(\theta = \tilde{\theta}_n\) : median
\[\sqrt{n} (\tilde{\theta}_n - \theta) \leadsto N \left(0, \sigma^2 \frac{\pi}{2} \right)\]
Consider two estimators \(T_n\) and \(U_n\)
\[ \begin{align} \sqrt{n} (T_n - \theta) &\leadsto N(0, t^2) \\ \sqrt{n} (U_n - \theta) &\leadsto N(0, u^2) \\ \end{align} \]Relative efficiency
\[\text{ARE}(U, T) = \frac{t^2}{u^2}\]
Relative efficiency for Normal distribution
\[\text{ARE}(\tilde{\theta}_n, \hat{\theta}_n) = \frac{2}{\pi} \approx .63\]
If \(\hat{\theta}_n\) is the MLE and \(\tilde{\theta}_n\) is any other estimator, then
\[\text{ARE} (\tilde{\theta}_n, \hat{\theta}_n) \leq 1\]MLE is efficient or asymptotically optimal
\[\Pr(X_i = x_i) = \frac{1}{\sqrt{2\pi\sigma^2}} \exp \left[\frac{(X_i - \mu)^2}{2\sigma^2}\right]\]
\[ \begin{align} \lagr_n(\mu, \sigma^2 | X) &= \log \prod_{i = 1}^{N}{\frac{1}{\sqrt{2\pi\sigma^2}} \exp \left[\frac{(X_i - \mu)^2}{2\sigma^2}\right]} \\ &= \sum_{i=1}^{N}{\log\left(\frac{1}{\sqrt{2\pi\sigma^2}} \exp \left[\frac{(X_i - \mu)^2}{2\sigma^2}\right]\right)} \\ &= -\frac{N}{2} \log(2\pi) - \left[ \sum_{i = 1}^{N} \ln{\sigma^2 - \frac{1}{2\sigma^2}} (X_i - \mu)^2 \right] \end{align} \]
id | salary |
---|---|
1 | 60 |
2 | 55 |
3 | 65 |
4 | 50 |
5 | 70 |
\[\E(Y) \equiv \mu = \beta_0 + \beta_{1}X_{i}\]
\[\Var (Y) = \sigma^2\]
\[ \begin{align} \lagr_n(\beta_0, \beta_1, \sigma^2 | Y, X) &= \log \left[ \prod_{i = 1}^{N}{\frac{1}{\sqrt{2\pi\sigma^2}} \exp \left[\frac{(Y_i - \beta_0 - \beta_{1}X_{i})^2}{2\sigma^2}\right]} \right] \\ &= -\frac{N}{2} \log(2\pi) \\ &\quad- \sum_{i = 1}^{N}\left[ \log{\sigma^2 - \frac{1}{2\sigma^2}} (Y_i - \beta_0 - \beta_{1}X_{i})^2 \right] \\ &\propto - \sum_{i = 1}^{N} \left[ \log{\sigma^2 - \frac{1}{2\sigma^2}} (Y_i - \beta_0 - \beta_{1}X_{i})^2 \right] \end{align} \]
Drop \(\sigma^2\)
\[\lagr_n(\beta_0, \beta_1 | Y, X) = - \sum_{i = 1}^{N} (Y_i - \beta_0 - \beta_{1}X_{i})^2\]
\[ \begin{align} \dfrac{\partial{ \lagr_n(\beta_0, \beta_1 | Y, X)}}{\partial \beta_0} & = -2 (\sum_{i=1}^n (Y_i - \beta_0 - \beta_1 X_i))\\ & = \sum_{i=1}^n -2Y_i + 2\beta_0 + 2\beta_1 X_i\\ 0 & = \sum_{i=1}^n -2Y_{i} + 2\beta_0 + 2\beta_1 X_i\\ 0 & = -2 \sum_{i=1}^n Y_{i} + 2\sum_{i=1}^n \beta_0 + 2\beta_1 \sum_{i=1}^n X_i\\ 0 & = -2 \sum_{i=1}^n Y_{i} + (n \times 2\beta_0) + 2\beta_1 \sum_{i=1}^n X_i\\ n \times 2\beta_0 & = 2 \sum_{i=1}^n Y_i - 2\beta_1 \sum_{i=1}^n X_i\\ \hat \beta_0 & = \dfrac{2 \sum_{i=1}^n Y_i}{2n} - \dfrac{2\beta_1 \sum_{i=1}^n X_i}{2n}\\ & = \dfrac{\sum_{i=1}^n Y_i}{n} - \beta_1\dfrac{ \sum_{i=1}^n X_i}{n}\\ \hat \beta_0 & = \bar{Y} - \beta_1 \bar{X} \end{align} \]
\[ \begin{align} \dfrac{\partial{ \lagr_n(\beta_0, \beta_1 | Y, X)}}{\partial \beta_1} & = \sum_{i=1}^n -2X_i(Y_i - \beta_0 - \beta_1 X_i) \\ & = \sum_{i=1}^n -2Y_iX_i + 2\beta_0X_i + 2\beta_1 X_i^2\\ 0 & = \sum_{i=1}^n -2Y_iX_i + 2\beta_0 \sum_{i=1}^nX_i + 2\beta_1 \sum_{i=1}^n X_i^2\\ & = \sum_{i=1}^n -2Y_iX_i + 2 (\bar{Y} - \beta_1 \bar{X}) \sum_{i=1}^nX_i + 2\beta_1 \sum_{i=1}^n X_i^2\\ & = \sum_{i=1}^n -2Y_iX_i + 2\bar{Y} \sum_{i=1}^nX_i - 2\beta_1 \bar{X}\sum_{i=1}^nX_i + 2\beta_1 \sum_{i=1}^n X_i^2\\ 2\beta_1 \sum_{i=1}^n X_i^2 - 2\beta_1 \bar{X}\sum_{i=1}^nX_i & = \sum_{i=1}^n 2Y_iX_i - 2\bar{Y} \sum_{i=1}^nX_i\\ \beta_1 ( \sum_{i=1}^n X_i^2 - \bar{X}\sum_{i=1}^nX_i ) & = \sum_{i=1}^n Y_iX_i - \bar{Y} \sum_{i=1}^nX_i\\ \hat \beta_1 & = \dfrac{ \sum_{i=1}^n Y_iX_i - \bar{Y} \sum_{i=1}^nX_i}{ \sum_{i=1}^n X_i^2 - \bar{X}\sum_{i=1}^nX_i}\\ \hat \beta_0 & = \bar{Y} - \hat{\beta}_1 \bar{X} \end{align} \]
log_lik_lm <- function(par, data.y, data.x) {
a.par <- par[1] # The current slope
b.par <- par[2] # The current intercept
err.sigma <- par[3] # The current error standard deviation
# Calculate the likelihood of each data point using dnorm
likelihoods <- suppressWarnings(dnorm(data.y, mean = data.x * a.par + b.par, sd = err.sigma))
# Calculate log-likelihoods of each data point
log.likelihoods <- log(likelihoods)
# return the sum of the log-likelihoods
sum(log.likelihoods)
}
# optimize for professor salary
optim(par = c(1, 5, 20), fn = log_lik_lm, data.y = prof$salary, data.x = prof$years,
control = list(fnscale = -1))
## $par
## [1] 5.00e+00 4.50e+01 2.46e-10
##
## $value
## [1] 106
##
## $counts
## function gradient
## 502 NA
##
## $convergence
## [1] 1
##
## $message
## NULL
# compare to lm()
summary(lm(salary ~ years, data = prof))
##
## Call:
## lm(formula = salary ~ years, data = prof)
##
## Residuals:
## 1 2 3 4 5
## -1.28e-14 3.34e-15 3.17e-15 3.10e-15 3.19e-15
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.50e+01 8.67e-15 5.19e+15 <2e-16 ***
## years 5.00e+00 2.61e-15 1.91e+15 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.27e-15 on 3 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 3.66e+30 on 1 and 3 DF, p-value: <2e-16