%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
# True error distribution parameters
TRUE_ERROR_MEAN = 0
TRUE_ERROR_VAR = 4
# True parameter/beta values
TRUE_WEIGHTS = np.array([2, 1.5])[np.newaxis].T
TRUE_WEIGHTS.shape
def generate_errors(N, error_mean, error_sd):
"""Generates a sample of N errors from the true error distribution
Args:
N (int): number of errors to sample
error_mean: mean of epsilon normal distribution
error_sd: standard deviation of epsilon normal distribution
Returns:
np.array: sampled errors vector of dimensions [m x 1] (number of samples x 1)
"""
return np.random.normal(
loc=error_mean,
scale=error_sd,
size=N
)[np.newaxis].T
def generate_sample(errors, weights, xbounds=(-10, 10)):
"""Generates sample X matrix and associated Y vectors from sampled errors and true weights
Args:
errors: sampled errors from true population
weights: parameters/beta, must be shape [n x 1] (number of predictors x 1)
Returns:
tuple: (
np.array: design matrix of dimensions [m x n] (number samples x number of predictors)
np.array: y vector of dimensions [m x 1] (number of samples x 1)
"""
# initialize X - note we leave first column of X as 1 for intercept
m = len(errors)
n = len(weights)
X = np.ones((m, n))
# randomly sample m locations
for colnum in range(1, n):
X[:, colnum] = np.random.uniform(low=xbounds[0], high=xbounds[1], size=m)
Y = X @ weights + errors
return X, Y
# Plot 2d case to see if samples are working
errors = generate_errors(100, TRUE_ERROR_MEAN, np.sqrt(TRUE_ERROR_VAR))
X, Y = generate_sample(errors, TRUE_WEIGHTS)
x_true = np.linspace(-10, 10, 100)
plt.plot(X[:,1], Y, marker='o', ls='', markersize=2)
plt.plot(x_true, x_true*TRUE_WEIGHTS[1] + TRUE_WEIGHTS[0], color='red', lw=3)
plt.title("Sampled Values and True Population");
def estimate_parameters(X, Y):
"""Estimates population parameters from sample
Args:
sample: np.array of float, sampled points from population
Returns:
tuple: (beta_hat [n x 1] np.array, var_mle, var_ols)
"""
m = Y.shape[0]
n = X.shape[1]
# beta hat estimated the same for both OLS and MLE
beta_hat = np.linalg.inv(X.T @ X) @ X.T @ Y
# both mle and ols use RSS
RSS = (Y - X @ beta_hat).T @ (Y - X @ beta_hat)
# sd_mle should be biased estimate of TRUE_ERROR_VAR
var_mle = (1 / m * RSS)[0][0]
# sd_ols should be unbiased estimate of TRUE_ERROR_VAR
var_ols = (1 / (m - n) * RSS)[0][0]
return beta_hat, var_mle, var_ols
# Test one estimate
beta_hat, var_mle, var_ols = estimate_parameters(X, Y)
print("True Beta:\n", TRUE_WEIGHTS)
print("Estimated Beta:\n", beta_hat)
print("\nTrue Error Variance:", TRUE_ERROR_VAR)
print("OLS Estimated Error Variance:", var_ols)
print("MLE Estimated Error Variance:", var_mle)
# True error distribution parameters
TRUE_ERROR_MEAN = 0
TRUE_ERROR_VAR = 4
TRUE_WEIGHTS = np.array([2, 1.5, 10, -2, -5, 3])[np.newaxis].T
N_SAMPLES_PER_SIM = 100
N_SIMS = 100000
var_mles, var_olss = [], []
betas = np.array([])
for ii in range(N_SIMS):
if ii+1 == 1 or not((ii+1) % (N_SIMS//10)): print('Running simulation', ii+1, 'of', N_SIMS)
# setup sample
errors = generate_errors(N_SAMPLES_PER_SIM, TRUE_ERROR_MEAN, np.sqrt(TRUE_ERROR_VAR))
X, Y = generate_sample(errors, TRUE_WEIGHTS)
# estimate params
beta_hat, var_mle, var_ols = estimate_parameters(X, Y)
# store results of each sim
if not len(betas):
betas = beta_hat
else:
betas = np.append(betas, beta_hat, axis=1)
var_mles.append(var_mle)
var_olss.append(var_ols)
# Plot betas and confirm they are unbiased
ncols = 2
nrows = int(np.ceil(len(TRUE_WEIGHTS) / ncols))
fig, axs = plt.subplots(ncols=ncols, nrows=nrows, figsize=(16, 2*nrows))
axs = np.ravel(axs)
for ii, beta in enumerate(TRUE_WEIGHTS):
ax = axs[ii]
ax.axvline(beta, color='red', lw=3, label="Truth")
ax.hist(betas[ii], bins=30, label="Estimates")
ax.set_title("Beta{}".format(str(ii)), fontsize=20)
ax.set_ylabel("# Sims", fontsize=16)
ax.grid(True)
axs[0].legend(fontsize=12)
plt.tight_layout()
# Exciting part - check bias on betas
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(16,4))
# Mark true variance
for ax in axs:
ax.axvline(TRUE_ERROR_VAR, color="red", lw=3, label="Truth")
ax.grid(True)
# MLE left
axs[0].hist(var_mles, bins=80, label="Estimates")
axs[0].axvline(np.mean(var_mles), color="k", ls=':', lw=3, label="Mean")
axs[0].set_title("MLE Estimation of Error Variance", fontsize=20)
axs[0].set_ylabel("# Sims", fontsize=16)
axs[0].legend(fontsize=12)
# OLS right
axs[1].hist(var_olss, bins=80, label="Estimates")
axs[1].axvline(np.mean(var_olss), color="k", ls=':', lw=3, label="Mean")
axs[1].set_title("OLS Estimation of Error Variance", fontsize=20)
axs[1].set_ylabel("# Sims", fontsize=16)
axs[1].legend(fontsize=12)
plt.tight_layout()