In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

Setup

In [2]:
# True error distribution parameters
TRUE_ERROR_MEAN = 0
TRUE_ERROR_VAR = 4
In [3]:
# True parameter/beta values
TRUE_WEIGHTS = np.array([2, 1.5])[np.newaxis].T
TRUE_WEIGHTS.shape
Out[3]:
(2, 1)
In [4]:
def generate_errors(N, error_mean, error_sd):
    """Generates a sample of N errors from the true error distribution
    
    Args:
        N (int): number of errors to sample
        error_mean: mean of epsilon normal distribution
        error_sd: standard deviation of epsilon normal distribution
        
    Returns:
        np.array: sampled errors vector of dimensions [m x 1] (number of samples x 1)
    """
    return np.random.normal(
        loc=error_mean,
        scale=error_sd,
        size=N
    )[np.newaxis].T
In [5]:
def generate_sample(errors, weights, xbounds=(-10, 10)):
    """Generates sample X matrix and associated Y vectors from sampled errors and true weights
    
    Args:
        errors: sampled errors from true population
        weights: parameters/beta, must be shape [n x 1] (number of predictors x 1)
        
    Returns:
        tuple: (
            np.array: design matrix of dimensions [m x n] (number samples x number of predictors)
            np.array: y vector of dimensions [m x 1] (number of samples x 1)
    """
    # initialize X - note we leave first column of X as 1 for intercept
    m = len(errors)
    n = len(weights)
    X = np.ones((m, n))
    
    # randomly sample m locations
    for colnum in range(1, n):
        X[:, colnum] = np.random.uniform(low=xbounds[0], high=xbounds[1], size=m)
    
    Y = X @ weights + errors
    return X, Y
In [6]:
# Plot 2d case to see if samples are working

errors = generate_errors(100, TRUE_ERROR_MEAN, np.sqrt(TRUE_ERROR_VAR))
X, Y = generate_sample(errors, TRUE_WEIGHTS)

x_true = np.linspace(-10, 10, 100)
plt.plot(X[:,1], Y, marker='o', ls='', markersize=2)
plt.plot(x_true, x_true*TRUE_WEIGHTS[1] + TRUE_WEIGHTS[0], color='red', lw=3)
plt.title("Sampled Values and True Population");
In [7]:
def estimate_parameters(X, Y):
    """Estimates population parameters from sample
    
    Args:
        sample: np.array of float, sampled points from population
        
    Returns:
        tuple: (beta_hat [n x 1] np.array, var_mle, var_ols)
    """
    
    m = Y.shape[0]
    n = X.shape[1]
    
    # beta hat estimated the same for both OLS and MLE
    beta_hat = np.linalg.inv(X.T @ X) @ X.T @ Y
    
    # both mle and ols use RSS
    RSS = (Y - X @ beta_hat).T @ (Y - X @ beta_hat)
    
    # sd_mle should be biased estimate of TRUE_ERROR_VAR
    var_mle = (1 / m * RSS)[0][0]
    
    # sd_ols should be unbiased estimate of TRUE_ERROR_VAR
    var_ols = (1 / (m - n) * RSS)[0][0]
    
    return beta_hat, var_mle, var_ols
In [8]:
# Test one estimate
beta_hat, var_mle, var_ols = estimate_parameters(X, Y)
print("True Beta:\n", TRUE_WEIGHTS)
print("Estimated Beta:\n", beta_hat)
print("\nTrue Error Variance:", TRUE_ERROR_VAR)
print("OLS Estimated Error Variance:", var_ols)
print("MLE Estimated Error Variance:", var_mle)
True Beta:
 [[2. ]
 [1.5]]
Estimated Beta:
 [[1.96692275]
 [1.45174119]]

True Error Variance: 4
OLS Estimated Error Variance: 3.881119256690805
MLE Estimated Error Variance: 3.8034968715569892

Perform Simulations

In [9]:
# True error distribution parameters
TRUE_ERROR_MEAN = 0
TRUE_ERROR_VAR = 4
TRUE_WEIGHTS = np.array([2, 1.5, 10, -2, -5, 3])[np.newaxis].T
N_SAMPLES_PER_SIM = 100
N_SIMS = 100000
In [10]:
var_mles, var_olss = [], []
betas = np.array([])
for ii in range(N_SIMS):
    if ii+1 == 1 or not((ii+1) % (N_SIMS//10)): print('Running simulation', ii+1, 'of', N_SIMS)
        
    # setup sample
    errors = generate_errors(N_SAMPLES_PER_SIM, TRUE_ERROR_MEAN, np.sqrt(TRUE_ERROR_VAR))
    X, Y = generate_sample(errors, TRUE_WEIGHTS)
    
    # estimate params
    beta_hat, var_mle, var_ols = estimate_parameters(X, Y)
    
    # store results of each sim
    if not len(betas):
        betas = beta_hat
    else:
        betas = np.append(betas, beta_hat, axis=1)
    var_mles.append(var_mle)
    var_olss.append(var_ols)
Running simulation 1 of 100000
Running simulation 10000 of 100000
Running simulation 20000 of 100000
Running simulation 30000 of 100000
Running simulation 40000 of 100000
Running simulation 50000 of 100000
Running simulation 60000 of 100000
Running simulation 70000 of 100000
Running simulation 80000 of 100000
Running simulation 90000 of 100000
Running simulation 100000 of 100000
In [11]:
# Plot betas and confirm they are unbiased
ncols = 2
nrows = int(np.ceil(len(TRUE_WEIGHTS) / ncols))
fig, axs = plt.subplots(ncols=ncols, nrows=nrows, figsize=(16, 2*nrows))
axs = np.ravel(axs)
for ii, beta in enumerate(TRUE_WEIGHTS):
    ax = axs[ii]
    ax.axvline(beta, color='red', lw=3, label="Truth")
    ax.hist(betas[ii], bins=30, label="Estimates")
    ax.set_title("Beta{}".format(str(ii)), fontsize=20)
    ax.set_ylabel("# Sims", fontsize=16)
    ax.grid(True)
axs[0].legend(fontsize=12)
plt.tight_layout()
In [12]:
# Exciting part - check bias on betas
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(16,4))

# Mark true variance
for ax in axs:
    ax.axvline(TRUE_ERROR_VAR, color="red", lw=3, label="Truth")
    ax.grid(True)

# MLE left
axs[0].hist(var_mles, bins=80, label="Estimates")
axs[0].axvline(np.mean(var_mles), color="k", ls=':', lw=3, label="Mean")
axs[0].set_title("MLE Estimation of Error Variance", fontsize=20)
axs[0].set_ylabel("# Sims", fontsize=16)
axs[0].legend(fontsize=12)

# OLS right
axs[1].hist(var_olss, bins=80, label="Estimates")
axs[1].axvline(np.mean(var_olss), color="k", ls=':', lw=3, label="Mean")
axs[1].set_title("OLS Estimation of Error Variance", fontsize=20)
axs[1].set_ylabel("# Sims", fontsize=16)
axs[1].legend(fontsize=12)

plt.tight_layout()
In [ ]: