Contextualized Models And Outlier Robustness#

Contextualized models allow the effect to change as context changes. Therefore, context-dependent noise sources only have local effects and disrupt the estimated models in a neighborhood of context, rather than disrupting the single model used for all samples in a population model.

In this notebook, we will assess the robustness of contextualized models when faced with outliers. We’ll begin by training the models using data that doesn’t include any outliers, and then we’ll proceed to include outlier data to observe how well the models maintain their performance.

1. Data Preparation#

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_diabetes
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from contextualized.easy import ContextualizedRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
%matplotlib inline

import logging
logging.getLogger("pytorch_lightning").setLevel(logging.ERROR)
/opt/homebrew/lib/python3.10/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: dlopen(/opt/homebrew/lib/python3.10/site-packages/torchvision/image.so, 0x0006): Symbol not found: (__ZN3c106detail19maybe_wrap_dim_slowExxb)
  Referenced from: '/opt/homebrew/lib/python3.10/site-packages/torchvision/image.so'
  Expected in: '/opt/homebrew/lib/python3.10/site-packages/torch/lib/libc10.dylib'
  warn(f"Failed to load image Python extension: {e}")

1A. Preparing non-outlier data#

We will use the sklearn diabetes for a realistic toy regression problem.

X_normal, Y_normal = load_diabetes(return_X_y=True, as_frame=True)
Y_normal = np.expand_dims(Y_normal.values, axis=-1)
C_normal = X_normal[['age', 'sex', 'bmi']]
X_normal.drop(['age', 'sex', 'bmi'], axis=1, inplace=True)

# center and scale data
X_normal = (X_normal - X_normal.mean()) / X_normal.std()
Y_normal = (Y_normal - Y_normal.mean()) / Y_normal.std()
C_normal = (C_normal - C_normal.mean()) / C_normal.std()

seed = 1
C_train_normal, C_test_normal, X_train_normal, X_test_normal, Y_train_normal, Y_test_normal = train_test_split(C_normal, X_normal, Y_normal, test_size=0.20, random_state=seed)

1B. Preparing outlier data#

To simulate a localized outlier effect, we corrupt the data in a region of context.

X_outlier, Y_outlier, C_outlier = X_normal.copy(), Y_normal.copy(), C_normal.copy()
seed = 1
C_train_outlier, C_test_outlier, X_train_outlier, X_test_outlier, Y_train_outlier, Y_test_outlier = train_test_split(C_outlier, X_outlier, Y_outlier, test_size=0.20, random_state=seed)

# find nearest neighbors in C_train_outlier to modify output values
knn_model = NearestNeighbors(n_neighbors=len(C_train_outlier))
knn_model.fit(C_train_outlier)
query_value = C_train_outlier.iloc[0:1] # this value is what the model will find the nearest neighbors to
distances, indices = knn_model.kneighbors(query_value)
distances = distances.flatten()
neighbor_indices = indices.flatten()
# modifying values at indices
for i, (dist, idx) in enumerate(zip(distances, neighbor_indices)):
    if i < 10:
        Y_train_outlier[idx, 0] += 1+np.exp(-10*dist/np.max(distances))
    #Y_train_outlier[idx, 0] = np.random.normal(Y_train_outlier[idx, 0],
    #                                           1*np.exp(-10*dist/np.max(distances)), size=(1, 1))

plt.scatter(distances, Y_train_normal[neighbor_indices], marker='o', color='blue', label='Original Values')
plt.scatter(distances, Y_train_outlier[neighbor_indices], marker='x', color='red', label='Corrupted Values')
plt.xlabel('Distance from corrupted context')
plt.ylabel("Y-value")
plt.legend()
plt.show()
../_images/robust-outliers_6_0.png

2. How well do the models perform on the non-outlier data?#

2A. Fit a population model to non-outlier data.#

%%capture
# Initialize and train a linear regression model
sklearn_regressor = LinearRegression()
sklearn_regressor.fit(X_train_normal, Y_train_normal)

# Make predictions on the test set
sklearn_pred_normal = sklearn_regressor.predict(X_test_normal)
sklearn_pred_normal = sklearn_pred_normal.reshape(-1)

2B. Fit a contextualized regressor to non-outlier data.#

%%capture
context_regressor = ContextualizedRegressor(n_bootstraps=10)
context_regressor.fit(C_train_normal.values, X_train_normal.values, Y_train_normal,
          encoder_type="mlp", max_epochs=3,
          learning_rate=1e-2)

context_pred_normal = context_regressor.predict(C_test_normal.values, X_test_normal.values)[:, 0]

2C. Evaluate model performances#

plt.rcParams.update({'font.size': 18})

# Get the maximum value of the true values to set the same limit for both subplots
max_true_value = max(Y_test_normal[:, 0])

fig, axes = plt.subplots(1, 2, figsize=(12, 6))

# Plot the first graph in the first subplot
axes[0].scatter(Y_test_normal[:, 0], sklearn_pred_normal)
axes[0].set_xlabel("True Value")
axes[0].set_ylabel("Predicted Value")
axes[0].set_title("Sklearn Regressor")
axes[0].set_xlim(0, max_true_value)
axes[0].set_ylim(0, max_true_value)
axes[0].plot([0, max_true_value], [0, max_true_value], color='red', linestyle='--', label='Ideal Line')
axes[0].legend()  # Display the legend

# Plot the second graph in the second subplot
axes[1].scatter(Y_test_normal[:, 0], context_pred_normal)
axes[1].set_xlabel("True Value")
axes[1].set_ylabel("Predicted Value")
axes[1].set_title("Contextualized Regressor")
axes[1].set_xlim(0, max_true_value)
axes[1].set_ylim(0, max_true_value)
axes[1].plot([0, max_true_value], [0, max_true_value], color='red', linestyle='--', label='Ideal Line')
axes[1].legend()  # Display the legend

plt.tight_layout()
plt.show()

mse_sklearn_normal = mean_squared_error(Y_test_normal[:, 0], sklearn_pred_normal)
mse_contextualized_normal = mean_squared_error(Y_test_normal[:, 0], context_pred_normal)

print(f"Mean Squared Error (Population model): {mse_sklearn_normal:.2f}")
print(f"Mean Squared Error (contextualized regressor): {mse_contextualized_normal:.2f}")
../_images/robust-outliers_12_0.png
Mean Squared Error (Population model): 0.56
Mean Squared Error (contextualized regressor): 0.55

The models perform very similarly on the non-outlier data (if anything, the population model is slightly better). Now let’s introduce outliers in the dataset and see how the models hold up.

3. Now let’s see how the models perform on the outlier data.#

3A. Fit a population model to outlier data.#

%%capture
# Initialize and train a linear regression model
sklearn_regressor = LinearRegression()
sklearn_regressor.fit(X_train_outlier, Y_train_outlier)

# Make predictions on the test set
sklearn_pred_outlier = sklearn_regressor.predict(X_test_outlier)
sklearn_pred_outlier = sklearn_pred_outlier.reshape(-1)

3B. Fit a contextualized regressor to data with outliers.#

%%capture
context_regressor = ContextualizedRegressor(n_bootstraps=10)
context_regressor.fit(C_train_outlier.values, X_train_outlier.values, Y_train_outlier,
          encoder_type="mlp", max_epochs=3,
          learning_rate=1e-2)

context_pred_outlier = context_regressor.predict(C_test_normal.values, X_test_normal.values)[:, 0]

3C. Evaluate Performance on data with outliers.#

%matplotlib inline
plt.rcParams.update({'font.size': 18})

# Get the maximum value of the true values to set the same limit for both subplots
max_true_value = max(Y_test_normal[:, 0])
fig, axes = plt.subplots(1, 2, figsize=(12, 6))

# Plot the first graph
axes[0].scatter(Y_test_normal[:, 0], sklearn_pred_outlier)
axes[0].set_xlabel("True Value")
axes[0].set_ylabel("Predicted Value")
axes[0].set_title("Sklearn Regressor")
axes[0].set_xlim(0, max_true_value)
axes[0].set_ylim(0, max_true_value)
axes[0].plot([0, max_true_value], [0, max_true_value], color='red', linestyle='--', label='Ideal Line')
axes[0].legend()

# Plot the second graph
axes[1].scatter(Y_test_normal[:, 0], context_pred_outlier)
axes[1].set_xlabel("True Value")
axes[1].set_ylabel("Predicted Value")
axes[1].set_title("Contextualized Regressor")
axes[1].set_xlim(0, max_true_value)
axes[1].set_ylim(0, max_true_value)
axes[1].plot([0, max_true_value], [0, max_true_value],
             color='red', linestyle='--', label='Ideal Line')
axes[1].legend()

plt.tight_layout()
plt.show()

mse_sklearn = mean_squared_error(Y_test_outlier[:, 0], sklearn_pred_outlier)
mse_contextualized = mean_squared_error(Y_test_outlier[:, 0], context_pred_outlier)

print(f"Mean Squared Error (Population model): {mse_sklearn:.2f}")
print(f"Mean Squared Error (Contextualized regressor): {mse_contextualized:.2f}")
../_images/robust-outliers_20_0.png
Mean Squared Error (Population model): 0.56
Mean Squared Error (Contextualized regressor): 0.55

The performance of the sklearn regressor quickly degrades when faced with outlier data. In contrast, the contextualized regressor is more robust to these outliers.

Let’s check how this error changes with context (distance from corrupted value)#

# Calc absolute errors of predictions
errors_sklearn_outlier = [(abs(true - pred) / true) * 100 for true, pred in zip(Y_test_outlier[:, 0], sklearn_pred_outlier)]
errors_context_outlier = [(abs(true - pred) / true) * 100 for true, pred in zip(Y_test_outlier[:, 0], context_pred_outlier)]

# Normalize the errors by the errors of the population model
normalized_errors_sklearn_outlier = np.array(errors_sklearn_outlier) / np.array(errors_sklearn_outlier)
normalized_errors_context_outlier = np.array(errors_context_outlier) / np.array(errors_sklearn_outlier)

#assert np.allclose(normalized_errors_sklearn_outlier, np.ones(len(normalized_errors_sklearn_outlier)))
# Normalized errors for the population model should all be about 1.

# Sort the distances and errors by distance
knn_model = NearestNeighbors(n_neighbors=len(C_test_outlier))
knn_model.fit(C_test_outlier)
distances, indices = knn_model.kneighbors(query_value)
distances = distances.flatten()

normalized_errors_context_outlier = normalized_errors_context_outlier[indices.flatten()]
plt.figure(figsize=(10, 6))
plt.scatter(distances, normalized_errors_context_outlier, color='blue')
plt.xlabel('Distance from corrupted context')
plt.ylabel('Magnitude of Prediction Error, normalized by error of population model')
plt.title('Error vs Distance from corrupted region')
plt.axhline(y=1, color='gray', linestyle='--')

# Plot the rolling average
window_size = 10
rolling_average = np.convolve(normalized_errors_context_outlier, np.ones(window_size), 'valid') / window_size
plt.plot(distances[window_size - 1:], rolling_average, color='black', label='Rolling Average')

#plt.ylim(0, 10)
plt.show()
../_images/robust-outliers_23_0.png