import json
import os
import warnings
from datetime import datetime, timedelta
from pathlib import Path
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib.gridspec import GridSpec
import numpy as np
import pandas as pd
import seaborn as sns
import optuna
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFECV
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.pipeline import make_pipeline
from scipy.stats import boxcox
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.seasonal import seasonal_decompose
warnings.filterwarnings('ignore')data_dir = Path('data')
with open(data_dir / 'eventupp_students_private.json', 'r') as f:
data = json.load(f)
df = pd.DataFrame(data)
df['registerDate_dt'] = pd.to_datetime(df['registerDate'], unit='ms')
df = df.sort_values('registerDate_dt')
print(f"Total students: {len(df):,}")
print(f"Academic years: {sorted(df['academic_year'].unique())}")
print(f"Date range: {df['registerDate_dt'].min()} to {df['registerDate_dt'].max()}")
df.head(2)
| gender | birthdate | country | organization | university | registerDate | program | programDuration | academic_year | registerDate_dt | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | female | 980347260000 | Poland | 5f1ebfbc2fd6333295e8e9af | Universidade do Porto (UP) - Faculdade de Letr... | 1627992642923 | Erasmus + Programme: study exchange | 4-6 months (1 Semester) | 2021_2022 | 2021-08-03 12:10:42.923 |
| 1 | female | 895764420000 | Germany | 5f1ebfbc2fd6333295e8e9af | Instituto Politécnico do Porto - Instituto Sup... | 1627992763121 | Erasmus + Programme: study exchange | 4-6 months (1 Semester) | 2021_2022 | 2021-08-03 12:12:43.121 |
daily_counts = df.groupby(df['registerDate_dt'].dt.date).size()
fig = plt.figure(figsize=(14, 8))
# Plot 1: Daily registrations
daily_ts = pd.Series(daily_counts.values, index=pd.to_datetime(daily_counts.index))
daily_ts = daily_ts.asfreq('D', fill_value=0)
ax1 = plt.subplot(3, 1, 1)
daily_ts.plot(ax=ax1, linewidth=0.5, alpha=0.7)
ax1.set_title('Daily Student Registrations', fontsize=14, fontweight='bold')
ax1.set_xlabel('Time', fontsize=11)
ax1.set_ylabel('Number of Registrations', fontsize=11)
ax1.grid(True, alpha=0.3)
# Plot 2: Weekly registrations
weekly_ts = daily_ts.resample('W').sum()
ax2 = plt.subplot(3, 1, 2)
weekly_ts.plot(ax=ax2, linewidth=1.5, marker='o', markersize=3)
ax2.set_title('Weekly Student Registrations', fontsize=14, fontweight='bold')
ax2.set_xlabel('Time', fontsize=11)
ax2.set_ylabel('Number of Registrations', fontsize=11)
ax2.grid(True, alpha=0.3)
# Plot 3: Monthly registrations
monthly_ts = daily_ts.resample('ME').sum()
ax3 = plt.subplot(3, 1, 3)
monthly_ts.plot(ax=ax3, linewidth=2, marker='o', markersize=5, color='darkblue')
ax3.set_title('Monthly Student Registrations', fontsize=14, fontweight='bold')
ax3.set_xlabel('Time', fontsize=11)
ax3.set_ylabel('Number of Registrations', fontsize=11)
ax3.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()# Convert registerDate from milliseconds to datetime
df['registerDate'] = pd.to_datetime(df['registerDate'], unit='ms')
df_ts = df.set_index('registerDate').sort_index()
# Aggregate weekly - count registrations per week
weekly_registrations = df_ts.resample('W').size()
# Create a DataFrame with weekly aggregation
weekly_df = pd.DataFrame({
'week_ending': weekly_registrations.index,
'registrations': weekly_registrations.values
})weekly_ts.head()
registerDate_dt 2021-08-08 17 2021-08-15 22 2021-08-22 15 2021-08-29 18 2021-09-05 196 Freq: W-SUN, dtype: int64
y = weekly_ts.fillna(0) target_lags = 110 max_pacf_lags = max(1, min(target_lags, len(y) // 2 - 1)) max_acf_lags = target_lags fig, axes = plt.subplots(2, 1, figsize=(16, 10)) plot_acf(y, lags=max_acf_lags, ax=axes[0]) axes[0].set_title(f'ACF') axes[0].grid(alpha=0.3, axis='both') plot_pacf(y, lags=max_pacf_lags, ax=axes[1], method='ywm') axes[1].set_title(f'PACF') axes[1].grid(alpha=0.3, axis='both') plt.tight_layout() plt.show()
fig, axes = plt.subplots(3, 4, figsize=(20, 12))
axes = axes.flatten()
lags_to_plot = [1, 2, 3, 4, 8, 39, 51, 52, 53, 78, 52*2, 52*3]
for i, lag in enumerate(lags_to_plot):
ax = axes[i]
y = weekly_ts.values[lag:]
x = weekly_ts.values[:-lag]
ax.scatter(x, y, alpha=0.6, s=50, edgecolors='black', linewidths=0.5)
if len(x) > 1:
z = np.polyfit(x, y, 1)
p = np.poly1d(z)
# Create a sorted range of X values for a clean line plot
x_line = np.linspace(x.min(), x.max(), 100)
# Plot using the sorted range
ax.plot(x_line, p(x_line), "r--", linewidth=2, alpha=0.8, label=f'Fit: y={z[0]:.2f}x+{z[1]:.1f}')
corr = weekly_ts.autocorr(lag=lag)
lims = [min(x.min(), y.min()), max(x.max(), y.max())]
ax.plot(lims, lims, 'k-', alpha=0.3, linewidth=1, label='y=x')
ax.set_title(f'Lag {lag} Weeks (ρ={corr:.3f})', fontsize=12, fontweight='bold')
ax.set_xlabel(f'Registrations at time t')
ax.set_ylabel(f'Registrations at time t-{lag} Weeks')
ax.legend(fontsize=8, loc='upper left')
ax.grid(alpha=0.3)
plt.tight_layout()
plt.show()# Decompose the series (Additive model assumed initially)
# period=52 for weekly data with annual cycle
decomposition = seasonal_decompose(weekly_ts, model='additive', period=52)
fig = decomposition.plot()
fig.set_size_inches(10, 6)
plt.suptitle('Decomposition of Weekly Registrations', fontsize=16, y=0.995)
plt.tight_layout(rect=[0, 0, 1, 0.985])
plt.show()# Prepare data for seasonal plot
df['date'] = pd.to_datetime(df['registerDate'], unit='ms')
weekly_ts = df.set_index('date').resample('W').size()
df_seasonal = pd.DataFrame({'counts': weekly_ts})
df_seasonal['week'] = df_seasonal.index.isocalendar().week
df_seasonal['year'] = df_seasonal.index.year
# Plot overlapping years
plt.figure(figsize=(12, 6))
sns.lineplot(data=df_seasonal, x='week', y='counts', hue='year', palette='tab10')
plt.title('Seasonal Plot: Registrations by Week of Year', fontsize=16)
plt.xlabel('Week of the Year (1-52)')
plt.ylabel('Registrations')
plt.legend(title='Year')
plt.show()y_raw = weekly_ts.copy()
offset = 1
y_positive = y_raw + offset
# Log Transformation
y_log = np.log(y_positive)
# Box-Cox Transformation
y_boxcox, best_lambda = boxcox(y_positive)
# Plots
fig = plt.figure(figsize=(15, 10))
gs = GridSpec(3, 2, figure=fig, width_ratios=[3, 1])
# Original
ax_ts1 = fig.add_subplot(gs[0, 0])
ax_dist1 = fig.add_subplot(gs[0, 1])
ax_ts1.plot(y_raw, color='tab:blue')
ax_ts1.set_title('Original Data (Raw)', fontweight='bold')
ax_ts1.grid(True, alpha=0.3)
sns.histplot(y_raw, ax=ax_dist1, color='tab:blue', kde=True)
ax_dist1.set_title('Distribution (Skewed)', fontweight='bold')
# Log transformation
ax_ts2 = fig.add_subplot(gs[1, 0])
ax_dist2 = fig.add_subplot(gs[1, 1])
ax_ts2.plot(y_log, color='tab:green')
ax_ts2.set_title('Log Transformation', fontweight='bold')
ax_ts2.grid(True, alpha=0.3)
sns.histplot(y_log, ax=ax_dist2, color='tab:green', kde=True)
ax_dist2.set_title('Distribution', fontweight='bold')
# Box-Cox transformation
ax_ts3 = fig.add_subplot(gs[2, 0])
ax_dist3 = fig.add_subplot(gs[2, 1])
ax_ts3.plot(y_boxcox, color='tab:red')
ax_ts3.set_title(f'Box-Cox Transformation (λ = {best_lambda:.2f})', fontweight='bold')
ax_ts3.grid(True, alpha=0.3)
sns.histplot(y_boxcox, ax=ax_dist3, color='tab:red', kde=True)
ax_dist3.set_title('Distribution', fontweight='bold')
plt.tight_layout()
plt.show()def create_features(df):
df = df.copy()
df['month'] = df['week_ending'].dt.month
df['year'] = df['week_ending'].dt.year
df['week_of_year'] = df['week_ending'].dt.isocalendar().week.astype(int)
# Calculate week of month (0-5)
df['day_of_month'] = df['week_ending'].dt.day
df['week_of_month'] = ((df['day_of_month'] - 1) // 7)
# Is welcome month (February=2 or September=9)
df['is_welcome_month'] = ((df['week_ending'].dt.month == 2) | (df['week_ending'].dt.month == 9)).astype(int)
# Semester: 0 for fall (Aug-Jan), 1 for spring (Feb-Jul)
df['semester'] = ((df['week_ending'].dt.month >= 2) & (df['week_ending'].dt.month <= 7)).astype(int)
# Week of semester calculation
def calculate_week_of_semester(row):
year = row['year']
month = row['month']
week_ending = row['week_ending']
if month >= 8: # Fall semester (Aug-Dec)
semester_start = pd.Timestamp(year=year, month=8, day=1)
elif month <= 7 and month >= 2: # Spring semester (Feb-Jul)
semester_start = pd.Timestamp(year=year, month=2, day=1)
else: # January (still fall semester of previous academic year)
semester_start = pd.Timestamp(year=year-1, month=8, day=1)
weeks_diff = (week_ending - semester_start).days // 7
return max(0, weeks_diff)
df['week_of_semester'] = df.apply(calculate_week_of_semester, axis=1)
df['week_sin'] = np.sin(2 * np.pi * df['week_of_year'] / 52)
df['week_cos'] = np.cos(2 * np.pi * df['week_of_year'] / 52)
# Lag features
for lag in [1, 2, 52]:
df[f'lag_abs_{lag}'] = df['registrations'].shift(lag)
df['roll_mean_4_abs'] = df['registrations'].shift(1).rolling(window=4).mean()
df['roll_std_4_abs'] = df['registrations'].shift(1).rolling(window=4).std()
return df
df_features = create_features(weekly_df)
print(f"Features created: {df_features.shape[1]}")
df_features.head()| week_ending | registrations | month | year | week_of_year | day_of_month | week_of_month | is_welcome_month | semester | week_of_semester | week_sin | week_cos | lag_abs_1 | lag_abs_2 | lag_abs_52 | roll_mean_4_abs | roll_std_4_abs | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2021-08-08 | 17 | 8 | 2021 | 31 | 8 | 1 | 0 | 0 | 1 | -0.568065 | -0.822984 | NaN | NaN | NaN | NaN | NaN |
| 1 | 2021-08-15 | 22 | 8 | 2021 | 32 | 15 | 2 | 0 | 0 | 2 | -0.663123 | -0.748511 | 17.0 | NaN | NaN | NaN | NaN |
| 2 | 2021-08-22 | 15 | 8 | 2021 | 33 | 22 | 3 | 0 | 0 | 3 | -0.748511 | -0.663123 | 22.0 | 17.0 | NaN | NaN | NaN |
| 3 | 2021-08-29 | 18 | 8 | 2021 | 34 | 29 | 4 | 0 | 0 | 4 | -0.822984 | -0.568065 | 15.0 | 22.0 | NaN | NaN | NaN |
| 4 | 2021-09-05 | 196 | 9 | 2021 | 35 | 5 | 0 | 1 | 0 | 5 | -0.885456 | -0.464723 | 18.0 | 15.0 | NaN | 18.0 | 2.94392 |
df_model = df_features.dropna()
# Split Train (2021-2024) and Test (2025)
train = df_model[df_model['year'] < 2025].copy()
test = df_model[df_model['year'] == 2025].copy()
target = 'registrations'
drop_cols = [target, 'week_ending']
X_train = train.drop(columns=drop_cols)
y_train = train[target]
print(f"Training shapes: X={X_train.shape}, y={y_train.shape}")warnings.filterwarnings("ignore", category=UserWarning, module='sklearn.utils.validation')
warnings.filterwarnings("ignore", category=FutureWarning)
models_to_test = {
'RandomForest': RandomForestRegressor(n_jobs=-1, random_state=42),
'XGBoost': XGBRegressor(objective='reg:absoluteerror', n_jobs=-1, random_state=42),
'LightGBM': LGBMRegressor(n_jobs=-1, random_state=42, verbose=-1),
'ElasticNet': ElasticNet(random_state=42, max_iter=10000)
}
final_selections = {}
tscv = TimeSeriesSplit(n_splits=5)
fig, axes = plt.subplots(1, 4, figsize=(24, 5))
for i, (name, model) in enumerate(models_to_test.items()):
if name == 'ElasticNet':
scaler = StandardScaler()
X_used = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
model = ElasticNet(random_state=42, max_iter=10000)
else:
X_used = X_train
selector = RFECV(
estimator=model,
step=1,
cv=tscv,
scoring='neg_root_mean_squared_error',
min_features_to_select=3
)
selector = selector.fit(X_used, y_train)
selected_features = X_train.columns[selector.support_].tolist()
final_selections[name] = selected_features
ax = axes[i]
x_points = range(3, len(selector.cv_results_['mean_test_score']) + 3)
y_points = selector.cv_results_['mean_test_score']
ax.plot(x_points, y_points, linewidth=2, color='tab:blue')
ax.set_title(f'{name}\n({len(selected_features)} features)', fontsize=14, fontweight='bold')
ax.set_xlabel("# Features")
if i == 0: ax.set_ylabel("CV Score (Neg MAE)")
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
all_features = sorted(list(set(x for l in final_selections.values() for x in l)))
selection_df = pd.DataFrame(index=all_features)
for name, features in final_selections.items():
selection_df[name] = selection_df.index.isin(features).astype(int)
selection_df['Count'] = selection_df.sum(axis=1)
selection_df = selection_df.sort_values('Count', ascending=False)
display(selection_df.style.background_gradient(cmap='Blues', subset=['Count']))| RandomForest | XGBoost | LightGBM | ElasticNet | Count | |
|---|---|---|---|---|---|
| day_of_month | 1 | 1 | 1 | 1 | 4 |
| is_welcome_month | 1 | 1 | 1 | 1 | 4 |
| lag_abs_1 | 1 | 1 | 1 | 1 | 4 |
| lag_abs_2 | 1 | 1 | 1 | 1 | 4 |
| lag_abs_52 | 1 | 1 | 1 | 1 | 4 |
| roll_std_4_abs | 1 | 1 | 1 | 1 | 4 |
| week_sin | 1 | 1 | 1 | 1 | 4 |
| roll_mean_4_abs | 1 | 1 | 1 | 0 | 3 |
| week_of_semester | 0 | 1 | 1 | 1 | 3 |
| week_cos | 0 | 1 | 1 | 0 | 2 |
| week_of_year | 0 | 1 | 1 | 0 | 2 |
| month | 0 | 0 | 1 | 0 | 1 |
| semester | 0 | 1 | 0 | 0 | 1 |
def walk_forward_validation(train_df, test_df, model, feature_cols, target_col='registrations', forecast_horizon=1):
results = []
relevant_cols = feature_cols + [target_col, 'week_ending']
train_subset = train_df[relevant_cols].copy()
test_subset = test_df[relevant_cols].copy()
test_weeks = test_subset['week_ending'].values
n_test = len(test_weeks)
current_train = train_subset.copy()
i = 0
fold = 0
while i < n_test:
fold += 1
end_idx = min(i + forecast_horizon, n_test)
forecast_weeks = test_weeks[i:end_idx]
X_train = current_train[feature_cols]
y_train = current_train[target_col]
current_test_window = test_subset[test_subset['week_ending'].isin(forecast_weeks)]
X_test = current_test_window[feature_cols]
y_test = current_test_window[target_col]
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred = np.maximum(0, np.round(y_pred).astype(int))
for j, week in enumerate(forecast_weeks):
results.append({
'week_ending': pd.Timestamp(week),
'actual': y_test.iloc[j],
'predicted': y_pred[j],
'model': type(model).__name__,
'fold': fold
})
# Add the data we just predicted (expanding window)
current_train = pd.concat([current_train, current_test_window], ignore_index=True)
i = end_idx
return pd.DataFrame(results)def objective(trial, model_name, X, y):
if model_name == 'RandomForest':
params = {
'n_estimators': trial.suggest_int('n_estimators', 50, 300),
'max_depth': trial.suggest_int('max_depth', 3, 20),
'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
'random_state': 42,
'n_jobs': -1
}
model = RandomForestRegressor(**params)
elif model_name == 'XGBoost':
params = {
'n_estimators': trial.suggest_int('n_estimators', 50, 300),
'max_depth': trial.suggest_int('max_depth', 3, 10),
'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
'subsample': trial.suggest_float('subsample', 0.5, 1.0),
'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
'objective': 'reg:squarederror',
'random_state': 42,
'n_jobs': -1
}
model = XGBRegressor(**params)
elif model_name == 'LightGBM':
params = {
'n_estimators': trial.suggest_int('n_estimators', 50, 300),
'num_leaves': trial.suggest_int('num_leaves', 20, 100),
'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
'subsample': trial.suggest_float('subsample', 0.5, 1.0),
'random_state': 42,
'n_jobs': -1,
'verbose': -1
}
model = LGBMRegressor(**params)
elif model_name == 'ElasticNet':
params = {
'alpha': trial.suggest_float('alpha', 0.001, 10.0, log=True),
'l1_ratio': trial.suggest_float('l1_ratio', 0.0, 1.0),
'random_state': 42,
'max_iter': 10000
}
model = ElasticNet(**params)
tscv = TimeSeriesSplit(n_splits=5)
scores = []
for train_index, val_index in tscv.split(X):
X_t, X_v = X.iloc[train_index], X.iloc[val_index]
y_t, y_v = y.iloc[train_index], y.iloc[val_index]
if model_name == 'ElasticNet':
scaler = StandardScaler()
X_t = scaler.fit_transform(X_t)
X_v = scaler.transform(X_v)
model.fit(X_t, y_t)
preds = model.predict(X_v)
mae = np.mean(np.abs(y_v - preds))
scores.append(mae)
return np.mean(scores)
optuna.logging.set_verbosity(optuna.logging.WARNING)
print("This cell may take some time to run...\n")
all_results = []
best_params_store = {}
for name in models_to_test.keys():
features = final_selections[name]
X_train_fs = X_train[features]
study = optuna.create_study(direction='minimize')
study.optimize(lambda trial: objective(trial, name, X_train_fs, y_train), n_trials=20)
best_params = study.best_params
best_params_store[name] = best_params
if name == 'RandomForest':
final_model = RandomForestRegressor(**best_params, random_state=42, n_jobs=-1)
elif name == 'XGBoost':
final_model = XGBRegressor(**best_params, objective='reg:squarederror', random_state=42, n_jobs=-1)
elif name == 'LightGBM':
final_model = LGBMRegressor(**best_params, random_state=42, n_jobs=-1, verbose=-1)
elif name == 'ElasticNet':
final_model = ElasticNet(**best_params, random_state=42, max_iter=10000)
final_model = make_pipeline(StandardScaler(), final_model)
wf_results = walk_forward_validation(
train_df=train,
test_df=test,
model=final_model,
feature_cols=features,
forecast_horizon=1
)
test_mae = np.mean(np.abs(wf_results['actual'] - wf_results['predicted']))
all_results.append(wf_results)
final_results_df = pd.concat(all_results, ignore_index=True)def calculate_wmape(y_true, y_pred):
return np.sum(np.abs(y_true - y_pred)) / np.sum(np.abs(y_true))
final_results_df['model'] = final_results_df['model'].replace({
'Pipeline': 'ElasticNet',
})
summary_metrics = final_results_df.groupby('model').apply(
lambda x: pd.Series({
'MAE': np.mean(np.abs(x['actual'] - x['predicted'])),
'RMSE': np.sqrt(np.mean((x['actual'] - x['predicted'])**2)),
'WMAPE': calculate_wmape(x['actual'], x['predicted'])
})
).sort_values('RMSE')
styler = summary_metrics.style.format("{:.4f}")
styler.background_gradient(cmap='Greens', subset=['MAE', 'RMSE', 'WMAPE'])
display(styler)| MAE | RMSE | WMAPE | |
|---|---|---|---|
| model | |||
| XGBRegressor | 21.4694 | 35.6227 | 0.3219 |
| ElasticNet | 21.4490 | 39.4045 | 0.3216 |
| RandomForestRegressor | 27.7755 | 53.5760 | 0.4165 |
| LGBMRegressor | 31.2857 | 56.2438 | 0.4691 |
metrics = ['MAE', 'RMSE', 'WMAPE']
num_vars = len(metrics)
labels = summary_metrics.index.tolist()
stats = summary_metrics[metrics].values
max_vals = stats.max(axis=0)
stats_norm = stats / max_vals
stats_norm = np.concatenate((stats_norm, stats_norm[:, [0]]), axis=1)
angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()
angles += angles[:1]
fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))
plt.xticks(angles[:-1], metrics, color='grey', size=12)
ax.set_rlabel_position(0)
plt.yticks([0.25, 0.5, 0.75, 1.0], ["25%", "50%", "75%", "100%"], color="grey", size=8)
plt.ylim(0, 1.1)
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728']
for i, model_name in enumerate(labels):
ax.plot(angles, stats_norm[i], linewidth=2, linestyle='solid', label=model_name, color=colors[i])
ax.fill(angles, stats_norm[i], color=colors[i], alpha=0.1)
plt.title('Model Performance Comparison (Normalized)', size=16, fontweight='bold', y=1.1)
plt.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1))
plt.show()best_model_name = 'XGBRegressor' results_df = final_results_df[final_results_df['model'] == best_model_name].copy() results_df['week_ending'] = pd.to_datetime(results_df['week_ending']) results_df['residual'] = results_df['predicted'] - results_df['actual'] history_df = train[['week_ending', 'registrations']].copy() history_df['week_ending'] = pd.to_datetime(history_df['week_ending'])
fig, ax = plt.subplots(figsize=(15, 8))
ax.plot(history_df['week_ending'], history_df['registrations'],
label='Training data (history)', color='grey', alpha=0.5, linewidth=1)
ax.plot(results_df['week_ending'], results_df['actual'],
label='Actual (test period)', color='#E63946', linewidth=2, marker='o', markersize=6)
ax.plot(results_df['week_ending'], results_df['predicted'],
label='Walk-Forward predictions', color='#2A9D8F', linewidth=2,
linestyle='--', marker='o', markersize=6)
ax.fill_between(results_df['week_ending'],
results_df['actual'], results_df['predicted'],
alpha=0.2, color='gray', label='Prediction error')
ax.set_xlabel('Date', fontsize=12)
ax.set_ylabel('Registrations', fontsize=12)
ax.set_title(f'Full Timeline: History vs Walk-Forward Forecast ({best_model_name})', fontsize=14, fontweight='bold')
ax.legend(loc='upper left')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()fig, ax = plt.subplots(figsize=(14, 6))
ax.plot(results_df['week_ending'], results_df['actual'],
label='Actual', color='#E63946', marker='o', linewidth=2, markersize=8)
ax.plot(results_df['week_ending'], results_df['predicted'],
label='Predicted', color='#2A9D8F', linestyle='--', marker='o', linewidth=2, markersize=8)
for _, row in results_df.iterrows():
ax.annotate(f"{row['residual']:+.0f}",
xy=(row['week_ending'], max(row['actual'], row['predicted']) + 10),
ha='center', fontsize=9, color='black')
ax.fill_between(results_df['week_ending'],
results_df['actual'], results_df['predicted'],
alpha=0.15, color='gray')
ax.set_xlabel('Week ending', fontsize=12)
ax.set_ylabel('Registrations', fontsize=12)
ax.set_title('Zoomed view: forecast performance', fontsize=14, fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.5)
plt.tight_layout()
plt.show()sigma = results_df['residual'].std()
results_df['upper_95'] = results_df['predicted'] + (1.96 * sigma)
results_df['lower_95'] = results_df['predicted'] - (1.96 * sigma)
results_df['upper_50'] = results_df['predicted'] + (0.675 * sigma)
results_df['lower_50'] = results_df['predicted'] - (0.675 * sigma)
results_df['is_outlier'] = (results_df['actual'] > results_df['upper_95']) | \
(results_df['actual'] < results_df['lower_95'])
outliers = results_df[results_df['is_outlier']]
safe_points = results_df[~results_df['is_outlier']]
fig, ax = plt.subplots(figsize=(16, 7))
ax.plot(results_df['week_ending'], results_df['predicted'],
color='#2A9D8F', linewidth=2, label='Forecast (center)')
ax.fill_between(results_df['week_ending'],
results_df['lower_95'], results_df['upper_95'],
color='#2A9D8F', alpha=0.10, label='95% Confidence Interval')
ax.fill_between(results_df['week_ending'],
results_df['lower_50'], results_df['upper_50'],
color='#2A9D8F', alpha=0.25, label='50% Confidence Interval')
ax.scatter(safe_points['week_ending'], safe_points['actual'],
color='#264653', s=50, alpha=0.7, label='Actual (within expected range)')
ax.scatter(outliers['week_ending'], outliers['actual'],
color='red', s=100, zorder=5, edgecolor='white', linewidth=1.5,
label=f'Anomalies (> {1.96*sigma:.0f} error)')
for _, row in outliers.iterrows():
ax.annotate(f"{row['actual']}",
xy=(row['week_ending'], row['actual']),
xytext=(0, 10), textcoords='offset points',
ha='center', fontsize=9, fontweight='bold', color='red')
ax.set_title(f'Forecast uncertainty analysis (Std. Dev.: {sigma:.1f})', fontsize=16, fontweight='bold')
ax.set_ylabel('Registrations', fontsize=12)
ax.set_xlabel('Date', fontsize=12)
ax.legend(loc='upper left', fontsize=11)
ax.grid(True, alpha=0.3)
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))
ax.xaxis.set_major_locator(mdates.MonthLocator())
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
coverage_95 = (len(results_df) - len(outliers)) / len(results_df) * 100
print(f"Interval Coverage: {coverage_95:.1f}% of actuals fell within the 95% prediction interval.")mean_error = results_df['residual'].mean()
fig, axes = plt.subplots(1, 2, figsize=(22, 8))
# 1. Density Plot
sns.kdeplot(results_df['residual'], ax=axes[0],
fill=True, color='red', alpha=0.3, linewidth=2)
axes[0].axvline(mean_error, color='#2A9D8F', linestyle='-', linewidth=3,
label=f'Mean: {mean_error:.2f}')
axes[0].set_title('Density of forecast errors', fontsize=20, fontweight='bold')
axes[0].set_xlabel('Forecast error', fontsize=18)
axes[0].legend(fontsize=18)
axes[0].grid(True, alpha=0.2)
# 2. Scatter Plot
sns.scatterplot(data=results_df, x='actual', y='predicted', ax=axes[1],
color='#2A9D8F', s=150, edgecolor='white', linewidth=1.5)
limit = max(results_df['actual'].max(), results_df['predicted'].max()) * 1.05
axes[1].plot([0, limit], [0, limit], color='#e63946', linestyle='--', linewidth=2, label='Perfect Prediction')
axes[1].set_title('Actual vs Predicted values', fontsize=20, fontweight='bold')
axes[1].set_xlabel('Actual registrations', fontsize=18)
axes[1].set_ylabel('Predicted registrations', fontsize=18)
axes[1].set_xlim(-5, limit)
axes[1].set_ylim(-5, limit)
axes[1].legend(loc='upper left', fontsize=18)
axes[1].grid(True, alpha=0.2)
plt.tight_layout()
plt.show()results_df['month_period'] = results_df['week_ending'].dt.to_period('M')
results_df['month_cum_actual'] = results_df.groupby('month_period')['actual'].cumsum()
results_df['month_cum_pred'] = results_df.groupby('month_period')['predicted'].cumsum()
monthly_stats = results_df.groupby('month_period').agg({
'week_ending': 'max',
'month_cum_actual': 'max',
'month_cum_pred': 'max',
'actual': 'sum',
'predicted': 'sum'
}).reset_index()
monthly_stats['diff'] = monthly_stats['predicted'] - monthly_stats['actual']
monthly_mae = np.mean(np.abs(monthly_stats['actual'] - monthly_stats['predicted']))
monthly_rmse = np.sqrt(np.mean((monthly_stats['actual'] - monthly_stats['predicted'])**2))
monthly_wmape = np.sum(np.abs(monthly_stats['actual'] - monthly_stats['predicted'])) / np.sum(monthly_stats['actual'])
print("Monthly Aggregated Performance:")
print(f"MAE: {monthly_mae:.2f}")
print(f"RMSE: {monthly_rmse:.2f}")
print(f"WMAPE: {monthly_wmape:.2%}")
fig, ax = plt.subplots(figsize=(16, 7))
ax.plot(results_df['week_ending'], results_df['month_cum_actual'],
label='Cumulative actual (monthly)', color='#E63946', linewidth=2.5, marker='o', markersize=4)
ax.plot(results_df['week_ending'], results_df['month_cum_pred'],
label='Cumulative predicted (monthly)', color='#2A9D8F',
linestyle='--', linewidth=2.5, marker='o', markersize=4)
for _, row in monthly_stats.iterrows():
diff = row['diff']
color = 'black'
sign = '+' if diff > 0 else ''
max_height = max(row['month_cum_actual'], row['month_cum_pred'])
ax.annotate(f"{sign}{diff:.0f}",
xy=(row['week_ending'], max_height),
xytext=(0, 10), textcoords='offset points',
ha='center', va='bottom', fontsize=11, fontweight='bold', color=color,
bbox=dict(boxstyle="round,pad=0.2", fc="white", ec=color, alpha=0.8))
ax.set_title('Monthly performance: cumulative progress', fontsize=16, fontweight='bold')
ax.set_ylabel('Cumulative Registrations (Resets Monthly)', fontsize=12)
ax.set_xlabel('Date', fontsize=12)
ax.xaxis.set_major_locator(mdates.MonthLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))
ax.grid(True, which='major', axis='x', linestyle='--', alpha=0.7)
ax.grid(True, which='major', axis='y', alpha=0.3)
plt.legend(loc='upper left', fontsize=12)
plt.tight_layout()
plt.show()