01 Serialize RF Bloom Classifier and RobustScaler#
Replicates exact training from ml.ipynb. Run once to populate models/.
Trains
RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)on 1993–2018 dataSaves
models/rf_bloom_classifier.joblibandmodels/robust_scaler.joblibWrites
models/model_card.mdwith sklearn version, balanced accuracy, training date, and feature list
Do not modify machineLearning/machineLearning/ml.ipynb — this notebook is the standalone serialization entry point.
import joblib
import sklearn
import numpy as np
import pandas as pd
from pathlib import Path
from datetime import date
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import balanced_accuracy_score
DATA_PATH = Path("machineLearning/machineLearning/input/data_weekly_intepolated.csv")
MODELS_DIR = Path("models")
MODELS_DIR.mkdir(exist_ok=True)
df = pd.read_csv(DATA_PATH, parse_dates=["time"])
df = df.sort_values("time").reset_index(drop=True)
# Create binary classification target (K. brevis >= 100k cells/L => bloom)
threshold = 100000 # K. brevis >= 100k cells/L => bloom
df["bloom"] = (df["kb"] >= threshold).astype(int)
# Shift target for 1-week-ahead prediction (1-week-ahead prediction)
df["target_next_week"] = df["bloom"].shift(-1)
df = df.dropna(subset=["target_next_week"])
df["target_next_week"] = df["target_next_week"].astype(int)
# Lag features (lag and rolling window features)
df["kb_prev1"] = df["kb"].shift(1)
df["kb_prev2"] = df["kb"].shift(2)
df["peace_discharge_prev1"] = df["peace_discharge"].shift(1)
df["peace_TN_prev1"] = df["peace_TN"].shift(1)
df["peace_TP_prev1"] = df["peace_TP"].shift(1)
df["discharge_4w_avg"] = df["peace_discharge"].rolling(window=4).mean()
df = df.dropna().reset_index(drop=True)
FEATURE_COLS = [
"kb", "kb_prev1", "kb_prev2",
"zos", "salinity", "water_temp",
"wind_speed", "wind_direction",
"peace_discharge", "peace_TN", "peace_TP",
"peace_discharge_prev1", "peace_TN_prev1", "peace_TP_prev1",
"discharge_4w_avg",
]
TARGET_COL = "target_next_week"
print(f"Loaded {len(df)} rows. Features: {len(FEATURE_COLS)}")
Loaded 1613 rows. Features: 15
train = df[df["time"] < "2019-01-01"]
test = df[df["time"] >= "2019-01-01"]
X_train = train[FEATURE_COLS].values
y_train = train[TARGET_COL].values
X_test = test[FEATURE_COLS].values
y_test = test[TARGET_COL].values
print(f"Train: {X_train.shape}, Test: {X_test.shape}")
Train: (1354, 15), Test: (259, 15)
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
rf_clf = RandomForestClassifier(
n_estimators=100,
class_weight="balanced",
random_state=42,
)
rf_clf.fit(X_train_scaled, y_train)
y_pred = rf_clf.predict(X_test_scaled)
bal_acc = balanced_accuracy_score(y_test, y_pred)
print(f"Balanced accuracy: {bal_acc:.3f}")
Balanced accuracy: 0.887
joblib.dump(rf_clf, MODELS_DIR / "rf_bloom_classifier.joblib")
joblib.dump(scaler, MODELS_DIR / "robust_scaler.joblib")
print(f"Saved model and scaler to {MODELS_DIR.resolve()}")
Saved model and scaler to total_nitrogen\models
feature_list = "".join(f"{i+1}. `{c}`\n" for i, c in enumerate(FEATURE_COLS))
card = f"""# Model Card -- Red Tide RF Bloom Classifier
| Field | Value |
|-------|-------|
| sklearn_version | {sklearn.__version__} |
| balanced_accuracy | {bal_acc:.3f} |
| training_date | {date.today().isoformat()} |
| n_estimators | 100 |
| class_weight | balanced |
| random_state | 42 |
| n_features | {len(FEATURE_COLS)} |
| train_period | 1993-2018 |
| test_period | 2019+ |
## Feature Columns (in order)
{feature_list}
"""
(MODELS_DIR / "model_card.md").write_text(card)
print("Wrote model_card.md")
Wrote model_card.md
# Verify round-trip
m = joblib.load(MODELS_DIR / "rf_bloom_classifier.joblib")
s = joblib.load(MODELS_DIR / "robust_scaler.joblib")
proba = m.predict_proba(s.transform(X_test))[:, 1]
assert proba.shape == (len(X_test),)
assert 0.0 <= proba.min() and proba.max() <= 1.0
print(f"Verification passed: {proba.shape[0]} predictions, range [{proba.min():.3f}, {proba.max():.3f}]")
Verification passed: 259 predictions, range [0.000, 0.970]