Hyperparameter grid search with scikit-learn¶

Upload a synthetic dataset once with send_object, then fan out a grid of hyperparameter combinations. Each worker fits a GradientBoostingClassifier and returns its cross-validation score; the client picks the best.

[ ]:

# One-time setup: install the Scaler client
%pip install opengris-scaler

[ ]:

# Connection settings -- edit these to point at your running scheduler.
SCHEDULER_ADDRESS = "ws://127.0.0.1:2345"  # supports tcp:// or ws://; only ws:// works from JupyterLite (browser)
OBJECT_STORAGE_ADDRESS = None  # leave None to use whatever the scheduler advertises

[ ]:

import time

from itertools import product

from sklearn.datasets import make_classification

from scaler import Client


# A noticeably larger dataset so each individual fit takes real wall-clock time.
X, y = make_classification(
    n_samples=800,
    n_features=40,
    n_informative=15,
    n_redundant=8,
    random_state=0,
)
print(f"dataset: X={X.shape}, y={y.shape}, class balance={int(y.sum())}/{len(y)}")


def score_params(dataset: tuple, params: dict, seed: int) -> float:
    """Worker-side: fit one GradientBoostingClassifier and return a CV score."""
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.model_selection import cross_val_score

    X_train, y_train = dataset
    model = GradientBoostingClassifier(random_state=seed, **params)
    return float(cross_val_score(model, X_train, y_train, cv=5, scoring="accuracy").mean())


# Default grid has 4 * 3 * 3 * 4 = 144 model fits; on 16 workers expect roughly a minute.
grid = list(
    product(
        [100, 200, 400, 600],   # n_estimators
        [3, 4, 5],              # max_depth
        [0.03, 0.05, 0.1],      # learning_rate
        [0, 1, 2, 3],           # seeds
    )
)

with Client(address=SCHEDULER_ADDRESS, object_storage_address=OBJECT_STORAGE_ADDRESS) as client:
    dataset_ref = client.send_object((X, y), name="grid-search-dataset")
    started = time.perf_counter()
    futures = {
        (n, depth, lr, seed): client.submit(
            score_params,
            dataset_ref,
            {"n_estimators": n, "max_depth": depth, "learning_rate": lr},
            seed,
        )
        for (n, depth, lr, seed) in grid
    }
    scores = {key: future.result() for key, future in futures.items()}
    elapsed = time.perf_counter() - started

print(f"fit {len(grid)} models in {elapsed:.2f}s")
best_key, best_score = max(scores.items(), key=lambda item: item[1])
n, depth, lr, seed = best_key
print(f"best: n_estimators={n}, max_depth={depth}, learning_rate={lr}, seed={seed} -> accuracy={best_score:.4f}")