Hyperparameter grid search with scikit-learn¶
Upload a synthetic dataset once with send_object, then fan out a grid of hyperparameter combinations. Each worker fits a GradientBoostingClassifier and returns its cross-validation score; the client picks the best.
[ ]:
# Connection settings -- edit these to point at your running scheduler.
SCHEDULER_ADDRESS = "ws://127.0.0.1:2345" # supports tcp:// or ws://; only ws:// works from JupyterLite (browser)
OBJECT_STORAGE_ADDRESS = None # leave None to use whatever the scheduler advertises
[ ]:
import time
from itertools import product
from sklearn.datasets import make_classification
from scaler import Client
# A noticeably larger dataset so each individual fit takes real wall-clock time.
X, y = make_classification(
n_samples=800,
n_features=40,
n_informative=15,
n_redundant=8,
random_state=0,
)
print(f"dataset: X={X.shape}, y={y.shape}, class balance={int(y.sum())}/{len(y)}")
def score_params(dataset: tuple, params: dict, seed: int) -> float:
"""Worker-side: fit one GradientBoostingClassifier and return a CV score."""
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
X_train, y_train = dataset
model = GradientBoostingClassifier(random_state=seed, **params)
return float(cross_val_score(model, X_train, y_train, cv=5, scoring="accuracy").mean())
# Default grid has 4 * 3 * 3 * 4 = 144 model fits; on 16 workers expect roughly a minute.
grid = list(
product(
[100, 200, 400, 600], # n_estimators
[3, 4, 5], # max_depth
[0.03, 0.05, 0.1], # learning_rate
[0, 1, 2, 3], # seeds
)
)
with Client(address=SCHEDULER_ADDRESS, object_storage_address=OBJECT_STORAGE_ADDRESS) as client:
dataset_ref = client.send_object((X, y), name="grid-search-dataset")
started = time.perf_counter()
futures = {
(n, depth, lr, seed): client.submit(
score_params,
dataset_ref,
{"n_estimators": n, "max_depth": depth, "learning_rate": lr},
seed,
)
for (n, depth, lr, seed) in grid
}
scores = {key: future.result() for key, future in futures.items()}
elapsed = time.perf_counter() - started
print(f"fit {len(grid)} models in {elapsed:.2f}s")
best_key, best_score = max(scores.items(), key=lambda item: item[1])
n, depth, lr, seed = best_key
print(f"best: n_estimators={n}, max_depth={depth}, learning_rate={lr}, seed={seed} -> accuracy={best_score:.4f}")