Source code for mds_2025_helper_functions.scores

from sklearn.model_selection import cross_validate
from sklearn.base import BaseEstimator
import numpy as np
import pandas as pd

[docs] def compare_model_scores(*args, X, y=None, scoring=None, return_train_scores=False, **kwargs): """ Creates a table comparing mean cross-validation scores of multiple models. Parameters ---------- *args : sklearn.base.BaseEstimator Model objects implementing the `fit` method. At least two models are required. X : array-like of shape (n_samples, n_features) Training data. y : array-like of shape (n_samples,) or (n_samples, n_outputs), optional Target values for supervised learning tasks. scoring : str, callable, list, tuple, or dict, optional Metrics to evaluate models. Refer to `scikit-learn` scoring documentation: https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter. return_train_scores : bool, default=False Whether to include training scores in addition to test scores. **kwargs : dict Additional arguments passed to `sklearn.model_selection.cross_validate`. Returns ------- pd.DataFrame A DataFrame comparing model performance: - Rows represent different models. - Columns include metrics from cross-validation. - Index contains model names. Examples -------- >>> from sklearn.linear_model import LogisticRegression >>> from sklearn.ensemble import RandomForestClassifier >>> compare_model_scores(LogisticRegression(), RandomForestClassifier(), X=X_train, y=y_train, scoring="accuracy") """ # Check for at least 2 models to compare if len(args) <= 1: raise TypeError( "compare_model_scores() requires at least 2 models. " f"You provided {len(args)}." ) # Check that objects passed as arguments are models (not e.g. lists, strings, etc) for model in args: if not isinstance(model, BaseEstimator): raise TypeError( "All models must be sklearn models. " f"The following argument is not an sklearn model: {model}" ) # Check that all models are either classifiers or regressors model_types = {model._estimator_type for model in args} if len(model_types) > 1: raise ValueError( "All models must be of the same type. " f"Found multiple types: {', '.join(sorted(model_types))}" ) # Main code results = [] model_counts = {} for model in args: # Get CV scores cv_results = cross_validate( model, X=X, y=y, scoring=scoring, return_train_score=return_train_scores, **kwargs ) # Calculate mean of scores mean_scores = {key: np.mean(val) for key, val in cv_results.items()} # Give model a unique name model_name = model.__class__.__name__ if model_name in model_counts: model_counts[model_name] += 1 model_name = f"{model_name}_{model_counts[model_name]}" else: model_counts[model_name] = 1 mean_scores['model'] = model_name # Add model scores and model name to list results.append(mean_scores) # Return model list as DataFrame return pd.DataFrame(results).set_index('model')