Source code for Hive_ML.evaluation.model_evaluation

import matplotlib.pyplot as plt
import numpy as np
from yellowbrick.base import Visualizer
from yellowbrick.classifier import (
    ClassificationReport,
    ROCAUC,
    PrecisionRecallCurve,
    ClassPredictionError,
    DiscriminationThreshold,
)
from yellowbrick.style import set_palette

from Hive_ML.training.models import (
    adab_tree,
    random_forest,
    knn,
    decicion_tree,
    lda,
    qda,
    naive,
    svm_kernel,
    logistic_regression,
    ridge,
    mlp,
)

from sklearn.decomposition import PCA
from os import PathLike
from typing import List, Dict, Union
from pandas import DataFrame
from typing import Tuple
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from Hive_ML.training.model_trainer import model_fit_and_predict
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from Hive_ML.utilities.feature_utils import feature_normalization, prepare_features
from sklearn.base import ClassifierMixin

set_palette("sns_pastel")

MODELS = {
    "rf": random_forest,
    "adab": adab_tree,
    "lda": lda,
    "qda": qda,
    "logistic_regression": logistic_regression,
    "knn": knn,
    "naive": naive,
    "decision_tree": decicion_tree,
    "svm": svm_kernel,
    "ridge": ridge,
    "mlp": mlp,
}

AGGR_NUMPY = {"median": np.median, "mean": np.mean}

YB_VISUALIZERS = {
    "Report": ClassificationReport,
    "ROCAUC": ROCAUC,
    "PR": PrecisionRecallCurve,
    "CPE": ClassPredictionError,
    "DT": DiscriminationThreshold,
}


[docs] def select_best_classifiers( df_summary: DataFrame, metric: str, reduction: str, k: int = 1 ) -> Tuple[List[Tuple[str, str]], List[float]]: """ Given a DataFrame containing Validation scores for different Classifiers and Number of Selected Features, returns the k-best combinations and their respective reduced score (mean or median over the validation splits). Parameters ---------- df_summary : Validation DataFrame Summary. metric : Metric to consider to select the best performance. reduction : Reduction to apply to the validation splits to select the best performance. k : Number of the best combinations to select. Returns ------- Selected best combinations [(N_Features, Classifier), (N_Features, Classifier), ... ] and corresponding reduced validation scores. """ aggr = df_summary[df_summary["Metric"] == metric][["Value", "Classifier"]].groupby(["Classifier"]).agg(reduction) aggr = aggr.loc[aggr["Value"].nlargest(k).index] classifiers = aggr.index.values n_features = [] best_val_scores = [] for classifier in classifiers: aggr = ( df_summary[(df_summary["Metric"] == metric) & (df_summary["Classifier"] == classifier)][ ["Value", "N_Features"] ] .groupby(["N_Features"]) .agg(reduction) ) aggr = aggr.loc[aggr["Value"].nlargest(1).index] n_features.append(aggr.index.values[0]) best_val_scores.append(aggr.values[0][0]) n_features_selected_classifier = [(n_features[i], classifiers[i]) for i in range(len(classifiers))] n_features, selected_classifier = n_features_selected_classifier[0] print(f"Best Configuration: {selected_classifier}-{n_features}, {metric}: {best_val_scores[0]}") return n_features_selected_classifier, best_val_scores
[docs] def evaluate_classifiers( ensemble_configuration_df: DataFrame, classifier_kwargs_list: List[Dict], train_feature_set: np.ndarray, train_label_set: np.ndarray, test_feature_set: np.ndarray, test_label_set: np.ndarray, aggregation: str, feature_selection: str, visualizers: List[Dict] = None, output_file: Union[str, PathLike] = None, plot_title: str = "", random_state=None, ) -> Dict: """ Evaluate ensemble Classification performance of provided classifiers, weighting and combining the single classifier predictions. If a list of YellowBrick Visualizers is provided, generates a single multi-plot report file. Parameters ---------- ensemble_configuration_df: Dataframe containing the ensemble configuration. Each row should include `Classifier` , `N_Features` ( Number of Features to select), and `weight` ( weighting of the classifier prediction in the ensemble). classifier_kwargs_list : List of classifiers kwargs Dict, used to configure the classifiers. train_feature_set : Train Feature set used for the classifiers fitting. train_label_set : Train Label set used for the classifiers fitting. test_feature_set : Test Feature set used for the classifiers evaluations. test_label_set : Test Label set used for the classifiers evaluations. feature_selection : Type of Feature Selection to perform ( ``SFFS`` or ``PCA``). aggregation : Type of Feature Aggregation. visualizers : List of YellowBrick Visualizers to use in the report plot generation. output_file : File location where to save the YellowBrick Plot Report. plot_title : String used in the YellowBrick plots as title. Returns ------- Dictionary with the ensemble classifier report ( including the classification metrics ). """ fig, axs = plt.subplots( int(len(visualizers)), int(ensemble_configuration_df.shape[0]), figsize=(int(ensemble_configuration_df.shape[0]) * 10 * 1.5, int(len(visualizers)) * 10 * 1), squeeze=False, ) visualgrid = [] x_train, y_train, x_test, y_test = prepare_features( train_feature_set, train_label_set, None, aggregation, None, test_feature_set, test_label_set ) x_train, x_test, _ = feature_normalization(x_train, x_test) ensemble_y_test_pred = np.zeros((x_test.shape[0], 2)) ensemble_weights = ensemble_configuration_df["weight"].values weight_sum = np.sum(ensemble_weights) ensemble_weights = ensemble_weights / weight_sum for ensemble_idx, (classifier_configuration, classifier_kwargs, weight) in enumerate( zip(ensemble_configuration_df.iterrows(), classifier_kwargs_list, ensemble_weights) ): classifier, n_features = classifier_configuration[1]["Classifier"], classifier_configuration[1]["N_Features"] clf = MODELS[classifier](**classifier_kwargs, random_state=random_state) x_train, y_train, x_test, y_test = prepare_features( train_feature_set, train_label_set, None, aggregation, None, test_feature_set, test_label_set ) x_train, x_test, _ = feature_normalization(x_train, x_test) if n_features != "All" and feature_selection == "SFFS": sffs_model = SFS( clf, k_features=int(n_features), forward=True, floating=True, scoring="roc_auc", verbose=0, n_jobs=-1, cv=5, ) sffs = sffs_model.fit(x_train, y_train) sffs_features = sffs.subsets_ feature_idx = sffs_features[n_features]["feature_idx"] x_train = x_train[:, feature_idx] x_test = x_test[:, feature_idx] if n_features != "All" and feature_selection == "PCA": pca = PCA(n_components=n_features) x_train = pca.fit_transform(x_train) x_test = pca.transform(x_test) clf = MODELS[classifier](**classifier_kwargs) y_test_pred = model_fit_and_predict(clf, x_train, y_train, x_test) for idx_visualizer, visualizer in enumerate(visualizers): visualizers[visualizer]["ax"] = axs[idx_visualizer, ensemble_idx] visualizers[visualizer]["title"] = f"{plot_title} {visualizer}, {classifier}-{n_features}" visualgrid.append(YB_Visualizer(clf, visualizer, x_train, y_train, x_test, y_test, visualizers[visualizer])) ensemble_y_test_pred += y_test_pred * weight roc_auc_val = roc_auc_score(y_test, ensemble_y_test_pred[:, 1]) report = classification_report(y_test, np.where(ensemble_y_test_pred[:, 1] > 0.5, 1, 0), output_dict=True) report["roc_auc"] = roc_auc_val if output_file is not None: plt.savefig(output_file) return report
[docs] def YB_Visualizer( clf: ClassifierMixin, visualizer: str, x_train: np.ndarray, y_train: np.ndarray, x_test: np.ndarray, y_test: np.ndarray, kwargs: Dict, ) -> Visualizer: """ Creates and Finalize a YellowBrick visualizer, given the classifier and the train/test features and corresponding labels to use for fitting and scoring. Parameters ---------- clf : Classifier used by the Visualizer. visualizer : visualizer name to create. Must match a value in YB_VISUALIZERS. x_train : Train Feature set used for the classifiers fitting. y_train : Train Label set used for the classifiers fitting. x_test : Test Feature set used for the classifiers scoring. y_test : Test Label set used for the classifiers scoring. kwargs : Dictionary of kwargs for the YellowBrick Visualizer. Returns ------- YellowBrick Visualizer finalized. """ visualizer = YB_VISUALIZERS[visualizer](clf, **kwargs) if visualizer != "DT": visualizer.fit(x_train, y_train) visualizer.score(x_test, y_test) else: x = np.vstack((x_train, x_test)) y = np.vstack((y_train, y_test)) visualizer.fit(x, y) if visualizer == "Report": visualizer.draw() visualizer.finalize() return visualizer