Import des outils / jeu de données#
1import matplotlib.pyplot as plt
2import numpy as np
3import pandas as pd
4import seaborn as sns
5import xgboost
6from catboost import CatBoostClassifier
7from imblearn.over_sampling import SMOTENC
8from keras import layers
9from lightgbm import LGBMClassifier
10from scipy.stats import boxcox
11from sklearn.compose import ColumnTransformer
12from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
13from sklearn.dummy import DummyClassifier
14from sklearn.ensemble import IsolationForest, RandomForestClassifier, VotingClassifier
15from sklearn.feature_selection import mutual_info_regression
16from sklearn.gaussian_process import GaussianProcessClassifier
17from sklearn.inspection import permutation_importance
18from sklearn.linear_model import LogisticRegression
19from sklearn.metrics import (
20 ConfusionMatrixDisplay,
21 accuracy_score,
22 classification_report,
23 confusion_matrix,
24 f1_score,
25 make_scorer,
27from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
28from sklearn.naive_bayes import BernoulliNB, ComplementNB
29from sklearn.neighbors import KNeighborsClassifier
30from sklearn.pipeline import Pipeline
31from sklearn.preprocessing import OneHotEncoder, RobustScaler, StandardScaler
32from sklearn.svm import LinearSVC
33from sklearn.tree import DecisionTreeClassifier
34from tensorflow import keras
36from src.classification import create_classification_models
37from src.config import data_folder, seed
38from src.constants import var_cat_non_ohe, var_cat_ohe, var_categoriques, var_numeriques
39from src.utils import init_notebook
1df = pd.read_csv(
2 f"{data_folder}/data-cleaned-feature-engineering.csv",
3 sep=",",
4 index_col="ID",
5 parse_dates=True,
Variables globales#
1LABELS = (0, 1)
Fonctions et variables utiles#
1score_modeles = []
Tableau. Informations sur notre classification
| Objectif métier | Prédire l’acceptation à une campagne marketing |
| Problème technique | Classification binaire supervisée |
| Métrique | Score F1 sur la classe 1 (clients qui acceptent)
À score F1 égal, on choisit la meilleure précision sur la classe 1 |
| Méthode d’entraînement | Validation croisée en 5 blocs |
| Pré-traitement | Variables quantitatives : centrer/réduire
Variables qualitatives : OneHot Encoding (Tableau Disjonctif Complet) |
| Équilibrage des classes| 1) Aucun
2) Sous-échantillonnage aléatoire manuel
3) Sur-échantillonnage avec SMOTE |
1def evaluate_models(models, prefix, X_train, X_test, y_train, y_test):
2 """Evalue tous les modèles dans `models` et sauvegarde les résultats avec un préfixe `prefix`
3 (utile pour distinguer les différentes stratégies de pré-traitement des données)."""
4 results = []
6 for model, model_name in models:
7 name = f"{prefix}/{model_name}"
9, y_train)
10 y_pred = model.predict(X_test)
11 precision = accuracy_score(y_test, y_pred)
12 scores = cross_val_score(
13 model,
14 X_train,
15 y_train,
16 cv=5,
17 scoring=make_scorer(f1_score, labels=[LABELS[1]]),
18 )
19 scores_mean = scores.mean()
20 scores_std = scores.std()
22 ## clf_report = pd.DataFrame(
23 # classification_report(y_test, y_pred, output_dict=True)
24 ## ).T
25 ## cm = confusion_matrix(y_test, y_pred, labels=LABELS, normalize="true")
26 ## ## sns.heatmap(cm, annot=True, cmap="Purples", vmin=0, vmax=1)
27 #
28 ## score_f1_classe1 = clf_report.iloc[1, 2]
30 results.append(
31 [
32 name,
33 scores_mean,
34 # scores_std,
35 ]
36 )
37 score_modeles.extend(
38 (
39 [
40 name,
41 "score_f1_classe1",
42 scores_mean,
43 # scores_std,
44 precision,
45 ],
46 )
47 )
49 return results
Liste des modèles#
Tableau. Liste des modèles de notre étude
| Modèles de référence | Classificateur Idiot Uniforme (50% de oui et 50% de non)
Classificateur Idiot Constant 1 (100% de oui) |
| Modèles linéaires | Régression logistique
Analyse Discriminante Linéaire |
| Arbres de décision | Arbre de décision
Forêt d’arbres de décision (Random Forest) |
| Gradient Boosting | XGBoost
CatBoost |
| Machine à vecteurs de support (SVM) | Classificateur SVM linéaire |
| k plus proches voisins (k-NN) | Classificateur k-nn (5 voisins)
Classificateur k-nn (15 voisins) |
| Modèle de vote| Modèle de “Vote à la majorité” sur 5 modèles :
- Régression logistique
- Analyse discriminante linéaire
- Random Forest
- XGBoost
- CatBoost |
| Réseau de neurones| Réseau de neurones à 5 couches et 1 600 neurones |
1models = create_classification_models(seed)
Traitement des données#
Pipeline Scaler & OneHotEncoding#
1df["HasAcceptedCampaigns"] = df["HasAcceptedCampaigns"].astype(int)
1## ohe = OneHotEncoder()
1## ohe.fit_transform(df[var_cat_ohe])
Education | Marital_Status | Kidhome | Teenhome | AcceptedCmp1 | AcceptedCmp2 | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | Response | |
ID | ||||||||||
5524 | Graduation | Single | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
2174 | Graduation | Single | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
4141 | Graduation | Together | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
6182 | Graduation | Together | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5324 | PhD | Married | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1preprocessor = ColumnTransformer(
2 remainder="passthrough",
3 transformers=[
4 ("ohe", OneHotEncoder(), var_cat_ohe),
5 ("scaler", RobustScaler(), var_numeriques),
6 ],
1X = df.drop(columns=["Response", "Dt_Customer"])
ColumnTransformer(remainder='passthrough', transformers=[('ohe', OneHotEncoder(), ['Education', 'Marital_Status', 'Kidhome', 'Teenhome', 'NbAcceptedCampaigns', 'NbChildren']), ('scaler', RobustScaler(), ['Year_Birth', 'Income', 'Recency', 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth'])])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with
ColumnTransformer(remainder='passthrough', transformers=[('ohe', OneHotEncoder(), ['Education', 'Marital_Status', 'Kidhome', 'Teenhome', 'NbAcceptedCampaigns', 'NbChildren']), ('scaler', RobustScaler(), ['Year_Birth', 'Income', 'Recency', 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth'])])
['Education', 'Marital_Status', 'Kidhome', 'Teenhome', 'NbAcceptedCampaigns', 'NbChildren']
['Year_Birth', 'Income', 'Recency', 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth']
['AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1', 'AcceptedCmp2', 'HasAcceptedCampaigns']
array([[0., 0., 1., ..., 0., 0., 0.],
[0., 0., 1., ..., 0., 0., 0.],
[0., 0., 1., ..., 0., 0., 0.],
[0., 0., 1., ..., 0., 0., 1.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.]])
1nouveau_df = pd.DataFrame(preprocessor.transform(X), index=df.index)
(2052, 45)
1## pipeline = Pipeline(steps=[("scaler", RobustScaler())])
3## preprocessor = ColumnTransformer(
4# remainder="passthrough",
5# transformers=[
6# ("std", standard_transformer, [3]),
7# ],
8## )
1## under_sampling_manuel = Pipeline(steps=[()])
Par défaut#
1y = df[["Response"]].astype(int)
1X_train, X_test, y_train, y_test = train_test_split(
2 nouveau_df, y, test_size=0.2, random_state=seed
(2052, 45)
1prefix = "défaut"
2results = evaluate_models(models, prefix, X_train, X_test, y_train, y_test)
1sorted(results, key=lambda x: x[1], reverse=True)
[['défaut/LinearDiscriminantAnalysis', 0.5457873305066958],
['défaut/LogisticRegression', 0.5300741717327082],
['défaut/LinearSVC', 0.5266264112213479],
['défaut/VotingClassifier', 0.5048525204014583],
['défaut/CatBoostClassifier', 0.48986074706622657],
['défaut/XGBClassifier', 0.4612210194041914],
['défaut/LGBMClassifier', 0.4491622877623547],
['défaut/DecisionTreeClassifier', 0.4145184590700617],
['défaut/RandomForestClassifier', 0.3955075959382346],
['défaut/DummyClassifier_Constant1', 0.2551837011118009],
['défaut/KNeighborsClassifier5', 0.2450767474496288],
['défaut/DummyClassifier_Uniform', 0.2171772519717725],
['défaut/KNeighborsClassifier15', 0.13868175622739518]]
Équilibrage des classes#
Under-sampling (manuel)#
1## samples0 = df[df["Response"] == 0].sample(350, random_state=seed)
1## X_eq = pd.concat((samples0, df[df["Response"] == 1]))
1## X_eq["Response"].hist()
1## y_eq = X_eq.pop("Response").astype(int)
1## X_eq = pd.get_dummies(X_eq.drop(columns=["Dt_Customer"]))
1## df["Response"].value_counts()
2drop_indices = np.random.choice(
3 nouveau_df[y["Response"] == 0].index, NB_A_SUPPRIMER, replace=False
5df_subset = nouveau_df.drop(drop_indices)
1y_eq = y.drop(index=drop_indices)
1X_train, X_test, y_train, y_test = train_test_split(
2 df_subset, y_eq, test_size=0.2, random_state=seed
1prefix = "éq_classes"
2results = evaluate_models(models, prefix, X_train, X_test, y_train, y_test)
1sorted(results, key=lambda x: x[1], reverse=True)
[['éq_classes/LogisticRegression', 0.7941343818734119],
['éq_classes/CatBoostClassifier', 0.7938368158804571],
['éq_classes/LinearDiscriminantAnalysis', 0.7938211762491106],
['éq_classes/VotingClassifier', 0.7920346320346321],
['éq_classes/LinearSVC', 0.7908569588129786],
['éq_classes/LGBMClassifier', 0.7834421677461864],
['éq_classes/XGBClassifier', 0.7642432318617356],
['éq_classes/RandomForestClassifier', 0.7484611585366764],
['éq_classes/DecisionTreeClassifier', 0.7168816591954977],
['éq_classes/DummyClassifier_Constant1', 0.6589356893363467],
['éq_classes/KNeighborsClassifier5', 0.6585194388262133],
['éq_classes/KNeighborsClassifier15', 0.6425526464663587],
['éq_classes/DummyClassifier_Uniform', 0.553094245204337]]
Over-sampling (SMOTE)#
1cat_cols = list(X.select_dtypes(include=["category", "int", "bool"]).columns)
1cat_cols_index = list(map(lambda c: list(X.columns).index(c), cat_cols))
1list(X.iloc[:, cat_cols_index].columns)
1sm = SMOTENC(
2 categorical_features=cat_cols_index,
3 random_state=seed,
1X_train, X_test, y_train, y_test = train_test_split(
2 nouveau_df, y, test_size=0.2, random_state=seed
1X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)
0 0.5
1 0.5
dtype: float64
1prefix = "SMOTE"
2results = evaluate_models(models, prefix, X_train_sm, X_test, y_train_sm, y_test)
1sorted(results, key=lambda x: x[1], reverse=True)
Réseau de neurones#
1X_train, X_test, y_train, y_test = train_test_split(
2 df_subset, y_eq, test_size=0.2, random_state=seed
1X_train = np.asarray(X_train).astype("float32")
2y_train = np.asarray(y_train).astype("float32")
3X_test = np.asarray(X_test).astype("float32")
4y_test = np.asarray(y_test).astype("float32")
2model = keras.Sequential(
3 [
4 layers.Dense(400, activation="relu", input_shape=[X_train.shape[1]]),
5 layers.Dense(400, activation="relu"),
6 layers.Dense(400, activation="sigmoid"),
7 layers.Dense(400, activation="relu"),
8 layers.Dense(1, activation="sigmoid"),
9 ]
2 optimizer="adam",
3 loss="binary_crossentropy",
4 metrics=["binary_accuracy"], ## "binary_accuracy"
1early_stopping = keras.callbacks.EarlyStopping(
2 patience=10,
3 min_delta=0.001,
4 restore_best_weights=True,
1history =
2 X_train,
3 y_train,
4 validation_data=(X_test, y_test),
5 ## validation_split=0.2,
6 batch_size=512,
7 epochs=1000,
8 callbacks=[early_stopping],
9 verbose=0, ## hide the output because we have so many epochs
1history_df = pd.DataFrame(history.history)
2## Start the plot at epoch 5
3history_df.loc[5:, ["loss", "val_loss"]].plot()
4history_df.loc[5:, ["binary_accuracy", "val_binary_accuracy"]].plot()
7 ("Best Validation Loss: {:0.4f}" + "\nBest Validation Accuracy: {:0.4f}").format(
8 history_df["val_loss"].min(), history_df["val_binary_accuracy"].max()
9 )
1y_pred = model.predict(X_test)
1sns.histplot(y_pred > 0.5, discrete=True)
1y_pred_old = y_pred
1y_pred = y_pred > 0.5
1print(classification_report(y_test, y_pred, labels=LABELS))
1cm = confusion_matrix(y_test, y_pred, labels=LABELS)
2disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=LABELS)
1nom_modele = "Réseau de Neurones"
2## ajout_score(model, nom_modele, y_test, y_pred)
Diagnostic du modèle#
RandomForest feature importance#
1nom_colonnes = preprocessor.get_feature_names_out(X.columns)
1nom_colonnes = list(map(lambda x: x.split("__")[1], nom_colonnes))
1rf = models[6][0]
1fi = rf.feature_importances_
1fi = pd.DataFrame(fi.reshape((1, len(fi))), columns=nom_colonnes)
1fi = fi.sort_values(
2 by=0, axis=1, ascending=False
3) ## trier les colonnes en fonction de la ligne 0
1plt.figure(figsize=(5, 12))
2plt.title("Importance donnée par le modèle RandomForest")
3sns.barplot(fi, orient="h", color="gray")
Permutation importance#
1result = permutation_importance(rf, X_test, y_test, n_repeats=10, random_state=seed)
1## sns.histplot(result.importances_std < 0.015)
1pi_results = result.importances_mean
1pi_results = pd.DataFrame(
2 pi_results.reshape((1, len(pi_results))), columns=nom_colonnes
4pi_results = pi_results.sort_values(by=0, axis=1, ascending=False)
1plt.figure(figsize=(5, 12))
2plt.title("Importance de Permutation du modèle RandomForest")
3sns.barplot(pi_results, orient="h", color="gray")
Pour aller plus loin#
afficher les intervalles de confiance des scores de validation croisée
optimiser les hyper-paramètres des modèles (avec une recherche en grille ou une recherche bayésienne)
tester les modèles sur différents sous-ensembles de variables pour les comparer
Sauvegarde des données#
1score_modeles_df = pd.DataFrame(
2 score_modeles, columns=["Modèle", "Métrique", "Valeur", "Précision"]
1score_modeles_df.to_csv(f"{data_folder}/results/classifications.csv", index=False)