Refonte régression#

Import des outils / jeu de données#

 1import statistics
 2
 3import matplotlib.pyplot as plt
 4import numpy as np
 5import pandas as pd
 6import seaborn as sns
 7import statsmodels.api as sm
 8import statsmodels.stats.api as sms
 9import xgboost
10from keras import layers
11from scipy import stats
12from scipy.stats import boxcox, kstest, pearsonr, poisson
13from sklearn.cross_decomposition import PLSRegression
14from sklearn.ensemble import RandomForestClassifier
15from sklearn.inspection import permutation_importance
16from sklearn.linear_model import LinearRegression, PoissonRegressor
17from sklearn.metrics import mean_absolute_error, mean_squared_error
18from sklearn.model_selection import GridSearchCV, train_test_split
19from sklearn.preprocessing import OrdinalEncoder, PolynomialFeatures, RobustScaler
20from statsmodels.compat import lzip
21from statsmodels.graphics.regressionplots import *
22from statsmodels.stats.outliers_influence import variance_inflation_factor
23from tensorflow import keras
24
25from functions import affiche_score
26from src.config import data_folder
27from src.constants import var_categoriques, var_numeriques
28from src.utils import init_notebook
2023-12-17 21:40:41.689594: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-17 21:40:41.721293: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-17 21:40:41.721328: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-17 21:40:41.722546: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-17 21:40:41.728328: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-17 21:40:41.728841: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-17 21:40:42.596937: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[1], line 25
     22 from statsmodels.stats.outliers_influence import variance_inflation_factor
     23 from tensorflow import keras
---> 25 from functions import affiche_score
     26 from src.config import data_folder
     27 from src.constants import var_categoriques, var_numeriques

ModuleNotFoundError: No module named 'functions'
1init_notebook()
1df = pd.read_csv(
2    ## f"{data_folder}/data-cleaned-feature-engineering.csv",
3    f"{data_folder}/data-cleaned.csv",
4    sep=",",
5    index_col="ID",
6    parse_dates=True,
7)
1df_transforme = pd.read_csv(
2    f"{data_folder}/data-transformed.csv",
3    sep=",",
4    index_col="ID",
5    parse_dates=True,
6)

Variables globales#

1df[var_categoriques] = df[var_categoriques].astype(str).astype("category")
1df["Dt_Customer"] = pd.to_datetime(df["Dt_Customer"], format="%Y-%m-%d").astype(int)

Fonctions et variables utiles#

1score_modeles = []
1def ajout_score(modele, nom_modele, y_test, y_pred):
2    """Ajoute la MMSE, RMSE et MAE au dataframe score_modeles."""
3    score_modeles.extend(
4        (
5            [nom_modele, "mse", mean_squared_error(y_test, y_pred)],
6            [nom_modele, "rmse", mean_squared_error(y_test, y_pred, squared=False)],
7            [nom_modele, "mae", mean_absolute_error(y_test, y_pred)],
8        )
9    )

Préparation du jeu de données#

Normalisation#

Explication#

Pour normaliser les données, nous allons utiliser la transformation de Box-Cox, définie $\forall x > 0, $ comme ci-dessous : $B(x, \lambda) = \begin{cases} \frac{x^{\lambda} - 1}{\lambda} & \text{ si } \lambda \neq 0 \ \log(x) & \text{ si } \lambda = 0 \end{cases}$

Cette transformation est à appliquer à une variable (strictement positive), en ajustant le $\lambda$ pour maximiser la normalité.

Nous allons utiliser la librairie scipy.stats.boxcox qui estime le meilleur paramètre $\lambda$.

Sélection des variables#

1df[var_numeriques].hist(figsize=(12, 12), bins=30)
2plt.show()
1var_a_normaliser = [
2    "MntWines",
3    "MntFruits",
4    "MntMeatProducts",
5    "MntFishProducts",
6    "MntSweetProducts",
7    "MntGoldProds",
8]

Transformation#

1sns.histplot(df["MntWines"], kde=True)
1boxcox_lambdas = {}  ## on garde les lambdas, pour la transformation inverse
1for var in var_a_normaliser:
2    var_strict_positif = df[var] + df[var].min() + 1
3
4    var_apres_boxcox, l = boxcox(var_strict_positif)
5
6    df_transforme[var] = var_apres_boxcox
7    boxcox_lambdas[var] = l
1## Tuto : comment récupérer la fonction initiale
2## (il faut avoir récupéré le paramètre "l" lambda)
3
4## from scipy.special import inv_boxcox
5## initial = inv_boxcox(incbox, l)
6## initial = pd.DataFrame(initial)
7## sns.histplot(initial, bins=50, kde=True)

Fin de la normalisation (todo)#

1X = df.drop(columns=["NumStorePurchases"])

Variables catégoriques#

1marital_status = pd.get_dummies(X["Marital_Status"], prefix="Marital_Status")
1marital_status.head()
1encoder_education = OrdinalEncoder(
2    categories=[["Basic", "2n Cycle", "Graduation", "Master", "PhD"]]
3)
4education = pd.DataFrame(
5    encoder_education.fit_transform(X[["Education"]]),
6    index=marital_status.index,
7    columns=["Education"],
8)
1cat_col = pd.concat((marital_status, education), axis=1)
1X.drop(columns=["Marital_Status", "Education"], inplace=True)
1X = pd.concat((X, cat_col), axis=1)

Scaling#

1scaler = RobustScaler()
2X_scale = scaler.fit_transform(X)
1y = df[["NumStorePurchases"]].astype(int)
1X_train, X_test, y_train, y_test = train_test_split(
2    X_scale, y, test_size=0.2, random_state=0
3)

XGBoost#

1tuned_xgb = xgboost.XGBRegressor(
2    n_estimators=1000,
3    learning_rate=0.05,
4    n_jobs=4,
5    eval_metric="mae",
6    random_state=0,
7)
1tuned_xgb.fit(X_train, y_train)
1y_pred = tuned_xgb.predict(X_test)
1affiche_score(tuned_xgb, y_test, y_pred)
1tuned_xgb.feature_importances_
1fi = tuned_xgb.feature_importances_
1fi = pd.DataFrame(fi.reshape((1, fi.shape[0])), columns=X.columns)
1fi = fi.sort_values(
2    by=0, axis=1, ascending=False
3)  ## trier les colonnes en fonction de la ligne 0
1fi.T
1plt.figure(figsize=(5, 12))
2sns.barplot(fi, orient="h", color="gray")
1result = permutation_importance(tuned_xgb, X_test, y_test, n_repeats=3, random_state=0)
1result.importances_mean, result.importances_std
1pi_results = result.importances_mean
1pi_results = pd.DataFrame(
2    pi_results.reshape((1, pi_results.shape[0])), columns=X.columns
3)
4pi_results = pi_results.sort_values(by=0, axis=1, ascending=False)
1pi_results.T
1plt.figure(figsize=(5, 12))
2sns.barplot(pi_results, orient="h", color="gray")

Test X2#

 1X2 = df[
 2    [
 3        "MntWines",
 4        "MntMeatProducts",
 5        "Income",
 6        "MntSweetProducts",
 7        "NumCatalogPurchases",
 8        "MntFruits",
 9        "Response",
10    ]
11]
1scaler = RobustScaler()
2X_scale = scaler.fit_transform(X2)
1y = df[["NumStorePurchases"]].astype(int)
1X_train, X_test, y_train, y_test = train_test_split(
2    X_scale, y, test_size=0.2, random_state=0
3)
1tuned_xgb = xgboost.XGBRegressor(
2    n_estimators=1000,
3    learning_rate=0.05,
4    n_jobs=4,
5    random_state=0,
6)
1tuned_xgb.fit(X_train, y_train)
1y_pred = tuned_xgb.predict(X_test)
1affiche_score(tuned_xgb, y_test, y_pred)
 1## Using Grid Search to find the best parameters
 2param_grid = {
 3    "n_estimators": [50, 100, 200],
 4    "max_features": ["auto"],
 5    "max_depth": [None, 3, 5, 8],
 6    "criterion": ["gini"],
 7    "min_samples_split": [2, 3, 4],
 8}
 9
10## Training RF Models with K-Fold of 5
11rf_models = GridSearchCV(
12    RandomForestClassifier(random_state=5), param_grid=param_grid, cv=5, verbose=1
13)
14rf_models.fit(X_train, y_train)

Refactor#

 1models = [
 2    [LinearRegression(), "Régression linéaire"],
 3    [PoissonRegressor(), "GLM Poisson"],
 4    [PLSRegression(), "Régression PLS"],
 5    [xgboost.XGBRegressor(), "XGBoost"],
 6    [
 7        xgboost.XGBRegressor(
 8            n_estimators=1000,
 9            learning_rate=0.05,
10            n_jobs=4,
11            eval_metric="mae",
12            early_stopping_rounds=20,
13            random_state=0,
14        ),
15        "XGBoost optimisé",
16    ],
17]