Refonte régression#

Import des outils / jeu de données#

import statistics

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import statsmodels.stats.api as sms
import xgboost
from keras import layers
from scipy import stats
from scipy.stats import boxcox, kstest, pearsonr, poisson
from sklearn.cross_decomposition import PLSRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LinearRegression, PoissonRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import OrdinalEncoder, PolynomialFeatures, RobustScaler
from statsmodels.compat import lzip
from statsmodels.graphics.regressionplots import *
from statsmodels.stats.outliers_influence import variance_inflation_factor
from tensorflow import keras

from functions import affiche_score
from src.config import data_folder
from src.constants import var_categoriques, var_numeriques
from src.utils import init_notebook

2023-12-17 21:40:41.689594: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-17 21:40:41.721293: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-17 21:40:41.721328: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-17 21:40:41.722546: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-17 21:40:41.728328: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-17 21:40:41.728841: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.

2023-12-17 21:40:42.596937: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT

---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[1], line 25
     22 from statsmodels.stats.outliers_influence import variance_inflation_factor
     23 from tensorflow import keras
---> 25 from functions import affiche_score
     26 from src.config import data_folder
     27 from src.constants import var_categoriques, var_numeriques

ModuleNotFoundError: No module named 'functions'

1init_notebook()

df = pd.read_csv(
    ## f"{data_folder}/data-cleaned-feature-engineering.csv",
    f"{data_folder}/data-cleaned.csv",
    sep=",",
    index_col="ID",
    parse_dates=True,
)

df_transforme = pd.read_csv(
    f"{data_folder}/data-transformed.csv",
    sep=",",
    index_col="ID",
    parse_dates=True,
)

Variables globales#

df[var_categoriques] = df[var_categoriques].astype(str).astype("category")

df["Dt_Customer"] = pd.to_datetime(df["Dt_Customer"], format="%Y-%m-%d").astype(int)

Fonctions et variables utiles#

score_modeles = []

def ajout_score(modele, nom_modele, y_test, y_pred):
    """Ajoute la MMSE, RMSE et MAE au dataframe score_modeles."""
    score_modeles.extend(
        (
            [nom_modele, "mse", mean_squared_error(y_test, y_pred)],
            [nom_modele, "rmse", mean_squared_error(y_test, y_pred, squared=False)],
            [nom_modele, "mae", mean_absolute_error(y_test, y_pred)],
        )
    )

Préparation du jeu de données#

Normalisation#

Explication#

Pour normaliser les données, nous allons utiliser la transformation de Box-Cox, définie $\forall x > 0, $ comme ci-dessous : $B(x, \lambda) = \begin{cases} \frac{x^{\lambda} - 1}{\lambda} & \text{ si } \lambda \neq 0 \ \log(x) & \text{ si } \lambda = 0 \end{cases}$

Cette transformation est à appliquer à une variable (strictement positive), en ajustant le $\lambda$ pour maximiser la normalité.

Nous allons utiliser la librairie scipy.stats.boxcox qui estime le meilleur paramètre $\lambda$.

Sélection des variables#

df[var_numeriques].hist(figsize=(12, 12), bins=30)
plt.show()

var_a_normaliser = [
    "MntWines",
    "MntFruits",
    "MntMeatProducts",
    "MntFishProducts",
    "MntSweetProducts",
    "MntGoldProds",
]

Transformation#

sns.histplot(df["MntWines"], kde=True)

boxcox_lambdas = {}  ## on garde les lambdas, pour la transformation inverse

for var in var_a_normaliser:
    var_strict_positif = df[var] + df[var].min() + 1

    var_apres_boxcox, l = boxcox(var_strict_positif)

    df_transforme[var] = var_apres_boxcox
    boxcox_lambdas[var] = l

## Tuto : comment récupérer la fonction initiale
## (il faut avoir récupéré le paramètre "l" lambda)

## from scipy.special import inv_boxcox
## initial = inv_boxcox(incbox, l)
## initial = pd.DataFrame(initial)
## sns.histplot(initial, bins=50, kde=True)

Fin de la normalisation (todo)#

X = df.drop(columns=["NumStorePurchases"])

Variables catégoriques#

marital_status = pd.get_dummies(X["Marital_Status"], prefix="Marital_Status")

marital_status.head()

encoder_education = OrdinalEncoder(
    categories=[["Basic", "2n Cycle", "Graduation", "Master", "PhD"]]
)
education = pd.DataFrame(
    encoder_education.fit_transform(X[["Education"]]),
    index=marital_status.index,
    columns=["Education"],
)

cat_col = pd.concat((marital_status, education), axis=1)

X.drop(columns=["Marital_Status", "Education"], inplace=True)

X = pd.concat((X, cat_col), axis=1)

Scaling#

scaler = RobustScaler()
X_scale = scaler.fit_transform(X)

y = df[["NumStorePurchases"]].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X_scale, y, test_size=0.2, random_state=0
)

XGBoost#

tuned_xgb = xgboost.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    n_jobs=4,
    eval_metric="mae",
    random_state=0,
)

tuned_xgb.fit(X_train, y_train)

y_pred = tuned_xgb.predict(X_test)

affiche_score(tuned_xgb, y_test, y_pred)

tuned_xgb.feature_importances_

fi = tuned_xgb.feature_importances_

fi = pd.DataFrame(fi.reshape((1, fi.shape[0])), columns=X.columns)

fi = fi.sort_values(
    by=0, axis=1, ascending=False
)  ## trier les colonnes en fonction de la ligne 0

fi.T

plt.figure(figsize=(5, 12))
sns.barplot(fi, orient="h", color="gray")

result = permutation_importance(tuned_xgb, X_test, y_test, n_repeats=3, random_state=0)

result.importances_mean, result.importances_std

pi_results = result.importances_mean

pi_results = pd.DataFrame(
    pi_results.reshape((1, pi_results.shape[0])), columns=X.columns
)
pi_results = pi_results.sort_values(by=0, axis=1, ascending=False)

pi_results.T

plt.figure(figsize=(5, 12))
sns.barplot(pi_results, orient="h", color="gray")

Test X2#

X2 = df[
    [
        "MntWines",
        "MntMeatProducts",
        "Income",
        "MntSweetProducts",
        "NumCatalogPurchases",
        "MntFruits",
        "Response",
    ]
]

scaler = RobustScaler()
X_scale = scaler.fit_transform(X2)

y = df[["NumStorePurchases"]].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X_scale, y, test_size=0.2, random_state=0
)

tuned_xgb = xgboost.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    n_jobs=4,
    random_state=0,
)

tuned_xgb.fit(X_train, y_train)

y_pred = tuned_xgb.predict(X_test)

affiche_score(tuned_xgb, y_test, y_pred)

## Using Grid Search to find the best parameters
param_grid = {
    "n_estimators": [50, 100, 200],
    "max_features": ["auto"],
    "max_depth": [None, 3, 5, 8],
    "criterion": ["gini"],
    "min_samples_split": [2, 3, 4],
}

## Training RF Models with K-Fold of 5
rf_models = GridSearchCV(
    RandomForestClassifier(random_state=5), param_grid=param_grid, cv=5, verbose=1
)
rf_models.fit(X_train, y_train)

Refactor#

models = [
    [LinearRegression(), "Régression linéaire"],
    [PoissonRegressor(), "GLM Poisson"],
    [PLSRegression(), "Régression PLS"],
    [xgboost.XGBRegressor(), "XGBoost"],
    [
        xgboost.XGBRegressor(
            n_estimators=1000,
            learning_rate=0.05,
            n_jobs=4,
            eval_metric="mae",
            early_stopping_rounds=20,
            random_state=0,
        ),
        "XGBoost optimisé",
    ],
]

Refonte régression

Contents