Analyse factorielle#

Import des outils / jeu de données#

 1import matplotlib.pyplot as plt
 2import pandas as pd
 3import prince
 4import seaborn as sns
 5from mlxtend.plotting import plot_pca_correlation_graph
 6from sklearn.compose import ColumnTransformer
 7from sklearn.decomposition import PCA
 8from sklearn.preprocessing import StandardScaler
 9
10from src.config import data_folder, seed
11from src.constants import var_categoriques, var_numeriques
12from src.utils import init_notebook
1init_notebook()
1df = pd.read_csv(
2    f"{data_folder}/data-cleaned-feature-engineering.csv",
3    sep=",",
4    index_col="ID",
5    parse_dates=True,
6)

Variables globales#

1var_categoriques_extra = ["NbAcceptedCampaigns", "HasAcceptedCampaigns", "NbChildren"]
2
3var_categoriques_fe = var_categoriques + var_categoriques_extra

Analyse multi-variée#

Analyse en Composantes Principales (ACP)#

1preprocessor = ColumnTransformer(
2    transformers=[
3        ("scaler", StandardScaler(), var_numeriques),
4    ],
5)
1df_centre_reduit = pd.DataFrame(
2    preprocessor.fit_transform(df), columns=df[var_numeriques].columns
3)
1acp = PCA(random_state=seed)
1acp.fit(df_centre_reduit)
PCA(random_state=0)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
1variance_expliquee = pd.Series(acp.explained_variance_ratio_)
1variance_expliquee
0     0.412215
1     0.112204
2     0.078024
3     0.071339
4     0.057670
5     0.048930
6     0.044279
7     0.037022
8     0.033142
9     0.028891
10    0.025957
11    0.018987
12    0.018062
13    0.013277
dtype: float64
1variance_expliquee.plot.barh()
<Axes: >
../_images/1d5a8214a2844303571a1040f96516f336482c8923edefab7aae6eebbc93fd04.png
1composantes_principales = pd.DataFrame(
2    acp.fit_transform(df_centre_reduit),
3    index=df.index,
4)
1composantes_principales.columns = [
2    f"ACP{i+1}" for i in range(composantes_principales.shape[1])
3]
1composantes_principales.head()
ACP1 ACP2 ACP3 ACP4 ACP5 ACP6 ACP7 ACP8 ACP9 ACP10 ACP11 ACP12 ACP13 ACP14
ID
5524 3.677574 0.709630 0.726633 0.528046 -1.182996 1.306460 0.193055 2.504792 -0.047630 0.556204 0.381602 -0.026330 -0.831932 -0.118034
2174 -2.024213 -0.621400 -1.365232 -0.600591 -0.685700 0.573463 0.329938 0.081125 0.179518 -0.151067 0.019145 -0.320555 0.072530 0.096130
4141 1.746875 0.152159 -0.131371 -0.971172 -0.509112 -0.986865 -0.950470 -0.534338 -0.504519 0.580976 1.127926 -0.019197 -0.006230 0.247519
6182 -2.384421 -0.746451 0.977139 -0.616917 0.506919 -0.077598 -0.192994 -0.336974 -0.035834 0.130339 0.049029 0.148129 -0.085955 -0.295272
5324 -0.000948 0.499014 0.578226 1.686209 0.749782 0.831949 -0.595007 -0.619262 0.027489 -0.355356 0.727407 -0.405869 -0.187491 0.173714
1_, ax = plt.subplots(1, 2, figsize=(12, 5))
2
3ax[0].set_title("Projection des individus sur les composantes principales")
4sns.scatterplot(
5    composantes_principales, x="ACP1", y="ACP2", hue=df["Response"], ax=ax[0]
6)
7sns.scatterplot(
8    composantes_principales, x="ACP3", y="ACP4", hue=df["Response"], ax=ax[1]
9)
<Axes: xlabel='ACP3', ylabel='ACP4'>
../_images/c3b9c44e95d1d0f95501caf3f19a78c096c08e8b7291e5d77be7384d4af887b5.png

Cercle de corrélations#

1plot_pca_correlation_graph(
2    df_centre_reduit,
3    df_centre_reduit.columns,
4    X_pca=composantes_principales.iloc[:, :2],
5    explained_variance=acp.explained_variance_[:2],
6    dimensions=(1, 2),
7)
(<Figure size 600x600 with 1 Axes>,
                         Dim 1     Dim 2
 Year_Birth          -0.165705 -0.261059
 Income               0.854372 -0.032168
 Recency              0.010637  0.004087
 MntWines             0.751883  0.285810
 MntFruits            0.708341 -0.175398
 MntMeatProducts      0.798662 -0.175350
 MntFishProducts      0.732041 -0.180784
 MntSweetProducts     0.706228 -0.140825
 MntGoldProds         0.575925  0.238082
 NumDealsPurchases   -0.095773  0.772308
 NumWebPurchases      0.549861  0.602148
 NumCatalogPurchases  0.815305  0.018713
 NumStorePurchases    0.746004  0.225954
 NumWebVisitsMonth   -0.639886  0.488683)
../_images/156b4a69db0c2a778be1371b097509aa3660042f2b31dc2a4f01db18b246ec7f.png
1_, correlation_matrix = plot_pca_correlation_graph(
2    df_centre_reduit,
3    df_centre_reduit.columns,
4    X_pca=composantes_principales.iloc[:, :4],
5    explained_variance=acp.explained_variance_[:4],
6    dimensions=(3, 4),
7)
../_images/fce45b39eaafe461e89135950a3bd9f322b40002247f8e100c2f9000729e37b0.png
1plt.figure(figsize=(8, 6))
2sns.heatmap(
3    correlation_matrix,
4    annot=True,
5    cmap="BrBG",
6    linewidths=0.5,
7    vmax=1,
8    vmin=-1,
9)
<Axes: >
../_images/5e9b808bf095d2b522b255dde1ed04741a93064331e496c27386dbb7255dd824.png

Tableau. Interprétation des 4 premiers axes de l’ACP

Axe

Interprétation

1

Richesse (revenu)
Achats dans toutes les catégories
Achats en catalogue et en magasin physique
Peu de visites sur le site Internet

2

Achats en promotion

3

Année de naissance

4

Nombre de jours depuis le dernier achat (Recency)

Analyse Factorielle des Correspondances (AFC)#

Kidhome vs Teenhome#

1table_contingence = pd.crosstab(df["Kidhome"], df["Teenhome"])
1table_contingence
Teenhome 0 1 2
Kidhome
0 575 577 28
1 464 343 22
2 17 26 0
1ca = prince.CA(random_state=seed)
2
3ca = ca.fit(table_contingence)
1ca.eigenvalues_summary
eigenvalue % of variance % of variance (cumulative)
component
0 0.007 96.68% 96.68%
1 0.000 3.32% 100.00%
1ca.plot(table_contingence)

Statut marital et niveau d’éducation#

1table_contingence = pd.crosstab(df["Marital_Status"], df["Education"])
1table_contingence
Education 2n Cycle Basic Graduation Master PhD
Marital_Status
Divorced 21 1 113 33 47
Married 71 18 396 131 179
Single 37 18 234 69 93
Together 53 11 254 95 107
Widow 5 1 31 12 22
1ca = prince.CA(random_state=seed)
2ca = ca.fit(table_contingence)
1ca.eigenvalues_summary
eigenvalue % of variance % of variance (cumulative)
component
0 0.005 60.05% 60.05%
1 0.002 26.34% 86.39%
1ca.plot(table_contingence)
1## todo: à interpréter

Analyse des Correspondances Multiples (ACM)#

Variables qualitatives uniquement#

1df["HasAcceptedCampaigns"] = df["HasAcceptedCampaigns"].astype(int)
1mca = prince.MCA(
2    n_components=df[var_categoriques_fe].shape[1],
3    random_state=seed,
4)
5mca = mca.fit(df[var_categoriques_fe])
1composantes_acm = mca.row_coordinates(df[var_categoriques_fe])
2composantes_acm.columns = [f"ACM{i+1}" for i in range(composantes_acm.shape[1])]
1mca.plot(df[var_categoriques_fe])

Tableau. Interprétation des 2 premiers axes de l’ACM

Axe

Interprétation

1

Nombre de campagnes acceptées
Peu d’enfants

2

Niveau d’éducation faible

1mca.plot(df[var_categoriques_fe], x_component=2, y_component=3)

Avec variables quantitatives en supplémenaire#

1mca.plot(df[var_categoriques_fe + var_numeriques])
1## todo: à interpréter

Essai sans Education#

1mca = prince.MCA(random_state=seed)
2mca = mca.fit(df[var_categoriques_fe].drop(columns=["Education"]))
3
4mca.plot(df[var_categoriques_fe].drop(columns=["Education"]))

Sauvegarde des données#

1composantes_principales.to_csv(f"{data_folder}/composantes_acp.csv")
2composantes_acm.to_csv(f"{data_folder}/composantes_acm.csv")