# Analyse factorielle

## Import des outils / jeu de données

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import prince
import seaborn as sns
from mlxtend.plotting import plot_pca_correlation_graph
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from src.config import data_folder, seed
from src.constants import var_categoriques, var_numeriques
from src.utils import init_notebook

In [None]:
init_notebook()

In [None]:
df = pd.read_csv(
    f"{data_folder}/data-cleaned-feature-engineering.csv",
    sep=",",
    index_col="ID",
    parse_dates=True,
)

## Variables globales

In [None]:
var_categoriques_extra = ["NbAcceptedCampaigns", "HasAcceptedCampaigns", "NbChildren"]

var_categoriques_fe = var_categoriques + var_categoriques_extra

## Analyse multi-variée

## Analyse en Composantes Principales (ACP)

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ("scaler", StandardScaler(), var_numeriques),
    ],
)

In [None]:
df_centre_reduit = pd.DataFrame(
    preprocessor.fit_transform(df), columns=df[var_numeriques].columns
)

In [None]:
acp = PCA(random_state=seed)

In [None]:
acp.fit(df_centre_reduit)

In [None]:
variance_expliquee = pd.Series(acp.explained_variance_ratio_)

In [None]:
variance_expliquee

In [None]:
variance_expliquee.plot.barh()

In [None]:
composantes_principales = pd.DataFrame(
    acp.fit_transform(df_centre_reduit),
    index=df.index,
)

In [None]:
composantes_principales.columns = [
    f"ACP{i+1}" for i in range(composantes_principales.shape[1])
]

In [None]:
composantes_principales.head()

In [None]:
_, ax = plt.subplots(1, 2, figsize=(12, 5))

ax[0].set_title("Projection des individus sur les composantes principales")
sns.scatterplot(
    composantes_principales, x="ACP1", y="ACP2", hue=df["Response"], ax=ax[0]
)
sns.scatterplot(
    composantes_principales, x="ACP3", y="ACP4", hue=df["Response"], ax=ax[1]
)

### Cercle de corrélations

In [None]:
plot_pca_correlation_graph(
    df_centre_reduit,
    df_centre_reduit.columns,
    X_pca=composantes_principales.iloc[:, :2],
    explained_variance=acp.explained_variance_[:2],
    dimensions=(1, 2),
)

In [None]:
_, correlation_matrix = plot_pca_correlation_graph(
    df_centre_reduit,
    df_centre_reduit.columns,
    X_pca=composantes_principales.iloc[:, :4],
    explained_variance=acp.explained_variance_[:4],
    dimensions=(3, 4),
)

In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(
    correlation_matrix,
    annot=True,
    cmap="BrBG",
    linewidths=0.5,
    vmax=1,
    vmin=-1,
)

**Tableau.** Interprétation des 4 premiers axes de l'ACP

| Axe | Interprétation                                                                                                                                 |
|:----|:-----------------------------------------------------------------------------------------------------------------------------------------------|
| 1   | Richesse (revenu)<br/>Achats dans toutes les catégories<br/>Achats en catalogue et en magasin physique<br/>Peu de visites sur le site Internet |
| 2   | Achats en promotion                                                                                                                            |
| 3   | Année de naissance                                                                                                                             |
| 4   | Nombre de jours depuis le dernier achat (Recency)                                                                                              |

## Analyse Factorielle des Correspondances (AFC)

### Kidhome vs Teenhome

In [None]:
table_contingence = pd.crosstab(df["Kidhome"], df["Teenhome"])

In [None]:
table_contingence

In [None]:
ca = prince.CA(random_state=seed)

ca = ca.fit(table_contingence)

In [None]:
ca.eigenvalues_summary

In [None]:
ca.plot(table_contingence)

### Statut marital et niveau d'éducation

In [None]:
table_contingence = pd.crosstab(df["Marital_Status"], df["Education"])

In [None]:
table_contingence

In [None]:
ca = prince.CA(random_state=seed)
ca = ca.fit(table_contingence)

In [None]:
ca.eigenvalues_summary

In [None]:
ca.plot(table_contingence)

In [None]:
## todo: à interpréter

## Analyse des Correspondances Multiples (ACM)

### Variables qualitatives uniquement

In [None]:
df["HasAcceptedCampaigns"] = df["HasAcceptedCampaigns"].astype(int)

In [None]:
mca = prince.MCA(
    n_components=df[var_categoriques_fe].shape[1],
    random_state=seed,
)
mca = mca.fit(df[var_categoriques_fe])

In [None]:
composantes_acm = mca.row_coordinates(df[var_categoriques_fe])
composantes_acm.columns = [f"ACM{i+1}" for i in range(composantes_acm.shape[1])]

In [None]:
mca.plot(df[var_categoriques_fe])

**Tableau.** Interprétation des 2 premiers axes de l'ACM

| Axe | Interprétation                                  |
|:----|:------------------------------------------------|
| 1   | Nombre de campagnes acceptées<br/>Peu d'enfants |
| 2   | Niveau d'éducation faible                       |

In [None]:
mca.plot(df[var_categoriques_fe], x_component=2, y_component=3)

### Avec variables quantitatives en supplémenaire

In [None]:
mca.plot(df[var_categoriques_fe + var_numeriques])

In [None]:
## todo: à interpréter

### Essai sans Education

In [None]:
mca = prince.MCA(random_state=seed)
mca = mca.fit(df[var_categoriques_fe].drop(columns=["Education"]))

mca.plot(df[var_categoriques_fe].drop(columns=["Education"]))

## Sauvegarde des données

In [None]:
composantes_principales.to_csv(f"{data_folder}/composantes_acp.csv")
composantes_acm.to_csv(f"{data_folder}/composantes_acm.csv")