Visualisation#

Import des outils / jeu de données#

1import matplotlib.pyplot as plt
2import pandas as pd
3import seaborn as sns
4
5from src.config import data_folder
6from src.constants import var_categoriques, var_numeriques
7from src.utils import init_notebook
1init_notebook()
1df = pd.read_csv(
2    f"{data_folder}/data-cleaned-feature-engineering.csv",
3    sep=",",
4    index_col="ID",
5    parse_dates=True,
6)
1df.head()
Year_Birth Education Marital_Status Income Kidhome Teenhome Dt_Customer Recency MntWines MntFruits ... NumWebVisitsMonth AcceptedCmp3 AcceptedCmp4 AcceptedCmp5 AcceptedCmp1 AcceptedCmp2 Response NbAcceptedCampaigns HasAcceptedCampaigns NbChildren
ID
5524 1957 Graduation Single 58138.0 0 0 2012-09-04 58 635 88 ... 7 0 0 0 0 0 1 0 False 0
2174 1954 Graduation Single 46344.0 1 1 2014-03-08 38 11 1 ... 5 0 0 0 0 0 0 0 False 2
4141 1965 Graduation Together 71613.0 0 0 2013-08-21 26 426 49 ... 4 0 0 0 0 0 0 0 False 0
6182 1984 Graduation Together 26646.0 1 0 2014-02-10 26 11 4 ... 6 0 0 0 0 0 0 0 False 1
5324 1981 PhD Married 58293.0 1 0 2014-01-19 94 173 43 ... 5 0 0 0 0 0 0 0 False 1

5 rows × 28 columns

Variables globales#

1var_categoriques_extra = ["NbAcceptedCampaigns", "HasAcceptedCampaigns", "NbChildren"]
2
3var_categoriques_fe = (
4    var_categoriques + var_categoriques_extra
5)  ## todo: sauvegarder toutes ces listes dans un pd.Series et les lire
1df[var_categoriques_fe] = df[var_categoriques_fe].astype(str).astype("category")

Visualisation#

Relation avec la variable cible (Response)#

 1for var in var_numeriques:
 2    _, ax = plt.subplots(1, 2, figsize=(10, 3))
 3    sns.boxplot(df, x=df[var], y=df["Response"], width=0.25, ax=ax[0])
 4    sns.histplot(
 5        df,
 6        x=df[var],
 7        kde=True,
 8        ax=ax[1],
 9        hue=df["Response"],
10        stat="probability",
11        common_norm=False,
12    )
13    plt.show()
../_images/c7043d0a1b483fd7199014c2aaca2446018de3178671e53b6b52d4c5a61e7309.png ../_images/8c452f489391df813e5c28636e7af16471f0cd9497b2f2bad4dde4363a98373c.png ../_images/07f010ae7a24a07e4d5badb16c6eee90c7e4b1a46da76bf11494773a16a5e86e.png ../_images/e760943a07ce65ccda7c56ab7b59a12c8d4455230da908df7fe896b1665d22b8.png ../_images/f3d6f0b8901d5f9561379a9a414619709bf2b1c9b7958f36e3f22660c4764398.png ../_images/0bf3d986b8ba4715d9f95fcd60ab108fd76ae9ccba99bcda74b4116119d2fd7b.png ../_images/febe8043ef5e583431d819902738f8101004bd00c7d295d2b68f81b894713567.png ../_images/f3c550c2bc0fbf750f50e66ac9bd68a948b7baf251517368c3bb6e0e918c0cdb.png ../_images/581ff4f6ed9dbc8541d0e0bf139f2eb408b4da1d15406f5ad4c55a919ad6c0df.png ../_images/99e6569e7d34f0e669a7a35b4f47144047471d183bebfe4da8e1fc4db0e7830a.png ../_images/120c0726c5a9f98659987885d3fa0c99208209c68b718574450486ca1a2c90dc.png ../_images/344e94c1f0be66909305fbfcc52072c48ad035ca5547cfff77ce7df7df6ba33a.png ../_images/21b9d4ab1f166466809973962a1b36f0fe17015f6c152d526fab9b8dbf905cc0.png ../_images/f51e46e29bbac8f780a2976e4eac7affc877647236da2059aabf07676304bb2f.png
 1for var in var_categoriques_fe:
 2    _, ax = plt.subplots(1, 2, figsize=(10, 3))
 3
 4    sns.histplot(
 5        df,
 6        x=df[var],
 7        hue=df["Response"],
 8        multiple="dodge",
 9        shrink=0.5,
10        ax=ax[0],
11    )
12    sns.histplot(
13        df,
14        hue=df[var],
15        x=df["Response"],
16        multiple="dodge",
17        shrink=0.5,
18        ax=ax[1],
19    )
20
21    plt.show()
../_images/13a18c2eedcb93adc4f4a1ac4b324fcf35301bb03f7293593de8c4edd7c6a698.png ../_images/407b6a36f7c0067f8e8e7ed3ad7980ed3f178b3e83cd4e704e53dd582eeefd2c.png ../_images/996d8052856da0c847e56345aaf3f471c18704a1bc5e0362f58ec8bd8b7b6bf4.png ../_images/79b1a9b09b63e6a2b93bc22c6a6d11c074444c704506a449bd39d0c388800b1c.png ../_images/ad25ee246f9f1fdc9d90945c7effaa17030ba463cc2f93084778490862948a03.png ../_images/fe1a308275a5bcbc5e6ab9462b7e18ff2097171ec16ba2736e600191bae39d25.png ../_images/4f63054074da4dfc20d45b46926e3c2deb9027d273c3da3227d217a08e638b38.png ../_images/81a395229e0b81670eb59b185ed65066f8b30939810804fd2d55833ae1ce9fb9.png ../_images/9fa8d165ffd05b58cfbd9b933e33615b524eddec471aaf5a70573e0ae984c3d0.png ../_images/043d7446743bd79f776a7ca02d1c9e3db62f3d643d507196d91f8e200374ae5c.png ../_images/37ed07aee6dfdd6f27f8d77a46b5ade2a2f043471899c8e40268be9f8acdd231.png ../_images/e6df8888ebaa127319df70df9dfc315e619a5aabb0b029f5cdf51695059444a2.png ../_images/ef04807d853de21a736d6e517bda047a4d514b3f56d1a3e6fff5d9752b5062a3.png

Graphiques supplémentaires#

 1_, ax = plt.subplots(1, 2, figsize=(12, 4))
 2
 3ax[0].set_title("Revenu en fonction du nombre de campagnes acceptées")
 4
 5sns.boxplot(y=df["Income"], x=df["NbAcceptedCampaigns"], ax=ax[0])
 6sns.histplot(
 7    df,
 8    x="Income",
 9    hue="NbAcceptedCampaigns",
10    kde=True,
11    stat="probability",
12    common_norm=False,
13    ax=ax[1],
14)
<Axes: xlabel='Income', ylabel='Probability'>
../_images/d5e402b6da9f83aa451f1048ac461d52bf97da4ca95a46d79a816512198fdad2.png
 1_, ax = plt.subplots(1, 2, figsize=(12, 4))
 2
 3ax[0].set_title("Revenu en fonction du nombre de campagnes acceptées")
 4
 5sns.boxplot(y=df["Income"], x=df["HasAcceptedCampaigns"], ax=ax[0])
 6sns.histplot(
 7    df,
 8    x="Income",
 9    hue="HasAcceptedCampaigns",
10    kde=True,
11    stat="probability",
12    common_norm=False,
13    ax=ax[1],
14)
<Axes: xlabel='Income', ylabel='Probability'>
../_images/c0d81f5baf662ff7bf9e0b206354a346d2e0e363e1d1582f71ddface2a7fd624.png

todo#

1sns.histplot(
2    data=df, x="Education", hue="HasAcceptedCampaigns", multiple="dodge", shrink=0.8
3)
<Axes: xlabel='Education', ylabel='Count'>
../_images/8172ebe26820bd32ebe734e1e69dd311dffead7f0b2f7af57bcb2fc5b615a630.png
1sns.histplot(
2    data=df,
3    x="NbChildren",
4    hue="Response",
5    multiple="dodge",
6    discrete=True,
7    shrink=0.4,
8)
<Axes: xlabel='NbChildren', ylabel='Count'>
../_images/143047e806b74906a7c629ff2f02a004bf5461aeaa0ca31ff8695c56065b920e.png
1sns.histplot(
2    data=df,
3    x="NbChildren",
4    hue="HasAcceptedCampaigns",
5    multiple="dodge",
6    discrete=True,
7    shrink=0.4,
8)
<Axes: xlabel='NbChildren', ylabel='Count'>
../_images/1a73095471288f7de2ee3ae62054e8dae1f2763d458142015d719f21db10d341.png

Matrice de corrélation#

1plt.figure(figsize=(12, 12))
2sns.heatmap(
3    df.corr()[df.corr().abs() > 0.5],
4    annot=True,
5    cmap="BrBG",
6    linewidths=0.5,
7    vmax=1,
8    vmin=-1,
9)
/tmp/ipykernel_2065/1235220098.py:3: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  df.corr()[df.corr().abs() > 0.5],
<Axes: >
../_images/5d4d83b1d4e69d76ae1677bcf01247d72c01c017439f7d909442c45aba4cdd11.png

todo#

1## Exemple de graphique sur plusieurs colonnes
2## sns.displot(
3#     data=df,
4#     x="Marital_Status",
5#     hue="Response",
6#     col="Kidhome",
7#     multiple="dodge",
8#     shrink=0.8,
9## )