Visualisation

Contents

Visualisation#

Import des outils / jeu de données#

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from src.config import data_folder
from src.constants import var_categoriques, var_numeriques
from src.utils import init_notebook

1init_notebook()

df = pd.read_csv(
    f"{data_folder}/data-cleaned-feature-engineering.csv",
    sep=",",
    index_col="ID",
    parse_dates=True,
)

df.head()

	Year_Birth	Education	Marital_Status	Income	Kidhome	Teenhome	Dt_Customer	Recency	MntWines	MntFruits	...	NumWebVisitsMonth	AcceptedCmp3	AcceptedCmp4	AcceptedCmp5	AcceptedCmp1	AcceptedCmp2	Response	NbAcceptedCampaigns	HasAcceptedCampaigns	NbChildren
ID
5524	1957	Graduation	Single	58138.0	0	0	2012-09-04	58	635	88	...	7	0	0	0	0	0	1	0	False	0
2174	1954	Graduation	Single	46344.0	1	1	2014-03-08	38	11	1	...	5	0	0	0	0	0	0	0	False	2
4141	1965	Graduation	Together	71613.0	0	0	2013-08-21	26	426	49	...	4	0	0	0	0	0	0	0	False	0
6182	1984	Graduation	Together	26646.0	1	0	2014-02-10	26	11	4	...	6	0	0	0	0	0	0	0	False	1
5324	1981	PhD	Married	58293.0	1	0	2014-01-19	94	173	43	...	5	0	0	0	0	0	0	0	False	1

5 rows × 28 columns

Variables globales#

var_categoriques_extra = ["NbAcceptedCampaigns", "HasAcceptedCampaigns", "NbChildren"]

var_categoriques_fe = (
    var_categoriques + var_categoriques_extra
)  ## todo: sauvegarder toutes ces listes dans un pd.Series et les lire

df[var_categoriques_fe] = df[var_categoriques_fe].astype(str).astype("category")

Visualisation#

Relation avec la variable cible (Response)#

for var in var_numeriques:
    _, ax = plt.subplots(1, 2, figsize=(10, 3))
    sns.boxplot(df, x=df[var], y=df["Response"], width=0.25, ax=ax[0])
    sns.histplot(
        df,
        x=df[var],
        kde=True,
        ax=ax[1],
        hue=df["Response"],
        stat="probability",
        common_norm=False,
    )
    plt.show()

../_images/c7043d0a1b483fd7199014c2aaca2446018de3178671e53b6b52d4c5a61e7309.png

../_images/8c452f489391df813e5c28636e7af16471f0cd9497b2f2bad4dde4363a98373c.png

../_images/07f010ae7a24a07e4d5badb16c6eee90c7e4b1a46da76bf11494773a16a5e86e.png

../_images/e760943a07ce65ccda7c56ab7b59a12c8d4455230da908df7fe896b1665d22b8.png

../_images/f3d6f0b8901d5f9561379a9a414619709bf2b1c9b7958f36e3f22660c4764398.png

../_images/0bf3d986b8ba4715d9f95fcd60ab108fd76ae9ccba99bcda74b4116119d2fd7b.png

../_images/febe8043ef5e583431d819902738f8101004bd00c7d295d2b68f81b894713567.png

../_images/f3c550c2bc0fbf750f50e66ac9bd68a948b7baf251517368c3bb6e0e918c0cdb.png

../_images/581ff4f6ed9dbc8541d0e0bf139f2eb408b4da1d15406f5ad4c55a919ad6c0df.png

../_images/99e6569e7d34f0e669a7a35b4f47144047471d183bebfe4da8e1fc4db0e7830a.png

../_images/120c0726c5a9f98659987885d3fa0c99208209c68b718574450486ca1a2c90dc.png

../_images/344e94c1f0be66909305fbfcc52072c48ad035ca5547cfff77ce7df7df6ba33a.png

../_images/21b9d4ab1f166466809973962a1b36f0fe17015f6c152d526fab9b8dbf905cc0.png

../_images/f51e46e29bbac8f780a2976e4eac7affc877647236da2059aabf07676304bb2f.png

for var in var_categoriques_fe:
    _, ax = plt.subplots(1, 2, figsize=(10, 3))

    sns.histplot(
        df,
        x=df[var],
        hue=df["Response"],
        multiple="dodge",
        shrink=0.5,
        ax=ax[0],
    )
    sns.histplot(
        df,
        hue=df[var],
        x=df["Response"],
        multiple="dodge",
        shrink=0.5,
        ax=ax[1],
    )

    plt.show()

../_images/13a18c2eedcb93adc4f4a1ac4b324fcf35301bb03f7293593de8c4edd7c6a698.png

../_images/407b6a36f7c0067f8e8e7ed3ad7980ed3f178b3e83cd4e704e53dd582eeefd2c.png

../_images/996d8052856da0c847e56345aaf3f471c18704a1bc5e0362f58ec8bd8b7b6bf4.png

../_images/79b1a9b09b63e6a2b93bc22c6a6d11c074444c704506a449bd39d0c388800b1c.png

../_images/ad25ee246f9f1fdc9d90945c7effaa17030ba463cc2f93084778490862948a03.png

../_images/fe1a308275a5bcbc5e6ab9462b7e18ff2097171ec16ba2736e600191bae39d25.png

../_images/4f63054074da4dfc20d45b46926e3c2deb9027d273c3da3227d217a08e638b38.png

../_images/81a395229e0b81670eb59b185ed65066f8b30939810804fd2d55833ae1ce9fb9.png

../_images/9fa8d165ffd05b58cfbd9b933e33615b524eddec471aaf5a70573e0ae984c3d0.png

../_images/043d7446743bd79f776a7ca02d1c9e3db62f3d643d507196d91f8e200374ae5c.png

../_images/37ed07aee6dfdd6f27f8d77a46b5ade2a2f043471899c8e40268be9f8acdd231.png

../_images/e6df8888ebaa127319df70df9dfc315e619a5aabb0b029f5cdf51695059444a2.png

../_images/ef04807d853de21a736d6e517bda047a4d514b3f56d1a3e6fff5d9752b5062a3.png

Graphiques supplémentaires#

_, ax = plt.subplots(1, 2, figsize=(12, 4))

ax[0].set_title("Revenu en fonction du nombre de campagnes acceptées")

sns.boxplot(y=df["Income"], x=df["NbAcceptedCampaigns"], ax=ax[0])
sns.histplot(
    df,
    x="Income",
    hue="NbAcceptedCampaigns",
    kde=True,
    stat="probability",
    common_norm=False,
    ax=ax[1],
)

<Axes: xlabel='Income', ylabel='Probability'>

../_images/d5e402b6da9f83aa451f1048ac461d52bf97da4ca95a46d79a816512198fdad2.png

_, ax = plt.subplots(1, 2, figsize=(12, 4))

ax[0].set_title("Revenu en fonction du nombre de campagnes acceptées")

sns.boxplot(y=df["Income"], x=df["HasAcceptedCampaigns"], ax=ax[0])
sns.histplot(
    df,
    x="Income",
    hue="HasAcceptedCampaigns",
    kde=True,
    stat="probability",
    common_norm=False,
    ax=ax[1],
)

<Axes: xlabel='Income', ylabel='Probability'>

../_images/c0d81f5baf662ff7bf9e0b206354a346d2e0e363e1d1582f71ddface2a7fd624.png

todo#

sns.histplot(
    data=df, x="Education", hue="HasAcceptedCampaigns", multiple="dodge", shrink=0.8
)

<Axes: xlabel='Education', ylabel='Count'>

../_images/8172ebe26820bd32ebe734e1e69dd311dffead7f0b2f7af57bcb2fc5b615a630.png

sns.histplot(
    data=df,
    x="NbChildren",
    hue="Response",
    multiple="dodge",
    discrete=True,
    shrink=0.4,
)

<Axes: xlabel='NbChildren', ylabel='Count'>

../_images/143047e806b74906a7c629ff2f02a004bf5461aeaa0ca31ff8695c56065b920e.png

sns.histplot(
    data=df,
    x="NbChildren",
    hue="HasAcceptedCampaigns",
    multiple="dodge",
    discrete=True,
    shrink=0.4,
)

<Axes: xlabel='NbChildren', ylabel='Count'>

../_images/1a73095471288f7de2ee3ae62054e8dae1f2763d458142015d719f21db10d341.png

Matrice de corrélation#

plt.figure(figsize=(12, 12))
sns.heatmap(
    df.corr()[df.corr().abs() > 0.5],
    annot=True,
    cmap="BrBG",
    linewidths=0.5,
    vmax=1,
    vmin=-1,
)

/tmp/ipykernel_2065/1235220098.py:3: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  df.corr()[df.corr().abs() > 0.5],

<Axes: >

../_images/5d4d83b1d4e69d76ae1677bcf01247d72c01c017439f7d909442c45aba4cdd11.png

todo#

## Exemple de graphique sur plusieurs colonnes
## sns.displot(
#     data=df,
#     x="Marital_Status",
#     hue="Response",
#     col="Kidhome",
#     multiple="dodge",
#     shrink=0.8,
## )