Expérimentations#

Import des outils / jeu de données#

1import matplotlib.pyplot as plt
2import numpy as np
3import pandas as pd
4import seaborn as sns
5import xgboost
6from sklearn.ensemble import IsolationForest
7
8from src.config import data_folder
9from src.constants import var_categoriques, var_numeriques
1np.random.seed(0)
2sns.set_theme()
1df = pd.read_csv(
2    f"{data_folder}/data-cleaned-feature-engineering.csv",
3    sep=",",
4    index_col="ID",
5    parse_dates=True,
6)
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[3], line 1
----> 1 df = pd.read_csv(
      2     f"{data_folder}/data-cleaned-feature-engineering.csv",
      3     sep=",",
      4     index_col="ID",
      5     parse_dates=True,
      6 )

File ~/.cache/pypoetry/virtualenvs/customer-base-analysis-F-W2gxNr-py3.10/lib/python3.10/site-packages/pandas/util/_decorators.py:211, in deprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper(*args, **kwargs)
    209     else:
    210         kwargs[new_arg_name] = new_arg_value
--> 211 return func(*args, **kwargs)

File ~/.cache/pypoetry/virtualenvs/customer-base-analysis-F-W2gxNr-py3.10/lib/python3.10/site-packages/pandas/util/_decorators.py:331, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
    325 if len(args) > num_allow_args:
    326     warnings.warn(
    327         msg.format(arguments=_format_argument_list(allow_args)),
    328         FutureWarning,
    329         stacklevel=find_stack_level(),
    330     )
--> 331 return func(*args, **kwargs)

File ~/.cache/pypoetry/virtualenvs/customer-base-analysis-F-W2gxNr-py3.10/lib/python3.10/site-packages/pandas/io/parsers/readers.py:950, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)
    935 kwds_defaults = _refine_defaults_read(
    936     dialect,
    937     delimiter,
   (...)
    946     defaults={"delimiter": ","},
    947 )
    948 kwds.update(kwds_defaults)
--> 950 return _read(filepath_or_buffer, kwds)

File ~/.cache/pypoetry/virtualenvs/customer-base-analysis-F-W2gxNr-py3.10/lib/python3.10/site-packages/pandas/io/parsers/readers.py:605, in _read(filepath_or_buffer, kwds)
    602 _validate_names(kwds.get("names", None))
    604 # Create the parser.
--> 605 parser = TextFileReader(filepath_or_buffer, **kwds)
    607 if chunksize or iterator:
    608     return parser

File ~/.cache/pypoetry/virtualenvs/customer-base-analysis-F-W2gxNr-py3.10/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1442, in TextFileReader.__init__(self, f, engine, **kwds)
   1439     self.options["has_index_names"] = kwds["has_index_names"]
   1441 self.handles: IOHandles | None = None
-> 1442 self._engine = self._make_engine(f, self.engine)

File ~/.cache/pypoetry/virtualenvs/customer-base-analysis-F-W2gxNr-py3.10/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1735, in TextFileReader._make_engine(self, f, engine)
   1733     if "b" not in mode:
   1734         mode += "b"
-> 1735 self.handles = get_handle(
   1736     f,
   1737     mode,
   1738     encoding=self.options.get("encoding", None),
   1739     compression=self.options.get("compression", None),
   1740     memory_map=self.options.get("memory_map", False),
   1741     is_text=is_text,
   1742     errors=self.options.get("encoding_errors", "strict"),
   1743     storage_options=self.options.get("storage_options", None),
   1744 )
   1745 assert self.handles is not None
   1746 f = self.handles.handle

File ~/.cache/pypoetry/virtualenvs/customer-base-analysis-F-W2gxNr-py3.10/lib/python3.10/site-packages/pandas/io/common.py:856, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    851 elif isinstance(handle, str):
    852     # Check whether the filename is to be opened in binary mode.
    853     # Binary mode does not support 'encoding' and 'newline'.
    854     if ioargs.encoding and "b" not in ioargs.mode:
    855         # Encoding
--> 856         handle = open(
    857             handle,
    858             ioargs.mode,
    859             encoding=ioargs.encoding,
    860             errors=errors,
    861             newline="",
    862         )
    863     else:
    864         # Binary mode
    865         handle = open(handle, ioargs.mode)

FileNotFoundError: [Errno 2] No such file or directory: 'data/data-cleaned-feature-engineering.csv'
1df_transforme = pd.read_csv(
2    f"{data_folder}/data-transformed.csv",
3    sep=",",
4    index_col="ID",
5    parse_dates=True,
6)

Variables globales#

1LABELS = (0, 1)

Isolation Forest (détection d’outliers)#

1X = pd.get_dummies(df.drop(columns=["Response", "Dt_Customer"]))
2y = df[["Response"]].astype(int)
1iforest = IsolationForest(random_state=0)
1iforest.fit(X)
1X.head()
1sns.histplot(iforest.predict(X))
1X["outlier"] = iforest.predict(X)
1plt.title("Outliers (-1) vs Normaux (1)")
2sns.histplot(data=X, hue="outlier", x="Income", bins=30, kde=True)
1sns.histplot(data=X[X["outlier"] == 1], x="Income", bins=30, kde=True)

Optimisation des hyper-paramètres#

1## todo
 1## params = {
 2#     "max_depth": [3, 6, 10],
 3#     "learning_rate": [0.01, 0.05, 0.1],
 4#     "n_estimators": [100, 500, 1000],
 5#     "colsample_bytree": [0.3, 0.7],
 6## }
 7#
 8## clf = GridSearchCV(
 9#     estimator=model,
10#     param_grid=params,
11#     scoring="precision",
12#     verbose=1,
13## )
14#
15## clf.fit(X_train, y_train)

Mutual Information#

Sans OneHotEncoding#

1## Label encoding for categoricals
2for colname in df.select_dtypes(["object", "category", "bool"]):
3    df[colname], _ = df[colname].factorize()
4
5## All discrete features should now have integer dtypes (double-check this before using MI!)
6discrete_features = df.dtypes == int
1discrete_features.drop("Response", axis=0, inplace=True)
1def make_mi_scores(X, y, discrete_features):
2    mi_scores = mutual_info_regression(
3        X, y, discrete_features=discrete_features, random_state=seed
4    )
5    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
6    mi_scores = mi_scores.sort_values(ascending=False)
7    return mi_scores
1mi_scores = make_mi_scores(df.drop(columns=["Response"]), y, discrete_features)
 1def plot_mi_scores(scores):
 2    scores = scores.sort_values(ascending=True)
 3    width = np.arange(len(scores))
 4    ticks = list(scores.index)
 5    plt.barh(width, scores)
 6    plt.yticks(width, ticks)
 7    plt.title("Mutual Information Scores")
 8
 9
10plt.figure(figsize=(5, 12))
11plot_mi_scores(mi_scores)

Avec OneHotEncoding#

1## Label encoding for categoricals
2for colname in X.select_dtypes(["object", "category", "bool"]):
3    X[colname], _ = X[colname].factorize()
4
5## All discrete features should now have integer dtypes (double-check this before using MI!)
6discrete_features = X.dtypes == int
1mi_scores = make_mi_scores(X, y, discrete_features)
 1def plot_mi_scores(scores):
 2    scores = scores.sort_values(ascending=True)
 3    width = np.arange(len(scores))
 4    ticks = list(scores.index)
 5    plt.barh(width, scores)
 6    plt.yticks(width, ticks)
 7    plt.title("Mutual Information Scores")
 8
 9
10plt.figure(figsize=(5, 12))
11plot_mi_scores(mi_scores)

Modèles après MI (avec OneHotEncoding)#

1positive_mi = mi_scores > 0
1cols_to_drop = positive_mi[positive_mi == 0].index
1X_positive_mi = X_eq.drop(columns=cols_to_drop)
1X_train, X_test, y_train, y_test = train_test_split(
2    X_positive_mi, y_eq, test_size=0.2, random_state=seed
3)
1prefix = "positive_mi"
2results = evaluate_models(models, prefix, X_train, X_test, y_train, y_test)
1sorted(results, key=lambda x: x[1], reverse=True)