# load in packages
from itertools import combinations

import matplotlib.pyplot as plt
import seaborn as sns

from test_results import test_results, score
import numpy as np
import pandas as pd
import scipy as sp
import sklearn as sk

from imblearn.over_sampling import SMOTENC, SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')


# loading the data:
train_data = pd.read_csv('training.csv')
train_data.head()


# number of clients designated to each group in training set:
train_data.Promotion.value_counts()

Yes    42364
No     42170
Name: Promotion, dtype: int64


# expected probability when randomly designating clients to promotion and non-promotion groups:
mean = 0.5

# creating experiment list:
exp_list = [1 if x == 'Yes' else 0 for x in train_data.Promotion]

# computing sample mean:
s_mean = (np.sum(exp_list) + 0.5) / len(exp_list)

# computing standard deviation:
s_std = np.std(exp_list)

# computing z score:
z_im = (s_mean - mean) / (s_std / np.sqrt(len(exp_list)))
print('Z score for the invariant metric is {:.2f}'.format(z_im))

Z score for the invariant metric is 0.67


# computing p-value:
p_value_im = sp.stats.norm.cdf(-z_im)
print('Invariant metric: p-value is {:.2f}, which means we failed to reject null hypothesis.'.format(p_value_im))

Invariant metric: p-value is 0.25, which means we failed to reject null hypothesis.


# experimental group size:
n1 = np.sum(exp_list)

# control group size:
n0 = len(exp_list) - n1

# purchase overall mean:
p_null = train_data.purchase.mean()

# purchase mean in control group:
p0 = train_data[train_data.Promotion == 'No'].purchase.mean()

# purchase mean in experimental group:
p1 = train_data[train_data.Promotion == 'Yes'].purchase.mean()

# computing standard error:
se = np.sqrt(p_null * (1-p_null) * (1/n0 + 1/n1))

# computing z score:
z_em = (p1 - p0) / se
print('Z score for the experimental metric is {:.2f}'.format(z_em))

Z score for the experimental metric is 12.47


# computing p-value:
p_value_em = sp.stats.norm.cdf(-z_em)
print('Experimental metric: p-value is {:.2f}, which means that we can reject null hypthesis with over 95% of confidence.'.format(p_value_em))

Experimental metric: p-value is 0.00, which means that we can reject null hypthesis with over 95% of confidence.


# general statistical overview:
train_data.describe()


# checking for null values:
train_data.isnull().mean()

ID           0.0
Promotion    0.0
purchase     0.0
V1           0.0
V2           0.0
V3           0.0
V4           0.0
V5           0.0
V6           0.0
V7           0.0
dtype: float64


# purchase count in control group:
train_data[train_data.Promotion == 'No'].purchase.value_counts()

0    41851
1      319
Name: purchase, dtype: int64


# purchase count in experimental group:
train_data[train_data.Promotion == 'Yes'].purchase.value_counts()

0    41643
1      721
Name: purchase, dtype: int64


# filtering data for control group:
control = train_data[train_data.Promotion == 'No']
sel_features = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'purchase']

# evaluating features' correlation for experimental group:
contr_corr = control[sel_features].corr()

# creating correlation plot:
fig, ax = plt.subplots(nrows = 1, ncols = 1, figsize = (10, 10))

sns.heatmap(contr_corr, annot = True, fmt = '.3g', vmin = -.3, vmax = .3, center = 0, 
                        cmap = 'RdYlBu', square = True)
ax.set_title('Correlation Matrix - Control Group')

fig.show()


# filtering data for experimental group:
experimental = train_data[train_data.Promotion == 'Yes']

# evaluating features' correlation for experimental group:
exper_corr = experimental[sel_features].corr()

# creating correlation plot:
fig, ax = plt.subplots(nrows = 1, ncols = 1, figsize = (10, 10))

sns.heatmap(exper_corr, annot = True, fmt = '.3g', vmin = -.3, vmax = .3, center = 0, 
                        cmap = 'RdYlBu', square = True)
ax.set_title('Correlation Matrix - Experimental Group')

fig.show()


# observing the percentage variation in the correlation between control and experimental group: 
purch_corr = pd.DataFrame()
purch_corr['control'] = contr_corr['purchase']
purch_corr['experimental'] = exper_corr['purchase']
purch_corr['perc_var'] = -((purch_corr.control - purch_corr.experimental)/purch_corr.control)*100
purch_corr


# Defining features:
features = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7']

# Features histograms for control group:
print('Histograms for Control Group V1-V7 \nred: no-purchase | blue: purchase')
control[control.purchase == 0].hist(column = features[:4], figsize=(15,2.5), layout = (1, 4), 
                                    density = True, color = 'tomato')

control[control.purchase == 1].hist(column = features[:4], figsize=(15,2.5), layout = (1, 4), 
                                              density = True, color = 'darkturquoise')

control[control.purchase == 0].hist(column = features[4:], figsize=(15,2.5), layout = (1, 3), 
                                    density = True, color = 'tomato')

control[control.purchase == 1].hist(column = features[4:], figsize=(15,2.5), layout = (1, 3), 
                                              density = True, color = 'darkturquoise')

Histograms for Control Group V1-V7 
red: no-purchase | blue: purchase

array([[<AxesSubplot:title={'center':'V5'}>,
        <AxesSubplot:title={'center':'V6'}>,
        <AxesSubplot:title={'center':'V7'}>]], dtype=object)


# Features histograms for experimental group:
print('Histograms for Experimental Group V1-V7 \nred: no-purchase | blue: purchase')
experimental[experimental.purchase == 0].hist(column = features[:4], figsize=(15,2.5), layout = (1, 4), 
                                    density = True, color = 'tomato')

experimental[experimental.purchase == 1].hist(column = features[:4], figsize=(15,2.5), layout = (1, 4), 
                                              density = True, color = 'darkturquoise')

experimental[experimental.purchase == 0].hist(column = features[4:], figsize=(15,2.5), layout = (1, 3), 
                                    density = True, color = 'tomato')

experimental[experimental.purchase == 1].hist(column = features[4:], figsize=(15,2.5), layout = (1, 3), 
                                              density = True, color = 'darkturquoise')

Histograms for Experimental Group V1-V7 
red: no-purchase | blue: purchase

array([[<AxesSubplot:title={'center':'V5'}>,
        <AxesSubplot:title={'center':'V6'}>,
        <AxesSubplot:title={'center':'V7'}>]], dtype=object)


# Defining function to test promotion to all the clients:
def promotion_strategy_all(df):
    '''
    INPUT 
    df - a dataframe with *only* the columns V1 - V7 (same as train_data)

    OUTPUT
    promotion_df - np.array with the values
                   'Yes' or 'No' related to whether or not an 
                   individual should recieve a promotion 
                   should be the length of df.shape[0]
                
    Ex:
    INPUT: df
    
    V1	V2	  V3	V4	V5	V6	V7
    2	30	-1.1	1	1	3	2
    3	32	-0.6	2	3	2	2
    2	30	0.13	1	1	4	2
    
    OUTPUT: promotion
    
    array(['Yes', 'Yes', 'No'])
    indicating the first two users would recieve the promotion and 
    the last should not.
    '''
    prom_list = ['Yes' for i in range(df.shape[0])]
    
    promotion = np.array(prom_list)
    
    return promotion

# Testing IRR and NIR for this strategy:
test_results(promotion_strategy_all)

Nice job!  See how well your strategy worked on our test data below!

Your irr with this strategy is 0.0096.

Your nir with this strategy is -1132.20.
We came up with a model with an irr of 0.0188 and an nir of 189.45 on the test set.

 How did you do?

(0.009593158278250108, -1132.1999999999998)


# Defining function using Correlation for client segmentation:
def promotion_strategy_corr(df):
    '''
    INPUT 
    df - a dataframe with *only* the columns V1 - V7 (same as train_data)

    OUTPUT
    promotion_df - np.array with the values
                   'Yes' or 'No' related to whether or not an 
                   individual should recieve a promotion 
                   should be the length of df.shape[0]
                
    Ex:
    INPUT: df
    
    V1	V2	  V3	V4	V5	V6	V7
    2	30	-1.1	1	1	3	2
    3	32	-0.6	2	3	2	2
    2	30	0.13	1	1	4	2
    
    OUTPUT: promotion
    
    array(['Yes', 'Yes', 'No'])
    indicating the first two users would recieve the promotion and 
    the last should not.
    '''
    prom_list = ['Yes' if (df.V4.iloc[i] == 2) | (df.V5.iloc[i] >= 3) else 'No' for i in range(df.shape[0])]
    
    promotion = np.array(prom_list)
    
    return promotion

# Testing IRR and NIR for this strategy:
test_results(promotion_strategy_corr)

Nice job!  See how well your strategy worked on our test data below!

Your irr with this strategy is 0.0119.

Your nir with this strategy is -531.40.
We came up with a model with an irr of 0.0188 and an nir of 189.45 on the test set.

 How did you do?

(0.01191694716124083, -531.4000000000001)


# Selecting training set:
X = experimental[features]
y = experimental['purchase']

# Balancing train set with SMOTE:
smotenc = SMOTENC(categorical_features = [0, 3, 4, 5, 6], random_state = 101)
X_bal, y_bal = smotenc.fit_sample(X, y)
print(y_bal.value_counts())

# Training Logistic Regression model, using GridSearchCV for best parameters:
lr = LogisticRegression()
param = {'penalty': ['l1', 'l2'], 
         'C': [1, 10, 100]}
log_r_cv = GridSearchCV(lr, param_grid = param)
log_r_cv.fit(X_bal, y_bal)

0    41643
1    41643
Name: purchase, dtype: int64

GridSearchCV(estimator=LogisticRegression(),
             param_grid={'C': [1, 10, 100], 'penalty': ['l1', 'l2']})


# Defining function using Logistic Regression for client segmentation:
def promotion_strategy_log(df):
    '''
    INPUT 
    df - a dataframe with *only* the columns V1 - V7 (same as train_data)

    OUTPUT
    promotion_df - np.array with the values
                   'Yes' or 'No' related to whether or not an 
                   individual should recieve a promotion 
                   should be the length of df.shape[0]
                
    Ex:
    INPUT: df
    
    V1	V2	  V3	V4	V5	V6	V7
    2	30	-1.1	1	1	3	2
    3	32	-0.6	2	3	2	2
    2	30	0.13	1	1	4	2
    
    OUTPUT: promotion
    
    array(['Yes', 'Yes', 'No'])
    indicating the first two users would recieve the promotion and 
    the last should not.
    '''
    X_test = df[features]
    y_pred = log_r_cv.predict(X_test)
    prom_list = ['Yes' if y_pred[i] == 1 else 'No' for i in range(len(y_pred))]
    
    promotion = np.array(prom_list)
    
    return promotion

# Testing IRR and NIR for this strategy:
test_results(promotion_strategy_log)

Nice job!  See how well your strategy worked on our test data below!

Your irr with this strategy is 0.0145.

Your nir with this strategy is -72.55.
We came up with a model with an irr of 0.0188 and an nir of 189.45 on the test set.

 How did you do?

(0.014468322486757366, -72.54999999999995)


# Training Decision Tree Classifier to check for most important features:
tree = DecisionTreeClassifier(random_state = 101)
tree.fit(X_bal, y_bal)

# Checking feature importance
feature_imp = tree.feature_importances_
for i, value in zip(np.argsort(-feature_imp), -np.sort(-feature_imp)):
    print(features[i], round(value, 2))

V3 0.52
V2 0.2
V4 0.09
V5 0.07
V6 0.05
V1 0.04
V7 0.03


# Selecting training set:
features_imp = ['V2', 'V3', 'V4', 'V5']
X_imp = experimental[features_imp]
y_imp = experimental['purchase']

# Balancing train set with SMOTE:
smotenc_imp = SMOTENC(categorical_features = [2, 3], random_state = 201)
X_ibal, y_ibal = smotenc_imp.fit_sample(X_imp, y_imp)
print(y_ibal.value_counts())

# Training Logistic Regression model, using GridSearchCV for best parameters:
log_r_cv_imp = GridSearchCV(lr, param_grid = param)
log_r_cv_imp.fit(X_ibal, y_ibal)

0    41643
1    41643
Name: purchase, dtype: int64

GridSearchCV(estimator=LogisticRegression(),
             param_grid={'C': [1, 10, 100], 'penalty': ['l1', 'l2']})


def promotion_strategy_log_imp(df):
    '''
    INPUT 
    df - a dataframe with *only* the columns V1 - V7 (same as train_data)

    OUTPUT
    promotion_df - np.array with the values
                   'Yes' or 'No' related to whether or not an 
                   individual should recieve a promotion 
                   should be the length of df.shape[0]
                
    Ex:
    INPUT: df
    
    V1	V2	  V3	V4	V5	V6	V7
    2	30	-1.1	1	1	3	2
    3	32	-0.6	2	3	2	2
    2	30	0.13	1	1	4	2
    
    OUTPUT: promotion
    
    array(['Yes', 'Yes', 'No'])
    indicating the first two users would recieve the promotion and 
    the last should not.
    '''
    X_test = df[features_imp]
    y_pred = log_r_cv_imp.predict(X_test)
    prom_list = ['Yes' if y_pred[i] == 1 else 'No' for i in range(len(y_pred))]
    
    promotion = np.array(prom_list)
    
    return promotion

# Testing IRR and NIR for this strategy:
test_results(promotion_strategy_log_imp)

Nice job!  See how well your strategy worked on our test data below!

Your irr with this strategy is 0.0147.

Your nir with this strategy is -39.60.
We came up with a model with an irr of 0.0188 and an nir of 189.45 on the test set.

 How did you do?

(0.0147094528569421, -39.59999999999991)


# Selecting training set:
data_xgb = train_data[features]

# Transforming labels:
data_xgb['send_prom'] = [1 if (train_data.Promotion.iloc[i] == 'Yes') & (train_data.purchase.iloc[i] == 1) else 0 \
                         for i in range(train_data.shape[0])]
data_xgb.head()


# Applying SMOTE:
smotenc_xgb = SMOTENC(categorical_features = [0, 3, 4, 5, 6], random_state = 301)
X_xgb, y_xgb = smotenc_xgb.fit_sample(data_xgb[features], data_xgb['send_prom'])
print(y_xgb.value_counts())

# Training XGBClassifier model:
xgb = XGBClassifier(objective = 'binary:logistic', seed = 301, verbosity = 0)
param_xgb = {'learning_rate': [0.1, 0.3], 'max_depth': [5, 7], 
             'min_child_weight': [1, 5], 'reg_lambda': [1, 5], 'gamma': [.1, 3, 5]}

xgb_cl = GridSearchCV(xgb, param_grid = param_xgb, scoring = 'roc_auc')
xgb_cl.fit(X_xgb, y_xgb)

0    83813
1    83813
Name: send_prom, dtype: int64

GridSearchCV(estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, seed=301,
                                     subsample=None, tree_method=None,
                                     validate_parameters=None, verbosity=0),
             param_grid={'gamma': [0.1, 3, 5], 'learning_rate': [0.1, 0.3],
                         'max_depth': [5, 7], 'min_child_weight': [1, 5],
                         'reg_lambda': [1, 5]},
             scoring='roc_auc')


# Defining function using XGBClassifier for client segmentation:
def promotion_strategy_xgb(df):
    '''
    INPUT 
    df - a dataframe with *only* the columns V1 - V7 (same as train_data)

    OUTPUT
    promotion_df - np.array with the values
                   'Yes' or 'No' related to whether or not an 
                   individual should recieve a promotion 
                   should be the length of df.shape[0]
                
    Ex:
    INPUT: df
    
    V1	V2	  V3	V4	V5	V6	V7
    2	30	-1.1	1	1	3	2
    3	32	-0.6	2	3	2	2
    2	30	0.13	1	1	4	2
    
    OUTPUT: promotion
    
    array(['Yes', 'Yes', 'No'])
    indicating the first two users would recieve the promotion and 
    the last should not.
    '''
    X_test = df[features]
    y_pred = xgb_cl.predict(X_test)
    prom_list = ['Yes' if y_pred[i] == 1 else 'No' for i in range(len(y_pred))]
    
    promotion = np.array(prom_list)
    
    return promotion

# Testing IRR and NIR for this strategy:
test_results(promotion_strategy_xgb)

Nice job!  See how well your strategy worked on our test data below!

Your irr with this strategy is 0.0164.

Your nir with this strategy is -1.10.
We came up with a model with an irr of 0.0188 and an nir of 189.45 on the test set.

 How did you do?

(0.016444154998371867, -1.1000000000000014)


# Selecting data and applying transformations:
data_xgb2 = experimental[sel_features]
# Transforming V2 and V3:
V2_cat = [0 if ((data_xgb2.V2.iloc[i] < 20) | (data_xgb2.V2.iloc[i] > 40)) \
          else 1 if (20 <= data_xgb2.V2.iloc[i] < 25) \
          else 2 if (25 <= data_xgb2.V2.iloc[i] < 30) \
          else 3 if (30 <= data_xgb2.V2.iloc[i] < 35) \
          else 4 for i in range(data_xgb2.shape[0])]
data_xgb2['V2'] = V2_cat
V3_cat = [5 if (data_xgb2.V3.iloc[i] <= -1) \
          else 4 if (-1 < data_xgb2.V3.iloc[i] <= -0.5) \
          else 3 if (-0.5 < data_xgb2.V3.iloc[i] <= 0) \
          else 2 if (0 < data_xgb2.V3.iloc[i] <= 0.5) \
          else 1 if (0.5 < data_xgb2.V3.iloc[i] <= 1) \
          else 0 for i in range(data_xgb2.shape[0])]
data_xgb2['V3'] = V3_cat
data_xgb2.head()


# Applying SMOTE:
smote_xgb2 = SMOTE(random_state = 1001)
X_xgb2, y_xgb2 = smote_xgb2.fit_sample(data_xgb2[features], data_xgb2['purchase'])
print(y_xgb2.value_counts())

# Training XGBClassifier model:
xgb = XGBClassifier(objective = 'binary:logistic', seed = 1001, verbosity = 0)
param_xgb = {'learning_rate': [0.1, 0.3], 'max_depth': [3, 5], 
             'min_child_weight': [1, 5], 'reg_lambda': [1, 5], 'gamma': [.1, 3, 5]}

xgb_cl2 = GridSearchCV(xgb, param_grid = param_xgb, scoring = 'roc_auc')
xgb_cl2.fit(X_xgb2, y_xgb2)

0    41643
1    41643
Name: purchase, dtype: int64

GridSearchCV(estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, seed=1001,
                                     subsample=None, tree_method=None,
                                     validate_parameters=None, verbosity=0),
             param_grid={'gamma': [0.1, 3, 5], 'learning_rate': [0.1, 0.3],
                         'max_depth': [3, 5], 'min_child_weight': [1, 5],
                         'reg_lambda': [1, 5]},
             scoring='roc_auc')


# Defining function using XGBClassifier for client segmentation:
def promotion_strategy_xgb2(df):
    '''
    INPUT 
    df - a dataframe with *only* the columns V1 - V7 (same as train_data)

    OUTPUT
    promotion_df - np.array with the values
                   'Yes' or 'No' related to whether or not an 
                   individual should recieve a promotion 
                   should be the length of df.shape[0]
                
    Ex:
    INPUT: df
    
    V1	V2	  V3	V4	V5	V6	V7
    2	30	-1.1	1	1	3	2
    3	32	-0.6	2	3	2	2
    2	30	0.13	1	1	4	2
    
    OUTPUT: promotion
    
    array(['Yes', 'Yes', 'No'])
    indicating the first two users would recieve the promotion and 
    the last should not.
    '''
    X_test = df[features]
    V2_cat = [0 if ((X_test.V2.iloc[i] < 20) | (X_test.V2.iloc[i] > 40)) \
              else 1 if (20 <= X_test.V2.iloc[i] < 25) \
              else 2 if (25 <= X_test.V2.iloc[i] < 30) \
              else 3 if (30 <= X_test.V2.iloc[i] < 35) \
              else 4 for i in range(X_test.shape[0])]
    X_test['V2'] = V2_cat
    V3_cat = [5 if (X_test.V3.iloc[i] <= -1) \
              else 4 if (-1 < X_test.V3.iloc[i] <= -0.5) \
              else 3 if (-0.5 < X_test.V3.iloc[i] <= 0) \
              else 2 if (0 < X_test.V3.iloc[i] <= 0.5) \
              else 1 if (0.5 < X_test.V3.iloc[i] <= 1) \
              else 0 for i in range(X_test.shape[0])]
    X_test['V3'] = V3_cat
    y_pred = xgb_cl2.predict(X_test)
    prom_list = ['Yes' if y_pred[i] == 1 else 'No' for i in range(len(y_pred))]
    
    promotion = np.array(prom_list)
    
    return promotion

# Testing IRR and NIR for this strategy:
test_results(promotion_strategy_xgb2)

Nice job!  See how well your strategy worked on our test data below!

Your irr with this strategy is 0.0178.

Your nir with this strategy is 153.70.
We came up with a model with an irr of 0.0188 and an nir of 189.45 on the test set.

 How did you do?

(0.017830221770241402, 153.70000000000005)


data_xgb3 = experimental[sel_features]
data_xgb3['V2'] = [0 if ((data_xgb3.V2.iloc[i] < 15) | (data_xgb3.V2.iloc[i] > 45)) \
                   else 1 for i in range(data_xgb3.shape[0])] 

data_xgb3['V3'] = [-1 if (data_xgb3.V3.iloc[i] > 1) \
                   else 0 if (0 <= data_xgb3.V3.iloc[i] <= 1) \
                   else 1 if (-1 <= data_xgb3.V3.iloc[i] < 0) \
                   else 2 for i in range(data_xgb3.shape[0])] 
data_xgb3.head()


# Applying SMOTE:
smotenc_xgb3 = SMOTE(random_state = 401)
X_xgb3, y_xgb3 = smotenc_xgb3.fit_sample(data_xgb3[features], data_xgb3['purchase'])
print(y_xgb3.value_counts())

# Training XGBClassifier model:
xgb = XGBClassifier(objective = 'binary:logistic', seed = 401, verbosity = 0)
param_xgb = {'learning_rate': [0.1, 0.3], 'max_depth': [3, 5], 
             'min_child_weight': [1, 5], 'reg_lambda': [1, 5], 'gamma': [.1, 3, 5]}

xgb_cl3 = GridSearchCV(xgb, param_grid = param_xgb, scoring = 'roc_auc')
xgb_cl3.fit(X_xgb3, y_xgb3)

0    41643
1    41643
Name: purchase, dtype: int64

GridSearchCV(estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, seed=401,
                                     subsample=None, tree_method=None,
                                     validate_parameters=None, verbosity=0),
             param_grid={'gamma': [0.1, 3, 5], 'learning_rate': [0.1, 0.3],
                         'max_depth': [3, 5], 'min_child_weight': [1, 5],
                         'reg_lambda': [1, 5]},
             scoring='roc_auc')


# Defining function using XGBClassifier for client segmentation:
def promotion_strategy_xgb3(df):
    '''
    INPUT 
    df - a dataframe with *only* the columns V1 - V7 (same as train_data)

    OUTPUT
    promotion_df - np.array with the values
                   'Yes' or 'No' related to whether or not an 
                   individual should recieve a promotion 
                   should be the length of df.shape[0]
                
    Ex:
    INPUT: df
    
    V1	V2	  V3	V4	V5	V6	V7
    2	30	-1.1	1	1	3	2
    3	32	-0.6	2	3	2	2
    2	30	0.13	1	1	4	2
    
    OUTPUT: promotion
    
    array(['Yes', 'Yes', 'No'])
    indicating the first two users would recieve the promotion and 
    the last should not.
    '''
    X_test = df[features]
    X_test['V2'] = [0 if ((X_test.V2.iloc[i] < 15) | (X_test.V2.iloc[i] > 45)) \
                    else 2 for i in range(X_test.shape[0])]
    X_test['V3'] = [-1 if (X_test.V3.iloc[i] > 1) \
                    else 0 if (0 <= X_test.V3.iloc[i] <= 1) \
                    else 1 if (-1 <= X_test.V3.iloc[i] < 0) \
                    else 2 for i in range(X_test.shape[0])]
    y_pred = xgb_cl3.predict(X_test)
    prom_list = ['Yes' if y_pred[i] == 1 else 'No' for i in range(len(y_pred))]
    
    promotion = np.array(prom_list)
    
    return promotion

# Testing IRR and NIR for this strategy:
test_results(promotion_strategy_xgb3)

Nice job!  See how well your strategy worked on our test data below!

Your irr with this strategy is 0.0173.

Your nir with this strategy is 154.00.
We came up with a model with an irr of 0.0188 and an nir of 189.45 on the test set.

 How did you do?

(0.017338303174019454, 154.0)


# Selecting data and applying transformations:
data_xgb4 = experimental[sel_features]
# Transforming V2 and V3:
data_xgb4['V2'] = [0 if ((data_xgb4.V2.iloc[i] < 15) | (data_xgb4.V2.iloc[i] > 45)) \
                   else 1 if ((15 <= data_xgb4.V2.iloc[i] < 25) | (35 < data_xgb4.V2.iloc[i] <= 45)) \
                   else 2 for i in range(data_xgb4.shape[0])]

data_xgb4['V3'] = [3 if (data_xgb4.V3.iloc[i] <= -1) \
                   else 2 if (-1 < data_xgb4.V3.iloc[i] <= -0.5) \
                   else 1 if (-0.5 < data_xgb4.V3.iloc[i] <= 0) \
                   else -1 if (0 < data_xgb4.V3.iloc[i] <= 0.5) \
                   else -2 if (0.5 < data_xgb4.V3.iloc[i] <= 1) \
                   else -3 for i in range(data_xgb4.shape[0])]
data_xgb4.head()


# Applying SMOTE:
smote_xgb4 = SMOTE(random_state = 1001)
X_xgb4, y_xgb4 = smote_xgb4.fit_sample(data_xgb4[features], data_xgb4['purchase'])
print(y_xgb4.value_counts())

# Training XGBClassifier model:
xgb = XGBClassifier(objective = 'binary:logistic', seed = 1001, verbosity = 0)
param_xgb = {'learning_rate': [0.1, 0.3], 'max_depth': [3, 5], 
             'min_child_weight': [1, 5], 'reg_lambda': [1, 5], 'gamma': [.1, 3, 5]}

xgb_cl4 = GridSearchCV(xgb, param_grid = param_xgb, scoring = 'roc_auc')
xgb_cl4.fit(X_xgb4, y_xgb4)

0    41643
1    41643
Name: purchase, dtype: int64

GridSearchCV(estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, seed=1001,
                                     subsample=None, tree_method=None,
                                     validate_parameters=None, verbosity=0),
             param_grid={'gamma': [0.1, 3, 5], 'learning_rate': [0.1, 0.3],
                         'max_depth': [3, 5], 'min_child_weight': [1, 5],
                         'reg_lambda': [1, 5]},
             scoring='roc_auc')


# Defining function using XGBClassifier for client segmentation:
def promotion_strategy_xgb4(df):
    '''
    INPUT 
    df - a dataframe with *only* the columns V1 - V7 (same as train_data)

    OUTPUT
    promotion_df - np.array with the values
                   'Yes' or 'No' related to whether or not an 
                   individual should recieve a promotion 
                   should be the length of df.shape[0]
                
    Ex:
    INPUT: df
    
    V1	V2	  V3	V4	V5	V6	V7
    2	30	-1.1	1	1	3	2
    3	32	-0.6	2	3	2	2
    2	30	0.13	1	1	4	2
    
    OUTPUT: promotion
    
    array(['Yes', 'Yes', 'No'])
    indicating the first two users would recieve the promotion and 
    the last should not.
    '''
    X_test = df[features]
    V2_cat = [0 if ((X_test.V2.iloc[i] < 15) | (X_test.V2.iloc[i] > 45)) \
              else 1 if ((15 <= X_test.V2.iloc[i] < 25) | (35 < X_test.V2.iloc[i] <= 45)) \
              else 2 for i in range(X_test.shape[0])]
    X_test['V2'] = V2_cat
    V3_cat = [3 if (X_test.V3.iloc[i] <= -1) \
              else 2 if (-1 < X_test.V3.iloc[i] <= -0.5) \
              else 1 if (-0.5 < X_test.V3.iloc[i] <= 0) \
              else -1 if (0 < X_test.V3.iloc[i] <= 0.5) \
              else -2 if (0.5 < X_test.V3.iloc[i] <= 1) \
              else -3 for i in range(X_test.shape[0])]
    X_test['V3'] = V3_cat
    y_pred = xgb_cl4.predict(X_test)
    prom_list = ['Yes' if y_pred[i] == 1 else 'No' for i in range(len(y_pred))]
    
    promotion = np.array(prom_list)
    
    return promotion

# Testing IRR and NIR for this strategy:
test_results(promotion_strategy_xgb4)

Nice job!  See how well your strategy worked on our test data below!

Your irr with this strategy is 0.0185.

Your nir with this strategy is 214.70.
We came up with a model with an irr of 0.0188 and an nir of 189.45 on the test set.

 How did you do?

(0.01849462654235275, 214.70000000000005)

	Actual
Predicted	Yes	No
Yes	I	II
No	III	IV

	ID	purchase	V1	V2	V3	V4	V5	V6	V7
count	84534.000000	84534.000000	84534.000000	84534.000000	84534.000000	84534.000000	84534.000000	84534.000000	84534.000000
mean	62970.972413	0.012303	1.500662	29.973600	0.000190	1.679608	2.327643	2.502898	1.701694
std	36418.440539	0.110234	0.868234	5.010626	1.000485	0.466630	0.841167	1.117349	0.457517
min	1.000000	0.000000	0.000000	7.104007	-1.684550	1.000000	1.000000	1.000000	1.000000
25%	31467.250000	0.000000	1.000000	26.591501	-0.905350	1.000000	2.000000	2.000000	1.000000
50%	62827.500000	0.000000	2.000000	29.979744	-0.039572	2.000000	2.000000	3.000000	2.000000
75%	94438.750000	0.000000	2.000000	33.344593	0.826206	2.000000	3.000000	4.000000	2.000000
max	126184.000000	1.000000	3.000000	50.375913	1.691984	2.000000	4.000000	4.000000	2.000000

	control	experimental	perc_var
V1	-0.006088	-0.004651	-23.605724
V2	0.000090	-0.002885	-3306.003045
V3	0.008325	-0.017012	-304.357526
V4	0.002344	0.053348	2176.227435
V5	0.002771	0.011711	322.600958
V6	-0.000980	-0.002049	109.222660
V7	0.004786	-0.005005	-204.587034
purchase	1.000000	1.000000	-0.000000

	V1	V2	V3	V4	V5	V6	V7
0	2	30.443518	-1.165083	1	1	3	2
1	3	32.159350	-0.645617	2	3	2	2
2	2	30.431659	0.133583	1	1	4	2
3	0	26.588914	-0.212728	2	1	4	2
4	3	28.044331	-0.385883	1	1	2	2

Starbucks - Advertising Promotion Optimization¶

Background Information¶

Optimization Strategy¶

How To Test The Strategy?¶

Table of Contents

Checking A/B Test

Invariant Metric

Evaluation Metric

Exploratory Analysis

Correlation

Considerations on Correlation Analysis

Histogram

Considerations on Histogram Analysis

Starting Parameters

Approach 1 - All the Clients

Approach 2 - Client Segmentation Using Individual Correlation

Testing Different Approaches

Approach 3 - Logistic Regression

Approach 4 - XGBoost Classifier

Basic Approach

Manipulating Features - Approach 1

Manipulating Features - Approach 2

Manipulating Features - Approach 3

Conclusion