Introduction

Data with imbalanced target class occurs frequently in several domians such as credit card Fraud Detection ,insurance claim prediction, email spam detection, anomaly detection, outlier detection etc. Financial instituions loose millions of dollars every year to fraudulent financial transactions. It is important that these institutions are able to identify fraud to protect their customers and also reduce the financial losses that comes from fraudsters. The goal here is to predict fraudulent transactions to minimize loss to financial companies. For machine learning data with imbalanced target clases, the model evaluation metric is the AUC, the area under the ROC curve and the area under the precision-recall curve. The accuaracy metric is not useful in these situations since usually the proportion of the positive class in these situations is so low that even a naive classifier that predicts all transactions as fraudulent would result in a high accuracy. For example the dataset considered here, the proportion of negative examples is over 99% this a naive classifier can predict all transactions as legitimate and would be over 99% accuarate.

The following packages that is been installed here will be neccessary for some of the analysis later on this project.

!pip uninstall scikit-learn # until no more scikit-learn is present
!pip install scikit-learn
!pip install scikit-optimize
!pip install skll
!pip install imbalanced-learn
!pip install eli5
!pip install scipy
!pip install scikit-optimize

# activate R magic to run R in google colab notebook
import rpy2
%load_ext rpy2.ipython

#%%R

#install.packages("MLmetrics")
#install.packages("yardstick")
#install.packages("mltools")
#install.packages("glue")

%tensorflow_version 2.x
import numpy as np
import pandas as pd
import io
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns; sns.set(style="ticks", color_codes=True)
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.feature_selection import chi2
from sklearn.linear_model import LogisticRegression
from sklearn import feature_selection
#from sklearn.preprocessing import Imputer
from sklearn.model_selection import  cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from skopt.space import Real, Categorical, Integer
#from skll.metrics import spearman
from scipy.stats import kendalltau, spearmanr, pearsonr
from skopt import BayesSearchCV
from sklearn.model_selection import  cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer
from skopt.space import Real, Categorical, Integer
from sklearn.metrics import classification_report
from sklearn.base import TransformerMixin
from sklearn.metrics import classification_report, f1_score, accuracy_score, precision_score, confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
pd.set_option('display.max_rows', 600)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
import warnings
import pandas_profiling
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.pipeline import make_pipeline, FeatureUnion, Pipeline
from sklearn.preprocessing import OneHotEncoder,  StandardScaler
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import roc_curve, roc_auc_score,balanced_accuracy_score
from sklearn.svm import SVC
import random
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import *
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
#import smote
import os
from sklearn.tree import DecisionTreeClassifier
from imblearn.metrics import geometric_mean_score as gmean
from imblearn.metrics import make_index_balanced_accuracy as iba
from imblearn.metrics import *
from eli5.sklearn import PermutationImportance
from eli5.sklearn import *
import eli5
from eli5.permutation_importance import get_score_importances
#import rus
# Skopt functions
from skopt import BayesSearchCV
from skopt import gp_minimize # Bayesian optimization using Gaussian Processes
from skopt.space import Real, Categorical, Integer
from skopt.utils import use_named_args # decorator to convert a list of parameters to named arguments
from skopt.callbacks import DeadlineStopper # Stop the optimization before running out of a fixed budget of time.
from skopt.callbacks import VerboseCallback # Callback to control the verbosity
from skopt.callbacks import DeltaXStopper # Stop the optimization If the last two positions at which the objective has been evaluated are less than delta
from joblib import dump, load
from prettytable import PrettyTable
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.under_sampling import RandomUnderSampler # doctest: +NORMALIZE_WHITESPACE
from imblearn import under_sampling, over_sampling
from imblearn.over_sampling import SMOTE
import tensorflow as tf
warnings.filterwarnings("ignore")
%matplotlib inline
#specify tensorflow version to use
%tensorflow_version 2.x
#load tensorboard
#%load_ext tensorboard
 #%tensorboard --logdir logs

%autosave 5

Autosaving every 5 seconds

np.version.version

'1.18.2'

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

Description of Data.

The datasets can be found on kaggle.The link to it is here. The datasets contains transactions made by credit cards in September 2013 by european cardholders. This dataset presents transactions that occurred in two days, where we have 492 frauds out of 284,807 transactions. The dataset is highly imbalanced, the positive class (frauds) account for 0.172% of all transactions.

It contains only numerical input variables which are the result of a PCA transformation. This was done to preserve the identity and privacy of the people whose transaction this data was gathered from. Features V1, V2, … V28 are the principal components obtained with PCA, the only features which have not been transformed with PCA are ‘Time’ and ‘Amount’. Feature ‘Time’ contains the seconds elapsed between each transaction and the first transaction in the dataset. The feature ‘Amount’ is the transaction Amount, this feature can be used for example-dependant cost-senstive learning. Feature ‘Class’ is the response variable and it takes value 1 in case of fraud and 0 otherwise.

file = tf.keras.utils
df = pd.read_csv('https://storage.googleapis.com/download.tensorflow.org/data/creditcard.csv')
df.head()

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	V10	V11	V12	V13	V14	V15	V16	V17	V18	V19	V20	V21	V22	V23	V24	V25	V26	V27	V28	Amount
0	0.0	-1.359807	-0.072781	2.536347	1.378155	-0.338321	0.462388	0.239599	0.098698	0.363787	0.090794	-0.551600	-0.617801	-0.991390	-0.311169	1.468177	-0.470401	0.207971	0.025791	0.403993	0.251412	-0.018307	0.277838	-0.110474	0.066928	0.128539	-0.189115	0.133558	-0.021053	149.62
1	0.0	1.191857	0.266151	0.166480	0.448154	0.060018	-0.082361	-0.078803	0.085102	-0.255425	-0.166974	1.612727	1.065235	0.489095	-0.143772	0.635558	0.463917	-0.114805	-0.183361	-0.145783	-0.069083	-0.225775	-0.638672	0.101288	-0.339846	0.167170	0.125895	-0.008983	0.014724	2.69
2	1.0	-1.358354	-1.340163	1.773209	0.379780	-0.503198	1.800499	0.791461	0.247676	-1.514654	0.207643	0.624501	0.066084	0.717293	-0.165946	2.345865	-2.890083	1.109969	-0.121359	-2.261857	0.524980	0.247998	0.771679	0.909412	-0.689281	-0.327642	-0.139097	-0.055353	-0.059752	378.66
3	1.0	-0.966272	-0.185226	1.792993	-0.863291	-0.010309	1.247203	0.237609	0.377436	-1.387024	-0.054952	-0.226487	0.178228	0.507757	-0.287924	-0.631418	-1.059647	-0.684093	1.965775	-1.232622	-0.208038	-0.108300	0.005274	-0.190321	-1.175575	0.647376	-0.221929	0.062723	0.061458	123.50
4	2.0	-1.158233	0.877737	1.548718	0.403034	-0.407193	0.095921	0.592941	-0.270533	0.817739	0.753074	-0.822843	0.538196	1.345852	-1.119670	0.175121	-0.451449	-0.237033	-0.038195	0.803487	0.408542	-0.009431	0.798278	-0.137458	0.141267	-0.206010	0.502292	0.219422	0.215153	69.99

df[['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V26', 'V27', 'V28', 'Amount', 'Class']].describe().transpose()

	count	mean	std	min	25%	50%	75%	max
Time	284807.0	9.481386e+04	47488.145955	0.000000	54201.500000	84692.000000	139320.500000	172792.000000
V1	284807.0	3.919560e-15	1.958696	-56.407510	-0.920373	0.018109	1.315642	2.454930
V2	284807.0	5.688174e-16	1.651309	-72.715728	-0.598550	0.065486	0.803724	22.057729
V3	284807.0	-8.769071e-15	1.516255	-48.325589	-0.890365	0.179846	1.027196	9.382558
V4	284807.0	2.782312e-15	1.415869	-5.683171	-0.848640	-0.019847	0.743341	16.875344
V5	284807.0	-1.552563e-15	1.380247	-113.743307	-0.691597	-0.054336	0.611926	34.801666
V26	284807.0	1.699104e-15	0.482227	-2.604551	-0.326984	-0.052139	0.240952	3.517346
V27	284807.0	-3.660161e-16	0.403632	-22.565679	-0.070840	0.001342	0.091045	31.612198
V28	284807.0	-1.206049e-16	0.330083	-15.430084	-0.052960	0.011244	0.078280	33.847808
Amount	284807.0	8.834962e+01	250.120109	0.000000	5.600000	22.000000	77.165000	25691.160000
Class	284807.0	1.727486e-03	0.041527	0.000000	0.000000	0.000000	0.000000	1.000000

We can see the target class is highly imbalanced. The minority classis about 0.17% of the target exampes.

df['Class'].value_counts(normalize=True)*100

0    99.827251
1     0.172749
Name: Class, dtype: float64

neg, pos = df.Class.value_counts()
total = neg + pos
print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n  '.format(
    total, pos, 100 * pos / total,100 * neg / total))

print('Total: {}\n    Negative: {} ({:.2f}% of total)\n  '.format(
    total, neg, 100 * neg / total))

Examples:
    Total: 284807
    Positive: 492 (0.17% of total)
  
Total: 284807
    Negative: 284315 (99.83% of total)

#x = raw_df.drop(['Time'],axis=1)

# Use a utility from sklearn to split and shuffle our dataset.
train_df, test_df = train_test_split(df, test_size=0.2)
#train_df, val_df = train_test_split(train_df, test_size=0.2)

train_x =train_df.drop(['Time','Class'],axis=1)
test_x = test_df.drop(['Time','Class'],axis=1)
#val_x  =  val_df.drop(['Time'],axis=1)

train_y=  train_df.Class
test_y = test_df.Class
#val_y  = val_df.Class

print('Traing dataset size:{}'.format(train_x.shape))
print('Test dataset size:{}'.format(test_x.shape))
#print('Validation dataset size: {}'.format(val_df.shape))

Traing dataset size:(227845, 29)
Test dataset size:(56962, 29)

#train_x.columns
#test_x.columns
test_y.isna().sum()

The first model considered here is the extreme gradient boosting algorithm. It is popular with modeling tabular data. The hyperparameters of the model would be set to default except the scale_pos_weight which would be tuned in the case of cost-sensitive xgboost to find the best weight that optimizes the model.The hyperparameter values is left to the default values to allow for a fair comparison among machine learning algorithms used in this analysis. The hyperparameter tuning is done by bayesian optimization using the scikit-optimize package.

# Setting a 5-fold stratified cross-validation (note: shuffle=True)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)


clf = xgb.XGBClassifier(
        n_jobs = -1,
        objective = 'binary:logistic',
        silent=1,
        tree_method='approx')

search_spaces = {
    #'learning_rate': Real(0.01, 1.0, 'log-uniform'),
    #             'min_child_weight': Integer(0, 10),
    #             'max_depth': Integer(1, 50),
    #            'max_delta_step': Integer(0, 20), # Maximum delta step we allow each leaf output
    #             'subsample': Real(0.01, 1.0, 'uniform'),
    #             'colsample_bytree': Real(0.01, 1.0, 'uniform'), # subsample ratio of columns by tree
    #             'colsample_bylevel': Real(0.01, 1.0, 'uniform'), # subsample ratio by level in trees
                 #'reg_lambda': Real(1e-9, 1000, 'log-uniform'), # L2 regularization
                 #'reg_alpha': Real(1e-9, 1.0, 'log-uniform'), # L1 regularization
     #            'gamma': Real(1e-9, 0.5, 'log-uniform'), # Minimum loss reduction for partition/pruning parameter
     #            'n_estimators': Integer(50, 100),
                 'scale_pos_weight': Real(1e-6, 2000, 'log-uniform')
                 }


bayessearch = BayesSearchCV(clf,
                    search_spaces,
                    scoring='roc_auc', #f1
                    cv=skf,
                    n_iter=40,
                    n_jobs=-1,
                    return_train_score=False,
                    #refit=True,
                    optimizer_kwargs={'base_estimator': 'GP'},
                    random_state=22)
    

#xgbm_model = bayessearch.fit(X=train_x,  y=train_y)

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

#share_link="https://drive.google.com/file/d/1mGzO4-vaTKVgH5zzzXCVcXNbj-Bt8u8g/view?usp=sharing"
import os 
os.getcwd()
#!files.os.listdir()

'/content'

Build Pandas -Profiling Report

The exploratory analysis of the features in the dataset can be automated with the Pandas -ProfilingReport package. It generates exploratory plots of the features in a dataset that is passed to it.

#Inline report without saving object
pandas_profiling.ProfileReport(df)
#Save report to file¶
pfr = pandas_profiling.ProfileReport(df)

pfr.to_file("/content/drive/My Drive/profilingReport2.html")
pfr

Overview

Dataset info

Number of variables	31
Number of observations	284807
Total Missing (%)	0.0%
Total size in memory	67.4 MiB
Average record size in memory	248.0 B

Variables types

Numeric	30
Categorical	0
Boolean	1
Date	0
Text (Unique)	0
Rejected	0
Unsupported	0

Warnings

Dataset has 1081 duplicate rows Warning

Variables

Time
Numeric

Distinct count	124592
Unique (%)	43.7%
Missing (%)	0.0%
Missing (n)	0
Infinite (%)	0.0%
Infinite (n)	0

Mean	94814
Minimum	0
Maximum	172790
Zeros (%)	0.0%

Toggle details

Quantile statistics

Minimum	0
5-th percentile	25298
Q1	54202
Median	84692
Q3	139320
95-th percentile	164140
Maximum	172790
Range	172790
Interquartile range	85119

Descriptive statistics

Standard deviation	47488
Coef of variation	0.50086
Kurtosis	-1.2935
Mean	94814
MAD	42796
Skewness	-0.035568
Sum	27004000000
Variance	2255100000
Memory size	2.2 MiB

Value	Count	Frequency (%)
163152.0	36	0.0%
64947.0	26	0.0%
68780.0	25	0.0%
3767.0	21	0.0%
3770.0	20	0.0%
128860.0	19	0.0%
19912.0	19	0.0%
3750.0	19	0.0%
140347.0	19	0.0%
143083.0	18	0.0%
Other values (124582)	284585	99.9%

Minimum 5 values

Value	Count	Frequency (%)
0.0	2	0.0%
1.0	2	0.0%
2.0	2	0.0%
4.0	1	0.0%
7.0	2	0.0%

Maximum 5 values

Value	Count	Frequency (%)
172785.0	1	0.0%
172786.0	1	0.0%
172787.0	1	0.0%
172788.0	2	0.0%
172792.0	1	0.0%

V1
Numeric

Distinct count	275663
Unique (%)	96.8%
Missing (%)	0.0%
Missing (n)	0
Infinite (%)	0.0%
Infinite (n)	0

Mean	3.9196e-15
Minimum	-56.408
Maximum	2.4549
Zeros (%)	0.0%

Toggle details

Quantile statistics

Minimum	-56.408
5-th percentile	-2.8991
Q1	-0.92037
Median	0.018109
Q3	1.3156
95-th percentile	2.0812
Maximum	2.4549
Range	58.862
Interquartile range	2.236

Descriptive statistics

Standard deviation	1.9587
Coef of variation	499720000000000
Kurtosis	32.487
Mean	3.9196e-15
MAD	1.4116
Skewness	-3.2807
Sum	3.3208e-1
Variance	3.8365
Memory size	2.2 MiB

Value	Count	Frequency (%)
2.0557970063003896	77	0.0%
1.24567381944824	77	0.0%
2.0533112135278504	62	0.0%
1.30237796508637	60	0.0%
2.04021105776632	53	0.0%
2.08517487552541	48	0.0%
1.33284931179458	45	0.0%
1.01841181981555	40	0.0%
1.33505315377059	39	0.0%
1.3154041716379299	36	0.0%
Other values (275653)	284270	99.8%

Minimum 5 values

Value	Count	Frequency (%)
-56.407509631329	1	0.0%
-46.85504720078179	1	0.0%
-41.9287375244141	1	0.0%
-40.4701418378475	1	0.0%
-40.0425374953845	1	0.0%

Maximum 5 values

Value	Count	Frequency (%)
2.4305067805687406	1	0.0%
2.43920748106102	1	0.0%
2.44650498499596	1	0.0%
2.4518884899535895	1	0.0%
2.45492999121121	1	0.0%

V2
Numeric

Distinct count	275663
Unique (%)	96.8%
Missing (%)	0.0%
Missing (n)	0
Infinite (%)	0.0%
Infinite (n)	0

Mean	5.6882e-16
Minimum	-72.716
Maximum	22.058
Zeros (%)	0.0%

Toggle details

Quantile statistics

Minimum	-72.716
5-th percentile	-1.972
Q1	-0.59855
Median	0.065486
Q3	0.80372
95-th percentile	1.8086
Maximum	22.058
Range	94.773
Interquartile range	1.4023

Descriptive statistics

Standard deviation	1.6513
Coef of variation	2903100000000000
Kurtosis	95.773
Mean	5.6882e-16
MAD	0.97384
Skewness	-4.6249
Sum	9.7316e-11
Variance	2.7268
Memory size	2.2 MiB

Value	Count	Frequency (%)
0.166975019545401	77	0.0%
-0.32666777306077005	77	0.0%
0.08973464781763099	62	0.0%
-0.606529308236609	60	0.0%
-0.146974974784838	53	0.0%
0.39305057772255	48	0.0%
0.38919824918427603	45	0.0%
1.03666300867632	40	0.0%
0.331464026372479	39	0.0%
0.44747360617094895	36	0.0%
Other values (275653)	284270	99.8%

Minimum 5 values

Value	Count	Frequency (%)
-72.7157275629303	1	0.0%
-63.3446983175027	1	0.0%
-60.4646176556493	1	0.0%
-50.3832691251379	1	0.0%
-48.060856024869395	1	0.0%

Maximum 5 values

Value	Count	Frequency (%)
18.1836264596211	1	0.0%
18.902452840124898	1	0.0%
19.167239010306197	1	0.0%
21.4672029942752	1	0.0%
22.0577289904909	1	0.0%

V3
Numeric

Distinct count	275663
Unique (%)	96.8%
Missing (%)	0.0%
Missing (n)	0
Infinite (%)	0.0%
Infinite (n)	0

Mean	-8.7691e-15
Minimum	-48.326
Maximum	9.3826
Zeros (%)	0.0%

Toggle details

Quantile statistics

Minimum	-48.326
5-th percentile	-2.3897
Q1	-0.89036
Median	0.17985
Q3	1.0272
95-th percentile	2.0626
Maximum	9.3826
Range	57.708
Interquartile range	1.9176

Descriptive statistics

Standard deviation	1.5163
Coef of variation	-172910000000000
Kurtosis	26.62
Mean	-8.7691e-15
MAD	1.1337
Skewness	-2.2402
Sum	-3.9108e-1
Variance	2.299
Memory size	2.2 MiB

Value	Count	Frequency (%)
-2.75204095570008	77	0.0%
0.488305742562781	77	0.0%
-1.68183566862495	62	0.0%
-0.681986192919261	60	0.0%
-2.95593366483195	53	0.0%
-4.50820053235418	48	0.0%
-2.16559660467804	45	0.0%
-1.6898137072248403	40	0.0%
-2.05776277666682	39	0.0%
-0.495757487926775	36	0.0%
Other values (275653)	284270	99.8%

Minimum 5 values

Value	Count	Frequency (%)
-48.3255893623954	1	0.0%
-33.6809840183525	1	0.0%
-32.9653457595238	1	0.0%
-32.45419818625469	1	0.0%
-31.8135859546007	1	0.0%

Maximum 5 values

Value	Count	Frequency (%)
4.07916781154883	1	0.0%
4.10171617761651	1	0.0%
4.18781059904763	1	0.0%
4.22610848028397	1	0.0%
9.38255843282114	1	0.0%

V4
Numeric

Distinct count	275663
Unique (%)	96.8%
Missing (%)	0.0%
Missing (n)	0
Infinite (%)	0.0%
Infinite (n)	0

Mean	2.7823e-15
Minimum	-5.6832
Maximum	16.875
Zeros (%)	0.0%

Toggle details

Quantile statistics

Minimum	-5.6832
5-th percentile	-2.1957
Q1	-0.84864
Median	-0.019847
Q3	0.74334
95-th percentile	2.5665
Maximum	16.875
Range	22.559
Interquartile range	1.592

Descriptive statistics

Standard deviation	1.4159
Coef of variation	508880000000000
Kurtosis	2.6355
Mean	2.7823e-15
MAD	1.0603
Skewness	0.67629
Sum	5.9435e-1
Variance	2.0047
Memory size	2.2 MiB

Value	Count	Frequency (%)
-0.842316033286871	77	0.0%
0.6353219207244001	77	0.0%
0.45421196023303295	62	0.0%
-1.9046033962221203	60	0.0%
-0.5783559788671391	53	0.0%
-0.311770683288625	48	0.0%
-0.306872623831362	45	0.0%
1.31547583332268	40	0.0%
-0.346175355279224	39	0.0%
-0.557087388354872	36	0.0%
Other values (275653)	284270	99.8%

Minimum 5 values

Value	Count	Frequency (%)
-5.68317119816995	1	0.0%
-5.600607141215099	1	0.0%
-5.56011758115594	1	0.0%
-5.519697123284151	1	0.0%
-5.416315392339291	1	0.0%

Maximum 5 values

Value	Count	Frequency (%)
13.1436680982574	1	0.0%
15.3041839851875	1	0.0%
16.4912171736623	1	0.0%
16.7155373723131	1	0.0%
16.8753440335975	1	0.0%

V5
Numeric

Distinct count	275663
Unique (%)	96.8%
Missing (%)	0.0%
Missing (n)	0
Infinite (%)	0.0%
Infinite (n)	0

Mean	-1.5526e-15
Minimum	-113.74
Maximum	34.802
Zeros (%)	0.0%

Toggle details

Quantile statistics

Minimum	-113.74
5-th percentile	-1.702
Q1	-0.6916
Median	-0.054336
Q3	0.61193
95-th percentile	2.099
Maximum	34.802
Range	148.54
Interquartile range	1.3035

Descriptive statistics

Standard deviation	1.3802
Coef of variation	-889010000000000
Kurtosis	206.9
Mean	-1.5526e-15
MAD	0.89707
Skewness	-2.4259
Sum	2.7353e-1
Variance	1.9051
Memory size	2.2 MiB

Value	Count	Frequency (%)
2.46307225982454	77	0.0%
-0.5627766807738629	77	0.0%
0.298310371498215	62	0.0%
1.3266231068468501	60	0.0%
2.60935827084169	53	0.0%
3.51011694221752	48	0.0%
2.6413512514436	45	0.0%
1.69843605562986	40	0.0%
2.58323382235421	39	0.0%
2.70504105264306	36	0.0%
Other values (275653)	284270	99.8%

Minimum 5 values

Value	Count	Frequency (%)
-113.74330671114599	1	0.0%
-42.1478983728015	1	0.0%
-40.4277263001722	1	0.0%
-35.1821203113785	1	0.0%
-32.0921290046357	1	0.0%

Maximum 5 values

Value	Count	Frequency (%)
29.1621720203733	1	0.0%
31.457046054914304	1	0.0%
32.9114617007293	1	0.0%
34.0993093435765	1	0.0%
34.8016658766686	1	0.0%

V6
Numeric

Distinct count	275663
Unique (%)	96.8%
Missing (%)	0.0%
Missing (n)	0
Infinite (%)	0.0%
Infinite (n)	0

Mean	2.0107e-15
Minimum	-26.161
Maximum	73.302
Zeros (%)	0.0%

Toggle details

Quantile statistics

Minimum	-26.161
5-th percentile	-1.4068
Q1	-0.7683
Median	-0.27419
Q3	0.39856
95-th percentile	3.1604
Maximum	73.302
Range	99.462
Interquartile range	1.1669

Descriptive statistics

Standard deviation	1.3323
Coef of variation	662600000000000
Kurtosis	42.642
Mean	2.0107e-15
MAD	0.90901
Skewness	1.8266
Sum	4.2439e-1
Variance	1.7749
Memory size	2.2 MiB

Value	Count	Frequency (%)
-1.01107261632698	77	0.0%
3.17385642307029	77	0.0%
-0.953526086363083	62	0.0%
3.43631244725031	60	0.0%
3.1426415310887905	53	0.0%
2.45329922016311	48	0.0%
2.80808376427436	45	0.0%
0.528806548957574	40	0.0%
2.8541019971666097	39	0.0%
2.7624395847487797	36	0.0%
Other values (275653)	284270	99.8%

Minimum 5 values

Value	Count	Frequency (%)
-26.1605059358433	1	0.0%
-23.496713929871397	1	0.0%
-21.9293122885031	1	0.0%
-21.2487516200394	1	0.0%
-20.8696261884133	1	0.0%

Maximum 5 values

Value	Count	Frequency (%)
21.3930687572539	1	0.0%
21.550496192579605	1	0.0%
22.5292984665587	1	0.0%
23.9178371266367	1	0.0%
73.3016255459646	1	0.0%

V7
Numeric

Distinct count	275663
Unique (%)	96.8%
Missing (%)	0.0%
Missing (n)	0
Infinite (%)	0.0%
Infinite (n)	0

Mean	-1.6942e-15
Minimum	-43.557
Maximum	120.59
Zeros (%)	0.0%

Toggle details

Quantile statistics

Minimum	-43.557
5-th percentile	-1.4344
Q1	-0.55408
Median	0.040103
Q3	0.57044
95-th percentile	1.4076
Maximum	120.59
Range	164.15
Interquartile range	1.1245

Descriptive statistics

Standard deviation	1.2371
Coef of variation	-730170000000000
Kurtosis	405.61
Mean	-1.6942e-15
MAD	0.73785
Skewness	2.5539
Sum	-1.5825e-1
Variance	1.5304
Memory size	2.2 MiB

Value	Count	Frequency (%)
-0.43212592398782396	77	0.0%
0.0149526614685896	77	0.0%
0.152002545314135	62	0.0%
-1.14512682747431	60	0.0%
-0.41688284124123	53	0.0%
0.220468581007954	48	0.0%
-0.171626636099457	45	0.0%
0.33171450239883	40	0.0%
-0.18754733727697498	39	0.0%
-0.5349938273164451	36	0.0%
Other values (275653)	284270	99.8%

Minimum 5 values

Value	Count	Frequency (%)
-43.5572415712451	1	0.0%
-41.5067960832574	1	0.0%
-37.0603114554112	1	0.0%
-33.2393281671892	1	0.0%
-31.76494649021	1	0.0%

Maximum 5 values

Value	Count	Frequency (%)
34.3031768568354	1	0.0%
36.6772679454031	1	0.0%
36.877368268259794	1	0.0%
44.054461363163796	1	0.0%
120.589493945238	1	0.0%

V8
Numeric

Distinct count	275663
Unique (%)	96.8%
Missing (%)	0.0%
Missing (n)	0
Infinite (%)	0.0%
Infinite (n)	0

Mean	-1.927e-16
Minimum	-73.217
Maximum	20.007
Zeros (%)	0.0%

Toggle details

Quantile statistics

Minimum	-73.217
5-th percentile	-0.84215
Q1	-0.20863
Median	0.022358
Q3	0.32735
95-th percentile	1.05
Maximum	20.007
Range	93.224
Interquartile range	0.53598

Descriptive statistics

Standard deviation	1.1944
Coef of variation	-6197900000000000
Kurtosis	220.59
Mean	-1.927e-16
MAD	0.50574
Skewness	-8.5219
Sum	3.3538e-11
Variance	1.4265
Memory size	2.2 MiB

Value	Count	Frequency (%)
-0.16021086330181197	77	0.0%
0.7277062007278241	77	0.0%
-0.207071379659966	62	0.0%
0.9591472620923409	60	0.0%
0.7843929483197328	53	0.0%
0.543376800596399	48	0.0%
0.683351733616692	45	0.0%
0.364538761567697	40	0.0%
0.6851537704418591	39	0.0%
0.8082500983641501	36	0.0%
Other values (275653)	284270	99.8%

Minimum 5 values

Value	Count	Frequency (%)
-73.21671845526741	1	0.0%
-50.94336886770229	1	0.0%
-50.688419356750295	1	0.0%
-50.420090064434206	1	0.0%
-41.484822506637705	1	0.0%

Maximum 5 values

Value	Count	Frequency (%)
18.709254543323397	1	0.0%
18.7488719520883	1	0.0%
19.168327389730102	1	0.0%
19.5877726234404	1	0.0%
20.0072083651213	1	0.0%

V9
Numeric

Distinct count	275663
Unique (%)	96.8%
Missing (%)	0.0%
Missing (n)	0
Infinite (%)	0.0%
Infinite (n)	0

Mean	-3.137e-15
Minimum	-13.434
Maximum	15.595
Zeros (%)	0.0%

Toggle details

Quantile statistics

Minimum	-13.434
5-th percentile	-1.7584
Q1	-0.6431
Median	-0.051429
Q3	0.59714
95-th percentile	1.7808
Maximum	15.595
Range	29.029
Interquartile range	1.2402

Descriptive statistics

Standard deviation	1.0986
Coef of variation	-350210000000000
Kurtosis	3.7313
Mean	-3.137e-15
MAD	0.81439
Skewness	0.55468
Sum	-6.8538e-1
Variance	1.207
Memory size	2.2 MiB

Value	Count	Frequency (%)
0.17036185217373	77	0.0%
0.608605870267216	77	0.0%
0.587335266422761	62	0.0%
1.67130156362918	60	0.0%
0.359902378888007	53	0.0%
-0.10043390489717	48	0.0%
-0.29796200128389	45	0.0%
-0.7117979387642629	40	0.0%
-0.28661406862562394	39	0.0%
0.6977195955056469	36	0.0%
Other values (275653)	284270	99.8%

Minimum 5 values

Value	Count	Frequency (%)
-13.4340663182301	1	0.0%
-13.3201546920984	1	0.0%
-11.1266235224579	1	0.0%
-10.8425258685569	1	0.0%
-9.48145633401495	1	0.0%

Maximum 5 values

Value	Count	Frequency (%)
10.3261330490616	1	0.0%
10.348406697766801	1	0.0%
10.370657984046	1	0.0%
10.392888824678499	1	0.0%
15.5949946071278	1	0.0%

V10
Numeric

Distinct count	275663
Unique (%)	96.8%
Missing (%)	0.0%
Missing (n)	0
Infinite (%)	0.0%
Infinite (n)	0

Mean	1.7686e-15
Minimum	-24.588
Maximum	23.745
Zeros (%)	0.0%

Toggle details

Quantile statistics

Minimum	-24.588
5-th percentile	-1.3386
Q1	-0.53543
Median	-0.092917
Q3	0.45392
95-th percentile	1.5486
Maximum	23.745
Range	48.333
Interquartile range	0.98935

Descriptive statistics

Standard deviation	1.0888
Coef of variation	615650000000000
Kurtosis	31.988
Mean	1.7686e-15
MAD	0.69512
Skewness	1.1871
Sum	6.379e-1
Variance	1.1856
Memory size	2.2 MiB

Value	Count	Frequency (%)
-0.0445745893804268	77	0.0%
-0.0751861699398929	77	0.0%
-0.362047348389396	62	0.0%
-1.02294602983554	60	0.0%
-0.351075101407957	53	0.0%
-1.01862219976658	48	0.0%
-0.652096600406493	45	0.0%
-1.57028828006989	40	0.0%
-0.5359027354525039	39	0.0%
-1.09018090617913	36	0.0%
Other values (275653)	284270	99.8%

Minimum 5 values

Value	Count	Frequency (%)
-24.5882624372475	1	0.0%
-24.403184969972802	1	0.0%
-23.2282548357516	1	0.0%
-22.1870885620007	4	0.0%
-20.949191554361104	1	0.0%

Maximum 5 values

Value	Count	Frequency (%)
13.8117577662908	1	0.0%
15.236028204007098	1	0.0%
15.2456856915255	1	0.0%
15.3317415557881	1	0.0%
23.7451361206545	1	0.0%

V11
Numeric

Distinct count	275663
Unique (%)	96.8%
Missing (%)	0.0%
Missing (n)	0
Infinite (%)	0.0%
Infinite (n)	0

Mean	9.1703e-16
Minimum	-4.7975
Maximum	12.019
Zeros (%)	0.0%

Toggle details

Quantile statistics

Minimum	-4.7975
5-th percentile	-1.5719
Q1	-0.76249
Median	-0.032757
Q3	0.73959
95-th percentile	1.614
Maximum	12.019
Range	16.816
Interquartile range	1.5021

Descriptive statistics

Standard deviation	1.0207
Coef of variation	1113100000000000
Kurtosis	1.6339
Mean	9.1703e-16
MAD	0.83126
Skewness	0.35651
Sum	4.7658e-1
Variance	1.0419
Memory size	2.2 MiB

Value	Count	Frequency (%)
-0.35674901847752005	77	0.0%
0.0635044576008839	77	0.0%
-0.589598040395407	62	0.0%
-0.19142297265161498	60	0.0%
0.329650883701029	53	0.0%
0.8070381066842709	48	0.0%
0.418002664896219	45	0.0%
3.46301782070354	40	0.0%
0.332848417624034	39	0.0%
-0.0286089299546822	36	0.0%
Other values (275653)	284270	99.8%

Minimum 5 values

Value	Count	Frequency (%)
-4.79747346479757	1	0.0%
-4.682930547652759	1	0.0%
-4.568390246460219	1	0.0%
-4.45385284150054	1	0.0%
-4.3393186545773705	1	0.0%

Maximum 5 values

Value	Count	Frequency (%)
11.228470279576001	1	0.0%
11.277920727806698	1	0.0%
11.6197234753825	1	0.0%
11.6692047358121	1	0.0%
12.018913181619899	1	0.0%

V12
Numeric

Distinct count	275663
Unique (%)	96.8%
Missing (%)	0.0%
Missing (n)	0
Infinite (%)	0.0%
Infinite (n)	0

Mean	-1.8107e-15
Minimum	-18.684
Maximum	7.8484
Zeros (%)	0.0%

Toggle details

Quantile statistics

Minimum	-18.684
5-th percentile	-1.9672
Q1	-0.40557
Median	0.14003
Q3	0.61824
95-th percentile	1.2431
Maximum	7.8484
Range	26.532
Interquartile range	1.0238

Descriptive statistics

Standard deviation	0.9992
Coef of variation	-551840000000000
Kurtosis	20.242
Mean	-1.8107e-15
MAD	0.70536
Skewness	-2.2784
Sum	-3.5743e-1
Variance	0.9984
Memory size	2.2 MiB

Value	Count	Frequency (%)
0.350563573253678	77	0.0%
-0.0734595173503765	77	0.0%
-0.17471205308176502	62	0.0%
0.6310273414871078	60	0.0%
0.18350812062465602	53	0.0%
-0.330547627789277	48	0.0%
-0.32243692372967503	45	0.0%
0.5384113631159171	40	0.0%
-0.26831873850147697	39	0.0%
0.0736565150203547	36	0.0%
Other values (275653)	284270	99.8%

Minimum 5 values

Value	Count	Frequency (%)
-18.683714633344298	1	0.0%
-18.553697009645802	1	0.0%
-18.4311310279993	1	0.0%
-18.047596570821604	1	0.0%
-17.7691434633638	1	0.0%

Maximum 5 values

Value	Count	Frequency (%)
4.4063382205176	1	0.0%
4.4729205841361	1	0.0%
4.57408224145334	1	0.0%
4.84645240859009	1	0.0%
7.8483920756445995	1	0.0%

V13
Numeric

Distinct count	275663
Unique (%)	96.8%
Missing (%)	0.0%
Missing (n)	0
Infinite (%)	0.0%
Infinite (n)	0

Mean	1.6934e-15
Minimum	-5.7919
Maximum	7.1269
Zeros (%)	0.0%

Toggle details

Quantile statistics

Minimum	-5.7919
5-th percentile	-1.6397
Q1	-0.64854
Median	-0.013568
Q3	0.6625
95-th percentile	1.6079
Maximum	7.1269
Range	12.919
Interquartile range	1.311

Descriptive statistics

Standard deviation	0.99527
Coef of variation	587720000000000
Kurtosis	0.1953
Mean	1.6934e-15
MAD	0.7846
Skewness	0.065233
Sum	2.3286e-1
Variance	0.99057
Memory size	2.2 MiB

Value	Count	Frequency (%)
-0.141238322200309	77	0.0%
-0.517759694198053	77	0.0%
-0.6211270614210049	62	0.0%
0.0319072703534055	60	0.0%
-0.27291854500254503	53	0.0%
-0.5314186516713479	48	0.0%
-0.143469154599387	45	0.0%
-0.37809538452842295	40	0.0%
-0.12761419581231198	39	0.0%
-0.23845703149556197	36	0.0%
Other values (275653)	284270	99.8%

Minimum 5 values

Value	Count	Frequency (%)
-5.7918812063208405	1	0.0%
-4.00863979207158	1	0.0%
-3.9617575357502504	1	0.0%
-3.8886062856691	1	0.0%
-3.8811062494802897	1	0.0%

Maximum 5 values

Value	Count	Frequency (%)
4.36999837897829	1	0.0%
4.465413177090861	1	0.0%
4.46956619153499	1	0.0%
4.56900895856606	1	0.0%
7.126882958593759	1	0.0%

V14
Numeric

Distinct count	275663
Unique (%)	96.8%
Missing (%)	0.0%
Missing (n)	0
Infinite (%)	0.0%
Infinite (n)	0

Mean	1.479e-15
Minimum	-19.214
Maximum	10.527
Zeros (%)	0.0%

Toggle details

Quantile statistics

Minimum	-19.214
5-th percentile	-1.4394
Q1	-0.42557
Median	0.050601
Q3	0.49315
95-th percentile	1.3937
Maximum	10.527
Range	29.741
Interquartile range	0.91872

Descriptive statistics

Standard deviation	0.9586
Coef of variation	648120000000000
Kurtosis	23.879
Mean	1.479e-15
MAD	0.64865
Skewness	-1.9952
Sum	3.4356e-1
Variance	0.91891
Memory size	2.2 MiB

Value	Count	Frequency (%)
0.40696893438373105	77	0.0%
0.690971618395625	77	0.0%
-0.7035127839833039	62	0.0%
-0.0314253812628428	60	0.0%
-0.597436665174528	53	0.0%
-2.1814488246367403	48	0.0%
-1.1545242958661899	45	0.0%
-3.0454951796322502	40	0.0%
-0.868299960850499	39	0.0%
0.215738138536011	36	0.0%
Other values (275653)	284270	99.8%

Minimum 5 values

Value	Count	Frequency (%)
-19.2143254902614	1	0.0%
-18.8220867423816	1	0.0%
-18.4937733551053	1	0.0%
-18.392091495673	1	0.0%
-18.049997689859396	1	0.0%

Maximum 5 values

Value	Count	Frequency (%)
7.518402781245941	1	0.0%
7.667725750558191	1	0.0%
7.692208543567821	1	0.0%
7.754598748054839	1	0.0%
10.5267660517847	1	0.0%

V15
Numeric

Distinct count	275663
Unique (%)	96.8%
Missing (%)	0.0%
Missing (n)	0
Infinite (%)	0.0%
Infinite (n)	0

Mean	3.4823e-15
Minimum	-4.4989
Maximum	8.8777
Zeros (%)	0.0%

Toggle details

Quantile statistics

Minimum	-4.4989
5-th percentile	-1.5932
Q1	-0.58288
Median	0.048072
Q3	0.64882
95-th percentile	1.3731
Maximum	8.8777
Range	13.377
Interquartile range	1.2317

Descriptive statistics

Standard deviation	0.91532
Coef of variation	262850000000000
Kurtosis	0.28477
Mean	3.4823e-15
MAD	0.72734
Skewness	-0.30842
Sum	1.3993e-09
Variance	0.8378
Memory size	2.2 MiB

Value	Count	Frequency (%)
1.2752570390934999	77	0.0%
1.1241469228868501	77	0.0%
0.271956610213985	62	0.0%
1.44662697638966	60	0.0%
0.5838968102925071	53	0.0%
0.38872408312047796	48	0.0%
1.157633713505	45	0.0%
1.46891114338139	40	0.0%
1.1285389817093798	39	0.0%
1.2452765300023998	36	0.0%
Other values (275653)	284270	99.8%

Minimum 5 values

Value	Count	Frequency (%)
-4.49894467676621	1	0.0%
-4.39130706780494	1	0.0%
-4.19932124976578	1	0.0%
-4.19661969463528	1	0.0%
-4.15253175950472	1	0.0%

Maximum 5 values

Value	Count	Frequency (%)
5.685899051594321	1	0.0%
5.720478632456981	1	0.0%
5.7845138896294594	1	0.0%
5.82565431863365	1	0.0%
8.87774159774277	1	0.0%

V16
Numeric

Distinct count	275663
Unique (%)	96.8%
Missing (%)	0.0%
Missing (n)	0
Infinite (%)	0.0%
Infinite (n)	0

Mean	1.392e-15
Minimum	-14.13
Maximum	17.315
Zeros (%)	0.0%

Toggle details

Quantile statistics

Minimum	-14.13
5-th percentile	-1.4917
Q1	-0.46804
Median	0.066413
Q3	0.5233
95-th percentile	1.3253
Maximum	17.315
Range	31.445
Interquartile range	0.99133

Descriptive statistics

Standard deviation	0.87625
Coef of variation	629490000000000
Kurtosis	10.419
Mean	1.392e-15
MAD	0.64782
Skewness	-1.101
Sum	4.0946e-1
Variance	0.76782
Memory size	2.2 MiB

Value	Count	Frequency (%)
0.34246975411076896	77	0.0%
-0.37196212502841897	77	0.0%
0.318688063430157	62	0.0%
-0.12182037858308699	60	0.0%
0.17867583647653199	53	0.0%
0.23207137768386	48	0.0%
0.878174917750572	45	0.0%
-0.0297415143257285	40	0.0%
0.7865060536879019	39	0.0%
-0.255230524748655	36	0.0%
Other values (275653)	284270	99.8%

Minimum 5 values

Value	Count	Frequency (%)
-14.1298545174931	1	0.0%
-13.5632729563133	1	0.0%
-13.30388757707	1	0.0%
-13.2568330912778	1	0.0%
-13.2515419788937	1	0.0%

Maximum 5 values

Value	Count	Frequency (%)
6.35185349844491	1	0.0%
6.44279790144451	1	0.0%
7.05913181057395	1	0.0%
8.289889559546191	1	0.0%
17.315111517627802	1	0.0%

V17
Numeric

Distinct count	275663
Unique (%)	96.8%
Missing (%)	0.0%
Missing (n)	0
Infinite (%)	0.0%
Infinite (n)	0

Mean	-7.5285e-16
Minimum	-25.163
Maximum	9.2535
Zeros (%)	0.0%

Toggle details

Quantile statistics

Minimum	-25.163
5-th percentile	-0.983
Q1	-0.48375
Median	-0.065676
Q3	0.39967
95-th percentile	1.2746
Maximum	9.2535
Range	34.416
Interquartile range	0.88342

Descriptive statistics

Standard deviation	0.84934
Coef of variation	-1128200000000000
Kurtosis	94.8
Mean	-7.5285e-16
MAD	0.56387
Skewness	-3.8449
Sum	-1.0823e-1
Variance	0.72137
Memory size	2.2 MiB

Value	Count	Frequency (%)
-0.6019568028284449	77	0.0%
-0.37465644005137605	77	0.0%
0.549365128729473	62	0.0%
-0.651405237009102	60	0.0%
0.47389827829767206	53	0.0%
2.12502188299054	48	0.0%
0.536917519702814	45	0.0%
3.6645884808692504	40	0.0%
0.31643526103505604	39	0.0%
-1.07208498526811	36	0.0%
Other values (275653)	284270	99.8%

Minimum 5 values

Value	Count	Frequency (%)
-25.162799369324798	1	0.0%
-24.019098547590197	1	0.0%
-23.8156358284126	1	0.0%
-23.2415971479491	1	0.0%
-22.8839985767803	1	0.0%

Maximum 5 values

Value	Count	Frequency (%)
7.766636362866991	1	0.0%
7.89339253241379	1	0.0%
8.538195138626161	1	0.0%
9.20705853529557	1	0.0%
9.25352625047285	1	0.0%

V18
Numeric

Distinct count	275663
Unique (%)	96.8%
Missing (%)	0.0%
Missing (n)	0
Infinite (%)	0.0%
Infinite (n)	0

Mean	4.3288e-16
Minimum	-9.4987
Maximum	5.0411
Zeros (%)	0.0%

Toggle details

Quantile statistics

Minimum	-9.4987
5-th percentile	-1.3581
Q1	-0.49885
Median	-0.0036363
Q3	0.50081
95-th percentile	1.3944
Maximum	5.0411
Range	14.54
Interquartile range	0.99966

Descriptive statistics

Standard deviation	0.83818
Coef of variation	1936300000000000
Kurtosis	2.5783
Mean	4.3288e-16
MAD	0.63582
Skewness	-0.25988
Sum	2.7262e-1
Variance	0.70254
Memory size	2.2 MiB

Value	Count	Frequency (%)
-0.43899243243668296	77	0.0%
-0.0526401462570187	77	0.0%
-0.25778585794493303	62	0.0%
0.6179704765287819	60	0.0%
-0.49884979866504103	53	0.0%
0.40554867355562896	48	0.0%
0.712873012618197	45	0.0%
-0.105189588790714	40	0.0%
0.587856253020328	39	0.0%
-0.0686980996025901	36	0.0%
Other values (275653)	284270	99.8%

Minimum 5 values

Value	Count	Frequency (%)
-9.498745921046769	1	0.0%
-9.33519307905321	1	0.0%
-9.287832213974019	1	0.0%
-9.264608732956551	1	0.0%
-9.17055721888169	1	0.0%

Maximum 5 values

Value	Count	Frequency (%)
4.19959110679305	1	0.0%
4.24384121345385	1	0.0%
4.2956482344645	1	0.0%
4.71239756635225	1	0.0%
5.04106918541184	1	0.0%

V19
Numeric

Distinct count	275663
Unique (%)	96.8%
Missing (%)	0.0%
Missing (n)	0
Infinite (%)	0.0%
Infinite (n)	0

Mean	9.0497e-16
Minimum	-7.2135
Maximum	5.592
Zeros (%)	0.0%

Toggle details

Quantile statistics

Minimum	-7.2135
5-th percentile	-1.3563
Q1	-0.4563
Median	0.0037348
Q3	0.45895
95-th percentile	1.2862
Maximum	5.592
Range	12.805
Interquartile range	0.91525

Descriptive statistics

Standard deviation	0.81404
Coef of variation	899520000000000
Kurtosis	1.725
Mean	9.0497e-16
MAD	0.60579
Skewness	0.10919
Sum	2.9615e-1
Variance	0.66266
Memory size	2.2 MiB

Value	Count	Frequency (%)
-0.116090785002835	77	0.0%
-0.33059044844294394	77	0.0%
0.0162561279842771	62	0.0%
0.927600044556072	60	0.0%
-0.14009868476221	53	0.0%
-0.440929511947803	48	0.0%
0.00677355522536129	45	0.0%
-2.0979443214639	40	0.0%
0.0493500831769145	39	0.0%
0.255267674459398	36	0.0%
Other values (275653)	284270	99.8%

Minimum 5 values

Value	Count	Frequency (%)
-7.21352743017759	1	0.0%
-6.93829731768481	1	0.0%
-4.93273305547833	1	0.0%
-4.676092279153361	1	0.0%
-4.619034341772441	1	0.0%

Maximum 5 values

Value	Count	Frequency (%)
4.8910624409520995	1	0.0%
5.2283417900513	1	0.0%
5.5017472139665	1	0.0%
5.572113326879691	1	0.0%
5.59197142733558	1	0.0%

V20
Numeric

Distinct count	275663
Unique (%)	96.8%
Missing (%)	0.0%
Missing (n)	0
Infinite (%)	0.0%
Infinite (n)	0

Mean	5.0855e-16
Minimum	-54.498
Maximum	39.421
Zeros (%)	0.0%

Toggle details

Quantile statistics

Minimum	-54.498
5-th percentile	-0.55843
Q1	-0.21172
Median	-0.062481
Q3	0.13304
95-th percentile	0.83614
Maximum	39.421
Range	93.919
Interquartile range	0.34476

Descriptive statistics

Standard deviation	0.77093
Coef of variation	1515900000000000
Kurtosis	271.02
Mean	5.0855e-16
MAD	0.34191
Skewness	-2.0372
Sum	1.8247e-1
Variance	0.59433
Memory size	2.2 MiB

Value	Count	Frequency (%)
-0.18037011855969298	77	0.0%
-0.132079724302295	77	0.0%
-0.187420788431655	62	0.0%
0.0057566554189328704	60	0.0%
-0.12071403428047302	53	0.0%
-0.0869893297425326	48	0.0%
0.0536071193018422	45	0.0%
-0.167555416292594	40	0.0%
0.0452174411898587	39	0.0%
0.0169521541786674	36	0.0%
Other values (275653)	284270	99.8%

Minimum 5 values

Value	Count	Frequency (%)
-54.497720494566	1	0.0%
-28.009635333749	1	0.0%
-25.222345240529698	1	0.0%
-23.646890332167303	1	0.0%
-23.4201725720228	1	0.0%

Maximum 5 values

Value	Count	Frequency (%)
23.649094568125502	1	0.0%
24.1338941917421	1	0.0%
26.237390789565897	1	0.0%
38.1172091261285	1	0.0%
39.4209042482199	1	0.0%

V21
Numeric

Distinct count	275663
Unique (%)	96.8%
Missing (%)	0.0%
Missing (n)	0
Infinite (%)	0.0%
Infinite (n)	0

Mean	1.5373e-16
Minimum	-34.83
Maximum	27.203
Zeros (%)	0.0%

Toggle details

Quantile statistics

Minimum	-34.83
5-th percentile	-0.50467
Q1	-0.22839
Median	-0.02945
Q3	0.18638
95-th percentile	0.53787
Maximum	27.203
Range	62.033
Interquartile range	0.41477

Descriptive statistics

Standard deviation	0.73452
Coef of variation	4778000000000000
Kurtosis	207.29
Mean	1.5373e-16
MAD	0.31907
Skewness	3.593
Sum	4.718e-11
Variance	0.53953
Memory size	2.2 MiB

Value	Count	Frequency (%)
-0.26258084604117604	77	0.0%
0.26976495136135703	77	0.0%
-0.36115803659984497	62	0.0%
-0.0642082814806287	60	0.0%
-0.35233380052375	53	0.0%
-0.0672166613423604	48	0.0%
-0.20743240447289701	45	0.0%
-0.0402375927503545	40	0.0%
-0.191819982814025	39	0.0%
0.0073428026657956095	36	0.0%
Other values (275653)	284270	99.8%

Minimum 5 values

Value	Count	Frequency (%)
-34.8303821448146	1	0.0%
-22.889347040939	1	0.0%
-22.797603905551895	1	0.0%
-22.7575398590576	1	0.0%
-22.665684604861497	1	0.0%

Maximum 5 values

Value	Count	Frequency (%)
22.5806752741477	1	0.0%
22.5889894712903	1	0.0%
22.5995433627945	1	0.0%
22.614889367616897	1	0.0%
27.2028391573154	6	0.0%

V22
Numeric

Distinct count	275663
Unique (%)	96.8%
Missing (%)	0.0%
Missing (n)	0
Infinite (%)	0.0%
Infinite (n)	0

Mean	7.9599e-16
Minimum	-10.933
Maximum	10.503
Zeros (%)	0.0%

Toggle details

Quantile statistics

Minimum	-10.933
5-th percentile	-1.0819
Q1	-0.54235
Median	0.0067819
Q3	0.52855
95-th percentile	1.129
Maximum	10.503
Range	21.436
Interquartile range	1.0709

Descriptive statistics

Standard deviation	0.7257
Coef of variation	911700000000000
Kurtosis	2.833
Mean	7.9599e-16
MAD	0.58421
Skewness	-0.21326
Sum	-9.8112e-11
Variance	0.52664
Memory size	2.2 MiB

Value	Count	Frequency (%)
-0.8162637631578471	77	0.0%
0.8446266467757121	77	0.0%
-0.984261949244254	62	0.0%
-0.0805870774450856	60	0.0%
-0.9969367748280931	53	0.0%
-0.0726415994946915	48	0.0%
-0.6924166841818179	45	0.0%
0.0961715739635631	40	0.0%
-0.650117795537897	39	0.0%
0.250885695089417	36	0.0%
Other values (275653)	284270	99.8%

Minimum 5 values

Value	Count	Frequency (%)
-10.933143697655	1	0.0%
-9.49942296430251	1	0.0%
-8.88701714094871	6	0.0%
-8.59364156538624	1	0.0%
-8.555807930456341	1	0.0%

Maximum 5 values

Value	Count	Frequency (%)
7.357255161770509	1	0.0%
8.27223298396612	1	0.0%
8.316275438913571	1	0.0%
8.361985191684349	1	0.0%
10.5030900899454	1	0.0%

V23
Numeric

Distinct count	275663
Unique (%)	96.8%
Missing (%)	0.0%
Missing (n)	0
Infinite (%)	0.0%
Infinite (n)	0

Mean	5.3676e-16
Minimum	-44.808
Maximum	22.528
Zeros (%)	0.0%

Toggle details

Quantile statistics

Minimum	-44.808
5-th percentile	-0.47225
Q1	-0.16185
Median	-0.011193
Q3	0.14764
95-th percentile	0.48802
Maximum	22.528
Range	67.336
Interquartile range	0.30949

Descriptive statistics

Standard deviation	0.62446
Coef of variation	1163400000000000
Kurtosis	440.09
Mean	5.3676e-16
MAD	0.26194
Skewness	-5.8751
Sum	7.3442e-11
Variance	0.38995
Memory size	2.2 MiB

Value	Count	Frequency (%)
0.14030430201432598	77	0.0%
0.0206746676928111	77	0.0%
0.354198094344309	62	0.0%
-0.0729910792746877	60	0.0%
0.36348490597863004	53	0.0%
-0.0365842826760227	48	0.0%
-0.11859751164434901	45	0.0%
-0.0925489695836041	40	0.0%
-0.11406946378171699	39	0.0%
0.10379660601100901	36	0.0%
Other values (275653)	284270	99.8%

Minimum 5 values

Value	Count	Frequency (%)
-44.807735203791296	1	0.0%
-36.666000066027	1	0.0%
-32.828994997462004	1	0.0%
-30.269720014317002	1	0.0%
-27.533643285000302	1	0.0%

Maximum 5 values

Value	Count	Frequency (%)
19.002941823292698	1	0.0%
19.228169082574198	1	0.0%
20.8033440994696	1	0.0%
22.0835448685737	1	0.0%
22.5284116897749	1	0.0%

V24
Numeric

Distinct count	275663
Unique (%)	96.8%
Missing (%)	0.0%
Missing (n)	0
Infinite (%)	0.0%
Infinite (n)	0

Mean	4.4581e-15
Minimum	-2.8366
Maximum	4.5845
Zeros (%)	0.0%

Toggle details

Quantile statistics

Minimum	-2.8366
5-th percentile	-1.1437
Q1	-0.35459
Median	0.040976
Q3	0.43953
95-th percentile	0.86636
Maximum	4.5845
Range	7.4212
Interquartile range	0.79411

Descriptive statistics

Standard deviation	0.60565
Coef of variation	135850000000000
Kurtosis	0.61887
Mean	4.4581e-15
MAD	0.46844
Skewness	-0.5525
Sum	1.2736e-09
Variance	0.36681
Memory size	2.2 MiB

Value	Count	Frequency (%)
0.726211883811499	77	0.0%
0.3578272492998061	77	0.0%
0.6207093385008	62	0.0%
1.01813597043583	60	0.0%
0.6048265697520401	53	0.0%
0.529692767553245	48	0.0%
0.8914796678605641	45	0.0%
-1.34566370602836	40	0.0%
0.9159355995388021	39	0.0%
1.00995227965779	36	0.0%
Other values (275653)	284270	99.8%

Minimum 5 values

Value	Count	Frequency (%)
-2.83662691870341	1	0.0%
-2.82484890293617	1	0.0%
-2.82268359235889	1	0.0%
-2.82238396858124	1	0.0%
-2.81489763570598	1	0.0%

Maximum 5 values

Value	Count	Frequency (%)
3.9982936780756897	1	0.0%
4.014444384730609	1	0.0%
4.01634181669268	1	0.0%
4.02286589044732	1	0.0%
4.58454913689817	1	0.0%

V25
Numeric

Distinct count	275663
Unique (%)	96.8%
Missing (%)	0.0%
Missing (n)	0
Infinite (%)	0.0%
Infinite (n)	0

Mean	1.453e-15
Minimum	-10.295
Maximum	7.5196
Zeros (%)	0.0%

Toggle details

Quantile statistics

Minimum	-10.295
5-th percentile	-0.82503
Q1	-0.31715
Median	0.016594
Q3	0.35072
95-th percentile	0.7607
Maximum	7.5196
Range	17.815
Interquartile range	0.66786

Descriptive statistics

Standard deviation	0.52128
Coef of variation	358760000000000
Kurtosis	4.2904
Mean	1.453e-15
MAD	0.40326
Skewness	-0.41579
Sum	1.5211e-1
Variance	0.27173
Memory size	2.2 MiB

Value	Count	Frequency (%)
0.18642294572338897	77	0.0%
0.36662430700491294	77	0.0%
-0.29713787612847997	62	0.0%
0.6635747724804879	60	0.0%
-0.26455958625151	53	0.0%
0.41468514197751	48	0.0%
0.7302403643301479	45	0.0%
0.510304957067478	40	0.0%
0.7300730401400359	39	0.0%
0.369398160051982	36	0.0%
Other values (275653)	284270	99.8%

Minimum 5 values

Value	Count	Frequency (%)
-10.2953970749851	1	0.0%
-8.69662677026752	1	0.0%
-7.495741104057091	1	0.0%
-7.0813253463773895	1	0.0%
-7.02578318190186	1	0.0%

Maximum 5 values

Value	Count	Frequency (%)
5.54159759459217	1	0.0%
5.8261590349735	1	0.0%
5.8524835709145595	1	0.0%
6.07085038407798	1	0.0%
7.51958867870916	1	0.0%

V26
Numeric

Distinct count	275663
Unique (%)	96.8%
Missing (%)	0.0%
Missing (n)	0
Infinite (%)	0.0%
Infinite (n)	0

Mean	1.6991e-15
Minimum	-2.6046
Maximum	3.5173
Zeros (%)	0.0%

Toggle details

Quantile statistics

Minimum	-2.6046
5-th percentile	-0.69735
Q1	-0.32698
Median	-0.052139
Q3	0.24095
95-th percentile	0.92092
Maximum	3.5173
Range	6.1219
Interquartile range	0.56794

Descriptive statistics

Standard deviation	0.48223
Coef of variation	283810000000000
Kurtosis	0.91901
Mean	1.6991e-15
MAD	0.37663
Skewness	0.57669
Sum	4.805e-1
Variance	0.23254
Memory size	2.2 MiB

Value	Count	Frequency (%)
-0.39882751495946295	77	0.0%
0.0965444707905616	77	0.0%
0.16673563433513197	62	0.0%
-0.671322844293718	60	0.0%
0.21967064744260398	53	0.0%
0.73586965178554	48	0.0%
0.384013445762976	45	0.0%
-0.18267352250172897	40	0.0%
0.38387941757557303	39	0.0%
0.110373548574965	36	0.0%
Other values (275653)	284270	99.8%

Minimum 5 values

Value	Count	Frequency (%)
-2.60455055280817	1	0.0%
-2.53432972105675	1	0.0%
-2.2416202900029503	1	0.0%
-2.06856086855144	1	0.0%
-1.8553553377608	1	0.0%

Maximum 5 values

Value	Count	Frequency (%)
3.15532747327538	1	0.0%
3.22017837466898	1	0.0%
3.41563624349633	1	0.0%
3.4632456536447997	1	0.0%
3.5173456116237998	1	0.0%

V27
Numeric

Distinct count	275663
Unique (%)	96.8%
Missing (%)	0.0%
Missing (n)	0
Infinite (%)	0.0%
Infinite (n)	0

Mean	-3.6602e-16
Minimum	-22.566
Maximum	31.612
Zeros (%)	0.0%

Toggle details

Quantile statistics

Minimum	-22.566
5-th percentile	-0.41525
Q1	-0.07084
Median	0.0013421
Q3	0.091045
95-th percentile	0.38775
Maximum	31.612
Range	54.178
Interquartile range	0.16188

Descriptive statistics

Standard deviation	0.40363
Coef of variation	-1102800000000000
Kurtosis	244.99
Mean	-3.6602e-16
MAD	0.18147
Skewness	-1.1702
Sum	-1.0442e-1
Variance	0.16292
Memory size	2.2 MiB

Value	Count	Frequency (%)
0.0277351215052822	77	0.0%
-0.035866315294695	77	0.0%
-0.0682990099344722	62	0.0%
0.0968009452278396	60	0.0%
-0.0392094515896982	53	0.0%
-0.0582327676516994	48	0.0%
-0.0284652724675567	45	0.0%
0.107058302469808	40	0.0%
-0.0319023040878611	39	0.0%
-0.0283020664307734	36	0.0%
Other values (275653)	284270	99.8%

Minimum 5 values

Value	Count	Frequency (%)
-22.5656793207827	1	0.0%
-9.89524404755692	1	0.0%
-9.845807692778981	1	0.0%
-9.793567905137511	1	0.0%
-9.544855375391482	1	0.0%

Maximum 5 values

Value	Count	Frequency (%)
10.135597346295699	1	0.0%
10.507884353083499	1	0.0%
11.135739844574198	1	0.0%
12.1524011068287	1	0.0%
31.612198106136304	1	0.0%

V28
Numeric

Distinct count	275663
Unique (%)	96.8%
Missing (%)	0.0%
Missing (n)	0
Infinite (%)	0.0%
Infinite (n)	0

Mean	-1.206e-16
Minimum	-15.43
Maximum	33.848
Zeros (%)	0.0%

Toggle details

Quantile statistics

Minimum	-15.43
5-th percentile	-0.31784
Q1	-0.05296
Median	0.011244
Q3	0.07828
95-th percentile	0.25609
Maximum	33.848
Range	49.278
Interquartile range	0.13124

Descriptive statistics

Standard deviation	0.33008
Coef of variation	-2736900000000000
Kurtosis	933.4
Mean	-1.206e-16
MAD	0.12933
Skewness	11.192
Sum	-3.4758e-11
Variance	0.10895
Memory size	2.2 MiB

Value	Count	Frequency (%)
0.0184945729704665	77	0.0%
-0.0602821510762213	77	0.0%
-0.0295847028396107	62	0.0%
0.0286968782920849	60	0.0%
-0.0427869554036275	53	0.0%
-0.0266576195050194	48	0.0%
0.0361233094119335	45	0.0%
0.0718180029478637	40	0.0%
0.0298493414012052	39	0.0%
-0.0203586378568534	36	0.0%
Other values (275653)	284270	99.8%

Minimum 5 values

Value	Count	Frequency (%)
-15.430083905534898	1	0.0%
-11.710895639451499	1	0.0%
-9.617915452382391	1	0.0%
-8.65656990038166	1	0.0%
-8.47868564330279	1	0.0%

Maximum 5 values

Value	Count	Frequency (%)
15.870474054688598	1	0.0%
15.942150981273501	1	0.0%
16.1296091387323	1	0.0%
22.620072218580297	1	0.0%
33.8478078188831	1	0.0%

Amount
Numeric

Distinct count	32767
Unique (%)	11.5%
Missing (%)	0.0%
Missing (n)	0
Infinite (%)	0.0%
Infinite (n)	0

Mean	88.35
Minimum	0
Maximum	25691
Zeros (%)	0.6%

Toggle details

Quantile statistics

Minimum	0
5-th percentile	0.92
Q1	5.6
Median	22
Q3	77.165
95-th percentile	365
Maximum	25691
Range	25691
Interquartile range	71.565

Descriptive statistics

Standard deviation	250.12
Coef of variation	2.831
Kurtosis	845.09
Mean	88.35
MAD	103.53
Skewness	16.978
Sum	25163000
Variance	62560
Memory size	2.2 MiB

Value	Count	Frequency (%)
1.0	13688	4.8%
1.98	6044	2.1%
0.89	4872	1.7%
9.99	4747	1.7%
15.0	3280	1.2%
0.76	2998	1.1%
10.0	2950	1.0%
1.29	2892	1.0%
1.79	2623	0.9%
0.99	2304	0.8%
Other values (32757)	238409	83.7%

Minimum 5 values

Value	Count	Frequency (%)
0.0	1825	0.6%
0.01	718	0.3%
0.02	85	0.0%
0.03	3	0.0%
0.04	11	0.0%

Maximum 5 values

Value	Count	Frequency (%)
11898.09	1	0.0%
12910.93	1	0.0%
18910.0	1	0.0%
19656.53	1	0.0%
25691.16	1	0.0%

Class
Boolean

Distinct count	2
Unique (%)	0.0%
Missing (%)	0.0%
Missing (n)	0

Mean	0.0017275

0	284315
1	492

Toggle details

Value	Count	Frequency (%)
0	284315	99.8%
1	492	0.2%

Correlations

Sample

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	V10	V11	V12	V13	V14	V15	V16	V17	V18	V19	V20	V21	V22	V23	V24	V25	V26	V27	V28	Amount
0	0.0	-1.359807	-0.072781	2.536347	1.378155	-0.338321	0.462388	0.239599	0.098698	0.363787	0.090794	-0.551600	-0.617801	-0.991390	-0.311169	1.468177	-0.470401	0.207971	0.025791	0.403993	0.251412	-0.018307	0.277838	-0.110474	0.066928	0.128539	-0.189115	0.133558	-0.021053	149.62
1	0.0	1.191857	0.266151	0.166480	0.448154	0.060018	-0.082361	-0.078803	0.085102	-0.255425	-0.166974	1.612727	1.065235	0.489095	-0.143772	0.635558	0.463917	-0.114805	-0.183361	-0.145783	-0.069083	-0.225775	-0.638672	0.101288	-0.339846	0.167170	0.125895	-0.008983	0.014724	2.69
2	1.0	-1.358354	-1.340163	1.773209	0.379780	-0.503198	1.800499	0.791461	0.247676	-1.514654	0.207643	0.624501	0.066084	0.717293	-0.165946	2.345865	-2.890083	1.109969	-0.121359	-2.261857	0.524980	0.247998	0.771679	0.909412	-0.689281	-0.327642	-0.139097	-0.055353	-0.059752	378.66
3	1.0	-0.966272	-0.185226	1.792993	-0.863291	-0.010309	1.247203	0.237609	0.377436	-1.387024	-0.054952	-0.226487	0.178228	0.507757	-0.287924	-0.631418	-1.059647	-0.684093	1.965775	-1.232622	-0.208038	-0.108300	0.005274	-0.190321	-1.175575	0.647376	-0.221929	0.062723	0.061458	123.50
4	2.0	-1.158233	0.877737	1.548718	0.403034	-0.407193	0.095921	0.592941	-0.270533	0.817739	0.753074	-0.822843	0.538196	1.345852	-1.119670	0.175121	-0.451449	-0.237033	-0.038195	0.803487	0.408542	-0.009431	0.798278	-0.137458	0.141267	-0.206010	0.502292	0.219422	0.215153	69.99

The function below is used to calculate the various evaluation metrics including area under the ROC curve, the area under the precision-recall curve,-f1-score etc.

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_recall_fscore_support
from numpy import trapz
from scipy.integrate import simps
from sklearn.metrics import f1_score

def Evaluate(labels, predictions, p=0.5):
    CM= confusion_matrix(labels, predictions > p)
    TN = CM[0][0]
    FN = CM[1][0]
    TP = CM[1][1]
    FP = CM[0][1]
    print('Legitimate Transactions Detected (True Negatives): {}'.format(TN))
    print('Fraudulent Transactions Missed (False Negatives):  {}'.format(FN))
    print('Fraudulent Transactions Detected (True Positives): {}'.format(TP))
    print('Legitimate Transactions Incorrectly Detected (False Positives):{}'.format(FP))
    print('Total Fraudulent Transactions: ', np.sum(CM[1]))
    auc = roc_auc_score(labels, predictions)
    prec=precision_score(labels, predictions>0.5)
    rec=recall_score(labels, predictions>0.5)
     # calculate F1 score
    f1 = f1_score(labels, predictions>p)
    print('auc :{}'.format(auc))
    print('precision :{}'.format(prec))
    print('recall :{}'.format(rec))
    print('f1 :{}'.format(f1))
    # Compute Precision-Recall and plot curve
    precision, recall, thresholds = precision_recall_curve(labels, predictions >0.5)
    #use the trapezoidal rule to calculate the area under the precion-recall curve
    area =  trapz(recall, precision)
   
    #area =  simps(recall, precision)
    print("Area Under Precision Recall  Curve(AP): %0.4f" % area)   #should be same as AP?    

from sklearn.metrics import auc

We will attempt to investigate the performance of several ML algorithms on imbalanced target class data classification. The xgboost algorithm will be used to model data that is undersampled, no sample at all, Synthetic Minority Oversampling Technique and also modified to perform a cost sensitive learning. The other ML algorithms which will be tested include Forest of Randomized Trees, RusBoost, EasyEnsemble and Bagging classifier. The cost-sensitive xgboost method will invlove experimental determining the optimal weight on the minority target class using bayesian optimization.

XGBoost No Weights

The first model considered here is an xgboost with default hyperparameter values with no sampling of data.

xgb_no_weights = clf.fit(X=train_x,  y=train_y)
#xgb_no_weights_pred = xgb_no_weights.predict_proba(test_x)
xgb_no_weights_pred = xgb_no_weights.predict_proba(test_x)[:,1]
dump(xgb_no_weights, '/content/drive/My Drive/ImbalancedData/xgb_no_weights.joblib')
Evaluate(labels=test_y, predictions=xgb_no_weights_pred, p=0.5)

Legitimate Transactions Detected (True Negatives): 56854
Fraudulent Transactions Missed (False Negatives):  26
Fraudulent Transactions Detected (True Positives): 79
Legitimate Transactions Incorrectly Detected (False Positives):3
Total Fraudulent Transactions:  105
auc :0.9777749860343032
precision :0.9634146341463414
recall :0.7523809523809524
f1 :0.8449197860962566
Area Under Precision Recall  Curve(AP): 0.8563

xgb_no_weights  =  load( '/content/drive/My Drive/ImbalancedData/xgb_no_weights.joblib')
xgb_no_weights_pred = xgb_no_weights.predict_proba(test_x)[:,1]
#Evaluate(labels=test_y, predictions=xgb_no_weights_pred, p=0.5)

Convert these vectors from python to R vectors. This will alow to use the R library for evaluating ML models /MLmetrics to be used in finding area under the precision-recall curve.

 #%R -i  test_y

#%%R -i xgb_no_weights_pred

#library( MLmetrics)
#library(yardstick)
#library(mltools)
#library("glue")
#MLmetrics::AUC(xgb_no_weights_pred,test_y)
#d= data.frame(pred=xgb_no_weights_pred[,2],truth=as.factor(test_y))
#glue("Test Set : Area Under Precision-Recall Curve: {yardstick::pr_auc(d, truth, pred)}")
#glue("Test Set : Area Under Precision-Recall Curve: {MLmetrics::PRAUC(as.vector(xgb_no_weights_pred),test_y)}")
#head(d)

dump(xgb_no_weights, '/content/drive/My Drive/ImbalancedData/xgb_no_weights.joblib') 

['/content/drive/My Drive/ImbalancedData/xgb_no_weights.joblib']

Model one : XGBoost with Weights on Label/ No Sampling

ns_model = bayessearch.fit(X=train_x,  y=train_y)

os.getcwd()
#ns_model
dump(ns_model, '/content/drive/My Drive/ImbalancedData/ns_model.joblib') 

['/content/drive/My Drive/ImbalancedData/ns_model.joblib']

#dump(ns_model, '/content/drive/My Drive/ImbalancedData/ns_model.joblib') 

ns_model = load('/content/drive/My Drive/ImbalancedData/ns_model.joblib') 

ns_model_pred = ns_model.predict_proba(test_x)[:,1]

Evaluate(labels=test_y, predictions=ns_model_pred, p=0.5)

Legitimate Transactions Detected (True Negatives): 56839
Fraudulent Transactions Missed (False Negatives):  15
Fraudulent Transactions Detected (True Positives): 90
Legitimate Transactions Incorrectly Detected (False Positives):18
Total Fraudulent Transactions:  105
auc :0.9961524191434317
precision :0.8333333333333334
recall :0.8571428571428571
f1 :0.8450704225352113
Area Under PR Curve(AP): 0.8435

XGBoost with Undersampling

from collections import Counter
from sklearn.datasets import make_classification
from imblearn.under_sampling import RandomUnderSampler # doctest: +NORMALIZE_WHITESPACE

rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X=train_x,  y=train_y)
print('Resampled dataset shape %s' % Counter(y_rus))

xgb_model_rus = clf.fit(X=X_rus, y=y_rus)                             

dump(xgb_model_rus, '/content/drive/My Drive/ImbalancedData/xgb_model_rus.joblib')

Resampled dataset shape Counter({0: 402, 1: 402})

['/content/drive/My Drive/ImbalancedData/xgb_model_rus.joblib']

xgb_model_rus = load('/content/drive/My Drive/ImbalancedData/xgb_model_rus.joblib') 

p=0.5
xgb_model_rus_pred  =  xgb_model_rus.predict_proba(test_x.values)[:,1]
#predict_proba(test_x)[:,1]

print(" balanced_accuracy_score {}".format(balanced_accuracy_score(test_y, xgb_model_rus_pred>p) ))

cm=confusion_matrix(test_y, xgb_model_rus_pred>0.5)

print("confusion matrix : {}".format(cm))
#test_x.columns
 #xgb_model_rus.

Evaluate(labels=test_y, predictions=xgb_model_rus_pred, p=0.5)

import collections

counter=collections.Counter(xgb_model_rus_pred>0.5)
print(counter)

 balanced_accuracy_score 0.9634347489985318
confusion matrix : [[54865  1992]
 [    4   101]]
Legitimate Transactions Detected (True Negatives): 54865
Fraudulent Transactions Missed (False Negatives):  4
Fraudulent Transactions Detected (True Positives): 101
Legitimate Transactions Incorrectly Detected (False Positives):1992
Total Fraudulent Transactions:  105
auc :0.9926581055061278
precision :0.0482560917343526
recall :0.9619047619047619
f1 :0.09190172884440401
Area Under PR Curve(AP): 0.5033
Counter({False: 54869, True: 2093})

print("Classification Report")
print(classification_report(test_y, xgb_model_rus_pred > p))

# ROC curve and Area-Under-Curve (AUC)
#calculating accuracy
accuracy_xgbm_sm= accuracy_score(test_y, xgb_model_rus_pred>p)

print('accuracy score : {:0.3f}'.format( accuracy_xgbm_sm))

roc_auc_sm = roc_auc_score(test_y, xgb_model_rus_pred)

print('roc score : {:0.3f}'.format( roc_auc_sm))

Classification Report
              precision    recall  f1-score   support

           0       1.00      0.96      0.98     56857
           1       0.05      0.96      0.09       105

    accuracy                           0.96     56962
   macro avg       0.52      0.96      0.54     56962
weighted avg       1.00      0.96      0.98     56962

accuracy score : 0.965
roc score : 0.993

SMOTE

from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE # doctest: +NORMALIZE_WHITESPACE

print('Original dataset shape %s' % Counter(train_y))

sm = SMOTE(random_state=42)

X_res, y_res = sm.fit_resample(X=train_x,  y=train_y)
print('Resampled dataset shape %s' % Counter(y_res))

xgb_model_sm= clf.fit(X=X_res, y=y_res)
xgb_model_sm_pred = xgb_model_sm.predict(test_x.values)
#iba(y_test, y_pred)
#balanced_accuracy_score(y_test, y_pred) 

cm=confusion_matrix(test_y, xgb_model_sm_pred)

print("confusion matrix")
print(cm)
print("Classification Report")
print(classification_report(test_y, xgb_model_sm_pred))

# ROC curve and Area-Under-Curve (AUC)
#calculating accuracy
accuracy_xgbm_sm= accuracy_score(test_y, xgb_model_sm_pred)
print('accuracy score : {:0.3f}'.format( accuracy_xgbm_sm))
roc_auc_sm = roc_auc_score(test_y, xgb_model_sm_pred)
print('roc score : {:0.3f}'.format( roc_auc_sm))

dump(xgb_model_sm, '/content/drive/My Drive/ImbalancedData/xgb_model_sm.joblib') 

xgb_model_sm = load('/content/drive/My Drive/ImbalancedData/xgb_model_sm.joblib') 

xgb_model_sm_pred  =  xgb_model_sm.predict_proba(test_x.values)[:,1]
Evaluate(labels=test_y, predictions=xgb_model_sm_pred, p=0.5)

Legitimate Transactions Detected (True Negatives): 56306
Fraudulent Transactions Missed (False Negatives):  6
Fraudulent Transactions Detected (True Positives): 99
Legitimate Transactions Incorrectly Detected (False Positives):551
Total Fraudulent Transactions:  105
auc :0.9948591160614306
precision :0.1523076923076923
recall :0.9428571428571428
f1 :0.2622516556291391
Area Under PR Curve(AP): 0.5458

Forest of randomized trees

BalancedRandomForestClassifier is another ensemble method in which each tree of the forest will be provided a balanced bootstrap sample. This class provides all functionality of the sklearn.ensemble.RandomForestClassifier and notably the feature_importances_ attributes:

from imblearn.ensemble import BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=0)
brf.fit(train_x,  train_y )

brf_pred = brf.predict(test_x)
balanced_accuracy_score(test_y, brf_pred)  

#brf.feature_importances

dump(brf, '/content/drive/My Drive/ImbalancedData/brf.joblib')

brf = load('/content/drive/My Drive/ImbalancedData/brf.joblib') 

brf_pred  =  brf.predict_proba(test_x.values)[:,1]
Evaluate(labels=test_y, predictions= brf_pred, p=0.5)

Legitimate Transactions Detected (True Negatives): 55509
Fraudulent Transactions Missed (False Negatives):  2
Fraudulent Transactions Detected (True Positives): 103
Legitimate Transactions Incorrectly Detected (False Positives):1348
Total Fraudulent Transactions:  105
auc :0.9878704887868227
precision :0.0709855272226051
recall :0.9809523809523809
f1 :0.13239074550128535
Area Under PR Curve(AP): 0.5241

RusBoost

from imblearn.ensemble import RUSBoostClassifier
from sklearn.datasets import make_classification


rbt = RUSBoostClassifier(random_state=0,
                               base_estimator=DecisionTreeClassifier(),
                             sampling_strategy='auto')
  
# Fit the grid search to the data
rbt.fit(X=train_x,  y=train_y)
  
dump(rbt, '/content/drive/My Drive/ImbalancedData/rbt.joblib')

rbt = load('/content/drive/My Drive/ImbalancedData/rbt.joblib') 

rbt_pred  =  rbt.predict_proba(test_x.values)[:,1]
Evaluate(labels=test_y, predictions= rbt_pred, p=0.5)

Legitimate Transactions Detected (True Negatives): 55970
Fraudulent Transactions Missed (False Negatives):  5
Fraudulent Transactions Detected (True Positives): 100
Legitimate Transactions Incorrectly Detected (False Positives):887
Total Fraudulent Transactions:  105
auc :0.9918764452507
precision :0.10131712259371833
recall :0.9523809523809523
f1 :0.18315018315018314
Area Under PR Curve(AP): 0.5250

EasyEnsembleClassifier

A specific method which uses AdaBoost as learners in the bagging classifier is called EasyEnsemble. The EasyEnsembleClassifier allows to bag AdaBoost learners which are trained on balanced bootstrap sample. Similarly to the BalancedBaggingClassifier API, one can construct the ensemble as:

from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(random_state=0,
                             base_estimator=AdaBoostClassifier(),
                            sampling_strategy='auto')

eec.fit(X=train_x,  y=train_y)

dump(eec, '/content/drive/My Drive/ImbalancedData/eec.joblib')

eec = load('/content/drive/My Drive/ImbalancedData/eec.joblib') 

eec_pred  =  eec.predict_proba(test_x.values)[:,1]
Evaluate(labels=test_y, predictions= eec_pred, p=0.5)

Legitimate Transactions Detected (True Negatives): 54565
Fraudulent Transactions Missed (False Negatives):  3
Fraudulent Transactions Detected (True Positives): 102
Legitimate Transactions Incorrectly Detected (False Positives):2292
Total Fraudulent Transactions:  105
auc :0.9912644671636528
precision :0.042606516290726815
recall :0.9714285714285714
f1 :0.08163265306122448
Area Under PR Curve(AP): 0.5052

Bagging classifier

BalancedBaggingClassifier allows to resample each subset of data before to train each estimator of the ensemble. In short, it combines the output of an EasyEnsemble sampler with an ensemble of classifiers (i.e. BaggingClassifier). Therefore, BalancedBaggingClassifier takes the same parameters than the scikit-learn BaggingClassifier. Additionally, there is two additional parameters, sampling_strategy and replacement to control the behaviour of the random under-sampler:

from imblearn.ensemble import BalancedBaggingClassifier

from imblearn.ensemble import BalancedBaggingClassifier
bbc = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                sampling_strategy='auto',
                                 replacement=False,
                                n_jobs=-1,
                               random_state=0)


bbc.fit(X=train_x,  y=train_y)
dump(bbc, '/content/drive/My Drive/ImbalancedData/bbc.joblib')

bbc_model = load('/content/drive/My Drive/ImbalancedData/bbc.joblib') 
bbc_pred  =  bbc_model.predict_proba(test_x.values)[:,1]
Evaluate(labels=test_y, predictions= bbc_pred, p=0.5)

Legitimate Transactions Detected (True Negatives): 55701
Fraudulent Transactions Missed (False Negatives):  5
Fraudulent Transactions Detected (True Positives): 100
Legitimate Transactions Incorrectly Detected (False Positives):1156
Total Fraudulent Transactions:  105
auc :0.9845647853386567
precision :0.07961783439490445
recall :0.9523809523809523
f1 :0.1469507714915503
Area Under PR Curve(AP): 0.5142

#pred_df=pd.DataFrame()
pred_df = pd.DataFrame(test_y,index=None)
pred_df['baggingclassifier_pred'] =bbc_pred
pred_df['easyensemble_pred'] = eec_pred
pred_df['RusBoost_pred']  =  rbt_pred
pred_df['forest_r_t']  =  brf_pred 
pred_df['xgb_rus_pred'] = xgb_model_rus_pred
pred_df['xgb_smote_pred']  =  xgb_model_sm_pred
pred_df['xgboost_weights'] =  ns_model_pred
#pred_df["test_y"]    = test_y
pred_df["xgb_no_weights_pred"] = xgb_no_weights_pred

pred_df.to_csv('/content/drive/My Drive/ImbalancedData/pred_df.csv')

#
#pred_df.head()
#pred_df.test_y.isna().sum()
#test_y
#test_y.isna().sum()

Plot the AUC ROC

mpl.rcParams['figure.figsize'] = (12, 10)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

from sklearn.metrics import roc_curve

def plot_roc(name, labels, predictions, p=0.5, **kwargs):
  fp, tp, _ = sklearn.metrics.roc_curve(labels, predictions)

  plt.plot(100*fp, 100*tp, label=name, linewidth=2, **kwargs)
  plt.xlabel('False positives [%]')
  plt.ylabel('True positives [%]')
  plt.xlim([-0.5,80])
  plt.title('Area Under ROC Curve @{:.2f}'.format(p))
  plt.ylim([20,100.5])
  plt.grid(True)
  ax = plt.gca()
  ax.set_aspect('equal')

#%matplotlib inline
sns.set_style("whitegrid")
plot_roc("xgboost No Weight", test_y, xgb_no_weights_pred, color=colors[0],linestyle='--')
plot_roc("Xgboost Weight", test_y ,ns_model_pred, color=colors[1])
plot_roc("Xgboost Under-Sampling", test_y, xgb_model_rus_pred, color=colors[2])
plot_roc("Xgboost Smote", test_y ,xgb_model_sm_pred, color=colors[3])
plot_roc("Forest of Randomized Trees", test_y ,brf_pred, color=colors[4])
plot_roc("RusBoost", test_y ,rbt_pred, color=colors[5])
plot_roc("EasyEnsemble Classifier", test_y ,eec_pred, color=colors[6])
plot_roc("Bagging Classifier", test_y ,bbc_pred, color=colors[7])
plt.legend(loc='lower right')
plt.savefig('/content/drive/My Drive/ImbalancedData/all_rocauc.png')

png

Plot the Area Under Orecision Recall Curve

from sklearn.metrics import precision_recall_curve

def plot_auc_pr(name, labels, predictions,n=0.5, **kwargs):
  p, r, _ = sklearn.metrics.precision_recall_curve(labels, predictions)

  plt.plot(100*r, 100*p, label=name, linewidth=2, **kwargs)
  plt.xlabel('Recall [%]')
  plt.ylabel('Precision [%]')
  plt.xlim([-0.5,100])
  plt.title('Area Under Precision-Recall Curve @{:.2f}'.format(n))
  #plt.title('Area Under Precision-Recall Curve: {}' .format(p))
  plt.ylim([20,100.5])
  plt.grid(True)
  ax = plt.gca()
  ax.set_aspect('equal')

#plot_auc_pr("Train  Weight", train_labels, train_predictions_weight, color=colors[1])
sns.set_style("whitegrid")
plot_auc_pr("xgboost No Weight", test_y, xgb_no_weights_pred, color=colors[0],linestyle='--')
plot_auc_pr("Xgboost Weight", test_y ,ns_model_pred, color=colors[1])
plot_auc_pr("Xgboost Under-Sampling", test_y, xgb_model_rus_pred, color=colors[2])
plot_auc_pr("Xgboost Smote", test_y ,xgb_model_sm_pred, color=colors[3])
plot_auc_pr("Forest of Randomized Trees", test_y ,brf_pred, color=colors[4])
plot_auc_pr("RusBoost", test_y ,rbt_pred, color=colors[5])
plot_auc_pr("EasyEnsemble Classifier", test_y ,eec_pred, color=colors[6])
plot_auc_pr("Bagging Classifier", test_y ,bbc_pred, color=colors[7])
plt.legend(loc='lower left')
plt.savefig('/content/drive/My Drive/ImbalancedData/auc_pr2.png')

png

Cost-Sensitive Logistic Regression

Logistic Regression is a well known statistical model for modeling binary target values that is often overlooked. It will be interesting to see how it performs when presented with an imbalanced target class. It can be modified to perform a Cost-sensitive learning with imbaalanced data.

from sklearn.linear_model import ElasticNet
from sklearn import linear_model

elasticreg = linear_model.SGDClassifier( tol=1e-3,
                                class_weight='balanced',
                                 max_iter = int(1e4), 
                                 warm_start = True, 
                                 n_jobs = -1)

plt.style.use('ggplot')
sns.set_style("whitegrid")

threshhold=0.5
fig = plt.figure(figsize=(15,8))
ax1 = fig.add_subplot(1,2,1)
ax1.set_xlim([0,100])
ax1.set_ylim([0,100])
ax1.set_xlabel('Recall')
ax1.set_ylabel('Precision')
ax1.set_title('Area Under Precision-Recall Curve @{:.2f}'.format(threshhold))

ax2 = fig.add_subplot(1,2,2)
ax2.set_xlim([-0.5,30])
ax2.set_ylim([80,100])
ax2.set_xlabel('False Positive Rate')
ax2.set_ylabel('True Positive Rate')
ax2.set_title('Area Under ROC Curve @{:.2f}'.format(threshhold))



rocauc_vector= []
f1_vector= []
prec_vector= []
rec_vector= []
#cfn_matrix_  =  np.zeros((8, 4))
cfn_matrix_ =[]
pr_auc_vector =[]

for w,k in zip([1,5,10,20,50,100,500,10000],'bgrcmykw'):
    lr_model = LogisticRegression(class_weight={0:1,1:w})
    lr_model.fit(train_x,train_y) 
    
    threshhold=0.5
    pred_prob = lr_model.predict_proba(test_x)[:,1]

    p,r,_ = precision_recall_curve(test_y,pred_prob)
    tpr,fpr,_ = roc_curve(test_y,pred_prob)
    auc=     roc_auc_score(test_y,pred_prob)
    f1 =     f1_score(test_y,pred_prob >threshhold )
    #f1 = f1_score(labels, predictions> threshhold)
    prec=precision_score(test_y,pred_prob>threshhold)
    rec=recall_score(test_y,pred_prob > threshhold)
    cfn_matrix = confusion_matrix(test_y,pred_prob > threshhold)
    rocauc_vector.append(auc)
    f1_vector.append(f1)
    prec_vector.append(prec)
    rec_vector.append(rec)
    #cfn_matrix_[w,:] = cfn_matrix.flatten()
    cfn_matrix_.append(cfn_matrix)
    precision, recall, thresholds = precision_recall_curve(test_y, pred_prob > threshhold)
    #use the trapezoidal rule to calculate the area under the precion-recall curve
    area =  trapz(recall, precision)
    pr_auc_vector.append(area)
    ax1.plot(r*100,p*100,c=k,label=w)
    ax2.plot(tpr*100,fpr*100,c=k,label=w)
    #plt.xlim([-0.5,30])
    #plt.title('Area Under ROC Curve @{:.2f}'.format(p))
    #plt.ylim([80,100.5])
ax1.legend(loc='lower left')    
ax2.legend(loc='lower right')
plt.savefig('/content/drive/My Drive/ImbalancedData/logistic.png')
plt.show()

png

results=pd.DataFrame(list(zip([1,5,10,20,50,100,500,10000],rocauc_vector,f1_vector,prec_vector,rec_vector)))
results.columns = ['Weight','ROC_AUC','F-Score','Precision','Recall']
results['TP'] = [66,83,89,92,95,96,101,108]
results['TN'] = [56846,56831,56829,56805,56750,56613,55668,40576]
results['FP'] = [6,21,23,47,102,239,1184,16276]
results['FN'] = [46,27,21,18,15,14,9,2]
results['PR_AUC'] = pr_auc_vector
results

	Weight	ROC_AUC	F-Score	Precision	Recall	TP	TN	FP	FN	PR_AUC
0	1	0.959148	0.725275	0.916667	0.600000	66	56846	6	46	0.756788
1	5	0.976090	0.775701	0.798077	0.754545	83	56831	21	27	0.774617
2	10	0.978801	0.801802	0.794643	0.809091	89	56829	23	21	0.800120
3	20	0.982562	0.738956	0.661871	0.836364	92	56805	47	18	0.747344
4	50	0.985512	0.618893	0.482234	0.863636	95	56750	102	15	0.671135
5	100	0.986236	0.431461	0.286567	0.872727	96	56613	239	14	0.577839
6	500	0.988871	0.144803	0.078599	0.918182	101	55668	1184	9	0.496538
7	10000	0.986018	0.013096	0.006592	0.981818	108	40576	16276	2	0.492291

Cost-sensitive logistic regression with a weight of 10 on the minority class performs well in comparison to other weights. It has a high ROC-AUC and area under the precision-recall curve.

Tags: xgboost imbalanced-learn scikit-optimize Forest of Randomized Trees Under-Sampling Bagging Classifier

Imbalanced Machine Learning For Fraud Detection

cost-sensitive xgboost,RusBoost,Smote,EasyEnsemble,cost-sensitive logistic regression

Introduction

Description of Data.

Build Pandas -Profiling Report

Overview

Variables

Correlations

Sample

XGBoost No Weights

Model one : XGBoost with Weights on Label/ No Sampling

XGBoost with Undersampling

SMOTE

Forest of randomized trees

RusBoost

EasyEnsembleClassifier

Bagging classifier

Plot the AUC ROC

Plot the Area Under Orecision Recall Curve

Cost-Sensitive Logistic Regression