Introduction

Data with imbalanced target class occurs frequently in several domians such as credit card Fraud Detection ,insurance claim prediction, email spam detection, anomaly detection, outlier detection etc. Financial instituions loose millions of dollars every year to fraudulent financial transactions. It is important that these institutions are able to identify fraud to protect their customers and also reduce the financial losses that comes from fraudsters. The goal here is to predict fraudulent transactions to minimize loss to financial companies. For machine learning data with imbalanced target clases, the model evaluation metric is the AUC, the area under the ROC curve and the area under the precision-recall curve. The accuaracy metric is not useful in these situations since usually the proportion of the positive class in these situations is so low that even a naive classifier that predicts all transactions as fraudulent would result in a high accuracy. For example the dataset considered here, the proportion of negative examples is over 99% this a naive classifier can predict all transactions as legitimate and would be over 99% accuarate.

The following packages that is been installed here will be neccessary for some of the analysis later on this project.

!pip uninstall scikit-learn # until no more scikit-learn is present
!pip install scikit-learn
!pip install scikit-optimize
!pip install skll
!pip install imbalanced-learn
!pip install eli5
!pip install scipy
!pip install scikit-optimize
# activate R magic to run R in google colab notebook
import rpy2
%load_ext rpy2.ipython
#%%R

#install.packages("MLmetrics")
#install.packages("yardstick")
#install.packages("mltools")
#install.packages("glue")
%tensorflow_version 2.x
import numpy as np
import pandas as pd
import io
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns; sns.set(style="ticks", color_codes=True)
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.feature_selection import chi2
from sklearn.linear_model import LogisticRegression
from sklearn import feature_selection
#from sklearn.preprocessing import Imputer
from sklearn.model_selection import  cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from skopt.space import Real, Categorical, Integer
#from skll.metrics import spearman
from scipy.stats import kendalltau, spearmanr, pearsonr
from skopt import BayesSearchCV
from sklearn.model_selection import  cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer
from skopt.space import Real, Categorical, Integer
from sklearn.metrics import classification_report
from sklearn.base import TransformerMixin
from sklearn.metrics import classification_report, f1_score, accuracy_score, precision_score, confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
pd.set_option('display.max_rows', 600)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
import warnings
import pandas_profiling
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.pipeline import make_pipeline, FeatureUnion, Pipeline
from sklearn.preprocessing import OneHotEncoder,  StandardScaler
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import roc_curve, roc_auc_score,balanced_accuracy_score
from sklearn.svm import SVC
import random
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import *
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
#import smote
import os
from sklearn.tree import DecisionTreeClassifier
from imblearn.metrics import geometric_mean_score as gmean
from imblearn.metrics import make_index_balanced_accuracy as iba
from imblearn.metrics import *
from eli5.sklearn import PermutationImportance
from eli5.sklearn import *
import eli5
from eli5.permutation_importance import get_score_importances
#import rus
# Skopt functions
from skopt import BayesSearchCV
from skopt import gp_minimize # Bayesian optimization using Gaussian Processes
from skopt.space import Real, Categorical, Integer
from skopt.utils import use_named_args # decorator to convert a list of parameters to named arguments
from skopt.callbacks import DeadlineStopper # Stop the optimization before running out of a fixed budget of time.
from skopt.callbacks import VerboseCallback # Callback to control the verbosity
from skopt.callbacks import DeltaXStopper # Stop the optimization If the last two positions at which the objective has been evaluated are less than delta
from joblib import dump, load
from prettytable import PrettyTable
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.under_sampling import RandomUnderSampler # doctest: +NORMALIZE_WHITESPACE
from imblearn import under_sampling, over_sampling
from imblearn.over_sampling import SMOTE
import tensorflow as tf
warnings.filterwarnings("ignore")
%matplotlib inline
#specify tensorflow version to use
%tensorflow_version 2.x
#load tensorboard
#%load_ext tensorboard
 #%tensorboard --logdir logs

%autosave 5
Autosaving every 5 seconds
np.version.version
'1.18.2'

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

Description of Data.

The datasets can be found on kaggle.The link to it is here. The datasets contains transactions made by credit cards in September 2013 by european cardholders. This dataset presents transactions that occurred in two days, where we have 492 frauds out of 284,807 transactions. The dataset is highly imbalanced, the positive class (frauds) account for 0.172% of all transactions.

It contains only numerical input variables which are the result of a PCA transformation. This was done to preserve the identity and privacy of the people whose transaction this data was gathered from. Features V1, V2, … V28 are the principal components obtained with PCA, the only features which have not been transformed with PCA are ‘Time’ and ‘Amount’. Feature ‘Time’ contains the seconds elapsed between each transaction and the first transaction in the dataset. The feature ‘Amount’ is the transaction Amount, this feature can be used for example-dependant cost-senstive learning. Feature ‘Class’ is the response variable and it takes value 1 in case of fraud and 0 otherwise.

file = tf.keras.utils
df = pd.read_csv('https://storage.googleapis.com/download.tensorflow.org/data/creditcard.csv')
df.head()
Time V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20 V21 V22 V23 V24 V25 V26 V27 V28 Amount Class
0 0.0 -1.359807 -0.072781 2.536347 1.378155 -0.338321 0.462388 0.239599 0.098698 0.363787 0.090794 -0.551600 -0.617801 -0.991390 -0.311169 1.468177 -0.470401 0.207971 0.025791 0.403993 0.251412 -0.018307 0.277838 -0.110474 0.066928 0.128539 -0.189115 0.133558 -0.021053 149.62 0
1 0.0 1.191857 0.266151 0.166480 0.448154 0.060018 -0.082361 -0.078803 0.085102 -0.255425 -0.166974 1.612727 1.065235 0.489095 -0.143772 0.635558 0.463917 -0.114805 -0.183361 -0.145783 -0.069083 -0.225775 -0.638672 0.101288 -0.339846 0.167170 0.125895 -0.008983 0.014724 2.69 0
2 1.0 -1.358354 -1.340163 1.773209 0.379780 -0.503198 1.800499 0.791461 0.247676 -1.514654 0.207643 0.624501 0.066084 0.717293 -0.165946 2.345865 -2.890083 1.109969 -0.121359 -2.261857 0.524980 0.247998 0.771679 0.909412 -0.689281 -0.327642 -0.139097 -0.055353 -0.059752 378.66 0
3 1.0 -0.966272 -0.185226 1.792993 -0.863291 -0.010309 1.247203 0.237609 0.377436 -1.387024 -0.054952 -0.226487 0.178228 0.507757 -0.287924 -0.631418 -1.059647 -0.684093 1.965775 -1.232622 -0.208038 -0.108300 0.005274 -0.190321 -1.175575 0.647376 -0.221929 0.062723 0.061458 123.50 0
4 2.0 -1.158233 0.877737 1.548718 0.403034 -0.407193 0.095921 0.592941 -0.270533 0.817739 0.753074 -0.822843 0.538196 1.345852 -1.119670 0.175121 -0.451449 -0.237033 -0.038195 0.803487 0.408542 -0.009431 0.798278 -0.137458 0.141267 -0.206010 0.502292 0.219422 0.215153 69.99 0
df[['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V26', 'V27', 'V28', 'Amount', 'Class']].describe().transpose()
count mean std min 25% 50% 75% max
Time 284807.0 9.481386e+04 47488.145955 0.000000 54201.500000 84692.000000 139320.500000 172792.000000
V1 284807.0 3.919560e-15 1.958696 -56.407510 -0.920373 0.018109 1.315642 2.454930
V2 284807.0 5.688174e-16 1.651309 -72.715728 -0.598550 0.065486 0.803724 22.057729
V3 284807.0 -8.769071e-15 1.516255 -48.325589 -0.890365 0.179846 1.027196 9.382558
V4 284807.0 2.782312e-15 1.415869 -5.683171 -0.848640 -0.019847 0.743341 16.875344
V5 284807.0 -1.552563e-15 1.380247 -113.743307 -0.691597 -0.054336 0.611926 34.801666
V26 284807.0 1.699104e-15 0.482227 -2.604551 -0.326984 -0.052139 0.240952 3.517346
V27 284807.0 -3.660161e-16 0.403632 -22.565679 -0.070840 0.001342 0.091045 31.612198
V28 284807.0 -1.206049e-16 0.330083 -15.430084 -0.052960 0.011244 0.078280 33.847808
Amount 284807.0 8.834962e+01 250.120109 0.000000 5.600000 22.000000 77.165000 25691.160000
Class 284807.0 1.727486e-03 0.041527 0.000000 0.000000 0.000000 0.000000 1.000000

We can see the target class is highly imbalanced. The minority classis about 0.17% of the target exampes.

df['Class'].value_counts(normalize=True)*100
0    99.827251
1     0.172749
Name: Class, dtype: float64
neg, pos = df.Class.value_counts()
total = neg + pos
print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n  '.format(
    total, pos, 100 * pos / total,100 * neg / total))

print('Total: {}\n    Negative: {} ({:.2f}% of total)\n  '.format(
    total, neg, 100 * neg / total))
Examples:
    Total: 284807
    Positive: 492 (0.17% of total)
  
Total: 284807
    Negative: 284315 (99.83% of total)
#x = raw_df.drop(['Time'],axis=1)

# Use a utility from sklearn to split and shuffle our dataset.
train_df, test_df = train_test_split(df, test_size=0.2)
#train_df, val_df = train_test_split(train_df, test_size=0.2)

train_x =train_df.drop(['Time','Class'],axis=1)
test_x = test_df.drop(['Time','Class'],axis=1)
#val_x  =  val_df.drop(['Time'],axis=1)

train_y=  train_df.Class
test_y = test_df.Class
#val_y  = val_df.Class

print('Traing dataset size:{}'.format(train_x.shape))
print('Test dataset size:{}'.format(test_x.shape))
#print('Validation dataset size: {}'.format(val_df.shape))
Traing dataset size:(227845, 29)
Test dataset size:(56962, 29)
#train_x.columns
#test_x.columns
test_y.isna().sum()
0

The first model considered here is the extreme gradient boosting algorithm. It is popular with modeling tabular data. The hyperparameters of the model would be set to default except the scale_pos_weight which would be tuned in the case of cost-sensitive xgboost to find the best weight that optimizes the model.The hyperparameter values is left to the default values to allow for a fair comparison among machine learning algorithms used in this analysis. The hyperparameter tuning is done by bayesian optimization using the scikit-optimize package.

# Setting a 5-fold stratified cross-validation (note: shuffle=True)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)


clf = xgb.XGBClassifier(
        n_jobs = -1,
        objective = 'binary:logistic',
        silent=1,
        tree_method='approx')

search_spaces = {
    #'learning_rate': Real(0.01, 1.0, 'log-uniform'),
    #             'min_child_weight': Integer(0, 10),
    #             'max_depth': Integer(1, 50),
    #            'max_delta_step': Integer(0, 20), # Maximum delta step we allow each leaf output
    #             'subsample': Real(0.01, 1.0, 'uniform'),
    #             'colsample_bytree': Real(0.01, 1.0, 'uniform'), # subsample ratio of columns by tree
    #             'colsample_bylevel': Real(0.01, 1.0, 'uniform'), # subsample ratio by level in trees
                 #'reg_lambda': Real(1e-9, 1000, 'log-uniform'), # L2 regularization
                 #'reg_alpha': Real(1e-9, 1.0, 'log-uniform'), # L1 regularization
     #            'gamma': Real(1e-9, 0.5, 'log-uniform'), # Minimum loss reduction for partition/pruning parameter
     #            'n_estimators': Integer(50, 100),
                 'scale_pos_weight': Real(1e-6, 2000, 'log-uniform')
                 }


bayessearch = BayesSearchCV(clf,
                    search_spaces,
                    scoring='roc_auc', #f1
                    cv=skf,
                    n_iter=40,
                    n_jobs=-1,
                    return_train_score=False,
                    #refit=True,
                    optimizer_kwargs={'base_estimator': 'GP'},
                    random_state=22)
    

#xgbm_model = bayessearch.fit(X=train_x,  y=train_y)

from google.colab import drive
drive.mount('/content/drive')
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
#share_link="https://drive.google.com/file/d/1mGzO4-vaTKVgH5zzzXCVcXNbj-Bt8u8g/view?usp=sharing"
import os 
os.getcwd()
#!files.os.listdir()
'/content'

Build Pandas -Profiling Report

The exploratory analysis of the features in the dataset can be automated with the Pandas -ProfilingReport package. It generates exploratory plots of the features in a dataset that is passed to it.

#Inline report without saving object
pandas_profiling.ProfileReport(df)
#Save report to file¶
pfr = pandas_profiling.ProfileReport(df)

pfr.to_file("/content/drive/My Drive/profilingReport2.html")
pfr

Overview

Dataset info

Number of variables 31
Number of observations 284807
Total Missing (%) 0.0%
Total size in memory 67.4 MiB
Average record size in memory 248.0 B

Variables types

Numeric 30
Categorical 0
Boolean 1
Date 0
Text (Unique) 0
Rejected 0
Unsupported 0

Warnings

  • Dataset has 1081 duplicate rows Warning

Variables

Time
Numeric

Distinct count 124592
Unique (%) 43.7%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 94814
Minimum 0
Maximum 172790
Zeros (%) 0.0%

Quantile statistics

Minimum 0
5-th percentile 25298
Q1 54202
Median 84692
Q3 139320
95-th percentile 164140
Maximum 172790
Range 172790
Interquartile range 85119

Descriptive statistics

Standard deviation 47488
Coef of variation 0.50086
Kurtosis -1.2935
Mean 94814
MAD 42796
Skewness -0.035568
Sum 27004000000
Variance 2255100000
Memory size 2.2 MiB
Value Count Frequency (%)  
163152.0 36 0.0%
 
64947.0 26 0.0%
 
68780.0 25 0.0%
 
3767.0 21 0.0%
 
3770.0 20 0.0%
 
128860.0 19 0.0%
 
19912.0 19 0.0%
 
3750.0 19 0.0%
 
140347.0 19 0.0%
 
143083.0 18 0.0%
 
Other values (124582) 284585 99.9%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 2 0.0%
 
1.0 2 0.0%
 
2.0 2 0.0%
 
4.0 1 0.0%
 
7.0 2 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
172785.0 1 0.0%
 
172786.0 1 0.0%
 
172787.0 1 0.0%
 
172788.0 2 0.0%
 
172792.0 1 0.0%
 

V1
Numeric

Distinct count 275663
Unique (%) 96.8%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 3.9196e-15
Minimum -56.408
Maximum 2.4549
Zeros (%) 0.0%

Quantile statistics

Minimum -56.408
5-th percentile -2.8991
Q1 -0.92037
Median 0.018109
Q3 1.3156
95-th percentile 2.0812
Maximum 2.4549
Range 58.862
Interquartile range 2.236

Descriptive statistics

Standard deviation 1.9587
Coef of variation 499720000000000
Kurtosis 32.487
Mean 3.9196e-15
MAD 1.4116
Skewness -3.2807
Sum 3.3208e-1
Variance 3.8365
Memory size 2.2 MiB
Value Count Frequency (%)  
2.0557970063003896 77 0.0%
 
1.24567381944824 77 0.0%
 
2.0533112135278504 62 0.0%
 
1.30237796508637 60 0.0%
 
2.04021105776632 53 0.0%
 
2.08517487552541 48 0.0%
 
1.33284931179458 45 0.0%
 
1.01841181981555 40 0.0%
 
1.33505315377059 39 0.0%
 
1.3154041716379299 36 0.0%
 
Other values (275653) 284270 99.8%
 

Minimum 5 values

Value Count Frequency (%)  
-56.407509631329 1 0.0%
 
-46.85504720078179 1 0.0%
 
-41.9287375244141 1 0.0%
 
-40.4701418378475 1 0.0%
 
-40.0425374953845 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
2.4305067805687406 1 0.0%
 
2.43920748106102 1 0.0%
 
2.44650498499596 1 0.0%
 
2.4518884899535895 1 0.0%
 
2.45492999121121 1 0.0%
 

V2
Numeric

Distinct count 275663
Unique (%) 96.8%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 5.6882e-16
Minimum -72.716
Maximum 22.058
Zeros (%) 0.0%

Quantile statistics

Minimum -72.716
5-th percentile -1.972
Q1 -0.59855
Median 0.065486
Q3 0.80372
95-th percentile 1.8086
Maximum 22.058
Range 94.773
Interquartile range 1.4023

Descriptive statistics

Standard deviation 1.6513
Coef of variation 2903100000000000
Kurtosis 95.773
Mean 5.6882e-16
MAD 0.97384
Skewness -4.6249
Sum 9.7316e-11
Variance 2.7268
Memory size 2.2 MiB
Value Count Frequency (%)  
0.166975019545401 77 0.0%
 
-0.32666777306077005 77 0.0%
 
0.08973464781763099 62 0.0%
 
-0.606529308236609 60 0.0%
 
-0.146974974784838 53 0.0%
 
0.39305057772255 48 0.0%
 
0.38919824918427603 45 0.0%
 
1.03666300867632 40 0.0%
 
0.331464026372479 39 0.0%
 
0.44747360617094895 36 0.0%
 
Other values (275653) 284270 99.8%
 

Minimum 5 values

Value Count Frequency (%)  
-72.7157275629303 1 0.0%
 
-63.3446983175027 1 0.0%
 
-60.4646176556493 1 0.0%
 
-50.3832691251379 1 0.0%
 
-48.060856024869395 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
18.1836264596211 1 0.0%
 
18.902452840124898 1 0.0%
 
19.167239010306197 1 0.0%
 
21.4672029942752 1 0.0%
 
22.0577289904909 1 0.0%
 

V3
Numeric

Distinct count 275663
Unique (%) 96.8%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean -8.7691e-15
Minimum -48.326
Maximum 9.3826
Zeros (%) 0.0%

Quantile statistics

Minimum -48.326
5-th percentile -2.3897
Q1 -0.89036
Median 0.17985
Q3 1.0272
95-th percentile 2.0626
Maximum 9.3826
Range 57.708
Interquartile range 1.9176

Descriptive statistics

Standard deviation 1.5163
Coef of variation -172910000000000
Kurtosis 26.62
Mean -8.7691e-15
MAD 1.1337
Skewness -2.2402
Sum -3.9108e-1
Variance 2.299
Memory size 2.2 MiB
Value Count Frequency (%)  
-2.75204095570008 77 0.0%
 
0.488305742562781 77 0.0%
 
-1.68183566862495 62 0.0%
 
-0.681986192919261 60 0.0%
 
-2.95593366483195 53 0.0%
 
-4.50820053235418 48 0.0%
 
-2.16559660467804 45 0.0%
 
-1.6898137072248403 40 0.0%
 
-2.05776277666682 39 0.0%
 
-0.495757487926775 36 0.0%
 
Other values (275653) 284270 99.8%
 

Minimum 5 values

Value Count Frequency (%)  
-48.3255893623954 1 0.0%
 
-33.6809840183525 1 0.0%
 
-32.9653457595238 1 0.0%
 
-32.45419818625469 1 0.0%
 
-31.8135859546007 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
4.07916781154883 1 0.0%
 
4.10171617761651 1 0.0%
 
4.18781059904763 1 0.0%
 
4.22610848028397 1 0.0%
 
9.38255843282114 1 0.0%
 

V4
Numeric

Distinct count 275663
Unique (%) 96.8%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 2.7823e-15
Minimum -5.6832
Maximum 16.875
Zeros (%) 0.0%

Quantile statistics

Minimum -5.6832
5-th percentile -2.1957
Q1 -0.84864
Median -0.019847
Q3 0.74334
95-th percentile 2.5665
Maximum 16.875
Range 22.559
Interquartile range 1.592

Descriptive statistics

Standard deviation 1.4159
Coef of variation 508880000000000
Kurtosis 2.6355
Mean 2.7823e-15
MAD 1.0603
Skewness 0.67629
Sum 5.9435e-1
Variance 2.0047
Memory size 2.2 MiB
Value Count Frequency (%)  
-0.842316033286871 77 0.0%
 
0.6353219207244001 77 0.0%
 
0.45421196023303295 62 0.0%
 
-1.9046033962221203 60 0.0%
 
-0.5783559788671391 53 0.0%
 
-0.311770683288625 48 0.0%
 
-0.306872623831362 45 0.0%
 
1.31547583332268 40 0.0%
 
-0.346175355279224 39 0.0%
 
-0.557087388354872 36 0.0%
 
Other values (275653) 284270 99.8%
 

Minimum 5 values

Value Count Frequency (%)  
-5.68317119816995 1 0.0%
 
-5.600607141215099 1 0.0%
 
-5.56011758115594 1 0.0%
 
-5.519697123284151 1 0.0%
 
-5.416315392339291 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
13.1436680982574 1 0.0%
 
15.3041839851875 1 0.0%
 
16.4912171736623 1 0.0%
 
16.7155373723131 1 0.0%
 
16.8753440335975 1 0.0%
 

V5
Numeric

Distinct count 275663
Unique (%) 96.8%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean -1.5526e-15
Minimum -113.74
Maximum 34.802
Zeros (%) 0.0%

Quantile statistics

Minimum -113.74
5-th percentile -1.702
Q1 -0.6916
Median -0.054336
Q3 0.61193
95-th percentile 2.099
Maximum 34.802
Range 148.54
Interquartile range 1.3035

Descriptive statistics

Standard deviation 1.3802
Coef of variation -889010000000000
Kurtosis 206.9
Mean -1.5526e-15
MAD 0.89707
Skewness -2.4259
Sum 2.7353e-1
Variance 1.9051
Memory size 2.2 MiB
Value Count Frequency (%)  
2.46307225982454 77 0.0%
 
-0.5627766807738629 77 0.0%
 
0.298310371498215 62 0.0%
 
1.3266231068468501 60 0.0%
 
2.60935827084169 53 0.0%
 
3.51011694221752 48 0.0%
 
2.6413512514436 45 0.0%
 
1.69843605562986 40 0.0%
 
2.58323382235421 39 0.0%
 
2.70504105264306 36 0.0%
 
Other values (275653) 284270 99.8%
 

Minimum 5 values

Value Count Frequency (%)  
-113.74330671114599 1 0.0%
 
-42.1478983728015 1 0.0%
 
-40.4277263001722 1 0.0%
 
-35.1821203113785 1 0.0%
 
-32.0921290046357 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
29.1621720203733 1 0.0%
 
31.457046054914304 1 0.0%
 
32.9114617007293 1 0.0%
 
34.0993093435765 1 0.0%
 
34.8016658766686 1 0.0%
 

V6
Numeric

Distinct count 275663
Unique (%) 96.8%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 2.0107e-15
Minimum -26.161
Maximum 73.302
Zeros (%) 0.0%

Quantile statistics

Minimum -26.161
5-th percentile -1.4068
Q1 -0.7683
Median -0.27419
Q3 0.39856
95-th percentile 3.1604
Maximum 73.302
Range 99.462
Interquartile range 1.1669

Descriptive statistics

Standard deviation 1.3323
Coef of variation 662600000000000
Kurtosis 42.642
Mean 2.0107e-15
MAD 0.90901
Skewness 1.8266
Sum 4.2439e-1
Variance 1.7749
Memory size 2.2 MiB
Value Count Frequency (%)  
-1.01107261632698 77 0.0%
 
3.17385642307029 77 0.0%
 
-0.953526086363083 62 0.0%
 
3.43631244725031 60 0.0%
 
3.1426415310887905 53 0.0%
 
2.45329922016311 48 0.0%
 
2.80808376427436 45 0.0%
 
0.528806548957574 40 0.0%
 
2.8541019971666097 39 0.0%
 
2.7624395847487797 36 0.0%
 
Other values (275653) 284270 99.8%
 

Minimum 5 values

Value Count Frequency (%)  
-26.1605059358433 1 0.0%
 
-23.496713929871397 1 0.0%
 
-21.9293122885031 1 0.0%
 
-21.2487516200394 1 0.0%
 
-20.8696261884133 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
21.3930687572539 1 0.0%
 
21.550496192579605 1 0.0%
 
22.5292984665587 1 0.0%
 
23.9178371266367 1 0.0%
 
73.3016255459646 1 0.0%
 

V7
Numeric

Distinct count 275663
Unique (%) 96.8%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean -1.6942e-15
Minimum -43.557
Maximum 120.59
Zeros (%) 0.0%

Quantile statistics

Minimum -43.557
5-th percentile -1.4344
Q1 -0.55408
Median 0.040103
Q3 0.57044
95-th percentile 1.4076
Maximum 120.59
Range 164.15
Interquartile range 1.1245

Descriptive statistics

Standard deviation 1.2371
Coef of variation -730170000000000
Kurtosis 405.61
Mean -1.6942e-15
MAD 0.73785
Skewness 2.5539
Sum -1.5825e-1
Variance 1.5304
Memory size 2.2 MiB
Value Count Frequency (%)  
-0.43212592398782396 77 0.0%
 
0.0149526614685896 77 0.0%
 
0.152002545314135 62 0.0%
 
-1.14512682747431 60 0.0%
 
-0.41688284124123 53 0.0%
 
0.220468581007954 48 0.0%
 
-0.171626636099457 45 0.0%
 
0.33171450239883 40 0.0%
 
-0.18754733727697498 39 0.0%
 
-0.5349938273164451 36 0.0%
 
Other values (275653) 284270 99.8%
 

Minimum 5 values

Value Count Frequency (%)  
-43.5572415712451 1 0.0%
 
-41.5067960832574 1 0.0%
 
-37.0603114554112 1 0.0%
 
-33.2393281671892 1 0.0%
 
-31.76494649021 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
34.3031768568354 1 0.0%
 
36.6772679454031 1 0.0%
 
36.877368268259794 1 0.0%
 
44.054461363163796 1 0.0%
 
120.589493945238 1 0.0%
 

V8
Numeric

Distinct count 275663
Unique (%) 96.8%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean -1.927e-16
Minimum -73.217
Maximum 20.007
Zeros (%) 0.0%

Quantile statistics

Minimum -73.217
5-th percentile -0.84215
Q1 -0.20863
Median 0.022358
Q3 0.32735
95-th percentile 1.05
Maximum 20.007
Range 93.224
Interquartile range 0.53598

Descriptive statistics

Standard deviation 1.1944
Coef of variation -6197900000000000
Kurtosis 220.59
Mean -1.927e-16
MAD 0.50574
Skewness -8.5219
Sum 3.3538e-11
Variance 1.4265
Memory size 2.2 MiB
Value Count Frequency (%)  
-0.16021086330181197 77 0.0%
 
0.7277062007278241 77 0.0%
 
-0.207071379659966 62 0.0%
 
0.9591472620923409 60 0.0%
 
0.7843929483197328 53 0.0%
 
0.543376800596399 48 0.0%
 
0.683351733616692 45 0.0%
 
0.364538761567697 40 0.0%
 
0.6851537704418591 39 0.0%
 
0.8082500983641501 36 0.0%
 
Other values (275653) 284270 99.8%
 

Minimum 5 values

Value Count Frequency (%)  
-73.21671845526741 1 0.0%
 
-50.94336886770229 1 0.0%
 
-50.688419356750295 1 0.0%
 
-50.420090064434206 1 0.0%
 
-41.484822506637705 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
18.709254543323397 1 0.0%
 
18.7488719520883 1 0.0%
 
19.168327389730102 1 0.0%
 
19.5877726234404 1 0.0%
 
20.0072083651213 1 0.0%
 

V9
Numeric

Distinct count 275663
Unique (%) 96.8%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean -3.137e-15
Minimum -13.434
Maximum 15.595
Zeros (%) 0.0%

Quantile statistics

Minimum -13.434
5-th percentile -1.7584
Q1 -0.6431
Median -0.051429
Q3 0.59714
95-th percentile 1.7808
Maximum 15.595
Range 29.029
Interquartile range 1.2402

Descriptive statistics

Standard deviation 1.0986
Coef of variation -350210000000000
Kurtosis 3.7313
Mean -3.137e-15
MAD 0.81439
Skewness 0.55468
Sum -6.8538e-1
Variance 1.207
Memory size 2.2 MiB
Value Count Frequency (%)  
0.17036185217373 77 0.0%
 
0.608605870267216 77 0.0%
 
0.587335266422761 62 0.0%
 
1.67130156362918 60 0.0%
 
0.359902378888007 53 0.0%
 
-0.10043390489717 48 0.0%
 
-0.29796200128389 45 0.0%
 
-0.7117979387642629 40 0.0%
 
-0.28661406862562394 39 0.0%
 
0.6977195955056469 36 0.0%
 
Other values (275653) 284270 99.8%
 

Minimum 5 values

Value Count Frequency (%)  
-13.4340663182301 1 0.0%
 
-13.3201546920984 1 0.0%
 
-11.1266235224579 1 0.0%
 
-10.8425258685569 1 0.0%
 
-9.48145633401495 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
10.3261330490616 1 0.0%
 
10.348406697766801 1 0.0%
 
10.370657984046 1 0.0%
 
10.392888824678499 1 0.0%
 
15.5949946071278 1 0.0%
 

V10
Numeric

Distinct count 275663
Unique (%) 96.8%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 1.7686e-15
Minimum -24.588
Maximum 23.745
Zeros (%) 0.0%

Quantile statistics

Minimum -24.588
5-th percentile -1.3386
Q1 -0.53543
Median -0.092917
Q3 0.45392
95-th percentile 1.5486
Maximum 23.745
Range 48.333
Interquartile range 0.98935

Descriptive statistics

Standard deviation 1.0888
Coef of variation 615650000000000
Kurtosis 31.988
Mean 1.7686e-15
MAD 0.69512
Skewness 1.1871
Sum 6.379e-1
Variance 1.1856
Memory size 2.2 MiB
Value Count Frequency (%)  
-0.0445745893804268 77 0.0%
 
-0.0751861699398929 77 0.0%
 
-0.362047348389396 62 0.0%
 
-1.02294602983554 60 0.0%
 
-0.351075101407957 53 0.0%
 
-1.01862219976658 48 0.0%
 
-0.652096600406493 45 0.0%
 
-1.57028828006989 40 0.0%
 
-0.5359027354525039 39 0.0%
 
-1.09018090617913 36 0.0%
 
Other values (275653) 284270 99.8%
 

Minimum 5 values

Value Count Frequency (%)  
-24.5882624372475 1 0.0%
 
-24.403184969972802 1 0.0%
 
-23.2282548357516 1 0.0%
 
-22.1870885620007 4 0.0%
 
-20.949191554361104 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
13.8117577662908 1 0.0%
 
15.236028204007098 1 0.0%
 
15.2456856915255 1 0.0%
 
15.3317415557881 1 0.0%
 
23.7451361206545 1 0.0%
 

V11
Numeric

Distinct count 275663
Unique (%) 96.8%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 9.1703e-16
Minimum -4.7975
Maximum 12.019
Zeros (%) 0.0%

Quantile statistics

Minimum -4.7975
5-th percentile -1.5719
Q1 -0.76249
Median -0.032757
Q3 0.73959
95-th percentile 1.614
Maximum 12.019
Range 16.816
Interquartile range 1.5021

Descriptive statistics

Standard deviation 1.0207
Coef of variation 1113100000000000
Kurtosis 1.6339
Mean 9.1703e-16
MAD 0.83126
Skewness 0.35651
Sum 4.7658e-1
Variance 1.0419
Memory size 2.2 MiB
Value Count Frequency (%)  
-0.35674901847752005 77 0.0%
 
0.0635044576008839 77 0.0%
 
-0.589598040395407 62 0.0%
 
-0.19142297265161498 60 0.0%
 
0.329650883701029 53 0.0%
 
0.8070381066842709 48 0.0%
 
0.418002664896219 45 0.0%
 
3.46301782070354 40 0.0%
 
0.332848417624034 39 0.0%
 
-0.0286089299546822 36 0.0%
 
Other values (275653) 284270 99.8%
 

Minimum 5 values

Value Count Frequency (%)  
-4.79747346479757 1 0.0%
 
-4.682930547652759 1 0.0%
 
-4.568390246460219 1 0.0%
 
-4.45385284150054 1 0.0%
 
-4.3393186545773705 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
11.228470279576001 1 0.0%
 
11.277920727806698 1 0.0%
 
11.6197234753825 1 0.0%
 
11.6692047358121 1 0.0%
 
12.018913181619899 1 0.0%
 

V12
Numeric

Distinct count 275663
Unique (%) 96.8%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean -1.8107e-15
Minimum -18.684
Maximum 7.8484
Zeros (%) 0.0%

Quantile statistics

Minimum -18.684
5-th percentile -1.9672
Q1 -0.40557
Median 0.14003
Q3 0.61824
95-th percentile 1.2431
Maximum 7.8484
Range 26.532
Interquartile range 1.0238

Descriptive statistics

Standard deviation 0.9992
Coef of variation -551840000000000
Kurtosis 20.242
Mean -1.8107e-15
MAD 0.70536
Skewness -2.2784
Sum -3.5743e-1
Variance 0.9984
Memory size 2.2 MiB
Value Count Frequency (%)  
0.350563573253678 77 0.0%
 
-0.0734595173503765 77 0.0%
 
-0.17471205308176502 62 0.0%
 
0.6310273414871078 60 0.0%
 
0.18350812062465602 53 0.0%
 
-0.330547627789277 48 0.0%
 
-0.32243692372967503 45 0.0%
 
0.5384113631159171 40 0.0%
 
-0.26831873850147697 39 0.0%
 
0.0736565150203547 36 0.0%
 
Other values (275653) 284270 99.8%
 

Minimum 5 values

Value Count Frequency (%)  
-18.683714633344298 1 0.0%
 
-18.553697009645802 1 0.0%
 
-18.4311310279993 1 0.0%
 
-18.047596570821604 1 0.0%
 
-17.7691434633638 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
4.4063382205176 1 0.0%
 
4.4729205841361 1 0.0%
 
4.57408224145334 1 0.0%
 
4.84645240859009 1 0.0%
 
7.8483920756445995 1 0.0%
 

V13
Numeric

Distinct count 275663
Unique (%) 96.8%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 1.6934e-15
Minimum -5.7919
Maximum 7.1269
Zeros (%) 0.0%

Quantile statistics

Minimum -5.7919
5-th percentile -1.6397
Q1 -0.64854
Median -0.013568
Q3 0.6625
95-th percentile 1.6079
Maximum 7.1269
Range 12.919
Interquartile range 1.311

Descriptive statistics

Standard deviation 0.99527
Coef of variation 587720000000000
Kurtosis 0.1953
Mean 1.6934e-15
MAD 0.7846
Skewness 0.065233
Sum 2.3286e-1
Variance 0.99057
Memory size 2.2 MiB
Value Count Frequency (%)  
-0.141238322200309 77 0.0%
 
-0.517759694198053 77 0.0%
 
-0.6211270614210049 62 0.0%
 
0.0319072703534055 60 0.0%
 
-0.27291854500254503 53 0.0%
 
-0.5314186516713479 48 0.0%
 
-0.143469154599387 45 0.0%
 
-0.37809538452842295 40 0.0%
 
-0.12761419581231198 39 0.0%
 
-0.23845703149556197 36 0.0%
 
Other values (275653) 284270 99.8%
 

Minimum 5 values

Value Count Frequency (%)  
-5.7918812063208405 1 0.0%
 
-4.00863979207158 1 0.0%
 
-3.9617575357502504 1 0.0%
 
-3.8886062856691 1 0.0%
 
-3.8811062494802897 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
4.36999837897829 1 0.0%
 
4.465413177090861 1 0.0%
 
4.46956619153499 1 0.0%
 
4.56900895856606 1 0.0%
 
7.126882958593759 1 0.0%
 

V14
Numeric

Distinct count 275663
Unique (%) 96.8%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 1.479e-15
Minimum -19.214
Maximum 10.527
Zeros (%) 0.0%

Quantile statistics

Minimum -19.214
5-th percentile -1.4394
Q1 -0.42557
Median 0.050601
Q3 0.49315
95-th percentile 1.3937
Maximum 10.527
Range 29.741
Interquartile range 0.91872

Descriptive statistics

Standard deviation 0.9586
Coef of variation 648120000000000
Kurtosis 23.879
Mean 1.479e-15
MAD 0.64865
Skewness -1.9952
Sum 3.4356e-1
Variance 0.91891
Memory size 2.2 MiB
Value Count Frequency (%)  
0.40696893438373105 77 0.0%
 
0.690971618395625 77 0.0%
 
-0.7035127839833039 62 0.0%
 
-0.0314253812628428 60 0.0%
 
-0.597436665174528 53 0.0%
 
-2.1814488246367403 48 0.0%
 
-1.1545242958661899 45 0.0%
 
-3.0454951796322502 40 0.0%
 
-0.868299960850499 39 0.0%
 
0.215738138536011 36 0.0%
 
Other values (275653) 284270 99.8%
 

Minimum 5 values

Value Count Frequency (%)  
-19.2143254902614 1 0.0%
 
-18.8220867423816 1 0.0%
 
-18.4937733551053 1 0.0%
 
-18.392091495673 1 0.0%
 
-18.049997689859396 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
7.518402781245941 1 0.0%
 
7.667725750558191 1 0.0%
 
7.692208543567821 1 0.0%
 
7.754598748054839 1 0.0%
 
10.5267660517847 1 0.0%
 

V15
Numeric

Distinct count 275663
Unique (%) 96.8%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 3.4823e-15
Minimum -4.4989
Maximum 8.8777
Zeros (%) 0.0%

Quantile statistics

Minimum -4.4989
5-th percentile -1.5932
Q1 -0.58288
Median 0.048072
Q3 0.64882
95-th percentile 1.3731
Maximum 8.8777
Range 13.377
Interquartile range 1.2317

Descriptive statistics

Standard deviation 0.91532
Coef of variation 262850000000000
Kurtosis 0.28477
Mean 3.4823e-15
MAD 0.72734
Skewness -0.30842
Sum 1.3993e-09
Variance 0.8378
Memory size 2.2 MiB
Value Count Frequency (%)  
1.2752570390934999 77 0.0%
 
1.1241469228868501 77 0.0%
 
0.271956610213985 62 0.0%
 
1.44662697638966 60 0.0%
 
0.5838968102925071 53 0.0%
 
0.38872408312047796 48 0.0%
 
1.157633713505 45 0.0%
 
1.46891114338139 40 0.0%
 
1.1285389817093798 39 0.0%
 
1.2452765300023998 36 0.0%
 
Other values (275653) 284270 99.8%
 

Minimum 5 values

Value Count Frequency (%)  
-4.49894467676621 1 0.0%
 
-4.39130706780494 1 0.0%
 
-4.19932124976578 1 0.0%
 
-4.19661969463528 1 0.0%
 
-4.15253175950472 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
5.685899051594321 1 0.0%
 
5.720478632456981 1 0.0%
 
5.7845138896294594 1 0.0%
 
5.82565431863365 1 0.0%
 
8.87774159774277 1 0.0%
 

V16
Numeric

Distinct count 275663
Unique (%) 96.8%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 1.392e-15
Minimum -14.13
Maximum 17.315
Zeros (%) 0.0%

Quantile statistics

Minimum -14.13
5-th percentile -1.4917
Q1 -0.46804
Median 0.066413
Q3 0.5233
95-th percentile 1.3253
Maximum 17.315
Range 31.445
Interquartile range 0.99133

Descriptive statistics

Standard deviation 0.87625
Coef of variation 629490000000000
Kurtosis 10.419
Mean 1.392e-15
MAD 0.64782
Skewness -1.101
Sum 4.0946e-1
Variance 0.76782
Memory size 2.2 MiB
Value Count Frequency (%)  
0.34246975411076896 77 0.0%
 
-0.37196212502841897 77 0.0%
 
0.318688063430157 62 0.0%
 
-0.12182037858308699 60 0.0%
 
0.17867583647653199 53 0.0%
 
0.23207137768386 48 0.0%
 
0.878174917750572 45 0.0%
 
-0.0297415143257285 40 0.0%
 
0.7865060536879019 39 0.0%
 
-0.255230524748655 36 0.0%
 
Other values (275653) 284270 99.8%
 

Minimum 5 values

Value Count Frequency (%)  
-14.1298545174931 1 0.0%
 
-13.5632729563133 1 0.0%
 
-13.30388757707 1 0.0%
 
-13.2568330912778 1 0.0%
 
-13.2515419788937 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
6.35185349844491 1 0.0%
 
6.44279790144451 1 0.0%
 
7.05913181057395 1 0.0%
 
8.289889559546191 1 0.0%
 
17.315111517627802 1 0.0%
 

V17
Numeric

Distinct count 275663
Unique (%) 96.8%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean -7.5285e-16
Minimum -25.163
Maximum 9.2535
Zeros (%) 0.0%

Quantile statistics

Minimum -25.163
5-th percentile -0.983
Q1 -0.48375
Median -0.065676
Q3 0.39967
95-th percentile 1.2746
Maximum 9.2535
Range 34.416
Interquartile range 0.88342

Descriptive statistics

Standard deviation 0.84934
Coef of variation -1128200000000000
Kurtosis 94.8
Mean -7.5285e-16
MAD 0.56387
Skewness -3.8449
Sum -1.0823e-1
Variance 0.72137
Memory size 2.2 MiB
Value Count Frequency (%)  
-0.6019568028284449 77 0.0%
 
-0.37465644005137605 77 0.0%
 
0.549365128729473 62 0.0%
 
-0.651405237009102 60 0.0%
 
0.47389827829767206 53 0.0%
 
2.12502188299054 48 0.0%
 
0.536917519702814 45 0.0%
 
3.6645884808692504 40 0.0%
 
0.31643526103505604 39 0.0%
 
-1.07208498526811 36 0.0%
 
Other values (275653) 284270 99.8%
 

Minimum 5 values

Value Count Frequency (%)  
-25.162799369324798 1 0.0%
 
-24.019098547590197 1 0.0%
 
-23.8156358284126 1 0.0%
 
-23.2415971479491 1 0.0%
 
-22.8839985767803 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
7.766636362866991 1 0.0%
 
7.89339253241379 1 0.0%
 
8.538195138626161 1 0.0%
 
9.20705853529557 1 0.0%
 
9.25352625047285 1 0.0%
 

V18
Numeric

Distinct count 275663
Unique (%) 96.8%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 4.3288e-16
Minimum -9.4987
Maximum 5.0411
Zeros (%) 0.0%

Quantile statistics

Minimum -9.4987
5-th percentile -1.3581
Q1 -0.49885
Median -0.0036363
Q3 0.50081
95-th percentile 1.3944
Maximum 5.0411
Range 14.54
Interquartile range 0.99966

Descriptive statistics

Standard deviation 0.83818
Coef of variation 1936300000000000
Kurtosis 2.5783
Mean 4.3288e-16
MAD 0.63582
Skewness -0.25988
Sum 2.7262e-1
Variance 0.70254
Memory size 2.2 MiB
Value Count Frequency (%)  
-0.43899243243668296 77 0.0%
 
-0.0526401462570187 77 0.0%
 
-0.25778585794493303 62 0.0%
 
0.6179704765287819 60 0.0%
 
-0.49884979866504103 53 0.0%
 
0.40554867355562896 48 0.0%
 
0.712873012618197 45 0.0%
 
-0.105189588790714 40 0.0%
 
0.587856253020328 39 0.0%
 
-0.0686980996025901 36 0.0%
 
Other values (275653) 284270 99.8%
 

Minimum 5 values

Value Count Frequency (%)  
-9.498745921046769 1 0.0%
 
-9.33519307905321 1 0.0%
 
-9.287832213974019 1 0.0%
 
-9.264608732956551 1 0.0%
 
-9.17055721888169 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
4.19959110679305 1 0.0%
 
4.24384121345385 1 0.0%
 
4.2956482344645 1 0.0%
 
4.71239756635225 1 0.0%
 
5.04106918541184 1 0.0%
 

V19
Numeric

Distinct count 275663
Unique (%) 96.8%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 9.0497e-16
Minimum -7.2135
Maximum 5.592
Zeros (%) 0.0%

Quantile statistics

Minimum -7.2135
5-th percentile -1.3563
Q1 -0.4563
Median 0.0037348
Q3 0.45895
95-th percentile 1.2862
Maximum 5.592
Range 12.805
Interquartile range 0.91525

Descriptive statistics

Standard deviation 0.81404
Coef of variation 899520000000000
Kurtosis 1.725
Mean 9.0497e-16
MAD 0.60579
Skewness 0.10919
Sum 2.9615e-1
Variance 0.66266
Memory size 2.2 MiB
Value Count Frequency (%)  
-0.116090785002835 77 0.0%
 
-0.33059044844294394 77 0.0%
 
0.0162561279842771 62 0.0%
 
0.927600044556072 60 0.0%
 
-0.14009868476221 53 0.0%
 
-0.440929511947803 48 0.0%
 
0.00677355522536129 45 0.0%
 
-2.0979443214639 40 0.0%
 
0.0493500831769145 39 0.0%
 
0.255267674459398 36 0.0%
 
Other values (275653) 284270 99.8%
 

Minimum 5 values

Value Count Frequency (%)  
-7.21352743017759 1 0.0%
 
-6.93829731768481 1 0.0%
 
-4.93273305547833 1 0.0%
 
-4.676092279153361 1 0.0%
 
-4.619034341772441 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
4.8910624409520995 1 0.0%
 
5.2283417900513 1 0.0%
 
5.5017472139665 1 0.0%
 
5.572113326879691 1 0.0%
 
5.59197142733558 1 0.0%
 

V20
Numeric

Distinct count 275663
Unique (%) 96.8%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 5.0855e-16
Minimum -54.498
Maximum 39.421
Zeros (%) 0.0%

Quantile statistics

Minimum -54.498
5-th percentile -0.55843
Q1 -0.21172
Median -0.062481
Q3 0.13304
95-th percentile 0.83614
Maximum 39.421
Range 93.919
Interquartile range 0.34476

Descriptive statistics

Standard deviation 0.77093
Coef of variation 1515900000000000
Kurtosis 271.02
Mean 5.0855e-16
MAD 0.34191
Skewness -2.0372
Sum 1.8247e-1
Variance 0.59433
Memory size 2.2 MiB
Value Count Frequency (%)  
-0.18037011855969298 77 0.0%
 
-0.132079724302295 77 0.0%
 
-0.187420788431655 62 0.0%
 
0.0057566554189328704 60 0.0%
 
-0.12071403428047302 53 0.0%
 
-0.0869893297425326 48 0.0%
 
0.0536071193018422 45 0.0%
 
-0.167555416292594 40 0.0%
 
0.0452174411898587 39 0.0%
 
0.0169521541786674 36 0.0%
 
Other values (275653) 284270 99.8%
 

Minimum 5 values

Value Count Frequency (%)  
-54.497720494566 1 0.0%
 
-28.009635333749 1 0.0%
 
-25.222345240529698 1 0.0%
 
-23.646890332167303 1 0.0%
 
-23.4201725720228 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
23.649094568125502 1 0.0%
 
24.1338941917421 1 0.0%
 
26.237390789565897 1 0.0%
 
38.1172091261285 1 0.0%
 
39.4209042482199 1 0.0%
 

V21
Numeric

Distinct count 275663
Unique (%) 96.8%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 1.5373e-16
Minimum -34.83
Maximum 27.203
Zeros (%) 0.0%

Quantile statistics

Minimum -34.83
5-th percentile -0.50467
Q1 -0.22839
Median -0.02945
Q3 0.18638
95-th percentile 0.53787
Maximum 27.203
Range 62.033
Interquartile range 0.41477

Descriptive statistics

Standard deviation 0.73452
Coef of variation 4778000000000000
Kurtosis 207.29
Mean 1.5373e-16
MAD 0.31907
Skewness 3.593
Sum 4.718e-11
Variance 0.53953
Memory size 2.2 MiB
Value Count Frequency (%)  
-0.26258084604117604 77 0.0%
 
0.26976495136135703 77 0.0%
 
-0.36115803659984497 62 0.0%
 
-0.0642082814806287 60 0.0%
 
-0.35233380052375 53 0.0%
 
-0.0672166613423604 48 0.0%
 
-0.20743240447289701 45 0.0%
 
-0.0402375927503545 40 0.0%
 
-0.191819982814025 39 0.0%
 
0.0073428026657956095 36 0.0%
 
Other values (275653) 284270 99.8%
 

Minimum 5 values

Value Count Frequency (%)  
-34.8303821448146 1 0.0%
 
-22.889347040939 1 0.0%
 
-22.797603905551895 1 0.0%
 
-22.7575398590576 1 0.0%
 
-22.665684604861497 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
22.5806752741477 1 0.0%
 
22.5889894712903 1 0.0%
 
22.5995433627945 1 0.0%
 
22.614889367616897 1 0.0%
 
27.2028391573154 6 0.0%
 

V22
Numeric

Distinct count 275663
Unique (%) 96.8%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 7.9599e-16
Minimum -10.933
Maximum 10.503
Zeros (%) 0.0%

Quantile statistics

Minimum -10.933
5-th percentile -1.0819
Q1 -0.54235
Median 0.0067819
Q3 0.52855
95-th percentile 1.129
Maximum 10.503
Range 21.436
Interquartile range 1.0709

Descriptive statistics

Standard deviation 0.7257
Coef of variation 911700000000000
Kurtosis 2.833
Mean 7.9599e-16
MAD 0.58421
Skewness -0.21326
Sum -9.8112e-11
Variance 0.52664
Memory size 2.2 MiB
Value Count Frequency (%)  
-0.8162637631578471 77 0.0%
 
0.8446266467757121 77 0.0%
 
-0.984261949244254 62 0.0%
 
-0.0805870774450856 60 0.0%
 
-0.9969367748280931 53 0.0%
 
-0.0726415994946915 48 0.0%
 
-0.6924166841818179 45 0.0%
 
0.0961715739635631 40 0.0%
 
-0.650117795537897 39 0.0%
 
0.250885695089417 36 0.0%
 
Other values (275653) 284270 99.8%
 

Minimum 5 values

Value Count Frequency (%)  
-10.933143697655 1 0.0%
 
-9.49942296430251 1 0.0%
 
-8.88701714094871 6 0.0%
 
-8.59364156538624 1 0.0%
 
-8.555807930456341 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
7.357255161770509 1 0.0%
 
8.27223298396612 1 0.0%
 
8.316275438913571 1 0.0%
 
8.361985191684349 1 0.0%
 
10.5030900899454 1 0.0%
 

V23
Numeric

Distinct count 275663
Unique (%) 96.8%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 5.3676e-16
Minimum -44.808
Maximum 22.528
Zeros (%) 0.0%

Quantile statistics

Minimum -44.808
5-th percentile -0.47225
Q1 -0.16185
Median -0.011193
Q3 0.14764
95-th percentile 0.48802
Maximum 22.528
Range 67.336
Interquartile range 0.30949

Descriptive statistics

Standard deviation 0.62446
Coef of variation 1163400000000000
Kurtosis 440.09
Mean 5.3676e-16
MAD 0.26194
Skewness -5.8751
Sum 7.3442e-11
Variance 0.38995
Memory size 2.2 MiB
Value Count Frequency (%)  
0.14030430201432598 77 0.0%
 
0.0206746676928111 77 0.0%
 
0.354198094344309 62 0.0%
 
-0.0729910792746877 60 0.0%
 
0.36348490597863004 53 0.0%
 
-0.0365842826760227 48 0.0%
 
-0.11859751164434901 45 0.0%
 
-0.0925489695836041 40 0.0%
 
-0.11406946378171699 39 0.0%
 
0.10379660601100901 36 0.0%
 
Other values (275653) 284270 99.8%
 

Minimum 5 values

Value Count Frequency (%)  
-44.807735203791296 1 0.0%
 
-36.666000066027 1 0.0%
 
-32.828994997462004 1 0.0%
 
-30.269720014317002 1 0.0%
 
-27.533643285000302 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
19.002941823292698 1 0.0%
 
19.228169082574198 1 0.0%
 
20.8033440994696 1 0.0%
 
22.0835448685737 1 0.0%
 
22.5284116897749 1 0.0%
 

V24
Numeric

Distinct count 275663
Unique (%) 96.8%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 4.4581e-15
Minimum -2.8366
Maximum 4.5845
Zeros (%) 0.0%

Quantile statistics

Minimum -2.8366
5-th percentile -1.1437
Q1 -0.35459
Median 0.040976
Q3 0.43953
95-th percentile 0.86636
Maximum 4.5845
Range 7.4212
Interquartile range 0.79411

Descriptive statistics

Standard deviation 0.60565
Coef of variation 135850000000000
Kurtosis 0.61887
Mean 4.4581e-15
MAD 0.46844
Skewness -0.5525
Sum 1.2736e-09
Variance 0.36681
Memory size 2.2 MiB
Value Count Frequency (%)  
0.726211883811499 77 0.0%
 
0.3578272492998061 77 0.0%
 
0.6207093385008 62 0.0%
 
1.01813597043583 60 0.0%
 
0.6048265697520401 53 0.0%
 
0.529692767553245 48 0.0%
 
0.8914796678605641 45 0.0%
 
-1.34566370602836 40 0.0%
 
0.9159355995388021 39 0.0%
 
1.00995227965779 36 0.0%
 
Other values (275653) 284270 99.8%
 

Minimum 5 values

Value Count Frequency (%)  
-2.83662691870341 1 0.0%
 
-2.82484890293617 1 0.0%
 
-2.82268359235889 1 0.0%
 
-2.82238396858124 1 0.0%
 
-2.81489763570598 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
3.9982936780756897 1 0.0%
 
4.014444384730609 1 0.0%
 
4.01634181669268 1 0.0%
 
4.02286589044732 1 0.0%
 
4.58454913689817 1 0.0%
 

V25
Numeric

Distinct count 275663
Unique (%) 96.8%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 1.453e-15
Minimum -10.295
Maximum 7.5196
Zeros (%) 0.0%

Quantile statistics

Minimum -10.295
5-th percentile -0.82503
Q1 -0.31715
Median 0.016594
Q3 0.35072
95-th percentile 0.7607
Maximum 7.5196
Range 17.815
Interquartile range 0.66786

Descriptive statistics

Standard deviation 0.52128
Coef of variation 358760000000000
Kurtosis 4.2904
Mean 1.453e-15
MAD 0.40326
Skewness -0.41579
Sum 1.5211e-1
Variance 0.27173
Memory size 2.2 MiB
Value Count Frequency (%)  
0.18642294572338897 77 0.0%
 
0.36662430700491294 77 0.0%
 
-0.29713787612847997 62 0.0%
 
0.6635747724804879 60 0.0%
 
-0.26455958625151 53 0.0%
 
0.41468514197751 48 0.0%
 
0.7302403643301479 45 0.0%
 
0.510304957067478 40 0.0%
 
0.7300730401400359 39 0.0%
 
0.369398160051982 36 0.0%
 
Other values (275653) 284270 99.8%
 

Minimum 5 values

Value Count Frequency (%)  
-10.2953970749851 1 0.0%
 
-8.69662677026752 1 0.0%
 
-7.495741104057091 1 0.0%
 
-7.0813253463773895 1 0.0%
 
-7.02578318190186 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
5.54159759459217 1 0.0%
 
5.8261590349735 1 0.0%
 
5.8524835709145595 1 0.0%
 
6.07085038407798 1 0.0%
 
7.51958867870916 1 0.0%
 

V26
Numeric

Distinct count 275663
Unique (%) 96.8%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 1.6991e-15
Minimum -2.6046
Maximum 3.5173
Zeros (%) 0.0%

Quantile statistics

Minimum -2.6046
5-th percentile -0.69735
Q1 -0.32698
Median -0.052139
Q3 0.24095
95-th percentile 0.92092
Maximum 3.5173
Range 6.1219
Interquartile range 0.56794

Descriptive statistics

Standard deviation 0.48223
Coef of variation 283810000000000
Kurtosis 0.91901
Mean 1.6991e-15
MAD 0.37663
Skewness 0.57669
Sum 4.805e-1
Variance 0.23254
Memory size 2.2 MiB
Value Count Frequency (%)  
-0.39882751495946295 77 0.0%
 
0.0965444707905616 77 0.0%
 
0.16673563433513197 62 0.0%
 
-0.671322844293718 60 0.0%
 
0.21967064744260398 53 0.0%
 
0.73586965178554 48 0.0%
 
0.384013445762976 45 0.0%
 
-0.18267352250172897 40 0.0%
 
0.38387941757557303 39 0.0%
 
0.110373548574965 36 0.0%
 
Other values (275653) 284270 99.8%
 

Minimum 5 values

Value Count Frequency (%)  
-2.60455055280817 1 0.0%
 
-2.53432972105675 1 0.0%
 
-2.2416202900029503 1 0.0%
 
-2.06856086855144 1 0.0%
 
-1.8553553377608 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
3.15532747327538 1 0.0%
 
3.22017837466898 1 0.0%
 
3.41563624349633 1 0.0%
 
3.4632456536447997 1 0.0%
 
3.5173456116237998 1 0.0%
 

V27
Numeric

Distinct count 275663
Unique (%) 96.8%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean -3.6602e-16
Minimum -22.566
Maximum 31.612
Zeros (%) 0.0%

Quantile statistics

Minimum -22.566
5-th percentile -0.41525
Q1 -0.07084
Median 0.0013421
Q3 0.091045
95-th percentile 0.38775
Maximum 31.612
Range 54.178
Interquartile range 0.16188

Descriptive statistics

Standard deviation 0.40363
Coef of variation -1102800000000000
Kurtosis 244.99
Mean -3.6602e-16
MAD 0.18147
Skewness -1.1702
Sum -1.0442e-1
Variance 0.16292
Memory size 2.2 MiB
Value Count Frequency (%)  
0.0277351215052822 77 0.0%
 
-0.035866315294695 77 0.0%
 
-0.0682990099344722 62 0.0%
 
0.0968009452278396 60 0.0%
 
-0.0392094515896982 53 0.0%
 
-0.0582327676516994 48 0.0%
 
-0.0284652724675567 45 0.0%
 
0.107058302469808 40 0.0%
 
-0.0319023040878611 39 0.0%
 
-0.0283020664307734 36 0.0%
 
Other values (275653) 284270 99.8%
 

Minimum 5 values

Value Count Frequency (%)  
-22.5656793207827 1 0.0%
 
-9.89524404755692 1 0.0%
 
-9.845807692778981 1 0.0%
 
-9.793567905137511 1 0.0%
 
-9.544855375391482 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
10.135597346295699 1 0.0%
 
10.507884353083499 1 0.0%
 
11.135739844574198 1 0.0%
 
12.1524011068287 1 0.0%
 
31.612198106136304 1 0.0%
 

V28
Numeric

Distinct count 275663
Unique (%) 96.8%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean -1.206e-16
Minimum -15.43
Maximum 33.848
Zeros (%) 0.0%

Quantile statistics

Minimum -15.43
5-th percentile -0.31784
Q1 -0.05296
Median 0.011244
Q3 0.07828
95-th percentile 0.25609
Maximum 33.848
Range 49.278
Interquartile range 0.13124

Descriptive statistics

Standard deviation 0.33008
Coef of variation -2736900000000000
Kurtosis 933.4
Mean -1.206e-16
MAD 0.12933
Skewness 11.192
Sum -3.4758e-11
Variance 0.10895
Memory size 2.2 MiB
Value Count Frequency (%)  
0.0184945729704665 77 0.0%
 
-0.0602821510762213 77 0.0%
 
-0.0295847028396107 62 0.0%
 
0.0286968782920849 60 0.0%
 
-0.0427869554036275 53 0.0%
 
-0.0266576195050194 48 0.0%
 
0.0361233094119335 45 0.0%
 
0.0718180029478637 40 0.0%
 
0.0298493414012052 39 0.0%
 
-0.0203586378568534 36 0.0%
 
Other values (275653) 284270 99.8%
 

Minimum 5 values

Value Count Frequency (%)  
-15.430083905534898 1 0.0%
 
-11.710895639451499 1 0.0%
 
-9.617915452382391 1 0.0%
 
-8.65656990038166 1 0.0%
 
-8.47868564330279 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
15.870474054688598 1 0.0%
 
15.942150981273501 1 0.0%
 
16.1296091387323 1 0.0%
 
22.620072218580297 1 0.0%
 
33.8478078188831 1 0.0%
 

Amount
Numeric

Distinct count 32767
Unique (%) 11.5%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 88.35
Minimum 0
Maximum 25691
Zeros (%) 0.6%

Quantile statistics

Minimum 0
5-th percentile 0.92
Q1 5.6
Median 22
Q3 77.165
95-th percentile 365
Maximum 25691
Range 25691
Interquartile range 71.565

Descriptive statistics

Standard deviation 250.12
Coef of variation 2.831
Kurtosis 845.09
Mean 88.35
MAD 103.53
Skewness 16.978
Sum 25163000
Variance 62560
Memory size 2.2 MiB
Value Count Frequency (%)  
1.0 13688 4.8%
 
1.98 6044 2.1%
 
0.89 4872 1.7%
 
9.99 4747 1.7%
 
15.0 3280 1.2%
 
0.76 2998 1.1%
 
10.0 2950 1.0%
 
1.29 2892 1.0%
 
1.79 2623 0.9%
 
0.99 2304 0.8%
 
Other values (32757) 238409 83.7%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 1825 0.6%
 
0.01 718 0.3%
 
0.02 85 0.0%
 
0.03 3 0.0%
 
0.04 11 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
11898.09 1 0.0%
 
12910.93 1 0.0%
 
18910.0 1 0.0%
 
19656.53 1 0.0%
 
25691.16 1 0.0%
 

Class
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.0017275
0
284315
1
 
492
Value Count Frequency (%)  
0 284315 99.8%
 
1 492 0.2%
 

Correlations

Sample

Time V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20 V21 V22 V23 V24 V25 V26 V27 V28 Amount Class
0 0.0 -1.359807 -0.072781 2.536347 1.378155 -0.338321 0.462388 0.239599 0.098698 0.363787 0.090794 -0.551600 -0.617801 -0.991390 -0.311169 1.468177 -0.470401 0.207971 0.025791 0.403993 0.251412 -0.018307 0.277838 -0.110474 0.066928 0.128539 -0.189115 0.133558 -0.021053 149.62 0
1 0.0 1.191857 0.266151 0.166480 0.448154 0.060018 -0.082361 -0.078803 0.085102 -0.255425 -0.166974 1.612727 1.065235 0.489095 -0.143772 0.635558 0.463917 -0.114805 -0.183361 -0.145783 -0.069083 -0.225775 -0.638672 0.101288 -0.339846 0.167170 0.125895 -0.008983 0.014724 2.69 0
2 1.0 -1.358354 -1.340163 1.773209 0.379780 -0.503198 1.800499 0.791461 0.247676 -1.514654 0.207643 0.624501 0.066084 0.717293 -0.165946 2.345865 -2.890083 1.109969 -0.121359 -2.261857 0.524980 0.247998 0.771679 0.909412 -0.689281 -0.327642 -0.139097 -0.055353 -0.059752 378.66 0
3 1.0 -0.966272 -0.185226 1.792993 -0.863291 -0.010309 1.247203 0.237609 0.377436 -1.387024 -0.054952 -0.226487 0.178228 0.507757 -0.287924 -0.631418 -1.059647 -0.684093 1.965775 -1.232622 -0.208038 -0.108300 0.005274 -0.190321 -1.175575 0.647376 -0.221929 0.062723 0.061458 123.50 0
4 2.0 -1.158233 0.877737 1.548718 0.403034 -0.407193 0.095921 0.592941 -0.270533 0.817739 0.753074 -0.822843 0.538196 1.345852 -1.119670 0.175121 -0.451449 -0.237033 -0.038195 0.803487 0.408542 -0.009431 0.798278 -0.137458 0.141267 -0.206010 0.502292 0.219422 0.215153 69.99 0

The function below is used to calculate the various evaluation metrics including area under the ROC curve, the area under the precision-recall curve,-f1-score etc.

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_recall_fscore_support
from numpy import trapz
from scipy.integrate import simps
from sklearn.metrics import f1_score

def Evaluate(labels, predictions, p=0.5):
    CM= confusion_matrix(labels, predictions > p)
    TN = CM[0][0]
    FN = CM[1][0]
    TP = CM[1][1]
    FP = CM[0][1]
    print('Legitimate Transactions Detected (True Negatives): {}'.format(TN))
    print('Fraudulent Transactions Missed (False Negatives):  {}'.format(FN))
    print('Fraudulent Transactions Detected (True Positives): {}'.format(TP))
    print('Legitimate Transactions Incorrectly Detected (False Positives):{}'.format(FP))
    print('Total Fraudulent Transactions: ', np.sum(CM[1]))
    auc = roc_auc_score(labels, predictions)
    prec=precision_score(labels, predictions>0.5)
    rec=recall_score(labels, predictions>0.5)
     # calculate F1 score
    f1 = f1_score(labels, predictions>p)
    print('auc :{}'.format(auc))
    print('precision :{}'.format(prec))
    print('recall :{}'.format(rec))
    print('f1 :{}'.format(f1))
    # Compute Precision-Recall and plot curve
    precision, recall, thresholds = precision_recall_curve(labels, predictions >0.5)
    #use the trapezoidal rule to calculate the area under the precion-recall curve
    area =  trapz(recall, precision)
   
    #area =  simps(recall, precision)
    print("Area Under Precision Recall  Curve(AP): %0.4f" % area)   #should be same as AP?    
from sklearn.metrics import auc

We will attempt to investigate the performance of several ML algorithms on imbalanced target class data classification. The xgboost algorithm will be used to model data that is undersampled, no sample at all, Synthetic Minority Oversampling Technique and also modified to perform a cost sensitive learning. The other ML algorithms which will be tested include Forest of Randomized Trees, RusBoost, EasyEnsemble and Bagging classifier. The cost-sensitive xgboost method will invlove experimental determining the optimal weight on the minority target class using bayesian optimization.

XGBoost No Weights

The first model considered here is an xgboost with default hyperparameter values with no sampling of data.

xgb_no_weights = clf.fit(X=train_x,  y=train_y)
#xgb_no_weights_pred = xgb_no_weights.predict_proba(test_x)
xgb_no_weights_pred = xgb_no_weights.predict_proba(test_x)[:,1]
dump(xgb_no_weights, '/content/drive/My Drive/ImbalancedData/xgb_no_weights.joblib')
Evaluate(labels=test_y, predictions=xgb_no_weights_pred, p=0.5)
Legitimate Transactions Detected (True Negatives): 56854
Fraudulent Transactions Missed (False Negatives):  26
Fraudulent Transactions Detected (True Positives): 79
Legitimate Transactions Incorrectly Detected (False Positives):3
Total Fraudulent Transactions:  105
auc :0.9777749860343032
precision :0.9634146341463414
recall :0.7523809523809524
f1 :0.8449197860962566
Area Under Precision Recall  Curve(AP): 0.8563
xgb_no_weights  =  load( '/content/drive/My Drive/ImbalancedData/xgb_no_weights.joblib')
xgb_no_weights_pred = xgb_no_weights.predict_proba(test_x)[:,1]
#Evaluate(labels=test_y, predictions=xgb_no_weights_pred, p=0.5)

Convert these vectors from python to R vectors. This will alow to use the R library for evaluating ML models /MLmetrics to be used in finding area under the precision-recall curve.

 #%R -i  test_y 
#%%R -i xgb_no_weights_pred

#library( MLmetrics)
#library(yardstick)
#library(mltools)
#library("glue")
#MLmetrics::AUC(xgb_no_weights_pred,test_y)
#d= data.frame(pred=xgb_no_weights_pred[,2],truth=as.factor(test_y))
#glue("Test Set : Area Under Precision-Recall Curve: {yardstick::pr_auc(d, truth, pred)}")
#glue("Test Set : Area Under Precision-Recall Curve: {MLmetrics::PRAUC(as.vector(xgb_no_weights_pred),test_y)}")
#head(d)
dump(xgb_no_weights, '/content/drive/My Drive/ImbalancedData/xgb_no_weights.joblib') 
['/content/drive/My Drive/ImbalancedData/xgb_no_weights.joblib']

Model one : XGBoost with Weights on Label/ No Sampling


ns_model = bayessearch.fit(X=train_x,  y=train_y)
os.getcwd()
#ns_model
dump(ns_model, '/content/drive/My Drive/ImbalancedData/ns_model.joblib') 
['/content/drive/My Drive/ImbalancedData/ns_model.joblib']
#dump(ns_model, '/content/drive/My Drive/ImbalancedData/ns_model.joblib') 

ns_model = load('/content/drive/My Drive/ImbalancedData/ns_model.joblib') 


ns_model_pred = ns_model.predict_proba(test_x)[:,1]

Evaluate(labels=test_y, predictions=ns_model_pred, p=0.5)
Legitimate Transactions Detected (True Negatives): 56839
Fraudulent Transactions Missed (False Negatives):  15
Fraudulent Transactions Detected (True Positives): 90
Legitimate Transactions Incorrectly Detected (False Positives):18
Total Fraudulent Transactions:  105
auc :0.9961524191434317
precision :0.8333333333333334
recall :0.8571428571428571
f1 :0.8450704225352113
Area Under PR Curve(AP): 0.8435

XGBoost with Undersampling

from collections import Counter
from sklearn.datasets import make_classification
from imblearn.under_sampling import RandomUnderSampler # doctest: +NORMALIZE_WHITESPACE


rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X=train_x,  y=train_y)
print('Resampled dataset shape %s' % Counter(y_rus))


xgb_model_rus = clf.fit(X=X_rus, y=y_rus)                             



dump(xgb_model_rus, '/content/drive/My Drive/ImbalancedData/xgb_model_rus.joblib') 
Resampled dataset shape Counter({0: 402, 1: 402})





['/content/drive/My Drive/ImbalancedData/xgb_model_rus.joblib']
xgb_model_rus = load('/content/drive/My Drive/ImbalancedData/xgb_model_rus.joblib') 

p=0.5
xgb_model_rus_pred  =  xgb_model_rus.predict_proba(test_x.values)[:,1]
#predict_proba(test_x)[:,1]

print(" balanced_accuracy_score {}".format(balanced_accuracy_score(test_y, xgb_model_rus_pred>p) ))

cm=confusion_matrix(test_y, xgb_model_rus_pred>0.5)

print("confusion matrix : {}".format(cm))
#test_x.columns
 #xgb_model_rus.


Evaluate(labels=test_y, predictions=xgb_model_rus_pred, p=0.5)



import collections

counter=collections.Counter(xgb_model_rus_pred>0.5)
print(counter)
 balanced_accuracy_score 0.9634347489985318
confusion matrix : [[54865  1992]
 [    4   101]]
Legitimate Transactions Detected (True Negatives): 54865
Fraudulent Transactions Missed (False Negatives):  4
Fraudulent Transactions Detected (True Positives): 101
Legitimate Transactions Incorrectly Detected (False Positives):1992
Total Fraudulent Transactions:  105
auc :0.9926581055061278
precision :0.0482560917343526
recall :0.9619047619047619
f1 :0.09190172884440401
Area Under PR Curve(AP): 0.5033
Counter({False: 54869, True: 2093})
print("Classification Report")
print(classification_report(test_y, xgb_model_rus_pred > p))

# ROC curve and Area-Under-Curve (AUC)
#calculating accuracy
accuracy_xgbm_sm= accuracy_score(test_y, xgb_model_rus_pred>p)

print('accuracy score : {:0.3f}'.format( accuracy_xgbm_sm))

roc_auc_sm = roc_auc_score(test_y, xgb_model_rus_pred)

print('roc score : {:0.3f}'.format( roc_auc_sm))

Classification Report
              precision    recall  f1-score   support

           0       1.00      0.96      0.98     56857
           1       0.05      0.96      0.09       105

    accuracy                           0.96     56962
   macro avg       0.52      0.96      0.54     56962
weighted avg       1.00      0.96      0.98     56962

accuracy score : 0.965
roc score : 0.993

SMOTE

from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE # doctest: +NORMALIZE_WHITESPACE

print('Original dataset shape %s' % Counter(train_y))

sm = SMOTE(random_state=42)

X_res, y_res = sm.fit_resample(X=train_x,  y=train_y)
print('Resampled dataset shape %s' % Counter(y_res))

xgb_model_sm= clf.fit(X=X_res, y=y_res)
xgb_model_sm_pred = xgb_model_sm.predict(test_x.values)
#iba(y_test, y_pred)
#balanced_accuracy_score(y_test, y_pred) 

cm=confusion_matrix(test_y, xgb_model_sm_pred)

print("confusion matrix")
print(cm)
print("Classification Report")
print(classification_report(test_y, xgb_model_sm_pred))

# ROC curve and Area-Under-Curve (AUC)
#calculating accuracy
accuracy_xgbm_sm= accuracy_score(test_y, xgb_model_sm_pred)
print('accuracy score : {:0.3f}'.format( accuracy_xgbm_sm))
roc_auc_sm = roc_auc_score(test_y, xgb_model_sm_pred)
print('roc score : {:0.3f}'.format( roc_auc_sm))

dump(xgb_model_sm, '/content/drive/My Drive/ImbalancedData/xgb_model_sm.joblib') 
xgb_model_sm = load('/content/drive/My Drive/ImbalancedData/xgb_model_sm.joblib') 

xgb_model_sm_pred  =  xgb_model_sm.predict_proba(test_x.values)[:,1]
Evaluate(labels=test_y, predictions=xgb_model_sm_pred, p=0.5)

Legitimate Transactions Detected (True Negatives): 56306
Fraudulent Transactions Missed (False Negatives):  6
Fraudulent Transactions Detected (True Positives): 99
Legitimate Transactions Incorrectly Detected (False Positives):551
Total Fraudulent Transactions:  105
auc :0.9948591160614306
precision :0.1523076923076923
recall :0.9428571428571428
f1 :0.2622516556291391
Area Under PR Curve(AP): 0.5458

Forest of randomized trees

BalancedRandomForestClassifier is another ensemble method in which each tree of the forest will be provided a balanced bootstrap sample. This class provides all functionality of the sklearn.ensemble.RandomForestClassifier and notably the feature_importances_ attributes:

from imblearn.ensemble import BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=0)
brf.fit(train_x,  train_y )

brf_pred = brf.predict(test_x)
balanced_accuracy_score(test_y, brf_pred)  

#brf.feature_importances

dump(brf, '/content/drive/My Drive/ImbalancedData/brf.joblib')
brf = load('/content/drive/My Drive/ImbalancedData/brf.joblib') 

brf_pred  =  brf.predict_proba(test_x.values)[:,1]
Evaluate(labels=test_y, predictions= brf_pred, p=0.5)
Legitimate Transactions Detected (True Negatives): 55509
Fraudulent Transactions Missed (False Negatives):  2
Fraudulent Transactions Detected (True Positives): 103
Legitimate Transactions Incorrectly Detected (False Positives):1348
Total Fraudulent Transactions:  105
auc :0.9878704887868227
precision :0.0709855272226051
recall :0.9809523809523809
f1 :0.13239074550128535
Area Under PR Curve(AP): 0.5241

RusBoost

from imblearn.ensemble import RUSBoostClassifier
from sklearn.datasets import make_classification


rbt = RUSBoostClassifier(random_state=0,
                               base_estimator=DecisionTreeClassifier(),
                             sampling_strategy='auto')
  
# Fit the grid search to the data
rbt.fit(X=train_x,  y=train_y)
  
dump(rbt, '/content/drive/My Drive/ImbalancedData/rbt.joblib')
rbt = load('/content/drive/My Drive/ImbalancedData/rbt.joblib') 

rbt_pred  =  rbt.predict_proba(test_x.values)[:,1]
Evaluate(labels=test_y, predictions= rbt_pred, p=0.5)
Legitimate Transactions Detected (True Negatives): 55970
Fraudulent Transactions Missed (False Negatives):  5
Fraudulent Transactions Detected (True Positives): 100
Legitimate Transactions Incorrectly Detected (False Positives):887
Total Fraudulent Transactions:  105
auc :0.9918764452507
precision :0.10131712259371833
recall :0.9523809523809523
f1 :0.18315018315018314
Area Under PR Curve(AP): 0.5250

EasyEnsembleClassifier

A specific method which uses AdaBoost as learners in the bagging classifier is called EasyEnsemble. The EasyEnsembleClassifier allows to bag AdaBoost learners which are trained on balanced bootstrap sample. Similarly to the BalancedBaggingClassifier API, one can construct the ensemble as:

from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(random_state=0,
                             base_estimator=AdaBoostClassifier(),
                            sampling_strategy='auto')





eec.fit(X=train_x,  y=train_y)

dump(eec, '/content/drive/My Drive/ImbalancedData/eec.joblib')
eec = load('/content/drive/My Drive/ImbalancedData/eec.joblib') 

eec_pred  =  eec.predict_proba(test_x.values)[:,1]
Evaluate(labels=test_y, predictions= eec_pred, p=0.5)
Legitimate Transactions Detected (True Negatives): 54565
Fraudulent Transactions Missed (False Negatives):  3
Fraudulent Transactions Detected (True Positives): 102
Legitimate Transactions Incorrectly Detected (False Positives):2292
Total Fraudulent Transactions:  105
auc :0.9912644671636528
precision :0.042606516290726815
recall :0.9714285714285714
f1 :0.08163265306122448
Area Under PR Curve(AP): 0.5052

Bagging classifier

BalancedBaggingClassifier allows to resample each subset of data before to train each estimator of the ensemble. In short, it combines the output of an EasyEnsemble sampler with an ensemble of classifiers (i.e. BaggingClassifier). Therefore, BalancedBaggingClassifier takes the same parameters than the scikit-learn BaggingClassifier. Additionally, there is two additional parameters, sampling_strategy and replacement to control the behaviour of the random under-sampler:

from imblearn.ensemble import BalancedBaggingClassifier

from imblearn.ensemble import BalancedBaggingClassifier
bbc = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                sampling_strategy='auto',
                                 replacement=False,
                                n_jobs=-1,
                               random_state=0)


bbc.fit(X=train_x,  y=train_y)
dump(bbc, '/content/drive/My Drive/ImbalancedData/bbc.joblib')

bbc_model = load('/content/drive/My Drive/ImbalancedData/bbc.joblib') 
bbc_pred  =  bbc_model.predict_proba(test_x.values)[:,1]
Evaluate(labels=test_y, predictions= bbc_pred, p=0.5)
Legitimate Transactions Detected (True Negatives): 55701
Fraudulent Transactions Missed (False Negatives):  5
Fraudulent Transactions Detected (True Positives): 100
Legitimate Transactions Incorrectly Detected (False Positives):1156
Total Fraudulent Transactions:  105
auc :0.9845647853386567
precision :0.07961783439490445
recall :0.9523809523809523
f1 :0.1469507714915503
Area Under PR Curve(AP): 0.5142
#pred_df=pd.DataFrame()
pred_df = pd.DataFrame(test_y,index=None)
pred_df['baggingclassifier_pred'] =bbc_pred
pred_df['easyensemble_pred'] = eec_pred
pred_df['RusBoost_pred']  =  rbt_pred
pred_df['forest_r_t']  =  brf_pred 
pred_df['xgb_rus_pred'] = xgb_model_rus_pred
pred_df['xgb_smote_pred']  =  xgb_model_sm_pred
pred_df['xgboost_weights'] =  ns_model_pred
#pred_df["test_y"]    = test_y
pred_df["xgb_no_weights_pred"] = xgb_no_weights_pred

pred_df.to_csv('/content/drive/My Drive/ImbalancedData/pred_df.csv')

#
#pred_df.head()
#pred_df.test_y.isna().sum()
#test_y
#test_y.isna().sum()

Plot the AUC ROC

mpl.rcParams['figure.figsize'] = (12, 10)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
from sklearn.metrics import roc_curve

def plot_roc(name, labels, predictions, p=0.5, **kwargs):
  fp, tp, _ = sklearn.metrics.roc_curve(labels, predictions)

  plt.plot(100*fp, 100*tp, label=name, linewidth=2, **kwargs)
  plt.xlabel('False positives [%]')
  plt.ylabel('True positives [%]')
  plt.xlim([-0.5,80])
  plt.title('Area Under ROC Curve @{:.2f}'.format(p))
  plt.ylim([20,100.5])
  plt.grid(True)
  ax = plt.gca()
  ax.set_aspect('equal')
#%matplotlib inline
sns.set_style("whitegrid")
plot_roc("xgboost No Weight", test_y, xgb_no_weights_pred, color=colors[0],linestyle='--')
plot_roc("Xgboost Weight", test_y ,ns_model_pred, color=colors[1])
plot_roc("Xgboost Under-Sampling", test_y, xgb_model_rus_pred, color=colors[2])
plot_roc("Xgboost Smote", test_y ,xgb_model_sm_pred, color=colors[3])
plot_roc("Forest of Randomized Trees", test_y ,brf_pred, color=colors[4])
plot_roc("RusBoost", test_y ,rbt_pred, color=colors[5])
plot_roc("EasyEnsemble Classifier", test_y ,eec_pred, color=colors[6])
plot_roc("Bagging Classifier", test_y ,bbc_pred, color=colors[7])
plt.legend(loc='lower right')
plt.savefig('/content/drive/My Drive/ImbalancedData/all_rocauc.png')

png

Plot the Area Under Orecision Recall Curve

from sklearn.metrics import precision_recall_curve

def plot_auc_pr(name, labels, predictions,n=0.5, **kwargs):
  p, r, _ = sklearn.metrics.precision_recall_curve(labels, predictions)

  plt.plot(100*r, 100*p, label=name, linewidth=2, **kwargs)
  plt.xlabel('Recall [%]')
  plt.ylabel('Precision [%]')
  plt.xlim([-0.5,100])
  plt.title('Area Under Precision-Recall Curve @{:.2f}'.format(n))
  #plt.title('Area Under Precision-Recall Curve: {}' .format(p))
  plt.ylim([20,100.5])
  plt.grid(True)
  ax = plt.gca()
  ax.set_aspect('equal')
#plot_auc_pr("Train  Weight", train_labels, train_predictions_weight, color=colors[1])
sns.set_style("whitegrid")
plot_auc_pr("xgboost No Weight", test_y, xgb_no_weights_pred, color=colors[0],linestyle='--')
plot_auc_pr("Xgboost Weight", test_y ,ns_model_pred, color=colors[1])
plot_auc_pr("Xgboost Under-Sampling", test_y, xgb_model_rus_pred, color=colors[2])
plot_auc_pr("Xgboost Smote", test_y ,xgb_model_sm_pred, color=colors[3])
plot_auc_pr("Forest of Randomized Trees", test_y ,brf_pred, color=colors[4])
plot_auc_pr("RusBoost", test_y ,rbt_pred, color=colors[5])
plot_auc_pr("EasyEnsemble Classifier", test_y ,eec_pred, color=colors[6])
plot_auc_pr("Bagging Classifier", test_y ,bbc_pred, color=colors[7])
plt.legend(loc='lower left')
plt.savefig('/content/drive/My Drive/ImbalancedData/auc_pr2.png')

png

Cost-Sensitive Logistic Regression

Logistic Regression is a well known statistical model for modeling binary target values that is often overlooked. It will be interesting to see how it performs when presented with an imbalanced target class. It can be modified to perform a Cost-sensitive learning with imbaalanced data.


from sklearn.linear_model import ElasticNet
from sklearn import linear_model

elasticreg = linear_model.SGDClassifier( tol=1e-3,
                                class_weight='balanced',
                                 max_iter = int(1e4), 
                                 warm_start = True, 
                                 n_jobs = -1)

plt.style.use('ggplot')
sns.set_style("whitegrid")

threshhold=0.5
fig = plt.figure(figsize=(15,8))
ax1 = fig.add_subplot(1,2,1)
ax1.set_xlim([0,100])
ax1.set_ylim([0,100])
ax1.set_xlabel('Recall')
ax1.set_ylabel('Precision')
ax1.set_title('Area Under Precision-Recall Curve @{:.2f}'.format(threshhold))

ax2 = fig.add_subplot(1,2,2)
ax2.set_xlim([-0.5,30])
ax2.set_ylim([80,100])
ax2.set_xlabel('False Positive Rate')
ax2.set_ylabel('True Positive Rate')
ax2.set_title('Area Under ROC Curve @{:.2f}'.format(threshhold))



rocauc_vector= []
f1_vector= []
prec_vector= []
rec_vector= []
#cfn_matrix_  =  np.zeros((8, 4))
cfn_matrix_ =[]
pr_auc_vector =[]

for w,k in zip([1,5,10,20,50,100,500,10000],'bgrcmykw'):
    lr_model = LogisticRegression(class_weight={0:1,1:w})
    lr_model.fit(train_x,train_y) 
    
    threshhold=0.5
    pred_prob = lr_model.predict_proba(test_x)[:,1]

    p,r,_ = precision_recall_curve(test_y,pred_prob)
    tpr,fpr,_ = roc_curve(test_y,pred_prob)
    auc=     roc_auc_score(test_y,pred_prob)
    f1 =     f1_score(test_y,pred_prob >threshhold )
    #f1 = f1_score(labels, predictions> threshhold)
    prec=precision_score(test_y,pred_prob>threshhold)
    rec=recall_score(test_y,pred_prob > threshhold)
    cfn_matrix = confusion_matrix(test_y,pred_prob > threshhold)
    rocauc_vector.append(auc)
    f1_vector.append(f1)
    prec_vector.append(prec)
    rec_vector.append(rec)
    #cfn_matrix_[w,:] = cfn_matrix.flatten()
    cfn_matrix_.append(cfn_matrix)
    precision, recall, thresholds = precision_recall_curve(test_y, pred_prob > threshhold)
    #use the trapezoidal rule to calculate the area under the precion-recall curve
    area =  trapz(recall, precision)
    pr_auc_vector.append(area)
    ax1.plot(r*100,p*100,c=k,label=w)
    ax2.plot(tpr*100,fpr*100,c=k,label=w)
    #plt.xlim([-0.5,30])
    #plt.title('Area Under ROC Curve @{:.2f}'.format(p))
    #plt.ylim([80,100.5])
ax1.legend(loc='lower left')    
ax2.legend(loc='lower right')
plt.savefig('/content/drive/My Drive/ImbalancedData/logistic.png')
plt.show()

png


results=pd.DataFrame(list(zip([1,5,10,20,50,100,500,10000],rocauc_vector,f1_vector,prec_vector,rec_vector)))
results.columns = ['Weight','ROC_AUC','F-Score','Precision','Recall']
results['TP'] = [66,83,89,92,95,96,101,108]
results['TN'] = [56846,56831,56829,56805,56750,56613,55668,40576]
results['FP'] = [6,21,23,47,102,239,1184,16276]
results['FN'] = [46,27,21,18,15,14,9,2]
results['PR_AUC'] = pr_auc_vector
results
Weight ROC_AUC F-Score Precision Recall TP TN FP FN PR_AUC
0 1 0.959148 0.725275 0.916667 0.600000 66 56846 6 46 0.756788
1 5 0.976090 0.775701 0.798077 0.754545 83 56831 21 27 0.774617
2 10 0.978801 0.801802 0.794643 0.809091 89 56829 23 21 0.800120
3 20 0.982562 0.738956 0.661871 0.836364 92 56805 47 18 0.747344
4 50 0.985512 0.618893 0.482234 0.863636 95 56750 102 15 0.671135
5 100 0.986236 0.431461 0.286567 0.872727 96 56613 239 14 0.577839
6 500 0.988871 0.144803 0.078599 0.918182 101 55668 1184 9 0.496538
7 10000 0.986018 0.013096 0.006592 0.981818 108 40576 16276 2 0.492291

Cost-sensitive logistic regression with a weight of 10 on the minority class performs well in comparison to other weights. It has a high ROC-AUC and area under the precision-recall curve.