In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
In [2]:
pre_data = pd.read_csv("./preprocessed_data.csv")
In [3]:
data = pre_data.drop(columns=['Company_Name'])
In [4]:
data.head()
Out[4]:
Organization_Number Registration_Month Bransje Fylke Kommune Stiftet Share_Capital Organization_Form Ansatte Class
0 811108952 8 41 7 704 1988 120000 10 0 1
1 811549452 4 74 1 101 2013 30000 10 0 1
2 811594962 2 43 50 5001 2013 30000 10 0 1
3 811618462 11 41 18 1860 2013 0 2 1 1
4 811678252 10 43 50 5024 2013 30000 10 0 1
In [5]:
data.shape
Out[5]:
(26531, 10)
In [6]:
### Get the train and test data-set, with and without sampling Train - Test data split without resampling
In [7]:
X = data.iloc[:, data.columns != 'Class'].values
y = data.iloc[:, data.columns == 'Class'].values
In [8]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state = 0)

print("Original number transactions train dataset: ", len(X_train))
print("Original number transactions test dataset: ", len(X_test))
print("Total number of transactions: ", len(X_train)+len(X_test))
Original number transactions train dataset:  18571
Original number transactions test dataset:  7960
Total number of transactions:  26531
In [9]:
# Number of data points in the minority class
number_records_fraud = len(data[data.Class == 1])
fraud_indices = np.array(data[data.Class == 1].index)
print(number_records_fraud)
#print(data[data.Class == 1])
6286
In [10]:
# Picking the indices of the normal classes
normal_indices = data[data.Class == 0].index

# Out of the indices we picked, randomly select "x" number (number_records_fraud)
#np.random.choice(). By using this, the numbers of fraud indices and non-fraud indices become equal. 
random_normal_indices = np.random.choice(a = normal_indices, size = number_records_fraud, replace = False)
random_normal_indices = np.array(random_normal_indices)
In [11]:
# Appending the 2 indices
under_sample_indices = np.concatenate([fraud_indices,random_normal_indices])
print(under_sample_indices)
[    0     1     2 ... 10007  7523 22501]
In [12]:
# Under sample dataset
under_sample_data = data.iloc[under_sample_indices,:]


X_undersample = under_sample_data.iloc[:, under_sample_data.columns != 'Class']
y_undersample = under_sample_data.iloc[:, under_sample_data.columns == 'Class']

# Showing ratio
print("Percentage of normal transactions: ", len(under_sample_data[under_sample_data.Class == 0])/len(under_sample_data))
print("Percentage of fraud transactions: ", len(under_sample_data[under_sample_data.Class == 1])/len(under_sample_data))
print("Total number of transactions in resampled data: ", len(under_sample_data))
Percentage of normal transactions:  0.5
Percentage of fraud transactions:  0.5
Total number of transactions in resampled data:  12572
In [13]:
# Undersampled dataset
X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = train_test_split(X_undersample
                                                                                                   ,y_undersample
                                                                                                   ,test_size = 0.3
                                                                                                   ,random_state = 0)
print("")
print("Number transactions train dataset: ", len(X_train_undersample))
print("Number transactions test dataset: ", len(X_test_undersample))
print("Total number of transactions: ", len(X_train_undersample)+len(X_test_undersample))
Number transactions train dataset:  8800
Number transactions test dataset:  3772
Total number of transactions:  12572
In [14]:
# Logistic Regression
In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.metrics import confusion_matrix,precision_recall_curve,auc,roc_auc_score,roc_curve,recall_score,classification_report 
D:\Anaconda_new\lib\site-packages\sklearn\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
In [16]:
def printing_Kfold_scores(x_train_data,y_train_data):
    #sklearn.model_selection.KFold(n_splits=’warn’, shuffle=False, random_state=None)
    fold = KFold(len(y_train_data),5,shuffle=False)
    
    #Different C parameters
    c_param_range = [0.01,0.1,1,10,100]
    
    results_table = pd.DataFrame(index = range(len(c_param_range),2), columns = ['C_parameter','Mean recall score'])
    results_table['C_parameter'] = c_param_range

    # the k-fold will give 2 lists: train_indices = indices[0], test_indices = indices[1]
    j = 0
    for c_param in c_param_range:
        print('-------------------------------------------')
        print('C parameter: ', c_param)
        print('-------------------------------------------')
        print('')

        recall_accs = []
        mean_scores = []
        for iteration, indices in enumerate(fold,start=1):

            # Call the logistic regression model with a certain C parameter
            lr = LogisticRegression(C = c_param, penalty = 'l1')

            # Use the training data to fit the model. In this case, we use the portion of the fold to train the model
            # with indices[0]. We then predict on the portion assigned as the 'test cross validation' with indices[1]
            lr.fit(x_train_data.iloc[indices[0],:],y_train_data.iloc[indices[0],:].values.ravel())

            # Predict values using the test indices in the training data
            y_pred_undersample = lr.predict(x_train_data.iloc[indices[1],:].values)

            # Calculate the recall score and append it to a list for recall scores representing the current c_parameter
            recall_acc = recall_score(y_train_data.iloc[indices[1],:].values,y_pred_undersample)
            recall_accs.append(recall_acc)
            print('Iteration ', iteration,': recall score = ', recall_acc)

        # The mean value of those recall scores is the metric we want to save and get hold of.
        results_table.ix[j,'Mean recall score'] = np.mean(recall_accs)
        j += 1
        print('')
        mean_scores.append(np.mean(recall_accs))
        print('Mean recall score ', np.mean(recall_accs))
        print('')
    
    max_mean_recall_score = results_table["Mean recall score"].max()
    best_c = None
    for i in range(results_table["Mean recall score"].count()):
        if results_table.ix[i,'Mean recall score'] == max_mean_recall_score:
            best_c = c_param_range[i]
        
            
#     best_c = results_table.loc['Mean recall score']['C_parameter']
        
#     # Finally, we can check which C parameter is the best amongst the chosen.
    print('*********************************************************************************')
    print('Best model to choose from cross validation is with C parameter = ', best_c)
    print('*********************************************************************************')
    
    return best_c
In [17]:
best_c = printing_Kfold_scores(X_train_undersample,y_train_undersample)
-------------------------------------------
C parameter:  0.01
-------------------------------------------

Iteration  1 : recall score =  0.6506300114547537
Iteration  2 : recall score =  0.6465997770345596
Iteration  3 : recall score =  0.6594036697247706
Iteration  4 : recall score =  0.6475869809203143
Iteration  5 : recall score =  0.6685205784204672

Mean recall score  0.6545482035109731

-------------------------------------------
C parameter:  0.1
-------------------------------------------

D:\Anaconda_new\lib\site-packages\ipykernel_launcher.py:39: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
Iteration  1 : recall score =  0.6540664375715922
Iteration  2 : recall score =  0.6432552954292085
Iteration  3 : recall score =  0.6490825688073395
Iteration  4 : recall score =  0.6386083052749719
Iteration  5 : recall score =  0.6507230255839822

Mean recall score  0.6471471265334189

-------------------------------------------
C parameter:  1
-------------------------------------------

Iteration  1 : recall score =  0.6529209621993127
Iteration  2 : recall score =  0.6421404682274248
Iteration  3 : recall score =  0.6571100917431193
Iteration  4 : recall score =  0.6464646464646465
Iteration  5 : recall score =  0.6685205784204672

Mean recall score  0.653431349410994

-------------------------------------------
C parameter:  10
-------------------------------------------

Iteration  1 : recall score =  0.6494845360824743
Iteration  2 : recall score =  0.6432552954292085
Iteration  3 : recall score =  0.6571100917431193
Iteration  4 : recall score =  0.6363636363636364
Iteration  5 : recall score =  0.6674082313681868

Mean recall score  0.650724358197325

-------------------------------------------
C parameter:  100
-------------------------------------------

Iteration  1 : recall score =  0.6540664375715922
Iteration  2 : recall score =  0.6432552954292085
Iteration  3 : recall score =  0.6559633027522935
Iteration  4 : recall score =  0.6464646464646465
Iteration  5 : recall score =  0.6674082313681868

Mean recall score  0.6534315827171855

*********************************************************************************
Best model to choose from cross validation is with C parameter =  0.01
*********************************************************************************
D:\Anaconda_new\lib\site-packages\ipykernel_launcher.py:49: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
In [19]:
# Use this C_parameter to build the final model with the sampled training dataset and predict the classes in the test
# dataset
best_c = 0.01
lr = LogisticRegression(C = best_c, penalty = 'l1') # l2 is about 90% recall
lr.fit(X_train_undersample,y_train_undersample.values.ravel())
y_pred_undersample = lr.predict(X_test_undersample.values)

# Compute and plot confusion matrix
cnf_matrix = confusion_matrix(y_test_undersample,y_pred_undersample)

#Model overall accuracy
print("the Model overall accuracy is :",(cnf_matrix[1,1]+cnf_matrix[0,0])/(cnf_matrix[1,1]+cnf_matrix[1,0]+cnf_matrix[1,0]+cnf_matrix[0,0]))
print()
print("the recall of fraud is :",cnf_matrix[1,1]/(cnf_matrix[1,1]+cnf_matrix[1,0]))
print("the precision of fraud is :",cnf_matrix[1,1]/(cnf_matrix[1,1]+cnf_matrix[0,1]))
print()
print("the recall of normal is :",cnf_matrix[0,0]/(cnf_matrix[0,0]+cnf_matrix[0,1]))
print("the precision of normal is :",cnf_matrix[0,0]/(cnf_matrix[0,0]+cnf_matrix[1,0]))

fig= plt.figure(figsize=(6,3))# to plot the graph
print("TP",cnf_matrix[1,1]) # no of fraud transaction which are predicted fraud
print("TN",cnf_matrix[0,0]) # no.of normal transaction which are predited normal
print("FP",cnf_matrix[0,1]) # no of normal transaction which are predicted fraud
print("FN",cnf_matrix[1,0]) # no of fraud Transaction which are predicted normal
sns.heatmap(cnf_matrix,cmap="coolwarm_r",annot=True,linewidths=0.5)
plt.title("Confusion_matrix")
plt.xlabel("Predicted_class")
plt.ylabel("Real class")
plt.show()
the Model overall accuracy is : 0.6727821060567982

the recall of fraud is : 0.6488673139158576
the precision of fraud is : 0.7304189435336976

the recall of normal is : 0.7685088633993743
the precision of normal is : 0.6936470588235294
TP 1203
TN 1474
FP 444
FN 651