In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
In [2]:
pre_data = pd.read_csv("./preprocessed_data.csv")
In [3]:
data = pre_data.drop(columns=['Company_Name'])
In [4]:
data.head()
Out[4]:
Organization_Number Registration_Month Bransje Fylke Kommune Stiftet Share_Capital Organization_Form Ansatte Class
0 811108952 8 41 7 704 1988 120000 10 0 1
1 811549452 4 74 1 101 2013 30000 10 0 1
2 811594962 2 43 50 5001 2013 30000 10 0 1
3 811618462 11 41 18 1860 2013 0 2 1 1
4 811678252 10 43 50 5024 2013 30000 10 0 1
In [5]:
data.shape
Out[5]:
(26531, 10)
In [6]:
### Get the train and test data-set, with and without sampling Train - Test data split without resampling
In [7]:
X = data.iloc[:, data.columns != 'Class'].values
y = data.iloc[:, data.columns == 'Class'].values
In [8]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state = 0)

print("Original number transactions train dataset: ", len(X_train))
print("Original number transactions test dataset: ", len(X_test))
print("Total number of transactions: ", len(X_train)+len(X_test))
Original number transactions train dataset:  18571
Original number transactions test dataset:  7960
Total number of transactions:  26531
In [9]:
# Number of data points in the minority class
number_records_fraud = len(data[data.Class == 1])
fraud_indices = np.array(data[data.Class == 1].index)
print(number_records_fraud)
#print(data[data.Class == 1])
6286
In [10]:
# Picking the indices of the normal classes
normal_indices = data[data.Class == 0].index

# Out of the indices we picked, randomly select "x" number (number_records_fraud)
#np.random.choice(). By using this, the numbers of fraud indices and non-fraud indices become equal. 
random_normal_indices = np.random.choice(a = normal_indices, size = number_records_fraud, replace = False)
random_normal_indices = np.array(random_normal_indices)
In [11]:
# Appending the 2 indices
under_sample_indices = np.concatenate([fraud_indices,random_normal_indices])
print(under_sample_indices)
[    0     1     2 ... 11945 22539 19118]
In [12]:
# Under sample dataset
under_sample_data = data.iloc[under_sample_indices,:]


X_undersample = under_sample_data.iloc[:, under_sample_data.columns != 'Class']
y_undersample = under_sample_data.iloc[:, under_sample_data.columns == 'Class']

# Showing ratio
print("Percentage of normal transactions: ", len(under_sample_data[under_sample_data.Class == 0])/len(under_sample_data))
print("Percentage of fraud transactions: ", len(under_sample_data[under_sample_data.Class == 1])/len(under_sample_data))
print("Total number of transactions in resampled data: ", len(under_sample_data))
Percentage of normal transactions:  0.5
Percentage of fraud transactions:  0.5
Total number of transactions in resampled data:  12572
In [13]:
# Undersampled dataset
X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = train_test_split(X_undersample
                                                                                                   ,y_undersample
                                                                                                   ,test_size = 0.3
                                                                                                   ,random_state = 0)
print("")
print("Number transactions train dataset: ", len(X_train_undersample))
print("Number transactions test dataset: ", len(X_test_undersample))
print("Total number of transactions: ", len(X_train_undersample)+len(X_test_undersample))
Number transactions train dataset:  8800
Number transactions test dataset:  3772
Total number of transactions:  12572
In [14]:
# MLP
In [15]:
# 从sklearn.preprocessing里导入数据标准化模块。
from sklearn.preprocessing import StandardScaler

# 从sklearn.svm里导入基于线性假设的支持向量机分类器LinearSVC。
from sklearn.neural_network import MLPClassifier

# 从仍然需要对训练和测试的特征数据进行标准化。
ss = StandardScaler()
X_train_undersample = ss.fit_transform(X_train_undersample)
X_test_undersample = ss.transform(X_test_undersample)

# 初始化线性假设的支持向量机分类器LinearSVC。
mlp = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1)
#进行模型训练
mlp.fit(X_train_undersample, y_train_undersample)
# 利用训练好的模型对测试样本的数字类别进行预测,预测结果储存在变量y_predict中。
y_predict_undersample = mlp.predict(X_test_undersample)
D:\Anaconda_new\lib\site-packages\sklearn\neural_network\multilayer_perceptron.py:912: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
In [16]:
from sklearn.metrics import confusion_matrix,precision_recall_curve,auc,roc_auc_score,roc_curve,recall_score,classification_report 
In [17]:
# Compute and plot confusion matrix
cnf_matrix = confusion_matrix(y_test_undersample,y_predict_undersample)

#Model overall accuracy
print("the Model overall accuracy is :",(cnf_matrix[1,1]+cnf_matrix[0,0])/(cnf_matrix[1,1]+cnf_matrix[1,0]+cnf_matrix[1,0]+cnf_matrix[0,0]))
print()
print("the recall of fraud is :",cnf_matrix[1,1]/(cnf_matrix[1,1]+cnf_matrix[1,0]))
print("the precision of fraud is :",cnf_matrix[1,1]/(cnf_matrix[1,1]+cnf_matrix[0,1]))
print()
print("the recall of normal is :",cnf_matrix[0,0]/(cnf_matrix[0,0]+cnf_matrix[0,1]))
print("the precision of normal is :",cnf_matrix[0,0]/(cnf_matrix[0,0]+cnf_matrix[1,0]))

fig= plt.figure(figsize=(6,3))# to plot the graph
print("TP",cnf_matrix[1,1]) # no of fraud transaction which are predicted fraud
print("TN",cnf_matrix[0,0]) # no.of normal transaction which are predited normal
print("FP",cnf_matrix[0,1]) # no of normal transaction which are predicted fraud
print("FN",cnf_matrix[1,0]) # no of fraud Transaction which are predicted normal
sns.heatmap(cnf_matrix,cmap="coolwarm_r",annot=True,linewidths=0.5)
plt.title("Confusion_matrix")
plt.xlabel("Predicted_class")
plt.ylabel("Real class")
plt.show()
the Model overall accuracy is : 0.9883720930232558

the recall of fraud is : 0.9881337648327939
the precision of fraud is : 0.99457111834962

the recall of normal is : 0.9947862356621481
the precision of normal is : 0.9886010362694301
TP 1832
TN 1908
FP 10
FN 22
In [18]:
#NEXT IS USE THE GLOBAL DATA
In [19]:
# 从sklearn.preprocessing里导入数据标准化模块。
from sklearn.preprocessing import StandardScaler

# 从sklearn.svm里导入基于线性假设的支持向量机分类器LinearSVC。
from sklearn.svm import LinearSVC

# 从仍然需要对训练和测试的特征数据进行标准化。
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

# 初始化线性假设的支持向量机分类器LinearSVC。
mlp_ = MLPClassifier(solver='lbfgs',alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1)
#进行模型训练
mlp_.fit(X_train, y_train)
# 利用训练好的模型对测试样本的数字类别进行预测,预测结果储存在变量y_predict中。
y_predict = mlp_.predict(X_test)
D:\Anaconda_new\lib\site-packages\sklearn\utils\validation.py:475: DataConversionWarning: Data with input dtype int64 was converted to float64 by StandardScaler.
  warnings.warn(msg, DataConversionWarning)
D:\Anaconda_new\lib\site-packages\sklearn\utils\validation.py:475: DataConversionWarning: Data with input dtype int64 was converted to float64 by StandardScaler.
  warnings.warn(msg, DataConversionWarning)
D:\Anaconda_new\lib\site-packages\sklearn\utils\validation.py:475: DataConversionWarning: Data with input dtype int64 was converted to float64 by StandardScaler.
  warnings.warn(msg, DataConversionWarning)
D:\Anaconda_new\lib\site-packages\sklearn\neural_network\multilayer_perceptron.py:912: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
In [21]:
# Compute and plot confusion matrix
cnf_matrix = confusion_matrix(y_test,y_predict)

#Model overall accuracy
print("the Model overall accuracy is :",(cnf_matrix[1,1]+cnf_matrix[0,0])/(cnf_matrix[1,1]+cnf_matrix[1,0]+cnf_matrix[1,0]+cnf_matrix[0,0]))
print()
print("the recall of fraud is :",cnf_matrix[1,1]/(cnf_matrix[1,1]+cnf_matrix[1,0]))
print("the precision of fraud is :",cnf_matrix[1,1]/(cnf_matrix[1,1]+cnf_matrix[0,1]))
print()
print("the recall of normal is :",cnf_matrix[0,0]/(cnf_matrix[0,0]+cnf_matrix[0,1]))
print("the precision of normal is :",cnf_matrix[0,0]/(cnf_matrix[0,0]+cnf_matrix[1,0]))

fig= plt.figure(figsize=(6,3))# to plot the graph
print("TP",cnf_matrix[1,1]) # no of fraud transaction which are predicted fraud
print("TN",cnf_matrix[0,0]) # no.of normal transaction which are predited normal
print("FP",cnf_matrix[0,1]) # no of normal transaction which are predicted fraud
print("FN",cnf_matrix[1,0]) # no of fraud Transaction which are predicted normal
sns.heatmap(cnf_matrix,cmap="coolwarm_r",annot=True,linewidths=0.5)
plt.title("Confusion_matrix")
plt.xlabel("Predicted_class")
plt.ylabel("Real class")
plt.show()
the Model overall accuracy is : 0.9922276545067068

the recall of fraud is : 0.9835106382978723
the precision of fraud is : 0.9924852388620504

the recall of normal is : 0.9976973684210526
the precision of normal is : 0.9949155322289651
TP 1849
TN 6066
FP 14
FN 31