In [112]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
In [124]:
pre_data = pd.read_csv("./preprocessed_data.csv")
In [125]:
data = pre_data.drop(columns=['Company_Name'])
In [126]:
data.head()
Out[126]:
Organization_Number Registration_Month Bransje Fylke Kommune Stiftet Share_Capital Organization_Form Ansatte Class
0 811108952 8 41 7 704 1988 120000 10 0 1
1 811549452 4 74 1 101 2013 30000 10 0 1
2 811594962 2 43 50 5001 2013 30000 10 0 1
3 811618462 11 41 18 1860 2013 0 2 1 1
4 811678252 10 43 50 5024 2013 30000 10 0 1
In [127]:
data.shape
Out[127]:
(26531, 10)

Get the train and test data-set, with and without sampling Train - Test data split without resampling

In [128]:
X = data.iloc[:, data.columns != 'Class'].values
y = data.iloc[:, data.columns == 'Class'].values
In [129]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state = 0)

print("Original number transactions train dataset: ", len(X_train))
print("Original number transactions test dataset: ", len(X_test))
print("Total number of transactions: ", len(X_train)+len(X_test))
Original number transactions train dataset:  18571
Original number transactions test dataset:  7960
Total number of transactions:  26531
In [130]:
# Number of data points in the minority class
number_records_fraud = len(data[data.Class == 1])
fraud_indices = np.array(data[data.Class == 1].index)
print(number_records_fraud)
#print(data[data.Class == 1])
6286
In [131]:
# Picking the indices of the normal classes
normal_indices = data[data.Class == 0].index

# Out of the indices we picked, randomly select "x" number (number_records_fraud)
#np.random.choice(). By using this, the numbers of fraud indices and non-fraud indices become equal. 
random_normal_indices = np.random.choice(a = normal_indices, size = number_records_fraud, replace = False)
random_normal_indices = np.array(random_normal_indices)
In [132]:
# Appending the 2 indices
under_sample_indices = np.concatenate([fraud_indices,random_normal_indices])
print(under_sample_indices)
[    0     1     2 ... 20749 14453 18236]
In [133]:
# Under sample dataset
under_sample_data = data.iloc[under_sample_indices,:]


X_undersample = under_sample_data.iloc[:, under_sample_data.columns != 'Class']
y_undersample = under_sample_data.iloc[:, under_sample_data.columns == 'Class']

# Showing ratio
print("Percentage of normal transactions: ", len(under_sample_data[under_sample_data.Class == 0])/len(under_sample_data))
print("Percentage of fraud transactions: ", len(under_sample_data[under_sample_data.Class == 1])/len(under_sample_data))
print("Total number of transactions in resampled data: ", len(under_sample_data))
Percentage of normal transactions:  0.5
Percentage of fraud transactions:  0.5
Total number of transactions in resampled data:  12572
In [134]:
# Undersampled dataset
X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = train_test_split(X_undersample
                                                                                                   ,y_undersample
                                                                                                   ,test_size = 0.3
                                                                                                   ,random_state = 0)
print("")
print("Number transactions train dataset: ", len(X_train_undersample))
print("Number transactions test dataset: ", len(X_test_undersample))
print("Total number of transactions: ", len(X_train_undersample)+len(X_test_undersample))
Number transactions train dataset:  8800
Number transactions test dataset:  3772
Total number of transactions:  12572
In [135]:
# 从sklearn.preprocessing里导入数据标准化模块。
from sklearn.preprocessing import StandardScaler

# 从sklearn.svm里导入基于线性假设的支持向量机分类器LinearSVC。
from sklearn.svm import LinearSVC

# 从仍然需要对训练和测试的特征数据进行标准化。
ss = StandardScaler()
X_train_undersample = ss.fit_transform(X_train_undersample)
X_test_undersample = ss.transform(X_test_undersample)

# 初始化线性假设的支持向量机分类器LinearSVC。
lsvc = LinearSVC()
#进行模型训练
lsvc.fit(X_train_undersample, y_train_undersample)
# 利用训练好的模型对测试样本的数字类别进行预测,预测结果储存在变量y_predict中。
y_predict_undersample = lsvc.predict(X_test_undersample)
D:\Anaconda_new\lib\site-packages\sklearn\utils\validation.py:578: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
In [136]:
# 使用模型自带的评估函数进行准确性测评。
print ('The Accuracy of Linear SVC is', lsvc.score(X_test_undersample, y_test_undersample))
The Accuracy of Linear SVC is 0.9888653234358431
In [139]:
# 依然使用sklearn.metrics里面的classification_report模块对预测结果做更加详细的分析。
from sklearn.metrics import classification_report
print (classification_report(y_test_undersample, y_predict_undersample, target_names=data["Class"].astype(str)))
             precision    recall  f1-score   support

          1       0.98      1.00      0.99      1918
          1       1.00      0.98      0.99      1854

avg / total       0.99      0.99      0.99      3772

D:\Anaconda_new\lib\site-packages\sklearn\metrics\classification.py:1428: UserWarning: labels size, 2, does not match size of target_names, 26531
  .format(len(labels), len(target_names))

part2: 直接对整个数据进行svm

In [141]:
# 从sklearn.preprocessing里导入数据标准化模块。
from sklearn.preprocessing import StandardScaler

# 从sklearn.svm里导入基于线性假设的支持向量机分类器LinearSVC。
from sklearn.svm import LinearSVC

# 从仍然需要对训练和测试的特征数据进行标准化。
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

# 初始化线性假设的支持向量机分类器LinearSVC。
lsvc_2 = LinearSVC()
#进行模型训练
lsvc_2.fit(X_train, y_train)
# 利用训练好的模型对测试样本的数字类别进行预测,预测结果储存在变量y_predict中。
y_predict = lsvc_2.predict(X_test)
D:\Anaconda_new\lib\site-packages\sklearn\utils\validation.py:475: DataConversionWarning: Data with input dtype int64 was converted to float64 by StandardScaler.
  warnings.warn(msg, DataConversionWarning)
D:\Anaconda_new\lib\site-packages\sklearn\utils\validation.py:475: DataConversionWarning: Data with input dtype int64 was converted to float64 by StandardScaler.
  warnings.warn(msg, DataConversionWarning)
D:\Anaconda_new\lib\site-packages\sklearn\utils\validation.py:475: DataConversionWarning: Data with input dtype int64 was converted to float64 by StandardScaler.
  warnings.warn(msg, DataConversionWarning)
D:\Anaconda_new\lib\site-packages\sklearn\utils\validation.py:578: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
In [142]:
# 对原sample上检验
# 使用模型自带的评估函数进行准确性测评。
print ('The Accuracy of Linear SVC is', lsvc_2.score(X_test, y_test))
The Accuracy of Linear SVC is 0.9928391959798994
In [143]:
# 依然使用sklearn.metrics里面的classification_report模块对预测结果做更加详细的分析。
from sklearn.metrics import classification_report
print (classification_report(y_test, y_predict, target_names=data["Class"].astype(str)))
             precision    recall  f1-score   support

          1       0.99      1.00      1.00      6080
          1       1.00      0.97      0.98      1880

avg / total       0.99      0.99      0.99      7960

D:\Anaconda_new\lib\site-packages\sklearn\metrics\classification.py:1428: UserWarning: labels size, 2, does not match size of target_names, 26531
  .format(len(labels), len(target_names))