import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
pre_data = pd.read_csv("./preprocessed_data.csv")
data = pre_data.drop(columns=['Company_Name'])
data.head()
data.shape
X = data.iloc[:, data.columns != 'Class'].values
y = data.iloc[:, data.columns == 'Class'].values
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state = 0)
print("Original number transactions train dataset: ", len(X_train))
print("Original number transactions test dataset: ", len(X_test))
print("Total number of transactions: ", len(X_train)+len(X_test))
# Number of data points in the minority class
number_records_fraud = len(data[data.Class == 1])
fraud_indices = np.array(data[data.Class == 1].index)
print(number_records_fraud)
#print(data[data.Class == 1])
# Picking the indices of the normal classes
normal_indices = data[data.Class == 0].index
# Out of the indices we picked, randomly select "x" number (number_records_fraud)
#np.random.choice(). By using this, the numbers of fraud indices and non-fraud indices become equal.
random_normal_indices = np.random.choice(a = normal_indices, size = number_records_fraud, replace = False)
random_normal_indices = np.array(random_normal_indices)
# Appending the 2 indices
under_sample_indices = np.concatenate([fraud_indices,random_normal_indices])
print(under_sample_indices)
# Under sample dataset
under_sample_data = data.iloc[under_sample_indices,:]
X_undersample = under_sample_data.iloc[:, under_sample_data.columns != 'Class']
y_undersample = under_sample_data.iloc[:, under_sample_data.columns == 'Class']
# Showing ratio
print("Percentage of normal transactions: ", len(under_sample_data[under_sample_data.Class == 0])/len(under_sample_data))
print("Percentage of fraud transactions: ", len(under_sample_data[under_sample_data.Class == 1])/len(under_sample_data))
print("Total number of transactions in resampled data: ", len(under_sample_data))
# Undersampled dataset
X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = train_test_split(X_undersample
,y_undersample
,test_size = 0.3
,random_state = 0)
print("")
print("Number transactions train dataset: ", len(X_train_undersample))
print("Number transactions test dataset: ", len(X_test_undersample))
print("Total number of transactions: ", len(X_train_undersample)+len(X_test_undersample))
# 从sklearn.preprocessing里导入数据标准化模块。
from sklearn.preprocessing import StandardScaler
# 从sklearn.svm里导入基于线性假设的支持向量机分类器LinearSVC。
from sklearn.svm import LinearSVC
# 从仍然需要对训练和测试的特征数据进行标准化。
ss = StandardScaler()
X_train_undersample = ss.fit_transform(X_train_undersample)
X_test_undersample = ss.transform(X_test_undersample)
# 初始化线性假设的支持向量机分类器LinearSVC。
lsvc = LinearSVC()
#进行模型训练
lsvc.fit(X_train_undersample, y_train_undersample)
# 利用训练好的模型对测试样本的数字类别进行预测,预测结果储存在变量y_predict中。
y_predict_undersample = lsvc.predict(X_test_undersample)
# 使用模型自带的评估函数进行准确性测评。
print ('The Accuracy of Linear SVC is', lsvc.score(X_test_undersample, y_test_undersample))
# 依然使用sklearn.metrics里面的classification_report模块对预测结果做更加详细的分析。
from sklearn.metrics import classification_report
print (classification_report(y_test_undersample, y_predict_undersample, target_names=data["Class"].astype(str)))
# 从sklearn.preprocessing里导入数据标准化模块。
from sklearn.preprocessing import StandardScaler
# 从sklearn.svm里导入基于线性假设的支持向量机分类器LinearSVC。
from sklearn.svm import LinearSVC
# 从仍然需要对训练和测试的特征数据进行标准化。
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)
# 初始化线性假设的支持向量机分类器LinearSVC。
lsvc_2 = LinearSVC()
#进行模型训练
lsvc_2.fit(X_train, y_train)
# 利用训练好的模型对测试样本的数字类别进行预测,预测结果储存在变量y_predict中。
y_predict = lsvc_2.predict(X_test)
# 对原sample上检验
# 使用模型自带的评估函数进行准确性测评。
print ('The Accuracy of Linear SVC is', lsvc_2.score(X_test, y_test))
# 依然使用sklearn.metrics里面的classification_report模块对预测结果做更加详细的分析。
from sklearn.metrics import classification_report
print (classification_report(y_test, y_predict, target_names=data["Class"].astype(str)))