Credit_Card_fraud_detection Using ML - Jupyter Notebook2
Credit_Card_fraud_detection Using ML - Jupyter Notebook2
In [24]:
!pip install scikit-learn
In [2]:
In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler as SS
In [2]:
#LOAD THE DATASET USING PANDAS LIBRARY
In [3]:
dataset=pd.read_csv('creditcard.csv')
In [4]:
#VIEWING THE DATASET USING head() and tail()
In [5]:
dataset.head()
Out[5]:
Time V1 V2 V3 V4 V5 V6 V7 V8
5 rows × 31 columns
In [6]:
dataset.tail()
Out[6]:
Time V1 V2 V3 V4 V5 V6 V7
5 rows × 31 columns
In [7]:
#VIEW THE SHAPE OF THE DATASET
In [14]:
dataset.shape
Out[14]:
(284807, 31)
In [ ]:
#DROPPING THE TIME COLUMN
In [8]:
dataset=dataset.drop(['Time'],axis=1)
In [9]:
dataset.shape
Out[9]:
(284807, 30)
In [10]:
#CHECKING FOR NULL VALUES IN THE DATASET
In [11]:
dataset.isnull().sum()
Out[11]:
V1 0
V2 0
V3 0
V4 0
V5 0
V6 0
V7 0
V8 0
V9 0
V10 0
V11 0
V12 0
V13 0
V14 0
V15 0
V16 0
V17 0
V18 0
V19 0
V20 0
V21 0
V22 0
V23 0
V24 0
V25 0
V26 0
V27 0
V28 0
Amount 0
Class 0
dtype: int64
In [12]:
#INFORMATION ABOUT DATASET FEATURES
In [13]:
dataset.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 30 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 V1 284807 non-null float64
1 V2 284807 non-null float64
2 V3 284807 non-null float64
3 V4 284807 non-null float64
4 V5 284807 non-null float64
5 V6 284807 non-null float64
6 V7 284807 non-null float64
7 V8 284807 non-null float64
8 V9 284807 non-null float64
9 V10 284807 non-null float64
10 V11 284807 non-null float64
11 V12 284807 non-null float64
12 V13 284807 non-null float64
13 V14 284807 non-null float64
14 V15 284807 non-null float64
15 V16 284807 non-null float64
16 V17 284807 non-null float64
17 V18 284807 non-null float64
18 V19 284807 non-null float64
19 V20 284807 non-null float64
20 V21 284807 non-null float64
21 V22 284807 non-null float64
22 V23 284807 non-null float64
23 V24 284807 non-null float64
24 V25 284807 non-null float64
25 V26 284807 non-null float64
26 V27 284807 non-null float64
27 V28 284807 non-null float64
28 Amount 284807 non-null float64
29 Class 284807 non-null int64
dtypes: float64(29), int64(1)
memory usage: 65.2 MB
In [14]:
#STATISTICAL MEASURES OF THE DATASET FEATURES
In [15]:
dataset.describe()
Out[15]:
V1 V2 V3 V4 V5 V6
8 rows × 30 columns
In [16]:
#COUNTING THE TARGET VALUES OF VALID AND FRAUD TRANSACTIONS
In [17]:
dataset['Class'].value_counts()
Out[17]:
0 284315
1 492
Name: Class, dtype: int64
In [18]:
#SEPERATING VALID AND FRAUD TRANSACTIONS FOR ANALYSIS
In [20]:
real=dataset[dataset.Class==0]
fraud=dataset[dataset.Class==1]
In [21]:
#VIEW THE SHAPES OF VALID AND FRAUD TRANSACTIONS DATA
In [22]:
print(real.shape,fraud.shape)
In [23]:
#STATISTICAL MEASURES ON VALID AND FRAUD DATA - AMOUNT COLUMN
In [24]:
real.Amount.describe()
Out[24]:
count 284315.000000
mean 88.291022
std 250.105092
min 0.000000
25% 5.650000
50% 22.000000
75% 77.050000
max 25691.160000
Name: Amount, dtype: float64
In [25]:
fraud.Amount.describe()
Out[25]:
count 492.000000
mean 122.211321
std 256.683288
min 0.000000
25% 1.000000
50% 9.250000
75% 105.890000
max 2125.870000
Name: Amount, dtype: float64
In [26]:
In [27]:
real_trans=real.sample(n=492)
In [28]:
#SHAPE OF CONVERTED VALID TRANSACTIONS DATA
In [29]:
print(real_trans.shape)
(492, 30)
In [30]:
#CONCATENATING THE NEW VALID TRANSACTIONS DATA AND FRAUD DATA TO FORM NEW DATASET
In [31]:
new_dataset=pd.concat([real_trans,fraud],axis=0)
In [32]:
new_dataset.shape
Out[32]:
(984, 30)
In [33]:
print(new_dataset)
V1 V2 V3 V4 V5 V6 V7
\
12944 1.244831 -0.323488 0.669388 -0.759828 -0.533733 0.174007 -0.728925
197932 -2.553810 -0.083991 -1.936271 -1.333610 1.151432 -1.256007 0.601789
118594 1.274797 0.134625 0.175297 0.316727 -0.072007 -0.208877 -0.098056
62022 -2.494865 2.322834 -0.348792 0.227640 -2.164643 -0.456763 -0.424845
33852 1.234848 0.693741 -0.570387 0.812455 0.402725 -0.960529 0.412969
... ... ... ... ... ... ... ...
279863 -1.927883 1.125653 -4.518331 1.749293 -1.566487 -2.010494 -0.882850
280143 1.378559 1.289381 -5.004247 1.411850 0.442581 -1.326536 -1.413170
280149 -0.676143 1.126366 -2.213700 0.468308 -1.120541 -0.003346 -2.234739
281144 -3.113832 0.585864 -5.399730 1.817092 -0.840618 -2.943548 -2.208002
281674 1.991976 0.158476 -2.583441 0.408670 1.151147 -0.096695 0.223050
In [34]:
In [35]:
X=new_dataset.iloc[:,:-1]
y=new_dataset['Class']
In [36]:
print(X)
print(y)
V1 V2 V3 V4 V5 V6 V7
\
12944 1.244831 -0.323488 0.669388 -0.759828 -0.533733 0.174007 -0.728925
197932 -2.553810 -0.083991 -1.936271 -1.333610 1.151432 -1.256007 0.601789
118594 1.274797 0.134625 0.175297 0.316727 -0.072007 -0.208877 -0.098056
62022 -2.494865 2.322834 -0.348792 0.227640 -2.164643 -0.456763 -0.424845
33852 1.234848 0.693741 -0.570387 0.812455 0.402725 -0.960529 0.412969
... ... ... ... ... ... ... ...
279863 -1.927883 1.125653 -4.518331 1.749293 -1.566487 -2.010494 -0.882850
280143 1.378559 1.289381 -5.004247 1.411850 0.442581 -1.326536 -1.413170
280149 -0.676143 1.126366 -2.213700 0.468308 -1.120541 -0.003346 -2.234739
281144 -3.113832 0.585864 -5.399730 1.817092 -0.840618 -2.943548 -2.208002
281674 1.991976 0.158476 -2.583441 0.408670 1.151147 -0.096695 0.223050
In [37]:
#DATA STANDARDIZATION USING STABDARDSCALER LIBRARY
In [38]:
#data standardization - downscaling the values in range between -1 to 1
In [39]:
scaler=SS()
scaler.fit(X)
standard_X=scaler.fit_transform(X)
In [40]:
print(standard_X)
In [41]:
#SPLITTING THE DATASET INTO TRAINING AND TESTING
In [42]:
x_train,x_test,y_train,y_test=train_test_split(standard_X,y,test_size=0.1,stratify=y,random_
In [43]:
print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)
In [44]:
#MODEL TRAINING - LOGISTIC REGRESSION
In [45]:
classifier=LogisticRegression()
classifier.fit(x_train,y_train)
Out[45]:
▾ LogisticRegression
LogisticRegression()
In [46]:
In [47]:
train_acc=classifier.predict(x_train)
train_acc_score=accuracy_score(train_acc,y_train)
print("The Accuracy on training data is :",train_acc_score)
In [48]:
#PREDICT THE OUTPUTS USING TEST DATA ON THE MODEL
In [49]:
y_pred=classifier.predict(x_test)
In [50]:
print(y_pred)
[1 0 0 0 0 0 1 1 1 0 1 0 1 0 1 1 1 0 0 1 1 1 1 0 0 1 0 1 1 0 1 0 1 1 0 0 1
0 0 1 1 0 1 1 0 1 0 1 0 1 1 1 1 0 0 1 0 1 0 0 0 0 1 1 0 0 0 1 1 0 0 0 0 0
0 0 1 0 0 0 1 0 1 1 0 1 0 0 1 1 1 0 0 0 0 1 1 0 1]
In [51]:
print(y_test)
198868 1
107690 0
217010 0
269315 0
146418 0
..
221515 0
30100 1
123238 1
166719 0
262560 1
Name: Class, Length: 99, dtype: int64
In [52]:
#CHECKING THE ACCURACY OF PREDICTED OUTPUTS BY THE MODEL
In [53]:
accuracy=accuracy_score(y_pred,y_test)
print("The Accuracy Score of the model is : ",accuracy)
In [56]:
In [54]:
input_data=[-1.359807134, -0.072781173,2.536346738,1.378155224,-0.33832077,
0.462387778,0.239598554,0.098697901,0.3637869,0.090794172,-0.551599533,
-0.617800856,-0.991389847,-0.311169354,1.468176972,-0.470400525,
0.207971242,0.02579058,0.40399296,0.251412098,-0.018306778,0.277837576,
-0.11047391,0.066928075,0.128539358,-0.189114844,0.133558377,-0.21053053,149.62
In [55]:
#CHANGING THIS INPUT DATA INTO NUMPY ARRAY
In [56]:
input_data_as_numpy_array = np.asarray(input_data)
input_data_reshaped=input_data_as_numpy_array.reshape(1,-1)
In [57]:
#STANDARDIZE THE RESHAPED ARRAY DATA
In [58]:
stand_input_data=scaler.transform(input_data_reshaped)
print(stand_input_data)
In [59]:
#PREDICT THE USER OF THIS UNKNOWN DATA
In [60]:
prediction_label=classifier.predict(stand_input_data)
In [61]:
if prediction_label:
print("Fraud Transaction..............👎")
else:
print("Valid Transaction...............👍")
Valid Transaction...............👍
In [62]:
#-----------LET US FIT THE RandomForestClassifier ON THE SAME DATA AND CHECK THE ACCURACY AN
In [63]:
rfc=RandomForestClassifier()
rfc.fit(x_train,y_train)
pred_rfc=rfc.predict(x_test)
print("Predicted lables using RFC\n",pred_rfc)
In [64]:
acc_score_rfc=accuracy_score(pred_rfc,y_test)
print("Accuracy Score using RFC is :",acc_score_rfc)
In [65]:
prediction_label_rfc=rfc.predict(stand_input_data)
In [66]:
if prediction_label_rfc:
print("Fraud Transaction..............👎")
else:
print("Valid Transaction...............👍")
Valid Transaction...............👍
In [ ]: