Project-Password Strength Classifier
Project-Password Strength Classifier
In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
data = pd.read_csv('data.csv',error_bad_lines=False)
In [3]:
data.head(5)
Out[3]:
password strength
0 kzde5577 1
1 kino3434 1
2 visi7k1yr 1
3 megzy123 1
4 lamborghin1 1
In [4]:
data['strength'].unique()
Out[4]:
In [5]:
password strength
0 False False
1 False False
2 False False
3 False False
4 False False
In [6]:
data.isnull().sum()
Out[6]:
password 1
strength 0
dtype: int64
In [7]:
data.isnull().sum()
Out[8]:
password 0
strength 0
dtype: int64
In [9]:
data[data['strength']==0].count()
Out[9]:
password 89701
strength 89701
dtype: int64
In [10]:
data[data['strength']==1].count()
Out[10]:
password 496801
strength 496801
dtype: int64
In [11]:
data[data['strength']==2].count()
Out[11]:
password 83137
strength 83137
dtype: int64
In [12]:
array([['kzde5577', 1],
['kino3434', 1],
['visi7k1yr', 1],
...,
['184520socram', 1],
['marken22a', 1],
['fxx4pw4g', 1]], dtype=object)
In [13]:
(669639, 2)
In [14]:
import random
random.shuffle(password_tuple) #shuffle the array
In [15]:
array([['kzde5577', 1],
['kino3434', 1],
['kzde5577', 1],
...,
['kobeji659', 1],
['kt5tu2o0', 1],
['killi48', 0]], dtype=object)
In [16]:
In [18]:
len(X)
Out[18]:
669639
In [82]:
len(y)
Out[82]:
669639
In [21]:
In [22]:
In [23]:
In [24]:
vectorizer=TfidfVectorizer(tokenizer=word_divide_char)
In [26]:
X = vectorizer.fit_transform(X)
In [27]:
(669639, 132)
In [28]:
vectorizer.get_feature_names()
Out[29]:
['\x02',
'\x05',
'\x06',
'\x08',
'\x0f',
'\x10',
'\x11',
'\x16',
'\x17',
'\x19',
'\x1b',
'\x1c',
'\x1e',
' ',
'!',
'"',
'#',
'$',
In [30]:
X.shape
Out[30]:
(669639, 132)
In [31]:
first_document_vector= X[0]
first document vector
Out[31]:
In [32]:
In [33]:
In [34]:
first_document_vector.T.todense()
[0. ], #Dense matrix representation of Transpose of first_document_vector
[0. ],
[0. ],
[0. ],
[0. ],
[0. ],
[0. ],
[0. ],
[0. ],
[0. ],
[0. ],
[0. ],
[0. ],
[0. ],
[0. ],
[0. ],
[0. ],
[0.56653315],
[0. ],
[0.59175205],
[0 ]
In [35]:
pd.DataFrame(first_document_vector.T.todense(),
index=vectorizer.get_feature_names(),
columns=['Tf-Idf'],
).sort values(by='Tf-Idf',ascending=False)
Out[35]:
Tf-Idf
7 0.591752
5 0.566533
z 0.336021
k 0.292210
d 0.285529
... ...
= 0.000000
< 0.000000
; 0.000000
9 0.000000
™ 0.000000
In [37]:
In [38]:
type(X train)
Out[38]:
scipy.sparse._csr.csr_matrix
In [39]:
X_train.shape
Out[39]:
(535711, 132)
In [40]:
type(y_train)
Out[40]:
list
In [41]:
In [42]:
clf = LogisticRegression(random_state=0,multi_class='multinomial')
In [43]:
▾ LogisticRegression
LogisticRegression(multi_class='multinomial', random_state=0)
In [44]:
y_pred=clf.predict(X_test)
y_pred
Out[44]:
In [45]:
In [46]:
confusion_matrix(y_test,y_pred)
Out[46]:
In [47]:
accuracy_score(y_test,y_pred)
Out[47]:
0.8197538976166299
In [68]:
array([1])
array([0])
In [80]:
array([2])
Complete!!