Python CA 4
Python CA 4
REG NO : 12215814
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv('spam.csv',encoding='latin1')
df.sample(5)
df.shape
## Data Cleaning
df.info()
df
# rename columns
df.rename(columns={'v1':'target','v2':'text'},inplace=True)
df.head()
encoder=LE()
df['target']=encoder.fit_transform(df['target'])
df.head()
# null values
df.isnull().sum()
#check duplicates
df.duplicated().sum()
#drop duplicates
df=df.drop_duplicates(keep='first')
df.duplicated().sum()
df.shape
# EDA
df['target'].value_counts()
plt.pie(df['target'].value_counts(),labels=['ham','spam'],autopct="%0.2f")
plt.show()
import nltk
nltk.download('punkt')
df['num_chars']=df['text'].apply(len)
df.sample(3)
#num of words
df['num_words']=df['text'].apply(lambda x:len(nltk.word_tokenize(x)))
df['num_sentences']=df['text'].apply(lambda x:len(nltk.sent_tokenize(x)))
df.head()
df[['num_chars','num_words','num_sentences']].describe()
#hams
df[df['target']==0][['num_chars','num_words','num_sentences']].describe()
#spams
df[df['target']==1][['num_chars','num_words','num_sentences']].describe()
sns.histplot(df[df['target']==0]['num_chars'])
sns.histplot(df[df['target']==1]['num_chars'],color='red')
sns.histplot(df[df['target']==0]['num_words'])
sns.histplot(df[df['target']==1]['num_words'],color='red')
sns.pairplot(df,hue='target')
sns.heatmap(df.corr(),annot=True)
# Data Preprocessing
### Tokenization
### Stemming
import nltk
nltk.download('stopwords')
stopwords=stopwords.words('english')
import string
puncs=string.punctuation
ps=PorterStemmer()
def transform_text(text):
text=text.lower()
text=nltk.word_tokenize(text)
y=[]
for i in text:
if i.isalnum():
y.append(i)
text=y[:]
y.clear()
for i in text:
if i not in stopwords+list(puncs):
y.append(i)
text=y[:]
y.clear()
for i in text:
y.append(ps.stem(i))
df['transformed_text']=df['text'].apply(transform_text)
df.sample(5)
wc=WordCloud(width=500,height=500,min_font_size=10,background_color='white')
spam_wc=wc.generate(df[df['target']==1]['transformed_text'].str.cat(sep=" "))
# plt.figure(figsize=(15,6))
plt.imshow(spam_wc)
ham_wc=wc.generate(df[df['target']==0]['transformed_text'].str.cat(sep=" "))
# plt.figure(figsize=(15,6))
plt.imshow(ham_wc)
spam_words=[]
spam_words.append(word)
len(spam_words)
plt.bar(pd.DataFrame(Counter(spam_words).most_common(30))[0],pd.DataFrame(Counter(spam_w
ords).most_common(30))[1])
plt.xticks(rotation='vertical')
plt.show()
ham_words=[]
ham_words.append(word)
len(ham_words)
plt.bar(pd.DataFrame(Counter(ham_words).most_common(30))[0],pd.DataFrame(Counter(ham_wor
ds).most_common(30))[1])
plt.xticks(rotation='vertical')
plt.show()
cv = CountVectorizer()
tfidf = TfidfVectorizer(max_features=3000)
X = tfidf.fit_transform(df['transformed_text']).toarray()
X.shape
y = df['target'].values
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)
gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()
gnb.fit(X_train,y_train)
y_pred1 = gnb.predict(X_test)
print(accuracy_score(y_test,y_pred1))
print(confusion_matrix(y_test,y_pred1))
print(precision_score(y_test,y_pred1))
mnb.fit(X_train,y_train)
y_pred2 = mnb.predict(X_test)
print(accuracy_score(y_test,y_pred2))
print(confusion_matrix(y_test,y_pred2))
print(precision_score(y_test,y_pred2))
bnb.fit(X_train,y_train)
y_pred3 = bnb.predict(X_test)
print(accuracy_score(y_test,y_pred3))
print(confusion_matrix(y_test,y_pred3))
print(precision_score(y_test,y_pred3))
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth=5)
lrc = LogisticRegression(solver='liblinear', penalty='l1')
bc = BaggingClassifier(n_estimators=50, random_state=2)
gbdt = GradientBoostingClassifier(n_estimators=50,random_state=2)
xgb = XGBClassifier(n_estimators=50,random_state=2)
def train_classifier(clf,X_train,y_train,X_test,y_test):
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
precision = precision_score(y_test,y_pred)
return accuracy,precision
train_classifier(svc,X_train,y_train,X_test,y_test)
clfs = {
'SVC' : svc,
'KN' : knc,
'NB': mnb,
'DT': dtc,
'LR': lrc,
'RF': rfc,
'AdaBoost': abc,
'BgC': bc,
'ETC': etc,
'GBDT':gbdt,
'xgb':xgb
# accuracy_scores = []
# precision_scores = []
# for name,clf in clfs.items():
# print("For ",name)
# print("Accuracy - ",current_accuracy)
# print("Precision - ",current_precision)
# accuracy_scores.append(current_accuracy)
# precision_scores.append(current_precision)
# performance_df =
pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy':accuracy_scores,'Precision':precision_scores}).sort_
values('Precision',ascending=False)
# performance_df
# performance_df1
# plt.ylim(0.5,1.0)
# plt.xticks(rotation='vertical')
# plt.show()
# model improve
temp_df =
pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy_max_ft_3000':accuracy_scores,'Precision_max_ft_3
000':precision_scores}).sort_values('Precision_max_ft_3000',ascending=False)
temp_df =
pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy_scaling':accuracy_scores,'Precision_scaling':precisio
n_scores}).sort_values('Precision_scaling',ascending=False)
new_df = performance_df.merge(temp_df,on='Algorithm')
new_df_scaled = new_df.merge(temp_df,on='Algorithm')
temp_df =
pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy_num_chars':accuracy_scores,'Precision_num_chars':
precision_scores}).sort_values('Precision_num_chars',ascending=False)
# new_df_scaled.merge(temp_df,on='Algorithm')
# Voting Classifier
mnb = MultinomialNB()
voting.fit(X_train,y_train)
y_pred = voting.predict(X_test)
print("Accuracy",accuracy_score(y_test,y_pred))
print("Precision",precision_score(y_test,y_pred))
# Applying stacking
final_estimator=RandomForestClassifier()
# clf.fit(X_train,y_train)
# y_pred = clf.predict(X_test)
# print("Accuracy",accuracy_score(y_test,y_pred))
# print("Precision",precision_score(y_test,y_pred))
import pickle
pickle.dump(tfidf,open('vectorizer.pkl','wb'))
pickle.dump(mnb,open('model.pkl','wb'))