Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
0% found this document useful (0 votes)
17 views

Python CA 4

This document summarizes a student's Python project on SMS spam detection. The student loads and cleans a SMS text dataset, applies natural language processing techniques like tokenization and stemming, then builds and compares various classification models including Naive Bayes, Logistic Regression, Random Forest and XGBoost. The best performing models are then ensemble using voting and stacking classifiers to further improve accuracy and precision of spam detection. The models are saved using pickle for future use.

Uploaded by

subham patra
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
17 views

Python CA 4

This document summarizes a student's Python project on SMS spam detection. The student loads and cleans a SMS text dataset, applies natural language processing techniques like tokenization and stemming, then builds and compares various classification models including Naive Bayes, Logistic Regression, Random Forest and XGBoost. The best performing models are then ensemble using voting and stacking classifiers to further improve accuracy and precision of spam detection. The models are saved using pickle for future use.

Uploaded by

subham patra
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 9

Python CA 4

Name : Subham Patra

REG NO : 12215814

# SMS SPAM DETECTION

import numpy as np

import pandas as pd

import warnings

warnings.filterwarnings('ignore')

df = pd.read_csv('spam.csv',encoding='latin1')

df.sample(5)

df.shape

## Data Cleaning

df.info()

# drop last 3 columns

df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)

df

# rename columns

df.rename(columns={'v1':'target','v2':'text'},inplace=True)

df.head()

# change target into binary

from sklearn.preprocessing import LabelEncoder as LE

encoder=LE()

df['target']=encoder.fit_transform(df['target'])

df.head()

# null values

df.isnull().sum()
#check duplicates

df.duplicated().sum()

#drop duplicates

df=df.drop_duplicates(keep='first')

df.duplicated().sum()

df.shape

# EDA

df['target'].value_counts()

import matplotlib.pyplot as plt

plt.pie(df['target'].value_counts(),labels=['ham','spam'],autopct="%0.2f")

plt.show()

#making new columns(no. of chars,words and sentences) for better analyzing

import nltk

!pip install nltk

nltk.download('punkt')

df['num_chars']=df['text'].apply(len)

df.sample(3)

#num of words

df['num_words']=df['text'].apply(lambda x:len(nltk.word_tokenize(x)))

df['num_sentences']=df['text'].apply(lambda x:len(nltk.sent_tokenize(x)))

df.head()

df[['num_chars','num_words','num_sentences']].describe()

#hams

df[df['target']==0][['num_chars','num_words','num_sentences']].describe()

#spams

df[df['target']==1][['num_chars','num_words','num_sentences']].describe()

import seaborn as sns

sns.histplot(df[df['target']==0]['num_chars'])

sns.histplot(df[df['target']==1]['num_chars'],color='red')

sns.histplot(df[df['target']==0]['num_words'])
sns.histplot(df[df['target']==1]['num_words'],color='red')

sns.pairplot(df,hue='target')

sns.heatmap(df.corr(),annot=True)

# Data Preprocessing

### Lower case

### Tokenization

### Removing special characters

### Removing stop words and punctuation

### Stemming

import nltk

from nltk.corpus import stopwords

nltk.download('stopwords')

stopwords=stopwords.words('english')

import string

puncs=string.punctuation

from nltk.stem.porter import PorterStemmer

ps=PorterStemmer()

def transform_text(text):

text=text.lower()

text=nltk.word_tokenize(text)

y=[]

for i in text:

if i.isalnum():

y.append(i)

text=y[:]

y.clear()

for i in text:
if i not in stopwords+list(puncs):

y.append(i)

text=y[:]

y.clear()

for i in text:

y.append(ps.stem(i))

return " ".join(y)

df['transformed_text']=df['text'].apply(transform_text)

df.sample(5)

# !pip install wordcloud

from wordcloud import WordCloud

wc=WordCloud(width=500,height=500,min_font_size=10,background_color='white')

spam_wc=wc.generate(df[df['target']==1]['transformed_text'].str.cat(sep=" "))

# plt.figure(figsize=(15,6))

plt.imshow(spam_wc)

ham_wc=wc.generate(df[df['target']==0]['transformed_text'].str.cat(sep=" "))

# plt.figure(figsize=(15,6))

plt.imshow(ham_wc)

spam_words=[]

for msg in df[df['target']==1]['transformed_text'].tolist():

for word in msg.split():

spam_words.append(word)

len(spam_words)

from collections import Counter

plt.bar(pd.DataFrame(Counter(spam_words).most_common(30))[0],pd.DataFrame(Counter(spam_w
ords).most_common(30))[1])

plt.xticks(rotation='vertical')

plt.show()
ham_words=[]

for msg in df[df['target']==0]['transformed_text'].tolist():

for word in msg.split():

ham_words.append(word)

len(ham_words)

from collections import Counter

plt.bar(pd.DataFrame(Counter(ham_words).most_common(30))[0],pd.DataFrame(Counter(ham_wor
ds).most_common(30))[1])

plt.xticks(rotation='vertical')

plt.show()

# MODEL BUILDING->naive bayes start

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

cv = CountVectorizer()

tfidf = TfidfVectorizer(max_features=3000)

X = tfidf.fit_transform(df['transformed_text']).toarray()

X.shape

y = df['target'].values

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB

from sklearn.metrics import accuracy_score,confusion_matrix,precision_score

gnb = GaussianNB()

mnb = MultinomialNB()

bnb = BernoulliNB()

gnb.fit(X_train,y_train)
y_pred1 = gnb.predict(X_test)

print(accuracy_score(y_test,y_pred1))

print(confusion_matrix(y_test,y_pred1))

print(precision_score(y_test,y_pred1))

mnb.fit(X_train,y_train)

y_pred2 = mnb.predict(X_test)

print(accuracy_score(y_test,y_pred2))

print(confusion_matrix(y_test,y_pred2))

print(precision_score(y_test,y_pred2))

bnb.fit(X_train,y_train)

y_pred3 = bnb.predict(X_test)

print(accuracy_score(y_test,y_pred3))

print(confusion_matrix(y_test,y_pred3))

print(precision_score(y_test,y_pred3))

from sklearn.linear_model import LogisticRegression

from sklearn.svm import SVC

from sklearn.naive_bayes import MultinomialNB

from sklearn.tree import DecisionTreeClassifier

from sklearn.neighbors import KNeighborsClassifier

from sklearn.ensemble import RandomForestClassifier

from sklearn.ensemble import AdaBoostClassifier

from sklearn.ensemble import BaggingClassifier

from sklearn.ensemble import ExtraTreesClassifier

from sklearn.ensemble import GradientBoostingClassifier

!pip install xgboost

from xgboost import XGBClassifier

svc = SVC(kernel='sigmoid', gamma=1.0)

knc = KNeighborsClassifier()

mnb = MultinomialNB()

dtc = DecisionTreeClassifier(max_depth=5)
lrc = LogisticRegression(solver='liblinear', penalty='l1')

rfc = RandomForestClassifier(n_estimators=50, random_state=2)

abc = AdaBoostClassifier(n_estimators=50, random_state=2)

bc = BaggingClassifier(n_estimators=50, random_state=2)

etc = ExtraTreesClassifier(n_estimators=50, random_state=2)

gbdt = GradientBoostingClassifier(n_estimators=50,random_state=2)

xgb = XGBClassifier(n_estimators=50,random_state=2)

def train_classifier(clf,X_train,y_train,X_test,y_test):

clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test,y_pred)

precision = precision_score(y_test,y_pred)

return accuracy,precision

train_classifier(svc,X_train,y_train,X_test,y_test)

clfs = {

'SVC' : svc,

'KN' : knc,

'NB': mnb,

'DT': dtc,

'LR': lrc,

'RF': rfc,

'AdaBoost': abc,

'BgC': bc,

'ETC': etc,

'GBDT':gbdt,

'xgb':xgb

# accuracy_scores = []

# precision_scores = []
# for name,clf in clfs.items():

# current_accuracy,current_precision = train_classifier(clf, X_train,y_train,X_test,y_test)

# print("For ",name)

# print("Accuracy - ",current_accuracy)

# print("Precision - ",current_precision)

# accuracy_scores.append(current_accuracy)

# precision_scores.append(current_precision)

# performance_df =
pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy':accuracy_scores,'Precision':precision_scores}).sort_
values('Precision',ascending=False)

# performance_df

# performance_df1 = pd.melt(performance_df, id_vars = "Algorithm")

# performance_df1

# sns.catplot(x = 'Algorithm', y='value', hue = 'variable',data=performance_df1, kind='bar',height=5)

# plt.ylim(0.5,1.0)

# plt.xticks(rotation='vertical')

# plt.show()

# model improve

# 1. Change the max_features parameter of TfIdf

temp_df =
pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy_max_ft_3000':accuracy_scores,'Precision_max_ft_3
000':precision_scores}).sort_values('Precision_max_ft_3000',ascending=False)

temp_df =
pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy_scaling':accuracy_scores,'Precision_scaling':precisio
n_scores}).sort_values('Precision_scaling',ascending=False)

new_df = performance_df.merge(temp_df,on='Algorithm')

new_df_scaled = new_df.merge(temp_df,on='Algorithm')
temp_df =
pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy_num_chars':accuracy_scores,'Precision_num_chars':
precision_scores}).sort_values('Precision_num_chars',ascending=False)

# new_df_scaled.merge(temp_df,on='Algorithm')

# Voting Classifier

svc = SVC(kernel='sigmoid', gamma=1.0,probability=True)

mnb = MultinomialNB()

etc = ExtraTreesClassifier(n_estimators=50, random_state=2)

from sklearn.ensemble import VotingClassifier

voting = VotingClassifier(estimators=[('svm', svc), ('nb', mnb), ('et', etc)],voting='soft')

voting.fit(X_train,y_train)

y_pred = voting.predict(X_test)

print("Accuracy",accuracy_score(y_test,y_pred))

print("Precision",precision_score(y_test,y_pred))

# Applying stacking

estimators=[('svm', svc), ('nb', mnb), ('et', etc)]

final_estimator=RandomForestClassifier()

# from sklearn.ensemble import StackingClassifier

# clf = StackingClassifier(estimators=estimators, final_estimator=final_estimator)

# clf.fit(X_train,y_train)

# y_pred = clf.predict(X_test)

# print("Accuracy",accuracy_score(y_test,y_pred))

# print("Precision",precision_score(y_test,y_pred))

import pickle

pickle.dump(tfidf,open('vectorizer.pkl','wb'))

pickle.dump(mnb,open('model.pkl','wb'))

You might also like