Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
0% found this document useful (0 votes)
2 views

code

NOTHING

Uploaded by

hodcse
Copyright
© © All Rights Reserved
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
2 views

code

NOTHING

Uploaded by

hodcse
Copyright
© © All Rights Reserved
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 13

import pandas as pd

import re
from textblob import TextBlob
import matplotlib.pyplot as plt
# 2
# Load dataset
file_path = '/content/Nri_Textual_Survey_Data.csv' # Replace with your
file path
survey_data = pd.read_csv(file_path)
# 3
# ### 1. Text Preprocessing ###

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Update preprocessing function to include stop word removal


def preprocess_text_with_stopwords(text):
"""Cleans text by converting to lowercase, removing punctuation,
extra whitespace, and stop words."""
try:
# Convert to lowercase and remove punctuation
text = str(text).lower()
text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
text = re.sub(r'\s+', ' ', text).strip() # Remove extra spaces
text = re.sub(r'\d+', '', text) # Remove digits
text = re.sub(r'http\S+|www\S+|https\S+', '', text) # Remove
URLs
text = re.sub(r'<.*?>', '', text) # Remove HTML tags
text = re.sub(r'\n', ' ', text) # Replace newlines with spaces
text = re.sub(r'[^\x00-\x7F]+', '', text) # Remove non-ASCII
characters

# Remove stop words


text = " ".join([word for word in text.split() if word not in
ENGLISH_STOP_WORDS])
return text
except Exception as e:
return text

# Apply updated preprocessing


processed_data = survey_data.copy()
for col in processed_data.columns:
processed_data[col] =
processed_data[col].apply(preprocess_text_with_stopwords)
# 4
processed_data.to_csv('preprocessed_data.csv', index=False) # Save to
Colab environment

# Download the file


from google.colab import files
files.download('preprocessed_data.csv')
# 5
### 2. Sentiment Analysis ###
def analyze_sentiment(text):
"""Classifies sentiment as 'happy', 'neutral', or 'unhappy'."""
try:
blob = TextBlob(text)
polarity = blob.sentiment.polarity # Polarity ranges from -1
(negative) to 1 (positive)
if polarity > 0:
return 'happy'
elif polarity == 0:
return 'neutral'
else:
return 'unhappy'
except Exception:
return 'neutral'

# Add sentiment columns for each facility


sentiment_data = processed_data.copy()
for col in sentiment_data.columns:
sentiment_data[col + '_sentiment'] =
sentiment_data[col].apply(analyze_sentiment)
# 6
### 3. Sentiment Analysis Summary ###
# Count sentiments for each facility
facility_sentiment_cols = [col for col in sentiment_data.columns if
'_sentiment' in col]
sentiment_summary =
sentiment_data[facility_sentiment_cols].apply(pd.Series.value_counts).f
illna(0).astype(int)
sentiment_summary = sentiment_summary.T
sentiment_summary.columns = ['happy', 'neutral', 'unhappy']

# Overall sentiment counts


overall_sentiment_counts = sentiment_summary.sum()
# 7
### Visualization ###
# Overall Sentiment Distribution (Bar and Pie Charts)
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
overall_sentiment_counts.plot(kind='bar', color=['green', 'orange',
'red'], ax=axes[0])
axes[0].set_title('Overall Sentiment Distribution (Bar Chart)')
axes[0].set_xlabel('Sentiment')
axes[0].set_ylabel('Count')
overall_sentiment_counts.plot(kind='pie', autopct='%1.1f%%',
colors=['green', 'orange', 'red'], ax=axes[1])
axes[1].set_title('Overall Sentiment Distribution (Pie Chart)')
axes[1].set_ylabel('')
plt.tight_layout()
plt.show()
# 8
# Facility-Wise Sentiment Distribution (Stacked Bar Chart)
sentiment_summary.plot(
kind='bar',
stacked=True,
figsize=(12, 8),
title='Facility-Wise Sentiment Distribution',
color=['green', 'orange', 'red']
)
plt.xlabel('Facilities')
plt.ylabel('Count')
plt.legend(title='Sentiment')
plt.show()
# 9
# Individual Facility Sentiment Pie Charts
rows = (len(sentiment_summary) // 3) + (1 if len(sentiment_summary) % 3
else 0)
fig, axes = plt.subplots(rows, 3, figsize=(18, 5 * rows))

axes = axes.flatten()
for idx, facility in enumerate(sentiment_summary.index):
sentiment_summary.loc[facility].plot(
kind='pie',
ax=axes[idx],
autopct='%1.1f%%',
colors=['green', 'orange', 'red'],
title=f'{facility} Sentiment Distribution'
)
axes[idx].set_ylabel('')

for ax in axes[len(sentiment_summary):]:
ax.axis('off')

plt.tight_layout()
plt.show()
# 10
from sklearn.feature_extraction.text import CountVectorizer,
TfidfVectorizer
import matplotlib.pyplot as plt
import pandas as pd

# Assuming 'processed_data' contains the preprocessed text data


# Dynamically set the column name
column_name = 'Internship' # Replace this with your desired column

# Bag-of-Words Extraction
bow_vectorizer = CountVectorizer(max_features=1000) # Limit vocabulary
size
bow_features =
bow_vectorizer.fit_transform(processed_data[column_name]) # Use
dynamic column name
bow_term_frequencies = bow_features.sum(axis=0).A1 # Convert sparse
matrix to array

# Create BoW DataFrame


bow_term_df = pd.DataFrame({
'Term': bow_vectorizer.get_feature_names_out(),
'Frequency': bow_term_frequencies
}).sort_values(by='Frequency', ascending=False)

# TF-IDF Extraction
tfidf_vectorizer = TfidfVectorizer(max_features=1000) # Limit
vocabulary size
tfidf_features =
tfidf_vectorizer.fit_transform(processed_data[column_name]) # Use
dynamic column name
tfidf_term_scores = tfidf_features.sum(axis=0).A1 # Convert sparse
matrix to array

# Create TF-IDF DataFrame


tfidf_term_df = pd.DataFrame({
'Term': tfidf_vectorizer.get_feature_names_out(),
'TF-IDF Score': tfidf_term_scores
}).sort_values(by='TF-IDF Score', ascending=False)

# Merge BoW and TF-IDF for comparison


comparison_df = pd.merge(
bow_term_df.rename(columns={"Frequency":
"Frequency_BoW"}).head(20),
tfidf_term_df.rename(columns={"TF-IDF Score": "Frequency_TF-
IDF"}).head(20),
on="Term",
how="outer"
).fillna(0)

# Sort by Bag-of-Words Frequency for consistency


comparison_df = comparison_df.sort_values(by="Frequency_BoW",
ascending=False)

# Plot the comparison graph


plt.figure(figsize=(12, 8))

# Bar width for side-by-side bars


bar_width = 0.35
index = range(len(comparison_df))

# Bag-of-Words Bar
plt.bar(index, comparison_df["Frequency_BoW"], bar_width, label="Bag-
of-Words", color="skyblue")

# TF-IDF Bar
plt.bar([i + bar_width for i in index], comparison_df["Frequency_TF-
IDF"], bar_width, label="TF-IDF", color="orange")

# Add labels and title


plt.xlabel("Terms")
plt.ylabel("Frequency/TF-IDF Score")
plt.title(f"Comparison of Bag-of-Words and TF-IDF Representations for
'{column_name}'")
plt.xticks([i + bar_width / 2 for i in index], comparison_df["Term"],
rotation=45, ha="right")
plt.legend()
plt.tight_layout()
plt.show()

# 11
from sklearn.model_selection import train_test_split

# Target variable based on the dynamic column name


target_column_name = column_name + '_sentiment' # Append '_sentiment'
dynamically
y = sentiment_data[target_column_name] # Use the dynamic sentiment
column name

# Split for BoW features


X_train_bow, X_test_bow, y_train, y_test = train_test_split(
bow_features, y, test_size=0.2, random_state=42
)

# Split for TF-IDF features


X_train_tfidf, X_test_tfidf, _, _ = train_test_split(
tfidf_features, y, test_size=0.2, random_state=42
)

# 12

from sklearn.linear_model import LogisticRegression


from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Initialize models
log_reg = LogisticRegression(max_iter=1000, random_state=42)
random_forest = RandomForestClassifier(random_state=42)
svm = SVC(kernel='linear', random_state=42)

# Train models on BoW features


log_reg.fit(X_train_bow, y_train)
random_forest.fit(X_train_bow, y_train)
svm.fit(X_train_bow, y_train)
# 13
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import accuracy_score

# Dynamically calculate model accuracies


bow_accuracies = [
accuracy_score(y_test, log_reg.predict(X_test_bow)),
accuracy_score(y_test, random_forest.predict(X_test_bow)),
accuracy_score(y_test, svm.predict(X_test_bow))
]

tfidf_accuracies = [
accuracy_score(y_test, log_reg.predict(X_test_tfidf)),
accuracy_score(y_test, random_forest.predict(X_test_tfidf)),
accuracy_score(y_test, svm.predict(X_test_tfidf))
]

# Plotting model accuracy comparison


model_names = ['Logistic Regression', 'Random Forest', 'SVM']
x = range(len(model_names))
bar_width = 0.35

plt.figure(figsize=(10, 6))
plt.bar(x, bow_accuracies, width=bar_width, label='BoW',
color='skyblue')
plt.bar([i + bar_width for i in x], tfidf_accuracies, width=bar_width,
label='TF-IDF', color='orange')

# Add labels and title


plt.xticks([i + bar_width / 2 for i in x], model_names)
plt.ylabel('Accuracy')
plt.title('Model Accuracy Comparison (BoW vs. TF-IDF)')
plt.legend()
plt.tight_layout()
plt.show()
# 14
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, recall_score, f1_score

# Dynamically calculate metrics for a specific model (e.g., Logistic


Regression)
bow_scores = [
precision_score(y_test, log_reg.predict(X_test_bow),
average='weighted'),
recall_score(y_test, log_reg.predict(X_test_bow),
average='weighted'),
f1_score(y_test, log_reg.predict(X_test_bow), average='weighted')
]

tfidf_scores = [
precision_score(y_test, log_reg.predict(X_test_tfidf),
average='weighted'),
recall_score(y_test, log_reg.predict(X_test_tfidf),
average='weighted'),
f1_score(y_test, log_reg.predict(X_test_tfidf), average='weighted')
]

# Plot grouped bar chart


metrics = ['Precision', 'Recall', 'F1-Score']
x = np.arange(len(metrics))
bar_width = 0.35

plt.figure(figsize=(10, 6))
plt.bar(x, bow_scores, width=bar_width, label='BoW', color='skyblue')
plt.bar(x + bar_width, tfidf_scores, width=bar_width, label='TF-IDF',
color='orange')

# Add labels and title


plt.xticks(x + bar_width / 2, metrics)
plt.ylabel('Score')
plt.title('Model Performance Metrics (BoW vs. TF-IDF)')
plt.legend()
plt.tight_layout()
plt.show()

# 15
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Predictions for BoW features


y_pred_log_reg_bow = log_reg.predict(X_test_bow)
y_pred_rf_bow = random_forest.predict(X_test_bow)
y_pred_svm_bow = svm.predict(X_test_bow)

# Predictions for TF-IDF features


y_pred_log_reg_tfidf = log_reg.predict(X_test_tfidf)
y_pred_rf_tfidf = random_forest.predict(X_test_tfidf)
y_pred_svm_tfidf = svm.predict(X_test_tfidf)

# Confusion Matrices
cm_log_reg_bow = confusion_matrix(y_test, y_pred_log_reg_bow)
cm_rf_bow = confusion_matrix(y_test, y_pred_rf_bow)
cm_svm_bow = confusion_matrix(y_test, y_pred_svm_bow)

cm_log_reg_tfidf = confusion_matrix(y_test, y_pred_log_reg_tfidf)


cm_rf_tfidf = confusion_matrix(y_test, y_pred_rf_tfidf)
cm_svm_tfidf = confusion_matrix(y_test, y_pred_svm_tfidf)

# Plotting all confusion matrices


fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# Titles for the matrices


titles = [
"Logistic Regression (BoW)", "Random Forest (BoW)", "SVM (BoW)",
"Logistic Regression (TF-IDF)", "Random Forest (TF-IDF)", "SVM (TF-
IDF)"
]

# All confusion matrices


conf_matrices = [
cm_log_reg_bow, cm_rf_bow, cm_svm_bow,
cm_log_reg_tfidf, cm_rf_tfidf, cm_svm_tfidf
]

# Plotting each heatmap


for i, ax in enumerate(axes.flat):
sns.heatmap(
conf_matrices[i], annot=True, fmt='d', cmap='Blues',
xticklabels=['Happy', 'Neutral', 'Unhappy'],
yticklabels=['Happy', 'Neutral', 'Unhappy'], ax=ax
)
ax.set_title(titles[i])
ax.set_xlabel("Predicted")
ax.set_ylabel("Actual")

plt.tight_layout()
plt.show()

from sklearn.metrics.pairwise import cosine_similarity


# 16

# Step 1: Use the already defined `processed_data` from your script.

# Step 2: Combine text from all columns to build a unified vocabulary


from sklearn.metrics.pairwise import cosine_similarity

combined_text_all = processed_data.apply(
lambda row: ' '.join(row.astype(str)), axis=1
)

# Fit the TF-IDF vectorizer on the combined text


tfidf_vectorizer_all = TfidfVectorizer()
tfidf_vectorizer_all.fit(combined_text_all)

# Step 3: Transform each column using the unified vocabulary


tfidf_vectors_all = {col:
tfidf_vectorizer_all.transform(processed_data[col].astype(str)) for col
in processed_data.columns}

# Step 4: Compute Pairwise Cosine Similarity for All Labels


similarity_matrix_all = np.zeros((len(processed_data.columns),
len(processed_data.columns)))

for i, col1 in enumerate(processed_data.columns):


for j, col2 in enumerate(processed_data.columns):
if i == j: # Self-similarity
similarity_matrix_all[i, j] = 1.0
else: # Pairwise similarity
similarity_matrix_all[i, j] = cosine_similarity(
tfidf_vectors_all[col1], tfidf_vectors_all[col2]
).mean()

# Step 5: Visualize the Similarity Matrix for All Labels


plt.figure(figsize=(12, 10))
sns.heatmap(
similarity_matrix_all,
xticklabels=processed_data.columns,
yticklabels=processed_data.columns,
cmap='coolwarm',
annot=True,
fmt=".2f",
annot_kws={"size": 10}, # Customize annotation font size
cbar_kws={"shrink": 0.8, "label": "Similarity Score"} # Color bar
customization
)
plt.title("Text Similarity Between All Labels (Cosine Similarity)",
fontsize=16)
plt.xlabel("Labels", fontsize=12)
plt.ylabel("Labels", fontsize=12)
plt.xticks(rotation=45, ha='right', fontsize=10) # Rotate x-axis
labels for better readability
plt.yticks(fontsize=10)
plt.tight_layout()
plt.show()

# Step 6: Identify the Most and Least Similar Pairs Across All Labels
similarity_df_all = pd.DataFrame(
similarity_matrix_all,
index=processed_data.columns,
columns=processed_data.columns
)

# Melt the matrix for pairwise comparison


similarity_melted_all = similarity_df_all.reset_index().melt(
id_vars='index',
var_name='Label 2',
value_name='Similarity'
).rename(columns={'index': 'Label 1'})

# Remove self-similarity (diagonal values)


similarity_melted_all =
similarity_melted_all[similarity_melted_all['Label 1'] !=
similarity_melted_all['Label 2']]

# Sort for most and least similar pairs


most_similar_all = similarity_melted_all.sort_values(by='Similarity',
ascending=False).head(1)
least_similar_all = similarity_melted_all.sort_values(by='Similarity',
ascending=True).head(1)

# Output results
print("Most Similar Pair:")
print(most_similar_all)

print("\nLeast Similar Pair:")


print(least_similar_all)
# 17
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay,
accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import torch
from torch.optim import AdamW
from tqdm import tqdm
import matplotlib.pyplot as plt

# Step 1: Load and Preprocess Dataset


file_path = '/content/Nri_Textual_Survey_Data.csv' # Replace with your
dataset path
data = pd.read_csv(file_path)

# Preprocess text
def preprocess_text_dl(text_dl):
"""Clean text."""
text_dl = str(text_dl).lower()
text_dl = re.sub(r'[^\w\s]', '', text_dl) # Remove punctuation
text_dl = re.sub(r'\s+', ' ', text_dl).strip() # Remove extra
spaces
return text_dl

# Apply preprocessing to text column dynamically


text_column = data.columns[0] # Dynamically use the first column as
text
data[text_column] = data[text_column].apply(preprocess_text_dl)

# Analyze sentiment dynamically


def analyze_sentiment(text_dl):
"""Classify sentiment using polarity."""
from textblob import TextBlob
try:
blob = TextBlob(text_dl)
polarity = blob.sentiment.polarity
if polarity > 0:
return 0 # Happy
elif polarity == 0:
return 1 # Neutral
else:
return 2 # Unhappy
except:
return 1

data['label'] = data[text_column].apply(analyze_sentiment)

# Step 2: Split Data


train_texts, test_texts, train_labels, test_labels = train_test_split(
data[text_column], data['label'], test_size=0.2, random_state=42
)

# Step 3: Tokenize Using BERT


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
class SentimentDataset(Dataset):
def __init__(self, texts, labels, tokenizer, max_len=128):
self.texts = texts
self.labels = labels
self.tokenizer = tokenizer
self.max_len = max_len

def __len__(self):
return len(self.texts)

def __getitem__(self, idx):


text = self.texts.iloc[idx]
label = self.labels.iloc[idx]
encoding = self.tokenizer(
text,
truncation=True,
padding='max_length',
max_length=self.max_len,
return_tensors="pt"
)
return {
'input_ids': encoding['input_ids'].squeeze(0),
'attention_mask': encoding['attention_mask'].squeeze(0),
'labels': torch.tensor(label, dtype=torch.long)
}

train_dataset = SentimentDataset(train_texts, train_labels, tokenizer)


test_dataset = SentimentDataset(test_texts, test_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)


test_loader = DataLoader(test_dataset, batch_size=16)

# Step 4: Define BERT Model


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained('bert-base-
uncased', num_labels=3)
model.to(device)

# Step 5: Train Model


optimizer = AdamW(model.parameters(), lr=5e-5)
epochs = 1
model.train()

for epoch in range(epochs):


total_loss = 0
for batch in tqdm(train_loader, desc=f"Training Epoch {epoch +
1}"):
optimizer.zero_grad()
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)

outputs = model(input_ids, attention_mask=attention_mask,


labels=labels)
loss = outputs.loss
total_loss += loss.item()
loss.backward()
optimizer.step()
print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")

# Step 6: Evaluate Model


model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
for batch in tqdm(test_loader, desc="Evaluating"):
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)

outputs = model(input_ids, attention_mask=attention_mask)


preds = torch.argmax(outputs.logits, axis=1)

all_preds.extend(preds.cpu().numpy())
all_labels.extend(labels.cpu().numpy())

# Calculate Accuracy
accuracy = accuracy_score(all_labels, all_preds)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Confusion Matrix
cm = confusion_matrix(all_labels, all_preds, labels=[0, 1, 2])
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
display_labels=["Happy", "Neutral", "Unhappy"])
disp.plot(cmap="Blues")
plt.title("Confusion Matrix - Sentiment Analysis")
plt.show()

You might also like