code
code
import re
from textblob import TextBlob
import matplotlib.pyplot as plt
# 2
# Load dataset
file_path = '/content/Nri_Textual_Survey_Data.csv' # Replace with your
file path
survey_data = pd.read_csv(file_path)
# 3
# ### 1. Text Preprocessing ###
axes = axes.flatten()
for idx, facility in enumerate(sentiment_summary.index):
sentiment_summary.loc[facility].plot(
kind='pie',
ax=axes[idx],
autopct='%1.1f%%',
colors=['green', 'orange', 'red'],
title=f'{facility} Sentiment Distribution'
)
axes[idx].set_ylabel('')
for ax in axes[len(sentiment_summary):]:
ax.axis('off')
plt.tight_layout()
plt.show()
# 10
from sklearn.feature_extraction.text import CountVectorizer,
TfidfVectorizer
import matplotlib.pyplot as plt
import pandas as pd
# Bag-of-Words Extraction
bow_vectorizer = CountVectorizer(max_features=1000) # Limit vocabulary
size
bow_features =
bow_vectorizer.fit_transform(processed_data[column_name]) # Use
dynamic column name
bow_term_frequencies = bow_features.sum(axis=0).A1 # Convert sparse
matrix to array
# TF-IDF Extraction
tfidf_vectorizer = TfidfVectorizer(max_features=1000) # Limit
vocabulary size
tfidf_features =
tfidf_vectorizer.fit_transform(processed_data[column_name]) # Use
dynamic column name
tfidf_term_scores = tfidf_features.sum(axis=0).A1 # Convert sparse
matrix to array
# Bag-of-Words Bar
plt.bar(index, comparison_df["Frequency_BoW"], bar_width, label="Bag-
of-Words", color="skyblue")
# TF-IDF Bar
plt.bar([i + bar_width for i in index], comparison_df["Frequency_TF-
IDF"], bar_width, label="TF-IDF", color="orange")
# 11
from sklearn.model_selection import train_test_split
# 12
# Initialize models
log_reg = LogisticRegression(max_iter=1000, random_state=42)
random_forest = RandomForestClassifier(random_state=42)
svm = SVC(kernel='linear', random_state=42)
tfidf_accuracies = [
accuracy_score(y_test, log_reg.predict(X_test_tfidf)),
accuracy_score(y_test, random_forest.predict(X_test_tfidf)),
accuracy_score(y_test, svm.predict(X_test_tfidf))
]
plt.figure(figsize=(10, 6))
plt.bar(x, bow_accuracies, width=bar_width, label='BoW',
color='skyblue')
plt.bar([i + bar_width for i in x], tfidf_accuracies, width=bar_width,
label='TF-IDF', color='orange')
tfidf_scores = [
precision_score(y_test, log_reg.predict(X_test_tfidf),
average='weighted'),
recall_score(y_test, log_reg.predict(X_test_tfidf),
average='weighted'),
f1_score(y_test, log_reg.predict(X_test_tfidf), average='weighted')
]
plt.figure(figsize=(10, 6))
plt.bar(x, bow_scores, width=bar_width, label='BoW', color='skyblue')
plt.bar(x + bar_width, tfidf_scores, width=bar_width, label='TF-IDF',
color='orange')
# 15
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
# Confusion Matrices
cm_log_reg_bow = confusion_matrix(y_test, y_pred_log_reg_bow)
cm_rf_bow = confusion_matrix(y_test, y_pred_rf_bow)
cm_svm_bow = confusion_matrix(y_test, y_pred_svm_bow)
plt.tight_layout()
plt.show()
combined_text_all = processed_data.apply(
lambda row: ' '.join(row.astype(str)), axis=1
)
# Step 6: Identify the Most and Least Similar Pairs Across All Labels
similarity_df_all = pd.DataFrame(
similarity_matrix_all,
index=processed_data.columns,
columns=processed_data.columns
)
# Output results
print("Most Similar Pair:")
print(most_similar_all)
# Preprocess text
def preprocess_text_dl(text_dl):
"""Clean text."""
text_dl = str(text_dl).lower()
text_dl = re.sub(r'[^\w\s]', '', text_dl) # Remove punctuation
text_dl = re.sub(r'\s+', ' ', text_dl).strip() # Remove extra
spaces
return text_dl
data['label'] = data[text_column].apply(analyze_sentiment)
def __len__(self):
return len(self.texts)
with torch.no_grad():
for batch in tqdm(test_loader, desc="Evaluating"):
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
all_preds.extend(preds.cpu().numpy())
all_labels.extend(labels.cpu().numpy())
# Calculate Accuracy
accuracy = accuracy_score(all_labels, all_preds)
print(f"Model Accuracy: {accuracy * 100:.2f}%")
# Confusion Matrix
cm = confusion_matrix(all_labels, all_preds, labels=[0, 1, 2])
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
display_labels=["Happy", "Neutral", "Unhappy"])
disp.plot(cmap="Blues")
plt.title("Confusion Matrix - Sentiment Analysis")
plt.show()