AIL303 M
AIL303 M
AIL303 M
import io, os, sys, types, time, datetime, math, random, requests, subprocess, tempfile
# Data Manipulation
import numpy as np
import pandas as pd
# Visualization
import matplotlib.pyplot as plt
import missingno
import seaborn as sns
from pandas.plotting import scatter_matrix
# Machine learning
import sklearn.ensemble as ske
from sklearn import datasets, model_selection, tree, preprocessing, metrics, linear_model
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso,
SGDClassifier
from sklearn.tree import DecisionTreeClassifier
import tensorflow as tf
# Metrics
from sklearn.metrics import precision_recall_fscore_support, roc_curve, auc
# Managing Warnings
import warnings
warnings.filterwarnings('ignore')
# # Download
# DATASET = (
# "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
# "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names",
# "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test",
#)
# #download_data()
path = "D:\\TUANTA\\VTC Academiy\\AI-Tuan-Class\\"
path += "Specialist Lesson 05 - Machine Learning - Classification Methods\\"
# Join Datasets
dataset_raw = training_raw.append(test_raw)
dataset_raw.reset_index(inplace=True)
dataset_raw.drop('index',inplace=True,axis=1)
There are multiple ways of manipulating each feature type, but for simplicity, we'll define only two
feature types:
● Missing values: Either omit elements from a dataset that contain missing values or
impute them (fill them in).
● Special values: Numeric variables are endowed with several formalized special values
including ±Inf, NA and NaN. Calculations involving special values often result in special
values, and need to be handled/cleaned.
● Outliers: They should be detected, but not necessarily removed. Their inclusion in the
analysis is a statistical decision.
● Obvious inconsistencies: A person's age cannot be negative, a man cannot be
pregnant and an under-aged person cannot possess a drivers license. Find the
inconsistencies and plan for them.
dataset_bin['predclass'] = dataset_raw['predclass']
dataset_con['predclass'] = dataset_raw['predclass']
plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(20,1))
sns.countplot(y="predclass", data=dataset_bin);
plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(20,5))
plt.subplot(1, 2, 1)
sns.countplot(y="age", data=dataset_bin);
plt.subplot(1, 2, 2)
sns.distplot(dataset_con.loc[dataset_con['predclass'] == 1]['age'], kde_kws={"label": ">$50K"});
sns.distplot(dataset_con.loc[dataset_con['predclass'] == 0]['age'], kde_kws={"label": "<$50K"});
# There are too many groups here, we can group someof them together.
# Create buckets for Workclass
dataset_raw.loc[dataset_raw['workclass'] == 'Without-pay', 'workclass'] = 'Not Working'
dataset_raw.loc[dataset_raw['workclass'] == 'Never-worked', 'workclass'] = 'Not Working'
dataset_raw.loc[dataset_raw['workclass'] == 'Federal-gov', 'workclass'] = 'Fed-gov'
dataset_raw.loc[dataset_raw['workclass'] == 'State-gov', 'workclass'] = 'Non-fed-gov'
dataset_raw.loc[dataset_raw['workclass'] == 'Local-gov', 'workclass'] = 'Non-fed-gov'
dataset_raw.loc[dataset_raw['workclass'] == 'Self-emp-not-inc', 'workclass'] = 'Self-emp'
dataset_raw.loc[dataset_raw['workclass'] == 'Self-emp-inc', 'workclass'] = 'Self-emp'
dataset_bin['workclass'] = dataset_raw['workclass']
dataset_con['workclass'] = dataset_raw['workclass']
plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(20,2))
sns.countplot(y="workclass", data=dataset_bin);
dataset_bin['occupation'] = dataset_raw['occupation']
dataset_con['occupation'] = dataset_raw['occupation']
plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(20,3))
sns.countplot(y="occupation", data=dataset_bin);
dataset_bin['native-country'] = dataset_raw['native-country']
dataset_con['native-country'] = dataset_raw['native-country']
plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(20,4))
sns.countplot(y="native-country", data=dataset_bin);
dataset_bin['education'] = dataset_raw['education']
dataset_con['education'] = dataset_raw['education']
plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(20,4))
sns.countplot(y="education", data=dataset_bin);
dataset_bin['marital-status'] = dataset_raw['marital-status']
dataset_con['marital-status'] = dataset_raw['marital-status']
plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(20,3))
sns.countplot(y="marital-status", data=dataset_bin);
# Let's use the Pandas Cut function to bin the data in equally sized buckets
dataset_bin['fnlwgt'] = pd.cut(dataset_raw['fnlwgt'], 10)
dataset_con['fnlwgt'] = dataset_raw['fnlwgt']
plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(20,4))
sns.countplot(y="fnlwgt", data=dataset_bin);
# Let's use the Pandas Cut function to bin the data in equally sized buckets
dataset_bin['education-num'] = pd.cut(dataset_raw['education-num'], 10)
dataset_con['education-num'] = dataset_raw['education-num']
plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(20,5))
sns.countplot(y="education-num", data=dataset_bin);
# Let's use the Pandas Cut function to bin the data in equally sized buckets
dataset_bin['hours-per-week'] = pd.cut(dataset_raw['hours-per-week'], 10)
dataset_con['hours-per-week'] = dataset_raw['hours-per-week']
plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(20,4))
plt.subplot(1, 2, 1)
sns.countplot(y="hours-per-week", data=dataset_bin);
plt.subplot(1, 2, 2)
sns.distplot(dataset_con['hours-per-week']);
# Let's use the Pandas Cut function to bin the data in equally sized buckets
dataset_bin['capital-gain'] = pd.cut(dataset_raw['capital-gain'], 5)
dataset_con['capital-gain'] = dataset_raw['capital-gain']
plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(20,3))
plt.subplot(1, 2, 1)
sns.countplot(y="capital-gain", data=dataset_bin);
plt.subplot(1, 2, 2)
sns.distplot(dataset_con['capital-gain']);
# Let's use the Pandas Cut function to bin the data in equally sized buckets
dataset_bin['capital-loss'] = pd.cut(dataset_raw['capital-loss'], 5)
dataset_con['capital-loss'] = dataset_raw['capital-loss']
plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(20,3))
plt.subplot(1, 2, 1)
sns.countplot(y="capital-loss", data=dataset_bin);
plt.subplot(1, 2, 2)
sns.distplot(dataset_con['capital-loss']);
# Plot a count of the categories from each categorical feature split by our prediction class:
salary - predclass.
def plot_bivariate_bar(dataset, hue, cols=5, width=20, height=15, hspace=0.2, wspace=0.5):
# dataset = dataset.select_dtypes(include=[np.object])
plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(width,height))
fig.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=wspace,
hspace=hspace)
rows = math.ceil(float(dataset.shape[1]) / cols)
for i, column in enumerate(dataset.columns):
ax = fig.add_subplot(rows, cols, i + 1)
ax.set_title(column)
if dataset.dtypes[column] == np.object:
g = sns.countplot(y=column, hue=hue, data=dataset)
substrings = [s.get_text()[:10] for s in g.get_yticklabels()]
g.set(yticklabels=substrings)
# Historical Trends on the Sex, Education, HPW and Age impact on Income.
plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(20,4))
plt.subplot(1, 3, 1)
sns.violinplot(x='sex', y='education-num', hue='predclass', data=dataset_con, split=True,
scale='count');
plt.subplot(1, 3, 2)
sns.violinplot(x='sex', y='hours-per-week', hue='predclass', data=dataset_con, split=True,
scale='count');
plt.subplot(1, 3, 3)
sns.violinplot(x='sex', y='age', hue='predclass', data=dataset_con, split=True, scale='count');
plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(20,5))
plt.subplot(1, 2, 1)
sns.countplot(y="age-hours", data=dataset_bin);
plt.subplot(1, 2, 2)
sns.distplot(dataset_con.loc[dataset_con['predclass'] == 1]['age-hours'], kde_kws={"label":
">$50K"});
sns.distplot(dataset_con.loc[dataset_con['predclass'] == 0]['age-hours'], kde_kws={"label":
"<$50K"});
plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(20,5))
sns.countplot(y="sex-marital", data=dataset_bin);
# One Hot Encodes all labels before Machine Learning
one_hot_cols = dataset_bin.columns.tolist()
one_hot_cols.remove('predclass')
dataset_bin_enc = pd.get_dummies(dataset_bin, columns=one_hot_cols)
dataset_bin_enc.head()
dataset_con['workclass'].fillna(dataset_con['workclass'].mode()[0], inplace=True)
dataset_con["workclass"].unique()
dataset_con["occupation"].unique()
feature = 'occupation'
dataset_con[feature].fillna(dataset_con[feature].mode()[0], inplace=True)
dataset_con[feature].unique()
dataset_con["native-country"].unique()
feature = 'native-country'
dataset_con[feature].fillna(dataset_con[feature].mode()[0], inplace=True)
dataset_con[feature].unique()
dataset_con_enc.head()
plt.subplot(1, 2, 1)
# Generate a mask for the upper triangle
mask = np.zeros_like(dataset_bin_enc.corr(), dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(dataset_bin_enc.corr(),
vmin=-1, vmax=1,
square=True,
cmap=sns.color_palette("RdBu_r", 100),
mask=mask,
linewidths=.5);
plt.subplot(1, 2, 2)
mask = np.zeros_like(dataset_con_enc.corr(), dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(dataset_con_enc.corr(),
vmin=-1, vmax=1,
square=True,
cmap=sns.color_palette("RdBu_r", 100),
mask=mask,
linewidths=.5);
plt.style.use('seaborn-whitegrid')
importance = clf.feature_importances_
importance = pd.DataFrame(importance, index=dataset_con_enc.drop('predclass',
axis=1).columns, columns=["Importance"])
importance.sort_values(by='Importance', ascending=True).plot(kind='barh',
figsize=(20,len(importance)/2));
# Calculating PCA for both datasets, and graphing the Variance for each feature, per dataset
std_scale = preprocessing.StandardScaler().fit(dataset_bin_enc.drop('predclass', axis=1))
X = std_scale.transform(dataset_bin_enc.drop('predclass', axis=1))
pca1 = PCA(n_components=len(dataset_bin_enc.columns)-1)
fit1 = pca1.fit(X)
plt.subplot(1, 2, 1)
plt.xlabel('PCA Feature')
plt.ylabel('Variance')
plt.title('PCA for Discretised Dataset')
plt.bar(range(0, fit1.explained_variance_ratio_.size), fit1.explained_variance_ratio_);
plt.subplot(1, 2, 2)
plt.xlabel('PCA Feature')
plt.ylabel('Variance')
plt.title('PCA for Continuous Dataset')
plt.bar(range(0, fit2.explained_variance_ratio_.size), fit2.explained_variance_ratio_);
# Formatting
target_names = [0,1]
colors = ['navy','darkorange']
lw = 2
alpha = 0.3
# 2 Components PCA
plt.style.use('seaborn-whitegrid')
plt.figure(2, figsize=(20, 8))
plt.subplot(1, 2, 1)
pca = PCA(n_components=2)
X_r = pca.fit(X).transform(X)
for color, i, target_name in zip(colors, [0, 1], target_names):
plt.scatter(X_r[y == i, 0], X_r[y == i, 1],
color=color,
alpha=alpha,
lw=lw,
label=target_name)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.title('First two PCA directions');
# 3 Components PCA
# ax = plt.subplot(1, 2, 2, projection='3d')
# pca = PCA(n_components=3)
# X_reduced = pca.fit(X).transform(X)
# for color, i, target_name in zip(colors, [0, 1], target_names):
# ax.scatter(X_reduced[y == i, 0], X_reduced[y == i, 1], X_reduced[y == i, 2],
# color=color,
# alpha=alpha,
# lw=lw,
# label=target_name)
# plt.legend(loc='best', shadow=False, scatterpoints=1)
# ax.set_title("First three PCA directions")
# ax.set_xlabel("1st eigenvector")
# ax.set_ylabel("2nd eigenvector")
# ax.set_zlabel("3rd eigenvector")
# Calculating RFE for non-discretised dataset, and graphing the Importance for each feature,
per dataset
selector1 = RFECV(LogisticRegression(), step=1, cv=5, n_jobs=-1)
selector1 = selector1.fit(dataset_con_enc.drop('predclass', axis=1).values,
dataset_con_enc['predclass'].values)
print("Feature Ranking For Non-Discretised: %s" % selector1.ranking_)
print("Optimal number of features : %d" % selector1.n_features_)
# Plot number of features VS. cross-validation scores
plt.style.use('seaborn-whitegrid')
plt.figure(figsize=(20,5))
plt.xlabel("Number of features selected - Non-Discretised")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(selector1.grid_scores_) + 1), selector1.grid_scores_);
# OPTIONS:
# - dataset_bin_enc
# - dataset_con_enc
# Change the dataset to test how would the algorithms perform under a differently encoded
dataset.
selected_dataset = dataset_con_enc
selected_dataset.head(2)
X_train_w_label = train
X_train = train.drop(['predclass'], axis=1)
y_train = train['predclass'].astype('int64')
X_test = test.drop(['predclass'], axis=1)
y_test = test['predclass'].astype('int64')
X_train.shape
X_train.head()
y_train.head()
# calculate the fpr and tpr for all thresholds of the classification
def plot_roc_curve(y_test, preds):
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([-0.01, 1.01])
plt.ylim([-0.01, 1.01])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
# Function that runs the requested algorithm and returns the accuracy metrics
def fit_ml_algo(algo, X_train, y_train, X_test, cv):
# One Pass
model = algo.fit(X_train, y_train)
test_pred = model.predict(X_test)
if (isinstance(algo, (LogisticRegression,
KNeighborsClassifier,
GaussianNB,
DecisionTreeClassifier,
RandomForestClassifier,
GradientBoostingClassifier))):
probs = model.predict_proba(X_test)[:,1]
else:
probs = "Not Available"
acc = round(model.score(X_test, y_test) * 100, 2)
# CV
train_pred = model_selection.cross_val_predict(algo,
X_train,
y_train,
cv=cv,
n_jobs = -1)
acc_cv = round(metrics.accuracy_score(y_train, train_pred) * 100, 2)
return train_pred, test_pred, acc, acc_cv, probs
start = time.time()
random_search.fit(X_train, y_train)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
" parameter settings." % ((time.time() - start), n_iter_search))
report(random_search.cv_results_)
# Logistic Regression
start_time = time.time()
train_pred_log, test_pred_log, acc_log, acc_cv_log, probs_log =
fit_ml_algo(LogisticRegression(n_jobs = -1),
X_train,
y_train,
X_test,
10)
log_time = (time.time() - start_time)
print("Accuracy: %s" % acc_log)
print("Accuracy CV 10-Fold: %s" % acc_cv_log)
print("Running Time: %s" % datetime.timedelta(seconds=log_time))
print(metrics.classification_report(y_train, train_pred_log))
print(metrics.classification_report(y_test, test_pred_log))
plot_roc_curve(y_test, probs_log)
# k-Nearest Neighbors
start_time = time.time()
train_pred_knn, test_pred_knn, acc_knn, acc_cv_knn, probs_knn =
fit_ml_algo(KNeighborsClassifier(n_neighbors = 3,
n_jobs = -1),
X_train,
y_train,
X_test,
10)
knn_time = (time.time() - start_time)
print("Accuracy: %s" % acc_knn)
print("Accuracy CV 10-Fold: %s" % acc_cv_knn)
print("Running Time: %s" % datetime.timedelta(seconds=knn_time))
print(metrics.classification_report(y_train, train_pred_knn))
print(metrics.classification_report(y_test, test_pred_knn))
plot_roc_curve(y_test, probs_knn)
print(metrics.classification_report(y_train, train_pred_gaussian))
print(metrics.classification_report(y_test, test_pred_gaussian))
plot_roc_curve(y_test, probs_gau)
# Linear SVC
start_time = time.time()
train_pred_svc, test_pred_svc, acc_linear_svc, acc_cv_linear_svc, _ =
fit_ml_algo(LinearSVC(),
X_train,
y_train,
X_test,
10)
linear_svc_time = (time.time() - start_time)
print("Accuracy: %s" % acc_linear_svc)
print("Accuracy CV 10-Fold: %s" % acc_cv_linear_svc)
print("Running Time: %s" % datetime.timedelta(seconds=linear_svc_time))
print(metrics.classification_report(y_train, train_pred_svc))
print(metrics.classification_report(y_test, test_pred_svc))
print(metrics.classification_report(y_train, train_pred_sgd))
print(metrics.classification_report(y_test, test_pred_sgd))
print(metrics.classification_report(y_train, train_pred_dt))
print(metrics.classification_report(y_test, test_pred_dt))
plot_roc_curve(y_test, probs_dt)
start = time.time()
random_search.fit(X_train, y_train)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
" parameter settings." % ((time.time() - start), n_iter_search))
report(random_search.cv_results_)
print(metrics.classification_report(y_train, train_pred_rf) )
print(metrics.classification_report(y_test, test_pred_rf))
plot_roc_curve(y_test, probs_rf)
print(metrics.classification_report(y_test, test_pred_gbt))
plot_roc_curve(y_test, probs_gbt)
models = pd.DataFrame({
'Model': ['KNN', 'Logistic Regression',
'Random Forest', 'Naive Bayes',
'Stochastic Gradient Decent', 'Linear SVC',
'Decision Tree', 'Gradient Boosting Trees'],
'Score': [
acc_knn,
acc_log,
acc_rf,
acc_gaussian,
acc_sgd,
acc_linear_svc,
acc_dt,
acc_gbt
]})
models.sort_values(by='Score', ascending=False)
models = pd.DataFrame({
'Model': ['KNN', 'Logistic Regression',
'Random Forest', 'Naive Bayes',
'Stochastic Gradient Decent', 'Linear SVC',
'Decision Tree', 'Gradient Boosting Trees'],
'Score': [
acc_cv_knn,
acc_cv_log,
acc_cv_rf,
acc_cv_gaussian,
acc_cv_sgd,
acc_cv_linear_svc,
acc_cv_dt,
acc_cv_gbt
]})
models.sort_values(by='Score', ascending=False)
plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(10,10))
models = [
'KNN',
'Logistic Regression',
'Random Forest',
'Naive Bayes',
'Decision Tree',
'Gradient Boosting Trees'
]
probs = [
probs_knn,
probs_log,
probs_rf,
probs_gau,
probs_dt,
probs_gbt
]
colors = [
'blue',
'green',
'red',
'cyan',
'magenta',
'yellow',
]
plt.show()