Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
0% found this document useful (0 votes)
27 views22 pages

AIL303 M

Download as docx, pdf, or txt
Download as docx, pdf, or txt
Download as docx, pdf, or txt
You are on page 1/ 22

Census Income Analysis

Data: adult.data, adult.names, adult.test, index, old.adult.names


References: Census Income Analysis.html (Attachment)
Create programing save Census Income Analysis.html by Census Income Analysis
Data: adult.data, adult.names, adult.test, index, old.adult.names
Explain code!
Examples:

import io, os, sys, types, time, datetime, math, random, requests, subprocess, tempfile

from io import StringIO

# Data Manipulation
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import missingno
import seaborn as sns
from pandas.plotting import scatter_matrix

# Feature Selection and Encoding


from sklearn.feature_selection import RFE, RFECV
from sklearn.svm import SVR
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, label_binarize

# Machine learning
import sklearn.ensemble as ske
from sklearn import datasets, model_selection, tree, preprocessing, metrics, linear_model
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso,
SGDClassifier
from sklearn.tree import DecisionTreeClassifier
import tensorflow as tf

# Grid and Random Search


import scipy.stats as st
from scipy.stats import randint as sp_randint
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Metrics
from sklearn.metrics import precision_recall_fscore_support, roc_curve, auc

# Managing Warnings
import warnings
warnings.filterwarnings('ignore')

# Plot the Figures Inline


%matplotlib inline

# conda_packages_list = StringIO(subprocess.Popen(["conda", "list"],


stdout=subprocess.PIPE).communicate()[0])
# conda_packages_list = pd.read_csv(conda_packages_list,
# names=['Package Name','Version','Python Version','Repo','Other'],
# delim_whitespace=True, engine='python', skiprows=3)
# conda_packages_list.head(5)

# # Download
# DATASET = (
# "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
# "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names",
# "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test",
#)

# def download_data(path='dataset', urls=DATASET):


# if not os.path.exists(path):
# os.mkdir(path)

# for url in urls:


# response = requests.get(url)
# name = os.path.basename(url)
# with open(os.path.join(path, name), 'w') as f:
# f.write(response.content)

# #download_data()
path = "D:\\TUANTA\\VTC Academiy\\AI-Tuan-Class\\"
path += "Specialist Lesson 05 - Machine Learning - Classification Methods\\"

# Load Training and Test Data Sets


headers = ['age', 'workclass', 'fnlwgt',
'education', 'education-num',
'marital-status', 'occupation',
'relationship', 'race', 'sex',
'capital-gain', 'capital-loss',
'hours-per-week', 'native-country',
'predclass']
training_raw = pd.read_csv(path + 'adult.data',
header=None,
names=headers,
sep=',\s',
na_values=["?"],
engine='python')
test_raw = pd.read_csv(path + 'adult.test',
header=None,
names=headers,
sep=',\s',
na_values=["?"],
engine='python',
skiprows=1)

# Join Datasets
dataset_raw = training_raw.append(test_raw)
dataset_raw.reset_index(inplace=True)
dataset_raw.drop('index',inplace=True,axis=1)

# Displaying the size of the Dataframe in Memory


def convert_size(size_bytes):
if size_bytes == 0:
return "0B"
size_name = ("Bytes", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
i = int(math.floor(math.log(size_bytes, 1024)))
p = math.pow(1024, i)
s = round(size_bytes / p, 2)
return "%s %s" % (s, size_name[i])
convert_size(dataset_raw.memory_usage().sum())

Data Exploration - Univariate


When exploring our dataset and its features, we have many options available to us. We can
explore each feature individually, or compare pairs of features, finding the correlation between.
Let's start with some simple Univariate (one feature) analysis.

Features can be of multiple types:

● Nominal: is for mutual exclusive, but not ordered, categories.


● Ordinal: is one where the order matters but not the difference between values.
● Interval: is a measurement where the difference between two values is meaningful.
● Ratio: has all the properties of an interval variable, and also has a clear definition of 0.0.

There are multiple ways of manipulating each feature type, but for simplicity, we'll define only two
feature types:

● Numerical: any feature that contains numeric values.


● Categorical: any feature that contains categories, or text.
# Describing all the Numerical Features
dataset_raw.describe()

# Describing all the Categorical Features


dataset_raw.describe(include=['O'])

# Let's have a quick look at our data


dataset_raw.head()

# Let’s plot the distribution of each feature


def plot_distribution(dataset, cols=5, width=20, height=15, hspace=0.2, wspace=0.5):
plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(width,height))
fig.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=wspace,
hspace=hspace)
rows = math.ceil(float(dataset.shape[1]) / cols)
for i, column in enumerate(dataset.columns):
ax = fig.add_subplot(rows, cols, i + 1)
ax.set_title(column)
if dataset.dtypes[column] == np.object:
g = sns.countplot(y=column, data=dataset)
substrings = [s.get_text()[:18] for s in g.get_yticklabels()]
g.set(yticklabels=substrings)
plt.xticks(rotation=25)
else:
g = sns.distplot(dataset[column])
plt.xticks(rotation=25)

plot_distribution(dataset_raw, cols=3, width=20, height=20, hspace=0.45, wspace=0.5)

# How many missing values are there in our dataset?


missingno.matrix(dataset_raw, figsize = (30,5))

missingno.bar(dataset_raw, sort='ascending', figsize = (30,5))

Feature Cleaning, Engineering, and Imputation


Cleaning: To clean our data, we'll need to work with:

● Missing values: Either omit elements from a dataset that contain missing values or
impute them (fill them in).
● Special values: Numeric variables are endowed with several formalized special values
including ±Inf, NA and NaN. Calculations involving special values often result in special
values, and need to be handled/cleaned.
● Outliers: They should be detected, but not necessarily removed. Their inclusion in the
analysis is a statistical decision.
● Obvious inconsistencies: A person's age cannot be negative, a man cannot be
pregnant and an under-aged person cannot possess a drivers license. Find the
inconsistencies and plan for them.

Engineering: There are multiple techniques for feature engineering:

● Decompose: Converting 2014-09-20T20:45:40Z into categorical attributes like


hour_of_the_day, part_of_day, etc.
● Discretization: We can choose to either discretize some of the continuous variables we
have, as some algorithms will perform faster. We are going to do both, and compare the
results of the ML algorithms on both discretized and non discretised datasets. We'll call
these datasets:
● dataset_bin => where Continuous variables are Discretised
● dataset_con => where Continuous variables are Continuous
● Reframe Numerical Quantities: Changing from grams to kg, and losing detail might be
both wanted and efficient for calculation
● Feature Crossing: Creating new features as a combination of existing features. Could
be multiplying numerical features, or combining categorical variables. This is a great way
to add domain expertise knowledge to the dataset.

Imputation: We can impute missing values in a number of different ways:


● Hot-Deck: The technique then finds the first missing value and uses the cell value
immediately prior to the data that are missing to impute the missing value.
● Cold-Deck: Selects donors from another dataset to complete missing data.
● Mean-substitution: Another imputation technique involves replacing any missing value
with the mean of that variable for all other cases, which has the benefit of not changing
the sample mean for that variable.
● Regression: A regression model is estimated to predict observed values of a variable
based on other variables, and that model is then used to impute values in cases where
that variable is missing.

missingno.bar(dataset_raw, sort='ascending', figsize = (30,5))

# To perform our data analysis, let's create new dataframes.


dataset_bin = pd.DataFrame() # To contain our dataframe with our discretised continuous
variables
dataset_con = pd.DataFrame() # To contain our dataframe with our continuous variables

# Let's fix the Class Feature


dataset_raw.loc[dataset_raw['predclass'] == '>50K', 'predclass'] = 1
dataset_raw.loc[dataset_raw['predclass'] == '>50K.', 'predclass'] = 1
dataset_raw.loc[dataset_raw['predclass'] == '<=50K', 'predclass'] = 0
dataset_raw.loc[dataset_raw['predclass'] == '<=50K.', 'predclass'] = 0

dataset_bin['predclass'] = dataset_raw['predclass']
dataset_con['predclass'] = dataset_raw['predclass']

plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(20,1))
sns.countplot(y="predclass", data=dataset_bin);

dataset_bin['age'] = pd.cut(dataset_raw['age'], 10) # discretised


dataset_con['age'] = dataset_raw['age'] # non-discretised

plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(20,5))
plt.subplot(1, 2, 1)
sns.countplot(y="age", data=dataset_bin);
plt.subplot(1, 2, 2)
sns.distplot(dataset_con.loc[dataset_con['predclass'] == 1]['age'], kde_kws={"label": ">$50K"});
sns.distplot(dataset_con.loc[dataset_con['predclass'] == 0]['age'], kde_kws={"label": "<$50K"});

# Can we bucket some of these groups?


plt.style.use('seaborn-whitegrid')
plt.figure(figsize=(20,3))
sns.countplot(y="workclass", data=dataset_raw);

# There are too many groups here, we can group someof them together.
# Create buckets for Workclass
dataset_raw.loc[dataset_raw['workclass'] == 'Without-pay', 'workclass'] = 'Not Working'
dataset_raw.loc[dataset_raw['workclass'] == 'Never-worked', 'workclass'] = 'Not Working'
dataset_raw.loc[dataset_raw['workclass'] == 'Federal-gov', 'workclass'] = 'Fed-gov'
dataset_raw.loc[dataset_raw['workclass'] == 'State-gov', 'workclass'] = 'Non-fed-gov'
dataset_raw.loc[dataset_raw['workclass'] == 'Local-gov', 'workclass'] = 'Non-fed-gov'
dataset_raw.loc[dataset_raw['workclass'] == 'Self-emp-not-inc', 'workclass'] = 'Self-emp'
dataset_raw.loc[dataset_raw['workclass'] == 'Self-emp-inc', 'workclass'] = 'Self-emp'

dataset_bin['workclass'] = dataset_raw['workclass']
dataset_con['workclass'] = dataset_raw['workclass']

plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(20,2))
sns.countplot(y="workclass", data=dataset_bin);

# Can we bucket some of these groups?


plt.style.use('seaborn-whitegrid')
plt.figure(figsize=(20,5))
sns.countplot(y="occupation", data=dataset_raw);

# Create buckets for Occupation


dataset_raw.loc[dataset_raw['occupation'] == 'Adm-clerical', 'occupation'] = 'Admin'
dataset_raw.loc[dataset_raw['occupation'] == 'Armed-Forces', 'occupation'] = 'Military'
dataset_raw.loc[dataset_raw['occupation'] == 'Craft-repair', 'occupation'] = 'Manual Labour'
dataset_raw.loc[dataset_raw['occupation'] == 'Exec-managerial', 'occupation'] = 'Office Labour'
dataset_raw.loc[dataset_raw['occupation'] == 'Farming-fishing', 'occupation'] = 'Manual Labour'
dataset_raw.loc[dataset_raw['occupation'] == 'Handlers-cleaners', 'occupation'] = 'Manual Labour'
dataset_raw.loc[dataset_raw['occupation'] == 'Machine-op-inspct', 'occupation'] = 'Manual
Labour'
dataset_raw.loc[dataset_raw['occupation'] == 'Other-service', 'occupation'] = 'Service'
dataset_raw.loc[dataset_raw['occupation'] == 'Priv-house-serv', 'occupation'] = 'Service'
dataset_raw.loc[dataset_raw['occupation'] == 'Prof-specialty', 'occupation'] = 'Professional'
dataset_raw.loc[dataset_raw['occupation'] == 'Protective-serv', 'occupation'] = 'Military'
dataset_raw.loc[dataset_raw['occupation'] == 'Sales', 'occupation'] = 'Office Labour'
dataset_raw.loc[dataset_raw['occupation'] == 'Tech-support', 'occupation'] = 'Office Labour'
dataset_raw.loc[dataset_raw['occupation'] == 'Transport-moving', 'occupation'] = 'Manual Labour'

dataset_bin['occupation'] = dataset_raw['occupation']
dataset_con['occupation'] = dataset_raw['occupation']

plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(20,3))
sns.countplot(y="occupation", data=dataset_bin);

# Can we bucket some of these groups?


plt.style.use('seaborn-whitegrid')
plt.figure(figsize=(20,10))
sns.countplot(y="native-country", data=dataset_raw);

dataset_raw.loc[dataset_raw['native-country'] == 'Cambodia' , 'native-country'] = 'SE-


Asia'
dataset_raw.loc[dataset_raw['native-country'] == 'Canada' , 'native-country'] =
'British-Commonwealth'
dataset_raw.loc[dataset_raw['native-country'] == 'China' , 'native-country'] = 'China'
dataset_raw.loc[dataset_raw['native-country'] == 'Columbia' , 'native-country'] =
'South-America'
dataset_raw.loc[dataset_raw['native-country'] == 'Cuba' , 'native-country'] = 'South-
America'
dataset_raw.loc[dataset_raw['native-country'] == 'Dominican-Republic' , 'native-country'] =
'South-America'
dataset_raw.loc[dataset_raw['native-country'] == 'Ecuador' , 'native-country'] =
'South-America'
dataset_raw.loc[dataset_raw['native-country'] == 'El-Salvador' , 'native-country'] =
'South-America'
dataset_raw.loc[dataset_raw['native-country'] == 'England' , 'native-country'] =
'British-Commonwealth'
dataset_raw.loc[dataset_raw['native-country'] == 'France' , 'native-country'] =
'Euro_Group_1'
dataset_raw.loc[dataset_raw['native-country'] == 'Germany' , 'native-country'] =
'Euro_Group_1'
dataset_raw.loc[dataset_raw['native-country'] == 'Greece' , 'native-country'] =
'Euro_Group_2'
dataset_raw.loc[dataset_raw['native-country'] == 'Guatemala' , 'native-country'] =
'South-America'
dataset_raw.loc[dataset_raw['native-country'] == 'Haiti' , 'native-country'] = 'South-
America'
dataset_raw.loc[dataset_raw['native-country'] == 'Holand-Netherlands' , 'native-country'] =
'Euro_Group_1'
dataset_raw.loc[dataset_raw['native-country'] == 'Honduras' , 'native-country'] =
'South-America'
dataset_raw.loc[dataset_raw['native-country'] == 'Hong' , 'native-country'] = 'China'
dataset_raw.loc[dataset_raw['native-country'] == 'Hungary' , 'native-country'] =
'Euro_Group_2'
dataset_raw.loc[dataset_raw['native-country'] == 'India' , 'native-country'] = 'British-
Commonwealth'
dataset_raw.loc[dataset_raw['native-country'] == 'Iran' , 'native-country'] =
'Euro_Group_2'
dataset_raw.loc[dataset_raw['native-country'] == 'Ireland' , 'native-country'] = 'British-
Commonwealth'
dataset_raw.loc[dataset_raw['native-country'] == 'Italy' , 'native-country'] =
'Euro_Group_1'
dataset_raw.loc[dataset_raw['native-country'] == 'Jamaica' , 'native-country'] =
'South-America'
dataset_raw.loc[dataset_raw['native-country'] == 'Japan' , 'native-country'] = 'APAC'
dataset_raw.loc[dataset_raw['native-country'] == 'Laos' , 'native-country'] = 'SE-
Asia'
dataset_raw.loc[dataset_raw['native-country'] == 'Mexico' , 'native-country'] =
'South-America'
dataset_raw.loc[dataset_raw['native-country'] == 'Nicaragua' , 'native-country'] =
'South-America'
dataset_raw.loc[dataset_raw['native-country'] == 'Outlying-US(Guam-USVI-etc)' , 'native-
country'] = 'South-America'
dataset_raw.loc[dataset_raw['native-country'] == 'Peru' , 'native-country'] = 'South-
America'
dataset_raw.loc[dataset_raw['native-country'] == 'Philippines' , 'native-country'] = 'SE-
Asia'
dataset_raw.loc[dataset_raw['native-country'] == 'Poland' , 'native-country'] =
'Euro_Group_2'
dataset_raw.loc[dataset_raw['native-country'] == 'Portugal' , 'native-country'] =
'Euro_Group_2'
dataset_raw.loc[dataset_raw['native-country'] == 'Puerto-Rico' , 'native-country'] =
'South-America'
dataset_raw.loc[dataset_raw['native-country'] == 'Scotland' , 'native-country'] =
'British-Commonwealth'
dataset_raw.loc[dataset_raw['native-country'] == 'South' , 'native-country'] =
'Euro_Group_2'
dataset_raw.loc[dataset_raw['native-country'] == 'Taiwan' , 'native-country'] =
'China'
dataset_raw.loc[dataset_raw['native-country'] == 'Thailand' , 'native-country'] = 'SE-
Asia'
dataset_raw.loc[dataset_raw['native-country'] == 'Trinadad&Tobago' , 'native-country'] =
'South-America'
dataset_raw.loc[dataset_raw['native-country'] == 'United-States' , 'native-country'] =
'United-States'
dataset_raw.loc[dataset_raw['native-country'] == 'Vietnam' , 'native-country'] = 'SE-
Asia'
dataset_raw.loc[dataset_raw['native-country'] == 'Yugoslavia' , 'native-country'] =
'Euro_Group_2'

dataset_bin['native-country'] = dataset_raw['native-country']
dataset_con['native-country'] = dataset_raw['native-country']

plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(20,4))
sns.countplot(y="native-country", data=dataset_bin);

# Can we bucket some of these groups?


plt.style.use('seaborn-whitegrid')
plt.figure(figsize=(20,5))
sns.countplot(y="education", data=dataset_raw);

dataset_raw.loc[dataset_raw['education'] == '10th' , 'education'] = 'Dropout'


dataset_raw.loc[dataset_raw['education'] == '11th' , 'education'] = 'Dropout'
dataset_raw.loc[dataset_raw['education'] == '12th' , 'education'] = 'Dropout'
dataset_raw.loc[dataset_raw['education'] == '1st-4th' , 'education'] = 'Dropout'
dataset_raw.loc[dataset_raw['education'] == '5th-6th' , 'education'] = 'Dropout'
dataset_raw.loc[dataset_raw['education'] == '7th-8th' , 'education'] = 'Dropout'
dataset_raw.loc[dataset_raw['education'] == '9th' , 'education'] = 'Dropout'
dataset_raw.loc[dataset_raw['education'] == 'Assoc-acdm' , 'education'] = 'Associate'
dataset_raw.loc[dataset_raw['education'] == 'Assoc-voc' , 'education'] = 'Associate'
dataset_raw.loc[dataset_raw['education'] == 'Bachelors' , 'education'] = 'Bachelors'
dataset_raw.loc[dataset_raw['education'] == 'Doctorate' , 'education'] = 'Doctorate'
dataset_raw.loc[dataset_raw['education'] == 'HS-Grad' , 'education'] = 'HS-Graduate'
dataset_raw.loc[dataset_raw['education'] == 'Masters' , 'education'] = 'Masters'
dataset_raw.loc[dataset_raw['education'] == 'Preschool' , 'education'] = 'Dropout'
dataset_raw.loc[dataset_raw['education'] == 'Prof-school' , 'education'] = 'Professor'
dataset_raw.loc[dataset_raw['education'] == 'Some-college' , 'education'] = 'HS-Graduate'

dataset_bin['education'] = dataset_raw['education']
dataset_con['education'] = dataset_raw['education']
plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(20,4))
sns.countplot(y="education", data=dataset_bin);

# Can we bucket some of these groups?


plt.figure(figsize=(20,3))
sns.countplot(y="marital-status", data=dataset_raw);

dataset_raw.loc[dataset_raw['marital-status'] == 'Never-married' , 'marital-status'] = 'Never-


Married'
dataset_raw.loc[dataset_raw['marital-status'] == 'Married-AF-spouse' , 'marital-status'] =
'Married'
dataset_raw.loc[dataset_raw['marital-status'] == 'Married-civ-spouse' , 'marital-status'] =
'Married'
dataset_raw.loc[dataset_raw['marital-status'] == 'Married-spouse-absent', 'marital-status'] = 'Not-
Married'
dataset_raw.loc[dataset_raw['marital-status'] == 'Separated' , 'marital-status'] =
'Separated'
dataset_raw.loc[dataset_raw['marital-status'] == 'Divorced' , 'marital-status'] = 'Separated'
dataset_raw.loc[dataset_raw['marital-status'] == 'Widowed' , 'marital-status'] = 'Widowed'

dataset_bin['marital-status'] = dataset_raw['marital-status']
dataset_con['marital-status'] = dataset_raw['marital-status']

plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(20,3))
sns.countplot(y="marital-status", data=dataset_bin);

# Let's use the Pandas Cut function to bin the data in equally sized buckets
dataset_bin['fnlwgt'] = pd.cut(dataset_raw['fnlwgt'], 10)
dataset_con['fnlwgt'] = dataset_raw['fnlwgt']

plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(20,4))
sns.countplot(y="fnlwgt", data=dataset_bin);

# Let's use the Pandas Cut function to bin the data in equally sized buckets
dataset_bin['education-num'] = pd.cut(dataset_raw['education-num'], 10)
dataset_con['education-num'] = dataset_raw['education-num']

plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(20,5))
sns.countplot(y="education-num", data=dataset_bin);

# Let's use the Pandas Cut function to bin the data in equally sized buckets
dataset_bin['hours-per-week'] = pd.cut(dataset_raw['hours-per-week'], 10)
dataset_con['hours-per-week'] = dataset_raw['hours-per-week']

plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(20,4))
plt.subplot(1, 2, 1)
sns.countplot(y="hours-per-week", data=dataset_bin);
plt.subplot(1, 2, 2)
sns.distplot(dataset_con['hours-per-week']);

# Let's use the Pandas Cut function to bin the data in equally sized buckets
dataset_bin['capital-gain'] = pd.cut(dataset_raw['capital-gain'], 5)
dataset_con['capital-gain'] = dataset_raw['capital-gain']

plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(20,3))
plt.subplot(1, 2, 1)
sns.countplot(y="capital-gain", data=dataset_bin);
plt.subplot(1, 2, 2)
sns.distplot(dataset_con['capital-gain']);

# Let's use the Pandas Cut function to bin the data in equally sized buckets
dataset_bin['capital-loss'] = pd.cut(dataset_raw['capital-loss'], 5)
dataset_con['capital-loss'] = dataset_raw['capital-loss']

plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(20,3))
plt.subplot(1, 2, 1)
sns.countplot(y="capital-loss", data=dataset_bin);
plt.subplot(1, 2, 2)
sns.distplot(dataset_con['capital-loss']);

# Some features we'll consider to be in good enough shape as to pass through


dataset_con['sex'] = dataset_bin['sex'] = dataset_raw['sex']
dataset_con['race'] = dataset_bin['race'] = dataset_raw['race']
dataset_con['relationship'] = dataset_bin['relationship'] = dataset_raw['relationship']

# Plot a count of the categories from each categorical feature split by our prediction class:
salary - predclass.
def plot_bivariate_bar(dataset, hue, cols=5, width=20, height=15, hspace=0.2, wspace=0.5):
# dataset = dataset.select_dtypes(include=[np.object])
plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(width,height))
fig.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=wspace,
hspace=hspace)
rows = math.ceil(float(dataset.shape[1]) / cols)
for i, column in enumerate(dataset.columns):
ax = fig.add_subplot(rows, cols, i + 1)
ax.set_title(column)
if dataset.dtypes[column] == np.object:
g = sns.countplot(y=column, hue=hue, data=dataset)
substrings = [s.get_text()[:10] for s in g.get_yticklabels()]
g.set(yticklabels=substrings)

plot_bivariate_bar(dataset_con, hue='predclass', cols=3, width=20, height=12, hspace=0.4,


wspace=0.5)
# Effect of Marital Status and Education on Income, across Marital Status.
plt.style.use('seaborn-whitegrid')
g = sns.FacetGrid(dataset_con, col='marital-status', size=4, aspect=.7)
g = g.map(sns.boxplot, 'predclass', 'education-num')

# Historical Trends on the Sex, Education, HPW and Age impact on Income.
plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(20,4))
plt.subplot(1, 3, 1)
sns.violinplot(x='sex', y='education-num', hue='predclass', data=dataset_con, split=True,
scale='count');

plt.subplot(1, 3, 2)
sns.violinplot(x='sex', y='hours-per-week', hue='predclass', data=dataset_con, split=True,
scale='count');

plt.subplot(1, 3, 3)
sns.violinplot(x='sex', y='age', hue='predclass', data=dataset_con, split=True, scale='count');

# Interaction between pairs of features.


sns.pairplot(dataset_con[['age','education-num','hours-per-week','predclass','capital-
gain','capital-loss']],
hue="predclass",
diag_kind="histogram",
size=4);

# Crossing Numerical Features


dataset_con['age-hours'] = dataset_con['age'] * dataset_con['hours-per-week']

dataset_bin['age-hours'] = pd.cut(dataset_con['age-hours'], 10)


dataset_con['age-hours'] = dataset_con['age-hours']

plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(20,5))
plt.subplot(1, 2, 1)
sns.countplot(y="age-hours", data=dataset_bin);
plt.subplot(1, 2, 2)
sns.distplot(dataset_con.loc[dataset_con['predclass'] == 1]['age-hours'], kde_kws={"label":
">$50K"});
sns.distplot(dataset_con.loc[dataset_con['predclass'] == 0]['age-hours'], kde_kws={"label":
"<$50K"});

# Crossing Categorical Features


dataset_bin['sex-marital'] = dataset_con['sex-marital'] = dataset_con['sex'] +
dataset_con['marital-status']

plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(20,5))
sns.countplot(y="sex-marital", data=dataset_bin);
# One Hot Encodes all labels before Machine Learning
one_hot_cols = dataset_bin.columns.tolist()
one_hot_cols.remove('predclass')
dataset_bin_enc = pd.get_dummies(dataset_bin, columns=one_hot_cols)

dataset_bin_enc.head()

dataset_con['workclass'].fillna(dataset_con['workclass'].mode()[0], inplace=True)

dataset_con["workclass"].unique()

dataset_con["occupation"].unique()

feature = 'occupation'
dataset_con[feature].fillna(dataset_con[feature].mode()[0], inplace=True)
dataset_con[feature].unique()

dataset_con["native-country"].unique()

feature = 'native-country'
dataset_con[feature].fillna(dataset_con[feature].mode()[0], inplace=True)
dataset_con[feature].unique()

# Label Encode all labels


dataset_con_enc = dataset_con.apply(LabelEncoder().fit_transform)

dataset_con_enc.head()

# Create a correlation plot of both datasets.


plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(25,10))

plt.subplot(1, 2, 1)
# Generate a mask for the upper triangle
mask = np.zeros_like(dataset_bin_enc.corr(), dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(dataset_bin_enc.corr(),
vmin=-1, vmax=1,
square=True,
cmap=sns.color_palette("RdBu_r", 100),
mask=mask,
linewidths=.5);

plt.subplot(1, 2, 2)
mask = np.zeros_like(dataset_con_enc.corr(), dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(dataset_con_enc.corr(),
vmin=-1, vmax=1,
square=True,
cmap=sns.color_palette("RdBu_r", 100),
mask=mask,
linewidths=.5);

# Using Random Forest to gain an insight on Feature Importance


clf = RandomForestClassifier()
clf.fit(dataset_con_enc.drop('predclass', axis=1), dataset_con_enc['predclass'])

plt.style.use('seaborn-whitegrid')
importance = clf.feature_importances_
importance = pd.DataFrame(importance, index=dataset_con_enc.drop('predclass',
axis=1).columns, columns=["Importance"])
importance.sort_values(by='Importance', ascending=True).plot(kind='barh',
figsize=(20,len(importance)/2));

# Calculating PCA for both datasets, and graphing the Variance for each feature, per dataset
std_scale = preprocessing.StandardScaler().fit(dataset_bin_enc.drop('predclass', axis=1))
X = std_scale.transform(dataset_bin_enc.drop('predclass', axis=1))
pca1 = PCA(n_components=len(dataset_bin_enc.columns)-1)
fit1 = pca1.fit(X)

std_scale = preprocessing.StandardScaler().fit(dataset_con_enc.drop('predclass', axis=1))


X = std_scale.transform(dataset_con_enc.drop('predclass', axis=1))
pca2 = PCA(n_components=len(dataset_con_enc.columns)-2)
fit2 = pca2.fit(X)

# Graphing the variance per feature


plt.style.use('seaborn-whitegrid')
plt.figure(figsize=(25,7))

plt.subplot(1, 2, 1)
plt.xlabel('PCA Feature')
plt.ylabel('Variance')
plt.title('PCA for Discretised Dataset')
plt.bar(range(0, fit1.explained_variance_ratio_.size), fit1.explained_variance_ratio_);

plt.subplot(1, 2, 2)
plt.xlabel('PCA Feature')
plt.ylabel('Variance')
plt.title('PCA for Continuous Dataset')
plt.bar(range(0, fit2.explained_variance_ratio_.size), fit2.explained_variance_ratio_);

# from mpl_toolkits.mplot3d import Axes3D

# PCA's components graphed in 2D and 3D


# Apply Scaling
std_scale = preprocessing.StandardScaler().fit(dataset_con_enc.drop('predclass', axis=1))
X = std_scale.transform(dataset_con_enc.drop('predclass', axis=1))
y = dataset_con_enc['predclass']

# Formatting
target_names = [0,1]
colors = ['navy','darkorange']
lw = 2
alpha = 0.3
# 2 Components PCA
plt.style.use('seaborn-whitegrid')
plt.figure(2, figsize=(20, 8))

plt.subplot(1, 2, 1)
pca = PCA(n_components=2)
X_r = pca.fit(X).transform(X)
for color, i, target_name in zip(colors, [0, 1], target_names):
plt.scatter(X_r[y == i, 0], X_r[y == i, 1],
color=color,
alpha=alpha,
lw=lw,
label=target_name)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.title('First two PCA directions');

# 3 Components PCA
# ax = plt.subplot(1, 2, 2, projection='3d')

# pca = PCA(n_components=3)
# X_reduced = pca.fit(X).transform(X)
# for color, i, target_name in zip(colors, [0, 1], target_names):
# ax.scatter(X_reduced[y == i, 0], X_reduced[y == i, 1], X_reduced[y == i, 2],
# color=color,
# alpha=alpha,
# lw=lw,
# label=target_name)
# plt.legend(loc='best', shadow=False, scatterpoints=1)
# ax.set_title("First three PCA directions")
# ax.set_xlabel("1st eigenvector")
# ax.set_ylabel("2nd eigenvector")
# ax.set_zlabel("3rd eigenvector")

# rotate the axes


# ax.view_init(30, 10)

# Calculating RFE for non-discretised dataset, and graphing the Importance for each feature,
per dataset
selector1 = RFECV(LogisticRegression(), step=1, cv=5, n_jobs=-1)
selector1 = selector1.fit(dataset_con_enc.drop('predclass', axis=1).values,
dataset_con_enc['predclass'].values)
print("Feature Ranking For Non-Discretised: %s" % selector1.ranking_)
print("Optimal number of features : %d" % selector1.n_features_)
# Plot number of features VS. cross-validation scores
plt.style.use('seaborn-whitegrid')
plt.figure(figsize=(20,5))
plt.xlabel("Number of features selected - Non-Discretised")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(selector1.grid_scores_) + 1), selector1.grid_scores_);

# Feature space could be subsetted like so:


dataset_con_enc =
dataset_con_enc[dataset_con_enc.columns[np.insert(selector1.support_, 0, True)]]

# OPTIONS:
# - dataset_bin_enc
# - dataset_con_enc

# Change the dataset to test how would the algorithms perform under a differently encoded
dataset.

selected_dataset = dataset_con_enc

selected_dataset.head(2)

# Splitting the Training and Test data sets


train = selected_dataset.loc[0:32560,:]
test = selected_dataset.loc[32560:,:]

# Given missing fields are a small percentange of the overall dataset,


# we have chosen to delete them.
train = train.dropna(axis=0)
test = test.dropna(axis=0)

X_train_w_label = train
X_train = train.drop(['predclass'], axis=1)
y_train = train['predclass'].astype('int64')
X_test = test.drop(['predclass'], axis=1)
y_test = test['predclass'].astype('int64')

X_train.shape

X_train.head()

y_train.head()

# Setting a random seed will guarantee we get the same results


# every time we run our training and testing.
random.seed(1)

# calculate the fpr and tpr for all thresholds of the classification
def plot_roc_curve(y_test, preds):
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([-0.01, 1.01])
plt.ylim([-0.01, 1.01])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

# Function that runs the requested algorithm and returns the accuracy metrics
def fit_ml_algo(algo, X_train, y_train, X_test, cv):
# One Pass
model = algo.fit(X_train, y_train)
test_pred = model.predict(X_test)
if (isinstance(algo, (LogisticRegression,
KNeighborsClassifier,
GaussianNB,
DecisionTreeClassifier,
RandomForestClassifier,
GradientBoostingClassifier))):
probs = model.predict_proba(X_test)[:,1]
else:
probs = "Not Available"
acc = round(model.score(X_test, y_test) * 100, 2)
# CV
train_pred = model_selection.cross_val_predict(algo,
X_train,
y_train,
cv=cv,
n_jobs = -1)
acc_cv = round(metrics.accuracy_score(y_train, train_pred) * 100, 2)
return train_pred, test_pred, acc, acc_cv, probs

# Logistic Regression - Random Search for Hyperparameters

# Utility function to report best scores


def report(results, n_top=5):
for i in range(1, n_top + 1):
candidates = np.flatnonzero(results['rank_test_score'] == i)
for candidate in candidates:
print("Model with rank: {0}".format(i))
print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
results['mean_test_score'][candidate],
results['std_test_score'][candidate]))
print("Parameters: {0}".format(results['params'][candidate]))
print("")

# Specify parameters and distributions to sample from


param_dist = {'penalty': ['l2', 'l1'],
'class_weight': [None, 'balanced'],
'C': np.logspace(-20, 20, 10000),
'intercept_scaling': np.logspace(-20, 20, 10000)}

# Run Randomized Search


n_iter_search = 10
lrc = LogisticRegression()
random_search = RandomizedSearchCV(lrc,
n_jobs=-1,
param_distributions=param_dist,
n_iter=n_iter_search)

start = time.time()
random_search.fit(X_train, y_train)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
" parameter settings." % ((time.time() - start), n_iter_search))
report(random_search.cv_results_)

# Logistic Regression
start_time = time.time()
train_pred_log, test_pred_log, acc_log, acc_cv_log, probs_log =
fit_ml_algo(LogisticRegression(n_jobs = -1),
X_train,
y_train,
X_test,
10)
log_time = (time.time() - start_time)
print("Accuracy: %s" % acc_log)
print("Accuracy CV 10-Fold: %s" % acc_cv_log)
print("Running Time: %s" % datetime.timedelta(seconds=log_time))

print(metrics.classification_report(y_train, train_pred_log))

print(metrics.classification_report(y_test, test_pred_log))

plot_roc_curve(y_test, probs_log)

# k-Nearest Neighbors
start_time = time.time()
train_pred_knn, test_pred_knn, acc_knn, acc_cv_knn, probs_knn =
fit_ml_algo(KNeighborsClassifier(n_neighbors = 3,
n_jobs = -1),
X_train,
y_train,
X_test,
10)
knn_time = (time.time() - start_time)
print("Accuracy: %s" % acc_knn)
print("Accuracy CV 10-Fold: %s" % acc_cv_knn)
print("Running Time: %s" % datetime.timedelta(seconds=knn_time))
print(metrics.classification_report(y_train, train_pred_knn))

print(metrics.classification_report(y_test, test_pred_knn))

plot_roc_curve(y_test, probs_knn)

# Gaussian Naive Bayes


start_time = time.time()
train_pred_gaussian, test_pred_gaussian, acc_gaussian, acc_cv_gaussian, probs_gau =
fit_ml_algo(GaussianNB(),
X_train,
y_train,
X_test,
10)
gaussian_time = (time.time() - start_time)
print("Accuracy: %s" % acc_gaussian)
print("Accuracy CV 10-Fold: %s" % acc_cv_gaussian)
print("Running Time: %s" % datetime.timedelta(seconds=gaussian_time))

print(metrics.classification_report(y_train, train_pred_gaussian))

print(metrics.classification_report(y_test, test_pred_gaussian))

plot_roc_curve(y_test, probs_gau)

# Linear SVC
start_time = time.time()
train_pred_svc, test_pred_svc, acc_linear_svc, acc_cv_linear_svc, _ =
fit_ml_algo(LinearSVC(),
X_train,
y_train,
X_test,
10)
linear_svc_time = (time.time() - start_time)
print("Accuracy: %s" % acc_linear_svc)
print("Accuracy CV 10-Fold: %s" % acc_cv_linear_svc)
print("Running Time: %s" % datetime.timedelta(seconds=linear_svc_time))

print(metrics.classification_report(y_train, train_pred_svc))

print(metrics.classification_report(y_test, test_pred_svc))

# Stochastic Gradient Descent


start_time = time.time()
train_pred_sgd, test_pred_sgd, acc_sgd, acc_cv_sgd, _ = fit_ml_algo(SGDClassifier(n_jobs
= -1),
X_train,
y_train,
X_test,
10)
sgd_time = (time.time() - start_time)
print("Accuracy: %s" % acc_sgd)
print("Accuracy CV 10-Fold: %s" % acc_cv_sgd)
print("Running Time: %s" % datetime.timedelta(seconds=sgd_time))

print(metrics.classification_report(y_train, train_pred_sgd))

print(metrics.classification_report(y_test, test_pred_sgd))

# Decision Tree Classifier


start_time = time.time()
train_pred_dt, test_pred_dt, acc_dt, acc_cv_dt, probs_dt =
fit_ml_algo(DecisionTreeClassifier(),
X_train,
y_train,
X_test,
10)
dt_time = (time.time() - start_time)
print("Accuracy: %s" % acc_dt)
print("Accuracy CV 10-Fold: %s" % acc_cv_dt)
print("Running Time: %s" % datetime.timedelta(seconds=dt_time))

print(metrics.classification_report(y_train, train_pred_dt))

print(metrics.classification_report(y_test, test_pred_dt))

plot_roc_curve(y_test, probs_dt)

# Random Forest Classifier - Random Search for Hyperparameters

# Utility function to report best scores


def report(results, n_top=5):
for i in range(1, n_top + 1):
candidates = np.flatnonzero(results['rank_test_score'] == i)
for candidate in candidates:
print("Model with rank: {0}".format(i))
print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
results['mean_test_score'][candidate],
results['std_test_score'][candidate]))
print("Parameters: {0}".format(results['params'][candidate]))
print("")

# Specify parameters and distributions to sample from


param_dist = {"max_depth": [10, None],
"max_features": sp_randint(1, 11),
"min_samples_split": sp_randint(2, 20),
"min_samples_leaf": sp_randint(1, 11),
"bootstrap": [True, False],
"criterion": ["gini", "entropy"]}
# Run Randomized Search
n_iter_search = 10
rfc = RandomForestClassifier(n_estimators=10)
random_search = RandomizedSearchCV(rfc,
n_jobs = -1,
param_distributions=param_dist,
n_iter=n_iter_search)

start = time.time()
random_search.fit(X_train, y_train)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
" parameter settings." % ((time.time() - start), n_iter_search))
report(random_search.cv_results_)

# Random Forest Classifier


start_time = time.time()
rfc = RandomForestClassifier(n_estimators=10,
min_samples_leaf=2,
min_samples_split=17,
criterion='gini',
max_features=8)
train_pred_rf, test_pred_rf, acc_rf, acc_cv_rf, probs_rf = fit_ml_algo(rfc,
X_train,
y_train,
X_test,
10)
rf_time = (time.time() - start_time)
print("Accuracy: %s" % acc_rf)
print("Accuracy CV 10-Fold: %s" % acc_cv_rf)
print("Running Time: %s" % datetime.timedelta(seconds=rf_time))

print(metrics.classification_report(y_train, train_pred_rf) )

print(metrics.classification_report(y_test, test_pred_rf))

plot_roc_curve(y_test, probs_rf)

# Gradient Boosting Trees


start_time = time.time()
train_pred_gbt, test_pred_gbt, acc_gbt, acc_cv_gbt, probs_gbt =
fit_ml_algo(GradientBoostingClassifier(),
X_train,
y_train,
X_test,
10)
gbt_time = (time.time() - start_time)
print("Accuracy: %s" % acc_gbt)
print("Accuracy CV 10-Fold: %s" % acc_cv_gbt)
print("Running Time: %s" % datetime.timedelta(seconds=gbt_time))
print(metrics.classification_report(y_train, train_pred_gbt))

print(metrics.classification_report(y_test, test_pred_gbt))

plot_roc_curve(y_test, probs_gbt)

models = pd.DataFrame({
'Model': ['KNN', 'Logistic Regression',
'Random Forest', 'Naive Bayes',
'Stochastic Gradient Decent', 'Linear SVC',
'Decision Tree', 'Gradient Boosting Trees'],
'Score': [
acc_knn,
acc_log,
acc_rf,
acc_gaussian,
acc_sgd,
acc_linear_svc,
acc_dt,
acc_gbt
]})
models.sort_values(by='Score', ascending=False)

models = pd.DataFrame({
'Model': ['KNN', 'Logistic Regression',
'Random Forest', 'Naive Bayes',
'Stochastic Gradient Decent', 'Linear SVC',
'Decision Tree', 'Gradient Boosting Trees'],
'Score': [
acc_cv_knn,
acc_cv_log,
acc_cv_rf,
acc_cv_gaussian,
acc_cv_sgd,
acc_cv_linear_svc,
acc_cv_dt,
acc_cv_gbt
]})
models.sort_values(by='Score', ascending=False)

plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(10,10))

models = [
'KNN',
'Logistic Regression',
'Random Forest',
'Naive Bayes',
'Decision Tree',
'Gradient Boosting Trees'
]
probs = [
probs_knn,
probs_log,
probs_rf,
probs_gau,
probs_dt,
probs_gbt
]
colors = [
'blue',
'green',
'red',
'cyan',
'magenta',
'yellow',
]

plt.title('Receiver Operating Characteristic')


plt.plot([0, 1], [0, 1],'r--')
plt.xlim([-0.01, 1.01])
plt.ylim([-0.01, 1.01])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')

def plot_roc_curves(y_test, prob, model):


fpr, tpr, threshold = metrics.roc_curve(y_test, prob)
roc_auc = metrics.auc(fpr, tpr)
plt.plot(fpr, tpr, 'b', label = model + ' AUC = %0.2f' % roc_auc, color=colors[i])
plt.legend(loc = 'lower right')

for i, model in list(enumerate(models)):


plot_roc_curves(y_test, probs[i], models[i])

plt.show()

You might also like