Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
0% found this document useful (0 votes)
3 views

Python Code Library

The document provides a comprehensive guide on data analysis and visualization using Python libraries such as Pandas, Matplotlib, and Seaborn. It covers methods for reading data from CSV files, performing exploratory data analysis, and creating various visualizations like histograms, boxplots, and scatter plots. Additionally, it includes sections on machine learning techniques such as linear regression, logistic regression, KNN, and decision trees, along with data preprocessing methods like standardization and normalization.

Uploaded by

Daniel Wu
Copyright
© © All Rights Reserved
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
3 views

Python Code Library

The document provides a comprehensive guide on data analysis and visualization using Python libraries such as Pandas, Matplotlib, and Seaborn. It covers methods for reading data from CSV files, performing exploratory data analysis, and creating various visualizations like histograms, boxplots, and scatter plots. Additionally, it includes sections on machine learning techniques such as linear regression, logistic regression, KNN, and decision trees, along with data preprocessing methods like standardization and normalization.

Uploaded by

Daniel Wu
Copyright
© © All Rights Reserved
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 8

----------------------------------Coding

Library---------------------------------------------
##Import popular library for EDA/Visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

#Be careful using warnigns


import warnings
warnings.filterwarnings('ignore')

##Read file/get the data (csv/excel)


-----------------------------------------------------------------------------------
-----------
Method 1: Read csv file
df = pd.read_csv('titanic_train.csv')

##Actual file path


df = pd.read_csv('D:\\For Dan\\Learning\\Udemy\\Python\\P4-Demographic-Data.csv')

##Method 2: Change Woring Directory


import os
print(os.getcwd())
-->C:\Users\wooju\Desktop\Python Programing

os.chdir('D:\\For Dan\\Learning\\Udemy\\Python')
df = pd.read_csv('P4-Demographic-Data.csv')

df.columns

##Column rename
stats.columns = ['CountryName', 'CountryCode', 'BirthRate',
'InternetUsers','IncomeGroup']

## [column name] to get the unique items within the column


df.IncomeGroup.unique()

df.info()
df.describe()
##df.describe().transpose()

##Passing the filter with more than 1 conditions ( and & or |)


df[(df.BirthRate >= 40) & (df.InternetUsers < 2)]
df[df.CountryName == 'Malta']

movies.Genre = movies.Genre.astype('category')
----------------------------------Visualization
---------------------------------------------

import seaborn as sns

sns.set_style('darkgrid')
sns.set_style('whitegrid')

plt.rcParams['figure.figsize'] = 8,4
plt.figure(figsize=(8,4))

##Histogram/Distribution
sns.set()
vis1 = sns.distplot(stats['InternetUsers'], hist_kws={"edgecolor":"Black"},
bins=20)
plt.show()

plt.hist(movies.AudienceRatings, bins = 15)


#With filter
h1 = plt.hist(movies[movies.Genre == 'Drama'].BudgetMillion)

##Stacked column chart


listgen = list() or []
listlabel = list() or[]
for gen in movies.Genre.cat.categories:
listgen.append(movies[movies.Genre == gen].BudgetMillion)
listlabel.append(gen)

sns.set_style('darkgrid')
fig, ax = plt.subplots()
fig.set_size_inches(11.7,8.27)
h2 = plt.hist(list1, bins = 20, stacked = True, rwidth = 1, label = listlabel)

#
plt.title('Movie Budget Distribution', fontsize=30)
plt.ylabel('Number of Movies',fontsize=15)
plt.xlabel('Budget',fontsize=15)
plt.yticks(fontsize=15)
plt.xticks(fontsize=15)
plt.legend(frameon = True, fancybox = True, shadow = True, fontsize=15)

plt.show()

##Subplot
f, axes = plt.subplots(1,2,figsize = (12,6), sharex = True, sharey=True)
k3 = sns.kdeplot(movies.BudgetMillion, movies.AudienceRatings,cmap = 'Greens',
ax = axes[0])
k4 = sns.kdeplot(movies.BudgetMillion, movies.CriticRatings,
ax = axes[1])
k3.set(xlim = (-20,160)) #custom x-axis range
plt.show()

##violin plot
w = sns.violinplot(data=movies, x = 'Genre', y = 'CriticRatings')

##Boxplot
sns.set()
vis2 = sns.boxplot(data = stats, x = 'IncomeGroup', y = 'BirthRate')

##Linear Model
vis3 = sns.lmplot(data = stats, x = 'InternetUsers', y = 'BirthRate',
fit_reg = False, hue = 'IncomeGroup', size = 10, aspect=1)

##Jointplot
j = sns.jointplot(data = movies, x = 'CriticRatings', y = 'AudienceRatings')
j = sns.jointplot(data = movies, x = 'CriticRatings', y = 'AudienceRatings',
kind = 'kde')

##FacetGrid
# Controlling Axes and Adding Diagonals
g = sns.FacetGrid(movies, row='Genre', col='YearRelease', hue='Genre')
kws = dict(s=50, edgecolor='black', linewidth=0.5)
g = g.map(plt.scatter, 'CriticRatings', 'AudienceRatings')
g.set(xlim=(0,100), ylim=(20,100))
for ax in g.axes.flat:
ax.plot((0,100),(20,100), c='grey', ls='--')
g.add_legend()

plt.show()
-----------------------------------------------------------------------------------
-----------

#sns.set_style('darkgrid') #white, whitegrid, dard, darkgrid


sns.set_style('dark', {'axes.facecolor':'Black'})
f, axes = plt.subplots(2,2, figsize = (15,15))

k1 = sns.kdeplot(movies.BudgetMillion, movies.AudienceRatings,
shade = True, Shade_lowest = True, cmap='inferno',
ax = axes[0,0])
k1b = sns.kdeplot(movies.BudgetMillion, movies.AudienceRatings, cmap = 'PuBu',
ax = axes[0,0])

k2 = sns.kdeplot(movies.BudgetMillion, movies.CriticRatings,
shade = True, Shade_lowest = True, cmap='inferno',
ax = axes[0,1])
k2b = sns.kdeplot(movies.BudgetMillion, movies.CriticRatings, cmap = 'cool',
ax = axes[0,1])

v = sns.violinplot(data=movies, x = 'YearRelease', y = 'BudgetMillion',


palette='YlOrRd',
ax = axes[1,0])

k4 = sns.kdeplot(movies.CriticRatings, movies.AudienceRatings,
shade = True, shade_lowest = False, cmap = 'Blues_r',
ax = axes[1,1])
k4b = sns.kdeplot(movies.CriticRatings, movies.AudienceRatings, cmap =
'gist_gray_r',
ax = axes[1,1])

k1.set(xlim = (-20,200))
k2.set(xlim = (-20,200))
plt.show()

def myplot(data, playerlist = Players):

Col = {"KobeBryant":'Black',"JoeJohnson":'green',"LeBronJames":'red',
"CarmeloAnthony":'y',"DwightHoward":'k',"ChrisBosh":'m',
"ChrisPaul":'b',"KevinDurant":'k',"DerrickRose":'c',"DwayneWade":'m'}

Mkers = {"KobeBryant":"o","JoeJohnson":"D","LeBronJames":"^",
"CarmeloAnthony":"*","DwightHoward":"v","ChrisBosh":'',
"ChrisPaul":"p","KevinDurant":"D","DerrickRose":"H","DwayneWade":"^"}

for name in playerlist:


plt.plot(data[Pdict[name]], c=Col[name], ls = '--',
Marker = Mkers[name], ms = 8, label = name)

plt.legend(loc = 'upper left', bbox_to_anchor = (1,1))


plt.xticks(list(range(0,10)), Seasons, rotation = 'horizontal')
plt.show()
------------------------------------------Machine
Learning-------------------------------------

----------------LinearRegression
----------------LogisticRegression

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression


from sklearn.linear_model import LogisticRegression

lm = LinearRegression(
logmodel = LogisticRegression()

lm.fit(X_train,y_train)
logmodel.fit(X_train,y_train)

predictions = lm.predict(X_test)
predictions = logmodel.predict(X_test)

from sklearn.metrics import confusion_matrix


accuracy = confusion_matrix(y_test,predictions)
from sklearn.metrics import accuracy_score
acscore = accuracy_score(y_test,predictions)

#F1-Score??
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))

---------------------KNN

from sklearn.preprocessing import StandardScaler

##Standardize
scaler = StandardScaler()
scaler.fit(df.drop('TARGET CLASS', axis=1))
scaled_features = scaler.transform(df.drop('TARGET CLASS', axis=1))

df_feat = pd.DataFrame(scaled_features, columns=df.columns[0:-1])

from sklearn.model_selection import train_test_split


X = df_feat
y = df['TARGET CLASS']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,
random_state=101)

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=1)

knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix


print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

##Find the minimum K-value

error_rate = []

for i in range(1,40):

knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)
error_rate.append(np.mean(predictions != y_test))

plt.figure(figsize=(10,6))
plt.plot(range(1,40), error_rate, color='blue', linestyle='-', marker='o',
markerfacecolor='red', markersize=10)
plt.title('Error Rate vs K-value')
plt.xlabel('K')
plt.ylabel('K-value')

knn = KNeighborsClassifier(n_neighbors=17)
knn.fit(X_train, y_train)
predictions = knn.predict(X_test)

print(confusion_matrix(y_test, predictions))
print('\n')
print(classification_report(y_test, predictions))

--------------------------------------Decision
Tree-----------------------------------------

X = final_data.drop('not.fully.paid', axis=1)
y = final_data['not.fully.paid']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
random_state=101)
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)

predictions = dtree.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix


print(confusion_matrix(y_test, predictions))
print('\n')
print(classification_report(y_test, predictions))

from sklearn.ensemble import RandomForestClassifier


rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(X_train, y_train)

rfc_pred = rfc.predict(X_test)
print(confusion_matrix(y_test, rfc_pred))
print('\n')
print(classification_report(y_test, rfc_pred))

------------------------------Standardisation vs Max-Min
Normalization----------------------------------------------------

Unit/magnitude

Standardisation

#Import library
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_X = sc_X.fit_transform(df)
#Convert to table format - StandardScaler
sc_X = pd.DataFrame(data=sc_X, columns=["Age",
"Salary","Purchased","Country_France","Country_Germany", "Country_spain"])
sc_X

Max-Min Normalization

from sklearn.preprocessing import MinMaxScaler


scaler = MinMaxScaler()
scaler.fit(df)
scaled_features = scaler.transform(df)
#Convert to table format - MinMaxScaler
df_MinMax = pd.DataFrame(data=scaled_features, columns=["Age",
"Salary","Purchased","Country_France","Country_Germany", "Country_spain"])

You might also like