Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                

Titanic Classification

Download as pdf or txt
Download as pdf or txt
You are on page 1of 7

import pandas as pd

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

df = pd.read_csv("titanic.csv")

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 891 non-null int64
1 Survived 891 non-null int64
2 Pclass 891 non-null int64
3 Name 891 non-null object
4 Sex 891 non-null object
5 Age 714 non-null float64
6 SibSp 891 non-null int64
7 Parch 891 non-null int64
8 Ticket 891 non-null object
9 Fare 891 non-null float64
10 Cabin 204 non-null object
11 Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB

df.head()

PassengerId Survived Pclass \


0 1 0 3
1 2 1 1
2 3 1 3
3 4 1 1
4 5 0 3

Name Sex Age


SibSp \
0 Braund, Mr. Owen Harris male 22.0
1
1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0
1
2 Heikkinen, Miss. Laina female 26.0
0
3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0
1
4 Allen, Mr. William Henry male 35.0
0

Parch Ticket Fare Cabin Embarked


0 0 A/5 21171 7.2500 NaN S
1 0 PC 17599 71.2833 C85 C
2 0 STON/O2. 3101282 7.9250 NaN S
3 0 113803 53.1000 C123 S
4 0 373450 8.0500 NaN S

df.isnull().sum()

PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
dtype: int64

plt.boxplot(df[['Fare']].fillna(0))

{'whiskers': [<matplotlib.lines.Line2D at 0x7cc99da9aad0>,


<matplotlib.lines.Line2D at 0x7cc99da9ad70>],
'caps': [<matplotlib.lines.Line2D at 0x7cc99da9b010>,
<matplotlib.lines.Line2D at 0x7cc99da9b2b0>],
'boxes': [<matplotlib.lines.Line2D at 0x7cc99da9a830>],
'medians': [<matplotlib.lines.Line2D at 0x7cc99da9b550>],
'fliers': [<matplotlib.lines.Line2D at 0x7cc99da9b7f0>],
'means': []}
plt.boxplot(df[['Age']].fillna(df['Age'].median()))

{'whiskers': [<matplotlib.lines.Line2D at 0x7cc99db2d510>,


<matplotlib.lines.Line2D at 0x7cc99db2d7b0>],
'caps': [<matplotlib.lines.Line2D at 0x7cc99db2da50>,
<matplotlib.lines.Line2D at 0x7cc99db2dcf0>],
'boxes': [<matplotlib.lines.Line2D at 0x7cc99db2d270>],
'medians': [<matplotlib.lines.Line2D at 0x7cc99db2df90>],
'fliers': [<matplotlib.lines.Line2D at 0x7cc99db2e230>],
'means': []}
sns.heatmap(df[['Age','Fare','Pclass','Survived']].corr())

<ipython-input-20-9d1bb16c3506>:1: FutureWarning: The default value of


numeric_only in DataFrame.corr is deprecated. In a future version, it
will default to False. Select only valid columns or specify the value
of numeric_only to silence this warning.
sns.heatmap(df[['Age','Fare','Sex','Pclass','Survived']].corr())

<Axes: >
sns.pairplot(df[['Age','Fare','Pclass','Survived']],hue='Survived')

<seaborn.axisgrid.PairGrid at 0x7cc9a265d6f0>
df['Age'] = df['Age'].fillna(df['Age'].median())

train, test = train_test_split(df[['Age','Fare','Pclass','Survived']],


test_size=0.3)

rf = RandomForestClassifier()

rf.fit(train[['Age','Fare','Pclass']], train[['Survived']])

<ipython-input-37-e5cca9f45d48>:1: DataConversionWarning: A column-


vector y was passed when a 1d array was expected. Please change the
shape of y to (n_samples,), for example using ravel().
rf.fit(train[['Age','Fare','Pclass']], train[['Survived']])

RandomForestClassifier()
print("Train Accuracy: ",
round(rf.score(train[['Age','Fare','Pclass']], train[['Survived']]),
2))

Train Accuracy: 0.96

result = rf.predict(test[['Age','Fare','Pclass']])

np.mean( np.array(result) == np.array(test['Survived']))

0.6753731343283582

svm = SVC(kernel='linear')

svm.fit(train[['Age','Fare','Pclass']], train[['Survived']])

/usr/local/lib/python3.10/dist-packages/sklearn/utils/
validation.py:1143: DataConversionWarning: A column-vector y was
passed when a 1d array was expected. Please change the shape of y to
(n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)

SVC(kernel='linear')

print(svm.score(test[['Age','Fare','Pclass']], test[['Survived']]))

0.6940298507462687

You might also like