Titanic Classification

import pandas as pd

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

df = pd.read_csv("titanic.csv")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 891 non-null int64
1 Survived 891 non-null int64
2 Pclass 891 non-null int64
3 Name 891 non-null object
4 Sex 891 non-null object
5 Age 714 non-null float64
6 SibSp 891 non-null int64
7 Parch 891 non-null int64
8 Ticket 891 non-null object
9 Fare 891 non-null float64
10 Cabin 204 non-null object
11 Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


PassengerId Survived Pclass \

0 1 0 3
1 2 1 1
2 3 1 3
3 4 1 1
4 5 0 3

Name Sex Age

SibSp \
0 Braund, Mr. Owen Harris male 22.0
1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0
2 Heikkinen, Miss. Laina female 26.0
3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0
4 Allen, Mr. William Henry male 35.0

Parch Ticket Fare Cabin Embarked

0 0 A/5 21171 7.2500 NaN S
1 0 PC 17599 71.2833 C85 C
2 0 STON/O2. 3101282 7.9250 NaN S
3 0 113803 53.1000 C123 S
4 0 373450 8.0500 NaN S


PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
dtype: int64


{'whiskers': [<matplotlib.lines.Line2D at 0x7cc99da9aad0>,

<matplotlib.lines.Line2D at 0x7cc99da9ad70>],
'caps': [<matplotlib.lines.Line2D at 0x7cc99da9b010>,
<matplotlib.lines.Line2D at 0x7cc99da9b2b0>],
'boxes': [<matplotlib.lines.Line2D at 0x7cc99da9a830>],
'medians': [<matplotlib.lines.Line2D at 0x7cc99da9b550>],
'fliers': [<matplotlib.lines.Line2D at 0x7cc99da9b7f0>],
'means': []}

{'whiskers': [<matplotlib.lines.Line2D at 0x7cc99db2d510>,

<matplotlib.lines.Line2D at 0x7cc99db2d7b0>],
'caps': [<matplotlib.lines.Line2D at 0x7cc99db2da50>,
<matplotlib.lines.Line2D at 0x7cc99db2dcf0>],
'boxes': [<matplotlib.lines.Line2D at 0x7cc99db2d270>],
'medians': [<matplotlib.lines.Line2D at 0x7cc99db2df90>],
'fliers': [<matplotlib.lines.Line2D at 0x7cc99db2e230>],
'means': []}

<ipython-input-20-9d1bb16c3506>:1: FutureWarning: The default value of

numeric_only in DataFrame.corr is deprecated. In a future version, it
will default to False. Select only valid columns or specify the value
of numeric_only to silence this warning.

<Axes: >

<seaborn.axisgrid.PairGrid at 0x7cc9a265d6f0>
df['Age'] = df['Age'].fillna(df['Age'].median())

train, test = train_test_split(df[['Age','Fare','Pclass','Survived']],


rf = RandomForestClassifier()

rf.fit(train[['Age','Fare','Pclass']], train[['Survived']])

<ipython-input-37-e5cca9f45d48>:1: DataConversionWarning: A column-

vector y was passed when a 1d array was expected. Please change the
shape of y to (n_samples,), for example using ravel().
rf.fit(train[['Age','Fare','Pclass']], train[['Survived']])

print("Train Accuracy: ",
round(rf.score(train[['Age','Fare','Pclass']], train[['Survived']]),

Train Accuracy: 0.96

result = rf.predict(test[['Age','Fare','Pclass']])

np.mean( np.array(result) == np.array(test['Survived']))


svm = SVC(kernel='linear')

svm.fit(train[['Age','Fare','Pclass']], train[['Survived']])

validation.py:1143: DataConversionWarning: A column-vector y was
passed when a 1d array was expected. Please change the shape of y to
(n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)


print(svm.score(test[['Age','Fare','Pclass']], test[['Survived']]))


