Data Preprocessing
Data Preprocessing
import numpy as np
df = pd.read_csv('/Users/nageshjadhav/Desktop/adult.csv')
df.head()
df.corr()
capital-loss hours-per-week
age 0.056944 0.071558
fnlwgt -0.004366 -0.013519
educational-num 0.080972 0.143689
capital-gain -0.031441 0.082157
capital-loss 1.000000 0.054467
hours-per-week 0.054467 1.000000
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 age 48842 non-null int64
1 workclass 48842 non-null object
2 fnlwgt 48842 non-null int64
3 education 48842 non-null object
4 educational-num 48842 non-null int64
5 marital-status 48842 non-null object
6 occupation 48842 non-null object
7 relationship 48842 non-null object
8 race 48842 non-null object
9 gender 48842 non-null object
10 capital-gain 48842 non-null int64
11 capital-loss 48842 non-null int64
12 hours-per-week 48842 non-null int64
13 native-country 48842 non-null object
14 income 48842 non-null object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB
df.isna().sum()
age 0
workclass 0
fnlwgt 0
education 0
educational-num 0
marital-status 0
occupation 0
relationship 0
race 0
gender 0
capital-gain 0
capital-loss 0
hours-per-week 0
native-country 0
income 0
dtype: int64
df.isin(['?']).sum()
age 0
workclass 2799
fnlwgt 0
education 0
educational-num 0
marital-status 0
occupation 2809
relationship 0
race 0
gender 0
capital-gain 0
capital-loss 0
hours-per-week 0
native-country 857
income 0
dtype: int64
df.describe()
capital-loss hours-per-week
count 48842.000000 48842.000000
mean 87.502314 40.422382
std 403.004552 12.391444
min 0.000000 1.000000
25% 0.000000 40.000000
50% 0.000000 40.000000
75% 0.000000 45.000000
max 4356.000000 99.000000
df.duplicated().sum()
52
df = df.drop_duplicates()
print(df.duplicated().sum())
df['age'].value_counts()
36 1348
35 1336
33 1335
23 1325
31 1324
...
88 6
85 5
87 3
89 2
86 1
Name: age, Length: 74, dtype: int64
for i in df.columns:
a = df[i].value_counts()
print(f'\n\n\nvalues of {a}')
values of 36 1348
35 1336
33 1335
23 1325
31 1324
...
88 6
85 5
87 3
89 2
86 1
Name: age, Length: 74, dtype: int64
values of 203488 21
190290 19
120277 19
125892 18
126569 18
..
293579 1
114874 1
96279 1
509350 1
257302 1
Name: fnlwgt, Length: 28523, dtype: int64
values of 9 15770
10 10863
13 8013
14 2656
11 2060
7 1812
12 1601
6 1389
4 954
15 834
5 756
8 655
16 594
3 507
2 245
1 81
Name: educational-num, dtype: int64
values of Married-civ-spouse 22366
Never-married 16082
Divorced 6630
Separated 1530
Widowed 1518
Married-spouse-absent 627
Married-AF-spouse 37
Name: marital-status, dtype: int64
values of 0 44755
15024 513
7688 410
7298 364
99999 244
...
1111 1
7262 1
22040 1
1639 1
2387 1
Name: capital-gain, Length: 123, dtype: int64
values of 0 46508
1902 304
1977 253
1887 233
2415 72
...
2465 1
2080 1
155 1
1911 1
2201 1
Name: capital-loss, Length: 99, dtype: int64
values of 40 22773
50 4242
45 2715
60 2177
35 1934
...
69 1
87 1
94 1
82 1
79 1
Name: hours-per-week, Length: 96, dtype: int64
df = df.replace('?',np.nan)
df.isna().sum()
age 0
workclass 2795
fnlwgt 0
education 0
educational-num 0
marital-status 0
occupation 2805
relationship 0
race 0
gender 0
capital-gain 0
capital-loss 0
hours-per-week 0
native-country 856
income 0
dtype: int64
[[ 2 3 7 30]
[ 9 4 6 1]
[ 8 15 2 40]
[20 10 2 6]]
[[0. 0. 1. 0.74358974]
[0.38888889 0.08333333 0.8 0. ]
[0.33333333 1. 0. 1. ]
[1. 0.58333333 0. 0.12820513]]
data.head()
College Salary
0 Texas 7730337.0
1 Marquette 6796117.0
2 Boston University NaN
3 Georgia State 1148640.0
4 NaN 5000000.0
data.isna().sum()
Name 1
Team 1
Number 1
Position 1
Age 1
Height 1
Weight 1
College 85
Salary 12
dtype: int64
data.head(10)
College Salary
0 Texas 7730337.0
1 Marquette 6796117.0
2 Boston University 6796117.0
3 Georgia State 1148640.0
4 NaN 5000000.0
5 NaN 12000000.0
6 LSU 1170960.0
7 Gonzaga 2165160.0
8 Louisville 1824360.0
9 Oklahoma State 3431040.0
Simple Imputer
It replaces the NaN values with a specified placeholder.
missing_values : The missing_values placeholder which has to be imputed. By default is
NaN strategy : The data which will replace the NaN values from the dataset. The strategy
argument can take the values – ‘mean'(default), ‘median’, ‘most_frequent’ and ‘constant’.
fill_value : The constant value to be given to the NaN data using the constant strategy.
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean', missing_values=np.nan)
data['Salary'] = imputer.fit_transform(data[['Salary']])
data.head()
College Salary
0 Texas 7730337.0
1 Marquette 6796117.0
2 Boston University 6796117.0
3 Georgia State 1148640.0
4 LSU 5000000.0
imputer1 = SimpleImputer(strategy='most_frequent',
missing_values=np.nan)
data['College'] = imputer1.fit_transform(data[['College']])
data.head()
df['workclass'].unique()
df.isna().sum()
age 0
workclass 0
fnlwgt 0
education 0
educational-num 0
marital-status 0
occupation 0
relationship 0
race 0
gender 0
capital-gain 0
capital-loss 0
hours-per-week 0
native-country 0
income 0
dtype: int64
df
Outlier Detection
1. Using Boxplot
2. Using Scatter plot
3. Using Z score
4. Using Inter Quartile Range
# Using Boxplot
import seaborn as sns
sns.boxplot(data['Weight'])
/Users/nageshjadhav/opt/anaconda3/envs/ISII_Lab/lib/python3.9/site-
packages/seaborn/_decorators.py:36: FutureWarning: Pass the following
variable as a keyword arg: x. From version 0.12, the only valid
positional argument will be `data`, and passing other arguments
without an explicit keyword will result in an error or
misinterpretation.
warnings.warn(
<AxesSubplot:xlabel='Weight'>
print(np.where(data['Weight']>290))
(array([405]),)
sns.boxplot(df['age'])
/Users/nageshjadhav/opt/anaconda3/envs/ISII_Lab/lib/python3.9/site-
packages/seaborn/_decorators.py:36: FutureWarning: Pass the following
variable as a keyword arg: x. From version 0.12, the only valid
positional argument will be `data`, and passing other arguments
without an explicit keyword will result in an error or
misinterpretation.
warnings.warn(
<AxesSubplot:xlabel='age'>
print(df['age'].unique())
[25 38 28 44 18 34 29 63 24 55 65 36 26 58 48 43 20 37 40 72 45 22 23
54
32 46 56 17 39 52 21 42 33 30 47 41 19 69 50 31 59 49 51 27 57 61 64
79
73 53 77 80 62 35 68 66 75 60 67 71 70 90 81 74 78 82 83 85 76 84 89
88
87 86]
print(df.age[193])
79
20.0
lwr_bound = Q1-(1.5*IQR)
upr_bound = Q3+(1.5*IQR)
print(lwr_bound, upr_bound)
-2.0 78.0
outliers = []
for i in df['age']:
if (i<lwr_bound or i>upr_bound):
outliers.append(i)
print(outliers)
print(len(outliers))
[79, 80, 90, 79, 80, 81, 82, 83, 81, 85, 80, 90, 81, 84, 81, 89, 81,
83, 81, 82, 80, 90, 81, 83, 80, 90, 90, 84, 80, 80, 80, 81, 90, 85,
90, 81, 81, 80, 80, 79, 81, 80, 88, 87, 90, 79, 83, 79, 80, 90, 79,
79, 81, 81, 90, 82, 90, 87, 81, 88, 80, 81, 80, 81, 90, 88, 89, 84,
80, 80, 83, 79, 81, 79, 90, 80, 81, 90, 88, 90, 90, 80, 90, 81, 82,
79, 81, 80, 83, 90, 90, 79, 81, 90, 80, 90, 90, 79, 79, 84, 90, 80,
90, 81, 83, 84, 81, 79, 85, 82, 79, 80, 90, 90, 90, 84, 80, 90, 90,
79, 84, 90, 79, 90, 90, 90, 82, 81, 90, 84, 79, 81, 82, 81, 80, 90,
80, 84, 82, 79, 90, 84, 90, 83, 79, 81, 80, 79, 80, 79, 80, 90, 90,
80, 90, 90, 81, 83, 82, 90, 90, 81, 80, 80, 90, 79, 80, 82, 85, 80,
79, 90, 81, 79, 80, 79, 81, 82, 88, 90, 82, 88, 84, 83, 79, 86, 90,
90, 82, 83, 81, 79, 90, 80, 81, 79, 84, 84, 79, 90, 80, 81, 81, 81,
90, 87, 90, 80, 80, 82, 90, 90, 85, 82, 81]
215
Handling Outliers
1. Trimming/Remove the outliers
2. Quantile based flooring and capping
3. Mean/Median imputation
median = np.median(df['age'])# Replace with median
print(median)
for i in outliers:
c = np.where(df['age']==i, 37, df['age'])
print("New array: ",c)
print(c.shape)
37.0
New array: [25 38 28 ... 58 22 52]
(48790,)
df['income'].value_counts()
<=50K 37109
>50K 11681
Name: income, dtype: int64
sns.countplot(df['income'])
/Users/nageshjadhav/opt/anaconda3/envs/ISII_Lab/lib/python3.9/site-
packages/seaborn/_decorators.py:36: FutureWarning: Pass the following
variable as a keyword arg: x. From version 0.12, the only valid
positional argument will be `data`, and passing other arguments
without an explicit keyword will result in an error or
misinterpretation.
warnings.warn(
<AxesSubplot:xlabel='income', ylabel='count'>
Handling Categorical Data
1. Replacing values
2. Encoding labels
3. One-Hot encoding
print(df['gender'].unique())
['Male' 'Female']
df['gender'] = df['gender'].replace('Male',1)
df['gender'] = df['gender'].replace('Female',0)
df['income'].unique()
df['marital-status'] = df['marital-status'].str.replace('Never-
married', 'NotMarried')
df['marital-status'] = df['marital-status'].str.replace('Married-AF-
spouse', 'Married')
df['marital-status'] = df['marital-status'].str.replace('Married-civ-
spouse', 'Married')
df['marital-status'] = df['marital-status'].str.replace('Married-
spouse-absent', 'NotMarried')
df['marital-status'] = df['marital-status'].str.replace('Separated',
'Separated')
df['marital-status'] = df['marital-status'].str.replace('Divorced',
'Separated')
df['marital-status'] = df['marital-status'].str.replace('Widowed',
'Widowed')
le = LabelEncoder()
df['income'] = le.fit_transform(df['income'])
df['race'] = le.fit_transform(df['race'])
df['marital-status'] = le.fit_transform(df['marital-status'])
print(df['income'].unique())
[0 1]
df['education'].unique()
df['education'].unique()
df['education'] = le.fit_transform(df['education'])
df['education'].unique()
array([5, 3, 1, 4, 0, 2])
df['marital-status'].unique()
array([1, 0, 3, 2])
df.head()
1 38 Private 89814 3 9 0
2 28 Local-gov 336951 1 12 0
3 44 Private 160323 1 10 0
4 18 Private 103497 1 10 1
X = df.drop('income',axis=1)
y = df.income
Random Sampling
yes = df[df['income']==1]
no = df[df['income']==0]
print(yes.shape)
print(no.shape)
(11681, 15)
(37109, 15)
no_sample = no.sample(n=11681)
no_sample.shape
(11681, 15)
sampled_df = pd.concat([yes,no_sample],axis=0)
sampled_df.shape
(23362, 15)
sns.countplot(sampled_df['income'])
/Users/nageshjadhav/opt/anaconda3/envs/ISII_Lab/lib/python3.9/site-
packages/seaborn/_decorators.py:36: FutureWarning: Pass the following
variable as a keyword arg: x. From version 0.12, the only valid
positional argument will be `data`, and passing other arguments
without an explicit keyword will result in an error or
misinterpretation.
warnings.warn(
<AxesSubplot:xlabel='income', ylabel='count'>
sampled_df.head()