DSBDA2
DSBDA2
DSBDA2
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from scipy.stats import zscore
from scipy.stats import zscore, skew, shapiro, probplot
from scipy.stats import zscore, skew, shapiro, probplot
import matplotlib.pyplot as plt
import seaborn as sns
data = pd.read_csv("Test_Data.csv")
data
data.isnull().sum()
age 1
sex 0
bmi 1
health_gradient 0
smoker 0
region 0
children 1
dtype: int64
Column: age
0 40.000000
1 47.000000
2 54.000000
3 NaN
4 59.130049
Name: age, dtype: float64
Column: sex
0 male
1 male
2 female
3 male
4 male
Name: sex, dtype: object
Column: bmi
0 29.900000
1 32.300000
2 28.880000
3 30.568094
4 33.132854
Name: bmi, dtype: float64
Column: health_gradient
0 35760.40000
1 49034.63000
2 45038.93760
3 0.00000
4 64912.13924
Name: health_gradient, dtype: float64
Column: smoker
0 no
1 no
2 no
3 no
4 yes
Name: smoker, dtype: object
Column: region
0 southwest
1 southwest
2 northeast
3 northeast
4 northeast
Name: region, dtype: object
Column: children
0 2.0
1 1.0
2 2.0
3 3.0
4 4.0
Name: children, dtype: float64
handle_missing_values_categorical =
SimpleImputer(strategy='most_frequent') #handle strings with mode
data_categorical = data.select_dtypes(exclude='number')
data[data_categorical.columns] =
handle_missing_values_categorical.fit_transform(data_categorical)
#fit_transform calculates the most frequent value for each categorical
column in the training data (data_categorical) and then replaces
missing values with these calculated values.
handle_missing_values_numeric_mean = SimpleImputer(strategy='mean')
#handle numeric with mean
data_numeric = data.select_dtypes(include='number')
data[data_numeric.columns] =
handle_missing_values_numeric_mean.fit_transform(data_numeric)
#fit_transform calculates the most frequent value for each categorical
column in the training data (data_categorical) and then replaces
missing values with these calculated values.
handle_missing_values_numeric_median =
SimpleImputer(strategy='median') #handle numeric with median
data_numeric = data.select_dtypes(include='number')
data[data_numeric.columns] =
handle_missing_values_numeric_median.fit_transform(data_numeric)
#fit_transform calculates the most frequent value for each categorical
column in the training data (data_categorical) and then replaces
missing values with these calculated values.
print(data)
#Calculate Z-Scores:
z_scores = zscore(data.select_dtypes(include='number'), axis=0)
#Identify Outliers:
outliers = (z_scores > 3) | (z_scores < -3)
Column: age
0 40.000000
1 47.000000
2 54.000000
3 38.844276
4 59.130049
Name: age, dtype: float64
Column: bmi
0 29.900000
1 32.300000
2 28.880000
3 30.568094
4 33.132854
Name: bmi, dtype: float64
Column: health_gradient
0 35760.40000
1 49034.63000
2 45038.93760
3 0.00000
4 64912.13924
Name: health_gradient, dtype: float64
Column: children
0 NaN
1 NaN
2 NaN
3 NaN
4 4.0
Name: children, dtype: float64
skew_before = data_no_outliers['age'].skew()
print(f"\nSkewness before transformation: {skew_before}")
data
data