Practical 3 DS

1/24/24, 2:55 AM Practical 3 DS
Anuja Shelar (BE_A&R-62)
In [17]: import pandas as pd #data manipulation

import numpy as np #numerical computations
from sklearn.model_selection import train_test_split # scikit-learn fo
from sklearn import metrics
import seaborn as sns
import matplotlib.pyplot as plt
In [3]: data = pd.read_csv("Telco-Customer-Churn.csv")

data
Out[3]: customerID gender SeniorCitizen Partner Dependents tenure PhoneService MultipleLines
7590- No phone
0 Female 0 Yes No 1 No
VHVEG service
5575-
1 Male 0 No No 34 Yes No
GNVDE
3668-
QPYBK
7795- No phone
3 Male 0 No No 45 No
CFOCW service
9237-
4 Female 0 No No 2 Yes No
HQITU
... ... ... ... ... ... ... ... ...
7038 6840-RESVB Male 0 Yes Yes 24 Yes Yes
2234-
7039 Female 0 Yes Yes 72 Yes Yes
XADUH
No phone
7040 4801-JZAZL Female 0 Yes Yes 11 No
service
8361-
7041 Male 1 Yes No 4 Yes Yes
LTMKD
7042 3186-AJIEK Male 0 No No 66 Yes No
7043 rows × 21 columns
 
In [5]: print(data.head())
print(data.info())
localhost:8889/nbconvert/html/Downloads/Practical 3 DS.ipynb?download=false 1/8

customerID gender SeniorCitizen Partner Dependents tenure PhoneService \
0 7590-VHVEG Female 0 Yes No 1 No
1 5575-GNVDE Male 0 No No 34 Yes
2 3668-QPYBK Male 0 No No 2 Yes
3 7795-CFOCW Male 0 No No 45 No
4 9237-HQITU Female 0 No No 2 Yes
MultipleLines InternetService OnlineSecurity ... DeviceProtection \

0 No phone service DSL No ... No
1 No DSL Yes ... Yes
2 No DSL Yes ... No
3 No phone service DSL Yes ... Yes
4 No Fiber optic No ... No
TechSupport StreamingTV StreamingMovies Contract PaperlessBilling \

0 No No No Month-to-month Yes
1 No No No One year No
3 Yes No No One year No
PaymentMethod MonthlyCharges TotalCharges Churn

0 Electronic check 29.85 29.85 No
1 Mailed check 56.95 1889.5 No
2 Mailed check 53.85 108.15 Yes
3 Bank transfer (automatic) 42.30 1840.75 No
4 Electronic check 70.70 151.65 Yes
[5 rows x 21 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 customerID 7043 non-null object
1 gender 7043 non-null object
2 SeniorCitizen 7043 non-null int64
3 Partner 7043 non-null object
4 Dependents 7043 non-null object
5 tenure 7043 non-null int64
6 PhoneService 7043 non-null object
7 MultipleLines 7043 non-null object
8 InternetService 7043 non-null object
9 OnlineSecurity 7043 non-null object
10 OnlineBackup 7043 non-null object
11 DeviceProtection 7043 non-null object
12 TechSupport 7043 non-null object
13 StreamingTV 7043 non-null object
14 StreamingMovies 7043 non-null object
15 Contract 7043 non-null object
16 PaperlessBilling 7043 non-null object
17 PaymentMethod 7043 non-null object
18 MonthlyCharges 7043 non-null float64
19 TotalCharges 7043 non-null object
20 Churn 7043 non-null object
dtypes: float64(1), int64(2), object(18)
memory usage: 1.1+ MB
None

In [7]: data.nunique()
customerID 7043
Out[7]:
gender 2
SeniorCitizen 2
Partner 2
Dependents 2
tenure 73
PhoneService 2
MultipleLines 3
InternetService 3
OnlineSecurity 3
OnlineBackup 3
DeviceProtection 3
TechSupport 3
StreamingTV 3
StreamingMovies 3
Contract 3
PaperlessBilling 2
PaymentMethod 4
MonthlyCharges 1585
TotalCharges 6531
Churn 2
dtype: int64
In [9]: data.isnull().sum()
customerID 0
Out[9]:
gender 0
SeniorCitizen 0
Partner 0
Dependents 0
tenure 0
PhoneService 0
MultipleLines 0
InternetService 0
OnlineSecurity 0
OnlineBackup 0
DeviceProtection 0
TechSupport 0
StreamingTV 0
StreamingMovies 0
Contract 0
PaperlessBilling 0
PaymentMethod 0
MonthlyCharges 0
TotalCharges 0
Churn 0
dtype: int64
In [10]: data_cleaned = data.drop_duplicates()
In [11]: data.describe()

Out[11]: SeniorCitizen tenure MonthlyCharges
count 7043.000000 7043.000000 7043.000000
mean 0.162147 32.371149 64.761692
std 0.368612 24.559481 30.090047
min 0.000000 0.000000 18.250000
25% 0.000000 9.000000 35.500000
50% 0.000000 29.000000 70.350000
75% 0.000000 55.000000 89.850000
max 1.000000 72.000000 118.750000
In [12]: unique, counts = np.unique(data['tenure'], return_counts=True)

print(unique, counts)
[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
72] [ 11 613 238 200 176 133 110 131 123 119 116 99 117 109 76 99 80 87
97 73 71 63 90 85 94 79 79 72 57 72 72 65 69 64 65 88
50 65 59 56 64 70 65 65 51 61 74 68 64 66 68 68 80 70
68 64 80 65 67 60 76 76 70 72 80 76 89 98 100 95 119 170
362]
In [13]: unique, counts = np.unique(data['MonthlyCharges'], return_counts=True)

[ 18.25 18.4 18.55 ... 118.6 118.65 118.75] [1 1 1 ... 2 1 1]
In [14]: unique, counts = np.unique(data['TotalCharges'], return_counts=True)

[' ' '100.2' '100.25' ... '999.45' '999.8' '999.9'] [11 1 1 ... 1 1 1]
In [16]: sns.pairplot(data)
<seaborn.axisgrid.PairGrid at 0x2981866dae0>
Out[16]:

Check for Outliers

In [18]: plt.boxplot(data['tenure'])
plt.show()

In [19]: plt.boxplot(data['MonthlyCharges'])
plt.show()
Split the Data

In [20]: X = data.drop("Churn", axis=1)
y = data["Churn"]
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=
In [21]: X_train.shape
(5634, 20)
Out[21]:
In [22]: y_train.shape
(5634,)
Out[22]:
In [23]: X_test.shape
(1409, 20)
Out[23]:
In [24]: y_test.shape
(1409,)
Out[24]:
Export the cleaned data

In [26]: # Export the cleaned dataset to a CSV file
data.to_csv("Cleaned_Telecom_Customer_Churn.csv", index=False)
In [27]: data

Out[27]: customerID gender SeniorCitizen Partner Dependents tenure PhoneService MultipleLines
7590- No phone
0 Female 0 Yes No 1 No
VHVEG service
5575-
GNVDE
3668-
QPYBK
7795- No phone
3 Male 0 No No 45 No
CFOCW service
9237-
4 Female 0 No No 2 Yes No
HQITU
... ... ... ... ... ... ... ... ...
7038 6840-RESVB Male 0 Yes Yes 24 Yes Yes
2234-
7039 Female 0 Yes Yes 72 Yes Yes
XADUH
No phone
7040 4801-JZAZL Female 0 Yes Yes 11 No
service
8361-
7041 Male 1 Yes No 4 Yes Yes
LTMKD
7042 3186-AJIEK Male 0 No No 66 Yes No
7043 rows × 21 columns
 

Practical 3 DS

Uploaded by

Copyright:

Available Formats

Practical 3 DS

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Practical 3 DS

Uploaded by

Copyright:

Available Formats

1/24/24, 2:55 AM Practical 3 DS

Anuja Shelar (BE_A&R-62)

In [17]: import pandas as pd #data manipulation

import matplotlib.pyplot as plt

In [3]: data = pd.read_csv("Telco-Customer-Churn.csv")

Out[3]: customerID gender SeniorCitizen Partner Dependents tenure PhoneService MultipleLines

... ... ... ... ... ... ... ... ...

7038 6840-RESVB Male 0 Yes Yes 24 Yes Yes

7042 3186-AJIEK Male 0 No No 66 Yes No

7043 rows × 21 columns

localhost:8889/nbconvert/html/Downloads/Practical 3 DS.ipynb?download=false 1/8

MultipleLines InternetService OnlineSecurity ... DeviceProtection \

TechSupport StreamingTV StreamingMovies Contract PaperlessBilling \

PaymentMethod MonthlyCharges TotalCharges Churn

localhost:8889/nbconvert/html/Downloads/Practical 3 DS.ipynb?download=false 2/8

In [10]: data_cleaned = data.drop_duplicates()

localhost:8889/nbconvert/html/Downloads/Practical 3 DS.ipynb?download=false 3/8

count 7043.000000 7043.000000 7043.000000

mean 0.162147 32.371149 64.761692

std 0.368612 24.559481 30.090047

min 0.000000 0.000000 18.250000

25% 0.000000 9.000000 35.500000

50% 0.000000 29.000000 70.350000

75% 0.000000 55.000000 89.850000

max 1.000000 72.000000 118.750000

In [12]: unique, counts = np.unique(data['tenure'], return_counts=True)

In [13]: unique, counts = np.unique(data['MonthlyCharges'], return_counts=True)

[ 18.25 18.4 18.55 ... 118.6 118.65 118.75] [1 1 1 ... 2 1 1]

In [14]: unique, counts = np.unique(data['TotalCharges'], return_counts=True)

localhost:8889/nbconvert/html/Downloads/Practical 3 DS.ipynb?download=false 4/8

Check for Outliers

localhost:8889/nbconvert/html/Downloads/Practical 3 DS.ipynb?download=false 5/8

Split the Data

localhost:8889/nbconvert/html/Downloads/Practical 3 DS.ipynb?download=false 6/8

Export the cleaned data

localhost:8889/nbconvert/html/Downloads/Practical 3 DS.ipynb?download=false 7/8

... ... ... ... ... ... ... ... ...

7038 6840-RESVB Male 0 Yes Yes 24 Yes Yes

7042 3186-AJIEK Male 0 No No 66 Yes No

7043 rows × 21 columns

localhost:8889/nbconvert/html/Downloads/Practical 3 DS.ipynb?download=false 8/8

You might also like