Python for Machine Learning Visualization 1735231185
Python for Machine Learning Visualization 1735231185
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
pd.set_option('display.precision', 2)
id age sex dataset cp trestbps chol fbs restecg thalch exang oldpeak slope ca thal num
lv fixed
0 1 63 Male Cleveland typical angina 145.0 233.0 True 150.0 False 2.3 downsloping 0.0 0
hypertrophy defect
lv
1 2 67 Male Cleveland asymptomatic 160.0 286.0 False 108.0 True 1.5 flat 3.0 normal 2
hypertrophy
print(f"Records: {df.shape[0]}")
print(f"Columns: {df.shape[1]}")
Records: 299
Columns: 16
top_leagues = df['cp'].value_counts().nlargest(4).index
display(top_leagues)
plt.figure(figsize=(15, 6))
sns.scatterplot(x='age', y='chol', data=df[df['cp'].isin(top_leagues)], hue='cp')
plt.title('Age vs. Cholesterol for Top 4 Chest Pain')
plt.xlabel('Age')
plt.ylabel('Cholesterol')
plt.legend(title='Chest Pain Type', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()
import plotly.express as px
fig.update_layout(width=1000, height=500)
fig.update_layout(title_text='Scatter Plot of Cholesterol vs. Age (colored by Sex)')
fig.show()
80
70
60
age
50
40
30
chol
30 40 50 60 70
Age
import plotly.express as px
fig = px.scatter(df, x='chol', y='age', color='cp', size = 'oldpeak', size_max = 30, hover_name = 'exang')
fig.update_layout(width=1000, height=500)
80
cp
70
60
age
50
40
30
chol
Python Code Link: https://t.me/AIMLDeepThaught/573
fig.show()
Cholesterol Vs Age
cp
500
400
chol
300
200
100
30 40 50 60 70 80
age
import plotly.express as px
fig = px.scatter(df, x='chol', y='age', color='cp', size = 'oldpeak', size_max = 30, hover_name = 'exang',facet_col
fig.update_layout(width=1000, height=500)
fig.show()
70
60
age
50
40
30
100
200
300
400
500
600
700
100
200
300
400
500
600
700
100
200
300
400
500
600
700
100
200
300
400
500
600
700
chol chol chol chol
hover_name='exang' means that the values in the 'exang' column will be shown as tooltips when you hover over the data points
on the scatter plot. This is useful for providing additional information about each data point without cluttering the plot with
labels.
fig=px.bar(df,x='age',y='chol',hover_data=['oldpeak'],color='sex',height=400)
fig.show()
sex
Male
4000
Female
3000
chol
2000
1000
0
30 40 50 60 70
age
def generate_rating_df(df):
rating_df = df.groupby(['cp', 'slope']).agg({'id': 'count'}).reset_index()
rating_df = rating_df[rating_df['id'] != 0]
rating_df.columns = ['cp', 'slope', 'counts']
rating_df = rating_df.sort_values('slope')
return rating_df
rating_df = generate_rating_df(df)
fig = px.bar(rating_df, x='cp', y='counts', color='slope')
fig.update_traces(textposition='auto',
textfont_size=20)
fig.update_layout(barmode='stack')
fig.show()
slope
140 downsloping
flat
upsloping
120
100
counts
80
60
40
20
0
asymptomatic atypical angina non-anginal typical angina
cp
def generate_rating_df(df):
rating_df = df.groupby(['cp', 'slope']).agg({'id': 'count'}).reset_index()
rating_df = rating_df[rating_df['id'] != 0]
rating_df.columns = ['cp', 'slope', 'counts']
rating_df = rating_df.sort_values('slope')
return rating_df
rating_df = generate_rating_df(df)
fig = px.bar(rating_df, x='cp', y='counts', color='slope')
fig.update_traces(textposition='auto',
textfont_size=20)
fig.update_layout(barmode='group')
slope
80 downsloping
flat
upsloping
70
60
50
counts
40
30
20
10
0
asymptomatic atypical angina non-anginal typical angina
cp
import plotly.express as px
def generate_rating_df(df):
rating_df = df.groupby(['cp', 'slope']).agg({'id': 'count'}).reset_index()
rating_df = rating_df[rating_df['id'] != 0]
rating_df.columns = ['cp', 'slope', 'counts']
rating_df = rating_df.sort_values('slope')
return rating_df
rating_df = generate_rating_df(df)
fig.update_traces(textposition='auto',
textfont_size=20)
fig.show()
slope
80 84 downsloping
flat
upsloping
70
60
50
counts
49
40 45
36
30
33
20
10
11 5
2 11 3 11 9
0
asymptomatic atypical angina non-anginal typical angina
cp
def generate_rating_df(df):
rating_df = df.groupby(['cp', 'slope']).agg({'id': 'count'}).reset_index()
rating_df = rating_df[rating_df['id'] != 0]
rating_df.columns = ['cp', 'slope', 'counts']
rating_df = rating_df.sort_values('slope')
# Calculate percentages
total_counts = rating_df['counts'].sum()
rating_df['percentage'] = rating_df['counts'] / total_counts * 100
return rating_df
rating_df = generate_rating_df(df)
fig.update_traces(
texttemplate='%{text:.1f}%',
textposition='outside',
textfont_size=16
)
fig.update_layout(
barmode='group',
yaxis_title='Count',
xaxis_title='CP',
legend_title='Slope'
)
fig.update_layout(
height=550,
width=1000,
title_text="Distribution of Chest Pain Type by Percentage",
title_font_size=24
)
fig.show()
70
60
50
16.4%
15.1%
Count
40
12.0%
11.0%
30
20
CP
fig.show()
cp
500
400
chol
300
200
100
30 40 50 60 70 80
age
fig.show()
Age vs Cholesterol
500
400
chol
300
200
100
30 40 50 60 70 80
age
fig.show()
Age vs Cholesterol
500
400
chol
300
200
100
30 40 50 60 70 80
age
sex
500
400
chol
300
200
100
30 40 50 60 70 80
age
title_x=0.2)
fig.update_layout(
font_family='classic-roman',
font_color= 'grey',
yaxis_title={'text': " count", 'font': {'size':18}},
xaxis_title={'text': " Age", 'font': {'size':18}}
)
fig.show()
Histogram of Persons by Age
sex
Male
30
Female
25
20
count
15
10
0
30 40 50 60 70
Age
import plotly.graph_objects as go
from plotly.subplots import make_subplots
fig = make_subplots(rows=2,
cols=2,
specs=[[{'type':'domain'}, {'type':'domain'}],
[{'type':'domain'}, {'type':'domain'}]],
subplot_titles=("Asymptomatic", "Non-Anginal",
"Atypical Angina", "Typical Angina"))
# Update layout to increase the size of the plot and add main title
fig.update_layout(
height=800,
width=1000,
title_text="Distribution of Cholesterol Levels by Chest Pain Type",
title_font_size=24
)
# Update traces
fig.update_traces(textposition='inside', textfont_size=16)
fig.update_annotations(font_size=20)
fig.show()
Python Code Link: https://t.me/AIMLDeepThaught/573
Distribution of Cholesterol Levels by Chest Pain Type
Asymptomatic Non-Anginal
27.5%
38.1%
54.3%
%
2.33
70.2%
3%
7.6
17%
3.7
4%
36.1%
56.2%
79.3%
%
68
7.
import plotly.express as px
fig = px.scatter(df, x='chol', y='age', color='cp', size = 'oldpeak', size_max = 30, hover_name = 'exang', range_x
labels = dict(oldpeak = 'oldpeak', chol = 'Cholestrol', age = "Age" ), animation_frame = "chol",
fig.update_layout(width=1000, height=600)
fig.show()
Scatter Plot of Cholesterol vs. Age (colored by cp) with Animation
100
cp
80
60
Age
40
20
0
100 200 300 400 500 600 700 800
Cholestrol
Cholestrol=233.0
▶ ◼
233.0 192.0 283.0 335.0 175.0 216.0 248.0 325.0 182.0 217.0 240.0 277.0 196.0 210.0 319.0 241.0
gender = df["sex"].value_counts()
display(gender.head().to_frame())
fig = px.bar(data_frame=gender,
x = gender.index,
y = gender,
color=gender.index,
text_auto="0.3s",
labels={"y": "Frequency", "index": "Gender"}
)
fig.update_traces(textfont_size=24)
sex
Male 203
Female 96
sex
200
203 Male
Female
150
Frequency
100
96.0
50
0
Male Female
sex
category = df["cp"].value_counts()
fig = px.bar(category,
x = category.index,
y = (category / sum(category)) * 100,
color=category.index,
labels={"y" : "Frequency in (Percentage%)", "category":"Category"},
title="Frequency of Chest Pain Category in Percentage",
text = category.apply(lambda x: f'{(x / sum(category)) * 100:.1f}%'),
template="plotly_dark"
)
fig.update_layout(showlegend=False)
fig.update_traces(
textfont= {
"family": "consolas",
"size": 20,
}
)
iplot(fig)
Frequency of Chest Pain Category in Percentage
50
48.2%
40
Frequency in (Percentage%)
30
27.8%
20
16.4%
10
7.7%
0
asymptomatic non-anginal atypical angina typical angina
cp
ChestPain = df["cp"].value_counts()
iplot(fig)
non-anginal
27.8%
asymptomatic
48.2%
a
gin
n
ala
pic .4%
y 16
ngina
at
7.69%
typical a
cp = df["cp"].value_counts()
fig = px.bar(cp,
y = cp.index,
x = (cp / sum(cp)) * 100,
color=cp.index,
labels={"x" : "Frequency in Percentage(%)", "cp":"Chest Pain"},
orientation="h",
title="Frequency of Chest Pain",
text = cp.apply(lambda x: f'{(x / sum(cp)) * 100:.1f}%'),
)
fig.update_layout(showlegend=True,width=1000, height=600)
fig.update_traces(
textfont= {
"family": "consolas",
"size": 20
}
)
Chest Pai
asymptomatic 48.2%
non-anginal 27.8%
Chest Pain
0 10 20 30 40 50
Frequency in Percentage(%)
fig=px.pie(df.groupby('cp',as_index=False)['sex'].count().sort_values(by='sex',ascending=False).reset_index(drop
names='cp',values='sex',color='sex',color_discrete_sequence=px.colors.sequential.Plasma_r,
labels={'cp':'Chest Pain','Sex':'Count'}, template='seaborn',hole=0.4)
fig.update_traces(
textfont= {
"family": "consolas",
"size": 20
}
)
fig.show()
Chest Pain
27.8%
48.2%
16.4%
7.69%
import plotly.express as px
from plotly.offline import iplot
import plotly.graph_objects as go
from plotly.subplots import make_subplots
fig.append_trace(go.Histogram(x=df['age'],
name='Age Distribution') ,1,1)
fig.append_trace(go.Histogram(x=np.log10(df['age']),
name='Log Age Distribution') ,1,2)
iplot(dict(data=fig))
Age Distribution Log Age Distribution
45 Age Distribution
30 Log Age Distribution
40
25 35
30
20
25
15 20
15
10
10
5
5
0 0
40 60 1.5 1.6 1.7 1.8 1.9
import numpy as np
import plotly.graph_objs as go
from plotly.offline import iplot
# Define layout
layout = go.Layout(
xaxis=dict(title='Chest Pain', titlefont=dict(size=25)),
yaxis=dict(title='Values', titlefont=dict(size=25)),
showlegend=True,
width=1300,
height=600
)
30k
25k
Values
20k
20367
15k
12019
10k
8032
5k
4475
2510
0
asymptomatic non-anginal atypical angina
Chest Pain
import plotly.graph_objs as go
from plotly.offline import iplot
data = [
go.Bar(
x=top_03_cp.index,
y=top_03_cp,
name='Top 3 age',
text=top_03_cp,
textposition='auto'
),
go.Bar(
x=top_03_AGE.index,
y=top_03_AGE,
name='Top 3 cholesterol',
text=top_03_AGE,
textposition='auto'
)
]
layout = go.Layout(
title="Grouped Bar Plot For Age and Cholesterol<br>(For The Top Three types of Chest Pain)",
barmode='group'
)
iplot(dict(data=data, layout=layout))
Grouped Bar Plot For Age and Cholesterol
(For The Top Three types of Chest Pain)
Top 3 age
35k 35949 Top 3 cholesterol
30k
25k
20k
20367
15k
12019
10k
8032
5k
4475
2510
0
asymptomatic non-anginal atypical angina
gap_df = pd.read_csv("gapminder_full.csv")
display(gap_df.head(2))
fig = px.bar(data_frame=gap_df,
x="continent",
y="population",
color="continent",
animation_frame="year",
animation_group="country",
range_y=[0,4000000000])
fig.show()
Python Code Link: https://t.me/AIMLDeepThaught/573
country year population continent life_exp gdp_cap
4B
continent
3.5B Asia
Europe
Africa
3B
Americas
Oceania
2.5B
population
2B
1.5B
1B
0.5B
0
Asia Europe Africa Americas Oceania
continent
year=1952
▶ ◼
1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 2002 2007
fig = px.scatter(gap_df,x='gdp_cap',y='life_exp',color='continent',size='population',size_max=60,hover_name="country"
animation_frame="year",animation_group='country',log_x=True,range_x=[100,100000],range_y=[25,90],
labels=dict(Population ="Populations",gdp_cap="Gdp Per Capital",life_exp="Life Expentacy"))
fig.update_layout(
height=550,
width=1500,
title_text="Distribution of GDP Cap Vs Life Expentacy",
title_font_size=24
)
fig.show()
80
70
Life Expentacy
60
50
40
30
2 3 4 5 6 7 8 9 2 3 4 5 6 7
100 1000
year=1952
▶ ◼
cp
import pandas as pd
import plotly.express as px
fig = px.bar(grouped_df,
y="cp",
x='count',
color='thal',
title='Count of Passengers by cp and thal',
labels={'count': 'Number of Patients'},
text_auto=True)
fig.show()
Count of Passengers by cp and thal
thal
fixed defect
typical angina 2 13 8
normal
reversable defect
non-anginal 2 59 22
cp
atypical angina 2 39 8
asymptomatic 12 53 79
0 50 100 150
Number of Patients
plt.text(-0.5,
-0.7,
'Color Palette',
{'font':'monospace',
'size': 24,
'weight':'normal'}
)
plt.show()
import plotly.figure_factory as ff
_ = df.groupby(['cp', 'thal']).chol.size().unstack()
z = _.values.tolist()
x = _.columns.tolist()
y = _.index.tolist()
fig = ff.create_annotated_heatmap(z = z,
x = x,
y = y,
xgap = 3,
ygap = 3,
colorscale = ['#53354A', '#E84545']
)
title = format_title('cp',
'thal.',
'Chol',
12
)
fig.update_layout(title_text = title,
title_x = 0.5,
titlefont={'size': 24,
'family': 'Proxima Nova',
},
template='plotly_dark',
paper_bgcolor='#2B2E4A',
plot_bgcolor='#2B2E4A',
fig.show()
cp
thal.
asymptomatic 12 53 79
atypical angina 2 39 8
non-anginal 2 59 22
typical angina 2 13 8
# available templates
template = ['ggplot2','plotly_dark', 'seaborn', 'simple_white', 'plotly']
fig = px.histogram(df,
x="cp",
y=None,
color="sex",
width=1200,
height=450,
histnorm='percent',
color_discrete_map={
"male": "RebeccaPurple", "female": "lightsalmon"
},
template="plotly_dark"
)
50
percent 40
30
20
10
0
typical angina asymptomatic non-anginal atypical angina
cp
fig.add_trace(
go.Pie(
labels=df['cp'],
title="Chest Pain",
titlefont={'size':20, 'family': 'Serif',},
values=None,
hole=0.5,
), col=2, row=1,
)
fig.update_traces(
hoverinfo='label+value',
textinfo='label+percent',
textfont_size=12,
)
fig.layout.update(title="<b> Heart Disesse <b>",
titlefont={'size':20, 'family': 'Serif',},
showlegend=False,
height=600,
width=1000,
template=None,
)
non-anginal
27.8%
non-anginal
27.8%
asymptomatic asymptomatic
Chest Pain 48.2%
Chest Pain 48.2%
at
yp 16
ic .4%
al
an
ngina
gi
atypical angina
na
7.69%
16.4%
typical a
typical angina
7.69%
# data titanic
fig = make_subplots(rows=1, cols=2,
specs=[[{'type':'domain'}, {'type':'domain'}],
])
fig.add_trace(
go.Pie(
labels=df['cp'],
values=None,
hole=.4,
title='Chest Pain',
titlefont={'color':None, 'size': 24},
),
row=1,col=1
)
fig.update_traces(
hoverinfo='label+value',
textinfo='label+percent',
textfont_size=12,
marker=dict(
colors=['lightgray', 'lightseagreen'],
line=dict(color='#000000',
width=2)
)
)
fig.add_trace(
go.Pie(
labels=df['sex'],
values=None,
hole=.4,
title='Sex',
titlefont={'color':None, 'size': 24},
),
row=1,col=2
)
fig.update_traces(
hoverinfo='label+value',
textinfo='label+percent',
textfont_size=16,
marker=dict(
colors=['lightgray', 'lightseagreen'],
line=dict(color='#000000',
width=2)
)
)
fig.layout.update(title="<b> Heart Desies <b>",
titlefont={'color':None, 'size': 24, 'family': 'San-Serif'},
showlegend=False,
height=600,
width=950,
)
fig.show()
Heart Desies
non-anginal
27.8% Female
32.1%
asymptomatic
48.2%
Chest Pain Sex
at
Male
yp 16
67.9%
ic .4%
al
an
gi
na
typical angina
7.69%
Male
Female
asymptomatic
typic Female
al an
gina
non-anginal
Male
Male atypical angina
Female
Female Male
fig.update_traces(marker_color=colors, marker_line_color=None,
marker_line_width=2.5, opacity=None)
fig.show()
Chest Pain Type
50
40
30
percent
20
10
0
asymptomatic non-anginal atypical angina typical angina
cp
40
30
percent
20
10
0
typical angina asymptomatic non-anginal atypical angina
cp
data = df[['sex']]
fig = px.histogram(df,
y="sex",
orientation='h',
width=800,
height=350,
histnorm='percent',
template="plotly_dark"
)
fig.update_layout(title="<b>Heart Disease<b>",
font_family="San Serif",
bargap=0.2,
barmode='group',
titlefont={'size': 28},
paper_bgcolor='lightgray',
plot_bgcolor='lightgray',
legend=dict(
orientation="v",
y=1,
yanchor="top",
x=1.250,
xanchor="right",)
)
annotations = []
annotations.append(dict(xref='paper', yref='paper',
x=0.0, y=1.2,
text='Heart Disease',
font=dict(family='Arial', size=16, color=colors[2]),
showarrow=False))
annotations.append(dict(xref='paper', yref='paper',
x=0.50, y=0.85,
text='30.4%',
font=dict(family='Arial', size=20, color=colors[2]),
showarrow=False))
annotations.append(dict(xref='paper', yref='paper',
x=1.08, y=0.19,
text='69.6%',
font=dict(family='Arial', size=20, color=colors[2]),
showarrow=False))
fig.update_layout(
autosize=False,
width=800,
height=350,
margin=dict(
l=50,
r=50,
b=50,
t=120,
),
)
fig.update_layout(annotations=annotations)
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.show() Python Code Link: https://t.me/AIMLDeepThaught/573
Heart Disease
Heart Disease
Female 30.4%
sex
Male 69.6%
0 10 20 30 40 50 60 70
percent
# Pie chart
plt.subplot(1, 2, 1)
quality_counts = df['cp'].value_counts()
plt.pie(quality_counts, labels=quality_counts.index, colors=sns.color_palette('PuBuGn', len(quality_counts)), autopct
plt.title('Chest Pain Distribution')
# Count plot
plt.subplot(1, 2, 2)
ax = sns.countplot(data=df, x='cp',palette='PuBuGn')
# Add count values above each bar
for i in range(len(ax.containers)):
ax.bar_label(ax.containers[i], label_type='edge')
plt.figure(figsize=(20, 5))
plt.show()
plt.figure(figsize=(20, 6))
plt.xlabel('Features')
plt.ylabel('Mean Value')
plt.title('Grouped Barplot by Chest Pain type')
plt.legend(title='Chest Pain')
plt.show()
for cp in cp_attributes_comparison['cp'].unique():
cp_data = cp_attributes_comparison.loc[cp_attributes_comparison['cp'] == cp]
# Add legend
plt.legend(loc='upper right', bbox_to_anchor=(1.3, 1))
# Add title
plt.title('Chest Pain Attributes Comparison')
(5000, 10)
job_title company descriptions location category subcategory role type salary listingDate
RM 3,000
AESD JOB –
MARKETING Marketing & Marketing marketing- Full 2024-03-
0 INTERNATIONAL DESCRIPTIONS\nWork Petaling RM 4,000
EXECUTIVE Communications Assistants/Coordinators executive time 21T08:08:18Z
(M) SDN. BHD. closely with the sales ... per
month
RM 2,500
Job –
E-Commerce JOBSGURU Administration & Client & Sales sales- Full 2024-05-
1 Description\nPerform Petaling RM 3,500
Sales Admin SDN. BHD. Office Support Administration administration time 24T12:59:40Z
CS activities by repl... per
month
# Drop rows with missing values and plot the resulting DataFrame
job = jobs.dropna()
msno.matrix(job, ax=axes[1])
axes[1].set_title("DataFrame after Dropping Missing Values",fontsize=24,color='Green')
plt.tight_layout()
plt.show()
import re
def clean_and_calculate_mean(salary):
try:
# Remove currency symbols, words, and extra characters
salary = salary.replace('RM', '').replace('MYR', '').replace('$', '').replace('per month', '').replace('p.m.'
except Exception as e:
print(f"Error processing salary '{salary}': {e}")
return None
job_title company descriptions location category subcategory role type listingDate Salary
AESD JOB
MARKETING Marketing & Marketing marketing- Full 2024-03-
0 INTERNATIONAL DESCRIPTIONS\nWork Petaling 3500.0
EXECUTIVE Communications Assistants/Coordinators executive time 21T08:08:18Z
(M) SDN. BHD. closely with the sales ...
Job
E-Commerce JOBSGURU Administration & Client & Sales sales- Full 2024-05-
1 Description\nPerform Petaling 3000.0
Sales Admin SDN. BHD. Office Support Administration administration time 24T12:59:40Z
CS activities by repl...
import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)
jobType = job['type'].value_counts()
fig.update_layout(width=1000, height=800)
fig.update_traces(textfont_size=20)
fig.show()
4.63%
0.408%
0.0454%
94.9%
top_n = 50
filtered_data = job['job_title'].value_counts().head(top_n).reset_index()
filtered_data.columns = ['job_title', 'count']
fig.update_traces(textfont_size=16)
fig.update_layout(width=1000, height=600)
fig.update_layout(margin=dict(t=50, l=25, r=25, b=25),
title='Top Job Openings: Job Roles in Malaysia',
title_x=0.5,
title_y=0.98)
fig.show()
Python Code Link: https://t.me/AIMLDeepThaught/573
Top Job Openings: Job Roles in Malaysia
all
Business Development Executive Senior Account Executive Account Assistant
Marketing Executive
Human Resource Executive
Senior Marketing Executive Administrative Assistant Customer Service Executive E-Commerce Executive
HR Executive
Finance Executive Project Engineer
Admin Assistant
Admin Executive ACCOUNT EXECUTIVE
Finance Manager Sales Admin Executive
Customer Service Mechanical Engineer SALES EXECUTIVE
Sales Admin
Account Clerk
job_title_counts = filtered_data['job_title'].value_counts()
top_n = 10
top_job_titles = job_title_counts.head(top_n).index.tolist()
filtered_data = filtered_data[filtered_data['job_title'].isin(top_job_titles)]
all
Sales Executive Account Assistant Marketing Executive Account Executive Admin Assistant Finance Executive
Marketing Executive
Accounts Executive
Business Development Executive
Finance Executive Business Development Executiv
Admin Assistant
Penang
Business Development Executive
Account Assistant Admin Assistant
Account Executive Sales Executive
HR Assistant
fig.show()
Job Vacancies Available Across Malaysia by Category
Defencey
Governm & Technolog
Administr
perty
l
ent &
y & Tourism
g & Strategy
Lega
ent
Science
& Developm
n
ng
Trad Superann Media
& Pro
Hospitalit
uatio
Consultin
es
Info
ity Services
ical
ervic
tion
Estate
s&
Commun
ts
Med
, Art
tio
c
Accounting
rma
S
Educa
u
Sales
gin itect s
ruc
istic
rod
re &
es &
rin e
Adve
Ma
e&
ur
nst
Log
hca
tion
rP
g
ranc
Co
lt
ation & O
me
&
Insu
rke
Hea
ice
t
cia
& Co
por
En
Ba uring, onsu
ee
sig inan
rv
ing Trans
tin
Hu
gin
s
&C
Se
mm
&F
n
m
g&
M
tio
n
ee
an
ail
er
an
unic
Ca
Ret
t
ica
uf
Re
om
en
rin
e
ffice Supp
Re
t
nk
ll
fac
Co
D
ac
t
ation
ai
Ce
itm
C
un
so
nu
l&
t
Ed on
st
ur
nt
Ma
s C
ur
Ba uca tru
m
Cu
u
in
o
re
ct nsu
cr
mu
ti
ce
g
De nk
Tech
sig ing on
m
i
&
, T om
& on m
Re
Sales
&
n &
Ad Rea
er
Cu
Co
He vert Esta &
Ar ina Tra
F
ra
l
&
nic
a isin
Pr
tre
ch nc in
nolo
&
Ho lth te
st
ns r S
ca g &
re , Art Pro
od
sp
i
Le
ite ia
ita
ct l S ng
ga
Re
l lit
Sc y & s
M & pert
ort
ienc &
&
s
uc
To
po er
Con
ism ed M
Spo sult Tra
e ur
ica ed y ur e
rt ing des
& & &
Rec & Stra
Te
ce
en
Ser
e rvic
ati
ia
reat
tegyvice
ch
l
ts
ion s
no
cr ogi
gy
logy
rt vic
an ting
es
rt
ur
ui
M all C
&
po
on
so
tm tics
up
e
Re
s
Ac S
k
C
e
e
nt
co
ar
un fic
tin Of
um
Adm g n&
tio
H
inis
tra
tion i stra nolo
gy
Petaling n ech
Eng &O dmi icatio
nT
inee ffic A mm
un
ring e Sup &C
o
Man por tion
ufactu t Kuala Lumpur rma
Trans
ring, Info
Marketin port
g & Co
mmun & Log
Sales ications istics
Human Reso Johor Bahru Accounting
urces & Recr
uitment
Information & Comm
unication Techn
Real Estate & Property ology
Retail & Consumer Products
Construction
Call Centre & Customer Service Retail & Consumer Products
Banking & Financial Services
Construction
Administration & Office Support
g
& Office Support
Hospitality & Tourism Muar
Engineerin
& Energy Engineering
Mining, Resources Maluri
Malaysia Engineering
Healthcare &
ng
Kudat Medical
Kota Bharu Administration Accounting
Penang
& Office Support
Jasin Retail & Consumer
Bandar Bentong Products
Sri Permaisur Marketing & Accounting
Taman Ampang i Communications
Tun Dr Information Accounting
Ismail & Communication
Technology
ba
Sandakan
Sales
Manufacturing, Accounting
Pontian Transport
Pahang Healthcare & Logistics
Mid Valley & Medical
Kampun City Accounting
g Malaysi Kedah
ting
Sales
Su
a Raya Human Resources Accounting
Sibu &
Division Education Recruitment
Sabak & Training
Accoun
Bernam Administratio Sales
Negeri Accounting
Sembil n & Office
Manufacturin Support
an Accounting
g, Transport
/
Kucha
i Lama & Logistics
Bukit
Daman Design Accounting
Manufacturi & Architecture
sara Accounting
Bukit ng, Transport
Bintan Healthcare & Logistics
g Retail &
Consumer& Medical
m
Mir Per Human
Resources Products
i Div ak
Marketing & Recruitmen
Call Centre & Communica
Manufactu t
isio & Customer
Human ring, Transport
tions
Ala
Sabah n
Service
Resources & Logistics
Insurance & Recruitme
& Superannu
logy
nt
Kuan Engineerin ation
Ko
g
tan
chno
Ba Sales
Sales
ndar ta Se
Advertisin Accountin
g, Arts g
n Te
Marketing & Media
tar Accountin
s
Ma & Communi g
gistic
CEO & Healthcar
icatio
Mo lay Manufact General
cations
e & Medical
nt sia Retail
uring,
TransporManagem
ah
mmun
& Consume ent
t & Logistics
Kia Manufact
t & Lo
r
uring, Products
ther O ra Transpor
& Co
t & Logistics
Farming Engineer
spor
, Animals Accountiing
i
g
ation Ku K
& Conserv ng
Sh
Adminis
, Tran
Resourc
Inform
ra
tration
t
es &
& Office Recruitm
t Ja
ppor
Support ent
S
turing Office Su tment
Ku ela lil
Sales
or
Klan
Marketi Account
n
Retail
a
ng &
Commuing
ac
tion er Product nication
& Commu
uf
s
nications
ng
&
Kuala Lum
tu la L
s
Admini Consult
KL
& Office
istic
lu
& Re ns
y
istrat
AccounSuppor
lity & Accoun ting t
ng
s M D g Tourism ting
in ce io Ec ivisio at
Adm ela
Retail
Log
sour icat
& Consum
o
Inform Sales
gi/Serda
er
ationMarket EngineProduc
an Re mun C n
Admin
M
Constr ering ts
ka
& Comm ing
istratio & Comm
Ku
uction
ity
n& unicati unicati
rt &
Suppo
Hum
Manuf Retail logy
la
Inform rt
ationacturin & Consu
g& Ke eng ka
ch
& Comm g, Transp mer Sales
ng
ketin Custom
spo
Centre Bankin Hospit ion Logist ts
& Custo g & Financ ality Techn ics
& Tourisology
in
ah
mer ial Servic m
p
acturi Real Servic
Se
Huma es
ng, Estate e
ran
on
n ResouTrans
g
&
ort
rces port &Property
g
po
& RecruLogist
Ce Engin itmenics
D
Admin Const eering t
g
ng
,T
istrat & FinanEngin ructio
es ion cial eeringn
& Office
up
iv
Servic
Hulu Langat
AccouServic
ra
Se
cial Suppo
Edu nting es
ing
& Finan rt
ng Admi
i
Banki nistra Engin
si
n re Const eering
Go er
Manution &
eS
ructio
Sales
Kot
ructio tectu
r
Const Archi factu Office n
pa
n& al Admi
pur City Ce
Huma ring, Supp
on
cts
u
Desig & Medic Produ nistra n ResouTrans ort
t
hcare tion
umer & Office rces port
Se
Healt
fac
Cons & Logis
& Recru
l& ology
g/P
c
Supp
m
tics
be
Retai itmen
Ch ban
i
& Techn
ng
ort
ce Media ation t
f
Accou
ScienArts &rannu
g
/Ban
Bank Trade nting
u f
g, ing s&
rtisin & Supe
ba s
& FinanEngin Servi
Accou
re
n
tin & O
Adve ance
aK
Insur l cial eerinces
Ma
Lega Servi g
Manu
Reta
factu il
nting ces
ring,& Cons
Infor Call
m
Tran umer Sales
k
mati Cent
n ion
sport Prod
on re
& Com & CustEngin& Logisucts
Se
Ban
omer eerin tics
cou
mun
a
Adm Hosp icatioAcco Servg
K
inistr italit n untin ice
y Tech g
ina
ation
Desi Engi& Tour nolog
& Offic
Man gn neer ism y
Ku
ufac Minin & Arch Accoe Supp ing
Klan
Ma
int
t
turin g, itect untin ort
g, Reso
nu
Ac nistra
Tran urceure g
Kajang
factu Hosp
spor
italit t s & Ener
Johor
&
y & Logis gy
rin Ac Engi Tour tics
g, coun neer
t
Hum ism
balu
ing
Tran
gy
Adman Scie
tin
a
or
inistReso nce
gsar
g
lai
sp ratiource &
ort
Tech
n &s &
olo
Recr nolo
Offic
& Lo
mi
t
Info Engi uitm gy
e Supp
neer ent
en chn
pp
rma
gy
Ma tion gis ing ort
nu Mar & Com
tic
ntre
keti Trad
fact s
m
ng mun es
Edu icati & Serv
uit Te
&
Ad
ur Com
lo
catio on Saleices
Su
ing, Info mun
Tech s
icati n & Trai
cr ion
rma nolo
tion ons
Tr Scie Acco ning gy
les g no
an Adm& Com
Re
nce unti
inist mun
sp
t
&
Techng
s
ratio
or
& nica ion
icati nolo
n & on
t&
e
Sale gy
Offic Tech
s
ch
Man Acco
Lo e nolo
Sa erin
ufac unti Sup
s
fic
t
turi Scie gi ng port gy
Hu stics
ce u a En
ng, nce
m Tran
ur mm unic
gin
&
e
an spo Tech
f
rt nolo
Re Acc &
ee Sales
so
ounLog gy
T
Info Man
e so tingistic
o
rma ufacMar
rin
O
s
ur tion
gin Re & C mm
turiketi
ce Adm & ng,ng
g
Com
n
&
s& inis Tran Com
En an
tratmun spo mun
Co
ion icat rt
Re
& io
& ion & icat
at
m
logys
Scie tm ting por
Hum Ret
Hu rma n
ail nce
en t
g ed
an &
Mar Res Con &
Tec t
io ic
Info Adm sum
our hno
et ts
er Sale logy
tion Pro s
tin Adm trat &
fo
at
Hum ion EngRec Leg duc
inis &
un
g
In ark ising, Arrvices
trat Call an
Com & Con ineeruit al ts
& Offi stru
ringment
Ad
ion CenRes mu
Acc ce ctio
Co & tre our nica ounSup n
tr
Offi & ces tion
vice m ce Cus & Saletingport
M ve &
tomRecTechnos
m m
Ser Sup
g m
rt Se er ts un por er ruit
Ser me logy
is
tom duc Acc t
in icat
Man vicent
Cus Pro Ret oun
Ad es tre & er
in Com
Humufa Mar ail ting
io
ctu ket & Tra
ad l Cen Consum logsy
is ns
s
in
Tr Res , & sum
tr
hno Adm &
& ourTra Com er Ser
r
s
Cal
Tec vice
En
ail inis Hea cesnsp
& Ser mu Pro vice
tic
istic
Ret
at
e ial trat lthc & ort nica
Sa
duc
enc re are Rec& Log tion tss
Ad
ee &
ion
Sci Financ
Ac gin
ctu g ty & Eng & ruit isti s
m
n
io
& ininper Off ine Med me
g ctio hite
gis
le
Tra Hum Ma ice erin ical ntcs
kin stru & Arc& Pro
an rke
m
& Sup g
n
Ad
s
BanCon ign ion
co ee
ate Resting por
Des catEst our & t
Lo
Edu l
in on
in
&
cesCom
un rin
Rea
& mu Sal
Rec nic es
is
O
In
ruit atio
&
tin g
Ma me ns
g ti
Log
tr
ffic
fo
nuf nt
act Hu Ma
ns
at
t
ma rke
rm
uri
En rma
or
ng, n ting
Cal Res
e
Tra
io
l
Heour & Com
e atio
nsp Cen
ati
g
althces
re S un sp
Su
ort tre mu Sal
n
& car & Rec
& nic
on
Log Cus Enge & rui atioes
in
an
pp
&
gy
Ma
Ser g al
&
nuf
le unt
oun
fo
act Re vic
hit sto om Tr
O
g
rt &
or
ting
uri tail e
Co
ort gisti hnolo
Ad ng, &
ffic
ic
ver TraCo
t
nsu
In
tu er m
tisi nsp
,
Ac up
ng, me
& ing
ort
Art &r Pro
Sa cco
in
m
Adm
e
S
co
En s Log duc
gin
al &
un
Me isti ts
eer
es
S
un
dia cs
r
in
s
ing
tu
is
ec
ic
u C
Hu
tin
Ma n
spo
tr
ec m
ma
cs
A
ati
rke Re
Cal
at
ke fac
ort
tin sou
l Cen
g
po
io
er
nT
g rce
t
Ban n
on
De
n
tre
Co &
Edu Fin
kin
sig
&
mm Re ine
&
g
itm
u
rt
cat Coanc re
Ad
Cu
&
&
unicru ng g
sto
O
ion nstial
Arc C
Te
s entr tin
Arc
an
catitm
m
me
ffic
ru
tio
hite vice
ion ent
Tra tionvic
eri inin
r
&
in
Ma
Ser
upp
Rec
ch
s
ctu
e
Inf
istr
ran
M
Acc
Lo
orm
Sup
Ma atio
e
Hum
no
& O nsp nica
ne
Ma Res
oun
es
nuf n
&
at
rke
an
ting
Returin
act & Com
po
Ca ar
nu
Adm
es
Hu
io
ting ces
lo
ail g, mu ign
Man
&
rt
& Tra nic & Arc
rc
our
&
&
m
s
gy
Con nspatio
Res n
Com Rec
vice
Ac
M
&
De ll C
ou
Des nic
an ctio
Hum st n
an
sumort n Tec
Ser
mu
&
fact
Off
u
itm upp
Con ig
En Su
er & Loghno re
eS
ort
g
ial
Sal ducisti y
ufa
ru
Re Des
Pro
co
atio
ruit
anc
,T
gi
ice
mm
l
es ts cs
Man
dica
Info
me
hite
ns
Fin
rism
ne
so
Me
Ad ng,
Man tion
nt
tin
rma
&
ctu
gi
Tou
un
er
ctu
log
ur
g
&
inis
uf
ufa &
kin
Ret ket , mu
Rec sum inin Med s
Mar ring
are
uri
Tra s & vice
ecru e S
&
pp
in
ia
Ban
ce
ctu Com
en
lity
ac
Adm
ts
& Art Ser
g
duc
or
Co
s & Ar
Hea
ig
g
tin
ri
tu
Offic
tion Pro
Hos
t
n
Ret Legcat sing des
trat
pita
Edu erti Tra
&
Re
ra
Sa oun
ion ,
ion
istr nsp
ffic
Hu
er nicaLog log
Info
lity
g,
g
trati
Pro tion
En
&
rt & Con
cr
urin
ent
Offi
&
rma
man n
&
duc
istics
Spoail al
ch
Tr
Man
ati ng, T
ui
Tou
&
Sa
coketingmunica&catiCusontomnciaOffilceSerSuplepors
ort
ts
ati
istic
tm
rism
s
an
ite
Call Baninistrat
le
s
on
Adm
y
Re
en
sp
ct
s
Cenking
o
Ac
ment
ran
&R
cruitm
ur
tre &
or
t
uf
so
Hu minis
ing
Supp
tinmunication
Information
ati
e
t&
c
ur
on
ion
les
Ad
Fina&
ma
&
actu
co
Info
ri
on &
En
ce
g slogy
ort
on
Con
rma Call
Lo erin
Ac
rces
spo
n Re tio
Off
rm
hno g
s&
gine
act
Manufac
min factu
stru
Sa
tion
inin vices
un
gist g
Ac
Mar Comtre
ctio
&
Admi
&
ring,
keti mun
Re
n
sour n & Of Acco
& Recruit
&O
tra
ics
Info
t
fo
ou
l Esta
tin
crui
co
Lo
rt &
Enginee tion & Office
s & Re
&
Comion er
unt
Hum istr
te mun
Cus
ces
Su
In
Res
rma
&
gis
tm
En
nu
Prop
un
g
nuf
nistra
Tran
TechServ
Call Centre &
Administration
icat nolo
trati
ns
& Re e Su ting
en
erty
pp
ring
ices municatio
tion
ions gy
Mini
turing,
tics
ffice
t
Ma
gin
& Communicat
Acc
ice
an
ng,
tin
Log
ScieReso Cons
fic
ort
cru ppor
Marke munica
& Com
spor
Acco
Mark
& Tech
Serv
tion
itm
ce
y ucts
g
eeri
Finan eting
Ad
ure
s&
eting
Hum ing,
Ma
Farm
turing, Tran
r Prod
cal
isti
nolo gy
un
en
o
ting
itect
oun
Ener
Consing & Mark
ications
& Medi
nolog
Accounting
an Anim
inis
& Comm
gy
sour
ion
inee
Transpor
ume
thca& Arch
t
Resources
Reso als
Sup
& Tech
& Of
t
Human
& Lo
RetaLega re
ce Cons
ng
Banking
cs
Estat & Recruervat
Acc
Heal gn
ring
unica
Bank
gy
Informa
Hospitality
Scienil &
Marketing
Technolo
e&
tions
mun
Huma
eerin
tion
ti
nistra
Prope
Call
untin
Marketing & Communications
fice
gist
Customer Service
tion &
an Re
munica logy
n Resou
Adm
itmen
Resources
Centre
Human
rty t
port
ication
ng
Com
g
Design
ounting
ion
Commun
& Finan
Call Centre
Administr
Eng
ion Techno
Sales
ics
Commun
t & Log
Resource
Supp
Sale
Accounting
& Recruecture e
& Comm
ng &
Engineering
ture
tion
ication
tion
Manufacturing, Transport & Logistics
Retail
ation
Accoun
& Custome
Healthca
er
& Conserva
& Architec
tion &
s & Recruitm
Admi
itmen
Engineering
cial Servi
& Consum
Technolo
Design Constru
& Office
s
Hum
Educatio
Marketi
& Recruit
& Property
Construc Informa
Training
gy
Manufac
ort
& Tourism
t
re &
Banking
Human
& Technolo
s
Advertising, Arts & Media
r Service
g
Design
& Architec
Design & Architecture
Enginee
Animals
Support
gy
n & Training
unications
istics
Medical
er Product
Communications
Educatio tion
ent
Design
Consulting
Farming, n &
Sales
Estate
Acc
& Financial
Sales
ring
Education & Strategy
& Property
ction
ting
Science & Technology
Information & Training
ture
Science
ces
Sales
& Architecture
Science & Technology
Design & Architecture
logy
s
Education & Training
Trades & Services
ment
Real
Sales
& Strategy
Services
&
Consulting
Real Estate
IQR = Q3 - Q1
return df_outlier_free
#===========================================================================================================#
# Plotting
plt.figure(figsize=(16, 6))
sns.boxplot(x='Salary', y='job_title', data=job[job['job_title'].isin(top_leagues)])
plt.title('Distribution of Salary by Top 20 job_title from job')
plt.xlabel('Salary')
plt.ylabel('job_title')
plt.show()
plt.figure(figsize=(16, 6))
sns.boxplot(x='Salary', y='job_title', data=jobs[jobs['job_title'].isin(top_leagues)])
plt.title('Distribution of Salary by Top 20 job_title from After Removing Outliers')
plt.xlabel('Salary')
plt.ylabel('job_title')
plt.show()
top_leagues = jobs['job_title'].value_counts().nlargest(15)
plt.figure(figsize=(20, 8))
plt.show()
tips_df = pd.read_csv("tip.csv")
display(tips_df.head(2))
fig = px.bar(tips_df,
x="sex",
y="total_bill",
color="smoker",
barmode="group",
facet_row="time",
facet_col="day",
category_orders={"day": ["Thur", "Fri", "Sat", "Sun"],
"time": ["Lunch", "Dinner"]})
fig.show()
time=Lunch
600
total_bill
400
200
800
time=Dinner
600
total_bill
400
200
0
Male Female Male Female Male Female Male Female
fig = px.box(tips_df,
x="time",
y="total_bill",
points="all")
fig.show()
50
40
total_bill
30
20
10
Dinner Lunch
time
fig = px.box(tips_df,
x="time",
y="total_bill",
points="outliers")
fig.show()
50
40
total_bill
30
20
10
Dinner Lunch
time
fig = px.box(tips_df,
x="day",
y="total_bill",
color="smoker" )
fig.update_traces(quartilemethod="linear")
fig.show()
smoker
50
No
Yes
40
total_bill
30
20
10
day
fig = px.box(tips_df,
x="time",
y="total_bill",
color="smoker",
notched=True,
hover_data=["day"] # add day column to hover data
)
fig.show()
smoker
50
No
Yes
40
total_bill
30
20
10
Dinner Lunch
time
plt.figure(figsize=(20, 10))
x = jobs['job_title'].head(20)
y = jobs['Salary'].head(20)
# Plot the scatter plot with country names and numbers on y-axis
marker_sizes = jobs['Salary']
for i, country in enumerate(x):
plt.scatter(country, y.iloc[i], s=(marker_sizes.iloc[i])/20, label=country, alpha=0.7)
plt.text(country, y.iloc[i], f'{y.iloc[i]:,.0f}', ha='center', va='bottom', rotation='vertical', fontsize=10
# Set y-axis to display numbers in billions
plt.ticklabel_format(style='plain', axis='y', useOffset=False, scilimits=(9, 9))
plt.xlabel('Job Title')
plt.ylabel('Salary')
plt.title('Scatter Plot Job Title for Salary')
plt.xticks(rotation=90)
plt.grid(True)
plt.tight_layout()
plt.show()
jobs = pd.read_csv("jobstreet_all_job_dataset.csv")
jobs = jobs.sample(5000)
jobs = jobs.drop(columns=['job_id'], axis=1)
jobs = jobs.reset_index()
jobs = jobs.drop(columns=['index'], axis=1)
display(jobs.shape)
display(jobs.head(2))
plt.axis('off')
plt.show()
(5000, 10)
job_title company descriptions location category subcategory role type salary listingDate
Mass JOB
Manager,
Rapid PURPOSE
Government Kuala 2024-05-
0 Transit :\nTo organize Construction Project Management manager Contract/Temp NaN
& Authority Lumpur 10T04:06:31Z
Corporation & participate in
Liasion
Sdn Bhd a ...
Designing
Saraya Information & information-
Junior IT solutions, Seremban 2024-04-
1 Goodmaid Communication Developers/Programmers technology- Full time NaN
Executive implementation, District 08T00:14:09Z
Sdn Bhd Technology executive
customiza...
# Drop rows with missing values and plot the resulting DataFrame
job = jobs.dropna()
import re
def clean_and_calculate_mean(salary):
try:
# Remove currency symbols, words, and extra characters
salary = salary.replace('RM', '').replace('MYR', '').replace('$', '').replace('per month', '').replace('p.m.'
except Exception as e:
print(f"Error processing salary '{salary}': {e}")
return None
job_title company descriptions location category subcategory role type listingDate Salary
fig = go.Figure()
fig.add_trace(go.Bar(y = dfp['role'],
orientation='h',
name = 'Position',
marker = dict(color = 'LightCoral')))
fig.add_trace(go.Bar(y = dfl['location'],
orientation='h',
name = 'Location',
marker = dict(color = 'CadetBlue')))
fig.add_trace(go.Bar(y = dfc['company'],
orientation='h',
name = 'Company',
marker = dict(color = 'SteelBlue')))
fig.update_layout(
updatemenus=[
dict(
type = "buttons",
direction="left",
pad={"r": 10, "t": 10},
showactive=True,
x=0.16,
xanchor="left",
y=1.12,
yanchor="top",
font = dict(color = 'Indigo',size = 14),
buttons=list([
dict(label="All",
method="update",
args=[ {"visible": [True, True, True]},
{'showlegend' : True}
]),
dict(label="Position",
method="update",
args=[ {"visible": [True, False, False]},
{'showlegend' : True}
]),
dict(label='Location',
method="update",
args=[ {"visible": [False, True, False]},
{'showlegend' : True}
]),
dict(label='Company',
method="update",
args=[ {"visible": [False, False, True]},
{'showlegend' : True}]),
]),
)])
fig.update_layout(
annotations=[
dict(text="Choose:", showarrow=False,
x=0, y=1.075, yref="paper", align="right",
font=dict(size=16,color = 'DarkSlateBlue'))])
fig.show()
# Read data
df = pd.read_csv('US_Job_Market.csv')
df.head(3)
2 Data Scientist Xpert Staffing Growing company located in the Atlanta, GA are... NaN Atlanta, GA
import pandas as pd
from dash import Dash, dcc, html
from dash.dependencies import Input, Output
import plotly.graph_objects as go
fig.add_trace(go.Bar(x = redf1["location"],
y = redf1["count"],
marker = dict(color = 'Tomato'),
name = 'Data Scientist'))
fig.add_trace(go.Bar(x = redf2['location'],
y = redf2['count'],
name = 'Senior Data Scientist',
marker = dict(color = 'LightCoral')))
fig.add_trace(go.Bar(x = redf3['location'],
y = redf3['count'],
name = 'Research Analyst',
marker = dict(color = 'SteelBlue')))
fig.add_trace(go.Bar(x = redf4['location'],
y = redf4['count'],
name = 'Data Engineer',
marker = dict(color = 'CadetBlue')))
fig.update_layout(
annotations=[
dict(text="Choose:", showarrow=False,
x=0, y=1.075, yref="paper", align="right",
font=dict(size=16,color = 'DarkSlateBlue'))])
fig.show()
The distribution of states by four Positions
80 Choose: All
▼
Data Scientist
Senior Data Scientist
Research Analyst
60 Data Engineer
40
20
0
Austin, TX
San Diego, CA
Seattle, WA
Atlanta, GA
Los Angeles, CA
Washington, DC
Chicago, IL
Boston, MA
San Francisco, CA
New York, NY
Mountain View, CA
Alameda, CA
Washington, DC 20036
Sunnyvale, CA
San Mateo, CA
df = pd.read_csv("heart_disease_uci.csv")
df = df.dropna()
df.head(2)
id age sex dataset cp trestbps chol fbs restecg thalch exang oldpeak slope ca thal num
lv fixed
0 1 63 Male Cleveland typical angina 145.0 233.0 True 150.0 False 2.3 downsloping 0.0 0
hypertrophy defect
lv
1 2 67 Male Cleveland asymptomatic 160.0 286.0 False 108.0 True 1.5 flat 3.0 normal 2
hypertrophy
# Calculate counts
counts = [df[label].sum() for label in labels]
# Add value labels on top of each bar with some vertical offset
for bar, count in zip(bars, counts):
yval = bar.get_height() + 0.1 # Add a small offset
plt.text(bar.get_x() + bar.get_width() / 2, yval, str(count), ha='center')
plt.subplot(1, 2, 2)
chart = df.groupby('thal')['oldpeak'].mean().sort_values(ascending = False).plot(kind = 'bar', color = 'gold')
chart.set_xticklabels(chart.get_xticklabels(), rotation = 0)
plt.title('Thal from Old Peak', fontsize = 15, color = 'b', pad = 12)
plt.xlabel('Thal')
plt.ylabel('Old Peak')
plt.show()
plt.figure(figsize = (12,4))
ax = sns.countplot(x=df.cp)
for bars in ax.containers:
ax.bar_label(bars)
plt.title("Count of Levels", fontsize = 15);
plt.figure(figsize = (8,5))
sns.kdeplot(df.age, shade = True, color = "r")
plt.title("Age Histogram", fontsize = 20)
plt.show()
print("Histogram's skewness is {} and kurtosis is {}".format(df.age.skew(), df.age.kurtosis()))
df_numeric = df.select_dtypes(include='number')
results = []
0 id 0.90 3.95
6 ca 1.19 0.26
sns.set(rc={'figure.figsize':(20,7)})
sns.relplot(y='trestbps',x='chol',data=df,kind='scatter',size='oldpeak',hue='cp',aspect=1.2);
plt.figure(figsize=(20, 7))
# Create the countplot
sns.countplot(data=df, x='age', order=sorted(df['age'].unique()))
plt.figure(figsize=(17, 5))
plt.subplot(1, 3, 1)
sns.distplot(df[variable])
plt.title('Histogram')
plt.subplot(1, 3, 2)
stats.probplot(df[variable], dist="norm", plot=plt) # Use stats.probplot
plt.ylabel('RM quantiles')
plt.subplot(1, 3, 3)
sns.boxplot(x=df[variable])
plt.title('Boxplot')
plt.show()
corr = df.select_dtypes('number').drop('id',axis=1).corr()
# Generate a mask for the upper triangle
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
fig, ax = plt.subplots(figsize=(15,10))
sns.heatmap(corr, cmap='Spectral_r', mask=mask, square=True, annot=True, linewidth=0.5, cbar_kws={"shrink" : 0.5
df = df.drop('id',axis=1)
df.head(2)
age sex dataset cp trestbps chol fbs restecg thalch exang oldpeak slope ca thal num
0 63 Male Cleveland typical angina 145.0 233.0 True lv hypertrophy 150.0 False 2.3 downsloping 0.0 fixed defect 0
1 67 Male Cleveland asymptomatic 160.0 286.0 False lv hypertrophy 108.0 True 1.5 flat 3.0 normal 2
plt.figure(figsize=(20, 5))
sns.set_context("paper")
kdeplt = sns.kdeplot(
data=df,
x="chol",
hue="sex",
palette='Dark2',
alpha=0.7,
lw=2,
)
fig = px.bar(df,
x='cp',
y='chol',
color='cp', #color represents brand
title='Chol Value'
)
fig.show() Python Code Link: https://t.me/AIMLDeepThaught/573
Chol Value
180
cp
asymptomatic
160 atypical angina
non-anginal
140 typical angina
120
100
chol
80
60
40
20
0
asymptomatic atypical angina non-anginal typical angina
cp
df = pd.read_csv("heart_disease_uci.csv")
df = df.dropna()
df = df.drop('id',axis =1 )
df.head(2)
age sex dataset cp trestbps chol fbs restecg thalch exang oldpeak slope ca thal num
0 63 Male Cleveland typical angina 145.0 233.0 True lv hypertrophy 150.0 False 2.3 downsloping 0.0 fixed defect 0
1 67 Male Cleveland asymptomatic 160.0 286.0 False lv hypertrophy 108.0 True 1.5 flat 3.0 normal 2
plt.figure(figsize=(18,5))
plt.subplot(1,5,1)
sns.distplot(df['age'],color='DeepPink')
plt.subplot(1,5,2)
sns.distplot(df['chol'],color='Green')
plt.subplot(1,5,3)
sns.distplot(df['thalch'],color='Red')
plt.subplot(1,5,4)
sns.distplot(df['oldpeak'],color='Magenta')
plt.tight_layout()
plt.show()
df_cpy = df.copy("Deep")
df_cpy = df_cpy.select_dtypes("number")
df_cpy = df_cpy[['age','chol','thalch','oldpeak']]
fig,axis=plt.subplots(ncols=4,nrows=1,figsize=(15,5))
index=0
axis=axis.flatten()
df_cpy = df.copy("Deep")
df_cpy = df_cpy.select_dtypes("number")
df_cpy = df_cpy[['age','chol','thalch','oldpeak']]
n_cols = 4
n_rows = int(np.ceil(df_cpy.shape[-1]*2 / n_cols))
fig, axes = plt.subplots(n_rows, n_cols, figsize=(4 * n_cols, 3 * n_rows))
for i, (col) in enumerate(list(df_cpy.columns)):
mean = df_cpy[col].mean()
median = df_cpy[col].median()
sns.histplot(df_cpy[col], ax=axes.flatten()[2*i], kde=True)
sns.boxplot(x=df_cpy[col], orient='h', ax=axes.flatten()[2*i+1], color='g')
axes.flatten()[2*i+1].vlines(mean, ymin = -1, ymax = 1, color='r', label=f"For [{col}]\nMean: {mean:.2}\nMedian:
axes.flatten()[2*i+1].legend()
if i % n_cols == 0:
ax.set_ylabel('Frequency')
else:
ax.set_ylabel('')
plt.tight_layout()
0 7.4 0.70 0.0 1.9 0.08 11.0 34.0 1.0 3.51 0.56 9.4 5 0
1 7.8 0.88 0.0 2.6 0.10 25.0 67.0 1.0 3.20 0.68 9.8 5 1
plt.legend();
#plotting
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 9))
fig.suptitle(' Highest and Lowest Correlation ', size = 20, weight='bold')
axs = [ax1, ax2]
#kdeplot
sns.kdeplot(data=df, y='chol', x='thalch', ax=ax1, color="red")
ax1.set_title('Chol Vs Thalch', size = 14, weight='bold', pad=20)
#kdeplot
sns.kdeplot(data=df, y='chol', x='oldpeak', ax=ax2, color='Blue')
ax2.set_title('Chol Vs Oldpeak', size = 14, weight='bold', pad=20);
df1 = pd.read_csv('US_Job_Market.csv')
df1 = df1.dropna().reset_index()
df1 = df1.drop('index',axis=1)
df1.head(2) Python Code Link: https://t.me/AIMLDeepThaught/573
position company description reviews location
plt.figure(figsize=(20, 7))
plt.xticks(rotation=0)
plt.title('Top 10 Most Frequent Companies')
plt.show()
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
from wordcloud import WordCloud, STOPWORDS
import pandas as pd
import requests
from io import BytesIO
import plotly.graph_objs as go
values = df1['company'].value_counts()[:10]
labels=values.index
text=values.index
fig = go.Figure(data=[go.Pie(values=values,labels=labels,hole=.3)])
fig.update_traces(hoverinfo='label+percent', textinfo='value', textfont_size=20,
marker=dict(line=dict(color='#000000', width=3)))
fig.update_layout(title="Most popular Jobs in USA",
titlefont={'size': 30},
)
fig.show() Python Code Link: https://t.me/AIMLDeepThaught/573
Amazon.com
Ball Aerospace
Microsoft
Google
187 NYU Langone Health
Fred Hutchinson Cancer Research Center
357
KPMG
134
45
49
76
70 66 49
z=df1['position'].value_counts().head(10)
fig=px.bar(z,x=z.index,y=z.values,color=z.index,text=z.values,labels={'index':'job title','y':'count','text':'count'
fig.show()
200
position
204
Data Scientist
Senior Data Scientist
Research Analyst
150 Data Engineer
Machine Learning Engineer
Sr. Data Scientist
count
position
# Plotting Outliers
col = 1
plt.figure(figsize = (20, 10))
for i in wine.columns:
if col < 11:
plt.subplot(2, 5, col)
plt.boxplot(wine[i])
plt.xlabel(i)
col = col + 1
s = sns.countplot(x = 'cp',data = df)
sizes=[]
for p in s.patches:
height = p.get_height()
sizes.append(height)
s.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/len(df)*100),
ha="center", fontsize=16)
total sulfur dioxide 1143.000000 45.914698 32.782130 6.000000 21.000000 37.000000 61.000000 289.000000
free sulfur dioxide 1143.000000 15.615486 10.250486 1.000000 7.000000 13.000000 21.000000 68.000000
fixed acidity 1143.000000 8.311111 1.747595 4.600000 7.100000 7.900000 9.100000 15.900000
residual sugar 1143.000000 2.532152 1.355917 0.900000 1.900000 2.200000 2.600000 15.500000
citric acid 1143.000000 0.268364 0.196686 0.000000 0.090000 0.250000 0.420000 1.000000
volatile acidity 1143.000000 0.531339 0.179633 0.120000 0.392500 0.520000 0.640000 1.580000