Ggplot 2

Data Visualisation in R (link for html version)
1. Correlation
1.1 Scatterplot
options(scipen=999) # turn-off scientific notation like 1e+48

library(ggplot2)
theme_set(theme_bw()) # pre-set the bw theme
data("midwest", package = "ggplot2")
# midwest <- read.csv("http://goo.gl/G1K41K") # bkup data source
# Scatterplot
gg <- ggplot(midwest, aes(x=area, y=poptotal)) +
geom_point(aes(col=state, size=popdensity)) +
geom_smooth(method="loess", se=F) +
xlim(c(0, 0.1)) +
ylim(c(0, 500000)) +
labs(subtitle="Area Vs Population",y="Population",x="Area",
title="Scatterplot",caption = "Source: midwest")
plot(gg)
Scatterplot
Area Vs Population
500000
state
IL
400000 IN
MI
OH
Population
300000
WI
200000 popdensity
20000
40000
100000
60000
80000
0
0.000 0.025 0.050 0.075 0.100

Area
Source: midwest
1
1.2 Scatterplot with Encircling
# install ’ggalt’ pkg

# devtools::install_github("hrbrmstr/ggalt")
options(scipen = 999)
library(ggplot2)
library(ggalt)
midwest_select <- midwest[midwest$poptotal > 350000 &
midwest$poptotal <= 500000 &
midwest$area > 0.01 &
midwest$area < 0.1, ]
# Plot
ggplot(midwest, aes(x=area, y=poptotal)) +
geom_point(aes(col=state, size=popdensity)) + # draw points
geom_smooth(method="loess", se=F) +
xlim(c(0, 0.1)) +
ylim(c(0, 500000)) + # draw smoothing line
geom_encircle(aes(x=area,y=poptotal),data=midwest_select,
color="red",size=2,expand=0.08) + # encircle
labs(subtitle="Area Vs Population",
y="Population",x="Area",
title="Scatterplot + Encircle",caption="Source: midwest")
Scatterplot + Encircle
Area Vs Population
500000
state
IL
400000 IN
MI
OH
Population
300000
WI
200000 popdensity
20000
40000
100000
60000
80000
0
0.000 0.025 0.050 0.075 0.100

Area
Source: midwest
2
1.3 Jitterplot
library(ggplot2)
data(mpg, package="ggplot2") # alternate source: "http://goo.gl/uEeRGu")
theme_set(theme_bw()) # pre-set the bw theme.
g <- ggplot(mpg, aes(cty, hwy))
# Scatterplot
g + geom_point() +
geom_smooth(method="lm", se=F) +
labs(subtitle="mpg: city vs highway mileage",
y="hwy",
x="cty",
title="Scatterplot with overlapping points",
caption="Source: midwest")
Scatterplot with overlapping points

mpg: city vs highway mileage
40
hwy
30
20
10 15 20 25 30 35
cty
Source: midwest
3
1.3 Jitterplot (2nd version)
# load package and data

library(ggplot2)
data(mpg, package="ggplot2")
# mpg <- read.csv("http://goo.gl/uEeRGu")
# Scatterplot
g + geom_jitter(width = .5, size=1) +
y="hwy",
x="cty",
title="Jittered Points")
Jittered Points
40
30
hwy
20
10 15 20 25 30 35
cty
4
1.4 Counts Chart

library(ggplot2)
# Scatterplot
g + geom_count(col="tomato3", show.legend=F) +
y="hwy",
x="cty",
title="Counts Plot")
Counts Plot
40
30
hwy
20
10 15 20 25 30 35
cty
5
1.5 Bubble plot
While scatterplot lets you compare the relationship between 2 continuous variables, a bubble chart serves
well if you want to understand relationship within the underlying groups based on:
• A Categorical variable (by changing the color) and

• Another continuous variable (by changing the size of points).
Bubble charts are more suitable if you have 4-Dimensional data where two of them are numeric (X and Y)
and one other categorical (color) and another numeric variable (size).
1.5 Bubble plot
library(ggplot2)
mpg_select <- mpg[mpg$manufacturer %in% c("audi", "ford", "honda", "hyundai"), ]
# Scatterplot
theme_set(theme_bw()) # pre-set the bw theme
g <- ggplot(mpg_select, aes(displ, cty)) +
labs(subtitle="mpg: Displacement vs City Mileage",title="Bubble chart")
g + geom_jitter(aes(col=manufacturer, size=hwy)) +
geom_smooth(aes(col=manufacturer), method="lm", se=F)
Bubble chart
mpg: Displacement vs City Mileage
hwy
15
25
20
25
30
20 35
cty
manufacturer
audi
15 ford
honda
hyundai
10
2 3 4 5
displ
6
1.6 Marginal Histogram/Boxplot
If you want to show the relationship as well as the distribution in the same chart, use the marginal histogram.
It has a histogram of the X and Y variables at the margins of the scatterplot.
1.6 Marginal Histogram

library(ggplot2)
library(ggExtra)
# Scatterplot
mpg_select <- mpg[mpg$hwy >= 35 & mpg$cty > 27, ]
g <- ggplot(mpg, aes(cty, hwy)) +
geom_count() +
geom_smooth(method="lm", se=F)
#ggMarginal(g, type = "density", fill="transparent")

ggMarginal(g, type = "histogram", fill="transparent")
40
n
hwy
30 5
10
20
10 15 20 25 30 35
cty
7
1.6 Marginal Boxplot

library(ggplot2)
library(ggExtra)
# Scatterplot
mpg_select <- mpg[mpg$hwy >= 35 & mpg$cty > 27, ]
g <- ggplot(mpg, aes(cty, hwy)) +
geom_count() +
geom_smooth(method="lm", se=F)
#ggMarginal(g, type = "density", fill="transparent")

ggMarginal(g, type = "boxplot", fill="transparent")
40
n
hwy
30 5
10
20
10 15 20 25 30 35
cty
8
1.7 Correlogram
Correlogram let’s you examine the corellation of multiple continuous variables present in the same dataframe.
1.7 Correlogram
library(ggplot2)
library(ggcorrplot)
# Correlation matrix
data(mtcars)
corr <- round(cor(mtcars), 1)
# Plot
ggcorrplot(corr, hc.order = TRUE,
type = "lower",
lab = TRUE,
lab_size = 3,
method="circle",
colors = c("tomato2", "white", "springgreen3"),
title="Correlogram of mtcars",
ggtheme=theme_bw)
Correlogram of mtcars
mpg 0.7
gear 0.5 0.7
am 0.8 0.6 0.7
Corr
vs 0.2 0.2 0.7 0.4 1.0
qsec 0.7 −0.2 −0.2 0.4 0.1 0.5
0.0
disp −0.4 −0.7 −0.6 −0.6 −0.8 −0.7
−0.5
cyl 0.9 −0.6 −0.8 −0.5 −0.5 −0.9 −0.7
−1.0
hp 0.8 0.8 −0.7 −0.7 −0.2 −0.1 −0.8 −0.4
wt 0.7 0.8 0.9 −0.2 −0.6 −0.7 −0.6 −0.9 −0.7
carb 0.4 0.7 0.5 0.4 −0.7 −0.6 0.1 0.3 −0.6 −0.1
t
hp
l
p
ec
vs
ar
pg
at
am
cy
w
dr
ge
di
qs
9
2. Deviation
Compare variation in values between small number of items (or categories) with respect to a fixed reference.
2.1 Diverging bars
library(ggplot2)
theme_set(theme_bw())
# Data Prep
data("mtcars") # load data
mtcars$‘car name‘ <- rownames(mtcars) # create new column for car names
# Compute normalized mpg
mtcars$mpg_z <- round((mtcars$mpg - mean(mtcars$mpg))/sd(mtcars$mpg), 2)
mtcars$mpg_type <- ifelse(mtcars$mpg_z < 0, "below", "above") # above/below avg flag
mtcars <- mtcars[order(mtcars$mpg_z), ] # sort
# Convert to factor to retain sorted order in plot
mtcars$‘car name‘ <- factor(mtcars$‘car name‘, levels = mtcars$‘car name‘)
# Diverging Barcharts
ggplot(mtcars, aes(x=‘car name‘, y=mpg_z, label=mpg_z)) +
geom_bar(stat=’identity’, aes(fill=mpg_type), width=.5) +
scale_fill_manual(name="Mileage",
labels = c("Above Average", "Below Average"),
values = c("above"="#00ba38", "below"="#f8766d")) +
labs(subtitle="Normalised mileage from ’mtcars’",title= "Diverging Bars") + coord_flip()
Diverging Bars
Normalised mileage from 'mtcars'
Toyota Corolla
Fiat 128
Lotus Europa
Honda Civic
Fiat X1−9
Porsche 914−2
Merc 240D
Merc 230
Datsun 710
Toyota Corona
Volvo 142E
Hornet 4 Drive
Mazda RX4 Wag
Mileage
car name
Mazda RX4
Ferrari Dino
Pontiac Firebird Above Average
Merc 280
Hornet Sportabout
Valiant Below Average
Merc 280C
Merc 450SL
Merc 450SE
Ford Pantera L
Dodge Challenger
AMC Javelin
Merc 450SLC
Maserati Bora
Chrysler Imperial
Duster 360
Camaro Z28
Lincoln Continental
Cadillac Fleetwood
−1 0 1 2
mpg_z
10
2.2 Diverging Lollipop chart
library(ggplot2)

geom_point(stat=’identity’, fill="black", size=6) +
geom_segment(aes(y = 0,
x = ‘car name‘,
yend = mpg_z,
xend = ‘car name‘),
color = "black") +
geom_text(color="white", size=2) +
labs(title="Diverging Lollipop Chart",
subtitle="Normalized mileage from ’mtcars’: Lollipop") +
ylim(-2.5, 2.5) +
coord_flip()
Diverging Lollipop Chart

Normalized mileage from 'mtcars': Lollipop
Toyota Corolla 2.29
Fiat 128 2.04
Lotus Europa 1.71
Honda Civic 1.71
Fiat X1−9 1.2
Porsche 914−2 0.98
Merc 240D 0.72
Merc 230 0.45
Datsun 710 0.45
Toyota Corona 0.23
Volvo 142E 0.22
Hornet 4 Drive 0.22
Mazda RX4 Wag 0.15
car name
Mazda RX4 0.15

Ferrari Dino −0.06
Pontiac Firebird −0.15
Merc 280 −0.15
Hornet Sportabout −0.23
Valiant −0.33
Merc 280C −0.38
Merc 450SL −0.46
Merc 450SE −0.61
Ford Pantera L −0.71
Dodge Challenger −0.76
AMC Javelin −0.81
Merc 450SLC −0.81
Maserati Bora −0.84
Chrysler Imperial −0.89
Duster 360 −0.96
Camaro Z28 −1.13
Lincoln Continental −1.61
Cadillac Fleetwood −1.61
−2 −1 0 1 2
mpg_z
11
2.3 Diverging Dot Plot
library(ggplot2)
# Plot
geom_point(stat=’identity’, aes(col=mpg_type), size=6) +
scale_color_manual(name="Mileage",
labels = c("Above Average", "Below Average"),
values = c("above"="#00ba38", "below"="#f8766d")) +
geom_text(color="white", size=2) +
labs(title="Diverging Dot Plot",
subtitle="Normalized mileage from ’mtcars’: Dotplot") +
ylim(-2.5, 2.5) +
coord_flip()
Diverging Dot Plot

Normalized mileage from 'mtcars': Dotplot
Toyota Corolla 2.29
Fiat 128 2.04
Lotus Europa 1.71
Honda Civic 1.71
Fiat X1−9 1.2
Porsche 914−2 0.98
Merc 240D 0.72
Merc 230 0.45
Datsun 710 0.45
Toyota Corona 0.23
Volvo 142E 0.22
Hornet 4 Drive 0.22
Mazda RX4 Wag 0.15
Mileage
car name
Mazda RX4 0.15

Ferrari Dino −0.06
Pontiac Firebird −0.15
Above Average
Merc 280 −0.15
Hornet Sportabout −0.23
Valiant −0.33 Below Average
Merc 280C −0.38
Merc 450SL −0.46
Merc 450SE −0.61
Ford Pantera L −0.71
Dodge Challenger −0.76
AMC Javelin −0.81
Merc 450SLC −0.81
Maserati Bora −0.84
Chrysler Imperial −0.89
Duster 360 −0.96
Camaro Z28 −1.13
Lincoln Continental −1.61
Cadillac Fleetwood −1.61
−2 −1 0 1 2
mpg_z
12
2.4 Area Chart
Area charts are typically used to visualize how a particular metric (such as % returns from a stock) performed
compared to a certain baseline.
Area Chart
library(ggplot2)
library(quantmod)
data("economics", package = "ggplot2")
# Compute % Returns
economics$returns_perc <- c(0,
diff(economics$psavert)/economics$psavert[-length(economics$psavert)])
# Create break points and labels for axis ticks
brks <- economics$date[seq(1, length(economics$date), 12)]
lbls <- lubridate::year(economics$date[seq(1, length(economics$date), 12)])
# Plot
ggplot(economics[1:100, ], aes(date, returns_perc)) + geom_area() +
scale_x_date(breaks=brks, labels=lbls) +
theme(axis.text.x = element_text(angle=90)) +
labs(title="Area Chart", subtitle = "Perc Returns for Personal Savings",
y="% Returns for Personal savings", caption="Source: economics")
Area Chart
Perc Returns for Personal Savings
0.2
% Returns for Personal savings
0.1
0.0
−0.1
−0.2
1967
1968
1969
1970
1971
1972
1973
1974
1975
date
Source: economics
13
3. Ranking
Used to compare the position or performance of multiple items with respect to each other. Actual values
matters somewhat less than the ranking.
3.1 Ordered Bar Chart
# Prepare data: group mean city mileage by manufacturer.

cty_mpg <- aggregate(mpg$cty, by=list(mpg$manufacturer), FUN=mean) # aggregate
colnames(cty_mpg) <- c("make", "mileage") # change column names
cty_mpg <- cty_mpg[order(cty_mpg$mileage), ] # sort
cty_mpg$make <- factor(cty_mpg$make, levels = cty_mpg$make) # to retain the order in plot
library(ggplot2)
# Draw plot
ggplot(cty_mpg, aes(x=make, y=mileage)) +
geom_bar(stat="identity", width=.5, fill="tomato3") +
labs(title="Ordered Bar Chart",
subtitle="Make Vs Avg. Mileage",
caption="source: mpg") +
theme(axis.text.x = element_text(angle=65, vjust=0.6))
Ordered Bar Chart

Make Vs Avg. Mileage
25
20
15
mileage
10
0
n
ver
age
t
ry
le
dai
tiac
aru
san
oln
a
ota
rcu
vro
p
i
d ro
ford
g
d
aud
n
jee
sw
dod
hon
linc
sub
pon
toy
hyu
nis
che
me
k
lan
vol
make
source: mpg
14
3.2 Lollipop Chart
library(ggplot2)
# Plot
geom_point(size=3) +
geom_segment(aes(x=make,
xend=make,
y=0,
yend=mileage)) +
labs(title="Lollipop Chart",
Lollipop Chart
25
20
15
mileage
10
0
n
ver
age
let
ry
dai
tiac
aru
san
oln
a
ota
rcu
vro
p
i
d ro
ford
g
d
aud
n
jee
sw
dod
hon
linc
sub
pon
toy
hyu
nis
che
me
k
lan
vol
make
source: mpg
15
3.3 Dot plot
library(ggplot2)
library(scales)
theme_set(theme_classic())
# Plot
geom_point(col="tomato2", size=3) + # Draw points
geom_segment(aes(x=make,
xend=make,
y=min(mileage),
yend=max(mileage)),
linetype="dashed",
size=0.1) + # Draw dashed lines
labs(title="Dot Plot",
coord_flip()
Dot Plot
honda
volkswagen
subaru
hyundai
toyota
nissan
audi
make
pontiac
chevrolet
ford
jeep
mercury
dodge
land rover
lincoln
15 20 25
mileage
source: mpg
16
3.4 Slope Chart
library(ggplot2)
library(scales)
# Prep data
df <- read.csv("https://raw.githubusercontent.com/selva86/datasets/master/gdppercap.csv")
colnames(df) <- c("continent", "1952", "1957")
left_label <- paste(df$continent, round(df$‘1952‘),sep=", ")
right_label <- paste(df$continent, round(df$‘1957‘),sep=", ")
df$class <- ifelse((df$‘1957‘ - df$‘1952‘) < 0, "red", "green")
# Plot
p <- ggplot(df) + geom_segment(aes(x=1, xend=2, y=‘1952‘, yend=‘1957‘, col=class),
size=.75, show.legend=F) +
geom_vline(xintercept=1, linetype="dashed", size=.1) +
geom_vline(xintercept=2, linetype="dashed", size=.1) +
scale_color_manual(labels = c("Up", "Down"),
values = c("green"="#00ba38", "red"="#f8766d")) +
labs(x="", y="Mean GdpPerCap") + # color of lines and axes labels
xlim(.5, 2.5) + ylim(0,(1.1*(max(df$‘1952‘, df$‘1957‘)))) # axes limits
# Add texts
p <- p + geom_text(label=left_label, y=df$‘1952‘, x=rep(1, NROW(df)), hjust=1.1, size=3.5)
p <- p + geom_text(label=right_label, y=df$‘1957‘, x=rep(2, NROW(df)), hjust=-0.1, size=3.5)
p <- p + geom_text(label="Time 1", x=1, y=1.1*(max(df$‘1952‘, df$‘1957‘)), hjust=1.2, size=5)
p <- p + geom_text(label="Time 2", x=2, y=1.1*(max(df$‘1952‘, df$‘1957‘)), hjust=-0.1, size=5)
# Minify theme
p + theme(panel.background = element_blank(),panel.grid = element_blank(),
axis.ticks = element_blank(),axis.text.x = element_blank(),
panel.border = element_blank(),plot.margin = unit(c(1,2,1,2), "cm"))
Time 1 Time 2
Oceania, 11599
10000 Oceania, 10298

Mean GdpPerCap
Europe, 6963
Europe, 5661
5000 Asia, 5195
Americas, 4616
Americas, 4079 Asia, 4003
Africa, 1253 Africa, 1385

0
17
3.5 Dumbbell Plot
Dumbbell charts are a great tool if you wish to:
• Visualise relative positions (like growth and decline) between two points in time.
• Compare distance between two categories.
18
Dumbbell Plot
library(ggplot2)
library(ggalt)
health <- read.csv("https://raw.githubusercontent.com/selva86/datasets/master/health.csv")

health$Area <- factor(health$Area, levels=as.character(health$Area)) # right ordering
# health$Area <- factor(health$Area)
gg <- ggplot(health, aes(x=pct_2013, xend=pct_2014, y=Area, group=Area)) +
geom_dumbbell(color="#a3c4dc",
size=0.75,
point.colour.l="#0e668b") +
scale_x_continuous(label=percent) +
labs(x=NULL,y=NULL,title="Dumbbell Chart",
subtitle="Pct Change: 2013 vs 2014",
caption="Source: https://github.com/hrbrmstr/ggalt") +
theme(plot.title = element_text(hjust=0.5, face="bold"),
plot.background=element_rect(fill="#f7f7f7"),
panel.background=element_rect(fill="#f7f7f7"),panel.grid.minor=element_blank(),
panel.grid.major.y=element_blank(),panel.grid.major.x=element_line(),
axis.ticks=element_blank(),legend.position="top",panel.border=element_blank())
plot(gg)
Dumbbell Chart
Pct Change: 2013 vs 2014
Boston
Minneapolis
Pittsburgh
Baltimore
San Francisco
Seattle
Philadelphia
Detroit
St. Louis
Portland
Washington, D.C.
Denver
New York
Chicago
All Metro Areas
San Diego
Charlotte
Phoenix
Riverside, Calif.
Tampa
Los Angeles
Atlanta
San Antonio
Dallas
Miami
Houston
5% 10% 15% 20% 25%
Source: https://github.com/hrbrmstr/ggalt
19
4. Distribution
• When you have a lot of data points and want to study where and how the data points are distributed.
4.1 Histogram (with automatic binning)
library(ggplot2)
# Histogram on a Continuous (Numeric) Variable

g <- ggplot(mpg, aes(displ)) + scale_fill_brewer(palette = "Spectral")
g + geom_histogram(aes(fill=class),
binwidth = .1,
col="black",
size=.1) + # change binwidth
labs(title="Histogram with Auto Binning",
subtitle="Engine Displacement across Vehicle Classes")
Histogram with Auto Binning

Engine Displacement across Vehicle Classes
20
class
15 2seater
compact
midsize
count
10 minivan
pickup
subcompact
5 suv
2 3 4 5 6 7
displ
20
4.1 Histogram (with fixed binning)
library(ggplot2)
g + geom_histogram(aes(fill=class),
bins=5,
col="black",
size=.1) + # change number of bins
labs(title="Histogram with Fixed Bins",
subtitle="Engine Displacement across Vehicle Classes")
Histogram with Fixed Bins

Engine Displacement across Vehicle Classes
80
60 class
2seater
compact
midsize
count
40
minivan
pickup
subcompact
20 suv
2 4 6
displ
21
4.1 Histogram (on a categorical variable)
library(ggplot2)
# Histogram on a Categorical variable

g <- ggplot(mpg, aes(manufacturer))
g + geom_bar(aes(fill=class), width = 0.5) +
theme(axis.text.x = element_text(angle=65, vjust=0.6)) +
labs(title="Histogram on Categorical Variable",
subtitle="Manufacturer across Vehicle Classes")
Histogram on Categorical Variable

Manufacturer across Vehicle Classes
30 class
2seater
compact
midsize
count
20
minivan
pickup
subcompact
10
suv
0
n
ver
age
t
ry
le
dai
tiac
aru
san
oln
e
ota
vro
rcu
i
p
d ro
ford
g
d
aud
jee
ksw
dod
hon
linc
sub
pon
toy
hyu
nis
che
me
lan
vol
manufacturer
22
4.2 Density plot
library(ggplot2)
# Plot
g <- ggplot(mpg, aes(cty))
g + geom_density(aes(fill=factor(cyl)), alpha=0.8) +
labs(title="Density plot",
subtitle="City Mileage Grouped by Number of cylinders",
caption="Source: mpg",
x="City Mileage",
fill="# Cylinders")
Density plot
City Mileage Grouped by Number of cylinders
0.4
# Cylinders
4
density
5
6
0.2
8
0.0
10 15 20 25 30 35
City Mileage
Source: mpg
23
4.3 Boxplot
library(ggplot2)
# Plot
g <- ggplot(mpg, aes(class, cty))
g + geom_boxplot(varwidth=T, fill="plum") +
labs(title="Box plot",
subtitle="City Mileage grouped by Class of vehicle",
x="Class of Vehicle",
y="City Mileage")
Box plot
City Mileage grouped by Class of vehicle
35
30
City Mileage
25
20
15
10
2seater compact midsize minivan pickup subcompact suv

Class of Vehicle
Source: mpg
24
4.3 Boxplot (2nd version)
library(ggthemes)
g + geom_boxplot(aes(fill=factor(cyl))) +
labs(title="Box plot",
y="City Mileage")
Box plot
35
30
factor(cyl)
City Mileage
25 4
5
20 6
8
15
10
t
pac
t
n
ize
r
pac
kup
ate
iva
com
suv
s
com
2se
mid
min
pic
sub
Class of Vehicle
Source: mpg
25
4.4 Dot + Boxplot
library(ggplot2)
# plot
g <- ggplot(mpg, aes(manufacturer, cty))
g + geom_boxplot() +
geom_dotplot(binaxis=’y’,
stackdir=’center’,
dotsize = .5,
fill="red") +
labs(title="Box plot + Dot plot",
subtitle="City Mileage vs Class: Each dot represents 1 row in source data",
y="City Mileage")
Box plot + Dot plot

City Mileage vs Class: Each dot represents 1 row in source data
30
City Mileage
20
10
n
ver
age
let
ry
ai
tiac
aru
san
oln
ge
da
ota
vro
nd
rcu
i
d ro
ford
aud
jee
ksw
dod
hon
linc
sub
pon
toy
hyu
nis
che
me
lan
vol
Class of Vehicle
Source: mpg
26
4.5 Tufte’s Boxplot
library(ggthemes)
library(ggplot2)
theme_set(theme_tufte()) # from ggthemes
# plot
g <- ggplot(mpg, aes(manufacturer, cty))
g + geom_tufteboxplot() +
labs(title="Tufte Styled Boxplot",
y="City Mileage")
Tufte Styled Boxplot

35
30
City Mileage
25
20
15
10
gen
ver
t
ry
i
le
tiac
oln
aru
nda
ota
ge
da
a
i
vro
ford
jeep
rcu
d ro
a
aud
ksw
niss
dod
hon
linc
sub
pon
toy
hyu
me
che
lan
vol
Class of Vehicle
Source: mpg
27
4.6 Violin Plot
library(ggplot2)
# plot
g + geom_violin() +
labs(title="Violin plot",
subtitle="City Mileage vs Class of vehicle",
y="City Mileage")
Violin plot
City Mileage vs Class of vehicle
35
30
City Mileage
25
20
15
10
2seater compact midsize minivan pickup subcompact suv

Class of Vehicle
Source: mpg
28
4.7 Bean Plot
library(beanplot)
set.seed(1)
par(mfrow = c(1, 2), mai = c(0.5, 0.5, 0.5, 0.1))
mu <- 2
si <- 0.6
c <- 500
bimodal <- c(rnorm(c/2, -mu, si), rnorm(c/2, mu, si))
uniform <- runif(c, -4, 4)
normal <- rnorm(c, 0, 1.5)
ylim <- c(-7, 7)
boxplot(bimodal, uniform, normal, ylim = ylim, main = "boxplot", names = 1:3)
beanplot(bimodal, uniform, normal, ylim = ylim, main = "beanplot",
col = c("#CAB2D6", "#33A02C",
"#B2DF8A"), border = "#CAB2D6")
boxplot beanplot
6
6
4
4
2
2
0
0
−2
−2
−4
−4
−6
−6
1 2 3 1 2 3
29
4.8 Population Pyramid
library(ggplot2)
library(ggthemes)
options(scipen = 999) # turns of scientific notations like 1e+40
# Read data
email_campaign_funnel <-
read.csv("https://raw.githubusercontent.com/selva86/datasets/master/email_campaign_funnel.csv")
# X Axis Breaks and Labels

brks <- seq(-15000000, 15000000, 5000000)
lbls = paste0(as.character(c(seq(15, 0, -5), seq(5, 15, 5))), "m")
# Plot
ggplot(email_campaign_funnel, aes(x = Stage, y = Users, fill = Gender)) + # fill column
geom_bar(stat = "identity", width = .6) + # draw the bars
scale_y_continuous(breaks = brks, # breaks
labels = lbls) + # labels
coord_flip() + # flip axes
labs(title="Email Campaign Funnel") +
theme_tufte() + # tufte theme from ggfortify
theme(plot.title = element_text(hjust = .5),
axis.ticks = element_blank()) + # centre plot title
scale_fill_brewer(palette = "Dark2") # colour palette
Email Campaign Funnel

Stage 18: 5th Purchase
Stage 17: 4th Purchase
Stage 16: 3rd Purchase
Stage 15: 2nd Purchase
Stage 14: 1st Successful Purchase
Stage 13: Payment Successful
Stage 12: Payment
Stage 11: Submit Order Page Gender
Stage
Stage 10: Address Verification Page Female

Stage 09: Cart Confirmation Page
Male
Stage 08: Buy Button Clickers
Stage 07: Buy Button Page
Stage 06: Campaign−Email Clickthroughs
Stage 05: Campaign−Email Opens
Stage 04: Email Confirmed
Stage 03: Email Signups
Stage 02: Unbounced Users
Stage 01: Browsers
15m 10m 5m 0m 5m 10m 15m
Users
30
5. Composition
5.1 Waffle Chart
var <- mpg$class # categorical data
# Prep data (nothing to change here)

nrows <- 10
df <- expand.grid(y = 1:nrows, x = 1:nrows)
categ_table <- round(table(var) * ((nrows*nrows)/(length(var))))
df$category <- factor(rep(names(categ_table), categ_table))
# NOTE: if sum(categ_table) is not 100 (i.e. nrows^2),
# it will need adjustment to make the sum to 100
# Plot
ggplot(df, aes(x = x, y = y, fill = category)) +
geom_tile(color = "black", size = 0.5) +
scale_x_continuous(expand = c(0, 0)) +
scale_y_continuous(expand = c(0, 0), trans = ’reverse’) +
scale_fill_brewer(palette = "Set3") +
labs(title="Waffle Chart", subtitle="’Class’ of vehicles",
caption="Source: mpg") +
theme(panel.border = element_rect(size = 2),
plot.title = element_text(size = rel(1.2)),
axis.text = element_blank(),axis.title = element_blank(),
axis.ticks = element_blank(), legend.title = element_blank(),
legend.position = "right")
Waffle Chart
'Class' of vehicles
2seater
compact
midsize
minivan
pickup
subcompact
suv
Source: mpg
31
5.2 Pie Chart
library(ggplot2)
# Source: Frequency table

df <- as.data.frame(table(mpg$class))
colnames(df) <- c("class", "freq")
pie <- ggplot(df, aes(x = "", y=freq, fill = factor(class))) +
geom_bar(width = 1, stat = "identity") +
theme(axis.line = element_blank(),
plot.title = element_text(hjust=0.5)) +
labs(fill="class",
x=NULL,
y=NULL,
title="Pie Chart of class",
caption="Source: mpg")
pie + coord_polar(theta = "y", start=0)
Pie Chart of class

0
200
class
2seater
50 compact
midsize
minivan
pickup
subcompact
suv
150
100
Source: mpg
32
5.3 Treemap
library(ggplot2)
library(treemapify)
ggplot(G20, aes(area = gdp_mil_usd, fill = hdi, label = country)) +
geom_treemap() +
geom_treemap_text(fontface = "italic", colour = "white", place = "centre",
grow = TRUE)
Saudi Arabia South Africa
Italy Mexico
Turkey Argentina
Russia Australia South Korea Indonesia

United States
Brazil India Canada hdi
0.9
0.8
Germany France United Kingdom 0.7
0.6
European Union
China Japan
33
5.3 Treemap (second version)
ggplot(G20, aes(area = gdp_mil_usd, fill = hdi, label = country,

subgroup = region)) +
geom_treemap() +
geom_treemap_subgroup_border() +
geom_treemap_subgroup_text(place = "centre", grow = T, alpha = 0.5, colour =
"black", fontface = "italic", min.size = 0) +
geom_treemap_text(colour = "white", place = "topleft", reflow = T)
United States Mexico Russia Turkey South

Africa
Africa
Eurasia Saudi
Middle East
Arabia
Australia
North America Brazil Canada

South America Oceania
India South Indonesia

hdi
Korea 0.9
European Union Italy 0.8
Japan
United 0.7
Europe Asia Kingdom

France China
0.6
Germany
34
5.3 Treemap (third version)
ggplot(G20, aes(area = gdp_mil_usd, fill = region, label = country)) +

geom_treemap() +
geom_treemap_text(grow = T, reflow = T, colour = "black") +
facet_wrap( ~ econ_classification) +
scale_fill_brewer(palette = "Set1") +
theme(legend.position = "bottom") +
labs(
title = "The G-20 major economies",
caption = "The area of each country is proportional to its relative GDP
within the economic group (advanced or developing)",
fill = "Region"
)
The G−20 major economies

Advanced Developing
United
Canada Australia South
China Turkey Argentina South

Korea Africa
Indonesia Saudi
United
Italy Arabia
States Kingdom
Germany France
India Mexico
European
Brazil Russia
Union Japan
Africa Eurasia Middle East Oceania
Region
Asia Europe North America South America
The area of each country is proportional to its relative GDP

within the economic group (advanced or developing)
35
5.4 Bar Chart
freqtable <- table(mpg$manufacturer)

df <- as.data.frame.table(freqtable)
library(ggplot2)
# Plot
g <- ggplot(df, aes(Var1, Freq))
g + geom_bar(stat="identity", width = 0.5, fill="tomato2") +
labs(title="Bar Chart",
subtitle="Manufacturer of vehicles",
caption="Source: Frequency of Manufacturers from ’mpg’ dataset") +
Bar Chart
Manufacturer of vehicles
30
Freq
20
10
n
ver
age
let
ry
dai
tiac
aru
san
oln
e
ota
vro
rcu
i
d ro
ford
g
d
aud
jee
sw
dod
hon
linc
sub
pon
toy
hyu
nis
che
me
k
lan
vol
Var1
Source: Frequency of Manufacturers from 'mpg' dataset
36
5.4 Bar Chart (2nd version)
g <- ggplot(mpg, aes(manufacturer))

g + geom_bar(aes(fill=class), width = 0.5) +
labs(title="Categorywise Bar Chart",
subtitle="Manufacturer of vehicles",
caption="Source: Manufacturers from ’mpg’ dataset")
Categorywise Bar Chart

Manufacturer of vehicles
30 class
2seater
compact
midsize
count
20
minivan
pickup
10 subcompact
suv
0
n
ver
age
let
ry
i
tiac
aru
nda
san
oln
ge
da
ota
vro
rcu
i
p
d ro
ford
aud
jee
ksw
dod
hon
linc
sub
pon
toy
hyu
nis
che
me
lan
vol
manufacturer
Source: Manufacturers from 'mpg' dataset
37
6. Change
Used to compare the position or performance of multiple items with respect to each other. Actual values
matters somewhat less than the ranking.
6.1 Time Series Plot (from a time series object)
# Prepare data: group mean city mileage by manufacturer.

## From Timeseries object (ts)
library(ggplot2)
library(ggfortify)
# Plot
autoplot(AirPassengers) +
labs(title="AirPassengers") +
theme(plot.title = element_text(hjust=0.5))
AirPassengers
600
400
200
1950 1955 1960
38
6.1 Time Series Plot (from a data frame)
library(ggplot2)
library(ggfortify)
# Allow Default X Axis Labels

ggplot(economics, aes(x=date)) +
geom_line(aes(y=psavert)) +
labs(title="Time Series Chart",
subtitle="Personal savings rate from ’Economics’ Dataset",
caption="Source: Economics",
y="Personal savings rate")
Time Series Chart

Personal savings rate from 'Economics' Dataset
15
Personal savings rate
10
1970 1980 1990 2000 2010

date
Source: Economics
39
6.1 Time Series Plot (from long data format)
data(economics_long, package = "ggplot2")

library(ggplot2)
library(lubridate)
df <- economics_long[economics_long$variable %in% c("psavert", "uempmed"), ]

df <- df[lubridate::year(df$date) %in% c(1967:1981), ]
# labels and breaks for X axis text

brks <- df$date[seq(1, length(df$date), 12)]
lbls <- lubridate::year(brks)
# plot
ggplot(df, aes(x=date)) +
geom_line(aes(y=value, col=variable)) +
labs(title="Time Series of Returns Percentage",
subtitle="Drawn from Long Data format",
caption="Source: Economics",
y="Returns %",
color=NULL) + # title and caption
scale_x_date(labels = lbls, breaks = brks) + # change to monthly ticks and labels
scale_color_manual(labels = c("psavert", "uempmed"),
values = c("psavert"="#00ba38", "uempmed"="#f8766d")) + # line colour
theme(axis.text.x = element_text(angle = 90, vjust=0.5, size = 8), # rotate x axis text
panel.grid.minor = element_blank()) # turn off minor grid
Time Series of Returns Percentage

Drawn from Long Data format
15
Returns %
12
psavert
uempmed
9
6
1967
1968
1968
1969
1969
1970
1970
1971
1971
1972
1972
1973
1973
1974
1974
1975
1975
1976
1976
1977
1977
1978
1978
1979
1979
1980
1980
1981
1981
date
Source: Economics
40
6.1 Time Series Plot (from wide data format)
library(ggplot2)
library(lubridate)
df <- economics[, c("date", "psavert", "uempmed")]


brks <- df$date[seq(1, length(df$date), 12)]
lbls <- lubridate::year(brks)
# plot
geom_line(aes(y=psavert, col="psavert")) +
geom_line(aes(y=uempmed, col="uempmed")) +
labs(title="Time Series of Returns Percentage",
subtitle="Drawn From Wide Data format",
caption="Source: Economics", y="Returns %") + # title and caption
scale_color_manual(name="",
theme(panel.grid.minor = element_blank()) # turn off minor grid
Time Series of Returns Percentage

Drawn From Wide Data format
15
Returns %
12
psavert
uempmed
9
1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981
date
Source: Economics
41
6.2 Stacked Area Chart
Stacked area chart is just like a line chart, except that the region below the plot is all colored. This is typically
used when:
• You want to describe how a quantity or volume (rather than something like price) changed over time.
• You have many data points. For very few data points, consider plotting a bar chart.
• You want to show the contribution from individual components.
Stacked Area Chart
library(ggplot2)
library(lubridate)
df <- economics[, c("date", "psavert", "uempmed")]
brks <- df$date[seq(1, length(df$date), 12)];lbls <- lubridate::year(brks)
# Plot
geom_area(aes(y=psavert+uempmed, fill="psavert")) + geom_area(aes(y=uempmed, fill="uempmed")) +
labs(title="Area Chart of Returns Percentage",subtitle="From Wide Data format",
caption="Source: Economics",y="Returns %") + # title and caption
scale_fill_manual(name="",
theme(panel.grid.minor = element_blank()) # turn off minor grid
Area Chart of Returns Percentage

From Wide Data format
20
Returns %
psavert
uempmed
10
0
1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981
date
Source: Economics
42
6.3 Calendar Heat Map
# http://margintale.blogspot.in/2012/04/ggplot2-time-series-heatmaps.html
library(ggplot2)
library(plyr)
library(scales)
library(zoo)
df <- read.csv("https://raw.githubusercontent.com/selva86/datasets/master/yahoo.csv")
df$date <- as.Date(df$date) # format date
df <- df[df$year >= 2012, ] # filter reqd years
# Create Month Week

df$yearmonth <- as.yearmon(df$date)
df$yearmonthf <- factor(df$yearmonth)
df <- ddply(df,.(yearmonthf), transform, monthweek=1+week-min(week))#compute week number
df <- df[, c("year", "yearmonthf", "monthf", "week", "monthweek", "weekdayf","VIX.Close")]
# Plot
ggplot(df, aes(monthweek, weekdayf, fill = VIX.Close)) +
geom_tile(colour = "white") + facet_grid(year~monthf) +
scale_fill_gradient(low="red", high="green") +
labs(x="Week of Month",y="",title = "Time-Series Calendar Heatmap",
subtitle="Yahoo Closing Price", fill="Close")
Time−Series Calendar Heatmap

Yahoo Closing Price
Apr Aug Dec Feb Jan Jul Jun Mar May Nov Oct Sep
Wed
Tue
2012
Thu
Mon
Fri
Wed
Tue
2013
Thu Close
Mon
Fri 40
Wed
Tue
2014
30
Thu
Mon
Fri
20
Wed
Tue
2015
Thu
Mon
Fri
Wed
Tue
2016
Thu
Mon
Fri
12345 12345 12345 12345 12345 12345 12345 12345 12345 12345 12345 12345
Week of Month
43
6.4 Slope Chart
Slope charts are a great tool if you want to visualise change in value and ranking between categories. This is
more suitable over a time series when there are very few time points.
Slope Chart
library(slopegraph)
library(ggplot2)
data(states)
cols <- ‘[<-‘(rep("black", 37), 7, "red")
ggslopegraph(states, offset.x = 0.06, yrev = TRUE,
col.lines = cols, col.lab = cols,
main = ’Relative Rank of U.S. State Populations, 1790-1870’) +
theme_bw()
Relative Rank of U.S. State Populations, 1790−1870

Virginia 1 1 1 1 1 1 1 1 1 New York
Pennylvania 2 2 2 2 2 2 2 2 2 Pennylvania
North Carolina 3 3 3 3 3 3 3 3 3 Ohio
Massachusetts 4 4 4 4 4 4 4 4 4 Illinois
New York 5 5 5 5 5 5 5 5 5 Missouri
Maryland 6 6 6 6 6 6 6 6 6 Indiana
South Carolina 7 7 7 7 7 7 7 7 7 Massachusetts
Connecticut 8 8 8 8 8 8 8 8 8 Kentucky
New Jersey 9 9 9 9 9 9 9 9 9 Tennessee
New Hampshire 10 10 10 10 10 10 10 10 10 Virginia
Maine 11 11 11 11 11 11 11 11 11 Iowa
Vermont 12 12 12 12 12 12 12 12 12 Georgia
Georgia 13 13 13 13 13 13 13 13 13 Michigan
Kentucky 14 14 14 14 14 14 14 14 14 North Carolina
Rhode Island 15 15 15 15 15 15 15 15 15 Wisconsin
Delaware 16 16 16 16 16 16 16 16 Alabama
Tennessee 17 17 17 17 17 17 17 17 17 New Jersey
18 18 18 18 18 18 18 18 Mississippi
19 19 19 19 19 19 19 19 Texas
20 20 20 20 20 20 20 20 Maryland
21 21 21 21 21 21 21 Louisiana
22 22 22 22 22 22 22 South Carolina
23 23 23 23 23 23 23 Maine
24 24 24 24 24 24 24 California
25 25 25 25 25 25 Connecticut
26 26 26 26 26 Arkansas
27 27 27 27 27 27 West Virginia
28 28 28 28 Minnesota
29 29 29 29 Kansas
30 30 30 Vermont
31 31 31 New Hampshire
32 32 32 Rhode Island
33 33 33 Florida
34 34 Delaware
35 35 Nebraska
36 36 Oregon
37 Nevada
1790 1800 1810 1820 1830 1840 1850 1860 1870
44
6.5 Seasonal Plot
library(ggplot2)
library(forecast)
# Subset data
nottem_small <- window(nottem, start=c(1920, 1), end=c(1925, 12))
# Plot
ggseasonplot(AirPassengers) +
labs(title="Seasonal plot: International Airline Passengers")
Seasonal plot: International Airline Passengers
600
year
1949
1950
1951
1952
400 1953
1954
1955
1956
1957
1958
200
1959
1960
Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
Month
45
6.5 Seasonal Plot
library(ggplot2)
library(forecast)
# Subset data
nottem_small <- window(nottem, start=c(1920, 1), end=c(1925, 12))
# Plot
ggseasonplot(nottem_small) +
labs(title="Seasonal plot: Air temperatures at Nottingham Castle")
Seasonal plot: Air temperatures at Nottingham Castle
60
year
1920
1921
1922
50
1923
1924
1925
40
Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
Month
46
7. Groups
7.1 Hierarchical Dendrogram
library(ggplot2)
library(ggdendro)
hc <- hclust(dist(USArrests), "ave") # hierarchical clustering
# plot
ggdendrogram(hc, rotate = TRUE, size = 2)
New Hampshire
Iowa
Wisconsin
Minnesota
Vermont
North Dakota
South Dakota
Maine
West Virginia
Hawaii
Pennsylvania
Connecticut
Kansas
Indiana
Utah
Ohio
Montana
Kentucky
Nebraska
Idaho
Texas
Colorado
Georgia
Tennessee
Arkansas
Missouri
New Jersey
Massachusetts
Rhode Island
Virginia
Oklahoma
Wyoming
Oregon
Washington
South Carolina
Mississippi
Alaska
Nevada
Michigan
New York
Illinois
Louisiana
Alabama
Delaware
New Mexico
Arizona
Maryland
California
North Carolina
Florida
0 50 100 150
47
7.2 Clusters
library(ggplot2)
library(ggalt)
library(ggfortify)
# Compute data with principal components

df <- iris[c(1, 2, 3, 4)]
pca_mod <- prcomp(df) # compute principal components
# Data frame of principal components
df_pc <- data.frame(pca_mod$x, Species=iris$Species) # dataframe of principal components
df_pc_vir <- df_pc[df_pc$Species == "virginica", ] # df for ’virginica’
df_pc_set <- df_pc[df_pc$Species == "setosa", ] # df for ’setosa’
df_pc_ver <- df_pc[df_pc$Species == "versicolor", ] # df for ’versicolor’
# Plot
ggplot(df_pc, aes(PC1, PC2, col=Species)) +
geom_point(aes(shape=Species), size=2) + # draw points
labs(title="Iris Clustering",
subtitle="With principal components PC1 and PC2 as X and Y axis",
caption="Source: Iris") +
coord_cartesian(xlim = 1.2 * c(min(df_pc$PC1), max(df_pc$PC1)),
ylim = 1.2 * c(min(df_pc$PC2), max(df_pc$PC2))) + # change axes limits
geom_encircle(data = df_pc_vir, aes(x=PC1, y=PC2)) + # draw circles
geom_encircle(data = df_pc_set, aes(x=PC1, y=PC2)) +
geom_encircle(data = df_pc_ver, aes(x=PC1, y=PC2))
Iris Clustering
With principal components PC1 and PC2 as X and Y axis
Species
setosa
PC2
0
versicolor
virginica
−1
−2.5 0.0 2.5

PC1
Source: Iris
48

Ggplot 2

Uploaded by

Document Informationclick to expand document information

Copyright:

Available Formats

Ggplot 2

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Ggplot 2

Uploaded by

Copyright:

Available Formats

Data Visualisation in R (link for html version)

options(scipen=999) # turn-off scientific notation like 1e+48

0.000 0.025 0.050 0.075 0.100

# install ’ggalt’ pkg

0.000 0.025 0.050 0.075 0.100

g <- ggplot(mpg, aes(cty, hwy))

Scatterplot with overlapping points

# load package and data

# load package and data

• A Categorical variable (by changing the color) and

1.5 Bubble plot

1.6 Marginal Histogram

# load package and data

#ggMarginal(g, type = "density", fill="transparent")

# load package and data

#ggMarginal(g, type = "density", fill="transparent")

gear 0.5 0.7

am 0.8 0.6 0.7

qsec 0.7 −0.2 −0.2 0.4 0.1 0.5

hp 0.8 0.8 −0.7 −0.7 −0.2 −0.1 −0.8 −0.4

wt 0.7 0.8 0.9 −0.2 −0.6 −0.7 −0.6 −0.9 −0.7

2.1 Diverging bars

ggplot(mtcars, aes(x=‘car name‘, y=mpg_z, label=mpg_z)) +

Diverging Lollipop Chart

Mazda RX4 0.15

Diverging Dot Plot

Mazda RX4 0.15

3.1 Ordered Bar Chart

# Prepare data: group mean city mileage by manufacturer.

Ordered Bar Chart

10000 Oceania, 10298

Africa, 1253 Africa, 1385

health <- read.csv("https://raw.githubusercontent.com/selva86/datasets/master/health.csv")

4.1 Histogram (with automatic binning)

# Histogram on a Continuous (Numeric) Variable

Histogram with Auto Binning

Histogram with Fixed Bins

# Histogram on a Categorical variable

Histogram on Categorical Variable

2seater compact midsize minivan pickup subcompact suv

Box plot + Dot plot

Tufte Styled Boxplot

2seater compact midsize minivan pickup subcompact suv

# X Axis Breaks and Labels

Email Campaign Funnel

Stage 10: Address Verification Page Female

5.1 Waffle Chart

var <- mpg$class # categorical data

# Prep data (nothing to change here)

# Source: Frequency table

pie + coord_polar(theta = "y", start=0)

Pie Chart of class

Saudi Arabia South Africa

Russia Australia South Korea Indonesia

Germany France United Kingdom 0.7

ggplot(G20, aes(area = gdp_mil_usd, fill = hdi, label = country,

United States Mexico Russia Turkey South

North America Brazil Canada