Ggplot 2
Ggplot 2
Ggplot 2
1. Correlation
1.1 Scatterplot
# Scatterplot
gg <- ggplot(midwest, aes(x=area, y=poptotal)) +
geom_point(aes(col=state, size=popdensity)) +
geom_smooth(method="loess", se=F) +
xlim(c(0, 0.1)) +
ylim(c(0, 500000)) +
labs(subtitle="Area Vs Population",y="Population",x="Area",
title="Scatterplot",caption = "Source: midwest")
plot(gg)
Scatterplot
Area Vs Population
500000
state
IL
400000 IN
MI
OH
Population
300000
WI
200000 popdensity
20000
40000
100000
60000
80000
0
1
1.2 Scatterplot with Encircling
Scatterplot + Encircle
Area Vs Population
500000
state
IL
400000 IN
MI
OH
Population
300000
WI
200000 popdensity
20000
40000
100000
60000
80000
0
2
1.3 Jitterplot
library(ggplot2)
data(mpg, package="ggplot2") # alternate source: "http://goo.gl/uEeRGu")
theme_set(theme_bw()) # pre-set the bw theme.
# Scatterplot
g + geom_point() +
geom_smooth(method="lm", se=F) +
labs(subtitle="mpg: city vs highway mileage",
y="hwy",
x="cty",
title="Scatterplot with overlapping points",
caption="Source: midwest")
40
hwy
30
20
10 15 20 25 30 35
cty
Source: midwest
3
1.3 Jitterplot (2nd version)
# Scatterplot
theme_set(theme_bw()) # pre-set the bw theme.
g <- ggplot(mpg, aes(cty, hwy))
g + geom_jitter(width = .5, size=1) +
labs(subtitle="mpg: city vs highway mileage",
y="hwy",
x="cty",
title="Jittered Points")
Jittered Points
mpg: city vs highway mileage
40
30
hwy
20
10 15 20 25 30 35
cty
4
1.4 Counts Chart
# Scatterplot
theme_set(theme_bw()) # pre-set the bw theme.
g <- ggplot(mpg, aes(cty, hwy))
g + geom_count(col="tomato3", show.legend=F) +
labs(subtitle="mpg: city vs highway mileage",
y="hwy",
x="cty",
title="Counts Plot")
Counts Plot
mpg: city vs highway mileage
40
30
hwy
20
10 15 20 25 30 35
cty
5
1.5 Bubble plot
While scatterplot lets you compare the relationship between 2 continuous variables, a bubble chart serves
well if you want to understand relationship within the underlying groups based on:
Bubble charts are more suitable if you have 4-Dimensional data where two of them are numeric (X and Y)
and one other categorical (color) and another numeric variable (size).
library(ggplot2)
data(mpg, package="ggplot2")
# mpg <- read.csv("http://goo.gl/uEeRGu")
mpg_select <- mpg[mpg$manufacturer %in% c("audi", "ford", "honda", "hyundai"), ]
# Scatterplot
theme_set(theme_bw()) # pre-set the bw theme
g <- ggplot(mpg_select, aes(displ, cty)) +
labs(subtitle="mpg: Displacement vs City Mileage",title="Bubble chart")
g + geom_jitter(aes(col=manufacturer, size=hwy)) +
geom_smooth(aes(col=manufacturer), method="lm", se=F)
Bubble chart
mpg: Displacement vs City Mileage
hwy
15
25
20
25
30
20 35
cty
manufacturer
audi
15 ford
honda
hyundai
10
2 3 4 5
displ
6
1.6 Marginal Histogram/Boxplot
If you want to show the relationship as well as the distribution in the same chart, use the marginal histogram.
It has a histogram of the X and Y variables at the margins of the scatterplot.
40
n
hwy
30 5
10
20
10 15 20 25 30 35
cty
7
1.6 Marginal Boxplot
40
n
hwy
30 5
10
20
10 15 20 25 30 35
cty
8
1.7 Correlogram
Correlogram let’s you examine the corellation of multiple continuous variables present in the same dataframe.
1.7 Correlogram
library(ggplot2)
library(ggcorrplot)
# Correlation matrix
data(mtcars)
corr <- round(cor(mtcars), 1)
# Plot
ggcorrplot(corr, hc.order = TRUE,
type = "lower",
lab = TRUE,
lab_size = 3,
method="circle",
colors = c("tomato2", "white", "springgreen3"),
title="Correlogram of mtcars",
ggtheme=theme_bw)
Correlogram of mtcars
mpg 0.7
Corr
vs 0.2 0.2 0.7 0.4 1.0
0.0
disp −0.4 −0.7 −0.6 −0.6 −0.8 −0.7
−0.5
cyl 0.9 −0.6 −0.8 −0.5 −0.5 −0.9 −0.7
−1.0
carb 0.4 0.7 0.5 0.4 −0.7 −0.6 0.1 0.3 −0.6 −0.1
t
hp
l
p
ec
vs
ar
pg
at
am
cy
w
dr
ge
di
qs
9
2. Deviation
Compare variation in values between small number of items (or categories) with respect to a fixed reference.
library(ggplot2)
theme_set(theme_bw())
# Data Prep
data("mtcars") # load data
mtcars$‘car name‘ <- rownames(mtcars) # create new column for car names
# Compute normalized mpg
mtcars$mpg_z <- round((mtcars$mpg - mean(mtcars$mpg))/sd(mtcars$mpg), 2)
mtcars$mpg_type <- ifelse(mtcars$mpg_z < 0, "below", "above") # above/below avg flag
mtcars <- mtcars[order(mtcars$mpg_z), ] # sort
# Convert to factor to retain sorted order in plot
mtcars$‘car name‘ <- factor(mtcars$‘car name‘, levels = mtcars$‘car name‘)
# Diverging Barcharts
ggplot(mtcars, aes(x=‘car name‘, y=mpg_z, label=mpg_z)) +
geom_bar(stat=’identity’, aes(fill=mpg_type), width=.5) +
scale_fill_manual(name="Mileage",
labels = c("Above Average", "Below Average"),
values = c("above"="#00ba38", "below"="#f8766d")) +
labs(subtitle="Normalised mileage from ’mtcars’",title= "Diverging Bars") + coord_flip()
Diverging Bars
Normalised mileage from 'mtcars'
Toyota Corolla
Fiat 128
Lotus Europa
Honda Civic
Fiat X1−9
Porsche 914−2
Merc 240D
Merc 230
Datsun 710
Toyota Corona
Volvo 142E
Hornet 4 Drive
Mazda RX4 Wag
Mileage
car name
Mazda RX4
Ferrari Dino
Pontiac Firebird Above Average
Merc 280
Hornet Sportabout
Valiant Below Average
Merc 280C
Merc 450SL
Merc 450SE
Ford Pantera L
Dodge Challenger
AMC Javelin
Merc 450SLC
Maserati Bora
Chrysler Imperial
Duster 360
Camaro Z28
Lincoln Continental
Cadillac Fleetwood
−1 0 1 2
mpg_z
10
2.2 Diverging Lollipop chart
library(ggplot2)
theme_set(theme_bw())
−2 −1 0 1 2
mpg_z
11
2.3 Diverging Dot Plot
library(ggplot2)
theme_set(theme_bw())
# Plot
ggplot(mtcars, aes(x=‘car name‘, y=mpg_z, label=mpg_z)) +
geom_point(stat=’identity’, aes(col=mpg_type), size=6) +
scale_color_manual(name="Mileage",
labels = c("Above Average", "Below Average"),
values = c("above"="#00ba38", "below"="#f8766d")) +
geom_text(color="white", size=2) +
labs(title="Diverging Dot Plot",
subtitle="Normalized mileage from ’mtcars’: Dotplot") +
ylim(-2.5, 2.5) +
coord_flip()
−2 −1 0 1 2
mpg_z
12
2.4 Area Chart
Area charts are typically used to visualize how a particular metric (such as % returns from a stock) performed
compared to a certain baseline.
Area Chart
library(ggplot2)
library(quantmod)
data("economics", package = "ggplot2")
# Compute % Returns
economics$returns_perc <- c(0,
diff(economics$psavert)/economics$psavert[-length(economics$psavert)])
# Create break points and labels for axis ticks
brks <- economics$date[seq(1, length(economics$date), 12)]
lbls <- lubridate::year(economics$date[seq(1, length(economics$date), 12)])
# Plot
ggplot(economics[1:100, ], aes(date, returns_perc)) + geom_area() +
scale_x_date(breaks=brks, labels=lbls) +
theme(axis.text.x = element_text(angle=90)) +
labs(title="Area Chart", subtitle = "Perc Returns for Personal Savings",
y="% Returns for Personal savings", caption="Source: economics")
Area Chart
Perc Returns for Personal Savings
0.2
% Returns for Personal savings
0.1
0.0
−0.1
−0.2
1967
1968
1969
1970
1971
1972
1973
1974
1975
date
Source: economics
13
3. Ranking
Used to compare the position or performance of multiple items with respect to each other. Actual values
matters somewhat less than the ranking.
# Draw plot
ggplot(cty_mpg, aes(x=make, y=mileage)) +
geom_bar(stat="identity", width=.5, fill="tomato3") +
labs(title="Ordered Bar Chart",
subtitle="Make Vs Avg. Mileage",
caption="source: mpg") +
theme(axis.text.x = element_text(angle=65, vjust=0.6))
20
15
mileage
10
0
n
ver
age
t
ry
le
dai
tiac
aru
san
oln
a
ota
rcu
vro
p
i
d ro
ford
g
d
aud
n
jee
sw
dod
hon
linc
sub
pon
toy
hyu
nis
che
me
k
lan
vol
make
source: mpg
14
3.2 Lollipop Chart
library(ggplot2)
theme_set(theme_bw())
# Plot
ggplot(cty_mpg, aes(x=make, y=mileage)) +
geom_point(size=3) +
geom_segment(aes(x=make,
xend=make,
y=0,
yend=mileage)) +
labs(title="Lollipop Chart",
subtitle="Make Vs Avg. Mileage",
caption="source: mpg") +
theme(axis.text.x = element_text(angle=65, vjust=0.6))
Lollipop Chart
Make Vs Avg. Mileage
25
20
15
mileage
10
0
n
ver
age
let
ry
dai
tiac
aru
san
oln
a
ota
rcu
vro
p
i
d ro
ford
g
d
aud
n
jee
sw
dod
hon
linc
sub
pon
toy
hyu
nis
che
me
k
lan
vol
make
source: mpg
15
3.3 Dot plot
library(ggplot2)
library(scales)
theme_set(theme_classic())
# Plot
ggplot(cty_mpg, aes(x=make, y=mileage)) +
geom_point(col="tomato2", size=3) + # Draw points
geom_segment(aes(x=make,
xend=make,
y=min(mileage),
yend=max(mileage)),
linetype="dashed",
size=0.1) + # Draw dashed lines
labs(title="Dot Plot",
subtitle="Make Vs Avg. Mileage",
caption="source: mpg") +
coord_flip()
Dot Plot
Make Vs Avg. Mileage
honda
volkswagen
subaru
hyundai
toyota
nissan
audi
make
pontiac
chevrolet
ford
jeep
mercury
dodge
land rover
lincoln
15 20 25
mileage
source: mpg
16
3.4 Slope Chart
library(ggplot2)
library(scales)
theme_set(theme_classic())
# Prep data
df <- read.csv("https://raw.githubusercontent.com/selva86/datasets/master/gdppercap.csv")
colnames(df) <- c("continent", "1952", "1957")
left_label <- paste(df$continent, round(df$‘1952‘),sep=", ")
right_label <- paste(df$continent, round(df$‘1957‘),sep=", ")
df$class <- ifelse((df$‘1957‘ - df$‘1952‘) < 0, "red", "green")
# Plot
p <- ggplot(df) + geom_segment(aes(x=1, xend=2, y=‘1952‘, yend=‘1957‘, col=class),
size=.75, show.legend=F) +
geom_vline(xintercept=1, linetype="dashed", size=.1) +
geom_vline(xintercept=2, linetype="dashed", size=.1) +
scale_color_manual(labels = c("Up", "Down"),
values = c("green"="#00ba38", "red"="#f8766d")) +
labs(x="", y="Mean GdpPerCap") + # color of lines and axes labels
xlim(.5, 2.5) + ylim(0,(1.1*(max(df$‘1952‘, df$‘1957‘)))) # axes limits
# Add texts
p <- p + geom_text(label=left_label, y=df$‘1952‘, x=rep(1, NROW(df)), hjust=1.1, size=3.5)
p <- p + geom_text(label=right_label, y=df$‘1957‘, x=rep(2, NROW(df)), hjust=-0.1, size=3.5)
p <- p + geom_text(label="Time 1", x=1, y=1.1*(max(df$‘1952‘, df$‘1957‘)), hjust=1.2, size=5)
p <- p + geom_text(label="Time 2", x=2, y=1.1*(max(df$‘1952‘, df$‘1957‘)), hjust=-0.1, size=5)
# Minify theme
p + theme(panel.background = element_blank(),panel.grid = element_blank(),
axis.ticks = element_blank(),axis.text.x = element_blank(),
panel.border = element_blank(),plot.margin = unit(c(1,2,1,2), "cm"))
Time 1 Time 2
Oceania, 11599
Europe, 6963
Europe, 5661
5000 Asia, 5195
Americas, 4616
Americas, 4079 Asia, 4003
17
3.5 Dumbbell Plot
Dumbbell charts are a great tool if you wish to:
• Visualise relative positions (like growth and decline) between two points in time.
• Compare distance between two categories.
18
Dumbbell Plot
library(ggplot2)
library(ggalt)
theme_set(theme_classic())
Dumbbell Chart
Pct Change: 2013 vs 2014
Boston
Minneapolis
Pittsburgh
Baltimore
San Francisco
Seattle
Philadelphia
Detroit
St. Louis
Portland
Washington, D.C.
Denver
New York
Chicago
All Metro Areas
San Diego
Charlotte
Phoenix
Riverside, Calif.
Tampa
Los Angeles
Atlanta
San Antonio
Dallas
Miami
Houston
5% 10% 15% 20% 25%
Source: https://github.com/hrbrmstr/ggalt
19
4. Distribution
• When you have a lot of data points and want to study where and how the data points are distributed.
library(ggplot2)
theme_set(theme_classic())
g + geom_histogram(aes(fill=class),
binwidth = .1,
col="black",
size=.1) + # change binwidth
labs(title="Histogram with Auto Binning",
subtitle="Engine Displacement across Vehicle Classes")
20
class
15 2seater
compact
midsize
count
10 minivan
pickup
subcompact
5 suv
2 3 4 5 6 7
displ
20
4.1 Histogram (with fixed binning)
library(ggplot2)
g + geom_histogram(aes(fill=class),
bins=5,
col="black",
size=.1) + # change number of bins
labs(title="Histogram with Fixed Bins",
subtitle="Engine Displacement across Vehicle Classes")
80
60 class
2seater
compact
midsize
count
40
minivan
pickup
subcompact
20 suv
2 4 6
displ
21
4.1 Histogram (on a categorical variable)
library(ggplot2)
theme_set(theme_classic())
30 class
2seater
compact
midsize
count
20
minivan
pickup
subcompact
10
suv
0
n
ver
age
t
ry
le
dai
tiac
aru
san
oln
e
ota
vro
rcu
i
p
d ro
ford
g
d
aud
jee
ksw
dod
hon
linc
sub
pon
toy
hyu
nis
che
me
lan
vol
manufacturer
22
4.2 Density plot
library(ggplot2)
theme_set(theme_classic())
# Plot
g <- ggplot(mpg, aes(cty))
g + geom_density(aes(fill=factor(cyl)), alpha=0.8) +
labs(title="Density plot",
subtitle="City Mileage Grouped by Number of cylinders",
caption="Source: mpg",
x="City Mileage",
fill="# Cylinders")
Density plot
City Mileage Grouped by Number of cylinders
0.4
# Cylinders
4
density
5
6
0.2
8
0.0
10 15 20 25 30 35
City Mileage
Source: mpg
23
4.3 Boxplot
library(ggplot2)
theme_set(theme_classic())
# Plot
g <- ggplot(mpg, aes(class, cty))
g + geom_boxplot(varwidth=T, fill="plum") +
labs(title="Box plot",
subtitle="City Mileage grouped by Class of vehicle",
caption="Source: mpg",
x="Class of Vehicle",
y="City Mileage")
Box plot
City Mileage grouped by Class of vehicle
35
30
City Mileage
25
20
15
10
24
4.3 Boxplot (2nd version)
library(ggthemes)
g <- ggplot(mpg, aes(class, cty))
g + geom_boxplot(aes(fill=factor(cyl))) +
theme(axis.text.x = element_text(angle=65, vjust=0.6)) +
labs(title="Box plot",
subtitle="City Mileage grouped by Class of vehicle",
caption="Source: mpg",
x="Class of Vehicle",
y="City Mileage")
Box plot
City Mileage grouped by Class of vehicle
35
30
factor(cyl)
City Mileage
25 4
5
20 6
8
15
10
t
pac
t
n
ize
r
pac
kup
ate
iva
com
suv
s
com
2se
mid
min
pic
sub
Class of Vehicle
Source: mpg
25
4.4 Dot + Boxplot
library(ggplot2)
theme_set(theme_bw())
# plot
g <- ggplot(mpg, aes(manufacturer, cty))
g + geom_boxplot() +
geom_dotplot(binaxis=’y’,
stackdir=’center’,
dotsize = .5,
fill="red") +
theme(axis.text.x = element_text(angle=65, vjust=0.6)) +
labs(title="Box plot + Dot plot",
subtitle="City Mileage vs Class: Each dot represents 1 row in source data",
caption="Source: mpg",
x="Class of Vehicle",
y="City Mileage")
30
City Mileage
20
10
n
ver
age
let
ry
ai
tiac
aru
san
oln
ge
da
ota
vro
nd
rcu
i
d ro
ford
aud
jee
ksw
dod
hon
linc
sub
pon
toy
hyu
nis
che
me
lan
vol
Class of Vehicle
Source: mpg
26
4.5 Tufte’s Boxplot
library(ggthemes)
library(ggplot2)
theme_set(theme_tufte()) # from ggthemes
# plot
g <- ggplot(mpg, aes(manufacturer, cty))
g + geom_tufteboxplot() +
theme(axis.text.x = element_text(angle=65, vjust=0.6)) +
labs(title="Tufte Styled Boxplot",
subtitle="City Mileage grouped by Class of vehicle",
caption="Source: mpg",
x="Class of Vehicle",
y="City Mileage")
30
City Mileage
25
20
15
10
gen
ver
t
ry
i
le
tiac
oln
aru
nda
ota
ge
da
a
i
vro
ford
jeep
rcu
d ro
a
aud
ksw
niss
dod
hon
linc
sub
pon
toy
hyu
me
che
lan
vol
Class of Vehicle
Source: mpg
27
4.6 Violin Plot
library(ggplot2)
theme_set(theme_bw())
# plot
g <- ggplot(mpg, aes(class, cty))
g + geom_violin() +
labs(title="Violin plot",
subtitle="City Mileage vs Class of vehicle",
caption="Source: mpg",
x="Class of Vehicle",
y="City Mileage")
Violin plot
City Mileage vs Class of vehicle
35
30
City Mileage
25
20
15
10
28
4.7 Bean Plot
library(beanplot)
set.seed(1)
par(mfrow = c(1, 2), mai = c(0.5, 0.5, 0.5, 0.1))
mu <- 2
si <- 0.6
c <- 500
bimodal <- c(rnorm(c/2, -mu, si), rnorm(c/2, mu, si))
uniform <- runif(c, -4, 4)
normal <- rnorm(c, 0, 1.5)
ylim <- c(-7, 7)
boxplot(bimodal, uniform, normal, ylim = ylim, main = "boxplot", names = 1:3)
beanplot(bimodal, uniform, normal, ylim = ylim, main = "beanplot",
col = c("#CAB2D6", "#33A02C",
"#B2DF8A"), border = "#CAB2D6")
boxplot beanplot
6
6
4
4
2
2
0
0
−2
−2
−4
−4
−6
−6
1 2 3 1 2 3
29
4.8 Population Pyramid
library(ggplot2)
library(ggthemes)
options(scipen = 999) # turns of scientific notations like 1e+40
# Read data
email_campaign_funnel <-
read.csv("https://raw.githubusercontent.com/selva86/datasets/master/email_campaign_funnel.csv")
# Plot
ggplot(email_campaign_funnel, aes(x = Stage, y = Users, fill = Gender)) + # fill column
geom_bar(stat = "identity", width = .6) + # draw the bars
scale_y_continuous(breaks = brks, # breaks
labels = lbls) + # labels
coord_flip() + # flip axes
labs(title="Email Campaign Funnel") +
theme_tufte() + # tufte theme from ggfortify
theme(plot.title = element_text(hjust = .5),
axis.ticks = element_blank()) + # centre plot title
scale_fill_brewer(palette = "Dark2") # colour palette
30
5. Composition
Waffle Chart
'Class' of vehicles
2seater
compact
midsize
minivan
pickup
subcompact
suv
Source: mpg
31
5.2 Pie Chart
library(ggplot2)
theme_set(theme_classic())
200
class
2seater
50 compact
midsize
minivan
pickup
subcompact
suv
150
100
Source: mpg
32
5.3 Treemap
library(ggplot2)
library(treemapify)
ggplot(G20, aes(area = gdp_mil_usd, fill = hdi, label = country)) +
geom_treemap() +
geom_treemap_text(fontface = "italic", colour = "white", place = "centre",
grow = TRUE)
Italy Mexico
Turkey Argentina
0.8
0.6
European Union
China Japan
33
5.3 Treemap (second version)
Eurasia Saudi
Middle East
Arabia
Australia
Germany
34
5.3 Treemap (third version)
United
Canada Australia South
Indonesia Saudi
United
Italy Arabia
States Kingdom
Germany France
India Mexico
European
Brazil Russia
Union Japan
Africa Eurasia Middle East Oceania
Region
Asia Europe North America South America
35
5.4 Bar Chart
# Plot
g <- ggplot(df, aes(Var1, Freq))
g + geom_bar(stat="identity", width = 0.5, fill="tomato2") +
labs(title="Bar Chart",
subtitle="Manufacturer of vehicles",
caption="Source: Frequency of Manufacturers from ’mpg’ dataset") +
theme(axis.text.x = element_text(angle=65, vjust=0.6))
Bar Chart
Manufacturer of vehicles
30
Freq
20
10
n
ver
age
let
ry
dai
tiac
aru
san
oln
e
ota
vro
rcu
i
d ro
ford
g
d
aud
jee
sw
dod
hon
linc
sub
pon
toy
hyu
nis
che
me
k
lan
vol
Var1
Source: Frequency of Manufacturers from 'mpg' dataset
36
5.4 Bar Chart (2nd version)
30 class
2seater
compact
midsize
count
20
minivan
pickup
10 subcompact
suv
0
n
ver
age
let
ry
i
tiac
aru
nda
san
oln
ge
da
ota
vro
rcu
i
p
d ro
ford
aud
jee
ksw
dod
hon
linc
sub
pon
toy
hyu
nis
che
me
lan
vol
manufacturer
Source: Manufacturers from 'mpg' dataset
37
6. Change
Used to compare the position or performance of multiple items with respect to each other. Actual values
matters somewhat less than the ranking.
# Plot
autoplot(AirPassengers) +
labs(title="AirPassengers") +
theme(plot.title = element_text(hjust=0.5))
AirPassengers
600
400
200
38
6.1 Time Series Plot (from a data frame)
library(ggplot2)
library(ggfortify)
theme_set(theme_classic())
15
Personal savings rate
10
39
6.1 Time Series Plot (from long data format)
# plot
ggplot(df, aes(x=date)) +
geom_line(aes(y=value, col=variable)) +
labs(title="Time Series of Returns Percentage",
subtitle="Drawn from Long Data format",
caption="Source: Economics",
y="Returns %",
color=NULL) + # title and caption
scale_x_date(labels = lbls, breaks = brks) + # change to monthly ticks and labels
scale_color_manual(labels = c("psavert", "uempmed"),
values = c("psavert"="#00ba38", "uempmed"="#f8766d")) + # line colour
theme(axis.text.x = element_text(angle = 90, vjust=0.5, size = 8), # rotate x axis text
panel.grid.minor = element_blank()) # turn off minor grid
15
Returns %
12
psavert
uempmed
9
6
1967
1968
1968
1969
1969
1970
1970
1971
1971
1972
1972
1973
1973
1974
1974
1975
1975
1976
1976
1977
1977
1978
1978
1979
1979
1980
1980
1981
1981
date
Source: Economics
40
6.1 Time Series Plot (from wide data format)
library(ggplot2)
library(lubridate)
theme_set(theme_bw())
# plot
ggplot(df, aes(x=date)) +
geom_line(aes(y=psavert, col="psavert")) +
geom_line(aes(y=uempmed, col="uempmed")) +
labs(title="Time Series of Returns Percentage",
subtitle="Drawn From Wide Data format",
caption="Source: Economics", y="Returns %") + # title and caption
scale_x_date(labels = lbls, breaks = brks) + # change to monthly ticks and labels
scale_color_manual(name="",
values = c("psavert"="#00ba38", "uempmed"="#f8766d")) + # line colour
theme(panel.grid.minor = element_blank()) # turn off minor grid
15
Returns %
12
psavert
uempmed
9
1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981
date
Source: Economics
41
6.2 Stacked Area Chart
Stacked area chart is just like a line chart, except that the region below the plot is all colored. This is typically
used when:
• You want to describe how a quantity or volume (rather than something like price) changed over time.
• You have many data points. For very few data points, consider plotting a bar chart.
• You want to show the contribution from individual components.
library(ggplot2)
library(lubridate)
theme_set(theme_bw())
df <- economics[, c("date", "psavert", "uempmed")]
df <- df[lubridate::year(df$date) %in% c(1967:1981), ]
# labels and breaks for X axis text
brks <- df$date[seq(1, length(df$date), 12)];lbls <- lubridate::year(brks)
# Plot
ggplot(df, aes(x=date)) +
geom_area(aes(y=psavert+uempmed, fill="psavert")) + geom_area(aes(y=uempmed, fill="uempmed")) +
labs(title="Area Chart of Returns Percentage",subtitle="From Wide Data format",
caption="Source: Economics",y="Returns %") + # title and caption
scale_x_date(labels = lbls, breaks = brks) + # change to monthly ticks and labels
scale_fill_manual(name="",
values = c("psavert"="#00ba38", "uempmed"="#f8766d")) + # line colour
theme(panel.grid.minor = element_blank()) # turn off minor grid
20
Returns %
psavert
uempmed
10
0
1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981
date
Source: Economics
42
6.3 Calendar Heat Map
# http://margintale.blogspot.in/2012/04/ggplot2-time-series-heatmaps.html
library(ggplot2)
library(plyr)
library(scales)
library(zoo)
df <- read.csv("https://raw.githubusercontent.com/selva86/datasets/master/yahoo.csv")
df$date <- as.Date(df$date) # format date
df <- df[df$year >= 2012, ] # filter reqd years
Apr Aug Dec Feb Jan Jul Jun Mar May Nov Oct Sep
Wed
Tue
2012
Thu
Mon
Fri
Wed
Tue
2013
Thu Close
Mon
Fri 40
Wed
Tue
2014
30
Thu
Mon
Fri
20
Wed
Tue
2015
Thu
Mon
Fri
Wed
Tue
2016
Thu
Mon
Fri
12345 12345 12345 12345 12345 12345 12345 12345 12345 12345 12345 12345
Week of Month
43
6.4 Slope Chart
Slope charts are a great tool if you want to visualise change in value and ranking between categories. This is
more suitable over a time series when there are very few time points.
Slope Chart
library(slopegraph)
library(ggplot2)
data(states)
cols <- ‘[<-‘(rep("black", 37), 7, "red")
ggslopegraph(states, offset.x = 0.06, yrev = TRUE,
col.lines = cols, col.lab = cols,
main = ’Relative Rank of U.S. State Populations, 1790-1870’) +
theme_bw()
44
6.5 Seasonal Plot
library(ggplot2)
library(forecast)
theme_set(theme_classic())
# Subset data
nottem_small <- window(nottem, start=c(1920, 1), end=c(1925, 12))
# Plot
ggseasonplot(AirPassengers) +
labs(title="Seasonal plot: International Airline Passengers")
600
year
1949
1950
1951
1952
400 1953
1954
1955
1956
1957
1958
200
1959
1960
Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
Month
45
6.5 Seasonal Plot
library(ggplot2)
library(forecast)
theme_set(theme_classic())
# Subset data
nottem_small <- window(nottem, start=c(1920, 1), end=c(1925, 12))
# Plot
ggseasonplot(nottem_small) +
labs(title="Seasonal plot: Air temperatures at Nottingham Castle")
60
year
1920
1921
1922
50
1923
1924
1925
40
Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
Month
46
7. Groups
library(ggplot2)
library(ggdendro)
theme_set(theme_bw())
# plot
ggdendrogram(hc, rotate = TRUE, size = 2)
New Hampshire
Iowa
Wisconsin
Minnesota
Vermont
North Dakota
South Dakota
Maine
West Virginia
Hawaii
Pennsylvania
Connecticut
Kansas
Indiana
Utah
Ohio
Montana
Kentucky
Nebraska
Idaho
Texas
Colorado
Georgia
Tennessee
Arkansas
Missouri
New Jersey
Massachusetts
Rhode Island
Virginia
Oklahoma
Wyoming
Oregon
Washington
South Carolina
Mississippi
Alaska
Nevada
Michigan
New York
Illinois
Louisiana
Alabama
Delaware
New Mexico
Arizona
Maryland
California
North Carolina
Florida
0 50 100 150
47
7.2 Clusters
library(ggplot2)
library(ggalt)
library(ggfortify)
theme_set(theme_classic())
Iris Clustering
With principal components PC1 and PC2 as X and Y axis
Species
setosa
PC2
0
versicolor
virginica
−1
48