Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
0% found this document useful (0 votes)
23 views

4027 Assignment Q5

Uploaded by

lewis.hastie
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
23 views

4027 Assignment Q5

Uploaded by

lewis.hastie
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 12

Question 5 Output

2023-08-27

Question 5

# Creating Long Dataframe


data <- AirPassengers
df <- as.data.frame(data)

year <- c(seq(1949, 1960, by = 1))


new <- c()
for (i in year){
new <- append(new, rep(i, 12))
}
df <- cbind(df, month = 1:12, year = new)

#ggplot(df, aes(x = month, y = x, group = year, colour = year)) +


# geom_line() +
# geom_point()

part a)

i)

# Model 1
df$month2 <- df$monthˆ2
m1 <- lm(x ~ month + month2, data = df)
#summary(m1)

# Model 2
df$logx <- log(df$x, base = exp(1))
m2 <- lm(logx ~ month + month2, data = df)
#summary(m2)

# Model 3
df$month3 <- df$monthˆ3
m3 <- lm(x ~ month + month2 + month3, data = df)
#summary(m3)

numbers <- function(model){


MSE = round(mean(residuals(model)ˆ2), 2)

1
MSE r2 p_val AIC BIC
m1 13391.36 0.0630151 0.0101659 1784.995 1796.874
m2 0.18 0.0516647 0.0237583 1780.547 1780.547
m3 13331.55 0.0672002 0.0205926 1786.350 1801.199

r2 = summary(model)$r.squared
f_dat = summary(model)$fstatistic
p_val = 1 - pf(f_dat[1], f_dat[2], f_dat[3])
aic = AIC(model)
bic = BIC(model)
data.frame(MSE = MSE, r2 = r2, p_val = p_val, AIC = aic, BIC = bic)
}

# need to adjust AIC


metric <- rbind(m1 = numbers(m1), m2 = numbers(m2), m3 = numbers(m3))

adjust_aic <- AIC(m2) + 2*sum(log(df$x, base = exp(1)))


adjust_bic <- BIC(m2) + 2*sum(log(df$x, base = exp(1)))

metric[2,4] <- adjust_bic


metric[2,5] <- adjust_bic

metric |> kbl() |> kable_styling()

Based on the above table, we can see that m1 provides a balance between all three models in terms of r-
squared, and AIC metrics: although m3 has a higher r-squared, this is due to it possessing more parameters,
thus artificially increasing the r-squared: as a result it has a higher AIC score. m3, whilst possessing the
lowest AIC, has significantly reduced r-squared compared to the other two models. Thus it is clear that m1
provides a parsimonious balance between model complexity and prediction accuracy.

ii)

# Fitting Model
quant <- c(0.025, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.975)
q <- rq(x ~ month + month2, tau = quant, method="br", data = df)

coef_table <- as.data.frame(round(q$coefficients,2))


names(coef_table) <- c(0.025, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.975)
coef_table |> kbl() |> kable_styling(full_width = FALSE, position = "left") |> add_header_above(c('', '

Quantiles
0.025 0.05 0.1 0.25 0.5 0.75 0.9 0.95 0.975
(Intercept) 106.21 102.12 102.18 126.07 172.64 236.6 287.44 296.55 358.87
month 6.30 10.76 13.86 20.35 33.79 51.0 56.47 68.20 62.86
month2 -0.51 -0.88 -1.05 -1.42 -2.42 -3.6 -3.91 -4.74 -4.73

set.seed(111)
# Graph

2
num_colors <- length(c("Quantile = 0.025", "Quantile = 0.05", "Quantile = 0.01", "Quantile = 0.25", "Qua
"Quantile = 0.75", "Quantile = 0.9", "Quantile = 0.95", "Quantile = 0.975", "Mean
random_colors <- sample(colors(), num_colors)

df %>%
ggplot() +
aes(x = month, y = x) +
geom_point() +
geom_quantile(quantiles = quant, formula = y ~ x + I(xˆ2),
aes(color = as.factor(..quantile..)), se = TRUE) +
geom_smooth(method = "lm", se = TRUE, aes(color = "black"), linewidth = 0.5) +
scale_x_continuous(breaks = 1:12) +
scale_color_manual(name = "Fitted Line",
labels = c("Quantile = 0.025", "Quantile = 0.05", "Quantile = 0.01", "Quantile = 0.
"Quantile = 0.75", "Quantile = 0.9", "Quantile = 0.95", "Quantile = 0.97
values = random_colors) + # Use the randomized colors here
labs(x = "Month", y = "y", title = "Fitted Line Plot")

## Don’t know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
## ‘geom_smooth()‘ using formula = ’y ~ x’

Fitted Line Plot

600

Fitted Line
Quantile = 0.025
Quantile = 0.05
Quantile = 0.01
400 Quantile = 0.25
Quantile = 0.5
y

Quantile = 0.75
Quantile = 0.9
Quantile = 0.95
Quantile = 0.975
200
Mean Regression

1 2 3 4 5 6 7 8 9 10 11 12
Month

# Coefficient Graphs
plot(summary(q), ylim = c(50, 350))

3
Prediction Table for December 1962
Prediction Lower Bound Upper Bound
502.4587 471.6534 533.264

(Intercept)
350
200
50

0.0 0.2 0.4 0.6 0.8 1.0

month
350
200
50

0.0 0.2 0.4 0.6 0.8 1.0

month2
350
200
50

0.0 0.2 0.4 0.6 0.8 1.0

iii)

m1_refit <- lm(x ~ month + month2 + year, data = df)

m4 <- rq(x ~ month + month2 + year, data = df, tau = 0.75, method="br")

# Prediction
new_data <- data.frame(month = 12, month2 = 144, year = 1962)
pred <- predict(m4, newdata = new_data, interval = 'confidence')

pred <- as.data.frame(pred)


names(pred) <- c('Prediction', 'Lower Bound', 'Upper Bound')

pred |> kbl() |> kable_styling() |> add_header_above(c('Prediction Table for December 1962' = 3))

4
b)

i)

coef_func <- function(model){


sum = summary(model)
tab = as.data.frame(round(sum$p.table,3))
df <- as.data.frame(cbind(round(rbind(sum$r.sq, sum$dev.exp),4), c('N/A'), c('N/A'), c('N/A')))
rownames(df) <- c('r-squared', 'deviance')
names(df) <- names(tab)
rbind(tab, df)
}

# Linear Gam
gam0 <- gam(x ~ month, data = df, method = 'REML')

# Default Spline
gam1 <- gam(x ~ s(month), data = df, method = 'REML')

z <- summary(gam1)
z

##
## Family: gaussian
## Link function: identity
##
## Formula:
## x ~ s(month)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 280.299 9.687 28.94 <2e-16 ***
## ---
## Signif. codes: 0 ’***’ 0.001 ’**’ 0.01 ’*’ 0.05 ’.’ 0.1 ’ ’ 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(month) 2.743 3.41 2.994 0.0316 *
## ---
## Signif. codes: 0 ’***’ 0.001 ’**’ 0.01 ’*’ 0.05 ’.’ 0.1 ’ ’ 1
##
## R-sq.(adj) = 0.0611 Deviance explained = 7.91%
## -REML = 883.38 Scale est. = 13512 n = 144

z$p.table

## Estimate Std. Error t value Pr(>|t|)


## (Intercept) 280.2986 9.686865 28.93595 5.340708e-61

5
qq <- as.data.frame(z$s.table)
nam <- names(qq)
rbind(nam,qq)

## edf Ref.df F p-value


## 1 edf Ref.df F p-value
## s(month) 2.74306691727574 3.40978785243308 2.9944992617236 0.031550925135668

coef_func2 <- function(model){


sum = summary(model)
tab = as.data.frame(round(sum$p.table,3))
stab = as.data.frame(round(sum$s.table,3))
nam = names(stab)
stab = rbind(nam, stab)
names(stab) <- names(tab)
df = as.data.frame(cbind(round(rbind(sum$r.sq, sum$dev.exp),4), c('N/A'), c('N/A'), c('N/A')))
rownames(df) = c('r-squared', 'deviance')
names(df) = names(tab)
ddf = rbind(tab, df, stab)
rownames(ddf) <- c('Intercept', 'r-squared', 'deviance', '', 's(month)')
ddf
}

coef_func2(gam1)

## Estimate Std. Error t value Pr(>|t|)


## Intercept 280.299 9.687 28.936 0
## r-squared 0.0611 N/A N/A N/A
## deviance 0.0791 N/A N/A N/A
## edf Ref.df F p-value
## s(month) 2.743 3.41 2.994 0.032

# Cubic Spline
gam2 <- gam(x ~ s(month, bs = 'cr'), data = df, method = 'REML')
summary(gam2)

##
## Family: gaussian
## Link function: identity
##
## Formula:
## x ~ s(month, bs = "cr")
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 280.299 9.687 28.94 <2e-16 ***
## ---
## Signif. codes: 0 ’***’ 0.001 ’**’ 0.01 ’*’ 0.05 ’.’ 0.1 ’ ’ 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(month) 2.738 3.397 3.006 0.0312 *

6
Summary Table for GAM0
Estimate Std. Error t value Pr(>|t|)
(Intercept) 265.971 21.346 12.46 0
month 2.204 2.9 0.76 0.449
r-squared -0.003 N/A N/A N/A
deviance 0.0041 N/A N/A N/A

Summary Table for GAM1


Estimate Std. Error t value Pr(>|t|)
Intercept 280.299 9.687 28.936 0
r-squared 0.0611 N/A N/A N/A
deviance 0.0791 N/A N/A N/A
edf Ref.df F p-value
s(month) 2.743 3.41 2.994 0.032

## ---
## Signif. codes: 0 ’***’ 0.001 ’**’ 0.01 ’*’ 0.05 ’.’ 0.1 ’ ’ 1
##
## R-sq.(adj) = 0.0612 Deviance explained = 7.91%
## -REML = 882.21 Scale est. = 13512 n = 144

coef_func(gam0) |> kbl() |> kable_styling() |> add_header_above(c('Summary Table for GAM0' = 5))

coef_func2(gam1) |> kbl() |> kable_styling() |> add_header_above(c('Summary Table for GAM1' = 5))

coef_func2(gam2) |> kbl() |> kable_styling() |> add_header_above(c('Summary Table for GAM2' = 5))

R squared is low for all models as we are not considering the effect of year on the value. They do not capture
the true relatonship as it is being confounded by year.

ii)

gam11 <- gam(x ~ s(month) , data = df[df$year == '1949',], method = 'REML')


gam12 <- gam(x ~ s(month) , data = df[df$year == '1950',], method = 'REML')
gam13 <- gam(x ~ s(month) , data = df[df$year == '1951',], method = 'REML')
gam14 <- gam(x ~ s(month) , data = df[df$year == '1952',], method = 'REML')
gam15 <- gam(x ~ s(month) , data = df[df$year == '1953',], method = 'REML')
gam16 <- gam(x ~ s(month) , data = df[df$year == '1954',], method = 'REML')
gam17 <- gam(x ~ s(month) , data = df[df$year == '1955',], method = 'REML')

Summary Table for GAM2


Estimate Std. Error t value Pr(>|t|)
Intercept 280.299 9.687 28.937 0
r-squared 0.0612 N/A N/A N/A
deviance 0.0791 N/A N/A N/A
edf Ref.df F p-value
s(month) 2.738 3.397 3.006 0.031

7
Summary Table for 1949
Estimate Std. Error t value Pr(>|t|)
Intercept 126.667 0.447 283.182 0
r-squared 0.9872 N/A N/A N/A
deviance 0.9973 N/A N/A N/A
edf Ref.df F p-value
s(month) 8.643 8.964 95.114 0.01

Summary Table for 1950


Estimate Std. Error t value Pr(>|t|)
Intercept 139.667 3.945 35.401 0
r-squared 0.4864 N/A N/A N/A
deviance 0.6238 N/A N/A N/A
edf Ref.df F p-value
s(month) 2.942 3.654 2.936 0.097

gam18 <- gam(x ~ s(month) , data = df[df$year == '1956',], method = 'REML')


gam19 <- gam(x ~ s(month) , data = df[df$year == '1957',], method = 'REML')
gam20 <- gam(x ~ s(month) , data = df[df$year == '1958',], method = 'REML')
gam21 <- gam(x ~ s(month) , data = df[df$year == '1959',], method = 'REML')
gam22 <- gam(x ~ s(month) , data = df[df$year == '1960',], method = 'REML')

coef_func2(gam11) |> kbl() |> kable_styling() |> add_header_above(c('Summary Table for 1949' = 5))

coef_func2(gam12) |> kbl() |> kable_styling() |> add_header_above(c('Summary Table for 1950' = 5))

coef_func2(gam13) |> kbl() |> kable_styling() |> add_header_above(c('Summary Table for 1951' = 5))

coef_func2(gam14) |> kbl() |> kable_styling() |> add_header_above(c('Summary Table for 1952' = 5))

coef_func2(gam15) |> kbl() |> kable_styling() |> add_header_above(c('Summary Table for 1953' = 5))

coef_func2(gam16) |> kbl() |> kable_styling() |> add_header_above(c('Summary Table for 1954' = 5))

coef_func2(gam17) |> kbl() |> kable_styling() |> add_header_above(c('Summary Table for 1955' = 5))

Summary Table for 1951


Estimate Std. Error t value Pr(>|t|)
Intercept 170.167 3.348 50.833 0
r-squared 0.6045 N/A N/A N/A
deviance 0.718 N/A N/A N/A
edf Ref.df F p-value
s(month) 3.158 3.919 4.315 0.038

8
Summary Table for 1952
Estimate Std. Error t value Pr(>|t|)
Intercept 197 2.305 85.451 0
r-squared 0.8791 N/A N/A N/A
deviance 0.9478 N/A N/A N/A
edf Ref.df F p-value
s(month) 6.248 7.408 10.887 0.009

Summary Table for 1953


Estimate Std. Error t value Pr(>|t|)
Intercept 225 4.199 53.588 0
r-squared 0.7389 N/A N/A N/A
deviance 0.8279 N/A N/A N/A
edf Ref.df F p-value
s(month) 3.748 4.634 6.861 0.014

Summary Table for 1954


Estimate Std. Error t value Pr(>|t|)
Intercept 238.917 3.839 62.235 0
r-squared 0.855 N/A N/A N/A
deviance 0.9271 N/A N/A N/A
edf Ref.df F p-value
s(month) 5.471 6.612 9.884 0.007

Summary Table for 1955


Estimate Std. Error t value Pr(>|t|)
Intercept 284 1.41 201.355 0
r-squared 0.9866 N/A N/A N/A
deviance 0.997 N/A N/A N/A
edf Ref.df F p-value
s(month) 8.541 8.941 90.203 0.011

9
Summary Table for 1956
Estimate Std. Error t value Pr(>|t|)
Intercept 328.25 1.951 168.254 0
r-squared 0.9801 N/A N/A N/A
deviance 0.9948 N/A N/A N/A
edf Ref.df F p-value
s(month) 8.146 8.806 61.647 0.003

Summary Table for 1957


Estimate Std. Error t value Pr(>|t|)
Intercept 368.417 3.518 104.711 0
r-squared 0.9557 N/A N/A N/A
deviance 0.9849 N/A N/A N/A
edf Ref.df F p-value
s(month) 7.253 8.274 28.833 0.003

coef_func2(gam18) |> kbl() |> kable_styling() |> add_header_above(c('Summary Table for 1956' = 5))

coef_func2(gam19) |> kbl() |> kable_styling() |> add_header_above(c('Summary Table for 1957' = 5))

coef_func2(gam20) |> kbl() |> kable_styling() |> add_header_above(c('Summary Table for 1958' = 5))

coef_func2(gam21) |> kbl() |> kable_styling() |> add_header_above(c('Summary Table for 1959' = 5))

coef_func2(gam22) |> kbl() |> kable_styling() |> add_header_above(c('Summary Table for 1960' = 5))

ggplot(data = df, aes(y = x, x = month, group = year, colour = year)) +


geom_point() +
geom_smooth(method = "gam", formula = y ~ s(x)) +
theme(plot.title = element_text(hjust = 0.5)) +
labs(title = 'Fitted GAMs')

## Don’t know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.

Summary Table for 1958


Estimate Std. Error t value Pr(>|t|)
Intercept 381 5.405 70.494 0
r-squared 0.9158 N/A N/A N/A
deviance 0.9645 N/A N/A N/A
edf Ref.df F p-value
s(month) 6.36 7.515 16.078 0.004

10
Summary Table for 1959
Estimate Std. Error t value Pr(>|t|)
Intercept 428.333 5.53 77.461 0
r-squared 0.9248 N/A N/A N/A
deviance 0.9694 N/A N/A N/A
edf Ref.df F p-value
s(month) 6.528 7.671 17.787 0.007

Summary Table for 1960


Estimate Std. Error t value Pr(>|t|)
Intercept 476.167 4.168 114.255 0
r-squared 0.9655 N/A N/A N/A
deviance 0.9889 N/A N/A N/A
edf Ref.df F p-value
s(month) 7.475 8.431 36.404 0.002

Fitted GAMs

600

year
1960.0

400 1957.5
x

1955.0

1952.5

1950.0

200

2.5 5.0 7.5 10.0 12.5


month

iii)

df60 <- df[df$year == '1960',]

model_matrix=predict(gam22, type = "lpmatrix")


x_new = seq(1, 12, length.out = 100)

11
y_pred=predict(gam22, data.frame(month = x_new)) #pred y

plot(df60$x ~ df60$month, ylim=c(350, 650), type = "o", main = 'Fitted GAM for 1960 with Splines', xlab
abline(h = 0)
matplot(df60$month, model_matrix*50 + 400, type = "l", lty = 2, add = T)
lines(y_pred ~ x_new, col = "red", lwd = 2) #red fitted line

Fitted GAM for 1960 with Splines


650
No. of Passengers

550
450
350

2 4 6 8 10 12

Month

# Making Prediction for June 15th


predict(gam22, data.frame(year = 1960, month = 6))

## 1
## 539.4304

12

You might also like