4027 Assignment Q5
4027 Assignment Q5
2023-08-27
Question 5
part a)
i)
# Model 1
df$month2 <- df$monthˆ2
m1 <- lm(x ~ month + month2, data = df)
#summary(m1)
# Model 2
df$logx <- log(df$x, base = exp(1))
m2 <- lm(logx ~ month + month2, data = df)
#summary(m2)
# Model 3
df$month3 <- df$monthˆ3
m3 <- lm(x ~ month + month2 + month3, data = df)
#summary(m3)
1
MSE r2 p_val AIC BIC
m1 13391.36 0.0630151 0.0101659 1784.995 1796.874
m2 0.18 0.0516647 0.0237583 1780.547 1780.547
m3 13331.55 0.0672002 0.0205926 1786.350 1801.199
r2 = summary(model)$r.squared
f_dat = summary(model)$fstatistic
p_val = 1 - pf(f_dat[1], f_dat[2], f_dat[3])
aic = AIC(model)
bic = BIC(model)
data.frame(MSE = MSE, r2 = r2, p_val = p_val, AIC = aic, BIC = bic)
}
Based on the above table, we can see that m1 provides a balance between all three models in terms of r-
squared, and AIC metrics: although m3 has a higher r-squared, this is due to it possessing more parameters,
thus artificially increasing the r-squared: as a result it has a higher AIC score. m3, whilst possessing the
lowest AIC, has significantly reduced r-squared compared to the other two models. Thus it is clear that m1
provides a parsimonious balance between model complexity and prediction accuracy.
ii)
# Fitting Model
quant <- c(0.025, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.975)
q <- rq(x ~ month + month2, tau = quant, method="br", data = df)
Quantiles
0.025 0.05 0.1 0.25 0.5 0.75 0.9 0.95 0.975
(Intercept) 106.21 102.12 102.18 126.07 172.64 236.6 287.44 296.55 358.87
month 6.30 10.76 13.86 20.35 33.79 51.0 56.47 68.20 62.86
month2 -0.51 -0.88 -1.05 -1.42 -2.42 -3.6 -3.91 -4.74 -4.73
set.seed(111)
# Graph
2
num_colors <- length(c("Quantile = 0.025", "Quantile = 0.05", "Quantile = 0.01", "Quantile = 0.25", "Qua
"Quantile = 0.75", "Quantile = 0.9", "Quantile = 0.95", "Quantile = 0.975", "Mean
random_colors <- sample(colors(), num_colors)
df %>%
ggplot() +
aes(x = month, y = x) +
geom_point() +
geom_quantile(quantiles = quant, formula = y ~ x + I(xˆ2),
aes(color = as.factor(..quantile..)), se = TRUE) +
geom_smooth(method = "lm", se = TRUE, aes(color = "black"), linewidth = 0.5) +
scale_x_continuous(breaks = 1:12) +
scale_color_manual(name = "Fitted Line",
labels = c("Quantile = 0.025", "Quantile = 0.05", "Quantile = 0.01", "Quantile = 0.
"Quantile = 0.75", "Quantile = 0.9", "Quantile = 0.95", "Quantile = 0.97
values = random_colors) + # Use the randomized colors here
labs(x = "Month", y = "y", title = "Fitted Line Plot")
## Don’t know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
## ‘geom_smooth()‘ using formula = ’y ~ x’
600
Fitted Line
Quantile = 0.025
Quantile = 0.05
Quantile = 0.01
400 Quantile = 0.25
Quantile = 0.5
y
Quantile = 0.75
Quantile = 0.9
Quantile = 0.95
Quantile = 0.975
200
Mean Regression
1 2 3 4 5 6 7 8 9 10 11 12
Month
# Coefficient Graphs
plot(summary(q), ylim = c(50, 350))
3
Prediction Table for December 1962
Prediction Lower Bound Upper Bound
502.4587 471.6534 533.264
(Intercept)
350
200
50
month
350
200
50
month2
350
200
50
iii)
m4 <- rq(x ~ month + month2 + year, data = df, tau = 0.75, method="br")
# Prediction
new_data <- data.frame(month = 12, month2 = 144, year = 1962)
pred <- predict(m4, newdata = new_data, interval = 'confidence')
pred |> kbl() |> kable_styling() |> add_header_above(c('Prediction Table for December 1962' = 3))
4
b)
i)
# Linear Gam
gam0 <- gam(x ~ month, data = df, method = 'REML')
# Default Spline
gam1 <- gam(x ~ s(month), data = df, method = 'REML')
z <- summary(gam1)
z
##
## Family: gaussian
## Link function: identity
##
## Formula:
## x ~ s(month)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 280.299 9.687 28.94 <2e-16 ***
## ---
## Signif. codes: 0 ’***’ 0.001 ’**’ 0.01 ’*’ 0.05 ’.’ 0.1 ’ ’ 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(month) 2.743 3.41 2.994 0.0316 *
## ---
## Signif. codes: 0 ’***’ 0.001 ’**’ 0.01 ’*’ 0.05 ’.’ 0.1 ’ ’ 1
##
## R-sq.(adj) = 0.0611 Deviance explained = 7.91%
## -REML = 883.38 Scale est. = 13512 n = 144
z$p.table
5
qq <- as.data.frame(z$s.table)
nam <- names(qq)
rbind(nam,qq)
coef_func2(gam1)
# Cubic Spline
gam2 <- gam(x ~ s(month, bs = 'cr'), data = df, method = 'REML')
summary(gam2)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## x ~ s(month, bs = "cr")
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 280.299 9.687 28.94 <2e-16 ***
## ---
## Signif. codes: 0 ’***’ 0.001 ’**’ 0.01 ’*’ 0.05 ’.’ 0.1 ’ ’ 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(month) 2.738 3.397 3.006 0.0312 *
6
Summary Table for GAM0
Estimate Std. Error t value Pr(>|t|)
(Intercept) 265.971 21.346 12.46 0
month 2.204 2.9 0.76 0.449
r-squared -0.003 N/A N/A N/A
deviance 0.0041 N/A N/A N/A
## ---
## Signif. codes: 0 ’***’ 0.001 ’**’ 0.01 ’*’ 0.05 ’.’ 0.1 ’ ’ 1
##
## R-sq.(adj) = 0.0612 Deviance explained = 7.91%
## -REML = 882.21 Scale est. = 13512 n = 144
coef_func(gam0) |> kbl() |> kable_styling() |> add_header_above(c('Summary Table for GAM0' = 5))
coef_func2(gam1) |> kbl() |> kable_styling() |> add_header_above(c('Summary Table for GAM1' = 5))
coef_func2(gam2) |> kbl() |> kable_styling() |> add_header_above(c('Summary Table for GAM2' = 5))
R squared is low for all models as we are not considering the effect of year on the value. They do not capture
the true relatonship as it is being confounded by year.
ii)
7
Summary Table for 1949
Estimate Std. Error t value Pr(>|t|)
Intercept 126.667 0.447 283.182 0
r-squared 0.9872 N/A N/A N/A
deviance 0.9973 N/A N/A N/A
edf Ref.df F p-value
s(month) 8.643 8.964 95.114 0.01
coef_func2(gam11) |> kbl() |> kable_styling() |> add_header_above(c('Summary Table for 1949' = 5))
coef_func2(gam12) |> kbl() |> kable_styling() |> add_header_above(c('Summary Table for 1950' = 5))
coef_func2(gam13) |> kbl() |> kable_styling() |> add_header_above(c('Summary Table for 1951' = 5))
coef_func2(gam14) |> kbl() |> kable_styling() |> add_header_above(c('Summary Table for 1952' = 5))
coef_func2(gam15) |> kbl() |> kable_styling() |> add_header_above(c('Summary Table for 1953' = 5))
coef_func2(gam16) |> kbl() |> kable_styling() |> add_header_above(c('Summary Table for 1954' = 5))
coef_func2(gam17) |> kbl() |> kable_styling() |> add_header_above(c('Summary Table for 1955' = 5))
8
Summary Table for 1952
Estimate Std. Error t value Pr(>|t|)
Intercept 197 2.305 85.451 0
r-squared 0.8791 N/A N/A N/A
deviance 0.9478 N/A N/A N/A
edf Ref.df F p-value
s(month) 6.248 7.408 10.887 0.009
9
Summary Table for 1956
Estimate Std. Error t value Pr(>|t|)
Intercept 328.25 1.951 168.254 0
r-squared 0.9801 N/A N/A N/A
deviance 0.9948 N/A N/A N/A
edf Ref.df F p-value
s(month) 8.146 8.806 61.647 0.003
coef_func2(gam18) |> kbl() |> kable_styling() |> add_header_above(c('Summary Table for 1956' = 5))
coef_func2(gam19) |> kbl() |> kable_styling() |> add_header_above(c('Summary Table for 1957' = 5))
coef_func2(gam20) |> kbl() |> kable_styling() |> add_header_above(c('Summary Table for 1958' = 5))
coef_func2(gam21) |> kbl() |> kable_styling() |> add_header_above(c('Summary Table for 1959' = 5))
coef_func2(gam22) |> kbl() |> kable_styling() |> add_header_above(c('Summary Table for 1960' = 5))
## Don’t know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
10
Summary Table for 1959
Estimate Std. Error t value Pr(>|t|)
Intercept 428.333 5.53 77.461 0
r-squared 0.9248 N/A N/A N/A
deviance 0.9694 N/A N/A N/A
edf Ref.df F p-value
s(month) 6.528 7.671 17.787 0.007
Fitted GAMs
600
year
1960.0
400 1957.5
x
1955.0
1952.5
1950.0
200
iii)
11
y_pred=predict(gam22, data.frame(month = x_new)) #pred y
plot(df60$x ~ df60$month, ylim=c(350, 650), type = "o", main = 'Fitted GAM for 1960 with Splines', xlab
abline(h = 0)
matplot(df60$month, model_matrix*50 + 400, type = "l", lty = 2, add = T)
lines(y_pred ~ x_new, col = "red", lwd = 2) #red fitted line
550
450
350
2 4 6 8 10 12
Month
## 1
## 539.4304
12