Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
0% found this document useful (0 votes)
15 views

Code

Uploaded by

Classy Man
Copyright
© © All Rights Reserved
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
15 views

Code

Uploaded by

Classy Man
Copyright
© © All Rights Reserved
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 25

Introduction

Read
library(xlsx)
data =read.xlsx('Classify_risk_dataset.xlsx', head=TRUE, 1,
stringsAsFactors=T)

cleveland_data <- read.csv('cleveland_data.txt',header = T,sep =


",",dec = ".",stringsAsFactors = F)
Apply

apply(data, 1, sum) # dataframe

unlist(lapply(1:2,function(x){sum(M[,x])})) # dataframe columns


sapply(1:2,function(x){sum(M[,x])})

tapply(summary variable to apply function on, filter variable used to


split data, function to apply to resulting sets)

 tapply(medical.data$age, medical.data$treatment, mean)

Others
cbind , rbind

df = data.frame(array1,array2) # tipo cbind

head(df), tail(df), colnames(df), nrow(df), str(df)

df$col1 EQUAL TO df[,1] EQUAL TO df[,’col1’]

df['vol'] NOT EQUAL TO df[,’vol’]

> df['vol']

vol

1 5

2 6

3 7
4 8

> df[,'vol']

[1] 5 6 7 8

mydata[mydata$e=='red', c('d', 'e')] #or


mydata[mydata['e']=='red',’d’]

mydata[which(mydata$e== 'red'& mydata$d <=1),]

mydata[,-c(1, 2)]

Soil=Soil[order(-Soil$pH),] # order by decreasing

> Soil[nrow(Soil)+1,]<-apply(Soil,2,sum, na.rm = TRUE)

> rownames(Soil)[nrow(Soil)]<- ' Total'

Data Preparation
Read file
cleveland_data <- read.csv('cleveland_data.txt',header = T,sep =
",",dec = ".",stringsAsFactors = F)

Change colnames
colnames(data_switzerland)[colnames(data_switzerland)=="AGE"] <- "age"
#or colnames(switzerland_data)[4]<-'age'
ID column
data$id <- 1:nrow(data)

Functions
mean(data$age)
sd(data$age)
min(data$age)
max(data$age)
NA treatment
data[data=='?']<-NA
Cols to factos
cols_to_factors <- c('origin','sex','cp','fbs','slope','num')
for (i in cols_to_factors){
data[,i] <- as.factor(data[,i])
}

Data cleaning
Remove observations with 3 or more missing values
aux <- apply(data,1,function(x)sum(is.na(x)))
length(aux[aux<3]) #581 observations
data <- data[which(aux<3),]
Replace NA with regression results
model<-lm(PO4 ~ oPO4, data = algae)
AND
algae[28, "PO4"] <- model[["coefficients"]][1] +
model[["coefficients"]][2] * algae[28, "oPO4"]
OR (if missing values)
fillPO4 <- function(oP) {
if (is.na(oP))
return(NA)
else
(return(model[["coefficients"]][1]+model[["coefficients"]][2]*oP))}
algae[is.na(algae$PO4), "PO4"] <- sapply(algae[is.na(algae$PO4),
"oPO4"], fillPO4)
Replace NA with mode
for (i in c('fbs', 'slope', 'thal')){
data[is.na(data[,i]),i] <-
as.numeric(names(sort(-table(data[is.na(data[,i]),i])))[1])}
Replace NA with mean/median
algae[is.na(algae$Chla), "Chla"] <- median(algae$Chla, na.rm = T)
Replace NA with kNN
algae <- knnImputation(algae, k = 10, meth = "median")
Binning
library(infotheo)
discretize(algae$Cl, disc="equalfreq", nbins=sqrt(length(algae$Cl)))
#Create new attribute with bin values
algae$bin_2<-as.numeric(unlist(discretize(algae$Cl, disc="equalfreq",
nbins=sqrt(length(algae$Cl)))))

#Associate the mean value to each bin


for (k in 1:sqrt(NROW(algae$Cl)))
{algae$Cl[algae$bin_2 == k]<-mean(algae[algae$bin_2==k, "Cl"])}
Eliminate outliers outside 3 SD
data <- data[-which(data$chol > (mean(data$chol)+3*sd(data$chol)) |
data$chol < (mean(data$chol)-3*sd(data$chol))),]

Data transformation
Data with threshold
data$var_2 <- 0
data[which(data$thalach>150),'var_2'] <- 1
Data with threshold
> Soil$aux_2[Soil$H2O > 20]<-“higher than 20"

> Soil$aux_2[Soil$H2O <= 20]<-“lower than 20 or equal"

Normal normalization
numeric_attrib <- c('age','trestbps')

z_score_function <-
function(x) {(x - mean(x, na.rm=TRUE))/(sd(x,na.rm=TRUE))}

data[,numeric_attrib] <-
apply(data[,numeric_attrib],2, z_score_function)
Min-max normalization
new_max<-2
new_min<-0

doit <- function(x) {(x - min(x, na.rm=TRUE))/(max(x,na.rm=TRUE) -


min(x, na.rm=TRUE))*((new_max-new_min)+new_min)}

apply(data,2, doit)
Decimal scaling
maxvect <- apply(abs(mtcars), 2, max)
kvector <- ceiling(log10(maxvect))
scalefactor <- 10^kvector
normed <- scale(data, center = FALSE, scale = scalefactor)
Random Sampling
Mysample_without_replacement <- mtcars[sample(1:nrow(mtcars), 50,
replace=FALSE),]
Mysample_with_replacement <- mtcars[sample(1:nrow(mtcars), 50,
replace=TRUE),]
Stratified Sampling keeping each class distribution
p=0.7
data_sample <- data.frame()
for (i in levels(data$origin)){
data_sub <- data[data$origin==i,]
B = ceiling(nrow(data_sub)*p) #rounds up
data_sub_aux <- data_sub[sample(1:nrow(data_sub),B),]
data_sample <- rbind(data_sample,data_sub_aux)
}

Classification
holdout approach with One R
library(splitTools)
partitions <- partition(Data$RISK, p = c(train = 0.7, test = 0.3))
str(partitions)
Train_sample <- Data[partitions$train,]
Test_sample <- Data[partitions$test,]

Model_OneR<-OneR(Train_sample[,-c(1)])
prediction<-predict(Model_OneR, Test_sample[,-c(1)])
table(prediction, Test_sample$RISK)
accuracy<-sum(prediction==Test_sample$RISK)/nrow(Test_sample)

Repeated holdout approach with One R


number_iterations<-10
accuracy<-vector(length=number_iterations)

for (j in 1:number_iterations){
set.seed(j)
partitions <- partition(Data$RISK, p = c(train = 0.7, test =
0.3))
Test <- Data[partitions$test,]
Train<- Data[partitions$train,]
Model_OneR<-OneR(Train[,-c(1)])
prediction<-predict(Model_OneR, Test[,-c(1)])
accuracy[j]<-sum(prediction== Test$RISK)/nrow(Test)
}

mean_accuracy<-mean(accuracy)
Cross validation with LR and good performance estimation
library(splitTools)
set.seed(123)
folds <- createFolds(Data$RISK, k=num_folds)
#index the elements of the test dataset

predictions_summary<-data.frame()

for (k in 1:num_folds){
Model_Logistic<-glm(RISK~.-ID,data=Data[-folds[[k]],], family
= binomial('logit'))

prediction <- predict(Model_Logistic, Data[folds[[k]],],


type="response")
##for a default binomial model the default predictions are of log-odds
(probabilities on logit scale), thus we need to include
"type="response""

predictions_summary<-rbind(predictions_summary,
data.frame(ID=Data[folds[[k]],1], RISK=Data$RISK[folds[[k]]],
prediction))
}

summary(predictions_summary)

predictions_summary$RISK<-
as.numeric(as.character(predictions_summary$RISK))
#PresenceAbsence functions require both observed and predicted values
to be numeric

#install.packages('PresenceAbsence')
library(PresenceAbsence)

confusion_matrix<-cmx(predictions_summary, 0.5)
confusion_matrix

performance_metrics<-presence.absence.accuracy(predictions_summary,
threshold = 0.5, find.auc = FALSE, st.dev = FALSE)
performance_metrics

#install.packages('pROC')
library(pROC)

roc_curve<-plot.roc(predictions_summary$RISK,
predictions_summary$prediction, direction="<")
roc_curve

auc_list<-auc(predictions_summary$RISK,
predictions_summary$prediction, direction="<")
auc_list

Cross validation with Naïve Bayes and performance estimation

#install.packages('e1071')
library (e1071)

predictions_summary<-data.frame()

for (k in 1:num_folds){

Model_NaiveBayes<-naiveBayes(RISK~.-ID,data=Data[-folds[[k]],])

prediction <- predict(Model_NaiveBayes, newdata=Data[folds[[k]],],


type='raw', threshold = 0.001)
predictions_summary<-rbind(predictions_summary,
data.frame(ID=Data[folds[[k]],1], RISK=Data$RISK[folds[[k]]],
prediction=prediction[,'1']))
}

predictions_summary$RISK<-
as.numeric(as.character(predictions_summary$RISK))

performance_metrics<-presence.absence.accuracy(predictions_summary,
threshold = 0.5, find.auc = FALSE, st.dev = FALSE)
performance_metrics

confusion_matrix<-cmx(predictions_summary)
confusion_matrix

roc_curve<-plot.roc(predictions_summary$RISK,
predictions_summary$prediction, legacy.axes= T, direction="<")
roc_curve

auc_list<-auc(predictions_summary$RISK,
predictions_summary$prediction, direction="<")
auc_list

kNN with cross validation and performance estimation


library(FNN)

Data$GENDER<-NULL # binary variables must be null!!!!!

predictions_summary<-data.frame()

for (k in 1:num_folds){
Model_knn<-knn(Data_norm[-folds[[k]],-c(1,8)],
Data_norm[folds[[k]],-c(1,8)], cl=Data_norm[-folds[[k]],8], k = 2,
prob=TRUE)

prob_predictions=ifelse(Model_knn==1,attr(Model_knn, 'prob'),1-
attr(Model_knn, 'prob'))
# This is necessary because "if prob is true, the proportion of
the votes for the winning class are returned as attribute prob."
#To compute the AUC we need the probability corresponding to
the positive class

predictions_summary<-rbind(predictions_summary,
data.frame(ID=Data_norm[folds[[k]],1],
RISK=Data_norm$RISK[folds[[k]]], pred=prob_predictions))

predictions_summary$RISK<-
as.numeric(as.character(predictions_summary$RISK))

auc_list<-auc(predictions_summary$RISK, predictions_summary$pred,
direction="<")

auc_list
kNN with 1 parameter tuning
#create stratified train and test samples
library(splitTools)
library(caret)

partitions <- partition(Data_norm$RISK, p = c(train_validation = 0.8,


test = 0.2))

Test_sample <- data.frame()


Train_validation_sample <- data.frame()

Train_validation_sample <- Data_norm[partitions$train_validation,]


Test_sample <- Data_norm[partitions$test,]

num_folds<-10
max_num_neighbours<-40

set.seed(123)
folds <- createFolds(Train_validation_sample$RISK, k=num_folds)

performance_summary<-data.frame()

for (j in 1:max_num_neighbours){

predictions_summary<-data.frame()

for (k in 1:num_folds){

#8 is the column with the dependent variable


Model_knn<-knn(Train_validation_sample[-folds[[k]],-
c(1,8)], Train_validation_sample[folds[[k]],-c(1,8)],
Train_validation_sample[-folds[[k]],8], k = j, prob=TRUE)

prob_predictions=ifelse(Model_knn==1,attr(Model_knn,
'prob'),1-attr(Model_knn, 'prob'))
# This is necessary because "if prob is true, the
proportion of the votes for the winning class are returned as
attribute prob."
#To compute the AUC we need the probability
corresponding to the positive class

predictions_summary<-rbind(predictions_summary,
data.frame(ID=Train_validation_sample[folds[[k]],1],
RISK=Train_validation_sample$RISK[folds[[k]]], pred=prob_predictions))

predictions_summary$RISK<-
as.numeric(as.character(predictions_summary$RISK))

performance_metrics<-presence.absence.accuracy(predictions_summary,
threshold = 0.5, find.auc = FALSE, st.dev = FALSE)

performance_summary<-rbind(performance_summary,
data.frame(num_neighbors=j, accuracy=performance_metrics$PCC))
}
num_neighbors<-
performance_summary[performance_summary$accuracy==max(performance_summ
ary$accuracy),"num_neighbors"]

Final_Model_knn<-knn(Train_validation_sample[,-c(1,8)], Test_sample[,-
c(1,8)], Train_validation_sample[,8], num_neighbors, prob=TRUE)

prob_predictions=ifelse(Final_Model_knn==1,attr(Final_Model_knn,
'prob'),1-attr(Final_Model_knn, 'prob'))
# This is necessary because "if prob is true, the proportion of the
votes for the winning class are returned as attribute prob."
#To compute the AUC we need the probability corresponding to the
positive class

predictions_summary<-data.frame(ID=Test_sample[,1],
RISK=Test_sample$RISK, pred=prob_predictions)

predictions_summary$RISK<-
as.numeric(as.character(predictions_summary$RISK))

performance_metrics<-presence.absence.accuracy(predictions_summary,
threshold = 0.5, find.auc = FALSE, st.dev = FALSE)
performance_metrics

DT (Decision trees) with 1 parameter tuning


library(rpart)

num_folds<-10

set.seed(123)
folds <- createFolds(Data$RISK, k=num_folds)
#index the elements of the test dataset

#########################GINI INDEX############################

predictions_summary<-data.frame()

for (k in 1:num_folds){

Model_tree<-rpart(RISK~.-ID,data=Data[-folds[[k]],], parms =
list(split = 'gini'))
# parms - optional parameters for the splitting function.

prediction <- predict(Model_tree, newdata=Data[folds[[k]],])

predictions_summary<-rbind(predictions_summary,
data.frame(ID=Data[folds[[k]],1], RISK=Data$RISK[folds[[k]]],
prediction=prediction[,'1']))
}

#check constructed tree


fancyRpartPlot(Model_tree)

predictions_summary$RISK<-
as.numeric(as.character(predictions_summary$RISK))
performance_metrics<-presence.absence.accuracy(predictions_summary,
threshold = 0.5, find.auc = FALSE, st.dev = FALSE)
performance_metrics

roc_curve<-plot.roc(predictions_summary$RISK,
predictions_summary$prediction, legacy.axes= T, direction="<")
auc_list<-auc(predictions_summary$RISK,
predictions_summary$prediction)
auc_list

#######Parameter Tuning ############


#create stratified train and test samples
partitions <- partition(Data$RISK, p = c(train_validation = 0.8, test
= 0.2))

Test_sample <- data.frame()


Train_validation_sample <- data.frame()

Train_validation_sample <- Data[partitions$train_validation,]


Test_sample <- Data[partitions$test,]

set.seed(123)
folds <- createFolds(Train_validation_sample$RISK, k=num_folds)

performance_summary<-data.frame()
criterion<-c('information','gini')
min_num_objects<-20

aux=0

for (l in criterion)
{
for (i in c(0.01,0))
{
for (j in 2:min_num_objects)
{
predictions_summary<-data.frame()
for (k in 1:num_folds)
{

Model_tree<-rpart(RISK~.-ID,data=Train_validation_sample[-
folds[[k]],], parms = list(split = l), minbucket = j, cp=i)
#minsplit: minimum number of observations in a node before
attempting a split
#minbucket: minimum number of observations in a leaf
#cp complex parameter; cp - penalizes complexity; cp=0 - no
pruning is done

prediction <- predict(Model_tree,


newdata=Train_validation_sample[folds[[k]],])

predictions_summary<-rbind(predictions_summary,
data.frame(ID=Train_validation_sample[folds[[k]],1],
RISK=Train_validation_sample$RISK[folds[[k]]],
prediction=prediction[,'1']))

}
predictions_summary$RISK<-
as.numeric(as.character( predictions_summary$RISK))

performance_metrics<-
presence.absence.accuracy(predictions_summary, threshold = 0.5,
find.auc = FALSE, st.dev = FALSE)

performance_summary<-rbind(performance_summary,
data.frame(criterion=l, cp=i, min_objects=j,
accuracy=performance_metrics$PCC))

}
}
}

l<-
performance_summary[which(performance_summary$accuracy==max(performanc
e_summary$accuracy))[1],"criterion"]
i<-
performance_summary[which(performance_summary$accuracy==max(performanc
e_summary$accuracy))[1],"cp"]
j<-
performance_summary[which(performance_summary$accuracy==max(performanc
e_summary$accuracy))[1],"min_objects"]

####

Final_model_tree<-rpart(RISK~.-ID,data=Train_validation_sample,
parms = list(split = l), minbucket = as.numeric(j), cp=i)

prediction <- predict(Final_model_tree, newdata=Test_sample)

predictions_summary<-data.frame(ID=Test_sample[,1],
RISK=Test_sample$RISK, prediction=prediction[,'1'])

predictions_summary$RISK<-
as.numeric(as.character(predictions_summary$RISK))

confusion_matrix<-cmx(predictions_summary, 0.5)
confusion_matrix

performance_metrics<-presence.absence.accuracy(predictions_summary,
threshold = 0.5, find.auc = FALSE, st.dev = FALSE)
performance_metrics

Clustering Plots
## PositionDesc attribute as a factor
ds$PositionsDesc <- as.factor(ds$PositionsDesc)

### Plot the histogram of the attribute Freekicks


hist(ds$Freekicks, breaks = seq(min(ds$Freekicks), max(ds$Freekicks),
by=2), xlab = "Freekicks", ylab="count", col="lightsteelblue",
border="black")
### Alternatively, using ggplot:
#install.packages("ggplot2")
#library(ggplot2)
ggplot(data=ds, aes(x=Freekicks)) + geom_histogram(binwidth = 2,
color="blue4", fill="lightblue", alpha=0.9) + theme_bw ()

### Plot the frequency polygon of attribute Heading


library(ggplot2)
qplot(ds$Heading,col="dark red",xlab="Heading",ylab="count", geom =
"freqpoly", binwidth=2)
### Alternatively, using ggplot:
ggplot(data=ds, aes(x=Heading)) + geom_freqpoly(binwidth = 2,
color = "red4", size=1.5) + theme_classic()

## variable to keep the column number for the last numerical variable
last_var <- ncol(ds)-1
## parallel coordinates and scatter plot matrix
#install.packages("GGally")
#install.packages("ggrepel")
library(GGally)
library(ggrepel)

ggparcoord(ds, columns = 3:last_var, groupColumn=last_var + 1,


scale="globalminmax")
# scale is a character string denoting how to scale the variables in
the parallel coordinate plot
# scale=globalminmax, no scaling is done; the range of the graphs is
defined by the global minimum and the global maximum
ggpairs(ds[,3:last_var]) # Make a matrix of plots

## Save the plot into a file. It may easier to analyze the plot
p <- ggpairs(ds[,3:last_var])
ggsave("pairs.pdf", plot = p , width = 30 , height = 30 )

## closer look at two variables (Strength and Flair)


## Observe the relationship between attributes Strength and Flair
using a scatter plot
qplot(ds$Strength, ds$Flair ,col="red",xlab="Strength",ylab="Flair")
### Alternatively, using ggplot:
p <- ggplot(data=ds, aes(x=Strength, y=Flair)) +
geom_point(shape=21, color="red4", fill="red3" , size=4) + theme_bw()
## add names
p + geom_label(aes(label=Name), size=0.5)
library(ggrepel) ## to plot names without overlap
p + geom_label_repel (aes(label=Name) , size = 2 , fill = "gold3",
alpha = 0.75 )
# jitter to spread overlaped point (label_repel does not follow
the jitter :-( )
p2 <- ggplot(data=ds, aes(x=Strength, y=Flair)) +
geom_jitter(shape=21, color="red4", fill="red3", size=4) + theme_bw()
# geom_jitter adds a small amount of random variation to the
location of each point, and is a useful way of handling
# overplotting caused by discreteness in smaller datasets

k-Means Full Approach


# Check the number of clusters (k) using the elbow method
k_num <- 2:15
num_tries <- 10 ## Run the K-means algorithm 10 times for each k
## vector to keep the average within sum of squares for each k
avg.tot.w.ss <-double(length(k_num))
for( v in k_num){ ## for each value of the range variable
v.tot.w.ss <-double(num_tries) # vector to hold the 10 tries
for(i in 1:num_tries){
k.temp <- kmeans( ds [, 3:last_var ] , centers=v) #Run k-means
v.tot.w.ss[i] <-k.temp$tot.withinss #Store the total withinss
}
avg.tot.w.ss[v-1] <-mean(v.tot.w.ss) #Average the 10 total withinss
}
plot(k_num, avg.tot.w.ss, type="b", main="Total Within SS by Various
K", ylab="Average Total
Within Sum of Squares", xlab="Value of K")

# OR Check the number of clusters with the very useful NbClust package
library(NbClust)

## Davies and Bouldin index


NbClust(ds[, c(3:last_var)] ,distance = "euclidean", min.nc = 2,
max.nc = 15, method = "kmeans", index = "db")

#silhouette index
NbClust( ds[ , c(3:last_var ) ] ,distance = "euclidean", min.nc = 2,
max.nc = 15, method = "kmeans", index = "silhouette")

#all indexes
NbClust (ds[,c(3:last_var)] ,distance="euclidean", min.nc=2,
max.nc=20, method="kmeans",index="all")

## silhouette plot
#install.packages("cluster")
library(cluster)
dis <- dist(ds[,3:last_var])^2 #dist() computes and returns the
distance matrix
cl <- kmeans(ds[,3:last_var], 2)
sil <- silhouette(cl$cluster, dis)
windows()
plot(sil)

#chosen number of clusters is 2


last_var <- ncol(ds)-1
k <- kmeans(ds[,3:last_var], centers = 2)

# create a dataframe with the centroids of the 2 clusters


k_centers <- data.frame(k$centers,cl=c(1,2))
k_centers$cl <- as.factor(k_centers$cl)

#attribute that enables to track the cluster assigned


ds <- cbind(ds , k$cluster)
colnames(ds)[ncol(ds)] <- "cluster"
ds$cluster <- factor(ds$cluster

## parallel coordinates coloured by cluster


library(ggplot2)
library(GGally)
ggparcoord(data = ds , columns = c(3:last_var), groupColumn =
ncol(ds), scale = "globalminmax") + theme_bw()

## see the name in each cluster


ds <- cbind(ds , k$cluster)
colnames(ds)[ncol(ds)] <- "cluster"
ds$cluster <- factor(ds$cluster)
ds[order(ds$cluster), c('Name', 'cluster')]

PAM with k specified


ds_scaled <- cbind(ds[,1:2], scale(ds[ ,-c(1,2,19)]), ds[,19])

library(cluster)
pam_cl <- cluster::pam(ds_scaled[, 4:ncol(ds_scaled)], k=2)

# check and observe medoids


pam_cl$id.med
ds_scaled[c(3,28),]

Ensemble PAM and k-means


pam_cl <- cluster::pam(ds_scaled[, 4:ncol(ds_scaled)], k=2)
km_cl <- kmeans(ds_scaled[,4:ncol(ds_scaled)], centers=2)

ds_cl <- data.frame(ds_scaled, pam=pam_cl$clustering,


km=km_cl$cluster)

#compare results
table(pam=ds_cl$pam, kmeans=ds_cl$km)
#OR
#length(which(ds_cl$pam==1 & ds_cl$km==1))
#length(which(ds_cl$pam==1 & ds_cl$km==2))
#length(which(ds_cl$pam==2 & ds_cl$km==1))
#length(which(ds_cl$pam==2 & ds_cl$km==2))

PAM without k specified (use silhouette metric)


#install.packages("fpc")
library(fpc)
pamk_cl <- fpc::pamk(ds_scaled[, 4:ncol(ds_scaled)])

##medoids
pamk_cl$pamobject$id.med

table(pam=ds_cl$pam, pamk=pamk_cl$pamobject$clustering)

Fuzzy clustering
f_kmeans <- cluster::fanny(ds_scaled[, 4:ncol(ds_scaled)], k = 2,
memb.exp = 1.3)
### memb.exp close 1 gives increasingly rigid clusterings whereas
higher values of r leads to complete fuzzyness

data.frame(cbind(ds_scaled$Name, round(f_kmeans$membership*100,2)))
apply(f_kmeans$membership, 1, sum)

#plotting
#install.packages("factoextra")
library(factoextra)
fviz_cluster(f_kmeans, ellipse.type = "norm", choose.vars =
c("Heading", "Passing"),legend = "right")
Hierarchical clustering
dist_data <- dist(ds_scaled[, 4:ncol(ds_scaled)]) ## check options
with ?dist

## try several methods, e.g. "ward.D", "single", "complete", "average"


, "median",
hc <- hclust(dist_data, method = "single" )

plot(hc, labels = ds_scaled$Name)

rect.hclust(hc, k=3)

groups <- cutree(hc, k=5)


Dataset_cluster<-cbind(Dataset,groups)

DBSCAN
## play with the parameters and if you should use the scaled data or
not
dsc_cl <- fpc::dbscan(ds_scaled[, 4:ncol(ds)], eps = 3, MinPts = 5)
dsc_cl$cluster

## Plotting Clusters resultant from dbscan clustering considering


attributes 'Dribbling' and 'Heading'
plot(dsc_cl, ds_scaled[, c(4,6)], main = "Dribbling vs Heading")

## Plotting Clusters resultant from dbscan clustering considering all


attributes
plot(dsc_cl, ds_scaled[, 4:ncol(ds)], main = "DBScan")

Visualization
library(readr)
library(dplyr)
library(lubridate)

countries <- c( "France","Spain" )


ds_raw <-
read_csv("https://opendata.ecdc.europa.eu/covid19/nationalcasedeath/
csv/data.csv", col_types ="cccncncnncc" )

ds <- ds_raw %>%


filter ( country %in% countries & indicator == 'cases' ) %>% ##
select countries
arrange ( year_week ) %>% ## to make sure it is sorted
group_by ( country ) %>%
slice_tail ( n = 30 ) %>%
mutate ( seq = row_number() ) %>% ## sequential number
ungroup () %>%
select ( -source,-note ) %>% ## select all columns but 'source'
rename ( cum_count = cumulative_count )

library ( ggplot2 )
p <- ggplot(data = ds, aes(x = seq , y = weekly_count, color =
country))
## in p1 properties are outside aes() and are fixed
p1 <- p + geom_line( size = 2 , linetype = "dashed" , alpha = 0.5)
## in p2 properties depend on the 'country' variable
p2 <- p + geom_line( aes ( linetype = country ) ) + geom_point( aes
( shape = country ) )

## col makes bars such that heights of bars represent values in the
data
p + geom_col()

## violin represents a distribution


p + geom_violin( aes ( fill = country ) , color = "black", alpha = 0.5
)

#Faceting - Divides a plot into subplots based on the value of


discrete variable(s)
p + facet_wrap( ~ country, nrow = 1)

#crop x axis
slice_tail ( n = 30 ) %>%

# scales control the mapping from data to aesthetics


p2 <- p + scale_x_continuous( name = "week", limits = c( 0, 31 ) ,
breaks = c( 1 , 15 , 30 ) )
p3 <- p2 + scale_color_manual( values = c( "Portugal" = "red3" , Spain
= "blue4" , "Germany" = "green4", "France" =
"gold3") )
library( scales )
p4 <- p3 + scale_y_continuous( name = "weekly cases" , labels =
unit_format(unit = "k", scale = 1e-3 ) )
p5 <- p4 + guides ( color = "none" )

p + scale_color_grey()
p + scale_color_brewer( type = "seq" )
p + scale_color_brewer( type = "qual" )
p + scale_color_brewer( type = "div" )
p + scale_color_brewer( palette = "Dark2" )

p5 + theme_bw()
p5 + theme_classic()
p5 + theme_grey()
p5 + theme_minimal()

#saving the plot


ggsave(filename = "plot_p5.png", plot = p5 , width = 15, height = 10,
units = "cm", dpi = 600)
ggsave("plot_p5.pdf", plot = p5 , width = 15, height = 10, units =
"cm")

Association rules
# define as factor various columns in 1 line
data[,1:ncol(data)]<-lapply(data[,1:5], as.factor)

#install.packages("arules")
library(arules)

# find frequent itemsets


frequent_sets<- apriori(data, parameter = list(support=0.5, target =
"frequent"))
inspect(sort(frequent_sets, by = 'support'))
# find association rules. Set minlen=2 to avoid rules with only one
item
rules<- apriori(data, parameter = list(support=0.4, confidence = 0.6,
minlen=2))
inspect(rules)

#plot
library(arulesViz)
plot(rules, method = "graph", engine = "htmlwidget")

# Generate the rules, with 2 itemsets, that lead to column 'pep' = NO


Plan (pep)
no_pep_rules <- apriori(toy[,-c(1,ncol(toy))], parameter =
list(support=0.5,appearance = list (rhs="pep=NO"))
inspect(no_pep_rules)

Example Exam 1
1. Start by writing the code that enables you to read the 'sales_data.txt' dataset file and to set
the appropriate data type.
sales_data = read_csv(‘sales_data.txt’)
sales_data$prod = as.factor(sales_data$prod)

2. Write the code that allows to compute how many reports do not include both the respective
Quantity and Value.

aux <- apply(data[‘Quant’,’val’],1,function(x)sum(is.na(x)))


length(aux[aux>0])
OR
Df = sales_data[which(is.na(sales_data$Quant)&is.na(sales_data$val)),]
nrow(df)

3. Create a new dataset excluding those reports (those that do not include both the respective
Quantity and
Value).

Df = sales_data[which(aux=0),]
Df = sales_data[-which(is.na(sales_data$Quant)&is.na(sales_data$val))]

4. In order to obtain a deeper knowledge on the new dataset, construct a barplot that depicts
the number of
reports per product (use the new dataset).
library(ggplot2)
p <- ggplot(data = ds, aes(x = Prod)) + geom_bar()

5. There are two products (Prod=2442 and Prod=2443) that have all their transactions with
unknown values of the quantity. Exclude from the new dataset all records refereeing to these
products.
Df = sales_data[-which((sales_data$Prod==”2442”)|
is.na(sales_data$Prod==”2443”)),]

6. Given the different quantity of products that are sold on each transaction, it is more correct
to center the
analysis on the unit price rather than on the total monetary value of the sale (Val). Add a new
column to the
dataframe (obtained in the last question) with the unit price and keep the Val column.
Df$Up = Df$Val / Df$Quant

7. You are considering to replace the missing values on the unit price by the average price of the
corresponding product. Create a new object with the mean prices by product.
tapply(sales_data$Up, sales_data$Prod, mean)

8. Since you were not sure how to use the table just created to fill in the missing values, you
decided to replace the missing values on the unit price by the global median unit price. Write
the code for this purpose.
df[is.na(df$Up), "Up"] <- median(df$Up, na.rm = T)

9. For those reports in which the quantity is not known, infer it based on the unit price and total
value.
df[is.na(df$Quant), "Quant"] <- df$Val / df$Up

10. Exclude the column Val of the dataset.


df$Val = NULL

11. Write the code that enables to run a classification model based on k-NN (k=2). Use all data
records with a known target value as training data and predict the unknown target values.

sales_processed$X<-NULL
sales_processed$Prod<-as.factor(sales_processed$Prod)

#Min-max scaling
new_max<-1
new_min<-0
norm <- function(x) {(x - min(x, na.rm=TRUE))/(max(x,na.rm=TRUE) -
min(x, na.rm=TRUE))*((new_max-new_min)+new_min)}

sales_processed[,-c(1,2,3,5)] <- apply(sales_processed[,-


c(1,2,3,5)],2, norm)

# binary variables must be null!!!!!


train = sales_processed[-which(sales_processed$Insp = ‘unkn’)]
test = sales_processed[which(sales_processed$Insp = ‘unkn’)]

predictions_summary<-data.frame()

Model_knn<-knn(train[,-c(1,8)], test[,-c(1,8)], cl=train[,8], k = 2,


prob=TRUE)

prob_predictions=ifelse(Model_knn==1,attr(Model_knn, 'prob'),1-
attr(Model_knn, 'prob'))

# This is necessary because "if prob is true, the proportion of


the votes for the winning class are returned as attribute prob."
#To compute the AUC we need the probability corresponding to
the positive class

predictions_summary<-rbind(predictions_summary,
data.frame(ID=test[,1], RISK=test$RISK, pred=prob_predictions)))

predictions_summary$RISK<-
as.numeric(as.character(predictions_summary$RISK))
auc_list<-auc(predictions_summary$RISK, predictions_summary$pred,
direction="<")

auc_list

12. Write the code that enables to identify the top 10% reports in terms of probability of fraud.
Those reports will be part of a rigorous control.

predictions_summary_ordered<-predictions_summary[order(-pred),]
predictions_summary_ordered[1:round(nrow(test)/10),]

Exam example 2
1. Start by writing the code that enables you to read the dataset file and to set the appropriate
data type.

2. Write the code that allows to estimate the standard deviation for each variable (if possible,
at once).
df_sd = apply(df,2,sd())

3. Write the code that enables to add a new column with the music style.
Dataset_ArtistStyle<-cbind(c("Abba","Beatles","Eels","Vivaldi",
"Mozart","Beethoven","Enya"),c("Rock","Rock","Rock","Classical",
"Classical","Classical","New Wave"))
colnames(Dataset_ArtistStyle)[1:2]<-c("V1","Music_Style")
Dataset2=merge(Dataset,Dataset_ArtistStyle,by.x = "Artist",by.y
= "V1",all.x = TRUE)
View(Dataset2)

4. Write the code that enables to estimate the average of “LVar” for each music style (if
possible at once).
tapply(df$LVar, df$V1, mean)

5. Write the code that enables to normalize the numeric data using a Z-score approach (if
possible at once).

data[,2:6] <-
apply(data[,2:6], 2 , function(x) {(x - mean(x,
na.rm=TRUE))/(sd(x,na.rm=TRUE))} )

6. Write the code that enables to create a sample of 80% of the original data. This sample

p=0.7
data_sample <- data.frame()
for (i in levels(data$origin)){
data_sub <- data[data$origin==i,]
B = ceiling(nrow(data_sub)*p) #rounds up
data_sub_aux <- data_sub[sample(1:nrow(data_sub),B),]
data_sample <- rbind(data_sample,data_sub_aux)
BUT TO HAVE BOTH 80% AND 20%

set.seed(5)
p = 0.2
Data$Artist<-factor(Data$Artist)
Test_sample <- data.frame()
Train_sample <- data.frame()

for(i in levels(Data$Artist)) {
dsub <- Data[Data$Artist == i,]
B = ceiling(nrow(dsub) * p)
C = sample(1:nrow(dsub), B)
dsub_1 <- dsub[C, ]
dsub_2 <- dsub[-C,]
Test_sample <- rbind(Test_sample, dsub_1)
Train_sample <- rbind(Train_sample, dsub_2)
}
Data_train<-Train_sample

7. Write the code that allows to assign the musics of the sample obtained to one of five
clusters constructed using a hierarchical clustering with average link distance. Please consider
only the numeric variables.

dist_data <- dist(ds_scaled[, 4:ncol(ds_scaled)]) ## check options


with ?dist

## try several methods, e.g. "ward.D", "single", "complete", "average"


, "median",
hc <- hclust(dist_data, method = "average" )

plot(hc, labels = ds_scaled$Name)

rect.hclust(hc, k=5)
groups <- cutree(hc, k=5)
Dataset_cluster<-cbind(Dataset,groups)

Example exam 3
3. number of visits and total amount spent per costumer. Cluster according to these
Visits = tapply(data$UnitPrice*data$Quantity, data$CostumerID, sum)
Amount = tapply(data$InvoiceNo, data$CostumerID, function(x)
length(unique(x)))
Clustering = kmeans(data.frame(Amount, Visits),5)

4. discretize with equal width to create bins


library(infotheo)
discretize(algae$Cl, disc="equalwidth", nbins=5)

5. use boxplot to identify outliers of total amount


Length(boxplot(data$UnitPrice*data$Quantity)$out)
6. STRATIFIED SAMPLING WITH HYPERPARAMETER TUNING OF DT AND AUC FOR PREDICITING
IF QUANTITY IS HIGHER THAN 100

set.seed(5)
p1 = 0.6
p2 = 0.2
data$dep = 0
data$dep[which(data$Quantity > 100)] = 1
data$dep = as.factor(data$dep)

Test_sample <- data.frame()


Train_sample <- data.frame()
Validation_sample <- data.frame()

for(i in levels(Data$Artist)) {
dsub <- Data[Data$Artist == i,]
B1 = ceiling(nrow(dsub) * p1)
B2 = ceiling(nrow(dsub) * p2)
C1 = sample(1:nrow(dsub), B1)
dsub_1 <- dsub[C1, ]
dsub_2 <- dsub[-C1,]
C2 = sample(1:nrow(dsub_2)
dsub_3 <- dsub_2[C2,]
dsub_4 <- dsub_2[-C2,]
Test_sample <- rbind(Test_sample, dsub_1)
Validation_sample <- rbind(Validation_sample, dsub_3)
Train_sample <- rbind(Train_sample, dsub_4)
}

library(rpart)
Model_information<-rpart(dep~
StockCode+UnitPrice+CustomerID+Country,data=Train, method = "class",
parms = list(split = “information”))

Model_gini<-rpart(dep~
StockCode+UnitPrice+CustomerID+Country,data=Train, method = "class",
parms = list(split = “gini”))

Prediction_information <- predict(Model_information,


newdata=Validation)

Prediction_gini <- predict(Model_gini, newdata=Validation)

auc_list_info<-auc(as.numeric(as.character(Validation$dep)),
Prediction_information[,'1'], direction='<')
auc_list_info

auc_list_gini<-auc(as.numeric(as.character(Validation$dep)),
Prediction_gini[,'1'], direction='<')
auc_list_gini

if (auc_list_info> auc_list_gini){
prediction <- predict(Model_information, newdata=Test)
auc<- auc(as.numeric(as.character(Test$dep)), prediction[,'1'],
direction='<')}
else {prediction <- predict(Model_gini, newdata=Test)
auc<- auc(as.numeric(as.character(Test$dep)), prediction[,'1'],
direction='<')}
7. Write the R code that enables to ignore observations with missing values and then create a
graph that shows the relationship between the Quantity and the UnitPrice. The Country of the
transactions should be represented by the color of the observations
new_data <-na.omit(new_data)
library(ggplot2)
ggplot(data, aes(x= UnitPrice, y= Quantity,colour= Country)) +
geom_point()
plot(data$UnitPrice, data$Quantity, col=data$Country)

8. Write the R code that enables to replace the Description values starting by “?” by NA
data[substr(data$Description,1,1)=="?","Description"]<-NA

9. Write the R code that enables to know how many rows do not have a CustomerId
associated, i.e. are missing values
sum(is.na(data$CustomerID))

10. Write the R code that calculates simultaneously the median of the variables Quantity and
UnitPrice, per country
sapply(c(‘quantity’,’unitprice’),function(x)tapply(data[,x],
data$Country, median))

PART 1
1.1 Write the code to load the datasets and classify each variable according to their type.

1.2 Write the code to include a variable that identifies which type of wine refers (red or white)
each observation, and integrate the two datasets.

1.3 Assume that the “alcohol” is dependent on the "volatileacidity" and "citricacid". Write the
code to infer the missing values of “alcohol” variable considering a linear regression.

1.4 Write the code to estimate the mean of "fixedacidity" and "volatileacidity" for each quality
level. Try to use one single code statement.

1.5 Write the code that uses the complete dataset to train an ensemble classification algorithm
that enables to predict the quality of the wine, using the remaining attributes. For this purpose
use oneR algorithm and decision tree algorithm (with default parameters). Estimate the
training error of this ensemble in order to measure the overfitting.

#part I
#1
setwd("C:/Users/afons/Downloads/AE_Normal_2022")

red <- read.csv('winequality-red.csv',header = T,sep = ";",dec = ".",stringsAsFactors = F)


white <- read.csv('winequality-white.csv',header = T,sep = ";",dec = ".",stringsAsFactors = F)

#2
red$type = 'red'
white$type = 'white'

colnames(red)[12] = 'quality'

wine = rbind(red,white)
wine$type = as.factor(wine$type)

#3
length(which(is.na(wine$volatileacidity)))
length(which(is.na(wine$citricacid)))

model<-lm(alcohol ~ volatileacidity+citricacid, data = wine[-which(is.na(wine$alcohol)),])

wine[which(is.na(wine$alcohol)),'alcohol'] = (model[["coefficients"]][1] +
model[["coefficients"]]
[2]*wine[which(is.na(wine$alcohol)),'volatileacidity']+
model[["coefficients"]][3]*wine[which(is.na(wine$alcohol)),'citricacid'])

#4
sapply(c("fixedacidity","volatileacidity"),function(x)tapply(wine[,x], wine$quality, median))

#5
#install.packages("OneR")
library('OneR')
library(rpart)
wine$quality = as.character(wine$quality)
wine$quality = as.factor(wine$quality)

Model_OneR<-OneR(wine[,-c(13)])
prediction_OneR<-predict(Model_OneR, wine[,-c(13)])

Model_tree<-rpart(quality~.,data=wine[,-c(13)], parms = list(split = 'gini'))

prediction_tree<-predict(Model_tree, wine[,-c(13)])
apply(prediction)

prediction_DT = colnames(prediction_tree)[apply(prediction_tree,1,which.max)]
prediction_DT = as.factor(prediction_DT)

table(prediction_OneR, prediction_DT)

table(prediction_OneR, wine$quality)
1 - (612 + 2419 + 24) / nrow(wine)

table(prediction_DT, wine$quality)
1 - (1338 + 2107) / nrow(wine)
PART 2
Using the dataset that integrate the records for both white and red wine answer the following
questions.

2.1 Use ggplot to create a set of parallel boxplots showing the alcohol level for the different
levels of quality. Write a sentence commenting the plot.

2.2 Create a scatter plot having 'citricacid' in the x-axis and ' volatileacidity' in the y-axis. Color
the points by the wine type (red or white). Do something to deal with the points that overlap.
Write a sentence commenting the plot.

2.3 Use the k-means clustering algorithm to analyze if there are groups of records that can be
induced from the values of the chemical variables. Justify your choice for the number of
clusters and any other step in the analysis that you think was necessary. Write a sentence
commenting the results.

# Part 2
#1
library(ggplot2)
p <- ggplot(data = wine, aes(x = alcohol, color = quality))
p + geom_boxplot()
# From quality 3 to 6 the alcohol level seems to decrease more significantly.
# Then from 6 to 8, the opposite patter

#2
p1 <- ggplot(data = wine, aes(x = citricacid, y = volatileacidity, color = type))
p1 + geom_point() + facet_wrap(~type)
# to deal with overlap, plot is divided into subplots with facet_wrap

#red and white have symetric behaviour regarding volatileacidity and citricacid
#red tends to have higher values of of volatileacity
#while white has higher values for citricacid
#in fact even the outliers of both show this pattern
#nevertheless both present a really wide range with linearity not being an adequate
assumption

#3

#It would be intering to identify clusters either related to the quality level
# or to the type of wine

#To check the number of clusters, the very useful NbClust package will be used
library(NbClust)

## Davies and Bouldin index


NbClust(wine[, -c(12,13)] ,distance = "euclidean", min.nc = 2, max.nc = 10, method = "kmeans",
index = "db")

#silhouette index
NbClust( wine[, -c(12,13)] ,distance = "euclidean", min.nc = 2, max.nc = 10, method =
"kmeans", index = "silhouette")
#Both yield good results for 2 and 3 clusters

## silhouette plot
#install.packages("cluster")
library(cluster)
dis <- dist(wine[, -c(12,13)])^2 #dist() computes and returns the distance matrix
cl <- kmeans(wine[, -c(12,13)], 3)
sil <- silhouette(cl$cluster, dis)
windows()
plot(sil)

#An alternative would be to use the elbow method

k_num <- 2:10


num_tries <- 10 ## Run the K-means algorithm 10 times for each k

## vector to keep the average within sum of squares for each k


avg.tot.w.ss <-double(length(k_num))
for( v in k_num){ ## for each value of the range variable
v.tot.w.ss <-double(num_tries) # vector to hold the 10 tries
for(i in 1:num_tries){
k.temp <- kmeans( wine[, -c(12,13)] , centers=v) #Run k-means
v.tot.w.ss[i] <-k.temp$tot.withinss #Store the total withinss
}
avg.tot.w.ss[v-1] <-mean(v.tot.w.ss) #Average the 10 total withinss
}
plot(k_num, avg.tot.w.ss, type="b", main="Total Within SS by Various K", ylab="Average Total
Within Sum of Squares", xlab="Value of K")

#from 2 to 3 there is a big gap in the total within SS

#chosen number of clusters is 2 to identify type of wine


k <- kmeans(wine[, -c(12,13)], centers = 2)

# create a dataframe with the centroids of the 2 clusters


k_centers <- data.frame(k$centers,cl=c(1,2))
k_centers$cl <- as.factor(k_centers$cl)

#attribute that enables to track the cluster assigned


wine <- cbind(wine , k$cluster)
colnames(wine)[ncol(wine)] <- "cluster"
wine$cluster <- factor(wine$cluster)

## parallel coordinates coloured by cluster


library(GGally)
ggparcoord(data = wine , columns = c(1:11), groupColumn = ncol(wine), scale =
"globalminmax") + theme_bw()

table(wine$type,wine$cluster)

You might also like