Code
Code
Read
library(xlsx)
data =read.xlsx('Classify_risk_dataset.xlsx', head=TRUE, 1,
stringsAsFactors=T)
Others
cbind , rbind
> df['vol']
vol
1 5
2 6
3 7
4 8
> df[,'vol']
[1] 5 6 7 8
mydata[,-c(1, 2)]
Data Preparation
Read file
cleveland_data <- read.csv('cleveland_data.txt',header = T,sep =
",",dec = ".",stringsAsFactors = F)
Change colnames
colnames(data_switzerland)[colnames(data_switzerland)=="AGE"] <- "age"
#or colnames(switzerland_data)[4]<-'age'
ID column
data$id <- 1:nrow(data)
Functions
mean(data$age)
sd(data$age)
min(data$age)
max(data$age)
NA treatment
data[data=='?']<-NA
Cols to factos
cols_to_factors <- c('origin','sex','cp','fbs','slope','num')
for (i in cols_to_factors){
data[,i] <- as.factor(data[,i])
}
Data cleaning
Remove observations with 3 or more missing values
aux <- apply(data,1,function(x)sum(is.na(x)))
length(aux[aux<3]) #581 observations
data <- data[which(aux<3),]
Replace NA with regression results
model<-lm(PO4 ~ oPO4, data = algae)
AND
algae[28, "PO4"] <- model[["coefficients"]][1] +
model[["coefficients"]][2] * algae[28, "oPO4"]
OR (if missing values)
fillPO4 <- function(oP) {
if (is.na(oP))
return(NA)
else
(return(model[["coefficients"]][1]+model[["coefficients"]][2]*oP))}
algae[is.na(algae$PO4), "PO4"] <- sapply(algae[is.na(algae$PO4),
"oPO4"], fillPO4)
Replace NA with mode
for (i in c('fbs', 'slope', 'thal')){
data[is.na(data[,i]),i] <-
as.numeric(names(sort(-table(data[is.na(data[,i]),i])))[1])}
Replace NA with mean/median
algae[is.na(algae$Chla), "Chla"] <- median(algae$Chla, na.rm = T)
Replace NA with kNN
algae <- knnImputation(algae, k = 10, meth = "median")
Binning
library(infotheo)
discretize(algae$Cl, disc="equalfreq", nbins=sqrt(length(algae$Cl)))
#Create new attribute with bin values
algae$bin_2<-as.numeric(unlist(discretize(algae$Cl, disc="equalfreq",
nbins=sqrt(length(algae$Cl)))))
Data transformation
Data with threshold
data$var_2 <- 0
data[which(data$thalach>150),'var_2'] <- 1
Data with threshold
> Soil$aux_2[Soil$H2O > 20]<-“higher than 20"
Normal normalization
numeric_attrib <- c('age','trestbps')
z_score_function <-
function(x) {(x - mean(x, na.rm=TRUE))/(sd(x,na.rm=TRUE))}
data[,numeric_attrib] <-
apply(data[,numeric_attrib],2, z_score_function)
Min-max normalization
new_max<-2
new_min<-0
apply(data,2, doit)
Decimal scaling
maxvect <- apply(abs(mtcars), 2, max)
kvector <- ceiling(log10(maxvect))
scalefactor <- 10^kvector
normed <- scale(data, center = FALSE, scale = scalefactor)
Random Sampling
Mysample_without_replacement <- mtcars[sample(1:nrow(mtcars), 50,
replace=FALSE),]
Mysample_with_replacement <- mtcars[sample(1:nrow(mtcars), 50,
replace=TRUE),]
Stratified Sampling keeping each class distribution
p=0.7
data_sample <- data.frame()
for (i in levels(data$origin)){
data_sub <- data[data$origin==i,]
B = ceiling(nrow(data_sub)*p) #rounds up
data_sub_aux <- data_sub[sample(1:nrow(data_sub),B),]
data_sample <- rbind(data_sample,data_sub_aux)
}
Classification
holdout approach with One R
library(splitTools)
partitions <- partition(Data$RISK, p = c(train = 0.7, test = 0.3))
str(partitions)
Train_sample <- Data[partitions$train,]
Test_sample <- Data[partitions$test,]
Model_OneR<-OneR(Train_sample[,-c(1)])
prediction<-predict(Model_OneR, Test_sample[,-c(1)])
table(prediction, Test_sample$RISK)
accuracy<-sum(prediction==Test_sample$RISK)/nrow(Test_sample)
for (j in 1:number_iterations){
set.seed(j)
partitions <- partition(Data$RISK, p = c(train = 0.7, test =
0.3))
Test <- Data[partitions$test,]
Train<- Data[partitions$train,]
Model_OneR<-OneR(Train[,-c(1)])
prediction<-predict(Model_OneR, Test[,-c(1)])
accuracy[j]<-sum(prediction== Test$RISK)/nrow(Test)
}
mean_accuracy<-mean(accuracy)
Cross validation with LR and good performance estimation
library(splitTools)
set.seed(123)
folds <- createFolds(Data$RISK, k=num_folds)
#index the elements of the test dataset
predictions_summary<-data.frame()
for (k in 1:num_folds){
Model_Logistic<-glm(RISK~.-ID,data=Data[-folds[[k]],], family
= binomial('logit'))
predictions_summary<-rbind(predictions_summary,
data.frame(ID=Data[folds[[k]],1], RISK=Data$RISK[folds[[k]]],
prediction))
}
summary(predictions_summary)
predictions_summary$RISK<-
as.numeric(as.character(predictions_summary$RISK))
#PresenceAbsence functions require both observed and predicted values
to be numeric
#install.packages('PresenceAbsence')
library(PresenceAbsence)
confusion_matrix<-cmx(predictions_summary, 0.5)
confusion_matrix
performance_metrics<-presence.absence.accuracy(predictions_summary,
threshold = 0.5, find.auc = FALSE, st.dev = FALSE)
performance_metrics
#install.packages('pROC')
library(pROC)
roc_curve<-plot.roc(predictions_summary$RISK,
predictions_summary$prediction, direction="<")
roc_curve
auc_list<-auc(predictions_summary$RISK,
predictions_summary$prediction, direction="<")
auc_list
#install.packages('e1071')
library (e1071)
predictions_summary<-data.frame()
for (k in 1:num_folds){
Model_NaiveBayes<-naiveBayes(RISK~.-ID,data=Data[-folds[[k]],])
predictions_summary$RISK<-
as.numeric(as.character(predictions_summary$RISK))
performance_metrics<-presence.absence.accuracy(predictions_summary,
threshold = 0.5, find.auc = FALSE, st.dev = FALSE)
performance_metrics
confusion_matrix<-cmx(predictions_summary)
confusion_matrix
roc_curve<-plot.roc(predictions_summary$RISK,
predictions_summary$prediction, legacy.axes= T, direction="<")
roc_curve
auc_list<-auc(predictions_summary$RISK,
predictions_summary$prediction, direction="<")
auc_list
predictions_summary<-data.frame()
for (k in 1:num_folds){
Model_knn<-knn(Data_norm[-folds[[k]],-c(1,8)],
Data_norm[folds[[k]],-c(1,8)], cl=Data_norm[-folds[[k]],8], k = 2,
prob=TRUE)
prob_predictions=ifelse(Model_knn==1,attr(Model_knn, 'prob'),1-
attr(Model_knn, 'prob'))
# This is necessary because "if prob is true, the proportion of
the votes for the winning class are returned as attribute prob."
#To compute the AUC we need the probability corresponding to
the positive class
predictions_summary<-rbind(predictions_summary,
data.frame(ID=Data_norm[folds[[k]],1],
RISK=Data_norm$RISK[folds[[k]]], pred=prob_predictions))
predictions_summary$RISK<-
as.numeric(as.character(predictions_summary$RISK))
auc_list<-auc(predictions_summary$RISK, predictions_summary$pred,
direction="<")
auc_list
kNN with 1 parameter tuning
#create stratified train and test samples
library(splitTools)
library(caret)
num_folds<-10
max_num_neighbours<-40
set.seed(123)
folds <- createFolds(Train_validation_sample$RISK, k=num_folds)
performance_summary<-data.frame()
for (j in 1:max_num_neighbours){
predictions_summary<-data.frame()
for (k in 1:num_folds){
prob_predictions=ifelse(Model_knn==1,attr(Model_knn,
'prob'),1-attr(Model_knn, 'prob'))
# This is necessary because "if prob is true, the
proportion of the votes for the winning class are returned as
attribute prob."
#To compute the AUC we need the probability
corresponding to the positive class
predictions_summary<-rbind(predictions_summary,
data.frame(ID=Train_validation_sample[folds[[k]],1],
RISK=Train_validation_sample$RISK[folds[[k]]], pred=prob_predictions))
predictions_summary$RISK<-
as.numeric(as.character(predictions_summary$RISK))
performance_metrics<-presence.absence.accuracy(predictions_summary,
threshold = 0.5, find.auc = FALSE, st.dev = FALSE)
performance_summary<-rbind(performance_summary,
data.frame(num_neighbors=j, accuracy=performance_metrics$PCC))
}
num_neighbors<-
performance_summary[performance_summary$accuracy==max(performance_summ
ary$accuracy),"num_neighbors"]
Final_Model_knn<-knn(Train_validation_sample[,-c(1,8)], Test_sample[,-
c(1,8)], Train_validation_sample[,8], num_neighbors, prob=TRUE)
prob_predictions=ifelse(Final_Model_knn==1,attr(Final_Model_knn,
'prob'),1-attr(Final_Model_knn, 'prob'))
# This is necessary because "if prob is true, the proportion of the
votes for the winning class are returned as attribute prob."
#To compute the AUC we need the probability corresponding to the
positive class
predictions_summary<-data.frame(ID=Test_sample[,1],
RISK=Test_sample$RISK, pred=prob_predictions)
predictions_summary$RISK<-
as.numeric(as.character(predictions_summary$RISK))
performance_metrics<-presence.absence.accuracy(predictions_summary,
threshold = 0.5, find.auc = FALSE, st.dev = FALSE)
performance_metrics
num_folds<-10
set.seed(123)
folds <- createFolds(Data$RISK, k=num_folds)
#index the elements of the test dataset
#########################GINI INDEX############################
predictions_summary<-data.frame()
for (k in 1:num_folds){
Model_tree<-rpart(RISK~.-ID,data=Data[-folds[[k]],], parms =
list(split = 'gini'))
# parms - optional parameters for the splitting function.
predictions_summary<-rbind(predictions_summary,
data.frame(ID=Data[folds[[k]],1], RISK=Data$RISK[folds[[k]]],
prediction=prediction[,'1']))
}
predictions_summary$RISK<-
as.numeric(as.character(predictions_summary$RISK))
performance_metrics<-presence.absence.accuracy(predictions_summary,
threshold = 0.5, find.auc = FALSE, st.dev = FALSE)
performance_metrics
roc_curve<-plot.roc(predictions_summary$RISK,
predictions_summary$prediction, legacy.axes= T, direction="<")
auc_list<-auc(predictions_summary$RISK,
predictions_summary$prediction)
auc_list
set.seed(123)
folds <- createFolds(Train_validation_sample$RISK, k=num_folds)
performance_summary<-data.frame()
criterion<-c('information','gini')
min_num_objects<-20
aux=0
for (l in criterion)
{
for (i in c(0.01,0))
{
for (j in 2:min_num_objects)
{
predictions_summary<-data.frame()
for (k in 1:num_folds)
{
Model_tree<-rpart(RISK~.-ID,data=Train_validation_sample[-
folds[[k]],], parms = list(split = l), minbucket = j, cp=i)
#minsplit: minimum number of observations in a node before
attempting a split
#minbucket: minimum number of observations in a leaf
#cp complex parameter; cp - penalizes complexity; cp=0 - no
pruning is done
predictions_summary<-rbind(predictions_summary,
data.frame(ID=Train_validation_sample[folds[[k]],1],
RISK=Train_validation_sample$RISK[folds[[k]]],
prediction=prediction[,'1']))
}
predictions_summary$RISK<-
as.numeric(as.character( predictions_summary$RISK))
performance_metrics<-
presence.absence.accuracy(predictions_summary, threshold = 0.5,
find.auc = FALSE, st.dev = FALSE)
performance_summary<-rbind(performance_summary,
data.frame(criterion=l, cp=i, min_objects=j,
accuracy=performance_metrics$PCC))
}
}
}
l<-
performance_summary[which(performance_summary$accuracy==max(performanc
e_summary$accuracy))[1],"criterion"]
i<-
performance_summary[which(performance_summary$accuracy==max(performanc
e_summary$accuracy))[1],"cp"]
j<-
performance_summary[which(performance_summary$accuracy==max(performanc
e_summary$accuracy))[1],"min_objects"]
####
Final_model_tree<-rpart(RISK~.-ID,data=Train_validation_sample,
parms = list(split = l), minbucket = as.numeric(j), cp=i)
predictions_summary<-data.frame(ID=Test_sample[,1],
RISK=Test_sample$RISK, prediction=prediction[,'1'])
predictions_summary$RISK<-
as.numeric(as.character(predictions_summary$RISK))
confusion_matrix<-cmx(predictions_summary, 0.5)
confusion_matrix
performance_metrics<-presence.absence.accuracy(predictions_summary,
threshold = 0.5, find.auc = FALSE, st.dev = FALSE)
performance_metrics
Clustering Plots
## PositionDesc attribute as a factor
ds$PositionsDesc <- as.factor(ds$PositionsDesc)
## variable to keep the column number for the last numerical variable
last_var <- ncol(ds)-1
## parallel coordinates and scatter plot matrix
#install.packages("GGally")
#install.packages("ggrepel")
library(GGally)
library(ggrepel)
## Save the plot into a file. It may easier to analyze the plot
p <- ggpairs(ds[,3:last_var])
ggsave("pairs.pdf", plot = p , width = 30 , height = 30 )
# OR Check the number of clusters with the very useful NbClust package
library(NbClust)
#silhouette index
NbClust( ds[ , c(3:last_var ) ] ,distance = "euclidean", min.nc = 2,
max.nc = 15, method = "kmeans", index = "silhouette")
#all indexes
NbClust (ds[,c(3:last_var)] ,distance="euclidean", min.nc=2,
max.nc=20, method="kmeans",index="all")
## silhouette plot
#install.packages("cluster")
library(cluster)
dis <- dist(ds[,3:last_var])^2 #dist() computes and returns the
distance matrix
cl <- kmeans(ds[,3:last_var], 2)
sil <- silhouette(cl$cluster, dis)
windows()
plot(sil)
library(cluster)
pam_cl <- cluster::pam(ds_scaled[, 4:ncol(ds_scaled)], k=2)
#compare results
table(pam=ds_cl$pam, kmeans=ds_cl$km)
#OR
#length(which(ds_cl$pam==1 & ds_cl$km==1))
#length(which(ds_cl$pam==1 & ds_cl$km==2))
#length(which(ds_cl$pam==2 & ds_cl$km==1))
#length(which(ds_cl$pam==2 & ds_cl$km==2))
##medoids
pamk_cl$pamobject$id.med
table(pam=ds_cl$pam, pamk=pamk_cl$pamobject$clustering)
Fuzzy clustering
f_kmeans <- cluster::fanny(ds_scaled[, 4:ncol(ds_scaled)], k = 2,
memb.exp = 1.3)
### memb.exp close 1 gives increasingly rigid clusterings whereas
higher values of r leads to complete fuzzyness
data.frame(cbind(ds_scaled$Name, round(f_kmeans$membership*100,2)))
apply(f_kmeans$membership, 1, sum)
#plotting
#install.packages("factoextra")
library(factoextra)
fviz_cluster(f_kmeans, ellipse.type = "norm", choose.vars =
c("Heading", "Passing"),legend = "right")
Hierarchical clustering
dist_data <- dist(ds_scaled[, 4:ncol(ds_scaled)]) ## check options
with ?dist
rect.hclust(hc, k=3)
DBSCAN
## play with the parameters and if you should use the scaled data or
not
dsc_cl <- fpc::dbscan(ds_scaled[, 4:ncol(ds)], eps = 3, MinPts = 5)
dsc_cl$cluster
Visualization
library(readr)
library(dplyr)
library(lubridate)
library ( ggplot2 )
p <- ggplot(data = ds, aes(x = seq , y = weekly_count, color =
country))
## in p1 properties are outside aes() and are fixed
p1 <- p + geom_line( size = 2 , linetype = "dashed" , alpha = 0.5)
## in p2 properties depend on the 'country' variable
p2 <- p + geom_line( aes ( linetype = country ) ) + geom_point( aes
( shape = country ) )
## col makes bars such that heights of bars represent values in the
data
p + geom_col()
#crop x axis
slice_tail ( n = 30 ) %>%
p + scale_color_grey()
p + scale_color_brewer( type = "seq" )
p + scale_color_brewer( type = "qual" )
p + scale_color_brewer( type = "div" )
p + scale_color_brewer( palette = "Dark2" )
p5 + theme_bw()
p5 + theme_classic()
p5 + theme_grey()
p5 + theme_minimal()
Association rules
# define as factor various columns in 1 line
data[,1:ncol(data)]<-lapply(data[,1:5], as.factor)
#install.packages("arules")
library(arules)
#plot
library(arulesViz)
plot(rules, method = "graph", engine = "htmlwidget")
Example Exam 1
1. Start by writing the code that enables you to read the 'sales_data.txt' dataset file and to set
the appropriate data type.
sales_data = read_csv(‘sales_data.txt’)
sales_data$prod = as.factor(sales_data$prod)
2. Write the code that allows to compute how many reports do not include both the respective
Quantity and Value.
3. Create a new dataset excluding those reports (those that do not include both the respective
Quantity and
Value).
Df = sales_data[which(aux=0),]
Df = sales_data[-which(is.na(sales_data$Quant)&is.na(sales_data$val))]
4. In order to obtain a deeper knowledge on the new dataset, construct a barplot that depicts
the number of
reports per product (use the new dataset).
library(ggplot2)
p <- ggplot(data = ds, aes(x = Prod)) + geom_bar()
5. There are two products (Prod=2442 and Prod=2443) that have all their transactions with
unknown values of the quantity. Exclude from the new dataset all records refereeing to these
products.
Df = sales_data[-which((sales_data$Prod==”2442”)|
is.na(sales_data$Prod==”2443”)),]
6. Given the different quantity of products that are sold on each transaction, it is more correct
to center the
analysis on the unit price rather than on the total monetary value of the sale (Val). Add a new
column to the
dataframe (obtained in the last question) with the unit price and keep the Val column.
Df$Up = Df$Val / Df$Quant
7. You are considering to replace the missing values on the unit price by the average price of the
corresponding product. Create a new object with the mean prices by product.
tapply(sales_data$Up, sales_data$Prod, mean)
8. Since you were not sure how to use the table just created to fill in the missing values, you
decided to replace the missing values on the unit price by the global median unit price. Write
the code for this purpose.
df[is.na(df$Up), "Up"] <- median(df$Up, na.rm = T)
9. For those reports in which the quantity is not known, infer it based on the unit price and total
value.
df[is.na(df$Quant), "Quant"] <- df$Val / df$Up
11. Write the code that enables to run a classification model based on k-NN (k=2). Use all data
records with a known target value as training data and predict the unknown target values.
sales_processed$X<-NULL
sales_processed$Prod<-as.factor(sales_processed$Prod)
#Min-max scaling
new_max<-1
new_min<-0
norm <- function(x) {(x - min(x, na.rm=TRUE))/(max(x,na.rm=TRUE) -
min(x, na.rm=TRUE))*((new_max-new_min)+new_min)}
predictions_summary<-data.frame()
prob_predictions=ifelse(Model_knn==1,attr(Model_knn, 'prob'),1-
attr(Model_knn, 'prob'))
predictions_summary<-rbind(predictions_summary,
data.frame(ID=test[,1], RISK=test$RISK, pred=prob_predictions)))
predictions_summary$RISK<-
as.numeric(as.character(predictions_summary$RISK))
auc_list<-auc(predictions_summary$RISK, predictions_summary$pred,
direction="<")
auc_list
12. Write the code that enables to identify the top 10% reports in terms of probability of fraud.
Those reports will be part of a rigorous control.
predictions_summary_ordered<-predictions_summary[order(-pred),]
predictions_summary_ordered[1:round(nrow(test)/10),]
Exam example 2
1. Start by writing the code that enables you to read the dataset file and to set the appropriate
data type.
2. Write the code that allows to estimate the standard deviation for each variable (if possible,
at once).
df_sd = apply(df,2,sd())
3. Write the code that enables to add a new column with the music style.
Dataset_ArtistStyle<-cbind(c("Abba","Beatles","Eels","Vivaldi",
"Mozart","Beethoven","Enya"),c("Rock","Rock","Rock","Classical",
"Classical","Classical","New Wave"))
colnames(Dataset_ArtistStyle)[1:2]<-c("V1","Music_Style")
Dataset2=merge(Dataset,Dataset_ArtistStyle,by.x = "Artist",by.y
= "V1",all.x = TRUE)
View(Dataset2)
4. Write the code that enables to estimate the average of “LVar” for each music style (if
possible at once).
tapply(df$LVar, df$V1, mean)
5. Write the code that enables to normalize the numeric data using a Z-score approach (if
possible at once).
data[,2:6] <-
apply(data[,2:6], 2 , function(x) {(x - mean(x,
na.rm=TRUE))/(sd(x,na.rm=TRUE))} )
6. Write the code that enables to create a sample of 80% of the original data. This sample
p=0.7
data_sample <- data.frame()
for (i in levels(data$origin)){
data_sub <- data[data$origin==i,]
B = ceiling(nrow(data_sub)*p) #rounds up
data_sub_aux <- data_sub[sample(1:nrow(data_sub),B),]
data_sample <- rbind(data_sample,data_sub_aux)
BUT TO HAVE BOTH 80% AND 20%
set.seed(5)
p = 0.2
Data$Artist<-factor(Data$Artist)
Test_sample <- data.frame()
Train_sample <- data.frame()
for(i in levels(Data$Artist)) {
dsub <- Data[Data$Artist == i,]
B = ceiling(nrow(dsub) * p)
C = sample(1:nrow(dsub), B)
dsub_1 <- dsub[C, ]
dsub_2 <- dsub[-C,]
Test_sample <- rbind(Test_sample, dsub_1)
Train_sample <- rbind(Train_sample, dsub_2)
}
Data_train<-Train_sample
7. Write the code that allows to assign the musics of the sample obtained to one of five
clusters constructed using a hierarchical clustering with average link distance. Please consider
only the numeric variables.
rect.hclust(hc, k=5)
groups <- cutree(hc, k=5)
Dataset_cluster<-cbind(Dataset,groups)
Example exam 3
3. number of visits and total amount spent per costumer. Cluster according to these
Visits = tapply(data$UnitPrice*data$Quantity, data$CostumerID, sum)
Amount = tapply(data$InvoiceNo, data$CostumerID, function(x)
length(unique(x)))
Clustering = kmeans(data.frame(Amount, Visits),5)
set.seed(5)
p1 = 0.6
p2 = 0.2
data$dep = 0
data$dep[which(data$Quantity > 100)] = 1
data$dep = as.factor(data$dep)
for(i in levels(Data$Artist)) {
dsub <- Data[Data$Artist == i,]
B1 = ceiling(nrow(dsub) * p1)
B2 = ceiling(nrow(dsub) * p2)
C1 = sample(1:nrow(dsub), B1)
dsub_1 <- dsub[C1, ]
dsub_2 <- dsub[-C1,]
C2 = sample(1:nrow(dsub_2)
dsub_3 <- dsub_2[C2,]
dsub_4 <- dsub_2[-C2,]
Test_sample <- rbind(Test_sample, dsub_1)
Validation_sample <- rbind(Validation_sample, dsub_3)
Train_sample <- rbind(Train_sample, dsub_4)
}
library(rpart)
Model_information<-rpart(dep~
StockCode+UnitPrice+CustomerID+Country,data=Train, method = "class",
parms = list(split = “information”))
Model_gini<-rpart(dep~
StockCode+UnitPrice+CustomerID+Country,data=Train, method = "class",
parms = list(split = “gini”))
auc_list_info<-auc(as.numeric(as.character(Validation$dep)),
Prediction_information[,'1'], direction='<')
auc_list_info
auc_list_gini<-auc(as.numeric(as.character(Validation$dep)),
Prediction_gini[,'1'], direction='<')
auc_list_gini
if (auc_list_info> auc_list_gini){
prediction <- predict(Model_information, newdata=Test)
auc<- auc(as.numeric(as.character(Test$dep)), prediction[,'1'],
direction='<')}
else {prediction <- predict(Model_gini, newdata=Test)
auc<- auc(as.numeric(as.character(Test$dep)), prediction[,'1'],
direction='<')}
7. Write the R code that enables to ignore observations with missing values and then create a
graph that shows the relationship between the Quantity and the UnitPrice. The Country of the
transactions should be represented by the color of the observations
new_data <-na.omit(new_data)
library(ggplot2)
ggplot(data, aes(x= UnitPrice, y= Quantity,colour= Country)) +
geom_point()
plot(data$UnitPrice, data$Quantity, col=data$Country)
8. Write the R code that enables to replace the Description values starting by “?” by NA
data[substr(data$Description,1,1)=="?","Description"]<-NA
9. Write the R code that enables to know how many rows do not have a CustomerId
associated, i.e. are missing values
sum(is.na(data$CustomerID))
10. Write the R code that calculates simultaneously the median of the variables Quantity and
UnitPrice, per country
sapply(c(‘quantity’,’unitprice’),function(x)tapply(data[,x],
data$Country, median))
PART 1
1.1 Write the code to load the datasets and classify each variable according to their type.
1.2 Write the code to include a variable that identifies which type of wine refers (red or white)
each observation, and integrate the two datasets.
1.3 Assume that the “alcohol” is dependent on the "volatileacidity" and "citricacid". Write the
code to infer the missing values of “alcohol” variable considering a linear regression.
1.4 Write the code to estimate the mean of "fixedacidity" and "volatileacidity" for each quality
level. Try to use one single code statement.
1.5 Write the code that uses the complete dataset to train an ensemble classification algorithm
that enables to predict the quality of the wine, using the remaining attributes. For this purpose
use oneR algorithm and decision tree algorithm (with default parameters). Estimate the
training error of this ensemble in order to measure the overfitting.
#part I
#1
setwd("C:/Users/afons/Downloads/AE_Normal_2022")
#2
red$type = 'red'
white$type = 'white'
colnames(red)[12] = 'quality'
wine = rbind(red,white)
wine$type = as.factor(wine$type)
#3
length(which(is.na(wine$volatileacidity)))
length(which(is.na(wine$citricacid)))
wine[which(is.na(wine$alcohol)),'alcohol'] = (model[["coefficients"]][1] +
model[["coefficients"]]
[2]*wine[which(is.na(wine$alcohol)),'volatileacidity']+
model[["coefficients"]][3]*wine[which(is.na(wine$alcohol)),'citricacid'])
#4
sapply(c("fixedacidity","volatileacidity"),function(x)tapply(wine[,x], wine$quality, median))
#5
#install.packages("OneR")
library('OneR')
library(rpart)
wine$quality = as.character(wine$quality)
wine$quality = as.factor(wine$quality)
Model_OneR<-OneR(wine[,-c(13)])
prediction_OneR<-predict(Model_OneR, wine[,-c(13)])
prediction_tree<-predict(Model_tree, wine[,-c(13)])
apply(prediction)
prediction_DT = colnames(prediction_tree)[apply(prediction_tree,1,which.max)]
prediction_DT = as.factor(prediction_DT)
table(prediction_OneR, prediction_DT)
table(prediction_OneR, wine$quality)
1 - (612 + 2419 + 24) / nrow(wine)
table(prediction_DT, wine$quality)
1 - (1338 + 2107) / nrow(wine)
PART 2
Using the dataset that integrate the records for both white and red wine answer the following
questions.
2.1 Use ggplot to create a set of parallel boxplots showing the alcohol level for the different
levels of quality. Write a sentence commenting the plot.
2.2 Create a scatter plot having 'citricacid' in the x-axis and ' volatileacidity' in the y-axis. Color
the points by the wine type (red or white). Do something to deal with the points that overlap.
Write a sentence commenting the plot.
2.3 Use the k-means clustering algorithm to analyze if there are groups of records that can be
induced from the values of the chemical variables. Justify your choice for the number of
clusters and any other step in the analysis that you think was necessary. Write a sentence
commenting the results.
# Part 2
#1
library(ggplot2)
p <- ggplot(data = wine, aes(x = alcohol, color = quality))
p + geom_boxplot()
# From quality 3 to 6 the alcohol level seems to decrease more significantly.
# Then from 6 to 8, the opposite patter
#2
p1 <- ggplot(data = wine, aes(x = citricacid, y = volatileacidity, color = type))
p1 + geom_point() + facet_wrap(~type)
# to deal with overlap, plot is divided into subplots with facet_wrap
#red and white have symetric behaviour regarding volatileacidity and citricacid
#red tends to have higher values of of volatileacity
#while white has higher values for citricacid
#in fact even the outliers of both show this pattern
#nevertheless both present a really wide range with linearity not being an adequate
assumption
#3
#It would be intering to identify clusters either related to the quality level
# or to the type of wine
#To check the number of clusters, the very useful NbClust package will be used
library(NbClust)
#silhouette index
NbClust( wine[, -c(12,13)] ,distance = "euclidean", min.nc = 2, max.nc = 10, method =
"kmeans", index = "silhouette")
#Both yield good results for 2 and 3 clusters
## silhouette plot
#install.packages("cluster")
library(cluster)
dis <- dist(wine[, -c(12,13)])^2 #dist() computes and returns the distance matrix
cl <- kmeans(wine[, -c(12,13)], 3)
sil <- silhouette(cl$cluster, dis)
windows()
plot(sil)
table(wine$type,wine$cluster)