R Programming
R Programming
Page |1
1.1.R intro & Mathematical Operation
16/12/2020
# Useful Shortcuts
# To Clear the R-Console - Ctrl + L
# To Execute a particular line of Code - Ctrl + Enter
## [1] 5 # execution
b = 10
b
## [1] 10
## [1] "numeric"
a = "Hello"
class(a)
## [1] "character"
a = TRUE
class(a)
## [1] "logical"
a = FALSE
class (a)
## [1] "logical"
## [1] 25
x-y
## [1] -5
x*y
## [1] 150
x/y
Page |2
## [1] 0.6666667
sqrt(x)
## [1] 3.162278
x^y
## [1] 1e+15
exp(x) # exponential
## [1] 22026.47
log(x, base=exp(1))
## [1] 2.302585
log10(x)
## [1] 1
factorial(x)
## [1] 3628800
cos(x)
## [1] -0.8390715
## [1] 10
1.2 - Function
17/12/2020
# Functions in R
# to create a function with function name divider
divider = function(x,y) {
result = x/y
print(result)
}
divider(50,25) # x is assigned 50 and y is assigned 25
## [1] 2
Page |3
divider (100,25) # only need to assign specific values of x and y to
execute function
## [1] 4
## [1] 575
multiply (19,20)
## [1] 380
f = c(1,2,3,4,5)
f
## [1] 1 2 3 4 5
## [1] 5 6 7 8 9
f+d
f = c(1,2,3,4,5)
Page |4
rm (list = ls()) # to remove all variables (after executing this
environment will be empty)
ls()
## character(0)
#A = 10
#Variable /Object -- > A (Case Sensitive)
#Value = 10
#Read from right to left.
# <- or = # Assignment.
# Simple Mathematical Operations.
# Remove the objects or variables created.
Current Topic
# 4 DATA TYPES. (Nominal, Ordinal, Interval and Ratio)
# Self (NOIR) and System (Numeric, Character, Logical, Date, Vector). (Two
Brains). We have to adjust ourselves according to R understanding
# DATA TYPES
x = 10
class(x)
## [1] "numeric"
# Numeric - Integer and Decimal - (R)- Integer (Whole Number) and Numeric
(Float - Decimal)
i = 5L # for integer we need to mention L specifically - Integer
class(i)
## [1] "integer"
is.integer(i)
## [1] TRUE
is.numeric(x)
## [1] TRUE
Page |5
# Character - Categorical Variable - Words/String (Nominal),
Classification (Gender - Male, Female)
s = "R_Studio"
class(s)
## [1] "character"
FALSE * 5
K = TRUE
class(K)
## [1] "logical"
is.logical(K)
## [1] TRUE
date1 = as.Date("2012-06-28")
# as.Date()# Auto complete # How to enter
# ? as.Date # for help
date1
## [1] "2012-06-28"
class (date1)
## [1] "Date"
as.numeric(date1)
## [1] 15519
class(date2)
Page |6
as.numeric(date2)
## [1] 1340885520
3.1. Vectors
17/12/2020
# Vector - R is called as Vectorized language.
# Array - n-dimension collection of similar elements
# Matrix - subset of array (Two-dimension array). Matrix generally
contains numeric values.
# Vectorized form is used by R for calculation. (used in solving Linear
regression)
# Creating Vectors
# The most common way to create a Vector is using 'c' [combine]
x = c(1,2,3,4,5,6,7,8,9,10) c - combine. Combine these values as a vector.
x
## [1] 1 2 3 4 5 6 7 8 9 10
# Vector Operations
x*3 # multiplies each element by 3; No loops necessary!
## [1] 3 6 9 12 15 18 21 24 27 30
x+2
## [1] 3 4 5 6 7 8 9 10 11 12
x-3
## [1] -2 -1 0 1 2 3 4 5 6 7
x/4
## [1] 0.25 0.50 0.75 1.00 1.25 1.50 1.75 2.00 2.25 2.50
x^2
Page |7
## [1] 1 4 9 16 25 36 49 64 81 100
sqrt(x)
## [1] 1 2 3 4 5 6 7 8 9 10
10:1
## [1] 10 9 8 7 6 5 4 3 2 1
-2:3
## [1] -2 -1 0 1 2 3
5:-7
## [1] 5 4 3 2 1 0 -1 -2 -3 -4 -5 -6 -7
## [1] -4 -2 0 2 4 6 8 10 12 14
x-y
## [1] 6 6 6 6 6 6 6 6 6 6
x*y
## [1] -5 -8 -9 -8 -5 0 7 16 27 40
x/y
## [1] -0.2 -0.5 -1.0 -2.0 -5.0 Inf 7.0 4.0 3.0 2.5
x^y
## [1] 10
length(y)
## [1] 10
Page |8
# Unequal length vectors
x+c(1,2) # 1 & 2 will be added repeatedly.
## [1] 2 4 4 6 6 8 8 10 10 12
## [1] 2 4 6 5 7 9 8 10 12 11
## [1] TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
x<y
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [1] TRUE
all(x<y)
## [1] FALSE
## [1] 6 8 8 6 5 8 10 6 7 6
## [1] 2 2 2 2 1 1 1 1 1 1
## [1] 10
x[1:2]
## [1] 10 9
Page |9
x[c(1:5,9)]
## [1] 10 9 8 7 6 2
## a b c
## 1 2 3
3.2. Factors in R
17/12/2020
# Factor Vectors - Ordinal data [Ordered Categorical]
# Factors are important concept in R, esp. when building models
# Nominal - unordered (sachin, rahul), Ordinal - ordered (supervisor,
GM,AM,AGM)
# Nominal - character, Ordinal - Factor
q = c("Hockey","Lacrosse","Hockey","Water Polo","Hockey","Lacrosse")
q2 = c(q,"Hockey","Lacrosse","Hockey","Water Polo","Hockey","Lacrosse")
q2
class(q2)
## [1] "character"
as.numeric(q2)
## [1] NA NA NA NA NA NA NA NA NA NA NA NA
class(q2)
## [1] "character"
P a g e | 10
## [1] Hockey Lacrosse Hockey Water Polo Hockey Lacrosse
## [7] Hockey Lacrosse Hockey Water Polo Hockey Lacrosse
## Levels: Hockey Lacrosse Water Polo
# 11 Levels - 10 Distinct Names from "q" and one (Water polo) from "q2"
# The "levels" of a factor are the unique values of that factor variable.
# Technically R is giving "unique integer" to each distinct names, See
below
as.numeric(q2_F)# IN the O/P --> Notice "6" = "Hockey"
## [1] 1 2 1 3 1 2 1 2 1 3 1 2
## [1] 1 2 NA 8 3 NA 3
#Another example
z_char = c("Hockey", NA ,"Cricket")
z_char
is.na(z_char)
P a g e | 11
# NULL - Absence of anything. It is not exactly missingness, but
nothingness
# Eg: Having Brain but thinking Nothing! - Makes Sense!!!
# Functions can sometimes return NULL and their arguments can be NULL.
# Important difference is, NULL is atomical and cannot exist within a
vector...
# ...If used inside a vector, it simply disappears! Let's see...
z= c(1,NULL,3)
z
## [1] 1 3
x = c(1,NA,3)
x
## [1] 1 NA 3
# Notice, here the "NULL" didnot get stored in "z", infact "z" has only
length of 2!
length(z)
length(x)
## [1] TRUE
17/12/2020
# Data Structures in R
# Data come in many types and structures which can pose a problem for
some...
# ...analysis environments but R handles them with ease.
## VECTOR
# The most common data structure is the one-dimensional vector
# Vector forms the basis of everything in R.
# A vector is collection of elements of same type.
# (ie) A vector cannot be of mixed type.
# R is a Vectorized Language. That means operations are applied to each
element of the vector automatically,
# .., without the need to loop through the vector.
P a g e | 12
# This is a powerful concept and vector plays a crucial and significant
role in R.
## DATA FRAME
# Data Frames(DF) - Most useful features of R & also cited reason for R's
ease of use.
# In dataframe, each column is actually a vector, each of which has same
length.
# Each column can hold different type of data.
# Also within each column, each element must be of same type, like
vectors.
## MATRICES
# A matrix (plural matrices) is a rectangular array or table of numbers,
symbols, or expressions...
#..., arranged in rows and columns.(i.e.) 2-Dimensional Array
# Similar to data.frame(RxC) and also similar to Vector
# Matrix - Element by element operations are possible.
## ARRAYS
# Arrays - An array is essentially a multidimensional vector.
# It must all be of the same type and
# ...individual elements are accessed using Square Brackets.
# First element is Row(R) Index, Second Element is Column(C) Index and
# the remaining elements are for Outer Dimensions (OD).
## LIST
# Lists - Stores any number of items of any type.
# List can contain all numerics or characters or...
#...a mix of the two or data.frames or recursively other lists.
# Data Frames(DF) - Most useful features of R & also cited reason for R's
ease of use.
# In dataframe, each column is actually a vector, each of which has same
length.
# Each column can hold different type of data.
# Also within each column, each element must be of same type, like vectors
P a g e | 13
# Creating a Dataframe from vectors
x = 10:1
y = -4:5
q = c("Hockey","Football","Baseball","Curlin","Rugby","Lacrosse",
"Basketball","Tennis","Cricket","Soccer")
# to combine these 3 vectors, we will use data frame
theDF = data.frame(x,y,q) # this would create a 10x3 data.frame with x, y
and q as variable names
theDF
## x y q
## 1 10 -4 Hockey
## 2 9 -3 Football
## 3 8 -2 Baseball
## 4 7 -1 Curlin
## 5 6 0 Rugby
## 6 5 1 Lacrosse
## 7 4 2 Basketball
## 8 3 3 Tennis
## 9 2 4 Cricket
## 10 1 5 Soccer
str(theDF) # This will give the structure of dataframen like data type,
levels etc.
## [1] 10
P a g e | 14
ncol(theDF)
## [1] 3
dim(theDF)
## [1] 10 3
names (theDF)
## [1] "Sport"
rownames(theDF)
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10"
class(theDF)
## [1] "data.frame"
P a g e | 15
## [1] Hockey Football Baseball Curlin Rugby Lacrosse
## [7] Basketball Tennis Cricket Soccer
## 10 Levels: Baseball Basketball Cricket Curlin Football Hockey ...
Tennis
## [1] -2
## Second Sport
## 3 -2 Baseball
## [1] -2 0
# since only one column was selected, it was returned as vector and hence
no column names in output.
## Second Sport
## 3 -2 Baseball
## 5 0 Rugby
## Second Sport
## 1 -4 Hockey
## 2 -3 Football
## 3 -2 Baseball
## 4 -1 Curlin
## 5 0 Rugby
## 6 1 Lacrosse
## 7 2 Basketball
## 8 3 Tennis
## 9 4 Cricket
## 10 5 Soccer
P a g e | 16
theDF[2:4,]
## First Sport
## 1 10 Hockey
## 2 9 Football
## 3 8 Baseball
## 4 7 Curlin
## 5 6 Rugby
## 6 5 Lacrosse
## 7 4 Basketball
## 8 3 Tennis
## 9 2 Cricket
## 10 1 Soccer
class(theDF[ ,"Sport"])
## [1] "factor"
## Sport
## 1 Hockey
## 2 Football
## 3 Baseball
## 4 Curlin
## 5 Rugby
## 6 Lacrosse
## 7 Basketball
## 8 Tennis
## 9 Cricket
## 10 Soccer
class(theDF["Sport"]) # Data.Frame
## [1] "data.frame"
P a g e | 17
class(theDF[["Sport"]]) # Factor
## [1] "factor"
## Sport
## 1 Hockey
## 2 Football
## 3 Baseball
## 4 Curlin
## 5 Rugby
## 6 Lacrosse
## 7 Basketball
## 8 Tennis
## 9 Cricket
## 10 Soccer
## [1] "data.frame"
## Sport
## 1 Hockey
## 2 Football
## 3 Baseball
## 4 Curlin
## 5 Rugby
## 6 Lacrosse
## 7 Basketball
## 8 Tennis
## 9 Cricket
## 10 Soccer
## [1] "data.frame"
# model.matrix(~newFactor -1)
# ? model.matrix() # To be understand
P a g e | 18
4.3 Data Structure Matrices
Vaibhav Kumar
17/12/2020
# A matrix (plural matrices) is a rectangular array or table of numbers,
symbols, or expressions...
#..., arranged in rows and columns.(i.e.) 2-Dimensional Array
## [,1] [,2]
## [1,] 1 6
## [2,] 2 7
## [3,] 3 8
## [4,] 4 9
## [5,] 5 10
## [,1] [,2]
## [1,] 21 26
## [2,] 22 27
## [3,] 23 28
## [4,] 24 29
## [5,] 25 30
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] 21 23 25 27 29 31 33 35 37 39
## [2,] 22 24 26 28 30 32 34 36 38 40
nrow(A)
## [1] 5
ncol(A)
## [1] 2
dim(A)
## [1] 5 2
# Add Them
A+B
P a g e | 19
## [,1] [,2]
## [1,] 22 32
## [2,] 24 34
## [3,] 26 36
## [4,] 28 38
## [5,] 30 40
## [,1] [,2]
## [1,] 1 6
## [2,] 2 7
## [3,] 3 8
## [4,] 4 9
## [5,] 5 10
## [,1] [,2]
## [1,] 21 26
## [2,] 22 27
## [3,] 23 28
## [4,] 24 29
## [5,] 25 30
## [,1] [,2]
## [1,] 21 156
## [2,] 44 189
## [3,] 69 224
## [4,] 96 261
## [5,] 125 300
## [,1] [,2]
## [1,] FALSE FALSE
## [2,] FALSE FALSE
## [3,] FALSE FALSE
## [4,] FALSE FALSE
## [5,] FALSE FALSE
P a g e | 20
## [4,] 318 331 344 357 370
## [5,] 365 380 395 410 425
## NULL
rownames(A)
## NULL
colnames(A)= c("Left","Right")
rownames(A)= c("1st","2nd","3rd","4th","5th")
colnames(B)
## NULL
rownames(B)
## NULL
colnames(B)= c("First","Second")
rownames(B)= c("One","Two","Three","Four","Five")
colnames(C)
## NULL
rownames(C)
## NULL
## [1] 5 2
dim(C)
## [1] 2 10
t(A)
A %*% C
## A B C D E F G H I J
## 1st 153 167 181 195 209 223 237 251 265 279
## 2nd 196 214 232 250 268 286 304 322 340 358
## 3rd 239 261 283 305 327 349 371 393 415 437
## 4th 282 308 334 360 386 412 438 464 490 516
## 5th 325 355 385 415 445 475 505 535 565 595
P a g e | 21
4.4 Data Structure Array
Vaibhav Kumar
17/12/2020
# Arrays - An array is essentially a multidimensional vector.
# It must all be of the same type and
# ...individual elements are accessed using Square Brackets.
# First element is Row(R) Index, Second Element is Column(C) Index and
# the remaining elements are for Outer Dimensions (OD).
## , , 1
##
## [,1] [,2] [,3]
## [1,] 1 3 5
## [2,] 2 4 6
##
## , , 2
##
## [,1] [,2] [,3]
## [1,] 7 9 11
## [2,] 8 10 12
theArray [1, ,]# Accessing all elements from Row 1, all columns, all outer
dimensions & build C x OD (R x C)
## [,1] [,2]
## [1,] 1 7
## [2,] 3 9
## [3,] 5 11
theArray[1, ,1]# Accessing all elements from Row 1, all columns, first
outer dimension
## [1] 1 3 5
theArray[, ,1]# Accessing all rows, all columns, first outer dimension
## , , 1
##
P a g e | 22
## [,1] [,2] [,3] [,4]
## [1,] 1 3 5 7
## [2,] 2 4 6 8
##
## , , 2
##
## [,1] [,2] [,3] [,4]
## [1,] 9 11 13 15
## [2,] 10 12 14 16
##
## , , 3
##
## [,1] [,2] [,3] [,4]
## [1,] 17 19 21 23
## [2,] 18 20 22 24
##
## , , 4
##
## [,1] [,2] [,3] [,4]
## [1,] 25 27 29 31
## [2,] 26 28 30 32
theArray_4D [1, ,] # Accessing all elements from Row 1, all columns, all
outer dimensions & build C x OD (R x C)
theArray_4D[1, ,1]
## [1] 1 3 5 7
theArray[, ,1]
17/12/2020
# Lists - Stores any number of items of any type.
# List can contain all numerics or characters or...
#...a mix of the two or data.frames or recursively other lists.
P a g e | 23
# Lists are created with the "list" function.
# Each argument in "list" becomes an element of the list.
## [[1]]
## [1] 1
##
## [[2]]
## [1] 2
##
## [[3]]
## [1] 3
## [[1]]
## [1] 1 2 3
## [[1]]
## [1] 1 2 3
##
## [[2]]
## [1] 3 4 5 6 7
## [[1]]
## [1] 1 2 3
##
## [[2]]
## [1] 3 4 5 6 7
x = 10:1
y = -4:5
q = c("Hockey","Football","Baseball","Curlin","Rugby","Lacrosse",
"Basketball","Tennis","Cricket","Soccer")
theDF = data.frame(x,y,q) # this would create a 10x3 data.frame with x, y
and q as variable names
theDF
## x y q
## 1 10 -4 Hockey
## 2 9 -3 Football
## 3 8 -2 Baseball
## 4 7 -1 Curlin
## 5 6 0 Rugby
P a g e | 24
## 6 5 1 Lacrosse
## 7 4 2 Basketball
## 8 3 3 Tennis
## 9 2 4 Cricket
## 10 1 5 Soccer
q = as.factor(q)
# Assigning Names
theDF = data.frame (First=x, Second =y, Sport = q)
theDF
## [1] 10
ncol(theDF)
## [1] 3
dim(theDF)
## [1] 10 3
names (theDF)
names(theDF)[3]
## [1] "Sport"
rownames(theDF)
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10"
P a g e | 25
# Head and Tail
head(theDF)# First 6 rows with all columns
head(theDF, n=10)
class(theDF)
## [1] "data.frame"
## [1] -2
## Second Sport
## 3 -2 Baseball
P a g e | 26
theDF[c(3,5), 2]# Row 3&5 from Column 2;
## [1] -2 0
# since only one column was selected, it was returned as vector and hence
no column names in output.
## Second Sport
## 3 -2 Baseball
## 5 0 Rugby
theDF[ , 2:3]
## Second Sport
## 1 -4 Hockey
## 2 -3 Football
## 3 -2 Baseball
## 4 -1 Curlin
## 5 0 Rugby
## 6 1 Lacrosse
## 7 2 Basketball
## 8 3 Tennis
## 9 4 Cricket
## 10 5 Soccer
theDF[2:4,]
## First Sport
## 1 10 Hockey
## 2 9 Football
## 3 8 Baseball
## 4 7 Curlin
## 5 6 Rugby
## 6 5 Lacrosse
## 7 4 Basketball
P a g e | 27
## 8 3 Tennis
## 9 2 Cricket
## 10 1 Soccer
class(theDF[ ,"Sport"])
## [1] "factor"
## Sport
## 1 Hockey
## 2 Football
## 3 Baseball
## 4 Curlin
## 5 Rugby
## 6 Lacrosse
## 7 Basketball
## 8 Tennis
## 9 Cricket
## 10 Soccer
class(theDF["Sport"]) # Data.Frame
## [1] "data.frame"
class(theDF[["Sport"]]) # Factor
## [1] "factor"
## Sport
## 1 Hockey
## 2 Football
## 3 Baseball
## 4 Curlin
## 5 Rugby
## 6 Lacrosse
## 7 Basketball
## 8 Tennis
P a g e | 28
## 9 Cricket
## 10 Soccer
## [1] "data.frame"
## Sport
## 1 Hockey
## 2 Football
## 3 Baseball
## 4 Curlin
## 5 Rugby
## 6 Lacrosse
## 7 Basketball
## 8 Tennis
## 9 Cricket
## 10 Soccer
## [1] "data.frame"
# model.matrix(~newFactor -1)
# ? model.matrix()
list(theDF, 1:10)# theDF is already created in previous exercise!
## [[1]]
## First Second Sport
## 1 10 -4 Hockey
## 2 9 -3 Football
## 3 8 -2 Baseball
## 4 7 -1 Curlin
## 5 6 0 Rugby
## 6 5 1 Lacrosse
## 7 4 2 Basketball
## 8 3 3 Tennis
## 9 2 4 Cricket
## 10 1 5 Soccer
##
## [[2]]
## [1] 1 2 3 4 5 6 7 8 9 10
P a g e | 29
# Three element list
list5 = list(theDF, 1:10, list3)
list5
## [[1]]
## First Second Sport
## 1 10 -4 Hockey
## 2 9 -3 Football
## 3 8 -2 Baseball
## 4 7 -1 Curlin
## 5 6 0 Rugby
## 6 5 1 Lacrosse
## 7 4 2 Basketball
## 8 3 3 Tennis
## 9 2 4 Cricket
## 10 1 5 Soccer
##
## [[2]]
## [1] 1 2 3 4 5 6 7 8 9 10
##
## [[3]]
## [[3]][[1]]
## [1] 1 2 3
##
## [[3]][[2]]
## [1] 3 4 5 6 7
list5
## $data.frame
## First Second Sport
## 1 10 -4 Hockey
## 2 9 -3 Football
## 3 8 -2 Baseball
## 4 7 -1 Curlin
## 5 6 0 Rugby
## 6 5 1 Lacrosse
## 7 4 2 Basketball
## 8 3 3 Tennis
## 9 2 4 Cricket
## 10 1 5 Soccer
##
## $vector
## [1] 1 2 3 4 5 6 7 8 9 10
##
## $list
## $list[[1]]
## [1] 1 2 3
##
P a g e | 30
## $list[[2]]
## [1] 3 4 5 6 7
list6
## $TheDataFrame
## First Second Sport
## 1 10 -4 Hockey
## 2 9 -3 Football
## 3 8 -2 Baseball
## 4 7 -1 Curlin
## 5 6 0 Rugby
## 6 5 1 Lacrosse
## 7 4 2 Basketball
## 8 3 3 Tennis
## 9 2 4 Cricket
## 10 1 5 Soccer
##
## $TheVector
## [1] 1 2 3 4 5 6 7 8 9 10
##
## $TheList
## $TheList[[1]]
## [1] 1 2 3
##
## $TheList[[2]]
## [1] 3 4 5 6 7
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## NULL
P a g e | 31
## 2 9 -3 Football
## 3 8 -2 Baseball
## 4 7 -1 Curlin
## 5 6 0 Rugby
## 6 5 1 Lacrosse
## 7 4 2 Basketball
## 8 3 3 Tennis
## 9 2 4 Cricket
## 10 1 5 Soccer
list5[["data.frame"]]
list5[[1]]$Sport
list5[[1]][,"Second"]
## [1] -4 -3 -2 -1 0 1 2 3 4 5
## Second
## 1 -4
## 2 -3
## 3 -2
## 4 -1
## 5 0
## 6 1
## 7 2
## 8 3
## 9 4
## 10 5
# LENGTH OF LIST
length(list5)
## [1] 3
names(list5)
P a g e | 32
## [1] "data.frame" "vector" "list"
list5
## $data.frame
## First Second Sport
## 1 10 -4 Hockey
## 2 9 -3 Football
## 3 8 -2 Baseball
## 4 7 -1 Curlin
## 5 6 0 Rugby
## 6 5 1 Lacrosse
## 7 4 2 Basketball
## 8 3 3 Tennis
## 9 2 4 Cricket
## 10 1 5 Soccer
##
## $vector
## [1] 1 2 3 4 5 6 7 8 9 10
##
## $list
## $list[[1]]
## [1] 1 2 3
##
## $list[[2]]
## [1] 3 4 5 6 7
18/12/2020
# Its time that we load data in R.
# Most common way to get data is reading comma separated values(CSV)
# Reading CSVs
#theUrl = "https://www.jaredlander.com/data/RetailFood.csv"
# visit https://www.jaredlander.com/data/ for other Datasets
#RetailFood = read.table(file=theUrl, header=TRUE, sep =",") here values
are separated by “,” and header to read the header
#head(RetailFood) # to read first 6 rows with all columns
#We can also use read.csv instead of read.table but it will work if file
is of csv extension. It might be tempting to use read.csv but that is more
trouble than it is worth,
#...and all it does is call read.table with some arguments preset.
P a g e | 33
# In this case read.csv2(or read.delim2)should be used instead of
read.table.
#R Binary Files
# save the tomato data.frame to Disk
#save(RetailFood, file="C:\\Users\\Vaibhav\\Documents\\R\\Data
Structure\\RetailFood.rdata")
# remove tomato from memory
#rm(RetailFood)
# Check if it still exists
#head(RetailFood)
# read it from the rdata file
#load("C:\\Users\\Vaibhav\\Documents\\R\\Data
Structure\\RetailFood.rdata")
#head(RetailFood)
18/12/2020
# R has various packages which need to be install and also contain various
data sets
# Built-in datasets in R
# for example
# data()# List of built-in Datasets in R. Open in different tab.
# Loading
# data(mtcars)
# Print the first 6 rows
# head(mtcars, 6)
P a g e | 34
5.1 Basic statistics – Mean, Median
Vaibhav Kumar
18/12/2020
# Basic Statistics - Mean, Variances,Correlations and T-tests
## [1] 96 71 18 26 15 10 18 39 28 68 58 13 25 38 57 95 60 22 89 93
## [1] 46.95
## [1] 84 57 86 18 12 48 23 93 69 20 29 70 7 66 82 76 65 99 67 56
## [1] 49 63 30 73 61 53 72 38 94 70 41 33 60 58 75 47 95 11 39 46
mean(y)# Will give NA! because sample contains both numerical and
character (NA)
## [1] 55.4
## [1] 55.4
# Weighted Mean
Grades = c(65,90,54,78)
Weights = c(1/8, 1/8, 1/4, 1/2)
mean(Grades)# Simple Arithmetic mean
## [1] 71.75
## [1] 71.875
#Variance
var(x)
## [1] 909.4184
P a g e | 35
#Calculating Variance using formula!
sum((x-mean(x))^2)/ (length(x)-1)
## [1] 909.4184
# Standard Deviation
sqrt(var(x)) #square root of variance
## [1] 30.15657
sd(x)
## [1] 30.15657
sd(y)
## [1] 21.10226
## [1] 21.10226
## [1] 10
max(x)
## [1] 96
median(x)
## [1] 38.5
min(y)
## [1] 11
min(y, na.rm=TRUE)
## [1] 11
# Summary Statistics
summary(x) # it provides min, max, median, mean, 1st qu. and 3rd qu.
# Quantiles
quantile(x, probs = c(0.25, 0.75)) # Calculate 25th and 75th Quantile
## 25% 75%
## 21.00 68.75
P a g e | 36
quantile(x, probs = c(0.1,0.25,0.5, 0.75,0.99)) # to calculate value at
specific length
## 25% 75%
## 40.5 70.5
## 25% 75%
## 40.5 70.5
## # A tibble: 6 x 6
## date pce pop psavert uempmed unemploy
## <date> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1967-07-01 507. 198712 12.6 4.5 2944
## 2 1967-08-01 510. 198911 12.6 4.7 2945
## 3 1967-09-01 516. 199113 11.9 4.6 2958
## 4 1967-10-01 512. 199311 12.9 4.9 3143
## 5 1967-11-01 517. 199498 12.8 4.7 3066
## 6 1967-12-01 525. 199657 11.8 4.8 3018
## [1] -0.7928546
# Lets install the required package and load them onto this R environment
for executing!!!
P a g e | 37
# Also load the Scales package for some extra plotting features
#install.packages("scales")
library(scales)
## x y Correlation
## 2 psavert pce -0.7928546
## 5 pce psavert -0.7928546
## 7 uempmed psavert -0.3251377
## 10 psavert uempmed -0.3251377
## 8 unemploy psavert -0.3093769
## 14 psavert unemploy -0.3093769
## 4 unemploy pce 0.6145176
## 13 pce unemploy 0.6145176
## 3 uempmed pce 0.7269616
## 9 pce uempmed 0.7269616
## 12 unemploy uempmed 0.8693097
## 15 uempmed unemploy 0.8693097
## 1 pce pce 1.0000000
## 6 psavert psavert 1.0000000
## 11 uempmed uempmed 1.0000000
## 16 unemploy unemploy 1.0000000
P a g e | 38
5.2. Correlation Heatmap
Vaibhav Kumar
18/12/2020
# Correlation
P a g e | 39
## hp -0.78 0.79 1.00 -0.45 0.66 -0.71
## drat 0.68 -0.71 -0.45 1.00 -0.71 0.09
## wt -0.87 0.89 0.66 -0.71 1.00 -0.17
## qsec 0.42 -0.43 -0.71 0.09 -0.17 1.00
P a g e | 40
# Get lower triangle of the correlation matrix
get_lower_tri<-function(cormat){
cormat[upper.tri(cormat)] <- NA
return(cormat)
}
# Get upper triangle of the correlation matrix
get_upper_tri <- function(cormat){
cormat[lower.tri(cormat)]<- NA
return(cormat)
}
P a g e | 41
# negative correlations are in blue color and positive correlations in
red.
# The function scale_fill_gradient2 is used with the argument limit = c(-
1,1) as correlation coefficients range from -1 to 1.
# coord_fixed() : this function ensures that one unit on the x-axis is the
same length as one unit on the y-axis.
P a g e | 42
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Pearson\nCorrelation") +
theme_minimal()+ # minimal theme
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 12, hjust = 1))+
coord_fixed()
# Print the heatmap
print(ggheatmap)
ggheatmap +
geom_text(aes(Var2, Var1, label = value), color = "black", size = 4) +
theme(
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.grid.major = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.ticks = element_blank(),
legend.justification = c(1, 0),
legend.position = c(0.6, 0.7),
legend.direction = "horizontal")+
guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
title.position = "top", title.hjust = 0.5))
P a g e | 43
5.3. Hypothesis Testing
Vaibhav Kumar
18/12/2020
# T-tests
# Dataset: Tips dependents on...
data(tips, package = "reshape2")
head(tips)
P a g e | 44
...
## $ size : int 2 3 3 2 4 4 2 4 2 2 ...
write.csv(tips, "C:\\Users\\Vaibhav\\Documents\\R\\Basic
statistics\\tips.csv", row.names = FALSE) # to save tips file in excel in
computer
# Gender
unique(tips$sex) # levels
##
## One Sample t-test
##
## data: tips$tip
## t = 5.6253, df = 243, p-value = 5.08e-08
## alternative hypothesis: true mean is not equal to 2.5
## 95 percent confidence interval:
## 2.823799 3.172758
## sample estimates:
## mean of x
## 2.998279
##
## One Sample t-test
##
## data: tips$tip
## t = 5.6253, df = 243, p-value = 2.54e-08
## alternative hypothesis: true mean is greater than 2.5
## 95 percent confidence interval:
## 2.852023 Inf
## sample estimates:
## mean of x
## 2.998279
P a g e | 45
t.test(tip ~ sex, data = tips, var.equal = TRUE) # Male and Female are
independent. Assuming variance for both is equal.
##
## Two Sample t-test
##
## data: tip by sex
## t = -1.3879, df = 242, p-value = 0.1665
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.6197558 0.1074167
## sample estimates:
## mean in group Female mean in group Male
## 2.833448 3.089618
##
## Attaching package: 'Hmisc'
##
## Attaching package: 'UsingR'
head(father.son)
P a g e | 46
## fheight sheight
## 1 65.04851 59.77827
## 2 63.25094 63.21404
## 3 64.95532 63.34242
## 4 65.75250 62.79238
## 5 61.13723 64.28113
## 6 63.02254 64.24221
write.csv(father.son, "C:\\Users\\Vaibhav\\Documents\\R\\Basic
statistics\\father_son.csv", row.names = FALSE)
P a g e | 47
6. Learning from Assignment
P a g e | 48