IntroR 2
IntroR 2
> x1=11
> 1x=12
Error: unexpected symbol in "1x"
> x2="Rahmath"
> x2
[1] "Rahmath"
> 11+24
[1] 35
> 7*9
[1] 63
> y=6
> x+y
[1] 15
> x-y
[1] 3
> sqrt(x)
[1] 3
> log(x)
[1] 2.197225
> log10(x)
[1] 0.9542425
> x1=c(1,3,5,7,9)
> gender=c("male","female")
> 2:7
[1] 2 3 4 5 6 7
> seq(from=1,to=7,by=2)
[1] 1 3 5 7
> rep(1,time=5)
[1] 1 1 1 1 1
> rep("rahma",times=3)
[1] "rahma" "rahma" "rahma"
> rep(1:3,time=3)
[1] 1 2 3 1 2 3 1 2 3
> x1
[1] 1 3 5 7 9
> x1[3]
[1] 5
> mat=matrix(c(1,2,3,4,5,6,7,8,9),nrow=3,byrow=TRUE)
> mat
[,1] [,2] [,3]
[1,] 1 2 3
[2,] 4 5 6
[3,] 7 8 9
> mat=matrix(c(1,2,3,4,5,6,7,8,9),nrow=3,byrow=FALSE)
> mat
[,1] [,2] [,3]
[1,] 1 4 7
[2,] 2 5 8
[3,] 3 6 9
> mat*3
[,1] [,2] [,3]
[1,] 3 12 21
[2,] 6 15 24
[3,] 9 18 27
> mat-3
[,1] [,2] [,3]
[1,] -2 1 4
[2,] -1 2 5
[3,] 0 3 6
> mat*mat
[,1] [,2] [,3]
[1,] 1 16 49
[2,] 4 25 64
[3,] 9 36 81
> mat/6
[,1] [,2] [,3]
[1,] 0.1666667 0.6666667 1.166667
[2,] 0.3333333 0.8333333 1.333333
[3,] 0.5000000 1.0000000 1.500000
> m=mat
>m
[,1] [,2] [,3]
[1,] 1 4 7
[2,] 2 5 8
[3,] 3 6 9
> m[2,]
[1] 2 5 8
#create a Excel File of data and save as csv format #comma separated value
>x2=read.table(file.choose(),header=T,sep=",")
#create a Excel File of data and save as txt format #tab delimited file
> x1=read.delim(file.choose(),header=T)
> x1=read.table(file.choose(),header=T,sep="\t")
> names(LungCapData)
[1] "LungCap" "Age" "Height" "Smoke" "Gender" "Caesarean"
>table(x2$Height)
> attach(x) # to separately identify the column variables without file name #detach opposite
> mean(Height)
[1] 64.83628
> class(Age)
[1] "integer"
> class(Smoke)
[1] "character"
> class(Height)
[1] "numeric"
>x$Gender=as.factor(x$Gender) # to convert the character data to factor
> f=x[Gender=="female",] #seprates the data of column Gender with female and stores in f
> m=x[Gender=="male",] #seprates the data of column Gender with male and stores in m
> maleover15=x[Gender=='male'& Age>15,] #seprates the data of column Gender with male
and Age over 15 stores in maleover15
> temp1=as.numeric(Age>15)
> temp1[1:5]
[1] 0 1 1 0 0
> malesmoke=Gender=="male" & Smoke=="yes"
> malesmoke[1:5]
[1] FALSE FALSE FALSE FALSE FALSE
> save.image("first.Rdata") # saves workspace in current directory under file name first
#can also be done using>>session>>save workspace As…
> load("first.Rdata") #loads workspace data saved in file first.Rdata
> load(file.choose())
#for R Studio preferences
#Box Plot
> boxplot(LungCap)
> quantile(LungCap,probs = c(0,0.25,0.5,0.75,1))
0% 25% 50% 75% 100%
0.507 6.150 8.000 9.800 14.675
> boxplot(LungCap,main="Boxplot",ylab="Lung Capacity", las=1)
> boxplot(LungCap,main="Boxplot",ylab="Lung Capacity", ylim=c(0,16), las=1)
> boxplot(LungCap~Gender)
> boxplot(LungCap[Gender=="female"], LungCap[Gender=="male"])
> AgeGroup=cut(Age,breaks = c(0,13,15,17,25),labels=c("<13","14/15","16/17","18+"))
#>cut-Divides x into intervals, breaks-cut points
> boxplot(LungCap~Smoke*AgeGroup,ylab="Lung
Capacity",main="BoxPlot",las=2,col=c(4,2))
#colur of box plot with blue and Red color
> hist(LungCap)
> hist(LungCap,freq=F)
> hist(LungCap,prob=T)
> hist(LungCap,prob=T,ylim = c(0,0.2))
> hist(LungCap,prob=T,ylim = c(0,0.2),breaks = 14)
> hist(LungCap,prob=T,ylim = c(0,0.2),breaks = 7)
> hist(LungCap,prob=T,ylim = c(0,0.2),breaks = c(0,2,4,6,8,10,12,14,16))
> hist(LungCap,prob=T,ylim = c(0,0.2),breaks = seq(from=0,to=16,by=1.5))
> hist(LungCap,prob=T,ylim = c(0,0.2),breaks = seq(from=0,to=16,by=1.5),main =
"Histogram", xlab = "Lung Capacity")
> hist(LungCap,prob=T,ylim = c(0,0.2),breaks = seq(from=0,to=16,by=1.5),main =
"Histogram", xlab = "Lung Capacity",las=1)
> lines(density(LungCap))
> lines(density(LungCap),col=2,lwd=3)
#Stem plot
> femaleLungCap=LungCap[Gender=="female"]
> stem(femaleLungCap)
> stem(femaleLungCap,scale=2)
#Scatter Plot
> cor(Age,Height)
[1] 0.8357368
#Legends
>plot(Age[Smoke=="no"], LungCap[Smoke=="no"],col=4, xlab = "Age", ylab = "LungCap")
# plots Age Vs LungCap for non smokers with blue color, adding a xlab and ylab
>points(Age[Smoke=="yes"], LungCap[Smoke=="yes"],col=2)
#adds points to existing plot for Age Vs LungCap for smokers with red color
>legend(x=3.5,y=12,legend=c("NON-SMOKE","SMOKE"),fill=c(4,2))
# adds a legend which begins x=3.5 and y=12 NON-SMOKE and SMOKE fill it 4 blue color
and 2 red color
>plot(Age[Smoke=="no"], LungCap[Smoke=="no"],col=4, xlab = "Age", ylab = "LungCap",
pch=16) #pch=16 plotting character 16 which is solid circle, 17 is triangle
> points(Age[Smoke=="yes"], LungCap[Smoke=="yes"],col=2,pch=17)
> legend(x=3.5,y=12,legend=c("NON-SMOKE","SMOKE"),col=c(4,2),pch=c(16,17))
>lines(smooth.spline(Age[Smoke=="no"],LungCap[Smoke=="no"]),col=4,lwd=3)
#adds lines with smooth spline for Age Vs LungCap for non-smokers with col=4 blue col and
lwd=3 line width 3 times
>lines(smooth.spline(Age[Smoke=="yes"],LungCap[Smoke=="yes"]),col=2,lwd=3,lty=2)
#adds lines with smooth spline for Age Vs LungCap for smokers with col=2 red col and
lwd=3 line width 3 times, lty=2 line type dashed line
>legend(x=3.5,y=12,legend=c("NON-SMOKE","SMOKE"),col=c(4,2),lty = c(1,2))
#Binomial Distribution
#Normal Distribution
#Correlation
#load the lung capacity data and attach
>plot(Age,LungCap,main = "scatter plot",las=1)
> cor(Age,LungCap,method = "pearson")
[1] 0.8196749
> cor(LungCap,Age,method = "pearson") #order does not effect
[1] 0.8196749
> cor(LungCap,Age, method = "spearman")
[1] 0.8172464
> cor(LungCap,Age, method = "kendall")
[1] 0.639576
> cor.test(LungCap,Age, method = "pearson")
#Regression
>mod=lm(LungCap~Age) #LungCap is Y data and Age is X data
>summary(mod)
Residuals:
Min 1Q Median 3Q Max
-4.7799 -1.0203 -0.0005 0.9789 4.2650
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.14686 0.18353 6.249 7.06e-10 ***
Age 0.54485 0.01416 38.476 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
> mod=lm(LungCap~Age)
> plot(mod)
> par(mfrow=c(2,2)) #plots the 4 plots in 2 by 2
> plot(mod)
#Cut command
#We will create height categories of A<50, B=50-55, C=55-60, D=60-65, E=65-70, F=70+
> catheight=cut(Height, breaks=c(0,50,55,60,65,70,100), labels=c("A","B","C","D","E","F"))
> Height[1:10]
[1] 62.1 74.7 69.7 71.0 56.9 58.7 63.3 70.4 70.5 59.2
> catheight[1:10]
[1] D F E F C C D F F C
Levels: A B C D E F
> catheight=cut(Height, breaks=c(0,50,55,60,65,70,100), labels=c("A","B","C","D","E","F"),
right = F) #If we want for example 60 to lie in D (By default it lies to left in C)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -11.747065 0.476899 -24.632 < 2e-16 ***
#estimated mean Lung Capacity for someone with zero Height and Age
Age 0.126368 0.017851 7.079 3.45e-12 ***
#We associate an increase in one year of age with an increase of 0.126 in lung Capacity
Height 0.278432 0.009926 28.051 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
> mod2=lm(LungCap~Age+Height+Smoke+Gender+Caesarean)
> summary(mod2)
> plot(mod2)
#Dummy indicators
#We will create height categories of A<50, B=50-55, C=55-60, D=60-65, E=65-70, F=70+
> catheight=cut(Height, breaks=c(0,50,55,60,65,70,100), labels=c("A","B","C","D","E","F"))
> mod1=lm(LungCap~catheight)
> summary(mod1)
Call:
lm(formula = LungCap ~ catheight)
Residuals:
Min 1Q Median 3Q Max
-4.0074 -0.7996 -0.0324 0.7935 3.8754
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.1486 0.2944 7.298 7.75e-13 ***
catheightB 1.5329 0.3424 4.476 8.83e-06 ***
catheightC 3.2768 0.3159 10.373 < 2e-16 ***
catheightD 5.0676 0.3102 16.335 < 2e-16 ***
catheightE 6.5837 0.3083 21.358 < 2e-16 ***
catheightF 8.6510 0.3083 28.065 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
> mean(LungCap[catheight=="A"])
[1] 2.148611
> mean(LungCap[catheight=="C"]) #Mean=2.15+1.51(0)+3.25(1)+5.02(0)+….=5.4
[1] 5.42542
#More on non-dependent variables like Age and Smoke with correlation of 0.2
> head(Smoke)
[1] "no" "yes" "no" "no" "no" "no"
> r=as.numeric(Smoke=="yes")
> head(r)
[1] 0 1 0 0 0 0
> cor(Age,r)
[1] 0.2112322
> mod1=lm(LungCap~Age+Smoke)
> summary(mod1)
For a smoker For a non-Smoker
#SWITCH Statement
switch(as.character(x$gender[6]),
"Male"=x$MonthlyCharges[6]*0.8,"Female"=x$MonthlyCharges[6]*0.5)
#If x column gender is male then 20% discount , if female 50% discount
}
print(count1)
}
#####accuracy