Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
0% found this document useful (0 votes)
11 views

IntroR 2

Uploaded by

moad77181
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
11 views

IntroR 2

Uploaded by

moad77181
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 18

> x=9

> x1=11

> 1x=12
Error: unexpected symbol in "1x"

> x2="Rahmath"

> x2
[1] "Rahmath"

> 11+24
[1] 35

> 7*9
[1] 63

> y=6

> x+y
[1] 15

> x^2 + y^2


[1] 117

> x-y
[1] 3

> sqrt(x)
[1] 3

> log(x)
[1] 2.197225

> ?log #help for log

> log10(x)
[1] 0.9542425

> #This is a comment line

> x1=c(1,3,5,7,9)

> gender=c("male","female")

> 2:7
[1] 2 3 4 5 6 7
> seq(from=1,to=7,by=2)
[1] 1 3 5 7

> rep(1,time=5)
[1] 1 1 1 1 1

> rep("rahma",times=3)
[1] "rahma" "rahma" "rahma"

> rep(1:3,time=3)
[1] 1 2 3 1 2 3 1 2 3

> x1
[1] 1 3 5 7 9

> x1[3]
[1] 5

#Clear Console (Ctrl L)

> mat=matrix(c(1,2,3,4,5,6,7,8,9),nrow=3,byrow=TRUE)
> mat
[,1] [,2] [,3]
[1,] 1 2 3
[2,] 4 5 6
[3,] 7 8 9

> mat=matrix(c(1,2,3,4,5,6,7,8,9),nrow=3,byrow=FALSE)
> mat
[,1] [,2] [,3]
[1,] 1 4 7
[2,] 2 5 8
[3,] 3 6 9

> mat*3
[,1] [,2] [,3]
[1,] 3 12 21
[2,] 6 15 24
[3,] 9 18 27

> mat-3
[,1] [,2] [,3]
[1,] -2 1 4
[2,] -1 2 5
[3,] 0 3 6

> mat*mat
[,1] [,2] [,3]
[1,] 1 16 49
[2,] 4 25 64
[3,] 9 36 81

> mat/6
[,1] [,2] [,3]
[1,] 0.1666667 0.6666667 1.166667
[2,] 0.3333333 0.8333333 1.333333
[3,] 0.5000000 1.0000000 1.500000

> m=mat

>m
[,1] [,2] [,3]
[1,] 1 4 7
[2,] 2 5 8
[3,] 3 6 9

> m[2,]
[1] 2 5 8

#create a Excel File of data and save as csv format #comma separated value

>x1=read.csv(file.choose(),header=T) # to import the excel csv file

>x2=read.table(file.choose(),header=T,sep=",")

#create a Excel File of data and save as txt format #tab delimited file

> x1=read.delim(file.choose(),header=T)

> x1=read.table(file.choose(),header=T,sep="\t")

#To export data from R to other formats

> write.table(Book3_1_,file = "bee.csv", sep=",")


(Book3_1_.....file in R,file = "bee.csv"…….destination file name, sep=",")

> names(LungCapData)
[1] "LungCap" "Age" "Height" "Smoke" "Gender" "Caesarean"

> rm(x1) #To remove file or data from R workspace

> dim(LungCapData) #dimension of data file


[1] 10 6

> head(x2) # head of file….first 6 rows


>head(x2,4) # head of file….first 4 rows
LungCap Age Height Smoke Gender Caesarean
1 6.475 6 62.1 no male no
2 10.125 18 74.7 yes female no
3 9.550 16 69.7 no female yes
4 11.125 14 71.0 no male no
5 4.800 5 56.9 no male no
6 6.225 11 58.7 no female no

> tail(x2) # Tail of file….last 6 rows

>sample(x2,5) #Sample of 5 data from x2

>table(x2$Height)

> names(x2) # names of data


[1] "LungCap" "Age" "Height" "Smoke" "Gender" "Caesarean"

> x2[-(4:722),] # all data except rows 4 to 722


LungCap Age Height Smoke Gender Caesarean
1 6.475 6 62.1 no male no
2 10.125 18 74.7 yes female no
3 9.550 16 69.7 no female yes
723 3.850 11 60.5 yes female no
724 9.825 15 64.9 no female no
725 7.100 10 67.7 no male no

> x2[5:9,] #data from 5 to 9 rows and all columns


LungCap Age Height Smoke Gender Caesarean
5 4.800 5 56.9 no male no
6 6.225 11 58.7 no female no
7 4.950 8 63.3 no male yes
8 7.325 11 70.4 no male no
9 8.875 15 70.5 no male no

> mean(x$Height) #mean of height column in file x


[1] 64.83628

> attach(x) # to separately identify the column variables without file name #detach opposite
> mean(Height)
[1] 64.83628

> class(Age)
[1] "integer"

> class(Smoke)
[1] "character"
> class(Height)
[1] "numeric"
>x$Gender=as.factor(x$Gender) # to convert the character data to factor

> f=x[Gender=="female",] #seprates the data of column Gender with female and stores in f
> m=x[Gender=="male",] #seprates the data of column Gender with male and stores in m

> maleover15=x[Gender=='male'& Age>15,] #seprates the data of column Gender with male
and Age over 15 stores in maleover15

> temp=Age>15 #logical statements


> temp[1:5]
[1] FALSE TRUE TRUE FALSE FALSE

> temp1=as.numeric(Age>15)
> temp1[1:5]
[1] 0 1 1 0 0
> malesmoke=Gender=="male" & Smoke=="yes"
> malesmoke[1:5]
[1] FALSE FALSE FALSE FALSE FALSE

> malesmoke=as.numeric(Gender=="male" & Smoke=="yes")


> malesmoke[1:5]
[1] 0 0 0 0 0

> moredata=cbind(x,malesmoke) #binds column-wise data in file x and in file malesmoke


> View(moredata)

> getwd() #get the current working directory


[1] "/Users/rahmathullabaig"

> Rwd="/Users/rahmathullabaig/TheR" #giving a name to the path of working directory


> setwd(Rwd)
> getwd()
[1] "/Users/rahmathullabaig/TheR"

> save.image("first.Rdata") # saves workspace in current directory under file name first
#can also be done using>>session>>save workspace As…
> load("first.Rdata") #loads workspace data saved in file first.Rdata
> load(file.choose())
#for R Studio preferences

# Using the 'APPLY' function in R


# read in the "StockExample.csv" data, and attach it
>StockData <- read.table(file="~/TheR/StockExample.csv", sep=",",
header=T,row.names=1)
# check the data
>StockData

# get the help menu


>?apply

# calculate the mean price of each stock


>apply(X=StockData, MARGIN=2, FUN=mean)

# calculate the mean price of each stock, removing any NAs


>apply(X=StockData, MARGIN=2, FUN=mean, na.rm=TRUE)

# store the mean in an object called AVG


>AVG <- apply(X=StockData, MARGIN=2, FUN=mean, na.rm=TRUE)
>AVG

# notice that we don't need to include "MARGIN", etc, as long


# as we enter info in the specified order
>apply(StockData, 2, mean, na.rm=TRUE)

# do the same, but using the ColMeans command


>colMeans(StockData, na.rm=TRUE)

# find the MAXIMUM stock price, for each stock


>apply(X=StockData, MARGIN=2, FUN=max, na.rm=TRUE)

# find the 20th and 80th PERCENTILE, for each stock


>apply(X=StockData, MARGIN=2, FUN=quantile, probs=c(0.2, .80), na.rm=TRUE)

# create a plot of each column, using a "line"


>apply(X=StockData, MARGIN=2, FUN=plot, type="l")

# we can also send the plot function more arguments, such as


# titles, axes labels, and so forth...
>apply(X=StockData, MARGIN=2, FUN=plot, type="l", main="stock", ylab="Price",
xlab="Day")
# now let's calculate the SUM of each row (MARGIN=1)
>apply(X=StockData, MARGIN=1, FUN=sum, na.rm=TRUE)

# do the same, but with the rowSums command


>rowSums(StockData, na.rm=TRUE)

# make a nice plot of these...


>plot(apply(X=StockData, MARGIN=1, FUN=sum, na.rm=TRUE), type="l"
,ylab="Total Market Value", xlab="Day", main="Market Trend")
# and add in some nice coloured points...
>points(apply(X=StockData, MARGIN=1, FUN=sum, na.rm=TRUE),
pch=16, col="blue")

#BarPlot and Pie Chart


>load("~/TheR/first.Rdata")
> ?barplot
> attach(x)
> count=table(Gender)
> count
Gender
female male
358 367
> percent=table(Gender)/725
> percent
Gender
female male
0.4937931 0.5062069
> barplot(count)
> barplot(count, main="Title", xlab = "Gender", ylab = "Count")
> barplot(percent)
> barplot(percent, main="Title", xlab = "Gender", ylab = "Count")
> barplot(percent, main="Title", xlab = "Gender", ylab = "Count", las=1)
> barplot(percent, main="Title", xlab = "Gender", ylab = "Count", las=1, names.arg =
c("Female", "Male"))
> barplot(percent, main="Title", xlab = "Gender", ylab = "Count", las=1, names.arg =
c("Female", "Male"), horiz = T)
> pie(count,main = "Title")
> pie(count,main = "Title", names.arg=c("Female","Male"))

#Box Plot

> boxplot(LungCap)
> quantile(LungCap,probs = c(0,0.25,0.5,0.75,1))
0% 25% 50% 75% 100%
0.507 6.150 8.000 9.800 14.675
> boxplot(LungCap,main="Boxplot",ylab="Lung Capacity", las=1)
> boxplot(LungCap,main="Boxplot",ylab="Lung Capacity", ylim=c(0,16), las=1)
> boxplot(LungCap~Gender)
> boxplot(LungCap[Gender=="female"], LungCap[Gender=="male"])
> AgeGroup=cut(Age,breaks = c(0,13,15,17,25),labels=c("<13","14/15","16/17","18+"))
#>cut-Divides x into intervals, breaks-cut points

> boxplot(LungCap,ylab="Lung Capacity",main="BoxPlot",las=1)


> boxplot(LungCap~Smoke,ylab="Lung Capacity",main="BoxPlot",las=1)
> boxplot(LungCap[Age>=18]~Smoke[Age>=18],ylab="Lung
Capacity",main="BoxPlot",las=1)

> boxplot(LungCap~Smoke*AgeGroup,ylab="Lung Capacity",main="BoxPlot",las=2)

> boxplot(LungCap~Smoke*AgeGroup,ylab="Lung
Capacity",main="BoxPlot",las=2,col=c(4,2))
#colur of box plot with blue and Red color

> hist(LungCap)
> hist(LungCap,freq=F)
> hist(LungCap,prob=T)
> hist(LungCap,prob=T,ylim = c(0,0.2))
> hist(LungCap,prob=T,ylim = c(0,0.2),breaks = 14)
> hist(LungCap,prob=T,ylim = c(0,0.2),breaks = 7)
> hist(LungCap,prob=T,ylim = c(0,0.2),breaks = c(0,2,4,6,8,10,12,14,16))
> hist(LungCap,prob=T,ylim = c(0,0.2),breaks = seq(from=0,to=16,by=1.5))
> hist(LungCap,prob=T,ylim = c(0,0.2),breaks = seq(from=0,to=16,by=1.5),main =
"Histogram", xlab = "Lung Capacity")
> hist(LungCap,prob=T,ylim = c(0,0.2),breaks = seq(from=0,to=16,by=1.5),main =
"Histogram", xlab = "Lung Capacity",las=1)

> lines(density(LungCap))
> lines(density(LungCap),col=2,lwd=3)

#Stem plot
> femaleLungCap=LungCap[Gender=="female"]
> stem(femaleLungCap)

> stem(femaleLungCap,scale=2)

#Scatter Plot
> cor(Age,Height)
[1] 0.8357368

> plot(Age,Height,main="ScatterPlot",xlab = "AGE",ylab = "HEIGHT",xlim =


c(0,20),pch=8, col=2,las=1)
>?text
>plot(Age,LungCap,main="Scatter Plot", las=1)
> text(x=5,y=12,label="Correlation=0.82") # text at x-axis “5” and y-axis “12” it writes label.
> text(x=5,y=12,label="Correlation=0.82",adj=0) # Label….begin at x-axis “5”
> text(x=5,y=12,label="Correlation=0.82",adj=1) # Label….ends at x-axis “5”
> text(x=5,y=12,label="Corr=0.82",adj=0,cex=0.5, col=4,font=4) #cex=size of font 50%,
col=4. Blue color, font=4 italic bold
>abline(h=mean(LungCap),col=2, lwd=2) # adds a horizontal line with col=2 red color with
lwd=2 line width is 2
>mtext(text="r=0.82", side=4, adj=1) # adds a text at Margins side=4 and at the end of
margin
>mtext(text="r=0.82", side=3, adj=1,las=1, col=2, font=4, cex=1.80)
# adds text at margin on top of plot as side=3, col=2 red, font=4 bolt italic, adj=1 top right
corner, cex=1.8 80% larger text size

#Legends
>plot(Age[Smoke=="no"], LungCap[Smoke=="no"],col=4, xlab = "Age", ylab = "LungCap")
# plots Age Vs LungCap for non smokers with blue color, adding a xlab and ylab
>points(Age[Smoke=="yes"], LungCap[Smoke=="yes"],col=2)
#adds points to existing plot for Age Vs LungCap for smokers with red color
>legend(x=3.5,y=12,legend=c("NON-SMOKE","SMOKE"),fill=c(4,2))
# adds a legend which begins x=3.5 and y=12 NON-SMOKE and SMOKE fill it 4 blue color
and 2 red color
>plot(Age[Smoke=="no"], LungCap[Smoke=="no"],col=4, xlab = "Age", ylab = "LungCap",
pch=16) #pch=16 plotting character 16 which is solid circle, 17 is triangle
> points(Age[Smoke=="yes"], LungCap[Smoke=="yes"],col=2,pch=17)
> legend(x=3.5,y=12,legend=c("NON-SMOKE","SMOKE"),col=c(4,2),pch=c(16,17))

>lines(smooth.spline(Age[Smoke=="no"],LungCap[Smoke=="no"]),col=4,lwd=3)
#adds lines with smooth spline for Age Vs LungCap for non-smokers with col=4 blue col and
lwd=3 line width 3 times
>lines(smooth.spline(Age[Smoke=="yes"],LungCap[Smoke=="yes"]),col=2,lwd=3,lty=2)
#adds lines with smooth spline for Age Vs LungCap for smokers with col=2 red col and
lwd=3 line width 3 times, lty=2 line type dashed line
>legend(x=3.5,y=12,legend=c("NON-SMOKE","SMOKE"),col=c(4,2),lty = c(1,2))

#Binomial Distribution

>dbinom(x=3,size=20,p=1/6) # probability of exactly 3 success p=3


>dbinom(x=0:3,size=20,p=1/6) # probability of exactly 0,1,2,3 success p=0,1,2,3
>pbinom(q=3,size=20,p=1/6,lower.tail=T) #P(x<=3)=P(x=0)+ P(x=1)+ P(x=2)+ P(x=3)

#Poisson Distribution Lambda=7


>dpois(x=4, lambda=7) # probability of exactly 4 occurrences p=4
>dpois(x=0:4, lambda=7) #probability of exactly 0,1,2,3,4 occurrences p=0,1,2,3,4
>ppois(q=4, lambda=7,lower.tail=T) #P(x<=4)=P(x=0)+ P(x=1)+ P(x=2)+ P(x=3)+P(x=4)
>ppois(q=12, lambda=7,lower.tail=F) #P(x>=12)

#Normal Distribution

>pnorm(q=70, mean=75, sd=5, lower.tail=T) #P(x<=70)


>pnorm(q=85, mean=75, sd=5, lower.tail=F) #P(x>=85)
>qnorm(p=0.25, mean=75, sd=5, lower.tail=T) #Find Q1=First quartile
#plotting density
>x=seq(from=55,to=95,by=0.25)
>x1=dnorm(x,mean=75,sd=5)
>plot(x,x1)
>plot(x,x1,type = "l")
#random sample from normal distribution
>x2=rnorm(n=40, mean=75,sd=5)
>hist(x2)

#Correlation
#load the lung capacity data and attach
>plot(Age,LungCap,main = "scatter plot",las=1)
> cor(Age,LungCap,method = "pearson")
[1] 0.8196749
> cor(LungCap,Age,method = "pearson") #order does not effect
[1] 0.8196749
> cor(LungCap,Age, method = "spearman")
[1] 0.8172464
> cor(LungCap,Age, method = "kendall")
[1] 0.639576
> cor.test(LungCap,Age, method = "pearson")

Pearson's product-moment correlation

data: LungCap and Age


t = 38.476, df = 723, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.7942660 0.8422217
sample estimates:
cor
0.8196749
> cor.test(LungCap,Age, method = "pearson",conf.level = 0.99)

Pearson's product-moment correlation

data: LungCap and Age


t = 38.476, df = 723, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
99 percent confidence interval:
0.7856499 0.8487564
sample estimates:
cor
0.8196749

#Regression
>mod=lm(LungCap~Age) #LungCap is Y data and Age is X data
>summary(mod)
Residuals:
Min 1Q Median 3Q Max
-4.7799 -1.0203 -0.0005 0.9789 4.2650

Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.14686 0.18353 6.249 7.06e-10 ***
Age 0.54485 0.01416 38.476 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.526 on 723 degrees of freedom #Sqrt(mse)


Multiple R-squared: 0.6719, Adjusted R-squared: 0.6714
F-statistic: 1480 on 1 and 723 DF, p-value: < 2.2e-16
> plot(Age,LungCap)
> abline(mod)
>anova(mod)

> mod=lm(LungCap~Age)
> plot(mod)
> par(mfrow=c(2,2)) #plots the 4 plots in 2 by 2
> plot(mod)

#Cut command
#We will create height categories of A<50, B=50-55, C=55-60, D=60-65, E=65-70, F=70+
> catheight=cut(Height, breaks=c(0,50,55,60,65,70,100), labels=c("A","B","C","D","E","F"))
> Height[1:10]
[1] 62.1 74.7 69.7 71.0 56.9 58.7 63.3 70.4 70.5 59.2
> catheight[1:10]
[1] D F E F C C D F F C
Levels: A B C D E F
> catheight=cut(Height, breaks=c(0,50,55,60,65,70,100), labels=c("A","B","C","D","E","F"),
right = F) #If we want for example 60 to lie in D (By default it lies to left in C)

#Multiple Regression Models


> mod2=lm(LungCap~Age+Height)
> summary(mod2)
Call:
lm(formula = LungCap ~ Age + Height)
Residuals:
Min 1Q Median 3Q Max
-3.4080 -0.7097 -0.0078 0.7167 3.1679

Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -11.747065 0.476899 -24.632 < 2e-16 ***
#estimated mean Lung Capacity for someone with zero Height and Age
Age 0.126368 0.017851 7.079 3.45e-12 ***
#We associate an increase in one year of age with an increase of 0.126 in lung Capacity
Height 0.278432 0.009926 28.051 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.056 on 722 degrees of freedom


#How far observed Lung Capacity are from the predicted or fitted lung capacity(Ý)……idea
of error e=Y- Ý
Multiple R-squared: 0.843, Adjusted R-squared: 0.8425
#84% variation in lung capacity can be explained by our model by Age And height
F-statistic: 1938 on 2 and 722 DF, p-value: < 2.2e-16

> mod2=lm(LungCap~Age+Height+Smoke+Gender+Caesarean)
> summary(mod2)
> plot(mod2)
#Dummy indicators
#We will create height categories of A<50, B=50-55, C=55-60, D=60-65, E=65-70, F=70+
> catheight=cut(Height, breaks=c(0,50,55,60,65,70,100), labels=c("A","B","C","D","E","F"))

> mod1=lm(LungCap~catheight)
> summary(mod1)

Call:
lm(formula = LungCap ~ catheight)

Residuals:
Min 1Q Median 3Q Max
-4.0074 -0.7996 -0.0324 0.7935 3.8754

Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.1486 0.2944 7.298 7.75e-13 ***
catheightB 1.5329 0.3424 4.476 8.83e-06 ***
catheightC 3.2768 0.3159 10.373 < 2e-16 ***
catheightD 5.0676 0.3102 16.335 < 2e-16 ***
catheightE 6.5837 0.3083 21.358 < 2e-16 ***
catheightF 8.6510 0.3083 28.065 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.249 on 719 degrees of freedom


Multiple R-squared: 0.7814, Adjusted R-squared: 0.7798
F-statistic: 513.9 on 5 and 719 DF, p-value: < 2.2e-16

> mean(LungCap[catheight=="A"])
[1] 2.148611
> mean(LungCap[catheight=="C"]) #Mean=2.15+1.51(0)+3.25(1)+5.02(0)+….=5.4
[1] 5.42542

#More on non-dependent variables like Age and Smoke with correlation of 0.2
> head(Smoke)
[1] "no" "yes" "no" "no" "no" "no"
> r=as.numeric(Smoke=="yes")
> head(r)
[1] 0 1 0 0 0 0
> cor(Age,r)
[1] 0.2112322

> mod1=lm(LungCap~Age+Smoke)
> summary(mod1)
For a smoker For a non-Smoker

For a smoker mean lung capacity is decreased by -0.649

#Interaction of multi-regression model


> mod=lm(LungCap~Age+Smoke+Age:Smoke) #Interaction Age:Smoke
> summary(mod)
The interaction is not statastically siginificant (0.377)

#IF ELSE Statement


if(x$InternetService[3]=="Fiber optic")
{print("Fiber optic")
}else if(x$InternetService[3]=="DSL")
{print("DSL")
}else {
print("No Internet service is not fiber optic nor DSL")}

#SWITCH Statement
switch(as.character(x$gender[6]),
"Male"=x$MonthlyCharges[6]*0.8,"Female"=x$MonthlyCharges[6]*0.5)
#If x column gender is male then 20% discount , if female 50% discount

#For statement to count number of DSL service users


count1=0
for (i in 1:nrow(x)) {
if(x$InternetService[i]=="DSL"){
count1=count1+1

}
print(count1)
}

#For statement1 to count number of DSL service users


count1=0
for (val in x$InternetService){
if(val=="DSL")
count1=count1+1
}
print(count1)

#While statement to count number of Tenure are 2 months


count12=0
i=1
while (i<=nrow(x)) {
if(x$tenure[i]==2) {
count12=count12+1
}
i=i+1
}

#####accuracy

fit1 <- rwf(x[, 1], h = 1)


summary(fit1)
accuracy(fit1)
#data in time series
dat_ts <- ts(x[, 1], start =1, end = 25, frequency = 1)
#SES model
se_model <- ses(dat_ts, h = 1)
summary(se_model)
accuracy(se_model)
#Holt model
holt_model <- holt(dat_ts, h = 1)
summary(holt_model)
accuracy(holt_model)
#ARIMA model
arima_model <- auto.arima(dat_ts)
summary(arima_model)
accuracy(arima_model)

You might also like