Simple Statistics Functions in R
Simple Statistics Functions in R
Susan Holmes
In this session, we’ll learn how to simulate data with R using
random number generators and how to use some of the most
useful statistical functions.
Getting started
When wanting to produce the same results with a random
number generator it is important to set a starting point. This is
important if you want to reproduce the results of a simulation
or algorithm, and is very important in debugging.
setwd("~/RWork")
library("dplyr")
##
## Attaching package: 'dplyr'
Taking a subsample
x <- c(1.9, 4.0, 4.4, 7.2, 3.8, 8.3, 8.7, 5.4, 8.8)
sample(x,3)
sample(x,3)
sample(9,3)
## [1] 2 6 9
sample(9,3)
## [1] 3 1 9
runif(3)
runif(3)
tenK<-runif(10000)
hist(tenK)
Now try:
set.seed(198911)
vecu=runif(100)
mean(vecu)
## [1] 0.4724584
set.seed(198911)
vecu=runif(100)
mean(vecu)
## [1] 0.4724584
u1=runif(1000)
hist(u1)
hist(u1,freq=FALSE)
u1=runif(100000)
hist(u1,freq=FALSE)
vals = seq(0,1,length=51)
head(vals)
punif(vals)
## [1] 0.00 0.02 0.04 0.06 0.08 0.10 0.12 0.14 0.16 0.18 0.20 0.22 0.24 0.26
## [15] 0.28 0.30 0.32 0.34 0.36 0.38 0.40 0.42 0.44 0.46 0.48 0.50 0.52 0.54
## [29] 0.56 0.58 0.60 0.62 0.64 0.66 0.68 0.70 0.72 0.74 0.76 0.78 0.80 0.82
## [43] 0.84 0.86 0.88 0.90 0.92 0.94 0.96 0.98 1.00
dunif(vals)
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
qunif(vals)
## [1] 0.00 0.02 0.04 0.06 0.08 0.10 0.12 0.14 0.16 0.18 0.20 0.22 0.24 0.26
## [15] 0.28 0.30 0.32 0.34 0.36 0.38 0.40 0.42 0.44 0.46 0.48 0.50 0.52 0.54
## [29] 0.56 0.58 0.60 0.62 0.64 0.66 0.68 0.70 0.72 0.74 0.76 0.78 0.80 0.82
## [43] 0.84 0.86 0.88 0.90 0.92 0.94 0.96 0.98 1.00
P (X ≤ a) = a
P (X ≤ q25) = 0.25
P (X ≤ m) = 0.5
head(sum5)
Here is how you might do the same thing with sapply(). You
can plot this using the same commands above.
set.seed(0)
reps <-10000
system.time(x1 <- sapply(1:reps, function(i){sum(runif(n=nuni))})) # simple apply
head(x1)
## [1] 3.015391 3.334659 1.515444 3.357100 2.701522 1.918804
set.seed(0)
system.time(x1 <- lapply(1:reps, function(i){sum(runif(n=nuni))})) # list apply
head(x1)
## [[1]]
## [1] 3.015391
##
## [[2]]
## [1] 3.334659
##
## [[3]]
## [1] 1.515444
##
## [[4]]
## [1] 3.3571
##
## [[5]]
## [1] 2.701522
##
## [[6]]
## [1] 1.918804
set.seed(0)
system.time(sum5 <- apply(matrix(runif(n=nuni*reps), nrow=nuni),2,sum)) # apply on a matrix
set.seed(0)
system.time(sum5 <- colSums(matrix(runif(n=nuni*reps), nrow=nuni))) # using colSums
head(sum5)
summary(sum5)
Question Do you think all the values in the range are equally
likely?
require(ggplot2)
d5 <-data.frame(sum5)
ggplot(d5, aes(sum5)) +
geom_histogram(binwidth =0.1)
ggplot(d5, aes(sum5)) +
geom_histogram(aes(y=..density..),binwidth =0.1)
ggplot(d5, aes(sum5)) +
geom_histogram(aes(y=..density..),binwidth =0.1,color="red")
ggplot(d5, aes(sum5)) +
geom_histogram(aes(y=..density..),binwidth =0.1,fill="red")
Question
probability P(M > 0.75). Do not use a for loop for any of this
question.
Computing a probability by
Monte Carlo
Estimate (to 3 significant digits) the probability
P(max(U , U , U ) > 0.75).
1 2 3
B <- 1000000
m3 <- matrix(runif(3000000),ncol=1000000,nrow=3)
sum(apply(m3,2,max)>0.75)/1000000
## [1] 0.578239
Answer: 0.578.
Many random variable
distributions
?Distributions
## [1] -0.001394208
sd(Norm10K)
## [1] 1.005359
hist(Norm10K)
ggplot(data.frame(Norm10K), aes(x = Norm10K)) +
geom_histogram(aes(y=..density..), binwidth =0.1)
qnorm(0.25)
## [1] -0.6744898
qnorm(0)
## [1] -Inf
qnorm(0.75)
## [1] 0.6744898
norm2=rnorm(50000,2.5,0.66)
ggplot(data.frame(norm2), aes(x = norm2)) +
geom_histogram(aes(y=..density..), binwidth =0.1)
ggplot(data.frame(sum5), aes(x = sum5)) +
geom_histogram(aes(y=..density..),binwidth =0.1) +
stat_function(fun=function(x) dnorm(x,mean=2.5,0.66),
color="red", size=1)
Making a quantile-quantile plot
with ggplot.
df5=data.frame(sum5)
ggplot(df5, aes(sample = sum5)) +
geom_point(stat = "qq")
ggplot(df5, aes(sample = sum5)) +
stat_qq(distribution = qnorm, dparams = list(mean(sum5),sd(sum5)))
Question
n <-100
mat10k=matrix(rnorm(1000*n),ncol=n)
maxs = apply(mat10k,1,max)
summary(maxs)
n <-200
mat10k=matrix(rnorm(1000*n),ncol=n)
maxs = apply(mat10k,1,max)
summary(maxs)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.762 2.459 2.701 2.749 2.984 4.584
n <-1000
mat10k=matrix(rnorm(1000*n),ncol=n)
maxs = apply(mat10k,1,max)
summary(maxs)
n <-8800
mat10k=matrix(rnorm(1000*n),ncol=n)
maxs = apply(mat10k,1,max)
summary(maxs)
Question
resp = matrix(rpois(100000*21,lambda=0.5),nrow=100000)
means = apply(resp,2,mean)
medians = apply(resp,2,median)
Testing
How do we test the difference
between two samples?
?t.test
sleep
## extra group ID
## 1 0.7 1 1
## 2 -1.6 1 2
## 3 -0.2 1 3
## 4 -1.2 1 4
## 5 -0.1 1 5
## 6 3.4 1 6
## 7 3.7 1 7
## 8 0.8 1 8
## 9 0.0 1 9
## 10 2.0 1 10
## 11 1.9 2 1
## 12 0.8 2 2
## 13 1.1 2 3
## 14 0.1 2 4
## 15 -0.1 2 5
## 16 4.4 2 6
## 17 5.5 2 7
## 18 1.6 2 8
## 19 4.6 2 9
## 20 3.4 2 10
?sleep
attach(sleep)
sleep1 <- extra[group == 2] - extra[group == 1]
sleep1
## [1] 1.2 2.4 1.3 1.3 0.0 1.0 1.8 0.8 4.6 1.4
t.test(sleep1)
##
## One Sample t-test
##
## data: sleep1
## t = 4.0621, df = 9, p-value = 0.002833
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 0.7001142 2.4598858
## sample estimates:
## mean of x
## 1.58
res=t.test(sleep1)
res$p.value
## [1] 0.00283289
t.test(extra~group,paired=TRUE)
##
## Paired t-test
##
## data: extra by group
## t = -4.0621, df = 9, p-value = 0.002833
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.4598858 -0.7001142
## sample estimates:
## mean of the differences
## -1.58
t.test(extra~group,paired=FALSE)
##
## Welch Two Sample t-test
##
## data: extra by group
## t = -1.8608, df = 17.776, p-value = 0.07939
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -3.3654832 0.2054832
## sample estimates:
## mean in group 1 mean in group 2
## 0.75 2.33
library(dplyr)
sumsperday<- birthn %>%
group_by(day_of_week) %>%
summarise(sum=sum(births)) %>%
arrange()
sumsperday
## # A tibble: 7 × 2
## day_of_week sum
## <int> <int>
## 1 1 9316001
## 2 2 10274874
## 3 3 10109130
## 4 4 10045436
## 5 5 9850199
## 6 6 6704495
## 7 7 5886889
chisq.test(sumsperday$sum)
##
## Chi-squared test for given probabilities
##
## data: sumsperday$sum
## X-squared = 2210500, df = 6, p-value < 2.2e-16
Regression
library(HistData)
?Galton
attach(Galton)
plot(parent,child)
hist(parent)
hist(child,breaks=50)
hist(parent,breaks=50)
plot(jitter(parent,3),jitter(child,3),pch=19,col="orange")
cor(parent,child)
## [1] 0.4587624
lm(child~parent)
##
## Call:
## lm(formula = child ~ parent)
##
## Coefficients:
## (Intercept) parent
## 23.9415 0.6463
str(reslm)
## List of 12
## $ coefficients : Named num [1:2] 23.942 0.646
## ..- attr(*, "names")= chr [1:2] "(Intercept)" "parent"
## $ residuals : Named num [1:928] -7.81 -6.51 -4.57 -3.93 -3.6 ...
## ..- attr(*, "names")= chr [1:928] "1" "2" "3" "4" ...
## $ effects : Named num [1:928] -2074.19 -35.17 -4.66 -4.12 -3.86 ...
## ..- attr(*, "names")= chr [1:928] "(Intercept)" "parent" "" "" ...
## $ rank : int 2
## $ fitted.values: Named num [1:928] 69.5 68.2 66.3 65.6 65.3 ...
## ..- attr(*, "names")= chr [1:928] "1" "2" "3" "4" ...
## $ assign : int [1:2] 0 1
## $ qr :List of 5
## ..$ qr : num [1:928, 1:2] -30.4631 0.0328 0.0328 0.0328 0.0328 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:928] "1" "2" "3" "4" ...
## .. .. ..$ : chr [1:2] "(Intercept)" "parent"
## .. ..- attr(*, "assign")= int [1:2] 0 1
## ..$ qraux: num [1:2] 1.03 1
## ..$ pivot: int [1:2] 1 2
## ..$ tol : num 1e-07
## ..$ rank : int 2
## ..- attr(*, "class")= chr "qr"
## $ df.residual : int 926
## $ xlevels : Named list()
## $ call : language lm(formula = child ~ parent)
## $ terms :Classes 'terms', 'formula' language child ~ parent
## .. ..- attr(*, "variables")= language list(child, parent)
## .. ..- attr(*, "factors")= int [1:2, 1] 0 1
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:2] "child" "parent"
## .. .. .. ..$ : chr "parent"
## .. ..- attr(*, "term.labels")= chr "parent"
## .. ..- attr(*, "order")= int 1
## .. ..- attr(*, "intercept")= int 1
## .. ..- attr(*, "response")= int 1
## .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. ..- attr(*, "predvars")= language list(child, parent)
## .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
## .. .. ..- attr(*, "names")= chr [1:2] "child" "parent"
## $ model :'data.frame': 928 obs. of 2 variables:
## ..$ child : num [1:928] 61.7 61.7 61.7 61.7 61.7 62.2 62.2 62.2 62.2 62.2 ...
## ..$ parent: num [1:928] 70.5 68.5 65.5 64.5 64 67.5 67.5 67.5 66.5 66.5 ...
## ..- attr(*, "terms")=Classes 'terms', 'formula' language child ~ parent
## .. .. ..- attr(*, "variables")= language list(child, parent)
## .. .. ..- attr(*, "factors")= int [1:2, 1] 0 1
## .. .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. .. ..$ : chr [1:2] "child" "parent"
## .. .. .. .. ..$ : chr "parent"
## .. .. ..- attr(*, "term.labels")= chr "parent"
## .. .. ..- attr(*, "order")= int 1
## .. .. ..- attr(*, "intercept")= int 1
## .. .. ..- attr(*, "response")= int 1
## .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. .. ..- attr(*, "predvars")= language list(child, parent)
## .. .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
## .. .. .. ..- attr(*, "names")= chr [1:2] "child" "parent"
## - attr(*, "class")= chr "lm"
plot(reslm)
Summary of this Session:
The sample function generates random subsamples of the
data.
Followup activity