##### Demonstration Script #0c
##### MATH322
#####
##### Data analysis and Bootstrapping
#####


### Packages extend functionality
### Or, load several functions without creating a package
source("http://rfs.kvasaheim.com/math322.R")




### Load data, Method 1 (small amounts)

height = c(73, 68, 72, 68, 66)

# Measures of center
mean(height)
median(height)

# Measures of spread
sd(height)
var(height)
IQR(height)

# Measures of position
zscore(68, height)
quantile(height,0.90)

# Those are all sample statistics, because they _______


# If the heights are randomly drawn from the target population,
# we can use that sample to draw conclusions about the population.
#
# In STAT200, we learned how to estimate the population mean:

t.test(height)

# However, that t-test has one major assumption:
#  1. X ~ Normal
#

hist(height)
normoverlay(height, from=64, to=76)

# Do the data come from a Normal distribution??? Maybe, maybe not.
# If so, then the t-test is appropriate
# If not, then what do we do?
#
# Bootstrap!!!
#
# Also called "resampling"
#

mn = numeric()

for(i in 1:1e5) {
  samp = sample(height, size=5, replace=TRUE)
  mn[i] = mean(samp)
}

hist(mn)

mean(mn)

# Confidence interval
quantile(mn, c(0.025,0.975))

# We are 95% confident that the mean height is between 67.2 and 71.8 inches.





### This process can be used to estimate *any* measure of center

md = numeric()

for(i in 1:1e5) {
  samp = sample(height, size=5, replace=TRUE)
  md[i] = median(samp)
}
hist(md)

# Confidence interval
quantile(md, c(0.025,0.975))
# We are 95% confident that the median height is between 66 and 73 inches.



### You can use the process to obtain estimates of other statistics,
#   but the results are biased. They can be unbiased, but the fixes 
#   are beyond the scope of this example.

sg = numeric()
for(i in 1:1e5) {
  samp = sample(height, size=5, replace=TRUE)
  sg[i] = sd(samp)
}
hist(sg)
quantile(sg, c(0.025,0.975))
# We are 95% confident that the standard deviation of the
# height is between 0.89 and 3.56 inches.


ts = numeric()
for(i in 1:1e5) {
  samp = sample(height, size=5, replace=TRUE)
  ts[i] = mean(samp)/sd(samp)*sqrt(5)
}
hist(ts)
quantile(ts, c(0.025,0.975))
# We are 95% confident that the value of TS is between 43 and 169.






#####
#####
#####




### Load data, Method 2 (large)

dt = read.csv("http://rfs.kvasaheim.com/data/crime.csv")
attach(dt)

names(dt)
summary(dt)

hist(vcrime00)
t.test(vcrime00, mu=500)

plot(vcrime90,vcrime00)
abline(a=0,b=1, col="salmon",lwd=2)
t.test(vcrime90,vcrime00)



# Bootstrapping the mean
mn = numeric()
for(i in 1:1e5) {
  samp = sample(vcrime00, size=51, replace=TRUE)
  mn[i] = mean(samp)
}
normoverlay(vcrime00)
normoverlay(mn)

quantile(mn, c(0.025,0.975))

# Assuming Normality
t.test(vcrime00, mu=500)







# Bootstrapping the median
md = numeric()
for(i in 1:1e5) {
  samp = sample(vcrime90, size=51, replace=TRUE)
  md[i] = median(samp)
}
normoverlay(vcrime90)
normoverlay(md)

quantile(md, c(0.025,0.975))

# Assuming Normality
t.test(vcrime90, mu=500)