##### Demonstration Script #0c ##### MATH322 ##### ##### Data analysis and Bootstrapping ##### ### Packages extend functionality ### Or, load several functions without creating a package source("http://rfs.kvasaheim.com/math322.R") ### Load data, Method 1 (small amounts) height = c(73, 68, 72, 68, 66) # Measures of center mean(height) median(height) # Measures of spread sd(height) var(height) IQR(height) # Measures of position zscore(68, height) quantile(height,0.90) # Those are all sample statistics, because they _______ # If the heights are randomly drawn from the target population, # we can use that sample to draw conclusions about the population. # # In STAT200, we learned how to estimate the population mean: t.test(height) # However, that t-test has one major assumption: # 1. X ~ Normal # hist(height) normoverlay(height, from=64, to=76) # Do the data come from a Normal distribution??? Maybe, maybe not. # If so, then the t-test is appropriate # If not, then what do we do? # # Bootstrap!!! # # Also called "resampling" # mn = numeric() for(i in 1:1e5) { samp = sample(height, size=5, replace=TRUE) mn[i] = mean(samp) } hist(mn) mean(mn) # Confidence interval quantile(mn, c(0.025,0.975)) # We are 95% confident that the mean height is between 67.2 and 71.8 inches. ### This process can be used to estimate *any* measure of center md = numeric() for(i in 1:1e5) { samp = sample(height, size=5, replace=TRUE) md[i] = median(samp) } hist(md) # Confidence interval quantile(md, c(0.025,0.975)) # We are 95% confident that the median height is between 66 and 73 inches. ### You can use the process to obtain estimates of other statistics, # but the results are biased. They can be unbiased, but the fixes # are beyond the scope of this example. sg = numeric() for(i in 1:1e5) { samp = sample(height, size=5, replace=TRUE) sg[i] = sd(samp) } hist(sg) quantile(sg, c(0.025,0.975)) # We are 95% confident that the standard deviation of the # height is between 0.89 and 3.56 inches. ts = numeric() for(i in 1:1e5) { samp = sample(height, size=5, replace=TRUE) ts[i] = mean(samp)/sd(samp)*sqrt(5) } hist(ts) quantile(ts, c(0.025,0.975)) # We are 95% confident that the value of TS is between 43 and 169. ##### ##### ##### ### Load data, Method 2 (large) dt = read.csv("http://rfs.kvasaheim.com/data/crime.csv") attach(dt) names(dt) summary(dt) hist(vcrime00) t.test(vcrime00, mu=500) plot(vcrime90,vcrime00) abline(a=0,b=1, col="salmon",lwd=2) t.test(vcrime90,vcrime00) # Bootstrapping the mean mn = numeric() for(i in 1:1e5) { samp = sample(vcrime00, size=51, replace=TRUE) mn[i] = mean(samp) } normoverlay(vcrime00) normoverlay(mn) quantile(mn, c(0.025,0.975)) # Assuming Normality t.test(vcrime00, mu=500) # Bootstrapping the median md = numeric() for(i in 1:1e5) { samp = sample(vcrime90, size=51, replace=TRUE) md[i] = median(samp) } normoverlay(vcrime90) normoverlay(md) quantile(md, c(0.025,0.975)) # Assuming Normality t.test(vcrime90, mu=500)