######################################## # # Script: 1 March 2011 (20110301.R) # ######################################## # Today: # # ** The test # . Why t not z # . Accuracy function # . Sri Lanka # . Benford # . Beer # # ** The next section: # . Continuous independent variables # # Why t, not z? # As there is no z.test() in R, let us use the one I wrote. This # is also how to dynamically link to external scripts. source("http://courses.kvasaheim.com/stat40x3/scripts/z.test.R") # Let us test the appropriateness of using a z-test when you use the # sample variance as the variance of the Normal distribution. # The number of trials should be as big as you have time for. A rule # of thumb is that for every two zeroes in the number of trials, the # estimate is good for another digit. Thus, for trials <- 10000, the # estimates are good to two significant figures. trials <- 1e4 # Set the number of trials p1 <- numeric() # Set aside memory for this variable, which will be filled later. for (i in 1:trials) { # Now, for 10000 times, x <- rnorm(10) # take a sample of size 10 from a Std Normal, and p1[i] <- t.test(x)$p.value # perform a t-test on the sample and get the p-value } hist(p1) # Plot an empirical pdf of the p-value abline(h=trials/20) # Plot the expected value line # So, the histogram indicates the p-values are Uniformly(0,1) distributed, # as they need to be for the test to be an appropriate test. # What about using a z-test in this circumstance? Let us use # the same steps as above trials <- 1e4 p2 <- numeric() for (i in 1:trials) { x <- rnorm(10) p2[i] <- z.test( x, sigma2=var(x) )$p.value } hist(p2) abline(h=trials/20) # Note that the p-values are not Uniform(0,1) distributed. That indicates that # this test is inappropriate in this case. In fact, the relative frequency in # that first bar is the actual alpha value for the test, not the alpha=0.05 # you are reporting. # Now, on to the test: #################### # Accuracy function (Problem 5) accuracy <- function(t) { acc <- (t[1,1]+t[2,2])/sum(t) return (acc) } #Step Six (remember?): t <- matrix(c(15,2, 3,17), byrow=TRUE, nrow=2, dimnames=list( Predicted=c("Terrorist", "Non-Terrorist"), Actual=c("Terrorist", "Non-Terrorist") ) ) accuracy(t) # Make sure this is correct and that you save this function; you WILL # need it in the next part of the course. #################### # Sri Lanka (Problem 6) sri <- read.csv("http://courses.kvasaheim.com/stat40x3/data/sri2010pres.csv") names(sri) # The above two lines are usual. # The next four lines are a good habit. Why? v.raja <- sri$RAJAPAKSE v.total <- sri$TOTAL prov <- sri$PROVINCE divn <- sri$DIVISION # It is good statistical work to ALWAYS alter the data within the statistical program # and not in the original data set (except for tidying it up). For the test, I # expected you to alter the original data. You are to never do that again. There is # a sense of unethicality in altering the data. # The which() function returns a vector of IDs corresponding to the elements of the # vector that match the condition in the parentheses. #(1) Drop: drop <- which(divn=="Postal") v.raja <- v.raja[-drop] v.total <- v.total[-drop] prov <- prov[-drop] divn <- divn[-drop] drop <- which(divn=="Displace") v.raja <- v.raja[-drop] v.total <- v.total[-drop] prov <- prov[-drop] divn <- divn[-drop] # Of course, you /can/ do the two drops at once, but I prefer this as it makes it clearer # to me what I was doing. n <- length(v.raja) # I will need this information in the future. #(2)Boxplot: p.raja <- v.raja/v.total par(mar=c(5,11,2,2)+0.1) boxplot(p.raja~prov, las=1, ylim=c(0,1), horizontal=TRUE, xlab="Proportion support for Rajapaksa") #(3)Combine? library(agricolae) kruskal(p.raja,prov) # As the assumptions of ANOVA are not me tin the data, we use # this non-parametric multiple-comparisons test. # #################### # Sri Lanka (Problem 7) v.raja # One way: do it by hand (which is frequently done for dabblers) # This is what I expected you to do. # Another way: nt <- substring(v.raja,1,1) ob <- table(nt) ex <- n * c(0.301,0.176,0.125,0.097,0.079,0.067,0.058,0.051,0.046) rbind(ex,ob) # This lets you get a look at the data # The chisq.test for comparing distributions. There is also a # chisq.1d.test() available from the website: # source("http://courses/kvasaheim.com/stat40x3/scripts/chisq.1d.test.R") ex-ob (ex-ob)^2 (ex-ob)^2/ex X2 <- sum( (ex-ob)^2/ex ) # ~ chisq(df=8) thus, p<<0.0001 1-pchisq(X2, df=8) 1-pchisq(X2, df=length(ex)-1) # Does this result surprise us? Let's look at the graph: par(mar=c(5,5,2,2)+0.1) plot(ex, type="l", las=1, xlab="Digit", ylab="Digit Frequency", ylim=c(0,50), lwd=2, col="tomato") lines(ob, col='navy', type="l") legend("topright", legend=c("Expected","Observed"), bty="n", lwd=c(2,1), col=c('tomato','navy')) #################### # Guiness (Problem 8) beer <- read.csv("http://courses.kvasaheim.com/stat40x3/data/beer1.csv") names(beer) par(mar=c(5,5,2,2)+0.1) boxplot(rating~brand, data=beer, las=1, ylab="Rating") kruskal.test(rating~brand, data=beer) beer.model <- aov(rating~brand, data=beer) summary( beer.model ) with(beer, kruskal(rating,brand) ) with(beer, pairwise.t.test(rating,brand) ) LSD.test(beer.model, "brand") HSD.test(beer.model, "brand") SNK.test(beer.model, "brand") duncan.test(beer.model, "brand") # The assumption of the Kruskal-Wallis test is that the distributions # differ only in the location. Here is one way we would test that. with(beer, ks.test(rating[brand=="Coors Light"],rating[brand=="Olympia"]) ) # Another way for discrete data like this is to perform a chisq.1d.test()