########################################
#
#  Script: 1 March 2011 (20110301.R)
#
########################################

# Today:
#
#   ** The test
#      . Why t not z
#      . Accuracy function
#      . Sri Lanka
#      . Benford
#      . Beer
#
#   ** The next section:
#      . Continuous independent variables
#




# Why t, not z?

# As there is no z.test() in R, let us use the one I wrote. This
# is also how to dynamically link to external scripts.

source("http://courses.kvasaheim.com/stat40x3/scripts/z.test.R")


# Let us test the appropriateness of using a z-test when you use the
# sample variance as the variance of the Normal distribution.

# The number of trials should be as big as you have time for. A rule
# of thumb is that for every two zeroes in the number of trials, the
# estimate is good for another digit. Thus, for trials <- 10000, the
# estimates are good to two significant figures.

trials <- 1e4      # Set the number of trials
p1 <- numeric()    # Set aside memory for this variable, which will be filled later.

for (i in 1:trials) {             # Now, for 10000 times, 
  x <- rnorm(10)                  # take a sample of size 10 from a Std Normal, and
  p1[i] <- t.test(x)$p.value      # perform a t-test on the sample and get the p-value
}
hist(p1)                          # Plot an empirical pdf of the p-value
abline(h=trials/20)               # Plot the expected value line

# So, the histogram indicates the p-values are Uniformly(0,1) distributed,
# as they need to be for the test to be an appropriate test.


# What about using a z-test in this circumstance? Let us use
# the same steps as above

trials <- 1e4      
p2 <- numeric()
for (i in 1:trials) {
  x <- rnorm(10)
  p2[i] <- z.test( x, sigma2=var(x) )$p.value
}
hist(p2)
abline(h=trials/20)

# Note that the p-values are not Uniform(0,1) distributed. That indicates that 
# this test is inappropriate in this case. In fact, the relative frequency in
# that first bar is the actual alpha value for the test, not the alpha=0.05
# you are reporting.





# Now, on to the test:



####################
# Accuracy function (Problem 5)

accuracy <- function(t) {
  acc <- (t[1,1]+t[2,2])/sum(t)
return (acc)
}

#Step Six (remember?):

t <-  matrix(c(15,2, 3,17), byrow=TRUE, nrow=2,
    dimnames=list( Predicted=c("Terrorist", "Non-Terrorist"), Actual=c("Terrorist", "Non-Terrorist") )
    )

accuracy(t)


# Make sure this is correct and that you save this function; you WILL
# need it in the next part of the course.





####################
# Sri Lanka (Problem 6)

sri <- read.csv("http://courses.kvasaheim.com/stat40x3/data/sri2010pres.csv")
names(sri)

# The above two lines are usual.
# The next four lines are a good habit. Why?

v.raja  <- sri$RAJAPAKSE
v.total <- sri$TOTAL
prov    <- sri$PROVINCE
divn    <- sri$DIVISION


# It is good statistical work to ALWAYS alter the data within the statistical program
# and not in the original data set (except for tidying it up). For the test, I
# expected you to alter the original data. You are to never do that again. There is 
# a sense of unethicality in altering the data.


# The which() function returns a vector of IDs corresponding to the elements of the
# vector that match the condition in the parentheses. 

#(1) Drop:
drop    <- which(divn=="Postal")
v.raja  <- v.raja[-drop]
v.total <- v.total[-drop]
prov    <- prov[-drop]
divn    <- divn[-drop]


drop    <- which(divn=="Displace")
v.raja  <- v.raja[-drop]
v.total <- v.total[-drop]
prov    <- prov[-drop]
divn    <- divn[-drop]


# Of course, you /can/ do the two drops at once, but I prefer this as it makes it clearer
# to me what I was doing.


n <- length(v.raja)    # I will need this information in the future.



#(2)Boxplot:
p.raja <- v.raja/v.total

par(mar=c(5,11,2,2)+0.1)
boxplot(p.raja~prov, las=1, ylim=c(0,1), horizontal=TRUE,
	xlab="Proportion support for Rajapaksa")


#(3)Combine?
library(agricolae)
kruskal(p.raja,prov)    # As the assumptions of ANOVA are not me tin the data, we use
                        # this non-parametric multiple-comparisons test.
#




####################
# Sri Lanka (Problem 7)

v.raja


# One way: do it by hand (which is frequently done for dabblers)
#          This is what I expected you to do.


# Another way:

nt <- substring(v.raja,1,1)
ob <- table(nt)
ex <- n * c(0.301,0.176,0.125,0.097,0.079,0.067,0.058,0.051,0.046)

rbind(ex,ob)  # This lets you get a look at the data


# The chisq.test for comparing distributions. There is also a
# chisq.1d.test() available from the website:
# source("http://courses/kvasaheim.com/stat40x3/scripts/chisq.1d.test.R")

ex-ob
(ex-ob)^2
(ex-ob)^2/ex
X2 <- sum( (ex-ob)^2/ex ) # ~ chisq(df=8)   thus, p<<0.0001
1-pchisq(X2, df=8)
1-pchisq(X2, df=length(ex)-1)


# Does this result surprise us? Let's look at the graph:

par(mar=c(5,5,2,2)+0.1)
plot(ex, type="l", las=1, xlab="Digit",
    ylab="Digit  Frequency", ylim=c(0,50), lwd=2, col="tomato")
lines(ob, col='navy', type="l")
legend("topright", legend=c("Expected","Observed"), bty="n",
     lwd=c(2,1), col=c('tomato','navy'))





####################
# Guiness (Problem 8)
beer <- read.csv("http://courses.kvasaheim.com/stat40x3/data/beer1.csv")
names(beer)

par(mar=c(5,5,2,2)+0.1)
boxplot(rating~brand, data=beer, las=1, ylab="Rating")


kruskal.test(rating~brand, data=beer)

beer.model <- aov(rating~brand, data=beer)
summary( beer.model )

with(beer, kruskal(rating,brand) )
with(beer, pairwise.t.test(rating,brand) )

LSD.test(beer.model, "brand")
HSD.test(beer.model, "brand")
SNK.test(beer.model, "brand")
duncan.test(beer.model, "brand")


# The assumption of the Kruskal-Wallis test is that the distributions 
# differ only in the location. Here is one way we would test that.
with(beer, ks.test(rating[brand=="Coors Light"],rating[brand=="Olympia"]) )


# Another way for discrete data like this is to perform a chisq.1d.test()