########################################
#
#  Script: 18 January 2011 (20110118.R)
#
########################################


# Today, we will read in data, test means, and test
# assumptions of those tests


# Read in data
crime <- read.csv("http://courses.kvasaheim.com/stat40x3/data/crime.csv", header=TRUE)

# If your data is local and not online, you can load it like:
crime <- read.csv("G:/STAT40x3/data/crime.csv", header=TRUE)



# Get to know the data ALWAYS!!!

names(crime)     # Gives the variable names
summary(crime)   # Gives summary stats for each variable in the dataset

mean(crime$vcrime90)  # Note the way of refering to the variable within the dataset
sd(crime$vcrime90)
var(crime$vcrime90)

# Let us examine violent crime rates in 1990 for South and West
# Because we will be doing this a lot, let us create 'shortcut' variables
# because we are lazy

# Define our shortcut variables
vcrime90.west  <- crime$vcrime90[crime$census4=="West"]
vcrime90.south <- crime$vcrime90[crime$census4=="South"]



# Check means of the two groups
mean(vcrime90.west)
mean(vcrime90.south)

# Graphically ``see'' how different the means are
boxplot(vcrime90.west,vcrime90.south) # Category 1 is West, 2 is South



#
# Let us use a t-test to check the two means, because that
# is what we usually do by default.

# Check assumptions of t-test first

# 1: Normality
hist(vcrime90.west)    # Not a good choice for small n
hist(vcrime90.south)   

qqnorm(vcrime90.west)  # Also not a good choice for small n
qqnorm(vcrime90.south)

shapiro.test(vcrime90.west)   # The null hypothesis is Normality
shapiro.test(vcrime90.south)  # The South is not Normal, so cannot use t-test

# The Kolmogorov-Smirnov test is more general, as it compares any 
# two distributions. I tend to use the Shapiro-Wilk test (above) if possible.
# The null hypothesis is that the two distributions are the same
ks.test(vcrime90.west,  "pnorm",m=mean(vcrime90.west), s=sd(vcrime90.west) )
ks.test(vcrime90.south, "pnorm",m=mean(vcrime90.south),s=sd(vcrime90.south))

# 2: Check for equal variances
var.test( vcrime90.west,vcrime90.south )             # Only works for two groups
bartlett.test( vcrime90.west,vcrime90.south )        # Does not work
bartlett.test( list(vcrime90.west,vcrime90.south) )  # Does work


# Are the means significantly different?
t.test(vcrime90.west,vcrime90.south, var.equal=TRUE)
t.test(vcrime90.west,vcrime90.south, var.equal=FALSE)  # This is the default


# Since the South was not Normal, we should use a non-parametric test

wilcox.test(vcrime90.west,vcrime90.south, exact=TRUE)  # This is slightly slower, and is default
wilcox.test(vcrime90.west,vcrime90.south, exact=FALSE)

# What are the assumptions of the Mann-Whitney? The two groups have the same 
# distributions except for the centrality parameter. Let's try to check this
par(mfrow=c(1,2))        # Allows us to compare two graphs side-by-side
hist(vcrime90.west)
hist(vcrime90.south)     # Still not a good test for small n.


# There is a test called the Permutation Test, which makes no
# assumptions. It is slow, however.
# It is the permTS() function in the perm package in R. 

# FYI: The permTS() function gives a p-value of 0.03193, so the conclusion is the same