######################################## # # Script: 18 January 2011 (20110118.R) # ######################################## # Today, we will read in data, test means, and test # assumptions of those tests # Read in data crime <- read.csv("http://courses.kvasaheim.com/stat40x3/data/crime.csv", header=TRUE) # If your data is local and not online, you can load it like: crime <- read.csv("G:/STAT40x3/data/crime.csv", header=TRUE) # Get to know the data ALWAYS!!! names(crime) # Gives the variable names summary(crime) # Gives summary stats for each variable in the dataset mean(crime$vcrime90) # Note the way of refering to the variable within the dataset sd(crime$vcrime90) var(crime$vcrime90) # Let us examine violent crime rates in 1990 for South and West # Because we will be doing this a lot, let us create 'shortcut' variables # because we are lazy # Define our shortcut variables vcrime90.west <- crime$vcrime90[crime$census4=="West"] vcrime90.south <- crime$vcrime90[crime$census4=="South"] # Check means of the two groups mean(vcrime90.west) mean(vcrime90.south) # Graphically ``see'' how different the means are boxplot(vcrime90.west,vcrime90.south) # Category 1 is West, 2 is South # # Let us use a t-test to check the two means, because that # is what we usually do by default. # Check assumptions of t-test first # 1: Normality hist(vcrime90.west) # Not a good choice for small n hist(vcrime90.south) qqnorm(vcrime90.west) # Also not a good choice for small n qqnorm(vcrime90.south) shapiro.test(vcrime90.west) # The null hypothesis is Normality shapiro.test(vcrime90.south) # The South is not Normal, so cannot use t-test # The Kolmogorov-Smirnov test is more general, as it compares any # two distributions. I tend to use the Shapiro-Wilk test (above) if possible. # The null hypothesis is that the two distributions are the same ks.test(vcrime90.west, "pnorm",m=mean(vcrime90.west), s=sd(vcrime90.west) ) ks.test(vcrime90.south, "pnorm",m=mean(vcrime90.south),s=sd(vcrime90.south)) # 2: Check for equal variances var.test( vcrime90.west,vcrime90.south ) # Only works for two groups bartlett.test( vcrime90.west,vcrime90.south ) # Does not work bartlett.test( list(vcrime90.west,vcrime90.south) ) # Does work # Are the means significantly different? t.test(vcrime90.west,vcrime90.south, var.equal=TRUE) t.test(vcrime90.west,vcrime90.south, var.equal=FALSE) # This is the default # Since the South was not Normal, we should use a non-parametric test wilcox.test(vcrime90.west,vcrime90.south, exact=TRUE) # This is slightly slower, and is default wilcox.test(vcrime90.west,vcrime90.south, exact=FALSE) # What are the assumptions of the Mann-Whitney? The two groups have the same # distributions except for the centrality parameter. Let's try to check this par(mfrow=c(1,2)) # Allows us to compare two graphs side-by-side hist(vcrime90.west) hist(vcrime90.south) # Still not a good test for small n. # There is a test called the Permutation Test, which makes no # assumptions. It is slow, however. # It is the permTS() function in the perm package in R. # FYI: The permTS() function gives a p-value of 0.03193, so the conclusion is the same