######################################## # # Script: 17 February 2011 (20110217.R) # ######################################## # Today: # # ** Categorical vs Categorical # . Two-Way # # ** Statistical effect # . Start of level of effect ################################################## # # Now, with regard to tables, here are some # very useful functions # pearson.x2 <- function(ob) { i <- rowSums(ob) j <- colSums(ob) n <- sum(ob) ex <- i%*%t(j)/n stat <- sum( (ex-ob)^2/ex ) df <- (length(i)-1)*(length(j)-1) p <- 1-pchisq(stat,df=df) return(p) } lrtest.g2 <- function(ob) { i <- rowSums(ob) j <- colSums(ob) n <- sum(ob) ex <- i%*%t(j)/n stat <- 2 * sum( ob * log(ob/ex) ) df <- (length(i)-1)*(length(j)-1) p <- 1-pchisq(stat,df=df) return(p) } resid.raw <- function(ob) { i <- rowSums(ob) j <- colSums(ob) n <- sum(ob) ex <- i%*%t(j)/n t <- ob-ex return(t) } resid.pearson <- function(ob) { i <- rowSums(ob) j <- colSums(ob) n <- sum(ob) ex <- i%*%t(j)/n t <- (ob-ex)/sqrt(ex) return(t) } resid.haberman <- function(ob) { i <- rowSums(ob) j <- colSums(ob) n <- sum(ob) ex <- i%*%t(j)/n pr <- i/n # i+ pc <- j/n # +j t <- (ob-ex)/( sqrt(ex*(1-pr)%*%t(1-pc)) ) return(t) } ################################################## # # Examples of use # # Note the change in how the data is presented. # This is very important! Data for categorical vs. # categorical is in count form. So, how do we get # from how we usually get our data to count form? # See Example 3. # Example 1: Gender by political party d <- matrix( c(762,327,468,484,239,477), byrow=TRUE, nrow=2) pearson.x2(d) lrtest.g2(d) resid.raw(d) resid.pearson(d) resid.haberman(d) # Example 2: Working mother vs. Paid maternity leave (GSS) d <- matrix( c(97,96,22,17,2, 102,199,48,38,5, 42,102,25,36,7, 9,18,7,10,2), byrow=TRUE, nrow=4) pearson.x2(d) lrtest.g2(d) resid.raw(d) resid.pearson(d) resid.haberman(d) # Example 3: Vitamin C vs Cold data <- read.csv("http://courses.kvasaheim.com/stat40x3/data/vitaminC.csv", header=TRUE) # Note that the style of the data is not frequency; it # is the usual way we get our data: Each row is an individual ob <- table(data) # This makes it frequency pearson.x2(ob) lrtest.g2(ob) resid.raw(ob) resid.pearson(ob) resid.haberman(ob) ################################################## # # Hypothesis tests # ** Pay attention to the null hypothesis in each case! # . differences are subtle # Question 1: Independence (variables not related) # Question 2: Homogeneity (grouping is useful) # Question 3: Unrelated Classifications (both margins fixed by design) # However, each is tested in the same way, since the # underlying distributional hypothesis is the same.