########################################
#
#  Script: 17 February 2011 (20110217.R)
#
########################################

# Today:
#
#   ** Categorical vs Categorical 
#      . Two-Way
#
#   ** Statistical effect
#      . Start of level of effect




##################################################
#
# Now, with regard to tables, here are some 
#   very useful functions
#


pearson.x2 <- function(ob) {
  i <- rowSums(ob)
  j <- colSums(ob)
  n <- sum(ob)
  ex <- i%*%t(j)/n
  stat <- sum( (ex-ob)^2/ex )
  df <- (length(i)-1)*(length(j)-1)
  p <- 1-pchisq(stat,df=df)
return(p)
}


lrtest.g2 <- function(ob) {
  i <- rowSums(ob)
  j <- colSums(ob)
  n <- sum(ob)
  ex <- i%*%t(j)/n
  stat <- 2 * sum( ob * log(ob/ex) )
  df <- (length(i)-1)*(length(j)-1)
  p <- 1-pchisq(stat,df=df)
return(p)
}


resid.raw <- function(ob) {
  i <- rowSums(ob)
  j <- colSums(ob)
  n <- sum(ob)
  ex <- i%*%t(j)/n
  t <- ob-ex
return(t)
}


resid.pearson <- function(ob) {
  i <- rowSums(ob)
  j <- colSums(ob)
  n <- sum(ob)
  ex <- i%*%t(j)/n
  t <- (ob-ex)/sqrt(ex)
return(t)
}


resid.haberman <- function(ob) {
  i <- rowSums(ob)
  j <- colSums(ob)
  n <- sum(ob)
  ex <- i%*%t(j)/n
  pr <- i/n		 # i+
  pc <- j/n		 # +j
  t <- (ob-ex)/( sqrt(ex*(1-pr)%*%t(1-pc)) )
return(t)
}








##################################################
#
# Examples of use
#

# Note the change in how the data is presented. 
# This is very important! Data for categorical vs. 
# categorical is in count form. So, how do we get
# from how we usually get our data to count form?
# See Example 3.


# Example 1: Gender by political party
d <- matrix( c(762,327,468,484,239,477), byrow=TRUE, nrow=2)

pearson.x2(d)
lrtest.g2(d)

resid.raw(d)
resid.pearson(d)
resid.haberman(d)




# Example 2: Working mother vs. Paid maternity leave (GSS)
d <- matrix( c(97,96,22,17,2, 102,199,48,38,5, 42,102,25,36,7, 9,18,7,10,2), byrow=TRUE, nrow=4)

pearson.x2(d)
lrtest.g2(d)

resid.raw(d)
resid.pearson(d)
resid.haberman(d)




# Example 3: Vitamin C vs Cold
data <- read.csv("http://courses.kvasaheim.com/stat40x3/data/vitaminC.csv", header=TRUE)

# Note that the style of the data is not frequency; it
# is the usual way we get our data: Each row is an individual


ob <- table(data) # This makes it frequency

pearson.x2(ob)
lrtest.g2(ob)

resid.raw(ob)
resid.pearson(ob)
resid.haberman(ob)






##################################################
#
# Hypothesis tests 
#   ** Pay attention to the null hypothesis in each case!
#      . differences are subtle


# Question 1: Independence (variables not related)



# Question 2: Homogeneity (grouping is useful)



# Question 3: Unrelated Classifications (both margins fixed by design)



# However, each is tested in the same way, since the 
# underlying distributional hypothesis is the same.