##### Demonstration Script #4a
##### MATH322
#####
##### Testing the Chi-Square test 
#####


### Binomial Experiment
#   Recall that the chi-square test should not work too
#   well for a binomial experiment. Let's see just how
#   bad it is.


B = 1e6


n = 5     # Num of Flips
p = 0.50  # Pr(Head)

# Draw a sample from the Head distribution
H = rbinom(B, size=n, prob=0.500)

# Calculate the sample from the Tail distribution
T = n - H

# Calculate the test statistic
TS = (H-n*p)^2 / (n*p*(1-p))+ (T-n*(1-p))^2 / (n*(1-p)*p)


# Look at the distribution of the data and test statistic
hist(H)
hist(T)
hist(TS)


# Let's calculate the endpoints of a 95% confidence interval
# under the assumption that TS does follow a chi-square dist

LB = qchisq(0.025, df=1)
UB = qchisq(0.975, df=1)

# Finally, let's calculate the Type I error rate

mean(TS<LB) + mean(TS>UB)

# If n=5,    then a = 0.06 > 0.05 = alpha (albeit close!)
# If n=50,   then a = 0.23 > 0.05 = alpha
# If n=5000, then a = 0.12 > 0.05 = alpha


## So, for two bins (H and T), the Chi-square test is not
#  that good of a test... at least from the Type I error 
#  rate perspective.


### Poisson Experiments
#   Recall that the chi-square test should not work too
#   well for a binomial experiment. Let's see just how
#   bad it is.

B = 1e6

EL = 200
p = 0.500

# Draw a sample from the Head distribution
H = rpois(m, lambda=EL*p)

# Calculate the sample from the Tail distribution
T = rpois(m, lambda=EL*(1-p))

# Calculate the test statistic
TS = (H-EL*p)^2 / (EL*p) + (T-EL*(1-p))^2 / (EL*(1-p))


# Look at the distribution of the data and test statistic
hist(H)
hist(T)
hist(TS)


# Let's calculate the endpoints of a 95% confidence interval
# under the assumption that TS does follow a chi-square dist

LB = qchisq(0.025, df=2)
UB = qchisq(0.975, df=2)

# Finally, let's calculate the Type I error rate

mean(TS<LB) + mean(TS>UB)


# When pEL = 10,  a = 0.04 
# When pEL = 30,  a = 0.05
# When pEL = 100, a = 0.05

# Note that these observed Type I error rates are much closer to 
# alpha = 0.05 than for the Binomial experiment. 
#
# This is as we expected. =)


### Dice Experiments
#   Maybe having more bins would allow the Binomial experiment (now
#   a multinomial experiment) to work with the Chi-square test better
#


# (this is what I started and skipped in class)


B = 1e4  ## Number of experiments

n = 50   ## Number of dice rolls in the experiment


# Roll a random die n times for each of the B experiments
x = rmultinom(B, size=n, prob=c(1,1,1,1,1,1)/6)

# Calculate the test statistic

TS = numeric()

for(i in 1:B) {
  TS[i] = sum( (x[,i]-n/6)^2/(n/6) )
}

#barplot( apply(x, MAR=1, FUN=mean), names=1:6); abline(h=n/6)
#hist(TS)


# Let's calculate the endpoints of a 95% confidence interval
# under the assumption that TS does follow a chi-square dist

LB = qchisq(0.025, df=5)
UB = qchisq(0.975, df=5)

# Finally, let's calculate the Type I error rate

mean(TS<LB) + mean(TS>UB)


# For n =  10, a = 0.07
# For n =  50, a = 0.05
# For n = 100, a = 0.05

# Note that these observed Type I error rates are much closer to 
# alpha = 0.05 than for the original Binomial experiment. 
#
# Thus, it appears as though the Chi-square test is better as the
# number of bins (choices) increases. g=2 was too few. g=6 seems good.


##### 
### Using real data

# Goodness of fit:
dt = read.csv("http://rfs.kvasaheim.com/data/someCollegeClean.csv")
summary(dt)

lev = table(dt$level)
chisq.test(lev)                 # H0 : The four class levels are equally represented in the pop

gen = table(dt$gender)
chisq.test(gen)                 # H0 : There are equal numbers of males and females in the pop
chisq.test(gen, p=c(1,2)/3)     # H0 : There are twice as many females as males in the pop
chisq.test(gen, p=c(2,1)/3)     # H0 : There are twice as many males as females in the pop
chisq.test(gen, p=c(1,1.5)/2.5) # H0 : There are 50% more males as females in the pop


# Test of independence
dt = read.csv("http://rfs.kvasaheim.com/data/crime.csv")
summary(dt)

obs = table(dt$cult_dom,dt$census4)
chisq.test( obs )   # H0 : There is no relationship between dominant culture and region


# Test of some things
dt = read.csv("http://rfs.kvasaheim.com/data/religion.csv")
summary(dt)


obs = table(dt$Vote,dt$Census4)
chisq.test( obs )      # H0 : ?????
obs


obs = table(dt$Vote)
chisq.test(obs)        # H0 : ?????
obs