##### Demonstration Script #4a ##### MATH322 ##### ##### Testing the Chi-Square test ##### ### Binomial Experiment # Recall that the chi-square test should not work too # well for a binomial experiment. Let's see just how # bad it is. B = 1e6 n = 5 # Num of Flips p = 0.50 # Pr(Head) # Draw a sample from the Head distribution H = rbinom(B, size=n, prob=0.500) # Calculate the sample from the Tail distribution T = n - H # Calculate the test statistic TS = (H-n*p)^2 / (n*p*(1-p))+ (T-n*(1-p))^2 / (n*(1-p)*p) # Look at the distribution of the data and test statistic hist(H) hist(T) hist(TS) # Let's calculate the endpoints of a 95% confidence interval # under the assumption that TS does follow a chi-square dist LB = qchisq(0.025, df=1) UB = qchisq(0.975, df=1) # Finally, let's calculate the Type I error rate mean(TSUB) # If n=5, then a = 0.06 > 0.05 = alpha (albeit close!) # If n=50, then a = 0.23 > 0.05 = alpha # If n=5000, then a = 0.12 > 0.05 = alpha ## So, for two bins (H and T), the Chi-square test is not # that good of a test... at least from the Type I error # rate perspective. ### Poisson Experiments # Recall that the chi-square test should not work too # well for a binomial experiment. Let's see just how # bad it is. B = 1e6 EL = 200 p = 0.500 # Draw a sample from the Head distribution H = rpois(m, lambda=EL*p) # Calculate the sample from the Tail distribution T = rpois(m, lambda=EL*(1-p)) # Calculate the test statistic TS = (H-EL*p)^2 / (EL*p) + (T-EL*(1-p))^2 / (EL*(1-p)) # Look at the distribution of the data and test statistic hist(H) hist(T) hist(TS) # Let's calculate the endpoints of a 95% confidence interval # under the assumption that TS does follow a chi-square dist LB = qchisq(0.025, df=2) UB = qchisq(0.975, df=2) # Finally, let's calculate the Type I error rate mean(TSUB) # When pEL = 10, a = 0.04 # When pEL = 30, a = 0.05 # When pEL = 100, a = 0.05 # Note that these observed Type I error rates are much closer to # alpha = 0.05 than for the Binomial experiment. # # This is as we expected. =) ### Dice Experiments # Maybe having more bins would allow the Binomial experiment (now # a multinomial experiment) to work with the Chi-square test better # # (this is what I started and skipped in class) B = 1e4 ## Number of experiments n = 50 ## Number of dice rolls in the experiment # Roll a random die n times for each of the B experiments x = rmultinom(B, size=n, prob=c(1,1,1,1,1,1)/6) # Calculate the test statistic TS = numeric() for(i in 1:B) { TS[i] = sum( (x[,i]-n/6)^2/(n/6) ) } #barplot( apply(x, MAR=1, FUN=mean), names=1:6); abline(h=n/6) #hist(TS) # Let's calculate the endpoints of a 95% confidence interval # under the assumption that TS does follow a chi-square dist LB = qchisq(0.025, df=5) UB = qchisq(0.975, df=5) # Finally, let's calculate the Type I error rate mean(TSUB) # For n = 10, a = 0.07 # For n = 50, a = 0.05 # For n = 100, a = 0.05 # Note that these observed Type I error rates are much closer to # alpha = 0.05 than for the original Binomial experiment. # # Thus, it appears as though the Chi-square test is better as the # number of bins (choices) increases. g=2 was too few. g=6 seems good. ##### ### Using real data # Goodness of fit: dt = read.csv("http://rfs.kvasaheim.com/data/someCollegeClean.csv") summary(dt) lev = table(dt$level) chisq.test(lev) # H0 : The four class levels are equally represented in the pop gen = table(dt$gender) chisq.test(gen) # H0 : There are equal numbers of males and females in the pop chisq.test(gen, p=c(1,2)/3) # H0 : There are twice as many females as males in the pop chisq.test(gen, p=c(2,1)/3) # H0 : There are twice as many males as females in the pop chisq.test(gen, p=c(1,1.5)/2.5) # H0 : There are 50% more males as females in the pop # Test of independence dt = read.csv("http://rfs.kvasaheim.com/data/crime.csv") summary(dt) obs = table(dt$cult_dom,dt$census4) chisq.test( obs ) # H0 : There is no relationship between dominant culture and region # Test of some things dt = read.csv("http://rfs.kvasaheim.com/data/religion.csv") summary(dt) obs = table(dt$Vote,dt$Census4) chisq.test( obs ) # H0 : ????? obs obs = table(dt$Vote) chisq.test(obs) # H0 : ????? obs