##### Demonstration Script #4d
##### MATH322
#####
##### Three Goodness-of-Fit tests
#####


### Preamble

# Load a package
library(nortest)

       # If you got this error:
       #    Error in library(nortest) : there is no package called ‘nortest’
       # then you will need to install this package.


# Import some extended functions
source("http://rfs.kvasaheim.com/stat200.R")

# Import that data
dt = read.csv("http://rfs.kvasaheim.com/data/crime.csv")
attach(dt)


### A discussion on the Pearson Chi-Square Goodness-of-Fit test
#   and why it is not a good option

normoverlay(vcrime90, breaks=3)
normoverlay(vcrime90, breaks=11)
normoverlay(vcrime90, breaks=21)
normoverlay(vcrime90, breaks=51)

# That is the problem with epdf


# Here is the ecdf
plot(ecdf(vcrime90))

       # No information is discarded
       # No need to make complex decisions on
       #   which histogram is best.

# Fit with Normal curve
x = seq(0, 2500)
y = pnorm(x,m=mean(vcrime90),s=sd(vcrime90))
lines(x,y, col="red",lwd=2)


### Three CDF tests

## KS Test
ks.test(vcrime90,"pnorm",m=600,s=400)

       # Note that we had to know mu and sigma to use the KS test
#


## Anderson-Darling
ad.test(vcrime90)


## Shapiro-Wilk
shapiro.test(vcrime90)


## The KS test manually:

# Since the KS test statistics is based on the largest difference between the
# observed cdf and the theoretical CDF, we create both, find the difference,
# then return that difference:


d = dt$vcrime90

w = sort(d)

ww = numeric()
yn = numeric()

for(x in 1:length(d)) {
 ww[x] = mean(d <= w[x])
 yn[x] = pnorm(w[x], m=2000, s=1000)
}

ks.test(d, pnorm, m=2000,s=1000)

max( abs(yn-ww) )


##### Investigation into the three tests
#####

### The Kolmogorov-Smirnov test

# with known parameters
pvalue = numeric()

for(i in 1:1e4) {
  x = rnorm(100, m=0,s=1)
  pvalue[i] = ks.test(x,"pnorm",m=0,s=1)$p.value
}

mean(pvalue<0.05)

binom.test(x=sum(pvalue<0.05),n=length(pvalue))

# The 95% confidence interval is from 0.043 to 0.051. Since
# this includes 0.05, we have no evidence that the test is 
# a "bad" test... at least in the run I did in my office. Your
# results may differ. 


# with unknown parameters
pvalue = numeric()

for(i in 1:1e4) {
  x = rnorm(100, m=0,s=1)
  pvalue[i] = ks.test(x,"pnorm",m=mean(x),s=sd(x))$p.value
}

mean(pvalue<0.05)
binom.test(x=sum(pvalue<0.05),n=length(pvalue))

# The 95% confidence interval is from 0.00002 to 0.00072. Since
# this does not include 0.05, we have strong evidence that the test is 
# a "bad" test... at least in the run I did in my office. Your
# results may differ. 
#
# The upshot: When using the K-S test and estimating the parameters
# from the data, you will almost never reject the null hypothesis. This
# is a sign of a low power test.


### The Anderson-Darling test

pvalue = numeric()

for(i in 1:1e4) {
  x = rnorm(100, m=0,s=1)
  pvalue[i] = ad.test(x)$p.value
}

mean(pvalue<0.05)
binom.test(x=sum(pvalue<0.05),n=length(pvalue))

# The 95% confidence interval is from 0.047 to 0.055. Since
# this includes 0.05, we have no evidence that the test is 
# a "bad" test... at least in the run I did in my office. Your
# results may differ. 


### Investigation into Shapiro-Wilk Test

pvalue = numeric()

for(i in 1:1e4) {
  x = rnorm(100, m=0,s=1)
  pvalue[i] = shapiro.test(x)$p.value
}

mean(pvalue<0.05)
binom.test(x=sum(pvalue<0.05),n=length(pvalue))

# The 95% confidence interval is from 0.048 to 0.057. Since
# this includes 0.05, we have no evidence that the test is 
# a "bad" test... at least in the run I did in my office. Your
# results may differ. 


##### Power Curves
#####


### Recall that the power curve is the probability of rejecting
#   a wrong null hypothesis. That means we have to create a 
#   series of distributions ranging from Normal to non-Normal
#   As Nadine suggested, the t distribution will work... as
#   long as we realize we are only testing power against a
#   symmetric alternative.


### Shapiro-Wilk Test

pwr=numeric()

for(i in 1:20) {
  pvalue = numeric()
  for(j in 1:1e4) {
    x = rt(100, df=i)
    pvalue[j] = shapiro.test(x)$p.value
  }
  pwr[i] = mean(pvalue<0.05)
}

plot(pwr, ylim=c(0,1),xlim=c(0,20), pch=20,col="blue")


### The Anderson-Darling test

pwr=numeric()

for(i in 1:20) {
  pvalue = numeric()
  for(j in 1:1e4) {
    x = rt(100, df=i)
    pvalue[j] = ad.test(x)$p.value
  }
  pwr[i] = mean(pvalue<0.05)
}

points(pwr, col="red", pch=20)


### So, which of the two has a higher power against
#   symmetric alternatives?


### What about for non-symmetric alternatives? We will need
#   to get a series of distributions that start with a 
#   Normal and become more and more asymmetric.
#
#   And that is where we will pick up on Friday. 
#