########################################
#
#  Script: 10 February 2011 (20110210.R)
#
########################################

# Today:
#   ** Data transforms
#   ** Two-way Analysis of Variance
#




d <- read.csv("http://courses.kvasaheim.com/stat40x3/data/gdpcap.csv", header=TRUE)
names(d)
summary(d)


#####
# Let us determine which of these affects wealth in the world

# Effects of each categorical variable?
summary( aov(gdpcap ~ region, data=d) )     # Statistically significant!
summary( aov(gdpcap ~ OPEC, data=d) )       # Statistically significant!
summary( aov(gdpcap ~ govt, data=d) )       # Statistically significant!


# But, these effects are /not/ independent:

# Correlations among the three categorical variables?
with(d, table(region,OPEC) )                # correlated means not independent
with(d, table(region,govt) )                # correlated means not independent
with(d, table(govt,OPEC) )                  # correlated means not independent

with(d, chisq.test(region,OPEC) )           # read the p-value 
with(d, chisq.test(region,govt) )           # read the p-value
with(d, chisq.test(govt,OPEC) )             # read the p-value



#####
# Let us now determine their conditional effects:
# conditional effects are those effects that belong to that variable
# alone and not to the other variables in the analysis. As these three
# variables are correlated, we would expect the conditional effects to
# be different from the unconditional effects (above)


model.1 <- aov(gdpcap ~ region + OPEC + govt, data=d)
summary(model.1)

# Wait! The conclusion is different than above... why?

# What are the effect levels?
model.1$coefficients

# Interpret these (see notes)




# What about interactions?
model.2 <- aov(gdpcap ~ region * OPEC * govt, data=d)
summary(model.2)

# Not the asterisk to indicate interactions (think of interactions as
# being products of the variables


# Again, what are the effect levels?
coefficients(model.2)


# Is this larger model 'better enough'?
anova(model.2,model.1)

# The anova() function compares the two models to determine if the larger is
# statistically better at prediction than the smaller. The null hypothesis 
# is that there is no statistical difference in the two models



# What about dropping the region:govt interaction?
model.3 <- aov(gdpcap ~ region + OPEC + govt + OPEC:govt + region:OPEC, data=d)
summary(model.3)

# Is this dropping acceptable?
anova(model.2,model.3)

model.4 <- aov(gdpcap ~ region + OPEC + govt + OPEC:govt, data=d)
summary(model.4)

# Is this dropping acceptable?
anova(model.3,model.4)

# So, this is the "minimally acceptable" model
summary(model.4)


# Effect levels:
coefficients(model.4)

# Minimally acceptable models are important for us because they include only those
# variables that are important in predicting our outcome variable. If we include
# non-significant effects, then our predictions have a larger variance (lower
# precision) than otherwise.




# What is the expected gdp per capita in a Democratic, African OPEC member?
2540.4683 + 3377.4044


# Alternatively (and more useful in the future):
ourstate <- data.frame(govt="Democracy", region="Eastern", OPEC="Non-Member")
predict(model.4, newdata=ourstate)
# This is the preferred manner of getting our predictions. It is cleaner and can
# be copy-and-pasted, changing only the values.


##
# The following gives an error. Beware!
ourstate <- data.frame(govt="Democratic", region="Africa", OPEC="Member")
predict(model.4, newdata=ourstate)
# Learn this error, since you will probably see it in the future.



###
# Alternatively:

model.5 <- lm(gdpcap ~ region + OPEC + govt + OPEC:govt, data=d)
summary(model.5)
summary(aov(model.5))

# Differences?








#########################
#
# Now, for Honesty in Government
#
#

# Do this one on your own.











#########################
#
# Data transformation

# This is done when the assumptions of ANOVA are not met, and we really
# want to use ANOVA. The process consists of applying a function (one-to-one 
# and onto) to the original data in the hopes of having the transformed
# data meet the assumptions of ANOVA. Conclusions reached in the transformed 
# state are valid in the un-transformed state (predictions need to be back-
# transformed, however).


d <- read.csv("http://courses.kvasaheim.com/stat40x3/data/population1.csv", header=TRUE)
names(d)

boxplot(population~group, data=d)


bartlett.test( population~group, data=d)      # p << 0.0001

shapiro.test( d$population[d$group=="A"] )    # p << 0.0001
shapiro.test( d$population[d$group=="B"] )    # p << 0.0001
shapiro.test( d$population[d$group=="C"] )    # p =  0.0005
shapiro.test( d$population[d$group=="D"] )    # p << 0.0001

# The data fails the assumptions. Let us try the transformation method
# so that we can still use the ANOVA procedure. 

# There are several options for transformation:
#   . square
#   . square root
#   . cube
#   . cube root
#   . logarithm

# ANY function that is one-to-one and onto will work. 


# Let us try the log transformation:

bartlett.test( log(population)~group, data=d)      # p = 0.255

shapiro.test( log(d$population[d$group=="A"]) )    # p = 0.451
shapiro.test( log(d$population[d$group=="B"]) )    # p = 0.235
shapiro.test( log(d$population[d$group=="C"]) )    # p = 0.302
shapiro.test( log(d$population[d$group=="D"]) )    # p = 0.804

# The log-transformed data meets the assumptions of ANOVA, so let
# us use the ANOVA procedure on the transformed data:

model.tr <- aov( log(population)~group, data=d)
summary( model.tr )

# Because the log function is one-to-one and onto, the conclusions
# regarding statistical significance you reach in the transformed
# space are applicable in the non-transformed space.


# We can also make our predictions, but we have to back-transform
# our predictions:

coefficients(model.tr)

pr <- predict( model.tr, newdata=data.frame(group=c("A","B","C","D")) )

exp(pr)

# Why did I use the expontential function? It is the inverse of the 
# logarithm function. The predictions in `pr' are in log-units, are
# in the transformed space. To get real predictions, we must get back 
# into our original measurements. We do that by applying the inverse of 
# the transformation function on the predictions.







##################################################
#
# Finally ...

# All of the multiple comparison tests we covered last time were 
# based (in some way) on the parametric t-test. There is a non-
# parametric multiple-comparison test called the Kruskal Test.


library(agricolae)
with(d, kruskal(population,group) )