########################################
#
#  Script: 12 April 2011 (20110412.R)
#
########################################

# Today:
#
#   ** Analysis of variance (means)
#   ** Experimental design (blocking)
#


# Let us create a new dataset (Ex 15.1, page 960)

ins <- data.frame( matrix( c(1,1,56, 1,2,48, 1,3,66, 1,4,62, 
  2,1,83, 2,2,78, 2,3,94, 2,4,93,
  3,1,80, 3,2,72, 3,3,83, 3,4,85), ncol=3, byrow=T))
names(ins) <- c("Ins","Plot","Seedlings")
ins$Plot   <- as.factor(ins$Plot)
ins$Ins    <- as.factor(ins$Ins)
ins        <- data.frame(ins)


#### And now for something completely different...

# Note that the number of seedlings is the dependent variable
# Note that insecticide is the independent variable
# Note that the plot is a blocking variable


model1 <- aov(Seedlings ~ Ins + Plot, data=ins)
summary(model1)

# Is the insecticide an important variable in determing the
# number of seedlings?

# Yes (F = 211.39; df1 = 2, df2 = 6; p << 0.0001)


# What are the effect sizes of the three insecticides?

summary.lm(model1)

# For the insecticide, 
# Expected number of seedlings for Ins 1 = 56
# Expected number of seedlings for Ins 2 = 56+29 = 85
# Expected number of seedlings for Ins 3 = 56+22 = 78


# Is Insecticide 1 statistically worse than the other two?

# Yes. Comparing Ins1 to Ins2: p << 0.0001,
# and comparing Ins1 to Ins2: p << 0.0001


# Is Insecticide 2 significantly better than Ins3?

# Recall t = (29-22)/1.472. Thus, t=4.755. Thus, p <0.05

pt((29-22)/1.472,df=6, lower.tail=FALSE)*2

# The p-value is 0.00314



# Alternatively, we could have used a professor-defined function

source("http://courses.kvasaheim.com/stat40x3/R/set.base.R")

ins$Ins <- set.base(ins$Ins,'2', data=ins)
model2 <- aov(Seedlings ~ Ins + Plot, data=ins)
summary.lm(model2)


# From this, we can conclude the following order of insecticide
# efficacy: 2 > 3 > 1.



# Was the blocking necessary?

summary(model1)

# Yes, as the Plot variable was statistically significant (F=33.69,
# df1=3, df2=6, p=0.00037).