########################################
#
#	Topic: Linear Regression
#
#
#	Script: 20101119.R
#
#	Date: 19 November 2010
#
########################################

# Sets the wroking directroy for my directory. You would 
# either remove it or use another path.

setwd("E:/STAT4073 Engineering/regression")


# Read in the data from the file "corr-gdp.csv"
# The 'header=TRUE' parameter tells R that the 
# first row contains variable names

gd <- read.csv("corr-gdpcap.csv", header=TRUE)
attach(gd)

# This tells you the names of the variables in 
# the data set

names(gd)


# Now, perform some linear regression. The 'glm' command 
# requires two parameters: the research formula and the 
# data set to be used. 

# The research formula starts with the dependent
# variable (gdpcap), is followed by the tilde, which
# is followed by the independent variables, separated
# by plus signs.

# Save the results in the variable 'm1' so that you can use 
# the results in the future.

m1 <- glm( gdpcap ~ hig , data=gd )


# Now, let us see the regression results

summary(m1)


# Let us add the OPEC variable to the research formula

m2 <- glm( gdpcap ~ hig + OPEC, data=gd )
summary(m2)


# Let us now add the categorical variable 'region' to the 
# research formula. This allows us to compare the gdp values
# in different regions of the world.

m3 <- glm( gdpcap ~ hig + OPEC + region, data=gd )
summary(m3)

# Note that 'regionAfrica' was not included in the regression
# results. Africa is the base region because it comes first in
# the alphabet, compared to the other regions.

# To use other regions as bases, you will be unable to use this
# convenience function, and will have to do what all the other
# statistical programs require you to do: create a series of
# dichotomous indicator variables.

# Like this:

africa  <- gd$region=="Africa"
eastern <- gd$region=="Eastern"
islamic <- gd$region=="Islamic"
latamer <- gd$region=="Latin America"
western <- gd$region=="Western"
other   <- gd$region=="Other"


# If you did everything right, the folowing should give the same
# results as m3. (Why?)

m4 <- glm( gdpcap ~ hig + OPEC + eastern + islamic + latamer + western + other, data=gd )
summary(m4)
summary(m3)


# Now, to use whatever region as the base, just leave it out of the research
# formula and add all of the others. For instance, using Western as the base:

m5 <- glm( gdpcap ~ hig + OPEC + africa + eastern + islamic + latamer + other, data=gd )
summary(m5)

# Thankfully, we see Africa is statistically significant and that the estimate
# is familiar. (To what and why?)

# We also see that the West has a significantly higher gdpcap than Other. Why do we
# know that?

# What would you have to do to compare Latin America with Islamic countries?





##############################
#
# Extension: Graphing
#


# A picture is worth a thousand words because pictures show a lot of information
# in a little area. BUT, this is only true if the picture is helpful. Here are 
# some helpful plots. For each, you should be able to state what is being expressed


# standard, default scatterplot
plot(gdpcap ~ hig, data=gd)


# Fancied up a bit
plot(gdpcap ~ hig, data=gd, xlab="Honest in Government Index", ylab="GDP per capita", xlim=c(0,10), bty="L")


# Indicate the Islamic countries on the previous graph:
points(gdpcap~hig, data=gd, subset=(which(region=="Islamic")), pch=16, col=3 )


# Draw the line of best fit for model m4 on the previous graph
abline(m4, lwd=2, col=2)		    # only works because hig is the first variable in the model
abline(coef = coef(m4)[c(1,2)] , col=2) # this works in general (1 = intercept, 2 = hig)


# Indicate the OPEC countries on the graph
points(gdpcap~hig, data=gd, subset=(which(OPEC==1)), pch=2, col=4, lwd=2 )


# Where is the United States?
points(gdpcap~hig, data=gd, subset=(which(scode2=="us")), pch=16, col=14 )



#####
# Question: How would you fit the model only for the Western countries?

m5w <- glm( gdpcap ~ hig + OPEC + africa + eastern + islamic + latamer + other, data=gd, subset=(region=="Western") )
summary(m5w)

# What do the NAs mean?

#Let us plot this model 
plot(gdpcap ~ hig, data=gd, xlab="Honest in Government Index", ylab="GDP per capita", xlim=c(0,10), bty="L")
abline(coef = coef(m5w)[c(1,2)] , col=2, lwd=2)
abline(coef = coef(m4)[c(1,2)] , col=5, lwd=2)

# What does that plot tell us?


# Let's do the same, but with Africa
m5a <- glm( gdpcap ~ hig + OPEC, data=gd, subset=(region=="Africa") )
summary(m5a)
abline(coef = coef(m5a)[c(1,2)] , col=4, lwd=2)

# Now, what does *that* plot tell us? What question(s) is (are) answered by that plot?