######################################## # # Topic: Linear Regression # # # Script: 20101119.R # # Date: 19 November 2010 # ######################################## # Sets the wroking directroy for my directory. You would # either remove it or use another path. setwd("E:/STAT4073 Engineering/regression") # Read in the data from the file "corr-gdp.csv" # The 'header=TRUE' parameter tells R that the # first row contains variable names gd <- read.csv("corr-gdpcap.csv", header=TRUE) attach(gd) # This tells you the names of the variables in # the data set names(gd) # Now, perform some linear regression. The 'glm' command # requires two parameters: the research formula and the # data set to be used. # The research formula starts with the dependent # variable (gdpcap), is followed by the tilde, which # is followed by the independent variables, separated # by plus signs. # Save the results in the variable 'm1' so that you can use # the results in the future. m1 <- glm( gdpcap ~ hig , data=gd ) # Now, let us see the regression results summary(m1) # Let us add the OPEC variable to the research formula m2 <- glm( gdpcap ~ hig + OPEC, data=gd ) summary(m2) # Let us now add the categorical variable 'region' to the # research formula. This allows us to compare the gdp values # in different regions of the world. m3 <- glm( gdpcap ~ hig + OPEC + region, data=gd ) summary(m3) # Note that 'regionAfrica' was not included in the regression # results. Africa is the base region because it comes first in # the alphabet, compared to the other regions. # To use other regions as bases, you will be unable to use this # convenience function, and will have to do what all the other # statistical programs require you to do: create a series of # dichotomous indicator variables. # Like this: africa <- gd$region=="Africa" eastern <- gd$region=="Eastern" islamic <- gd$region=="Islamic" latamer <- gd$region=="Latin America" western <- gd$region=="Western" other <- gd$region=="Other" # If you did everything right, the folowing should give the same # results as m3. (Why?) m4 <- glm( gdpcap ~ hig + OPEC + eastern + islamic + latamer + western + other, data=gd ) summary(m4) summary(m3) # Now, to use whatever region as the base, just leave it out of the research # formula and add all of the others. For instance, using Western as the base: m5 <- glm( gdpcap ~ hig + OPEC + africa + eastern + islamic + latamer + other, data=gd ) summary(m5) # Thankfully, we see Africa is statistically significant and that the estimate # is familiar. (To what and why?) # We also see that the West has a significantly higher gdpcap than Other. Why do we # know that? # What would you have to do to compare Latin America with Islamic countries? ############################## # # Extension: Graphing # # A picture is worth a thousand words because pictures show a lot of information # in a little area. BUT, this is only true if the picture is helpful. Here are # some helpful plots. For each, you should be able to state what is being expressed # standard, default scatterplot plot(gdpcap ~ hig, data=gd) # Fancied up a bit plot(gdpcap ~ hig, data=gd, xlab="Honest in Government Index", ylab="GDP per capita", xlim=c(0,10), bty="L") # Indicate the Islamic countries on the previous graph: points(gdpcap~hig, data=gd, subset=(which(region=="Islamic")), pch=16, col=3 ) # Draw the line of best fit for model m4 on the previous graph abline(m4, lwd=2, col=2) # only works because hig is the first variable in the model abline(coef = coef(m4)[c(1,2)] , col=2) # this works in general (1 = intercept, 2 = hig) # Indicate the OPEC countries on the graph points(gdpcap~hig, data=gd, subset=(which(OPEC==1)), pch=2, col=4, lwd=2 ) # Where is the United States? points(gdpcap~hig, data=gd, subset=(which(scode2=="us")), pch=16, col=14 ) ##### # Question: How would you fit the model only for the Western countries? m5w <- glm( gdpcap ~ hig + OPEC + africa + eastern + islamic + latamer + other, data=gd, subset=(region=="Western") ) summary(m5w) # What do the NAs mean? #Let us plot this model plot(gdpcap ~ hig, data=gd, xlab="Honest in Government Index", ylab="GDP per capita", xlim=c(0,10), bty="L") abline(coef = coef(m5w)[c(1,2)] , col=2, lwd=2) abline(coef = coef(m4)[c(1,2)] , col=5, lwd=2) # What does that plot tell us? # Let's do the same, but with Africa m5a <- glm( gdpcap ~ hig + OPEC, data=gd, subset=(region=="Africa") ) summary(m5a) abline(coef = coef(m5a)[c(1,2)] , col=4, lwd=2) # Now, what does *that* plot tell us? What question(s) is (are) answered by that plot?