######################################## # # Topic: Generalized Linear Models # # # Script: 20101122.R # # Date: 22 November 2010 # ######################################## # Read in the data from the file "crime.csv" # The 'header=TRUE' parameter tells R that the # first row contains variable names gd <- read.csv("http://oforsber.kvasaheim.com/courses/stat4073/data/crime.csv", header=TRUE) attach(gd) # This tells you the names of the variables in the data set names(gd) # Let us now try to determine how much the violent crime rate # in 1990 is dependent on the wealth of the state in 1990 (gsp90), # the numbers of citizens' initiatives used (inituse), the inherent # level of republicanism in the state (repub) and the total # population in 1990. # First, a linear model (which assumes Normality) m1 <- glm(vcrime90 ~ gsp90 + inituse + repub + pop90) summary(m1) # How would you determine if the Normality assumption was violated? # Second, a linear model (which assumes Normality), but transforming the # dependent variable (as it was bounded below) m2 <- glm(vcrime90 ~ gsp90 + inituse + repub + pop90, family=gaussian(link=log) ) summary(m2) # Now, let's predict initiative use from these four varaibles m3 <- glm( inituse ~ gsp90 + vcrime90 + repub + pop90, family=poisson(link=log) ) summary(m3) # Note that inituse is a count variable. Thus, we use a different family # and a different link function (why the log link?) # Finally, let us predict 'South=ness' from the other four variables. In # reality this would not make sense, but I wanted to demonstrate how one # would model a binary dependent variable m4 <- glm(census4=="South" ~ gsp90 + inituse + repub + pop90, family=binomial(link=logit) ) summary(m4) # Here, you are actually just modeling the underlying probability that a person # is Southern. The link takes that 0-1 bounded probability variable and unbounds # it. Other links available include loglog, cloglog, and probit. Of these, only # the logit, probit, and cauchit are symmetric. Also, of these, only the logit # and probit are frequently used.