###########################################################################
# PUBLG100: Introduction to Quantitative Methods
#
# Week 9 Seminar: Binary models: Logit
#
#

# Change your working directory
setwd("N:/PUBLG100")

# Check your working directory
getwd()


## ----message=FALSE-------------------------------------------------------
library(Zelig) # load BEFORE dplyr!!!
library(foreign) # to load .dta files
library(lmtest) # for likelihood ratio test
library(texreg) # regression tables
library(dplyr) # data manipulation/ summary statistics

# clear environment
rm(list = ls())

## ------------------------------------------------------------------------
# load British post election study
bes <- read.dta("http://uclspp.github.io/PUBLG100/data/bes.dta")
head(bes)

# frequency table of voter turnout
table(bes$Turnout) 

# rename Gender to male b/c 1 = male & remove missings
bes <- bes %>%
  rename(male = Gender) %>%
  na.omit() 

## ------------------------------------------------------------------------
# mean and standard deviation for Turnout by gender
bes %>%
  group_by(male) %>%
  summarise(avg_turnout = mean(Turnout), sd_turnout = sd(Turnout))

# mean for multiple columns using "summarise_each"
bes %>% 
  group_by(male) %>%
  summarise_each(funs(mean), Turnout, Vote2001, Age, LeftrightSelf, 
                 CivicDutyIndex, polinfoindex)

## ------------------------------------------------------------------------
# logit model
model1 <- glm(Turnout ~ Income + polinfoindex + male + edu15 + edu17 + edu18 + 
                edu19plus + in_school + in_uni, family = binomial(link = "logit"),
              data = bes)

# regression output
screenreg(model1)

## ------------------------------------------------------------------------
# predicted probabilities for all respondents
predicted.probabilities <- predict(model1, type = "response")

## ------------------------------------------------------------------------
# threshold to translate predicted probabilities into outcomes
threshold <- .5 

## ------------------------------------------------------------------------
# set prediction to 1 if predicted probability is larger than 0.5 and put 0 otherwise
expected.values <- ifelse(predicted.probabilities > threshold, yes = 1, no = 0)

## ------------------------------------------------------------------------
 # actually observed outcomes
observed <- bes$Turnout

# putting observed outcomes and predicted outcomes into a table
outcome.table <- table(observed,expected.values)
outcome.table

# correctly predicted cases:
# (correct negatives + correct positives) / total number of outcomes
correctly.predicted <- (outcome.table[1,1] + outcome.table[2,2]) / sum(outcome.table)
correctly.predicted

# comparing rate of correctly predicted to naive guess
mean(bes$Turnout)

## ------------------------------------------------------------------------
# esimate the new model 2 including Influence and Age
model2 <- glm(Turnout ~ Income + polinfoindex + Influence + male + Age + 
                edu15 + edu17 + edu18 + edu19plus + in_school + in_uni, 
              family = binomial(link = "logit"), data = bes)

# regression table comparing model 1 and model 2
screenreg( list(model1, model2) )

## ------------------------------------------------------------------------
# the likelihood ratio test
lrtest(model1, model2)

# Akaike's Information Criterion
AIC(model1, model2)

# Bayesian Infromation Criterion
BIC(model1, model2) 

## ------------------------------------------------------------------------
# re-estimate model 2 using Zelig
z.m2 <- zelig(Turnout ~ Income + polinfoindex + Influence + male + Age + edu15 + 
                edu17 + edu18 + edu19plus + in_school + in_uni, model = "logit", 
              data = bes, cite = FALSE)

## ------------------------------------------------------------------------
# average man with 18 years of education
x.male.18edu <- setx(z.m2, male = 1, edu18 = 1, Income = mean(bes$Income), 
                     polinfoindex = mean(bes$polinfoindex), Influence = mean(bes$Influence),  
                     Age = mean(bes$Age), edu15 = 0, edu17 = 0,  edu19plus = 0, 
                     in_school = 0, in_uni = 0)

# check covariate values (if you have missings here, the simulation will not work)
t(x.male.18edu$values) 

# average woman with 18 years of education
x.female.18edu <- setx(z.m2, male = 0, edu18 = 1, Income = mean(bes$Income), 
                       polinfoindex = mean(bes$polinfoindex), Influence = mean(bes$Influence),  
                       Age = mean(bes$Age), edu15 = 0, edu17 = 0,  edu19plus = 0,
                       in_school = 0, in_uni = 0)

# check covariate values (if you have missings here, the simulation will not work)
t(x.female.18edu$values)

## ------------------------------------------------------------------------
# make simulation replicable, the values in set.seed() do not matter
set.seed(123)

# simulate with our two scenarios
s.out <- sim(z.m2, x = x.female.18edu, x1 = x.male.18edu)

# outcomes, check especially first differences
summary(s.out)

## ------------------------------------------------------------------------
# women with income levels from lowest to highest (notice we put education to the mode)
x.fem <- setx(z.m2, male = 0, Income = 1:13, polinfoindex = mean(bes$polinfoindex), 
              Influence = mean(bes$Influence), Age = mean(bes$Age), 
              edu15 = 1, edu17 = 0, edu18 = 0, edu19plus = 0, in_school = 0, in_uni = 0)

# men with income levels from lowest to highest (notice we put education to the mode)
x.mal <- setx(z.m2, male = 1, Income = 1:13, polinfoindex = mean(bes$polinfoindex), 
              Influence = mean(bes$Influence), Age = mean(bes$Age), 
              edu15 = 1, edu17 = 0, edu18 = 0, edu19plus = 0, in_school = 0, in_uni = 0)

## ------------------------------------------------------------------------
# simulation
s.out2 <- sim(z.m2, x = x.fem, x1 = x.mal)

## ------------------------------------------------------------------------
# final plot
plot.ci (s.out2, 
         ci = 95,
         xlab = "income", 
         ylab = "predicted probability of Voting",
         main = "effect of income by gender")

# add labels manually
text( x = 2, y = .75, labels = "women" )
text( x = 7, y = .68, labels = "men" )