###########################################################################
# PUBLG100: Introduction to Quantitative Methods
#
# Week 9 Seminar: Binary models: Logit
#
#

# Set your working directory
#
# CAUTION: Make sure the directory you specify here matches the working directory on your computer.
#          We're using N:/PUBLG100 only for illustration purposes and it would only work if you're 
#          using a UCL dekstop. If you're using your own laptop, then replace N:/PUBLG100 with the 
#          appropriate directory (or folder)
setwd("N:/PUBLG100")


# Verify that your working directory is set correctly
getwd()


## ----message = FALSE-----------------------------------------------------
library(foreign) 
library(Zelig) 
library(texreg) 
library(lmtest)
library(dplyr) 

## ------------------------------------------------------------------------
rm(list = ls())

## ------------------------------------------------------------------------
bes <- read.dta("bes.dta")

## ----echo = FALSE--------------------------------------------------------
source("../common/knitr_hooks.R")

## ------------------------------------------------------------------------
bes$Gender <- factor(bes$Gender, levels = c(0, 1), labels = c("Female", "Male"))

## ------------------------------------------------------------------------
head(bes)

## ------------------------------------------------------------------------
bes <- filter(bes, 
              !is.na(Turnout), 
              !is.na(Income), 
              !is.na(polinfoindex), 
              !is.na(Gender), 
              !is.na(edu15), 
              !is.na(edu17), 
              !is.na(edu18), 
              !is.na(edu19plus), 
              !is.na(in_school), 
              !is.na(in_uni))

## ------------------------------------------------------------------------
model1 <- glm(Turnout ~ Income + polinfoindex + Gender + 
                edu15 + edu17 + edu18 + edu19plus + in_school + in_uni, 
              family = binomial(link = "logit"),
              data = bes)

screenreg(model1)

## ------------------------------------------------------------------------
table(bes$Turnout) 

## ----echo = FALSE--------------------------------------------------------
# THIS CODE IS HERE FOR DISPLAYING THE CONTENT ON THE COURSE WEBSITE. YOU DO NOT NEED TO WORRY ABOUT WHAT IT DOES
mean_round <- function(x, pct = 1) { sprintf("`%.2f%s`", mean(x)*pct, if_else(pct == 100, "%", "")) }

## ------------------------------------------------------------------------
predicted_probs <- predict(model1, type = "response")

## ------------------------------------------------------------------------
expected <- as.numeric(predicted_probs > 0.5)

## ------------------------------------------------------------------------
observed <- bes$Turnout

outcome <- table(observed,expected)
outcome

## ------------------------------------------------------------------------
(outcome[1,1] + outcome[2,2]) / sum(outcome)

## ------------------------------------------------------------------------
mean(bes$Turnout)

## ------------------------------------------------------------------------
model2 <- glm(Turnout ~ Income + polinfoindex + Influence + Gender + Age + 
                edu15 + edu17 + edu18 + edu19plus + in_school + in_uni, 
              family = binomial(link = "logit"), data = bes)

screenreg(list(model1, model2))

## ------------------------------------------------------------------------
lrtest(model1, model2)

## ------------------------------------------------------------------------
AIC(model1, model2)

BIC(model1, model2) 

## ------------------------------------------------------------------------
z.out <- zelig(Turnout ~ Income + polinfoindex + Influence + Gender + Age + 
                 edu15 + edu17 + edu18 + edu19plus + in_school + in_uni, 
               model = "logit", 
               data = bes)

## ----echo = FALSE--------------------------------------------------------
set.seed(666)

## ------------------------------------------------------------------------
x.male <- setx(z.out, 
               Gender = "Male", edu18 = 1, 
               Age = mean(bes$Age),
               Income = mean(bes$Income), 
               polinfoindex = mean(bes$polinfoindex), 
               Influence = mean(bes$Influence),  
               edu15 = 0, edu17 = 0, edu19plus = 0, in_school = 0, in_uni = 0)

x.female <- setx(z.out, Gender = "Female", edu18 = 1, 
                 Age = mean(bes$Age),
                 Income = mean(bes$Income), 
                 polinfoindex = mean(bes$polinfoindex), 
                 Influence = mean(bes$Influence),  
                 edu15 = 0, edu17 = 0, edu19plus = 0, in_school = 0, in_uni = 0)

s.out <- sim(z.out, x.male, x1 = x.female)
summary(s.out)

## ----echo = FALSE--------------------------------------------------------
# IGNORE THESE FUNCTIONS. They are used for generating course website
get_sim_out <- function(x, i, j) {
  sprintf("`%d%%`", round(mean(x$sim.out[[i]][[j]][[1]])*100))
}

get_margin_of_error <- function(x, i, j, s = "") {
  qi <- x$sim.out[[i]][[j]][[1]][,1]
  quantiles <- quantile(qi, c(0.025, 0.975))
  sprintf("`%d%s`", round(((quantiles[2] - quantiles[1])/2)*100), s)
}

get_ci_low <- function(x, i, j) {
  qi <- x$sim.out[[i]][[j]][[1]][,1]
  quantiles <- quantile(qi, c(0.025, 0.975))
  sprintf("`%d`", round(min(quantiles)*100))
}

get_ci_high <- function(x, i, j) {
  qi <- x$sim.out[[i]][[j]][[1]][,1]
  quantiles <- quantile(qi, c(0.025, 0.975))
  sprintf("`%d`", round(max(quantiles)*100))
}

## ------------------------------------------------------------------------
range(bes$Income)

## ----echo = FALSE--------------------------------------------------------
set.seed(666)

## ------------------------------------------------------------------------
x.male <- setx(z.out, 
               Gender = "Male", edu18 = 1, 
               Age = mean(bes$Age),
               Income = seq(1, 13), 
               polinfoindex = mean(bes$polinfoindex), 
               Influence = mean(bes$Influence),  
               edu15 = 0, edu17 = 0, edu19plus = 0, in_school = 0, in_uni = 0)

x.female <- setx(z.out, Gender = "Female", edu18 = 1, 
                 Age = mean(bes$Age),
                 Income = seq(1, 13), 
                 polinfoindex = mean(bes$polinfoindex), 
                 Influence = mean(bes$Influence),  
                 edu15 = 0, edu17 = 0, edu19plus = 0, in_school = 0, in_uni = 0)

s.out <- sim(z.out, x.male, x1 = x.female)

## ------------------------------------------------------------------------
ci.plot(s.out, 
        ci = 95,
        xlab = "Income", 
        ylab = "Predicted Probability of Voting",
        main = "Effect of Income by Gender")

# add labels manually
# NOTE: the x and y values below are just coordinates on the plot, you can change them based on 
#       where you want the labels placed
text(x = 3, y = .85, labels = "Women")  
text(x = 6, y = .75, labels = "Men")    

## ------------------------------------------------------------------------
bes %>%
  group_by(Gender) %>%
  summarise(avg_turnout = mean(Turnout), sd_turnout = sd(Turnout))