###########################################################################
# PUBLG100: Introduction to Quantitative Methods
#
# Week 9 Solutions: Binary models: Logit
#
#

## ----message = FALSE-----------------------------------------------------
# clearing workspace
rm(list = ls())

# loading packages we need
library(Zelig) # load BEFORE dplyr!!!
library(foreign) # to load .dta files
library(lmtest) # for likelihood ratio test
library(texreg) # regression tables
library(dplyr) # data manipulation/ summary statistics

# load data
bes <- read.dta("http://uclspp.github.io/PUBLG100/data/bes.dta")

# selecting variables we need
bes <- bes %>%
  select(turnout = Turnout,
         income = Income,
         age = Age,
         male = Gender,
         partyid = PartyID,
         influence = Influence,
         civicduty = CivicDutyScores,
         polinfoindex,
         edu15,
         edu16,
         edu17,
         edu18,
         edu19plus,
         in_school,
         in_uni) %>%
  na.omit()

# check if there are missings left
summary(bes)

# frequency table
table(bes$partyid)

# histogram
hist(bes$civicduty, main = "Distribution of Civic Duty",
     xlab = "Level of Civic Duty (larger values more feeling of duty)",
     ylab = "Frequency")

## ------------------------------------------------------------------------
# histogram Income
hist(bes$income, main = "Distribution of Income",
     xlab = "Income Level",
     ylab = "Frequency")

# histogram polinfoindex
hist(bes$polinfoindex, main = "Distribution of Knowledge of Politics",
     xlab = "Level of Knowledge",
     ylab = "Frequency",
     breaks = seq(0, 8, 1))

## ------------------------------------------------------------------------
# histogram influence
hist(bes$influence, main = "Distribution of Influence",
     xlab = "Influence Level",
     ylab = "Frequency")

## ------------------------------------------------------------------------
# logistic regression expanding on model 2 from the seminar (adds civicduty and partyid)
model.full <- glm(turnout ~ income + polinfoindex + influence + civicduty + partyid + 
                    male + age +edu15 + edu17 + edu18 + edu19plus + in_school + 
                    in_uni, family = binomial(link = "logit"), data = bes)

## ------------------------------------------------------------------------
# re-estimate model 2 from the seminar
model2 <- glm(turnout ~ income + polinfoindex + influence + male + age + edu15 + edu17 + 
                edu18 + edu19plus + in_school + in_uni, 
              family = binomial(link = "logit"), data = bes)

# likelihood ratio test
lrtest(model2, model.full)

## ------------------------------------------------------------------------
# predicted probabilities for all observations; full model
predicted.probs.full <- predict.glm(model.full, type = "response")

# predicted probabilities for all observations; model 2
predicted.probs.m2 <- predict.glm(model2, type = "response")

# expected values full model
expected.vals.full <- ifelse(predicted.probs.full > 0.5, yes = 1, no = 0)

# expected values model 2
expected.vals.m2 <- ifelse(predicted.probs.m2 > 0.5, yes = 1, no = 0)

# observed outcomes 
outcomes <- bes$turnout

# outcome table full model
outcome.table.full <- table(outcomes, expected.vals.full)
outcome.table.full

## percentage correctly predicted full model
# take the diagonal of the table (the correctly predicted cases)
# sum over the diagonal values and divide by the number of cases
sum( diag(outcome.table.full) ) / sum(outcome.table.full)

# outcome table model 2
outcome.table.m2 <- table(outcomes, expected.vals.m2)
outcome.table.m2

## percentage correctly predicted model 2
# take the diagonal of the table (the correctly predicted cases)
# sum over the diagonal values and divide by the number of cases
sum( diag(outcome.table.m2) ) / sum(outcome.table.m2)

## ------------------------------------------------------------------------
# cross-table comparing expected values and observed outcomes
outcome.table.full

## ------------------------------------------------------------------------
# expected values full model; cut-off 0.7
expected.vals.07 <- ifelse(predicted.probs.full > 0.7, yes = 1, no = 0)

# outcome table full model; cut-off 0.7
outcome.table.07 <- table(outcomes, expected.vals.07)
outcome.table.07

# percentage correctly predicted full model; cut-off 0.7
sum(diag(outcome.table.07)) / sum(outcome.table.07)


## ------------------------------------------------------------------------
# estimate full model with zelig
z.full <- zelig(turnout ~ income + polinfoindex + influence + civicduty
                + partyid + male + age +edu15 + edu17 + edu18 + edu19plus 
                + in_school + in_uni, model = "logit", data = bes, cite = FALSE)

# set covariates no identification with a party
no.party.id <- setx(z.full, partyid = 0, income = mean(bes$income), 
                    polinfoindex = mean(bes$polinfoindex), influence = mean(bes$influence),
                    civicduty = mean(bes$civicduty), male = 0, age = mean(bes$age), 
                    edu15 = 1, edu17 = 0, edu18 = 0, edu19plus = 0, in_school = 0, in_uni = 0)

# check values we set
t(no.party.id$values)

# set covariates person who identifies with a party
yes.party.id <- setx(z.full, partyid = 1, income = mean(bes$income), 
                    polinfoindex = mean(bes$polinfoindex), influence = mean(bes$influence),
                    civicduty = mean(bes$civicduty), male = 0, age = mean(bes$age), 
                    edu15 = 1, edu17 = 0, edu18 = 0, edu19plus = 0, in_school = 0, in_uni = 0)

# check values we set
t(yes.party.id$values)

# make results replicable
set.seed(123)

# simulate
s.out <- sim(z.full, x = no.party.id, x1 = yes.party.id)

# check results 
summary(s.out)

## ------------------------------------------------------------------------
screenreg(list (model2, model.full), 
          custom.model.names = c("Model 2", "Full Final Model"))

## ------------------------------------------------------------------------
# model 2 plus partid but excluding civicduty
m.partyid <- glm(turnout ~ income + polinfoindex + influence + male + age + edu15 + edu17 + 
                   edu18 + edu19plus + in_school + in_uni + partyid, 
                 family = binomial(link = "logit"), data = bes)

# model 2 plus civicduty but excluding partyid
m.civicduty <- glm(turnout ~ income + polinfoindex + influence + male + age + edu15 + edu17 + 
                     edu18 + edu19plus + in_school + in_uni + civicduty, 
                   family = binomial(link = "logit"), data = bes)

# table with all models
screenreg( list(model2, m.partyid, m.civicduty, model.full), custom.model.names = 
             c("Model 2", "M2 + PartyID", "M2 + Civic Duty", "Final Model") )

# check the the correlation between civic duty and influence
cor.test(bes$civicduty, bes$influence, mu = 0, use = "complete.obs" )

## ------------------------------------------------------------------------
# summary of variables of interest
summary( bes[, c("income", "polinfoindex")] )

# income for 1st quartile: 4
x.is.low.income <- setx(z.full, income = 4, polinfoindex = mean(bes$polinfoindex), 
                        influence = mean(bes$influence), civicduty = mean(bes$civicduty), 
                        partyid = 1, male = 0, age = mean(bes$age), edu15 = 1, edu17 = 0, edu18 = 0,
                        edu19plus = 0, in_school = 0, in_uni = 0)

# income for 3rd quartile: 7
x.is.high.income <- setx(z.full, income = 7, polinfoindex = mean(bes$polinfoindex), 
                        influence = mean(bes$influence), civicduty = mean(bes$civicduty), 
                        partyid = 1, male = 0, age = mean(bes$age), edu15 = 1, edu17 = 0, edu18 = 0,
                        edu19plus = 0, in_school = 0, in_uni = 0)

# simulation
income.sim <- sim(z.full, x = x.is.low.income, x1 = x.is.high.income)

# outcome
summary(income.sim)

## ------------------------------------------------------------------------
# polinfoindex for 1st quartile: 4
x.is.low.knowledge <- setx(z.full, polinfoindex = 4, income = mean(bes$income), 
                           influence = mean(bes$influence), civicduty = mean(bes$civicduty), 
                           partyid = 1, male = 0, age = mean(bes$age), edu15 = 1, edu17 = 0, edu18 = 0,
                           edu19plus = 0, in_school = 0, in_uni = 0)

# polinfoindex for 3rd quartile: 7
x.is.high.knowledge <- setx(z.full, polinfoindex = 7, income = mean(bes$income), 
                            influence = mean(bes$influence), civicduty = mean(bes$civicduty), 
                            partyid = 1, male = 0, age = mean(bes$age), edu15 = 1, edu17 = 0, edu18 = 0,
                            edu19plus = 0, in_school = 0, in_uni = 0)

# simulation
knowledge.sim <- sim(z.full, x = x.is.low.knowledge, x1 = x.is.high.knowledge)

# outcome
summary(knowledge.sim)

## ------------------------------------------------------------------------
# income varying from lowest to highest ceteris paribus (all other variables constant)
x.income.seq <- setx(z.full, income = 1:13, polinfoindex = mean(bes$polinfoindex), 
                        influence = mean(bes$influence), civicduty = mean(bes$civicduty), 
                        partyid = 1, male = 0, age = mean(bes$age), edu15 = 1, edu17 = 0, edu18 = 0,
                        edu19plus = 0, in_school = 0, in_uni = 0)

# polinfoindex varying from lowest to highest ceteris paribus (all other variables constant)
x.polinfoindex.seq <- setx(z.full,  polinfoindex = 1:13, income = mean(bes$income), 
                        influence = mean(bes$influence), civicduty = mean(bes$civicduty), 
                        partyid = 1, male = 0, age = mean(bes$age), edu15 = 1, edu17 = 0, edu18 = 0,
                        edu19plus = 0, in_school = 0, in_uni = 0)

# combination of both resource variables from lowest to highest all else equal
x.resource.seq <- setx(z.full,  polinfoindex = 1:13, income = 1:13, 
                       influence = mean(bes$influence), civicduty = mean(bes$civicduty), 
                       partyid = 1, male = 0, age = mean(bes$age), edu15 = 1, edu17 = 0, edu18 = 0,
                       edu19plus = 0, in_school = 0, in_uni = 0)

# simulations
sim.income.seq <- sim(z.full, x = x.income.seq)
sim.polinfoindex.seq <- sim(z.full, x = x.polinfoindex.seq)
sim.resource.seq <- sim(z.full, x = x.resource.seq)

# plot income from lowest to highest
plot.ci (sim.income.seq, 
         ci = 95,
         xlab = "income", 
         ylab = "predicted probability of Voting",
         main = "effect of income on turnout")

# plot polinfoindex from lowest to highest
plot.ci (sim.polinfoindex.seq, 
         ci = 95,
         xlab = "knowledge about politics", 
         ylab = "predicted probability of Voting",
         main = "effect of knowledge about politics on turnout")

# plot resource variables from lowest to highest
plot.ci (sim.resource.seq, 
         ci = 95,
         ylab = "predicted probability of Voting",
         main = "effect of knowledge by income levels")

## ------------------------------------------------------------------------
cor.test(bes$income, bes$polinfoindex, mu = 0, use = "complete.obs")

## ------------------------------------------------------------------------
# varying civic duty all else at appropriate measure of central tendency
x.civicduty.seq <- setx(z.full, civicduty = seq(min(bes$civicduty), max(bes$civicduty), 0.01), 
                        polinfoindex = mean(bes$polinfoindex), income = mean(bes$income), 
                        influence = mean(bes$influence),  partyid = 1, male = 0, 
                        age = mean(bes$age), edu15 = 1, edu17 = 0, edu18 = 0,
                        edu19plus = 0, in_school = 0, in_uni = 0)


# simulation for civic duty
sim.civicduty <- sim(z.full, x.civicduty.seq)

# plot effect of civic duty
plot.ci (sim.civicduty, 
         ci = 95,
         ylab = "predicted probability of Voting",
         main = "effect of civic duty on turnout")