########################################################################### # PUBLG100: Introduction to Quantitative Methods # # Week 9 Solutions: Binary models: Logit # # ## ----message = FALSE----------------------------------------------------- # clearing workspace rm(list = ls()) # loading packages we need library(Zelig) # load BEFORE dplyr!!! library(foreign) # to load .dta files library(lmtest) # for likelihood ratio test library(texreg) # regression tables library(dplyr) # data manipulation/ summary statistics # load data bes <- read.dta("http://uclspp.github.io/PUBLG100/data/bes.dta") # selecting variables we need bes <- bes %>% select(turnout = Turnout, income = Income, age = Age, male = Gender, partyid = PartyID, influence = Influence, civicduty = CivicDutyScores, polinfoindex, edu15, edu16, edu17, edu18, edu19plus, in_school, in_uni) %>% na.omit() # check if there are missings left summary(bes) # frequency table table(bes$partyid) # histogram hist(bes$civicduty, main = "Distribution of Civic Duty", xlab = "Level of Civic Duty (larger values more feeling of duty)", ylab = "Frequency") ## ------------------------------------------------------------------------ # histogram Income hist(bes$income, main = "Distribution of Income", xlab = "Income Level", ylab = "Frequency") # histogram polinfoindex hist(bes$polinfoindex, main = "Distribution of Knowledge of Politics", xlab = "Level of Knowledge", ylab = "Frequency", breaks = seq(0, 8, 1)) ## ------------------------------------------------------------------------ # histogram influence hist(bes$influence, main = "Distribution of Influence", xlab = "Influence Level", ylab = "Frequency") ## ------------------------------------------------------------------------ # logistic regression expanding on model 2 from the seminar (adds civicduty and partyid) model.full <- glm(turnout ~ income + polinfoindex + influence + civicduty + partyid + male + age +edu15 + edu17 + edu18 + edu19plus + in_school + in_uni, family = binomial(link = "logit"), data = bes) ## ------------------------------------------------------------------------ # re-estimate model 2 from the seminar model2 <- glm(turnout ~ income + polinfoindex + influence + male + age + edu15 + edu17 + edu18 + edu19plus + in_school + in_uni, family = binomial(link = "logit"), data = bes) # likelihood ratio test lrtest(model2, model.full) ## ------------------------------------------------------------------------ # predicted probabilities for all observations; full model predicted.probs.full <- predict.glm(model.full, type = "response") # predicted probabilities for all observations; model 2 predicted.probs.m2 <- predict.glm(model2, type = "response") # expected values full model expected.vals.full <- ifelse(predicted.probs.full > 0.5, yes = 1, no = 0) # expected values model 2 expected.vals.m2 <- ifelse(predicted.probs.m2 > 0.5, yes = 1, no = 0) # observed outcomes outcomes <- bes$turnout # outcome table full model outcome.table.full <- table(outcomes, expected.vals.full) outcome.table.full ## percentage correctly predicted full model # take the diagonal of the table (the correctly predicted cases) # sum over the diagonal values and divide by the number of cases sum( diag(outcome.table.full) ) / sum(outcome.table.full) # outcome table model 2 outcome.table.m2 <- table(outcomes, expected.vals.m2) outcome.table.m2 ## percentage correctly predicted model 2 # take the diagonal of the table (the correctly predicted cases) # sum over the diagonal values and divide by the number of cases sum( diag(outcome.table.m2) ) / sum(outcome.table.m2) ## ------------------------------------------------------------------------ # cross-table comparing expected values and observed outcomes outcome.table.full ## ------------------------------------------------------------------------ # expected values full model; cut-off 0.7 expected.vals.07 <- ifelse(predicted.probs.full > 0.7, yes = 1, no = 0) # outcome table full model; cut-off 0.7 outcome.table.07 <- table(outcomes, expected.vals.07) outcome.table.07 # percentage correctly predicted full model; cut-off 0.7 sum(diag(outcome.table.07)) / sum(outcome.table.07) ## ------------------------------------------------------------------------ # estimate full model with zelig z.full <- zelig(turnout ~ income + polinfoindex + influence + civicduty + partyid + male + age +edu15 + edu17 + edu18 + edu19plus + in_school + in_uni, model = "logit", data = bes, cite = FALSE) # set covariates no identification with a party no.party.id <- setx(z.full, partyid = 0, income = mean(bes$income), polinfoindex = mean(bes$polinfoindex), influence = mean(bes$influence), civicduty = mean(bes$civicduty), male = 0, age = mean(bes$age), edu15 = 1, edu17 = 0, edu18 = 0, edu19plus = 0, in_school = 0, in_uni = 0) # check values we set t(no.party.id$values) # set covariates person who identifies with a party yes.party.id <- setx(z.full, partyid = 1, income = mean(bes$income), polinfoindex = mean(bes$polinfoindex), influence = mean(bes$influence), civicduty = mean(bes$civicduty), male = 0, age = mean(bes$age), edu15 = 1, edu17 = 0, edu18 = 0, edu19plus = 0, in_school = 0, in_uni = 0) # check values we set t(yes.party.id$values) # make results replicable set.seed(123) # simulate s.out <- sim(z.full, x = no.party.id, x1 = yes.party.id) # check results summary(s.out) ## ------------------------------------------------------------------------ screenreg(list (model2, model.full), custom.model.names = c("Model 2", "Full Final Model")) ## ------------------------------------------------------------------------ # model 2 plus partid but excluding civicduty m.partyid <- glm(turnout ~ income + polinfoindex + influence + male + age + edu15 + edu17 + edu18 + edu19plus + in_school + in_uni + partyid, family = binomial(link = "logit"), data = bes) # model 2 plus civicduty but excluding partyid m.civicduty <- glm(turnout ~ income + polinfoindex + influence + male + age + edu15 + edu17 + edu18 + edu19plus + in_school + in_uni + civicduty, family = binomial(link = "logit"), data = bes) # table with all models screenreg( list(model2, m.partyid, m.civicduty, model.full), custom.model.names = c("Model 2", "M2 + PartyID", "M2 + Civic Duty", "Final Model") ) # check the the correlation between civic duty and influence cor.test(bes$civicduty, bes$influence, mu = 0, use = "complete.obs" ) ## ------------------------------------------------------------------------ # summary of variables of interest summary( bes[, c("income", "polinfoindex")] ) # income for 1st quartile: 4 x.is.low.income <- setx(z.full, income = 4, polinfoindex = mean(bes$polinfoindex), influence = mean(bes$influence), civicduty = mean(bes$civicduty), partyid = 1, male = 0, age = mean(bes$age), edu15 = 1, edu17 = 0, edu18 = 0, edu19plus = 0, in_school = 0, in_uni = 0) # income for 3rd quartile: 7 x.is.high.income <- setx(z.full, income = 7, polinfoindex = mean(bes$polinfoindex), influence = mean(bes$influence), civicduty = mean(bes$civicduty), partyid = 1, male = 0, age = mean(bes$age), edu15 = 1, edu17 = 0, edu18 = 0, edu19plus = 0, in_school = 0, in_uni = 0) # simulation income.sim <- sim(z.full, x = x.is.low.income, x1 = x.is.high.income) # outcome summary(income.sim) ## ------------------------------------------------------------------------ # polinfoindex for 1st quartile: 4 x.is.low.knowledge <- setx(z.full, polinfoindex = 4, income = mean(bes$income), influence = mean(bes$influence), civicduty = mean(bes$civicduty), partyid = 1, male = 0, age = mean(bes$age), edu15 = 1, edu17 = 0, edu18 = 0, edu19plus = 0, in_school = 0, in_uni = 0) # polinfoindex for 3rd quartile: 7 x.is.high.knowledge <- setx(z.full, polinfoindex = 7, income = mean(bes$income), influence = mean(bes$influence), civicduty = mean(bes$civicduty), partyid = 1, male = 0, age = mean(bes$age), edu15 = 1, edu17 = 0, edu18 = 0, edu19plus = 0, in_school = 0, in_uni = 0) # simulation knowledge.sim <- sim(z.full, x = x.is.low.knowledge, x1 = x.is.high.knowledge) # outcome summary(knowledge.sim) ## ------------------------------------------------------------------------ # income varying from lowest to highest ceteris paribus (all other variables constant) x.income.seq <- setx(z.full, income = 1:13, polinfoindex = mean(bes$polinfoindex), influence = mean(bes$influence), civicduty = mean(bes$civicduty), partyid = 1, male = 0, age = mean(bes$age), edu15 = 1, edu17 = 0, edu18 = 0, edu19plus = 0, in_school = 0, in_uni = 0) # polinfoindex varying from lowest to highest ceteris paribus (all other variables constant) x.polinfoindex.seq <- setx(z.full, polinfoindex = 1:13, income = mean(bes$income), influence = mean(bes$influence), civicduty = mean(bes$civicduty), partyid = 1, male = 0, age = mean(bes$age), edu15 = 1, edu17 = 0, edu18 = 0, edu19plus = 0, in_school = 0, in_uni = 0) # combination of both resource variables from lowest to highest all else equal x.resource.seq <- setx(z.full, polinfoindex = 1:13, income = 1:13, influence = mean(bes$influence), civicduty = mean(bes$civicduty), partyid = 1, male = 0, age = mean(bes$age), edu15 = 1, edu17 = 0, edu18 = 0, edu19plus = 0, in_school = 0, in_uni = 0) # simulations sim.income.seq <- sim(z.full, x = x.income.seq) sim.polinfoindex.seq <- sim(z.full, x = x.polinfoindex.seq) sim.resource.seq <- sim(z.full, x = x.resource.seq) # plot income from lowest to highest plot.ci (sim.income.seq, ci = 95, xlab = "income", ylab = "predicted probability of Voting", main = "effect of income on turnout") # plot polinfoindex from lowest to highest plot.ci (sim.polinfoindex.seq, ci = 95, xlab = "knowledge about politics", ylab = "predicted probability of Voting", main = "effect of knowledge about politics on turnout") # plot resource variables from lowest to highest plot.ci (sim.resource.seq, ci = 95, ylab = "predicted probability of Voting", main = "effect of knowledge by income levels") ## ------------------------------------------------------------------------ cor.test(bes$income, bes$polinfoindex, mu = 0, use = "complete.obs") ## ------------------------------------------------------------------------ # varying civic duty all else at appropriate measure of central tendency x.civicduty.seq <- setx(z.full, civicduty = seq(min(bes$civicduty), max(bes$civicduty), 0.01), polinfoindex = mean(bes$polinfoindex), income = mean(bes$income), influence = mean(bes$influence), partyid = 1, male = 0, age = mean(bes$age), edu15 = 1, edu17 = 0, edu18 = 0, edu19plus = 0, in_school = 0, in_uni = 0) # simulation for civic duty sim.civicduty <- sim(z.full, x.civicduty.seq) # plot effect of civic duty plot.ci (sim.civicduty, ci = 95, ylab = "predicted probability of Voting", main = "effect of civic duty on turnout")