###########################################################################
# PUBLG100: Introduction to Quantitative Methods
#
# Week 7 Solutions: Interactions
#
#

## ----message=FALSE-------------------------------------------------------
library(foreign) # to load .dta files
library(texreg) # to print nice regression tables
library(Zelig) # to model & illustrate uncertainty

## ------------------------------------------------------------------------
# clear workspace
rm(list = ls())

# load data set from seminar page
data.set <- read.dta("http://uclspp.github.io/PUBLG100/data/QoG2012.dta")

# exercise subset
exercise.subset <- data.set[, c("undp_hdi", "wdi_gdpc")]

# removing NA's on our two variables of interest
exercise.subset <- na.omit(exercise.subset)

# running regression with squared GDP/captia
exercise1.reg <- zelig(undp_hdi ~ wdi_gdpc + I(wdi_gdpc^2), model = "ls",
                       data = exercise.subset, cite = FALSE)

screenreg(exercise1.reg)

## ------------------------------------------------------------------------
## effect of an increase from $5,000 to $15,000
# set.seed() makes your results replicable, it does not 
# matter which values you put inside
set.seed(123)

# setting covariates
set.gdp5000 <- setx(exercise1.reg, wdi_gdpc = 5000)
set.gdp15000 <- setx(exercise1.reg, wdi_gdpc = 15000)

# simulating
sim.5000.to.15000 <- sim(exercise1.reg, x = set.gdp5000, x1 = set.gdp15000)

# checking for the first differences
summary(sim.5000.to.15000)

## effect of an increase from 25,000 to 35,000
# setting covariates
set.gdp25000 <- setx(exercise1.reg, wdi_gdpc = 25000)
set.gdp35000 <- setx(exercise1.reg, wdi_gdpc = 35000)

# simulating
sim.25000.to.35000 <- sim(exercise1.reg, x = set.gdp25000, x1 = set.gdp35000)

# checking for the first differences
summary(sim.25000.to.35000)

# checking summary statistics of wealth
summary(exercise.subset$wdi_gdpc)

# checking 25th percentile and 75th percentile
quantile(exercise.subset$wdi_gdpc, probs = c(0.25, 0.75))

# looking at the distribution of wealth
hist(exercise.subset$wdi_gdpc)

# checking summary statistic of the dependent variable
summary(exercise.subset$undp_hdi)
diff(range(exercise.subset$undp_hdi))

# the effect of increasing from $5000 to $15000
summary(sim.5000.to.15000)
# calculating the effect in percentage terms (compared to the range of DV)
0.182 / diff(range(exercise.subset$undp_hdi))

# the effect of increasing from $25000 to $35000
summary(sim.25000.to.35000)
# calculating the effect in percentage (compared to range of DV)
0.041 / diff(range(exercise.subset$undp_hdi))

## ------------------------------------------------------------------------
# clear workspace
rm(list = ls())

# load data set
data.set <- read.dta("http://uclspp.github.io/PUBLG100/data/QoG2012.dta")

# subset for analysis
exercise2.subset <- data.set[, c("wbgi_pse", "h_j", "former_col")]

# remove missings from subset
exercise2.subset <- na.omit(exercise2.subset)

# estimate model without interaction
base.model <- zelig(wbgi_pse ~ h_j + former_col, model = "ls",
                    data = exercise2.subset, cite = FALSE)

# estimate model including the interaction
interaction.model <- zelig(wbgi_pse ~ h_j + former_col + h_j*former_col, 
                           model = "ls", data = exercise2.subset, cite = FALSE)

# comparing the models
screenreg(list(base.model, interaction.model))

# F-test
anova(base.model$result, interaction.model$result)

## ------------------------------------------------------------------------
# summary statistics of the dependent variable
summary(exercise2.subset$wbgi_pse)

# range of the dependnet variable
diff(range(exercise2.subset$wbgi_pse))

# percentage change in pol. stability of having indep. judiciary if NOT colonised
0.24 / diff(range(exercise2.subset$wbgi_pse))

# percentage change in pol. stability of having indep. judiciary if colonised
0.10 / diff(range(exercise2.subset$wbgi_pse))

## ------------------------------------------------------------------------
# clear workspace
rm(list = ls())

# load data set
data.set <- read.dta("http://uclspp.github.io/PUBLG100/data/QoG2012.dta")

# subset for analysis
exercise3.subset <- data.set[, c("undp_hdi", "h_j", "wbgi_cce")]

# remove missings
exercise3.subset <- na.omit(exercise3.subset)

# estimate model without interaction
base.model <- zelig(undp_hdi ~ wbgi_cce + h_j, model = "ls", data = exercise3.subset,
                    cite = FALSE)

# estimate interaction model
interaction.model <- zelig(undp_hdi ~ wbgi_cce + h_j + wbgi_cce:h_j, 
                           model = "ls", data = exercise3.subset, cite = FALSE)

# output table
screenreg(list(base.model, interaction.model))

# F-Test
anova(base.model$result, interaction.model$result)

## ------------------------------------------------------------------------
# illustration of results

# range of control of corruption
summary(exercise3.subset$wbgi_cce)

# setting the two scenarios
no.indep.jud.setx <- setx(interaction.model, h_j = 0, wbgi_cce = seq(-1.7, 2.5, 0.1) )
an.indep.jud.setx <- setx(interaction.model, h_j = 1, wbgi_cce = seq(-1.7, 2.5, 0.1) )

# simulation
simulation.result <- sim(interaction.model, x = no.indep.jud.setx, x1 = an.indep.jud.setx)

# plot results
plot(simulation.result,
     main = "Effect of Control of Corruption Conditional on Indep. Judiciary",
     xlab = "Control of Corruption",
     ylab = "Quality of Life")

## ------------------------------------------------------------------------
# clear workspace
rm(list = ls())

# load data set from your local folder where you saved the data set
california.schools <- read.dta("caschool.dta")

# check variable names
names(california.schools)

# summary stats of variables of interest
summary(california.schools[, c("testscr", "avginc")])

# checking for missings in data set
summary(california.schools)

# scatter plot
plot( testscr ~ avginc, data = california.schools,
      main = "Relationship b/w Income and Test Scores",
      xlab = "Average District Income in $1000's",
      ylab = "Average Test Score of Reading and Math")

# regression exluding squared income
base.reg <- zelig( testscr ~ avginc, model = "ls", data = california.schools,
                   cite = FALSE)

# regression including squared income
square.reg <- zelig( testscr ~ avginc + I(avginc^2), model = "ls", 
                     data = california.schools, cite = FALSE)

# regression tables
screenreg(list(base.reg, square.reg))

# F-test
anova(base.reg$result, square.reg$result)

## ------------------------------------------------------------------------
# descriptive summary statistics of district income
summary(california.schools$avginc)

# what is the income in the 10th percentile?
low1 <- quantile(california.schools$avginc, 0.10)
low1

# what is the income in the 15th percentile
low2 <- quantile(california.schools$avginc, 0.15)
low2

# how big is difference in terms of money
low2 - low1

# illustrating effect of a move from the 10th to the 15th percentile
x.out.low1 <- setx(square.reg, avginc = low1)
x.out.low2 <- setx(square.reg, avginc = low2)

# simulation
s.out.low <- sim(square.reg, x = x.out.low1, x1 = x.out.low2)

# check first difference
summary(s.out.low)

# dependent variable summary
summary(california.schools$testscr)

# range of the depdendent variable
diff(range(california.schools$testscr))

# percent change of DV (percent change based on range of DV as observed in the sample) caused by IV
2.6 / diff(range(california.schools$testscr))

## ------------------------------------------------------------------------
# what is the avg. income at the 75th percentile?
high1 <- quantile(california.schools$avginc, 0.75)
high1

# we add the same amount of wealth as we did for the poor districts
high2 <- high1 + (low2 - low1)
high2

# illustrating effect of adding $840 avg. district income to a district at 75th percentile
x.out.high1 <- setx(square.reg, avginc = high1)
x.out.high2 <- setx(square.reg, avginc = high2)

# simulation
s.out.low <- sim(square.reg, x = x.out.high1, x1 = x.out.high2)

# check first difference
summary(s.out.low)

# percent change of DV (percent change based on range of DV as observed in the sample) caused by IV
2 / diff(range(california.schools$testscr))