###########################################################################
# PUBLG100: Introduction to Quantitative Methods
#
# Week 7 Seminar: Interactions
#
#

# Change your working directory
setwd("N:/PUBLG100")

# Check your working directory
getwd()


## ----message=FALSE-------------------------------------------------------
library(foreign)
library(Zelig)
library(texreg)
library(dplyr)
rm(list=ls())

## ------------------------------------------------------------------------
# load quality of government institute 2015 dataset
world.data <- read.dta("http://uclspp.github.io/PUBLG100/data/QoG2012.dta")

# look at variable names
names(world.data)

# look at first 5 observations
world.data[1:5,]

# descriptive summary statistics
summary(world.data[, c("undp_hdi", "wbgi_cce", "former_col", "wdi_gdpc")]) 

# frequency table for categorical variable
table(world.data$former_col) 

## ------------------------------------------------------------------------
# we remove NA's
world.data <- na.omit(world.data)

# run the multiple regression
m1 <- lm(undp_hdi ~ wbgi_cce + former_col, data = world.data)

# regression table
screenreg(m1)

## ------------------------------------------------------------------------
# look at cofficients of m1
m1$coefficients

## ------------------------------------------------------------------------
# scatter plot
plot(undp_hdi ~ wbgi_cce, data = world.data,
     xlab = "control of corruption",
     ylab = "quality of life",
     main = "effect of a dummy")

# the regression line when former_col = 0
abline(coef = c(m1$coefficients[1], m1$coefficients[2]), lty = "solid")

# the regression line when former_col = 1
abline(coef = c(m1$coefficients[1] + m1$coefficients[3], m1$coefficients[2]), lty = "dashed")

# add a legend to the plot
legend("bottomright", c("former_col = 0", "former_col = 1"), lty = c("solid", "dashed"))

## ------------------------------------------------------------------------
# first model
z.out1 <- zelig(undp_hdi ~ wbgi_cce + former_col, 
                model = "ls", data = world.data, cite = FALSE)
# second model
z.out2 <- zelig(undp_hdi ~ wbgi_cce + former_col + wbgi_cce:former_col, 
                model = "ls", data = world.data, cite = FALSE)

# F-test (see seminar 5)
anova(z.out1$result, z.out2$result) 

# regression table
screenreg(list(z.out1, z.out2))

## ------------------------------------------------------------------------
# intercepts and slopes of control of corruption
intercepts <- c(z.out2$res$coeff[1], z.out2$res$coeff[1] + z.out2$res$coeff[3]) 
slopes <- c(z.out2$res$coeff[2], z.out2$res$coeff[2] + z.out2$res$coeff[4])

# rbind combines separate vectors or matrices or data frames row-wise 
cond.efcts <- rbind(intercepts,slopes)

# colnames lets you define names of columns of a matrix like object
colnames(cond.efcts) <- c("never colony", "former colony")

# lets examine the effects
# the function round lets us round to 2 digits
round(cond.efcts, digits = 2)

## ------------------------------------------------------------------------
# set covariates for countries that weren't colonised
x.out1 <- setx(z.out2, former_col = 0, wbgi_cce = -3:2)

# set covariates for colonised countries
x.out2 <- setx(z.out2, former_col = 1, wbgi_cce = -3:2)

# simulate 
s.out <- sim(z.out2, x = x.out1, x1 = x.out2)

# plot results
# you will run into problems here if you forgot to do na.omit(world.data)
plot(s.out, 
     xlab = "Control of Corruption",
     ylab = "predicted value of Human Development Index",
     main = "Effect of Control of Corruption by Colonial Past"
    ) 

# add text to Zelig plot where x and y are plot coordinates
text( x = -1, y = .8, labels = "no former colonies")
text( x = -1, y = .4, labels = "former colonies")

## ------------------------------------------------------------------------
# we check min and max of variables of interest to draw plot limits (xlim & ylim)
summary(world.data[, c("undp_hdi", "wdi_gdpc")  ])

# plot of relationship b/w income & the human development index
plot( undp_hdi ~ wdi_gdpc,
      data = world.data,
      xlim = c( xmin = 0, xmax = 65000),
      ylim = c( ymin = 0, ymax = 1),
      frame = FALSE,
      xlab = "World Bank GDP/captia",
      ylab = "Human Development Index",
      main = "Relationship b/w Income and Quality of Life")

# add the regression line using zelig() within abline()
abline(zelig(undp_hdi ~ wdi_gdpc, data = world.data, model = "ls", cite = FALSE))

# lowess line
lines(lowess(world.data$wdi_gdpc, world.data$undp_hdi), col="red")

## ------------------------------------------------------------------------
# we include a quadradic term for income
z.out <- zelig(undp_hdi ~ wdi_gdpc + I(wdi_gdpc^2), 
               data = world.data, model = "ls", cite = FALSE)

# regression output
summary(z.out)

# do we need the quadratic term or is the relationship in fact linear?
# use an F-Test to check
z.out.base <- zelig(undp_hdi ~ wdi_gdpc, data = world.data, model = "ls", cite = FALSE)
anova(z.out.base$res, z.out$res)

# setting covariates; GDP/captia is a sequence from 0 to 45000 by steps of 1000
x.out <- setx(z.out, wdi_gdpc = seq(0, 45000, 1000))

# simulate using our model and our covariates
s.out <- sim(z.out, x = x.out)

# plot the results
plot(s.out)

## ------------------------------------------------------------------------
# we order by ex-colony and then hdi 
head(arrange(world.data, former_col, undp_hdi))
# note: to change the data set you would have to assign it:
# world.data <- arrange(world.data, former_col, undp_hdi)

# the default order is ascending, for descending use:
head(arrange(world.data, desc(former_col, undp_hdi)))