###########################################################################
# PUBLG100: Introduction to Quantitative Methods
#
# Week 3 Seminar: T-test for Difference in Means and Hypothesis Testing
#
#

# Change your working directory
setwd("N:/PUBLG100")

# Check your working directory
getwd()


## ------------------------------------------------------------------------
rm(list=ls()) # clear workspace

## ------------------------------------------------------------------------
library(foreign) # to work with foreign file formats

# loading a STATA format data set (remember to load the library foreign 1st)
world.data <- read.dta("http://uclspp.github.io/PUBLG100/data/QoG2012.dta")

# the dimensions: rows (observations) and columns (variables) 
dim(world.data)

# the variable names
names(world.data) 

# the top ten rows and all columns
world.data[ 1:10 , ]

## ------------------------------------------------------------------------
install.packages("dplyr")

## ----message=FALSE-------------------------------------------------------
# load dplyr
library(dplyr)

## ------------------------------------------------------------------------
# dataset.name <- rename(argument1, argument2 = argument3)
# h_j = 1 means there is an independent judiciary
# rename h_j to indep.judi

# rename a variable and save the result in our data frame
world.data <- rename( world.data, indep.judi = h_j )

# check the result
names(world.data)

## ------------------------------------------------------------------------
# frequency table of binary independent variable
table(world.data$indep.judi)

## ------------------------------------------------------------------------
# creating a factor variable
world.data$indep.judi <- factor(world.data$indep.judi, 
                                labels = c("contr. judiciary", "indep. judiciary"))

# checking the result
world.data[ 1:10, ]

# a frequency table of indep.judi
table(world.data$indep.judi)

## ------------------------------------------------------------------------
summary(world.data$wdi_gdpc)

## ------------------------------------------------------------------------
# creating subsets of our data by the status of the judiciary
free.legal <- subset(world.data, indep.judi == "indep. judiciary")
contr.legal <- subset(world.data, indep.judi == "contr. judiciary")

# mean income levels, we remove missings
mean(free.legal$wdi_gdpc, na.rm = TRUE)
mean(contr.legal$wdi_gdpc, na.rm = TRUE)

## ------------------------------------------------------------------------
# t.test
# Interval DV (GDP per captia)
# Binary IV (independent judiciary)
t.test(world.data$wdi_gdpc ~ world.data$indep.judi, mu=0, alt="two.sided", conf=0.95)

## ------------------------------------------------------------------------
# renaming variables
world.data <- rename(world.data, hdi = undp_hdi)
world.data <- rename(world.data, cont.corr = wbgi_cce)

## ------------------------------------------------------------------------
# scatterplot
plot( x = world.data$cont.corr,
      y = world.data$hdi,
      xlim = c(xmin = -2, xmax = 3),
      ylim = c(ymin = 0, ymax = 1),
      frame = FALSE,
      xlab = "World Bank Control of Corruption Index",
      ylab = "UN Development Programme Human Development Index",
      main = "Relationship b/w Quality of Institutions and Quality of Life")

## ------------------------------------------------------------------------
#  Pearson's r including test statistic
r <- cor.test(world.data$cont.corr, world.data$hdi, use="complete.obs", conf.level = 0.99)
r

## ------------------------------------------------------------------------
# creating fake hdi and fake corruption by adding 3 outliers
# to each vector
fake.hdi <- c(world.data$hdi, 0.0, 0.1, 0.2)
fake.corr <- c(world.data$cont.corr, 3, 2.8, 2.9)

## ------------------------------------------------------------------------
# correlation coeffiecient with outliers
r.outliers <- cor.test(fake.corr, fake.hdi, use="complete.obs", conf.level = 0.99)
r.outliers

# compare this to non-faked data
r

## ------------------------------------------------------------------------
# plot the difference in corrlations
plot(x = world.data$cont.corr,
     y = world.data$hdi,
     xlim = c(xmin = -2, xmax = 3),
     ylim = c(ymin = 0, ymax = 1),
     frame = FALSE,
     xlab = "World Bank Control of Corruption Index",
     ylab = "UN Development Programme Human Development Index",
     main = "Relationship b/w Quality of Institutions and Quality of Life")

# the fake data points
# this gives us the last 3 elements of a vector
tail(fake.corr,3)
points(x = tail(fake.corr, 3), y = tail(fake.hdi, 3), 
       col = "red", pch = 20)

# line of best fit real data
reg1 <- lm(world.data$hdi ~ world.data$cont.corr, data = world.data)
abline(reg1)
# line of best fit with fake data
reg2 <- lm(fake.hdi ~ fake.corr, data = world.data)
abline(reg2, col = "red")