########################################################################### # PUBLG100: Introduction to Quantitative Methods # # Week 3 Seminar: T-test for Difference in Means and Hypothesis Testing # # # Change your working directory setwd("N:/PUBLG100") # Check your working directory getwd() ## ------------------------------------------------------------------------ rm(list=ls()) # clear workspace ## ------------------------------------------------------------------------ library(foreign) # to work with foreign file formats # loading a STATA format data set (remember to load the library foreign 1st) world.data <- read.dta("http://uclspp.github.io/PUBLG100/data/QoG2012.dta") # the dimensions: rows (observations) and columns (variables) dim(world.data) # the variable names names(world.data) # the top ten rows and all columns world.data[ 1:10 , ] ## ------------------------------------------------------------------------ install.packages("dplyr") ## ----message=FALSE------------------------------------------------------- # load dplyr library(dplyr) ## ------------------------------------------------------------------------ # dataset.name <- rename(argument1, argument2 = argument3) # h_j = 1 means there is an independent judiciary # rename h_j to indep.judi # rename a variable and save the result in our data frame world.data <- rename( world.data, indep.judi = h_j ) # check the result names(world.data) ## ------------------------------------------------------------------------ # frequency table of binary independent variable table(world.data$indep.judi) ## ------------------------------------------------------------------------ # creating a factor variable world.data$indep.judi <- factor(world.data$indep.judi, labels = c("contr. judiciary", "indep. judiciary")) # checking the result world.data[ 1:10, ] # a frequency table of indep.judi table(world.data$indep.judi) ## ------------------------------------------------------------------------ summary(world.data$wdi_gdpc) ## ------------------------------------------------------------------------ # creating subsets of our data by the status of the judiciary free.legal <- subset(world.data, indep.judi == "indep. judiciary") contr.legal <- subset(world.data, indep.judi == "contr. judiciary") # mean income levels, we remove missings mean(free.legal$wdi_gdpc, na.rm = TRUE) mean(contr.legal$wdi_gdpc, na.rm = TRUE) ## ------------------------------------------------------------------------ # t.test # Interval DV (GDP per captia) # Binary IV (independent judiciary) t.test(world.data$wdi_gdpc ~ world.data$indep.judi, mu=0, alt="two.sided", conf=0.95) ## ------------------------------------------------------------------------ # renaming variables world.data <- rename(world.data, hdi = undp_hdi) world.data <- rename(world.data, cont.corr = wbgi_cce) ## ------------------------------------------------------------------------ # scatterplot plot( x = world.data$cont.corr, y = world.data$hdi, xlim = c(xmin = -2, xmax = 3), ylim = c(ymin = 0, ymax = 1), frame = FALSE, xlab = "World Bank Control of Corruption Index", ylab = "UN Development Programme Human Development Index", main = "Relationship b/w Quality of Institutions and Quality of Life") ## ------------------------------------------------------------------------ # Pearson's r including test statistic r <- cor.test(world.data$cont.corr, world.data$hdi, use="complete.obs", conf.level = 0.99) r ## ------------------------------------------------------------------------ # creating fake hdi and fake corruption by adding 3 outliers # to each vector fake.hdi <- c(world.data$hdi, 0.0, 0.1, 0.2) fake.corr <- c(world.data$cont.corr, 3, 2.8, 2.9) ## ------------------------------------------------------------------------ # correlation coeffiecient with outliers r.outliers <- cor.test(fake.corr, fake.hdi, use="complete.obs", conf.level = 0.99) r.outliers # compare this to non-faked data r ## ------------------------------------------------------------------------ # plot the difference in corrlations plot(x = world.data$cont.corr, y = world.data$hdi, xlim = c(xmin = -2, xmax = 3), ylim = c(ymin = 0, ymax = 1), frame = FALSE, xlab = "World Bank Control of Corruption Index", ylab = "UN Development Programme Human Development Index", main = "Relationship b/w Quality of Institutions and Quality of Life") # the fake data points # this gives us the last 3 elements of a vector tail(fake.corr,3) points(x = tail(fake.corr, 3), y = tail(fake.hdi, 3), col = "red", pch = 20) # line of best fit real data reg1 <- lm(world.data$hdi ~ world.data$cont.corr, data = world.data) abline(reg1) # line of best fit with fake data reg2 <- lm(fake.hdi ~ fake.corr, data = world.data) abline(reg2, col = "red")