###########################################################################
# PUBLG100: Introduction to Quantitative Methods
#
# Week 2 Seminar: Descriptive Statistics
#
#

# Change your working directory
setwd("N:/PUBLG100")

# Check your working directory
getwd()


## ------------------------------------------------------------------------
# clear environment
rm(list = ls())

## ------------------------------------------------------------------------
install.packages("readxl")

## ------------------------------------------------------------------------
library(readxl)

## ------------------------------------------------------------------------
# make sure you've downloaded the dataset from http://uclspp.github.io/PUBLG100/data/hsb2.xlsx 
# to your PUBLG100 working directory.

student_data <- read_excel("hsb2.xlsx")

## ------------------------------------------------------------------------
names(student_data)

## ------------------------------------------------------------------------
dim(student_data)

## ------------------------------------------------------------------------
head(student_data, n = 10)

## ------------------------------------------------------------------------
hist(student_data$science, main = "Histogram of Science Scores", xlab = "Science Score")

## ------------------------------------------------------------------------
sum(student_data$science) / length(student_data$science)

## ------------------------------------------------------------------------
science_mean <- mean(student_data$science)
science_mean

## ------------------------------------------------------------------------
science_median <- median(student_data$science)
science_median

## ------------------------------------------------------------------------
science_scores <- table(student_data$science)
science_scores

## ------------------------------------------------------------------------
sorted_science_scores <- sort(science_scores, decreasing = TRUE)
sorted_science_scores

## ------------------------------------------------------------------------
science_mode <- sorted_science_scores[1]
science_mode

## ------------------------------------------------------------------------
max(student_data$science)
which.max(student_data$science)

## ------------------------------------------------------------------------
top_science_student <- which.max(student_data$science)
top_science_student_id <- student_data[top_science_student,]$id
top_science_student_id

## ------------------------------------------------------------------------
science_range <- range(student_data$science)
diff(science_range)

## ------------------------------------------------------------------------
var(student_data$science)

## ------------------------------------------------------------------------
sd(student_data$science)

## ------------------------------------------------------------------------
quantile(student_data$science, c(0.25, 0.5, 0.75))

## ------------------------------------------------------------------------
student_data$gender <- factor(student_data$female, labels = c("Male", "Female")) 
student_data$socioeconomic_status <- factor(student_data$ses, labels = c("Low", "Middle", "High")) 
student_data$racial_group <- factor(student_data$race, labels = c("Black", "Asian", "Hispanic", "White")) 

## ------------------------------------------------------------------------
head(student_data)

## ------------------------------------------------------------------------
table(student_data$gender)

## ------------------------------------------------------------------------
table(student_data$socioeconomic_status)

## ------------------------------------------------------------------------
table(student_data$socioeconomic_status, student_data$racial_group)

## ------------------------------------------------------------------------
boys <- subset(student_data, gender == "Male")
girls <- subset(student_data, gender == "Female")

## ------------------------------------------------------------------------
mean(boys$science)
mean(girls$science)

## ------------------------------------------------------------------------
# bar charts
barplot(table(student_data$socioeconomic_status))

## ------------------------------------------------------------------------
# science score by gender, race and socioeconomic status
par(mfrow=c(1,3))

# categorical variables are plotted as boxplots
plot(student_data$gender, student_data$science, main = "Gender", las = 2)
plot(student_data$racial_group, student_data$science, main = "Race", las = 2)
plot(student_data$socioeconomic_status, student_data$science, main = "Socioeconomic Status", las = 2)

## ------------------------------------------------------------------------
par(mfrow=c(1,1))
plot(student_data$math, student_data$science)

## ------------------------------------------------------------------------
pairs(~ read + write + math + science + socst, data = student_data)

## ------------------------------------------------------------------------
student_data$english <- apply(student_data[c("read", "write")], 1, mean)

## ------------------------------------------------------------------------
head(student_data)

## ------------------------------------------------------------------------
normal_dist <- rnorm(10, mean = 0, sd = 1)
normal_dist

## ------------------------------------------------------------------------
num_rolls <- 10 # number of times to roll the dice
rolls <- as.integer(runif(num_rolls, min = 1, max = 7))
rolls