# Analysis of Baltimore City employee salaries for the 2015 fiscal year

library(magrittr)
library(plyr)
library(dplyr)
library(ggplot2)
library(stringi)
library(gridExtra)
library(scales)

salaries <- read.csv("Baltimore_City_Employee_Salaries_FY2015.csv")

# remove dollar signs for AnnualSalary and GrossPay; convert to numeric
salaries$AnnualSalary %<>% as.character()
salaries$GrossPay %<>% as.character()
for (x in 1:dim(salaries)[1]) {
  salaries$AnnualSalary[x] %<>% sub(pattern = "\\$", 
                                    replacement = "", 
                                    x = . )
  salaries$GrossPay[x] %<>% sub(pattern = "\\$", 
                                replacement = "", 
                                x = . )
}
salaries$AnnualSalary %<>% as.numeric()
salaries$GrossPay %<>% as.numeric()

# histogram of employees per sub-agency before collapsing
agency.prefilter <- data.frame(Agencies = salaries$Agency)
hist.agencies.pre <- ggplot(agency.prefilter, aes(Agencies)) + 
                        geom_bar() + 
                        theme_classic() +
                        theme(axis.text.x = element_blank(), 
                              axis.ticks.x = element_blank(),
                              axis.title.x = element_blank(),
                              axis.text.y = element_text(size = 18),
                              axis.title.y = element_text(size = 18)) + 
                        scale_y_continuous (limits = c(0, 600), 
                                            breaks = seq(0, 600, 200), 
                                            expand = c(0,0)) +
                        ylab("Number of employees")

# According to Baltimore Sun (http://data.baltimoresun.com/salaries/city/fy2015/?showvals=true&NullVariableName=&sortby=AnnualSalary&sortdirection=DESC),
# the salary for Michael Swift is erroneously recorded ($5000 instead of 
# $187,000).
salaries$AnnualSalary[salaries$name == "Swift,Michael"] <- 5000

# top 10 annual salaries for individuals. 
salaries %>% arrange(., desc(AnnualSalary)) %>% 
  select(. , name, JobTitle, Agency, AnnualSalary) %>%
  head(., 10)

# top 10 gross salaries for individuals
salaries %>% arrange(., desc(GrossPay)) %>% 
  select(. , name, JobTitle, Agency, GrossPay) %>%
  head(., 10)

# crossing guard info before collapsing:
cross_guard_ID <- grep(pattern = "TRANS-Crossing Guards*", x = salaries$Agency)
cross_guard <- salaries[cross_guard_ID, ]
dim(cross_guard) # 305 crossing guards
median(cross_guard$AnnualSalary) # median annual salary: $10,546


#### collapse across sub-agencies (673 sub-agencies -> 25 agencies)

# removes sub-agencies IDs
salaries$Agency %<>% as.character()
for (x in 1:dim(salaries)[1]) {
  salaries$Agency[x] %<>% stri_replace_all_regex(. , 
                                                 pattern = " \\(.*", 
                                                 replacement = "")
  salaries$Agency[x] %<>% stri_replace_all_regex(. , 
                                                 pattern = "-.*", 
                                                 replacement = "")
  if (salaries$Agency[x] == "Mayors Office") {
    salaries$Agency[x] <- "Mayor's Office"
  }
}
salaries$Agency %<>% as.factor()

# drop agencies with 10 or fewer employees
a <- summary(salaries$Agency)
keep.agency <- names(a[a > 10])
keep <- salaries$Agency %in% keep.agency
salaries.filtered <- salaries[keep, ]

# histogram of agencies after collapsing data across sub-agencies
agency.postfilter <- data.frame(Agencies = salaries.filtered$Agency)

hist.agencies.post <- ggplot(agency.postfilter, aes(Agencies)) + 
                        geom_bar() + 
                        theme_classic() +
                        theme(axis.text.x = element_blank(), 
                              axis.ticks.x = element_blank(),
                              axis.text.y = element_text(size = 18), 
                              axis.title.y = element_text(size = 18),
                              axis.title.x = element_text(size = 18)) + 
                        scale_y_continuous (limits = c(0, 4000), 
                                            breaks = seq(0, 4000, 1000), 
                                            expand = c(0,0)) +
                        ylab("Number of employees") +
                        xlab("Agency")

# facetted histogram of agencies pre- and post-collapsed
png("baltimore_govt_agencies.png", width = 800)
grid.arrange(hist.agencies.pre, hist.agencies.post)
dev.off()

# renamed agencies for clarity
salaries.filtered$Agency %<>% revalue(c("Civil Rights & Wage Enforce" = "Civil Right and Wage Enfore",
                                        "COMP" = "Comptroller", 
                                        "DPW" = "Department of Public Works",
                                        "ERS/EOS Admin" = "Employee/Elected Officials Retirement Service",
                                        "FIN" = "Finance",
                                        "FPR Admin" = "Fire and Police Retirement",
                                        "HLTH" = "Health Department",
                                        "Housing & Community Dev" = "Housing and Community Development",
                                        "HR" = "Human Resources",
                                        "M" = "Media Relations",                                        
                                        "OED" = "Employment Development",
                                        "R&P" = "Recreation and Parks",
                                        "TRANS" = "Transportation"))

#### boxplot of annual salaries descending by median
salaries.filtered$Agency <- reorder(x = salaries.filtered$Agency, 
                                    X = salaries.filtered$AnnualSalary, 
                                    FUN = median)

salaries.boxplot <- ggplot(salaries.filtered, aes(x = Agency, y = AnnualSalary)) + 
                      geom_boxplot(outlier.colour = "#2D2D83", 
                                   outlier.size = 3) + 
                      scale_y_continuous(labels = dollar) + 
                      coord_flip() + 
                      theme_linedraw() +
                      theme(axis.title.y = element_blank(),
                            axis.text.y = element_text(size = 13),
                            axis.title.x = element_text(size = 18),
                            axis.text.x = element_text(size = 16, 
                                                       angle = 60, 
                                                       hjust = 1)) +
                      ylab("Annual Salary")

png("baltimore_government_salaries_2015.png", height = 800, width = 600)
salaries.boxplot
dev.off()

#### max salary per agency along with difference from the next highest salary
maxes <- data.frame(Agency = rep(NA, 25), 
                    name = rep(NA, 25), 
                    AnnualSalary = rep(NA, 25), 
                    JobTitle = rep(NA, 25),
                    NextSalary = rep(NA, 25),
                    NextSalaryNormed = rep(NA, 25))

filtered.agencies <- unique(salaries.filtered$Agency)

for (x in 1:25) {
  temp <- salaries.filtered[salaries.filtered$Agency == filtered.agencies[x], ] %>% 
              select(. , Agency, name, AnnualSalary, JobTitle) %>% 
              arrange(., desc(AnnualSalary))
  for (y in c(1, 2, 4)) {
    maxes[x, y] <- as.character(temp[1, y]) 
  }
  maxes[x, 3] <- temp[1, 3]
  maxes[x, 5] <-  temp[2, 3] - temp[1, 3]
  maxes[x, 6] <-  ((temp[2, 3] - temp[1, 3]) / temp[1, 3])
}

# table ordered by absolute differences from next highest salary
arrange(maxes, NextSalary)

# correlation between annual salary and difference from next highest salary
salaries.corr <- ggplot(maxes, aes(x = AnnualSalary, y = NextSalary)) + 
                    stat_smooth(method = lm, 
                                color = "#2D2D83", 
                                size = 1.5) +
                    geom_point(size = 4) + 
                    theme_classic() +
                    scale_x_continuous (limits = c(90000, 250000), 
                                        breaks = seq(100000, 250000, 50000), 
                                        expand = c(0,0),
                                        labels = dollar) +
                    scale_y_continuous(breaks = seq(40000, -120000, -40000),
                                       labels = dollar) + 
                    theme(axis.text.x = element_text(size = 15, 
                                                     angle = 60, 
                                                     hjust = 1),
                          axis.title.x = element_text(size = 18),
                          axis.text.y = element_text(size = 15),
                          axis.title.y = element_text(size = 18)) +
                    ylab("Second highest salary") +
                    xlab("Highest Annual Salary")

png("baltimore_highest_salary_correlation.png")
salaries.corr
dev.off()

# hypothetical example where second highest salary earns 20% less than
# max salary produces negative correlation; normalization results in 
# zero correlation

x <- seq(100000, 200000, 20000)
y <- x * -0.2
y_normed = y/x
norm_example <- data.frame(annual_salary = x, 
                           next_highest_salary = y,
                           normalized_next = y_normed)

corr <- ggplot(norm_example, aes(x = x, y = y)) + 
          geom_point(size = 6) + 
          theme_classic() + 
          scale_x_continuous(labels = dollar) + 
          scale_y_continuous(labels = dollar) + 
          theme(axis.text.x = element_text(size = 12, 
                                           angle = 60, 
                                           hjust = 1),
                axis.title.x = element_text(size = 15),
                axis.text.y = element_text(size = 12),
                axis.title.y = element_text(size = 15)) +
          ylab("Next highest salary") +
          xlab("Highest salary") + 
          ggtitle("Absolute Differences")

normed_corr <- ggplot(norm_example, aes(x = x, y = y_normed)) + 
                  geom_point(size = 6) + 
                  theme_classic() + 
                  scale_x_continuous(labels = dollar) + 
                  scale_y_continuous(breaks = seq(0.2, -0.6, -0.2),
                                     labels = percent) + 
                  theme(axis.text.x = element_text(size = 12, 
                                                   angle = 60, 
                                                   hjust = 1),
                        axis.title.x = element_text(size = 15),
                        axis.text.y = element_text(size = 12),
                        axis.title.y = element_text(size = 15)) + 
                  xlab("Highest salary") +
                  ylab("") +
                  ggtitle("Normalized Differences")

png("example_salary_normalized.png")
grid.arrange(corr, normed_corr, ncol = 2)
dev.off()

# correlation between annual salary and difference from next highest salary
# normalized to highest salary
salaries.corr.normed <- ggplot(maxes, aes(x = AnnualSalary, y = NextSalaryNormed)) + 
                          stat_smooth(method = lm, 
                                      color = "#2D2D83", 
                                      size = 1.5) +
                          geom_point(size = 4) + 
                          theme_classic() +
                          scale_x_continuous(limits = c(90000, 250000), 
                                             breaks = seq(100000, 250000, 50000),
                                             expand = c(0,0),
                                             labels = dollar) +
                          scale_y_continuous(labels = percent) + 
                          theme(axis.text.x = element_text(size = 15, 
                                                           angle = 60, 
                                                           hjust = 1),
                                axis.title.x = element_text(size = 18),
                                axis.text.y = element_text(size = 15),
                                axis.title.y = element_text(size = 18)) +
                          ylab("Second highest salary (% of highest)") +
                          xlab("Highest Annual Salary")

png("baltimore_highest_salary_correlation_normed.png")
salaries.corr.normed
dev.off()

# table ordered by absolute differences from next highest salary
arrange(maxes, NextSalaryNormed) %>% select(. , Agency, AnnualSalary, NextSalaryNormed)


# violin plot of fire department, sheriff's office, fire department and
# law department
fpsl <- salaries.filtered[salaries.filtered$Agency == "Police Department" | 
                            salaries.filtered$Agency == "Fire Department" | 
                            salaries.filtered$Agency ==  "Sheriff's Office" |
                            salaries.filtered$Agency ==  "Law Department", ]

# reorder levels
levels(fpsl$Agency)[1] <- "Police Department"
levels(fpsl$Agency)[2] <- "Fire Department"
levels(fpsl$Agency)[3] <- "Sheriff's Office"
levels(fpsl$Agency)[4] <- "Law Department"

fpsl.violin <- ggplot(fpsl, aes(x = Agency, y = AnnualSalary, fill = Agency)) + 
                  geom_violin() + 
                  theme_classic() +
                  theme(axis.title.x = element_blank(),
                        axis.text.x = element_text(size = 15, 
                                                   angle = 60, 
                                                   hjust = 1),
                        axis.text.y = element_text(size = 15),
                        axis.title.y = element_text(size = 18),
                        legend.position="none") +
                  scale_fill_manual(values = c("#2D2D83", "#2D2D83", 
                                               "#2D2D83", "#6E6E6E")) +
                  scale_y_continuous(labels = dollar,
                                     limits = c(0, 150000),
                                     breaks = seq(0, 150000, 50000), 
                                     expand = c(0, 0)) +
                  ylab("Annual Salary")

png("baltimore_fire_police_sheriff_legal_salaries_dist.png")
fpsl.violin
dev.off()