# Analysis of Baltimore City employee salaries for the 2015 fiscal year
library(magrittr)
library(plyr)
library(dplyr)
library(ggplot2)
library(stringi)
library(gridExtra)
library(scales)
salaries <- read.csv("Baltimore_City_Employee_Salaries_FY2015.csv")
# remove dollar signs for AnnualSalary and GrossPay; convert to numeric
salaries$AnnualSalary %<>% as.character()
salaries$GrossPay %<>% as.character()
for (x in 1:dim(salaries)[1]) {
salaries$AnnualSalary[x] %<>% sub(pattern = "\\$",
replacement = "",
x = . )
salaries$GrossPay[x] %<>% sub(pattern = "\\$",
replacement = "",
x = . )
}
salaries$AnnualSalary %<>% as.numeric()
salaries$GrossPay %<>% as.numeric()
# histogram of employees per sub-agency before collapsing
agency.prefilter <- data.frame(Agencies = salaries$Agency)
hist.agencies.pre <- ggplot(agency.prefilter, aes(Agencies)) +
geom_bar() +
theme_classic() +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.x = element_blank(),
axis.text.y = element_text(size = 18),
axis.title.y = element_text(size = 18)) +
scale_y_continuous (limits = c(0, 600),
breaks = seq(0, 600, 200),
expand = c(0,0)) +
ylab("Number of employees")
# According to Baltimore Sun (http://data.baltimoresun.com/salaries/city/fy2015/?showvals=true&NullVariableName=&sortby=AnnualSalary&sortdirection=DESC),
# the salary for Michael Swift is erroneously recorded ($5000 instead of
# $187,000).
salaries$AnnualSalary[salaries$name == "Swift,Michael"] <- 5000
# top 10 annual salaries for individuals.
salaries %>% arrange(., desc(AnnualSalary)) %>%
select(. , name, JobTitle, Agency, AnnualSalary) %>%
head(., 10)
# top 10 gross salaries for individuals
salaries %>% arrange(., desc(GrossPay)) %>%
select(. , name, JobTitle, Agency, GrossPay) %>%
head(., 10)
# crossing guard info before collapsing:
cross_guard_ID <- grep(pattern = "TRANS-Crossing Guards*", x = salaries$Agency)
cross_guard <- salaries[cross_guard_ID, ]
dim(cross_guard) # 305 crossing guards
median(cross_guard$AnnualSalary) # median annual salary: $10,546
#### collapse across sub-agencies (673 sub-agencies -> 25 agencies)
# removes sub-agencies IDs
salaries$Agency %<>% as.character()
for (x in 1:dim(salaries)[1]) {
salaries$Agency[x] %<>% stri_replace_all_regex(. ,
pattern = " \\(.*",
replacement = "")
salaries$Agency[x] %<>% stri_replace_all_regex(. ,
pattern = "-.*",
replacement = "")
if (salaries$Agency[x] == "Mayors Office") {
salaries$Agency[x] <- "Mayor's Office"
}
}
salaries$Agency %<>% as.factor()
# drop agencies with 10 or fewer employees
a <- summary(salaries$Agency)
keep.agency <- names(a[a > 10])
keep <- salaries$Agency %in% keep.agency
salaries.filtered <- salaries[keep, ]
# histogram of agencies after collapsing data across sub-agencies
agency.postfilter <- data.frame(Agencies = salaries.filtered$Agency)
hist.agencies.post <- ggplot(agency.postfilter, aes(Agencies)) +
geom_bar() +
theme_classic() +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.text.y = element_text(size = 18),
axis.title.y = element_text(size = 18),
axis.title.x = element_text(size = 18)) +
scale_y_continuous (limits = c(0, 4000),
breaks = seq(0, 4000, 1000),
expand = c(0,0)) +
ylab("Number of employees") +
xlab("Agency")
# facetted histogram of agencies pre- and post-collapsed
png("baltimore_govt_agencies.png", width = 800)
grid.arrange(hist.agencies.pre, hist.agencies.post)
dev.off()
# renamed agencies for clarity
salaries.filtered$Agency %<>% revalue(c("Civil Rights & Wage Enforce" = "Civil Right and Wage Enfore",
"COMP" = "Comptroller",
"DPW" = "Department of Public Works",
"ERS/EOS Admin" = "Employee/Elected Officials Retirement Service",
"FIN" = "Finance",
"FPR Admin" = "Fire and Police Retirement",
"HLTH" = "Health Department",
"Housing & Community Dev" = "Housing and Community Development",
"HR" = "Human Resources",
"M" = "Media Relations",
"OED" = "Employment Development",
"R&P" = "Recreation and Parks",
"TRANS" = "Transportation"))
#### boxplot of annual salaries descending by median
salaries.filtered$Agency <- reorder(x = salaries.filtered$Agency,
X = salaries.filtered$AnnualSalary,
FUN = median)
salaries.boxplot <- ggplot(salaries.filtered, aes(x = Agency, y = AnnualSalary)) +
geom_boxplot(outlier.colour = "#2D2D83",
outlier.size = 3) +
scale_y_continuous(labels = dollar) +
coord_flip() +
theme_linedraw() +
theme(axis.title.y = element_blank(),
axis.text.y = element_text(size = 13),
axis.title.x = element_text(size = 18),
axis.text.x = element_text(size = 16,
angle = 60,
hjust = 1)) +
ylab("Annual Salary")
png("baltimore_government_salaries_2015.png", height = 800, width = 600)
salaries.boxplot
dev.off()
#### max salary per agency along with difference from the next highest salary
maxes <- data.frame(Agency = rep(NA, 25),
name = rep(NA, 25),
AnnualSalary = rep(NA, 25),
JobTitle = rep(NA, 25),
NextSalary = rep(NA, 25),
NextSalaryNormed = rep(NA, 25))
filtered.agencies <- unique(salaries.filtered$Agency)
for (x in 1:25) {
temp <- salaries.filtered[salaries.filtered$Agency == filtered.agencies[x], ] %>%
select(. , Agency, name, AnnualSalary, JobTitle) %>%
arrange(., desc(AnnualSalary))
for (y in c(1, 2, 4)) {
maxes[x, y] <- as.character(temp[1, y])
}
maxes[x, 3] <- temp[1, 3]
maxes[x, 5] <- temp[2, 3] - temp[1, 3]
maxes[x, 6] <- ((temp[2, 3] - temp[1, 3]) / temp[1, 3])
}
# table ordered by absolute differences from next highest salary
arrange(maxes, NextSalary)
# correlation between annual salary and difference from next highest salary
salaries.corr <- ggplot(maxes, aes(x = AnnualSalary, y = NextSalary)) +
stat_smooth(method = lm,
color = "#2D2D83",
size = 1.5) +
geom_point(size = 4) +
theme_classic() +
scale_x_continuous (limits = c(90000, 250000),
breaks = seq(100000, 250000, 50000),
expand = c(0,0),
labels = dollar) +
scale_y_continuous(breaks = seq(40000, -120000, -40000),
labels = dollar) +
theme(axis.text.x = element_text(size = 15,
angle = 60,
hjust = 1),
axis.title.x = element_text(size = 18),
axis.text.y = element_text(size = 15),
axis.title.y = element_text(size = 18)) +
ylab("Second highest salary") +
xlab("Highest Annual Salary")
png("baltimore_highest_salary_correlation.png")
salaries.corr
dev.off()
# hypothetical example where second highest salary earns 20% less than
# max salary produces negative correlation; normalization results in
# zero correlation
x <- seq(100000, 200000, 20000)
y <- x * -0.2
y_normed = y/x
norm_example <- data.frame(annual_salary = x,
next_highest_salary = y,
normalized_next = y_normed)
corr <- ggplot(norm_example, aes(x = x, y = y)) +
geom_point(size = 6) +
theme_classic() +
scale_x_continuous(labels = dollar) +
scale_y_continuous(labels = dollar) +
theme(axis.text.x = element_text(size = 12,
angle = 60,
hjust = 1),
axis.title.x = element_text(size = 15),
axis.text.y = element_text(size = 12),
axis.title.y = element_text(size = 15)) +
ylab("Next highest salary") +
xlab("Highest salary") +
ggtitle("Absolute Differences")
normed_corr <- ggplot(norm_example, aes(x = x, y = y_normed)) +
geom_point(size = 6) +
theme_classic() +
scale_x_continuous(labels = dollar) +
scale_y_continuous(breaks = seq(0.2, -0.6, -0.2),
labels = percent) +
theme(axis.text.x = element_text(size = 12,
angle = 60,
hjust = 1),
axis.title.x = element_text(size = 15),
axis.text.y = element_text(size = 12),
axis.title.y = element_text(size = 15)) +
xlab("Highest salary") +
ylab("") +
ggtitle("Normalized Differences")
png("example_salary_normalized.png")
grid.arrange(corr, normed_corr, ncol = 2)
dev.off()
# correlation between annual salary and difference from next highest salary
# normalized to highest salary
salaries.corr.normed <- ggplot(maxes, aes(x = AnnualSalary, y = NextSalaryNormed)) +
stat_smooth(method = lm,
color = "#2D2D83",
size = 1.5) +
geom_point(size = 4) +
theme_classic() +
scale_x_continuous(limits = c(90000, 250000),
breaks = seq(100000, 250000, 50000),
expand = c(0,0),
labels = dollar) +
scale_y_continuous(labels = percent) +
theme(axis.text.x = element_text(size = 15,
angle = 60,
hjust = 1),
axis.title.x = element_text(size = 18),
axis.text.y = element_text(size = 15),
axis.title.y = element_text(size = 18)) +
ylab("Second highest salary (% of highest)") +
xlab("Highest Annual Salary")
png("baltimore_highest_salary_correlation_normed.png")
salaries.corr.normed
dev.off()
# table ordered by absolute differences from next highest salary
arrange(maxes, NextSalaryNormed) %>% select(. , Agency, AnnualSalary, NextSalaryNormed)
# violin plot of fire department, sheriff's office, fire department and
# law department
fpsl <- salaries.filtered[salaries.filtered$Agency == "Police Department" |
salaries.filtered$Agency == "Fire Department" |
salaries.filtered$Agency == "Sheriff's Office" |
salaries.filtered$Agency == "Law Department", ]
# reorder levels
levels(fpsl$Agency)[1] <- "Police Department"
levels(fpsl$Agency)[2] <- "Fire Department"
levels(fpsl$Agency)[3] <- "Sheriff's Office"
levels(fpsl$Agency)[4] <- "Law Department"
fpsl.violin <- ggplot(fpsl, aes(x = Agency, y = AnnualSalary, fill = Agency)) +
geom_violin() +
theme_classic() +
theme(axis.title.x = element_blank(),
axis.text.x = element_text(size = 15,
angle = 60,
hjust = 1),
axis.text.y = element_text(size = 15),
axis.title.y = element_text(size = 18),
legend.position="none") +
scale_fill_manual(values = c("#2D2D83", "#2D2D83",
"#2D2D83", "#6E6E6E")) +
scale_y_continuous(labels = dollar,
limits = c(0, 150000),
breaks = seq(0, 150000, 50000),
expand = c(0, 0)) +
ylab("Annual Salary")
png("baltimore_fire_police_sheriff_legal_salaries_dist.png")
fpsl.violin
dev.off()