# modules
import pandas as pd
import seaborn as sb
import fnmatch
# data
sals = pd.read_csv("/Users/michaelfeyder/Desktop/Bmore_demo/Baltimore_City_Employee_Salaries_FY2015.csv")
# remove dollar signs for AnnualSalary and GrossPay; skip over NAs, which are
# interpreted as floats and throw error
for i in range(sals.shape[0]):
if not type(sals["GrossPay"][i]) == float:
sals["GrossPay"][i] = sals["GrossPay"][i].replace("$", "")
if not type(sals["AnnualSalary"][i]) == float:
sals["AnnualSalary"][i] = sals["AnnualSalary"][i].replace("$", "")
# AnnualSalary and GrossPay to float
sals[["AnnualSalary", "GrossPay"]] = sals[["AnnualSalary", "GrossPay"]].astype(float)
# histogram of all agencies before collapsing data across sub-agencies
hist_all_agencies = sals.groupby("Agency").count()
hist_all_agencies["Agency"] = hist_all_agencies.index
hist_pre = sb.factorplot(data = hist_all_agencies,
x = "Agency",
y = "name",
kind = "bar")
hist_pre.set_axis_labels("Agency", "Number of Employees")
hist_pre.set_xticklabels([])
# top 10 annual salaries for individuals
sals.sort("AnnualSalary", ascending = False).head(10)
# top 10 gross pay for individuals
sals.sort("GrossPay", ascending = False).head(10)
# crossing guard info before collapsing
set(fnmatch.filter(sals["Agency"], "TRANS-Crossing Guards*"))
CGs = sals[sals["Agency"] == ("TRANS-Crossing Guards (786)" or "TRANS-Crossing Guards (900)" or "TRANS-Crossing Guards (910)")]
CGs["AnnualSalary"].median() # median $10546.00
# collapse across sub-agencies
for i in xrange(sals.shape[0]):
if "M-R" in sals.loc[i, "Agency"]:
slice_index = sals.loc[i, "Agency"].find(" ")
sals.loc[i, "Agency"] = sals.loc[i, "Agency"][:slice_index]
else:
slice_index = sals.loc[i, "Agency"].find("(")
sals.loc[i, "Agency"] = sals.loc[i, "Agency"][:slice_index - 1]
if "-" in sals.loc[:, "Agency"][i]:
slice_index2 = sals.loc[i, "Agency"].find("-")
sals.loc[i, "Agency"] = sals.loc[i, "Agency"][:slice_index2]
# re-name some agencies for clarity
clean_agency_names = {
"DPW": "Department of Public Works",
"R&P": "Recreation and Parks",
"Civil Rights & Wage Enforce": "Civil Right and Wage Enfore",
"FPR Admin": "Fire and Police Retirement",
"HR": "Human Resources",
"COMP": "Comptroller",
"ERS/EOS Admin": "Employee/Elected Officials Retirement Service",
"M-R": "Media Relations",
"TRANS": "Transportation",
"FIN": "Finance",
"Housing & Community Dev": "Housing and Community Development",
"HLTH": "Health Department",
"OED": "Employment Development"
}
for i in sals.index.values:
if sals.loc[i, "Agency"] not in clean_agency_names.keys():
continue
else:
sals.loc[i, "Agency"] = clean_agency_names[sals.loc[i, "Agency"]]
# drop agencies with 10 or fewer employees
agency_counts = sals.groupby("Agency").count()
agency_keep = list(agency_counts[agency_counts["name"] > 10].index.values)
keep = []
for i in range(shape(sals)[0]):
keep.append(sals.loc[i, "Agency"] in agency_keep)
sals = sals[keep]
# histogram of agencies after collapsing data across sub-agencies
collapsed_agencies_grouped = sals.groupby("Agency")
hist_collapsed_agencies = collapsed_agencies_grouped.count()
hist_collapsed_agencies["Agency"] = hist_collapsed_agencies.index
hist_post = sb.factorplot(data = hist_collapsed_agencies,
x = "Agency",
y = "name",
kind = "bar")
hist_post.set_axis_labels("Agency", "Number of Employees")
hist_post.set_xticklabels([])
# box plot of annual salaries
sals_median_ascend = sals.groupby("Agency").median().sort("AnnualSalary").index.values
sb.factorplot(data = sals,
y = "Agency",
x = "AnnualSalary",
order = sals_median_ascend[::-1],
kind = "box",
size = 10)
# violin plot of fire department, sheriff's office,
# fire department and law department
agency_keep2 = ["Police Department", "Fire Department","Sheriff's Office"]
keep2 = []
for j in sals.index.values:
keep2.append(sals.loc[j, "Agency"] in agency_keep2)
sals_pfs = sals[keep2]
sb.factorplot(data = sals_pfs,
x= "Agency",
y = "AnnualSalary",
kind = "violin")