# modules
import pandas as pd
import seaborn as sb
import fnmatch

# data
sals = pd.read_csv("/Users/michaelfeyder/Desktop/Bmore_demo/Baltimore_City_Employee_Salaries_FY2015.csv")

# remove dollar signs for AnnualSalary and GrossPay; skip over NAs, which are
# interpreted as floats and throw error
for i in range(sals.shape[0]):

    if not type(sals["GrossPay"][i]) == float:
        sals["GrossPay"][i] = sals["GrossPay"][i].replace("$", "")

    if not type(sals["AnnualSalary"][i]) == float:
        sals["AnnualSalary"][i] = sals["AnnualSalary"][i].replace("$", "")
        
# AnnualSalary and GrossPay to float
sals[["AnnualSalary", "GrossPay"]] = sals[["AnnualSalary", "GrossPay"]].astype(float)

# histogram of all agencies before collapsing data across sub-agencies
hist_all_agencies = sals.groupby("Agency").count()
hist_all_agencies["Agency"] = hist_all_agencies.index

hist_pre = sb.factorplot(data = hist_all_agencies, 
                         x = "Agency", 
                         y = "name",  
                         kind = "bar")
hist_pre.set_axis_labels("Agency", "Number of Employees")
hist_pre.set_xticklabels([])

# top 10 annual salaries for individuals
sals.sort("AnnualSalary", ascending = False).head(10)

# top 10 gross pay for individuals
sals.sort("GrossPay", ascending = False).head(10)

# crossing guard info before collapsing
set(fnmatch.filter(sals["Agency"], "TRANS-Crossing Guards*"))
CGs = sals[sals["Agency"] == ("TRANS-Crossing Guards (786)" or "TRANS-Crossing Guards (900)" or "TRANS-Crossing Guards (910)")]
CGs["AnnualSalary"].median()    # median $10546.00

# collapse across sub-agencies
for i in xrange(sals.shape[0]):
    if "M-R" in sals.loc[i, "Agency"]:
        slice_index = sals.loc[i, "Agency"].find(" ")
        sals.loc[i, "Agency"] = sals.loc[i, "Agency"][:slice_index]
    
    else:
        slice_index = sals.loc[i, "Agency"].find("(")
        sals.loc[i, "Agency"] = sals.loc[i, "Agency"][:slice_index - 1] 

        if "-" in sals.loc[:, "Agency"][i]:
            slice_index2 = sals.loc[i, "Agency"].find("-")
            sals.loc[i, "Agency"] = sals.loc[i, "Agency"][:slice_index2] 

# re-name some agencies for clarity
clean_agency_names = {
"DPW": "Department of Public Works", 
"R&P": "Recreation and Parks", 
"Civil Rights & Wage Enforce": "Civil Right and Wage Enfore", 
"FPR Admin": "Fire and Police Retirement",
"HR": "Human Resources", 
"COMP": "Comptroller", 
"ERS/EOS Admin": "Employee/Elected Officials Retirement Service",
"M-R": "Media Relations", 
"TRANS": "Transportation", 
"FIN": "Finance", 
"Housing & Community Dev": "Housing and Community Development", 
"HLTH": "Health Department",
"OED": "Employment Development"
}

for i in sals.index.values:
    if sals.loc[i, "Agency"] not in clean_agency_names.keys():
        continue
    else:
        sals.loc[i, "Agency"] = clean_agency_names[sals.loc[i, "Agency"]]

# drop agencies with 10 or fewer employees
agency_counts = sals.groupby("Agency").count()
agency_keep = list(agency_counts[agency_counts["name"] > 10].index.values)
keep = []
for i in range(shape(sals)[0]):
    keep.append(sals.loc[i, "Agency"] in agency_keep)
sals = sals[keep]

# histogram of agencies after collapsing data across sub-agencies
collapsed_agencies_grouped = sals.groupby("Agency")
hist_collapsed_agencies = collapsed_agencies_grouped.count()
hist_collapsed_agencies["Agency"] = hist_collapsed_agencies.index

hist_post = sb.factorplot(data = hist_collapsed_agencies, 
                          x = "Agency", 
                          y = "name",  
                          kind = "bar")
hist_post.set_axis_labels("Agency", "Number of Employees")
hist_post.set_xticklabels([])

# box plot of annual salaries
sals_median_ascend = sals.groupby("Agency").median().sort("AnnualSalary").index.values
sb.factorplot(data = sals, 
              y = "Agency", 
              x = "AnnualSalary", 
              order = sals_median_ascend[::-1], 
              kind = "box", 
              size = 10)

# violin plot of fire department, sheriff's office, 
# fire department and law department
agency_keep2 = ["Police Department", "Fire Department","Sheriff's Office"]
keep2 = []
for j in sals.index.values:
    keep2.append(sals.loc[j, "Agency"] in agency_keep2)
sals_pfs = sals[keep2]

sb.factorplot(data = sals_pfs, 
              x= "Agency", 
              y = "AnnualSalary", 
              kind = "violin")