In [26]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import colors
import matplotlib.ticker as mtick
from matplotlib.ticker import PercentFormatter
from matplotlib.pyplot import figure
import seaborn as sns
import pandas as pd
In [27]:
df = pd.read_csv(r"C:\Users\james\Downloads\Degrees that Pay Back\degrees_that_pay_back.csv")
In [28]:
df[df.columns[1:]] = df[df.columns[1:]].replace('[\$,]', '', regex=True).astype(float)
In [29]:
df.tail()
Out[29]:
Undergraduate Major Starting Median Salary Mid-Career 10th Percentile Salary Mid-Career 25th Percentile Salary Mid-Career Median Salary Percent change from Starting to Mid-Career Salary Mid-Career 75th Percentile Salary Mid-Career 90th Percentile Salary
45 Political Science 40800.0 41200.0 55300.0 78200.0 91.7 114000.0 168000.0
46 Psychology 35900.0 31600.0 42100.0 60400.0 68.2 87500.0 127000.0
47 Religion 34100.0 29700.0 36500.0 52000.0 52.5 70900.0 96400.0
48 Sociology 36500.0 30700.0 40400.0 58200.0 59.5 81200.0 118000.0
49 Spanish 34000.0 31000.0 40000.0 53100.0 56.2 76800.0 96400.0
In [75]:
starting_median_salary = df['Starting Median Salary']
mid_career_median_salary = df['Mid-Career Median Salary']
percent_change = df['Percent change from Starting to Mid-Career Salary']
In [86]:
n_bins = 6
fig, axs = plt.subplots(1, 1, figsize=(20, 6))
fig.suptitle('Starting Median Salaries Histogram', fontsize = 16)

axs.axvline(starting_median_salary.mean(), color='w', linestyle='dashed', linewidth=1)

N, bins, patches = axs.hist(starting_median_salary, bins=n_bins, color='green', edgecolor='black')
fracs = N / N.max()
norm = colors.Normalize(fracs.min(), fracs.max())
for thisfrac, thispatch in zip(fracs, patches):
    color = plt.cm.viridis(norm(thisfrac))
    thispatch.set_facecolor(color)
 
axs.yaxis.set_major_formatter(PercentFormatter(xmax=len(starting_median_salary)))

axs.set_ylabel('Percentage', fontsize=14, labelpad=15)
axs.set_xlabel('Salaries', fontsize=14)

axs.tick_params(labelsize=12)

axs.tick_params(axis='x', labelrotation=45)

major_ticks = np.arange(0, max(axs.get_ylim()) + 2.5, 2)

axs.set_yticks(major_ticks)

axs.set_facecolor('k')
fig.set_facecolor('w')

plt.savefig('dtpb1.png')
In [87]:
n_bins = 6
fig, axs = plt.subplots(1, 1, figsize=(20, 6))
fig.suptitle('Mid-Career Median Salaries Histogram', fontsize = 16)

axs.axvline(mid_career_median_salary.mean(), color='w', linestyle='dashed', linewidth=1)

N, bins, patches = axs.hist(mid_career_median_salary, bins=n_bins, color='green', edgecolor='black')
fracs = N / N.max()
norm = colors.Normalize(fracs.min(), fracs.max())
for thisfrac, thispatch in zip(fracs, patches):
    color = plt.cm.viridis(norm(thisfrac))
    thispatch.set_facecolor(color)
 
axs.yaxis.set_major_formatter(PercentFormatter(xmax=len(starting_median_salary)))

axs.set_ylabel('Percentage', fontsize=14, labelpad=15)
axs.set_xlabel('Salaries', fontsize=14)

axs.tick_params(labelsize=12)

axs.tick_params(axis='x', labelrotation=45)

major_ticks = np.arange(0, max(axs.get_ylim()) + 2.5, 2)

axs.set_yticks(major_ticks)

axs.set_facecolor('k')
fig.set_facecolor('w')

plt.savefig('dtpb2.png')
In [88]:
n_bins = 9

fig, axs = plt.subplots(1, 1, figsize=(20, 6))
fig.suptitle('Percentage Salary Change from Starting to Mid-Career Histogram', fontsize = 16)

axs.axvline(percent_change.mean(), color='w', linestyle='dashed', linewidth=1)

N, bins, patches = axs.hist(percent_change, bins=n_bins, color='green', edgecolor='black')
fracs = N / N.max()
norm = colors.Normalize(fracs.min(), fracs.max())
for thisfrac, thispatch in zip(fracs, patches):
    color = plt.cm.viridis(norm(thisfrac))
    thispatch.set_facecolor(color)
 
axs.yaxis.set_major_formatter(PercentFormatter(xmax=len(starting_median_salary)))

axs.set_ylabel('Percentage', fontsize=14, labelpad=15)
axs.set_xlabel('Percentage Change', fontsize=14)

axs.tick_params(labelsize=12)

axs.tick_params(axis='x', labelrotation=45)

major_ticks = np.arange(0, max(axs.get_ylim()) + 2.5, 2)

axs.set_yticks(major_ticks)

axs.set_facecolor('k')
fig.set_facecolor('w')

plt.savefig('dtpb3.png')
In [121]:
df2 = pd.read_csv(r"C:\Users\james\Downloads\Degrees that Pay Back\degrees_that_pay_back.csv")
df2[df2.columns[1:]] = df2[df2.columns[1:]].replace('[\$,]', '', regex=True).astype(float)
df2 = df2.sort_values('Mid-Career Median Salary',ascending = False)
In [122]:
df2.head()
Out[122]:
Undergraduate Major Starting Median Salary Mid-Career 10th Percentile Salary Mid-Career 25th Percentile Salary Mid-Career Median Salary Percent change from Starting to Mid-Career Salary Mid-Career 75th Percentile Salary Mid-Career 90th Percentile Salary
8 Chemical Engineering 63200.0 71900.0 87300.0 107000.0 69.3 143000.0 194000.0
12 Computer Engineering 61400.0 66100.0 84100.0 105000.0 71.0 135000.0 162000.0
19 Electrical Engineering 60900.0 69300.0 83800.0 103000.0 69.1 130000.0 168000.0
1 Aerospace Engineering 57700.0 64300.0 82100.0 101000.0 75.0 127000.0 161000.0
17 Economics 50100.0 50600.0 70600.0 98600.0 96.8 145000.0 210000.0
In [123]:
df2 = pd.melt(df2, id_vars=['Undergraduate Major'], value_vars=['Starting Median Salary', 'Mid-Career Median Salary', 'Mid-Career 10th Percentile Salary', 'Mid-Career 25th Percentile Salary', 'Mid-Career 75th Percentile Salary', 'Mid-Career 90th Percentile Salary'])
df2.tail()
Out[123]:
Undergraduate Major variable value
295 Music Mid-Career 90th Percentile Salary 134000.0
296 Interior Design Mid-Career 90th Percentile Salary 107000.0
297 Spanish Mid-Career 90th Percentile Salary 96400.0
298 Education Mid-Career 90th Percentile Salary 102000.0
299 Religion Mid-Career 90th Percentile Salary 96400.0
In [124]:
fig, axs = plt.subplots(1,3, figsize=(20, 6))
violin_parts=axs[0].violinplot(starting_median_salary, showmedians=True)
for pc in violin_parts['bodies']:
    pc.set_facecolor('red')
    pc.set_edgecolor('black')

violin_parts=axs[1].violinplot(mid_career_median_salary, showmedians=True)
for pc in violin_parts['bodies']:
    pc.set_facecolor('red')
    pc.set_edgecolor('black')

violin_parts=axs[2].violinplot(percent_change, showmeans=True)
for pc in violin_parts['bodies']:
    pc.set_facecolor('red')
    pc.set_edgecolor('black')


axs[0].title.set_text('Starting Median Salary')
axs[1].title.set_text('Mid-Career Median Salary')
axs[2].title.set_text('Percentage Change')

axs[0].set_ylabel('Salary', fontsize=14, labelpad=15)
axs[1].set_ylabel('Salary', fontsize=14, labelpad=15)
axs[2].set_ylabel('Percentage Change', fontsize=14, labelpad=15)

fmt = '%.0f%%' 
xticks = mtick.FormatStrFormatter(fmt)
axs[2].yaxis.set_major_formatter(xticks)
plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.3, hspace=None)

axs[0].set_facecolor('lightgrey')
axs[1].set_facecolor('lightgrey')
axs[2].set_facecolor('lightgrey')
fig.set_facecolor('w')
plt.savefig('dtpb4.png')
In [125]:
#boxplots sorted by highest median salary

fig, ax = plt.subplots(1, 1, figsize=(20, 6))
plt.tight_layout()
sns.set_style("whitegrid")
bplot = sns.boxplot(y='value', x='Undergraduate Major', data=df2, width=0.5, color='white')
bplot = sns.stripplot(y='value', x='Undergraduate Major', data=df2, jitter=True, marker='o', alpha=0.5, color='k', edgecolor='c')
plt.xticks(rotation='vertical', fontsize=12)

dark_brown = '#B25116'
dark_pink = '#FB84D1'

# iterate over boxes
for i,box in enumerate(ax.artists):
    if i % 2 == 0:
        col = dark_pink
    else:
        col = dark_brown
        
    box.set_edgecolor(col)
    box.set_facecolor('white')
            
plt.setp(ax.lines, color='k')

plt.axhline(df2['value'].quantile(0.25), color='r')
plt.axhline(df2['value'].quantile(0.5), color='orange')
plt.axhline(df2['value'].quantile(0.75), color='g')
plt.axhline(df2['value'].mean(), color='b')

ax.set_xlabel('Undergraduate Major', fontsize=14)
plt.setp(ax.get_xticklabels(), rotation=30, horizontalalignment='right')
plt.title('Salaries by Major Boxplot Sorted by Highest Mid-Career Median Income', fontsize=14)

plt.gcf().subplots_adjust(bottom=0.15)

ax.set_facecolor('lightgrey')
fig.set_facecolor('w')
plt.savefig('dtpb5.png')
In [126]:
salaries = ['Starting Median Salary',
       'Mid-Career 10th Percentile Salary',
       'Mid-Career 25th Percentile Salary',
        'Mid-Career Median Salary',
       'Mid-Career 75th Percentile Salary',
       'Mid-Career 90th Percentile Salary']
In [127]:
degrees = df2['Undergraduate Major'].unique()
df2 = df2.sort_values('value', ascending = True)
fig, axs = plt.subplots(5, 10, figsize=(30, 15))
fig.suptitle('Salaries Sparkline by Major sorted by Highest Mid-Career Median Salary', y=0.94, fontsize = 14)
axs = axs.flatten()
for i, degree in enumerate(degrees):
    axs[i].plot(salaries, df2['value'].loc[df2['Undergraduate Major'] == degree], c=np.random.rand(3,))
    axs[i].set_title(degree, fontsize=10)
    axs[i].set_ylim(0, 220000)
    axs[i].set_xticks([])
    mediansalary = int(df2['value'].loc[(df2['Undergraduate Major'] == degree) & (df2['variable']=='Mid-Career Median Salary')].max())
    axs[i].axhline(mediansalary, color='k', linestyle='dashed', linewidth=1)
    axs[i].text(1, mediansalary + 10000, f'{mediansalary:,}', 
            size=12)
    
plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.4, hspace=None)

plt.show()

plt.savefig('dtpb6.png')
<Figure size 432x288 with 0 Axes>
In [ ]: