%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import colors
import matplotlib.ticker as mtick
from matplotlib.ticker import PercentFormatter
from matplotlib.pyplot import figure
import seaborn as sns
import pandas as pd
df = pd.read_csv(r"C:\Users\james\Downloads\Degrees that Pay Back\degrees_that_pay_back.csv")
df[df.columns[1:]] = df[df.columns[1:]].replace('[\$,]', '', regex=True).astype(float)
df.tail()
starting_median_salary = df['Starting Median Salary']
mid_career_median_salary = df['Mid-Career Median Salary']
percent_change = df['Percent change from Starting to Mid-Career Salary']
n_bins = 6
fig, axs = plt.subplots(1, 1, figsize=(20, 6))
fig.suptitle('Starting Median Salaries Histogram', fontsize = 16)
axs.axvline(starting_median_salary.mean(), color='w', linestyle='dashed', linewidth=1)
N, bins, patches = axs.hist(starting_median_salary, bins=n_bins, color='green', edgecolor='black')
fracs = N / N.max()
norm = colors.Normalize(fracs.min(), fracs.max())
for thisfrac, thispatch in zip(fracs, patches):
color = plt.cm.viridis(norm(thisfrac))
thispatch.set_facecolor(color)
axs.yaxis.set_major_formatter(PercentFormatter(xmax=len(starting_median_salary)))
axs.set_ylabel('Percentage', fontsize=14, labelpad=15)
axs.set_xlabel('Salaries', fontsize=14)
axs.tick_params(labelsize=12)
axs.tick_params(axis='x', labelrotation=45)
major_ticks = np.arange(0, max(axs.get_ylim()) + 2.5, 2)
axs.set_yticks(major_ticks)
axs.set_facecolor('k')
fig.set_facecolor('w')
plt.savefig('dtpb1.png')
n_bins = 6
fig, axs = plt.subplots(1, 1, figsize=(20, 6))
fig.suptitle('Mid-Career Median Salaries Histogram', fontsize = 16)
axs.axvline(mid_career_median_salary.mean(), color='w', linestyle='dashed', linewidth=1)
N, bins, patches = axs.hist(mid_career_median_salary, bins=n_bins, color='green', edgecolor='black')
fracs = N / N.max()
norm = colors.Normalize(fracs.min(), fracs.max())
for thisfrac, thispatch in zip(fracs, patches):
color = plt.cm.viridis(norm(thisfrac))
thispatch.set_facecolor(color)
axs.yaxis.set_major_formatter(PercentFormatter(xmax=len(starting_median_salary)))
axs.set_ylabel('Percentage', fontsize=14, labelpad=15)
axs.set_xlabel('Salaries', fontsize=14)
axs.tick_params(labelsize=12)
axs.tick_params(axis='x', labelrotation=45)
major_ticks = np.arange(0, max(axs.get_ylim()) + 2.5, 2)
axs.set_yticks(major_ticks)
axs.set_facecolor('k')
fig.set_facecolor('w')
plt.savefig('dtpb2.png')
n_bins = 9
fig, axs = plt.subplots(1, 1, figsize=(20, 6))
fig.suptitle('Percentage Salary Change from Starting to Mid-Career Histogram', fontsize = 16)
axs.axvline(percent_change.mean(), color='w', linestyle='dashed', linewidth=1)
N, bins, patches = axs.hist(percent_change, bins=n_bins, color='green', edgecolor='black')
fracs = N / N.max()
norm = colors.Normalize(fracs.min(), fracs.max())
for thisfrac, thispatch in zip(fracs, patches):
color = plt.cm.viridis(norm(thisfrac))
thispatch.set_facecolor(color)
axs.yaxis.set_major_formatter(PercentFormatter(xmax=len(starting_median_salary)))
axs.set_ylabel('Percentage', fontsize=14, labelpad=15)
axs.set_xlabel('Percentage Change', fontsize=14)
axs.tick_params(labelsize=12)
axs.tick_params(axis='x', labelrotation=45)
major_ticks = np.arange(0, max(axs.get_ylim()) + 2.5, 2)
axs.set_yticks(major_ticks)
axs.set_facecolor('k')
fig.set_facecolor('w')
plt.savefig('dtpb3.png')
df2 = pd.read_csv(r"C:\Users\james\Downloads\Degrees that Pay Back\degrees_that_pay_back.csv")
df2[df2.columns[1:]] = df2[df2.columns[1:]].replace('[\$,]', '', regex=True).astype(float)
df2 = df2.sort_values('Mid-Career Median Salary',ascending = False)
df2.head()
df2 = pd.melt(df2, id_vars=['Undergraduate Major'], value_vars=['Starting Median Salary', 'Mid-Career Median Salary', 'Mid-Career 10th Percentile Salary', 'Mid-Career 25th Percentile Salary', 'Mid-Career 75th Percentile Salary', 'Mid-Career 90th Percentile Salary'])
df2.tail()
fig, axs = plt.subplots(1,3, figsize=(20, 6))
violin_parts=axs[0].violinplot(starting_median_salary, showmedians=True)
for pc in violin_parts['bodies']:
pc.set_facecolor('red')
pc.set_edgecolor('black')
violin_parts=axs[1].violinplot(mid_career_median_salary, showmedians=True)
for pc in violin_parts['bodies']:
pc.set_facecolor('red')
pc.set_edgecolor('black')
violin_parts=axs[2].violinplot(percent_change, showmeans=True)
for pc in violin_parts['bodies']:
pc.set_facecolor('red')
pc.set_edgecolor('black')
axs[0].title.set_text('Starting Median Salary')
axs[1].title.set_text('Mid-Career Median Salary')
axs[2].title.set_text('Percentage Change')
axs[0].set_ylabel('Salary', fontsize=14, labelpad=15)
axs[1].set_ylabel('Salary', fontsize=14, labelpad=15)
axs[2].set_ylabel('Percentage Change', fontsize=14, labelpad=15)
fmt = '%.0f%%'
xticks = mtick.FormatStrFormatter(fmt)
axs[2].yaxis.set_major_formatter(xticks)
plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.3, hspace=None)
axs[0].set_facecolor('lightgrey')
axs[1].set_facecolor('lightgrey')
axs[2].set_facecolor('lightgrey')
fig.set_facecolor('w')
plt.savefig('dtpb4.png')
#boxplots sorted by highest median salary
fig, ax = plt.subplots(1, 1, figsize=(20, 6))
plt.tight_layout()
sns.set_style("whitegrid")
bplot = sns.boxplot(y='value', x='Undergraduate Major', data=df2, width=0.5, color='white')
bplot = sns.stripplot(y='value', x='Undergraduate Major', data=df2, jitter=True, marker='o', alpha=0.5, color='k', edgecolor='c')
plt.xticks(rotation='vertical', fontsize=12)
dark_brown = '#B25116'
dark_pink = '#FB84D1'
# iterate over boxes
for i,box in enumerate(ax.artists):
if i % 2 == 0:
col = dark_pink
else:
col = dark_brown
box.set_edgecolor(col)
box.set_facecolor('white')
plt.setp(ax.lines, color='k')
plt.axhline(df2['value'].quantile(0.25), color='r')
plt.axhline(df2['value'].quantile(0.5), color='orange')
plt.axhline(df2['value'].quantile(0.75), color='g')
plt.axhline(df2['value'].mean(), color='b')
ax.set_xlabel('Undergraduate Major', fontsize=14)
plt.setp(ax.get_xticklabels(), rotation=30, horizontalalignment='right')
plt.title('Salaries by Major Boxplot Sorted by Highest Mid-Career Median Income', fontsize=14)
plt.gcf().subplots_adjust(bottom=0.15)
ax.set_facecolor('lightgrey')
fig.set_facecolor('w')
plt.savefig('dtpb5.png')
salaries = ['Starting Median Salary',
'Mid-Career 10th Percentile Salary',
'Mid-Career 25th Percentile Salary',
'Mid-Career Median Salary',
'Mid-Career 75th Percentile Salary',
'Mid-Career 90th Percentile Salary']
degrees = df2['Undergraduate Major'].unique()
df2 = df2.sort_values('value', ascending = True)
fig, axs = plt.subplots(5, 10, figsize=(30, 15))
fig.suptitle('Salaries Sparkline by Major sorted by Highest Mid-Career Median Salary', y=0.94, fontsize = 14)
axs = axs.flatten()
for i, degree in enumerate(degrees):
axs[i].plot(salaries, df2['value'].loc[df2['Undergraduate Major'] == degree], c=np.random.rand(3,))
axs[i].set_title(degree, fontsize=10)
axs[i].set_ylim(0, 220000)
axs[i].set_xticks([])
mediansalary = int(df2['value'].loc[(df2['Undergraduate Major'] == degree) & (df2['variable']=='Mid-Career Median Salary')].max())
axs[i].axhline(mediansalary, color='k', linestyle='dashed', linewidth=1)
axs[i].text(1, mediansalary + 10000, f'{mediansalary:,}',
size=12)
plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.4, hspace=None)
plt.show()
plt.savefig('dtpb6.png')