'''
UNIVERSITY RANKINGS FROM THE TIMES 2011 - 2016
'''
import pandas as pd
import plotly.offline as pyo
from plotly.offline import init_notebook_mode, plot_mpl
import plotly.graph_objects as go
import plotly.io as pio
import plotly.express as px
import plotly
import pandas as pd
from random import choice
from plotly.subplots import make_subplots
import math
import numpy as np
import ipywidgets
from ipywidgets import widgets
from ipywidgets import interactive, HBox, VBox
import statistics
pio.renderers.default='jupyterlab'
pyo.init_notebook_mode(connected=True)
df=pd.read_csv(r"C:\Users\james\Downloads\University Rankings\timesData.csv", encoding='latin-1')
df
df.dtypes
df['country'] = df['country'].astype(str)
df['country'].dtypes
df_with_count = df.groupby(['country', 'year', 'university_name'], as_index=False).mean()
df_with_count['count'] = ""
for c in df_with_count['country'].unique():
for y in df_with_count['year'].unique():
for i in df_with_count[(df_with_count['country']==c) & (df_with_count['year']==y)].index:
df_with_count.loc[i, 'count'] = 1
df_with_count['uni_count'] = ""
for u in df_with_count['university_name'].unique():
for y in df_with_count['year'].unique():
for i in df_with_count[(df_with_count['university_name']==u) & (df_with_count['year']==y)].index:
df_with_count.loc[i, 'uni_count'] = 1
df_with_count['count'] = pd.to_numeric(df_with_count['count'])
df_with_count['running_country_sum_year'] = df_with_count.groupby(['country', 'year'])['count'].apply(lambda x: x.cumsum())
df_with_count['running_country_sum_year'] = pd.to_numeric(df_with_count['running_country_sum_year'])
df_with_count['max_country_sum_year'] = df_with_count.groupby(['country', 'year'])['running_country_sum_year'].transform(max)
df_with_count['max_country_sum_year'] = pd.to_numeric(df_with_count['max_country_sum_year'])
df_with_count['running_country_sum'] = df_with_count.groupby(['country'])['count'].apply(lambda x: x.cumsum())
df_with_count['running_country_sum'] = pd.to_numeric(df_with_count['running_country_sum'])
df_with_count['countries total times on list'] = df_with_count.groupby(['country'])['running_country_sum'].transform(max)
df_with_count['countries total times on list'] = pd.to_numeric(df_with_count['countries total times on list'])
df_with_count['uni_count'] = pd.to_numeric(df_with_count['uni_count'])
df_with_count['running_uni_sum'] = df_with_count.groupby(['university_name'])['uni_count'].apply(lambda x: x.cumsum())
df_with_count['running_uni_sum'] = pd.to_numeric(df_with_count['running_uni_sum'])
df_with_count['uni total times on list'] = df_with_count.groupby(['university_name'])['running_uni_sum'].transform(max)
df_with_count['uni total times on list'] = pd.to_numeric(df_with_count['uni total times on list'])
df_with_count = df_with_count.sort_values('income', ascending = False)
df_count_country_with_year= df_with_count.groupby(['country', 'year'], as_index=False).mean()
df_count_country_with_year['income'] = round(df_count_country_with_year['income'], 2)
df_count_country = df_with_count.groupby('country', as_index=False).mean()
df_university_count = df.groupby(['year', 'university_name', 'country'], as_index=False).mean()
df_university_count['count'] = ""
for u in df_university_count['university_name'].unique():
for y in df_university_count['year'].unique():
for i in df_university_count[(df_university_count['university_name']==u) & (df_university_count['year']==y)].index:
df_university_count.loc[i, 'count'] = int(len(df_with_count[(df_with_count['university_name']==u) & (df_with_count['year']==y)]['university_name']))
df_university_count['running_sum'] = df_university_count.groupby(['university_name'])['count'].apply(lambda x: x.cumsum())
df_university_count['total times on list'] = df_university_count.groupby(['university_name'])['running_sum'].transform(max)
df_with_count['income and research average'] = (df_with_count['research'] + df_with_count['income'])/2
df_count_country['income and research average'] = (df_count_country['research'] + df_count_country['income'])/2
df_university_count['income and research average'] = (df_university_count['research'] + df_university_count['income'])/2
df_count_country_with_year['income and research average'] = round((df_count_country_with_year['research'] + df_count_country_with_year['income'])/2, 2)
df_with_count_2016 = df_with_count.query('year==2016')
df_with_count_2016['income and research average'] = (df_with_count_2016['research'] + df_with_count_2016['income'])/2
df_with_count_2016 = df_with_count_2016.sort_values('income and research average', ascending=False)
fig = make_subplots(
rows=2, cols=2,
specs=[[{'type': 'xy'}, {'type':'xy'}],
[{"colspan": 2}, None]],
vertical_spacing=0.35)
fig.add_trace(go.Bar(
x=df_with_count_2016['university_name'],
y= df_with_count_2016['income and research average'],
customdata=np.stack((list(round(df_with_count_2016['income'], 2)),
list(df_with_count_2016['uni total times on list']), list(df_with_count_2016['research']), df_with_count_2016['income and research average']), axis=-1),
text=df_with_count_2016['country'],
hovertemplate =
'<br><b>University Name</b>: %{x}' +
'<br><b># of Appearances from 2011 - 2016</b>: %{customdata[1]}' +
'<br><b>Income Score/100</b>: %{customdata[0]}' +
'<br><b>Research Score/100</b>: %{customdata[2]}' +
'<br><b>Income and Research Score/100</b>: %{customdata[3]}'
'<br><b>Country</b>: %{text}<extra></extra>',
marker=dict(color= df_with_count_2016['income and research average'],
colorscale='Inferno'),
showlegend=False)
,
row=1, col=1
)
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.add_annotation(
x=df_with_count_2016['university_name'].loc[int(len(df_with_count_2016['university_name'])/2)],
y=round(df_with_count_2016['income and research average'].mean(), 2)+5,
xref="x1",
yref="y1",
text="Average of research and income / 100: " + str(round(df_with_count_2016['income and research average'].mean(), 2)),
showarrow=False,
font=dict(
family="Courier New, monospace",
size=20,
color="white"
))
fig['layout']['annotations'][0].update({'font': {'size': 12}})
colours = []
def colour():
hex_chars = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f']
randomColour = '#'
for i in range(0, 6):
randomColour = randomColour + choice(hex_chars)
return randomColour
df_with_count_2016 = df_with_count_2016.sort_values('countries total times on list', ascending = False)
countries = df_with_count_2016['country'].unique()
for c in countries:
colours.append(colour())
for i, c in enumerate(countries):
fig.add_trace(go.Scatter(y = df_with_count_2016[df_with_count_2016['country']==c]['research'], x = df_with_count_2016[df_with_count_2016['country']==c]['income'], mode = 'markers',
marker = dict(
size=df_with_count_2016[df_with_count_2016['country']==c]['num_students']/100,
color = colours[i],
line_color='rgb(40,40,40)',
line_width=0.5,
sizemode = 'area',
colorscale='Inferno',
reversescale=False),
customdata=np.stack((list(round(df_with_count_2016[df_with_count_2016['country']==c]['income'], 2)),
list(df_with_count_2016[df_with_count_2016['country']==c]['uni total times on list']), list(df_with_count_2016[df_with_count_2016['country']==c]['research']), list(df_with_count_2016[df_with_count_2016['country']==c]['income and research average'])), axis=-1),
text=np.stack((list(df_with_count_2016[df_with_count_2016['country']==c]['university_name']), list(df_with_count_2016[df_with_count_2016['country']==c]['country'])), axis=-1),
hovertemplate =
'<br><b>University Name</b>: %{text[0]}' +
'<br><b># of Appearances from 2011 - 2016</b>: %{customdata[1]}' +
'<br><b>Income Score/100</b>: %{customdata[0]}' +
'<br><b>Research Score/100</b>: %{customdata[2]}' +
'<br><b>Average Income and Research Score/100</b>: %{customdata[3]}'
'<br><b>Country</b>: %{text[1]}<extra></extra>',
legendgroup = c,
showlegend=True,
name=str(c),
opacity=.9),
row=1, col=2)
fig['layout'].update(
annotations=[
dict(
x=92, y=98, # annotation point
xref='x2',
yref='y2',
text="",
showarrow=False,
font=dict(
color="white",
size=14
),
),
dict(
x=92, y=98, # annotation point
xref='x2',
yref='y2',
text="High income, high research",
showarrow=False,
font=dict(
color="white",
size=14
),
),
dict(
x=8, y=5,
xref='x2',
yref='y2',
text="Low income, low research",
showarrow=False,
font=dict(
color="white",
size=14
),
), dict(
x=5, y=98,
xref = 'x2',
yref = 'y2',
text="High research, low income",
showarrow=False,
font=dict(
color="white",
size=14
),
),
dict(
x=92, y=5, # annotation point
xref='x2',
yref='y2',
text="High income, low research",
showarrow=False,
font=dict(
color="white",
size=14
),
),
])
fig.add_annotation(
x=5,
y=round(df_with_count_2016['research'].mean(), 2)+5,
xref="x2",
yref="y2",
text="Average income: " + str(round(df_with_count_2016['income'].mean(), 2)),
showarrow=False,
font=dict(
family="Courier New, monospace",
size=12,
color="white"
))
fig['layout']['annotations'][1].update({'font': {'size': 12}})
fig.add_annotation(
x=0,
y=17,
xref="x2",
yref="y2",
text="Average research: " +str(round(df_with_count_2016['research'].mean(), 2)),
textangle=-90,
showarrow=False,
font=dict(
family="Courier New, monospace",
size=12,
color="white"
))
fig['layout']['annotations'][1].update({'font': {'size': 14}})
df_with_count_2016_mean = df_with_count_2016.groupby('country', as_index=False).mean()
fig.add_trace(go.Choropleth(
locations = df_with_count_2016_mean['country'],
locationmode = 'country names',
z= df_with_count_2016_mean['income and research average'],
colorscale = 'Inferno',
reversescale=False,
marker_line_color='darkgray',
marker_line_width=0.5,
customdata=np.stack((list(round(df_with_count_2016_mean['income'], 2)),
list(df_with_count_2016_mean['max_country_sum_year']), list(round(df_with_count_2016_mean['research'], 2)), (list(round(df_with_count_2016_mean['income and research average'], 2)))), axis=-1),
text=df_with_count_2016_mean['country'],
hovertemplate =
'<br><b># of Appearances in 2016</b>: %{customdata[1]}' +
'<br><b>Income Score/100</b>: %{customdata[0]}' +
'<br><b>Research Score/100</b>: %{customdata[2]}' +
'<br><b>Average Income and Research Score/100</b>: %{customdata[3]}'
'<br><b>Country</b>: %{text}<extra></extra>',
showlegend=False,
showscale=False
))
fig.add_shape(go.layout.Shape(type="line",
yref="y1",
xref="x1",
x0=0,
y0= statistics.mean([(x + y)/2 for x, y in zip(df_with_count_2016['income'], df_with_count_2016['research'])]),
x1=len(df_with_count_2016['university_name']),
y1= statistics.mean([(x + y)/2 for x, y in zip(df_with_count_2016['income'], df_with_count_2016['research'])]),
line=dict(color='blue', width=3, dash='dash')
)
)
fig.add_shape(go.layout.Shape(type="line",
yref="y2",
xref="x2",
x0=0,
y0=df_with_count_2016['research'].mean(),
x1=100,
y1=df_with_count_2016['research'].mean(),
line=dict(color='blue', width=3, dash='dash')
)
)
fig.add_shape(go.layout.Shape(type="line",
yref="y2",
xref="x2",
x0=df_with_count_2016['income'].mean(),
y0=0,
x1=df_with_count_2016['income'].mean(),
y1=100,
line=dict(color='blue', width=3, dash='dash')))
fig.update_xaxes(showticklabels=False, row=1, col=1)
fig.update_layout(
hoverlabel=dict(
bgcolor="white",
font_size=16,
font_family="Rockwell"
)
)
fig.update_layout(plot_bgcolor='grey')
fig.update_layout(height=2000, width=1250)
fig.update_layout(
title={
'text': "Income and Research Average Score / 100",
'y':1,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top',
},
font=dict(
color="white",
size=18)
)
fig.update_layout(
template='plotly_dark')
fig.show()
df_with_count_2016_mean.columns
df_with_count_2016 = df_with_count[df_with_count['year']==2016]
f = go.FigureWidget([go.Scatter(y = df_with_count_2016['income'], x = df_with_count_2016['income'], mode = 'markers',
marker = dict(
size=df_with_count_2016['num_students']/100,
color = df_with_count_2016['income'],
line_color='rgb(40,40,40)',
line_width=0.5,
sizemode = 'area',
colorscale='Inferno',
reversescale=False,
colorbar={'title': 'Income Score/100', "tickvals": list(range(0, 100, 10)), "nticks": 10, 'ticksuffix': '%'
},
),
customdata=np.stack((list(round(df_with_count_2016['income'], 2)),
list(df_with_count_2016['uni total times on list']), list(round(df_with_count_2016['research'], 2)), list(df_with_count_2016['num_students'])), axis=-1),
text=df_with_count_2016['university_name'],
hovertemplate =
'<br><b>University</b>: %{text}' +
'<br><b>Times on List</b>: %{customdata[1]}' +
'<br><b># Students</b>: %{customdata[3]}' +
'<br><b>x</b>: %{x}' +
'<br><b>y</b>: %{y}<extra></extra>'
,
)])
scatter = f.data[0]
N = len(df_with_count_2016)
scatter.x = scatter.x + np.random.rand(N)/10 *(df_with_count_2016['income'].max() - df_with_count_2016['income'].min())
scatter.y = scatter.y + np.random.rand(N)/10 *(df_with_count_2016['income'].max() - df_with_count_2016['income'].min())
scatter.marker.opacity = 0.5
def update_axes(xaxis, yaxis):
scatter = f.data[0]
scatter.x = df_with_count_2016[xaxis]
scatter.y = df_with_count_2016[yaxis]
with f.batch_update():
f.layout.xaxis.title = xaxis
f.layout.yaxis.title = yaxis
scatter.x = scatter.x + np.random.rand(N)/10 *(df_with_count_2016[xaxis].max() - df_with_count_2016[xaxis].min())
scatter.y = scatter.y + np.random.rand(N)/10 *(df_with_count_2016[yaxis].max() - df_with_count_2016[yaxis].min())
axis_dropdowns = interactive(update_axes, yaxis = df_with_count_2016[['teaching', 'research',
'citations', 'income', 'num_students', 'student_staff_ratio']].columns, xaxis = df_with_count_2016[['teaching', 'research',
'citations', 'income', 'num_students', 'student_staff_ratio']].columns)
t = go.FigureWidget([go.Table(
header=dict(values=['country', 'year', 'university_name', 'teaching', 'research',
'citations', 'income', 'num_students', 'student_staff_ratio','uni total times on list'],
fill = dict(color='#C2D4FF'),
align = ['left'] * 5),
cells=dict(values=[df_with_count_2016[col] for col in ['country', 'year', 'university_name', 'teaching', 'research',
'citations', 'income', 'num_students', 'student_staff_ratio', 'uni total times on list']],
fill = dict(color='#F5F8FF'),
align = ['left'] * 5))])
def selection_fn(trace,points,selector):
t.data[0].cells.values = [df_with_count_2016.iloc[points.point_inds][col] for col in ['country', 'year', 'university_name', 'teaching', 'research',
'citations', 'income', 'num_students', 'student_staff_ratio', 'uni total times on list']]
scatter.on_selection(selection_fn)
VBox((HBox(axis_dropdowns.children),f,t))
fig = px.scatter_geo(df_count_country_with_year.sort_values('year', ascending=True), lat='latitude', lon='longitude', size=[1]*df_count_country_with_year['country'].size, color="income and research average",
hover_name="country", hover_data={
'income': True,
'research': True,
'income and research average': True,
'max_country_sum_year': True,
'count': False,
'latitude': False,
'longitude': False,
}, animation_frame="year", projection='natural earth',
opacity=0.45
)
fig.update_layout(
autosize=False,
width=900,
height=600,)
fig.update_layout(
title_text='Universities by Country Median Income Score/100', title_x=0.5)
fig.update_layout(
template='plotly_dark')
fig.show()
df_count_country_with_year
df_with_count_2016_mean
fig = px.bar(df_count_country, y='income and research average', x='country', text='countries total times on list', color='income and research average', color_continuous_scale='Jet')
fig.update_traces(texttemplate='%{text:}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.update_layout(
title_text='Country Average Income Score/100 and # of Universities from 2011 - 2016'
, title_x=0.5)
fig.update_layout(
autosize=False,
width=900,
height=600,)
fig.add_shape(
# Line Vertical
dict(
type="line",
x0=-0.7,
y0=df_count_country['income and research average'].mean(),
x1=len(df_count_country['country']),
y1=df_count_country['income and research average'].mean(),
line=dict(
color="LightSeaGreen",
width=4,
dash="dashdot",
)
))
fig.update_layout(
template='plotly_dark')
fig.add_annotation(
x=df_count_country['country'].loc[int(len(df_count_country['country'])/2)+1],
y=round(df_count_country['income and research average'].mean(), 2)+5,
xref="x1",
yref="y1",
text="Average of research and income / 100: " + str(round(df_count_country['income and research average'].mean(), 2)),
showarrow=False,
font=dict(
family="Courier New, monospace",
size=12,
color="white"
))
fig.show()
df_university_count_2 = df_university_count.groupby('university_name', as_index=False).mean()
fig = px.bar(df_university_count_2, y='income and research average', x='university_name', text='total times on list', color='income and research average', color_continuous_scale='jet',
hover_name="university_name", hover_data={
'income and research average': True,
'total times on list': True,})
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.update_layout(
title_text='Universities by Average Income Score/100 and # of Universities from 2011 - 2016', title_x=0.5
)
fig.update_layout(
autosize=False,
width=900,
height=700,
font=dict(
size=10,
))
fig.add_shape(
dict(
type="line",
x0=-0.7,
y0=df_university_count_2['income and research average'].mean(),
x1=len(df_university_count_2['university_name']),
y1=df_university_count_2['income and research average'].mean(),
line=dict(
color="LightSeaGreen",
width=4,
dash="dashdot",
)
))
fig.add_annotation(
x=df_university_count['university_name'].loc[int(len(df_university_count['university_name'])/2)],
y=round(df_university_count['income and research average'].mean(), 2)+2,
xref="x1",
yref="y1",
text="Average of research and income / 100: " + str(round(df_university_count['income and research average'].mean(), 2)),
showarrow=False,
font=dict(
family="Courier New, monospace",
size=12,
color="white"
))
fig.update_layout(
template='plotly_dark')
fig.update_xaxes(tickangle=45)
fig.show()
df_all_years_groupby = df_with_count.groupby(['university_name', 'country'], as_index=False).mean()
df_all_years_groupby = df_all_years_groupby.sort_values('countries total times on list', ascending = False)
country_names=df_all_years_groupby ['country'].unique()
country_names = df_all_years_groupby ['country'].unique()
country_appearances = {}
n=len(country_names)
n_cols=3
def create_bar(df, row, col, c):
fig.add_trace(
go.Bar(
x=df[df['country']==c]['university_name'],
y=df[df['country']==c]['income and research average'],
customdata=np.stack((list(round(df[df['country']==c]['income and research average'], 2)),
list(df[df['country']==c]['uni total times on list'])), axis=-1),
text=df[df['country']==c]['country'],
hovertemplate =
'<br><b>University Name</b>: %{x}' +
'<br><b># of Appearances</b>: %{customdata[1]}' +
'<br><b>Average Income and Resarch Score/100</b>: %{y}<extra></extra>',
name=str(c),
marker=dict(color=df[df['country']==c]['income and research average'])
),
row=row, col=col,)
def create_line(df, row, col, c):
x = list(df[df['country']==c]['university_name'])
y = [round(df[df['country']==c]['income and research average'].mean(), 2)] * len(df[df['country']==c]['university_name'])
fig.add_trace(go.Scatter(
x=x,
y=y,
hovertemplate =
'<br><b>Average Income Score</b>: %{y:}</br><extra></extra>',
showlegend=False,
mode="lines"),
row=row, col=col,)
def annotate(df, i):
x = list(df[df['country']==c]['university_name'])
y = [round(df[df['country']==c]['income and research average'].mean(), 2)] * len(df[df['country']==c]['university_name'])
fig['layout']['yaxis'+str(i+1)].update(title='', range=[0, 100], dtick=20, autorange=False)
fig.add_annotation(
x=x[int(len(x)/2)-1],
y=round(df[df['country']==c]['income and research average'].mean(), 2)+10,
xref="x" + str(i+1),
yref="y" + str(i+1),
text="Average: " + str(round(df[df['country']==c]['income and research average'].mean(), 2)),
showarrow=False,
font=dict(
family="Courier New, monospace",
size=16,
color="white"
))
fig['layout']['annotations'][i].update({'font': {'size': 12}})
title_items = []
for c, t in zip(df_all_years_groupby['country'], df_all_years_groupby['countries total times on list']):
country_appearances[c] = t
for k, v in country_appearances.items():
title_items.append(str(k) + ', ' + str(int(v)) + ' appearances')
fig = make_subplots(math.ceil(len(country_names[:n])/n_cols), n_cols, shared_yaxes=True,
subplot_titles=title_items)
df_all_years_groupby = df_all_years_groupby.sort_values('income and research average', ascending = False)
row = 1
col = 1
for i, c in enumerate(country_names[:n]):
if col <= n_cols:
create_bar(df_all_years_groupby, row, col, c)
create_line(df_all_years_groupby, row, col, c)
fig['layout']['yaxis'+str(i+1)].update(title='', range=[0, 100], dtick=20, autorange=False)
annotate(df_all_years_groupby, i)
col += 1
else:
col = 1
row = row + 1
create_bar(df_all_years_groupby, row, col, c)
create_line(df_all_years_groupby, row, col, c)
fig['layout']['yaxis'+str(i+1)].update(title='', range=[0, 100], dtick=20, autorange=False)
annotate(df_all_years_groupby, i)
col+=1
fig.update_layout(
autosize=False,
width = 1000,
height=3000,
font=dict(
size=10,
))
fig.update_xaxes(showticklabels=False)
fig.update_layout(
hoverlabel=dict(
bgcolor="white",
font_size=16,
font_family="Rockwell"
)
)
fig.update_layout(
template='plotly_dark')
fig.update_layout(
title_text='Average University Income and Research Scores by All Years (2011 - 2016)', title_x=0.5
)
fig.show()
# average of all years
fig = go.Figure()
fig.add_trace(go.Scattergeo(
lon = df_count_country['longitude'],
lat = df_count_country['latitude'],
marker = dict(
size=df_count_country['count']*10,
color = df_count_country['income'],
line_color='rgb(40,40,40)',
line_width=0.5,
sizemode = 'area',
colorscale='Inferno',
reversescale=False,
colorbar={'title': 'Income Score/100', "tickvals": list(range(0, 100, 10)), "nticks": 10, 'ticksuffix': '%'
},
),
text=df_count_country['country'],
customdata=np.stack((list(round(df_count_country['income'], 2)), list(round(df_count_country['count'], 0))), axis=-1),
hovertemplate =
'<br><b>Country</b>: %{text}' +
'<br><b># of Universities</b>: %{customdata[1]}' +
'<br><b>Average Income Score/100</b>: %{customdata[0]}<extra></extra>',
opacity=0.7
))
fig.update_layout(
autosize=False,
width=900,
height=600,)
fig.update_layout(
title_text='Universities by Country Median Income Score/100', title_x=0.5,
geo=dict(
showframe=False,
showcoastlines=False,
projection_type='natural earth'
))
fig.show()
my_list