import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns
plt.style.use('fivethirtyeight')
plt.style.use('seaborn')
plt.style.use('seaborn-colorblind')
plt.rcParams['figure.figsize'] = [12, 6]
x = str
df = pd.read_csv('Core_Data/Film_Bang_Personnel_Master_Step_2.csv', dtype={'Trainee prof':x, '2020': x, '2019': x, '2018': x,
'2017': x, '2016': x, '2015': x, '2014': x, '2013': x, '2012': x, '2011': x, '2010': x, '2009': x,
'2008': x, '2007': x, '2006': x, '2005': x, '2004': x, '2003': x, '2002': x, '2001': x, '2000': x,
'1999': x, '1998': x, '1997': x, '1996': x, '1995': x, '1994': x, '1993': x, '1992': x, '1991': x,
'1990': x, '1989': x, '1988': x, '1987': x, '1986': x, '1984': x, '1982': x, '1981': x,
'1979': x, '1978': x, '1976': x})
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 300)
colours = sns.color_palette('colorblind', as_cmap=True)
#colours = sns.cubehelix_palette(start=2, rot=1, dark=0, light=.95, as_cmap=True)
#colours = sns.color_palette("rocket_r", as_cmap=True)
# Function to format percentage values for visualisation
format_percent = lambda x: format((x*100), '.0f')
The code below brings basic analysis into one document and shows the output of various calculations. The anonymisation of the data has been carried out in a separate notebook so names, full postcodes, locations etc are removed. We're left with the first 3 characters of the 'last known' postcode where entries have one.
This notebook will have other subsidiary notebooks added to it over the next few days (3/10/20)
Index:
Number of Entries, Columns
df.shape
df.columns
# Display of sample of data
df[['Trainee prog', 'Gender', 'Role 1', 'Role 1 Category', 'Rural', 'No of Yrs']].head(20)
plot_years = df.loc[:, '2020':'1976'].columns.tolist()
basic_totals_crew = []
col_range = df.loc[:, '2020':'1976']
def crew_count(output_array, dataframe):
for col in dataframe.loc[:, '2020':'1976']:
total = dataframe[col].value_counts(dropna=True)
output_array.append(total[0])
#print(output_array)
crew_count(basic_totals_crew, df)
x_indexes = list(range(len(plot_years)))
x_indexes.reverse()
fig, ax = plt.subplots()
ax.plot(x_indexes, basic_totals_crew, color='#2E0014', label='Total Freelancers', linewidth=3)
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_xticks(ticks=x_indexes)
ax.set_xticklabels(plot_years, {'fontsize': 12})
fig.autofmt_xdate(rotation=90)
ax.legend(fontsize=14)
ax.set_facecolor('white')
plt.tight_layout()
plt.savefig('Outputs/Final/film_bang_freelancers_over_time.png', facecolor='#ffffff')
plt.show()
basic_totals_crew.reverse()
list_76_89 = basic_totals_crew[0:10]
list_90_99 = basic_totals_crew[10:20]
list_2000_2009 = basic_totals_crew[20:30]
list_2010_2020 = basic_totals_crew[30:41]
plot_years.reverse()
plot_years_76_89 = plot_years[0:10]
plot_years_90_99 = plot_years[10:20]
plot_years_00_09 = plot_years[20:30]
plot_years_10_20 = plot_years[30:41]
x_indexes = list(range(len(plot_years)))
#print(x_indexes)
x_indexes_76_89 = x_indexes[0:10]
x_indexes_90_99 = x_indexes[10:20]
x_indexes_00_09 = x_indexes[20:30]
x_indexes_10_20 = x_indexes[30:41]
fig, ax = plt.subplots()
ax.plot(x_indexes_76_89, list_76_89, color='#2E0014', label='Total Freelancers', linewidth=3)
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_xticks(ticks=x_indexes_76_89)
ax.set_xticklabels(plot_years_76_89, {'fontsize': 12})
ax.axvspan(x_indexes_76_89[0], x_indexes_76_89[9], facecolor=colours[0], alpha=0.5)
ax.set_facecolor('white')
ax.legend(loc='upper right', bbox_to_anchor=(0.4, 0., 0.5, 0.9), fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/proto-type_creative_cluster_76_89.png', facecolor='#ffffff')
plt.show()
fig, ax = plt.subplots()
ax.plot(x_indexes_90_99, list_90_99, color='#2E0014', label='Total Freelancers', linewidth=3)
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_xticks(ticks=x_indexes_90_99)
ax.set_xticklabels(plot_years_90_99, {'fontsize': 12})
ax.axvspan(x_indexes_90_99[0], x_indexes_90_99[9], facecolor=colours[1], alpha=0.5)
ax.set_facecolor('white')
ax.legend(loc='upper left', bbox_to_anchor=(0.1, 0., 0.5, 0.9), fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/de-regulation_90_99.png', facecolor='#ffffff')
plt.show()
fig, ax = plt.subplots()
ax.plot(x_indexes_00_09, list_2000_2009, color='#2E0014', label='Total Freelancers', linewidth=3)
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_xticks(ticks=x_indexes_00_09)
ax.set_xticklabels(plot_years_00_09, {'fontsize': 12})
plt.axvspan(x_indexes_00_09[0], x_indexes_00_09[9], facecolor=colours[2], alpha=0.5)
ax.set_facecolor('white')
ax.legend(loc='upper right', bbox_to_anchor=(0.4, 0., 0.5, 0.9), fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/limits_to_growth_00_09.png', facecolor='#ffffff')
plt.show()
fig, ax = plt.subplots()
ax.plot(x_indexes_10_20, list_2010_2020, color='#2E0014', label='Total Freelancers', linewidth=3)
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('New expansion, new productions')
ax.set_xticks(ticks=x_indexes_10_20)
ax.set_xticklabels(plot_years_10_20, {'fontsize': 12})
ax.axvspan(x_indexes_10_20[0], x_indexes_10_20[10], facecolor=colours[3], alpha=0.5)
ax.set_facecolor('white')
ax.legend(loc='upper right', bbox_to_anchor=(0.4, 0., 0.5, 0.9), fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/new_Expansion_10_20.png', facecolor='#ffffff')
plt.show()
fig, ax = plt.subplots()
ax.plot(x_indexes, basic_totals_crew, color='#2E0014', label=' Total Entries', linewidth=3)
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_xticks(ticks=x_indexes)
ax.set_xticklabels(plot_years, {'fontsize':12})
fig.autofmt_xdate(rotation=90)
ax.axvspan(x_indexes[10], x_indexes[0], facecolor=colours[0], alpha=0.5, label=' Development of screen industry in Scotland')
ax.axvspan(x_indexes[20], x_indexes[10], facecolor=colours[1], alpha=0.5, label=' Deregulation, changing technology, flexible working')
ax.axvspan(x_indexes[30], x_indexes[20], facecolor=colours[2], alpha=0.5, label=' Decline of industry')
ax.axvspan(x_indexes[40], x_indexes[30], facecolor=colours[3], alpha=0.5, label=' Increased production, new broadcasters')
ax.set_facecolor('white')
ax.legend(frameon=1, loc='lower right', bbox_to_anchor=(0.4, 0.1, 0.5, 0.9), facecolor='inherit', edgecolor="#000000", fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/highlighting_growth_decline.png', facecolor='#ffffff')
plt.show()
fig, ax = plt.subplots()
ax.plot(x_indexes, basic_totals_crew, color='#2E0014', label='Total Freelancers', linewidth=3)
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_xticks(ticks=x_indexes)
ax.set_xticklabels(plot_years, {'fontsize': 12})
fig.autofmt_xdate(rotation = 90)
ax.axvspan(x_indexes[10], x_indexes[0], facecolor=colours[0], alpha=0.5, label='Development of Screen Industry in Scotland')
ax.set_facecolor('white')
plt.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/timeframes_phase_1.png', facecolor='#ffffff')
plt.show()
fig, ax = plt.subplots()
ax.plot(x_indexes, basic_totals_crew, color='#2E0014', label='Total Freelancers', linewidth=3)
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_xticks(ticks=x_indexes)
ax.set_xticklabels(plot_years, {'fontsize': 12})
fig.autofmt_xdate(rotation=90)
ax.axvspan(x_indexes[20], x_indexes[10], facecolor=colours[1], alpha=0.5, label='Deregulation, changing technology, flexible working')
ax.set_facecolor('white')
ax.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/timeframesphase_2.png', facecolor='#ffffff')
plt.show()
fig, ax = plt.subplots()
ax.plot(x_indexes, basic_totals_crew, color='#2E0014', label='Total Freelancers', linewidth=3)
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_xticks(ticks=x_indexes)
ax.set_xticklabels(plot_years)
fig.autofmt_xdate(rotation=90)
ax.axvspan(x_indexes[30], x_indexes[20], facecolor=colours[2], alpha=0.5, label='Decline of industry')
ax.set_facecolor('white')
ax.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/timeframesphase_3.png', facecolor='#ffffff')
plt.show()
fig, ax = plt.subplots()
ax.plot(x_indexes, basic_totals_crew, color='#2E0014', label='Total Freelancers', linewidth=3)
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_xticks(ticks=x_indexes)
ax.set_xticklabels(plot_years)
ax.axvspan(x_indexes[40], x_indexes[30], facecolor=colours[3], alpha=0.5, label='Increased Production, New Broadcasters')
fig.autofmt_xdate(rotation=90)
ax.set_facecolor('white')
ax.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/timeframesphase_4.png', facecolor='#ffffff')
plt.show()
fig, ax = plt.subplots()
width=0.8
ax.bar(plot_years, basic_totals_crew, width=width, color=colours[5], label='Total Freelancers Per Directory')
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_facecolor('white')
ax.set_xticks(ticks=plot_years)
fig.autofmt_xdate(rotation=90)
ax.legend(fontsize=14)
plt.tight_layout()
plt.savefig('Outputs/Final/listings_by_year.png', facecolor='#ffffff')
plt.show()
x_indexes = list(range(len(plot_years)))
fig, ax = plt.subplots()
ax.plot(x_indexes, basic_totals_crew, color='#2E0014', label='Total Entries', linewidth=3)
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_xticks(ticks=x_indexes)
ax.set_xticklabels(plot_years, {'fontsize': 12})
fig.autofmt_xdate(rotation=90)
ax.axvline(x=x_indexes[4], linewidth=3, color=colours[5], label='Channel 4 Starts')
ax.axvline(x=x_indexes[16], linewidth=3, color=colours[6], label='Tartan Shorts Launched')
ax.axvline(x=x_indexes[21], linewidth=3, color=colours[7], label='STV Crisis')
ax.axvline(x=x_indexes[28], linewidth=3, color=colours[8], label='Financial Crisis')
ax.axvline(x=x_indexes[39], linewidth=3, color=colours[9], label='Film Bang Website Relaunched')
ax.set_facecolor('white')
ax.legend(fontsize=14, frameon=1)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/workers_by_year_key_dates.png', facecolor='#ffffff')
plt.show()
at_leasts = []
df_year_exclusions = pd.DataFrame(columns=at_leasts)
for i in range(1, 41):
string = f'AL {i}'
at_leasts.append(string)
df_year_exclusions[string] = np.where((df['No of Yrs'] > (i -1)), 1, np.nan)
at_least_vals = []
for col in at_leasts:
at_least_vals.append(df_year_exclusions[col].value_counts().values[0])
print(at_least_vals)
x_vals = range(1,41)
width=0.8
fig, ax = plt.subplots()
ax.bar(x_vals, at_least_vals, color=colours[2], width = width, label='At least x years listing length')
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_facecolor('white')
ax.set_xticks(ticks=x_vals)
ax.legend(fontsize=14)
plt.tight_layout()
plt.savefig('Outputs/Final/at_least_x_listing_length.png', facecolor='#ffffff')
plt.show()
Figures for number of people in each department. Departments were allocated according to keywords found in role descriptions. There are 536 unique roles given throughout the data (Role 1). Film Bang used to offer a free field for crew to define their department or role. The current post 2019 website offers a dropdown list of options to choose from when signing up. There are 124 options to choose from. We have grouped roles into the categories listed below based on __ .
# Unique values in the Role 1 Column
df['Role 1'].nunique()
df['Role 1 Category'].value_counts()
Percentages for above figures
df['Role 1 Category'].value_counts(normalize=True).apply(format_percent)
#df['Role 1 Category'].value_counts(normalize=True).apply(lambda x: format((x*100), '.0f'))
This is a figure for where, for each entry, the Role 1, Role 2, Role 3 columns fall into the same category or not. Same category = consistent. Differing categories across 1, 2, 3 = not consistent. We are trying to determine if people are listing in different departments, spreading their net wide so to speak. Or not.
False = Director/Producer, Producer/Director
# Shows figures for those whose roles are consistent over the 3 columns (including empty values for 2 and 3),
# and those where the roles differ by department
df['Has Consistent Role'].value_counts(dropna=False)
# Percentages for the above
df['Has Consistent Role'].value_counts(normalize=True, dropna=False).apply(format_percent)
filt = df.groupby(['Role 1 Category'])
filt['Has Consistent Role'].value_counts(dropna=False)
# Percentages of the above
filt = df.groupby(['Role 1 Category'])
filt['Has Consistent Role'].value_counts(normalize=True, dropna=False).apply(format_percent)
In the above data we can see the departments Direction and Producer show the greatest instances of people having different roles across Role Categories 1, 2 & 3
# Data
depts = df['Role 1 Category'].value_counts().index.tolist()
nums = df['Role 1 Category'].value_counts().tolist()
print(depts)
print(nums)
tot = sum(nums)
nums_p_cents = []
for i in nums:
x = (i / tot) * 100
x = round(x)
nums_p_cents.append(x)
print('Percentages', nums_p_cents)
survey_dept_p_cents = [20, 15, 12, 5, 7, 5, 4, 0, 3, 0, 0, 2, 0, 2, 0, 0]
width=0.8
fig, ax = plt.subplots()
ax.bar(depts, nums_p_cents, color=colours[3], width = width, label='% of Film Bang Listings by Department')
ax.bar(depts, survey_dept_p_cents, color=colours[4], width = width, label="% of Survey Responses by Department")
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_facecolor('white')
ax.set_xticks(ticks=depts)
fig.autofmt_xdate(rotation=45)
plt.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/percentages_dept_correlation.png', facecolor='#ffffff')
plt.show()
width=0.8
fig, ax = plt.subplots()
ax.bar(depts, nums, color=colours[3], width = width, label='Listings by Department')
for i in range(len(depts)):
ax.text(x = depts[i], y = nums[i], s = nums[i], size = 15, horizontalalignment='center')
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_facecolor('white')
ax.set_xticks(ticks=depts)
fig.autofmt_xdate(rotation=45)
plt.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/department_role_category_v.png', facecolor='#ffffff')
plt.show()
# Plot
width=0.8
fig, ax = plt.subplots()
ax.barh(depts, nums, color=colours[3], label='Listings by Department')
for i in range(len(depts)):
ax.text(x = nums[i], y = depts[i], s = nums[i], size = 15, verticalalignment='center', horizontalalignment='left')
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_facecolor('white')
ax.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/department_role_category_h.png', facecolor='#ffffff')
plt.show()
filt = (df['Role 1 Category'] == 'Production')
production = df[filt]
df_production = pd.DataFrame(production)
df_production.loc[production['Role 1'].str.contains('Unit|Director|ProductionAssistant|Engineer|Floor|Script|Autocue|Location|Property|Scout|Runner|Newcomer|Production Assistant|Stagehand|Stage Hand|Studio|Production Executive|Drapes'), 'Role 1 SubCategory'] = 'Location'
df_production.loc[production['Role 1'].str.contains('Line|Researcher|Production Manager|Secretary|Accountant|Payroll|Production Co-ordinator|Production Co-Ordinator|Office|Publicist'), 'Role 1 SubCategory'] = 'Admin'
df_production.loc[production['Role 1'].str.contains('Driver|Pilot|Transport'), 'Role 1 SubCategory'] = 'Transport'
breakdown = production['Role 1 SubCategory'].value_counts().array
labels = ['Location', 'Admin', 'Transport']
fig, ax = plt.subplots()
ax.pie(breakdown, labels=labels, textprops={'fontsize': 14}, pctdistance=0.85,
labeldistance=1.2, colors=colours, shadow=False,
startangle=45, autopct='%1.0f%%', wedgeprops={'edgecolor':'black'})
ax.set_title('')
ax.text(.0,.0,' ', fontsize=14, ha='right')
circle=plt.Circle( (0,0), 0.7, color='white')
p=plt.gcf()
p.gca().add_artist(circle)
plt.tight_layout()
plt.savefig('Outputs/Final/production_sub_categories_pie_chart_ring.png', facecolor='#ffffff')
plt.show()
# Create new df for storing filtered values
structure = {
'Year':[],
'Direction':[],
'Music':[],
'Producer':[],
'Construction':[],
'Costume':[],
'Hair & Make-Up':[],
'Sound':[],
'Post-Production':[],
'Casting':[],
'Support':[],
'Art':[],
'Camera':[],
'Production':[],
'Script':[],
'Special FX':[]
}
df_roles = pd.DataFrame(structure)
for column in df.loc[:, '2020':'1976']:
# filter Role 1 Category for entries of 'Direction'
filt1 = (df[column] == column) & (df['Role 1 Category'] == 'Direction')
# apply filter
direction = df[filt1]
# count items in dataframe oject
direction_count = len(direction.index)
filt2 = (df[column] == column) & (df['Role 1 Category'] == 'Music')
music = df[filt2]
music_count = len(music.index)
filt3 = (df[column] == column) & (df['Role 1 Category'] == 'Producer')
producer = df[filt3]
producer_count = len(producer.index)
filt4 = (df[column] == column) & (df['Role 1 Category'] == 'Construction')
construction = df[filt4]
construction_count = len(construction.index)
filt5 = (df[column] == column) & (df['Role 1 Category'] == 'Costume')
costume = df[filt5]
costume_count = len(costume.index)
filt6 = (df[column] == column) & (df['Role 1 Category'] == 'Hair & Make-Up')
hair = df[filt6]
hair_count = len(hair.index)
filt7 = (df[column] == column) & (df['Role 1 Category'] == 'Sound')
sound = df[filt7]
sound_count = len(sound.index)
filt8 = (df[column] == column) & (df['Role 1 Category'] == 'Post-Production')
postp = df[filt8]
postp_count = len(postp.index)
filt9 = (df[column] == column) & (df['Role 1 Category'] == 'Casting')
casting = df[filt9]
casting_count = len(casting.index)
filt10 = (df[column] == column) & (df['Role 1 Category'] == 'Support')
support = df[filt10]
support_count = len(support.index)
filt11 = (df[column] == column) & (df['Role 1 Category'] == 'Art')
art = df[filt11]
art_count = len(art.index)
filt12 = (df[column] == column) & (df['Role 1 Category'] == 'Camera')
camera = df[filt12]
camera_count = len(camera.index)
filt13 = (df[column] == column) & (df['Role 1 Category'] == 'Production')
production = df[filt13]
production_count = len(production.index)
filt14 = (df[column] == column) & (df['Role 1 Category'] == 'Script')
script = df[filt14]
script_count = len(script.index)
filt15 = (df[column] == column) & (df['Role 1 Category'] == 'Special FX')
special = df[filt15]
special_count = len(special.index)
df_roles = df_roles.append({
'Year': column,
'Direction': direction_count,
'Music': music_count,
'Producer': producer_count,
'Construction': construction_count,
'Costume': costume_count,
'Hair & Make-Up': hair_count,
'Sound': sound_count,
'Post-Production': postp_count,
'Casting': casting_count,
'Support': support_count,
'Art': art_count,
'Camera': camera_count,
'Production': production_count,
'Script': script_count,
'Special FX': special_count
}, ignore_index=True)
#df_roles
years = df_roles['Year']
producers = df_roles['Producer']
construction = df_roles['Construction']
direction = df_roles['Direction']
costume = df_roles['Costume']
hairs = df_roles['Hair & Make-Up']
sound = df_roles['Sound']
postp = df_roles['Post-Production']
casting = df_roles['Casting']
support = df_roles['Support']
art = df_roles['Art']
camera = df_roles['Camera']
production = df_roles['Production']
script = df_roles['Script']
special = df_roles['Special FX']
x_indexes = list(range(len(years)))
x_indexes.reverse()
linewidth=3
fig, ax = plt.subplots()
ax.plot(x_indexes, camera, color=colours[8], label='Camera', linewidth=linewidth)
ax.plot(x_indexes, production, color=colours[9], label='Production', linewidth=linewidth)
ax.plot(x_indexes, art, color=colours[7], label='Art', linewidth=linewidth)
ax.plot(x_indexes, direction, color=colours[5], label='Direction', linewidth=linewidth)
ax.plot(x_indexes, producers, color=colours[0], label='Producers', linewidth=linewidth)
ax.plot(x_indexes, sound, color=colours[3], label='Sound', linewidth=linewidth)
ax.plot(x_indexes, hairs, color=colours[2], label='Hair & Make-Up', linewidth=linewidth)
ax.plot(x_indexes, postp, color=colours[4], label='Post-Production', linewidth=linewidth)
ax.plot(x_indexes, costume, color=colours[1], label='Costume', linewidth=linewidth)
ax.plot(x_indexes, script, color=colours[8], label='Script', linewidth=linewidth)
ax.plot(x_indexes, support, color=colours[6], label='Support', linewidth=linewidth)
ax.plot(x_indexes, construction, color=colours[9], label='Construction', linewidth=linewidth)
ax.plot(x_indexes, casting, color=colours[5], label='Casting', linewidth=linewidth)
ax.plot(x_indexes, special, color=colours[2], label='Special FX', linewidth=linewidth)
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_xticks(ticks=x_indexes)
ax.set_xticklabels(years)
fig.autofmt_xdate(rotation=90)
ax.set_facecolor('white')
ax.legend(fontsize=12)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/numbers_in_departmet_by_year.png', facecolor='#ffffff')
plt.show()
fig, ax = plt.subplots()
ax.plot(x_indexes, camera, color=colours[8], label='Camera', linewidth=linewidth)
ax.plot(x_indexes, production, color=colours[9], label='Production', linewidth=linewidth)
ax.plot(x_indexes, art, color=colours[7], label='Art', linewidth=linewidth)
ax.plot(x_indexes, producers, color=colours[0], label='Producers', linewidth=linewidth)
ax.plot(x_indexes, sound, color=colours[3], label='Sound', linewidth=linewidth)
ax.plot(x_indexes, postp, color=colours[4], label='Post-Production', linewidth=linewidth)
ax.plot(x_indexes, hairs, color=colours[2], label='Hair & Make-Up', linewidth=linewidth)
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_facecolor('white')
ax.set_xticks(ticks=x_indexes)
ax.set_xticklabels(years)
fig.autofmt_xdate(rotation=90)
plt.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/departments_high_growth.png', facecolor='#ffffff')
plt.show()
fig, ax = plt.subplots()
ax.plot(x_indexes, construction, color=colours[9], label='Construction', linewidth=linewidth)
ax.plot(x_indexes, costume, color=colours[1], label='Costume', linewidth=linewidth)
ax.plot(x_indexes, casting, color=colours[5], label='Casting', linewidth=linewidth)
ax.plot(x_indexes, support, color=colours[6], label='Support', linewidth=linewidth)
ax.plot(x_indexes, script, color=colours[2], label='Script', linewidth=linewidth)
ax.plot(x_indexes, special, color=colours[3], label='Special FX', linewidth=linewidth)
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_facecolor('white')
ax.set_xticks(ticks=x_indexes)
ax.set_xticklabels(years)
fig.autofmt_xdate(rotation=90)
ax.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/low_growth_depts.png', facecolor='#ffffff')
plt.show()
filt_production = (df['Role 1 Category'] == 'Production')
production_filtered = df[filt_production]
production_filtered['Role 1'].value_counts().head(10)
# Get list of departments
depts_for_top_10 = df['Role 1 Category'].value_counts().index.tolist()
def csv_maker(val, ind, dep):
df_to_csv = pd.DataFrame(columns=('Role', 'Number'))
# Add to DataFrame
df_to_csv['Role'] = ind
df_to_csv['Number'] = val
df_to_csv
# Export to CSV
dep = dep.lower()
df_to_csv.to_csv(f'Outputs/CSV/{dep}_top_10.csv', index=False)
#print(dep)
#print(df_to_csv)
def department_top_10(dept_array):
for i in depts_for_top_10:
filt = (df['Role 1 Category'] == i)
dept_filtered = df[filt]
values = dept_filtered['Role 1'].value_counts().head(10).values.tolist()
indexes = dept_filtered['Role 1'].value_counts().head(10).index.tolist()
csv_maker(values, indexes, i)
department_top_10(depts_for_top_10)
The dataset is compromised in terms of location data. We take the last known postcode of entries that include a postcode and check the postcode against a list of rural postcodes.
# Where no postcode is given
df['Postcode1'].isna().value_counts(dropna=False)
# Number of Entries with Rural Postcode
df['Rural'].value_counts(dropna=False)
# Entries with Urban Postcode / Entries with Rural Postcode
filt = df['Postcode1'].notna()
postcodes = df[filt]
postcodes['Rural'].value_counts(dropna=False)
# Percentage of Urban to Rural Postcodes
filt = df['Postcode1'].notna()
postcodes = df[filt]
postcodes['Rural'].value_counts(normalize=True, dropna=False).apply(format_percent)
# Percentage of all postcode values inc nan to Rural Postcodes
df['Rural'].value_counts(normalize=True, dropna=False).apply(format_percent)
Data for location is inconsistent and not tracked accurately over time. We've reduced this to a look at what is effectively a 'last known postcode area' for each entry.
# The 20 most popular postcodes:
p_code_values = df['Postcode1'].value_counts().values.tolist()
p_code_indexes = df['Postcode1'].value_counts().index.tolist()
df_p_codes = pd.DataFrame(columns=('PostCode', 'Number'))
df_p_codes['Postcode'] = p_code_indexes
df_p_codes['Number'] = p_code_values
# Export to CSV
#df_p_codes.to_csv('Outputs/CSV/df_p_codes.csv', index=False)
df_p_codes.head(20)
# Number of crew in the 20 most popular postcodes
df['Postcode1'].value_counts().head(20).sum()
df['PostCodeStrip'] = df['Postcode1']
if df['PostCodeStrip'].notna:
df['PostCodeStrip'] = df['PostCodeStrip'].str[:2]
df['PostCodeStrip'].head(10)
print(df['PostCodeStrip'].value_counts().tolist())
p_code_nums = [697, 434, 358, 203, 191, 172, 132, 108, 89, 17, 1, 1]
sum_p_codes = sum(p_code_nums)
print('Glasgow and Edinburgh Postcodes: ' + str(sum_p_codes))
# Trainee data float to string
df['Trainee prog'] = df['Trainee prog'].fillna(-1)
df['Trainee prog'] = df['Trainee prog'].astype(int)
df['Trainee prog'] = df['Trainee prog'].astype(str)
df['Trainee prog'] = df['Trainee prog'].replace('-1', np.nan)
df['Trainee prog'].count()
# Show number of trainees on Trainee Program in a given year
df['Trainee prog'].value_counts()
trainee_index = df['Trainee prog'].value_counts().index.tolist()
trainee_values = df['Trainee prog'].value_counts().values.tolist()
trainee_index.insert(0, 'Year of Trainee Program')
df_trainees = pd.DataFrame(columns=trainee_index)
trainee_values.insert(0, 'Numbers')
df_trainees.loc[1] = trainee_values
# Export to CSV
df_trainees.to_csv('Outputs/CSV/df_trainees.csv', index=False)
print('Average number of trainees over years the programs ran: ' + str(df['Trainee prog'].value_counts().median())
+ ' trainees')
# Filter Department for 'Trainee', return number of years in directory for each
filt = df['Role 1 Category'] == 'Trainee'
trainees = df[filt]
trainees['No of Yrs']
years = df.groupby(['Trainee prog'])
years['No of Yrs'].value_counts()
years = df.groupby(['Trainee prog'])
years['No of Yrs'].mean()
# Revise this, it's wrong
trainees = df[['Role 1 Category', 'Gender', 'Trainee prog', 'No of Yrs']].dropna()
trainees = trainees.sort_values(['Role 1 Category','Trainee prog'], ascending=False)
trainees
trainees['Gender'].value_counts()
trainees['Gender'].value_counts(normalize=True).apply(format_percent)
dept_grp = trainees.groupby(['Role 1 Category'])
dept_grp['Gender'].value_counts()
dept_grp = trainees.groupby(['Role 1 Category'])
dept_grp['Gender'].value_counts(normalize=True)
year_grp = trainees.groupby(['Trainee prog'])
year_grp['Role 1 Category'].value_counts()
#data
df_trainee = df['Trainee prog'].value_counts().sort_index()
trainee_year_index = df_trainee.index.tolist()
trainee_values = df_trainee.values
fig, ax = plt.subplots()
width=0.7
ax.bar(trainee_year_index, trainee_values, color=colours[2], width = width, label='Trainees')
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_facecolor('white')
ax.set_xticks(ticks=trainee_year_index)
fig.autofmt_xdate(rotation=90)
plt.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/trainees_per_year.png', facecolor='#ffffff')
plt.show()
Basic figures for Career Lengths
# Number of Years in Directory, number of entries present for that duration
df['No of Yrs'].value_counts()
Career length grouped into 5 year bins
bins = pd.cut(df['No of Yrs'], [0, 5, 10, 15, 20, 25, 30, 35, 40, 45])
df.groupby(bins)['No of Yrs'].agg(['count'])
indexes = ['1 year','2-3 years', '4-6 years', '7-20 years', '20-41 years']
# Data
bins = pd.cut(df['No of Yrs'], [0, 1, 3, 6, 20, 41])
values = df.groupby(bins)['No of Yrs'].agg(['count'])
print(values)
# Simplify Data
simple_values = (values.values)
value_list = [ item for elem in simple_values for item in elem]
colors = sns.cubehelix_palette(start=2, rot=1, dark=0.3)
fig, ax = plt.subplots()
ax.pie(value_list, labels=indexes, labeldistance=1.2, pctdistance=0.85, textprops={'fontsize': 14}, shadow=False,
startangle=90, autopct='%1.0f%%', colors=colours, wedgeprops={'edgecolor':'black'})
ax.set_title('', fontsize=16, color='#635DC6', weight='bold')
ax.text(.0,.0,' ', fontsize=14, ha='right')
circle=plt.Circle( (0,0), 0.7, color='white')
p=plt.gcf()
p.gca().add_artist(circle)
plt.tight_layout()
plt.savefig('Outputs/Final/longevity_bins_percentages_ring.png', facecolor='#ffffff')
plt.show()
# Data
indexes = ['1-5 years','6-10 years', '11-15 years', '16-20 years', '>20']
bins = pd.cut(df['No of Yrs'], [0, 5, 10, 15, 20, 41])
values = df.groupby(bins)['No of Yrs'].agg(['count'])
print(values)
simple_values = (values.values)
value_list = [ item for elem in simple_values for item in elem]
# Plot
colors = sns.cubehelix_palette(start=2, rot=1, dark=0.3)
fig, ax = plt.subplots()
ax.pie(value_list, labels=indexes, labeldistance=1.3, pctdistance=1.1, textprops={'fontsize': 14}, shadow=False,
startangle=90, autopct='%1.0f%%', colors=colours, wedgeprops={'edgecolor':'black'})
ax.set_title('', fontsize=16, color='#635DC6', weight='bold')
ax.text(.0,.0,' ', fontsize=14, ha='right')
circle=plt.Circle( (0,0), 0.7, color='white')
p=plt.gcf()
p.gca().add_artist(circle)
plt.tight_layout()
plt.savefig('Outputs/Final/longevity_bins_percentages_ring_directory_1.png', facecolor='#ffffff')
plt.show()
filtered = df['No of Yrs'].apply(lambda x: x > 1)
print(filtered.value_counts())
print()
print('Percentages')
print(filtered.value_counts(normalize=True).apply(format_percent))
filtered = df['No of Yrs'].apply(lambda x: x > 6)
print(filtered.value_counts())
print()
print('Percentages')
print(filtered.value_counts(normalize=True).apply(format_percent))
filtered = df['No of Yrs'].apply(lambda x: x > 15)
print(filtered.value_counts())
print()
print('Percentages')
print(filtered.value_counts(normalize=True).apply(format_percent))
filtered = df['No of Yrs'].apply(lambda x: x > 20)
print(filtered.value_counts())
print()
print('Percentages')
print(filtered.value_counts(normalize=True).apply(format_percent))
Median Career Length All Entries
df['No of Yrs'].median()
Median Career Length for Entries in for more than 1 Year
filt = (df['No of Yrs'] > 1)
df2 = df[filt]
df2['No of Yrs'].median()
# Showing Long Careers & Breakdown by Department
lengths = df.groupby(['No of Yrs'])
lengths['Role 1 Category'].value_counts().tail(107)
# Showing Short Careers Breakdown by Department Numbers
lengths = df.groupby(['No of Yrs'])
lengths['Role 1 Category'].value_counts().head(104)
The following two charts show Longevity in the directory with frequency of entries with the given 'career' lengths plotted along the x axis. We chart the pattern for all entries, and for entries who are present for more than 1 year. While entries lasting only 1 year are at first glance an anomaly, being by far the biggest single group in the dataset, but declaring the least information given their short presence, we see the same pattern (scaling down) with the removal of those entries from the chart.
# Data
keys_list = df['No of Yrs'].value_counts().index.tolist()
values = df['No of Yrs'].value_counts().values
median = df['No of Yrs'].median()
# Plot
width=0.8
fig, ax = plt.subplots()
ax.bar(keys_list, values, color=colours[2], width = width, label='Crew')
ax.axvline(median, linewidth=3, color='black', label='Median')
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_facecolor('white')
ax.set_xticks(ticks=keys_list)
ax.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/basic_crew_longevity.png', facecolor='#ffffff')
plt.show()
# Data
filt = (df['No of Yrs'] > 1)
df2 = df[filt]
keys_list = df2['No of Yrs'].value_counts().index.tolist()
values = df2['No of Yrs'].value_counts().values
median = df2['No of Yrs'].median()
# Plot
width=0.8
fig, ax = plt.subplots()
ax.bar(keys_list, values, color=colours[4], width = width, label='Crew')
ax.axvline(median, linewidth=3, color='black', label='Median')
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_facecolor('white')
ax.set_xticks(ticks=keys_list)
ax.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/basic_crew_longevity>1yr.png', facecolor='#ffffff')
plt.show()
We are interested in the factors that allow people to develop longer careers. The following chart shows frequency of career lengths in the entries for between 4 and 20 years. This offers a wide sample of entries who are clearly getting something out of being in the directory
filt_1 = (df['No of Yrs'] < 2)
df_1 = df[filt_1]
msg = 'Number in department in for only 1 year, historical'
print(msg)
print(len(msg) *'-')
entries = []
for dept in depts:
filt_dept = (df_1['Role 1 Category'] == dept)
df_depts = df_1[filt_dept]
count = df_depts['No of Yrs'].value_counts().values
entries.append(count[0])
#print(dept, count)
print(entries)
depts2 = depts
depts2.pop(-1)
entries.pop(-1)
width=0.7
color = sns.cubehelix_palette(start=2, dark=0.5)
fig, ax = plt.subplots()
ax.bar(depts2, entries, color=colours[5], width = width, label='Crew')
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_facecolor('white')
ax.set_xticks(ticks=depts)
fig.autofmt_xdate(rotation=45)
ax.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/freelancers_by_department_in_for_1_year.png', facecolor='#ffffff')
plt.show()
# Data
filt5 = (df['No of Yrs'] > 20 )
df4 = df[filt5]
keys_list = df4['No of Yrs'].value_counts().index.tolist()
values = df4['No of Yrs'].value_counts().values
# Plot
color = sns.cubehelix_palette(start=1, dark=0.5)
width=0.7
fig, ax = plt.subplots()
ax.bar(keys_list, values, color=colours[6], width = width, label='Crew')
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_facecolor('white')
ax.set_xticks(ticks=keys_list)
ax.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/freelancer_longevity_>20_yrs.png', facecolor='#ffffff')
plt.show()
import seaborn as sns
import matplotlib.pylab as plt
totals = df['Role 1 Category'].value_counts().array
depts = df['Role 1 Category'].value_counts().index.array
indexes = ['1 year','2-3 years', '4-6 years', '7-11 years', '12-18 years', '19-29 years', '30-42 years']
color = sns.color_palette('colorblind', as_cmap=True)
def bins_plotter(_index, department, med, values):
width = 0.7
total = str(totals[_index])
label = department +'\n'+'Dept total = '+total+'\n' +'Median= '+med+' years'
fig, ax = plt.subplots()
ax.bar(indexes, values, color=color[9], width = width, label=label)
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_facecolor('white')
ax.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/crew_longevity_by_year_prime_bins_bar_'+department+'.png', facecolor='#ffffff')
plt.show()
def career_bins(series, dataframe):
for index, dept in enumerate(series):
filt_dept = (dataframe['Role 1 Category'] == dept)
df_depts = dataframe[filt_dept]
bins = pd.cut(df_depts['No of Yrs'], [0, 1, 3, 6, 11, 18, 29, 42])
values = df_depts.groupby(bins)['No of Yrs'].agg(['count'])
simple_values = (values.values)
value_list = [ item for elem in simple_values for item in elem]
median = str(df_depts['No of Yrs'].median())
# call bins_plotter function
bins_plotter(index, dept, median, value_list)
career_bins(depts, df)
Using the calculation for career length each year (a count produced for each year an entry is present in the directory) chart career lengths over time
years = [*range(1,42)]
df_master = pd.DataFrame({'Years': years})
for year in df.loc[:,'2020':'1976']:
# filter out NAN values
filtyear = df[year].notna()
dfyear = df[filtyear]
# get data
values_year = dfyear['Yr Cnt '+year].value_counts().array
keys_year = dfyear['Yr Cnt '+year].value_counts().index
# second dataframe
df_year = pd.DataFrame({f'{year} Keys':keys_year, f'{year} Values':values_year})
# join dataframes
df_master = df_master.join(df_year.set_index(f'{year} Keys'), on='Years')
longevity_data = df_master.set_index('Years')
longevity_data.iloc[20].median()
print(longevity_data.iloc[19].median())
print(longevity_data.iloc[20].median())
print(longevity_data.iloc[21].median())
# drop_vals used in this code were read from another chart, not coded here.
# write a function to retrieve this data.
dropout_years = [1978,1979,1981,1982,1984,1986,1987,1988,1989,1990,1991, 1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020]
df_one_year_dropouts = pd.DataFrame(columns=dropout_years)
drop_vals = [12, 7, 3, 3, 2, 39, 6, 5, 9, 23, 24, 29, 10, 13, 35, 18, 16, 20, 36, 23, 38, 28,11,30, 44, 35,30, 32, 29, 68,15, 46, 19, 30, 18, 35, 16, 32, 40, 40]
df_one_year_dropouts.loc[1] = drop_vals
print(df_one_year_dropouts)
df_one_year_dropouts2 = pd.DataFrame()
df_one_year_dropouts2['Years'] = dropout_years
df_one_year_dropouts2['Dropouts'] = drop_vals
df_one_year_dropouts2.to_csv('Outputs/CSV/To_DataWrapper/df_dropouts.csv', index=False)
df_one_year_dropouts.loc[1].median()
import matplotlib.pylab as pylt
plot_years = df.loc[:, '2020':'1976']
hm_colours = sns.cubehelix_palette(start=2, rot=1, dark=0, light=.95, as_cmap=True)
pylt.figure(figsize=(22,10))
ax = sns.heatmap(longevity_data, linewidth=0.3, cmap=hm_colours, annot=True, fmt=".0f")
pylt.yticks(rotation=0)
ax.xaxis.set_ticks_position('top')
ax.set_xticklabels(plot_years,rotation=90)
ax.set_facecolor('white')
pylt.xlabel('')
pylt.ylabel('Length of Career in Years')
#plt.title(f'Film Bang Freelancers Length of Career over Time' , fontsize=16, color='#635DC6', weight='bold')
#plt.figtext(.0,.0,'Fig. 1.3', fontsize=14, ha='right')
plt.savefig(f"Outputs/Final/career_longevity_basic_1.png", facecolor='#ffffff', dpi=500)
pylt.show()
# Convert to html table for screen readers
#longevity_data.to_html()
The chart above shows overall presence in the listings over the history of Film Bang.
Each horizontal row contains numbers of people in the data with that given length of career (top level is whoever has a 1 year presence in the listings). Columns are for each year. So we can track how many people have a given career length in each year.
We can see a bulge of darker colour emerging from the early nineties, reaching a peak in the early 2000s and dropping off to early 90s density from 2012 onwards.
We can also see an initial high density influx of listings in the first year, and a comparative drop off of 'new starts' until 1984 when the number of new entries jumps. We can see this cohort lingering on in the listings in the form of a darker trail leading down and left from the figure '89' at the top of the 1984 column.
depts = df['Role 1 Category'].value_counts().index.array
df_master = pd.DataFrame({'Years': years})
import matplotlib.pylab as plt
plot_years = df.loc[:, '2020':'1976']
def plot(dept):
hm_colours = sns.cubehelix_palette(start=2, rot=1, dark=0, light=.95, as_cmap=True)
plt.figure(figsize=(22,10))
ax = sns.heatmap(longevity_data_2, linewidth=0.3, cmap=hm_colours, annot=True, fmt=".0f")
plt.yticks(rotation=0)
ax.xaxis.set_ticks_position('top')
ax.set_facecolor('white')
ax.set_xticklabels(plot_years,rotation=90)
plt.xlabel('')
plt.ylabel('Number of Years in Directory')
plt.title(f'{dept}', fontsize=14)
#plt.figtext(.0,.0,'Fig. #', fontsize=14, ha='right')
#plt.legend(dept)
plt.savefig(f"Outputs/Final/career_longevity_{dept}_2.png", facecolor="#ffffff", dpi=500)
plt.show()
# Loop through departments
for i in depts:
#initialise a new df_master dataframe for each pass through departments
years = [*range(1,42)]
df_master = pd.DataFrame({'Years': years})
# filter for department of given iteration
deptfilt = (df['Role 1 Category'] == i)
df_dept = df[deptfilt]
# loop through year columns and filter out NAN values
for year in df.loc[:, '2020':'1976']:
filtyear = df_dept[year].notna()
dfyear = df_dept[filtyear]
# get numbers
values_year = dfyear['Yr Cnt '+year].value_counts().array
keys_year = dfyear['Yr Cnt '+year].value_counts().index
# put them in a new mini dataframe
df_year = pd.DataFrame({f'{year} Keys':keys_year, f'{year} Values':values_year})
# join the mini dataframes to a temporary master dataframe
df_master = df_master.join(df_year.set_index(f'{year} Keys'), on='Years')
# assign the new master dataframe to data variable and plot it
# plot function will expect data to be the current dataframe
df_master = df_master.set_index('Years')
longevity_data_2 = df_master
plot(i)
The charts above show density of career lengths over time. The darker the colour, the more entries. For each incremental year that cohort moves down one square and left one square. Under Production for example there are 6 entries under production in 1976. In 1978 there are 4 entries. Meaning 2 dropped out and 4 stayed in. In 1978 4 people have a two year career and 4 people have a 1 year career. In 2011 6 entries show a career length of 10 years. In 2012 there are 5 entries with a career of 11 years (meaning one dropped out). And so on.
The darker the trail leading down and left, the more sustainable the career during that period. So we see careers starting in the mid 90s lasting comparitively well. Patches of light colour relatively high up in the charts indicate (relatively) high numbers of drop outs.
Presence in the listings after 10 years is comparatively rare so seeing any people lasting this long is significant in its own right
long_series = longevity_data_2.iloc[0]
long_series.median()
longevity_data_2.iloc[0].median()
depts = df['Role 1 Category'].value_counts().index.tolist()
nums = df['Role 1 Category'].value_counts().tolist()
filt_1_yr = (df['No of Yrs'] > 1)
df_more_than_1_year = df[filt_1_yr]
nums_more_than_1_year = df_more_than_1_year['Role 1 Category'].value_counts().tolist()
width=0.8
fig, ax = plt.subplots()
ax.bar(depts, nums, color=colours[3], width = width, label='All listings')
ax.bar(depts, nums_more_than_1_year, color=colours[4], width=width, label="Listings in Directory for more than 1 year")
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_facecolor('white')
ax.set_xticks(ticks=depts)
fig.autofmt_xdate(rotation=45)
plt.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/department_role_category_comparison.png', facecolor='#ffffff')
plt.show()
Gender is inferred. The Film Bang directory does not ask people to declare gender, or other personal information. The gender presented here is inferred from the names of entries (done during data capture, prior to anonymization of the data). Where gender is recorded here as 'unknown' it refers to a name that might conventionally refer to both a man or woman.
df['Gender'].value_counts()
df['Gender'].value_counts(normalize=True).apply(format_percent)
role_grp = df.groupby(['Role 1 Category'])
role_grp['Gender'].value_counts().tail(300)
ct = {
'Year':[],
'Male':[],
'Female':[],
'Unknown':[],
}
gender_ct = pd.DataFrame(ct)
for column in df.loc[:, '2020':'1976']:
filt1 = (df[column] == column) & (df['Gender'] == 'Female')
new_df = df[filt1]
filt2 = (df[column] == column) & (df['Gender'] == 'Male')
new_df2 = df[filt2]
filt3 = (df[column] == column) & (df['Gender'] == 'Unknown')
new_df3 = df[filt3]
women = len(new_df.index)
men = len(new_df2.index)
no_gender = len(new_df3.index)
gender_ct = gender_ct.append({'Year': column, 'Male': men,'Female': women, 'Unknown': no_gender}, ignore_index=True)
# Plot Data
gender_data = gender_ct
years_x = gender_data['Year']
no_g_y = gender_data['Unknown']
male_y = gender_data['Male']
female_y = gender_data['Female']
width=0.7
fig, ax = plt.subplots()
bars = np.add(male_y, female_y).tolist()
ax.bar(years_x, no_g_y, bottom=bars, color=colours[0], width=width, label='Gender Unknown')
ax.bar(years_x, male_y, bottom=female_y, color=colours[1], width=width, label='Male')
ax.bar(years_x, female_y, color=colours[2], width=width, label='Female')
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_xticks(ticks=years_x)
ax.set_xticklabels(years_x,{'fontsize':12})
fig.autofmt_xdate(rotation=90, ha='right')
fig.gca().invert_xaxis()
ax.set_facecolor('white')
ax.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig("Outputs/Final/basic_gender_chart_dp.png", facecolor='#ffffff')
plt.show()
gender_ct['Div'] = 100/(gender_ct['Male'] + gender_ct['Female'] + gender_ct['Unknown'])
gender_ct['m_perc'] = gender_ct['Div'] * gender_ct['Male']
gender_ct['f_perc'] = gender_ct['Div'] * gender_ct['Female']
gender_ct['u_perc'] = gender_ct['Div'] * gender_ct['Unknown']
gender_ct
years_x = gender_ct['Year']
no_g_y = gender_ct['u_perc']
male_y = gender_ct['m_perc']
female_y = gender_ct['f_perc']
width=0.7
bars = np.add(male_y, female_y).tolist()
plt.bar(years_x, no_g_y, bottom=bars, color=colours[0], width=width, label='Gender Unknown')
plt.bar(years_x, male_y, bottom=female_y, color=colours[1], width=width, label='Male')
plt.bar(years_x, female_y, color=colours[2], width=width, label='Female')
plt.xlabel('')
plt.ylabel('')
plt.xticks(ticks=years_x, rotation=90)
plt.gca().invert_xaxis()
ax = plt.axes()
ax.set_facecolor('white')
plt.legend(frameon=1, fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig("Outputs/Final/basic_gender_chart_percent.png", facecolor='#ffffff')
plt.show()
# Create CSV for basic stats: crew, crew gender, companies ADD Workshops
crew_totals = [688, 644, 636, 627, 634, 675, 627, 668, 664, 692, 639, 726, 708, 759, 786, 793, 806, 782, 728, 710, 713, 676, 651, 585, 535, 492, 476, 397, 361, 355, 313, 266, 206, 210, 197, 200, 119, 104, 85, 83, 66]
production_companies = [50, 51, 49, 49, 55, 56, 61, 60, 65, 64, 62, 77, 79, 78, 70, 72, 74, 78, 81, 87, 85, 82, 81, 72, 62, 60, 65, 56, 53, 48, 39, 35, 27, 34, 29, 26, 22, 16, 15, 13, 11]
gender_ct['Total Crew'] = crew_totals
gender_ct['Companies'] = production_companies
# Export to CSV
gender_ct.to_csv('Outputs/CSV/To_DataWrapper/gender_ct.csv', index=False)
gender_by_year = {
'Year':[],
'Male':[],
'Female':[],
'Unknown':[],
}
df_gender = pd.DataFrame(gender_by_year)
def gender_dept_plot(d, years_x, no_g_y, male_y, female_y, index):
bars = np.add(male_y, female_y).tolist()
plt.bar(0, 0, color='none', label=f"{i}")
plt.bar(years_x, no_g_y, bottom=bars, color=colours[0], width=width, label='Gender Unknown')
plt.bar(years_x, male_y, bottom=female_y, color=colours[1], width=width, label='Male')
plt.bar(years_x, female_y, color=colours[2], width=width, label='Female')
plt.xlabel('')
plt.ylabel('')
plt.title(f'')
ax = plt.axes()
ax.set_facecolor('white')
plt.xticks(ticks=years_x, rotation=90, fontsize=14)
plt.yticks(fontsize=14)
plt.gca().invert_xaxis()
plt.legend(fontsize=16)
plt.grid(True)
plt.tight_layout()
plt.savefig(f"Outputs/Final/Gender/{index}_department_workers_by_gender.png", facecolor='#ffffff')
plt.show()
# Refactor this to break into smaller parts: data manipulation and plots.
# Create plot function with a dept parameter
# Call plot function within the loop, passing the dept / column variable as argument.
width = 0.7
depts = df['Role 1 Category'].dropna().unique()
for i in depts:
df_gender = df_gender[0:0]
filt = (df['Role 1 Category'] == i)
department = df[filt]
for column in department.loc[:, '2020':'1976']:
filt1 = (department[column] == column) & (department['Gender'] == 'Female')
new_df = department[filt1]
filt2 = (department[column] == column) & (department['Gender'] == 'Male')
new_df2 = department[filt2]
filt3 = (department[column] == column) & (department['Gender'] == 'Unknown')
new_df3 = department[filt3]
women = len(new_df.index)
men = len(new_df2.index)
no_gender = len(new_df3.index)
df_gender = df_gender.append({'Year': column, 'Male': men,'Female': women, 'Unknown': no_gender}, ignore_index=True)
#data = df_gender
years_x = df_gender['Year']
no_g_y = df_gender['Unknown']
male_y = df_gender['Male']
female_y = df_gender['Female']
# call plot function
gender_dept_plot(depts, years_x, no_g_y, male_y, female_y, i)
g = df.groupby(['No of Yrs'])
g_count = g['Gender'].value_counts()
print(g_count)
# Create Dataframe with Yrs column
df_clg = pd.DataFrame({'Yrs': range(1, 43)})
# Filtering
m_filt = (df['Gender'] == 'Male')
df_m = df[m_filt]
f_filt = (df['Gender'] == 'Female')
df_f = df[f_filt]
u_filt = (df['Gender'] == 'Unknown')
df_u = df[u_filt]
# parse data
m_length = df_m.groupby(['No of Yrs'])
m_c_length_values = m_length['Gender'].value_counts().array
m_c_length_keys = m_length['Gender'].value_counts().index.tolist()
m_indexes = [i[0] for i in m_c_length_keys]
f_length = df_f.groupby(['No of Yrs'])
f_c_length_values = f_length['Gender'].value_counts().array
f_c_length_keys = f_length['Gender'].value_counts().index.tolist()
f_indexes = [i[0] for i in f_c_length_keys]
u_length = df_u.groupby(['No of Yrs'])
u_c_length_values = u_length['Gender'].value_counts().array
u_c_length_keys = u_length['Gender'].value_counts().index.tolist()
u_indexes = [i[0] for i in u_c_length_keys]
# Join Data
df_male = pd.DataFrame({'M Yrs':m_indexes, 'M Values':m_c_length_values})
df_clg = df_clg.join(df_male.set_index('M Yrs'), on='Yrs')
df_female = pd.DataFrame({'F Yrs':f_indexes, 'F Values':f_c_length_values})
df_clg = df_clg.join(df_female.set_index('F Yrs'), on='Yrs')
df_unknown = pd.DataFrame({'U Yrs':u_indexes, 'U Values':u_c_length_values})
df_clg = df_clg.join(df_unknown.set_index('U Yrs'), on='Yrs')
#print(df_clg)
width = 0.3
yrs = df_clg['Yrs']
male = df_clg['M Values']
female = df_clg['F Values']
unknown = df_clg['U Values']
fig, ax = plt.subplots()
ax.bar(yrs + width, unknown, color=colours[0], width=width, label='Gender Unknown', log=True)
ax.bar(yrs, male, color=colours[1], width=width, label='Male',log=True)
ax.bar(yrs - width, female, color=colours[2], width=width, label='Female', log=True)
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_facecolor('white')
ax.set_xticks(ticks=yrs)
ax.legend(loc='upper right', fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig("Outputs/Final/gender_longevity_log.png", facecolor='#ffffff')
plt.show()
bins = pd.cut(df_clg['Yrs'], [0, 1, 3, 6, 11, 18, 29, 42])
m_values = male.groupby(bins)
f_values = female.groupby(bins)
u_values = unknown.groupby(bins)
m_out = [i for i in m_values]
f_out = [i for i in f_values]
u_out = [i for i in u_values]
# Data
m_totals = []
f_totals = []
u_totals = []
for i in m_out:
m_totals.append(i[1].sum())
for i in f_out:
f_totals.append(i[1].sum())
for i in u_out:
u_totals.append(i[1].sum())
indexes = ['1 year','2-3 years', '4-6 years', '7-11 years', '12-18 years', '19-29 years', '30-42 years']
width = 0.3
fig, ax = plt.subplots()
ax.bar(indexes, m_totals, color=colours[1], width=width, align='edge', label='Male')
ax.bar(indexes, f_totals, color=colours[2], width=-width, align='edge', label='Female')
ax.bar(indexes, u_totals, color=colours[0], width=width, label='Unknown Gender')
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_facecolor('white')
ax.legend(fontsize=12)
plt.tight_layout()
plt.savefig('Outputs/Final/longevity_gender_crew_bins.png', facecolor='#ffffff')
plt.show()
df_gender_bins = pd.DataFrame(columns=indexes)
# Add to DataFrame
df_gender_bins.loc[1] = m_totals
df_gender_bins.loc[2] = f_totals
df_gender_bins.loc[3] = u_totals
df_gender_bins
depts = df['Role 1 Category'].value_counts().index.tolist()
# Create new column duplicating values from Yr Cnt 1976
# in order to allow the .diff calculation below to work
# Calculation uses values in column to the left, so there has to be a duplicate column
# 'to the left' of the first column to be calculated.
df['Yr Cnt 1976 x'] = df['Yr Cnt 1976']
# Reverse year cols
df = df[['UUID', 'Codes', 'Trainee prog', 'Gender', 'Role 1', 'Role 1 Category',
'Role 2', 'Role 2 Category', 'Role 3', 'Role 3 Category','Has Consistent Role',
'Postcode1', 'Rural', 'No of Yrs', '1976', '1978', '1979', '1981', '1982',
'1984', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993',
'1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002',
'2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011',
'2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020',
'Company', 'Description', 'Yr Cnt 1976 x', 'Yr Cnt 1976', 'Yr Cnt 1978',
'Yr Cnt 1979', 'Yr Cnt 1981', 'Yr Cnt 1982', 'Yr Cnt 1984', 'Yr Cnt 1986',
'Yr Cnt 1987', 'Yr Cnt 1988', 'Yr Cnt 1989', 'Yr Cnt 1990', 'Yr Cnt 1991',
'Yr Cnt 1992', 'Yr Cnt 1993', 'Yr Cnt 1994', 'Yr Cnt 1995', 'Yr Cnt 1996',
'Yr Cnt 1997', 'Yr Cnt 1998', 'Yr Cnt 1999', 'Yr Cnt 2000', 'Yr Cnt 2001',
'Yr Cnt 2002', 'Yr Cnt 2003', 'Yr Cnt 2004', 'Yr Cnt 2005', 'Yr Cnt 2006',
'Yr Cnt 2007', 'Yr Cnt 2008', 'Yr Cnt 2009', 'Yr Cnt 2010', 'Yr Cnt 2011',
'Yr Cnt 2012', 'Yr Cnt 2013', 'Yr Cnt 2014', 'Yr Cnt 2015', 'Yr Cnt 2016',
'Yr Cnt 2017', 'Yr Cnt 2018', 'Yr Cnt 2019', 'Yr Cnt 2020']]
df.head(1)
We start with two sets of year columns. The basic year columns containing strings such as '2020' or '1999'. Presence or absence in the directory is represented by the presence or absence of a year string in those columns. Yr Cnt Year columns present a running tally of the number of years a person has been in the directory. Due to the way the code runs, the total years (career length) is present in all subsequent columns regardless of when the person left the directory. The code below creates two things: a dataframe indicating the difference between one year and the next (inserting a 1.0 or a 0.0 value depending on whether the year tally increases or stays the same. Then it multiplies the 'Yr Cnt Year' cols by the difference values: (year tally 1 = year tally), (year tally 0 = 0).
# Assign values from Yr Cnt 1976 col to a variable.
# This allows retrieval of the original values once
# the .diff calculation is run.
temp = df['Yr Cnt 1976']
temp.value_counts()
# check those values against the duplicate column
if temp.value_counts().values.tolist() == df['Yr Cnt 1976 x'].value_counts().values.tolist(): print("Values Equal")
# create diff dataframe
df_diff = df.loc[:, 'Yr Cnt 1976 x':'Yr Cnt 2020'].diff(axis='columns')
#df_diff
# Multiply the year count by the diff value to either get the total or 0
# Re-assign values from original Yr Cnt 1976 column to the current one.
# diff operation was inserting NAN values in Yr Cnt 1976 col because the
# column previous to it was an index.
df_diff['Yr Cnt 1976'] = temp
# check operation has worked
if df_diff['Yr Cnt 1976'].value_counts().values.tolist() == df['Yr Cnt 1976'].value_counts().values.tolist(): print("Values Equal")
# Multiply selected cols in dataframe by diff dataframe & show results
df_results = df.loc[:, 'Yr Cnt 1976 x':'Yr Cnt 2020'].multiply(df_diff)
df_results.head()
## Checking when dropouts occur
This code inserts a string 'year' - where the value in the given year is greater than the following (a dropout year will contain 0) - into a new column to capture the year when an entry drops out. Multiple columns are required because many entries drop out and re enter the directory over the years, sometimes several times.
# Get list of column names
yr_cnt_range = df_results.loc[:, 'Yr Cnt 1976':'Yr Cnt 2020'].columns.tolist()
# Get list of years
yr_range = df.loc[:, '1976':'2020'].columns.tolist()
# Get int version of years list
int_yr_range = []
for i in yr_range:
int_yr_range.append(int(i))
#print(int_yr_range)
nIYR = len(int_yr_range)
for index in range(1, nIYR):
i = int_yr_range[index]
j = str(i)
df_results['dropout '+ (j[2:])] = np.where((df_results[yr_cnt_range[index-1]] >
df_results[yr_cnt_range[index]]), i, np.nan)
dropouts = []
#
for i in yr_range[1:]:
try:
dropouts.append((i, df_results['dropout '+i[2:]].value_counts().values[0]))
except IndexError:
dropouts.append((i, 0))
df_results['dropout 11'].value_counts().values[0]
dropout_list = []
for i in dropouts:
dropout_list.append(i[1])
# at a 0 value for 1976 in dropout_list to account for no dropouts in the first year
insert = 0 # index position
dropout_list[insert:insert] = [0]
# Optional put dropouts in new dataframe
df2 = pd.DataFrame({'values':dropout_list})
# Median number of dropouts per year
dropout_median = df2['values'].median()
# Get totals for each year
general_count = []
for column in df.loc[:, '1976':'2020']:
try:
step = df[column].value_counts()
general_count.append(step[0])
except IndexError:
general_count.append(0)
print(general_count)
We can now count the instances of a career length of 1 year and produce stats for the number of new entries per year. The 1 count is based on the presence of a year string in the original dataset, so there's no case of a first year counting as 0.
new_entries = []
for column in df_results.loc[:, 'Yr Cnt 1976':'Yr Cnt 2020']:
current = df_results[column].isin([1]).sum(axis=0)
new_entries.append(current)
print(new_entries)
# Median number of new entries per year
df3 = pd.DataFrame({'values':new_entries})
new_entry_median = df3['values'].median()
print(new_entry_median)
df2 = pd.DataFrame({'values':dropout_list})
#print(df2)
df_dropouts = df_results.loc[:, 'dropout 78':'dropout 20']
df['multi dropouts'] = df_dropouts.notna().sum(axis=1)
df['multi dropouts'].head(10)
df['multi dropouts'].value_counts(dropna=False)
dropout_tally_list = df['multi dropouts'].value_counts().tolist()
dropout_tally_list
try:
dropout_tally_list.pop(0)
except IndexError:
print('Index Error')
try:
dropout_tally_list.pop(0)
except IndexError:
print('Index Error')
drop_sum = sum(dropout_tally_list)
total_entries = 3634
d = 100 / total_entries
one_dropout = d * 2573
two_dropouts = d * 419
three_dropouts = d * 95
four_dropouts = d * 14
five_dropouts = d * 4
multi_dropout_total = d * 532
print('One:' , one_dropout , ' %')
print('Two:' , two_dropouts , ' %')
print('Three:' , three_dropouts , ' %')
print('Four:' , four_dropouts , ' %')
print('Five:' , five_dropouts , ' %')
print('Multiple Dropouts:' , multi_dropout_total , ' %')
plt.rcParams['axes.facecolor']='white'
plot_years = yr_range
fig, ax = plt.subplots()
ax.plot(plot_years, general_count, color="black", label="Total Entries", linewidth=3)
ax.plot(plot_years, dropout_list, color=colours[3], label='Dropouts', linewidth=3)
ax.plot(plot_years, new_entries, color=colours[2], label='New Entries', linewidth=3)
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_xticks(ticks=plot_years)
fig.autofmt_xdate(rotation=90)
ax.set_facecolor('white')
ax.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/turnover_new_entries_dropouts_totals.png', facecolor='#ffffff')
plt.show()
width = 0.5
fig, axs = plt.subplots(2, 1, sharex=True)
neg_dropout_list = [ -x for x in dropout_list]
axs[0].bar(plot_years, new_entries, width=width, color=colours[2], label='New Entries')
axs[0].set_xticks(plot_years)
axs[1].bar(plot_years, neg_dropout_list, width=width, color=colours[3], label='Dropouts')
axs[1].set_xticks(plot_years)
axs[0].legend(fontsize=14)
axs[1].legend(fontsize=14, loc=2, bbox_to_anchor=(0.0,0.9))
plt.xticks(rotation=90)
plt.tight_layout()
plt.subplots_adjust(hspace=0)
axs[1].set_xlabel('')
axs[0].set_ylabel('')
axs[0].yaxis.set_label_coords(-0.05,-0.2)
fig.set_facecolor('white')
axs[0].set_facecolor('white')
axs[1].set_facecolor('white')
ticks = axs[1].get_yticks()
axs[1].set_yticklabels([int(abs(tick)) for tick in ticks])
axs[0].grid(False)
axs[1].grid(False)
plt.savefig(f'Outputs/Final/turnover_new_entries_dropouts_mirror_bars.png', facecolor='#ffffff')
plt.show()
Dropouts and new entries, while presented above as somewhat symetrical, are not opposing poles. A person can be a new entry only once, in one particular year. A person can drop out and re-enter the directory as many times as they like; those dropouts are all counted and included in the chart above. So there will be more dropouts than new entries regardless. The meaning of a dropout is also ambiguous. Leaving the film bang directory does not mean leaving the industry or failing in the industry.
Dropouts are measured by comparing a value with a (lower) value in the following year. The dropout is located in the year with the low value. So if a person is present in the directory in 2018 and absent in 2019, the dropout is recorded as happening in 2019.
dropouts_index=['New Entries', 'Dropouts']
df_dropouts = pd.DataFrame(columns=plot_years)
# Add to DataFrame
df_dropouts.loc[1] = new_entries
df_dropouts.loc[2] = neg_dropout_list
#df_dropouts.to_html()
#df_dropouts.iloc[1]
# go through each row and create a new column with their dropout year in it.
df_test = df_results
df_test.replace('%','',regex=True).astype('float')
df_test = df_test.drop(df_test.loc[:,'Yr Cnt 1976 x':'Yr Cnt 2020'].columns, axis = 1)
df_test
df_test["dropout year"] = df_test.sum(axis=1)
df_test.head(10)
df_test.shape
df.shape
# take the above column and add it to the main df
df_cox = df.join(df_test)
df_cox.head(30)
# drop useless cols from that df
df_cox = df_cox.drop(['Codes', 'Role 1', 'Role 2', 'Role 3', 'Company', 'Description'], axis=1)
df_cox = df_cox.drop(df_cox.loc[:,'dropout 78':'dropout 20'].columns, axis = 1)
df_cox = df_cox.drop(df_cox.loc[:,'Yr Cnt 1976 x':'Yr Cnt 2020'].columns, axis = 1)
df_cox.to_csv(f'Outputs/CSV/df_for_statistical_analysis.csv', index=False)
# create new dfs for each variable to be checked
df_cox['Trainee prog'] = df_cox['Trainee prog'].fillna(0)
We took the UK Film Council and BFI statistical yearbooks from 2002 - 2019. These are not a stable reliable source of data due to changes in the way the ONS counted companies in the screen industries. We found multiple errors in transcribing data from ONS into the yearbooks themselves. And noted multiple variants in figures for years through time. However, these figures present a 'good enough' comparison as long as we don't try to read anything very deep into the detail. We took the figures for the years 1996 - 2018 and went for the most plausible. MORE DETAIL ON PROCESS...
film_bang_se_nums = [
688, 644, 636, 627, 634, 675, 627, 668, 664, 692, 639, 726, 708,
759, 786, 793, 806, 782, 728, 710, 713, 676, 651, 585, 535, 492,
476, 397, 361, 355, 313, 266, 206, 210, 197, 200, 119, 104, 85,
83, 66]
bfi_se_nums = [
8000,9000,13000,11000,11000,12000,15000,15000,15000,11000,13000,11000,
11000,15000,20000,24000,22000,24000,28000,24000,27000,32000,31000]
bfi_years = [
1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018]
film_bang_se_nums = [
535, 585, 651, 676, 713, 710, 728, 782, 806, 793, 786, 759, 708, 726,
639, 692, 664, 668, 627, 675, 634, 627, 636]
x_indexes = list(range(len(bfi_years)))
fig, ax = plt.subplots()
ax.plot(x_indexes, bfi_se_nums, color='magenta', label='BFI Numbers', linewidth=3)
ax.plot(x_indexes, film_bang_se_nums, color='blue', label='Film Bang Numbers', linewidth=3)
ax.set_facecolor('white')
ax.set_xlabel('')
ax.set_ylabel('Number of Self Employed')
ax.set_title('')
ax.set_xticks(ticks=x_indexes)
ax.set_xticklabels(bfi_years, {'fontsize':12})
fig.autofmt_xdate(rotation=90)
ax.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/bfi_film_bang_plot_comparison.png', facecolor='#ffffff')
plt.show()
# reverse list
film_bang_se_nums = list(reversed(film_bang_se_nums))
print(film_bang_se_nums)
width = 0.8
fig, ax = plt.subplots()
ax.bar(x_indexes, bfi_se_nums, color=colours[6], width=width, label='BFI', log=True)
ax.bar(x_indexes, film_bang_se_nums, color=colours[2], width=width, label='Film Bang', log=True)
for i in range(len(bfi_years)):
ax.text(x = x_indexes[i], y = bfi_se_nums[i] + 500, s = bfi_se_nums[i] + 500, size = 8, horizontalalignment='center')
ax.text(x = x_indexes[i], y = film_bang_se_nums[i] + 50, s = film_bang_se_nums[i] + 50, size = 8, horizontalalignment='center')
ax.set_facecolor('white')
ax.set_xlabel('')
ax.set_ylabel('No. Entries - Log Scale')
ax.set_title('')
ax.set_xticks(ticks=x_indexes)
ax.set_xticklabels(bfi_years, {'fontsize':12})
fig.autofmt_xdate(rotation=90)
plt.legend(loc='upper left', fontsize=14)
plt.grid(True)
plt.savefig("Outputs/Final/bfi_film_bang_bar_comparison.png", facecolor='#ffffff')
plt.show()