Film Bang Directory¶

Jupyter Notebook for computing and displaying basic figures¶

Crew Data - Growth & Decline
Departments
Urban / Rural
Trainee Program
Career Lengths & Longevity
Gender
Dropouts
Data for Cox Proportional Hazard Modelling
Comparing Film Bang Data With BFI

import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns

plt.style.use('fivethirtyeight')
plt.style.use('seaborn')
plt.style.use('seaborn-colorblind')
plt.rcParams['figure.figsize'] = [12, 6]

x = str
df = pd.read_csv('Core_Data/Film_Bang_Personnel_Master_Step_2.csv', dtype={'Trainee prof':x, '2020': x, '2019': x, '2018': x,
       '2017': x, '2016': x, '2015': x, '2014': x, '2013': x, '2012': x, '2011': x, '2010': x, '2009': x,
       '2008': x, '2007': x, '2006': x, '2005': x, '2004': x, '2003': x, '2002': x, '2001': x, '2000': x,
       '1999': x, '1998': x, '1997': x, '1996': x, '1995': x, '1994': x, '1993': x, '1992': x, '1991': x,
       '1990': x, '1989': x, '1988': x, '1987': x, '1986': x, '1984': x, '1982': x, '1981': x,
       '1979': x, '1978': x, '1976': x})

pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 300)

colours = sns.color_palette('colorblind', as_cmap=True)
#colours = sns.cubehelix_palette(start=2, rot=1, dark=0, light=.95, as_cmap=True)
#colours = sns.color_palette("rocket_r", as_cmap=True)

# Function to format percentage values for visualisation
format_percent = lambda x: format((x*100), '.0f')

The code below brings basic analysis into one document and shows the output of various calculations. The anonymisation of the data has been carried out in a separate notebook so names, full postcodes, locations etc are removed. We're left with the first 3 characters of the 'last known' postcode where entries have one.

This notebook will have other subsidiary notebooks added to it over the next few days (3/10/20)

Index:

Basic Description of Data
Departments
Location
Trainee Program
Longevity

Number of Entries, Columns

df.shape

(3634, 98)

df.columns

Index(['UUID', 'Codes', 'Trainee prog', 'Gender', 'Role 1', 'Role 1 Category',
       'Role 2', 'Role 2 Category', 'Role 3', 'Role 3 Category',
       'Has Consistent Role', 'Postcode1', 'Rural', 'No of Yrs', '2020',
       '2019', '2018', '2017', '2016', '2015', '2014', '2013', '2012', '2011',
       '2010', '2009', '2008', '2007', '2006', '2005', '2004', '2003', '2002',
       '2001', '2000', '1999', '1998', '1997', '1996', '1995', '1994', '1993',
       '1992', '1991', '1990', '1989', '1988', '1987', '1986', '1984', '1982',
       '1981', '1979', '1978', '1976', 'Company', 'Description', 'Yr Cnt 2020',
       'Yr Cnt 2019', 'Yr Cnt 2018', 'Yr Cnt 2017', 'Yr Cnt 2016',
       'Yr Cnt 2015', 'Yr Cnt 2014', 'Yr Cnt 2013', 'Yr Cnt 2012',
       'Yr Cnt 2011', 'Yr Cnt 2010', 'Yr Cnt 2009', 'Yr Cnt 2008',
       'Yr Cnt 2007', 'Yr Cnt 2006', 'Yr Cnt 2005', 'Yr Cnt 2004',
       'Yr Cnt 2003', 'Yr Cnt 2002', 'Yr Cnt 2001', 'Yr Cnt 2000',
       'Yr Cnt 1999', 'Yr Cnt 1998', 'Yr Cnt 1997', 'Yr Cnt 1996',
       'Yr Cnt 1995', 'Yr Cnt 1994', 'Yr Cnt 1993', 'Yr Cnt 1992',
       'Yr Cnt 1991', 'Yr Cnt 1990', 'Yr Cnt 1989', 'Yr Cnt 1988',
       'Yr Cnt 1987', 'Yr Cnt 1986', 'Yr Cnt 1984', 'Yr Cnt 1982',
       'Yr Cnt 1981', 'Yr Cnt 1979', 'Yr Cnt 1978', 'Yr Cnt 1976'],
      dtype='object')

# Display of sample of data
df[['Trainee prog', 'Gender', 'Role 1', 'Role 1 Category', 'Rural', 'No of Yrs']].head(20)

Crew Data¶

plot_years = df.loc[:, '2020':'1976'].columns.tolist()

basic_totals_crew = []
col_range = df.loc[:, '2020':'1976']

def crew_count(output_array, dataframe):
    for col in dataframe.loc[:, '2020':'1976']:
        total = dataframe[col].value_counts(dropna=True)
        output_array.append(total[0])
    #print(output_array)

crew_count(basic_totals_crew, df)

Growth & Decline¶

x_indexes = list(range(len(plot_years)))
x_indexes.reverse()

fig, ax = plt.subplots()

ax.plot(x_indexes, basic_totals_crew, color='#2E0014', label='Total Freelancers', linewidth=3)
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_xticks(ticks=x_indexes)
ax.set_xticklabels(plot_years, {'fontsize': 12})
fig.autofmt_xdate(rotation=90)
ax.legend(fontsize=14)
ax.set_facecolor('white')

plt.tight_layout()
plt.savefig('Outputs/Final/film_bang_freelancers_over_time.png', facecolor='#ffffff')
plt.show()

basic_totals_crew.reverse()

list_76_89 = basic_totals_crew[0:10]
list_90_99 = basic_totals_crew[10:20]
list_2000_2009 = basic_totals_crew[20:30]
list_2010_2020 = basic_totals_crew[30:41]

plot_years.reverse()

plot_years_76_89 = plot_years[0:10]
plot_years_90_99 = plot_years[10:20]
plot_years_00_09 = plot_years[20:30]
plot_years_10_20 = plot_years[30:41]

x_indexes = list(range(len(plot_years)))
#print(x_indexes)
x_indexes_76_89 = x_indexes[0:10]
x_indexes_90_99 = x_indexes[10:20]
x_indexes_00_09 = x_indexes[20:30]
x_indexes_10_20 = x_indexes[30:41]

fig, ax = plt.subplots()

ax.plot(x_indexes_76_89, list_76_89, color='#2E0014', label='Total Freelancers', linewidth=3)
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_xticks(ticks=x_indexes_76_89)
ax.set_xticklabels(plot_years_76_89, {'fontsize': 12})
ax.axvspan(x_indexes_76_89[0], x_indexes_76_89[9], facecolor=colours[0], alpha=0.5)
ax.set_facecolor('white')

ax.legend(loc='upper right', bbox_to_anchor=(0.4, 0., 0.5, 0.9), fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/proto-type_creative_cluster_76_89.png', facecolor='#ffffff')
plt.show()

fig, ax = plt.subplots()

ax.plot(x_indexes_90_99, list_90_99, color='#2E0014', label='Total Freelancers', linewidth=3)
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_xticks(ticks=x_indexes_90_99)
ax.set_xticklabels(plot_years_90_99, {'fontsize': 12})
ax.axvspan(x_indexes_90_99[0], x_indexes_90_99[9], facecolor=colours[1], alpha=0.5)
ax.set_facecolor('white')

ax.legend(loc='upper left', bbox_to_anchor=(0.1, 0., 0.5, 0.9), fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/de-regulation_90_99.png', facecolor='#ffffff')
plt.show()

fig, ax = plt.subplots()

ax.plot(x_indexes_00_09, list_2000_2009, color='#2E0014', label='Total Freelancers', linewidth=3)
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_xticks(ticks=x_indexes_00_09)
ax.set_xticklabels(plot_years_00_09, {'fontsize': 12})
plt.axvspan(x_indexes_00_09[0], x_indexes_00_09[9], facecolor=colours[2], alpha=0.5)
ax.set_facecolor('white')
ax.legend(loc='upper right', bbox_to_anchor=(0.4, 0., 0.5, 0.9), fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/limits_to_growth_00_09.png', facecolor='#ffffff')
plt.show()

fig, ax = plt.subplots()

ax.plot(x_indexes_10_20, list_2010_2020, color='#2E0014', label='Total Freelancers', linewidth=3)
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('New expansion, new productions')
ax.set_xticks(ticks=x_indexes_10_20)
ax.set_xticklabels(plot_years_10_20, {'fontsize': 12})
ax.axvspan(x_indexes_10_20[0], x_indexes_10_20[10], facecolor=colours[3], alpha=0.5)
ax.set_facecolor('white')

ax.legend(loc='upper right', bbox_to_anchor=(0.4, 0., 0.5, 0.9), fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/new_Expansion_10_20.png', facecolor='#ffffff')
plt.show()

Periods of Growth and Decline¶

fig, ax = plt.subplots()

ax.plot(x_indexes, basic_totals_crew, color='#2E0014', label='  Total Entries', linewidth=3)
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_xticks(ticks=x_indexes)
ax.set_xticklabels(plot_years, {'fontsize':12})
fig.autofmt_xdate(rotation=90)

ax.axvspan(x_indexes[10], x_indexes[0], facecolor=colours[0], alpha=0.5, label='  Development of screen industry in Scotland')
ax.axvspan(x_indexes[20], x_indexes[10], facecolor=colours[1], alpha=0.5, label='  Deregulation, changing technology, flexible working')
ax.axvspan(x_indexes[30], x_indexes[20], facecolor=colours[2], alpha=0.5, label='  Decline of industry')
ax.axvspan(x_indexes[40], x_indexes[30], facecolor=colours[3], alpha=0.5, label='  Increased production, new broadcasters')
ax.set_facecolor('white')
ax.legend(frameon=1, loc='lower right', bbox_to_anchor=(0.4, 0.1, 0.5, 0.9), facecolor='inherit', edgecolor="#000000", fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/highlighting_growth_decline.png', facecolor='#ffffff')
plt.show()

fig, ax = plt.subplots()

ax.plot(x_indexes, basic_totals_crew, color='#2E0014', label='Total Freelancers', linewidth=3)
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_xticks(ticks=x_indexes)
ax.set_xticklabels(plot_years, {'fontsize': 12})
fig.autofmt_xdate(rotation = 90)
ax.axvspan(x_indexes[10], x_indexes[0], facecolor=colours[0], alpha=0.5, label='Development of Screen Industry in Scotland')
ax.set_facecolor('white')
plt.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/timeframes_phase_1.png', facecolor='#ffffff')
plt.show()

fig, ax = plt.subplots()

ax.plot(x_indexes, basic_totals_crew, color='#2E0014', label='Total Freelancers', linewidth=3)
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_xticks(ticks=x_indexes)
ax.set_xticklabels(plot_years, {'fontsize': 12})
fig.autofmt_xdate(rotation=90)
ax.axvspan(x_indexes[20], x_indexes[10], facecolor=colours[1], alpha=0.5, label='Deregulation, changing technology, flexible working')
ax.set_facecolor('white')
ax.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/timeframesphase_2.png', facecolor='#ffffff')
plt.show()

fig, ax = plt.subplots()
ax.plot(x_indexes, basic_totals_crew, color='#2E0014', label='Total Freelancers', linewidth=3)
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_xticks(ticks=x_indexes)
ax.set_xticklabels(plot_years)
fig.autofmt_xdate(rotation=90)
ax.axvspan(x_indexes[30], x_indexes[20], facecolor=colours[2], alpha=0.5, label='Decline of industry')
ax.set_facecolor('white')

ax.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/timeframesphase_3.png', facecolor='#ffffff')
plt.show()

fig, ax = plt.subplots()

ax.plot(x_indexes, basic_totals_crew, color='#2E0014', label='Total Freelancers', linewidth=3)
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_xticks(ticks=x_indexes)
ax.set_xticklabels(plot_years)
ax.axvspan(x_indexes[40], x_indexes[30], facecolor=colours[3], alpha=0.5, label='Increased Production, New Broadcasters')
fig.autofmt_xdate(rotation=90)
ax.set_facecolor('white')
ax.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/timeframesphase_4.png', facecolor='#ffffff')
plt.show()

Total Entries per Year¶

fig, ax = plt.subplots()
width=0.8
ax.bar(plot_years, basic_totals_crew, width=width, color=colours[5], label='Total Freelancers Per Directory')
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_facecolor('white')
ax.set_xticks(ticks=plot_years)
fig.autofmt_xdate(rotation=90)

ax.legend(fontsize=14)
plt.tight_layout()
plt.savefig('Outputs/Final/listings_by_year.png', facecolor='#ffffff')
plt.show()

Showing Growth Alongside Historical Events¶

x_indexes = list(range(len(plot_years)))

fig, ax = plt.subplots()
ax.plot(x_indexes, basic_totals_crew, color='#2E0014', label='Total Entries', linewidth=3)
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_xticks(ticks=x_indexes)
ax.set_xticklabels(plot_years, {'fontsize': 12})
fig.autofmt_xdate(rotation=90)
ax.axvline(x=x_indexes[4], linewidth=3, color=colours[5], label='Channel 4 Starts')
ax.axvline(x=x_indexes[16], linewidth=3, color=colours[6], label='Tartan Shorts Launched')
ax.axvline(x=x_indexes[21], linewidth=3, color=colours[7], label='STV Crisis')
ax.axvline(x=x_indexes[28], linewidth=3, color=colours[8], label='Financial Crisis')
ax.axvline(x=x_indexes[39], linewidth=3, color=colours[9], label='Film Bang Website Relaunched')
ax.set_facecolor('white')
ax.legend(fontsize=14, frameon=1)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/workers_by_year_key_dates.png', facecolor='#ffffff')
plt.show()

at_leasts = []
df_year_exclusions = pd.DataFrame(columns=at_leasts)

for i in range(1, 41):
    string = f'AL {i}'
    at_leasts.append(string)
    df_year_exclusions[string] = np.where((df['No of Yrs'] > (i -1)), 1, np.nan)

at_least_vals = []
for col in at_leasts:
    at_least_vals.append(df_year_exclusions[col].value_counts().values[0])
print(at_least_vals)

[3632, 2512, 1982, 1627, 1380, 1163, 1035, 914, 813, 716, 631, 563, 509, 455, 408, 357, 326, 291, 260, 233, 207, 180, 150, 136, 115, 96, 79, 70, 59, 45, 35, 31, 25, 21, 14, 7, 4, 2, 1, 1]

x_vals = range(1,41)
width=0.8
fig, ax = plt.subplots()
ax.bar(x_vals, at_least_vals, color=colours[2], width = width, label='At least x years listing length')
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_facecolor('white')
ax.set_xticks(ticks=x_vals)
ax.legend(fontsize=14)
plt.tight_layout()
plt.savefig('Outputs/Final/at_least_x_listing_length.png', facecolor='#ffffff')
plt.show()

Departments¶

Figures for number of people in each department. Departments were allocated according to keywords found in role descriptions. There are 536 unique roles given throughout the data (Role 1). Film Bang used to offer a free field for crew to define their department or role. The current post 2019 website offers a dropdown list of options to choose from when signing up. There are 124 options to choose from. We have grouped roles into the categories listed below based on __ .

# Unique values in the Role 1 Column
df['Role 1'].nunique()

536

df['Role 1 Category'].value_counts()

Production         816
Camera             634
Art                470
Direction          282
Producer           242
Post-Production    226
Sound              205
Hair & Make-Up     201
Costume            154
Music               86
Support             83
Script              79
Casting             76
Construction        51
Special FX          22
Trainee              6
Name: Role 1 Category, dtype: int64

Percentages for above figures

df['Role 1 Category'].value_counts(normalize=True).apply(format_percent)
#df['Role 1 Category'].value_counts(normalize=True).apply(lambda x: format((x*100), '.0f'))

Production         22
Camera             17
Art                13
Direction           8
Producer            7
Post-Production     6
Sound               6
Hair & Make-Up      6
Costume             4
Music               2
Support             2
Script              2
Casting             2
Construction        1
Special FX          1
Trainee             0
Name: Role 1 Category, dtype: object

Has Consitent Role¶

This is a figure for where, for each entry, the Role 1, Role 2, Role 3 columns fall into the same category or not. Same category = consistent. Differing categories across 1, 2, 3 = not consistent. We are trying to determine if people are listing in different departments, spreading their net wide so to speak. Or not.

False = Director/Producer, Producer/Director

# Shows figures for those whose roles are consistent over the 3 columns (including empty values for 2 and 3), 
# and those where the roles differ by department
df['Has Consistent Role'].value_counts(dropna=False)

True     3093
NaN       469
False      72
Name: Has Consistent Role, dtype: int64

# Percentages for the above
df['Has Consistent Role'].value_counts(normalize=True, dropna=False).apply(format_percent)

True     85
NaN      13
False     2
Name: Has Consistent Role, dtype: object

filt = df.groupby(['Role 1 Category'])
filt['Has Consistent Role'].value_counts(dropna=False)

Role 1 Category  Has Consistent Role
Art              True                   429
                 NaN                     41
Camera           True                   580
                 NaN                     54
Casting          True                    72
                 NaN                      4
Construction     True                    51
Costume          True                   140
                 NaN                     14
Direction        True                   161
                 NaN                    100
                 False                   21
Hair & Make-Up   True                   195
                 NaN                      6
Music            True                    83
                 NaN                      3
Post-Production  True                   201
                 NaN                     25
Producer         NaN                    110
                 True                    81
                 False                   51
Production       True                   750
                 NaN                     66
Script           True                    59
                 NaN                     20
Sound            True                   196
                 NaN                      9
Special FX       True                    17
                 NaN                      5
Support          True                    72
                 NaN                     11
Trainee          True                     6
Name: Has Consistent Role, dtype: int64

# Percentages of the above
filt = df.groupby(['Role 1 Category'])
filt['Has Consistent Role'].value_counts(normalize=True, dropna=False).apply(format_percent)

Role 1 Category  Has Consistent Role
Art              True                    91
                 NaN                      9
Camera           True                    91
                 NaN                      9
Casting          True                    95
                 NaN                      5
Construction     True                   100
Costume          True                    91
                 NaN                      9
Direction        True                    57
                 NaN                     35
                 False                    7
Hair & Make-Up   True                    97
                 NaN                      3
Music            True                    97
                 NaN                      3
Post-Production  True                    89
                 NaN                     11
Producer         NaN                     45
                 True                    33
                 False                   21
Production       True                    92
                 NaN                      8
Script           True                    75
                 NaN                     25
Sound            True                    96
                 NaN                      4
Special FX       True                    77
                 NaN                     23
Support          True                    87
                 NaN                     13
Trainee          True                   100
Name: Has Consistent Role, dtype: object

In the above data we can see the departments Direction and Producer show the greatest instances of people having different roles across Role Categories 1, 2 & 3

# Data
depts = df['Role 1 Category'].value_counts().index.tolist()
nums = df['Role 1 Category'].value_counts().tolist()

print(depts)
print(nums)

['Production', 'Camera', 'Art', 'Direction', 'Producer', 'Post-Production', 'Sound', 'Hair & Make-Up', 'Costume', 'Music', 'Support', 'Script', 'Casting', 'Construction', 'Special FX', 'Trainee']
[816, 634, 470, 282, 242, 226, 205, 201, 154, 86, 83, 79, 76, 51, 22, 6]

tot = sum(nums)

nums_p_cents = []
for i in nums:
    x = (i / tot) * 100
    x = round(x)
    nums_p_cents.append(x)

print('Percentages', nums_p_cents)

Percentages [22, 17, 13, 8, 7, 6, 6, 6, 4, 2, 2, 2, 2, 1, 1, 0]

survey_dept_p_cents = [20, 15, 12, 5, 7, 5, 4, 0, 3, 0, 0, 2, 0, 2, 0, 0]

width=0.8
fig, ax = plt.subplots()
ax.bar(depts, nums_p_cents, color=colours[3], width = width, label='% of Film Bang Listings by Department')
ax.bar(depts, survey_dept_p_cents, color=colours[4], width = width, label="% of Survey Responses by Department")

ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_facecolor('white')
ax.set_xticks(ticks=depts)
fig.autofmt_xdate(rotation=45)
plt.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/percentages_dept_correlation.png', facecolor='#ffffff')
plt.show()

width=0.8
fig, ax = plt.subplots()
ax.bar(depts, nums, color=colours[3], width = width, label='Listings by Department')
for i in range(len(depts)):
    ax.text(x = depts[i], y = nums[i], s = nums[i], size = 15, horizontalalignment='center')
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_facecolor('white')
ax.set_xticks(ticks=depts)
fig.autofmt_xdate(rotation=45)
plt.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/department_role_category_v.png', facecolor='#ffffff')
plt.show()

# Plot
width=0.8
fig, ax = plt.subplots()
ax.barh(depts, nums, color=colours[3], label='Listings by Department')
for i in range(len(depts)):
    ax.text(x = nums[i], y = depts[i], s = nums[i], size = 15, verticalalignment='center', horizontalalignment='left')
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_facecolor('white')
ax.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/department_role_category_h.png', facecolor='#ffffff')
plt.show()

Production - Breakdown of this department given its size¶

filt = (df['Role 1 Category'] == 'Production')
production = df[filt]
df_production = pd.DataFrame(production)

df_production.loc[production['Role 1'].str.contains('Unit|Director|ProductionAssistant|Engineer|Floor|Script|Autocue|Location|Property|Scout|Runner|Newcomer|Production Assistant|Stagehand|Stage Hand|Studio|Production Executive|Drapes'), 'Role 1 SubCategory'] = 'Location'
df_production.loc[production['Role 1'].str.contains('Line|Researcher|Production Manager|Secretary|Accountant|Payroll|Production Co-ordinator|Production Co-Ordinator|Office|Publicist'), 'Role 1 SubCategory'] = 'Admin'
df_production.loc[production['Role 1'].str.contains('Driver|Pilot|Transport'), 'Role 1 SubCategory'] = 'Transport'

breakdown = production['Role 1 SubCategory'].value_counts().array
labels = ['Location', 'Admin', 'Transport']

fig, ax = plt.subplots()
ax.pie(breakdown,  labels=labels, textprops={'fontsize': 14}, pctdistance=0.85,
        labeldistance=1.2, colors=colours, shadow=False,
        startangle=45, autopct='%1.0f%%', wedgeprops={'edgecolor':'black'})
ax.set_title('')
ax.text(.0,.0,' ', fontsize=14, ha='right')

circle=plt.Circle( (0,0), 0.7, color='white')
p=plt.gcf()
p.gca().add_artist(circle)
plt.tight_layout()
plt.savefig('Outputs/Final/production_sub_categories_pie_chart_ring.png', facecolor='#ffffff')
plt.show()

Growth¶

# Create new df for storing filtered values
structure = {
    'Year':[],
    'Direction':[],
    'Music':[],
    'Producer':[],
    'Construction':[],
    'Costume':[],
    'Hair & Make-Up':[],
    'Sound':[],
    'Post-Production':[],
    'Casting':[],
    'Support':[],
    'Art':[],
    'Camera':[],
    'Production':[],
    'Script':[],
    'Special FX':[]
}

df_roles = pd.DataFrame(structure)

for column in df.loc[:, '2020':'1976']:
    # filter Role 1 Category for entries of 'Direction'
    filt1 = (df[column] == column) & (df['Role 1 Category'] == 'Direction')
    # apply filter
    direction = df[filt1]
    # count items in dataframe oject
    direction_count = len(direction.index)

    filt2 = (df[column] == column) & (df['Role 1 Category'] == 'Music')
    music = df[filt2]
    music_count = len(music.index)

    filt3 = (df[column] == column) & (df['Role 1 Category'] == 'Producer')
    producer = df[filt3]
    producer_count = len(producer.index)

    filt4 = (df[column] == column) & (df['Role 1 Category'] == 'Construction')
    construction = df[filt4]
    construction_count = len(construction.index)

    filt5 = (df[column] == column) & (df['Role 1 Category'] == 'Costume')
    costume = df[filt5]
    costume_count = len(costume.index)

    filt6 = (df[column] == column) & (df['Role 1 Category'] == 'Hair & Make-Up')
    hair = df[filt6]
    hair_count = len(hair.index)

    filt7 = (df[column] == column) & (df['Role 1 Category'] == 'Sound')
    sound = df[filt7]
    sound_count = len(sound.index)

    filt8 = (df[column] == column) & (df['Role 1 Category'] == 'Post-Production')
    postp = df[filt8]
    postp_count = len(postp.index)

    filt9 = (df[column] == column) & (df['Role 1 Category'] == 'Casting')
    casting = df[filt9]
    casting_count = len(casting.index)

    filt10 = (df[column] == column) & (df['Role 1 Category'] == 'Support')
    support = df[filt10]
    support_count = len(support.index)

    filt11 = (df[column] == column) & (df['Role 1 Category'] == 'Art')
    art = df[filt11]
    art_count = len(art.index)

    filt12 = (df[column] == column) & (df['Role 1 Category'] == 'Camera')
    camera = df[filt12]
    camera_count = len(camera.index)

    filt13 = (df[column] == column) & (df['Role 1 Category'] == 'Production')
    production = df[filt13]
    production_count = len(production.index)

    filt14 = (df[column] == column) & (df['Role 1 Category'] == 'Script')
    script = df[filt14]
    script_count = len(script.index)

    filt15 = (df[column] == column) & (df['Role 1 Category'] == 'Special FX')
    special = df[filt15]
    special_count = len(special.index)

    df_roles = df_roles.append({
        'Year': column,
        'Direction': direction_count,
        'Music': music_count,
        'Producer': producer_count,
        'Construction': construction_count,
        'Costume': costume_count,
        'Hair & Make-Up': hair_count,
        'Sound': sound_count,
        'Post-Production': postp_count,
        'Casting': casting_count,
        'Support': support_count,
        'Art': art_count,
        'Camera': camera_count,
        'Production': production_count,
        'Script': script_count,
        'Special FX': special_count
    }, ignore_index=True)
#df_roles

years = df_roles['Year']
producers = df_roles['Producer']
construction = df_roles['Construction']
direction = df_roles['Direction']
costume = df_roles['Costume']
hairs = df_roles['Hair & Make-Up']
sound = df_roles['Sound']
postp = df_roles['Post-Production']
casting = df_roles['Casting']
support = df_roles['Support']
art = df_roles['Art']
camera = df_roles['Camera']
production = df_roles['Production']
script = df_roles['Script']
special = df_roles['Special FX']

x_indexes = list(range(len(years)))
x_indexes.reverse()

linewidth=3
fig, ax = plt.subplots()
ax.plot(x_indexes, camera, color=colours[8], label='Camera', linewidth=linewidth)
ax.plot(x_indexes, production, color=colours[9], label='Production', linewidth=linewidth)
ax.plot(x_indexes, art, color=colours[7], label='Art', linewidth=linewidth)
ax.plot(x_indexes, direction, color=colours[5], label='Direction', linewidth=linewidth)
ax.plot(x_indexes, producers, color=colours[0], label='Producers', linewidth=linewidth)
ax.plot(x_indexes, sound, color=colours[3], label='Sound', linewidth=linewidth)
ax.plot(x_indexes, hairs, color=colours[2], label='Hair & Make-Up', linewidth=linewidth)
ax.plot(x_indexes, postp, color=colours[4], label='Post-Production', linewidth=linewidth)
ax.plot(x_indexes, costume, color=colours[1], label='Costume', linewidth=linewidth)
ax.plot(x_indexes, script, color=colours[8], label='Script', linewidth=linewidth)
ax.plot(x_indexes, support, color=colours[6], label='Support', linewidth=linewidth)
ax.plot(x_indexes, construction, color=colours[9], label='Construction', linewidth=linewidth)
ax.plot(x_indexes, casting, color=colours[5], label='Casting', linewidth=linewidth)
ax.plot(x_indexes, special, color=colours[2], label='Special FX', linewidth=linewidth)
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_xticks(ticks=x_indexes)
ax.set_xticklabels(years)
fig.autofmt_xdate(rotation=90)
ax.set_facecolor('white')
ax.legend(fontsize=12)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/numbers_in_departmet_by_year.png', facecolor='#ffffff')
plt.show()

fig, ax = plt.subplots()
ax.plot(x_indexes, camera, color=colours[8], label='Camera', linewidth=linewidth)
ax.plot(x_indexes, production, color=colours[9], label='Production', linewidth=linewidth)
ax.plot(x_indexes, art, color=colours[7], label='Art', linewidth=linewidth)
ax.plot(x_indexes, producers, color=colours[0], label='Producers', linewidth=linewidth)
ax.plot(x_indexes, sound, color=colours[3], label='Sound', linewidth=linewidth)
ax.plot(x_indexes, postp, color=colours[4], label='Post-Production', linewidth=linewidth)
ax.plot(x_indexes, hairs, color=colours[2], label='Hair & Make-Up', linewidth=linewidth)
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_facecolor('white')
ax.set_xticks(ticks=x_indexes)
ax.set_xticklabels(years)
fig.autofmt_xdate(rotation=90)
plt.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/departments_high_growth.png', facecolor='#ffffff')
plt.show()

fig, ax = plt.subplots()
ax.plot(x_indexes, construction, color=colours[9], label='Construction', linewidth=linewidth)
ax.plot(x_indexes, costume, color=colours[1], label='Costume', linewidth=linewidth)
ax.plot(x_indexes, casting, color=colours[5], label='Casting', linewidth=linewidth)
ax.plot(x_indexes, support, color=colours[6], label='Support', linewidth=linewidth)
ax.plot(x_indexes, script, color=colours[2], label='Script', linewidth=linewidth)
ax.plot(x_indexes, special, color=colours[3], label='Special FX', linewidth=linewidth)
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_facecolor('white')
ax.set_xticks(ticks=x_indexes)
ax.set_xticklabels(years)
fig.autofmt_xdate(rotation=90)
ax.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/low_growth_depts.png', facecolor='#ffffff')
plt.show()

Production - Top 10 Roles , example¶

filt_production = (df['Role 1 Category'] == 'Production')
production_filtered = df[filt_production]
production_filtered['Role 1'].value_counts().head(10)

Runner                             138
Production Manager                  86
Production Assistant                85
Researcher                          75
Production Co-Ordinator             72
Location Manager                    40
Script Supervisor                   38
Production Accountant               24
Location Assistant                  23
Assistant Production Accountant     22
Name: Role 1, dtype: int64

# Get list of departments
depts_for_top_10 = df['Role 1 Category'].value_counts().index.tolist()

def csv_maker(val, ind, dep):
    df_to_csv = pd.DataFrame(columns=('Role', 'Number'))
    # Add to DataFrame
    df_to_csv['Role'] = ind
    df_to_csv['Number'] = val
    df_to_csv
    # Export to CSV
    dep = dep.lower()
    df_to_csv.to_csv(f'Outputs/CSV/{dep}_top_10.csv', index=False)
    #print(dep)
    #print(df_to_csv)

def department_top_10(dept_array):
    for i in depts_for_top_10:
        filt = (df['Role 1 Category'] == i)
        dept_filtered = df[filt]
        values = dept_filtered['Role 1'].value_counts().head(10).values.tolist()
        indexes = dept_filtered['Role 1'].value_counts().head(10).index.tolist()
        csv_maker(values, indexes, i)

department_top_10(depts_for_top_10)

Location¶

Urban / Rural¶

The dataset is compromised in terms of location data. We take the last known postcode of entries that include a postcode and check the postcode against a list of rural postcodes.

# Where no postcode is given
df['Postcode1'].isna().value_counts(dropna=False)

False    3104
True      530
Name: Postcode1, dtype: int64

# Number of Entries with Rural Postcode
df['Rural'].value_counts(dropna=False)

NaN     3479
True     155
Name: Rural, dtype: int64

# Entries with Urban Postcode / Entries with Rural Postcode 
filt = df['Postcode1'].notna()
postcodes = df[filt]
postcodes['Rural'].value_counts(dropna=False)

NaN     2949
True     155
Name: Rural, dtype: int64

# Percentage of Urban to Rural Postcodes 
filt = df['Postcode1'].notna()
postcodes = df[filt]
postcodes['Rural'].value_counts(normalize=True, dropna=False).apply(format_percent)

NaN     95
True     5
Name: Rural, dtype: object

# Percentage of all postcode values inc nan to Rural Postcodes 
df['Rural'].value_counts(normalize=True, dropna=False).apply(format_percent)

NaN     96
True     4
Name: Rural, dtype: object

Data for location is inconsistent and not tracked accurately over time. We've reduced this to a look at what is effectively a 'last known postcode area' for each entry.

20 Most Popular Postcode Areas¶

# The 20 most popular postcodes:
p_code_values = df['Postcode1'].value_counts().values.tolist()
p_code_indexes = df['Postcode1'].value_counts().index.tolist()

df_p_codes = pd.DataFrame(columns=('PostCode', 'Number'))
df_p_codes['Postcode'] = p_code_indexes
df_p_codes['Number'] = p_code_values
# Export to CSV
#df_p_codes.to_csv('Outputs/CSV/df_p_codes.csv', index=False)
df_p_codes.head(20)

# Number of crew in the 20 most popular postcodes
df['Postcode1'].value_counts().head(20).sum()

1530

Edinburgh & Glasgow Postcodes¶

df['PostCodeStrip'] = df['Postcode1']

if df['PostCodeStrip'].notna:
    df['PostCodeStrip'] = df['PostCodeStrip'].str[:2]
df['PostCodeStrip'].head(10)

0    NaN
1     DD
2     EH
3     G1
4     G1
5    NaN
6     G4
7    NaN
8    NaN
9    NaN
Name: PostCodeStrip, dtype: object

print(df['PostCodeStrip'].value_counts().tolist())

[697, 434, 358, 203, 191, 172, 132, 124, 108, 93, 89, 68, 64, 57, 56, 47, 46, 44, 21, 21, 17, 14, 10, 8, 5, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

p_code_nums = [697, 434, 358, 203, 191, 172, 132, 108, 89, 17, 1, 1]
sum_p_codes = sum(p_code_nums)
print('Glasgow and Edinburgh Postcodes: ' + str(sum_p_codes))

Glasgow and Edinburgh Postcodes: 2403

Trainee Program¶

# Trainee data float to string
df['Trainee prog'] = df['Trainee prog'].fillna(-1)
df['Trainee prog'] = df['Trainee prog'].astype(int)
df['Trainee prog'] = df['Trainee prog'].astype(str)
df['Trainee prog'] = df['Trainee prog'].replace('-1', np.nan)

df['Trainee prog'].count()

116

# Show number of trainees on Trainee Program in a given year
df['Trainee prog'].value_counts()

2007    12
2009     9
2004     8
2006     7
2002     7
1997     6
2000     6
1994     6
2019     5
1996     5
1990     4
1991     4
1993     4
1987     3
1978     3
1982     3
1986     3
1984     3
2011     3
1989     3
1983     3
2013     3
1979     2
2015     2
1981     1
1988     1
Name: Trainee prog, dtype: int64

trainee_index = df['Trainee prog'].value_counts().index.tolist()
trainee_values = df['Trainee prog'].value_counts().values.tolist()
trainee_index.insert(0, 'Year of Trainee Program')
df_trainees = pd.DataFrame(columns=trainee_index)

trainee_values.insert(0, 'Numbers')
df_trainees.loc[1] = trainee_values
# Export to CSV
df_trainees.to_csv('Outputs/CSV/df_trainees.csv', index=False)

Get Median Value for Numbers of Trainees¶

print('Average number of trainees over years the programs ran: ' + str(df['Trainee prog'].value_counts().median())
      + ' trainees')

Average number of trainees over years the programs ran: 3.5 trainees

Trainees self declaring as Trainees and not on the Trainee Program¶

# Filter Department for 'Trainee', return number of years in directory for each
filt = df['Role 1 Category'] == 'Trainee'
trainees = df[filt]
trainees['No of Yrs']

75      1.0
461     1.0
468     1.0
627     1.0
1016    3.0
2537    1.0
Name: No of Yrs, dtype: float64

Length of career of trainees ordered by the year they became trainees¶

years = df.groupby(['Trainee prog'])
years['No of Yrs'].value_counts()

Trainee prog  No of Yrs
1978          6.0          1
              36.0         1
              37.0         1
1979          7.0          1
              32.0         1
1981          29.0         1
1982          3.0          1
              26.0         1
              28.0         1
1983          3.0          1
              21.0         1
              36.0         1
1984          2.0          1
              14.0         1
              34.0         1
1986          17.0         1
              19.0         1
              34.0         1
1987          8.0          1
              12.0         1
              32.0         1
1988          7.0          1
1989          6.0          1
              10.0         1
              13.0         1
1990          2.0          1
              4.0          1
              9.0          1
              30.0         1
1991          1.0          1
              4.0          1
              5.0          1
              29.0         1
1993          11.0         2
              5.0          1
              7.0          1
1994          3.0          1
              4.0          1
              6.0          1
              7.0          1
              8.0          1
              22.0         1
1996          2.0          2
              10.0         1
              21.0         1
              23.0         1
1997          1.0          1
              3.0          1
              4.0          1
              7.0          1
              8.0          1
              16.0         1
2000          7.0          2
              1.0          1
              5.0          1
              8.0          1
              19.0         1
2002          2.0          3
              1.0          1
              3.0          1
              5.0          1
              11.0         1
2004          4.0          2
              7.0          2
              1.0          1
              3.0          1
              5.0          1
              6.0          1
2006          1.0          4
              3.0          2
              13.0         1
2007          1.0          3
              5.0          3
              6.0          2
              2.0          1
              3.0          1
              12.0         1
              13.0         1
2009          1.0          4
              2.0          2
              3.0          1
              4.0          1
              10.0         1
2011          1.0          1
              3.0          1
              4.0          1
2013          2.0          1
              8.0          1
              9.0          1
2015          3.0          2
2019          1.0          5
Name: No of Yrs, dtype: int64

years = df.groupby(['Trainee prog'])
years['No of Yrs'].mean()

Trainee prog
1978    26.333333
1979    19.500000
1981    29.000000
1982    19.000000
1983    20.000000
1984    16.666667
1986    23.333333
1987    17.333333
1988     7.000000
1989     9.666667
1990    11.250000
1991     9.750000
1993     8.500000
1994     8.333333
1996    11.600000
1997     6.500000
2000     7.833333
2002     3.714286
2004     4.625000
2006     3.285714
2007     5.000000
2009     2.777778
2011     2.666667
2013     6.333333
2015     3.000000
2019     1.000000
Name: No of Yrs, dtype: float64

New DataFrame from selected columns¶

# Revise this, it's wrong 
trainees = df[['Role 1 Category', 'Gender', 'Trainee prog', 'No of Yrs']].dropna()
trainees = trainees.sort_values(['Role 1 Category','Trainee prog'], ascending=False)
trainees

Trainee Gender¶

trainees['Gender'].value_counts()

Male       59
Female     54
Unknown     3
Name: Gender, dtype: int64

trainees['Gender'].value_counts(normalize=True).apply(format_percent)

Male       51
Female     47
Unknown     3
Name: Gender, dtype: object

Trainee Gender & Department¶

dept_grp = trainees.groupby(['Role 1 Category'])
dept_grp['Gender'].value_counts()

Role 1 Category  Gender
Art              Female      5
                 Male        5
Camera           Male       15
                 Female      7
                 Unknown     1
Costume          Female      6
Direction        Female      6
                 Male        3
Hair & Make-Up   Female      3
                 Male        1
Post-Production  Male        8
                 Female      5
Producer         Female      2
                 Male        1
Production       Female     19
                 Male       16
                 Unknown     2
Sound            Male       10
Support          Female      1
Name: Gender, dtype: int64

dept_grp = trainees.groupby(['Role 1 Category'])
dept_grp['Gender'].value_counts(normalize=True)

Role 1 Category  Gender
Art              Female     0.500000
                 Male       0.500000
Camera           Male       0.652174
                 Female     0.304348
                 Unknown    0.043478
Costume          Female     1.000000
Direction        Female     0.666667
                 Male       0.333333
Hair & Make-Up   Female     0.750000
                 Male       0.250000
Post-Production  Male       0.615385
                 Female     0.384615
Producer         Female     0.666667
                 Male       0.333333
Production       Female     0.513514
                 Male       0.432432
                 Unknown    0.054054
Sound            Male       1.000000
Support          Female     1.000000
Name: Gender, dtype: float64

Number of trainees in each department by year¶

year_grp = trainees.groupby(['Trainee prog'])
year_grp['Role 1 Category'].value_counts()

Trainee prog  Role 1 Category
1978          Post-Production    2
              Camera             1
1979          Camera             1
              Direction          1
1981          Production         1
1982          Camera             1
              Producer           1
              Production         1
1983          Camera             2
              Post-Production    1
1984          Camera             1
              Producer           1
              Sound              1
1986          Camera             1
              Post-Production    1
              Sound              1
1987          Production         1
              Sound              1
              Support            1
1988          Post-Production    1
1989          Camera             1
              Direction          1
              Sound              1
1990          Camera             2
              Art                1
              Post-Production    1
1991          Camera             1
              Direction          1
              Post-Production    1
              Production         1
1993          Direction          2
              Camera             1
              Hair & Make-Up     1
1994          Art                2
              Camera             1
              Direction          1
              Post-Production    1
              Production         1
1996          Camera             1
              Costume            1
              Direction          1
              Production         1
              Sound              1
1997          Art                1
              Costume            1
              Hair & Make-Up     1
              Post-Production    1
              Production         1
              Sound              1
2000          Production         2
              Art                1
              Camera             1
              Post-Production    1
              Sound              1
2002          Production         2
              Art                1
              Camera             1
              Direction          1
              Post-Production    1
              Producer           1
2004          Production         4
              Art                1
              Costume            1
              Hair & Make-Up     1
              Sound              1
2006          Production         4
              Camera             1
              Costume            1
              Sound              1
2007          Production         6
              Camera             2
              Art                1
              Costume            1
              Post-Production    1
              Sound              1
2009          Production         6
              Art                1
              Camera             1
              Hair & Make-Up     1
2011          Production         2
              Art                1
2013          Direction          1
              Post-Production    1
              Production         1
2015          Camera             1
              Production         1
2019          Camera             2
              Production         2
              Costume            1
Name: Role 1 Category, dtype: int64

Trainee Plots¶

#data
df_trainee = df['Trainee prog'].value_counts().sort_index()
trainee_year_index = df_trainee.index.tolist()
trainee_values = df_trainee.values

fig, ax = plt.subplots()
width=0.7
ax.bar(trainee_year_index, trainee_values, color=colours[2], width = width, label='Trainees')
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_facecolor('white')
ax.set_xticks(ticks=trainee_year_index)
fig.autofmt_xdate(rotation=90)
plt.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/trainees_per_year.png', facecolor='#ffffff')
plt.show()

Career Lengths & Longevity¶

Basic figures for Career Lengths

# Number of Years in Directory, number of entries present for that duration
df['No of Yrs'].value_counts()

1.0     1120
2.0      530
3.0      355
4.0      247
5.0      217
6.0      128
7.0      121
8.0      101
9.0       97
10.0      85
11.0      68
13.0      54
12.0      54
15.0      51
14.0      47
17.0      35
18.0      31
16.0      31
22.0      30
19.0      27
21.0      27
20.0      26
24.0      21
25.0      19
26.0      17
29.0      14
23.0      14
28.0      11
30.0      10
27.0       9
34.0       7
35.0       7
32.0       6
31.0       4
33.0       4
36.0       3
37.0       2
38.0       1
41.0       1
Name: No of Yrs, dtype: int64

Career length grouped into 5 year bins

bins = pd.cut(df['No of Yrs'], [0, 5, 10, 15, 20, 25, 30, 35, 40, 45])
df.groupby(bins)['No of Yrs'].agg(['count'])

indexes = ['1 year','2-3 years', '4-6 years', '7-20 years', '20-41 years']

# Data
bins = pd.cut(df['No of Yrs'], [0, 1, 3, 6, 20, 41])
values = df.groupby(bins)['No of Yrs'].agg(['count'])
print(values)

           count
No of Yrs
(0, 1]      1120
(1, 3]       885
(3, 6]       592
(6, 20]      828
(20, 41]     207

# Simplify Data
simple_values = (values.values)
value_list = [ item for elem in simple_values for item in elem]

colors = sns.cubehelix_palette(start=2, rot=1, dark=0.3)
fig, ax = plt.subplots()
ax.pie(value_list,  labels=indexes, labeldistance=1.2, pctdistance=0.85, textprops={'fontsize': 14}, shadow=False,
        startangle=90, autopct='%1.0f%%', colors=colours, wedgeprops={'edgecolor':'black'})
ax.set_title('', fontsize=16, color='#635DC6', weight='bold')
ax.text(.0,.0,' ', fontsize=14, ha='right')
circle=plt.Circle( (0,0), 0.7, color='white')
p=plt.gcf()
p.gca().add_artist(circle)
plt.tight_layout()
plt.savefig('Outputs/Final/longevity_bins_percentages_ring.png', facecolor='#ffffff')
plt.show()

# Data
indexes = ['1-5 years','6-10 years', '11-15 years', '16-20 years', '>20']
bins = pd.cut(df['No of Yrs'], [0, 5, 10, 15, 20, 41])
values = df.groupby(bins)['No of Yrs'].agg(['count'])
print(values)

           count
No of Yrs
(0, 5]      2469
(5, 10]      532
(10, 15]     274
(15, 20]     150
(20, 41]     207

simple_values = (values.values)
value_list = [ item for elem in simple_values for item in elem]
# Plot
colors = sns.cubehelix_palette(start=2, rot=1, dark=0.3)
fig, ax = plt.subplots()
ax.pie(value_list,  labels=indexes, labeldistance=1.3, pctdistance=1.1, textprops={'fontsize': 14}, shadow=False,
        startangle=90, autopct='%1.0f%%', colors=colours, wedgeprops={'edgecolor':'black'})
ax.set_title('', fontsize=16, color='#635DC6', weight='bold')
ax.text(.0,.0,' ', fontsize=14, ha='right')
circle=plt.Circle( (0,0), 0.7, color='white')
p=plt.gcf()
p.gca().add_artist(circle)
plt.tight_layout()
plt.savefig('Outputs/Final/longevity_bins_percentages_ring_directory_1.png', facecolor='#ffffff')
plt.show()

Personnel in Directory for more than 1 year¶

filtered = df['No of Yrs'].apply(lambda x: x > 1)
print(filtered.value_counts())
print()
print('Percentages')
print(filtered.value_counts(normalize=True).apply(format_percent))

True     2512
False    1122
Name: No of Yrs, dtype: int64

Percentages
True     69
False    31
Name: No of Yrs, dtype: object

Personnel in Directory for more than 6 years¶

filtered = df['No of Yrs'].apply(lambda x: x > 6)
print(filtered.value_counts())
print()
print('Percentages')
print(filtered.value_counts(normalize=True).apply(format_percent))

False    2599
True     1035
Name: No of Yrs, dtype: int64

Percentages
False    72
True     28
Name: No of Yrs, dtype: object

Personnel in Directory for more than 15 years¶

filtered = df['No of Yrs'].apply(lambda x: x > 15)
print(filtered.value_counts())
print()
print('Percentages')
print(filtered.value_counts(normalize=True).apply(format_percent))

False    3277
True      357
Name: No of Yrs, dtype: int64

Percentages
False    90
True     10
Name: No of Yrs, dtype: object

Personnel in Directory for more than 20 years¶

filtered = df['No of Yrs'].apply(lambda x: x > 20)
print(filtered.value_counts())
print()
print('Percentages')
print(filtered.value_counts(normalize=True).apply(format_percent))

False    3427
True      207
Name: No of Yrs, dtype: int64

Percentages
False    94
True      6
Name: No of Yrs, dtype: object

Median Career Length All Entries

df['No of Yrs'].median()

3.0

Median Career Length for Entries in for more than 1 Year

filt = (df['No of Yrs'] > 1)
df2 = df[filt]
df2['No of Yrs'].median()

5.0

Career Lengths by Department¶

# Showing Long Careers & Breakdown by Department
lengths = df.groupby(['No of Yrs'])
lengths['Role 1 Category'].value_counts().tail(107)

No of Yrs  Role 1 Category
20.0       Art                6
           Producer           5
           Sound              5
           Direction          3
           Production         3
           Camera             2
           Costume            1
           Hair & Make-Up     1
21.0       Camera             8
           Sound              4
           Art                3
           Post-Production    3
           Production         3
           Producer           2
           Costume            1
           Direction          1
           Music              1
           Script             1
22.0       Art                7
           Camera             7
           Costume            3
           Production         3
           Direction          2
           Hair & Make-Up     2
           Post-Production    2
           Sound              2
           Casting            1
           Producer           1
23.0       Camera             5
           Art                2
           Production         2
           Direction          1
           Music              1
           Producer           1
           Script             1
           Sound              1
24.0       Camera             6
           Production         5
           Art                3
           Post-Production    2
           Costume            1
           Hair & Make-Up     1
           Producer           1
           Script             1
           Support            1
25.0       Art                5
           Sound              3
           Camera             2
           Hair & Make-Up     2
           Production         2
           Costume            1
           Direction          1
           Post-Production    1
           Producer           1
           Support            1
26.0       Camera             6
           Art                3
           Hair & Make-Up     3
           Direction          2
           Construction       1
           Producer           1
           Production         1
27.0       Camera             4
           Producer           2
           Direction          1
           Production         1
           Sound              1
28.0       Producer           3
           Camera             2
           Sound              2
           Art                1
           Post-Production    1
           Production         1
           Script             1
29.0       Camera             5
           Direction          3
           Production         2
           Sound              2
           Hair & Make-Up     1
           Producer           1
30.0       Camera             5
           Costume            2
           Art                1
           Post-Production    1
           Sound              1
31.0       Camera             2
           Post-Production    1
           Sound              1
32.0       Camera             3
           Sound              2
           Hair & Make-Up     1
33.0       Camera             2
           Sound              2
34.0       Camera             2
           Post-Production    2
           Art                1
           Direction          1
           Sound              1
35.0       Camera             3
           Art                2
           Costume            1
           Support            1
36.0       Camera             3
37.0       Art                1
           Post-Production    1
38.0       Sound              1
41.0       Camera             1
Name: Role 1 Category, dtype: int64

# Showing Short Careers Breakdown by Department Numbers
lengths = df.groupby(['No of Yrs'])
lengths['Role 1 Category'].value_counts().head(104)

No of Yrs  Role 1 Category
1.0        Production         315
           Camera             167
           Art                148
           Direction           65
           Post-Production     62
           Sound               61
           Hair & Make-Up      57
           Costume             51
           Producer            43
           Music               35
           Support             31
           Casting             28
           Construction        24
           Script              17
           Special FX          11
           Trainee              5
2.0        Production         139
           Camera              88
           Art                 62
           Post-Production     35
           Hair & Make-Up      32
           Direction           31
           Producer            31
           Music               21
           Sound               20
           Casting             19
           Support             18
           Costume             14
           Script              13
           Construction         4
           Special FX           2
3.0        Production          92
           Camera              42
           Art                 34
           Direction           33
           Producer            28
           Hair & Make-Up      24
           Post-Production     22
           Costume             17
           Script              17
           Sound               13
           Casting              9
           Support              9
           Construction         7
           Music                5
           Special FX           2
           Trainee              1
4.0        Production          59
           Camera              43
           Direction           25
           Art                 24
           Producer            19
           Hair & Make-Up      17
           Costume             14
           Post-Production     13
           Sound               11
           Music                7
           Script               6
           Support              4
           Casting              3
           Construction         2
5.0        Production          50
           Camera              40
           Art                 35
           Producer            16
           Direction           15
           Post-Production     15
           Hair & Make-Up      12
           Costume             10
           Sound               10
           Music                5
           Casting              3
           Construction         2
           Support              2
           Script               1
           Special FX           1
6.0        Production          24
           Camera              22
           Art                 18
           Sound               12
           Direction           11
           Post-Production     10
           Costume              8
           Producer             8
           Hair & Make-Up       4
           Construction         3
           Script               3
           Music                2
           Support              2
           Casting              1
7.0        Production          22
           Camera              19
           Art                 18
           Direction           15
           Producer            12
           Hair & Make-Up       8
           Post-Production      7
           Costume              6
           Sound                6
           Script               4
           Casting              1
           Construction         1
           Music                1
           Special FX           1
Name: Role 1 Category, dtype: int64

Visualizing Career Lengths¶

The following two charts show Longevity in the directory with frequency of entries with the given 'career' lengths plotted along the x axis. We chart the pattern for all entries, and for entries who are present for more than 1 year. While entries lasting only 1 year are at first glance an anomaly, being by far the biggest single group in the dataset, but declaring the least information given their short presence, we see the same pattern (scaling down) with the removal of those entries from the chart.

# Data
keys_list = df['No of Yrs'].value_counts().index.tolist()
values = df['No of Yrs'].value_counts().values
median = df['No of Yrs'].median()

# Plot
width=0.8
fig, ax = plt.subplots()
ax.bar(keys_list, values, color=colours[2], width = width, label='Crew')
ax.axvline(median, linewidth=3, color='black', label='Median')
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_facecolor('white')
ax.set_xticks(ticks=keys_list)
ax.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/basic_crew_longevity.png', facecolor='#ffffff')
plt.show()

# Data
filt = (df['No of Yrs'] > 1)
df2 = df[filt]
keys_list = df2['No of Yrs'].value_counts().index.tolist()
values = df2['No of Yrs'].value_counts().values
median = df2['No of Yrs'].median()

# Plot
width=0.8
fig, ax = plt.subplots()
ax.bar(keys_list, values, color=colours[4], width = width, label='Crew')
ax.axvline(median, linewidth=3, color='black', label='Median')
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_facecolor('white')
ax.set_xticks(ticks=keys_list)
ax.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/basic_crew_longevity>1yr.png', facecolor='#ffffff')
plt.show()

Key window of career lengths: between 4 and 20 years¶

We are interested in the factors that allow people to develop longer careers. The following chart shows frequency of career lengths in the entries for between 4 and 20 years. This offers a wide sample of entries who are clearly getting something out of being in the directory

1 Year Entries - Their Departments¶

filt_1 = (df['No of Yrs'] < 2)
df_1 = df[filt_1]
msg = 'Number in department in for only 1 year, historical'
print(msg)
print(len(msg) *'-')
entries = []
for dept in depts:
    filt_dept = (df_1['Role 1 Category'] == dept)
    df_depts = df_1[filt_dept]
    count = df_depts['No of Yrs'].value_counts().values
    entries.append(count[0])
    #print(dept, count)
print(entries)

Number in department in for only 1 year, historical
---------------------------------------------------
[315, 167, 148, 65, 43, 62, 61, 57, 51, 35, 31, 17, 28, 24, 11, 5]

depts2 = depts
depts2.pop(-1)
entries.pop(-1)

5

width=0.7
color = sns.cubehelix_palette(start=2, dark=0.5)

fig, ax = plt.subplots()
ax.bar(depts2, entries, color=colours[5], width = width, label='Crew')

ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_facecolor('white')

ax.set_xticks(ticks=depts)
fig.autofmt_xdate(rotation=45)
ax.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/freelancers_by_department_in_for_1_year.png', facecolor='#ffffff')
plt.show()

Long Careers In Film Bang¶

# Data
filt5 = (df['No of Yrs'] > 20 )
df4 = df[filt5]
keys_list = df4['No of Yrs'].value_counts().index.tolist()
values = df4['No of Yrs'].value_counts().values
# Plot
color = sns.cubehelix_palette(start=1, dark=0.5)
width=0.7
fig, ax = plt.subplots()
ax.bar(keys_list, values, color=colours[6], width = width, label='Crew')

ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_facecolor('white')
ax.set_xticks(ticks=keys_list)
ax.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/freelancer_longevity_>20_yrs.png', facecolor='#ffffff')
plt.show()

Career Lengths in Departments - Bins¶

import seaborn as sns
import matplotlib.pylab as plt

totals = df['Role 1 Category'].value_counts().array
depts = df['Role 1 Category'].value_counts().index.array
indexes = ['1 year','2-3 years', '4-6 years', '7-11 years', '12-18 years', '19-29 years', '30-42 years']
color = sns.color_palette('colorblind', as_cmap=True)

def bins_plotter(_index, department, med, values):
    width = 0.7
    total = str(totals[_index])
    label = department +'\n'+'Dept total = '+total+'\n' +'Median= '+med+' years'

    fig, ax = plt.subplots()
    ax.bar(indexes, values, color=color[9], width = width, label=label)
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.set_title('')
    ax.set_facecolor('white')
    ax.legend(fontsize=14)
    plt.grid(True)
    plt.tight_layout()
    plt.savefig('Outputs/Final/crew_longevity_by_year_prime_bins_bar_'+department+'.png', facecolor='#ffffff')
    plt.show()

def career_bins(series, dataframe):
    for index, dept in enumerate(series):
        filt_dept = (dataframe['Role 1 Category'] == dept)
        df_depts = dataframe[filt_dept]
        bins = pd.cut(df_depts['No of Yrs'], [0, 1, 3, 6, 11, 18, 29, 42])
        values = df_depts.groupby(bins)['No of Yrs'].agg(['count'])
        simple_values = (values.values)
        value_list = [ item for elem in simple_values for item in elem]
        median = str(df_depts['No of Yrs'].median())
        # call bins_plotter function
        bins_plotter(index, dept, median, value_list)

career_bins(depts, df)

Career Longevity / Turnover¶

Using the calculation for career length each year (a count produced for each year an entry is present in the directory) chart career lengths over time

years = [*range(1,42)]

df_master = pd.DataFrame({'Years': years})
for year in df.loc[:,'2020':'1976']:
    # filter out NAN values
    filtyear = df[year].notna()
    dfyear = df[filtyear]
    # get data
    values_year = dfyear['Yr Cnt '+year].value_counts().array
    keys_year = dfyear['Yr Cnt '+year].value_counts().index
    # second dataframe
    df_year = pd.DataFrame({f'{year} Keys':keys_year, f'{year} Values':values_year})
    # join dataframes 
    df_master = df_master.join(df_year.set_index(f'{year} Keys'), on='Years')
longevity_data = df_master.set_index('Years')

longevity_data.iloc[20].median()

10.0

print(longevity_data.iloc[19].median())
print(longevity_data.iloc[20].median())
print(longevity_data.iloc[21].median())

10.5
10.0
8.0

# drop_vals used in this code were read from another chart, not coded here. 
# write a function to retrieve this data.
dropout_years = [1978,1979,1981,1982,1984,1986,1987,1988,1989,1990,1991, 1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020]
df_one_year_dropouts = pd.DataFrame(columns=dropout_years)
drop_vals = [12, 7, 3, 3,  2,  39, 6,  5,  9,  23, 24, 29, 10, 13, 35, 18, 16, 20, 36, 23, 38, 28,11,30, 44, 35,30, 32, 29, 68,15, 46, 19, 30, 18, 35, 16, 32, 40, 40]
df_one_year_dropouts.loc[1] = drop_vals
print(df_one_year_dropouts)

df_one_year_dropouts2 = pd.DataFrame()
df_one_year_dropouts2['Years'] = dropout_years
df_one_year_dropouts2['Dropouts'] = drop_vals
df_one_year_dropouts2.to_csv('Outputs/CSV/To_DataWrapper/df_dropouts.csv', index=False)

  1978 1979 1981 1982 1984 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995  \
1   12    7    3    3    2   39    6    5    9   23   24   29   10   13   35

  1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010  \
1   18   16   20   36   23   38   28   11   30   44   35   30   32   29   68

  2011 2012 2013 2014 2015 2016 2017 2018 2019 2020
1   15   46   19   30   18   35   16   32   40   40

df_one_year_dropouts.loc[1].median()

23.5

import matplotlib.pylab as pylt

plot_years = df.loc[:, '2020':'1976']
hm_colours = sns.cubehelix_palette(start=2, rot=1, dark=0, light=.95, as_cmap=True)

pylt.figure(figsize=(22,10))
ax = sns.heatmap(longevity_data, linewidth=0.3, cmap=hm_colours, annot=True, fmt=".0f")
pylt.yticks(rotation=0)
ax.xaxis.set_ticks_position('top')
ax.set_xticklabels(plot_years,rotation=90)
ax.set_facecolor('white')
pylt.xlabel('')
pylt.ylabel('Length of Career in Years')

#plt.title(f'Film Bang Freelancers Length of Career over Time' , fontsize=16, color='#635DC6', weight='bold')
#plt.figtext(.0,.0,'Fig. 1.3', fontsize=14, ha='right')
plt.savefig(f"Outputs/Final/career_longevity_basic_1.png", facecolor='#ffffff', dpi=500)
pylt.show()

# Convert to html table for screen readers
#longevity_data.to_html()

The chart above shows overall presence in the listings over the history of Film Bang.

Each horizontal row contains numbers of people in the data with that given length of career (top level is whoever has a 1 year presence in the listings). Columns are for each year. So we can track how many people have a given career length in each year.

We can see a bulge of darker colour emerging from the early nineties, reaching a peak in the early 2000s and dropping off to early 90s density from 2012 onwards.

We can also see an initial high density influx of listings in the first year, and a comparative drop off of 'new starts' until 1984 when the number of new entries jumps. We can see this cohort lingering on in the listings in the form of a darker trail leading down and left from the figure '89' at the top of the 1984 column.

Department Longevity¶

depts = df['Role 1 Category'].value_counts().index.array

df_master = pd.DataFrame({'Years': years})

import matplotlib.pylab as plt

plot_years = df.loc[:, '2020':'1976']

def plot(dept):
    hm_colours = sns.cubehelix_palette(start=2, rot=1, dark=0, light=.95, as_cmap=True)
    plt.figure(figsize=(22,10))
    ax = sns.heatmap(longevity_data_2, linewidth=0.3, cmap=hm_colours, annot=True, fmt=".0f")
    plt.yticks(rotation=0)
    ax.xaxis.set_ticks_position('top')
    ax.set_facecolor('white')
    ax.set_xticklabels(plot_years,rotation=90)
    plt.xlabel('')
    plt.ylabel('Number of Years in Directory')
    plt.title(f'{dept}', fontsize=14)
    #plt.figtext(.0,.0,'Fig. #', fontsize=14, ha='right')
    #plt.legend(dept)
    plt.savefig(f"Outputs/Final/career_longevity_{dept}_2.png", facecolor="#ffffff", dpi=500)
    plt.show()

# Loop through departments
for i in depts:
    #initialise a new df_master dataframe for each pass through departments
    years = [*range(1,42)]
    df_master = pd.DataFrame({'Years': years})
    # filter for department of given iteration
    deptfilt = (df['Role 1 Category'] == i)
    df_dept = df[deptfilt]
    # loop through year columns and filter out NAN values
    for year in df.loc[:, '2020':'1976']:
        filtyear = df_dept[year].notna()
        dfyear = df_dept[filtyear]
        # get numbers
        values_year = dfyear['Yr Cnt '+year].value_counts().array
        keys_year = dfyear['Yr Cnt '+year].value_counts().index
        # put them in a new mini dataframe
        df_year = pd.DataFrame({f'{year} Keys':keys_year, f'{year} Values':values_year})
        # join the mini dataframes to a temporary master dataframe
        df_master = df_master.join(df_year.set_index(f'{year} Keys'), on='Years')
    # assign the new master dataframe to data variable and plot it 
    # plot function will expect data to be the current dataframe
    df_master = df_master.set_index('Years')
    longevity_data_2 = df_master

    plot(i)

The charts above show density of career lengths over time. The darker the colour, the more entries. For each incremental year that cohort moves down one square and left one square. Under Production for example there are 6 entries under production in 1976. In 1978 there are 4 entries. Meaning 2 dropped out and 4 stayed in. In 1978 4 people have a two year career and 4 people have a 1 year career. In 2011 6 entries show a career length of 10 years. In 2012 there are 5 entries with a career of 11 years (meaning one dropped out). And so on.

The darker the trail leading down and left, the more sustainable the career during that period. So we see careers starting in the mid 90s lasting comparitively well. Patches of light colour relatively high up in the charts indicate (relatively) high numbers of drop outs.

Presence in the listings after 10 years is comparatively rare so seeing any people lasting this long is significant in its own right

long_series = longevity_data_2.iloc[0]
long_series.median()

1.0

longevity_data_2.iloc[0].median()

1.0

Dept data total and after 1 year¶

depts = df['Role 1 Category'].value_counts().index.tolist()
nums = df['Role 1 Category'].value_counts().tolist()

filt_1_yr = (df['No of Yrs'] > 1)
df_more_than_1_year = df[filt_1_yr]
nums_more_than_1_year = df_more_than_1_year['Role 1 Category'].value_counts().tolist()

width=0.8
fig, ax = plt.subplots()
ax.bar(depts, nums, color=colours[3], width = width, label='All listings')
ax.bar(depts, nums_more_than_1_year, color=colours[4], width=width, label="Listings in Directory for more than 1 year")
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_facecolor('white')
ax.set_xticks(ticks=depts)
fig.autofmt_xdate(rotation=45)
plt.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/department_role_category_comparison.png', facecolor='#ffffff')
plt.show()

Gender¶

Gender is inferred. The Film Bang directory does not ask people to declare gender, or other personal information. The gender presented here is inferred from the names of entries (done during data capture, prior to anonymization of the data). Where gender is recorded here as 'unknown' it refers to a name that might conventionally refer to both a man or woman.

df['Gender'].value_counts()

Male       1982
Female     1542
Unknown     110
Name: Gender, dtype: int64

df['Gender'].value_counts(normalize=True).apply(format_percent)

Male       55
Female     42
Unknown     3
Name: Gender, dtype: object

role_grp = df.groupby(['Role 1 Category'])
role_grp['Gender'].value_counts().tail(300)

Role 1 Category  Gender
Art              Female     229
                 Male       221
                 Unknown     20
Camera           Male       537
                 Female      80
                 Unknown     17
Casting          Female      44
                 Male        29
                 Unknown      3
Construction     Male        51
Costume          Female     130
                 Male        18
                 Unknown      6
Direction        Male       182
                 Female      93
                 Unknown      7
Hair & Make-Up   Female     183
                 Male        15
                 Unknown      3
Music            Male        73
                 Female      11
                 Unknown      2
Post-Production  Male       161
                 Female      56
                 Unknown      9
Producer         Female     121
                 Male       119
                 Unknown      2
Production       Female     491
                 Male       300
                 Unknown     25
Script           Male        50
                 Female      28
                 Unknown      1
Sound            Male       177
                 Female      22
                 Unknown      6
Special FX       Male        17
                 Female       5
Support          Female      48
                 Male        27
                 Unknown      8
Trainee          Male         5
                 Female       1
Name: Gender, dtype: int64

ct = {
    'Year':[],
    'Male':[],
    'Female':[],
    'Unknown':[],
}
gender_ct = pd.DataFrame(ct)

for column in df.loc[:, '2020':'1976']:

    filt1 = (df[column] == column) & (df['Gender'] == 'Female')
    new_df = df[filt1]
    filt2 = (df[column] == column) & (df['Gender'] == 'Male')
    new_df2 = df[filt2]
    filt3 = (df[column] == column) & (df['Gender'] == 'Unknown')
    new_df3 = df[filt3]

    women = len(new_df.index)
    men = len(new_df2.index)
    no_gender = len(new_df3.index)
    gender_ct = gender_ct.append({'Year': column, 'Male': men,'Female': women, 'Unknown': no_gender}, ignore_index=True)

# Plot Data
gender_data = gender_ct
years_x = gender_data['Year']
no_g_y = gender_data['Unknown']
male_y = gender_data['Male']
female_y = gender_data['Female']

width=0.7
fig, ax = plt.subplots()

bars = np.add(male_y, female_y).tolist()
ax.bar(years_x, no_g_y, bottom=bars, color=colours[0], width=width, label='Gender Unknown')
ax.bar(years_x, male_y, bottom=female_y, color=colours[1], width=width, label='Male')
ax.bar(years_x, female_y, color=colours[2], width=width, label='Female')

ax.set_xlabel('')
ax.set_ylabel('')
ax.set_xticks(ticks=years_x)
ax.set_xticklabels(years_x,{'fontsize':12})
fig.autofmt_xdate(rotation=90, ha='right')

fig.gca().invert_xaxis()
ax.set_facecolor('white')

ax.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig("Outputs/Final/basic_gender_chart_dp.png", facecolor='#ffffff')
plt.show()

gender_ct['Div'] = 100/(gender_ct['Male'] + gender_ct['Female'] + gender_ct['Unknown'])
gender_ct['m_perc'] = gender_ct['Div'] * gender_ct['Male']
gender_ct['f_perc'] = gender_ct['Div'] * gender_ct['Female']
gender_ct['u_perc'] = gender_ct['Div'] * gender_ct['Unknown']

gender_ct
years_x = gender_ct['Year']
no_g_y = gender_ct['u_perc']
male_y = gender_ct['m_perc']
female_y = gender_ct['f_perc']
width=0.7

bars = np.add(male_y, female_y).tolist()
plt.bar(years_x, no_g_y, bottom=bars, color=colours[0], width=width, label='Gender Unknown')
plt.bar(years_x, male_y, bottom=female_y, color=colours[1], width=width, label='Male')
plt.bar(years_x, female_y, color=colours[2], width=width, label='Female')

plt.xlabel('')
plt.ylabel('')

plt.xticks(ticks=years_x, rotation=90)

plt.gca().invert_xaxis()

ax = plt.axes()
ax.set_facecolor('white')

plt.legend(frameon=1, fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig("Outputs/Final/basic_gender_chart_percent.png", facecolor='#ffffff')

plt.show()

# Create CSV for basic stats: crew, crew gender, companies ADD Workshops
crew_totals = [688, 644, 636, 627, 634, 675, 627, 668, 664, 692, 639, 726, 708, 759, 786, 793, 806, 782, 728, 710, 713, 676, 651, 585, 535, 492, 476, 397, 361, 355, 313, 266, 206, 210, 197, 200, 119, 104, 85, 83, 66]
production_companies = [50, 51, 49, 49, 55, 56, 61, 60, 65, 64, 62, 77, 79, 78, 70, 72, 74, 78, 81, 87, 85, 82, 81, 72, 62, 60, 65, 56, 53, 48, 39, 35, 27, 34, 29, 26, 22, 16, 15, 13, 11]
gender_ct['Total Crew'] = crew_totals
gender_ct['Companies'] = production_companies
# Export to CSV
gender_ct.to_csv('Outputs/CSV/To_DataWrapper/gender_ct.csv', index=False)

gender_by_year = {
    'Year':[],
    'Male':[],
    'Female':[],
    'Unknown':[],
}
df_gender = pd.DataFrame(gender_by_year)

def gender_dept_plot(d, years_x, no_g_y, male_y, female_y, index):
    bars = np.add(male_y, female_y).tolist()
    plt.bar(0, 0, color='none', label=f"{i}")
    plt.bar(years_x, no_g_y, bottom=bars, color=colours[0], width=width, label='Gender Unknown')
    plt.bar(years_x, male_y, bottom=female_y, color=colours[1], width=width, label='Male')
    plt.bar(years_x, female_y, color=colours[2], width=width, label='Female')

    plt.xlabel('')
    plt.ylabel('')
    plt.title(f'')
    ax = plt.axes()
    ax.set_facecolor('white')
    plt.xticks(ticks=years_x, rotation=90, fontsize=14)
    plt.yticks(fontsize=14)
    plt.gca().invert_xaxis()
    plt.legend(fontsize=16)
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(f"Outputs/Final/Gender/{index}_department_workers_by_gender.png", facecolor='#ffffff')
    plt.show()

# Refactor this to break into smaller parts: data manipulation and plots. 
# Create plot function with a dept parameter
# Call plot function within the loop, passing the dept / column variable as argument. 

width = 0.7
depts = df['Role 1 Category'].dropna().unique()

for i in depts:
    df_gender = df_gender[0:0]
    filt = (df['Role 1 Category'] == i)
    department = df[filt]
    for column in department.loc[:, '2020':'1976']:
        filt1 = (department[column] == column) & (department['Gender'] == 'Female')
        new_df = department[filt1]
        filt2 = (department[column] == column) & (department['Gender'] == 'Male')
        new_df2 = department[filt2]
        filt3 = (department[column] == column) & (department['Gender'] == 'Unknown')
        new_df3 = department[filt3]
        women = len(new_df.index)
        men = len(new_df2.index)
        no_gender = len(new_df3.index)
        df_gender = df_gender.append({'Year': column, 'Male': men,'Female': women, 'Unknown': no_gender}, ignore_index=True)

    #data = df_gender
    years_x = df_gender['Year']
    no_g_y = df_gender['Unknown']
    male_y = df_gender['Male']
    female_y = df_gender['Female']
    # call plot function
    gender_dept_plot(depts, years_x, no_g_y, male_y, female_y, i)

Career Length by Gender¶

g = df.groupby(['No of Yrs'])
g_count = g['Gender'].value_counts()
print(g_count)

No of Yrs  Gender
1.0        Male       589
           Female     482
           Unknown     49
2.0        Male       283
           Female     227
           Unknown     20
3.0        Male       176
           Female     168
           Unknown     11
4.0        Male       130
           Female     109
           Unknown      8
5.0        Male       110
           Female     105
           Unknown      2
6.0        Male        75
           Female      50
           Unknown      3
7.0        Male        62
           Female      59
8.0        Male        64
           Female      34
           Unknown      3
9.0        Male        57
           Female      36
           Unknown      4
10.0       Male        51
           Female      34
11.0       Male        40
           Female      26
           Unknown      2
12.0       Male        32
           Female      21
           Unknown      1
13.0       Female      27
           Male        27
14.0       Female      24
           Male        22
           Unknown      1
15.0       Male        31
           Female      19
           Unknown      1
16.0       Male        21
           Female      10
17.0       Male        26
           Female       9
18.0       Male        20
           Female      11
19.0       Male        17
           Female      10
20.0       Male        16
           Female      10
21.0       Male        18
           Female       7
           Unknown      2
22.0       Male        17
           Female      12
           Unknown      1
23.0       Female       7
           Male         7
24.0       Female      11
           Male        10
25.0       Male        10
           Female       8
           Unknown      1
26.0       Male        10
           Female       7
27.0       Male         9
28.0       Male         9
           Female       2
29.0       Male         8
           Female       6
30.0       Male         8
           Female       2
31.0       Male         4
32.0       Male         5
           Female       1
33.0       Female       2
           Male         2
34.0       Male         5
           Female       2
35.0       Male         4
           Female       3
36.0       Male         3
37.0       Female       1
           Male         1
38.0       Male         1
41.0       Male         1
Name: Gender, dtype: int64

Data¶

# Create Dataframe with Yrs column
df_clg = pd.DataFrame({'Yrs': range(1, 43)})

# Filtering
m_filt = (df['Gender'] == 'Male')
df_m = df[m_filt]
f_filt = (df['Gender'] == 'Female')
df_f = df[f_filt]
u_filt = (df['Gender'] == 'Unknown')
df_u = df[u_filt]

# parse data
m_length = df_m.groupby(['No of Yrs'])
m_c_length_values = m_length['Gender'].value_counts().array
m_c_length_keys = m_length['Gender'].value_counts().index.tolist()
m_indexes = [i[0] for i in m_c_length_keys]

f_length = df_f.groupby(['No of Yrs'])
f_c_length_values = f_length['Gender'].value_counts().array
f_c_length_keys = f_length['Gender'].value_counts().index.tolist()
f_indexes = [i[0] for i in f_c_length_keys]

u_length = df_u.groupby(['No of Yrs'])
u_c_length_values = u_length['Gender'].value_counts().array
u_c_length_keys = u_length['Gender'].value_counts().index.tolist()
u_indexes = [i[0] for i in u_c_length_keys]

# Join Data
df_male = pd.DataFrame({'M Yrs':m_indexes, 'M Values':m_c_length_values})
df_clg = df_clg.join(df_male.set_index('M Yrs'), on='Yrs')
df_female = pd.DataFrame({'F Yrs':f_indexes, 'F Values':f_c_length_values})
df_clg = df_clg.join(df_female.set_index('F Yrs'), on='Yrs')
df_unknown = pd.DataFrame({'U Yrs':u_indexes, 'U Values':u_c_length_values})
df_clg = df_clg.join(df_unknown.set_index('U Yrs'), on='Yrs')
#print(df_clg)

Plots¶

width = 0.3

yrs = df_clg['Yrs']
male = df_clg['M Values']
female = df_clg['F Values']
unknown = df_clg['U Values']

fig, ax = plt.subplots()

ax.bar(yrs + width, unknown, color=colours[0], width=width, label='Gender Unknown', log=True)
ax.bar(yrs, male, color=colours[1], width=width, label='Male',log=True)
ax.bar(yrs - width, female, color=colours[2], width=width, label='Female', log=True)

ax.set_xlabel('')
ax.set_ylabel('')
ax.set_facecolor('white')
ax.set_xticks(ticks=yrs)

ax.legend(loc='upper right', fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig("Outputs/Final/gender_longevity_log.png", facecolor='#ffffff')
plt.show()

bins = pd.cut(df_clg['Yrs'], [0, 1, 3, 6, 11, 18, 29, 42])
m_values = male.groupby(bins)
f_values = female.groupby(bins)
u_values = unknown.groupby(bins)
m_out = [i for i in m_values]
f_out = [i for i in f_values]
u_out = [i for i in u_values]

# Data
m_totals = []
f_totals = []
u_totals = []
for i in m_out:
    m_totals.append(i[1].sum())
for i in f_out:
    f_totals.append(i[1].sum())
for i in u_out:
    u_totals.append(i[1].sum())

indexes = ['1 year','2-3 years', '4-6 years', '7-11 years', '12-18 years', '19-29 years', '30-42 years']
width = 0.3
fig, ax = plt.subplots()

ax.bar(indexes, m_totals, color=colours[1], width=width, align='edge', label='Male')
ax.bar(indexes, f_totals, color=colours[2], width=-width, align='edge', label='Female')
ax.bar(indexes, u_totals, color=colours[0], width=width,  label='Unknown Gender')
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_facecolor('white')
ax.legend(fontsize=12)
plt.tight_layout()
plt.savefig('Outputs/Final/longevity_gender_crew_bins.png', facecolor='#ffffff')
plt.show()

df_gender_bins = pd.DataFrame(columns=indexes)
# Add to DataFrame
df_gender_bins.loc[1] = m_totals
df_gender_bins.loc[2] = f_totals
df_gender_bins.loc[3] = u_totals
df_gender_bins

Dropouts¶

depts = df['Role 1 Category'].value_counts().index.tolist()

# Create new column duplicating values from Yr Cnt 1976 
# in order to allow the .diff calculation below to work
# Calculation uses values in column to the left, so there has to be a duplicate column 
# 'to the left' of the first column to be calculated. 
df['Yr Cnt 1976 x'] = df['Yr Cnt 1976']

# Reverse year cols
df = df[['UUID', 'Codes', 'Trainee prog', 'Gender', 'Role 1', 'Role 1 Category',
       'Role 2', 'Role 2 Category', 'Role 3', 'Role 3 Category','Has Consistent Role',
         'Postcode1', 'Rural', 'No of Yrs', '1976', '1978', '1979', '1981', '1982',
         '1984', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993',
         '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002',
         '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011',
         '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020',
         'Company', 'Description', 'Yr Cnt 1976 x', 'Yr Cnt 1976', 'Yr Cnt 1978',
         'Yr Cnt 1979', 'Yr Cnt 1981', 'Yr Cnt 1982', 'Yr Cnt 1984', 'Yr Cnt 1986',
         'Yr Cnt 1987', 'Yr Cnt 1988', 'Yr Cnt 1989', 'Yr Cnt 1990', 'Yr Cnt 1991',
         'Yr Cnt 1992', 'Yr Cnt 1993', 'Yr Cnt 1994', 'Yr Cnt 1995', 'Yr Cnt 1996',
         'Yr Cnt 1997', 'Yr Cnt 1998', 'Yr Cnt 1999', 'Yr Cnt 2000', 'Yr Cnt 2001',
         'Yr Cnt 2002', 'Yr Cnt 2003', 'Yr Cnt 2004', 'Yr Cnt 2005', 'Yr Cnt 2006',
         'Yr Cnt 2007', 'Yr Cnt 2008', 'Yr Cnt 2009', 'Yr Cnt 2010', 'Yr Cnt 2011',
         'Yr Cnt 2012', 'Yr Cnt 2013', 'Yr Cnt 2014', 'Yr Cnt 2015', 'Yr Cnt 2016',
         'Yr Cnt 2017', 'Yr Cnt 2018', 'Yr Cnt 2019', 'Yr Cnt 2020']]
df.head(1)

We start with two sets of year columns. The basic year columns containing strings such as '2020' or '1999'. Presence or absence in the directory is represented by the presence or absence of a year string in those columns. Yr Cnt Year columns present a running tally of the number of years a person has been in the directory. Due to the way the code runs, the total years (career length) is present in all subsequent columns regardless of when the person left the directory. The code below creates two things: a dataframe indicating the difference between one year and the next (inserting a 1.0 or a 0.0 value depending on whether the year tally increases or stays the same. Then it multiplies the 'Yr Cnt Year' cols by the difference values: (year tally 1 = year tally), (year tally 0 = 0).

# Assign values from Yr Cnt 1976 col to a variable. 
# This allows retrieval of the original values once 
# the .diff calculation is run.
temp = df['Yr Cnt 1976']
temp.value_counts()

0    3568
1      66
Name: Yr Cnt 1976, dtype: int64

# check those values against the duplicate column
if temp.value_counts().values.tolist() == df['Yr Cnt 1976 x'].value_counts().values.tolist(): print("Values Equal")

Values Equal

# create diff dataframe
df_diff = df.loc[:, 'Yr Cnt 1976 x':'Yr Cnt 2020'].diff(axis='columns')
#df_diff

# Multiply the year count by the diff value to either get the total or 0

# Re-assign values from original Yr Cnt 1976 column to the current one. 
# diff operation was inserting NAN values in Yr Cnt 1976 col because the 
# column previous to it was an index.
df_diff['Yr Cnt 1976'] = temp

# check operation has worked
if df_diff['Yr Cnt 1976'].value_counts().values.tolist() == df['Yr Cnt 1976'].value_counts().values.tolist(): print("Values Equal")

Values Equal

# Multiply selected cols in dataframe by diff dataframe & show results
df_results = df.loc[:, 'Yr Cnt 1976 x':'Yr Cnt 2020'].multiply(df_diff)
df_results.head()

## Checking when dropouts occur

This code inserts a string 'year' - where the value in the given year is greater than the following (a dropout year will contain 0) - into a new column to capture the year when an entry drops out. Multiple columns are required because many entries drop out and re enter the directory over the years, sometimes several times.

# Get list of column names
yr_cnt_range = df_results.loc[:, 'Yr Cnt 1976':'Yr Cnt 2020'].columns.tolist()

# Get list of years
yr_range = df.loc[:, '1976':'2020'].columns.tolist()

# Get int version of years list
int_yr_range = []
for i in yr_range:
    int_yr_range.append(int(i))
#print(int_yr_range)

nIYR = len(int_yr_range)
for index in range(1, nIYR):
    i = int_yr_range[index]
    j = str(i)
    df_results['dropout '+ (j[2:])] = np.where((df_results[yr_cnt_range[index-1]] >
              df_results[yr_cnt_range[index]]), i, np.nan)

dropouts = []

# 
for i in yr_range[1:]:
    try:
        dropouts.append((i, df_results['dropout '+i[2:]].value_counts().values[0]))
    except IndexError:
        dropouts.append((i, 0))

df_results['dropout 11'].value_counts().values[0]

108

dropout_list = []
for i in dropouts:
    dropout_list.append(i[1])

# at a 0 value for 1976 in dropout_list to account for no dropouts in the first year
insert = 0 # index position
dropout_list[insert:insert] = [0]

# Optional put dropouts in new dataframe
df2 = pd.DataFrame({'values':dropout_list})

# Median number of dropouts per year
dropout_median = df2['values'].median()

# Get totals for each year
general_count = []
for column in df.loc[:, '1976':'2020']:
    try:
        step = df[column].value_counts()
        general_count.append(step[0])
    except IndexError:
        general_count.append(0)
print(general_count)

[66, 83, 85, 104, 119, 200, 197, 210, 206, 266, 313, 355, 361, 397, 476, 492, 535, 585, 651, 676, 713, 710, 728, 782, 806, 793, 786, 759, 708, 726, 639, 692, 664, 668, 627, 675, 634, 627, 636, 644, 688]

Calculating New Entries¶

We can now count the instances of a career length of 1 year and produce stats for the number of new entries per year. The 1 count is based on the presence of a year string in the original dataset, so there's no case of a first year counting as 0.

new_entries = []
for column in df_results.loc[:, 'Yr Cnt 1976':'Yr Cnt 2020']:
    current = df_results[column].isin([1]).sum(axis=0)
    new_entries.append(current)

print(new_entries)

[66, 29, 18, 29, 30, 89, 59, 33, 32, 77, 82, 83, 77, 77, 119, 93, 99, 115, 119, 132, 122, 106, 97, 124, 129, 109, 107, 108, 83, 112, 68, 98, 75, 96, 80, 104, 72, 102, 95, 114, 171]

# Median number of new entries per year
df3 = pd.DataFrame({'values':new_entries})
new_entry_median = df3['values'].median()
print(new_entry_median)

95.0

df2 = pd.DataFrame({'values':dropout_list})
#print(df2)

Where people drop out more than once¶

df_dropouts = df_results.loc[:, 'dropout 78':'dropout 20']

df['multi dropouts'] = df_dropouts.notna().sum(axis=1)

df['multi dropouts'].head(10)

0    0
1    1
2    1
3    1
4    1
5    0
6    1
7    0
8    0
9    0
Name: multi dropouts, dtype: int64

df['multi dropouts'].value_counts(dropna=False)

1    2573
0     529
2     419
3      95
4      14
5       4
Name: multi dropouts, dtype: int64

dropout_tally_list = df['multi dropouts'].value_counts().tolist()

dropout_tally_list

[2573, 529, 419, 95, 14, 4]

try:
    dropout_tally_list.pop(0)
except IndexError:
        print('Index Error')
try:
    dropout_tally_list.pop(0)
except IndexError:
        print('Index Error')
drop_sum = sum(dropout_tally_list)

total_entries = 3634
d = 100 / total_entries
one_dropout = d * 2573
two_dropouts = d * 419
three_dropouts = d * 95
four_dropouts = d * 14
five_dropouts = d * 4

multi_dropout_total = d * 532

print('One:' , one_dropout , ' %')
print('Two:' , two_dropouts , ' %')
print('Three:' , three_dropouts , ' %')
print('Four:' , four_dropouts , ' %')
print('Five:' , five_dropouts , ' %')
print('Multiple Dropouts:' , multi_dropout_total , ' %')

One: 70.80352228948816  %
Two: 11.529994496422674  %
Three: 2.6141992294991745  %
Four: 0.38525041276829936  %
Five: 0.1100715465052284  %
Multiple Dropouts: 14.639515685195377  %

Plots¶

plt.rcParams['axes.facecolor']='white'

plot_years = yr_range

fig, ax = plt.subplots()
ax.plot(plot_years, general_count, color="black", label="Total Entries", linewidth=3)
ax.plot(plot_years, dropout_list, color=colours[3], label='Dropouts', linewidth=3)
ax.plot(plot_years, new_entries, color=colours[2], label='New Entries', linewidth=3)

ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_xticks(ticks=plot_years)
fig.autofmt_xdate(rotation=90)
ax.set_facecolor('white')
ax.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()

plt.savefig('Outputs/Final/turnover_new_entries_dropouts_totals.png', facecolor='#ffffff')
plt.show()

width = 0.5
fig, axs = plt.subplots(2, 1, sharex=True)
neg_dropout_list = [ -x for x in dropout_list]

axs[0].bar(plot_years, new_entries, width=width, color=colours[2], label='New Entries')
axs[0].set_xticks(plot_years)
axs[1].bar(plot_years, neg_dropout_list, width=width, color=colours[3], label='Dropouts')
axs[1].set_xticks(plot_years)

axs[0].legend(fontsize=14)
axs[1].legend(fontsize=14, loc=2, bbox_to_anchor=(0.0,0.9))

plt.xticks(rotation=90)
plt.tight_layout()

plt.subplots_adjust(hspace=0)
axs[1].set_xlabel('')
axs[0].set_ylabel('')
axs[0].yaxis.set_label_coords(-0.05,-0.2)

fig.set_facecolor('white')
axs[0].set_facecolor('white')
axs[1].set_facecolor('white')

ticks =  axs[1].get_yticks()
axs[1].set_yticklabels([int(abs(tick)) for tick in ticks])
axs[0].grid(False)
axs[1].grid(False)
plt.savefig(f'Outputs/Final/turnover_new_entries_dropouts_mirror_bars.png', facecolor='#ffffff')

plt.show()

Dropouts and new entries, while presented above as somewhat symetrical, are not opposing poles. A person can be a new entry only once, in one particular year. A person can drop out and re-enter the directory as many times as they like; those dropouts are all counted and included in the chart above. So there will be more dropouts than new entries regardless. The meaning of a dropout is also ambiguous. Leaving the film bang directory does not mean leaving the industry or failing in the industry.

Dropouts are measured by comparing a value with a (lower) value in the following year. The dropout is located in the year with the low value. So if a person is present in the directory in 2018 and absent in 2019, the dropout is recorded as happening in 2019.

dropouts_index=['New Entries', 'Dropouts']
df_dropouts = pd.DataFrame(columns=plot_years)
# Add to DataFrame
df_dropouts.loc[1] = new_entries
df_dropouts.loc[2] = neg_dropout_list
#df_dropouts.to_html()

#df_dropouts.iloc[1]

# go through each row and create a new column with their dropout year in it. 
df_test = df_results
df_test.replace('%','',regex=True).astype('float')
df_test = df_test.drop(df_test.loc[:,'Yr Cnt 1976 x':'Yr Cnt 2020'].columns, axis = 1)
df_test
df_test["dropout year"] = df_test.sum(axis=1)
df_test.head(10)

df_test.shape

(3634, 41)

df.shape

(3634, 100)

Data for Cox Proportional Hazard Modelling¶

# take the above column and add it to the main df
df_cox = df.join(df_test)
df_cox.head(30)

# drop useless cols from that df
df_cox = df_cox.drop(['Codes', 'Role 1', 'Role 2', 'Role 3', 'Company', 'Description'], axis=1)

df_cox = df_cox.drop(df_cox.loc[:,'dropout 78':'dropout 20'].columns, axis = 1)

df_cox = df_cox.drop(df_cox.loc[:,'Yr Cnt 1976 x':'Yr Cnt 2020'].columns, axis = 1)

df_cox.to_csv(f'Outputs/CSV/df_for_statistical_analysis.csv', index=False)

# create new dfs for each variable to be checked
df_cox['Trainee prog'] = df_cox['Trainee prog'].fillna(0)

Comparing Film Bang data with BFI¶

Methodology for comparison¶

We took the UK Film Council and BFI statistical yearbooks from 2002 - 2019. These are not a stable reliable source of data due to changes in the way the ONS counted companies in the screen industries. We found multiple errors in transcribing data from ONS into the yearbooks themselves. And noted multiple variants in figures for years through time. However, these figures present a 'good enough' comparison as long as we don't try to read anything very deep into the detail. We took the figures for the years 1996 - 2018 and went for the most plausible. MORE DETAIL ON PROCESS...

film_bang_se_nums = [
    688, 644, 636, 627, 634, 675, 627, 668, 664, 692, 639, 726, 708,
    759, 786, 793, 806, 782, 728, 710, 713, 676, 651, 585, 535, 492,
    476, 397, 361, 355, 313, 266, 206, 210, 197, 200, 119, 104, 85,
    83, 66]

bfi_se_nums = [
    8000,9000,13000,11000,11000,12000,15000,15000,15000,11000,13000,11000,
    11000,15000,20000,24000,22000,24000,28000,24000,27000,32000,31000]
bfi_years = [
    1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
    2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018]
film_bang_se_nums = [
    535, 585, 651, 676, 713, 710, 728, 782, 806, 793, 786, 759, 708, 726,
    639, 692, 664, 668, 627, 675, 634, 627, 636]

x_indexes = list(range(len(bfi_years)))
fig, ax = plt.subplots()

ax.plot(x_indexes, bfi_se_nums, color='magenta', label='BFI Numbers', linewidth=3)
ax.plot(x_indexes, film_bang_se_nums, color='blue', label='Film Bang Numbers', linewidth=3)

ax.set_facecolor('white')
ax.set_xlabel('')
ax.set_ylabel('Number of Self Employed')
ax.set_title('')
ax.set_xticks(ticks=x_indexes)
ax.set_xticklabels(bfi_years, {'fontsize':12})
fig.autofmt_xdate(rotation=90)
ax.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/bfi_film_bang_plot_comparison.png', facecolor='#ffffff')
plt.show()

# reverse list
film_bang_se_nums = list(reversed(film_bang_se_nums))
print(film_bang_se_nums)

[636, 627, 634, 675, 627, 668, 664, 692, 639, 726, 708, 759, 786, 793, 806, 782, 728, 710, 713, 676, 651, 585, 535]

width = 0.8
fig, ax = plt.subplots()

ax.bar(x_indexes, bfi_se_nums, color=colours[6], width=width, label='BFI', log=True)
ax.bar(x_indexes, film_bang_se_nums, color=colours[2], width=width, label='Film Bang', log=True)

for i in range(len(bfi_years)):
    ax.text(x = x_indexes[i], y = bfi_se_nums[i] + 500, s = bfi_se_nums[i] + 500, size = 8, horizontalalignment='center')
    ax.text(x = x_indexes[i], y = film_bang_se_nums[i] + 50, s = film_bang_se_nums[i] + 50, size = 8, horizontalalignment='center')

ax.set_facecolor('white')
ax.set_xlabel('')
ax.set_ylabel('No. Entries - Log Scale')
ax.set_title('')
ax.set_xticks(ticks=x_indexes)
ax.set_xticklabels(bfi_years, {'fontsize':12})
fig.autofmt_xdate(rotation=90)

plt.legend(loc='upper left', fontsize=14)
plt.grid(True)
plt.savefig("Outputs/Final/bfi_film_bang_bar_comparison.png", facecolor='#ffffff')
plt.show()

	Trainee prog	Gender	Role 1	Role 1 Category	Rural	No of Yrs
0	NaN	Unknown	Location Scout	Production	NaN	1.0
1	NaN	Male	Camera (Lighting)	Camera	NaN	3.0
2	NaN	Male	Editor	Post-Production	NaN	3.0
3	NaN	Female	Costume Designer	Costume	NaN	22.0
4	NaN	Male	Sound Boom Operator	Sound	NaN	4.0
5	NaN	Female	Camera 2nd Assistant	Camera	NaN	1.0
6	NaN	Male	Camera (Lighting)	Camera	NaN	3.0
7	NaN	Unknown	Stunt Co-Ordinator	Casting	NaN	2.0
8	NaN	Unknown	Photographer	Camera	NaN	1.0
9	NaN	Unknown	Photographer	Camera	NaN	2.0
10	NaN	Male	Construction	Construction	NaN	13.0
11	NaN	Male	Runner	Production	NaN	1.0
12	NaN	Male	Assistant Editor	Post-Production	NaN	11.0
13	NaN	Male	Director	Direction	NaN	9.0
14	NaN	Male	Producer	Producer	NaN	7.0
15	NaN	Male	Petty Cash Buyer	Art	NaN	2.0
16	NaN	Male	Writer/Producer	Script	NaN	1.0
17	NaN	Female	Make-Up Assistant	Hair & Make-Up	NaN	2.0
18	NaN	Female	Unit Manager	Production	NaN	4.0
19	NaN	Male	Location Manager	Production	NaN	2.0

	PostCode	Number	Postcode
0	NaN	203	G12
1	NaN	144	G20
2	NaN	131	G41
3	NaN	124	G11
4	NaN	108	G42
5	NaN	91	EH6
6	NaN	91	EH3
7	NaN	89	G
8	NaN	63	G51
9	NaN	57	EH7
10	NaN	52	EH
11	NaN	51	G31
12	NaN	47	G3 8
13	NaN	46	G61
14	NaN	45	EH10
15	NaN	43	EH1
16	NaN	37	EH4
17	NaN	36	EH9
18	NaN	36	G3 6
19	NaN	36	G3 7

	Role 1 Category	Gender	Trainee prog	No of Yrs
3067	Support	Female	1987	12.0
3437	Sound	Male	2007	3.0
1060	Sound	Male	2006	1.0
937	Sound	Male	2004	6.0
1550	Sound	Male	2000	7.0
759	Sound	Male	1997	4.0
2046	Sound	Male	1996	21.0
585	Sound	Male	1989	10.0
3614	Sound	Male	1987	32.0
3570	Sound	Male	1986	19.0
1493	Sound	Male	1984	34.0
1833	Production	Male	2019	1.0
2302	Production	Female	2019	1.0
935	Production	Female	2015	3.0
1075	Production	Female	2013	9.0
198	Production	Unknown	2011	4.0
442	Production	Male	2011	3.0
382	Production	Female	2009	2.0
588	Production	Female	2009	1.0
767	Production	Male	2009	1.0
1029	Production	Female	2009	4.0
2444	Production	Male	2009	10.0
2684	Production	Male	2009	1.0
584	Production	Female	2007	1.0
1693	Production	Male	2007	5.0
2002	Production	Female	2007	2.0
2482	Production	Male	2007	1.0
2795	Production	Female	2007	13.0
2945	Production	Female	2007	1.0
1328	Production	Male	2006	3.0
1538	Production	Male	2006	13.0
2430	Production	Female	2006	1.0
2753	Production	Female	2006	1.0
1735	Production	Male	2004	3.0
1798	Production	Male	2004	1.0
2091	Production	Female	2004	7.0
2539	Production	Male	2004	4.0
1757	Production	Male	2002	2.0
3609	Production	Female	2002	2.0
1469	Production	Unknown	2000	5.0
2798	Production	Male	2000	1.0
1113	Production	Female	1997	8.0
3583	Production	Female	1996	10.0
268	Production	Female	1994	7.0
586	Production	Female	1991	1.0
463	Production	Female	1987	8.0
2182	Production	Male	1982	3.0
381	Production	Male	1981	29.0
2519	Producer	Female	2002	5.0
278	Producer	Female	1984	14.0
3619	Producer	Male	1982	26.0
218	Post-Production	Male	2013	2.0
829	Post-Production	Male	2007	6.0
1024	Post-Production	Male	2002	3.0
2333	Post-Production	Female	2000	8.0
932	Post-Production	Female	1997	1.0
363	Post-Production	Male	1994	22.0
2262	Post-Production	Male	1991	5.0
1183	Post-Production	Male	1990	4.0
2688	Post-Production	Female	1988	7.0
480	Post-Production	Female	1986	34.0
1939	Post-Production	Male	1983	3.0
1558	Post-Production	Male	1978	6.0
2950	Post-Production	Female	1978	37.0
2343	Hair & Make-Up	Male	2009	3.0
3095	Hair & Make-Up	Female	2004	7.0
2134	Hair & Make-Up	Female	1997	16.0
1568	Hair & Make-Up	Female	1993	11.0
2749	Direction	Female	2013	8.0
3428	Direction	Female	2002	1.0
427	Direction	Male	1996	2.0
3573	Direction	Female	1994	3.0
1763	Direction	Female	1993	5.0
2464	Direction	Male	1993	7.0
640	Direction	Female	1991	4.0
868	Direction	Female	1989	13.0
337	Direction	Male	1979	7.0
832	Costume	Female	2019	1.0
1187	Costume	Female	2007	6.0
2419	Costume	Female	2006	1.0
3291	Costume	Female	2004	5.0
2201	Costume	Female	1997	3.0
3077	Costume	Female	1996	2.0
1948	Camera	Male	2019	1.0
3010	Camera	Male	2019	1.0
3387	Camera	Unknown	2015	3.0
2628	Camera	Male	2009	2.0
545	Camera	Female	2007	5.0
1775	Camera	Male	2007	12.0
1190	Camera	Male	2006	3.0
86	Camera	Male	2002	11.0
1632	Camera	Male	2000	19.0
262	Camera	Female	1996	23.0
3415	Camera	Male	1994	6.0
2283	Camera	Female	1993	11.0
2062	Camera	Male	1991	29.0
1408	Camera	Female	1990	2.0
2816	Camera	Male	1990	9.0
1285	Camera	Male	1989	6.0
3532	Camera	Female	1986	17.0
2083	Camera	Female	1984	2.0
207	Camera	Female	1983	21.0
2635	Camera	Male	1983	36.0
476	Camera	Male	1982	28.0
454	Camera	Male	1979	32.0
1417	Camera	Male	1978	36.0
1138	Art	Female	2011	1.0
3537	Art	Female	2009	1.0
1002	Art	Female	2007	5.0
177	Art	Male	2004	4.0
3393	Art	Female	2002	2.0
633	Art	Female	2000	7.0
2429	Art	Male	1997	7.0
2296	Art	Male	1994	4.0
3581	Art	Male	1994	8.0
3069	Art	Male	1990	30.0

	count
No of Yrs
(0, 5]	2469
(5, 10]	532
(10, 15]	274
(15, 20]	150
(20, 25]	111
(25, 30]	61
(30, 35]	28
(35, 40]	6
(40, 45]	1

	1 year	2-3 years	4-6 years	7-11 years	12-18 years	19-29 years	30-42 years
1	589.0	459.0	315.0	274.0	179.0	131.0	34.0
2	482.0	395.0	264.0	189.0	121.0	80.0	11.0
3	49.0	31.0	13.0	9.0	3.0	4.0	0.0

	dropout 78	dropout 79	dropout 81	dropout 82	dropout 84	dropout 86	dropout 87	dropout 88	dropout 89	dropout 90	dropout 91	dropout 92	dropout 93	dropout 94	dropout 95	dropout 96	dropout 97	dropout 98	dropout 99	dropout 00	dropout 01	dropout 02	dropout 03	dropout 04	dropout 05	dropout 06	dropout 07	dropout 08	dropout 09	dropout 10	dropout 11	dropout 12	dropout 13	dropout 14	dropout 15	dropout 16	dropout 17	dropout 18	dropout 19	dropout 20	dropout year
0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.0
1	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2007.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2007.0
2	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2002.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2002.0
3	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2020.0	2020.0
4	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2013.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2013.0
5	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.0
6	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1999.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1999.0
7	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.0
8	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.0
9	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.0

	UUID	Codes	Trainee prog	Gender	Role 1	Role 1 Category	Role 2	Role 2 Category	Role 3	Role 3 Category	Has Consistent Role	Postcode1	Rural	No of Yrs	1976	1978	1979	1981	1982	1984	1986	1987	1988	1989	1990	...	dropout 97	dropout 98	dropout 99	dropout 00	dropout 01	dropout 02	dropout 03	dropout 04	dropout 05	dropout 06	dropout 07	dropout 08	dropout 09	dropout 10	dropout 11	dropout 12	dropout 13	dropout 14	dropout 15	dropout 16	dropout 17	dropout 18	dropout 19	dropout 20	dropout year
0	b8da19b3-1a0d-4dd6-9cd1-1cf69c8bd615	NaN	NaN	Unknown	Location Scout	Production	Missing	NaN	Missing	NaN	True	NaN	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.0
1	18c1559c-081b-4cbf-bd32-03b6103aa603	NaN	NaN	Male	Camera (Lighting)	Camera	Missing	NaN	Missing	NaN	True	DD10	NaN	3.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2007.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2007.0
2	978118a2-08fd-4621-b732-037b592e19b7	NaN	NaN	Male	Editor	Post-Production	Missing	NaN	Missing	NaN	True	EH8	NaN	3.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	2002.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2002.0
3	78597596-2381-4fc8-aa96-faa77cc79de5	NaN	NaN	Female	Costume Designer	Costume	Wardrobe Assistant	Costume	Missing	NaN	True	G14	NaN	22.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2020.0	2020.0
4	e1bf07fb-079d-4457-a4c7-9d3e784d873e	NaN	NaN	Male	Sound Boom Operator	Sound	Sound Assistant	Sound	Missing	NaN	True	G12	NaN	4.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2013.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2013.0
5	375204bd-e245-44a3-bf11-d8721aa69fa7	NaN	NaN	Female	Camera 2nd Assistant	Camera	Missing	NaN	Missing	NaN	True	NaN	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.0
6	064372a8-468d-485a-9d58-c8b07d272fc8	NaN	NaN	Male	Camera (Lighting)	Camera	Missing	NaN	Missing	NaN	True	G41	NaN	3.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	1999.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1999.0
7	41051e49-8db1-4dd7-b9c5-6e096f00070d	NaN	NaN	Unknown	Stunt Co-Ordinator	Casting	Missing	NaN	Missing	NaN	True	NaN	NaN	2.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.0
8	453f96fd-618d-42f1-9549-9abdeaaa9ca0	NaN	NaN	Unknown	Photographer	Camera	Missing	NaN	Missing	NaN	True	NaN	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.0
9	ed55eaf9-b358-4158-adf3-4cb40678d1de	NaN	NaN	Unknown	Photographer	Camera	Missing	NaN	Missing	NaN	True	NaN	NaN	2.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.0
10	bf9dbc48-bb4d-4e95-b899-1178e97068b8	NaN	NaN	Male	Construction	Construction	Standby Carpenter	Construction	Missing	NaN	True	G3 6	NaN	13.0	NaN	NaN	NaN	NaN	NaN	NaN	1986	1987	1988	1989	1990	...	NaN	NaN	1999.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1999.0
11	4c6844ec-739e-4858-aa84-45b5b37b0c7d	NaN	NaN	Male	Runner	Production	Missing	NaN	Missing	NaN	True	G52	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2007.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2007.0
12	535d5e26-9be0-48e7-976e-80a485a69dbe	NaN	NaN	Male	Assistant Editor	Post-Production	Missing	NaN	Missing	NaN	True	EH8	NaN	11.0	NaN	NaN	NaN	NaN	NaN	1984	1986	1987	1988	1989	1990	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1996.0
13	3534f501-b86e-42e6-8109-567b79d47c3c	NaN	NaN	Male	Director	Direction	Missing	NaN	Missing	NaN	True	EH3	NaN	9.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2011.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2011.0
14	8d9f231e-5dd6-42a1-9fef-949984e225a6	NaN	NaN	Male	Producer	Producer	Director	Direction	Missing	NaN	NaN	G11	NaN	7.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1990	...	1997.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1997.0
15	c83e70e2-1b8a-4f19-a2e1-0a7dfa9e1a57	NaN	NaN	Male	Petty Cash Buyer	Art	Missing	NaN	Missing	NaN	True	G66	NaN	2.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2019.0	NaN	2019.0
16	c79c0679-df2a-4112-9c1e-ac3d1bea6fe7	NaN	NaN	Male	Writer/Producer	Script	Director	Direction	Missing	NaN	NaN	PH1	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1993.0
17	4ae7939e-629c-4bc2-91b5-c16dc3147351	NaN	NaN	Female	Make-Up Assistant	Hair & Make-Up	Make-Up Artist	Hair & Make-Up	Missing	NaN	True	G22	NaN	2.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2019.0	NaN	2019.0
18	b59e0b45-a387-4ade-8e4b-70182be0f5b3	NaN	NaN	Female	Unit Manager	Production	Missing	NaN	Missing	NaN	True	EH3	NaN	4.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1996.0
19	b4b4982b-2610-4be7-a2fb-054e78023ef4	NaN	NaN	Male	Location Manager	Production	Location Scout	Production	Missing	NaN	True	EH15	NaN	2.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2020.0	2020.0
20	3834ae52-72ac-4ceb-ad64-8c98c0876096	NaN	NaN	Female	Researcher	Production	Missing	NaN	Missing	NaN	True	DG7	NaN	3.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1995.0
21	c16d3aa3-e2c5-4062-9685-20d6dff94901	NaN	NaN	Male	Director Of Photography	Camera	Camera Operator	Camera	Camera (Lighting)	Camera	True	G75	NaN	11.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2018.0	NaN	NaN	2018.0
22	65bb41e8-2084-4f8b-a5de-7050a0f0c565	NaN	NaN	Male	Researcher	Production	Missing	NaN	Missing	NaN	True	G84	NaN	3.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2020.0	2020.0
23	e007b526-0fff-46f1-be19-5c0d20d1c968	NaN	NaN	Female	Wardrobe Assistant	Costume	Missing	NaN	Missing	NaN	True	NaN	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.0
24	76e9ddcd-9da6-43c4-add2-144f7b85bd0b	NaN	NaN	Male	Camera Assistant	Camera	Missing	NaN	Missing	NaN	True	EH	NaN	2.0	NaN	NaN	NaN	NaN	1982	1984	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1986.0
25	57865408-6927-4153-abcc-37e806177c70	NaN	NaN	Female	Production Assistant	Production	Missing	NaN	Missing	NaN	True	EH5	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1994.0
26	03eaf2dc-b55a-46a8-ae0b-da6e03b63a1f	NaN	NaN	Male	Production Manager	Production	Missing	NaN	Missing	NaN	True	NaN	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2020.0	2020.0
27	cab4d104-e02d-44dc-b1b8-2141786eb57a	NaN	NaN	Male	Camera (Lighting)	Camera	Director (Camera)	Camera	Missing	NaN	True	PA31	True	11.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2013.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2013.0
28	d26f25e4-3f9f-4b5a-a92c-4ab0e07b6dce	NaN	NaN	Male	Production Secretary	Production	Production Assistant	Production	Runner	Production	True	EH	NaN	6.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.0
29	b63a6a84-f03b-4e4c-980b-6b3277a543e1	NaN	NaN	Female	Producer	Producer	Production Manager	Production	Production Assistant	Production	NaN	EH6	NaN	5.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1990	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1995.0