Film Bang Directory

Jupyter Notebook for computing and displaying basic figures

  • Crew Data - Growth & Decline
  • Departments
  • Urban / Rural
  • Trainee Program
  • Career Lengths & Longevity
  • Gender
  • Dropouts
  • Data for Cox Proportional Hazard Modelling
  • Comparing Film Bang Data With BFI
In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns
In [2]:
plt.style.use('fivethirtyeight')
plt.style.use('seaborn')
plt.style.use('seaborn-colorblind')
plt.rcParams['figure.figsize'] = [12, 6]
In [3]:
x = str
df = pd.read_csv('Core_Data/Film_Bang_Personnel_Master_Step_2.csv', dtype={'Trainee prof':x, '2020': x, '2019': x, '2018': x,
       '2017': x, '2016': x, '2015': x, '2014': x, '2013': x, '2012': x, '2011': x, '2010': x, '2009': x,
       '2008': x, '2007': x, '2006': x, '2005': x, '2004': x, '2003': x, '2002': x, '2001': x, '2000': x,
       '1999': x, '1998': x, '1997': x, '1996': x, '1995': x, '1994': x, '1993': x, '1992': x, '1991': x,
       '1990': x, '1989': x, '1988': x, '1987': x, '1986': x, '1984': x, '1982': x, '1981': x,
       '1979': x, '1978': x, '1976': x})
In [4]:
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 300)
In [5]:
colours = sns.color_palette('colorblind', as_cmap=True)
#colours = sns.cubehelix_palette(start=2, rot=1, dark=0, light=.95, as_cmap=True)
#colours = sns.color_palette("rocket_r", as_cmap=True)
In [6]:
# Function to format percentage values for visualisation
format_percent = lambda x: format((x*100), '.0f')

The code below brings basic analysis into one document and shows the output of various calculations. The anonymisation of the data has been carried out in a separate notebook so names, full postcodes, locations etc are removed. We're left with the first 3 characters of the 'last known' postcode where entries have one.

This notebook will have other subsidiary notebooks added to it over the next few days (3/10/20)

Index:

  • Basic Description of Data
  • Departments
  • Location
  • Trainee Program
  • Longevity

Number of Entries, Columns

In [7]:
df.shape
Out[7]:
(3634, 98)
In [8]:
df.columns
Out[8]:
Index(['UUID', 'Codes', 'Trainee prog', 'Gender', 'Role 1', 'Role 1 Category',
       'Role 2', 'Role 2 Category', 'Role 3', 'Role 3 Category',
       'Has Consistent Role', 'Postcode1', 'Rural', 'No of Yrs', '2020',
       '2019', '2018', '2017', '2016', '2015', '2014', '2013', '2012', '2011',
       '2010', '2009', '2008', '2007', '2006', '2005', '2004', '2003', '2002',
       '2001', '2000', '1999', '1998', '1997', '1996', '1995', '1994', '1993',
       '1992', '1991', '1990', '1989', '1988', '1987', '1986', '1984', '1982',
       '1981', '1979', '1978', '1976', 'Company', 'Description', 'Yr Cnt 2020',
       'Yr Cnt 2019', 'Yr Cnt 2018', 'Yr Cnt 2017', 'Yr Cnt 2016',
       'Yr Cnt 2015', 'Yr Cnt 2014', 'Yr Cnt 2013', 'Yr Cnt 2012',
       'Yr Cnt 2011', 'Yr Cnt 2010', 'Yr Cnt 2009', 'Yr Cnt 2008',
       'Yr Cnt 2007', 'Yr Cnt 2006', 'Yr Cnt 2005', 'Yr Cnt 2004',
       'Yr Cnt 2003', 'Yr Cnt 2002', 'Yr Cnt 2001', 'Yr Cnt 2000',
       'Yr Cnt 1999', 'Yr Cnt 1998', 'Yr Cnt 1997', 'Yr Cnt 1996',
       'Yr Cnt 1995', 'Yr Cnt 1994', 'Yr Cnt 1993', 'Yr Cnt 1992',
       'Yr Cnt 1991', 'Yr Cnt 1990', 'Yr Cnt 1989', 'Yr Cnt 1988',
       'Yr Cnt 1987', 'Yr Cnt 1986', 'Yr Cnt 1984', 'Yr Cnt 1982',
       'Yr Cnt 1981', 'Yr Cnt 1979', 'Yr Cnt 1978', 'Yr Cnt 1976'],
      dtype='object')
In [9]:
# Display of sample of data
df[['Trainee prog', 'Gender', 'Role 1', 'Role 1 Category', 'Rural', 'No of Yrs']].head(20)
Out[9]:
Trainee prog Gender Role 1 Role 1 Category Rural No of Yrs
0 NaN Unknown Location Scout Production NaN 1.0
1 NaN Male Camera (Lighting) Camera NaN 3.0
2 NaN Male Editor Post-Production NaN 3.0
3 NaN Female Costume Designer Costume NaN 22.0
4 NaN Male Sound Boom Operator Sound NaN 4.0
5 NaN Female Camera 2nd Assistant Camera NaN 1.0
6 NaN Male Camera (Lighting) Camera NaN 3.0
7 NaN Unknown Stunt Co-Ordinator Casting NaN 2.0
8 NaN Unknown Photographer Camera NaN 1.0
9 NaN Unknown Photographer Camera NaN 2.0
10 NaN Male Construction Construction NaN 13.0
11 NaN Male Runner Production NaN 1.0
12 NaN Male Assistant Editor Post-Production NaN 11.0
13 NaN Male Director Direction NaN 9.0
14 NaN Male Producer Producer NaN 7.0
15 NaN Male Petty Cash Buyer Art NaN 2.0
16 NaN Male Writer/Producer Script NaN 1.0
17 NaN Female Make-Up Assistant Hair & Make-Up NaN 2.0
18 NaN Female Unit Manager Production NaN 4.0
19 NaN Male Location Manager Production NaN 2.0

Crew Data

In [10]:
plot_years = df.loc[:, '2020':'1976'].columns.tolist()
In [11]:
basic_totals_crew = []
col_range = df.loc[:, '2020':'1976']
In [12]:
def crew_count(output_array, dataframe):
    for col in dataframe.loc[:, '2020':'1976']:
        total = dataframe[col].value_counts(dropna=True)
        output_array.append(total[0])
    #print(output_array)
In [13]:
crew_count(basic_totals_crew, df)

Growth & Decline

In [14]:
x_indexes = list(range(len(plot_years)))
x_indexes.reverse()

fig, ax = plt.subplots()

ax.plot(x_indexes, basic_totals_crew, color='#2E0014', label='Total Freelancers', linewidth=3)
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_xticks(ticks=x_indexes)
ax.set_xticklabels(plot_years, {'fontsize': 12})
fig.autofmt_xdate(rotation=90)
ax.legend(fontsize=14)
ax.set_facecolor('white')

plt.tight_layout()
plt.savefig('Outputs/Final/film_bang_freelancers_over_time.png', facecolor='#ffffff')
plt.show()
In [15]:
basic_totals_crew.reverse()
In [16]:
list_76_89 = basic_totals_crew[0:10]
list_90_99 = basic_totals_crew[10:20]
list_2000_2009 = basic_totals_crew[20:30]
list_2010_2020 = basic_totals_crew[30:41]
In [17]:
plot_years.reverse()
In [18]:
plot_years_76_89 = plot_years[0:10]
plot_years_90_99 = plot_years[10:20]
plot_years_00_09 = plot_years[20:30]
plot_years_10_20 = plot_years[30:41]
In [19]:
x_indexes = list(range(len(plot_years)))
#print(x_indexes)
x_indexes_76_89 = x_indexes[0:10]
x_indexes_90_99 = x_indexes[10:20]
x_indexes_00_09 = x_indexes[20:30]
x_indexes_10_20 = x_indexes[30:41]
In [20]:
fig, ax = plt.subplots()

ax.plot(x_indexes_76_89, list_76_89, color='#2E0014', label='Total Freelancers', linewidth=3)
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_xticks(ticks=x_indexes_76_89)
ax.set_xticklabels(plot_years_76_89, {'fontsize': 12})
ax.axvspan(x_indexes_76_89[0], x_indexes_76_89[9], facecolor=colours[0], alpha=0.5)
ax.set_facecolor('white')

ax.legend(loc='upper right', bbox_to_anchor=(0.4, 0., 0.5, 0.9), fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/proto-type_creative_cluster_76_89.png', facecolor='#ffffff')
plt.show()
In [21]:
fig, ax = plt.subplots()

ax.plot(x_indexes_90_99, list_90_99, color='#2E0014', label='Total Freelancers', linewidth=3)
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_xticks(ticks=x_indexes_90_99)
ax.set_xticklabels(plot_years_90_99, {'fontsize': 12})
ax.axvspan(x_indexes_90_99[0], x_indexes_90_99[9], facecolor=colours[1], alpha=0.5)
ax.set_facecolor('white')

ax.legend(loc='upper left', bbox_to_anchor=(0.1, 0., 0.5, 0.9), fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/de-regulation_90_99.png', facecolor='#ffffff')
plt.show()
In [22]:
fig, ax = plt.subplots()

ax.plot(x_indexes_00_09, list_2000_2009, color='#2E0014', label='Total Freelancers', linewidth=3)
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_xticks(ticks=x_indexes_00_09)
ax.set_xticklabels(plot_years_00_09, {'fontsize': 12})
plt.axvspan(x_indexes_00_09[0], x_indexes_00_09[9], facecolor=colours[2], alpha=0.5)
ax.set_facecolor('white')
ax.legend(loc='upper right', bbox_to_anchor=(0.4, 0., 0.5, 0.9), fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/limits_to_growth_00_09.png', facecolor='#ffffff')
plt.show()
In [23]:
fig, ax = plt.subplots()

ax.plot(x_indexes_10_20, list_2010_2020, color='#2E0014', label='Total Freelancers', linewidth=3)
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('New expansion, new productions')
ax.set_xticks(ticks=x_indexes_10_20)
ax.set_xticklabels(plot_years_10_20, {'fontsize': 12})
ax.axvspan(x_indexes_10_20[0], x_indexes_10_20[10], facecolor=colours[3], alpha=0.5)
ax.set_facecolor('white')

ax.legend(loc='upper right', bbox_to_anchor=(0.4, 0., 0.5, 0.9), fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/new_Expansion_10_20.png', facecolor='#ffffff')
plt.show()

Periods of Growth and Decline

In [24]:
fig, ax = plt.subplots()

ax.plot(x_indexes, basic_totals_crew, color='#2E0014', label='  Total Entries', linewidth=3)
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_xticks(ticks=x_indexes)
ax.set_xticklabels(plot_years, {'fontsize':12})
fig.autofmt_xdate(rotation=90)

ax.axvspan(x_indexes[10], x_indexes[0], facecolor=colours[0], alpha=0.5, label='  Development of screen industry in Scotland')
ax.axvspan(x_indexes[20], x_indexes[10], facecolor=colours[1], alpha=0.5, label='  Deregulation, changing technology, flexible working')
ax.axvspan(x_indexes[30], x_indexes[20], facecolor=colours[2], alpha=0.5, label='  Decline of industry')
ax.axvspan(x_indexes[40], x_indexes[30], facecolor=colours[3], alpha=0.5, label='  Increased production, new broadcasters')
ax.set_facecolor('white')
ax.legend(frameon=1, loc='lower right', bbox_to_anchor=(0.4, 0.1, 0.5, 0.9), facecolor='inherit', edgecolor="#000000", fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/highlighting_growth_decline.png', facecolor='#ffffff')
plt.show()
In [25]:
fig, ax = plt.subplots()

ax.plot(x_indexes, basic_totals_crew, color='#2E0014', label='Total Freelancers', linewidth=3)
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_xticks(ticks=x_indexes)
ax.set_xticklabels(plot_years, {'fontsize': 12})
fig.autofmt_xdate(rotation = 90)
ax.axvspan(x_indexes[10], x_indexes[0], facecolor=colours[0], alpha=0.5, label='Development of Screen Industry in Scotland')
ax.set_facecolor('white')
plt.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/timeframes_phase_1.png', facecolor='#ffffff')
plt.show()
In [26]:
fig, ax = plt.subplots()

ax.plot(x_indexes, basic_totals_crew, color='#2E0014', label='Total Freelancers', linewidth=3)
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_xticks(ticks=x_indexes)
ax.set_xticklabels(plot_years, {'fontsize': 12})
fig.autofmt_xdate(rotation=90)
ax.axvspan(x_indexes[20], x_indexes[10], facecolor=colours[1], alpha=0.5, label='Deregulation, changing technology, flexible working')
ax.set_facecolor('white')
ax.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/timeframesphase_2.png', facecolor='#ffffff')
plt.show()
In [27]:
fig, ax = plt.subplots()
ax.plot(x_indexes, basic_totals_crew, color='#2E0014', label='Total Freelancers', linewidth=3)
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_xticks(ticks=x_indexes)
ax.set_xticklabels(plot_years)
fig.autofmt_xdate(rotation=90)
ax.axvspan(x_indexes[30], x_indexes[20], facecolor=colours[2], alpha=0.5, label='Decline of industry')
ax.set_facecolor('white')

ax.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/timeframesphase_3.png', facecolor='#ffffff')
plt.show()
In [28]:
fig, ax = plt.subplots()

ax.plot(x_indexes, basic_totals_crew, color='#2E0014', label='Total Freelancers', linewidth=3)
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_xticks(ticks=x_indexes)
ax.set_xticklabels(plot_years)
ax.axvspan(x_indexes[40], x_indexes[30], facecolor=colours[3], alpha=0.5, label='Increased Production, New Broadcasters')
fig.autofmt_xdate(rotation=90)
ax.set_facecolor('white')
ax.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/timeframesphase_4.png', facecolor='#ffffff')
plt.show()

Total Entries per Year

In [29]:
fig, ax = plt.subplots()
width=0.8
ax.bar(plot_years, basic_totals_crew, width=width, color=colours[5], label='Total Freelancers Per Directory')
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_facecolor('white')
ax.set_xticks(ticks=plot_years)
fig.autofmt_xdate(rotation=90)

ax.legend(fontsize=14)
plt.tight_layout()
plt.savefig('Outputs/Final/listings_by_year.png', facecolor='#ffffff')
plt.show()

Showing Growth Alongside Historical Events

In [30]:
x_indexes = list(range(len(plot_years)))

fig, ax = plt.subplots()
ax.plot(x_indexes, basic_totals_crew, color='#2E0014', label='Total Entries', linewidth=3)
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_xticks(ticks=x_indexes)
ax.set_xticklabels(plot_years, {'fontsize': 12})
fig.autofmt_xdate(rotation=90)
ax.axvline(x=x_indexes[4], linewidth=3, color=colours[5], label='Channel 4 Starts')
ax.axvline(x=x_indexes[16], linewidth=3, color=colours[6], label='Tartan Shorts Launched')
ax.axvline(x=x_indexes[21], linewidth=3, color=colours[7], label='STV Crisis')
ax.axvline(x=x_indexes[28], linewidth=3, color=colours[8], label='Financial Crisis')
ax.axvline(x=x_indexes[39], linewidth=3, color=colours[9], label='Film Bang Website Relaunched')
ax.set_facecolor('white')
ax.legend(fontsize=14, frameon=1)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/workers_by_year_key_dates.png', facecolor='#ffffff')
plt.show()
In [31]:
at_leasts = []
df_year_exclusions = pd.DataFrame(columns=at_leasts)

for i in range(1, 41):
    string = f'AL {i}'
    at_leasts.append(string)
    df_year_exclusions[string] = np.where((df['No of Yrs'] > (i -1)), 1, np.nan)
In [32]:
at_least_vals = []
for col in at_leasts:
    at_least_vals.append(df_year_exclusions[col].value_counts().values[0])
print(at_least_vals)
[3632, 2512, 1982, 1627, 1380, 1163, 1035, 914, 813, 716, 631, 563, 509, 455, 408, 357, 326, 291, 260, 233, 207, 180, 150, 136, 115, 96, 79, 70, 59, 45, 35, 31, 25, 21, 14, 7, 4, 2, 1, 1]
In [33]:
x_vals = range(1,41)
width=0.8
fig, ax = plt.subplots()
ax.bar(x_vals, at_least_vals, color=colours[2], width = width, label='At least x years listing length')
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_facecolor('white')
ax.set_xticks(ticks=x_vals)
ax.legend(fontsize=14)
plt.tight_layout()
plt.savefig('Outputs/Final/at_least_x_listing_length.png', facecolor='#ffffff')
plt.show()

Departments

Figures for number of people in each department. Departments were allocated according to keywords found in role descriptions. There are 536 unique roles given throughout the data (Role 1). Film Bang used to offer a free field for crew to define their department or role. The current post 2019 website offers a dropdown list of options to choose from when signing up. There are 124 options to choose from. We have grouped roles into the categories listed below based on __ .

In [34]:
# Unique values in the Role 1 Column
df['Role 1'].nunique()
Out[34]:
536
In [35]:
df['Role 1 Category'].value_counts()
Out[35]:
Production         816
Camera             634
Art                470
Direction          282
Producer           242
Post-Production    226
Sound              205
Hair & Make-Up     201
Costume            154
Music               86
Support             83
Script              79
Casting             76
Construction        51
Special FX          22
Trainee              6
Name: Role 1 Category, dtype: int64

Percentages for above figures

In [36]:
df['Role 1 Category'].value_counts(normalize=True).apply(format_percent)
#df['Role 1 Category'].value_counts(normalize=True).apply(lambda x: format((x*100), '.0f'))
Out[36]:
Production         22
Camera             17
Art                13
Direction           8
Producer            7
Post-Production     6
Sound               6
Hair & Make-Up      6
Costume             4
Music               2
Support             2
Script              2
Casting             2
Construction        1
Special FX          1
Trainee             0
Name: Role 1 Category, dtype: object

Has Consitent Role

This is a figure for where, for each entry, the Role 1, Role 2, Role 3 columns fall into the same category or not. Same category = consistent. Differing categories across 1, 2, 3 = not consistent. We are trying to determine if people are listing in different departments, spreading their net wide so to speak. Or not.

False = Director/Producer, Producer/Director

In [37]:
# Shows figures for those whose roles are consistent over the 3 columns (including empty values for 2 and 3), 
# and those where the roles differ by department
df['Has Consistent Role'].value_counts(dropna=False)
Out[37]:
True     3093
NaN       469
False      72
Name: Has Consistent Role, dtype: int64
In [38]:
# Percentages for the above
df['Has Consistent Role'].value_counts(normalize=True, dropna=False).apply(format_percent)
Out[38]:
True     85
NaN      13
False     2
Name: Has Consistent Role, dtype: object
In [39]:
filt = df.groupby(['Role 1 Category'])
filt['Has Consistent Role'].value_counts(dropna=False)
Out[39]:
Role 1 Category  Has Consistent Role
Art              True                   429
                 NaN                     41
Camera           True                   580
                 NaN                     54
Casting          True                    72
                 NaN                      4
Construction     True                    51
Costume          True                   140
                 NaN                     14
Direction        True                   161
                 NaN                    100
                 False                   21
Hair & Make-Up   True                   195
                 NaN                      6
Music            True                    83
                 NaN                      3
Post-Production  True                   201
                 NaN                     25
Producer         NaN                    110
                 True                    81
                 False                   51
Production       True                   750
                 NaN                     66
Script           True                    59
                 NaN                     20
Sound            True                   196
                 NaN                      9
Special FX       True                    17
                 NaN                      5
Support          True                    72
                 NaN                     11
Trainee          True                     6
Name: Has Consistent Role, dtype: int64
In [40]:
# Percentages of the above
filt = df.groupby(['Role 1 Category'])
filt['Has Consistent Role'].value_counts(normalize=True, dropna=False).apply(format_percent)
Out[40]:
Role 1 Category  Has Consistent Role
Art              True                    91
                 NaN                      9
Camera           True                    91
                 NaN                      9
Casting          True                    95
                 NaN                      5
Construction     True                   100
Costume          True                    91
                 NaN                      9
Direction        True                    57
                 NaN                     35
                 False                    7
Hair & Make-Up   True                    97
                 NaN                      3
Music            True                    97
                 NaN                      3
Post-Production  True                    89
                 NaN                     11
Producer         NaN                     45
                 True                    33
                 False                   21
Production       True                    92
                 NaN                      8
Script           True                    75
                 NaN                     25
Sound            True                    96
                 NaN                      4
Special FX       True                    77
                 NaN                     23
Support          True                    87
                 NaN                     13
Trainee          True                   100
Name: Has Consistent Role, dtype: object

In the above data we can see the departments Direction and Producer show the greatest instances of people having different roles across Role Categories 1, 2 & 3

In [41]:
# Data
depts = df['Role 1 Category'].value_counts().index.tolist()
nums = df['Role 1 Category'].value_counts().tolist()
In [42]:
print(depts)
print(nums)
['Production', 'Camera', 'Art', 'Direction', 'Producer', 'Post-Production', 'Sound', 'Hair & Make-Up', 'Costume', 'Music', 'Support', 'Script', 'Casting', 'Construction', 'Special FX', 'Trainee']
[816, 634, 470, 282, 242, 226, 205, 201, 154, 86, 83, 79, 76, 51, 22, 6]
In [43]:
tot = sum(nums)
In [44]:
nums_p_cents = []
for i in nums:
    x = (i / tot) * 100
    x = round(x)
    nums_p_cents.append(x)
In [45]:
print('Percentages', nums_p_cents)
Percentages [22, 17, 13, 8, 7, 6, 6, 6, 4, 2, 2, 2, 2, 1, 1, 0]
In [46]:
survey_dept_p_cents = [20, 15, 12, 5, 7, 5, 4, 0, 3, 0, 0, 2, 0, 2, 0, 0]
In [47]:
width=0.8
fig, ax = plt.subplots()
ax.bar(depts, nums_p_cents, color=colours[3], width = width, label='% of Film Bang Listings by Department')
ax.bar(depts, survey_dept_p_cents, color=colours[4], width = width, label="% of Survey Responses by Department")

ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_facecolor('white')
ax.set_xticks(ticks=depts)
fig.autofmt_xdate(rotation=45)
plt.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/percentages_dept_correlation.png', facecolor='#ffffff')
plt.show()
In [48]:
width=0.8
fig, ax = plt.subplots()
ax.bar(depts, nums, color=colours[3], width = width, label='Listings by Department')
for i in range(len(depts)):
    ax.text(x = depts[i], y = nums[i], s = nums[i], size = 15, horizontalalignment='center')
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_facecolor('white')
ax.set_xticks(ticks=depts)
fig.autofmt_xdate(rotation=45)
plt.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/department_role_category_v.png', facecolor='#ffffff')
plt.show()
In [49]:
# Plot
width=0.8
fig, ax = plt.subplots()
ax.barh(depts, nums, color=colours[3], label='Listings by Department')
for i in range(len(depts)):
    ax.text(x = nums[i], y = depts[i], s = nums[i], size = 15, verticalalignment='center', horizontalalignment='left')
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_facecolor('white')
ax.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/department_role_category_h.png', facecolor='#ffffff')
plt.show()

Production - Breakdown of this department given its size

In [50]:
filt = (df['Role 1 Category'] == 'Production')
production = df[filt]
df_production = pd.DataFrame(production)
In [51]:
df_production.loc[production['Role 1'].str.contains('Unit|Director|ProductionAssistant|Engineer|Floor|Script|Autocue|Location|Property|Scout|Runner|Newcomer|Production Assistant|Stagehand|Stage Hand|Studio|Production Executive|Drapes'), 'Role 1 SubCategory'] = 'Location'
df_production.loc[production['Role 1'].str.contains('Line|Researcher|Production Manager|Secretary|Accountant|Payroll|Production Co-ordinator|Production Co-Ordinator|Office|Publicist'), 'Role 1 SubCategory'] = 'Admin'
df_production.loc[production['Role 1'].str.contains('Driver|Pilot|Transport'), 'Role 1 SubCategory'] = 'Transport'
In [52]:
breakdown = production['Role 1 SubCategory'].value_counts().array
labels = ['Location', 'Admin', 'Transport']

fig, ax = plt.subplots()
ax.pie(breakdown,  labels=labels, textprops={'fontsize': 14}, pctdistance=0.85,
        labeldistance=1.2, colors=colours, shadow=False,
        startangle=45, autopct='%1.0f%%', wedgeprops={'edgecolor':'black'})
ax.set_title('')
ax.text(.0,.0,' ', fontsize=14, ha='right')

circle=plt.Circle( (0,0), 0.7, color='white')
p=plt.gcf()
p.gca().add_artist(circle)
plt.tight_layout()
plt.savefig('Outputs/Final/production_sub_categories_pie_chart_ring.png', facecolor='#ffffff')
plt.show()

Growth

In [53]:
# Create new df for storing filtered values
structure = {
    'Year':[],
    'Direction':[],
    'Music':[],
    'Producer':[],
    'Construction':[],
    'Costume':[],
    'Hair & Make-Up':[],
    'Sound':[],
    'Post-Production':[],
    'Casting':[],
    'Support':[],
    'Art':[],
    'Camera':[],
    'Production':[],
    'Script':[],
    'Special FX':[]
}

df_roles = pd.DataFrame(structure)

for column in df.loc[:, '2020':'1976']:
    # filter Role 1 Category for entries of 'Direction'
    filt1 = (df[column] == column) & (df['Role 1 Category'] == 'Direction')
    # apply filter
    direction = df[filt1]
    # count items in dataframe oject
    direction_count = len(direction.index)

    filt2 = (df[column] == column) & (df['Role 1 Category'] == 'Music')
    music = df[filt2]
    music_count = len(music.index)

    filt3 = (df[column] == column) & (df['Role 1 Category'] == 'Producer')
    producer = df[filt3]
    producer_count = len(producer.index)

    filt4 = (df[column] == column) & (df['Role 1 Category'] == 'Construction')
    construction = df[filt4]
    construction_count = len(construction.index)

    filt5 = (df[column] == column) & (df['Role 1 Category'] == 'Costume')
    costume = df[filt5]
    costume_count = len(costume.index)

    filt6 = (df[column] == column) & (df['Role 1 Category'] == 'Hair & Make-Up')
    hair = df[filt6]
    hair_count = len(hair.index)

    filt7 = (df[column] == column) & (df['Role 1 Category'] == 'Sound')
    sound = df[filt7]
    sound_count = len(sound.index)

    filt8 = (df[column] == column) & (df['Role 1 Category'] == 'Post-Production')
    postp = df[filt8]
    postp_count = len(postp.index)

    filt9 = (df[column] == column) & (df['Role 1 Category'] == 'Casting')
    casting = df[filt9]
    casting_count = len(casting.index)

    filt10 = (df[column] == column) & (df['Role 1 Category'] == 'Support')
    support = df[filt10]
    support_count = len(support.index)

    filt11 = (df[column] == column) & (df['Role 1 Category'] == 'Art')
    art = df[filt11]
    art_count = len(art.index)

    filt12 = (df[column] == column) & (df['Role 1 Category'] == 'Camera')
    camera = df[filt12]
    camera_count = len(camera.index)

    filt13 = (df[column] == column) & (df['Role 1 Category'] == 'Production')
    production = df[filt13]
    production_count = len(production.index)

    filt14 = (df[column] == column) & (df['Role 1 Category'] == 'Script')
    script = df[filt14]
    script_count = len(script.index)

    filt15 = (df[column] == column) & (df['Role 1 Category'] == 'Special FX')
    special = df[filt15]
    special_count = len(special.index)

    df_roles = df_roles.append({
        'Year': column,
        'Direction': direction_count,
        'Music': music_count,
        'Producer': producer_count,
        'Construction': construction_count,
        'Costume': costume_count,
        'Hair & Make-Up': hair_count,
        'Sound': sound_count,
        'Post-Production': postp_count,
        'Casting': casting_count,
        'Support': support_count,
        'Art': art_count,
        'Camera': camera_count,
        'Production': production_count,
        'Script': script_count,
        'Special FX': special_count
    }, ignore_index=True)
#df_roles
In [54]:
years = df_roles['Year']
producers = df_roles['Producer']
construction = df_roles['Construction']
direction = df_roles['Direction']
costume = df_roles['Costume']
hairs = df_roles['Hair & Make-Up']
sound = df_roles['Sound']
postp = df_roles['Post-Production']
casting = df_roles['Casting']
support = df_roles['Support']
art = df_roles['Art']
camera = df_roles['Camera']
production = df_roles['Production']
script = df_roles['Script']
special = df_roles['Special FX']

x_indexes = list(range(len(years)))
x_indexes.reverse()
In [55]:
linewidth=3
fig, ax = plt.subplots()
ax.plot(x_indexes, camera, color=colours[8], label='Camera', linewidth=linewidth)
ax.plot(x_indexes, production, color=colours[9], label='Production', linewidth=linewidth)
ax.plot(x_indexes, art, color=colours[7], label='Art', linewidth=linewidth)
ax.plot(x_indexes, direction, color=colours[5], label='Direction', linewidth=linewidth)
ax.plot(x_indexes, producers, color=colours[0], label='Producers', linewidth=linewidth)
ax.plot(x_indexes, sound, color=colours[3], label='Sound', linewidth=linewidth)
ax.plot(x_indexes, hairs, color=colours[2], label='Hair & Make-Up', linewidth=linewidth)
ax.plot(x_indexes, postp, color=colours[4], label='Post-Production', linewidth=linewidth)
ax.plot(x_indexes, costume, color=colours[1], label='Costume', linewidth=linewidth)
ax.plot(x_indexes, script, color=colours[8], label='Script', linewidth=linewidth)
ax.plot(x_indexes, support, color=colours[6], label='Support', linewidth=linewidth)
ax.plot(x_indexes, construction, color=colours[9], label='Construction', linewidth=linewidth)
ax.plot(x_indexes, casting, color=colours[5], label='Casting', linewidth=linewidth)
ax.plot(x_indexes, special, color=colours[2], label='Special FX', linewidth=linewidth)
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_xticks(ticks=x_indexes)
ax.set_xticklabels(years)
fig.autofmt_xdate(rotation=90)
ax.set_facecolor('white')
ax.legend(fontsize=12)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/numbers_in_departmet_by_year.png', facecolor='#ffffff')
plt.show()
In [56]:
fig, ax = plt.subplots()
ax.plot(x_indexes, camera, color=colours[8], label='Camera', linewidth=linewidth)
ax.plot(x_indexes, production, color=colours[9], label='Production', linewidth=linewidth)
ax.plot(x_indexes, art, color=colours[7], label='Art', linewidth=linewidth)
ax.plot(x_indexes, producers, color=colours[0], label='Producers', linewidth=linewidth)
ax.plot(x_indexes, sound, color=colours[3], label='Sound', linewidth=linewidth)
ax.plot(x_indexes, postp, color=colours[4], label='Post-Production', linewidth=linewidth)
ax.plot(x_indexes, hairs, color=colours[2], label='Hair & Make-Up', linewidth=linewidth)
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_facecolor('white')
ax.set_xticks(ticks=x_indexes)
ax.set_xticklabels(years)
fig.autofmt_xdate(rotation=90)
plt.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/departments_high_growth.png', facecolor='#ffffff')
plt.show()
In [57]:
fig, ax = plt.subplots()
ax.plot(x_indexes, construction, color=colours[9], label='Construction', linewidth=linewidth)
ax.plot(x_indexes, costume, color=colours[1], label='Costume', linewidth=linewidth)
ax.plot(x_indexes, casting, color=colours[5], label='Casting', linewidth=linewidth)
ax.plot(x_indexes, support, color=colours[6], label='Support', linewidth=linewidth)
ax.plot(x_indexes, script, color=colours[2], label='Script', linewidth=linewidth)
ax.plot(x_indexes, special, color=colours[3], label='Special FX', linewidth=linewidth)
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_facecolor('white')
ax.set_xticks(ticks=x_indexes)
ax.set_xticklabels(years)
fig.autofmt_xdate(rotation=90)
ax.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/low_growth_depts.png', facecolor='#ffffff')
plt.show()

Production - Top 10 Roles , example

In [58]:
filt_production = (df['Role 1 Category'] == 'Production')
production_filtered = df[filt_production]
production_filtered['Role 1'].value_counts().head(10)
Out[58]:
Runner                             138
Production Manager                  86
Production Assistant                85
Researcher                          75
Production Co-Ordinator             72
Location Manager                    40
Script Supervisor                   38
Production Accountant               24
Location Assistant                  23
Assistant Production Accountant     22
Name: Role 1, dtype: int64
In [59]:
# Get list of departments
depts_for_top_10 = df['Role 1 Category'].value_counts().index.tolist()

def csv_maker(val, ind, dep):
    df_to_csv = pd.DataFrame(columns=('Role', 'Number'))
    # Add to DataFrame
    df_to_csv['Role'] = ind
    df_to_csv['Number'] = val
    df_to_csv
    # Export to CSV
    dep = dep.lower()
    df_to_csv.to_csv(f'Outputs/CSV/{dep}_top_10.csv', index=False)
    #print(dep)
    #print(df_to_csv)

def department_top_10(dept_array):
    for i in depts_for_top_10:
        filt = (df['Role 1 Category'] == i)
        dept_filtered = df[filt]
        values = dept_filtered['Role 1'].value_counts().head(10).values.tolist()
        indexes = dept_filtered['Role 1'].value_counts().head(10).index.tolist()
        csv_maker(values, indexes, i)

department_top_10(depts_for_top_10)

Location

Urban / Rural

The dataset is compromised in terms of location data. We take the last known postcode of entries that include a postcode and check the postcode against a list of rural postcodes.

In [60]:
# Where no postcode is given
df['Postcode1'].isna().value_counts(dropna=False)
Out[60]:
False    3104
True      530
Name: Postcode1, dtype: int64
In [61]:
# Number of Entries with Rural Postcode
df['Rural'].value_counts(dropna=False)
Out[61]:
NaN     3479
True     155
Name: Rural, dtype: int64
In [62]:
# Entries with Urban Postcode / Entries with Rural Postcode 
filt = df['Postcode1'].notna()
postcodes = df[filt]
postcodes['Rural'].value_counts(dropna=False)
Out[62]:
NaN     2949
True     155
Name: Rural, dtype: int64
In [63]:
# Percentage of Urban to Rural Postcodes 
filt = df['Postcode1'].notna()
postcodes = df[filt]
postcodes['Rural'].value_counts(normalize=True, dropna=False).apply(format_percent)
Out[63]:
NaN     95
True     5
Name: Rural, dtype: object
In [64]:
# Percentage of all postcode values inc nan to Rural Postcodes 
df['Rural'].value_counts(normalize=True, dropna=False).apply(format_percent)
Out[64]:
NaN     96
True     4
Name: Rural, dtype: object

Data for location is inconsistent and not tracked accurately over time. We've reduced this to a look at what is effectively a 'last known postcode area' for each entry.

In [65]:
# The 20 most popular postcodes:
p_code_values = df['Postcode1'].value_counts().values.tolist()
p_code_indexes = df['Postcode1'].value_counts().index.tolist()
In [66]:
df_p_codes = pd.DataFrame(columns=('PostCode', 'Number'))
df_p_codes['Postcode'] = p_code_indexes
df_p_codes['Number'] = p_code_values
# Export to CSV
#df_p_codes.to_csv('Outputs/CSV/df_p_codes.csv', index=False)
df_p_codes.head(20)
Out[66]:
PostCode Number Postcode
0 NaN 203 G12
1 NaN 144 G20
2 NaN 131 G41
3 NaN 124 G11
4 NaN 108 G42
5 NaN 91 EH6
6 NaN 91 EH3
7 NaN 89 G
8 NaN 63 G51
9 NaN 57 EH7
10 NaN 52 EH
11 NaN 51 G31
12 NaN 47 G3 8
13 NaN 46 G61
14 NaN 45 EH10
15 NaN 43 EH1
16 NaN 37 EH4
17 NaN 36 EH9
18 NaN 36 G3 6
19 NaN 36 G3 7
In [67]:
# Number of crew in the 20 most popular postcodes
df['Postcode1'].value_counts().head(20).sum()
Out[67]:
1530

Edinburgh & Glasgow Postcodes

In [68]:
df['PostCodeStrip'] = df['Postcode1']
In [69]:
if df['PostCodeStrip'].notna:
    df['PostCodeStrip'] = df['PostCodeStrip'].str[:2]
df['PostCodeStrip'].head(10)
Out[69]:
0    NaN
1     DD
2     EH
3     G1
4     G1
5    NaN
6     G4
7    NaN
8    NaN
9    NaN
Name: PostCodeStrip, dtype: object
In [70]:
print(df['PostCodeStrip'].value_counts().tolist())
[697, 434, 358, 203, 191, 172, 132, 124, 108, 93, 89, 68, 64, 57, 56, 47, 46, 44, 21, 21, 17, 14, 10, 8, 5, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
In [71]:
p_code_nums = [697, 434, 358, 203, 191, 172, 132, 108, 89, 17, 1, 1]
sum_p_codes = sum(p_code_nums)
print('Glasgow and Edinburgh Postcodes: ' + str(sum_p_codes))
Glasgow and Edinburgh Postcodes: 2403

Trainee Program

In [72]:
# Trainee data float to string
df['Trainee prog'] = df['Trainee prog'].fillna(-1)
df['Trainee prog'] = df['Trainee prog'].astype(int)
df['Trainee prog'] = df['Trainee prog'].astype(str)
df['Trainee prog'] = df['Trainee prog'].replace('-1', np.nan)
In [73]:
df['Trainee prog'].count()
Out[73]:
116
In [74]:
# Show number of trainees on Trainee Program in a given year
df['Trainee prog'].value_counts()
Out[74]:
2007    12
2009     9
2004     8
2006     7
2002     7
1997     6
2000     6
1994     6
2019     5
1996     5
1990     4
1991     4
1993     4
1987     3
1978     3
1982     3
1986     3
1984     3
2011     3
1989     3
1983     3
2013     3
1979     2
2015     2
1981     1
1988     1
Name: Trainee prog, dtype: int64
In [75]:
trainee_index = df['Trainee prog'].value_counts().index.tolist()
trainee_values = df['Trainee prog'].value_counts().values.tolist()
trainee_index.insert(0, 'Year of Trainee Program')
df_trainees = pd.DataFrame(columns=trainee_index)

trainee_values.insert(0, 'Numbers')
df_trainees.loc[1] = trainee_values
# Export to CSV
df_trainees.to_csv('Outputs/CSV/df_trainees.csv', index=False)

Get Median Value for Numbers of Trainees

In [76]:
print('Average number of trainees over years the programs ran: ' + str(df['Trainee prog'].value_counts().median())
      + ' trainees')
Average number of trainees over years the programs ran: 3.5 trainees

Trainees self declaring as Trainees and not on the Trainee Program

In [77]:
# Filter Department for 'Trainee', return number of years in directory for each
filt = df['Role 1 Category'] == 'Trainee'
trainees = df[filt]
trainees['No of Yrs']
Out[77]:
75      1.0
461     1.0
468     1.0
627     1.0
1016    3.0
2537    1.0
Name: No of Yrs, dtype: float64

Length of career of trainees ordered by the year they became trainees

In [78]:
years = df.groupby(['Trainee prog'])
years['No of Yrs'].value_counts()
Out[78]:
Trainee prog  No of Yrs
1978          6.0          1
              36.0         1
              37.0         1
1979          7.0          1
              32.0         1
1981          29.0         1
1982          3.0          1
              26.0         1
              28.0         1
1983          3.0          1
              21.0         1
              36.0         1
1984          2.0          1
              14.0         1
              34.0         1
1986          17.0         1
              19.0         1
              34.0         1
1987          8.0          1
              12.0         1
              32.0         1
1988          7.0          1
1989          6.0          1
              10.0         1
              13.0         1
1990          2.0          1
              4.0          1
              9.0          1
              30.0         1
1991          1.0          1
              4.0          1
              5.0          1
              29.0         1
1993          11.0         2
              5.0          1
              7.0          1
1994          3.0          1
              4.0          1
              6.0          1
              7.0          1
              8.0          1
              22.0         1
1996          2.0          2
              10.0         1
              21.0         1
              23.0         1
1997          1.0          1
              3.0          1
              4.0          1
              7.0          1
              8.0          1
              16.0         1
2000          7.0          2
              1.0          1
              5.0          1
              8.0          1
              19.0         1
2002          2.0          3
              1.0          1
              3.0          1
              5.0          1
              11.0         1
2004          4.0          2
              7.0          2
              1.0          1
              3.0          1
              5.0          1
              6.0          1
2006          1.0          4
              3.0          2
              13.0         1
2007          1.0          3
              5.0          3
              6.0          2
              2.0          1
              3.0          1
              12.0         1
              13.0         1
2009          1.0          4
              2.0          2
              3.0          1
              4.0          1
              10.0         1
2011          1.0          1
              3.0          1
              4.0          1
2013          2.0          1
              8.0          1
              9.0          1
2015          3.0          2
2019          1.0          5
Name: No of Yrs, dtype: int64
In [79]:
years = df.groupby(['Trainee prog'])
years['No of Yrs'].mean()
Out[79]:
Trainee prog
1978    26.333333
1979    19.500000
1981    29.000000
1982    19.000000
1983    20.000000
1984    16.666667
1986    23.333333
1987    17.333333
1988     7.000000
1989     9.666667
1990    11.250000
1991     9.750000
1993     8.500000
1994     8.333333
1996    11.600000
1997     6.500000
2000     7.833333
2002     3.714286
2004     4.625000
2006     3.285714
2007     5.000000
2009     2.777778
2011     2.666667
2013     6.333333
2015     3.000000
2019     1.000000
Name: No of Yrs, dtype: float64

New DataFrame from selected columns

In [80]:
# Revise this, it's wrong 
trainees = df[['Role 1 Category', 'Gender', 'Trainee prog', 'No of Yrs']].dropna()
trainees = trainees.sort_values(['Role 1 Category','Trainee prog'], ascending=False)
trainees
Out[80]:
Role 1 Category Gender Trainee prog No of Yrs
3067 Support Female 1987 12.0
3437 Sound Male 2007 3.0
1060 Sound Male 2006 1.0
937 Sound Male 2004 6.0
1550 Sound Male 2000 7.0
759 Sound Male 1997 4.0
2046 Sound Male 1996 21.0
585 Sound Male 1989 10.0
3614 Sound Male 1987 32.0
3570 Sound Male 1986 19.0
1493 Sound Male 1984 34.0
1833 Production Male 2019 1.0
2302 Production Female 2019 1.0
935 Production Female 2015 3.0
1075 Production Female 2013 9.0
198 Production Unknown 2011 4.0
442 Production Male 2011 3.0
382 Production Female 2009 2.0
588 Production Female 2009 1.0
767 Production Male 2009 1.0
1029 Production Female 2009 4.0
2444 Production Male 2009 10.0
2684 Production Male 2009 1.0
584 Production Female 2007 1.0
1693 Production Male 2007 5.0
2002 Production Female 2007 2.0
2482 Production Male 2007 1.0
2795 Production Female 2007 13.0
2945 Production Female 2007 1.0
1328 Production Male 2006 3.0
1538 Production Male 2006 13.0
2430 Production Female 2006 1.0
2753 Production Female 2006 1.0
1735 Production Male 2004 3.0
1798 Production Male 2004 1.0
2091 Production Female 2004 7.0
2539 Production Male 2004 4.0
1757 Production Male 2002 2.0
3609 Production Female 2002 2.0
1469 Production Unknown 2000 5.0
2798 Production Male 2000 1.0
1113 Production Female 1997 8.0
3583 Production Female 1996 10.0
268 Production Female 1994 7.0
586 Production Female 1991 1.0
463 Production Female 1987 8.0
2182 Production Male 1982 3.0
381 Production Male 1981 29.0
2519 Producer Female 2002 5.0
278 Producer Female 1984 14.0
3619 Producer Male 1982 26.0
218 Post-Production Male 2013 2.0
829 Post-Production Male 2007 6.0
1024 Post-Production Male 2002 3.0
2333 Post-Production Female 2000 8.0
932 Post-Production Female 1997 1.0
363 Post-Production Male 1994 22.0
2262 Post-Production Male 1991 5.0
1183 Post-Production Male 1990 4.0
2688 Post-Production Female 1988 7.0
480 Post-Production Female 1986 34.0
1939 Post-Production Male 1983 3.0
1558 Post-Production Male 1978 6.0
2950 Post-Production Female 1978 37.0
2343 Hair & Make-Up Male 2009 3.0
3095 Hair & Make-Up Female 2004 7.0
2134 Hair & Make-Up Female 1997 16.0
1568 Hair & Make-Up Female 1993 11.0
2749 Direction Female 2013 8.0
3428 Direction Female 2002 1.0
427 Direction Male 1996 2.0
3573 Direction Female 1994 3.0
1763 Direction Female 1993 5.0
2464 Direction Male 1993 7.0
640 Direction Female 1991 4.0
868 Direction Female 1989 13.0
337 Direction Male 1979 7.0
832 Costume Female 2019 1.0
1187 Costume Female 2007 6.0
2419 Costume Female 2006 1.0
3291 Costume Female 2004 5.0
2201 Costume Female 1997 3.0
3077 Costume Female 1996 2.0
1948 Camera Male 2019 1.0
3010 Camera Male 2019 1.0
3387 Camera Unknown 2015 3.0
2628 Camera Male 2009 2.0
545 Camera Female 2007 5.0
1775 Camera Male 2007 12.0
1190 Camera Male 2006 3.0
86 Camera Male 2002 11.0
1632 Camera Male 2000 19.0
262 Camera Female 1996 23.0
3415 Camera Male 1994 6.0
2283 Camera Female 1993 11.0
2062 Camera Male 1991 29.0
1408 Camera Female 1990 2.0
2816 Camera Male 1990 9.0
1285 Camera Male 1989 6.0
3532 Camera Female 1986 17.0
2083 Camera Female 1984 2.0
207 Camera Female 1983 21.0
2635 Camera Male 1983 36.0
476 Camera Male 1982 28.0
454 Camera Male 1979 32.0
1417 Camera Male 1978 36.0
1138 Art Female 2011 1.0
3537 Art Female 2009 1.0
1002 Art Female 2007 5.0
177 Art Male 2004 4.0
3393 Art Female 2002 2.0
633 Art Female 2000 7.0
2429 Art Male 1997 7.0
2296 Art Male 1994 4.0
3581 Art Male 1994 8.0
3069 Art Male 1990 30.0

Trainee Gender

In [81]:
trainees['Gender'].value_counts()
Out[81]:
Male       59
Female     54
Unknown     3
Name: Gender, dtype: int64
In [82]:
trainees['Gender'].value_counts(normalize=True).apply(format_percent)
Out[82]:
Male       51
Female     47
Unknown     3
Name: Gender, dtype: object

Trainee Gender & Department

In [83]:
dept_grp = trainees.groupby(['Role 1 Category'])
dept_grp['Gender'].value_counts()
Out[83]:
Role 1 Category  Gender
Art              Female      5
                 Male        5
Camera           Male       15
                 Female      7
                 Unknown     1
Costume          Female      6
Direction        Female      6
                 Male        3
Hair & Make-Up   Female      3
                 Male        1
Post-Production  Male        8
                 Female      5
Producer         Female      2
                 Male        1
Production       Female     19
                 Male       16
                 Unknown     2
Sound            Male       10
Support          Female      1
Name: Gender, dtype: int64
In [84]:
dept_grp = trainees.groupby(['Role 1 Category'])
dept_grp['Gender'].value_counts(normalize=True)
Out[84]:
Role 1 Category  Gender
Art              Female     0.500000
                 Male       0.500000
Camera           Male       0.652174
                 Female     0.304348
                 Unknown    0.043478
Costume          Female     1.000000
Direction        Female     0.666667
                 Male       0.333333
Hair & Make-Up   Female     0.750000
                 Male       0.250000
Post-Production  Male       0.615385
                 Female     0.384615
Producer         Female     0.666667
                 Male       0.333333
Production       Female     0.513514
                 Male       0.432432
                 Unknown    0.054054
Sound            Male       1.000000
Support          Female     1.000000
Name: Gender, dtype: float64

Number of trainees in each department by year

In [85]:
year_grp = trainees.groupby(['Trainee prog'])
year_grp['Role 1 Category'].value_counts()
Out[85]:
Trainee prog  Role 1 Category
1978          Post-Production    2
              Camera             1
1979          Camera             1
              Direction          1
1981          Production         1
1982          Camera             1
              Producer           1
              Production         1
1983          Camera             2
              Post-Production    1
1984          Camera             1
              Producer           1
              Sound              1
1986          Camera             1
              Post-Production    1
              Sound              1
1987          Production         1
              Sound              1
              Support            1
1988          Post-Production    1
1989          Camera             1
              Direction          1
              Sound              1
1990          Camera             2
              Art                1
              Post-Production    1
1991          Camera             1
              Direction          1
              Post-Production    1
              Production         1
1993          Direction          2
              Camera             1
              Hair & Make-Up     1
1994          Art                2
              Camera             1
              Direction          1
              Post-Production    1
              Production         1
1996          Camera             1
              Costume            1
              Direction          1
              Production         1
              Sound              1
1997          Art                1
              Costume            1
              Hair & Make-Up     1
              Post-Production    1
              Production         1
              Sound              1
2000          Production         2
              Art                1
              Camera             1
              Post-Production    1
              Sound              1
2002          Production         2
              Art                1
              Camera             1
              Direction          1
              Post-Production    1
              Producer           1
2004          Production         4
              Art                1
              Costume            1
              Hair & Make-Up     1
              Sound              1
2006          Production         4
              Camera             1
              Costume            1
              Sound              1
2007          Production         6
              Camera             2
              Art                1
              Costume            1
              Post-Production    1
              Sound              1
2009          Production         6
              Art                1
              Camera             1
              Hair & Make-Up     1
2011          Production         2
              Art                1
2013          Direction          1
              Post-Production    1
              Production         1
2015          Camera             1
              Production         1
2019          Camera             2
              Production         2
              Costume            1
Name: Role 1 Category, dtype: int64

Trainee Plots

In [86]:
#data
df_trainee = df['Trainee prog'].value_counts().sort_index()
trainee_year_index = df_trainee.index.tolist()
trainee_values = df_trainee.values

fig, ax = plt.subplots()
width=0.7
ax.bar(trainee_year_index, trainee_values, color=colours[2], width = width, label='Trainees')
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_facecolor('white')
ax.set_xticks(ticks=trainee_year_index)
fig.autofmt_xdate(rotation=90)
plt.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/trainees_per_year.png', facecolor='#ffffff')
plt.show()

Career Lengths & Longevity

Basic figures for Career Lengths

In [87]:
# Number of Years in Directory, number of entries present for that duration
df['No of Yrs'].value_counts()
Out[87]:
1.0     1120
2.0      530
3.0      355
4.0      247
5.0      217
6.0      128
7.0      121
8.0      101
9.0       97
10.0      85
11.0      68
13.0      54
12.0      54
15.0      51
14.0      47
17.0      35
18.0      31
16.0      31
22.0      30
19.0      27
21.0      27
20.0      26
24.0      21
25.0      19
26.0      17
29.0      14
23.0      14
28.0      11
30.0      10
27.0       9
34.0       7
35.0       7
32.0       6
31.0       4
33.0       4
36.0       3
37.0       2
38.0       1
41.0       1
Name: No of Yrs, dtype: int64

Career length grouped into 5 year bins

In [88]:
bins = pd.cut(df['No of Yrs'], [0, 5, 10, 15, 20, 25, 30, 35, 40, 45])
df.groupby(bins)['No of Yrs'].agg(['count'])
Out[88]:
count
No of Yrs
(0, 5] 2469
(5, 10] 532
(10, 15] 274
(15, 20] 150
(20, 25] 111
(25, 30] 61
(30, 35] 28
(35, 40] 6
(40, 45] 1
In [89]:
indexes = ['1 year','2-3 years', '4-6 years', '7-20 years', '20-41 years']
In [90]:
# Data
bins = pd.cut(df['No of Yrs'], [0, 1, 3, 6, 20, 41])
values = df.groupby(bins)['No of Yrs'].agg(['count'])
print(values)
           count
No of Yrs
(0, 1]      1120
(1, 3]       885
(3, 6]       592
(6, 20]      828
(20, 41]     207
In [91]:
# Simplify Data
simple_values = (values.values)
value_list = [ item for elem in simple_values for item in elem]

colors = sns.cubehelix_palette(start=2, rot=1, dark=0.3)
fig, ax = plt.subplots()
ax.pie(value_list,  labels=indexes, labeldistance=1.2, pctdistance=0.85, textprops={'fontsize': 14}, shadow=False,
        startangle=90, autopct='%1.0f%%', colors=colours, wedgeprops={'edgecolor':'black'})
ax.set_title('', fontsize=16, color='#635DC6', weight='bold')
ax.text(.0,.0,' ', fontsize=14, ha='right')
circle=plt.Circle( (0,0), 0.7, color='white')
p=plt.gcf()
p.gca().add_artist(circle)
plt.tight_layout()
plt.savefig('Outputs/Final/longevity_bins_percentages_ring.png', facecolor='#ffffff')
plt.show()
In [92]:
# Data
indexes = ['1-5 years','6-10 years', '11-15 years', '16-20 years', '>20']
bins = pd.cut(df['No of Yrs'], [0, 5, 10, 15, 20, 41])
values = df.groupby(bins)['No of Yrs'].agg(['count'])
print(values)
           count
No of Yrs
(0, 5]      2469
(5, 10]      532
(10, 15]     274
(15, 20]     150
(20, 41]     207
In [93]:
simple_values = (values.values)
value_list = [ item for elem in simple_values for item in elem]
# Plot
colors = sns.cubehelix_palette(start=2, rot=1, dark=0.3)
fig, ax = plt.subplots()
ax.pie(value_list,  labels=indexes, labeldistance=1.3, pctdistance=1.1, textprops={'fontsize': 14}, shadow=False,
        startangle=90, autopct='%1.0f%%', colors=colours, wedgeprops={'edgecolor':'black'})
ax.set_title('', fontsize=16, color='#635DC6', weight='bold')
ax.text(.0,.0,' ', fontsize=14, ha='right')
circle=plt.Circle( (0,0), 0.7, color='white')
p=plt.gcf()
p.gca().add_artist(circle)
plt.tight_layout()
plt.savefig('Outputs/Final/longevity_bins_percentages_ring_directory_1.png', facecolor='#ffffff')
plt.show()

Personnel in Directory for more than 1 year

In [94]:
filtered = df['No of Yrs'].apply(lambda x: x > 1)
print(filtered.value_counts())
print()
print('Percentages')
print(filtered.value_counts(normalize=True).apply(format_percent))
True     2512
False    1122
Name: No of Yrs, dtype: int64

Percentages
True     69
False    31
Name: No of Yrs, dtype: object

Personnel in Directory for more than 6 years

In [95]:
filtered = df['No of Yrs'].apply(lambda x: x > 6)
print(filtered.value_counts())
print()
print('Percentages')
print(filtered.value_counts(normalize=True).apply(format_percent))
False    2599
True     1035
Name: No of Yrs, dtype: int64

Percentages
False    72
True     28
Name: No of Yrs, dtype: object

Personnel in Directory for more than 15 years

In [96]:
filtered = df['No of Yrs'].apply(lambda x: x > 15)
print(filtered.value_counts())
print()
print('Percentages')
print(filtered.value_counts(normalize=True).apply(format_percent))
False    3277
True      357
Name: No of Yrs, dtype: int64

Percentages
False    90
True     10
Name: No of Yrs, dtype: object

Personnel in Directory for more than 20 years

In [97]:
filtered = df['No of Yrs'].apply(lambda x: x > 20)
print(filtered.value_counts())
print()
print('Percentages')
print(filtered.value_counts(normalize=True).apply(format_percent))
False    3427
True      207
Name: No of Yrs, dtype: int64

Percentages
False    94
True      6
Name: No of Yrs, dtype: object

Median Career Length All Entries

In [98]:
df['No of Yrs'].median()
Out[98]:
3.0

Median Career Length for Entries in for more than 1 Year

In [99]:
filt = (df['No of Yrs'] > 1)
df2 = df[filt]
df2['No of Yrs'].median()
Out[99]:
5.0

Career Lengths by Department

In [100]:
# Showing Long Careers & Breakdown by Department
lengths = df.groupby(['No of Yrs'])
lengths['Role 1 Category'].value_counts().tail(107)
Out[100]:
No of Yrs  Role 1 Category
20.0       Art                6
           Producer           5
           Sound              5
           Direction          3
           Production         3
           Camera             2
           Costume            1
           Hair & Make-Up     1
21.0       Camera             8
           Sound              4
           Art                3
           Post-Production    3
           Production         3
           Producer           2
           Costume            1
           Direction          1
           Music              1
           Script             1
22.0       Art                7
           Camera             7
           Costume            3
           Production         3
           Direction          2
           Hair & Make-Up     2
           Post-Production    2
           Sound              2
           Casting            1
           Producer           1
23.0       Camera             5
           Art                2
           Production         2
           Direction          1
           Music              1
           Producer           1
           Script             1
           Sound              1
24.0       Camera             6
           Production         5
           Art                3
           Post-Production    2
           Costume            1
           Hair & Make-Up     1
           Producer           1
           Script             1
           Support            1
25.0       Art                5
           Sound              3
           Camera             2
           Hair & Make-Up     2
           Production         2
           Costume            1
           Direction          1
           Post-Production    1
           Producer           1
           Support            1
26.0       Camera             6
           Art                3
           Hair & Make-Up     3
           Direction          2
           Construction       1
           Producer           1
           Production         1
27.0       Camera             4
           Producer           2
           Direction          1
           Production         1
           Sound              1
28.0       Producer           3
           Camera             2
           Sound              2
           Art                1
           Post-Production    1
           Production         1
           Script             1
29.0       Camera             5
           Direction          3
           Production         2
           Sound              2
           Hair & Make-Up     1
           Producer           1
30.0       Camera             5
           Costume            2
           Art                1
           Post-Production    1
           Sound              1
31.0       Camera             2
           Post-Production    1
           Sound              1
32.0       Camera             3
           Sound              2
           Hair & Make-Up     1
33.0       Camera             2
           Sound              2
34.0       Camera             2
           Post-Production    2
           Art                1
           Direction          1
           Sound              1
35.0       Camera             3
           Art                2
           Costume            1
           Support            1
36.0       Camera             3
37.0       Art                1
           Post-Production    1
38.0       Sound              1
41.0       Camera             1
Name: Role 1 Category, dtype: int64
In [101]:
# Showing Short Careers Breakdown by Department Numbers
lengths = df.groupby(['No of Yrs'])
lengths['Role 1 Category'].value_counts().head(104)
Out[101]:
No of Yrs  Role 1 Category
1.0        Production         315
           Camera             167
           Art                148
           Direction           65
           Post-Production     62
           Sound               61
           Hair & Make-Up      57
           Costume             51
           Producer            43
           Music               35
           Support             31
           Casting             28
           Construction        24
           Script              17
           Special FX          11
           Trainee              5
2.0        Production         139
           Camera              88
           Art                 62
           Post-Production     35
           Hair & Make-Up      32
           Direction           31
           Producer            31
           Music               21
           Sound               20
           Casting             19
           Support             18
           Costume             14
           Script              13
           Construction         4
           Special FX           2
3.0        Production          92
           Camera              42
           Art                 34
           Direction           33
           Producer            28
           Hair & Make-Up      24
           Post-Production     22
           Costume             17
           Script              17
           Sound               13
           Casting              9
           Support              9
           Construction         7
           Music                5
           Special FX           2
           Trainee              1
4.0        Production          59
           Camera              43
           Direction           25
           Art                 24
           Producer            19
           Hair & Make-Up      17
           Costume             14
           Post-Production     13
           Sound               11
           Music                7
           Script               6
           Support              4
           Casting              3
           Construction         2
5.0        Production          50
           Camera              40
           Art                 35
           Producer            16
           Direction           15
           Post-Production     15
           Hair & Make-Up      12
           Costume             10
           Sound               10
           Music                5
           Casting              3
           Construction         2
           Support              2
           Script               1
           Special FX           1
6.0        Production          24
           Camera              22
           Art                 18
           Sound               12
           Direction           11
           Post-Production     10
           Costume              8
           Producer             8
           Hair & Make-Up       4
           Construction         3
           Script               3
           Music                2
           Support              2
           Casting              1
7.0        Production          22
           Camera              19
           Art                 18
           Direction           15
           Producer            12
           Hair & Make-Up       8
           Post-Production      7
           Costume              6
           Sound                6
           Script               4
           Casting              1
           Construction         1
           Music                1
           Special FX           1
Name: Role 1 Category, dtype: int64

Visualizing Career Lengths

The following two charts show Longevity in the directory with frequency of entries with the given 'career' lengths plotted along the x axis. We chart the pattern for all entries, and for entries who are present for more than 1 year. While entries lasting only 1 year are at first glance an anomaly, being by far the biggest single group in the dataset, but declaring the least information given their short presence, we see the same pattern (scaling down) with the removal of those entries from the chart.

In [102]:
# Data
keys_list = df['No of Yrs'].value_counts().index.tolist()
values = df['No of Yrs'].value_counts().values
median = df['No of Yrs'].median()

# Plot
width=0.8
fig, ax = plt.subplots()
ax.bar(keys_list, values, color=colours[2], width = width, label='Crew')
ax.axvline(median, linewidth=3, color='black', label='Median')
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_facecolor('white')
ax.set_xticks(ticks=keys_list)
ax.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/basic_crew_longevity.png', facecolor='#ffffff')
plt.show()
In [103]:
# Data
filt = (df['No of Yrs'] > 1)
df2 = df[filt]
keys_list = df2['No of Yrs'].value_counts().index.tolist()
values = df2['No of Yrs'].value_counts().values
median = df2['No of Yrs'].median()

# Plot
width=0.8
fig, ax = plt.subplots()
ax.bar(keys_list, values, color=colours[4], width = width, label='Crew')
ax.axvline(median, linewidth=3, color='black', label='Median')
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_facecolor('white')
ax.set_xticks(ticks=keys_list)
ax.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/basic_crew_longevity>1yr.png', facecolor='#ffffff')
plt.show()

Key window of career lengths: between 4 and 20 years

We are interested in the factors that allow people to develop longer careers. The following chart shows frequency of career lengths in the entries for between 4 and 20 years. This offers a wide sample of entries who are clearly getting something out of being in the directory

1 Year Entries - Their Departments

In [104]:
filt_1 = (df['No of Yrs'] < 2)
df_1 = df[filt_1]
msg = 'Number in department in for only 1 year, historical'
print(msg)
print(len(msg) *'-')
entries = []
for dept in depts:
    filt_dept = (df_1['Role 1 Category'] == dept)
    df_depts = df_1[filt_dept]
    count = df_depts['No of Yrs'].value_counts().values
    entries.append(count[0])
    #print(dept, count)
print(entries)
Number in department in for only 1 year, historical
---------------------------------------------------
[315, 167, 148, 65, 43, 62, 61, 57, 51, 35, 31, 17, 28, 24, 11, 5]
In [105]:
depts2 = depts
depts2.pop(-1)
entries.pop(-1)
Out[105]:
5
In [106]:
width=0.7
color = sns.cubehelix_palette(start=2, dark=0.5)

fig, ax = plt.subplots()
ax.bar(depts2, entries, color=colours[5], width = width, label='Crew')

ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_facecolor('white')

ax.set_xticks(ticks=depts)
fig.autofmt_xdate(rotation=45)
ax.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/freelancers_by_department_in_for_1_year.png', facecolor='#ffffff')
plt.show()

Long Careers In Film Bang

In [107]:
# Data
filt5 = (df['No of Yrs'] > 20 )
df4 = df[filt5]
keys_list = df4['No of Yrs'].value_counts().index.tolist()
values = df4['No of Yrs'].value_counts().values
# Plot
color = sns.cubehelix_palette(start=1, dark=0.5)
width=0.7
fig, ax = plt.subplots()
ax.bar(keys_list, values, color=colours[6], width = width, label='Crew')

ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_facecolor('white')
ax.set_xticks(ticks=keys_list)
ax.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/freelancer_longevity_>20_yrs.png', facecolor='#ffffff')
plt.show()

Career Lengths in Departments - Bins

In [108]:
import seaborn as sns
import matplotlib.pylab as plt

totals = df['Role 1 Category'].value_counts().array
depts = df['Role 1 Category'].value_counts().index.array
indexes = ['1 year','2-3 years', '4-6 years', '7-11 years', '12-18 years', '19-29 years', '30-42 years']
color = sns.color_palette('colorblind', as_cmap=True)
In [109]:
def bins_plotter(_index, department, med, values):
    width = 0.7
    total = str(totals[_index])
    label = department +'\n'+'Dept total = '+total+'\n' +'Median= '+med+' years'

    fig, ax = plt.subplots()
    ax.bar(indexes, values, color=color[9], width = width, label=label)
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.set_title('')
    ax.set_facecolor('white')
    ax.legend(fontsize=14)
    plt.grid(True)
    plt.tight_layout()
    plt.savefig('Outputs/Final/crew_longevity_by_year_prime_bins_bar_'+department+'.png', facecolor='#ffffff')
    plt.show()
In [110]:
def career_bins(series, dataframe):
    for index, dept in enumerate(series):
        filt_dept = (dataframe['Role 1 Category'] == dept)
        df_depts = dataframe[filt_dept]
        bins = pd.cut(df_depts['No of Yrs'], [0, 1, 3, 6, 11, 18, 29, 42])
        values = df_depts.groupby(bins)['No of Yrs'].agg(['count'])
        simple_values = (values.values)
        value_list = [ item for elem in simple_values for item in elem]
        median = str(df_depts['No of Yrs'].median())
        # call bins_plotter function
        bins_plotter(index, dept, median, value_list)
In [111]:
career_bins(depts, df)

Career Longevity / Turnover

Using the calculation for career length each year (a count produced for each year an entry is present in the directory) chart career lengths over time

In [112]:
years = [*range(1,42)]
In [113]:
df_master = pd.DataFrame({'Years': years})
for year in df.loc[:,'2020':'1976']:
    # filter out NAN values
    filtyear = df[year].notna()
    dfyear = df[filtyear]
    # get data
    values_year = dfyear['Yr Cnt '+year].value_counts().array
    keys_year = dfyear['Yr Cnt '+year].value_counts().index
    # second dataframe
    df_year = pd.DataFrame({f'{year} Keys':keys_year, f'{year} Values':values_year})
    # join dataframes 
    df_master = df_master.join(df_year.set_index(f'{year} Keys'), on='Years')
longevity_data = df_master.set_index('Years')
In [114]:
longevity_data.iloc[20].median()
Out[114]:
10.0
In [115]:
print(longevity_data.iloc[19].median())
print(longevity_data.iloc[20].median())
print(longevity_data.iloc[21].median())
10.5
10.0
8.0
In [116]:
# drop_vals used in this code were read from another chart, not coded here. 
# write a function to retrieve this data.
dropout_years = [1978,1979,1981,1982,1984,1986,1987,1988,1989,1990,1991, 1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020]
df_one_year_dropouts = pd.DataFrame(columns=dropout_years)
drop_vals = [12, 7, 3, 3,  2,  39, 6,  5,  9,  23, 24, 29, 10, 13, 35, 18, 16, 20, 36, 23, 38, 28,11,30, 44, 35,30, 32, 29, 68,15, 46, 19, 30, 18, 35, 16, 32, 40, 40]
df_one_year_dropouts.loc[1] = drop_vals
print(df_one_year_dropouts)

df_one_year_dropouts2 = pd.DataFrame()
df_one_year_dropouts2['Years'] = dropout_years
df_one_year_dropouts2['Dropouts'] = drop_vals
df_one_year_dropouts2.to_csv('Outputs/CSV/To_DataWrapper/df_dropouts.csv', index=False)
  1978 1979 1981 1982 1984 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995  \
1   12    7    3    3    2   39    6    5    9   23   24   29   10   13   35

  1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010  \
1   18   16   20   36   23   38   28   11   30   44   35   30   32   29   68

  2011 2012 2013 2014 2015 2016 2017 2018 2019 2020
1   15   46   19   30   18   35   16   32   40   40
In [117]:
df_one_year_dropouts.loc[1].median()
Out[117]:
23.5
In [118]:
import matplotlib.pylab as pylt

plot_years = df.loc[:, '2020':'1976']
hm_colours = sns.cubehelix_palette(start=2, rot=1, dark=0, light=.95, as_cmap=True)

pylt.figure(figsize=(22,10))
ax = sns.heatmap(longevity_data, linewidth=0.3, cmap=hm_colours, annot=True, fmt=".0f")
pylt.yticks(rotation=0)
ax.xaxis.set_ticks_position('top')
ax.set_xticklabels(plot_years,rotation=90)
ax.set_facecolor('white')
pylt.xlabel('')
pylt.ylabel('Length of Career in Years')

#plt.title(f'Film Bang Freelancers Length of Career over Time' , fontsize=16, color='#635DC6', weight='bold')
#plt.figtext(.0,.0,'Fig. 1.3', fontsize=14, ha='right')
plt.savefig(f"Outputs/Final/career_longevity_basic_1.png", facecolor='#ffffff', dpi=500)
pylt.show()
In [119]:
# Convert to html table for screen readers
#longevity_data.to_html()

The chart above shows overall presence in the listings over the history of Film Bang.

Each horizontal row contains numbers of people in the data with that given length of career (top level is whoever has a 1 year presence in the listings). Columns are for each year. So we can track how many people have a given career length in each year.

We can see a bulge of darker colour emerging from the early nineties, reaching a peak in the early 2000s and dropping off to early 90s density from 2012 onwards.

We can also see an initial high density influx of listings in the first year, and a comparative drop off of 'new starts' until 1984 when the number of new entries jumps. We can see this cohort lingering on in the listings in the form of a darker trail leading down and left from the figure '89' at the top of the 1984 column.

Department Longevity

In [120]:
depts = df['Role 1 Category'].value_counts().index.array
In [121]:
df_master = pd.DataFrame({'Years': years})
In [122]:
import matplotlib.pylab as plt

plot_years = df.loc[:, '2020':'1976']

def plot(dept):
    hm_colours = sns.cubehelix_palette(start=2, rot=1, dark=0, light=.95, as_cmap=True)
    plt.figure(figsize=(22,10))
    ax = sns.heatmap(longevity_data_2, linewidth=0.3, cmap=hm_colours, annot=True, fmt=".0f")
    plt.yticks(rotation=0)
    ax.xaxis.set_ticks_position('top')
    ax.set_facecolor('white')
    ax.set_xticklabels(plot_years,rotation=90)
    plt.xlabel('')
    plt.ylabel('Number of Years in Directory')
    plt.title(f'{dept}', fontsize=14)
    #plt.figtext(.0,.0,'Fig. #', fontsize=14, ha='right')
    #plt.legend(dept)
    plt.savefig(f"Outputs/Final/career_longevity_{dept}_2.png", facecolor="#ffffff", dpi=500)
    plt.show()
In [123]:
# Loop through departments
for i in depts:
    #initialise a new df_master dataframe for each pass through departments
    years = [*range(1,42)]
    df_master = pd.DataFrame({'Years': years})
    # filter for department of given iteration
    deptfilt = (df['Role 1 Category'] == i)
    df_dept = df[deptfilt]
    # loop through year columns and filter out NAN values
    for year in df.loc[:, '2020':'1976']:
        filtyear = df_dept[year].notna()
        dfyear = df_dept[filtyear]
        # get numbers
        values_year = dfyear['Yr Cnt '+year].value_counts().array
        keys_year = dfyear['Yr Cnt '+year].value_counts().index
        # put them in a new mini dataframe
        df_year = pd.DataFrame({f'{year} Keys':keys_year, f'{year} Values':values_year})
        # join the mini dataframes to a temporary master dataframe
        df_master = df_master.join(df_year.set_index(f'{year} Keys'), on='Years')
    # assign the new master dataframe to data variable and plot it 
    # plot function will expect data to be the current dataframe
    df_master = df_master.set_index('Years')
    longevity_data_2 = df_master

    plot(i)

The charts above show density of career lengths over time. The darker the colour, the more entries. For each incremental year that cohort moves down one square and left one square. Under Production for example there are 6 entries under production in 1976. In 1978 there are 4 entries. Meaning 2 dropped out and 4 stayed in. In 1978 4 people have a two year career and 4 people have a 1 year career. In 2011 6 entries show a career length of 10 years. In 2012 there are 5 entries with a career of 11 years (meaning one dropped out). And so on.

The darker the trail leading down and left, the more sustainable the career during that period. So we see careers starting in the mid 90s lasting comparitively well. Patches of light colour relatively high up in the charts indicate (relatively) high numbers of drop outs.

Presence in the listings after 10 years is comparatively rare so seeing any people lasting this long is significant in its own right

In [124]:
long_series = longevity_data_2.iloc[0]
long_series.median()
Out[124]:
1.0
In [125]:
longevity_data_2.iloc[0].median()
Out[125]:
1.0

Dept data total and after 1 year

In [126]:
depts = df['Role 1 Category'].value_counts().index.tolist()
nums = df['Role 1 Category'].value_counts().tolist()
In [127]:
filt_1_yr = (df['No of Yrs'] > 1)
df_more_than_1_year = df[filt_1_yr]
nums_more_than_1_year = df_more_than_1_year['Role 1 Category'].value_counts().tolist()
In [128]:
width=0.8
fig, ax = plt.subplots()
ax.bar(depts, nums, color=colours[3], width = width, label='All listings')
ax.bar(depts, nums_more_than_1_year, color=colours[4], width=width, label="Listings in Directory for more than 1 year")
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_facecolor('white')
ax.set_xticks(ticks=depts)
fig.autofmt_xdate(rotation=45)
plt.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/department_role_category_comparison.png', facecolor='#ffffff')
plt.show()

Gender

Gender is inferred. The Film Bang directory does not ask people to declare gender, or other personal information. The gender presented here is inferred from the names of entries (done during data capture, prior to anonymization of the data). Where gender is recorded here as 'unknown' it refers to a name that might conventionally refer to both a man or woman.

In [129]:
df['Gender'].value_counts()
Out[129]:
Male       1982
Female     1542
Unknown     110
Name: Gender, dtype: int64
In [130]:
df['Gender'].value_counts(normalize=True).apply(format_percent)
Out[130]:
Male       55
Female     42
Unknown     3
Name: Gender, dtype: object
In [131]:
role_grp = df.groupby(['Role 1 Category'])
role_grp['Gender'].value_counts().tail(300)
Out[131]:
Role 1 Category  Gender
Art              Female     229
                 Male       221
                 Unknown     20
Camera           Male       537
                 Female      80
                 Unknown     17
Casting          Female      44
                 Male        29
                 Unknown      3
Construction     Male        51
Costume          Female     130
                 Male        18
                 Unknown      6
Direction        Male       182
                 Female      93
                 Unknown      7
Hair & Make-Up   Female     183
                 Male        15
                 Unknown      3
Music            Male        73
                 Female      11
                 Unknown      2
Post-Production  Male       161
                 Female      56
                 Unknown      9
Producer         Female     121
                 Male       119
                 Unknown      2
Production       Female     491
                 Male       300
                 Unknown     25
Script           Male        50
                 Female      28
                 Unknown      1
Sound            Male       177
                 Female      22
                 Unknown      6
Special FX       Male        17
                 Female       5
Support          Female      48
                 Male        27
                 Unknown      8
Trainee          Male         5
                 Female       1
Name: Gender, dtype: int64
In [132]:
ct = {
    'Year':[],
    'Male':[],
    'Female':[],
    'Unknown':[],
}
gender_ct = pd.DataFrame(ct)
In [133]:
for column in df.loc[:, '2020':'1976']:

    filt1 = (df[column] == column) & (df['Gender'] == 'Female')
    new_df = df[filt1]
    filt2 = (df[column] == column) & (df['Gender'] == 'Male')
    new_df2 = df[filt2]
    filt3 = (df[column] == column) & (df['Gender'] == 'Unknown')
    new_df3 = df[filt3]

    women = len(new_df.index)
    men = len(new_df2.index)
    no_gender = len(new_df3.index)
    gender_ct = gender_ct.append({'Year': column, 'Male': men,'Female': women, 'Unknown': no_gender}, ignore_index=True)
In [134]:
# Plot Data
gender_data = gender_ct
years_x = gender_data['Year']
no_g_y = gender_data['Unknown']
male_y = gender_data['Male']
female_y = gender_data['Female']
In [135]:
width=0.7
fig, ax = plt.subplots()

bars = np.add(male_y, female_y).tolist()
ax.bar(years_x, no_g_y, bottom=bars, color=colours[0], width=width, label='Gender Unknown')
ax.bar(years_x, male_y, bottom=female_y, color=colours[1], width=width, label='Male')
ax.bar(years_x, female_y, color=colours[2], width=width, label='Female')

ax.set_xlabel('')
ax.set_ylabel('')
ax.set_xticks(ticks=years_x)
ax.set_xticklabels(years_x,{'fontsize':12})
fig.autofmt_xdate(rotation=90, ha='right')

fig.gca().invert_xaxis()
ax.set_facecolor('white')

ax.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig("Outputs/Final/basic_gender_chart_dp.png", facecolor='#ffffff')
plt.show()
In [136]:
gender_ct['Div'] = 100/(gender_ct['Male'] + gender_ct['Female'] + gender_ct['Unknown'])
gender_ct['m_perc'] = gender_ct['Div'] * gender_ct['Male']
gender_ct['f_perc'] = gender_ct['Div'] * gender_ct['Female']
gender_ct['u_perc'] = gender_ct['Div'] * gender_ct['Unknown']
In [137]:
gender_ct
years_x = gender_ct['Year']
no_g_y = gender_ct['u_perc']
male_y = gender_ct['m_perc']
female_y = gender_ct['f_perc']
width=0.7

bars = np.add(male_y, female_y).tolist()
plt.bar(years_x, no_g_y, bottom=bars, color=colours[0], width=width, label='Gender Unknown')
plt.bar(years_x, male_y, bottom=female_y, color=colours[1], width=width, label='Male')
plt.bar(years_x, female_y, color=colours[2], width=width, label='Female')

plt.xlabel('')
plt.ylabel('')

plt.xticks(ticks=years_x, rotation=90)

plt.gca().invert_xaxis()

ax = plt.axes()
ax.set_facecolor('white')

plt.legend(frameon=1, fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig("Outputs/Final/basic_gender_chart_percent.png", facecolor='#ffffff')

plt.show()
In [138]:
# Create CSV for basic stats: crew, crew gender, companies ADD Workshops
crew_totals = [688, 644, 636, 627, 634, 675, 627, 668, 664, 692, 639, 726, 708, 759, 786, 793, 806, 782, 728, 710, 713, 676, 651, 585, 535, 492, 476, 397, 361, 355, 313, 266, 206, 210, 197, 200, 119, 104, 85, 83, 66]
production_companies = [50, 51, 49, 49, 55, 56, 61, 60, 65, 64, 62, 77, 79, 78, 70, 72, 74, 78, 81, 87, 85, 82, 81, 72, 62, 60, 65, 56, 53, 48, 39, 35, 27, 34, 29, 26, 22, 16, 15, 13, 11]
gender_ct['Total Crew'] = crew_totals
gender_ct['Companies'] = production_companies
# Export to CSV
gender_ct.to_csv('Outputs/CSV/To_DataWrapper/gender_ct.csv', index=False)
In [139]:
gender_by_year = {
    'Year':[],
    'Male':[],
    'Female':[],
    'Unknown':[],
}
df_gender = pd.DataFrame(gender_by_year)
In [140]:
def gender_dept_plot(d, years_x, no_g_y, male_y, female_y, index):
    bars = np.add(male_y, female_y).tolist()
    plt.bar(0, 0, color='none', label=f"{i}")
    plt.bar(years_x, no_g_y, bottom=bars, color=colours[0], width=width, label='Gender Unknown')
    plt.bar(years_x, male_y, bottom=female_y, color=colours[1], width=width, label='Male')
    plt.bar(years_x, female_y, color=colours[2], width=width, label='Female')

    plt.xlabel('')
    plt.ylabel('')
    plt.title(f'')
    ax = plt.axes()
    ax.set_facecolor('white')
    plt.xticks(ticks=years_x, rotation=90, fontsize=14)
    plt.yticks(fontsize=14)
    plt.gca().invert_xaxis()
    plt.legend(fontsize=16)
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(f"Outputs/Final/Gender/{index}_department_workers_by_gender.png", facecolor='#ffffff')
    plt.show()
In [141]:
# Refactor this to break into smaller parts: data manipulation and plots. 
# Create plot function with a dept parameter
# Call plot function within the loop, passing the dept / column variable as argument. 

width = 0.7
depts = df['Role 1 Category'].dropna().unique()

for i in depts:
    df_gender = df_gender[0:0]
    filt = (df['Role 1 Category'] == i)
    department = df[filt]
    for column in department.loc[:, '2020':'1976']:
        filt1 = (department[column] == column) & (department['Gender'] == 'Female')
        new_df = department[filt1]
        filt2 = (department[column] == column) & (department['Gender'] == 'Male')
        new_df2 = department[filt2]
        filt3 = (department[column] == column) & (department['Gender'] == 'Unknown')
        new_df3 = department[filt3]
        women = len(new_df.index)
        men = len(new_df2.index)
        no_gender = len(new_df3.index)
        df_gender = df_gender.append({'Year': column, 'Male': men,'Female': women, 'Unknown': no_gender}, ignore_index=True)

    #data = df_gender
    years_x = df_gender['Year']
    no_g_y = df_gender['Unknown']
    male_y = df_gender['Male']
    female_y = df_gender['Female']
    # call plot function
    gender_dept_plot(depts, years_x, no_g_y, male_y, female_y, i)

Career Length by Gender

In [142]:
g = df.groupby(['No of Yrs'])
g_count = g['Gender'].value_counts()
print(g_count)
No of Yrs  Gender
1.0        Male       589
           Female     482
           Unknown     49
2.0        Male       283
           Female     227
           Unknown     20
3.0        Male       176
           Female     168
           Unknown     11
4.0        Male       130
           Female     109
           Unknown      8
5.0        Male       110
           Female     105
           Unknown      2
6.0        Male        75
           Female      50
           Unknown      3
7.0        Male        62
           Female      59
8.0        Male        64
           Female      34
           Unknown      3
9.0        Male        57
           Female      36
           Unknown      4
10.0       Male        51
           Female      34
11.0       Male        40
           Female      26
           Unknown      2
12.0       Male        32
           Female      21
           Unknown      1
13.0       Female      27
           Male        27
14.0       Female      24
           Male        22
           Unknown      1
15.0       Male        31
           Female      19
           Unknown      1
16.0       Male        21
           Female      10
17.0       Male        26
           Female       9
18.0       Male        20
           Female      11
19.0       Male        17
           Female      10
20.0       Male        16
           Female      10
21.0       Male        18
           Female       7
           Unknown      2
22.0       Male        17
           Female      12
           Unknown      1
23.0       Female       7
           Male         7
24.0       Female      11
           Male        10
25.0       Male        10
           Female       8
           Unknown      1
26.0       Male        10
           Female       7
27.0       Male         9
28.0       Male         9
           Female       2
29.0       Male         8
           Female       6
30.0       Male         8
           Female       2
31.0       Male         4
32.0       Male         5
           Female       1
33.0       Female       2
           Male         2
34.0       Male         5
           Female       2
35.0       Male         4
           Female       3
36.0       Male         3
37.0       Female       1
           Male         1
38.0       Male         1
41.0       Male         1
Name: Gender, dtype: int64

Data

In [143]:
# Create Dataframe with Yrs column
df_clg = pd.DataFrame({'Yrs': range(1, 43)})

# Filtering
m_filt = (df['Gender'] == 'Male')
df_m = df[m_filt]
f_filt = (df['Gender'] == 'Female')
df_f = df[f_filt]
u_filt = (df['Gender'] == 'Unknown')
df_u = df[u_filt]

# parse data
m_length = df_m.groupby(['No of Yrs'])
m_c_length_values = m_length['Gender'].value_counts().array
m_c_length_keys = m_length['Gender'].value_counts().index.tolist()
m_indexes = [i[0] for i in m_c_length_keys]

f_length = df_f.groupby(['No of Yrs'])
f_c_length_values = f_length['Gender'].value_counts().array
f_c_length_keys = f_length['Gender'].value_counts().index.tolist()
f_indexes = [i[0] for i in f_c_length_keys]

u_length = df_u.groupby(['No of Yrs'])
u_c_length_values = u_length['Gender'].value_counts().array
u_c_length_keys = u_length['Gender'].value_counts().index.tolist()
u_indexes = [i[0] for i in u_c_length_keys]

# Join Data
df_male = pd.DataFrame({'M Yrs':m_indexes, 'M Values':m_c_length_values})
df_clg = df_clg.join(df_male.set_index('M Yrs'), on='Yrs')
df_female = pd.DataFrame({'F Yrs':f_indexes, 'F Values':f_c_length_values})
df_clg = df_clg.join(df_female.set_index('F Yrs'), on='Yrs')
df_unknown = pd.DataFrame({'U Yrs':u_indexes, 'U Values':u_c_length_values})
df_clg = df_clg.join(df_unknown.set_index('U Yrs'), on='Yrs')
#print(df_clg)

Plots

In [144]:
width = 0.3

yrs = df_clg['Yrs']
male = df_clg['M Values']
female = df_clg['F Values']
unknown = df_clg['U Values']

fig, ax = plt.subplots()

ax.bar(yrs + width, unknown, color=colours[0], width=width, label='Gender Unknown', log=True)
ax.bar(yrs, male, color=colours[1], width=width, label='Male',log=True)
ax.bar(yrs - width, female, color=colours[2], width=width, label='Female', log=True)

ax.set_xlabel('')
ax.set_ylabel('')
ax.set_facecolor('white')
ax.set_xticks(ticks=yrs)

ax.legend(loc='upper right', fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig("Outputs/Final/gender_longevity_log.png", facecolor='#ffffff')
plt.show()
In [145]:
bins = pd.cut(df_clg['Yrs'], [0, 1, 3, 6, 11, 18, 29, 42])
m_values = male.groupby(bins)
f_values = female.groupby(bins)
u_values = unknown.groupby(bins)
m_out = [i for i in m_values]
f_out = [i for i in f_values]
u_out = [i for i in u_values]
In [146]:
# Data
m_totals = []
f_totals = []
u_totals = []
for i in m_out:
    m_totals.append(i[1].sum())
for i in f_out:
    f_totals.append(i[1].sum())
for i in u_out:
    u_totals.append(i[1].sum())

indexes = ['1 year','2-3 years', '4-6 years', '7-11 years', '12-18 years', '19-29 years', '30-42 years']
width = 0.3
fig, ax = plt.subplots()

ax.bar(indexes, m_totals, color=colours[1], width=width, align='edge', label='Male')
ax.bar(indexes, f_totals, color=colours[2], width=-width, align='edge', label='Female')
ax.bar(indexes, u_totals, color=colours[0], width=width,  label='Unknown Gender')
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_facecolor('white')
ax.legend(fontsize=12)
plt.tight_layout()
plt.savefig('Outputs/Final/longevity_gender_crew_bins.png', facecolor='#ffffff')
plt.show()
In [147]:
df_gender_bins = pd.DataFrame(columns=indexes)
# Add to DataFrame
df_gender_bins.loc[1] = m_totals
df_gender_bins.loc[2] = f_totals
df_gender_bins.loc[3] = u_totals
df_gender_bins
Out[147]:
1 year 2-3 years 4-6 years 7-11 years 12-18 years 19-29 years 30-42 years
1 589.0 459.0 315.0 274.0 179.0 131.0 34.0
2 482.0 395.0 264.0 189.0 121.0 80.0 11.0
3 49.0 31.0 13.0 9.0 3.0 4.0 0.0

Dropouts

In [148]:
depts = df['Role 1 Category'].value_counts().index.tolist()
In [149]:
# Create new column duplicating values from Yr Cnt 1976 
# in order to allow the .diff calculation below to work
# Calculation uses values in column to the left, so there has to be a duplicate column 
# 'to the left' of the first column to be calculated. 
df['Yr Cnt 1976 x'] = df['Yr Cnt 1976']
In [150]:
# Reverse year cols
df = df[['UUID', 'Codes', 'Trainee prog', 'Gender', 'Role 1', 'Role 1 Category',
       'Role 2', 'Role 2 Category', 'Role 3', 'Role 3 Category','Has Consistent Role',
         'Postcode1', 'Rural', 'No of Yrs', '1976', '1978', '1979', '1981', '1982',
         '1984', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993',
         '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002',
         '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011',
         '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020',
         'Company', 'Description', 'Yr Cnt 1976 x', 'Yr Cnt 1976', 'Yr Cnt 1978',
         'Yr Cnt 1979', 'Yr Cnt 1981', 'Yr Cnt 1982', 'Yr Cnt 1984', 'Yr Cnt 1986',
         'Yr Cnt 1987', 'Yr Cnt 1988', 'Yr Cnt 1989', 'Yr Cnt 1990', 'Yr Cnt 1991',
         'Yr Cnt 1992', 'Yr Cnt 1993', 'Yr Cnt 1994', 'Yr Cnt 1995', 'Yr Cnt 1996',
         'Yr Cnt 1997', 'Yr Cnt 1998', 'Yr Cnt 1999', 'Yr Cnt 2000', 'Yr Cnt 2001',
         'Yr Cnt 2002', 'Yr Cnt 2003', 'Yr Cnt 2004', 'Yr Cnt 2005', 'Yr Cnt 2006',
         'Yr Cnt 2007', 'Yr Cnt 2008', 'Yr Cnt 2009', 'Yr Cnt 2010', 'Yr Cnt 2011',
         'Yr Cnt 2012', 'Yr Cnt 2013', 'Yr Cnt 2014', 'Yr Cnt 2015', 'Yr Cnt 2016',
         'Yr Cnt 2017', 'Yr Cnt 2018', 'Yr Cnt 2019', 'Yr Cnt 2020']]
df.head(1)
Out[150]:
UUID Codes Trainee prog Gender Role 1 Role 1 Category Role 2 Role 2 Category Role 3 Role 3 Category Has Consistent Role Postcode1 Rural No of Yrs 1976 1978 1979 1981 1982 1984 1986 1987 1988 1989 1990 ... Yr Cnt 1996 Yr Cnt 1997 Yr Cnt 1998 Yr Cnt 1999 Yr Cnt 2000 Yr Cnt 2001 Yr Cnt 2002 Yr Cnt 2003 Yr Cnt 2004 Yr Cnt 2005 Yr Cnt 2006 Yr Cnt 2007 Yr Cnt 2008 Yr Cnt 2009 Yr Cnt 2010 Yr Cnt 2011 Yr Cnt 2012 Yr Cnt 2013 Yr Cnt 2014 Yr Cnt 2015 Yr Cnt 2016 Yr Cnt 2017 Yr Cnt 2018 Yr Cnt 2019 Yr Cnt 2020
0 b8da19b3-1a0d-4dd6-9cd1-1cf69c8bd615 NaN NaN Unknown Location Scout Production Missing NaN Missing NaN True NaN NaN 1.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1

1 rows × 99 columns

We start with two sets of year columns. The basic year columns containing strings such as '2020' or '1999'. Presence or absence in the directory is represented by the presence or absence of a year string in those columns. Yr Cnt Year columns present a running tally of the number of years a person has been in the directory. Due to the way the code runs, the total years (career length) is present in all subsequent columns regardless of when the person left the directory. The code below creates two things: a dataframe indicating the difference between one year and the next (inserting a 1.0 or a 0.0 value depending on whether the year tally increases or stays the same. Then it multiplies the 'Yr Cnt Year' cols by the difference values: (year tally 1 = year tally), (year tally 0 = 0).

In [151]:
# Assign values from Yr Cnt 1976 col to a variable. 
# This allows retrieval of the original values once 
# the .diff calculation is run.
temp = df['Yr Cnt 1976']
temp.value_counts()
Out[151]:
0    3568
1      66
Name: Yr Cnt 1976, dtype: int64
In [152]:
# check those values against the duplicate column
if temp.value_counts().values.tolist() == df['Yr Cnt 1976 x'].value_counts().values.tolist(): print("Values Equal")
Values Equal
In [153]:
# create diff dataframe
df_diff = df.loc[:, 'Yr Cnt 1976 x':'Yr Cnt 2020'].diff(axis='columns')
#df_diff
In [154]:
# Multiply the year count by the diff value to either get the total or 0 
In [155]:
# Re-assign values from original Yr Cnt 1976 column to the current one. 
# diff operation was inserting NAN values in Yr Cnt 1976 col because the 
# column previous to it was an index.
df_diff['Yr Cnt 1976'] = temp
In [156]:
# check operation has worked
if df_diff['Yr Cnt 1976'].value_counts().values.tolist() == df['Yr Cnt 1976'].value_counts().values.tolist(): print("Values Equal")
Values Equal
In [157]:
# Multiply selected cols in dataframe by diff dataframe & show results
df_results = df.loc[:, 'Yr Cnt 1976 x':'Yr Cnt 2020'].multiply(df_diff)
df_results.head()
Out[157]:
Yr Cnt 1976 x Yr Cnt 1976 Yr Cnt 1978 Yr Cnt 1979 Yr Cnt 1981 Yr Cnt 1982 Yr Cnt 1984 Yr Cnt 1986 Yr Cnt 1987 Yr Cnt 1988 Yr Cnt 1989 Yr Cnt 1990 Yr Cnt 1991 Yr Cnt 1992 Yr Cnt 1993 Yr Cnt 1994 Yr Cnt 1995 Yr Cnt 1996 Yr Cnt 1997 Yr Cnt 1998 Yr Cnt 1999 Yr Cnt 2000 Yr Cnt 2001 Yr Cnt 2002 Yr Cnt 2003 Yr Cnt 2004 Yr Cnt 2005 Yr Cnt 2006 Yr Cnt 2007 Yr Cnt 2008 Yr Cnt 2009 Yr Cnt 2010 Yr Cnt 2011 Yr Cnt 2012 Yr Cnt 2013 Yr Cnt 2014 Yr Cnt 2015 Yr Cnt 2016 Yr Cnt 2017 Yr Cnt 2018 Yr Cnt 2019 Yr Cnt 2020
0 NaN 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
1 NaN 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 2.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 NaN 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 2.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 NaN 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 9.0 10.0 11.0 12.0 13.0 14.0 15.0 16.0 17.0 18.0 19.0 20.0 21.0 22.0 0.0
4 NaN 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 2.0 3.0 4.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
In [158]:
## Checking when dropouts occur

This code inserts a string 'year' - where the value in the given year is greater than the following (a dropout year will contain 0) - into a new column to capture the year when an entry drops out. Multiple columns are required because many entries drop out and re enter the directory over the years, sometimes several times.

In [159]:
# Get list of column names
yr_cnt_range = df_results.loc[:, 'Yr Cnt 1976':'Yr Cnt 2020'].columns.tolist()
In [160]:
# Get list of years
yr_range = df.loc[:, '1976':'2020'].columns.tolist()
In [161]:
# Get int version of years list
int_yr_range = []
for i in yr_range:
    int_yr_range.append(int(i))
#print(int_yr_range)
In [162]:
nIYR = len(int_yr_range)
for index in range(1, nIYR):
    i = int_yr_range[index]
    j = str(i)
    df_results['dropout '+ (j[2:])] = np.where((df_results[yr_cnt_range[index-1]] >
              df_results[yr_cnt_range[index]]), i, np.nan)
In [163]:
dropouts = []
In [164]:
# 
for i in yr_range[1:]:
    try:
        dropouts.append((i, df_results['dropout '+i[2:]].value_counts().values[0]))
    except IndexError:
        dropouts.append((i, 0))
In [165]:
df_results['dropout 11'].value_counts().values[0]
Out[165]:
108
In [166]:
dropout_list = []
for i in dropouts:
    dropout_list.append(i[1])
In [167]:
# at a 0 value for 1976 in dropout_list to account for no dropouts in the first year
insert = 0 # index position
dropout_list[insert:insert] = [0]
In [168]:
# Optional put dropouts in new dataframe
df2 = pd.DataFrame({'values':dropout_list})
In [169]:
# Median number of dropouts per year
dropout_median = df2['values'].median()
In [170]:
# Get totals for each year
general_count = []
for column in df.loc[:, '1976':'2020']:
    try:
        step = df[column].value_counts()
        general_count.append(step[0])
    except IndexError:
        general_count.append(0)
print(general_count)
[66, 83, 85, 104, 119, 200, 197, 210, 206, 266, 313, 355, 361, 397, 476, 492, 535, 585, 651, 676, 713, 710, 728, 782, 806, 793, 786, 759, 708, 726, 639, 692, 664, 668, 627, 675, 634, 627, 636, 644, 688]

Calculating New Entries

We can now count the instances of a career length of 1 year and produce stats for the number of new entries per year. The 1 count is based on the presence of a year string in the original dataset, so there's no case of a first year counting as 0.

In [171]:
new_entries = []
for column in df_results.loc[:, 'Yr Cnt 1976':'Yr Cnt 2020']:
    current = df_results[column].isin([1]).sum(axis=0)
    new_entries.append(current)
In [172]:
print(new_entries)
[66, 29, 18, 29, 30, 89, 59, 33, 32, 77, 82, 83, 77, 77, 119, 93, 99, 115, 119, 132, 122, 106, 97, 124, 129, 109, 107, 108, 83, 112, 68, 98, 75, 96, 80, 104, 72, 102, 95, 114, 171]
In [173]:
# Median number of new entries per year
df3 = pd.DataFrame({'values':new_entries})
new_entry_median = df3['values'].median()
print(new_entry_median)
95.0
In [174]:
df2 = pd.DataFrame({'values':dropout_list})
#print(df2)

Where people drop out more than once

In [175]:
df_dropouts = df_results.loc[:, 'dropout 78':'dropout 20']
In [176]:
df['multi dropouts'] = df_dropouts.notna().sum(axis=1)
In [177]:
df['multi dropouts'].head(10)
Out[177]:
0    0
1    1
2    1
3    1
4    1
5    0
6    1
7    0
8    0
9    0
Name: multi dropouts, dtype: int64
In [178]:
df['multi dropouts'].value_counts(dropna=False)
Out[178]:
1    2573
0     529
2     419
3      95
4      14
5       4
Name: multi dropouts, dtype: int64
In [179]:
dropout_tally_list = df['multi dropouts'].value_counts().tolist()
In [180]:
dropout_tally_list
Out[180]:
[2573, 529, 419, 95, 14, 4]
In [181]:
try:
    dropout_tally_list.pop(0)
except IndexError:
        print('Index Error')
try:
    dropout_tally_list.pop(0)
except IndexError:
        print('Index Error')
drop_sum = sum(dropout_tally_list)
In [182]:
total_entries = 3634
d = 100 / total_entries
one_dropout = d * 2573
two_dropouts = d * 419
three_dropouts = d * 95
four_dropouts = d * 14
five_dropouts = d * 4

multi_dropout_total = d * 532

print('One:' , one_dropout , ' %')
print('Two:' , two_dropouts , ' %')
print('Three:' , three_dropouts , ' %')
print('Four:' , four_dropouts , ' %')
print('Five:' , five_dropouts , ' %')
print('Multiple Dropouts:' , multi_dropout_total , ' %')
One: 70.80352228948816  %
Two: 11.529994496422674  %
Three: 2.6141992294991745  %
Four: 0.38525041276829936  %
Five: 0.1100715465052284  %
Multiple Dropouts: 14.639515685195377  %

Plots

In [183]:
plt.rcParams['axes.facecolor']='white'
In [184]:
plot_years = yr_range
In [185]:
fig, ax = plt.subplots()
ax.plot(plot_years, general_count, color="black", label="Total Entries", linewidth=3)
ax.plot(plot_years, dropout_list, color=colours[3], label='Dropouts', linewidth=3)
ax.plot(plot_years, new_entries, color=colours[2], label='New Entries', linewidth=3)

ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_xticks(ticks=plot_years)
fig.autofmt_xdate(rotation=90)
ax.set_facecolor('white')
ax.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()

plt.savefig('Outputs/Final/turnover_new_entries_dropouts_totals.png', facecolor='#ffffff')
plt.show()
In [186]:
width = 0.5
fig, axs = plt.subplots(2, 1, sharex=True)
neg_dropout_list = [ -x for x in dropout_list]

axs[0].bar(plot_years, new_entries, width=width, color=colours[2], label='New Entries')
axs[0].set_xticks(plot_years)
axs[1].bar(plot_years, neg_dropout_list, width=width, color=colours[3], label='Dropouts')
axs[1].set_xticks(plot_years)

axs[0].legend(fontsize=14)
axs[1].legend(fontsize=14, loc=2, bbox_to_anchor=(0.0,0.9))

plt.xticks(rotation=90)
plt.tight_layout()

plt.subplots_adjust(hspace=0)
axs[1].set_xlabel('')
axs[0].set_ylabel('')
axs[0].yaxis.set_label_coords(-0.05,-0.2)

fig.set_facecolor('white')
axs[0].set_facecolor('white')
axs[1].set_facecolor('white')

ticks =  axs[1].get_yticks()
axs[1].set_yticklabels([int(abs(tick)) for tick in ticks])
axs[0].grid(False)
axs[1].grid(False)
plt.savefig(f'Outputs/Final/turnover_new_entries_dropouts_mirror_bars.png', facecolor='#ffffff')

plt.show()

Dropouts and new entries, while presented above as somewhat symetrical, are not opposing poles. A person can be a new entry only once, in one particular year. A person can drop out and re-enter the directory as many times as they like; those dropouts are all counted and included in the chart above. So there will be more dropouts than new entries regardless. The meaning of a dropout is also ambiguous. Leaving the film bang directory does not mean leaving the industry or failing in the industry.

Dropouts are measured by comparing a value with a (lower) value in the following year. The dropout is located in the year with the low value. So if a person is present in the directory in 2018 and absent in 2019, the dropout is recorded as happening in 2019.

In [187]:
dropouts_index=['New Entries', 'Dropouts']
df_dropouts = pd.DataFrame(columns=plot_years)
# Add to DataFrame
df_dropouts.loc[1] = new_entries
df_dropouts.loc[2] = neg_dropout_list
#df_dropouts.to_html()
In [188]:
#df_dropouts.iloc[1]
In [189]:
# go through each row and create a new column with their dropout year in it. 
df_test = df_results
df_test.replace('%','',regex=True).astype('float')
df_test = df_test.drop(df_test.loc[:,'Yr Cnt 1976 x':'Yr Cnt 2020'].columns, axis = 1)
df_test
df_test["dropout year"] = df_test.sum(axis=1)
df_test.head(10)
Out[189]:
dropout 78 dropout 79 dropout 81 dropout 82 dropout 84 dropout 86 dropout 87 dropout 88 dropout 89 dropout 90 dropout 91 dropout 92 dropout 93 dropout 94 dropout 95 dropout 96 dropout 97 dropout 98 dropout 99 dropout 00 dropout 01 dropout 02 dropout 03 dropout 04 dropout 05 dropout 06 dropout 07 dropout 08 dropout 09 dropout 10 dropout 11 dropout 12 dropout 13 dropout 14 dropout 15 dropout 16 dropout 17 dropout 18 dropout 19 dropout 20 dropout year
0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0
1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2007.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2007.0
2 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2002.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2002.0
3 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2020.0 2020.0
4 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2013.0 NaN NaN NaN NaN NaN NaN NaN 2013.0
5 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0
6 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1999.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1999.0
7 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0
8 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0
9 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0
In [190]:
df_test.shape
Out[190]:
(3634, 41)
In [191]:
df.shape
Out[191]:
(3634, 100)

Data for Cox Proportional Hazard Modelling

In [192]:
# take the above column and add it to the main df
df_cox = df.join(df_test)
df_cox.head(30)
Out[192]:
UUID Codes Trainee prog Gender Role 1 Role 1 Category Role 2 Role 2 Category Role 3 Role 3 Category Has Consistent Role Postcode1 Rural No of Yrs 1976 1978 1979 1981 1982 1984 1986 1987 1988 1989 1990 ... dropout 97 dropout 98 dropout 99 dropout 00 dropout 01 dropout 02 dropout 03 dropout 04 dropout 05 dropout 06 dropout 07 dropout 08 dropout 09 dropout 10 dropout 11 dropout 12 dropout 13 dropout 14 dropout 15 dropout 16 dropout 17 dropout 18 dropout 19 dropout 20 dropout year
0 b8da19b3-1a0d-4dd6-9cd1-1cf69c8bd615 NaN NaN Unknown Location Scout Production Missing NaN Missing NaN True NaN NaN 1.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0
1 18c1559c-081b-4cbf-bd32-03b6103aa603 NaN NaN Male Camera (Lighting) Camera Missing NaN Missing NaN True DD10 NaN 3.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2007.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2007.0
2 978118a2-08fd-4621-b732-037b592e19b7 NaN NaN Male Editor Post-Production Missing NaN Missing NaN True EH8 NaN 3.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN 2002.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2002.0
3 78597596-2381-4fc8-aa96-faa77cc79de5 NaN NaN Female Costume Designer Costume Wardrobe Assistant Costume Missing NaN True G14 NaN 22.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2020.0 2020.0
4 e1bf07fb-079d-4457-a4c7-9d3e784d873e NaN NaN Male Sound Boom Operator Sound Sound Assistant Sound Missing NaN True G12 NaN 4.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2013.0 NaN NaN NaN NaN NaN NaN NaN 2013.0
5 375204bd-e245-44a3-bf11-d8721aa69fa7 NaN NaN Female Camera 2nd Assistant Camera Missing NaN Missing NaN True NaN NaN 1.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0
6 064372a8-468d-485a-9d58-c8b07d272fc8 NaN NaN Male Camera (Lighting) Camera Missing NaN Missing NaN True G41 NaN 3.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN 1999.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1999.0
7 41051e49-8db1-4dd7-b9c5-6e096f00070d NaN NaN Unknown Stunt Co-Ordinator Casting Missing NaN Missing NaN True NaN NaN 2.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0
8 453f96fd-618d-42f1-9549-9abdeaaa9ca0 NaN NaN Unknown Photographer Camera Missing NaN Missing NaN True NaN NaN 1.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0
9 ed55eaf9-b358-4158-adf3-4cb40678d1de NaN NaN Unknown Photographer Camera Missing NaN Missing NaN True NaN NaN 2.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0
10 bf9dbc48-bb4d-4e95-b899-1178e97068b8 NaN NaN Male Construction Construction Standby Carpenter Construction Missing NaN True G3 6 NaN 13.0 NaN NaN NaN NaN NaN NaN 1986 1987 1988 1989 1990 ... NaN NaN 1999.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1999.0
11 4c6844ec-739e-4858-aa84-45b5b37b0c7d NaN NaN Male Runner Production Missing NaN Missing NaN True G52 NaN 1.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2007.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2007.0
12 535d5e26-9be0-48e7-976e-80a485a69dbe NaN NaN Male Assistant Editor Post-Production Missing NaN Missing NaN True EH8 NaN 11.0 NaN NaN NaN NaN NaN 1984 1986 1987 1988 1989 1990 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1996.0
13 3534f501-b86e-42e6-8109-567b79d47c3c NaN NaN Male Director Direction Missing NaN Missing NaN True EH3 NaN 9.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2011.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN 2011.0
14 8d9f231e-5dd6-42a1-9fef-949984e225a6 NaN NaN Male Producer Producer Director Direction Missing NaN NaN G11 NaN 7.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1990 ... 1997.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1997.0
15 c83e70e2-1b8a-4f19-a2e1-0a7dfa9e1a57 NaN NaN Male Petty Cash Buyer Art Missing NaN Missing NaN True G66 NaN 2.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2019.0 NaN 2019.0
16 c79c0679-df2a-4112-9c1e-ac3d1bea6fe7 NaN NaN Male Writer/Producer Script Director Direction Missing NaN NaN PH1 NaN 1.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1993.0
17 4ae7939e-629c-4bc2-91b5-c16dc3147351 NaN NaN Female Make-Up Assistant Hair & Make-Up Make-Up Artist Hair & Make-Up Missing NaN True G22 NaN 2.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2019.0 NaN 2019.0
18 b59e0b45-a387-4ade-8e4b-70182be0f5b3 NaN NaN Female Unit Manager Production Missing NaN Missing NaN True EH3 NaN 4.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1996.0
19 b4b4982b-2610-4be7-a2fb-054e78023ef4 NaN NaN Male Location Manager Production Location Scout Production Missing NaN True EH15 NaN 2.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2020.0 2020.0
20 3834ae52-72ac-4ceb-ad64-8c98c0876096 NaN NaN Female Researcher Production Missing NaN Missing NaN True DG7 NaN 3.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1995.0
21 c16d3aa3-e2c5-4062-9685-20d6dff94901 NaN NaN Male Director Of Photography Camera Camera Operator Camera Camera (Lighting) Camera True G75 NaN 11.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2018.0 NaN NaN 2018.0
22 65bb41e8-2084-4f8b-a5de-7050a0f0c565 NaN NaN Male Researcher Production Missing NaN Missing NaN True G84 NaN 3.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2020.0 2020.0
23 e007b526-0fff-46f1-be19-5c0d20d1c968 NaN NaN Female Wardrobe Assistant Costume Missing NaN Missing NaN True NaN NaN 1.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0
24 76e9ddcd-9da6-43c4-add2-144f7b85bd0b NaN NaN Male Camera Assistant Camera Missing NaN Missing NaN True EH NaN 2.0 NaN NaN NaN NaN 1982 1984 NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1986.0
25 57865408-6927-4153-abcc-37e806177c70 NaN NaN Female Production Assistant Production Missing NaN Missing NaN True EH5 NaN 1.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1994.0
26 03eaf2dc-b55a-46a8-ae0b-da6e03b63a1f NaN NaN Male Production Manager Production Missing NaN Missing NaN True NaN NaN 1.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2020.0 2020.0
27 cab4d104-e02d-44dc-b1b8-2141786eb57a NaN NaN Male Camera (Lighting) Camera Director (Camera) Camera Missing NaN True PA31 True 11.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2013.0 NaN NaN NaN NaN NaN NaN NaN 2013.0
28 d26f25e4-3f9f-4b5a-a92c-4ab0e07b6dce NaN NaN Male Production Secretary Production Production Assistant Production Runner Production True EH NaN 6.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0
29 b63a6a84-f03b-4e4c-980b-6b3277a543e1 NaN NaN Female Producer Producer Production Manager Production Production Assistant Production NaN EH6 NaN 5.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1990 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1995.0

30 rows × 141 columns

In [193]:
# drop useless cols from that df
df_cox = df_cox.drop(['Codes', 'Role 1', 'Role 2', 'Role 3', 'Company', 'Description'], axis=1)
In [194]:
df_cox = df_cox.drop(df_cox.loc[:,'dropout 78':'dropout 20'].columns, axis = 1)
In [195]:
df_cox = df_cox.drop(df_cox.loc[:,'Yr Cnt 1976 x':'Yr Cnt 2020'].columns, axis = 1)
In [196]:
df_cox.to_csv(f'Outputs/CSV/df_for_statistical_analysis.csv', index=False)
In [197]:
# create new dfs for each variable to be checked
df_cox['Trainee prog'] = df_cox['Trainee prog'].fillna(0)

Comparing Film Bang data with BFI

Methodology for comparison

We took the UK Film Council and BFI statistical yearbooks from 2002 - 2019. These are not a stable reliable source of data due to changes in the way the ONS counted companies in the screen industries. We found multiple errors in transcribing data from ONS into the yearbooks themselves. And noted multiple variants in figures for years through time. However, these figures present a 'good enough' comparison as long as we don't try to read anything very deep into the detail. We took the figures for the years 1996 - 2018 and went for the most plausible. MORE DETAIL ON PROCESS...

In [198]:
film_bang_se_nums = [
    688, 644, 636, 627, 634, 675, 627, 668, 664, 692, 639, 726, 708,
    759, 786, 793, 806, 782, 728, 710, 713, 676, 651, 585, 535, 492,
    476, 397, 361, 355, 313, 266, 206, 210, 197, 200, 119, 104, 85,
    83, 66]
In [199]:
bfi_se_nums = [
    8000,9000,13000,11000,11000,12000,15000,15000,15000,11000,13000,11000,
    11000,15000,20000,24000,22000,24000,28000,24000,27000,32000,31000]
bfi_years = [
    1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
    2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018]
film_bang_se_nums = [
    535, 585, 651, 676, 713, 710, 728, 782, 806, 793, 786, 759, 708, 726,
    639, 692, 664, 668, 627, 675, 634, 627, 636]
In [200]:
x_indexes = list(range(len(bfi_years)))
fig, ax = plt.subplots()

ax.plot(x_indexes, bfi_se_nums, color='magenta', label='BFI Numbers', linewidth=3)
ax.plot(x_indexes, film_bang_se_nums, color='blue', label='Film Bang Numbers', linewidth=3)

ax.set_facecolor('white')
ax.set_xlabel('')
ax.set_ylabel('Number of Self Employed')
ax.set_title('')
ax.set_xticks(ticks=x_indexes)
ax.set_xticklabels(bfi_years, {'fontsize':12})
fig.autofmt_xdate(rotation=90)
ax.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/Final/bfi_film_bang_plot_comparison.png', facecolor='#ffffff')
plt.show()
In [201]:
# reverse list
film_bang_se_nums = list(reversed(film_bang_se_nums))
print(film_bang_se_nums)
[636, 627, 634, 675, 627, 668, 664, 692, 639, 726, 708, 759, 786, 793, 806, 782, 728, 710, 713, 676, 651, 585, 535]
In [202]:
width = 0.8
fig, ax = plt.subplots()

ax.bar(x_indexes, bfi_se_nums, color=colours[6], width=width, label='BFI', log=True)
ax.bar(x_indexes, film_bang_se_nums, color=colours[2], width=width, label='Film Bang', log=True)

for i in range(len(bfi_years)):
    ax.text(x = x_indexes[i], y = bfi_se_nums[i] + 500, s = bfi_se_nums[i] + 500, size = 8, horizontalalignment='center')
    ax.text(x = x_indexes[i], y = film_bang_se_nums[i] + 50, s = film_bang_se_nums[i] + 50, size = 8, horizontalalignment='center')

ax.set_facecolor('white')
ax.set_xlabel('')
ax.set_ylabel('No. Entries - Log Scale')
ax.set_title('')
ax.set_xticks(ticks=x_indexes)
ax.set_xticklabels(bfi_years, {'fontsize':12})
fig.autofmt_xdate(rotation=90)

plt.legend(loc='upper left', fontsize=14)
plt.grid(True)
plt.savefig("Outputs/Final/bfi_film_bang_bar_comparison.png", facecolor='#ffffff')
plt.show()