Analysis and charts of data on the companies listed in Film Bang from 1976 - 2020
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')
plt.style.use('seaborn')
plt.style.use('seaborn-colorblind')
plt.rcParams['figure.figsize'] = [12, 6]
x = str
df = pd.read_excel('../Core_Data/020321_Master_List_Film_Bang_Production Companies.xlsx')
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 300)
colours = sns.color_palette('colorblind', as_cmap=True)
df.rename(columns={2021: '2021', 2020: '2020',2019: '2019', 2018: '2018',2017: '2017', 2016: '2016', 2015: '2015', 2014: '2014', 2013: '2013', 2012: '2012', 2011: '2011', 2010: '2010', 2009: '2009', 2008: '2008', 2007: '2007', 2006: '2006', 2005: '2005', 2004: '2004', 2003: '2003', 2002: '2002', 2001: '2001', 2000: '2000'}, inplace=True)
df.rename(columns={1999: '1999', 1998: '1998', 1997: '1997', 1996: '1996', 1995: '1995', 1994: '1994', 1993: '1993', 1992: '1992', 1991: '1991', 1990: '1990'}, inplace=True)
df.rename(columns={1989:'1989', 1988:'1988', 1987:'1987', 1986:'1986', 1984: '1984', 1982: '1982', 1981:'1981', 1979: '1979', 1978: '1978', 1976: '1976'}, inplace=True)
df.shape
# Example
df.loc[1, ['Company Name','Website','City','Key personnel']]
# Number of companies
df['Company Name'].count()
# Check for duplicates
df['Company Name'].value_counts().head(5)
df['Larger/Group'].value_counts()
# count instanges of & to count companies with more than one location
df['City'].str.contains('&').sum()
#NOTE number is actually 3 since one instance of '&' is part of a 'prev' qualification
# count instanges of / to count companies with more than one location
df['City'].str.contains('/').sum()
# count instances of 'prev' to count companies that moved
df['City'].str.contains('prev').sum()
# find 'mispelled entries and replace with correct string
df['City'] = df['City'].str.replace('Edinbburgh', 'Edinburgh')
# City :'last known' city. Split column by 'prev' and use only column 1 of the set
cities = df['City'].str.split('prev|&', expand=True,)
cities.fillna(value=np.nan, inplace=True)
df = df.join(cities)
# rename columns
df.rename(columns={0: 'City1', 1: 'City2', 2: 'City3'}, inplace=True)
# Replace NaN values with string to allow filtering
df['City'].replace(np.nan, 'Missing', inplace=True)
# remove whitespace from city columns
df['City1'] = df['City1'].str.strip()
df['City2'] = df['City2'].str.strip()
df['City3'] = df['City3'].str.strip()
region_numbers = df['Region'].value_counts(dropna=False).values.tolist()
region_keys = df["Region"].value_counts(dropna=False).index.tolist()
# Export Region Data for Datawrapper
df_regions = pd.DataFrame(columns=('Region','Number'))
df_regions['Region'] = region_keys
df_regions['Number'] = region_numbers
df_regions.to_csv('Outputs/regions.csv', index=False)
postcodes = df['Post code'].str.split('prev|Prev|(2001)|;', expand=True)
postcodes.fillna(value=np.nan, inplace=True)
df = df.join(postcodes)
df.rename(columns={0: 'P_Code1'}, inplace=True)
df['P_Code1'] = df['P_Code1'].str.replace(r'\;|\(','')
df['P_Code1'].replace(r"^ +| +$", r"", regex=True, inplace=True)
# Import second dataset in order to add a column with area code
df2 = pd.read_csv('../Core_Data/ukpostcodes.csv')
df = df.join(df2.set_index('postcode'), on='P_Code1')
locations = df['City1'].value_counts().index.tolist()
location_count = df['City1'].value_counts().values.tolist()
country = ['Scotland' for x in range(len(locations))]
df_locations = pd.DataFrame(columns=('Country','Location', 'Count'))
df_locations['Country'] = country
df_locations['Location'] = locations
df_locations['Count'] = location_count
df_locations.to_csv('Outputs/locations.csv', index=False)
df_locations
for use with other chart tools
lat = df['latitude']
lon = df['longitude']
df_lat_long = pd.DataFrame(columns=('latitude', 'longitude', 'count'))
df_lat_long['latitude'] = lat
df_lat_long['longitude'] = lon
df_lat_long['count'] = 1
df_lat_long.to_csv('Outputs/df_lat_long.csv', index=False)
df_lat_long.head(10)
df['Post code'].value_counts().head(5)
# Showing duplicates of key personnel.
# Cases where the same people set up more than one production company over the years
#df['Key personnel'].value_counts().head(10)
# Number of cases where multiple companies fall under the same name
len(df['Key personnel'].value_counts().head(8).values.tolist())
# Checking duplicate values in Website column
df['Website'].value_counts(dropna=True).head(10)
# Duplicate addresses / clusters
df['Address'].value_counts(dropna=True).head(10)
def year_count(range_start, range_end):
for year in df.loc[:, range_start:range_end]:
df['Yr Cnt '+year] = df.loc[:, year:range_end].count(axis=1)
year_count('2021', '1976')
df_entries = df.loc[:,'2021':'1976']
df['No of Yrs'] = df_entries.count(axis=1)
# Aggregate to 5 year bins
bins = pd.cut(df['No of Yrs'], [0, 5, 10, 15, 20, 25, 30, 35, 40, 45])
df.groupby(bins)['No of Yrs'].agg(['count'])
# Aggregate around bins used in Companies Survey
bins = pd.cut(df['No of Yrs'], [0, 2, 5, 10, 20, 45])
bins_values = df.groupby(bins)['No of Yrs'].agg(['count'])
bins_values
bin_indexes = ['1 year','2-5 years', '6-10 years', '11 - 20', '> 20 years']
# Drop entries with no year data (added to db in error)
# Get indexes where No of Yrs == 0
noEntriesIndexes = df[df['No of Yrs'] == 0].index
# Delete these row indexes from dataFrame
df.drop(noEntriesIndexes , inplace=True)
# Drop NFB entries in 2021 column
# Get indexes where name column has value 'NFB'
indexNames = df[(df['2021'] == 'NFB')].index
# Delete these row indexes from dataFrame
df.drop(indexNames , inplace=True)
indexNames
def company_count(output_array, dataframe, range_stop, range_start):
for col in dataframe.loc[:, range_stop:range_start]:
total = dataframe[col].value_counts(dropna=True).values.tolist()
output_array.append(total[0])
return(output_array)
# create output array
total_company = []
# call company count function
companytotal = company_count(total_company, df, '2021', '1976')
print(total_company)
# create indexes
plot_years = df.loc[:, '2021':'1976'].columns.tolist()
df_companies = pd.DataFrame(columns=('Years','Count'))
df_companies['Years'] = plot_years
df_companies['Count'] = total_company
df_companies.to_csv('Outputs/companies_figures.csv', index=False)
#df_companies
x_indexes = list(range(len(plot_years)))
x_indexes.reverse()
fig, ax = plt.subplots()
ax.plot(x_indexes, companytotal, color='#2E0014', label='Companies in Film Bang 1976 - 2021', linewidth=3)
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_xticks(ticks=x_indexes)
ax.set_xticklabels(plot_years)
fig.autofmt_xdate(rotation=90)
ax.legend(fontsize=14)
ax.set_facecolor('white')
plt.tight_layout()
plt.savefig('Outputs/companies_time.png', facecolor='#ffffff')
plt.show()
# Compile a DataFrame of companies operating from 2019 - 2021
# To filter out NFB values, change this to df_2021 = df[df'2021 == 2021]
df_2021 = df[df['2021'].notna()]
df_2020 = df[df['2020'].notna()]
df_2019 = df[df['2019'].notna()]
df_recent = pd.concat([df_2019, df_2020, df_2021])
df_recent = df_recent.drop_duplicates(subset=['Company Name'])
# Filter for those that set up recently (no of years < 4)
filt_yrs = (df['No of Yrs'] < 4)
df_recent = df_recent[filt_yrs]
df_recent['Region'].value_counts()
# Simplify Data
simple_values = (bins_values.values)
value_list = [ item for elem in simple_values for item in elem]
# Plot
colors = sns.cubehelix_palette(start=2, rot=1, dark=0.3)
#colors = ['blue', 'magenta', 'orange', 'green', 'lime', 'cyan', 'yellow']
fig, ax = plt.subplots()
ax.pie(value_list, labels=bin_indexes, labeldistance=1.2, pctdistance=0.85, textprops={'fontsize': 14}, shadow=False,
startangle=90, autopct='%1.0f%%', colors=colours, wedgeprops={'edgecolor':'black'})
ax.set_title('Companies', fontsize=16, color='#635DC6', weight='bold')
ax.text(.0,.0,' ', fontsize=14, ha='right')
circle=plt.Circle( (0,0), 0.7, color='white')
p=plt.gcf()
p.gca().add_artist(circle)
plt.tight_layout()
plt.savefig('Outputs/companies_longevity_bins_percentages_ring.png', facecolor='#ffffff')
plt.show()
# Data
keys_list = df['No of Yrs'].value_counts().index.tolist()
values = df['No of Yrs'].value_counts().values
median = df['No of Yrs'].median()
# Plot
width=0.8
fig, ax = plt.subplots()
ax.bar(keys_list, values, color=colours[2], width = width, label='Companies')
ax.axvline(median, linewidth=3, color='black', label='Median')
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_facecolor('white')
ax.set_xticks(ticks=keys_list)
ax.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/company_longevity.png', facecolor='#ffffff')
plt.show()
# Data
filt = (df['No of Yrs'] > 1)
df2 = df[filt]
keys_list = df2['No of Yrs'].value_counts().index.tolist()
values = df2['No of Yrs'].value_counts().values
median = df2['No of Yrs'].median()
# Plot
width=0.8
fig, ax = plt.subplots()
ax.bar(keys_list, values, color=colours[4], width = width, label='Companies')
ax.axvline(median, linewidth=3, color='black', label='Median')
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_facecolor('white')
ax.set_xticks(ticks=keys_list)
ax.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/company_longevity>1yr.png', facecolor='#ffffff')
plt.show()
years = [*range(1,43)]
df_longevity = pd.DataFrame({'Years': years})
for year in df.loc[:,'2021':'1976']:
filtyear = df[year].notna()
dfyear = df[filtyear]
values_year = dfyear['Yr Cnt '+year].value_counts().array
#print(values_year)
keys_year = dfyear['Yr Cnt '+year].value_counts().index
#print(keys_year)
df_year = pd.DataFrame({f'{year} Keys':keys_year, f'{year} Values':values_year})
#print(df_year)
df_longevity = df_longevity.join(df_year.set_index(f'{year} Keys'), on='Years')
longevity_data = df_longevity.set_index('Years')
#longevity_data
import matplotlib.pylab as pylt
plot_years = df.loc[:, '2021':'1976']
hm_colours = sns.cubehelix_palette(start=2, rot=1, dark=0, light=.95, as_cmap=True)
pylt.figure(figsize=(22,10))
ax = sns.heatmap(longevity_data, linewidth=0.3, cmap=hm_colours, annot=True, fmt=".0f")
pylt.yticks(rotation=0)
ax.xaxis.set_ticks_position('top')
ax.set_xticklabels(plot_years,rotation=90)
ax.set_facecolor('white')
pylt.xlabel('')
pylt.ylabel('Length of Career in Years')
plt.title('Companies' , fontsize=16, color='black', weight='bold')
#plt.figtext(.0,.0,'Fig. 1.3', fontsize=14, ha='right')
plt.savefig(f"Outputs/company_longevity_basic_1.png", facecolor='#ffffff', dpi=500)
pylt.show()
df['Yr Cnt 1976 x'] = df['Yr Cnt 1976']
df = df[['1976', '1978', '1979', '1981', '1982',
'1984', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993',
'1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002',
'2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011',
'2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021',
'Yr Cnt 1976 x', 'Yr Cnt 1976', 'Yr Cnt 1978',
'Yr Cnt 1979', 'Yr Cnt 1981', 'Yr Cnt 1982', 'Yr Cnt 1984', 'Yr Cnt 1986',
'Yr Cnt 1987', 'Yr Cnt 1988', 'Yr Cnt 1989', 'Yr Cnt 1990', 'Yr Cnt 1991',
'Yr Cnt 1992', 'Yr Cnt 1993', 'Yr Cnt 1994', 'Yr Cnt 1995', 'Yr Cnt 1996',
'Yr Cnt 1997', 'Yr Cnt 1998', 'Yr Cnt 1999', 'Yr Cnt 2000', 'Yr Cnt 2001',
'Yr Cnt 2002', 'Yr Cnt 2003', 'Yr Cnt 2004', 'Yr Cnt 2005', 'Yr Cnt 2006',
'Yr Cnt 2007', 'Yr Cnt 2008', 'Yr Cnt 2009', 'Yr Cnt 2010', 'Yr Cnt 2011',
'Yr Cnt 2012', 'Yr Cnt 2013', 'Yr Cnt 2014', 'Yr Cnt 2015', 'Yr Cnt 2016',
'Yr Cnt 2017', 'Yr Cnt 2018', 'Yr Cnt 2019', 'Yr Cnt 2020', 'Yr Cnt 2021']]
#df.head(1)
temp = df['Yr Cnt 1976']
temp.value_counts()
# check those values against the duplicate column
if temp.value_counts().values.tolist() == df['Yr Cnt 1976 x'].value_counts().values.tolist(): print("Values Equal")
# create diff dataframe
df_diff = df.loc[:, 'Yr Cnt 1976 x':'Yr Cnt 2021'].diff(axis='columns')
#df_diff
df_diff['Yr Cnt 1976'] = temp
# check operation has worked
if df_diff['Yr Cnt 1976'].value_counts().values.tolist() == df['Yr Cnt 1976'].value_counts().values.tolist(): print("Values Equal")
# Multiply selected cols in dataframe by diff dataframe & show results
df_results = df.loc[:, 'Yr Cnt 1976 x':'Yr Cnt 2021'].multiply(df_diff)
#df_results.head()
# Get list of column names
yr_cnt_range = df_results.loc[:, 'Yr Cnt 1976':'Yr Cnt 2021'].columns.tolist()
# Get list of years
yr_range = df.loc[:, '1976':'2021'].columns.tolist()
# Get int version of years list
int_yr_range = []
for i in yr_range:
int_yr_range.append(int(i))
#print(int_yr_range)
nIYR = len(int_yr_range)
for index in range(1, nIYR):
i = int_yr_range[index]
j = str(i)
df_results['dropout '+ (j[2:])] = np.where((df_results[yr_cnt_range[index-1]] >
df_results[yr_cnt_range[index]]), i, np.nan)
dropouts = []
for i in yr_range[1:]:
try:
dropouts.append((i, df_results['dropout '+i[2:]].value_counts().values[0]))
except IndexError:
dropouts.append((i, 0))
df_results['dropout 11'].value_counts().values.tolist()
dropout_list = []
for i in dropouts:
dropout_list.append(i[1])
# at a 0 value for 1976 in dropout_list to account for no dropouts in the first year
insert = 0 # index position
dropout_list[insert:insert] = [0]
# Optional put dropouts in new dataframe
df2 = pd.DataFrame({'values':dropout_list})
# Median number of dropouts per year
dropout_median = df2['values'].median()
df['1976'].value_counts().values[0]
# Get totals for each year
general_count = []
for column in df.loc[:, '1976':'2021']:
try:
step = df[column].value_counts().values
#print(step[0])
general_count.append(step[0])
except IndexError:
general_count.append(0)
print(general_count)
new_entries = []
for column in df_results.loc[:, 'Yr Cnt 1976':'Yr Cnt 2021']:
current = df_results[column].isin([1]).sum(axis=0)
new_entries.append(current)
df3 = pd.DataFrame({'values':new_entries})
new_entry_median = df3['values'].median()
print(new_entry_median)
df2 = pd.DataFrame({'values':dropout_list})
#print(df2)
df_dropouts = df_results.loc[:, 'dropout 78':'dropout 21']
df['multi dropouts'] = df_dropouts.notna().sum(axis=1)
dropout_tally_list = df['multi dropouts'].value_counts().tolist()
dropout_tally_list
try:
dropout_tally_list.pop(0)
except IndexError:
print('Index Error')
try:
dropout_tally_list.pop(0)
except IndexError:
print('Index Error')
drop_sum = sum(dropout_tally_list)
total_entries = 431
d = 100 / total_entries
one_dropout = d * 341
two_dropouts = d * 42
three_dropouts = d * 42
four_dropouts = d * 5
five_dropouts = d * 1
multi_dropout_total = d * 90
print('One:' , one_dropout , ' %')
print('Two:' , two_dropouts , ' %')
print('Three:' , three_dropouts , ' %')
print('Four:' , four_dropouts , ' %')
print('Five:' , five_dropouts , ' %')
print('Multiple Dropouts:' , multi_dropout_total , ' %')
plt.rcParams['axes.facecolor']='white'
plot_years = yr_range
fig, ax = plt.subplots()
ax.plot(plot_years, general_count, color="black", label="Total Entries", linewidth=3)
ax.plot(plot_years, dropout_list, color=colours[3], label='Dropouts', linewidth=3)
ax.plot(plot_years, new_entries, color=colours[2], label='New Entries', linewidth=3)
ax.set_xlabel('')
ax.set_ylabel('')
ax.set_title('')
ax.set_xticks(ticks=plot_years)
fig.autofmt_xdate(rotation=90)
ax.set_facecolor('white')
ax.legend(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('Outputs/turnover_new_entries_dropouts_totals.png', facecolor='#ffffff')
plt.show()
width = 0.5
fig, axs = plt.subplots(2, 1, sharex=True)
neg_dropout_list = [ -x for x in dropout_list]
axs[0].bar(plot_years, new_entries, width=width, color=colours[2], label='New Entries')
axs[0].set_xticks(plot_years)
axs[1].bar(plot_years, neg_dropout_list, width=width, color=colours[3], label='Dropouts')
axs[1].set_xticks(plot_years)
axs[0].legend(fontsize=14, loc=2, bbox_to_anchor=(0.0,0.9))
axs[1].legend(fontsize=14, loc=2, bbox_to_anchor=(0.0,1.8))
plt.xticks(rotation=90)
plt.tight_layout()
plt.subplots_adjust(hspace=0)
axs[1].set_xlabel('')
axs[0].set_ylabel('')
axs[0].set_title('Film Bang Companies 1976 - 2021 : New Entries & Dropouts')
fig.set_facecolor('white')
axs[0].set_facecolor('white')
axs[1].set_facecolor('white')
ticks = axs[1].get_yticks()
axs[1].set_yticklabels([int(abs(tick)) for tick in ticks])
axs[0].grid(False)
axs[1].grid(False)
plt.savefig('Outputs/turnover_new_entries_dropouts_mirror_bars.png', facecolor='#ffffff')
plt.show()
from IPython.display import IFrame
IFrame('https://datawrapper.dwcdn.net/nQITo/1/', width=700, height=900)