import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
from lifelines import KaplanMeierFitter
from lifelines import CoxPHFitter
from lifelines.statistics import logrank_test
from lifelines import NelsonAalenFitter
plt.rcParams['figure.figsize'] = [12, 6]
df = pd.read_csv('Outputs/CSV/df_for_statistical_analysis.csv')
years = df.loc[:,'1976':'2020']
years
df['Entry'] = years.min(axis=1)
df.dropna(subset=['Entry'], inplace=True)
# Replace empty cells in trainee prog with 0
df['Trainee prog'] = df['Trainee prog'].fillna(0)
# replace dates with 1 in trainee col
df.loc[df['Trainee prog'] > 0, 'Trainee prog'] = 1
# Replace gender col values with 0, 1, 2
df.loc[df['Gender']== 'Male', 'Gender'] = 0
df.loc[df['Gender']== 'Female', 'Gender'] = 1
df.loc[df['Gender']== 'Unknown', 'Gender'] = 2
# Convert consistent role
# 1 = True
# 0 = False
# 2 = Ambiguous: producer/director situation
df.loc[df['Has Consistent Role'] == True, 'Has Consistent Role'] = 1
df.loc[df['Has Consistent Role'] == False, 'Has Consistent Role'] = 2
df.loc[df['Has Consistent Role'].isnull(), 'Has Consistent Role'] = 0
df['Has Consistent Role'].value_counts()
df.loc[df['Rural'] == True, 'Rural'] = 1
df.loc[df['Rural'].isnull(), 'Rural'] = 0
dept_list = df['Role 1 Category'].value_counts().index.tolist()
# create distinct dfs for each dept
for i in dept_list:
filt = (df['Role 1 Category'] == i)
df_export = df[filt]
name = i.replace(' ', '_')
df_export.to_csv(f'Stats/{i}_df.csv', index=False)
# Assign integers for role cat values
#df.loc[df['Role 1 Category'] == 'Production', 'Role 1 Category'] = 0
for index, val in enumerate(dept_list):
df.loc[df['Role 1 Category'] == val, 'Role 1 Category'] = index
#df['Role 1 Category'] = df['Role 1 Category'].dropna()
df.dropna(subset=['Role 1 Category'], inplace=True)
df['Role 1 Category'].isnull().value_counts()
# drop useless cols
#df = df.drop(['Role 1 Category', 'Role 2 Category', 'Role 3 Category'], axis=1)
df = df.drop(df.loc[:,'1976':'2020'].columns, axis = 1)
df['Gender'].hist()
plt.title("Gender")
plt.tight_layout()
plt.savefig(f"Stats/Outputs/gender_hist.png", facecolor='#ffffff')
df['Rural'].hist()
plt.title("Rural")
plt.tight_layout()
plt.savefig(f"Stats/Outputs/rural_hist.png", facecolor='#ffffff')
df.columns
df['Has Consistent Role'].hist()
plt.title("Role Consistency")
plt.tight_layout()
plt.savefig(f"Stats/Outputs/consistent_hist.png", facecolor='#ffffff')
df['Trainee prog'].hist()
plt.title("Trainee Program")
plt.tight_layout()
plt.savefig(f"Stats/Outputs/trainee_hist.png", facecolor='#ffffff')
df['Trainee prog'].value_counts()
df.columns
df['Role 1 Category'].hist()
plt.title("Role 1 Category")
plt.tight_layout()
plt.savefig(f"Stats/Outputs/role_hist.png", facecolor='#ffffff')
#Year of entry to Film Bang
df['Entry'].hist()
plt.title("Entry")
plt.tight_layout()
plt.savefig(f"Stats/Outputs/entry_hist.png", facecolor='#ffffff')
Investigating the longevity of freelancers in the Film Bang database. Longevity here concerns the number of years a freelancer listed in the directry. The birth event is their first entry in the database, and the death event is the retirement of that individual from the directory. Censoring can occur if they are a) still in the directory at the time of dataset compilation (2020).
kmf = KaplanMeierFitter()
df.loc[df['dropout year'] == 0.0, 'dead'] = 0
df.loc[df['dropout year'] > 0, 'dead'] = 1
# drop null values from No of Yrs col
df = df[df['No of Yrs'].notna()]
kmf.fit(durations = df['No of Yrs'],event_observed = df['dead'], label="Film Bang Listings")
kmf.event_table
kmf.survival_function_
kmf.confidence_interval_
kmf.median_survival_time_
kmf.plot_survival_function(at_risk_counts=True, label="Listings in Film Bang")
plt.title("Kaplan-Meier Estimate")
# plt.ylabel("Probability of freelancer still in FB")
# plt.xlabel("Years")
plt.tight_layout()
plt.savefig(f"Stats/Outputs/kaplan_meier_estimate_fb_all.png", facecolor='#ffffff')
plt.show
The y-axis represents the probability a freelancer is still around after 𝑡t years, where 𝑡t years is on the x-axis. We see that very few freelancers make it past 20 years in the listings.
kmf.confidence_interval_
# Probability of leaving:
kmf.cumulative_density_
kmf.plot_cumulative_density()
plt.tight_layout()
plt.savefig(f"Stats/Outputs/kaplan_meier_cumulative_density_fb_all.png", facecolor='#ffffff')
# Hazard Function
naf = NelsonAalenFitter()
naf.fit(df['No of Yrs'], event_observed = df['dead'], label="Film Bang Listings")
naf.cumulative_hazard_
naf.plot_cumulative_hazard()
plt.tight_layout()
plt.savefig(f"Stats/Outputs/naf_cumulative_hazard_fb_all.png", facecolor='#ffffff')
df['Role 1 Category'].isnull().value_counts()
# TO DO: REMOVE PARAMETERS WITH MORE THAN 2 VARIABLES - ENTRY AND ROLE
data = df[['No of Yrs', 'Entry','Role 1 Category','Trainee prog','Gender','Rural', 'Has Consistent Role','dead']]
cph = CoxPHFitter()
cph.fit(data, 'No of Yrs',event_col='dead')
cph.print_summary()
cph.plot()
plt.tight_layout()
plt.savefig(f"Stats/Outputs/cph_plot_fb_all.png", facecolor='#ffffff')
HR greater than 1 means that as the vlaue of the covariate increases, the event hazard increases HR = 1 : no effect HR < 1 : Reduction in the hazard HR > 1 : Increase in hazard
# survival probability for different people in our dataset
d_data = data.iloc[[0,3,4,6]]
cph.predict_survival_function(d_data).plot()
plt.tight_layout()
plt.savefig(f"Stats/Outputs/cph_survival_prediction_4_individuals_fb_all.png", facecolor='#ffffff')
# median time to event for timeline
CTE = kmf.conditional_time_to_event_
plt.plot(CTE)
plt.tight_layout()
plt.savefig(f"Stats/Outputs/kmf_conditional_time_to_event_medians_fb_all.png", facecolor='#ffffff')
Not sure yet...
# create 3 kmf objects
kmf_m = KaplanMeierFitter()
kmf_f = KaplanMeierFitter()
kmf_u = KaplanMeierFitter()
Male = df.query("Gender == 0")
Female = df.query("Gender == 1")
Unknown = df.query("Gender == 2")
kmf_m.fit(durations = Male["No of Yrs"],event_observed = Male['dead'],label="Male")
kmf_f.fit(durations = Female["No of Yrs"],event_observed = Female['dead'],label="Female")
kmf_u.fit(durations = Unknown["No of Yrs"],event_observed = Unknown['dead'],label="Unknown")
kmf_m.event_table
kmf_f.event_table
kmf_u.event_table
kmf_m.survival_function_
kmf_f.survival_function_
kmf_u.survival_function_
kmf_m.plot()
kmf_f.plot()
kmf_u.plot()
plt.xlabel("Years Passed")
plt.ylabel("Longevity")
plt.title("Film Bang - Gender KMF")
plt.tight_layout()
plt.savefig(f"Stats/Outputs/kaplan_meier_fb_all_gender.png", facecolor='#ffffff')
kmf_m.plot_survival_function(at_risk_counts=True, label="Male")
plt.xlabel("Years Passed")
plt.ylabel("Longevity")
plt.title("Film Bang - Gender KMF")
plt.tight_layout()
plt.savefig("Stats/Outputs/kaplan_meier_fb_gender_male.png", facecolor='#ffffff')
kmf_f.plot_survival_function(at_risk_counts=True, label="Female")
plt.xlabel("Years Passed")
plt.ylabel("Longevity")
plt.title("Film Bang - Gender KMF")
plt.tight_layout()
plt.savefig("Stats/Outputs/kaplan_meier_fb_gender_female.png", facecolor='#ffffff')
kmf_u.plot_survival_function(at_risk_counts=True, label="Gender Unknown")
plt.xlabel("Years Passed")
plt.ylabel("Longevity")
plt.title("Film Bang - Gender KMF")
plt.tight_layout()
plt.savefig("Stats/Outputs/kaplan_meier_fb_gender_unknown.png", facecolor='#ffffff')
kmf_m.cumulative_density_
kmf_f.cumulative_density_
kmf_u.cumulative_density_
# Gender Cumulative Density Plot
kmf_m.plot_cumulative_density()
kmf_f.plot_cumulative_density()
kmf_u.plot_cumulative_density()
plt.tight_layout()
plt.savefig("Stats/Outputs/kaplan_meier_cumulative_density_gender.png", facecolor='#ffffff')
naf_m = NelsonAalenFitter()
naf_f = NelsonAalenFitter()
naf_u = NelsonAalenFitter()
naf_m.fit(Male['No of Yrs'],event_observed = Male['dead'], label="Male")
naf_f.fit(Female['No of Yrs'],event_observed = Female['dead'],label="Female")
naf_u.fit(Unknown['No of Yrs'],event_observed = Unknown['dead'],label="Unknown")
naf_m.cumulative_hazard_
naf_m.plot_cumulative_hazard()
naf_f.plot_cumulative_hazard()
naf_u.plot_cumulative_hazard()
plt.tight_layout()
plt.savefig("Stats/Outputs/naf_cumulative_hazard_gender.png", facecolor='#ffffff')
T1=Male['No of Yrs']
E1=Male['dead']
T2=Female['No of Yrs']
E2=Female['dead']
# T3=Unknown['No of Yrs']
# E3=Unknown['dead']
T1.describe()
T2.describe()
results=logrank_test(T1,T2,event_observed_A=E1, event_observed_B=E2)
results.print_summary()
# p value of <0.005 means gender is associated with longevity.
kmf_is = KaplanMeierFitter()
kmf_isnt = KaplanMeierFitter()
kmf_dp = KaplanMeierFitter()
df = df.rename(columns={"Has Consistent Role": "Consistent"})
df['Consistent']
Consistent = df.query("Consistent == 1")
Inconsistent = df.query("Consistent == 2")
DirProd = df.query("Consistent == 0")
kmf_is.fit(durations = Consistent['No of Yrs'],event_observed= Consistent['dead'], label="Consistent")
kmf_isnt.fit(durations = Inconsistent['No of Yrs'],event_observed= Inconsistent['dead'], label="Inconsistent")
kmf_dp.fit(durations = DirProd['No of Yrs'],event_observed= DirProd['dead'], label="Director/Producer")
kmf_is.event_table
kmf_isnt.event_table
kmf_dp.event_table
kmf_is.survival_function_
kmf_isnt.survival_function_
kmf_dp.survival_function_
kmf_is.plot()
kmf_isnt.plot()
kmf_dp.plot()
plt.xlabel("Years")
plt.ylabel("Longevity")
plt.title("Film Bang - Role Consistency KMF")
plt.tight_layout()
plt.savefig("Stats/Outputs/kapplan_meier_estimate_role_consistency_all.png", facecolor='#ffffff')
kmf_is.plot_cumulative_density()
kmf_isnt.plot_cumulative_density()
kmf_dp.plot_cumulative_density()
plt.tight_layout()
plt.savefig("Stats/Outputs/kaplan_meier_cumulative_density_role_consistency_all.png", facecolor='#ffffff')
naf_is = NelsonAalenFitter()
naf_isnt = NelsonAalenFitter()
naf_dp = NelsonAalenFitter()
naf_is.fit(Consistent['No of Yrs'],event_observed = Consistent['dead'])
naf_isnt.fit(Inconsistent['No of Yrs'],event_observed = Inconsistent['dead'])
naf_dp.fit(DirProd['No of Yrs'],event_observed = DirProd['dead'])
naf_is.plot_cumulative_hazard(label="Consistent")
naf_isnt.plot_cumulative_hazard(label="Inconsistent")
naf_dp.plot_cumulative_hazard(label="Director/Producer")
plt.tight_layout()
plt.savefig("Stats/Outputs/naf_cumulative_hazard_role_consistency.png", facecolor='#ffffff')
T1=Consistent['No of Yrs']
T2=Inconsistent['No of Yrs']
E1=Consistent['dead']
E2=Inconsistent['dead']
results=logrank_test(T1,T2,event_observed_A=E1, event_observed_B=E2)
results.print_summary()
kmf_t = KaplanMeierFitter()
kmf_null = KaplanMeierFitter()
df = df.rename(columns={"Trainee prog": "Trainee"})
Trainee = df.query("Trainee == 1")
Non = df.query("Trainee == 0")
kmf_t.fit(durations = Trainee['No of Yrs'],event_observed= Trainee['dead'], label="Trainee")
kmf_null.fit(durations = Non['No of Yrs'],event_observed= Non['dead'], label="Non Trainee")
kmf_t.event_table
kmf_null.event_table
kmf_t.survival_function_
kmf_null.survival_function_
kmf_t.plot()
kmf_null.plot()
plt.xlabel("Years")
plt.ylabel("Longevity")
plt.title("Film Bang - Trainee Prog KMF")
plt.tight_layout()
plt.savefig("Stats/Outputs/kaplan_meier_estimate_trainee.png", facecolor='#ffffff')
kmf_t.plot_cumulative_density()
kmf_null.plot_cumulative_density()
plt.tight_layout()
plt.savefig("Stats/Outputs/kaplan_meier_cumulative_density_trainee.png", facecolor='#ffffff')
naf_t = NelsonAalenFitter()
naf_null = NelsonAalenFitter()
naf_t.fit(Trainee['No of Yrs'],event_observed = Trainee['dead'])
naf_null.fit(Non['No of Yrs'],event_observed = Non['dead'])
naf_t.plot_cumulative_hazard(label="Trainee")
naf_null.plot_cumulative_hazard(label="Non Trainee")
plt.tight_layout()
plt.savefig("Stats/Outputs/naf_cumulative_hazard_trainee.png", facecolor='#ffffff')
T1=Trainee['No of Yrs']
T2=Non['No of Yrs']
E1=Trainee['dead']
E2=Non['dead']
T1.describe()
T2.describe()
results=logrank_test(T1,T2,event_observed_A=E1, event_observed_B=E2)
results.print_summary()
#df.columns
#dept_list
production = pd.read_csv('Stats/Production_df.csv')
kmf_production = KaplanMeierFitter()
production.shape
production.loc[production['dropout year'] == 0.0, 'dead'] = 0
production.loc[production['dropout year'] > 0, 'dead'] = 1
# drop null values from No of Yrs col
production = production[production['No of Yrs'].notna()]
kmf_production.fit(durations = production['No of Yrs'],event_observed = production['dead'], label="Production")
kmf_production.event_table
kmf_production.survival_function_
kmf_production.median_survival_time_
kmf_production.plot_survival_function(at_risk_counts=True)
plt.title("Kaplan-Meier Estimate")
plt.ylabel("Probability of Production freelancer still in FB")
plt.show
kmf_production.confidence_interval_
# Probability of leaving:
kmf_production.cumulative_density_
kmf_production.plot_cumulative_density()
naf_production = NelsonAalenFitter()
naf_production.fit(production['No of Yrs'], event_observed = production['dead'])
naf.cumulative_hazard_
naf_production.plot_cumulative_hazard()
data_production = production[['No of Yrs', 'Entry','Trainee prog','Gender','Rural', 'Has Consistent Role','dead']]
cph_production = CoxPHFitter()
cph_production.fit(data_production, 'No of Yrs',event_col='dead')
cph_production.print_summary()
camera = pd.read_csv('Stats/Camera_df.csv')
kmf_camera = KaplanMeierFitter()
camera.shape
camera.loc[camera['dropout year'] == 0.0, 'dead'] = 0
camera.loc[camera['dropout year'] > 0, 'dead'] = 1
# drop null values from No of Yrs col
camera = camera[camera['No of Yrs'].notna()]
kmf_camera.fit(durations = camera['No of Yrs'],event_observed = camera['dead'], label="Camera")
kmf_production.event_table
kmf_production.survival_function_
kmf_camera.median_survival_time_
kmf_camera.plot_survival_function(at_risk_counts=True)
plt.title("Kaplan-Meier Estimate")
plt.ylabel("Probability of Production freelancer still in FB")
plt.show
kmf_production.confidence_interval_
# Probability of leaving:
kmf_production.cumulative_density_
kmf_camera.plot_cumulative_density()
naf_camera = NelsonAalenFitter()
naf_camera.fit(camera['No of Yrs'], event_observed = camera['dead'], label="Camera")
naf.cumulative_hazard_
naf_camera.plot_cumulative_hazard()
camera = camera[camera['Entry'].notna()]
data_camera = camera[['No of Yrs', 'Entry','Trainee prog','Gender','Rural', 'Has Consistent Role','dead']]
cph_camera = CoxPHFitter()
cph_camera.fit(data_camera, 'No of Yrs',event_col='dead')
cph_camera.print_summary()
art = pd.read_csv('Stats/Art_df.csv')
kmf_art = KaplanMeierFitter()
art.shape
art.loc[art['dropout year'] == 0.0, 'dead'] = 0
art.loc[art['dropout year'] > 0, 'dead'] = 1
# drop null values from No of Yrs col
art = art[art['No of Yrs'].notna()]
kmf_art.fit(durations = art['No of Yrs'],event_observed = art['dead'], label="Art")
kmf_production.event_table
kmf_production.survival_function_
kmf_art.median_survival_time_
kmf_art.plot_survival_function(at_risk_counts=True)
plt.title("Kaplan-Meier Estimate")
plt.ylabel("Probability of Production freelancer still in FB")
plt.show
kmf_production.confidence_interval_
# Probability of leaving:
kmf_production.cumulative_density_
kmf_art.plot_cumulative_density()
naf_art = NelsonAalenFitter()
naf_art.fit(df['No of Yrs'], event_observed = df['dead'], label="Art")
naf.cumulative_hazard_
naf_art.plot_cumulative_hazard()
art = art[art['Entry'].notna()]
data_art = art[['No of Yrs', 'Entry','Trainee prog','Gender','Rural', 'Has Consistent Role','dead']]
cph_art = CoxPHFitter()
cph_art.fit(data_art, 'No of Yrs',event_col='dead')
cph_art.print_summary()
direction = pd.read_csv('Stats/Direction_df.csv')
kmf_direction = KaplanMeierFitter()
direction.shape
direction.loc[direction['dropout year'] == 0.0, 'dead'] = 0
direction.loc[direction['dropout year'] > 0, 'dead'] = 1
# drop null values from No of Yrs col
direction = direction[direction['No of Yrs'].notna()]
kmf_direction.fit(durations = direction['No of Yrs'],event_observed = direction['dead'], label="Direction")
kmf_production.event_table
kmf_production.survival_function_
kmf_direction.median_survival_time_
kmf_direction.plot_survival_function(at_risk_counts=True)
plt.title("Kaplan-Meier Estimate")
plt.ylabel("Probability of Production freelancer still in FB")
plt.show
kmf_production.confidence_interval_
# Probability of leaving:
kmf_production.cumulative_density_
kmf_direction.plot_cumulative_density()
naf_direction = NelsonAalenFitter()
naf_direction.fit(direction['No of Yrs'], event_observed = direction['dead'], label="Direction")
naf.cumulative_hazard_
naf_direction.plot_cumulative_hazard()
data_direction = direction[['No of Yrs', 'Entry','Trainee prog','Gender','Rural', 'Has Consistent Role','dead']]
cph_direction = CoxPHFitter()
cph_direction.fit(data_direction, 'No of Yrs',event_col='dead')
cph_direction.print_summary()
producer = pd.read_csv('Stats/Producer_df.csv')
kmf_producer = KaplanMeierFitter()
producer.shape
producer.loc[producer['dropout year'] == 0.0, 'dead'] = 0
producer.loc[producer['dropout year'] > 0, 'dead'] = 1
# drop null values from No of Yrs col
producer = producer[producer['No of Yrs'].notna()]
kmf_producer.fit(durations = producer['No of Yrs'],event_observed = producer['dead'], label="Producer")
kmf_production.event_table
kmf_production.survival_function_
kmf_producer.median_survival_time_
kmf_producer.plot_survival_function(at_risk_counts=True)
plt.title("Kaplan-Meier Estimate")
plt.ylabel("Probability of Production freelancer still in FB")
plt.show
kmf_production.confidence_interval_
# Probability of leaving:
kmf_production.cumulative_density_
kmf_producer.plot_cumulative_density()
naf_producer = NelsonAalenFitter()
naf_producer.fit(producer['No of Yrs'], event_observed = producer['dead'], label="Producer")
naf.cumulative_hazard_
naf_producer.plot_cumulative_hazard()
data_producer = producer[['No of Yrs', 'Entry','Trainee prog','Gender','Rural', 'Has Consistent Role','dead']]
cph_producer = CoxPHFitter()
cph_producer.fit(data_producer, 'No of Yrs',event_col='dead')
cph_producer.print_summary()
p_production = pd.read_csv('Stats/Post-Production_df.csv')
kmf_p_production = KaplanMeierFitter()
p_production.shape
p_production.loc[p_production['dropout year'] == 0.0, 'dead'] = 0
p_production.loc[p_production['dropout year'] > 0, 'dead'] = 1
# drop null values from No of Yrs col
p_production = p_production[p_production['No of Yrs'].notna()]
kmf_p_production.fit(durations = p_production['No of Yrs'],event_observed = p_production['dead'], label="Post-Production")
kmf_production.event_table
kmf_production.survival_function_
kmf_p_production.median_survival_time_
kmf_p_production.confidence_interval_
kmf_p_production.plot_survival_function(at_risk_counts=True)
plt.title("Kaplan-Meier Estimate")
plt.ylabel("Probability of Post-Production freelancer still in FB")
plt.show
kmf_p_production.confidence_interval_
# Probability of leaving:
kmf_p_production.cumulative_density_
kmf_p_production.plot_cumulative_density()
naf_p_production = NelsonAalenFitter()
naf_p_production.fit(p_production['No of Yrs'], event_observed = p_production['dead'], label="Post-Production")
naf.cumulative_hazard_
naf_p_production.plot_cumulative_hazard()
data_p_production = p_production[['No of Yrs', 'Entry','Trainee prog','Gender','Rural', 'Has Consistent Role','dead']]
cph_p_production = CoxPHFitter()
cph_p_production.fit(data_p_production, 'No of Yrs',event_col='dead')
cph_p_production.print_summary()
sound = pd.read_csv('Stats/Sound_df.csv')
kmf_sound = KaplanMeierFitter()
sound.shape
sound.loc[sound['dropout year'] == 0.0, 'dead'] = 0
sound.loc[sound['dropout year'] > 0, 'dead'] = 1
# drop null values from No of Yrs col
sound = sound[sound['No of Yrs'].notna()]
kmf_sound.fit(durations = sound['No of Yrs'],event_observed = sound['dead'], label="Sound")
kmf_production.event_table
kmf_production.survival_function_
kmf_sound.median_survival_time_
kmf_p_production.confidence_interval_
kmf_sound.plot_survival_function(at_risk_counts=True)
plt.title("Kaplan-Meier Estimate")
plt.ylabel("Probability of Production freelancer still in FB")
plt.show
kmf_sound.confidence_interval_
# Probability of leaving:
kmf_sound.cumulative_density_
kmf_sound.plot_cumulative_density()
naf_sound = NelsonAalenFitter()
naf_sound.fit(sound['No of Yrs'], event_observed = sound['dead'], label="Sound")
naf.cumulative_hazard_
naf_sound.plot_cumulative_hazard()
data_sound = sound[['No of Yrs', 'Entry','Trainee prog','Gender','Rural', 'Has Consistent Role','dead']]
cph_sound = CoxPHFitter()
cph_sound.fit(data_sound, 'No of Yrs',event_col='dead')
cph_sound.print_summary()
hair_make_up = pd.read_csv('Stats/Hair_&_Make-Up_df.csv')
kmf_hair_make_up = KaplanMeierFitter()
hair_make_up.shape
hair_make_up.loc[hair_make_up['dropout year'] == 0.0, 'dead'] = 0
hair_make_up.loc[hair_make_up['dropout year'] > 0, 'dead'] = 1
# drop null values from No of Yrs col
p_prodhair_make_upuction = hair_make_up[hair_make_up['No of Yrs'].notna()]
kmf_hair_make_up.fit(durations = hair_make_up['No of Yrs'],event_observed = hair_make_up['dead'], label="Hair & Make-Up")
kmf_production.event_table
kmf_production.survival_function_
kmf_hair_make_up.median_survival_time_
kmf_p_production.confidence_interval_
kmf_hair_make_up.plot_survival_function(at_risk_counts=True)
plt.title("Kaplan-Meier Estimate")
plt.ylabel("Probability of Production freelancer still in FB")
plt.show
kmf_hair_make_up.confidence_interval_
# Probability of leaving:
kmf_hair_make_up.cumulative_density_
kmf_hair_make_up.plot_cumulative_density()
naf_hair_make_up = NelsonAalenFitter()
naf_hair_make_up.fit(hair_make_up['No of Yrs'], event_observed = hair_make_up['dead'], label="Hair & Make-Up")
naf.cumulative_hazard_
naf_hair_make_up.plot_cumulative_hazard()
data_hair_make_up = hair_make_up[['No of Yrs', 'Entry','Trainee prog','Gender','Rural', 'Has Consistent Role','dead']]
cph_hair_make_up = CoxPHFitter()
cph_hair_make_up.fit(data_hair_make_up, 'No of Yrs',event_col='dead')
cph_hair_make_up.print_summary()
costume = pd.read_csv('Stats/Costume_df.csv')
kmf_costume = KaplanMeierFitter()
costume.shape
costume.loc[costume['dropout year'] == 0.0, 'dead'] = 0
costume.loc[costume['dropout year'] > 0, 'dead'] = 1
# drop null values from No of Yrs col
costume = costume[costume['No of Yrs'].notna()]
kmf_costume.fit(durations = costume['No of Yrs'],event_observed = costume['dead'], label="Costume")
kmf_production.event_table
kmf_production.survival_function_
kmf_costume.median_survival_time_
kmf_p_production.confidence_interval_
kmf_costume.plot_survival_function(at_risk_counts=True)
plt.title("Kaplan-Meier Estimate")
plt.ylabel("Probability of Production freelancer still in FB")
plt.show
kmf_costume.confidence_interval_
# Probability of leaving:
kmf_costume.cumulative_density_
kmf_costume.plot_cumulative_density()
naf_costume = NelsonAalenFitter()
naf_costume.fit(costume['No of Yrs'], event_observed = costume['dead'], label="Costume")
naf.cumulative_hazard_
naf_costume.plot_cumulative_hazard()
data_costume = costume[['No of Yrs', 'Entry','Trainee prog','Gender','Rural', 'Has Consistent Role','dead']]
cph_costume = CoxPHFitter()
cph_costume.fit(data_costume, 'No of Yrs',event_col='dead')
cph_costume.print_summary()
music = pd.read_csv('Stats/Music_df.csv')
kmf_music = KaplanMeierFitter()
music.shape
music.loc[music['dropout year'] == 0.0, 'dead'] = 0
music.loc[music['dropout year'] > 0, 'dead'] = 1
# drop null values from No of Yrs col
music = music[music['No of Yrs'].notna()]
kmf_music.fit(durations = music['No of Yrs'],event_observed = music['dead'], label="Music")
kmf_music.event_table
kmf_music.survival_function_
kmf_music.median_survival_time_
kmf_p_production.confidence_interval_
kmf_music.plot_survival_function(at_risk_counts=True)
plt.title("Kaplan-Meier Estimate")
plt.ylabel("Probability of Production freelancer still in FB")
plt.show
kmf_music.confidence_interval_
# Probability of leaving:
kmf_music.cumulative_density_
kmf_music.plot_cumulative_density()
naf_music = NelsonAalenFitter()
naf_music.fit(music['No of Yrs'], event_observed = music['dead'], label="Music")
naf.cumulative_hazard_
naf_music.plot_cumulative_hazard()
music['No of Yrs'].isnull().value_counts(dropna=False)
# Trainee Prog col dropped due to convergence issue: very low incidence.
data_music = music[['No of Yrs', 'Entry','Gender','Rural', 'Has Consistent Role','dead']]
cph_music = CoxPHFitter()
cph_music.fit(data_music, 'No of Yrs',event_col='dead')
cph_music.print_summary()
support = pd.read_csv('Stats/Support_df.csv')
kmf_support = KaplanMeierFitter()
support.shape
support.loc[support['dropout year'] == 0.0, 'dead'] = 0
support.loc[support['dropout year'] > 0, 'dead'] = 1
# drop null values from No of Yrs col
support = support[support['No of Yrs'].notna()]
# check
#df['No of Yrs'].value_counts(dropna=False)
kmf_support.fit(durations = support['No of Yrs'],event_observed = support['dead'], label="Support")
kmf_support.event_table
kmf_support.survival_function_
kmf_support.median_survival_time_
kmf_p_production.confidence_interval_
kmf_support.plot_survival_function(at_risk_counts=True)
plt.title("Kaplan-Meier Estimate")
plt.ylabel("Probability of Production freelancer still in FB")
plt.show
kmf_support.confidence_interval_
# Probability of leaving:
kmf_support.cumulative_density_
kmf_support.plot_cumulative_density()
naf_support = NelsonAalenFitter()
naf_support.fit(support['No of Yrs'], event_observed = support['dead'], label="Support")
naf.cumulative_hazard_
naf_support.plot_cumulative_hazard()
data_support = support[['No of Yrs', 'Entry','Trainee prog','Gender','Rural', 'Has Consistent Role','dead']]
cph_support = CoxPHFitter()
cph_support.fit(data_support, 'No of Yrs',event_col='dead')
cph_support.print_summary()
script = pd.read_csv('Stats/Script_df.csv')
kmf_script = KaplanMeierFitter()
script.shape
script.loc[script['dropout year'] == 0.0, 'dead'] = 0
script.loc[script['dropout year'] > 0, 'dead'] = 1
# drop null values from No of Yrs col
script = script[script['No of Yrs'].notna()]
kmf_script.fit(durations = script['No of Yrs'],event_observed = script['dead'], label="Script")
kmf_script.event_table
kmf_script.survival_function_
kmf_script.median_survival_time_
kmf_p_production.confidence_interval_
kmf_script.plot_survival_function(at_risk_counts=True)
plt.title("Kaplan-Meier Estimate")
plt.ylabel("Probability of Production freelancer still in FB")
plt.show
kmf_script.confidence_interval_
# Probability of leaving:
kmf_script.cumulative_density_
kmf_script.plot_cumulative_density()
naf_script = NelsonAalenFitter()
naf_script.fit(script['No of Yrs'], event_observed = script['dead'], label="Script")
naf.cumulative_hazard_
naf_script.plot_cumulative_hazard()
# Trainee prog and Rural cols dropped due to convegece issue: low incidence
data_script = script[['No of Yrs', 'Entry','Gender', 'Has Consistent Role','dead']]
cph_script = CoxPHFitter()
cph_script.fit(data_script, 'No of Yrs',event_col='dead')
cph_script.print_summary()
casting = pd.read_csv('Stats/Casting_df.csv')
kmf_casting = KaplanMeierFitter()
casting.shape
casting.loc[casting['dropout year'] == 0.0, 'dead'] = 0
casting.loc[casting['dropout year'] > 0, 'dead'] = 1
# drop null values from No of Yrs col
casting = casting[casting['No of Yrs'].notna()]
kmf_casting.fit(durations = casting['No of Yrs'],event_observed = casting['dead'], label="Casting")
kmf_casting.event_table
kmf_casting.survival_function_
kmf_casting.median_survival_time_
kmf_p_production.confidence_interval_
kmf_casting.plot_survival_function(at_risk_counts=True)
plt.title("Kaplan-Meier Estimate")
plt.ylabel("Probability of Production freelancer still in FB")
plt.show
kmf_casting.confidence_interval_
# Probability of leaving:
kmf_casting.cumulative_density_
kmf_casting.plot_cumulative_density()
naf_casting = NelsonAalenFitter()
naf_casting.fit(casting['No of Yrs'], event_observed = casting['dead'], label="Casting")
naf.cumulative_hazard_
naf_casting.plot_cumulative_hazard()
# Trainee prog and Rural columns dropped due to low incidence causing convergence issue
data_casting = casting[['No of Yrs', 'Entry','Gender', 'Has Consistent Role','dead']]
cph_casting = CoxPHFitter()
cph_casting.fit(data_casting, 'No of Yrs',event_col='dead')
cph_casting.print_summary()
construction = pd.read_csv('Stats/Construction_df.csv')
kmf_construction = KaplanMeierFitter()
construction.shape
construction.loc[construction['dropout year'] == 0.0, 'dead'] = 0
construction.loc[construction['dropout year'] > 0, 'dead'] = 1
# drop null values from No of Yrs col
construction = construction[construction['No of Yrs'].notna()]
kmf_construction.fit(durations = construction['No of Yrs'],event_observed = construction['dead'], label="Construction")
kmf_construction.event_table
kmf_construction.survival_function_
kmf_construction.median_survival_time_
kmf_construction.confidence_interval_
kmf_construction.plot_survival_function(at_risk_counts=True)
plt.title("Kaplan-Meier Estimate")
plt.ylabel("Probability of Production freelancer still in FB")
plt.show
kmf_construction.confidence_interval_
# Probability of leaving:
kmf_construction.cumulative_density_
kmf_construction.plot_cumulative_density()
naf_construction = NelsonAalenFitter()
naf_construction.fit(construction['No of Yrs'], event_observed = construction['dead'], label="Construction")
naf.cumulative_hazard_
naf_construction.plot_cumulative_hazard()
#Rural, Gender, Has Consistent Role, Trainee prog removed due to onvergence issue: low incidence.
data_construction = construction[['No of Yrs', 'Entry','dead']]
cph_construction = CoxPHFitter()
cph_construction.fit(data_construction, 'No of Yrs',event_col='dead')
cph_construction.print_summary()
special_fx = pd.read_csv('Stats/Special_FX_df.csv')
kmf_special_fx = KaplanMeierFitter()
special_fx.shape
special_fx.loc[special_fx['dropout year'] == 0.0, 'dead'] = 0
special_fx.loc[special_fx['dropout year'] > 0, 'dead'] = 1
# drop null values from No of Yrs col
special_fx = special_fx[special_fx['No of Yrs'].notna()]
kmf_special_fx.fit(durations = special_fx['No of Yrs'],event_observed = special_fx['dead'], label="Special FX")
kmf_special_fx.event_table
kmf_special_fx.survival_function_
kmf_special_fx.median_survival_time_
kmf_p_production.confidence_interval_
kmf_special_fx.plot_survival_function(at_risk_counts=True)
plt.title("Kaplan-Meier Estimate")
plt.ylabel("Probability of Production freelancer still in FB")
plt.show
kmf_special_fx.confidence_interval_
# Probability of leaving:
kmf_special_fx.cumulative_density_
kmf_special_fx.plot_cumulative_density()
naf_special_fx = NelsonAalenFitter()
naf_special_fx.fit(special_fx['No of Yrs'], event_observed = special_fx['dead'], label="Special FX")
naf.cumulative_hazard_
naf_special_fx.plot_cumulative_hazard()
# Trainee prog col dropped due to convergence issue: low incidence
data_special_fx = special_fx[['No of Yrs', 'Entry','Gender','Rural', 'Has Consistent Role','dead']]
cph_special_fx = CoxPHFitter()
cph_special_fx.fit(data_special_fx, 'No of Yrs',event_col='dead')
cph_special_fx.print_summary()
# Test
fig1 = plt.figure()
ax1 = plt.subplot(211)
ax2 = plt.subplot(211)
ax3 = plt.subplot(211)
ax4 = plt.subplot(211)
ax5 = plt.subplot(211)
ax6 = plt.subplot(211)
ax7 = plt.subplot(211)
ax8 = plt.subplot(211)
ax9 = plt.subplot(211)
ax10 = plt.subplot(211)
ax11 = plt.subplot(211)
ax12 = plt.subplot(211)
ax13 = plt.subplot(211)
ax14 = plt.subplot(211)
ax15 = plt.subplot(211)
kmf_sound.survival_function_.plot(ax=ax7, label="sound")
kmf_camera.survival_function_.plot(ax=ax4, label="camera")
kmf_producer.survival_function_.plot(ax=ax5, label="producer")
kmf_art.survival_function_.plot(ax=ax3, label="art")
kmf_direction.survival_function_.plot(ax=ax2, label="direction")
kmf_p_production.survival_function_.plot(ax=ax6, label="post production")
kmf_hair_make_up.survival_function_.plot(ax=ax8, label="hair & make-up")
kmf_costume.survival_function_.plot(ax=ax9, label="costume")
kmf_script.survival_function_.plot(ax=ax12, label="script")
kmf_support.survival_function_.plot(ax=ax11, label="support")
kmf_casting.survival_function_.plot(ax=ax13, label="casting")
kmf_production.survival_function_.plot(ax=ax1, label="production")
kmf_special_fx.survival_function_.plot(ax=ax15, label="special fx")
kmf_construction.survival_function_.plot(ax=ax14, label="construction")
kmf_music.survival_function_.plot(ax=ax10, label="music")
fig1.set_size_inches(12, 10)
plt.ylabel("Probability of Production freelancer still in FB")
######
plt.tight_layout()
plt.savefig("Stats/Outputs/dept_kmf_estimates_3.png", facecolor='#ffffff')
plt.show
# Version 2
# Test
fig1 = plt.figure()
ax1 = plt.subplot(211)
ax2 = plt.subplot(211)
ax3 = plt.subplot(211)
ax4 = plt.subplot(211)
ax5 = plt.subplot(211)
ax6 = plt.subplot(211)
ax7 = plt.subplot(211)
ax8 = plt.subplot(211)
ax9 = plt.subplot(212)
ax10 = plt.subplot(212)
ax11 = plt.subplot(212)
ax12 = plt.subplot(212)
ax13 = plt.subplot(212)
ax14 = plt.subplot(212)
ax15 = plt.subplot(212)
kmf_production.survival_function_.plot(ax=ax1, label="production")
kmf_direction.survival_function_.plot(ax=ax2, label="direction")
kmf_art.survival_function_.plot(ax=ax3, label="art")
kmf_camera.survival_function_.plot(ax=ax4, label="camera")
kmf_producer.survival_function_.plot(ax=ax5, label="producer")
kmf_p_production.survival_function_.plot(ax=ax6, label="post production")
kmf_sound.survival_function_.plot(ax=ax7, label="sound")
kmf_hair_make_up.survival_function_.plot(ax=ax8, label="hair & make-up")
kmf_costume.survival_function_.plot(ax=ax9, label="costume")
kmf_music.survival_function_.plot(ax=ax10, label="music")
kmf_support.survival_function_.plot(ax=ax11, label="support")
kmf_script.survival_function_.plot(ax=ax12, label="script")
kmf_casting.survival_function_.plot(ax=ax13, label="casting")
kmf_construction.survival_function_.plot(ax=ax14, label="construction")
kmf_special_fx.survival_function_.plot(ax=ax15, label="special fx")
fig1.set_size_inches(12, 8)
plt.ylabel("Probability of Production freelancer still in FB")
plt.show
fig1 = plt.figure()
ax1 = plt.subplot(211)
ax2 = plt.subplot(211)
ax3 = plt.subplot(211)
ax4 = plt.subplot(211)
ax5 = plt.subplot(211)
ax6 = plt.subplot(211)
ax7 = plt.subplot(211)
ax8 = plt.subplot(211)
kmf_production.survival_function_.plot(ax=ax1, label="production")
kmf_direction.survival_function_.plot(ax=ax2, label="direction")
kmf_art.survival_function_.plot(ax=ax3, label="art")
kmf_camera.survival_function_.plot(ax=ax4, label="camera")
kmf_producer.survival_function_.plot(ax=ax5, label="producer")
kmf_p_production.survival_function_.plot(ax=ax6, label="post production")
kmf_sound.survival_function_.plot(ax=ax7, label="sound")
kmf_hair_make_up.survival_function_.plot(ax=ax8, label="hair & make-up")
fig1.set_size_inches(12, 8)
plt.ylabel("Probability of Production freelancer still in FB")
plt.show
fig1 = plt.figure()
ax9 = plt.subplot(211)
ax10 = plt.subplot(211)
ax11 = plt.subplot(211)
ax12 = plt.subplot(211)
ax13 = plt.subplot(211)
ax14 = plt.subplot(211)
ax15 = plt.subplot(211)
kmf_costume.survival_function_.plot(ax=ax9, label="costume")
kmf_music.survival_function_.plot(ax=ax10, label="music")
kmf_support.survival_function_.plot(ax=ax11, label="support")
kmf_script.survival_function_.plot(ax=ax12, label="script")
kmf_casting.survival_function_.plot(ax=ax13, label="casting")
kmf_construction.survival_function_.plot(ax=ax14, label="construction")
kmf_special_fx.survival_function_.plot(ax=ax15, label="special fx")
fig1.set_size_inches(12, 8)
plt.ylabel("Probability of Production freelancer still in FB")
plt.show
fig = plt.figure()
ax1 = plt.subplot(211)
ax2 = plt.subplot(212)
kmf_production.plot(ax=ax1, label="production")
kmf_direction.plot(ax=ax2, label="direction")
fig = plt.figure()
ax1 = plt.subplot(331)
ax2 = plt.subplot(332)
ax3 = plt.subplot(333)
ax4 = plt.subplot(334)
ax5 = plt.subplot(335)
ax6 = plt.subplot(336)
ax7 = plt.subplot(337)
ax8 = plt.subplot(338)
ax9 = plt.subplot(339)
kmf_production.plot(ax=ax1, label="production")
kmf_direction.plot(ax=ax2, label="direction")
kmf_art.plot(ax=ax3, label="art")
kmf_camera.plot(ax=ax4, label="camera")
kmf_p_production.plot(ax=ax5, label="Post Production")
kmf_producer.plot(ax=ax6, label="Producer")
kmf_sound.plot(ax=ax7, label="Sound")
kmf_hair_make_up.plot(ax=ax8, label="Hair & Make-Up")
kmf_costume.plot(ax=ax9, label="costume")
plt.tight_layout()
plt.savefig("Stats/Outputs/dept_kmf_estimates_1.png", facecolor='#ffffff')
fig = plt.figure()
ax1 = plt.subplot(331)
ax2 = plt.subplot(332)
ax3 = plt.subplot(333)
ax4 = plt.subplot(334)
ax5 = plt.subplot(335)
ax6 = plt.subplot(336)
kmf_music.plot(ax=ax1, label="music")
kmf_support.plot(ax=ax2, label="support")
kmf_script.plot(ax=ax3, label="script")
kmf_casting.plot(ax=ax4, label="casting")
kmf_construction.plot(ax=ax5, label="construction")
kmf_special_fx.plot(ax=ax6, label="special fx")
plt.tight_layout()
plt.savefig("Stats/Outputs/dept_kmf_estimates_2.png", facecolor='#ffffff')