In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np

from lifelines import KaplanMeierFitter
from lifelines import CoxPHFitter
from lifelines.statistics import logrank_test
from lifelines import NelsonAalenFitter
In [2]:
plt.rcParams['figure.figsize'] = [12, 6]
In [3]:
df = pd.read_csv('Outputs/CSV/df_for_statistical_analysis.csv')

Data Prep for Statistical Analysis

In [4]:
years = df.loc[:,'1976':'2020']
years
df['Entry'] = years.min(axis=1)
In [5]:
df.dropna(subset=['Entry'], inplace=True)
In [6]:
# Replace empty cells in trainee prog with 0
df['Trainee prog'] = df['Trainee prog'].fillna(0)
In [7]:
# replace dates with 1 in trainee col 
df.loc[df['Trainee prog'] > 0, 'Trainee prog'] = 1
In [8]:
# Replace gender col values with 0, 1, 2
df.loc[df['Gender']== 'Male', 'Gender'] = 0
df.loc[df['Gender']== 'Female', 'Gender'] = 1
df.loc[df['Gender']== 'Unknown', 'Gender'] = 2
In [9]:
# Convert consistent role
# 1 = True
# 0 = False
# 2 = Ambiguous: producer/director situation 
df.loc[df['Has Consistent Role'] == True, 'Has Consistent Role'] = 1
df.loc[df['Has Consistent Role'] == False, 'Has Consistent Role'] = 2
df.loc[df['Has Consistent Role'].isnull(), 'Has Consistent Role'] = 0
In [10]:
df['Has Consistent Role'].value_counts()
Out[10]:
1    3089
0     469
2      72
Name: Has Consistent Role, dtype: int64
In [11]:
df.loc[df['Rural'] == True, 'Rural'] = 1
df.loc[df['Rural'].isnull(), 'Rural'] = 0
In [12]:
dept_list = df['Role 1 Category'].value_counts().index.tolist()
In [13]:
# create distinct dfs for each dept 
for i in dept_list:
    filt = (df['Role 1 Category'] == i)
    df_export = df[filt]
    name = i.replace(' ', '_')
    df_export.to_csv(f'Stats/{i}_df.csv', index=False)
In [14]:
# Assign integers for role cat values
#df.loc[df['Role 1 Category'] == 'Production', 'Role 1 Category'] = 0
for index, val in enumerate(dept_list):
    df.loc[df['Role 1 Category'] == val, 'Role 1 Category'] = index
In [15]:
#df['Role 1 Category'] = df['Role 1 Category'].dropna()
df.dropna(subset=['Role 1 Category'], inplace=True)
In [16]:
df['Role 1 Category'].isnull().value_counts()
Out[16]:
False    3629
Name: Role 1 Category, dtype: int64
In [17]:
# drop useless cols 
#df = df.drop(['Role 1 Category', 'Role 2 Category', 'Role 3 Category'], axis=1)
df = df.drop(df.loc[:,'1976':'2020'].columns, axis = 1)

Analysis

In [18]:
df['Gender'].hist()
plt.title("Gender")
plt.tight_layout()
plt.savefig(f"Stats/Outputs/gender_hist.png", facecolor='#ffffff')
In [19]:
df['Rural'].hist()
plt.title("Rural")
plt.tight_layout()
plt.savefig(f"Stats/Outputs/rural_hist.png", facecolor='#ffffff')
In [20]:
df.columns
Out[20]:
Index(['UUID', 'Trainee prog', 'Gender', 'Role 1 Category', 'Role 2 Category',
       'Role 3 Category', 'Has Consistent Role', 'Postcode1', 'Rural',
       'No of Yrs', 'multi dropouts', 'dropout year', 'Entry'],
      dtype='object')
In [21]:
df['Has Consistent Role'].hist()
plt.title("Role Consistency")
plt.tight_layout()
plt.savefig(f"Stats/Outputs/consistent_hist.png", facecolor='#ffffff')
In [22]:
df['Trainee prog'].hist()
plt.title("Trainee Program")
plt.tight_layout()
plt.savefig(f"Stats/Outputs/trainee_hist.png", facecolor='#ffffff')
In [23]:
df['Trainee prog'].value_counts()
Out[23]:
0.0    3513
1.0     116
Name: Trainee prog, dtype: int64
In [24]:
df.columns
Out[24]:
Index(['UUID', 'Trainee prog', 'Gender', 'Role 1 Category', 'Role 2 Category',
       'Role 3 Category', 'Has Consistent Role', 'Postcode1', 'Rural',
       'No of Yrs', 'multi dropouts', 'dropout year', 'Entry'],
      dtype='object')
In [25]:
df['Role 1 Category'].hist()
plt.title("Role 1 Category")
plt.tight_layout()
plt.savefig(f"Stats/Outputs/role_hist.png", facecolor='#ffffff')
In [26]:
#Year of entry to Film Bang
df['Entry'].hist()
plt.title("Entry")
plt.tight_layout()
plt.savefig(f"Stats/Outputs/entry_hist.png", facecolor='#ffffff')

Kaplan-Meier Estimage for Entire Dataset

Investigating the longevity of freelancers in the Film Bang database. Longevity here concerns the number of years a freelancer listed in the directry. The birth event is their first entry in the database, and the death event is the retirement of that individual from the directory. Censoring can occur if they are a) still in the directory at the time of dataset compilation (2020).

In [27]:
kmf = KaplanMeierFitter()
In [28]:
df.loc[df['dropout year'] == 0.0, 'dead'] = 0
df.loc[df['dropout year'] > 0, 'dead'] = 1
In [29]:
# drop null values from No of Yrs col
df = df[df['No of Yrs'].notna()]
In [30]:
kmf.fit(durations = df['No of Yrs'],event_observed = df['dead'], label="Film Bang Listings")
Out[30]:
<lifelines.KaplanMeierFitter:"Film Bang Listings", fitted with 3629 total observations, 525 right-censored observations>
In [31]:
kmf.event_table
Out[31]:
removed observed censored entrance at_risk
event_at
0.0 0 0 0 3629 3629
1.0 1119 948 171 0 3629
2.0 529 464 65 0 2510
3.0 355 330 25 0 1981
4.0 247 220 27 0 1626
5.0 217 198 19 0 1379
6.0 128 108 20 0 1162
7.0 121 112 9 0 1034
8.0 101 83 18 0 913
9.0 96 86 10 0 812
10.0 85 74 11 0 716
11.0 68 57 11 0 631
12.0 54 47 7 0 563
13.0 54 48 6 0 509
14.0 47 34 13 0 455
15.0 51 46 5 0 408
16.0 31 26 5 0 357
17.0 35 27 8 0 326
18.0 31 22 9 0 291
19.0 27 21 6 0 260
20.0 26 18 8 0 233
21.0 27 21 6 0 207
22.0 30 22 8 0 180
23.0 14 8 6 0 150
24.0 21 13 8 0 136
25.0 19 8 11 0 115
26.0 17 12 5 0 96
27.0 9 5 4 0 79
28.0 11 9 2 0 70
29.0 14 8 6 0 59
30.0 10 7 3 0 45
31.0 4 3 1 0 35
32.0 6 5 1 0 31
33.0 4 3 1 0 25
34.0 7 4 3 0 21
35.0 7 3 4 0 14
36.0 3 2 1 0 7
37.0 2 1 1 0 4
38.0 1 1 0 0 2
41.0 1 0 1 0 1
In [32]:
kmf.survival_function_
Out[32]:
Film Bang Listings
timeline
0.0 1.000000
1.0 0.738771
2.0 0.602201
3.0 0.501885
4.0 0.433979
5.0 0.371668
6.0 0.337124
7.0 0.300607
8.0 0.273279
9.0 0.244336
10.0 0.219083
11.0 0.199293
12.0 0.182656
13.0 0.165431
14.0 0.153069
15.0 0.135811
16.0 0.125920
17.0 0.115491
18.0 0.106760
19.0 0.098137
20.0 0.090556
21.0 0.081369
22.0 0.071424
23.0 0.067614
24.0 0.061151
25.0 0.056897
26.0 0.049785
27.0 0.046634
28.0 0.040638
29.0 0.035128
30.0 0.029664
31.0 0.027121
32.0 0.022747
33.0 0.020017
34.0 0.016204
35.0 0.012732
36.0 0.009094
37.0 0.006821
38.0 0.003410
41.0 0.003410
In [33]:
kmf.confidence_interval_
Out[33]:
Film Bang Listings_lower_0.95 Film Bang Listings_upper_0.95
0.0 1.000000 1.000000
1.0 0.724158 0.752747
2.0 0.585818 0.618163
3.0 0.485050 0.518469
4.0 0.417233 0.450599
5.0 0.355275 0.388053
6.0 0.321047 0.353262
7.0 0.284959 0.316394
8.0 0.258031 0.288726
9.0 0.229577 0.259358
10.0 0.204826 0.233665
11.0 0.185478 0.213482
12.0 0.169249 0.196481
13.0 0.152489 0.178839
14.0 0.140492 0.166149
15.0 0.123777 0.148403
16.0 0.114227 0.138204
17.0 0.104182 0.127428
18.0 0.095788 0.118393
19.0 0.087511 0.109461
20.0 0.080251 0.101594
21.0 0.071472 0.092047
22.0 0.062009 0.081679
23.0 0.058389 0.077706
24.0 0.052256 0.070964
25.0 0.048219 0.066533
26.0 0.041454 0.059164
27.0 0.038465 0.055899
28.0 0.032803 0.049673
29.0 0.027667 0.043899
30.0 0.022586 0.038202
31.0 0.020239 0.035554
32.0 0.016271 0.030944
33.0 0.013848 0.028032
34.0 0.010545 0.023912
35.0 0.007565 0.020272
36.0 0.004379 0.017117
37.0 0.002593 0.015334
38.0 0.000507 0.014214
41.0 0.000507 0.014214
In [34]:
kmf.median_survival_time_
Out[34]:
4.0

Kaplan-Meier Plot

In [35]:
kmf.plot_survival_function(at_risk_counts=True, label="Listings in Film Bang")
plt.title("Kaplan-Meier Estimate")
# plt.ylabel("Probability of freelancer still in FB")
# plt.xlabel("Years")
plt.tight_layout()
plt.savefig(f"Stats/Outputs/kaplan_meier_estimate_fb_all.png", facecolor='#ffffff')
plt.show
Out[35]:
<function matplotlib.pyplot.show(*args, **kw)>

The y-axis represents the probability a freelancer is still around after 𝑡t years, where 𝑡t years is on the x-axis. We see that very few freelancers make it past 20 years in the listings.

In [36]:
kmf.confidence_interval_
Out[36]:
Film Bang Listings_lower_0.95 Film Bang Listings_upper_0.95
0.0 1.000000 1.000000
1.0 0.724158 0.752747
2.0 0.585818 0.618163
3.0 0.485050 0.518469
4.0 0.417233 0.450599
5.0 0.355275 0.388053
6.0 0.321047 0.353262
7.0 0.284959 0.316394
8.0 0.258031 0.288726
9.0 0.229577 0.259358
10.0 0.204826 0.233665
11.0 0.185478 0.213482
12.0 0.169249 0.196481
13.0 0.152489 0.178839
14.0 0.140492 0.166149
15.0 0.123777 0.148403
16.0 0.114227 0.138204
17.0 0.104182 0.127428
18.0 0.095788 0.118393
19.0 0.087511 0.109461
20.0 0.080251 0.101594
21.0 0.071472 0.092047
22.0 0.062009 0.081679
23.0 0.058389 0.077706
24.0 0.052256 0.070964
25.0 0.048219 0.066533
26.0 0.041454 0.059164
27.0 0.038465 0.055899
28.0 0.032803 0.049673
29.0 0.027667 0.043899
30.0 0.022586 0.038202
31.0 0.020239 0.035554
32.0 0.016271 0.030944
33.0 0.013848 0.028032
34.0 0.010545 0.023912
35.0 0.007565 0.020272
36.0 0.004379 0.017117
37.0 0.002593 0.015334
38.0 0.000507 0.014214
41.0 0.000507 0.014214
In [37]:
# Probability of leaving:
kmf.cumulative_density_
Out[37]:
Film Bang Listings
timeline
0.0 0.000000
1.0 0.261229
2.0 0.397799
3.0 0.498115
4.0 0.566021
5.0 0.628332
6.0 0.662876
7.0 0.699393
8.0 0.726721
9.0 0.755664
10.0 0.780917
11.0 0.800707
12.0 0.817344
13.0 0.834569
14.0 0.846931
15.0 0.864189
16.0 0.874080
17.0 0.884509
18.0 0.893240
19.0 0.901863
20.0 0.909444
21.0 0.918631
22.0 0.928576
23.0 0.932386
24.0 0.938849
25.0 0.943103
26.0 0.950215
27.0 0.953366
28.0 0.959362
29.0 0.964872
30.0 0.970336
31.0 0.972879
32.0 0.977253
33.0 0.979983
34.0 0.983796
35.0 0.987268
36.0 0.990906
37.0 0.993179
38.0 0.996590
41.0 0.996590
In [38]:
kmf.plot_cumulative_density()
plt.tight_layout()
plt.savefig(f"Stats/Outputs/kaplan_meier_cumulative_density_fb_all.png", facecolor='#ffffff')
In [39]:
# Hazard Function
In [40]:
naf = NelsonAalenFitter()
naf.fit(df['No of Yrs'], event_observed = df['dead'], label="Film Bang Listings")
Out[40]:
<lifelines.NelsonAalenFitter:"Film Bang Listings", fitted with 3629 total observations, 525 right-censored observations>
In [41]:
naf.cumulative_hazard_
Out[41]:
Film Bang Listings
timeline
0.0 0.000000
1.0 0.302719
2.0 0.507069
3.0 0.689240
4.0 0.834566
5.0 0.989502
6.0 1.087008
7.0 1.201594
8.0 1.296850
9.0 1.408727
10.0 1.517739
11.0 1.612336
12.0 1.699428
13.0 1.798376
14.0 1.875952
15.0 1.995419
16.0 2.070927
17.0 2.157242
18.0 2.235714
19.0 2.319763
20.0 2.399984
21.0 2.506684
22.0 2.636660
23.0 2.691281
24.0 2.791364
25.0 2.863143
26.0 2.995933
27.0 3.060890
28.0 3.197463
29.0 3.341854
30.0 3.508900
31.0 3.597186
32.0 3.770011
33.0 3.895156
34.0 4.100962
35.0 4.332647
36.0 4.642171
37.0 4.892171
38.0 5.392171
41.0 5.392171
In [42]:
naf.plot_cumulative_hazard()
plt.tight_layout()
plt.savefig(f"Stats/Outputs/naf_cumulative_hazard_fb_all.png", facecolor='#ffffff')
In [43]:
df['Role 1 Category'].isnull().value_counts()
Out[43]:
False    3629
Name: Role 1 Category, dtype: int64
In [44]:
# TO DO: REMOVE PARAMETERS WITH MORE THAN 2 VARIABLES - ENTRY AND ROLE
data = df[['No of Yrs', 'Entry','Role 1 Category','Trainee prog','Gender','Rural', 'Has Consistent Role','dead']]
In [45]:
cph = CoxPHFitter()
cph.fit(data, 'No of Yrs',event_col='dead')
cph.print_summary()
model lifelines.CoxPHFitter
duration col 'No of Yrs'
event col 'dead'
baseline estimation breslow
number of observations 3629
number of events observed 3104
partial log-likelihood -22442.48
time fit was run 2021-07-01 14:32:46 UTC
coef exp(coef) se(coef) coef lower 95% coef upper 95% exp(coef) lower 95% exp(coef) upper 95% z p -log2(p)
Entry 0.02 1.02 0.00 0.02 0.03 1.02 1.03 11.21 <0.005 94.54
Role 1 Category 0.02 1.02 0.01 0.01 0.03 1.01 1.03 3.52 <0.005 11.19
Trainee prog -0.31 0.74 0.10 -0.51 -0.10 0.60 0.90 -2.95 <0.005 8.29
Gender 0.11 1.12 0.03 0.05 0.18 1.05 1.19 3.44 <0.005 10.77
Rural 0.07 1.08 0.09 -0.09 0.24 0.91 1.27 0.87 0.39 1.37
Has Consistent Role 0.24 1.28 0.04 0.16 0.33 1.17 1.39 5.55 <0.005 25.07

Concordance 0.59
Partial AIC 44896.96
log-likelihood ratio test 215.44 on 6 df
-log2(p) of ll-ratio test 142.88
In [46]:
cph.plot()
plt.tight_layout()
plt.savefig(f"Stats/Outputs/cph_plot_fb_all.png", facecolor='#ffffff')

HR greater than 1 means that as the vlaue of the covariate increases, the event hazard increases HR = 1 : no effect HR < 1 : Reduction in the hazard HR > 1 : Increase in hazard

In [47]:
# survival probability for different people in our dataset
d_data = data.iloc[[0,3,4,6]]
cph.predict_survival_function(d_data).plot()
plt.tight_layout()
plt.savefig(f"Stats/Outputs/cph_survival_prediction_4_individuals_fb_all.png", facecolor='#ffffff')
In [48]:
# median time to event for timeline
CTE = kmf.conditional_time_to_event_
plt.plot(CTE)
plt.tight_layout()
plt.savefig(f"Stats/Outputs/kmf_conditional_time_to_event_medians_fb_all.png", facecolor='#ffffff')

Analysis of above chart:

Not sure yet...

Gender - Kaplan-Meier Estimate

In [49]:
# create 3 kmf objects
kmf_m = KaplanMeierFitter()
kmf_f = KaplanMeierFitter()
kmf_u = KaplanMeierFitter()
In [50]:
Male = df.query("Gender == 0")
Female = df.query("Gender == 1")
Unknown = df.query("Gender == 2")
In [51]:
kmf_m.fit(durations = Male["No of Yrs"],event_observed = Male['dead'],label="Male")
kmf_f.fit(durations = Female["No of Yrs"],event_observed = Female['dead'],label="Female")
kmf_u.fit(durations = Unknown["No of Yrs"],event_observed = Unknown['dead'],label="Unknown")
Out[51]:
<lifelines.KaplanMeierFitter:"Unknown", fitted with 108 total observations, 19 right-censored observations>
In [52]:
kmf_m.event_table
Out[52]:
removed observed censored entrance at_risk
event_at
0.0 0 0 0 1981 1981
1.0 589 497 92 0 1981
2.0 283 250 33 0 1392
3.0 176 166 10 0 1109
4.0 130 115 15 0 933
5.0 110 101 9 0 803
6.0 75 62 13 0 693
7.0 62 57 5 0 618
8.0 64 53 11 0 556
9.0 57 54 3 0 492
10.0 51 45 6 0 435
11.0 40 31 9 0 384
12.0 32 27 5 0 344
13.0 27 22 5 0 312
14.0 22 14 8 0 285
15.0 31 28 3 0 263
16.0 21 18 3 0 232
17.0 26 20 6 0 211
18.0 20 13 7 0 185
19.0 17 12 5 0 165
20.0 16 12 4 0 148
21.0 18 13 5 0 132
22.0 17 12 5 0 114
23.0 7 5 2 0 97
24.0 10 6 4 0 90
25.0 10 5 5 0 80
26.0 10 7 3 0 70
27.0 9 5 4 0 60
28.0 9 8 1 0 51
29.0 8 6 2 0 42
30.0 8 5 3 0 34
31.0 4 3 1 0 26
32.0 5 4 1 0 22
33.0 2 2 0 0 17
34.0 5 3 2 0 15
35.0 4 2 2 0 10
36.0 3 2 1 0 6
37.0 1 0 1 0 3
38.0 1 1 0 0 2
41.0 1 0 1 0 1
In [53]:
kmf_f.event_table
Out[53]:
removed observed censored entrance at_risk
event_at
0.0 0 0 0 1540 1540
1.0 481 410 71 0 1540
2.0 227 199 28 0 1059
3.0 168 154 14 0 832
4.0 109 98 11 0 664
5.0 105 96 9 0 555
6.0 50 43 7 0 450
7.0 59 55 4 0 400
8.0 34 27 7 0 341
9.0 35 29 6 0 307
10.0 34 29 5 0 272
11.0 26 24 2 0 238
12.0 21 19 2 0 212
13.0 27 26 1 0 191
14.0 24 19 5 0 164
15.0 19 17 2 0 140
16.0 10 8 2 0 121
17.0 9 7 2 0 111
18.0 11 9 2 0 102
19.0 10 9 1 0 91
20.0 10 6 4 0 81
21.0 7 7 0 0 71
22.0 12 10 2 0 64
23.0 7 3 4 0 52
24.0 11 7 4 0 45
25.0 8 3 5 0 34
26.0 7 5 2 0 26
28.0 2 1 1 0 19
29.0 6 2 4 0 17
30.0 2 2 0 0 11
32.0 1 1 0 0 9
33.0 2 1 1 0 8
34.0 2 1 1 0 6
35.0 3 1 2 0 4
37.0 1 1 0 0 1
In [54]:
kmf_u.event_table
Out[54]:
removed observed censored entrance at_risk
event_at
0.0 0 0 0 108 108
1.0 49 41 8 0 108
2.0 19 15 4 0 59
3.0 11 10 1 0 40
4.0 8 7 1 0 29
5.0 2 1 1 0 21
6.0 3 3 0 0 19
8.0 3 3 0 0 16
9.0 4 3 1 0 13
11.0 2 2 0 0 9
12.0 1 1 0 0 7
14.0 1 1 0 0 6
15.0 1 1 0 0 5
21.0 2 1 1 0 4
22.0 1 0 1 0 2
25.0 1 0 1 0 1
In [55]:
kmf_m.survival_function_
Out[55]:
Male
timeline
0.0 1.000000
1.0 0.749117
2.0 0.614577
3.0 0.522584
4.0 0.458172
5.0 0.400543
6.0 0.364708
7.0 0.331070
8.0 0.299511
9.0 0.266638
10.0 0.239055
11.0 0.219756
12.0 0.202508
13.0 0.188229
14.0 0.178982
15.0 0.159927
16.0 0.147519
17.0 0.133536
18.0 0.124152
19.0 0.115123
20.0 0.105789
21.0 0.095370
22.0 0.085331
23.0 0.080933
24.0 0.075537
25.0 0.070816
26.0 0.063735
27.0 0.058423
28.0 0.049259
29.0 0.042222
30.0 0.036013
31.0 0.031857
32.0 0.026065
33.0 0.022999
34.0 0.018399
35.0 0.014719
36.0 0.009813
37.0 0.009813
38.0 0.004906
41.0 0.004906
In [56]:
kmf_f.survival_function_
Out[56]:
Female
timeline
0.0 1.000000
1.0 0.733766
2.0 0.595882
3.0 0.485586
4.0 0.413919
5.0 0.342322
6.0 0.309611
7.0 0.267040
8.0 0.245896
9.0 0.222668
10.0 0.198927
11.0 0.178868
12.0 0.162837
13.0 0.140671
14.0 0.124373
15.0 0.109271
16.0 0.102046
17.0 0.095611
18.0 0.087175
19.0 0.078553
20.0 0.072734
21.0 0.065563
22.0 0.055319
23.0 0.052128
24.0 0.044019
25.0 0.040135
26.0 0.032417
28.0 0.030710
29.0 0.027097
30.0 0.022171
32.0 0.019707
33.0 0.017244
34.0 0.014370
35.0 0.010777
37.0 0.000000
In [57]:
kmf_u.survival_function_
Out[57]:
Unknown
timeline
0.0 1.000000
1.0 0.620370
2.0 0.462649
3.0 0.346987
4.0 0.263231
5.0 0.250697
6.0 0.211113
8.0 0.171529
9.0 0.131946
11.0 0.102624
12.0 0.087964
14.0 0.073303
15.0 0.058642
21.0 0.043982
22.0 0.043982
25.0 0.043982
In [58]:
kmf_m.plot()
kmf_f.plot()
kmf_u.plot()
plt.xlabel("Years Passed")
plt.ylabel("Longevity")
plt.title("Film Bang - Gender KMF")
plt.tight_layout()
plt.savefig(f"Stats/Outputs/kaplan_meier_fb_all_gender.png", facecolor='#ffffff')
In [59]:
kmf_m.plot_survival_function(at_risk_counts=True, label="Male")

plt.xlabel("Years Passed")
plt.ylabel("Longevity")
plt.title("Film Bang - Gender KMF")
plt.tight_layout()
plt.savefig("Stats/Outputs/kaplan_meier_fb_gender_male.png", facecolor='#ffffff')
In [60]:
kmf_f.plot_survival_function(at_risk_counts=True, label="Female")

plt.xlabel("Years Passed")
plt.ylabel("Longevity")
plt.title("Film Bang - Gender KMF")
plt.tight_layout()
plt.savefig("Stats/Outputs/kaplan_meier_fb_gender_female.png", facecolor='#ffffff')
In [61]:
kmf_u.plot_survival_function(at_risk_counts=True, label="Gender Unknown")

plt.xlabel("Years Passed")
plt.ylabel("Longevity")
plt.title("Film Bang - Gender KMF")
plt.tight_layout()
plt.savefig("Stats/Outputs/kaplan_meier_fb_gender_unknown.png", facecolor='#ffffff')
In [62]:
kmf_m.cumulative_density_
Out[62]:
Male
timeline
0.0 0.000000
1.0 0.250883
2.0 0.385423
3.0 0.477416
4.0 0.541828
5.0 0.599457
6.0 0.635292
7.0 0.668930
8.0 0.700489
9.0 0.733362
10.0 0.760945
11.0 0.780244
12.0 0.797492
13.0 0.811771
14.0 0.821018
15.0 0.840073
16.0 0.852481
17.0 0.866464
18.0 0.875848
19.0 0.884877
20.0 0.894211
21.0 0.904630
22.0 0.914669
23.0 0.919067
24.0 0.924463
25.0 0.929184
26.0 0.936265
27.0 0.941577
28.0 0.950741
29.0 0.957778
30.0 0.963987
31.0 0.968143
32.0 0.973935
33.0 0.977001
34.0 0.981601
35.0 0.985281
36.0 0.990187
37.0 0.990187
38.0 0.995094
41.0 0.995094
In [63]:
kmf_f.cumulative_density_
Out[63]:
Female
timeline
0.0 0.000000
1.0 0.266234
2.0 0.404118
3.0 0.514414
4.0 0.586081
5.0 0.657678
6.0 0.690389
7.0 0.732960
8.0 0.754104
9.0 0.777332
10.0 0.801073
11.0 0.821132
12.0 0.837163
13.0 0.859329
14.0 0.875627
15.0 0.890729
16.0 0.897954
17.0 0.904389
18.0 0.912825
19.0 0.921447
20.0 0.927266
21.0 0.934437
22.0 0.944681
23.0 0.947872
24.0 0.955981
25.0 0.959865
26.0 0.967583
28.0 0.969290
29.0 0.972903
30.0 0.977829
32.0 0.980293
33.0 0.982756
34.0 0.985630
35.0 0.989223
37.0 1.000000
In [64]:
kmf_u.cumulative_density_
Out[64]:
Unknown
timeline
0.0 0.000000
1.0 0.379630
2.0 0.537351
3.0 0.653013
4.0 0.736769
5.0 0.749303
6.0 0.788887
8.0 0.828471
9.0 0.868054
11.0 0.897376
12.0 0.912036
14.0 0.926697
15.0 0.941358
21.0 0.956018
22.0 0.956018
25.0 0.956018
In [65]:
# Gender Cumulative Density Plot
kmf_m.plot_cumulative_density()
kmf_f.plot_cumulative_density()
kmf_u.plot_cumulative_density()
plt.tight_layout()
plt.savefig("Stats/Outputs/kaplan_meier_cumulative_density_gender.png", facecolor='#ffffff')

Gender Hazard Function

In [66]:
naf_m = NelsonAalenFitter()
naf_f = NelsonAalenFitter()
naf_u = NelsonAalenFitter()

naf_m.fit(Male['No of Yrs'],event_observed = Male['dead'], label="Male")
naf_f.fit(Female['No of Yrs'],event_observed = Female['dead'],label="Female")
naf_u.fit(Unknown['No of Yrs'],event_observed = Unknown['dead'],label="Unknown")
Out[66]:
<lifelines.NelsonAalenFitter:"Unknown", fitted with 108 total observations, 19 right-censored observations>
In [67]:
naf_m.cumulative_hazard_
Out[67]:
Male
timeline
0.0 0.000000
1.0 0.288776
2.0 0.486658
3.0 0.648726
4.0 0.780194
5.0 0.914526
6.0 1.008179
7.0 1.104864
8.0 1.204948
9.0 1.321082
10.0 1.430149
11.0 1.514209
12.0 1.595826
13.0 1.668826
14.0 1.719106
15.0 1.831449
16.0 1.912029
17.0 2.011366
18.0 2.084023
19.0 2.159294
20.0 2.243554
21.0 2.346819
22.0 2.457531
23.0 2.510174
24.0 2.578771
25.0 2.642895
26.0 2.747466
27.0 2.833724
28.0 3.002539
29.0 3.154722
30.0 3.311279
31.0 3.431407
32.0 3.627112
33.0 3.748435
34.0 3.963454
35.0 4.174565
36.0 4.541232
37.0 4.541232
38.0 5.041232
41.0 5.041232
In [68]:
naf_m.plot_cumulative_hazard()
naf_f.plot_cumulative_hazard()
naf_u.plot_cumulative_hazard()
plt.tight_layout()
plt.savefig("Stats/Outputs/naf_cumulative_hazard_gender.png", facecolor='#ffffff')

Gender - Log Rank Test

In [69]:
T1=Male['No of Yrs']
E1=Male['dead']
T2=Female['No of Yrs']
E2=Female['dead']
# T3=Unknown['No of Yrs']
# E3=Unknown['dead']
In [70]:
T1.describe()
Out[70]:
count    1981.000000
mean        6.267037
std         7.229071
min         1.000000
25%         1.000000
50%         3.000000
75%         8.000000
max        41.000000
Name: No of Yrs, dtype: float64
In [71]:
T2.describe()
Out[71]:
count    1540.000000
mean        5.368182
std         6.227141
min         1.000000
25%         1.000000
50%         3.000000
75%         7.000000
max        37.000000
Name: No of Yrs, dtype: float64
In [72]:
results=logrank_test(T1,T2,event_observed_A=E1, event_observed_B=E2)
results.print_summary()
t_0 -1
null_distribution chi squared
degrees_of_freedom 1
test_name logrank_test
test_statistic p -log2(p)
0 13.63 <0.005 12.13
In [73]:
# p value of <0.005 means gender is associated with longevity. 

Has Consistent Role

In [74]:
kmf_is = KaplanMeierFitter()
kmf_isnt = KaplanMeierFitter()
kmf_dp = KaplanMeierFitter()
In [75]:
df = df.rename(columns={"Has Consistent Role": "Consistent"})
In [76]:
df['Consistent']
Out[76]:
0       1
1       1
2       1
3       1
4       1
       ..
3629    1
3630    2
3631    1
3632    1
3633    1
Name: Consistent, Length: 3629, dtype: object
In [77]:
Consistent = df.query("Consistent == 1")
Inconsistent = df.query("Consistent == 2")
DirProd = df.query("Consistent == 0")
In [78]:
kmf_is.fit(durations = Consistent['No of Yrs'],event_observed= Consistent['dead'], label="Consistent")
kmf_isnt.fit(durations = Inconsistent['No of Yrs'],event_observed= Inconsistent['dead'], label="Inconsistent")
kmf_dp.fit(durations = DirProd['No of Yrs'],event_observed= DirProd['dead'], label="Director/Producer")
Out[78]:
<lifelines.KaplanMeierFitter:"Director/Producer", fitted with 468 total observations, 41 right-censored observations>
In [79]:
kmf_is.event_table
Out[79]:
removed observed censored entrance at_risk
event_at
0.0 0 0 0 3089 3089
1.0 1084 913 171 0 3089
2.0 465 405 60 0 2005
3.0 301 276 25 0 1540
4.0 200 175 25 0 1239
5.0 177 159 18 0 1039
6.0 98 80 18 0 862
7.0 88 82 6 0 764
8.0 77 61 16 0 676
9.0 71 61 10 0 599
10.0 68 57 11 0 528
11.0 56 46 10 0 460
12.0 33 27 6 0 404
13.0 43 38 5 0 371
14.0 36 27 9 0 328
15.0 34 29 5 0 292
16.0 17 14 3 0 258
17.0 27 21 6 0 241
18.0 25 18 7 0 214
19.0 15 10 5 0 189
20.0 16 11 5 0 174
21.0 23 18 5 0 158
22.0 23 17 6 0 135
23.0 10 5 5 0 112
24.0 18 11 7 0 102
25.0 15 5 10 0 84
26.0 13 9 4 0 69
27.0 5 2 3 0 56
28.0 6 5 1 0 51
29.0 10 5 5 0 45
30.0 8 6 2 0 35
31.0 3 2 1 0 27
32.0 4 3 1 0 24
33.0 4 3 1 0 20
34.0 5 3 2 0 16
35.0 5 1 4 0 11
36.0 3 2 1 0 6
37.0 1 0 1 0 3
38.0 1 1 0 0 2
41.0 1 0 1 0 1
In [80]:
kmf_isnt.event_table
Out[80]:
removed observed censored entrance at_risk
event_at
0.0 0 0 0 72 72
1.0 9 9 0 0 72
2.0 8 8 0 0 63
3.0 7 7 0 0 55
4.0 7 7 0 0 48
5.0 4 4 0 0 41
6.0 2 2 0 0 37
7.0 1 1 0 0 35
8.0 4 3 1 0 34
9.0 4 4 0 0 30
10.0 3 3 0 0 26
11.0 4 3 1 0 23
12.0 1 0 1 0 19
13.0 1 1 0 0 18
14.0 2 2 0 0 17
15.0 4 4 0 0 15
16.0 3 3 0 0 11
17.0 2 2 0 0 8
19.0 1 1 0 0 6
20.0 1 1 0 0 5
21.0 2 2 0 0 4
27.0 1 1 0 0 2
28.0 1 1 0 0 1
In [81]:
kmf_dp.event_table
Out[81]:
removed observed censored entrance at_risk
event_at
0.0 0 0 0 468 468
1.0 26 26 0 0 468
2.0 56 51 5 0 442
3.0 47 47 0 0 386
4.0 40 38 2 0 339
5.0 36 35 1 0 299
6.0 28 26 2 0 263
7.0 32 29 3 0 235
8.0 20 19 1 0 203
9.0 21 21 0 0 183
10.0 14 14 0 0 162
11.0 8 8 0 0 148
12.0 20 20 0 0 140
13.0 10 9 1 0 120
14.0 9 5 4 0 110
15.0 13 13 0 0 101
16.0 11 9 2 0 88
17.0 6 4 2 0 77
18.0 6 4 2 0 71
19.0 11 10 1 0 65
20.0 9 6 3 0 54
21.0 2 1 1 0 45
22.0 7 5 2 0 43
23.0 4 3 1 0 36
24.0 3 2 1 0 32
25.0 4 3 1 0 29
26.0 4 3 1 0 25
27.0 3 2 1 0 21
28.0 4 3 1 0 18
29.0 4 3 1 0 14
30.0 2 1 1 0 10
31.0 1 1 0 0 8
32.0 2 2 0 0 7
34.0 2 1 1 0 5
35.0 2 2 0 0 3
37.0 1 1 0 0 1
In [82]:
kmf_is.survival_function_
Out[82]:
Consistent
timeline
0.0 1.000000
1.0 0.704435
2.0 0.562143
3.0 0.461395
4.0 0.396226
5.0 0.335591
6.0 0.304446
7.0 0.271770
8.0 0.247246
9.0 0.222067
10.0 0.198094
11.0 0.178285
12.0 0.166370
13.0 0.149329
14.0 0.137037
15.0 0.123427
16.0 0.116729
17.0 0.106558
18.0 0.097595
19.0 0.092431
20.0 0.086588
21.0 0.076724
22.0 0.067062
23.0 0.064068
24.0 0.057159
25.0 0.053757
26.0 0.046745
27.0 0.045075
28.0 0.040656
29.0 0.036139
30.0 0.029944
31.0 0.027726
32.0 0.024260
33.0 0.020621
34.0 0.016754
35.0 0.015231
36.0 0.010154
37.0 0.010154
38.0 0.005077
41.0 0.005077
In [83]:
kmf_isnt.survival_function_
Out[83]:
Inconsistent
timeline
0.0 1.000000
1.0 0.875000
2.0 0.763889
3.0 0.666667
4.0 0.569444
5.0 0.513889
6.0 0.486111
7.0 0.472222
8.0 0.430556
9.0 0.373148
10.0 0.330093
11.0 0.287037
12.0 0.287037
13.0 0.271091
14.0 0.239198
15.0 0.175412
16.0 0.127572
17.0 0.095679
19.0 0.079733
20.0 0.063786
21.0 0.031893
27.0 0.015947
28.0 0.000000
In [84]:
kmf_dp.survival_function_
Out[84]:
Director/Producer
timeline
0.0 1.000000
1.0 0.944444
2.0 0.835470
3.0 0.733742
4.0 0.651494
5.0 0.575232
6.0 0.518365
7.0 0.454396
8.0 0.411867
9.0 0.364603
10.0 0.333094
11.0 0.315089
12.0 0.270076
13.0 0.249821
14.0 0.238465
15.0 0.207772
16.0 0.186522
17.0 0.176833
18.0 0.166870
19.0 0.141198
20.0 0.125509
21.0 0.122720
22.0 0.108450
23.0 0.099413
24.0 0.093200
25.0 0.083558
26.0 0.073531
27.0 0.066528
28.0 0.055440
29.0 0.043560
30.0 0.039204
31.0 0.034304
32.0 0.024503
34.0 0.019602
35.0 0.006534
37.0 0.000000
In [85]:
kmf_is.plot()
kmf_isnt.plot()
kmf_dp.plot()
plt.xlabel("Years")
plt.ylabel("Longevity")
plt.title("Film Bang - Role Consistency KMF")
plt.tight_layout()
plt.savefig("Stats/Outputs/kapplan_meier_estimate_role_consistency_all.png", facecolor='#ffffff')
In [86]:
kmf_is.plot_cumulative_density()
kmf_isnt.plot_cumulative_density()
kmf_dp.plot_cumulative_density()
plt.tight_layout()
plt.savefig("Stats/Outputs/kaplan_meier_cumulative_density_role_consistency_all.png", facecolor='#ffffff')
In [87]:
naf_is = NelsonAalenFitter()
naf_isnt = NelsonAalenFitter()
naf_dp = NelsonAalenFitter()

naf_is.fit(Consistent['No of Yrs'],event_observed = Consistent['dead'])
naf_isnt.fit(Inconsistent['No of Yrs'],event_observed = Inconsistent['dead'])
naf_dp.fit(DirProd['No of Yrs'],event_observed = DirProd['dead'])
Out[87]:
<lifelines.NelsonAalenFitter:"NA_estimate", fitted with 468 total observations, 41 right-censored observations>
In [88]:
naf_is.plot_cumulative_hazard(label="Consistent")
naf_isnt.plot_cumulative_hazard(label="Inconsistent")
naf_dp.plot_cumulative_hazard(label="Director/Producer")
plt.tight_layout()
plt.savefig("Stats/Outputs/naf_cumulative_hazard_role_consistency.png", facecolor='#ffffff')

Consistent Role Log-Rank Test

In [89]:
T1=Consistent['No of Yrs']
T2=Inconsistent['No of Yrs']
E1=Consistent['dead']
E2=Inconsistent['dead']
In [90]:
results=logrank_test(T1,T2,event_observed_A=E1, event_observed_B=E2)
results.print_summary()
t_0 -1
null_distribution chi squared
degrees_of_freedom 1
test_name logrank_test
test_statistic p -log2(p)
0 3.76 0.05 4.25

Trainee Prog

In [91]:
kmf_t = KaplanMeierFitter()
kmf_null = KaplanMeierFitter()
In [92]:
df = df.rename(columns={"Trainee prog": "Trainee"})
In [93]:
Trainee = df.query("Trainee == 1")
Non = df.query("Trainee == 0")
In [94]:
kmf_t.fit(durations = Trainee['No of Yrs'],event_observed= Trainee['dead'], label="Trainee")
kmf_null.fit(durations = Non['No of Yrs'],event_observed= Non['dead'], label="Non Trainee")
Out[94]:
<lifelines.KaplanMeierFitter:"Non Trainee", fitted with 3513 total observations, 507 right-censored observations>
In [95]:
kmf_t.event_table
Out[95]:
removed observed censored entrance at_risk
event_at
0.0 0 0 0 116 116
1.0 22 17 5 0 116
2.0 11 11 0 0 94
3.0 13 11 2 0 83
4.0 8 8 0 0 70
5.0 8 8 0 0 62
6.0 6 6 0 0 54
7.0 9 9 0 0 48
8.0 5 4 1 0 39
9.0 2 1 1 0 34
10.0 3 3 0 0 32
11.0 3 3 0 0 29
12.0 2 1 1 0 26
13.0 3 1 2 0 24
14.0 1 1 0 0 21
16.0 1 1 0 0 20
17.0 1 1 0 0 19
19.0 2 1 1 0 18
21.0 2 2 0 0 16
22.0 1 1 0 0 14
23.0 1 0 1 0 13
26.0 1 1 0 0 12
28.0 1 1 0 0 11
29.0 2 1 1 0 10
30.0 1 0 1 0 8
32.0 2 2 0 0 7
34.0 2 1 1 0 5
36.0 2 1 1 0 3
37.0 1 1 0 0 1
In [96]:
kmf_null.event_table
Out[96]:
removed observed censored entrance at_risk
event_at
0.0 0 0 0 3513 3513
1.0 1097 931 166 0 3513
2.0 518 453 65 0 2416
3.0 342 319 23 0 1898
4.0 239 212 27 0 1556
5.0 209 190 19 0 1317
6.0 122 102 20 0 1108
7.0 112 103 9 0 986
8.0 96 79 17 0 874
9.0 94 85 9 0 778
10.0 82 71 11 0 684
11.0 65 54 11 0 602
12.0 52 46 6 0 537
13.0 51 47 4 0 485
14.0 46 33 13 0 434
15.0 51 46 5 0 388
16.0 30 25 5 0 337
17.0 34 26 8 0 307
18.0 31 22 9 0 273
19.0 25 20 5 0 242
20.0 26 18 8 0 217
21.0 25 19 6 0 191
22.0 29 21 8 0 166
23.0 13 8 5 0 137
24.0 21 13 8 0 124
25.0 19 8 11 0 103
26.0 16 11 5 0 84
27.0 9 5 4 0 68
28.0 10 8 2 0 59
29.0 12 7 5 0 49
30.0 9 7 2 0 37
31.0 4 3 1 0 28
32.0 4 3 1 0 24
33.0 4 3 1 0 20
34.0 5 3 2 0 16
35.0 7 3 4 0 11
36.0 1 1 0 0 4
37.0 1 0 1 0 3
38.0 1 1 0 0 2
41.0 1 0 1 0 1
In [97]:
kmf_t.survival_function_
Out[97]:
Trainee
timeline
0.0 1.000000
1.0 0.853448
2.0 0.753577
3.0 0.653705
4.0 0.578996
5.0 0.504287
6.0 0.448255
7.0 0.364207
8.0 0.326853
9.0 0.317239
10.0 0.287498
11.0 0.257757
12.0 0.247843
13.0 0.237516
14.0 0.226206
16.0 0.214896
17.0 0.203585
19.0 0.192275
21.0 0.168241
22.0 0.156224
23.0 0.156224
26.0 0.143205
28.0 0.130186
29.0 0.117168
30.0 0.117168
32.0 0.083691
34.0 0.066953
36.0 0.044635
37.0 0.000000
In [98]:
kmf_null.survival_function_
Out[98]:
Non Trainee
timeline
0.0 1.000000
1.0 0.734984
2.0 0.597175
3.0 0.496807
4.0 0.429118
5.0 0.367211
6.0 0.333406
7.0 0.298578
8.0 0.271589
9.0 0.241917
10.0 0.216806
11.0 0.197358
12.0 0.180452
13.0 0.162965
14.0 0.150574
15.0 0.132722
16.0 0.122876
17.0 0.112470
18.0 0.103406
19.0 0.094860
20.0 0.086992
21.0 0.078338
22.0 0.068428
23.0 0.064432
24.0 0.057677
25.0 0.053197
26.0 0.046231
27.0 0.042832
28.0 0.037024
29.0 0.031735
30.0 0.025731
31.0 0.022974
32.0 0.020102
33.0 0.017087
34.0 0.013883
35.0 0.010097
36.0 0.007573
37.0 0.007573
38.0 0.003786
41.0 0.003786
In [99]:
kmf_t.plot()
kmf_null.plot()
plt.xlabel("Years")
plt.ylabel("Longevity")
plt.title("Film Bang - Trainee Prog KMF")
plt.tight_layout()
plt.savefig("Stats/Outputs/kaplan_meier_estimate_trainee.png", facecolor='#ffffff')
In [100]:
kmf_t.plot_cumulative_density()
kmf_null.plot_cumulative_density()
plt.tight_layout()
plt.savefig("Stats/Outputs/kaplan_meier_cumulative_density_trainee.png", facecolor='#ffffff')
In [101]:
naf_t = NelsonAalenFitter()
naf_null = NelsonAalenFitter()

naf_t.fit(Trainee['No of Yrs'],event_observed = Trainee['dead'])
naf_null.fit(Non['No of Yrs'],event_observed = Non['dead'])
Out[101]:
<lifelines.NelsonAalenFitter:"NA_estimate", fitted with 3513 total observations, 507 right-censored observations>
In [102]:
naf_t.plot_cumulative_hazard(label="Trainee")
naf_null.plot_cumulative_hazard(label="Non Trainee")
plt.tight_layout()
plt.savefig("Stats/Outputs/naf_cumulative_hazard_trainee.png", facecolor='#ffffff')

Trainee Log-Rank Test

In [103]:
T1=Trainee['No of Yrs']
T2=Non['No of Yrs']
E1=Trainee['dead']
E2=Non['dead']
In [104]:
T1.describe()
Out[104]:
count    116.000000
mean       8.560345
std        9.477010
min        1.000000
25%        2.000000
50%        5.000000
75%       10.250000
max       37.000000
Name: No of Yrs, dtype: float64
In [105]:
T2.describe()
Out[105]:
count    3513.000000
mean        5.716197
std         6.655021
min         1.000000
25%         1.000000
50%         3.000000
75%         7.000000
max        41.000000
Name: No of Yrs, dtype: float64
In [106]:
results=logrank_test(T1,T2,event_observed_A=E1, event_observed_B=E2)
results.print_summary()
t_0 -1
null_distribution chi squared
degrees_of_freedom 1
test_name logrank_test
test_statistic p -log2(p)
0 13.23 <0.005 11.82

Depts

In [107]:
#df.columns
In [108]:
#dept_list

Production

In [109]:
production = pd.read_csv('Stats/Production_df.csv')
In [110]:
kmf_production = KaplanMeierFitter()
In [111]:
production.shape
Out[111]:
(816, 54)
In [112]:
production.loc[production['dropout year'] == 0.0, 'dead'] = 0
production.loc[production['dropout year'] > 0, 'dead'] = 1
In [113]:
# drop null values from No of Yrs col
production = production[production['No of Yrs'].notna()]
In [114]:
kmf_production.fit(durations = production['No of Yrs'],event_observed = production['dead'], label="Production")
Out[114]:
<lifelines.KaplanMeierFitter:"Production", fitted with 816 total observations, 123 right-censored observations>
In [115]:
kmf_production.event_table
Out[115]:
removed observed censored entrance at_risk
event_at
0.0 0 0 0 816 816
1.0 315 253 62 0 816
2.0 139 123 16 0 501
3.0 92 86 6 0 362
4.0 59 52 7 0 270
5.0 50 49 1 0 211
6.0 24 18 6 0 161
7.0 22 22 0 0 137
8.0 14 10 4 0 115
9.0 16 14 2 0 101
10.0 14 13 1 0 85
11.0 5 4 1 0 71
12.0 6 6 0 0 66
13.0 11 8 3 0 60
14.0 6 3 3 0 49
15.0 6 6 0 0 43
16.0 3 3 0 0 37
17.0 2 1 1 0 34
18.0 6 4 2 0 32
19.0 3 3 0 0 26
20.0 3 2 1 0 23
21.0 3 2 1 0 20
22.0 3 3 0 0 17
23.0 2 1 1 0 14
24.0 5 4 1 0 12
25.0 2 1 1 0 7
26.0 1 0 1 0 5
27.0 1 0 1 0 4
28.0 1 0 1 0 3
29.0 2 2 0 0 2
In [116]:
kmf_production.survival_function_
Out[116]:
Production
timeline
0.0 1.000000
1.0 0.689951
2.0 0.520562
3.0 0.396892
4.0 0.320454
5.0 0.246036
6.0 0.218529
7.0 0.183436
8.0 0.167485
9.0 0.144270
10.0 0.122205
11.0 0.115320
12.0 0.104836
13.0 0.090858
14.0 0.085295
15.0 0.073394
16.0 0.067443
17.0 0.065459
18.0 0.057277
19.0 0.050668
20.0 0.046262
21.0 0.041636
22.0 0.034288
23.0 0.031839
24.0 0.021226
25.0 0.018194
26.0 0.018194
27.0 0.018194
28.0 0.018194
29.0 0.000000
In [117]:
kmf_production.median_survival_time_
Out[117]:
3.0
In [118]:
kmf_production.plot_survival_function(at_risk_counts=True)
plt.title("Kaplan-Meier Estimate")
plt.ylabel("Probability of Production freelancer still in FB")
plt.show
Out[118]:
<function matplotlib.pyplot.show(*args, **kw)>
In [119]:
kmf_production.confidence_interval_
Out[119]:
Production_lower_0.95 Production_upper_0.95
0.0 1.000000 1.000000
1.0 0.656980 0.720453
2.0 0.484613 0.555238
3.0 0.361500 0.432030
4.0 0.286649 0.354700
5.0 0.214779 0.278479
6.0 0.188558 0.249981
7.0 0.155323 0.213442
8.0 0.140370 0.196689
9.0 0.118721 0.172218
10.0 0.098367 0.148768
11.0 0.092069 0.141404
12.0 0.082531 0.130150
13.0 0.069960 0.115006
14.0 0.064969 0.108987
15.0 0.054318 0.096132
16.0 0.049077 0.089627
17.0 0.047343 0.087446
18.0 0.040235 0.078432
19.0 0.034529 0.071172
20.0 0.030796 0.066267
21.0 0.026905 0.061122
22.0 0.020837 0.052921
23.0 0.018876 0.050131
24.0 0.010705 0.037958
25.0 0.008478 0.034550
26.0 0.008478 0.034550
27.0 0.008478 0.034550
28.0 0.008478 0.034550
29.0 0.000000 0.000000
In [120]:
# Probability of leaving:
kmf_production.cumulative_density_
Out[120]:
Production
timeline
0.0 0.000000
1.0 0.310049
2.0 0.479438
3.0 0.603108
4.0 0.679546
5.0 0.753964
6.0 0.781471
7.0 0.816564
8.0 0.832515
9.0 0.855730
10.0 0.877795
11.0 0.884680
12.0 0.895164
13.0 0.909142
14.0 0.914705
15.0 0.926606
16.0 0.932557
17.0 0.934541
18.0 0.942723
19.0 0.949332
20.0 0.953738
21.0 0.958364
22.0 0.965712
23.0 0.968161
24.0 0.978774
25.0 0.981806
26.0 0.981806
27.0 0.981806
28.0 0.981806
29.0 1.000000
In [121]:
kmf_production.plot_cumulative_density()
Out[121]:
<matplotlib.axes._subplots.AxesSubplot at 0x106c56e48>
In [122]:
naf_production = NelsonAalenFitter()
naf_production.fit(production['No of Yrs'], event_observed = production['dead'])
Out[122]:
<lifelines.NelsonAalenFitter:"NA_estimate", fitted with 816 total observations, 123 right-censored observations>
In [123]:
naf.cumulative_hazard_
Out[123]:
Film Bang Listings
timeline
0.0 0.000000
1.0 0.302719
2.0 0.507069
3.0 0.689240
4.0 0.834566
5.0 0.989502
6.0 1.087008
7.0 1.201594
8.0 1.296850
9.0 1.408727
10.0 1.517739
11.0 1.612336
12.0 1.699428
13.0 1.798376
14.0 1.875952
15.0 1.995419
16.0 2.070927
17.0 2.157242
18.0 2.235714
19.0 2.319763
20.0 2.399984
21.0 2.506684
22.0 2.636660
23.0 2.691281
24.0 2.791364
25.0 2.863143
26.0 2.995933
27.0 3.060890
28.0 3.197463
29.0 3.341854
30.0 3.508900
31.0 3.597186
32.0 3.770011
33.0 3.895156
34.0 4.100962
35.0 4.332647
36.0 4.642171
37.0 4.892171
38.0 5.392171
41.0 5.392171
In [124]:
naf_production.plot_cumulative_hazard()
Out[124]:
<matplotlib.axes._subplots.AxesSubplot at 0x11a00ab70>
In [125]:
data_production = production[['No of Yrs', 'Entry','Trainee prog','Gender','Rural', 'Has Consistent Role','dead']]

cph_production = CoxPHFitter()
cph_production.fit(data_production, 'No of Yrs',event_col='dead')
cph_production.print_summary()
model lifelines.CoxPHFitter
duration col 'No of Yrs'
event col 'dead'
baseline estimation breslow
number of observations 816
number of events observed 693
partial log-likelihood -3965.28
time fit was run 2021-07-01 14:32:59 UTC
coef exp(coef) se(coef) coef lower 95% coef upper 95% exp(coef) lower 95% exp(coef) upper 95% z p -log2(p)
Entry 0.01 1.01 0.00 0.01 0.02 1.01 1.02 3.18 <0.005 9.41
Trainee prog -0.19 0.83 0.18 -0.55 0.18 0.58 1.19 -1.00 0.32 1.66
Gender 0.05 1.05 0.07 -0.10 0.19 0.91 1.21 0.64 0.52 0.93
Rural 0.14 1.15 0.18 -0.21 0.49 0.81 1.63 0.79 0.43 1.23
Has Consistent Role 0.57 1.77 0.14 0.30 0.84 1.34 2.32 4.10 <0.005 14.53

Concordance 0.56
Partial AIC 7940.55
log-likelihood ratio test 36.40 on 5 df
-log2(p) of ll-ratio test 20.27

Camera

In [126]:
camera = pd.read_csv('Stats/Camera_df.csv')
In [127]:
kmf_camera = KaplanMeierFitter()
In [128]:
camera.shape
Out[128]:
(633, 54)
In [129]:
camera.loc[camera['dropout year'] == 0.0, 'dead'] = 0
camera.loc[camera['dropout year'] > 0, 'dead'] = 1
In [130]:
# drop null values from No of Yrs col
camera = camera[camera['No of Yrs'].notna()]
In [131]:
kmf_camera.fit(durations = camera['No of Yrs'],event_observed = camera['dead'], label="Camera")
Out[131]:
<lifelines.KaplanMeierFitter:"Camera", fitted with 633 total observations, 139 right-censored observations>
In [132]:
kmf_production.event_table
Out[132]:
removed observed censored entrance at_risk
event_at
0.0 0 0 0 816 816
1.0 315 253 62 0 816
2.0 139 123 16 0 501
3.0 92 86 6 0 362
4.0 59 52 7 0 270
5.0 50 49 1 0 211
6.0 24 18 6 0 161
7.0 22 22 0 0 137
8.0 14 10 4 0 115
9.0 16 14 2 0 101
10.0 14 13 1 0 85
11.0 5 4 1 0 71
12.0 6 6 0 0 66
13.0 11 8 3 0 60
14.0 6 3 3 0 49
15.0 6 6 0 0 43
16.0 3 3 0 0 37
17.0 2 1 1 0 34
18.0 6 4 2 0 32
19.0 3 3 0 0 26
20.0 3 2 1 0 23
21.0 3 2 1 0 20
22.0 3 3 0 0 17
23.0 2 1 1 0 14
24.0 5 4 1 0 12
25.0 2 1 1 0 7
26.0 1 0 1 0 5
27.0 1 0 1 0 4
28.0 1 0 1 0 3
29.0 2 2 0 0 2
In [133]:
kmf_production.survival_function_
Out[133]:
Production
timeline
0.0 1.000000
1.0 0.689951
2.0 0.520562
3.0 0.396892
4.0 0.320454
5.0 0.246036
6.0 0.218529
7.0 0.183436
8.0 0.167485
9.0 0.144270
10.0 0.122205
11.0 0.115320
12.0 0.104836
13.0 0.090858
14.0 0.085295
15.0 0.073394
16.0 0.067443
17.0 0.065459
18.0 0.057277
19.0 0.050668
20.0 0.046262
21.0 0.041636
22.0 0.034288
23.0 0.031839
24.0 0.021226
25.0 0.018194
26.0 0.018194
27.0 0.018194
28.0 0.018194
29.0 0.000000
In [134]:
kmf_camera.median_survival_time_
Out[134]:
5.0
In [135]:
kmf_camera.plot_survival_function(at_risk_counts=True)
plt.title("Kaplan-Meier Estimate")
plt.ylabel("Probability of Production freelancer still in FB")
plt.show
Out[135]:
<function matplotlib.pyplot.show(*args, **kw)>
In [136]:
kmf_production.confidence_interval_
Out[136]:
Production_lower_0.95 Production_upper_0.95
0.0 1.000000 1.000000
1.0 0.656980 0.720453
2.0 0.484613 0.555238
3.0 0.361500 0.432030
4.0 0.286649 0.354700
5.0 0.214779 0.278479
6.0 0.188558 0.249981
7.0 0.155323 0.213442
8.0 0.140370 0.196689
9.0 0.118721 0.172218
10.0 0.098367 0.148768
11.0 0.092069 0.141404
12.0 0.082531 0.130150
13.0 0.069960 0.115006
14.0 0.064969 0.108987
15.0 0.054318 0.096132
16.0 0.049077 0.089627
17.0 0.047343 0.087446
18.0 0.040235 0.078432
19.0 0.034529 0.071172
20.0 0.030796 0.066267
21.0 0.026905 0.061122
22.0 0.020837 0.052921
23.0 0.018876 0.050131
24.0 0.010705 0.037958
25.0 0.008478 0.034550
26.0 0.008478 0.034550
27.0 0.008478 0.034550
28.0 0.008478 0.034550
29.0 0.000000 0.000000
In [137]:
# Probability of leaving:
kmf_production.cumulative_density_
Out[137]:
Production
timeline
0.0 0.000000
1.0 0.310049
2.0 0.479438
3.0 0.603108
4.0 0.679546
5.0 0.753964
6.0 0.781471
7.0 0.816564
8.0 0.832515
9.0 0.855730
10.0 0.877795
11.0 0.884680
12.0 0.895164
13.0 0.909142
14.0 0.914705
15.0 0.926606
16.0 0.932557
17.0 0.934541
18.0 0.942723
19.0 0.949332
20.0 0.953738
21.0 0.958364
22.0 0.965712
23.0 0.968161
24.0 0.978774
25.0 0.981806
26.0 0.981806
27.0 0.981806
28.0 0.981806
29.0 1.000000
In [138]:
kmf_camera.plot_cumulative_density()
Out[138]:
<matplotlib.axes._subplots.AxesSubplot at 0x11e388240>
In [139]:
naf_camera = NelsonAalenFitter()
naf_camera.fit(camera['No of Yrs'], event_observed = camera['dead'], label="Camera")
Out[139]:
<lifelines.NelsonAalenFitter:"Camera", fitted with 633 total observations, 139 right-censored observations>
In [140]:
naf.cumulative_hazard_
Out[140]:
Film Bang Listings
timeline
0.0 0.000000
1.0 0.302719
2.0 0.507069
3.0 0.689240
4.0 0.834566
5.0 0.989502
6.0 1.087008
7.0 1.201594
8.0 1.296850
9.0 1.408727
10.0 1.517739
11.0 1.612336
12.0 1.699428
13.0 1.798376
14.0 1.875952
15.0 1.995419
16.0 2.070927
17.0 2.157242
18.0 2.235714
19.0 2.319763
20.0 2.399984
21.0 2.506684
22.0 2.636660
23.0 2.691281
24.0 2.791364
25.0 2.863143
26.0 2.995933
27.0 3.060890
28.0 3.197463
29.0 3.341854
30.0 3.508900
31.0 3.597186
32.0 3.770011
33.0 3.895156
34.0 4.100962
35.0 4.332647
36.0 4.642171
37.0 4.892171
38.0 5.392171
41.0 5.392171
In [141]:
naf_camera.plot_cumulative_hazard()
Out[141]:
<matplotlib.axes._subplots.AxesSubplot at 0x11df24630>
In [142]:
camera = camera[camera['Entry'].notna()]
In [143]:
data_camera = camera[['No of Yrs', 'Entry','Trainee prog','Gender','Rural', 'Has Consistent Role','dead']]

cph_camera = CoxPHFitter()
cph_camera.fit(data_camera, 'No of Yrs',event_col='dead')
cph_camera.print_summary()
model lifelines.CoxPHFitter
duration col 'No of Yrs'
event col 'dead'
baseline estimation breslow
number of observations 633
number of events observed 494
partial log-likelihood -2725.09
time fit was run 2021-07-01 14:33:01 UTC
coef exp(coef) se(coef) coef lower 95% coef upper 95% exp(coef) lower 95% exp(coef) upper 95% z p -log2(p)
Entry 0.02 1.02 0.00 0.01 0.03 1.01 1.03 4.70 <0.005 18.52
Trainee prog -0.70 0.50 0.27 -1.22 -0.17 0.29 0.84 -2.61 0.01 6.77
Gender 0.00 1.00 0.11 -0.22 0.22 0.80 1.25 0.01 0.99 0.01
Rural 0.01 1.01 0.21 -0.40 0.43 0.67 1.54 0.06 0.95 0.08
Has Consistent Role 0.02 1.02 0.15 -0.27 0.31 0.76 1.36 0.13 0.90 0.16

Concordance 0.58
Partial AIC 5460.17
log-likelihood ratio test 35.10 on 5 df
-log2(p) of ll-ratio test 19.41

Art

In [144]:
art = pd.read_csv('Stats/Art_df.csv')
In [145]:
kmf_art = KaplanMeierFitter()
In [146]:
art.shape
Out[146]:
(468, 54)
In [147]:
art.loc[art['dropout year'] == 0.0, 'dead'] = 0
art.loc[art['dropout year'] > 0, 'dead'] = 1
In [148]:
# drop null values from No of Yrs col
art = art[art['No of Yrs'].notna()]
In [149]:
kmf_art.fit(durations = art['No of Yrs'],event_observed = art['dead'], label="Art")
Out[149]:
<lifelines.KaplanMeierFitter:"Art", fitted with 468 total observations, 80 right-censored observations>
In [150]:
kmf_production.event_table
Out[150]:
removed observed censored entrance at_risk
event_at
0.0 0 0 0 816 816
1.0 315 253 62 0 816
2.0 139 123 16 0 501
3.0 92 86 6 0 362
4.0 59 52 7 0 270
5.0 50 49 1 0 211
6.0 24 18 6 0 161
7.0 22 22 0 0 137
8.0 14 10 4 0 115
9.0 16 14 2 0 101
10.0 14 13 1 0 85
11.0 5 4 1 0 71
12.0 6 6 0 0 66
13.0 11 8 3 0 60
14.0 6 3 3 0 49
15.0 6 6 0 0 43
16.0 3 3 0 0 37
17.0 2 1 1 0 34
18.0 6 4 2 0 32
19.0 3 3 0 0 26
20.0 3 2 1 0 23
21.0 3 2 1 0 20
22.0 3 3 0 0 17
23.0 2 1 1 0 14
24.0 5 4 1 0 12
25.0 2 1 1 0 7
26.0 1 0 1 0 5
27.0 1 0 1 0 4
28.0 1 0 1 0 3
29.0 2 2 0 0 2
In [151]:
kmf_production.survival_function_
Out[151]:
Production
timeline
0.0 1.000000
1.0 0.689951
2.0 0.520562
3.0 0.396892
4.0 0.320454
5.0 0.246036
6.0 0.218529
7.0 0.183436
8.0 0.167485
9.0 0.144270
10.0 0.122205
11.0 0.115320
12.0 0.104836
13.0 0.090858
14.0 0.085295
15.0 0.073394
16.0 0.067443
17.0 0.065459
18.0 0.057277
19.0 0.050668
20.0 0.046262
21.0 0.041636
22.0 0.034288
23.0 0.031839
24.0 0.021226
25.0 0.018194
26.0 0.018194
27.0 0.018194
28.0 0.018194
29.0 0.000000
In [152]:
kmf_art.median_survival_time_
Out[152]:
4.0
In [153]:
kmf_art.plot_survival_function(at_risk_counts=True)
plt.title("Kaplan-Meier Estimate")
plt.ylabel("Probability of Production freelancer still in FB")
plt.show
Out[153]:
<function matplotlib.pyplot.show(*args, **kw)>
In [154]:
kmf_production.confidence_interval_
Out[154]:
Production_lower_0.95 Production_upper_0.95
0.0 1.000000 1.000000
1.0 0.656980 0.720453
2.0 0.484613 0.555238
3.0 0.361500 0.432030
4.0 0.286649 0.354700
5.0 0.214779 0.278479
6.0 0.188558 0.249981
7.0 0.155323 0.213442
8.0 0.140370 0.196689
9.0 0.118721 0.172218
10.0 0.098367 0.148768
11.0 0.092069 0.141404
12.0 0.082531 0.130150
13.0 0.069960 0.115006
14.0 0.064969 0.108987
15.0 0.054318 0.096132
16.0 0.049077 0.089627
17.0 0.047343 0.087446
18.0 0.040235 0.078432
19.0 0.034529 0.071172
20.0 0.030796 0.066267
21.0 0.026905 0.061122
22.0 0.020837 0.052921
23.0 0.018876 0.050131
24.0 0.010705 0.037958
25.0 0.008478 0.034550
26.0 0.008478 0.034550
27.0 0.008478 0.034550
28.0 0.008478 0.034550
29.0 0.000000 0.000000
In [155]:
# Probability of leaving:
kmf_production.cumulative_density_
Out[155]:
Production
timeline
0.0 0.000000
1.0 0.310049
2.0 0.479438
3.0 0.603108
4.0 0.679546
5.0 0.753964
6.0 0.781471
7.0 0.816564
8.0 0.832515
9.0 0.855730
10.0 0.877795
11.0 0.884680
12.0 0.895164
13.0 0.909142
14.0 0.914705
15.0 0.926606
16.0 0.932557
17.0 0.934541
18.0 0.942723
19.0 0.949332
20.0 0.953738
21.0 0.958364
22.0 0.965712
23.0 0.968161
24.0 0.978774
25.0 0.981806
26.0 0.981806
27.0 0.981806
28.0 0.981806
29.0 1.000000
In [156]:
kmf_art.plot_cumulative_density()
Out[156]:
<matplotlib.axes._subplots.AxesSubplot at 0x11d933518>
In [157]:
naf_art = NelsonAalenFitter()
naf_art.fit(df['No of Yrs'], event_observed = df['dead'], label="Art")
Out[157]:
<lifelines.NelsonAalenFitter:"Art", fitted with 3629 total observations, 525 right-censored observations>
In [158]:
naf.cumulative_hazard_
Out[158]:
Film Bang Listings
timeline
0.0 0.000000
1.0 0.302719
2.0 0.507069
3.0 0.689240
4.0 0.834566
5.0 0.989502
6.0 1.087008
7.0 1.201594
8.0 1.296850
9.0 1.408727
10.0 1.517739
11.0 1.612336
12.0 1.699428
13.0 1.798376
14.0 1.875952
15.0 1.995419
16.0 2.070927
17.0 2.157242
18.0 2.235714
19.0 2.319763
20.0 2.399984
21.0 2.506684
22.0 2.636660
23.0 2.691281
24.0 2.791364
25.0 2.863143
26.0 2.995933
27.0 3.060890
28.0 3.197463
29.0 3.341854
30.0 3.508900
31.0 3.597186
32.0 3.770011
33.0 3.895156
34.0 4.100962
35.0 4.332647
36.0 4.642171
37.0 4.892171
38.0 5.392171
41.0 5.392171
In [159]:
naf_art.plot_cumulative_hazard()
Out[159]:
<matplotlib.axes._subplots.AxesSubplot at 0x106cda6d8>
In [160]:
art = art[art['Entry'].notna()]
In [161]:
data_art = art[['No of Yrs', 'Entry','Trainee prog','Gender','Rural', 'Has Consistent Role','dead']]

cph_art = CoxPHFitter()
cph_art.fit(data_art, 'No of Yrs',event_col='dead')
cph_art.print_summary()
model lifelines.CoxPHFitter
duration col 'No of Yrs'
event col 'dead'
baseline estimation breslow
number of observations 468
number of events observed 388
partial log-likelihood -2022.66
time fit was run 2021-07-01 14:33:02 UTC
coef exp(coef) se(coef) coef lower 95% coef upper 95% exp(coef) lower 95% exp(coef) upper 95% z p -log2(p)
Entry 0.03 1.03 0.01 0.01 0.04 1.01 1.04 4.39 <0.005 16.45
Trainee prog 0.12 1.13 0.34 -0.55 0.79 0.58 2.19 0.35 0.73 0.46
Gender 0.00 1.00 0.09 -0.18 0.18 0.84 1.20 0.00 1.00 0.01
Rural -0.02 0.98 0.24 -0.50 0.46 0.61 1.59 -0.07 0.94 0.09
Has Consistent Role 0.57 1.78 0.18 0.21 0.93 1.24 2.55 3.12 <0.005 9.10

Concordance 0.60
Partial AIC 4055.33
log-likelihood ratio test 33.12 on 5 df
-log2(p) of ll-ratio test 18.10

Direction

In [162]:
direction = pd.read_csv('Stats/Direction_df.csv')
In [163]:
kmf_direction = KaplanMeierFitter()
In [164]:
direction.shape
Out[164]:
(282, 54)
In [165]:
direction.loc[direction['dropout year'] == 0.0, 'dead'] = 0
direction.loc[direction['dropout year'] > 0, 'dead'] = 1
In [166]:
# drop null values from No of Yrs col
direction = direction[direction['No of Yrs'].notna()]
In [167]:
kmf_direction.fit(durations = direction['No of Yrs'],event_observed = direction['dead'], label="Direction")
Out[167]:
<lifelines.KaplanMeierFitter:"Direction", fitted with 282 total observations, 22 right-censored observations>
In [168]:
kmf_production.event_table
Out[168]:
removed observed censored entrance at_risk
event_at
0.0 0 0 0 816 816
1.0 315 253 62 0 816
2.0 139 123 16 0 501
3.0 92 86 6 0 362
4.0 59 52 7 0 270
5.0 50 49 1 0 211
6.0 24 18 6 0 161
7.0 22 22 0 0 137
8.0 14 10 4 0 115
9.0 16 14 2 0 101
10.0 14 13 1 0 85
11.0 5 4 1 0 71
12.0 6 6 0 0 66
13.0 11 8 3 0 60
14.0 6 3 3 0 49
15.0 6 6 0 0 43
16.0 3 3 0 0 37
17.0 2 1 1 0 34
18.0 6 4 2 0 32
19.0 3 3 0 0 26
20.0 3 2 1 0 23
21.0 3 2 1 0 20
22.0 3 3 0 0 17
23.0 2 1 1 0 14
24.0 5 4 1 0 12
25.0 2 1 1 0 7
26.0 1 0 1 0 5
27.0 1 0 1 0 4
28.0 1 0 1 0 3
29.0 2 2 0 0 2
In [169]:
kmf_production.survival_function_
Out[169]:
Production
timeline
0.0 1.000000
1.0 0.689951
2.0 0.520562
3.0 0.396892
4.0 0.320454
5.0 0.246036
6.0 0.218529
7.0 0.183436
8.0 0.167485
9.0 0.144270
10.0 0.122205
11.0 0.115320
12.0 0.104836
13.0 0.090858
14.0 0.085295
15.0 0.073394
16.0 0.067443
17.0 0.065459
18.0 0.057277
19.0 0.050668
20.0 0.046262
21.0 0.041636
22.0 0.034288
23.0 0.031839
24.0 0.021226
25.0 0.018194
26.0 0.018194
27.0 0.018194
28.0 0.018194
29.0 0.000000
In [170]:
kmf_direction.median_survival_time_
Out[170]:
4.0
In [171]:
kmf_direction.plot_survival_function(at_risk_counts=True)
plt.title("Kaplan-Meier Estimate")
plt.ylabel("Probability of Production freelancer still in FB")
plt.show
Out[171]:
<function matplotlib.pyplot.show(*args, **kw)>
In [172]:
kmf_production.confidence_interval_
Out[172]:
Production_lower_0.95 Production_upper_0.95
0.0 1.000000 1.000000
1.0 0.656980 0.720453
2.0 0.484613 0.555238
3.0 0.361500 0.432030
4.0 0.286649 0.354700
5.0 0.214779 0.278479
6.0 0.188558 0.249981
7.0 0.155323 0.213442
8.0 0.140370 0.196689
9.0 0.118721 0.172218
10.0 0.098367 0.148768
11.0 0.092069 0.141404
12.0 0.082531 0.130150
13.0 0.069960 0.115006
14.0 0.064969 0.108987
15.0 0.054318 0.096132
16.0 0.049077 0.089627
17.0 0.047343 0.087446
18.0 0.040235 0.078432
19.0 0.034529 0.071172
20.0 0.030796 0.066267
21.0 0.026905 0.061122
22.0 0.020837 0.052921
23.0 0.018876 0.050131
24.0 0.010705 0.037958
25.0 0.008478 0.034550
26.0 0.008478 0.034550
27.0 0.008478 0.034550
28.0 0.008478 0.034550
29.0 0.000000 0.000000
In [173]:
# Probability of leaving:
kmf_production.cumulative_density_
Out[173]:
Production
timeline
0.0 0.000000
1.0 0.310049
2.0 0.479438
3.0 0.603108
4.0 0.679546
5.0 0.753964
6.0 0.781471
7.0 0.816564
8.0 0.832515
9.0 0.855730
10.0 0.877795
11.0 0.884680
12.0 0.895164
13.0 0.909142
14.0 0.914705
15.0 0.926606
16.0 0.932557
17.0 0.934541
18.0 0.942723
19.0 0.949332
20.0 0.953738
21.0 0.958364
22.0 0.965712
23.0 0.968161
24.0 0.978774
25.0 0.981806
26.0 0.981806
27.0 0.981806
28.0 0.981806
29.0 1.000000
In [174]:
kmf_direction.plot_cumulative_density()
Out[174]:
<matplotlib.axes._subplots.AxesSubplot at 0x11b05ad68>
In [175]:
naf_direction = NelsonAalenFitter()
naf_direction.fit(direction['No of Yrs'], event_observed = direction['dead'], label="Direction")
Out[175]:
<lifelines.NelsonAalenFitter:"Direction", fitted with 282 total observations, 22 right-censored observations>
In [176]:
naf.cumulative_hazard_
Out[176]:
Film Bang Listings
timeline
0.0 0.000000
1.0 0.302719
2.0 0.507069
3.0 0.689240
4.0 0.834566
5.0 0.989502
6.0 1.087008
7.0 1.201594
8.0 1.296850
9.0 1.408727
10.0 1.517739
11.0 1.612336
12.0 1.699428
13.0 1.798376
14.0 1.875952
15.0 1.995419
16.0 2.070927
17.0 2.157242
18.0 2.235714
19.0 2.319763
20.0 2.399984
21.0 2.506684
22.0 2.636660
23.0 2.691281
24.0 2.791364
25.0 2.863143
26.0 2.995933
27.0 3.060890
28.0 3.197463
29.0 3.341854
30.0 3.508900
31.0 3.597186
32.0 3.770011
33.0 3.895156
34.0 4.100962
35.0 4.332647
36.0 4.642171
37.0 4.892171
38.0 5.392171
41.0 5.392171
In [177]:
naf_direction.plot_cumulative_hazard()
Out[177]:
<matplotlib.axes._subplots.AxesSubplot at 0x11a5bc9b0>
In [178]:
data_direction = direction[['No of Yrs', 'Entry','Trainee prog','Gender','Rural', 'Has Consistent Role','dead']]
cph_direction = CoxPHFitter()
cph_direction.fit(data_direction, 'No of Yrs',event_col='dead')
cph_direction.print_summary()
model lifelines.CoxPHFitter
duration col 'No of Yrs'
event col 'dead'
baseline estimation breslow
number of observations 282
number of events observed 260
partial log-likelihood -1213.51
time fit was run 2021-07-01 14:33:04 UTC
coef exp(coef) se(coef) coef lower 95% coef upper 95% exp(coef) lower 95% exp(coef) upper 95% z p -log2(p)
Entry 0.02 1.02 0.01 0.01 0.03 1.01 1.03 2.78 0.01 7.53
Trainee prog 0.23 1.26 0.37 -0.48 0.95 0.62 2.59 0.64 0.52 0.93
Gender -0.07 0.93 0.12 -0.31 0.17 0.74 1.19 -0.56 0.58 0.79
Rural 0.03 1.03 0.33 -0.61 0.68 0.54 1.97 0.10 0.92 0.12
Has Consistent Role 0.27 1.31 0.09 0.09 0.45 1.09 1.58 2.91 <0.005 8.11

Concordance 0.58
Partial AIC 2437.02
log-likelihood ratio test 17.33 on 5 df
-log2(p) of ll-ratio test 8.00

Producer

In [179]:
producer = pd.read_csv('Stats/Producer_df.csv')
In [180]:
kmf_producer = KaplanMeierFitter()
In [181]:
producer.shape
Out[181]:
(242, 54)
In [182]:
producer.loc[producer['dropout year'] == 0.0, 'dead'] = 0
producer.loc[producer['dropout year'] > 0, 'dead'] = 1
In [183]:
# drop null values from No of Yrs col
producer = producer[producer['No of Yrs'].notna()]
In [184]:
kmf_producer.fit(durations = producer['No of Yrs'],event_observed = producer['dead'], label="Producer")
Out[184]:
<lifelines.KaplanMeierFitter:"Producer", fitted with 242 total observations, 12 right-censored observations>
In [185]:
kmf_production.event_table
Out[185]:
removed observed censored entrance at_risk
event_at
0.0 0 0 0 816 816
1.0 315 253 62 0 816
2.0 139 123 16 0 501
3.0 92 86 6 0 362
4.0 59 52 7 0 270
5.0 50 49 1 0 211
6.0 24 18 6 0 161
7.0 22 22 0 0 137
8.0 14 10 4 0 115
9.0 16 14 2 0 101
10.0 14 13 1 0 85
11.0 5 4 1 0 71
12.0 6 6 0 0 66
13.0 11 8 3 0 60
14.0 6 3 3 0 49
15.0 6 6 0 0 43
16.0 3 3 0 0 37
17.0 2 1 1 0 34
18.0 6 4 2 0 32
19.0 3 3 0 0 26
20.0 3 2 1 0 23
21.0 3 2 1 0 20
22.0 3 3 0 0 17
23.0 2 1 1 0 14
24.0 5 4 1 0 12
25.0 2 1 1 0 7
26.0 1 0 1 0 5
27.0 1 0 1 0 4
28.0 1 0 1 0 3
29.0 2 2 0 0 2
In [186]:
kmf_production.survival_function_
Out[186]:
Production
timeline
0.0 1.000000
1.0 0.689951
2.0 0.520562
3.0 0.396892
4.0 0.320454
5.0 0.246036
6.0 0.218529
7.0 0.183436
8.0 0.167485
9.0 0.144270
10.0 0.122205
11.0 0.115320
12.0 0.104836
13.0 0.090858
14.0 0.085295
15.0 0.073394
16.0 0.067443
17.0 0.065459
18.0 0.057277
19.0 0.050668
20.0 0.046262
21.0 0.041636
22.0 0.034288
23.0 0.031839
24.0 0.021226
25.0 0.018194
26.0 0.018194
27.0 0.018194
28.0 0.018194
29.0 0.000000
In [187]:
kmf_producer.median_survival_time_
Out[187]:
5.0
In [188]:
kmf_producer.plot_survival_function(at_risk_counts=True)
plt.title("Kaplan-Meier Estimate")
plt.ylabel("Probability of Production freelancer still in FB")
plt.show
Out[188]:
<function matplotlib.pyplot.show(*args, **kw)>
In [189]:
kmf_production.confidence_interval_
Out[189]:
Production_lower_0.95 Production_upper_0.95
0.0 1.000000 1.000000
1.0 0.656980 0.720453
2.0 0.484613 0.555238
3.0 0.361500 0.432030
4.0 0.286649 0.354700
5.0 0.214779 0.278479
6.0 0.188558 0.249981
7.0 0.155323 0.213442
8.0 0.140370 0.196689
9.0 0.118721 0.172218
10.0 0.098367 0.148768
11.0 0.092069 0.141404
12.0 0.082531 0.130150
13.0 0.069960 0.115006
14.0 0.064969 0.108987
15.0 0.054318 0.096132
16.0 0.049077 0.089627
17.0 0.047343 0.087446
18.0 0.040235 0.078432
19.0 0.034529 0.071172
20.0 0.030796 0.066267
21.0 0.026905 0.061122
22.0 0.020837 0.052921
23.0 0.018876 0.050131
24.0 0.010705 0.037958
25.0 0.008478 0.034550
26.0 0.008478 0.034550
27.0 0.008478 0.034550
28.0 0.008478 0.034550
29.0 0.000000 0.000000
In [190]:
# Probability of leaving:
kmf_production.cumulative_density_
Out[190]:
Production
timeline
0.0 0.000000
1.0 0.310049
2.0 0.479438
3.0 0.603108
4.0 0.679546
5.0 0.753964
6.0 0.781471
7.0 0.816564
8.0 0.832515
9.0 0.855730
10.0 0.877795
11.0 0.884680
12.0 0.895164
13.0 0.909142
14.0 0.914705
15.0 0.926606
16.0 0.932557
17.0 0.934541
18.0 0.942723
19.0 0.949332
20.0 0.953738
21.0 0.958364
22.0 0.965712
23.0 0.968161
24.0 0.978774
25.0 0.981806
26.0 0.981806
27.0 0.981806
28.0 0.981806
29.0 1.000000
In [191]:
kmf_producer.plot_cumulative_density()
Out[191]:
<matplotlib.axes._subplots.AxesSubplot at 0x11a7d25c0>
In [192]:
naf_producer = NelsonAalenFitter()
naf_producer.fit(producer['No of Yrs'], event_observed = producer['dead'], label="Producer")
Out[192]:
<lifelines.NelsonAalenFitter:"Producer", fitted with 242 total observations, 12 right-censored observations>
In [193]:
naf.cumulative_hazard_
Out[193]:
Film Bang Listings
timeline
0.0 0.000000
1.0 0.302719
2.0 0.507069
3.0 0.689240
4.0 0.834566
5.0 0.989502
6.0 1.087008
7.0 1.201594
8.0 1.296850
9.0 1.408727
10.0 1.517739
11.0 1.612336
12.0 1.699428
13.0 1.798376
14.0 1.875952
15.0 1.995419
16.0 2.070927
17.0 2.157242
18.0 2.235714
19.0 2.319763
20.0 2.399984
21.0 2.506684
22.0 2.636660
23.0 2.691281
24.0 2.791364
25.0 2.863143
26.0 2.995933
27.0 3.060890
28.0 3.197463
29.0 3.341854
30.0 3.508900
31.0 3.597186
32.0 3.770011
33.0 3.895156
34.0 4.100962
35.0 4.332647
36.0 4.642171
37.0 4.892171
38.0 5.392171
41.0 5.392171
In [194]:
naf_producer.plot_cumulative_hazard()
Out[194]:
<matplotlib.axes._subplots.AxesSubplot at 0x11acbde80>
In [195]:
data_producer = producer[['No of Yrs', 'Entry','Trainee prog','Gender','Rural', 'Has Consistent Role','dead']]

cph_producer = CoxPHFitter()
cph_producer.fit(data_producer, 'No of Yrs',event_col='dead')
cph_producer.print_summary()
model lifelines.CoxPHFitter
duration col 'No of Yrs'
event col 'dead'
baseline estimation breslow
number of observations 242
number of events observed 230
partial log-likelihood -1042.61
time fit was run 2021-07-01 14:33:06 UTC
coef exp(coef) se(coef) coef lower 95% coef upper 95% exp(coef) lower 95% exp(coef) upper 95% z p -log2(p)
Entry 0.01 1.01 0.01 -0.00 0.03 1.00 1.03 1.63 0.10 3.27
Trainee prog -0.74 0.48 0.61 -1.94 0.47 0.14 1.59 -1.20 0.23 2.12
Gender 0.05 1.05 0.14 -0.23 0.33 0.79 1.39 0.34 0.73 0.45
Rural 0.37 1.45 0.33 -0.27 1.02 0.76 2.77 1.13 0.26 1.95
Has Consistent Role 0.16 1.18 0.08 0.01 0.32 1.01 1.38 2.03 0.04 4.58

Concordance 0.59
Partial AIC 2095.22
log-likelihood ratio test 10.55 on 5 df
-log2(p) of ll-ratio test 4.03

Post-Production

In [196]:
p_production = pd.read_csv('Stats/Post-Production_df.csv')
In [197]:
kmf_p_production = KaplanMeierFitter()
In [198]:
p_production.shape
Out[198]:
(226, 54)
In [199]:
p_production.loc[p_production['dropout year'] == 0.0, 'dead'] = 0
p_production.loc[p_production['dropout year'] > 0, 'dead'] = 1
In [200]:
# drop null values from No of Yrs col
p_production = p_production[p_production['No of Yrs'].notna()]
In [201]:
kmf_p_production.fit(durations = p_production['No of Yrs'],event_observed = p_production['dead'], label="Post-Production")
Out[201]:
<lifelines.KaplanMeierFitter:"Post-Production", fitted with 226 total observations, 38 right-censored observations>
In [202]:
kmf_production.event_table
Out[202]:
removed observed censored entrance at_risk
event_at
0.0 0 0 0 816 816
1.0 315 253 62 0 816
2.0 139 123 16 0 501
3.0 92 86 6 0 362
4.0 59 52 7 0 270
5.0 50 49 1 0 211
6.0 24 18 6 0 161
7.0 22 22 0 0 137
8.0 14 10 4 0 115
9.0 16 14 2 0 101
10.0 14 13 1 0 85
11.0 5 4 1 0 71
12.0 6 6 0 0 66
13.0 11 8 3 0 60
14.0 6 3 3 0 49
15.0 6 6 0 0 43
16.0 3 3 0 0 37
17.0 2 1 1 0 34
18.0 6 4 2 0 32
19.0 3 3 0 0 26
20.0 3 2 1 0 23
21.0 3 2 1 0 20
22.0 3 3 0 0 17
23.0 2 1 1 0 14
24.0 5 4 1 0 12
25.0 2 1 1 0 7
26.0 1 0 1 0 5
27.0 1 0 1 0 4
28.0 1 0 1 0 3
29.0 2 2 0 0 2
In [203]:
kmf_production.survival_function_
Out[203]:
Production
timeline
0.0 1.000000
1.0 0.689951
2.0 0.520562
3.0 0.396892
4.0 0.320454
5.0 0.246036
6.0 0.218529
7.0 0.183436
8.0 0.167485
9.0 0.144270
10.0 0.122205
11.0 0.115320
12.0 0.104836
13.0 0.090858
14.0 0.085295
15.0 0.073394
16.0 0.067443
17.0 0.065459
18.0 0.057277
19.0 0.050668
20.0 0.046262
21.0 0.041636
22.0 0.034288
23.0 0.031839
24.0 0.021226
25.0 0.018194
26.0 0.018194
27.0 0.018194
28.0 0.018194
29.0 0.000000
In [204]:
kmf_p_production.median_survival_time_
Out[204]:
4.0
In [205]:
kmf_p_production.confidence_interval_
Out[205]:
Post-Production_lower_0.95 Post-Production_upper_0.95
0.0 1.000000 1.000000
1.0 0.718811 0.827471
2.0 0.544364 0.673692
3.0 0.448833 0.581936
4.0 0.396309 0.529573
5.0 0.329108 0.460512
6.0 0.290851 0.419970
7.0 0.266438 0.393732
8.0 0.216642 0.339054
9.0 0.201617 0.322153
10.0 0.185970 0.304517
11.0 0.149148 0.261940
12.0 0.132604 0.242485
13.0 0.115687 0.222237
14.0 0.109560 0.214993
15.0 0.096077 0.199331
17.0 0.082948 0.183369
19.0 0.069223 0.166369
21.0 0.056035 0.148915
22.0 0.048922 0.139498
24.0 0.041048 0.129294
25.0 0.032152 0.118076
28.0 0.023953 0.106289
30.0 0.023953 0.106289
31.0 0.014136 0.093092
34.0 0.006426 0.078329
37.0 0.000000 0.000000
In [206]:
kmf_p_production.plot_survival_function(at_risk_counts=True)
plt.title("Kaplan-Meier Estimate")
plt.ylabel("Probability of Post-Production freelancer still in FB")
plt.show
Out[206]:
<function matplotlib.pyplot.show(*args, **kw)>
In [207]:
kmf_p_production.confidence_interval_
Out[207]:
Post-Production_lower_0.95 Post-Production_upper_0.95
0.0 1.000000 1.000000
1.0 0.718811 0.827471
2.0 0.544364 0.673692
3.0 0.448833 0.581936
4.0 0.396309 0.529573
5.0 0.329108 0.460512
6.0 0.290851 0.419970
7.0 0.266438 0.393732
8.0 0.216642 0.339054
9.0 0.201617 0.322153
10.0 0.185970 0.304517
11.0 0.149148 0.261940
12.0 0.132604 0.242485
13.0 0.115687 0.222237
14.0 0.109560 0.214993
15.0 0.096077 0.199331
17.0 0.082948 0.183369
19.0 0.069223 0.166369
21.0 0.056035 0.148915
22.0 0.048922 0.139498
24.0 0.041048 0.129294
25.0 0.032152 0.118076
28.0 0.023953 0.106289
30.0 0.023953 0.106289
31.0 0.014136 0.093092
34.0 0.006426 0.078329
37.0 0.000000 0.000000
In [208]:
# Probability of leaving:
kmf_p_production.cumulative_density_
Out[208]:
Post-Production
timeline
0.0 0.000000
1.0 0.221239
2.0 0.387438
3.0 0.482409
4.0 0.535619
5.0 0.604782
6.0 0.644804
7.0 0.670543
8.0 0.723681
9.0 0.739935
10.0 0.756896
11.0 0.797413
12.0 0.815830
13.0 0.834882
14.0 0.841762
15.0 0.856833
17.0 0.871903
19.0 0.887915
21.0 0.903927
22.0 0.912661
24.0 0.922365
25.0 0.933456
28.0 0.944547
30.0 0.944547
31.0 0.958410
34.0 0.972273
37.0 1.000000
In [209]:
kmf_p_production.plot_cumulative_density()
Out[209]:
<matplotlib.axes._subplots.AxesSubplot at 0x11d728c50>
In [210]:
naf_p_production = NelsonAalenFitter()
naf_p_production.fit(p_production['No of Yrs'], event_observed = p_production['dead'], label="Post-Production")
Out[210]:
<lifelines.NelsonAalenFitter:"Post-Production", fitted with 226 total observations, 38 right-censored observations>
In [211]:
naf.cumulative_hazard_
Out[211]:
Film Bang Listings
timeline
0.0 0.000000
1.0 0.302719
2.0 0.507069
3.0 0.689240
4.0 0.834566
5.0 0.989502
6.0 1.087008
7.0 1.201594
8.0 1.296850
9.0 1.408727
10.0 1.517739
11.0 1.612336
12.0 1.699428
13.0 1.798376
14.0 1.875952
15.0 1.995419
16.0 2.070927
17.0 2.157242
18.0 2.235714
19.0 2.319763
20.0 2.399984
21.0 2.506684
22.0 2.636660
23.0 2.691281
24.0 2.791364
25.0 2.863143
26.0 2.995933
27.0 3.060890
28.0 3.197463
29.0 3.341854
30.0 3.508900
31.0 3.597186
32.0 3.770011
33.0 3.895156
34.0 4.100962
35.0 4.332647
36.0 4.642171
37.0 4.892171
38.0 5.392171
41.0 5.392171
In [212]:
naf_p_production.plot_cumulative_hazard()
Out[212]:
<matplotlib.axes._subplots.AxesSubplot at 0x11d78fbe0>
In [213]:
data_p_production = p_production[['No of Yrs', 'Entry','Trainee prog','Gender','Rural', 'Has Consistent Role','dead']]

cph_p_production = CoxPHFitter()
cph_p_production.fit(data_p_production, 'No of Yrs',event_col='dead')
cph_p_production.print_summary()
model lifelines.CoxPHFitter
duration col 'No of Yrs'
event col 'dead'
baseline estimation breslow
number of observations 226
number of events observed 188
partial log-likelihood -844.28
time fit was run 2021-07-01 14:33:08 UTC
coef exp(coef) se(coef) coef lower 95% coef upper 95% exp(coef) lower 95% exp(coef) upper 95% z p -log2(p)
Entry 0.02 1.02 0.01 0.00 0.03 1.00 1.03 2.11 0.03 4.86
Trainee prog -0.29 0.75 0.33 -0.93 0.35 0.40 1.42 -0.88 0.38 1.41
Gender 0.07 1.07 0.14 -0.20 0.34 0.82 1.41 0.52 0.61 0.72
Rural -0.46 0.63 0.40 -1.24 0.32 0.29 1.37 -1.16 0.25 2.02
Has Consistent Role 0.38 1.46 0.24 -0.09 0.84 0.91 2.33 1.58 0.11 3.13

Concordance 0.59
Partial AIC 1698.56
log-likelihood ratio test 12.84 on 5 df
-log2(p) of ll-ratio test 5.33

Sound

In [214]:
sound = pd.read_csv('Stats/Sound_df.csv')
In [215]:
kmf_sound = KaplanMeierFitter()
In [216]:
sound.shape
Out[216]:
(205, 54)
In [217]:
sound.loc[sound['dropout year'] == 0.0, 'dead'] = 0
sound.loc[sound['dropout year'] > 0, 'dead'] = 1
In [218]:
# drop null values from No of Yrs col
sound = sound[sound['No of Yrs'].notna()]
In [219]:
kmf_sound.fit(durations = sound['No of Yrs'],event_observed = sound['dead'], label="Sound")
Out[219]:
<lifelines.KaplanMeierFitter:"Sound", fitted with 205 total observations, 37 right-censored observations>
In [220]:
kmf_production.event_table
Out[220]:
removed observed censored entrance at_risk
event_at
0.0 0 0 0 816 816
1.0 315 253 62 0 816
2.0 139 123 16 0 501
3.0 92 86 6 0 362
4.0 59 52 7 0 270
5.0 50 49 1 0 211
6.0 24 18 6 0 161
7.0 22 22 0 0 137
8.0 14 10 4 0 115
9.0 16 14 2 0 101
10.0 14 13 1 0 85
11.0 5 4 1 0 71
12.0 6 6 0 0 66
13.0 11 8 3 0 60
14.0 6 3 3 0 49
15.0 6 6 0 0 43
16.0 3 3 0 0 37
17.0 2 1 1 0 34
18.0 6 4 2 0 32
19.0 3 3 0 0 26
20.0 3 2 1 0 23
21.0 3 2 1 0 20
22.0 3 3 0 0 17
23.0 2 1 1 0 14
24.0 5 4 1 0 12
25.0 2 1 1 0 7
26.0 1 0 1 0 5
27.0 1 0 1 0 4
28.0 1 0 1 0 3
29.0 2 2 0 0 2
In [221]:
kmf_production.survival_function_
Out[221]:
Production
timeline
0.0 1.000000
1.0 0.689951
2.0 0.520562
3.0 0.396892
4.0 0.320454
5.0 0.246036
6.0 0.218529
7.0 0.183436
8.0 0.167485
9.0 0.144270
10.0 0.122205
11.0 0.115320
12.0 0.104836
13.0 0.090858
14.0 0.085295
15.0 0.073394
16.0 0.067443
17.0 0.065459
18.0 0.057277
19.0 0.050668
20.0 0.046262
21.0 0.041636
22.0 0.034288
23.0 0.031839
24.0 0.021226
25.0 0.018194
26.0 0.018194
27.0 0.018194
28.0 0.018194
29.0 0.000000
In [222]:
kmf_sound.median_survival_time_
Out[222]:
6.0
In [223]:
kmf_p_production.confidence_interval_
Out[223]:
Post-Production_lower_0.95 Post-Production_upper_0.95
0.0 1.000000 1.000000
1.0 0.718811 0.827471
2.0 0.544364 0.673692
3.0 0.448833 0.581936
4.0 0.396309 0.529573
5.0 0.329108 0.460512
6.0 0.290851 0.419970
7.0 0.266438 0.393732
8.0 0.216642 0.339054
9.0 0.201617 0.322153
10.0 0.185970 0.304517
11.0 0.149148 0.261940
12.0 0.132604 0.242485
13.0 0.115687 0.222237
14.0 0.109560 0.214993
15.0 0.096077 0.199331
17.0 0.082948 0.183369
19.0 0.069223 0.166369
21.0 0.056035 0.148915
22.0 0.048922 0.139498
24.0 0.041048 0.129294
25.0 0.032152 0.118076
28.0 0.023953 0.106289
30.0 0.023953 0.106289
31.0 0.014136 0.093092
34.0 0.006426 0.078329
37.0 0.000000 0.000000
In [224]:
kmf_sound.plot_survival_function(at_risk_counts=True)
plt.title("Kaplan-Meier Estimate")
plt.ylabel("Probability of Production freelancer still in FB")
plt.show
Out[224]:
<function matplotlib.pyplot.show(*args, **kw)>
In [225]:
kmf_sound.confidence_interval_
Out[225]:
Sound_lower_0.95 Sound_upper_0.95
0.0 1.000000 1.000000
1.0 0.686088 0.804771
2.0 0.597895 0.728272
3.0 0.531131 0.667617
4.0 0.470594 0.610406
5.0 0.432678 0.573440
6.0 0.371811 0.512768
7.0 0.349675 0.490168
8.0 0.338321 0.478538
9.0 0.302831 0.442021
10.0 0.256393 0.392556
11.0 0.250662 0.386307
12.0 0.244816 0.379942
13.0 0.233178 0.367163
14.0 0.215434 0.347527
15.0 0.190776 0.320039
16.0 0.184679 0.313108
17.0 0.178611 0.306152
18.0 0.153824 0.277420
19.0 0.141638 0.262874
20.0 0.116714 0.232541
21.0 0.097582 0.208569
22.0 0.090925 0.200159
23.0 0.083870 0.191325
25.0 0.076927 0.182396
27.0 0.068755 0.172474
28.0 0.053087 0.152068
29.0 0.045619 0.141561
30.0 0.037250 0.130101
31.0 0.037250 0.130101
32.0 0.018987 0.103920
33.0 0.005336 0.074246
34.0 0.001170 0.058151
38.0 0.000000 0.000000
In [226]:
# Probability of leaving:
kmf_sound.cumulative_density_
Out[226]:
Sound
timeline
0.0 0.000000
1.0 0.248780
2.0 0.332249
3.0 0.396870
4.0 0.456640
5.0 0.494675
6.0 0.556437
7.0 0.579184
8.0 0.590873
9.0 0.627511
10.0 0.676362
11.0 0.682469
12.0 0.688695
13.0 0.701147
14.0 0.720223
15.0 0.746868
16.0 0.753530
17.0 0.760191
18.0 0.787598
19.0 0.801301
20.0 0.829687
21.0 0.851901
22.0 0.859696
23.0 0.867949
25.0 0.876202
27.0 0.885725
28.0 0.904771
29.0 0.914294
30.0 0.925007
31.0 0.925007
32.0 0.950005
33.0 0.975002
34.0 0.987501
38.0 1.000000
In [227]:
kmf_sound.plot_cumulative_density()
Out[227]:
<matplotlib.axes._subplots.AxesSubplot at 0x11d987400>
In [228]:
naf_sound = NelsonAalenFitter()
naf_sound.fit(sound['No of Yrs'], event_observed = sound['dead'], label="Sound")
Out[228]:
<lifelines.NelsonAalenFitter:"Sound", fitted with 205 total observations, 37 right-censored observations>
In [229]:
naf.cumulative_hazard_
Out[229]:
Film Bang Listings
timeline
0.0 0.000000
1.0 0.302719
2.0 0.507069
3.0 0.689240
4.0 0.834566
5.0 0.989502
6.0 1.087008
7.0 1.201594
8.0 1.296850
9.0 1.408727
10.0 1.517739
11.0 1.612336
12.0 1.699428
13.0 1.798376
14.0 1.875952
15.0 1.995419
16.0 2.070927
17.0 2.157242
18.0 2.235714
19.0 2.319763
20.0 2.399984
21.0 2.506684
22.0 2.636660
23.0 2.691281
24.0 2.791364
25.0 2.863143
26.0 2.995933
27.0 3.060890
28.0 3.197463
29.0 3.341854
30.0 3.508900
31.0 3.597186
32.0 3.770011
33.0 3.895156
34.0 4.100962
35.0 4.332647
36.0 4.642171
37.0 4.892171
38.0 5.392171
41.0 5.392171
In [230]:
naf_sound.plot_cumulative_hazard()
Out[230]:
<matplotlib.axes._subplots.AxesSubplot at 0x11a759b70>
In [231]:
data_sound = sound[['No of Yrs', 'Entry','Trainee prog','Gender','Rural', 'Has Consistent Role','dead']]

cph_sound = CoxPHFitter()
cph_sound.fit(data_sound, 'No of Yrs',event_col='dead')
cph_sound.print_summary()
model lifelines.CoxPHFitter
duration col 'No of Yrs'
event col 'dead'
baseline estimation breslow
number of observations 205
number of events observed 168
partial log-likelihood -727.63
time fit was run 2021-07-01 14:33:10 UTC
coef exp(coef) se(coef) coef lower 95% coef upper 95% exp(coef) lower 95% exp(coef) upper 95% z p -log2(p)
Entry 0.03 1.03 0.01 0.01 0.04 1.01 1.04 3.31 <0.005 10.04
Trainee prog -0.23 0.80 0.34 -0.89 0.44 0.41 1.55 -0.67 0.51 0.98
Gender 0.22 1.25 0.18 -0.13 0.57 0.88 1.76 1.24 0.21 2.22
Rural -0.25 0.78 0.33 -0.90 0.40 0.41 1.49 -0.76 0.44 1.17
Has Consistent Role 0.01 1.01 0.39 -0.76 0.77 0.47 2.16 0.01 0.99 0.02

Concordance 0.59
Partial AIC 1465.25
log-likelihood ratio test 15.84 on 5 df
-log2(p) of ll-ratio test 7.09

Hair & Make-Up

In [232]:
hair_make_up = pd.read_csv('Stats/Hair_&_Make-Up_df.csv')
In [233]:
kmf_hair_make_up = KaplanMeierFitter()
In [234]:
hair_make_up.shape
Out[234]:
(200, 54)
In [235]:
hair_make_up.loc[hair_make_up['dropout year'] == 0.0, 'dead'] = 0
hair_make_up.loc[hair_make_up['dropout year'] > 0, 'dead'] = 1
In [236]:
# drop null values from No of Yrs col
p_prodhair_make_upuction = hair_make_up[hair_make_up['No of Yrs'].notna()]
In [237]:
kmf_hair_make_up.fit(durations = hair_make_up['No of Yrs'],event_observed = hair_make_up['dead'], label="Hair & Make-Up")
Out[237]:
<lifelines.KaplanMeierFitter:"Hair & Make-Up", fitted with 200 total observations, 21 right-censored observations>
In [238]:
kmf_production.event_table
Out[238]:
removed observed censored entrance at_risk
event_at
0.0 0 0 0 816 816
1.0 315 253 62 0 816
2.0 139 123 16 0 501
3.0 92 86 6 0 362
4.0 59 52 7 0 270
5.0 50 49 1 0 211
6.0 24 18 6 0 161
7.0 22 22 0 0 137
8.0 14 10 4 0 115
9.0 16 14 2 0 101
10.0 14 13 1 0 85
11.0 5 4 1 0 71
12.0 6 6 0 0 66
13.0 11 8 3 0 60
14.0 6 3 3 0 49
15.0 6 6 0 0 43
16.0 3 3 0 0 37
17.0 2 1 1 0 34
18.0 6 4 2 0 32
19.0 3 3 0 0 26
20.0 3 2 1 0 23
21.0 3 2 1 0 20
22.0 3 3 0 0 17
23.0 2 1 1 0 14
24.0 5 4 1 0 12
25.0 2 1 1 0 7
26.0 1 0 1 0 5
27.0 1 0 1 0 4
28.0 1 0 1 0 3
29.0 2 2 0 0 2
In [239]:
kmf_production.survival_function_
Out[239]:
Production
timeline
0.0 1.000000
1.0 0.689951
2.0 0.520562
3.0 0.396892
4.0 0.320454
5.0 0.246036
6.0 0.218529
7.0 0.183436
8.0 0.167485
9.0 0.144270
10.0 0.122205
11.0 0.115320
12.0 0.104836
13.0 0.090858
14.0 0.085295
15.0 0.073394
16.0 0.067443
17.0 0.065459
18.0 0.057277
19.0 0.050668
20.0 0.046262
21.0 0.041636
22.0 0.034288
23.0 0.031839
24.0 0.021226
25.0 0.018194
26.0 0.018194
27.0 0.018194
28.0 0.018194
29.0 0.000000
In [240]:
kmf_hair_make_up.median_survival_time_
Out[240]:
3.0
In [241]:
kmf_p_production.confidence_interval_
Out[241]:
Post-Production_lower_0.95 Post-Production_upper_0.95
0.0 1.000000 1.000000
1.0 0.718811 0.827471
2.0 0.544364 0.673692
3.0 0.448833 0.581936
4.0 0.396309 0.529573
5.0 0.329108 0.460512
6.0 0.290851 0.419970
7.0 0.266438 0.393732
8.0 0.216642 0.339054
9.0 0.201617 0.322153
10.0 0.185970 0.304517
11.0 0.149148 0.261940
12.0 0.132604 0.242485
13.0 0.115687 0.222237
14.0 0.109560 0.214993
15.0 0.096077 0.199331
17.0 0.082948 0.183369
19.0 0.069223 0.166369
21.0 0.056035 0.148915
22.0 0.048922 0.139498
24.0 0.041048 0.129294
25.0 0.032152 0.118076
28.0 0.023953 0.106289
30.0 0.023953 0.106289
31.0 0.014136 0.093092
34.0 0.006426 0.078329
37.0 0.000000 0.000000
In [242]:
kmf_hair_make_up.plot_survival_function(at_risk_counts=True)
plt.title("Kaplan-Meier Estimate")
plt.ylabel("Probability of Production freelancer still in FB")
plt.show
Out[242]:
<function matplotlib.pyplot.show(*args, **kw)>
In [243]:
kmf_hair_make_up.confidence_interval_
Out[243]:
Hair & Make-Up_lower_0.95 Hair & Make-Up_upper_0.95
0.0 1.000000 1.000000
1.0 0.683908 0.804262
2.0 0.530902 0.667832
3.0 0.411086 0.552271
4.0 0.324622 0.463816
5.0 0.260529 0.394988
6.0 0.244775 0.377533
7.0 0.202470 0.329704
8.0 0.197249 0.323664
9.0 0.171392 0.293237
10.0 0.145978 0.262403
11.0 0.126010 0.237404
12.0 0.096770 0.199255
14.0 0.082532 0.179832
15.0 0.068605 0.160126
16.0 0.055049 0.140083
17.0 0.041957 0.119623
19.0 0.037274 0.112297
20.0 0.032695 0.104882
22.0 0.032695 0.104882
24.0 0.026769 0.096313
25.0 0.021160 0.087480
26.0 0.008789 0.067156
29.0 0.008789 0.067156
32.0 0.000000 0.000000
In [244]:
# Probability of leaving:
kmf_hair_make_up.cumulative_density_
Out[244]:
Hair & Make-Up
timeline
0.0 0.000000
1.0 0.250000
2.0 0.396853
3.0 0.516396
4.0 0.605334
5.0 0.672991
6.0 0.689906
7.0 0.735846
8.0 0.741588
9.0 0.770300
10.0 0.799013
11.0 0.821983
12.0 0.856438
14.0 0.873665
15.0 0.890893
16.0 0.908120
17.0 0.925348
19.0 0.931569
20.0 0.937790
22.0 0.937790
24.0 0.945566
25.0 0.953342
26.0 0.972005
29.0 0.972005
32.0 1.000000
In [245]:
kmf_hair_make_up.plot_cumulative_density()
Out[245]:
<matplotlib.axes._subplots.AxesSubplot at 0x11dea56a0>
In [246]:
naf_hair_make_up = NelsonAalenFitter()
naf_hair_make_up.fit(hair_make_up['No of Yrs'], event_observed = hair_make_up['dead'], label="Hair & Make-Up")
Out[246]:
<lifelines.NelsonAalenFitter:"Hair & Make-Up", fitted with 200 total observations, 21 right-censored observations>
In [247]:
naf.cumulative_hazard_
Out[247]:
Film Bang Listings
timeline
0.0 0.000000
1.0 0.302719
2.0 0.507069
3.0 0.689240
4.0 0.834566
5.0 0.989502
6.0 1.087008
7.0 1.201594
8.0 1.296850
9.0 1.408727
10.0 1.517739
11.0 1.612336
12.0 1.699428
13.0 1.798376
14.0 1.875952
15.0 1.995419
16.0 2.070927
17.0 2.157242
18.0 2.235714
19.0 2.319763
20.0 2.399984
21.0 2.506684
22.0 2.636660
23.0 2.691281
24.0 2.791364
25.0 2.863143
26.0 2.995933
27.0 3.060890
28.0 3.197463
29.0 3.341854
30.0 3.508900
31.0 3.597186
32.0 3.770011
33.0 3.895156
34.0 4.100962
35.0 4.332647
36.0 4.642171
37.0 4.892171
38.0 5.392171
41.0 5.392171
In [248]:
naf_hair_make_up.plot_cumulative_hazard()
Out[248]:
<matplotlib.axes._subplots.AxesSubplot at 0x11e502128>
In [249]:
data_hair_make_up = hair_make_up[['No of Yrs', 'Entry','Trainee prog','Gender','Rural', 'Has Consistent Role','dead']]

cph_hair_make_up = CoxPHFitter()
cph_hair_make_up.fit(data_hair_make_up, 'No of Yrs',event_col='dead')
cph_hair_make_up.print_summary()
model lifelines.CoxPHFitter
duration col 'No of Yrs'
event col 'dead'
baseline estimation breslow
number of observations 200
number of events observed 179
partial log-likelihood -764.85
time fit was run 2021-07-01 14:33:12 UTC
coef exp(coef) se(coef) coef lower 95% coef upper 95% exp(coef) lower 95% exp(coef) upper 95% z p -log2(p)
Entry 0.04 1.05 0.01 0.03 0.06 1.03 1.06 4.89 <0.005 19.89
Trainee prog -0.40 0.67 0.51 -1.40 0.59 0.25 1.81 -0.79 0.43 1.22
Gender -0.17 0.85 0.30 -0.75 0.42 0.47 1.52 -0.56 0.57 0.80
Rural 1.27 3.54 0.43 0.42 2.11 1.53 8.22 2.95 <0.005 8.28
Has Consistent Role 0.53 1.71 0.46 -0.38 1.44 0.69 4.24 1.15 0.25 2.00

Concordance 0.65
Partial AIC 1539.70
log-likelihood ratio test 33.28 on 5 df
-log2(p) of ll-ratio test 18.20

Costume

In [250]:
costume = pd.read_csv('Stats/Costume_df.csv')
In [251]:
kmf_costume = KaplanMeierFitter()
In [252]:
costume.shape
Out[252]:
(154, 54)
In [253]:
costume.loc[costume['dropout year'] == 0.0, 'dead'] = 0
costume.loc[costume['dropout year'] > 0, 'dead'] = 1
In [254]:
# drop null values from No of Yrs col
costume = costume[costume['No of Yrs'].notna()]
In [255]:
kmf_costume.fit(durations = costume['No of Yrs'],event_observed = costume['dead'], label="Costume")
Out[255]:
<lifelines.KaplanMeierFitter:"Costume", fitted with 154 total observations, 18 right-censored observations>
In [256]:
kmf_production.event_table
Out[256]:
removed observed censored entrance at_risk
event_at
0.0 0 0 0 816 816
1.0 315 253 62 0 816
2.0 139 123 16 0 501
3.0 92 86 6 0 362
4.0 59 52 7 0 270
5.0 50 49 1 0 211
6.0 24 18 6 0 161
7.0 22 22 0 0 137
8.0 14 10 4 0 115
9.0 16 14 2 0 101
10.0 14 13 1 0 85
11.0 5 4 1 0 71
12.0 6 6 0 0 66
13.0 11 8 3 0 60
14.0 6 3 3 0 49
15.0 6 6 0 0 43
16.0 3 3 0 0 37
17.0 2 1 1 0 34
18.0 6 4 2 0 32
19.0 3 3 0 0 26
20.0 3 2 1 0 23
21.0 3 2 1 0 20
22.0 3 3 0 0 17
23.0 2 1 1 0 14
24.0 5 4 1 0 12
25.0 2 1 1 0 7
26.0 1 0 1 0 5
27.0 1 0 1 0 4
28.0 1 0 1 0 3
29.0 2 2 0 0 2
In [257]:
kmf_production.survival_function_
Out[257]:
Production
timeline
0.0 1.000000
1.0 0.689951
2.0 0.520562
3.0 0.396892
4.0 0.320454
5.0 0.246036
6.0 0.218529
7.0 0.183436
8.0 0.167485
9.0 0.144270
10.0 0.122205
11.0 0.115320
12.0 0.104836
13.0 0.090858
14.0 0.085295
15.0 0.073394
16.0 0.067443
17.0 0.065459
18.0 0.057277
19.0 0.050668
20.0 0.046262
21.0 0.041636
22.0 0.034288
23.0 0.031839
24.0 0.021226
25.0 0.018194
26.0 0.018194
27.0 0.018194
28.0 0.018194
29.0 0.000000
In [258]:
kmf_costume.median_survival_time_
Out[258]:
4.0
In [259]:
kmf_p_production.confidence_interval_
Out[259]:
Post-Production_lower_0.95 Post-Production_upper_0.95
0.0 1.000000 1.000000
1.0 0.718811 0.827471
2.0 0.544364 0.673692
3.0 0.448833 0.581936
4.0 0.396309 0.529573
5.0 0.329108 0.460512
6.0 0.290851 0.419970
7.0 0.266438 0.393732
8.0 0.216642 0.339054
9.0 0.201617 0.322153
10.0 0.185970 0.304517
11.0 0.149148 0.261940
12.0 0.132604 0.242485
13.0 0.115687 0.222237
14.0 0.109560 0.214993
15.0 0.096077 0.199331
17.0 0.082948 0.183369
19.0 0.069223 0.166369
21.0 0.056035 0.148915
22.0 0.048922 0.139498
24.0 0.041048 0.129294
25.0 0.032152 0.118076
28.0 0.023953 0.106289
30.0 0.023953 0.106289
31.0 0.014136 0.093092
34.0 0.006426 0.078329
37.0 0.000000 0.000000
In [260]:
kmf_costume.plot_survival_function(at_risk_counts=True)
plt.title("Kaplan-Meier Estimate")
plt.ylabel("Probability of Production freelancer still in FB")
plt.show
Out[260]:
<function matplotlib.pyplot.show(*args, **kw)>
In [261]:
kmf_costume.confidence_interval_
Out[261]:
Costume_lower_0.95 Costume_upper_0.95
0.0 1.000000 1.000000
1.0 0.635864 0.778765
2.0 0.548757 0.702484
3.0 0.434036 0.594777
4.0 0.342879 0.502984
5.0 0.287320 0.444155
6.0 0.230579 0.381904
7.0 0.189228 0.334140
8.0 0.168982 0.309870
9.0 0.149055 0.285314
10.0 0.123038 0.252075
11.0 0.116304 0.243385
12.0 0.109627 0.234645
13.0 0.077207 0.190092
14.0 0.064767 0.171803
15.0 0.046834 0.143730
18.0 0.041090 0.134167
20.0 0.041090 0.134167
21.0 0.034686 0.123811
22.0 0.017112 0.091374
24.0 0.017112 0.091374
25.0 0.017112 0.091374
30.0 0.001555 0.063791
35.0 0.001555 0.063791
In [262]:
# Probability of leaving:
kmf_costume.cumulative_density_
Out[262]:
Costume
timeline
0.0 0.000000
1.0 0.285714
2.0 0.368932
3.0 0.482382
4.0 0.575841
5.0 0.634346
6.0 0.695288
7.0 0.740995
8.0 0.763848
9.0 0.786702
10.0 0.817173
11.0 0.825122
12.0 0.833071
13.0 0.872816
14.0 0.888714
15.0 0.912561
18.0 0.920510
20.0 0.920510
21.0 0.929342
22.0 0.955839
24.0 0.955839
25.0 0.955839
30.0 0.985280
35.0 0.985280
In [263]:
kmf_costume.plot_cumulative_density()
Out[263]:
<matplotlib.axes._subplots.AxesSubplot at 0x106cd7710>
In [264]:
naf_costume = NelsonAalenFitter()
naf_costume.fit(costume['No of Yrs'], event_observed = costume['dead'], label="Costume")
Out[264]:
<lifelines.NelsonAalenFitter:"Costume", fitted with 154 total observations, 18 right-censored observations>
In [265]:
naf.cumulative_hazard_
Out[265]:
Film Bang Listings
timeline
0.0 0.000000
1.0 0.302719
2.0 0.507069
3.0 0.689240
4.0 0.834566
5.0 0.989502
6.0 1.087008
7.0 1.201594
8.0 1.296850
9.0 1.408727
10.0 1.517739
11.0 1.612336
12.0 1.699428
13.0 1.798376
14.0 1.875952
15.0 1.995419
16.0 2.070927
17.0 2.157242
18.0 2.235714
19.0 2.319763
20.0 2.399984
21.0 2.506684
22.0 2.636660
23.0 2.691281
24.0 2.791364
25.0 2.863143
26.0 2.995933
27.0 3.060890
28.0 3.197463
29.0 3.341854
30.0 3.508900
31.0 3.597186
32.0 3.770011
33.0 3.895156
34.0 4.100962
35.0 4.332647
36.0 4.642171
37.0 4.892171
38.0 5.392171
41.0 5.392171
In [266]:
naf_costume.plot_cumulative_hazard()
Out[266]:
<matplotlib.axes._subplots.AxesSubplot at 0x106cbe0b8>
In [267]:
data_costume = costume[['No of Yrs', 'Entry','Trainee prog','Gender','Rural', 'Has Consistent Role','dead']]

cph_costume = CoxPHFitter()
cph_costume.fit(data_costume, 'No of Yrs',event_col='dead')
cph_costume.print_summary()
model lifelines.CoxPHFitter
duration col 'No of Yrs'
event col 'dead'
baseline estimation breslow
number of observations 154
number of events observed 136
partial log-likelihood -549.84
time fit was run 2021-07-01 14:33:14 UTC
coef exp(coef) se(coef) coef lower 95% coef upper 95% exp(coef) lower 95% exp(coef) upper 95% z p -log2(p)
Entry 0.04 1.04 0.01 0.02 0.06 1.02 1.06 3.91 <0.005 13.41
Trainee prog 0.27 1.31 0.46 -0.64 1.17 0.53 3.23 0.58 0.56 0.83
Gender -0.33 0.72 0.25 -0.81 0.16 0.44 1.18 -1.31 0.19 2.39
Rural -0.24 0.79 0.59 -1.39 0.91 0.25 2.49 -0.41 0.69 0.55
Has Consistent Role 0.59 1.80 0.31 -0.03 1.20 0.98 3.31 1.88 0.06 4.05

Concordance 0.63
Partial AIC 1109.68
log-likelihood ratio test 17.85 on 5 df
-log2(p) of ll-ratio test 8.31
In [ ]:

Music

In [268]:
music = pd.read_csv('Stats/Music_df.csv')
In [269]:
kmf_music = KaplanMeierFitter()
In [270]:
music.shape
Out[270]:
(86, 54)
In [271]:
music.loc[music['dropout year'] == 0.0, 'dead'] = 0
music.loc[music['dropout year'] > 0, 'dead'] = 1
In [272]:
# drop null values from No of Yrs col
music = music[music['No of Yrs'].notna()]
In [273]:
kmf_music.fit(durations = music['No of Yrs'],event_observed = music['dead'], label="Music")
Out[273]:
<lifelines.KaplanMeierFitter:"Music", fitted with 86 total observations, 6 right-censored observations>
In [274]:
kmf_music.event_table
Out[274]:
removed observed censored entrance at_risk
event_at
0.0 0 0 0 86 86
1.0 35 33 2 0 86
2.0 21 20 1 0 51
3.0 5 5 0 0 30
4.0 7 6 1 0 25
5.0 5 5 0 0 18
6.0 2 2 0 0 13
7.0 1 1 0 0 11
8.0 1 1 0 0 10
9.0 4 3 1 0 9
10.0 1 1 0 0 5
12.0 1 1 0 0 4
14.0 1 0 1 0 3
21.0 1 1 0 0 2
23.0 1 1 0 0 1
In [275]:
kmf_music.survival_function_
Out[275]:
Music
timeline
0.0 1.000000
1.0 0.616279
2.0 0.374601
3.0 0.312168
4.0 0.237247
5.0 0.171345
6.0 0.144984
7.0 0.131804
8.0 0.118624
9.0 0.079082
10.0 0.063266
12.0 0.047449
14.0 0.047449
21.0 0.023725
23.0 0.000000
In [276]:
kmf_music.median_survival_time_
Out[276]:
2.0
In [277]:
kmf_p_production.confidence_interval_
Out[277]:
Post-Production_lower_0.95 Post-Production_upper_0.95
0.0 1.000000 1.000000
1.0 0.718811 0.827471
2.0 0.544364 0.673692
3.0 0.448833 0.581936
4.0 0.396309 0.529573
5.0 0.329108 0.460512
6.0 0.290851 0.419970
7.0 0.266438 0.393732
8.0 0.216642 0.339054
9.0 0.201617 0.322153
10.0 0.185970 0.304517
11.0 0.149148 0.261940
12.0 0.132604 0.242485
13.0 0.115687 0.222237
14.0 0.109560 0.214993
15.0 0.096077 0.199331
17.0 0.082948 0.183369
19.0 0.069223 0.166369
21.0 0.056035 0.148915
22.0 0.048922 0.139498
24.0 0.041048 0.129294
25.0 0.032152 0.118076
28.0 0.023953 0.106289
30.0 0.023953 0.106289
31.0 0.014136 0.093092
34.0 0.006426 0.078329
37.0 0.000000 0.000000
In [278]:
kmf_music.plot_survival_function(at_risk_counts=True)
plt.title("Kaplan-Meier Estimate")
plt.ylabel("Probability of Production freelancer still in FB")
plt.show
Out[278]:
<function matplotlib.pyplot.show(*args, **kw)>
In [279]:
kmf_music.confidence_interval_
Out[279]:
Music_lower_0.95 Music_upper_0.95
0.0 1.000000 1.000000
1.0 0.505020 0.709652
2.0 0.272220 0.476649
3.0 0.216162 0.412763
4.0 0.151980 0.333346
5.0 0.098395 0.261302
6.0 0.078259 0.231366
7.0 0.068532 0.216103
8.0 0.059066 0.200616
9.0 0.032609 0.152493
10.0 0.022529 0.134131
12.0 0.013665 0.114838
14.0 0.013665 0.114838
21.0 0.002609 0.095090
23.0 0.000000 0.000000
In [280]:
# Probability of leaving:
kmf_music.cumulative_density_
Out[280]:
Music
timeline
0.0 0.000000
1.0 0.383721
2.0 0.625399
3.0 0.687832
4.0 0.762753
5.0 0.828655
6.0 0.855016
7.0 0.868196
8.0 0.881376
9.0 0.920918
10.0 0.936734
12.0 0.952551
14.0 0.952551
21.0 0.976275
23.0 1.000000
In [281]:
kmf_music.plot_cumulative_density()
Out[281]:
<matplotlib.axes._subplots.AxesSubplot at 0x11dd5e128>
In [282]:
naf_music = NelsonAalenFitter()
naf_music.fit(music['No of Yrs'], event_observed = music['dead'], label="Music")
Out[282]:
<lifelines.NelsonAalenFitter:"Music", fitted with 86 total observations, 6 right-censored observations>
In [283]:
naf.cumulative_hazard_
Out[283]:
Film Bang Listings
timeline
0.0 0.000000
1.0 0.302719
2.0 0.507069
3.0 0.689240
4.0 0.834566
5.0 0.989502
6.0 1.087008
7.0 1.201594
8.0 1.296850
9.0 1.408727
10.0 1.517739
11.0 1.612336
12.0 1.699428
13.0 1.798376
14.0 1.875952
15.0 1.995419
16.0 2.070927
17.0 2.157242
18.0 2.235714
19.0 2.319763
20.0 2.399984
21.0 2.506684
22.0 2.636660
23.0 2.691281
24.0 2.791364
25.0 2.863143
26.0 2.995933
27.0 3.060890
28.0 3.197463
29.0 3.341854
30.0 3.508900
31.0 3.597186
32.0 3.770011
33.0 3.895156
34.0 4.100962
35.0 4.332647
36.0 4.642171
37.0 4.892171
38.0 5.392171
41.0 5.392171
In [284]:
naf_music.plot_cumulative_hazard()
Out[284]:
<matplotlib.axes._subplots.AxesSubplot at 0x12106d518>
In [285]:
music['No of Yrs'].isnull().value_counts(dropna=False)
Out[285]:
False    86
Name: No of Yrs, dtype: int64
In [286]:
# Trainee Prog col dropped due to convergence issue: very low incidence. 
data_music = music[['No of Yrs', 'Entry','Gender','Rural', 'Has Consistent Role','dead']]

cph_music = CoxPHFitter()
cph_music.fit(data_music, 'No of Yrs',event_col='dead')
cph_music.print_summary()
model lifelines.CoxPHFitter
duration col 'No of Yrs'
event col 'dead'
baseline estimation breslow
number of observations 86
number of events observed 80
partial log-likelihood -279.36
time fit was run 2021-07-01 14:33:15 UTC
coef exp(coef) se(coef) coef lower 95% coef upper 95% exp(coef) lower 95% exp(coef) upper 95% z p -log2(p)
Entry -0.00 1.00 0.01 -0.03 0.02 0.97 1.02 -0.29 0.77 0.37
Gender 0.69 2.00 0.28 0.15 1.23 1.16 3.44 2.50 0.01 6.34
Rural -0.21 0.81 0.39 -0.97 0.55 0.38 1.74 -0.54 0.59 0.76
Has Consistent Role 0.87 2.39 0.75 -0.59 2.34 0.55 10.34 1.17 0.24 2.04

Concordance 0.58
Partial AIC 566.73
log-likelihood ratio test 7.33 on 4 df
-log2(p) of ll-ratio test 3.07

Support

In [287]:
support = pd.read_csv('Stats/Support_df.csv')
In [288]:
kmf_support = KaplanMeierFitter()
In [289]:
support.shape
Out[289]:
(83, 54)
In [290]:
support.loc[support['dropout year'] == 0.0, 'dead'] = 0
support.loc[support['dropout year'] > 0, 'dead'] = 1
In [291]:
# drop null values from No of Yrs col
support = support[support['No of Yrs'].notna()]
In [292]:
# check
#df['No of Yrs'].value_counts(dropna=False)
In [293]:
kmf_support.fit(durations = support['No of Yrs'],event_observed = support['dead'], label="Support")
Out[293]:
<lifelines.KaplanMeierFitter:"Support", fitted with 83 total observations, 11 right-censored observations>
In [294]:
kmf_support.event_table
Out[294]:
removed observed censored entrance at_risk
event_at
0.0 0 0 0 83 83
1.0 31 27 4 0 83
2.0 18 15 3 0 52
3.0 9 9 0 0 34
4.0 4 4 0 0 25
5.0 2 2 0 0 21
6.0 2 2 0 0 19
8.0 2 1 1 0 17
9.0 1 1 0 0 15
10.0 2 1 1 0 14
11.0 1 1 0 0 12
12.0 3 3 0 0 11
13.0 1 1 0 0 8
14.0 1 1 0 0 7
17.0 1 1 0 0 6
18.0 2 1 1 0 5
24.0 1 0 1 0 3
25.0 1 1 0 0 2
35.0 1 1 0 0 1
In [295]:
kmf_support.survival_function_
Out[295]:
Support
timeline
0.0 1.000000
1.0 0.674699
2.0 0.480074
3.0 0.352996
4.0 0.296516
5.0 0.268277
6.0 0.240037
8.0 0.225917
9.0 0.210856
10.0 0.195795
11.0 0.179479
12.0 0.130530
13.0 0.114214
14.0 0.097897
17.0 0.081581
18.0 0.065265
24.0 0.065265
25.0 0.032632
35.0 0.000000
In [296]:
kmf_support.median_survival_time_
Out[296]:
2.0
In [297]:
kmf_p_production.confidence_interval_
Out[297]:
Post-Production_lower_0.95 Post-Production_upper_0.95
0.0 1.000000 1.000000
1.0 0.718811 0.827471
2.0 0.544364 0.673692
3.0 0.448833 0.581936
4.0 0.396309 0.529573
5.0 0.329108 0.460512
6.0 0.290851 0.419970
7.0 0.266438 0.393732
8.0 0.216642 0.339054
9.0 0.201617 0.322153
10.0 0.185970 0.304517
11.0 0.149148 0.261940
12.0 0.132604 0.242485
13.0 0.115687 0.222237
14.0 0.109560 0.214993
15.0 0.096077 0.199331
17.0 0.082948 0.183369
19.0 0.069223 0.166369
21.0 0.056035 0.148915
22.0 0.048922 0.139498
24.0 0.041048 0.129294
25.0 0.032152 0.118076
28.0 0.023953 0.106289
30.0 0.023953 0.106289
31.0 0.014136 0.093092
34.0 0.006426 0.078329
37.0 0.000000 0.000000
In [298]:
kmf_support.plot_survival_function(at_risk_counts=True)
plt.title("Kaplan-Meier Estimate")
plt.ylabel("Probability of Production freelancer still in FB")
plt.show
Out[298]:
<function matplotlib.pyplot.show(*args, **kw)>
In [299]:
kmf_support.confidence_interval_
Out[299]:
Support_lower_0.95 Support_upper_0.95
0.0 1.000000 1.000000
1.0 0.562603 0.763998
2.0 0.367128 0.584274
3.0 0.247715 0.459777
4.0 0.197764 0.401782
5.0 0.173580 0.372096
6.0 0.149975 0.341903
8.0 0.138408 0.326601
9.0 0.126038 0.310416
10.0 0.113904 0.294034
11.0 0.100739 0.276515
12.0 0.063622 0.222022
13.0 0.052175 0.203098
14.0 0.041291 0.183712
17.0 0.031068 0.163785
18.0 0.021646 0.143210
24.0 0.021646 0.143210
25.0 0.003808 0.122131
35.0 0.000000 0.000000
In [300]:
# Probability of leaving:
kmf_support.cumulative_density_
Out[300]:
Support
timeline
0.0 0.000000
1.0 0.325301
2.0 0.519926
3.0 0.647004
4.0 0.703484
5.0 0.731723
6.0 0.759963
8.0 0.774083
9.0 0.789144
10.0 0.804205
11.0 0.820521
12.0 0.869470
13.0 0.885786
14.0 0.902103
17.0 0.918419
18.0 0.934735
24.0 0.934735
25.0 0.967368
35.0 1.000000
In [301]:
kmf_support.plot_cumulative_density()
Out[301]:
<matplotlib.axes._subplots.AxesSubplot at 0x1214a7320>
In [302]:
naf_support = NelsonAalenFitter()
naf_support.fit(support['No of Yrs'], event_observed = support['dead'], label="Support")
Out[302]:
<lifelines.NelsonAalenFitter:"Support", fitted with 83 total observations, 11 right-censored observations>
In [303]:
naf.cumulative_hazard_
Out[303]:
Film Bang Listings
timeline
0.0 0.000000
1.0 0.302719
2.0 0.507069
3.0 0.689240
4.0 0.834566
5.0 0.989502
6.0 1.087008
7.0 1.201594
8.0 1.296850
9.0 1.408727
10.0 1.517739
11.0 1.612336
12.0 1.699428
13.0 1.798376
14.0 1.875952
15.0 1.995419
16.0 2.070927
17.0 2.157242
18.0 2.235714
19.0 2.319763
20.0 2.399984
21.0 2.506684
22.0 2.636660
23.0 2.691281
24.0 2.791364
25.0 2.863143
26.0 2.995933
27.0 3.060890
28.0 3.197463
29.0 3.341854
30.0 3.508900
31.0 3.597186
32.0 3.770011
33.0 3.895156
34.0 4.100962
35.0 4.332647
36.0 4.642171
37.0 4.892171
38.0 5.392171
41.0 5.392171
In [304]:
naf_support.plot_cumulative_hazard()
Out[304]:
<matplotlib.axes._subplots.AxesSubplot at 0x12168fc50>
In [305]:
data_support = support[['No of Yrs', 'Entry','Trainee prog','Gender','Rural', 'Has Consistent Role','dead']]

cph_support = CoxPHFitter()
cph_support.fit(data_support, 'No of Yrs',event_col='dead')
cph_support.print_summary()
model lifelines.CoxPHFitter
duration col 'No of Yrs'
event col 'dead'
baseline estimation breslow
number of observations 83
number of events observed 72
partial log-likelihood -245.14
time fit was run 2021-07-01 14:33:17 UTC
coef exp(coef) se(coef) coef lower 95% coef upper 95% exp(coef) lower 95% exp(coef) upper 95% z p -log2(p)
Entry 0.03 1.03 0.01 -0.00 0.05 1.00 1.05 1.93 0.05 4.21
Trainee prog -0.01 0.99 1.07 -2.11 2.09 0.12 8.12 -0.01 0.99 0.01
Gender 0.29 1.34 0.23 -0.16 0.74 0.85 2.10 1.26 0.21 2.26
Rural 1.86 6.45 0.66 0.57 3.15 1.78 23.40 2.83 <0.005 7.77
Has Consistent Role 0.44 1.55 0.37 -0.28 1.16 0.75 3.19 1.19 0.23 2.10

Concordance 0.68
Partial AIC 500.29
log-likelihood ratio test 14.37 on 5 df
-log2(p) of ll-ratio test 6.22

Script

In [306]:
script = pd.read_csv('Stats/Script_df.csv')
In [307]:
kmf_script = KaplanMeierFitter()
In [308]:
script.shape
Out[308]:
(79, 54)
In [309]:
script.loc[script['dropout year'] == 0.0, 'dead'] = 0
script.loc[script['dropout year'] > 0, 'dead'] = 1
In [310]:
# drop null values from No of Yrs col
script = script[script['No of Yrs'].notna()]
In [311]:
kmf_script.fit(durations = script['No of Yrs'],event_observed = script['dead'], label="Script")
Out[311]:
<lifelines.KaplanMeierFitter:"Script", fitted with 79 total observations, 3 right-censored observations>
In [312]:
kmf_script.event_table
Out[312]:
removed observed censored entrance at_risk
event_at
0.0 0 0 0 79 79
1.0 17 17 0 0 79
2.0 13 13 0 0 62
3.0 17 16 1 0 49
4.0 6 5 1 0 32
5.0 1 1 0 0 26
6.0 3 3 0 0 25
7.0 4 3 1 0 22
8.0 2 2 0 0 18
9.0 3 3 0 0 16
10.0 1 1 0 0 13
11.0 2 2 0 0 12
15.0 2 2 0 0 10
16.0 1 1 0 0 8
18.0 3 3 0 0 7
21.0 1 1 0 0 4
23.0 1 1 0 0 3
24.0 1 1 0 0 2
28.0 1 1 0 0 1
In [313]:
kmf_script.survival_function_
Out[313]:
Script
timeline
0.0 1.000000
1.0 0.784810
2.0 0.620253
3.0 0.417722
4.0 0.352453
5.0 0.338897
6.0 0.298229
7.0 0.257561
8.0 0.228944
9.0 0.186017
10.0 0.171708
11.0 0.143090
15.0 0.114472
16.0 0.100163
18.0 0.057236
21.0 0.042927
23.0 0.028618
24.0 0.014309
28.0 0.000000
In [314]:
kmf_script.median_survival_time_
Out[314]:
3.0
In [315]:
kmf_p_production.confidence_interval_
Out[315]:
Post-Production_lower_0.95 Post-Production_upper_0.95
0.0 1.000000 1.000000
1.0 0.718811 0.827471
2.0 0.544364 0.673692
3.0 0.448833 0.581936
4.0 0.396309 0.529573
5.0 0.329108 0.460512
6.0 0.290851 0.419970
7.0 0.266438 0.393732
8.0 0.216642 0.339054
9.0 0.201617 0.322153
10.0 0.185970 0.304517
11.0 0.149148 0.261940
12.0 0.132604 0.242485
13.0 0.115687 0.222237
14.0 0.109560 0.214993
15.0 0.096077 0.199331
17.0 0.082948 0.183369
19.0 0.069223 0.166369
21.0 0.056035 0.148915
22.0 0.048922 0.139498
24.0 0.041048 0.129294
25.0 0.032152 0.118076
28.0 0.023953 0.106289
30.0 0.023953 0.106289
31.0 0.014136 0.093092
34.0 0.006426 0.078329
37.0 0.000000 0.000000
In [316]:
kmf_script.plot_survival_function(at_risk_counts=True)
plt.title("Kaplan-Meier Estimate")
plt.ylabel("Probability of Production freelancer still in FB")
plt.show
Out[316]:
<function matplotlib.pyplot.show(*args, **kw)>
In [317]:
kmf_script.confidence_interval_
Out[317]:
Script_lower_0.95 Script_upper_0.95
0.0 1.000000 1.000000
1.0 0.676896 0.860311
2.0 0.503862 0.716904
3.0 0.308424 0.523182
4.0 0.248958 0.457436
5.0 0.236774 0.443646
6.0 0.200899 0.401689
7.0 0.166104 0.358784
8.0 0.142016 0.328382
9.0 0.107309 0.281567
10.0 0.096169 0.265596
11.0 0.074635 0.233013
15.0 0.054278 0.199423
16.0 0.044636 0.182170
18.0 0.018701 0.127910
21.0 0.011495 0.108676
23.0 0.005452 0.088641
24.0 0.001219 0.068010
28.0 0.000000 0.000000
In [318]:
# Probability of leaving:
kmf_script.cumulative_density_
Out[318]:
Script
timeline
0.0 0.000000
1.0 0.215190
2.0 0.379747
3.0 0.582278
4.0 0.647547
5.0 0.661103
6.0 0.701771
7.0 0.742439
8.0 0.771056
9.0 0.813983
10.0 0.828292
11.0 0.856910
15.0 0.885528
16.0 0.899837
18.0 0.942764
21.0 0.957073
23.0 0.971382
24.0 0.985691
28.0 1.000000
In [319]:
kmf_script.plot_cumulative_density()
Out[319]:
<matplotlib.axes._subplots.AxesSubplot at 0x121b586d8>
In [320]:
naf_script = NelsonAalenFitter()
naf_script.fit(script['No of Yrs'], event_observed = script['dead'], label="Script")
Out[320]:
<lifelines.NelsonAalenFitter:"Script", fitted with 79 total observations, 3 right-censored observations>
In [321]:
naf.cumulative_hazard_
Out[321]:
Film Bang Listings
timeline
0.0 0.000000
1.0 0.302719
2.0 0.507069
3.0 0.689240
4.0 0.834566
5.0 0.989502
6.0 1.087008
7.0 1.201594
8.0 1.296850
9.0 1.408727
10.0 1.517739
11.0 1.612336
12.0 1.699428
13.0 1.798376
14.0 1.875952
15.0 1.995419
16.0 2.070927
17.0 2.157242
18.0 2.235714
19.0 2.319763
20.0 2.399984
21.0 2.506684
22.0 2.636660
23.0 2.691281
24.0 2.791364
25.0 2.863143
26.0 2.995933
27.0 3.060890
28.0 3.197463
29.0 3.341854
30.0 3.508900
31.0 3.597186
32.0 3.770011
33.0 3.895156
34.0 4.100962
35.0 4.332647
36.0 4.642171
37.0 4.892171
38.0 5.392171
41.0 5.392171
In [322]:
naf_script.plot_cumulative_hazard()
Out[322]:
<matplotlib.axes._subplots.AxesSubplot at 0x121d2b710>
In [323]:
# Trainee prog and Rural cols dropped due to convegece issue: low incidence
data_script = script[['No of Yrs', 'Entry','Gender', 'Has Consistent Role','dead']]

cph_script = CoxPHFitter()
cph_script.fit(data_script, 'No of Yrs',event_col='dead')
cph_script.print_summary()
model lifelines.CoxPHFitter
duration col 'No of Yrs'
event col 'dead'
baseline estimation breslow
number of observations 79
number of events observed 76
partial log-likelihood -258.08
time fit was run 2021-07-01 14:33:19 UTC
coef exp(coef) se(coef) coef lower 95% coef upper 95% exp(coef) lower 95% exp(coef) upper 95% z p -log2(p)
Entry 0.02 1.02 0.01 -0.00 0.04 1.00 1.04 1.69 0.09 3.46
Gender -0.07 0.93 0.25 -0.57 0.42 0.57 1.53 -0.29 0.77 0.37
Has Consistent Role 0.05 1.05 0.28 -0.50 0.60 0.61 1.83 0.18 0.86 0.23

Concordance 0.58
Partial AIC 522.17
log-likelihood ratio test 2.94 on 3 df
-log2(p) of ll-ratio test 1.32

Casting

In [324]:
casting = pd.read_csv('Stats/Casting_df.csv')
In [325]:
kmf_casting = KaplanMeierFitter()
In [326]:
casting.shape
Out[326]:
(76, 54)
In [327]:
casting.loc[casting['dropout year'] == 0.0, 'dead'] = 0
casting.loc[casting['dropout year'] > 0, 'dead'] = 1
In [328]:
# drop null values from No of Yrs col
casting = casting[casting['No of Yrs'].notna()]
In [329]:
kmf_casting.fit(durations = casting['No of Yrs'],event_observed = casting['dead'], label="Casting")
Out[329]:
<lifelines.KaplanMeierFitter:"Casting", fitted with 76 total observations, 12 right-censored observations>
In [330]:
kmf_casting.event_table
Out[330]:
removed observed censored entrance at_risk
event_at
0.0 0 0 0 76 76
1.0 28 24 4 0 76
2.0 19 16 3 0 48
3.0 9 9 0 0 29
4.0 3 3 0 0 20
5.0 3 2 1 0 17
6.0 1 1 0 0 14
7.0 1 1 0 0 13
8.0 2 2 0 0 12
9.0 1 0 1 0 10
11.0 1 1 0 0 9
13.0 2 2 0 0 8
14.0 2 1 1 0 6
15.0 1 1 0 0 4
16.0 1 1 0 0 3
19.0 1 0 1 0 2
22.0 1 0 1 0 1
In [331]:
kmf_casting.survival_function_
Out[331]:
Casting
timeline
0.0 1.000000
1.0 0.684211
2.0 0.456140
3.0 0.314580
4.0 0.267393
5.0 0.235935
6.0 0.219082
7.0 0.202230
8.0 0.168525
9.0 0.168525
11.0 0.149800
13.0 0.112350
14.0 0.093625
15.0 0.070219
16.0 0.046812
19.0 0.046812
22.0 0.046812
In [332]:
kmf_casting.median_survival_time_
Out[332]:
2.0
In [333]:
kmf_p_production.confidence_interval_
Out[333]:
Post-Production_lower_0.95 Post-Production_upper_0.95
0.0 1.000000 1.000000
1.0 0.718811 0.827471
2.0 0.544364 0.673692
3.0 0.448833 0.581936
4.0 0.396309 0.529573
5.0 0.329108 0.460512
6.0 0.290851 0.419970
7.0 0.266438 0.393732
8.0 0.216642 0.339054
9.0 0.201617 0.322153
10.0 0.185970 0.304517
11.0 0.149148 0.261940
12.0 0.132604 0.242485
13.0 0.115687 0.222237
14.0 0.109560 0.214993
15.0 0.096077 0.199331
17.0 0.082948 0.183369
19.0 0.069223 0.166369
21.0 0.056035 0.148915
22.0 0.048922 0.139498
24.0 0.041048 0.129294
25.0 0.032152 0.118076
28.0 0.023953 0.106289
30.0 0.023953 0.106289
31.0 0.014136 0.093092
34.0 0.006426 0.078329
37.0 0.000000 0.000000
In [334]:
kmf_casting.plot_survival_function(at_risk_counts=True)
plt.title("Kaplan-Meier Estimate")
plt.ylabel("Probability of Production freelancer still in FB")
plt.show
Out[334]:
<function matplotlib.pyplot.show(*args, **kw)>
In [335]:
kmf_casting.confidence_interval_
Out[335]:
Casting_lower_0.95 Casting_upper_0.95
0.0 1.000000 1.000000
1.0 0.566919 0.775886
2.0 0.339030 0.565733
3.0 0.208853 0.425692
4.0 0.168477 0.376468
5.0 0.142514 0.342830
6.0 0.128683 0.324882
7.0 0.115157 0.306681
8.0 0.089107 0.269449
9.0 0.089107 0.269449
11.0 0.074817 0.249050
13.0 0.048273 0.206628
14.0 0.036204 0.184452
15.0 0.021497 0.159241
16.0 0.009784 0.131886
19.0 0.009784 0.131886
22.0 0.009784 0.131886
In [336]:
# Probability of leaving:
kmf_casting.cumulative_density_
Out[336]:
Casting
timeline
0.0 0.000000
1.0 0.315789
2.0 0.543860
3.0 0.685420
4.0 0.732607
5.0 0.764065
6.0 0.780918
7.0 0.797770
8.0 0.831475
9.0 0.831475
11.0 0.850200
13.0 0.887650
14.0 0.906375
15.0 0.929781
16.0 0.953188
19.0 0.953188
22.0 0.953188
In [337]:
kmf_casting.plot_cumulative_density()
Out[337]:
<matplotlib.axes._subplots.AxesSubplot at 0x12145b9e8>
In [338]:
naf_casting = NelsonAalenFitter()
naf_casting.fit(casting['No of Yrs'], event_observed = casting['dead'], label="Casting")
Out[338]:
<lifelines.NelsonAalenFitter:"Casting", fitted with 76 total observations, 12 right-censored observations>
In [339]:
naf.cumulative_hazard_
Out[339]:
Film Bang Listings
timeline
0.0 0.000000
1.0 0.302719
2.0 0.507069
3.0 0.689240
4.0 0.834566
5.0 0.989502
6.0 1.087008
7.0 1.201594
8.0 1.296850
9.0 1.408727
10.0 1.517739
11.0 1.612336
12.0 1.699428
13.0 1.798376
14.0 1.875952
15.0 1.995419
16.0 2.070927
17.0 2.157242
18.0 2.235714
19.0 2.319763
20.0 2.399984
21.0 2.506684
22.0 2.636660
23.0 2.691281
24.0 2.791364
25.0 2.863143
26.0 2.995933
27.0 3.060890
28.0 3.197463
29.0 3.341854
30.0 3.508900
31.0 3.597186
32.0 3.770011
33.0 3.895156
34.0 4.100962
35.0 4.332647
36.0 4.642171
37.0 4.892171
38.0 5.392171
41.0 5.392171
In [340]:
naf_casting.plot_cumulative_hazard()
Out[340]:
<matplotlib.axes._subplots.AxesSubplot at 0x1214b2898>
In [341]:
# Trainee prog and Rural columns dropped due to low incidence causing convergence issue
data_casting = casting[['No of Yrs', 'Entry','Gender', 'Has Consistent Role','dead']]

cph_casting = CoxPHFitter()
cph_casting.fit(data_casting, 'No of Yrs',event_col='dead')
cph_casting.print_summary()
model lifelines.CoxPHFitter
duration col 'No of Yrs'
event col 'dead'
baseline estimation breslow
number of observations 76
number of events observed 64
partial log-likelihood -216.83
time fit was run 2021-07-01 14:33:21 UTC
coef exp(coef) se(coef) coef lower 95% coef upper 95% exp(coef) lower 95% exp(coef) upper 95% z p -log2(p)
Entry -0.01 0.99 0.02 -0.04 0.02 0.96 1.02 -0.74 0.46 1.11
Gender -0.49 0.61 0.27 -1.01 0.03 0.36 1.03 -1.86 0.06 3.98
Has Consistent Role 1.88 6.54 0.75 0.41 3.35 1.51 28.39 2.51 0.01 6.36

Concordance 0.62
Partial AIC 439.65
log-likelihood ratio test 12.19 on 3 df
-log2(p) of ll-ratio test 7.21

Construction

In [342]:
construction = pd.read_csv('Stats/Construction_df.csv')
In [343]:
kmf_construction = KaplanMeierFitter()
In [344]:
construction.shape
Out[344]:
(51, 54)
In [345]:
construction.loc[construction['dropout year'] == 0.0, 'dead'] = 0
construction.loc[construction['dropout year'] > 0, 'dead'] = 1
In [346]:
# drop null values from No of Yrs col
construction = construction[construction['No of Yrs'].notna()]
In [347]:
kmf_construction.fit(durations = construction['No of Yrs'],event_observed = construction['dead'], label="Construction")
Out[347]:
<lifelines.KaplanMeierFitter:"Construction", fitted with 51 total observations, 2 right-censored observations>
In [348]:
kmf_construction.event_table
Out[348]:
removed observed censored entrance at_risk
event_at
0.0 0 0 0 51 51
1.0 24 22 2 0 51
2.0 4 4 0 0 27
3.0 7 7 0 0 23
4.0 2 2 0 0 16
5.0 2 2 0 0 14
6.0 3 3 0 0 12
7.0 1 1 0 0 9
9.0 2 2 0 0 8
11.0 2 2 0 0 6
13.0 2 2 0 0 4
17.0 1 1 0 0 2
26.0 1 1 0 0 1
In [349]:
kmf_construction.survival_function_
Out[349]:
Construction
timeline
0.0 1.000000
1.0 0.568627
2.0 0.484386
3.0 0.336964
4.0 0.294844
5.0 0.252723
6.0 0.189542
7.0 0.168482
9.0 0.126362
11.0 0.084241
13.0 0.042121
17.0 0.021060
26.0 0.000000
In [350]:
kmf_construction.median_survival_time_
Out[350]:
2.0
In [351]:
kmf_construction.confidence_interval_
Out[351]:
Construction_lower_0.95 Construction_upper_0.95
0.0 1.000000 1.000000
1.0 0.422253 0.690976
2.0 0.341046 0.613579
3.0 0.209700 0.468841
4.0 0.174836 0.425147
5.0 0.141335 0.380253
6.0 0.094082 0.310282
7.0 0.079290 0.286123
9.0 0.051534 0.236225
11.0 0.027023 0.183585
13.0 0.007766 0.126830
17.0 0.001699 0.096654
26.0 0.000000 0.000000
In [352]:
kmf_construction.plot_survival_function(at_risk_counts=True)
plt.title("Kaplan-Meier Estimate")
plt.ylabel("Probability of Production freelancer still in FB")
plt.show
Out[352]:
<function matplotlib.pyplot.show(*args, **kw)>
In [353]:
kmf_construction.confidence_interval_
Out[353]:
Construction_lower_0.95 Construction_upper_0.95
0.0 1.000000 1.000000
1.0 0.422253 0.690976
2.0 0.341046 0.613579
3.0 0.209700 0.468841
4.0 0.174836 0.425147
5.0 0.141335 0.380253
6.0 0.094082 0.310282
7.0 0.079290 0.286123
9.0 0.051534 0.236225
11.0 0.027023 0.183585
13.0 0.007766 0.126830
17.0 0.001699 0.096654
26.0 0.000000 0.000000
In [354]:
# Probability of leaving:
kmf_construction.cumulative_density_
Out[354]:
Construction
timeline
0.0 0.000000
1.0 0.431373
2.0 0.515614
3.0 0.663036
4.0 0.705156
5.0 0.747277
6.0 0.810458
7.0 0.831518
9.0 0.873638
11.0 0.915759
13.0 0.957879
17.0 0.978940
26.0 1.000000
In [355]:
kmf_construction.plot_cumulative_density()
Out[355]:
<matplotlib.axes._subplots.AxesSubplot at 0x120ac9c50>
In [356]:
naf_construction = NelsonAalenFitter()
naf_construction.fit(construction['No of Yrs'], event_observed = construction['dead'], label="Construction")
Out[356]:
<lifelines.NelsonAalenFitter:"Construction", fitted with 51 total observations, 2 right-censored observations>
In [357]:
naf.cumulative_hazard_
Out[357]:
Film Bang Listings
timeline
0.0 0.000000
1.0 0.302719
2.0 0.507069
3.0 0.689240
4.0 0.834566
5.0 0.989502
6.0 1.087008
7.0 1.201594
8.0 1.296850
9.0 1.408727
10.0 1.517739
11.0 1.612336
12.0 1.699428
13.0 1.798376
14.0 1.875952
15.0 1.995419
16.0 2.070927
17.0 2.157242
18.0 2.235714
19.0 2.319763
20.0 2.399984
21.0 2.506684
22.0 2.636660
23.0 2.691281
24.0 2.791364
25.0 2.863143
26.0 2.995933
27.0 3.060890
28.0 3.197463
29.0 3.341854
30.0 3.508900
31.0 3.597186
32.0 3.770011
33.0 3.895156
34.0 4.100962
35.0 4.332647
36.0 4.642171
37.0 4.892171
38.0 5.392171
41.0 5.392171
In [358]:
naf_construction.plot_cumulative_hazard()
Out[358]:
<matplotlib.axes._subplots.AxesSubplot at 0x120ade828>
In [359]:
#Rural, Gender, Has Consistent Role, Trainee prog removed due to onvergence issue: low incidence.
data_construction = construction[['No of Yrs', 'Entry','dead']]

cph_construction = CoxPHFitter()
cph_construction.fit(data_construction, 'No of Yrs',event_col='dead')
cph_construction.print_summary()
model lifelines.CoxPHFitter
duration col 'No of Yrs'
event col 'dead'
baseline estimation breslow
number of observations 51
number of events observed 49
partial log-likelihood -145.35
time fit was run 2021-07-01 14:33:23 UTC
coef exp(coef) se(coef) coef lower 95% coef upper 95% exp(coef) lower 95% exp(coef) upper 95% z p -log2(p)
Entry 0.01 1.01 0.02 -0.02 0.05 0.98 1.05 0.86 0.39 1.35

Concordance 0.52
Partial AIC 292.70
log-likelihood ratio test 0.72 on 1 df
-log2(p) of ll-ratio test 1.34

Special FX

In [360]:
special_fx = pd.read_csv('Stats/Special_FX_df.csv')
In [361]:
kmf_special_fx = KaplanMeierFitter()
In [362]:
special_fx.shape
Out[362]:
(22, 54)
In [363]:
special_fx.loc[special_fx['dropout year'] == 0.0, 'dead'] = 0
special_fx.loc[special_fx['dropout year'] > 0, 'dead'] = 1
In [364]:
# drop null values from No of Yrs col
special_fx = special_fx[special_fx['No of Yrs'].notna()]
In [365]:
kmf_special_fx.fit(durations = special_fx['No of Yrs'],event_observed = special_fx['dead'], label="Special FX")
Out[365]:
<lifelines.KaplanMeierFitter:"Special FX", fitted with 22 total observations, 1 right-censored observations>
In [366]:
kmf_special_fx.event_table
Out[366]:
removed observed censored entrance at_risk
event_at
0.0 0 0 0 22 22
1.0 11 11 0 0 22
2.0 2 2 0 0 11
3.0 2 2 0 0 9
5.0 1 1 0 0 7
7.0 1 1 0 0 6
8.0 2 2 0 0 5
11.0 1 0 1 0 3
16.0 1 1 0 0 2
18.0 1 1 0 0 1
In [367]:
kmf_special_fx.survival_function_
Out[367]:
Special FX
timeline
0.0 1.000000
1.0 0.500000
2.0 0.409091
3.0 0.318182
5.0 0.272727
7.0 0.227273
8.0 0.136364
11.0 0.136364
16.0 0.068182
18.0 0.000000
In [368]:
kmf_special_fx.median_survival_time_
Out[368]:
1.0
In [369]:
kmf_p_production.confidence_interval_
Out[369]:
Post-Production_lower_0.95 Post-Production_upper_0.95
0.0 1.000000 1.000000
1.0 0.718811 0.827471
2.0 0.544364 0.673692
3.0 0.448833 0.581936
4.0 0.396309 0.529573
5.0 0.329108 0.460512
6.0 0.290851 0.419970
7.0 0.266438 0.393732
8.0 0.216642 0.339054
9.0 0.201617 0.322153
10.0 0.185970 0.304517
11.0 0.149148 0.261940
12.0 0.132604 0.242485
13.0 0.115687 0.222237
14.0 0.109560 0.214993
15.0 0.096077 0.199331
17.0 0.082948 0.183369
19.0 0.069223 0.166369
21.0 0.056035 0.148915
22.0 0.048922 0.139498
24.0 0.041048 0.129294
25.0 0.032152 0.118076
28.0 0.023953 0.106289
30.0 0.023953 0.106289
31.0 0.014136 0.093092
34.0 0.006426 0.078329
37.0 0.000000 0.000000
In [370]:
kmf_special_fx.plot_survival_function(at_risk_counts=True)
plt.title("Kaplan-Meier Estimate")
plt.ylabel("Probability of Production freelancer still in FB")
plt.show
Out[370]:
<function matplotlib.pyplot.show(*args, **kw)>
In [371]:
kmf_special_fx.confidence_interval_
Out[371]:
Special FX_lower_0.95 Special FX_upper_0.95
0.0 1.000000 1.000000
1.0 0.281787 0.684324
2.0 0.208521 0.600735
3.0 0.141759 0.511081
5.0 0.111158 0.463731
7.0 0.082726 0.414451
8.0 0.034130 0.308715
11.0 0.034130 0.308715
16.0 0.005899 0.245346
18.0 0.000000 0.000000
In [372]:
# Probability of leaving:
kmf_special_fx.cumulative_density_
Out[372]:
Special FX
timeline
0.0 0.000000
1.0 0.500000
2.0 0.590909
3.0 0.681818
5.0 0.727273
7.0 0.772727
8.0 0.863636
11.0 0.863636
16.0 0.931818
18.0 1.000000
In [373]:
kmf_special_fx.plot_cumulative_density()
Out[373]:
<matplotlib.axes._subplots.AxesSubplot at 0x1220e8c18>
In [374]:
naf_special_fx = NelsonAalenFitter()
naf_special_fx.fit(special_fx['No of Yrs'], event_observed = special_fx['dead'], label="Special FX")
Out[374]:
<lifelines.NelsonAalenFitter:"Special FX", fitted with 22 total observations, 1 right-censored observations>
In [375]:
naf.cumulative_hazard_
Out[375]:
Film Bang Listings
timeline
0.0 0.000000
1.0 0.302719
2.0 0.507069
3.0 0.689240
4.0 0.834566
5.0 0.989502
6.0 1.087008
7.0 1.201594
8.0 1.296850
9.0 1.408727
10.0 1.517739
11.0 1.612336
12.0 1.699428
13.0 1.798376
14.0 1.875952
15.0 1.995419
16.0 2.070927
17.0 2.157242
18.0 2.235714
19.0 2.319763
20.0 2.399984
21.0 2.506684
22.0 2.636660
23.0 2.691281
24.0 2.791364
25.0 2.863143
26.0 2.995933
27.0 3.060890
28.0 3.197463
29.0 3.341854
30.0 3.508900
31.0 3.597186
32.0 3.770011
33.0 3.895156
34.0 4.100962
35.0 4.332647
36.0 4.642171
37.0 4.892171
38.0 5.392171
41.0 5.392171
In [376]:
naf_special_fx.plot_cumulative_hazard()
Out[376]:
<matplotlib.axes._subplots.AxesSubplot at 0x1222bc0b8>
In [377]:
# Trainee prog col dropped due to convergence issue: low incidence
data_special_fx = special_fx[['No of Yrs', 'Entry','Gender','Rural', 'Has Consistent Role','dead']]

cph_special_fx = CoxPHFitter()
cph_special_fx.fit(data_special_fx, 'No of Yrs',event_col='dead')
cph_special_fx.print_summary()
model lifelines.CoxPHFitter
duration col 'No of Yrs'
event col 'dead'
baseline estimation breslow
number of observations 22
number of events observed 21
partial log-likelihood -43.27
time fit was run 2021-07-01 14:33:25 UTC
coef exp(coef) se(coef) coef lower 95% coef upper 95% exp(coef) lower 95% exp(coef) upper 95% z p -log2(p)
Entry 0.03 1.03 0.04 -0.04 0.11 0.96 1.11 0.85 0.39 1.34
Gender 0.58 1.79 0.56 -0.53 1.69 0.59 5.40 1.03 0.30 1.72
Rural 0.79 2.20 1.14 -1.46 3.03 0.23 20.68 0.69 0.49 1.02
Has Consistent Role 1.11 3.02 0.69 -0.24 2.46 0.78 11.66 1.61 0.11 3.21

Concordance 0.77
Partial AIC 94.54
log-likelihood ratio test 8.20 on 4 df
-log2(p) of ll-ratio test 3.57
In [378]:
# Test
fig1 = plt.figure()

ax1 = plt.subplot(211)
ax2 = plt.subplot(211)
ax3 = plt.subplot(211)
ax4 = plt.subplot(211)
ax5 = plt.subplot(211)
ax6 = plt.subplot(211)
ax7 = plt.subplot(211)
ax8 = plt.subplot(211)
ax9 = plt.subplot(211)
ax10 = plt.subplot(211)
ax11 = plt.subplot(211)
ax12 = plt.subplot(211)
ax13 = plt.subplot(211)
ax14 = plt.subplot(211)
ax15 = plt.subplot(211)

kmf_sound.survival_function_.plot(ax=ax7, label="sound")
kmf_camera.survival_function_.plot(ax=ax4, label="camera")
kmf_producer.survival_function_.plot(ax=ax5, label="producer")
kmf_art.survival_function_.plot(ax=ax3, label="art")
kmf_direction.survival_function_.plot(ax=ax2, label="direction")
kmf_p_production.survival_function_.plot(ax=ax6, label="post production")
kmf_hair_make_up.survival_function_.plot(ax=ax8, label="hair & make-up")
kmf_costume.survival_function_.plot(ax=ax9, label="costume")
kmf_script.survival_function_.plot(ax=ax12, label="script")
kmf_support.survival_function_.plot(ax=ax11, label="support")
kmf_casting.survival_function_.plot(ax=ax13, label="casting")
kmf_production.survival_function_.plot(ax=ax1, label="production")
kmf_special_fx.survival_function_.plot(ax=ax15, label="special fx")
kmf_construction.survival_function_.plot(ax=ax14, label="construction")
kmf_music.survival_function_.plot(ax=ax10, label="music")

fig1.set_size_inches(12, 10)
plt.ylabel("Probability of Production freelancer still in FB")
######
plt.tight_layout()
plt.savefig("Stats/Outputs/dept_kmf_estimates_3.png", facecolor='#ffffff')
plt.show
Out[378]:
<function matplotlib.pyplot.show(*args, **kw)>
In [379]:
# Version 2
# Test
fig1 = plt.figure()

ax1 = plt.subplot(211)
ax2 = plt.subplot(211)
ax3 = plt.subplot(211)
ax4 = plt.subplot(211)
ax5 = plt.subplot(211)
ax6 = plt.subplot(211)
ax7 = plt.subplot(211)
ax8 = plt.subplot(211)
ax9 = plt.subplot(212)
ax10 = plt.subplot(212)
ax11 = plt.subplot(212)
ax12 = plt.subplot(212)
ax13 = plt.subplot(212)
ax14 = plt.subplot(212)
ax15 = plt.subplot(212)

kmf_production.survival_function_.plot(ax=ax1, label="production")
kmf_direction.survival_function_.plot(ax=ax2, label="direction")
kmf_art.survival_function_.plot(ax=ax3, label="art")
kmf_camera.survival_function_.plot(ax=ax4, label="camera")
kmf_producer.survival_function_.plot(ax=ax5, label="producer")
kmf_p_production.survival_function_.plot(ax=ax6, label="post production")
kmf_sound.survival_function_.plot(ax=ax7, label="sound")
kmf_hair_make_up.survival_function_.plot(ax=ax8, label="hair & make-up")
kmf_costume.survival_function_.plot(ax=ax9, label="costume")
kmf_music.survival_function_.plot(ax=ax10, label="music")
kmf_support.survival_function_.plot(ax=ax11, label="support")
kmf_script.survival_function_.plot(ax=ax12, label="script")
kmf_casting.survival_function_.plot(ax=ax13, label="casting")
kmf_construction.survival_function_.plot(ax=ax14, label="construction")
kmf_special_fx.survival_function_.plot(ax=ax15, label="special fx")

fig1.set_size_inches(12, 8)
plt.ylabel("Probability of Production freelancer still in FB")
plt.show
Out[379]:
<function matplotlib.pyplot.show(*args, **kw)>
In [380]:
fig1 = plt.figure()

ax1 = plt.subplot(211)
ax2 = plt.subplot(211)
ax3 = plt.subplot(211)
ax4 = plt.subplot(211)
ax5 = plt.subplot(211)
ax6 = plt.subplot(211)
ax7 = plt.subplot(211)
ax8 = plt.subplot(211)

kmf_production.survival_function_.plot(ax=ax1, label="production")
kmf_direction.survival_function_.plot(ax=ax2, label="direction")
kmf_art.survival_function_.plot(ax=ax3, label="art")
kmf_camera.survival_function_.plot(ax=ax4, label="camera")
kmf_producer.survival_function_.plot(ax=ax5, label="producer")
kmf_p_production.survival_function_.plot(ax=ax6, label="post production")
kmf_sound.survival_function_.plot(ax=ax7, label="sound")
kmf_hair_make_up.survival_function_.plot(ax=ax8, label="hair & make-up")

fig1.set_size_inches(12, 8)
plt.ylabel("Probability of Production freelancer still in FB")
plt.show
Out[380]:
<function matplotlib.pyplot.show(*args, **kw)>
In [381]:
fig1 = plt.figure()

ax9 = plt.subplot(211)
ax10 = plt.subplot(211)
ax11 = plt.subplot(211)
ax12 = plt.subplot(211)
ax13 = plt.subplot(211)
ax14 = plt.subplot(211)
ax15 = plt.subplot(211)

kmf_costume.survival_function_.plot(ax=ax9, label="costume")
kmf_music.survival_function_.plot(ax=ax10, label="music")
kmf_support.survival_function_.plot(ax=ax11, label="support")
kmf_script.survival_function_.plot(ax=ax12, label="script")
kmf_casting.survival_function_.plot(ax=ax13, label="casting")
kmf_construction.survival_function_.plot(ax=ax14, label="construction")
kmf_special_fx.survival_function_.plot(ax=ax15, label="special fx")

fig1.set_size_inches(12, 8)
plt.ylabel("Probability of Production freelancer still in FB")
plt.show
Out[381]:
<function matplotlib.pyplot.show(*args, **kw)>
In [382]:
fig = plt.figure()
ax1 = plt.subplot(211)
ax2 = plt.subplot(212)

kmf_production.plot(ax=ax1, label="production")
kmf_direction.plot(ax=ax2, label="direction")
Out[382]:
<matplotlib.axes._subplots.AxesSubplot at 0x11deb00b8>
In [383]:
fig = plt.figure()
ax1 = plt.subplot(331)
ax2 = plt.subplot(332)
ax3 = plt.subplot(333)
ax4 = plt.subplot(334)
ax5 = plt.subplot(335)
ax6 = plt.subplot(336)
ax7 = plt.subplot(337)
ax8 = plt.subplot(338)
ax9 = plt.subplot(339)

kmf_production.plot(ax=ax1, label="production")
kmf_direction.plot(ax=ax2, label="direction")
kmf_art.plot(ax=ax3, label="art")
kmf_camera.plot(ax=ax4, label="camera")
kmf_p_production.plot(ax=ax5, label="Post Production")
kmf_producer.plot(ax=ax6, label="Producer")
kmf_sound.plot(ax=ax7, label="Sound")
kmf_hair_make_up.plot(ax=ax8, label="Hair & Make-Up")
kmf_costume.plot(ax=ax9, label="costume")

plt.tight_layout()
plt.savefig("Stats/Outputs/dept_kmf_estimates_1.png", facecolor='#ffffff')
In [384]:
fig = plt.figure()
ax1 = plt.subplot(331)
ax2 = plt.subplot(332)
ax3 = plt.subplot(333)
ax4 = plt.subplot(334)
ax5 = plt.subplot(335)
ax6 = plt.subplot(336)

kmf_music.plot(ax=ax1, label="music")
kmf_support.plot(ax=ax2, label="support")
kmf_script.plot(ax=ax3, label="script")
kmf_casting.plot(ax=ax4, label="casting")
kmf_construction.plot(ax=ax5, label="construction")
kmf_special_fx.plot(ax=ax6, label="special fx")

plt.tight_layout()
plt.savefig("Stats/Outputs/dept_kmf_estimates_2.png", facecolor='#ffffff')