In [3]:
import pandas as pd
import numpy as np
from tabulate import tabulate
In [5]:
df_files = pd.read_csv('output/all_grandchild_files_wlm_with_actors_2010-2021.csv')
df_users = pd.read_csv('output/df_user_withmonths.csv')
In [6]:
print(df_files.columns)
print(df_users.columns)
Index(['Unnamed: 0.1', 'Unnamed: 0', 'maincat_title', 'img_name', 'file_id',
       'img_timestamp', 'uploader', 'uploader_old', 'year', 'subcat_country',
       'subcat_title'],
      dtype='object')
Index(['Unnamed: 0.1', 'Unnamed: 0', 'uploader', 'first_year',
       'total_contribs', 'first_img_wl', 'subcat_country', 'first_edit',
       'first_edit_wiki', 'new_user', 'jan_all', 'jan_com', 'feb_all',
       'feb_com', 'mar_all', 'mar_com', 'apr_all', 'apr_com', 'may_all',
       'may_com', 'jun_all', 'jun_com', 'jul_all', 'jul_com', 'aug_all',
       'aug_com', 'sep_all', 'sep_com', 'oct_all', 'oct_com', 'nov_all',
       'nov_com', 'dec_all', 'dec_com', 'year_all', 'year_com'],
      dtype='object')

Basic Stats¶

In [25]:
print("number of unique contributors per year:")
df_files.groupby('year').agg(
    unique_uploaders=('uploader', 'nunique'),
    unique_uploads=('img_name', 'nunique')
)
number of unique contributors per year:
Out[25]:
unique_uploaders unique_uploads
year
2010 239 12702
2011 5609 170959
2012 14480 353862
2013 11802 361050
2014 8125 299696
2015 5871 217538
2016 9874 255502
2017 8680 223346
2018 13617 259657
2019 7065 211861
2020 7488 230481
2021 4925 174301
In [29]:
print('of all participants, how many joined through the competition')
df_users.new_user.value_counts()
of all participants, how many joined through the competition
Out[29]:
True     70229
False    16282
Name: new_user, dtype: int64
In [27]:
print("new users per year")
df_users.groupby('first_year').agg(
    new_users=('new_user', 'sum')
)
new users per year
Out[27]:
new_users
first_year
2010 136
2011 3825
2012 10929
2013 8652
2014 5834
2015 3900
2016 7364
2017 6236
2018 10599
2019 4564
2020 5030
2021 3160
In [22]:
print("new users per year")
df_users.groupby('subcat_country').agg(
    new_users=('new_user', 'sum')
).sort_values('new_users', ascending=False)[:25]
new users per year
Out[22]:
new_users
subcat_country
India 9155
the_United_States 8064
Russia 6075
Germany 3748
Iran 3253
France 2932
Ukraine 2554
Poland 2450
Spain 2362
Italy 2254
the_United_Kingdom 1906
Mexico 1874
Pakistan 1694
Israel 1513
Brazil 1392
Romania 822
Ireland 812
Canada 776
Egypt 734
Bangladesh 719
the_Netherlands 674
Greece 623
Sweden 564
Switzerland 522
Taiwan 518

User analysis¶

In [24]:
df_users.head()
Out[24]:
Unnamed: 0.1 Unnamed: 0 uploader first_year total_contribs first_img_wl subcat_country first_edit first_edit_wiki new_user ... sep_all sep_com oct_all oct_com nov_all nov_com dec_all dec_com year_all year_com
0 0 0 "Vin.car27" 2019 2 2019-09-12 14:45:04+00:00 Italy 2019-09-12 14:45:04+00:00 commonswiki True ... NaN NaN NaN NaN NaN NaN NaN NaN 0.0 0.0
1 1 1 "prottash jhd" 2019 1 2019-09-26 02:12:06+00:00 Bangladesh 2019-09-26 02:12:06+00:00 commonswiki True ... NaN NaN NaN NaN NaN NaN NaN NaN 0.0 0.0
2 2 2 "vilma.mariani" 2019 17 2019-09-23 12:00:50+00:00 Italy 2019-09-23 12:00:50+00:00 commonswiki True ... NaN NaN NaN NaN NaN NaN NaN NaN 0.0 0.0
3 3 3 $hubhanshu 2017 6 2017-09-08 09:13:49+00:00 India 2017-09-08 09:13:49+00:00 commonswiki True ... NaN NaN NaN NaN NaN NaN NaN NaN 0.0 0.0
4 4 4 $ru$hti 2016 1 2016-09-03 06:48:29+00:00 India 2016-09-03 06:48:29+00:00 commonswiki True ... NaN NaN NaN NaN NaN NaN NaN NaN 0.0 0.0

5 rows × 36 columns

Exploration¶

upload stats per participant¶

Todo: filter for bots

Observations:

  • 50+ % of participants uploads 3 images or more
  • 5+ % participated in 2 years or more
  • 2 contributors have participated in all 12 years! (NB: may well include bots)
In [42]:
print('uploaders by number of uploads')
df_users['total_contribs'].sort_index().describe(percentiles=[0,0.25,0.5,0.75,0.9,0.99])
uploaders by number of uploads
Out[42]:
count    86511.000000
mean        32.029384
std        455.404829
min          1.000000
0%           1.000000
25%          1.000000
50%          3.000000
75%          9.000000
90%         29.000000
99%        443.000000
max      90362.000000
Name: total_contribs, dtype: float64
In [34]:
print('uploaders by number of years participated')
df_files.groupby('uploader')['year'].nunique().value_counts().sort_index()
uploaders by number of years participated
Out[34]:
1     80878
2      3323
3      1035
4       513
5       258
6       176
7       118
8        83
9        52
10       37
11       36
12        2
Name: year, dtype: int64
In [43]:
df_files.groupby('uploader')['year'].nunique().describe(percentiles=[0,0.75,0.9,0.95,0.99])
Out[43]:
count    86511.000000
mean         1.130203
std          0.659471
min          1.000000
0%           1.000000
50%          1.000000
75%          1.000000
90%          1.000000
95%          2.000000
99%          4.000000
max         12.000000
Name: year, dtype: float64
In [44]:
df_files.groupby('uploader')['subcat_country'].nunique().value_counts().sort_index()
Out[44]:
0         3
1     84141
2      1633
3       349
4       132
5        70
6        54
7        30
8        23
9        14
10       17
11       10
12        4
13        3
14        3
15        7
17        2
20        1
21        1
22        3
23        1
29        1
30        2
33        2
36        2
52        1
61        1
74        1
Name: subcat_country, dtype: int64
In [45]:
df_files.groupby('uploader')['subcat_country'].nunique().describe(percentiles=[0,0.75,0.9,0.95,0.99])
Out[45]:
count    86511.000000
mean         1.054987
std          0.647568
min          0.000000
0%           0.000000
50%          1.000000
75%          1.000000
90%          1.000000
95%          1.000000
99%          2.000000
max         74.000000
Name: subcat_country, dtype: float64
In [47]:
df_files.groupby('uploader')['subcat_country'].nunique()[df_files.groupby('uploader')['subcat_country'].nunique() > 30]
Out[47]:
uploader
Embedded Data Bot     61
Jdx                   36
R8cocin8              36
Rotatebot             52
Sebastian Wallroth    33
Slick-o-bot           33
SteinsplitterBot      74
Name: subcat_country, dtype: int64
In [ ]:
 

Retention¶

Observations:

  • a little less than 1% across all data for January retention
  • drop of retention over the year
  • small bump in May (WLE?)
  • big bump in September (WLM?)
  • retention over the whole year: ~6% across wiki's, 4.4% in Commons alone.
  • Both in absolute numbers and in contributors, most retention happened on Commons.
  • The years with higher counts of new users had lower retention.
  • Country with higest retention is Austria, more than 30% higher than the next on the list (the Netherlands).
In [31]:
print('retention by month in the year after the competition, on all wikis (_all) or on Commons (_com)')
(df_users.loc[df_users['new_user'],'jan_all':]>0).mean()
retention by month in the year after the competition, on all wikis (_all) or on Commons (_com)
Out[31]:
jan_all     0.009341
jan_com     0.005311
feb_all     0.008373
feb_com     0.004713
mar_all     0.008515
mar_com     0.004727
apr_all     0.007632
apr_com     0.004371
may_all     0.010765
may_com     0.007476
jun_all     0.007945
jun_com     0.004927
jul_all     0.006650
jul_com     0.003973
aug_all     0.006522
aug_com     0.003773
sep_all     0.024833
sep_com     0.022398
oct_all     0.009725
oct_com     0.006977
nov_all     0.006821
nov_com     0.004144
dec_all     0.005610
dec_com     0.003047
year_all    0.059035
year_com    0.043885
dtype: float64
In [50]:
df_users[df_users['new_user']].fillna(0).groupby('first_year').agg(
    new_users_n = ('new_user', 'count'),
    jan_all = ('jan_all', lambda val: (val > 0).mean()),
    jan_com = ('jan_com', lambda val: (val > 0).mean()),
    year_all = ('year_all', lambda val: (val > 0).mean()),
    year_all_n = ('year_all', 'sum'),
    year_com = ('year_com', lambda val: (val > 0).mean()),
    year_com_n = ('year_com', 'sum')
)
Out[50]:
new_users_n jan_all jan_com year_all year_all_n year_com year_com_n
first_year
2010 136 0.051471 0.051471 0.125000 1461.0 0.125000 1078.0
2011 3825 0.017255 0.011503 0.085490 184886.0 0.067451 168623.0
2012 10929 0.013359 0.006130 0.071004 80612.0 0.048952 61878.0
2013 8652 0.008206 0.004854 0.051896 40169.0 0.038488 24348.0
2014 5834 0.005999 0.002914 0.059650 31698.0 0.046109 23915.0
2015 3900 0.010000 0.005641 0.060256 13582.0 0.045385 10583.0
2016 7364 0.008963 0.005160 0.053775 20653.0 0.038566 18266.0
2017 6236 0.008820 0.005131 0.055164 16394.0 0.038967 10870.0
2018 10599 0.004623 0.001604 0.047457 21554.0 0.034154 18768.0
2019 4564 0.010955 0.007888 0.074058 38562.0 0.061569 33953.0
2020 5030 0.009940 0.006561 0.060835 11238.0 0.049105 8565.0
2021 3160 0.006962 0.005696 0.033861 11878.0 0.024051 9614.0
In [55]:
print('of the 50 countries with the most new users, which have the higest retention in january')
df_users[df_users['new_user']].fillna(0).groupby('subcat_country').agg(
    new_users_n = ('new_user', 'count'),
    jan_all = ('jan_all', lambda val: (val > 0).mean()),
    jan_com = ('jan_com', lambda val: (val > 0).mean()),
    year_all = ('year_all', lambda val: (val > 0).mean()),
    year_all_n = ('year_all', 'sum'),
    year_com = ('year_com', lambda val: (val > 0).mean()),
    year_com_n = ('year_com', 'sum')
).sort_values('new_users_n', ascending=False)[:50].sort_values('jan_all', ascending=False)
of the 50 countries with the most new users, which have the higest retention in january
Out[55]:
new_users_n jan_all jan_com year_all year_all_n year_com year_com_n
subcat_country
Austria 508 0.039370 0.027559 0.135827 13260.0 0.114173 10351.0
the_Netherlands 674 0.028190 0.020772 0.087537 3264.0 0.068249 2770.0
Norway 243 0.024691 0.016461 0.090535 383.0 0.074074 315.0
the_Czech_Republic 470 0.021277 0.012766 0.106383 10092.0 0.087234 7692.0
Belgium 505 0.019802 0.011881 0.112871 3952.0 0.081188 3615.0
Slovakia 177 0.016949 0.005650 0.186441 535.0 0.163842 414.0
Bangladesh 719 0.016690 0.008345 0.077886 1554.0 0.051460 1143.0
Germany 3748 0.016542 0.012273 0.087513 77732.0 0.066435 64398.0
Egypt 734 0.014986 0.010899 0.089918 1660.0 0.074932 1047.0
France 2932 0.014325 0.007844 0.065484 146833.0 0.048772 132304.0
Nepal 466 0.012876 0.010730 0.094421 8166.0 0.075107 6182.0
Chile 395 0.012658 0.005063 0.050633 621.0 0.035443 527.0
China 252 0.011905 0.000000 0.039683 193.0 0.015873 7.0
Canada 776 0.011598 0.006443 0.047680 2190.0 0.029639 1592.0
Switzerland 522 0.011494 0.005747 0.090038 3397.0 0.070881 1598.0
Ukraine 2554 0.011355 0.007439 0.093579 25358.0 0.075959 21196.0
Romania 822 0.010949 0.006083 0.057178 2590.0 0.049878 916.0
Finland 183 0.010929 0.005464 0.131148 545.0 0.087432 520.0
Sweden 564 0.010638 0.007092 0.097518 1700.0 0.085106 982.0
the_United_Kingdom 1906 0.009969 0.007345 0.052991 19172.0 0.038825 13696.0
Spain 2362 0.009738 0.003387 0.054191 16667.0 0.036833 13145.0
Algeria 415 0.009639 0.009639 0.050602 174.0 0.045783 143.0
Russia 6075 0.009053 0.004774 0.053992 21538.0 0.039671 18017.0
the_United_States 8064 0.008929 0.004092 0.047867 14864.0 0.032986 13250.0
Iran 3253 0.008607 0.002767 0.047034 2538.0 0.023363 1526.0
Poland 2450 0.008571 0.006122 0.068163 34588.0 0.054694 30618.0
Argentina 482 0.008299 0.006224 0.053942 372.0 0.035270 241.0
Italy 2254 0.007986 0.004880 0.085182 10345.0 0.078083 8710.0
South_Africa 256 0.007812 0.007812 0.054688 277.0 0.046875 267.0
Greece 623 0.006421 0.003210 0.054575 722.0 0.043339 675.0
India 9155 0.006335 0.003386 0.036046 13601.0 0.021081 12049.0
Portugal 360 0.005556 0.002778 0.025000 247.0 0.016667 222.0
Mexico 1874 0.005336 0.002134 0.033084 691.0 0.024546 424.0
Brazil 1392 0.005029 0.002874 0.040230 1237.0 0.033046 1157.0
Malta 227 0.004405 0.004405 0.096916 292.0 0.092511 282.0
Panama 250 0.004000 0.004000 0.020000 18.0 0.020000 15.0
Israel 1513 0.003966 0.000661 0.051553 1194.0 0.036352 886.0
Thailand 511 0.003914 0.001957 0.062622 2431.0 0.056751 2210.0
Taiwan 518 0.003861 0.000000 0.077220 653.0 0.063707 591.0
Serbia 260 0.003846 0.003846 0.065385 410.0 0.053846 278.0
Australia 272 0.003676 0.000000 0.066176 223.0 0.029412 166.0
Morocco 353 0.002833 0.002833 0.045326 247.0 0.039660 244.0
the_Philippines 389 0.002571 0.002571 0.066838 791.0 0.056555 544.0
Ireland 812 0.002463 0.002463 0.060345 1125.0 0.052956 1083.0
Peru 421 0.002375 0.002375 0.045131 482.0 0.040380 424.0
Bolivia 423 0.002364 0.002364 0.026005 300.0 0.014184 251.0
Pakistan 1694 0.001771 0.001181 0.049587 839.0 0.040732 740.0
Uruguay 460 0.000000 0.000000 0.010870 226.0 0.010870 167.0
Colombia 444 0.000000 0.000000 0.036036 424.0 0.024775 152.0
Iraq 300 0.000000 0.000000 0.016667 17.0 0.010000 15.0
In [ ]: