import pandas as pd
import numpy as np
from tabulate import tabulate
df_files = pd.read_csv('output/all_grandchild_files_wlm_with_actors_2010-2021.csv')
df_users = pd.read_csv('output/df_user_withmonths.csv')
print(df_files.columns)
print(df_users.columns)
Index(['Unnamed: 0.1', 'Unnamed: 0', 'maincat_title', 'img_name', 'file_id', 'img_timestamp', 'uploader', 'uploader_old', 'year', 'subcat_country', 'subcat_title'], dtype='object') Index(['Unnamed: 0.1', 'Unnamed: 0', 'uploader', 'first_year', 'total_contribs', 'first_img_wl', 'subcat_country', 'first_edit', 'first_edit_wiki', 'new_user', 'jan_all', 'jan_com', 'feb_all', 'feb_com', 'mar_all', 'mar_com', 'apr_all', 'apr_com', 'may_all', 'may_com', 'jun_all', 'jun_com', 'jul_all', 'jul_com', 'aug_all', 'aug_com', 'sep_all', 'sep_com', 'oct_all', 'oct_com', 'nov_all', 'nov_com', 'dec_all', 'dec_com', 'year_all', 'year_com'], dtype='object')
print("number of unique contributors per year:")
df_files.groupby('year').agg(
unique_uploaders=('uploader', 'nunique'),
unique_uploads=('img_name', 'nunique')
)
number of unique contributors per year:
unique_uploaders | unique_uploads | |
---|---|---|
year | ||
2010 | 239 | 12702 |
2011 | 5609 | 170959 |
2012 | 14480 | 353862 |
2013 | 11802 | 361050 |
2014 | 8125 | 299696 |
2015 | 5871 | 217538 |
2016 | 9874 | 255502 |
2017 | 8680 | 223346 |
2018 | 13617 | 259657 |
2019 | 7065 | 211861 |
2020 | 7488 | 230481 |
2021 | 4925 | 174301 |
print('of all participants, how many joined through the competition')
df_users.new_user.value_counts()
of all participants, how many joined through the competition
True 70229 False 16282 Name: new_user, dtype: int64
print("new users per year")
df_users.groupby('first_year').agg(
new_users=('new_user', 'sum')
)
new users per year
new_users | |
---|---|
first_year | |
2010 | 136 |
2011 | 3825 |
2012 | 10929 |
2013 | 8652 |
2014 | 5834 |
2015 | 3900 |
2016 | 7364 |
2017 | 6236 |
2018 | 10599 |
2019 | 4564 |
2020 | 5030 |
2021 | 3160 |
print("new users per year")
df_users.groupby('subcat_country').agg(
new_users=('new_user', 'sum')
).sort_values('new_users', ascending=False)[:25]
new users per year
new_users | |
---|---|
subcat_country | |
India | 9155 |
the_United_States | 8064 |
Russia | 6075 |
Germany | 3748 |
Iran | 3253 |
France | 2932 |
Ukraine | 2554 |
Poland | 2450 |
Spain | 2362 |
Italy | 2254 |
the_United_Kingdom | 1906 |
Mexico | 1874 |
Pakistan | 1694 |
Israel | 1513 |
Brazil | 1392 |
Romania | 822 |
Ireland | 812 |
Canada | 776 |
Egypt | 734 |
Bangladesh | 719 |
the_Netherlands | 674 |
Greece | 623 |
Sweden | 564 |
Switzerland | 522 |
Taiwan | 518 |
df_users.head()
Unnamed: 0.1 | Unnamed: 0 | uploader | first_year | total_contribs | first_img_wl | subcat_country | first_edit | first_edit_wiki | new_user | ... | sep_all | sep_com | oct_all | oct_com | nov_all | nov_com | dec_all | dec_com | year_all | year_com | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | "Vin.car27" | 2019 | 2 | 2019-09-12 14:45:04+00:00 | Italy | 2019-09-12 14:45:04+00:00 | commonswiki | True | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 |
1 | 1 | 1 | "prottash jhd" | 2019 | 1 | 2019-09-26 02:12:06+00:00 | Bangladesh | 2019-09-26 02:12:06+00:00 | commonswiki | True | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 |
2 | 2 | 2 | "vilma.mariani" | 2019 | 17 | 2019-09-23 12:00:50+00:00 | Italy | 2019-09-23 12:00:50+00:00 | commonswiki | True | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 |
3 | 3 | 3 | $hubhanshu | 2017 | 6 | 2017-09-08 09:13:49+00:00 | India | 2017-09-08 09:13:49+00:00 | commonswiki | True | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 |
4 | 4 | 4 | $ru$hti | 2016 | 1 | 2016-09-03 06:48:29+00:00 | India | 2016-09-03 06:48:29+00:00 | commonswiki | True | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 |
5 rows × 36 columns
Todo: filter for bots
Observations:
print('uploaders by number of uploads')
df_users['total_contribs'].sort_index().describe(percentiles=[0,0.25,0.5,0.75,0.9,0.99])
uploaders by number of uploads
count 86511.000000 mean 32.029384 std 455.404829 min 1.000000 0% 1.000000 25% 1.000000 50% 3.000000 75% 9.000000 90% 29.000000 99% 443.000000 max 90362.000000 Name: total_contribs, dtype: float64
print('uploaders by number of years participated')
df_files.groupby('uploader')['year'].nunique().value_counts().sort_index()
uploaders by number of years participated
1 80878 2 3323 3 1035 4 513 5 258 6 176 7 118 8 83 9 52 10 37 11 36 12 2 Name: year, dtype: int64
df_files.groupby('uploader')['year'].nunique().describe(percentiles=[0,0.75,0.9,0.95,0.99])
count 86511.000000 mean 1.130203 std 0.659471 min 1.000000 0% 1.000000 50% 1.000000 75% 1.000000 90% 1.000000 95% 2.000000 99% 4.000000 max 12.000000 Name: year, dtype: float64
df_files.groupby('uploader')['subcat_country'].nunique().value_counts().sort_index()
0 3 1 84141 2 1633 3 349 4 132 5 70 6 54 7 30 8 23 9 14 10 17 11 10 12 4 13 3 14 3 15 7 17 2 20 1 21 1 22 3 23 1 29 1 30 2 33 2 36 2 52 1 61 1 74 1 Name: subcat_country, dtype: int64
df_files.groupby('uploader')['subcat_country'].nunique().describe(percentiles=[0,0.75,0.9,0.95,0.99])
count 86511.000000 mean 1.054987 std 0.647568 min 0.000000 0% 0.000000 50% 1.000000 75% 1.000000 90% 1.000000 95% 1.000000 99% 2.000000 max 74.000000 Name: subcat_country, dtype: float64
df_files.groupby('uploader')['subcat_country'].nunique()[df_files.groupby('uploader')['subcat_country'].nunique() > 30]
uploader Embedded Data Bot 61 Jdx 36 R8cocin8 36 Rotatebot 52 Sebastian Wallroth 33 Slick-o-bot 33 SteinsplitterBot 74 Name: subcat_country, dtype: int64
Observations:
print('retention by month in the year after the competition, on all wikis (_all) or on Commons (_com)')
(df_users.loc[df_users['new_user'],'jan_all':]>0).mean()
retention by month in the year after the competition, on all wikis (_all) or on Commons (_com)
jan_all 0.009341 jan_com 0.005311 feb_all 0.008373 feb_com 0.004713 mar_all 0.008515 mar_com 0.004727 apr_all 0.007632 apr_com 0.004371 may_all 0.010765 may_com 0.007476 jun_all 0.007945 jun_com 0.004927 jul_all 0.006650 jul_com 0.003973 aug_all 0.006522 aug_com 0.003773 sep_all 0.024833 sep_com 0.022398 oct_all 0.009725 oct_com 0.006977 nov_all 0.006821 nov_com 0.004144 dec_all 0.005610 dec_com 0.003047 year_all 0.059035 year_com 0.043885 dtype: float64
df_users[df_users['new_user']].fillna(0).groupby('first_year').agg(
new_users_n = ('new_user', 'count'),
jan_all = ('jan_all', lambda val: (val > 0).mean()),
jan_com = ('jan_com', lambda val: (val > 0).mean()),
year_all = ('year_all', lambda val: (val > 0).mean()),
year_all_n = ('year_all', 'sum'),
year_com = ('year_com', lambda val: (val > 0).mean()),
year_com_n = ('year_com', 'sum')
)
new_users_n | jan_all | jan_com | year_all | year_all_n | year_com | year_com_n | |
---|---|---|---|---|---|---|---|
first_year | |||||||
2010 | 136 | 0.051471 | 0.051471 | 0.125000 | 1461.0 | 0.125000 | 1078.0 |
2011 | 3825 | 0.017255 | 0.011503 | 0.085490 | 184886.0 | 0.067451 | 168623.0 |
2012 | 10929 | 0.013359 | 0.006130 | 0.071004 | 80612.0 | 0.048952 | 61878.0 |
2013 | 8652 | 0.008206 | 0.004854 | 0.051896 | 40169.0 | 0.038488 | 24348.0 |
2014 | 5834 | 0.005999 | 0.002914 | 0.059650 | 31698.0 | 0.046109 | 23915.0 |
2015 | 3900 | 0.010000 | 0.005641 | 0.060256 | 13582.0 | 0.045385 | 10583.0 |
2016 | 7364 | 0.008963 | 0.005160 | 0.053775 | 20653.0 | 0.038566 | 18266.0 |
2017 | 6236 | 0.008820 | 0.005131 | 0.055164 | 16394.0 | 0.038967 | 10870.0 |
2018 | 10599 | 0.004623 | 0.001604 | 0.047457 | 21554.0 | 0.034154 | 18768.0 |
2019 | 4564 | 0.010955 | 0.007888 | 0.074058 | 38562.0 | 0.061569 | 33953.0 |
2020 | 5030 | 0.009940 | 0.006561 | 0.060835 | 11238.0 | 0.049105 | 8565.0 |
2021 | 3160 | 0.006962 | 0.005696 | 0.033861 | 11878.0 | 0.024051 | 9614.0 |
print('of the 50 countries with the most new users, which have the higest retention in january')
df_users[df_users['new_user']].fillna(0).groupby('subcat_country').agg(
new_users_n = ('new_user', 'count'),
jan_all = ('jan_all', lambda val: (val > 0).mean()),
jan_com = ('jan_com', lambda val: (val > 0).mean()),
year_all = ('year_all', lambda val: (val > 0).mean()),
year_all_n = ('year_all', 'sum'),
year_com = ('year_com', lambda val: (val > 0).mean()),
year_com_n = ('year_com', 'sum')
).sort_values('new_users_n', ascending=False)[:50].sort_values('jan_all', ascending=False)
of the 50 countries with the most new users, which have the higest retention in january
new_users_n | jan_all | jan_com | year_all | year_all_n | year_com | year_com_n | |
---|---|---|---|---|---|---|---|
subcat_country | |||||||
Austria | 508 | 0.039370 | 0.027559 | 0.135827 | 13260.0 | 0.114173 | 10351.0 |
the_Netherlands | 674 | 0.028190 | 0.020772 | 0.087537 | 3264.0 | 0.068249 | 2770.0 |
Norway | 243 | 0.024691 | 0.016461 | 0.090535 | 383.0 | 0.074074 | 315.0 |
the_Czech_Republic | 470 | 0.021277 | 0.012766 | 0.106383 | 10092.0 | 0.087234 | 7692.0 |
Belgium | 505 | 0.019802 | 0.011881 | 0.112871 | 3952.0 | 0.081188 | 3615.0 |
Slovakia | 177 | 0.016949 | 0.005650 | 0.186441 | 535.0 | 0.163842 | 414.0 |
Bangladesh | 719 | 0.016690 | 0.008345 | 0.077886 | 1554.0 | 0.051460 | 1143.0 |
Germany | 3748 | 0.016542 | 0.012273 | 0.087513 | 77732.0 | 0.066435 | 64398.0 |
Egypt | 734 | 0.014986 | 0.010899 | 0.089918 | 1660.0 | 0.074932 | 1047.0 |
France | 2932 | 0.014325 | 0.007844 | 0.065484 | 146833.0 | 0.048772 | 132304.0 |
Nepal | 466 | 0.012876 | 0.010730 | 0.094421 | 8166.0 | 0.075107 | 6182.0 |
Chile | 395 | 0.012658 | 0.005063 | 0.050633 | 621.0 | 0.035443 | 527.0 |
China | 252 | 0.011905 | 0.000000 | 0.039683 | 193.0 | 0.015873 | 7.0 |
Canada | 776 | 0.011598 | 0.006443 | 0.047680 | 2190.0 | 0.029639 | 1592.0 |
Switzerland | 522 | 0.011494 | 0.005747 | 0.090038 | 3397.0 | 0.070881 | 1598.0 |
Ukraine | 2554 | 0.011355 | 0.007439 | 0.093579 | 25358.0 | 0.075959 | 21196.0 |
Romania | 822 | 0.010949 | 0.006083 | 0.057178 | 2590.0 | 0.049878 | 916.0 |
Finland | 183 | 0.010929 | 0.005464 | 0.131148 | 545.0 | 0.087432 | 520.0 |
Sweden | 564 | 0.010638 | 0.007092 | 0.097518 | 1700.0 | 0.085106 | 982.0 |
the_United_Kingdom | 1906 | 0.009969 | 0.007345 | 0.052991 | 19172.0 | 0.038825 | 13696.0 |
Spain | 2362 | 0.009738 | 0.003387 | 0.054191 | 16667.0 | 0.036833 | 13145.0 |
Algeria | 415 | 0.009639 | 0.009639 | 0.050602 | 174.0 | 0.045783 | 143.0 |
Russia | 6075 | 0.009053 | 0.004774 | 0.053992 | 21538.0 | 0.039671 | 18017.0 |
the_United_States | 8064 | 0.008929 | 0.004092 | 0.047867 | 14864.0 | 0.032986 | 13250.0 |
Iran | 3253 | 0.008607 | 0.002767 | 0.047034 | 2538.0 | 0.023363 | 1526.0 |
Poland | 2450 | 0.008571 | 0.006122 | 0.068163 | 34588.0 | 0.054694 | 30618.0 |
Argentina | 482 | 0.008299 | 0.006224 | 0.053942 | 372.0 | 0.035270 | 241.0 |
Italy | 2254 | 0.007986 | 0.004880 | 0.085182 | 10345.0 | 0.078083 | 8710.0 |
South_Africa | 256 | 0.007812 | 0.007812 | 0.054688 | 277.0 | 0.046875 | 267.0 |
Greece | 623 | 0.006421 | 0.003210 | 0.054575 | 722.0 | 0.043339 | 675.0 |
India | 9155 | 0.006335 | 0.003386 | 0.036046 | 13601.0 | 0.021081 | 12049.0 |
Portugal | 360 | 0.005556 | 0.002778 | 0.025000 | 247.0 | 0.016667 | 222.0 |
Mexico | 1874 | 0.005336 | 0.002134 | 0.033084 | 691.0 | 0.024546 | 424.0 |
Brazil | 1392 | 0.005029 | 0.002874 | 0.040230 | 1237.0 | 0.033046 | 1157.0 |
Malta | 227 | 0.004405 | 0.004405 | 0.096916 | 292.0 | 0.092511 | 282.0 |
Panama | 250 | 0.004000 | 0.004000 | 0.020000 | 18.0 | 0.020000 | 15.0 |
Israel | 1513 | 0.003966 | 0.000661 | 0.051553 | 1194.0 | 0.036352 | 886.0 |
Thailand | 511 | 0.003914 | 0.001957 | 0.062622 | 2431.0 | 0.056751 | 2210.0 |
Taiwan | 518 | 0.003861 | 0.000000 | 0.077220 | 653.0 | 0.063707 | 591.0 |
Serbia | 260 | 0.003846 | 0.003846 | 0.065385 | 410.0 | 0.053846 | 278.0 |
Australia | 272 | 0.003676 | 0.000000 | 0.066176 | 223.0 | 0.029412 | 166.0 |
Morocco | 353 | 0.002833 | 0.002833 | 0.045326 | 247.0 | 0.039660 | 244.0 |
the_Philippines | 389 | 0.002571 | 0.002571 | 0.066838 | 791.0 | 0.056555 | 544.0 |
Ireland | 812 | 0.002463 | 0.002463 | 0.060345 | 1125.0 | 0.052956 | 1083.0 |
Peru | 421 | 0.002375 | 0.002375 | 0.045131 | 482.0 | 0.040380 | 424.0 |
Bolivia | 423 | 0.002364 | 0.002364 | 0.026005 | 300.0 | 0.014184 | 251.0 |
Pakistan | 1694 | 0.001771 | 0.001181 | 0.049587 | 839.0 | 0.040732 | 740.0 |
Uruguay | 460 | 0.000000 | 0.000000 | 0.010870 | 226.0 | 0.010870 | 167.0 |
Colombia | 444 | 0.000000 | 0.000000 | 0.036036 | 424.0 | 0.024775 | 152.0 |
Iraq | 300 | 0.000000 | 0.000000 | 0.016667 | 17.0 | 0.010000 | 15.0 |