This notebook provides a tutorial for how to study page protections on Wikipedia either via the Mediawiki dumps or API. It has three stages:
This is an example of how to parse through Mediawiki dumps and determine what sorts of edit protections are applied to a given Wikipedia article.
# Import libraries
import gzip # necessary for decompressing the dump file for reading
# https://docs.python.org/3/library/gzip.html#gzip.open
import os # to manipulate paths with os.path
import sqlite3 # interface for SQLite databases
import pandas as pd # for data manipulation and analysis
# Every language on Wikipedia has its own page restrictions table
# you can find all the dbnames (e.g., enwiki) here: https://www.mediawiki.org/w/api.php?action=sitematrix
# for example, you could replace the LANGUAGE parameter of 'enwiki' with 'arwiki' to study Arabic Wikipedia
LANGUAGE = 'enwiki'
# e.g., enwiki -> en.wikipedia (this is necessary for the API section)
SITENAME = LANGUAGE.replace('wiki', '.wikipedia')
# directory on PAWS server that holds Wikimedia dumps
DUMP_DIR = "/public/dumps/public/{0}/latest/".format(LANGUAGE)
DUMP_FN = '{0}-latest-page_restrictions.sql.gz'.format(LANGUAGE)
# The dataset isn't huge -- 1.1 MB -- so should be quick to process in full
!ls -shH "{DUMP_DIR}{DUMP_FN}"
1.1M /public/dumps/public/enwiki/latest/enwiki-latest-page_restrictions.sql.gz
# Inspect the first 1000 characters of the page protections dump to see what it looks like
# As you can see from the CREATE TABLE statement, each datapoint has 7 fields (pr_page, pr_type, ... , pr_id)
# A description of the fields in the data can be found here:
# https://www.mediawiki.org/wiki/Manual:Page_restrictions_table
# And the data that we want is on lines that start with INSERT INTO `page_restrictions` VALUES...
# The first datapoint (1086732,'edit','sysop',0,NULL,'infinity',1307) can be interpreted as:
# 1086732: page ID 1086732 (en.wikipedia.org/wiki/?curid=1086732)
# 'edit': has edit protections
# 'sysop': that require sysop permissions (https://en.wikipedia.org/wiki/Wikipedia:User_access_levels#Administrator)
# 0: does not cascade to other pages
# NULL: no user-specific restrictions
# 'infinity': restriction does not expire automatically
# 1307: table primary key -- has no meaning by itself
!zcat "{DUMP_DIR}{DUMP_FN}" | head -46 | cut -c1-1000
-- MySQL dump 10.16 Distrib 10.1.45-MariaDB, for debian-linux-gnu (x86_64) -- -- Host: 10.64.48.13 Database: enwiki -- ------------------------------------------------------ -- Server version 10.1.43-MariaDB /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; /*!40101 SET NAMES utf8mb4 */; /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */; /*!40103 SET TIME_ZONE='+00:00' */; /*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */; /*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */; /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */; /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; -- -- Table structure for table `page_restrictions` -- DROP TABLE IF EXISTS `page_restrictions`; /*!40101 SET @saved_cs_client = @@character_set_client */; /*!40101 SET character_set_client = utf8 */; CREATE TABLE `page_restrictions` ( `pr_page` int(8) NOT NULL DEFAULT '0', `pr_type` varbinary(255) NOT NULL DEFAULT '', `pr_level` varbinary(255) NOT NULL DEFAULT '', `pr_cascade` tinyint(4) NOT NULL DEFAULT '0', `pr_user` int(10) unsigned DEFAULT NULL, `pr_expiry` varbinary(14) DEFAULT NULL, `pr_id` int(10) unsigned NOT NULL AUTO_INCREMENT, PRIMARY KEY (`pr_id`), UNIQUE KEY `pr_pagetype` (`pr_page`,`pr_type`), KEY `pr_typelevel` (`pr_type`,`pr_level`), KEY `pr_level` (`pr_level`), KEY `pr_cascade` (`pr_cascade`) ) ENGINE=InnoDB AUTO_INCREMENT=869230 DEFAULT CHARSET=binary ROW_FORMAT=COMPRESSED; /*!40101 SET character_set_client = @saved_cs_client */; -- -- Dumping data for table `page_restrictions` -- /*!40000 ALTER TABLE `page_restrictions` DISABLE KEYS */; INSERT INTO `page_restrictions` VALUES (1086732,'edit','sysop',0,NULL,'infinity',1307),(1086732,'move','sysop',0,NULL,'infinity',1308),(1266562,'edit','autoconfirmed',0,NULL,'infinity',1358),(1266562,'move','autoconfirmed',0,NULL,'infinity',1359),(1534334,'edit','autoconfirmed',0,NULL,NULL,1437),(1534334,'move','autoconfirmed',0,NULL,NULL,1438),(1654125,'edit','autoconfirmed',0,NULL,NULL,1664),(1654125,'move','autoconfirmed',0,NULL,NULL,1665),(1654622,'edit','autoconfirmed',0,NULL,NULL,1672),(1654622,'move','autoconfirmed',0,NULL,NULL,1673),(1654633,'edit','autoconfirmed',0,NULL,NULL,1674),(1654633,'move','autoconfirmed',0,NULL,NULL,1675),(1654645,'edit','autoconfirmed',0,NULL,NULL,1676),(1654645,'move','autoconfirmed',0,NULL,NULL,1677),(1654656,'edit','autoconfirmed',0,NULL,NULL,1682),(1654656,'move','autoconfirmed',0,NULL,NULL,1683),(1654662,'edit','autoconfirmed',0,NULL,NULL,1684),(1654662,'move','autoconfirmed',0,NULL,NULL,1685),(1654673,'edit','autoconfirmed',0,NULL,NULL,1686),(16 gzip: stdout: Broken pipe
# create a Connection object that represents the database.
conn = sqlite3.connect(':memory:') # special name ":memory:" to create a database in RAM.
# Create a Cursor object and call its execute() method
# to perform SQL commands to create a table
c = conn.cursor()
c.execute('''CREATE TABLE `page_restrictions` (
`pr_page` int(8) NOT NULL DEFAULT '0',
`pr_type` varbinary(255) NOT NULL DEFAULT '',
`pr_level` varbinary(255) NOT NULL DEFAULT '',
`pr_cascade` tinyint(4) NOT NULL DEFAULT '0',
`pr_user` int(10) DEFAULT NULL,
`pr_expiry` varbinary(14) DEFAULT NULL,
`pr_id` int(10) NOT NULL ); ''')
<sqlite3.Cursor at 0x7fe7d7881420>
# Insert Data into the table created above
with gzip.open(os.path.join(DUMP_DIR, DUMP_FN), 'rt') as raw:
for line in raw:
line = line.strip()
if line.lower().startswith('insert'):
c.execute(line)
for row in c.execute('SELECT * FROM page_restrictions LIMIT 5'):
print(row)
(1086732, 'edit', 'sysop', 0, None, 'infinity', 1307) (1086732, 'move', 'sysop', 0, None, 'infinity', 1308) (1266562, 'edit', 'autoconfirmed', 0, None, 'infinity', 1358) (1266562, 'move', 'autoconfirmed', 0, None, 'infinity', 1359) (1534334, 'edit', 'autoconfirmed', 0, None, None, 1437)
for row in c.execute('SELECT COUNT(*) FROM page_restrictions '):
print(row)
(147325,)
Save data into a Pandas DataFrame because
Pandas library provide high-performance, easy-to-use data structures and data analysis tools.
# save table into a DataFrame using pandas
df = pd.read_sql_query('SELECT * FROM page_restrictions',conn)
conn.close() # close the database connection
df.head(4) # show DataFrame first 4 rows
pr_page | pr_type | pr_level | pr_cascade | pr_user | pr_expiry | pr_id | |
---|---|---|---|---|---|---|---|
0 | 1086732 | edit | sysop | 0 | None | infinity | 1307 |
1 | 1086732 | move | sysop | 0 | None | infinity | 1308 |
2 | 1266562 | edit | autoconfirmed | 0 | None | infinity | 1358 |
3 | 1266562 | move | autoconfirmed | 0 | None | infinity | 1359 |
As the previous table shows, for each pr_page (page id) there can be more than one record,one for each type of protection that the page has.
df.info() # print DataFrame sumary
<class 'pandas.core.frame.DataFrame'> RangeIndex: 147325 entries, 0 to 147324 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 pr_page 147325 non-null int64 1 pr_type 147325 non-null object 2 pr_level 147325 non-null object 3 pr_cascade 147325 non-null int64 4 pr_user 0 non-null object 5 pr_expiry 147154 non-null object 6 pr_id 147325 non-null int64 dtypes: int64(3), object(4) memory usage: 7.9+ MB
df["pr_page"].unique().size # Shows how many unique registers has pr_page column.
84338
The Page Protection API can be a much simpler way to access data about page protections for a given article if you know what articles you are interested in and are interested in relatively few articles (e.g., hundreds or low thousands).
NOTE: the APIs are up-to-date while the Mediawiki dumps are always at least several days behind -- i.e. for specific snapshots in time -- so the data you get from the Mediawiki dumps might be different from the APIs if permissions have changed to a page's protections in the intervening days.
# import librarie
import mwapi # useful for accessing Wikimedia API
# select 10 random ids from df
ten_random_ids = df['pr_page'].sample(10, random_state = 1) # pandas_serie
# save the ids as a list
list_ids = ten_random_ids.tolist()
# print the ids list
print(list_ids)
[37813208, 28938834, 19333018, 22798656, 22711463, 41170415, 13136934, 41511721, 3477349, 22857969]
# mwapi documentation: https://pypi.org/project/mwapi/
# user_agent helps identify the request if there's an issue and is best practice
tutorial_label = 'Page Protection API tutorial (mwapi)'
# NOTE: it is best practice to include a contact email in user agents
# generally this is private information though so do not change it to yours
# if you are working in the PAWS environment or adding to a Github repo
# for Outreachy, you can leave this as my (isaac's) email or switch it to your Mediawiki username
# e.g., Isaac (WMF) for https://www.mediawiki.org/wiki/User:Isaac_(WMF)
contact_email = 'isaac@wikimedia.org'
session = mwapi.Session('https://{0}.org'.format(SITENAME), user_agent='{0} -- {1}'.format(tutorial_label, contact_email))
# API endpoint: https://www.mediawiki.org/w/api.php?action=help&modules=query%2Binfo
# More details: https://www.mediawiki.org/wiki/API:Info
params = {'action':'query',
'pageids':'|'.join([str(d) for d in list_ids]), # this parameter receives a list of integers, values separated with "|"
'prop':'info',
'inprop':'protection'
}
## make request to API for data
result = session.get(params)
# TODO: examine API results and compare to data from Mediawiki dump to see if
# they are the same and explain any discrepancies
# Shows API results
print("Quantity of elements in 'result'=",len(result['query']['pages']))
for k,v in result['query']['pages'].items():
print("Element in result: \n",k,v)
break
Quantity of elements in 'result'= 10 Element in result: 3477349 {'pageid': 3477349, 'ns': 0, 'title': 'Neilston', 'contentmodel': 'wikitext', 'pagelanguage': 'en', 'pagelanguagehtmlcode': 'en', 'pagelanguagedir': 'ltr', 'touched': '2020-10-30T02:28:22Z', 'lastrevid': 980151641, 'length': 55282, 'protection': [{'type': 'move', 'level': 'sysop', 'expiry': 'infinity'}], 'restrictiontypes': ['edit', 'move']}
After inspecting the data provided by the API on the types of protection of the selected pages, it is observed that the number of attributes differs with respect to those of the data in the dump file. In the API protection information is stored in a dict with *3 keys : type, level, expiry there is not information for cascade protections and future per-user edit restriction as in dump files. However the difference in per-user edit restriction field it is not a problem because is not in use.
Now we are going to process the data in a format that allows us to easily compare the two sources of information field by field.
# sort the new data frame by pr_page and pr_type
df_ten_dump = df.loc[df['pr_page'].isin(list_ids)].sort_values(by=['pr_page','pr_type'])
df_ten_dump.head(4) # shows 4 rows
pr_page | pr_type | pr_level | pr_cascade | pr_user | pr_expiry | pr_id | |
---|---|---|---|---|---|---|---|
22046 | 3477349 | move | sysop | 0 | None | infinity | 201435 |
136698 | 13136934 | edit | extendedconfirmed | 0 | None | infinity | 835682 |
136699 | 13136934 | move | extendedconfirmed | 0 | None | infinity | 835683 |
26567 | 19333018 | edit | autoconfirmed | 0 | None | infinity | 235073 |
df_ten_dump.reset_index(drop=True, inplace=True) # Reset DataFrame index and drops
# Use the Pandas classmethod DataFrame.from_dict to create a DF from API results
df_api = pd.DataFrame.from_dict(result['query']['pages'],orient='index')
df_api.head(3)
pageid | ns | title | contentmodel | pagelanguage | pagelanguagehtmlcode | pagelanguagedir | touched | lastrevid | length | protection | restrictiontypes | redirect | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
3477349 | 3477349 | 0 | Neilston | wikitext | en | en | ltr | 2020-10-30T02:28:22Z | 980151641 | 55282 | [{'type': 'move', 'level': 'sysop', 'expiry': ... | [edit, move] | NaN |
13136934 | 13136934 | 0 | Gurbaksh Chahal | wikitext | en | en | ltr | 2020-10-30T04:02:53Z | 981766586 | 39790 | [{'type': 'edit', 'level': 'extendedconfirmed'... | [edit, move] | NaN |
19333018 | 19333018 | 2 | User:Scjessey | wikitext | en | en | ltr | 2020-10-13T12:38:21Z | 947303256 | 9485 | [{'type': 'edit', 'level': 'autoconfirmed', 'e... | [edit, move] | NaN |
df_api.count() # count rows per column in DF
pageid 10 ns 10 title 10 contentmodel 10 pagelanguage 10 pagelanguagehtmlcode 10 pagelanguagedir 10 touched 10 lastrevid 10 length 10 protection 10 restrictiontypes 10 redirect 2 dtype: int64
# The "protection" column has a list per row with more than one element
# Use Pandas method _explode_ to transform each element of the list to a row
df_api = df_api.explode('protection')
# Count rows after use _explode_ in "protection" column
df_api.count()
pageid 19 ns 19 title 19 contentmodel 19 pagelanguage 19 pagelanguagehtmlcode 19 pagelanguagedir 19 touched 19 lastrevid 19 length 19 protection 19 restrictiontypes 19 redirect 4 dtype: int64
# df_api has the main information about page protection in "protection" column
# as dictionary per register.
# With _apply_ and pd.Series methods the "protection" column is transformed into a DataFrame
df_api_protection = df_api['protection'].apply(pd.Series).reset_index()
df_api_protection.head(3)
index | type | level | expiry | |
---|---|---|---|---|
0 | 3477349 | move | sysop | infinity |
1 | 13136934 | edit | extendedconfirmed | infinity |
2 | 13136934 | move | extendedconfirmed | infinity |
# Rename df_api_protection columns like df_ten_dump to ensure comparison
cols = {'index':'pr_page', 'type':'pr_type', 'level':'pr_level', 'expiry':'pr_expiry'}
df_api_protection.rename(columns=cols, inplace=True)
df_api_protection.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 19 entries, 0 to 18 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 pr_page 19 non-null object 1 pr_type 19 non-null object 2 pr_level 19 non-null object 3 pr_expiry 19 non-null object dtypes: object(4) memory usage: 736.0+ bytes
# Change 'pr_page' column type from object to int
df_api_protection["pr_page"] = pd.to_numeric(df_api_protection["pr_page"])
# Sort df_api_protection by 'pr_page' and 'pr_type'
df_api_protection.sort_values(by= ['pr_page','pr_type'], inplace=True)
Pandas compare method allows compare a DataFrame with another one and show the differences if exist. The resulting DataFrame shows that there are no differences between df_api_protection and df_ten_dump since each cell compared shows NaN value.
It is safe to say that this result is not representative for all data as we are only comparing 10 ids out of 84,338 possible (about 0.012% of total data)
# Use compare Pandas DataFrame method
df_api_protection.compare(df_ten_dump[['pr_page','pr_type','pr_level','pr_expiry']],
keep_shape=True)
pr_page | pr_type | pr_level | pr_expiry | |||||
---|---|---|---|---|---|---|---|---|
self | other | self | other | self | other | self | other | |
0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
6 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
7 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
8 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
9 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
10 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
11 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
12 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
13 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
14 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
15 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
16 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
17 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
18 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Here we show some examples of things we can do with the data that we gathered about the protections for various Wikipedia articles. You'll want to come up with some questions to ask of the data as well. For this, you might need to gather additional data such as:
DUMP_DIR
under the name {LANGUAGE}-latest-page.sql.gz
# TODO: add any imports of data analysis / visualization libraries here as necessary
import random
import time
from scipy.stats import pearsonr, pointbiserialr
import seaborn as sns
from IPython.display import display
sns.set()
We are going to use the data about page protection gathered from the Mediawiki dump and saved in the DataFrame df before.
There are 84.338 unique IDs in the DataFrame. Each page can have more than one type of protection such as, edit, move or upload and as we can observe the majority of the pages on these DataFrame have two type of protection, edit and move.
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 147325 entries, 0 to 147324 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 pr_page 147325 non-null int64 1 pr_type 147325 non-null object 2 pr_level 147325 non-null object 3 pr_cascade 147325 non-null int64 4 pr_user 0 non-null object 5 pr_expiry 147154 non-null object 6 pr_id 147325 non-null int64 dtypes: int64(3), object(4) memory usage: 7.9+ MB
df['pr_page'].unique().size # count unique IDs in df
84338
df.groupby(by='pr_type')["pr_page"].count() # explore categories in protection type
pr_type edit 73221 move 73894 upload 210 Name: pr_page, dtype: int64
# Count number of protections per page and save in a df to plot it
pr_count = df.groupby(by='pr_page')["pr_type"].count().to_frame()
# Plot the number of protections per page
c = sns.catplot(x= 'pr_type',kind="count", data=pr_count)
_ = c.set_axis_labels("Number of protections per page")
The top of protection levels for move and upload types is full-protection (sysop in data) and for edit protection type is semi-protection (autoconfirmed in data)
# Explore the levels of each protection type:
# Edit protection
df[df['pr_type']=='edit'].groupby(by='pr_level')['pr_page'].count()
pr_level autoconfirmed 47995 extendedconfirmed 2372 sysop 14278 templateeditor 8576 Name: pr_page, dtype: int64
# Move protection
df[df['pr_type']=='move'].groupby(by='pr_level')['pr_page'].count()
pr_level autoconfirmed 30075 extendedconfirmed 1967 sysop 33843 templateeditor 8009 Name: pr_page, dtype: int64
# Upload protection
df[df['pr_type']=='upload'].groupby(by='pr_level')['pr_page'].count()
pr_level autoconfirmed 31 extendedconfirmed 2 sysop 176 templateeditor 1 Name: pr_page, dtype: int64
# Make a list with the unique page ids in df
unique_ids = df['pr_page'].unique()
unique_ids = unique_ids.tolist()
# Select 3000 random page IDs from page_protection data gathered from the Mediawiki dump
number_sample = 4000
random_ids_3 = random.sample(unique_ids, number_sample)
# Create a DataFrame with the 3000 IDs from page_protection
df_random_3 = df.loc[df['pr_page'].isin(random_ids_3)].reset_index(drop=True)
df_random_3.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 7010 entries, 0 to 7009 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 pr_page 7010 non-null int64 1 pr_type 7010 non-null object 2 pr_level 7010 non-null object 3 pr_cascade 7010 non-null int64 4 pr_user 0 non-null object 5 pr_expiry 7003 non-null object 6 pr_id 7010 non-null int64 dtypes: int64(3), object(4) memory usage: 383.5+ KB
## remove 3 columns that are not required for analysis
df_random_3.drop(columns=['pr_id', 'pr_user','pr_cascade','pr_expiry'], inplace=True)
df_random_3.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 7010 entries, 0 to 7009 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 pr_page 7010 non-null int64 1 pr_type 7010 non-null object 2 pr_level 7010 non-null object dtypes: int64(1), object(2) memory usage: 164.4+ KB
The most representative protections in this data are move and edit, from now on we are going to work only with those two types.
df_random_3.groupby(by='pr_type')["pr_page"].count() # explore categories in protection type
pr_type edit 3468 move 3530 upload 12 Name: pr_page, dtype: int64
To make some kind of inference we need additional information about the articles, so we are going to request information about the anonymous contributors for the 3000 IDs randomly selected from protection_page data gathered from the Mediawiki dump. The intuition is that the pages with the greatest number of anonymous contributors may be exposed to more vandalism and therefore need some kind of protection.
# list with exception about logged contributors
list_exc =['accountcreator|bot|bureaucrat|checkuser|confirmed|import|interface-admin|ipblock-exempt|oversight|steward|sysop|transwiki']
start_time = time.time()
time_every = 500 # print elapsed time every 500 ids processed
contributors=[]
for i in range(0,number_sample,50):
params_50 = {'action':'query',
'pageids':'|'.join([str(d) for d in random_ids_3[i:i+50]]),
'prop':'contributors',
'pclimit': 'max',
'pcexcludegroup': list_exc
}
# make a request
result_50 = session.get(params_50)
# loop over request result to obtain the anonymous contributors count
pages = result_50['query']['pages']
for item in pages:
if 'anoncontributors' in pages[item]:
dic = {'pr_page':item, 'anonycontributors':int(pages[item]['anoncontributors'])}
contributors.append(dic)
else:
dic = {'pr_page':item, 'anonycontributors':0}
contributors.append(dic)
# # check time elapsed every 500 ids since the loop start until reaching 3000
if i % time_every == 0:
elapsed_t_sec = time.time() - start_time
print("{0} IDs processed. {1:.2f} minutes elapsed.".format(i,elapsed_t_sec/60))
elapsed_t_sec = time.time() - start_time
print("total:{0} IDs processed. {1:.2f} minutes elapsed.".format(i+50,elapsed_t_sec/60))
0 IDs processed. 0.12 minutes elapsed. 500 IDs processed. 0.84 minutes elapsed. 1000 IDs processed. 1.60 minutes elapsed. 1500 IDs processed. 2.31 minutes elapsed. 2000 IDs processed. 3.43 minutes elapsed. 2500 IDs processed. 4.14 minutes elapsed. 3000 IDs processed. 4.82 minutes elapsed. 3500 IDs processed. 5.48 minutes elapsed. total:4000 IDs processed. 6.18 minutes elapsed.
The main reason to request just this amount of IDs from the API is because is a time consuming process, as we can note from the reports above.
print("There are {0} dictionaries in contributors list, suchs as {1}".format(len(contributors),
contributors[0]))
There are 4000 dictionaries in contributors list, suchs as {'pr_page': '29810', 'anonycontributors': 2664}
# Transforme contributors data gathered from the API into a DataFrame
df_contributors = pd.DataFrame(contributors)
# # Inspect contributors DataFrame
df_contributors.head(2)
pr_page | anonycontributors | |
---|---|---|
0 | 29810 | 2664 |
1 | 150011 | 203 |
df_contributors.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4000 entries, 0 to 3999 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 pr_page 4000 non-null object 1 anonycontributors 4000 non-null int64 dtypes: int64(1), object(1) memory usage: 62.6+ KB
# Change 'pr_page' column type from object to int in contributors DataFrame
df_contributors["pr_page"] = pd.to_numeric(df_contributors["pr_page"])
# Order the dataframe with one column for
# edit type and other for move protection, and just one-row per id page
df_edit_3 = df_random_3[df_random_3['pr_type']=='edit'] # edit
df_move_3 = df_random_3[df_random_3['pr_type']=='move'] # move
# Merge edit and move
df_merged = df_edit_3.merge(df_move_3, how='outer', left_on='pr_page',right_on='pr_page',
suffixes=('_edit','_move'))
# Merge contributors
df_merged = df_merged.merge(df_contributors,how='outer', left_on='pr_page', right_on='pr_page')
# inspect merged DataFrame
df_merged.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 4000 entries, 0 to 3999 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 pr_page 4000 non-null int64 1 pr_type_edit 3468 non-null object 2 pr_level_edit 3468 non-null object 3 pr_type_move 3530 non-null object 4 pr_level_move 3530 non-null object 5 anonycontributors 4000 non-null int64 dtypes: int64(2), object(4) memory usage: 218.8+ KB
# Set 'pr_page' the page's id as index in merged DataFrame
df_merged.set_index('pr_page', inplace=True)
df_merged.head(2)
pr_type_edit | pr_level_edit | pr_type_move | pr_level_move | anonycontributors | |
---|---|---|---|---|---|
pr_page | |||||
1657925 | edit | autoconfirmed | move | autoconfirmed | 0 |
6119248 | edit | sysop | move | sysop | 0 |
Since not all pages have all the protections for which new columns were created, some null values were generated. We are going to replace null values with 0.
df_merged.isna().sum() # check for null values generated
pr_type_edit 532 pr_level_edit 532 pr_type_move 470 pr_level_move 470 anonycontributors 0 dtype: int64
# Change null values for 0
df_merged.fillna(0, inplace=True)
# mape categorical value "edit" for 1
df_merged['pr_type_edit'].replace('edit',1,inplace=True)
# mape categorical value "move" for 1
df_merged['pr_type_move'].replace('move',1,inplace=True)
df_merged['anonycontributors'].describe() # descriptive statistics for anonymous contributors
count 4000.000000 mean 123.913000 std 441.708597 min 0.000000 25% 0.000000 50% 0.000000 75% 12.000000 max 14306.000000 Name: anonycontributors, dtype: float64
As we can see, most of the articles in the sample have very few or no anonymous contributors.
a = sns.distplot(df_merged['anonycontributors'],
axlabel = "Number of anonymous contributors per article")
_ = a.set_ylabel('Proportion of Wikipedia articles')
If we explore the data by type of protection we observe that when pages have edit protection the average of anonymous contributors is lower than when the page don't have this protection.
And if we explore the levels of edit protection, we notice that average number of anonymous contributors per level increases as the level of protection decreases.
display(df_merged.groupby('pr_type_edit')['anonycontributors'].describe(),
df_merged.groupby('pr_level_edit')['anonycontributors'].describe())
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
pr_type_edit | ||||||||
0 | 532.0 | 364.624060 | 852.812638 | 0.0 | 0.0 | 55.0 | 416.25 | 14306.0 |
1 | 3468.0 | 86.987313 | 321.558111 | 0.0 | 0.0 | 0.0 | 4.00 | 3605.0 |
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
pr_level_edit | ||||||||
0 | 532.0 | 364.624060 | 852.812638 | 0.0 | 0.00 | 55.0 | 416.25 | 14306.0 |
autoconfirmed | 2243.0 | 126.561748 | 385.689783 | 0.0 | 0.00 | 0.0 | 15.50 | 3605.0 |
extendedconfirmed | 122.0 | 128.729508 | 327.505928 | 0.0 | 0.25 | 15.5 | 62.75 | 1855.0 |
sysop | 684.0 | 2.717836 | 20.287196 | 0.0 | 0.00 | 0.0 | 0.00 | 397.0 |
templateeditor | 419.0 | 0.548926 | 1.727727 | 0.0 | 0.00 | 0.0 | 0.00 | 22.0 |
On the other hand when it comes to move protection the average of contributors anonymous is greater than when the page don't have this protection.
display(df_merged.groupby('pr_type_move')['anonycontributors'].describe(),
df_merged.groupby('pr_level_move')['anonycontributors'].describe())
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
pr_type_move | ||||||||
0 | 470.0 | 124.676596 | 366.150519 | 0.0 | 0.0 | 0.0 | 26.0 | 3161.0 |
1 | 3530.0 | 123.811331 | 450.859031 | 0.0 | 0.0 | 0.0 | 11.0 | 14306.0 |
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
pr_level_move | ||||||||
0 | 470.0 | 124.676596 | 366.150519 | 0.0 | 0.0 | 0.0 | 26.0 | 3161.0 |
autoconfirmed | 1426.0 | 92.848527 | 298.557857 | 0.0 | 0.0 | 0.0 | 14.0 | 2690.0 |
extendedconfirmed | 97.0 | 74.329897 | 204.306486 | 0.0 | 2.0 | 14.0 | 39.0 | 1308.0 |
sysop | 1605.0 | 185.157632 | 597.470689 | 0.0 | 0.0 | 0.0 | 28.0 | 14306.0 |
templateeditor | 402.0 | 0.656716 | 2.540754 | 0.0 | 0.0 | 0.0 | 0.0 | 36.0 |
The point biserial correlation is used to measure the relationship between a binary variable, and a continuous variable. For this data sample te correlation between protection type edit and the number of anonymous contributors is negative and weak.
but we are going to try to train a naive model with the data restrictions we have
corr, p_value = pointbiserialr(df_merged['pr_type_edit'], df_merged['anonycontributors'])
print("The correlation between protection type edit and the anonymous contributors {0:.3f} (p={1:0.3f})".format(corr,p_value))
The correlation between protection type edit and the anonymous contributors -0.213 (p=0.000)
corr, p_value = pointbiserialr(df_merged['pr_type_move'], df_merged['anonycontributors'])
print("The correlation between protection type edit and the anonymous contributors {0:.3f} (p={1:0.3f})".format(corr,p_value))
The correlation between protection type edit and the anonymous contributors -0.001 (p=0.968)
To protect pages from vandalism Wikipedia allows for some articles to become protected, where only certain users can make revisions to the page. However, with over six million articles in the English Wikipedia, it is very difficult for editors to monitor all pages to suggest articles in need of edit protection.
Therefore considering the problem of deciding whether an article should be protected for editing or not in Wikipedia we can formulate it, as a binary classification task and propose a very simplistic set of features to decide which pages to protect based on (1) number of anonymous contributors and (2) other type of protections like move.
We are going to use a Naive Bayes (NB) classifier that predicts whether a page has an edit protection or not.
The model is very simplistic and the set of features is very limited, but it can be a good start to implement an improved version in the future analysis.
# imports
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# preprocess data for modeling
X = df_merged[['anonycontributors', 'pr_type_move']]
y = df_merged[['pr_type_edit']]
# # Split data in training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
# # # Print 3 first elements in each set
print("X_train ({0}):\n{1}".format(len(X_train), X_train.head(3)))
print("\ny_train ({0}):\n{1}".format(len(y_train), y_train.head(3)))
print("\nX_test ({0}):\n{1}".format(len(X_test), X_test.head(3)))
print("\ny_test ({0}):\n{1}".format(len(y_test), y_test.head(3)))
X_train (3600): anonycontributors pr_type_move pr_page 56556988 0 1 3653348 0 1 30795 2269 1 y_train (3600): pr_type_edit pr_page 56556988 1 3653348 1 30795 0 X_test (400): anonycontributors pr_type_move pr_page 17282118 0 1 51745047 0 0 19942102 0 1 y_test (400): pr_type_edit pr_page 17282118 1 51745047 1 19942102 1
# Initialize our classifier
gnb = GaussianNB().fit(X_train.values, y_train.values.ravel())
# Make predictions
y_pred = gnb.predict(X_test)
# Evaluate model
print("Number of mislabeled points out of a total {0} points: {1}".format(X_test.shape[0]
,(y_test.values.ravel() != y_pred).sum()))
Number of mislabeled points out of a total 400 points: 57
print(accuracy_score(y_test.values.ravel(), y_pred))
0.8575
The NB classifier is 85.75% accurate. This means that 85.75 percent of the time the classifier is able to make the correct prediction as to whether or not a page has edition protections or not. These results suggest that our feature set of 2 attributes are fine indicators of edition class but we need to add more features to obtain more accurate result.