You can run MySQL queries against live replicas of all public Wikimedia project databases from within a PAWS notebook.
These databases contain all publicly-avaialable data from the production databases. Sensitive information such as user email addresses, and metadata about (some kinds of) deleted revisions, is redacted.
The MediaWiki database schema shows the fields available in each table, and the relationship between the tables, for all tables that are common to all Wikimedia projects. There are also special, one-off or extension-specific tables available on some wikis.
If you're just interested in information about users, edits, and pages, most of the data you'll be interested in can be found in the following tables:
The function below uses os
and pymysql
to connect to the database with your current credentials (the same ones you use to log into PAWS and quarry.wmflabs.org) and returns a connection object.
IMPORTANT do not print your os.environ
credentials in PAWS—these variables contain your secure login credentials and all notebooks are potentially public.
import os
import pymysql
def connect_to_db():
"""
Connect to the public enwiki db using quarry.wmflabs.org credentials
"""
conn = pymysql.connect(
host=os.environ['MYSQL_HOST'],
user=os.environ['MYSQL_USERNAME'],
password=os.environ['MYSQL_PASSWORD'], #don't print me!
database='enwiki_p',
charset='utf8',
use_unicode=True
)
return conn
Once you've built a connection, you can run any arbitrary SELECT queries you want. Queries are throttled if they run more than 30 minutes.
conn = connect_to_db()
with conn.cursor() as cur:
cur.execute('use enwiki_p;') #which database to use
cur.execute("""SELECT rev_id, rev_user, rev_user_text, rev_timestamp, rev_comment
FROM enwiki_p.revision_userindex
WHERE rev_user_text = "{}"
AND DATE_FORMAT(rev_timestamp, '%Y%m%d%H%i%s')
> DATE_FORMAT({}, '%Y%m%d%H%i%s')""".format("Jtmorgan", "20160101000000"))
revs = cur.fetchall()
for r in revs:
print(r)
(697802037, 7878375, b'Jtmorgan', b'20160102005810', b'/* Inappropriate hosts */ my 2 cents') (700508456, 7878375, b'Jtmorgan', b'20160118224556', b'/* Hosts MIA */ reply') (700667121, 7878375, b'Jtmorgan', b'20160119220941', b'/* Proposed updates to invite template and inviter critera */ update re: inviter block/activity checker') (700671249, 7878375, b'Jtmorgan', b'20160119223809', b'/* Inappropriate hosts */ my opinion') (700671367, 7878375, b'Jtmorgan', b'20160119223857', b"/* Inappropriate hosts */ I wasn't the only one!") (702967818, 7878375, b'Jtmorgan', b'20160202182119', b'/* Deleted Content */ + Teahouse invite') (704180534, 7878375, b'Jtmorgan', b'20160210011313', b"I don't see why citation needed span is necessary on a single sentence") (704180761, 7878375, b'Jtmorgan', b'20160210011521', b'Undid revision 704099717 by [[Special:Contributions/Guestajh|Guestajh]] ([[User talk:Guestajh|talk]]).') (705841076, 7878375, b'Jtmorgan', b'20160219213406', b'') (705841243, 7878375, b'Jtmorgan', b'20160219213430', b'/* Proposed updates to invite template and inviter critera */ subst invite sample') (705841385, 7878375, b'Jtmorgan', b'20160219213537', b'/* Hello World */ rmv empty section') (705841435, 7878375, b'Jtmorgan', b'20160219213556', b'/* Hello World */ rmv empty section') (705841474, 7878375, b'Jtmorgan', b'20160219213614', b'/* Hello World */ rmv empty section') (705843147, 7878375, b'Jtmorgan', b'20160219214925', b'/* Proposed updates to invite template and inviter critera */ reply to Cordless Larry') (705844656, 7878375, b'Jtmorgan', b'20160219220052', b'/* About the "Join this discussion" gadget */ reply to Diego') (705844978, 7878375, b'Jtmorgan', b'20160219220314', b'/* Proposed updates to invite template and inviter critera */ reply') (706181650, 7878375, b'Jtmorgan', b'20160221231936', b'/* Proposed updates to invite template and inviter critera */ reply to Liz') (710793092, 7878375, b'Jtmorgan', b'20160319021004', b'/* Why */ rmv section by [[User:Major_Dracula]] (personal insults)') (711092934, 7878375, b'Jtmorgan', b'20160320212646', b'/* Security issue - mail */ resolved') (711649332, 7878375, b'Jtmorgan', b'20160324021300', b'/* Early life and emigration */ add some sources') (711649485, 7878375, b'Jtmorgan', b'20160324021410', b'/* Career */ fix source link') (711649763, 7878375, b'Jtmorgan', b'20160324021648', b'/* Career */ one more cite') (711649836, 7878375, b'Jtmorgan', b'20160324021724', b'rmv multiple issues template') (711650590, 7878375, b'Jtmorgan', b'20160324022335', b'architect infobox') (711807679, 7878375, b'Jtmorgan', b'20160325004822', b'') (712584902, 7878375, b'Jtmorgan', b'20160329232028', b'/* Blocked user */ reply') (714974277, 7878375, b'Jtmorgan', b'20160412231027', b'fix wikilink') (714975741, 7878375, b'Jtmorgan', b'20160412232359', b'/* I hereby award you the Cup & Saucer Award of Integrity */ new section') (714975915, 7878375, b'Jtmorgan', b'20160412232543', b'/* For asking an excellent question at the Teahouse */ new section') (714976038, 7878375, b'Jtmorgan', b'20160412232647', b'/* TeaHouse Badge */ reply') (720109972, 7878375, b'Jtmorgan', b'20160513200453', b'/* About HostBot */ reply to JumpiMaus') (720959087, 7878375, b'Jtmorgan', b'20160518232628', b'rmv dupe entry') (723058795, 7878375, b'Jtmorgan', b'20160531185943', b'/* Recent questions section */ reply') (723063594, 7878375, b'Jtmorgan', b'20160531193419', b'Undid revision 723015428 by [[Special:Contributions/PrimeHunter|PrimeHunter]] ([[User talk:PrimeHunter|talk]]) (fixed the issue)') (723063980, 7878375, b'Jtmorgan', b'20160531193711', b'/* Recent questions section */ fixed the problem. Thanks again!') (724537329, 7878375, b'Jtmorgan', b'20160609220545', b'/* Example of good hosting */ reply') (724649719, 7878375, b'Jtmorgan', b'20160610160400', b'rmv unsourced information') (726709091, 7878375, b'Jtmorgan', b'20160623214653', b'Undid revision 726638534 by [[Special:Contributions/114.4.82.25|114.4.82.25]] ([[User talk:114.4.82.25|talk]])') (729345482, 7878375, b'Jtmorgan', b'20160711154128', b'fix invalid ref') (729388837, 7878375, b'Jtmorgan', b'20160711212753', b'/* Speedy deletion nomination of Wikipedia:Enoch-Jude Danquah */ reply') (731387050, 7878375, b'Jtmorgan', b'20160725012049', b"/* Pink tide: 2000\xe2\x80\x9306 */ cite Fidel's speech for response to 9/11") (731389833, 7878375, b'Jtmorgan', b'20160725014539', b'/* Background */') (731390656, 7878375, b'Jtmorgan', b'20160725015208', b'named the ABC diplomats in the body of the article') (731390784, 7878375, b'Jtmorgan', b'20160725015318', b'images of the ABC diplomats to top') (731392223, 7878375, b'Jtmorgan', b'20160725020855', b'/* Other sightings */ uh-huh') (731392517, 7878375, b'Jtmorgan', b'20160725021210', b"Undid revision 731368548 [[Wikipedia:External_links|external links]] shouldn't be placed in the body of an article. Try adding a citation instead") (731528232, 7878375, b'Jtmorgan', b'20160725225442', b'link to Wikipedia:Glossary; Wikipedia:Jargon is about technical terminology in articles')
Timestamps are stored in the DB as 14-character binary text strings, rather than datetime objects. The format is
... or "%Y%m%d%H%i%s"
in Python format-speak.
You can convert these back and forth to datetime objects using the following functions.
from datetime import datetime
def convert_to_db_timestamp(timestamp):
"""
- Accepts a datetime object
- Returns a db-timestamp formatted string e.g. '20160621195536'
"""
return datetime.strftime(timestamp, "%Y%m%d%H%M%S")
def convert_to_datetime(regstr):
"""
- Accepts a MediaWiki db-formatted timestamp formatted string
- Returns a datetime object
"""
return datetime.strptime(regstr, "%Y%m%d%H%M%S")
print(convert_to_db_timestamp(datetime.now()))
print(convert_to_datetime('20161111000000'))
20160726214838 2016-11-11 00:00:00
The example function below gets a user's edit count at a particular point in time.
NOTE The revision
table is huge. If you're trying to get revision information about a particular user from the revision table, use revision_userindex
instead—much faster!
def get_user_editcount(user, timestamp, conn):
"""
- Accepts a username, datetime, and db conn
- Returns the number of edits the user made before that date and time
OR None, if there's an error (there will be some errors)
"""
try:
dbts = convert_to_db_timestamp(timestamp)
except TypeError:
return None
with conn.cursor() as cur:
try:
cur.execute('use enwiki_p;')
cur.execute("""SELECT COUNT(rev_id)
FROM enwiki_p.revision_userindex
WHERE rev_user_text = "{}"
AND DATE_FORMAT(rev_timestamp, '%Y%m%d%H%i%s')
< DATE_FORMAT({}, '%Y%m%d%H%i%s')""".format(user, dbts))
cnt = cur.fetchone()[0]
except TypeError:
cnt = None
return cnt
conn = connect_to_db()
print(get_user_editcount("Jtmorgan", datetime.now(), conn))
5149
This example query retrieves the registration date for a particular user.
def get_user_registration(user, conn):
"""
- Accepts a username and db connection
- Returns the datetime they registered
OR None, if there's an error (e.g. user is IP editor, username not found )
"""
with conn.cursor() as cur:
try:
cur.execute('use enwiki_p;')
cur.execute('select user_registration from user where user_name = "%s"' % (user))
# reg = cur.fetchone()#fix name, variable assignment
reg = convert_to_datetime(cur.fetchone()[0].decode("utf-8"))
except (TypeError, AttributeError) as e:
reg = None
return reg
conn = connect_to_db()
print(get_user_registration("Jtmorgan", conn))
2008-09-17 14:31:25