import mwxml
import gzip
import dateutil
import glob
from collections import Counter
from sqlalchemy import create_engine
import sys, os
import pandas as pd
constr = 'mysql+pymysql://{user}:{pwd}@{host}'.format(user=os.environ['MYSQL_USERNAME'],
pwd=os.environ['MYSQL_PASSWORD'],
host=os.environ['MYSQL_HOST'])
con = create_engine(constr)
revisions = mwxml.Dump.from_file(gzip.open('/public/dumps/public/enwiki/20180601/enwiki-20180601-stub-meta-history.xml.gz'))
Page(id=626, title='Auteur Theory Film', namespace=0, redirect='Auteur', restrictions=[]) (Timestamp('2001-03-08T00:27:40Z'), 984011260)
users = pd.read_sql("""select ukuser.user_id as uk_id,
enuser.user_id as en_id,
ukuser.user_name as username from ukwiki_p.user ukuser
join enwiki_p.user enuser ON ukuser.user_name = enuser.user_name
where ukuser.user_editcount between 10 and 1000 and enuser.user_editcount between 10 and 1000""", con)
en_users = set(users['en_id'])
len(en_users)
2763
uk_originals = pd.read_sql("""
SELECT ips_item_id as wikidataId, ips_site_page AS uktitle, ukwiki_p.page.page_id as ukpage_id, english.entitle as entitle, min(ukwiki_p.revision.rev_timestamp) as uk_created
FROM wikidatawiki_p.wb_items_per_site
INNER JOIN
(SELECT ips_item_id as enwikidata, ips_site_page as entitle FROM wikidatawiki_p.wb_items_per_site WHERE ips_site_id= 'enwiki')
as english
on ips_item_id=english.enwikidata
INNER JOIN ukwiki_p.page ON ukwiki_p.page.page_title = ips_site_page
INNER JOIN ukwiki_p.revision ON ukwiki_p.page.page_id = ukwiki_p.revision.rev_page
WHERE ips_site_id= 'ukwiki'
GROUP BY uktitle""", con)
uk_originals
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-9-6a59e4a26cd3> in <module>() ----> 1 uk_originals.take() TypeError: take() missing 1 required positional argument: 'indices'
uk_created = {row.entitle.decode(): dateutil.parser.parse(row.uk_created).timestamp()
for _, row in uk_originals.iterrows()}
ukpage_id = {row.entitle.decode(): row.ukpage_id
for _, row in uk_originals.iterrows()}
uk_created['Biocybernetics'], ukpage_id['Biocybernetics']
(1177356881.0, 158009)
def filt(pages, path):
for page in pages:
if page.title not in uk_created:
continue
revs = list(page)
en_created = min([rev.timestamp.unix() for rev in revs])
#print(page.title, len(revs), en_created, uk_created[page.title])
#if en_created < uk_created[page.title]:
# continue
authors = {rev.user.id for rev in revs if rev.user}
#print(authors, authors & en_users)
if not authors & en_users:
continue
yield (page.id, ukpage_id[page.title], page.title, en_created, uk_created[page.title], authors & en_users)
def map_(pages, path):
for page in pages:
for rev in page:
if rev.page.namespace != 0:
continue
if not rev.user or not rev.page:
continue
yield (rev.user.id, rev.user.text, rev.page.id, rev.page.redirect or rev.page.title, rev.minor, rev.comment, rev.bytes, rev.timestamp.unix())
files = glob.glob('/public/dumps/public/enwiki/20181120/enwiki-20181120-stub-meta-history[1-9]*.xml.gz')
#files = [gzip.open(f) for f in files]
with open('en-revisions.tsv', 'w', buffering=100) as f:
for filename in files:
print(filename)
for r in mwxml.map(map_, [filename]):
f.write('\t'.join(map(str, r)) + '\n')
/public/dumps/public/enwiki/20181120/enwiki-20181120-stub-meta-history18.xml.gz