import getpass
import glob
from IPython.display import display
import ipywidgets as widgets
import json
from operator import itemgetter, getitem
import os
from pprint import pprint
import requests
from random import choice, sample, seed, shuffle
from urllib import parse
import time
#get your username and password for the current paws session. used to edit testwiki.
uname = os.environ['JPY_USER']
print(uname)
upass = getpass.getpass() #don't print me!
Jtmorgan ········
#input dates for the pageview API query
class date_input():
def __init__(self,
year = "",
month = "",
day = "",
):
self.year = widgets.Text(description = 'Year (4 digit)',value = year)
self.month = widgets.Text(description = 'Month (2 digit)',value = month)
self.day = widgets.Text(description = 'Day (2 digit)',value = day)
self.year.on_submit(self.handle_submit)
self.year.on_submit(self.handle_submit)
self.year.on_submit(self.handle_submit)
display(self.year, self.month, self.day)
def handle_submit(self, text):
self.v = text.value
return self.v
print("enter the year, month and day above, then press return in any field")
f = date_input()
enter the year, month and day above, then press return in any field
print("Date for pageviews: " + "/".join([f.year.value, f.month.value, f.day.value]))
Date for pageviews: 2017/07/24
Gather the current top 5 articles by pageviews and trending edits, and their metadata
def api_call(url):
try:
call = requests.get(url)
response = call.json()
except:
response = None
return response
#https://en.wikipedia.org/api/rest_v1/feed/featured/2017/04/25
endpoint = 'https://en.wikipedia.org/api/rest_v1/feed/featured/{year}/{month}/{day}'
params = {'year' : f.year.value,
'month' : f.month.value,
'day' : f.day.value
}
response = api_call(endpoint.format(**params))
if response:
top_read = {}
for a in response['mostread']['articles']:
title = a['normalizedtitle']
top_read[title] = {}
top_read[title]['rank'] = a['rank']
if 'description' in a.keys():
top_read[title]['description'] = a['description']
else:
top_read[title]['description'] = ''
if 'originalimage' in a.keys():
top_read[title]['image url'] = parse.unquote(a['originalimage']['source'])
else:
top_read[title]['image url'] = ''
else:
print("Error retrieving data from API")
def image_link_parsing(article_list):
"""
Parse out image filenames from image url
flag articles with non-Commons images
initialize filename to generic article icon if no image available
"""
for k,v in article_list.items():
if len(v['image url']) > 0:
v['file name'] = v['image url'].split("/")[-1:][0]
else:
v['file name'] = 'OOjs_UI_icon_article-rtl.svg'
if 'commons' in v['image url']:
v['on commons'] = True
else:
v['on commons'] = False
return article_list
top_read = image_link_parsing(top_read)
def counting_things(article_list):
print("how many items in the set?")
print(len(article_list))
print("\n")
print("how many have an image url?")
print(len([k for k,v in article_list.items() if v['image url']]))
print("\n")
print("how many have a non-commons image?")
print(len([k for k,v in article_list.items() if 'on commons' in v.keys() and not v['on commons']]))
counting_things(top_read)
how many items in the set? 39 how many have an image url? 33 how many have a non-commons image? 23
#sample
pprint(dict(list(top_read.items())[0:2]))
{'Battle of Dunkirk': {'description': 'important battle in the Second World ' 'War between the Allies and Germany', 'file name': 'Dunkirksoldier1.JPG', 'image url': 'https://upload.wikimedia.org/wikipedia/en/8/86/Dunkirksoldier1.JPG', 'on commons': False, 'rank': 8}, 'Planet of the Apes': {'description': 'science fiction media franchise', 'file name': 'Planet_of_the_Apes_(logo).svg', 'image url': 'https://upload.wikimedia.org/wikipedia/commons/7/75/Planet_of_the_Apes_(logo).svg', 'on commons': True, 'rank': 49}}
#save for later
timestr = time.strftime("%Y%m%d-%H%M%S")
with open('data/top_read_{}.json'.format(timestr), 'w') as fout:
json.dump(top_read,fout,sort_keys = True)
print("last saved version: " + timestr)
last saved version: 20170725-142749
-Update 7/17/2017: filters out anything with a trendiness score of less than 1, and anything with fewer than 5 editors.
#https://en.wikipedia.org/api/rest_v1/feed/trending/edits
endpoint = 'https://en.wikipedia.org/api/rest_v1/feed/trending/edits'
response = api_call(endpoint)
if response:
trending = {}
for a in response['pages']:
if a['editors'] >= 5 and a['trendiness'] >= 1:
title = a['normalizedtitle']
trending[title] = {}
trending[title]['rank'] = a['trendiness']
if 'description' in a.keys():
trending[title]['description'] = a['description']
else:
trending[title]['description'] = ''
if 'originalimage' in a.keys():
trending[title]['image url'] = parse.unquote(a['originalimage']['source'])
else:
trending[title]['image url'] = ''
else:
pass
else:
print("Error retrieving data from API")
trending = image_link_parsing(trending)
counting_things(trending) # If there are at least 5 articles left, great. If not, throw an error.
how many items in the set? 19 how many have an image url? 10 how many have a non-commons image? 12
#sample
pprint(dict(list(trending.items())[0:2]))
{'2017–18 United States network television schedule': {'description': '', 'file name': 'OOjs_UI_icon_article-rtl.svg', 'image url': '', 'on commons': False, 'rank': 3.0130928849244607}, 'Mitochondrial DNA depletion syndrome': {'description': '', 'file name': 'Autosomal_recessive_-_en.svg', 'image url': 'https://upload.wikimedia.org/wikipedia/commons/f/f1/Autosomal_recessive_-_en.svg', 'on commons': True, 'rank': 2.2671748363841844}}
#save for later
timestr = time.strftime("%Y%m%d-%H%M%S")
with open('data/trending_{}.json'.format(timestr), 'w') as fout:
json.dump(trending,fout,sort_keys = True)
print("last saved version: " + timestr)
last saved version: 20170725-142749
#how much overlap?
for k in trending.keys():
if k in top_read.keys():
print(k)
Justice League (film) Deaths in 2017 Anthony Scaramucci
Convert the dicts into a list of tuples for ranking and truncate each list at 5 items after filtering out any duplicates in the top 5
top_read_sorted = sorted(top_read.items(),key=lambda x:getitem(x[1],'rank'), reverse=False)
#sample
pprint(top_read_sorted[0:2])
[('Chester Bennington', {'description': 'American musician', 'file name': 'Linkin_Park-Rock_im_Park_2014-_by_2eight_3SC0327.jpg', 'image url': 'https://upload.wikimedia.org/wikipedia/commons/6/6d/Linkin_Park-Rock_im_Park_2014-_by_2eight_3SC0327.jpg', 'on commons': True, 'rank': 3}), ('Dunkirk (2017 film)', {'description': '2017 film by Christopher Nolan', 'file name': 'Dunkirk_Film_poster.jpg', 'image url': 'https://upload.wikimedia.org/wikipedia/en/1/15/Dunkirk_Film_poster.jpg', 'on commons': False, 'rank': 5})]
trending_sorted = sorted(trending.items(),key=lambda x:getitem(x[1],'rank'), reverse=False)
#sample
pprint(trending_sorted[0:2])
[('Mitochondrial DNA depletion syndrome', {'description': '', 'file name': 'Autosomal_recessive_-_en.svg', 'image url': 'https://upload.wikimedia.org/wikipedia/commons/f/f1/Autosomal_recessive_-_en.svg', 'on commons': True, 'rank': 2.2671748363841844}), ('2017–18 United States network television schedule', {'description': '', 'file name': 'OOjs_UI_icon_article-rtl.svg', 'image url': '', 'on commons': False, 'rank': 3.0130928849244607})]
def distinct_lists(my_lists):
"""
Takes two lists, compares the top 5 items.
If duplicates are found in the top 5, chose a random copy from one list
and remove it, replacing it with the next sequential item.
Recheck the lists to see if there are any more dupes.
"""
atop = my_lists[0][0:5]
btop = my_lists[1][0:5]
if not any(i in btop for i in atop):
return atop, btop
elif (len(my_lists[0]) > 5 and len(my_lists[1]) > 5):
dupes = [x for x in atop if x in btop]
item_to_remove = dupes[0]
seed()
list_to_prune = choice(my_lists)
list_to_prune.remove(item_to_remove)
return distinct_lists(my_lists)
else:
print("ran out of options, couldn't find three distinct") #need to return something, so the script doesn't choke?
#input order matches output order
top5_distinct = distinct_lists(([x[0] for x in top_read_sorted], [x[0] for x in trending_sorted]))
top5_read_and_trending = {'top read' : top5_distinct[0], 'trending' : top5_distinct[1]}
pprint(top5_read_and_trending)
{'top read': ['Chester Bennington', 'Dunkirk (2017 film)', 'Jordan Spieth', 'John Heard (actor)', 'Battle of Dunkirk'], 'trending': ['Mitochondrial DNA depletion syndrome', '2017–18 United States network television schedule', 'Lone Echo', 'Maynooth University', 'Patriarchy']}
timestr = time.strftime("%Y%m%d-%H%M%S")
print("last saved version: " + timestr)
with open('data/top5_read_and_trending_{}.json'.format(timestr), 'w') as fout:
json.dump(top5_read_and_trending,fout,sort_keys = False)
last saved version: 20170725-142749
with open(max(glob.glob('data/top5_read_and_trending*'), key=os.path.getctime)) as fin:
top5 = json.load(fin)
print(fin)
with open(max(glob.glob('data/top_read*'), key=os.path.getctime)) as fin:
top_read = json.load(fin)
print(fin)
with open(max(glob.glob('data/trending*'), key=os.path.getctime)) as fin:
trending = json.load(fin)
print(fin)
<_io.TextIOWrapper name='data/top5_read_and_trending_20170725-142749.json' mode='r' encoding='UTF-8'> <_io.TextIOWrapper name='data/top_read_20170725-142749.json' mode='r' encoding='UTF-8'> <_io.TextIOWrapper name='data/trending_20170725-142749.json' mode='r' encoding='UTF-8'>
def filter(dict_to_filter, lookup_list):
return {k:v for k,v in dict_to_filter.items() if k in lookup_list}
top5_read = filter(top_read, top5['top read'])
top5_trending = filter(trending, top5['trending'])
top5_read_sorted = sorted(top5_read.items(),key=lambda x:getitem(x[1],'rank'), reverse=False)
top5_trending_sorted = sorted(top5_trending.items(),key=lambda x:getitem(x[1],'rank'), reverse=False)
page_template = """
<!-- {condition} -->
__NOEDITSECTION__
__NOTOC__
{{|cellpadding="5" style="margin-left: auto; margin-right: auto; border-spacing: 10px; background-color: transparent; border-top: solid 2px; border-bottom: solid 2px;"
|- style="border-top: solid 1px; vertical-align:top;"
| [[File:{image_1}|100px]]
| '''{title_1}'''<br/><span style="color:gray">{description_1}</span>
|- style="border-top: solid 1px; vertical-align:top;"
| [[File:{image_2}|100px]]
| '''{title_2}'''<br/><span style="color:gray">{description_2}</span>
|- style="border-top: solid 1px; vertical-align:top;"
| [[File:{image_3}|100px]]
| '''{title_3}'''<br/><span style="color:gray">{description_3}</span>
|- style="border-top: solid 1px; vertical-align:top;"
|[[File:{image_4}|100px]]
| '''{title_4}'''<br/><span style="color:gray">{description_4}</span>
|- style="border-top: solid 1px; vertical-align:top;"
| [[File:{image_5}|100px]]
| '''{title_5}'''<br/><span style="color:gray">{description_5}</span>
|}}
[[Category:Top_articles_user_study_Apr-May_2017]]
"""
def recommendation_section(condition, articles, output_template):
"""
Take a condition parameter, a list of (ar_title, ar_data_dict) tuples, and a wikitext template
Format the template with the condition and the relevant info from the articles
Adds a placeholder image if the article lacks one
"""
rec_params = {}
rec_params['condition'] = condition
for i, x in enumerate(articles):
rec_params['title_' + str(i + 1)] = x[0]
for k,v in x[1].items():
rec_params['description_' + str(i + 1)] = x[1]['description']
rec_params['image_' + str(i + 1)] = x[1]['file name']
formatted_output = output_template.format(**rec_params)
return formatted_output
def login_request(baseurl, username, password):
# Login request
payload = {'action': 'query', 'format': 'json', 'utf8': '', 'meta': 'tokens', 'type': 'login'}
r1 = requests.post(baseurl + 'api.php', data=payload)
# login confirm
login_token = r1.json()['query']['tokens']['logintoken']
payload = {'action': 'login', 'format': 'json', 'utf8': '', 'lgname': username, 'lgpassword': password, 'lgtoken': login_token}
r2 = requests.post(baseurl + 'api.php', data=payload, cookies=r1.cookies)
return r2
def token_request(baseurl, r2):
# get edit token2
params3 = '?format=json&action=query&meta=tokens&continue='
r3 = requests.get(baseurl + 'api.php' + params3, cookies=r2.cookies)
edit_token = r3.json()['query']['tokens']['csrftoken']
edit_cookie = r2.cookies.copy()
edit_cookie.update(r3.cookies)
return edit_token, edit_cookie
def publish(page_title, page_content, uname, upass):
base_url = 'https://test.wikipedia.org/w/'
login = login_request(base_url, uname, upass)
edit_token, edit_cookie = token_request(base_url, login)
summary = 'building test page for [[meta:Research:Comparing_most_read_and_trending_edits_for_Top_Articles_feature|Top articles user study]]'
headers={'User-Agent' : 'TopArticles user study', 'From' : 'jmorgan@wikimedia.org'}
# save action
payload = {'action': 'edit', 'assert': 'user', 'format': 'json', 'utf8': '', 'text': page_content,'summary': summary, 'title': page_title, 'token': edit_token}
r4 = requests.post(base_url + 'api.php', data=payload, cookies=edit_cookie, headers=headers)
print(r4.text)
timestr = time.strftime("%Y%m%d-%H%M%S")
print("Published page: " + page_title + " at " + timestr)
#format the page template
top5_read_output = recommendation_section('top read', top5_read_sorted, page_template)
top5_trending_output = recommendation_section('trending', top5_trending_sorted, page_template)
#should randomly assign
publish("Top_articles_1", top5_read_output, uname, upass)
{"edit":{"result":"Success","pageid":95788,"title":"Top articles 1","contentmodel":"wikitext","oldrevid":323682,"newrevid":323697,"newtimestamp":"2017-07-25T14:27:52Z"}} Published page: Top_articles_1 at 20170725-142752
#should randomly assign
publish("Top_articles_2", top5_trending_output, uname, upass)
{"edit":{"result":"Success","pageid":95789,"title":"Top articles 2","contentmodel":"wikitext","oldrevid":323681,"newrevid":323698,"newtimestamp":"2017-07-25T14:27:54Z"}} Published page: Top_articles_2 at 20170725-142754
I will manually upload these to test.wikipedia.org
def non_commons_images(articles):
"""
Take a list of (ar_title, ar_data_dict) tuples
Print info on those with images that aren't from Wikimedia Commons
"""
for a in articles:
if len(a[1]['image url']) >0 and not a[1]['on commons']:
print(a[0] + "\t" + a[1]['image url'])
non_commons_images(top5_read_sorted)
non_commons_images(top5_trending_sorted)
Dunkirk (2017 film) https://upload.wikimedia.org/wikipedia/en/1/15/Dunkirk_Film_poster.jpg Battle of Dunkirk https://upload.wikimedia.org/wikipedia/en/8/86/Dunkirksoldier1.JPG Maynooth University https://upload.wikimedia.org/wikipedia/en/3/3c/NUIM_Symbol.svg
timestr = time.strftime("%Y%m%d-%H%M%S")
print("last run: " + timestr)
last run: 20170725-142754