"""
Author: Christian Bouwense
Program that gets the revision data for a user and measures burstiness.
"""
import time
import random
import datetime as dt
import mwapi
import operator
import numpy as np
import dateutil.parser as dup
from matplotlib import pyplot as plt
import matplotlib.patches as mpatches
def get_user_revisions(user, uc_prop='timestamp', uc_start='today', uc_end='2000-01-01T00:00:00Z'):
# Information specifying user we are interested in
uc_user = user
# We're always going to want these parameters to be the same
action = 'query'
uc_list = 'usercontribs'
uc_limit = 'max'
today = dt.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%dT%H:%M:%SZ')
# User can just give the string "today" instead of the timestamp
if uc_start == "today":
uc_start = today
else:
uc_start = start_date
# Temporary dictionary holding amount of revisions for each user
revisions_by_user = {}
# Connect to Wikipedia
session = mwapi.Session('https://en.wikipedia.org', user_agent='cbouwense')
# Query Wikipedia for revisions on the supplied article
# The result is stored into the dictionary "rev_dict"
rev_dict = session.get(action=action,
list=uc_list,
ucuser=uc_user,
uclimit=uc_limit,
ucstart=uc_start,
ucend=uc_end)
# Go through the timestamps for each revision made.
# If the timestamp is already a key in our dictionary, increment that key value by 1.
# Else, create a new key for that year in our dictionary and set it to 1
rev_timestamps = []
for rev in rev_dict['query']['usercontribs']:
timestamp = dup.parse(rev['timestamp'])
rev_timestamps.append(timestamp)
# Check if there is a section named "continue".
# If there is, that means the query did not get all the data
# because of the per-user query limits.
print ("Retrieving data on %s from Wikipedia..." % uc_user)
while 'continue' in rev_dict:
continue_val = rev_dict['continue']['uccontinue']
rev_dict = session.get(action=action,
list=uc_list,
ucuser=uc_user,
uclimit=uc_limit,
ucstart=uc_start,
ucend=uc_end,
uccontinue=continue_val)
for rev in rev_dict['query']['usercontribs']:
timestamp = dup.parse(rev['timestamp'])
rev_timestamps.append(timestamp)
# Enumerate the times between events into a list
interevent_times = []
for i in range(0, len(rev_timestamps)-1):
interevent_times.append((rev_timestamps[i] - rev_timestamps[i+1]).total_seconds())
# Create entry in user_data for the current user
user_data[uc_user] = {}
# Add data to global dictionaries
user_data[uc_user]['interevent_times'] = interevent_times
get_B(uc_user)
get_M(uc_user)
print ("Data received successfully!")
def get_B(user):
# Calculate interevent time mean and standard deviation
interevent_mean = (np.mean(user_data[user]['interevent_times']))
interevent_std_dev = (np.std(user_data[user]['interevent_times']))
B = ((interevent_std_dev - interevent_mean) / (interevent_std_dev + interevent_mean))
user_data[user]['B'] = B
def get_M(user):
# Store times in this variable with a much shorter name
times = user_data[user]['interevent_times']
mean_1 = np.mean(times[0:len(times)-1])
mean_2 = np.mean(times[1:len(times)])
std_dev_1 = np.std(times[0:len(times)-1])
std_dev_2 = np.std(times[1:len(times)])
summation = 0
for i in range(0, len(times)-1):
tau_i = times[i]
tau_i_plus_one = times[i+1]
summation_term = (((tau_i - mean_1) * (tau_i_plus_one - mean_2)) / (std_dev_1 * std_dev_2))
summation += summation_term
M = (1 / (len(times) - 1)) * summation
user_data[user]['M'] = M
user_data = {}
users = [
'Ser Amantio di Nicalao',
'koavf',
'Rich Farmbrough',
'Waacstats',
'BD2412',
'Materialscientist',
'Bearcat',
'Hmains',
'Magioladitis',
'Rjwilmsi']
for i in range(0,10):
get_user_revisions(users[i])
for i in range(0, 10):
print ("%s: " % users[i])
print (user_data[users[i]]['B'])
print (user_data[users[i]]['M'])
print ("\n")
0.968527498535 0.0167869386751