First I import import requests
import requests
I included 'revids' as one of the parameters in order to get the appropriate response and parsed through the json_data with the right keys to get the qid.
def revid_to_qid(revid, lang):
"""Takes a Wikipedia article revision ID and returns the corresponding Wikidata ID.
Note: the revision ID is language-specific, so if the revision is from English Wikipedia, the pageprops
API that is called must be the one associated with English Wikipedia.
Args:
revid: integer revision ID associated with an article in the provided language
lang: the Wikipedia language version -- e.g., 'en' corresponds with English Wikipedia (en.wikipedia.org)
Returns:
qid: Wikidata ID associated with the article corresponding to the revision ID
"""
S = requests.Session()
URL = "https://en.wikipedia.org/w/api.php"
PARAMS = {
"action": "query",
"revids": revid,
"prop": "pageprops",
"format": "json"
}
response = S.get(url=URL, params=PARAMS)
json_data = response.json()
qid = json_data['query']['pages']['29828568']['pageprops']['wikibase_item']
return qid
print(revid_to_qid(935784560, 'en'))
Q2427544
#had to install the mwbase package
pip install mwbase
Collecting mwbase Downloading https://files.pythonhosted.org/packages/34/e3/15bc8df648967af0ae317be56cd67fe408dc147d059b3f4eac1c7c8de741/mwbase-0.1.4-py3-none-any.whl Installing collected packages: mwbase Successfully installed mwbase-0.1.4 Note: you may need to restart the kernel to use updated packages.
import mwbase
I included the 'entity' parameter for the function below and the value for the 'action' parameter set to 'wbgetclaims'.
def qid_to_claims(qid):
"""Takes a Wikidata ID and returns a sequence of claims.
Args:
qid: Wikidata ID
Returns:
claims: Sequence of claims tuples of form (property, value) or (property, ) when the value does not have a QID
"""
S = requests.Session()
URL = 'https://www.wikidata.org/w/api.php?action=help&modules=wbgetclaims'
PARAMS = {
"action": "wbgetclaims",
"format": "json",
"entity": qid
}
response = S.get(url=URL, params=PARAMS)
json_data = response.json()
entity = mwbase.Entity.from_json(json_data)
#properties = entity['properties'].keys()
claims = []
for k, v in entity['properties'].items():
value = v[0]['id']
claims.append((k, value[:8]))
return claims
print(qid_to_claims('Q2427544'))
[('P214', 'q2427544'), ('P569', 'Q2427544'), ('P856', 'Q2427544'), ('P31', 'Q2427544'), ('P646', 'Q2427544'), ('P19', 'Q2427544'), ('P27', 'Q2427544'), ('P1273', 'Q2427544'), ('P735', 'Q2427544'), ('P69', 'Q2427544'), ('P106', 'Q2427544'), ('P1233', 'Q2427544'), ('P1412', 'Q2427544'), ('P18', 'Q2427544'), ('P172', 'Q2427544'), ('P166', 'Q2427544'), ('P373', 'Q2427544'), ('P244', 'Q2427544'), ('P1411', 'Q2427544'), ('P227', 'Q2427544'), ('P345', 'Q2427544'), ('P648', 'Q2427544'), ('P742', 'Q2427544'), ('P213', 'Q2427544'), ('P5570', 'Q2427544'), ('P2963', 'Q2427544'), ('P691', 'Q2427544'), ('P1315', 'Q2427544'), ('P950', 'Q2427544'), ('P3630', 'Q2427544'), ('P1280', 'Q2427544'), ('P5714', 'Q2427544'), ('P2002', 'Q2427544'), ('P800', 'Q2427544'), ('P136', 'Q2427544'), ('P269', 'Q2427544'), ('P268', 'Q2427544'), ('P21', 'Q2427544'), ('P5408', 'Q2427544'), ('P5357', 'Q2427544'), ('P6553', 'Q2427544'), ('P22', 'Q2427544'), ('P1343', 'Q2427544'), ('P734', 'Q2427544'), ('P7400', 'Q2427544'), ('P7704', 'Q2427544')]
For the function below, assumung it accepts a list of tuples as claims and a dictionary as embeddings:
The 'claims_array' below flattens the list of claims and each item is used as an index in embeddings to get the corresponding embedding of the property/value using a list comprehension and everything converted into a numpy array.
import numpy as np
def claims_to_doc_embedding(claims, embeddings):
"""Takes a sequence of Wikidata claims and produces a document embedding.
Args:
claims: sequence of Wikidata claims.
embeddings: look-up for the embeddings associated with each property/entity in the claims.
Returns:
document embedding: sequence of floats that is the average of the claims embeddings
"""
clms = [item for tup in claims for item in tup]
unique_claims = set(clms)
k_embed = set(embeddings.keys())
no_embed = unique_claims.intersection(k_embed)
if len(no_embed) == 0:
claims_array = np.array([embeddings[item] for tup in claims for item in tup])
doc_embedding = sum(claims_array)/len(claims_array)
else:
for prop in no_embed:
embeddings[prop] = [0.0 for i in range(len(list(embeddings.values())[0])+1)]
claims_array = np.array([embeddings[item] for tup in claims for item in tup])
doc_embedding = sum(claims_array)/len(claims_array)
return doc_embedding
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
def cos_similarity(doc1, doc2):
a = np.array([doc1, doc2])
a_sparse = sparse.csr_matrix(a)
similarities = cosine_similarity(a_sparse)
return similarities