import pywikibot
pip install torch
Collecting torch Note: you may need to restart the kernel to use updated packages.
pip install semantic_text_similarity
Collecting semantic_text_similarity Using cached semantic_text_similarity-1.0.3-py3-none-any.whl (416 kB) Collecting torch Killed
pip install numpy
Requirement already satisfied: numpy in /srv/paws/lib/python3.8/site-packages (1.21.5) Note: you may need to restart the kernel to use updated packages.
pip install wikipedia
import wikipedia
import pywikibot
from pywikibot.data import api
import requests
from semantic_text_similarity.models import WebBertSimilarity
import numpy as np
ptwiki = pywikibot.Site('wikidata', 'wikidata')
ptwiki_repo = ptwiki.data_repository()
article_title_to_search = "Daviesia argillacea"
description_to_match = "Daviesia argillacea is a species of flowering plant in the family Fabaceae and is endemic to the south-west of Western Australia."
summary_to_match = wikipedia.summary(article_title_to_search)
# from https://github.com/AndriyMulyar/semantic-text-similarity
#defaults to GPU prediction
web_model = WebBertSimilarity(device='cpu', batch_size=10)
# From https://bitbucket.org/mikepeel/wikicode/src/master/example.py
def search_entities(site, itemtitle):
"""Search using 'Article Title' and return Wikidata entries"""
params = { 'action' :'wbsearchentities',
'format' : 'json',
'language' : 'en',
'type' : 'item',
'search': itemtitle}
request = api.Request(site=site, parameters=params)
return request.submit()
wikidataEntries = search_entities(ptwiki_repo, article_title_to_search)
if wikidataEntries['search'] != []:
results = wikidataEntries['search']
numresults = len(results)
for i in range(0,numresults):
qid = results[i]['id']
label = results[i]['label']
description = results[i]['description']
if description == description_to_match:
""" Match with the description of the provided 'Article Title's wiki page and
print the QID when multiple terms will be found against given 'Article Title' """
print ("Matched QID using string match - " + article_title_to_search + " is " + qid + " - " + description + "\n")
else:
print ("Using String Match mathod given string doesnt match with the QID\'s discription of given article title!")
"""
Match with the description of the provided 'Article Title's wiki page using
BERT-based semantic text similarity models and print the QIDs when multiple terms will be found against given 'Article Title'
"""
predicted_model = web_model.predict([(description, description_to_match)])
# Convert numpy dtypes to native python types
prediction = getattr(predicted_model, "tolist", lambda: value)()
int_prediction = int(prediction[0])
if int_prediction >= 3:
print ("Matched QID using Semantic Text Similarity - " + article_title_to_search + " is " + qid + " - " + description)
else:
print ("Given string doesnt match with the QID\'s discription for given article title!" + "\nSimilarity found: " , end = '')
print(predicted_model)
# Matching Atricle's summary
summary_model = web_model.predict([(summary_to_match, description_to_match)])
summary_prediction = getattr(summary_model, "tolist", lambda: value)()
int_summary_prediction = int(summary_prediction[0])
if int_summary_prediction >= 3:
print ("Matched Article\'s Summary using Semantic Text Similarity.")
else:
print ("Given string doesnt match with the Article\'s discription for given article title!" + "\nSimilarity found: " , end = '')
print(summary_model)
else:
print("No QID was found against the given string - " + article_title_to_search + ".")