In [ ]:
import pywikibot
In [1]:
pip install torch
Collecting torch
Note: you may need to restart the kernel to use updated packages.
In [2]:
pip install semantic_text_similarity
Collecting semantic_text_similarity
  Using cached semantic_text_similarity-1.0.3-py3-none-any.whl (416 kB)
Collecting torch
Killed

In [2]:
pip install numpy
Requirement already satisfied: numpy in /srv/paws/lib/python3.8/site-packages (1.21.5)
Note: you may need to restart the kernel to use updated packages.
In [ ]:
pip install wikipedia
In [ ]:
import wikipedia
import pywikibot
from pywikibot.data import api
import requests

from semantic_text_similarity.models import WebBertSimilarity
import numpy as np

ptwiki = pywikibot.Site('wikidata', 'wikidata') 
ptwiki_repo = ptwiki.data_repository() 

article_title_to_search = "Daviesia argillacea"
description_to_match = "Daviesia argillacea is a species of flowering plant in the family Fabaceae and is endemic to the south-west of Western Australia."

summary_to_match = wikipedia.summary(article_title_to_search)

# from https://github.com/AndriyMulyar/semantic-text-similarity
#defaults to GPU prediction
web_model = WebBertSimilarity(device='cpu', batch_size=10) 

# From https://bitbucket.org/mikepeel/wikicode/src/master/example.py
def search_entities(site, itemtitle):
    """Search using 'Article Title' and return Wikidata entries"""
    params = { 'action' :'wbsearchentities', 
                'format' : 'json',
                'language' : 'en',
                'type' : 'item',
                'search': itemtitle}
    request = api.Request(site=site, parameters=params)
    return request.submit()      

wikidataEntries = search_entities(ptwiki_repo, article_title_to_search)
if wikidataEntries['search'] != []:
    results = wikidataEntries['search']
    numresults = len(results)
    for i in range(0,numresults):
        qid = results[i]['id']
        label = results[i]['label']
        description = results[i]['description']
        
        if description == description_to_match:
            """ Match with the description of the provided 'Article Title's wiki page and 
            print the QID when multiple terms will be found against given 'Article Title' """
            print ("Matched QID using string match - " + article_title_to_search + " is " + qid + " - " + description + "\n")

        else: 
            print ("Using String Match mathod given string doesnt match with the QID\'s discription of given article title!")            
            
        """
        Match with the description of the provided 'Article Title's wiki page using 
        BERT-based semantic text similarity models and print the QIDs when multiple terms will be found against given 'Article Title'
        """
        predicted_model = web_model.predict([(description, description_to_match)])
        # Convert numpy dtypes to native python types
        prediction = getattr(predicted_model, "tolist", lambda: value)() 
        int_prediction = int(prediction[0])
        if int_prediction >= 3: 
            print ("Matched QID using Semantic Text Similarity - " + article_title_to_search + " is " + qid + " - " + description)
        
        else: 
            print ("Given string doesnt match with the QID\'s discription for given article title!" + "\nSimilarity found: " , end = '')
            print(predicted_model)
           
            # Matching Atricle's summary
            summary_model = web_model.predict([(summary_to_match, description_to_match)])
            summary_prediction = getattr(summary_model, "tolist", lambda: value)() 
            int_summary_prediction = int(summary_prediction[0])
            if int_summary_prediction >= 3: 
                print ("Matched Article\'s Summary using Semantic Text Similarity.")
            else: 
                print ("Given string doesnt match with the Article\'s discription for given article title!" + "\nSimilarity found: " , end = '')
                print(summary_model)            

else:
    print("No QID was found against the given string - " + article_title_to_search + ".")
In [ ]: