In this notebook we focus on maximum recall on articles related with COVID.
First we take all Wikidata Articles that links to a main COVID-19 pages COVID-19 (Q84263196) and the 2019–20 COVID-19 pandemic (Q81068910), and then we join both sets and do a final pass to get relationships between the seeds and all the resulting items.
#install dependencies
!pip install SPARQLWrapper
!pip install -U pandas
Requirement already satisfied: SPARQLWrapper in /srv/paws/lib/python3.6/site-packages Requirement already satisfied: rdflib>=4.0 in /srv/paws/lib/python3.6/site-packages (from SPARQLWrapper) Requirement already satisfied: isodate in /srv/paws/lib/python3.6/site-packages (from rdflib>=4.0->SPARQLWrapper) Requirement already satisfied: pyparsing in /srv/paws/lib/python3.6/site-packages (from rdflib>=4.0->SPARQLWrapper) Requirement already satisfied: six in /srv/paws/lib/python3.6/site-packages (from isodate->rdflib>=4.0->SPARQLWrapper) Requirement already up-to-date: pandas in /srv/paws/lib/python3.6/site-packages Requirement already up-to-date: numpy>=1.13.3 in /srv/paws/lib/python3.6/site-packages (from pandas) Requirement already up-to-date: python-dateutil>=2.6.1 in /srv/paws/lib/python3.6/site-packages (from pandas) Requirement already up-to-date: pytz>=2017.2 in /srv/paws/lib/python3.6/site-packages (from pandas) Requirement already up-to-date: six>=1.5 in /srv/paws/lib/python3.6/site-packages (from python-dateutil>=2.6.1->pandas)
import pandas as pd
now = pd.Timestamp.now()
#getting seed based on what links to coronavirus disease 2019 (Q84263196) in Wikidata
import requests
whatLinks = []
# COVID -19
url = 'https://www.wikidata.org/w/api.php?action=query&format=json&list=backlinks&bltitle=Q84263196&bllimit=500&blnamespace=0'
response = requests.get(url=url).json()
whatLinks.extend(response['query']['backlinks'])
while 'continue' in response:
url = url + '&blcontinue='+ response['continue']['blcontinue']
response = requests.get(url=url).json()
whatLinks.extend(response['query']['backlinks'])
QswhatLinks = [v['title'] for v in whatLinks]
QswhatLinks = set(QswhatLinks)
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
#https://w.wiki/KvX (Thanks User:Dipsacus_fullonum)
# All statements with item, property, value and rank with COVID-19 (Q84263196) as value for qualifier.
sparql.setQuery("""
SELECT ?item ?itemLabel ?property ?propertyLabel ?value ?valueLabel ?rank ?qualifier ?qualifierLabel
WHERE
{
?item ?claim ?statement.
?property wikibase:claim ?claim.
?property wikibase:statementProperty ?sprop.
?statement ?sprop ?value.
?statement wikibase:rank ?rank.
?statement ?qprop wd:Q84263196. # COVID-19
?qualifier wikibase:qualifier ?qprop.
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
""")
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
allStatements = pd.io.json.json_normalize(results['results']['bindings'])
/srv/paws/lib/python3.6/site-packages/ipykernel_launcher.py:27: FutureWarning: pandas.io.json.json_normalize is deprecated, use pandas.json_normalize instead
allStatements['valueLabel.value'].value_counts()
disease outbreak 437 human 7 treatment 2 epidemiological surveillance 1 vaccine 1 drug repositioning 1 mascot character 1 diagnostic test 1 pandemic 1 hierarchy of hazard controls 1 pneumonia 1 moe anthropomorphic character 1 drug development 1 2020-03-05T00:00:00Z 1 medical diagnosis 1 Name: valueLabel.value, dtype: int64
# All truthy statements with COVID-19 (Q84263196) as value.
#https://w.wiki/KvZ (Thanks User:Dipsacus_fullonum)
sparql.setQuery("""
SELECT ?item ?itemLabel ?property ?propertyLabel
WHERE
{
?item ?claim wd:Q84263196.
?property wikibase:directClaim ?claim.
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}""")
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
truthy = pd.io.json.json_normalize(results['results']['bindings'])
/srv/paws/lib/python3.6/site-packages/ipykernel_launcher.py:15: FutureWarning: pandas.io.json.json_normalize is deprecated, use pandas.json_normalize instead from ipykernel import kernelapp as app
truthyQ = [ link.split('/')[-1] for link in truthy['item.value'].tolist()]
allStatementsQ = [ link.split('/')[-1] for link in allStatements['item.value'].tolist()]
allSPARQL = set(truthyQ).union(set(allStatementsQ))
allSPARQL - QswhatLinks
{'L253474-S1'}
QswhatLinks - allSPARQL
{'Q10304982', 'Q66777139', 'Q84420257', 'Q85110277', 'Q88019029', 'Q88870103'}
Q84263196AllItems = allSPARQL.union(QswhatLinks)
# 2019–20 COVID-19 pandemic
url = 'https://www.wikidata.org/w/api.php?action=query&format=json&list=backlinks&bltitle=Q81068910&bllimit=500&blnamespace=0'
response = requests.get(url=url).json()
whatLinks.extend(response['query']['backlinks'])
while 'continue' in response:
url = url + '&blcontinue='+ response['continue']['blcontinue']
response = requests.get(url=url).json()
whatLinks.extend(response['query']['backlinks'])
QswhatLinks2 = [v['title'] for v in whatLinks]
QswhatLinks2 = set(QswhatLinks)
#All truthy statements with 2019–20 COVID-19 pandemic (Q81068910) as value.
#https://w.wiki/Kvd (Thanks User:Dipsacus_fullonum)
sparql.setQuery("""
#
SELECT ?item ?itemLabel ?property ?propertyLabel WHERE {
?item ?claim wd:Q81068910. #2019–20 COVID-19 pandemic
?property wikibase:directClaim ?claim.
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
""")
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
Q81068910 = pd.io.json.json_normalize(results['results']['bindings'])
/srv/paws/lib/python3.6/site-packages/ipykernel_launcher.py:17: FutureWarning: pandas.io.json.json_normalize is deprecated, use pandas.json_normalize instead
Q81068910SPARQL = [ link.split('/')[-1] for link in Q81068910['item.value'].tolist()]
Q81068910SPARQL = set(Q81068910SPARQL)
len(QswhatLinks2 - Q81068910SPARQL)
1111
len(Q81068910SPARQL - QswhatLinks2)
510
Q81068910All = Q81068910SPARQL.union(QswhatLinks2)
# Joining the full set from both seeds
Qs = Q81068910All.union(Q84263196AllItems)
## add seeds (by definition they were out)
Qs = Q81068910All.union({'Q81068910','Q84263196'})
len(Qs)
2771
# Adding a third seed SARS-CoV-2 (Q82069695)
url = 'https://www.wikidata.org/w/api.php?action=query&format=json&list=backlinks&bltitle=Q82069695&bllimit=500&blnamespace=0'
response = requests.get(url=url).json()
whatLinks.extend(response['query']['backlinks'])
while 'continue' in response:
url = url + '&blcontinue='+ response['continue']['blcontinue']
response = requests.get(url=url).json()
whatLinks.extend(response['query']['backlinks'])
QswhatLinks3 = [v['title'] for v in whatLinks]
QswhatLinks3 = set(QswhatLinks)
Qs = Qs.union(QswhatLinks3)
len(Qs)
2771
def chunks(lst, n):
"""Yield successive n-sized chunks from lst."""
for i in range(0, len(lst), n):
yield lst[i:i + n]
wikidata_query_base = 'https://www.wikidata.org/w/api.php?action=wbgetentities&format=json&props=aliases|claims|datatype|descriptions|info|labels|sitelinks|sitelinks/urls&ids='
itemsInfo = {}
c = 0
for items in chunks(list(Qs),50):
c +=50
if c%200 ==0: print(c,'items reviewed')
url = wikidata_query_base + '|'.join(items)
itemsInfo.update(requests.get(url=url).json()['entities'])
200 items reviewed 400 items reviewed 600 items reviewed 800 items reviewed 1000 items reviewed 1200 items reviewed 1400 items reviewed 1600 items reviewed 1800 items reviewed 2000 items reviewed 2200 items reviewed 2400 items reviewed 2600 items reviewed
def getRelationships(claims,targetQs):
'''
This function receives a list of claims from a Wikidata Item, and a list of target Qs
Iterating over the claims, looking for the target Qs and returning the pair Property and target Q
For example, if it find relationship Part of (P31) of Q12323 (that is the target list)
will return [(P31,Q3)]
inputs:
claims: object, result from wikidata queries like
'https://www.wikidata.org/w/api.php?action=wbgetentities&format=json&ids=Q5'
targetQs: list of str, where str are Q values
output:
return a list of pairs (prop,target)
'''
pairs = []
for prop, relationships in claims.items():
for relationship in relationships:
if 'mainsnak' in relationship:
datatype = relationship['mainsnak'].get('datatype','')
if datatype=='wikibase-item':
try: #found some cases without id even for a wikibase-item datatype
Qfound = relationship['mainsnak']['datavalue']['value'].get('id','')
if Qfound in targetQs:
pairs.append([prop,targetQs[targetQs.index(Qfound)]])
except:
pass
if not pairs:
pairs.append(['unknown','unknown'])
return pairs
def getValueIfWikidataItem(claim):
'''
this function return a list of values for a given claim, if those values point to a wikidata item
datatype=='wikibase-item'
input:
claim: object
output:
wikidataItems: list of str
'''
output = []
for relationship in claim:
if 'mainsnak' in relationship:
datatype = relationship['mainsnak'].get('datatype','')
if datatype=='wikibase-item':
Qfound = relationship['mainsnak']['datavalue']['value'].get('id','')
output.append(Qfound)
if not output:
output.append('unknown')
return output
pagesPerProject = {}
pagesPerProjectTable = {}
itemsInfoTable = {}
labelsEn = {}
for item,v in itemsInfo.items():
itemsInfoTable[item] = {}
try:
itemsInfoTable[item]['item Label'] = v['labels']['en']['value']
except:
itemsInfoTable[item]['item Label'] = 'unknown '
#checking if there are claims for that Q, if not claims we return an empty dict, to avoid errors
claims = v.get('claims',{})
if 'P31' in claims: #getting part of to classify the item
itemsInfoTable[item]['Instace Of'] = getValueIfWikidataItem(claims.get('P31'))
else:
itemsInfoTable[item]['Instace Of'] = ['unknown']
#find COVID-19 / COVID-19 pandemics relationships
itemsInfoTable[item]['RelationTuple'] = getRelationships(claims,['Q81068910','Q84263196'])
if 'sitelinks' in v:
for wiki,data in v['sitelinks'].items():
page = data['title']
project ='%s.%s' % (data['url'][8:].split('.')[0],data['url'][8:].split('.')[1]) #could be more elegant with regex
pagesPerProject[project] = pagesPerProject.get(project,[])
pagesPerProject[project].append(page)
article_link = data['url']
if project.split('.')[1] == 'wikipedia' or project.split('.')[0] == 'commons': #iwlinks : https://meta.wikimedia.org/wiki/Help:Interwiki_linking
projectcode = project.split('.')[0]
else:
projectcode = '%s:%s ' % (project.split('.')[1],project.split('.')[0])
wikilink = '[[%s:%s|%s]]' % (projectcode,page,page)
pagesPerProjectTable[article_link] = {'project':project,'page':page,'wikidataItem':item,'wikilink':wikilink}
itemsInfoTable = pd.DataFrame.from_dict(itemsInfoTable,orient='index')
pagesPerProjectTable = pd.DataFrame.from_dict(pagesPerProjectTable,orient='index')
#FINALVERSION Of pagesPerProjectTable
pagesPerProjectTable['url'] = pagesPerProjectTable.index
pagesPerProjectTable
project | page | wikidataItem | wikilink | url | |
---|---|---|---|---|---|
https://commons.wikimedia.org/wiki/Category:Thiago_Seyboth_Wild | commons.wikimedia | Category:Thiago Seyboth Wild | Q50198692 | [[commons:Category:Thiago Seyboth Wild|Categor... | https://commons.wikimedia.org/wiki/Category:Th... |
https://de.wikipedia.org/wiki/Thiago_Seyboth_Wild | de.wikipedia | Thiago Seyboth Wild | Q50198692 | [[de:Thiago Seyboth Wild|Thiago Seyboth Wild]] | https://de.wikipedia.org/wiki/Thiago_Seyboth_Wild |
https://en.wikipedia.org/wiki/Thiago_Seyboth_Wild | en.wikipedia | Thiago Seyboth Wild | Q50198692 | [[en:Thiago Seyboth Wild|Thiago Seyboth Wild]] | https://en.wikipedia.org/wiki/Thiago_Seyboth_Wild |
https://es.wikipedia.org/wiki/Thiago_Seyboth_Wild | es.wikipedia | Thiago Seyboth Wild | Q50198692 | [[es:Thiago Seyboth Wild|Thiago Seyboth Wild]] | https://es.wikipedia.org/wiki/Thiago_Seyboth_Wild |
https://fr.wikipedia.org/wiki/Thiago_Seyboth_Wild | fr.wikipedia | Thiago Seyboth Wild | Q50198692 | [[fr:Thiago Seyboth Wild|Thiago Seyboth Wild]] | https://fr.wikipedia.org/wiki/Thiago_Seyboth_Wild |
... | ... | ... | ... | ... | ... |
https://pt.wikipedia.org/wiki/Pandemia_de_COVID-19_no_Uzbequist%C3%A3o | pt.wikipedia | Pandemia de COVID-19 no Uzbequistão | Q87755912 | [[pt:Pandemia de COVID-19 no Uzbequistão|Pande... | https://pt.wikipedia.org/wiki/Pandemia_de_COVI... |
https://ta.wikipedia.org/wiki/2020_%E0%AE%89%E0%AE%9A%E0%AF%81%E0%AE%AA%E0%AF%86%E0%AE%95%E0%AF%8D%E0%AE%95%E0%AE%BF%E0%AE%9A%E0%AF%81%E0%AE%A4%E0%AE%BE%E0%AE%A9%E0%AE%BF%E0%AE%B2%E0%AF%8D_%E0%AE%95%E0%AF%8A%E0%AE%B0%E0%AF%8B%E0%AE%A9%E0%AE%BE%E0%AE%B5%E0%AF%88%E0%AE%B0%E0%AE%9A%E0%AF%81%E0%AE%A4%E0%AF%8D_%E0%AE%A4%E0%AF%8A%E0%AE%B1%E0%AF%8D%E0%AE%B1%E0%AF%81 | ta.wikipedia | 2020 உசுபெக்கிசுதானில் கொரோனாவைரசுத் தொற்று | Q87755912 | [[ta:2020 உசுபெக்கிசுதானில் கொரோனாவைரசுத் தொற்... | https://ta.wikipedia.org/wiki/2020_%E0%AE%89%E... |
https://tr.wikipedia.org/wiki/%C3%96zbekistan%27da_2020_koronavir%C3%BCs_pandemisi | tr.wikipedia | Özbekistan'da 2020 koronavirüs pandemisi | Q87755912 | [[tr:Özbekistan'da 2020 koronavirüs pandemisi|... | https://tr.wikipedia.org/wiki/%C3%96zbekistan%... |
https://uz.wikipedia.org/wiki/O%CA%BBzbekistonda_COVID-19_pandemiyasi | uz.wikipedia | Oʻzbekistonda COVID-19 pandemiyasi | Q87755912 | [[uz:Oʻzbekistonda COVID-19 pandemiyasi|Oʻzbek... | https://uz.wikipedia.org/wiki/O%CA%BBzbekiston... |
https://vi.wikipedia.org/wiki/%C4%90%E1%BA%A1i_d%E1%BB%8Bch_COVID-19_t%E1%BA%A1i_Uzbekistan | vi.wikipedia | Đại dịch COVID-19 tại Uzbekistan | Q87755912 | [[vi:Đại dịch COVID-19 tại Uzbekistan|Đại dịch... | https://vi.wikipedia.org/wiki/%C4%90%E1%BA%A1i... |
10538 rows × 5 columns
itemsInfoTable = itemsInfoTable.explode('Instace Of').explode('RelationTuple')
itemsInfoTable['connector'] = itemsInfoTable['RelationTuple'].apply(lambda x:x[0])
itemsInfoTable['connected To'] = itemsInfoTable['RelationTuple'].apply(lambda x:x[1])
itemsInfoTable.drop('RelationTuple',inplace=True,axis=1)
connectedToLabel = {'Q84263196':'COVID-19', 'Q81068910':'2019–20 COVID-19 pandemic'}
itemsInfoTable['connected To Label'] = itemsInfoTable['connected To'].apply(lambda x:connectedToLabel.get(x))
## Getting labels for connector (properties)
Ps = list(itemsInfoTable['connector'].unique())
props = []
for P in Ps:
props.append(requests.get('https://www.wikidata.org/w/api.php?action=wbgetentities&ids=%s&format=json' % P).json())
propLabels ={}
for P in props:
if 'entities' in P:
for Pid,data in P['entities'].items():
tmplabel = data.get('labels').get('en',{})
propLabels[Pid]= tmplabel.get('value','unknown')
propLabels = pd.DataFrame.from_dict(propLabels,orient='index',columns=['connector Label'])
propLabels['connector'] = propLabels.index
itemsInfoTable = itemsInfoTable.join(propLabels, on='connector',rsuffix='_tmp').drop('connector_tmp',axis=1)
itemsInfoTable['item_id'] = itemsInfoTable.index
itemsInfoTable
item Label | Instace Of | connector | connected To | connected To Label | connector Label | item_id | |
---|---|---|---|---|---|---|---|
Q50198692 | Thiago Seyboth Wild | Q5 | P1050 | Q84263196 | COVID-19 | medical condition | Q50198692 |
Q88938156 | 2020 coronavirus pandemic in Swiss canton AR | Q3241045 | unknown | unknown | None | NaN | Q88938156 |
Q88976185 | CORONAVIRUS IN PREGNANCY AND DELIVERY: RAPID R... | Q13442814 | P921 | Q84263196 | COVID-19 | main subject | Q88976185 |
Q88976185 | CORONAVIRUS IN PREGNANCY AND DELIVERY: RAPID R... | Q13442814 | P921 | Q81068910 | 2019–20 COVID-19 pandemic | main subject | Q88976185 |
Q88973815 | Exploring the coronavirus epidemic using the n... | Q13442814 | P921 | Q84263196 | COVID-19 | main subject | Q88973815 |
... | ... | ... | ... | ... | ... | ... | ... |
Q87461608 | Potential interventions for novel coronavirus ... | Q13442814 | P921 | Q84263196 | COVID-19 | main subject | Q87461608 |
Q87349559 | Category:People with coronavirus disease 2019 | Q4167836 | P971 | Q84263196 | COVID-19 | category combines topics | Q87349559 |
Q87755912 | 2020 coronavirus pandemic in Uzbekistan | Q3241045 | unknown | unknown | None | NaN | Q87755912 |
Q88974700 | Comparison of throat swabs and sputum specimen... | Q13442814 | P921 | Q84263196 | COVID-19 | main subject | Q88974700 |
Q88974700 | Comparison of throat swabs and sputum specimen... | Q13442814 | P921 | Q81068910 | 2019–20 COVID-19 pandemic | main subject | Q88974700 |
3984 rows × 7 columns
## Getting Instance of labels
instaceOfQs = list(itemsInfoTable['Instace Of'].unique())
print(len(instaceOfQs))
QiOf = [] # Q instace
for Q in instaceOfQs:
QiOf.append(requests.get('https://www.wikidata.org/w/api.php?action=wbgetentities&ids=%s&format=json' % Q).json())
98
QiOfLabels ={}
for P in QiOf:
if 'entities' in P:
for Pid,data in P['entities'].items():
tmplabel = data.get('labels').get('en',{})
QiOfLabels[Pid]= tmplabel.get('value','unknown')
QiOfLabels = pd.DataFrame.from_dict(QiOfLabels,orient='index',columns=['Instace Of Label'])
QiOfLabels['Instace Of'] = QiOfLabels.index
#FINALVERSION Of Info Table
itemsInfoTable = itemsInfoTable.join(QiOfLabels, on='Instace Of',rsuffix='_tmp').drop('Instace Of_tmp',axis=1)
nonHumans = itemsInfoTable[itemsInfoTable['Instace Of Label'] != 'human']
nonHumans
item Label | Instace Of | connector | connected To | connected To Label | connector Label | item_id | Instace Of Label | |
---|---|---|---|---|---|---|---|---|
Q88938156 | 2020 coronavirus pandemic in Swiss canton AR | Q3241045 | unknown | unknown | None | NaN | Q88938156 | disease outbreak |
Q88976185 | CORONAVIRUS IN PREGNANCY AND DELIVERY: RAPID R... | Q13442814 | P921 | Q84263196 | COVID-19 | main subject | Q88976185 | scholarly article |
Q88976185 | CORONAVIRUS IN PREGNANCY AND DELIVERY: RAPID R... | Q13442814 | P921 | Q81068910 | 2019–20 COVID-19 pandemic | main subject | Q88976185 | scholarly article |
Q88973815 | Exploring the coronavirus epidemic using the n... | Q13442814 | P921 | Q84263196 | COVID-19 | main subject | Q88973815 | scholarly article |
Q88973815 | Exploring the coronavirus epidemic using the n... | Q13442814 | P921 | Q81068910 | 2019–20 COVID-19 pandemic | main subject | Q88973815 | scholarly article |
... | ... | ... | ... | ... | ... | ... | ... | ... |
Q87461608 | Potential interventions for novel coronavirus ... | Q13442814 | P921 | Q84263196 | COVID-19 | main subject | Q87461608 | scholarly article |
Q87349559 | Category:People with coronavirus disease 2019 | Q4167836 | P971 | Q84263196 | COVID-19 | category combines topics | Q87349559 | Wikimedia category |
Q87755912 | 2020 coronavirus pandemic in Uzbekistan | Q3241045 | unknown | unknown | None | NaN | Q87755912 | disease outbreak |
Q88974700 | Comparison of throat swabs and sputum specimen... | Q13442814 | P921 | Q84263196 | COVID-19 | main subject | Q88974700 | scholarly article |
Q88974700 | Comparison of throat swabs and sputum specimen... | Q13442814 | P921 | Q81068910 | 2019–20 COVID-19 pandemic | main subject | Q88974700 | scholarly article |
3184 rows × 8 columns
nonHumansPages = nonHumans.join(pagesPerProjectTable.set_index('wikidataItem'))
nonHumansPages
item Label | Instace Of | connector | connected To | connected To Label | connector Label | item_id | Instace Of Label | project | page | wikilink | url | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
Q103177 | severe acute respiratory syndrome | Q18123741 | P1542 | Q81068910 | 2019–20 COVID-19 pandemic | has effect | Q103177 | infectious disease | af.wikipedia | Ernstige akute respiratoriese sindroom | [[af:Ernstige akute respiratoriese sindroom|Er... | https://af.wikipedia.org/wiki/Ernstige_akute_r... |
Q103177 | severe acute respiratory syndrome | Q18123741 | P1542 | Q81068910 | 2019–20 COVID-19 pandemic | has effect | Q103177 | infectious disease | ar.wikipedia | متلازمة تنفسية حادة وخيمة | [[ar:متلازمة تنفسية حادة وخيمة|متلازمة تنفسية ... | https://ar.wikipedia.org/wiki/%D9%85%D8%AA%D9%... |
Q103177 | severe acute respiratory syndrome | Q18123741 | P1542 | Q81068910 | 2019–20 COVID-19 pandemic | has effect | Q103177 | infectious disease | ast.wikipedia | Síndrome respiratoriu agudu grave | [[ast:Síndrome respiratoriu agudu grave|Síndro... | https://ast.wikipedia.org/wiki/S%C3%ADndrome_r... |
Q103177 | severe acute respiratory syndrome | Q18123741 | P1542 | Q81068910 | 2019–20 COVID-19 pandemic | has effect | Q103177 | infectious disease | azb.wikipedia | سارس | [[azb:سارس|سارس]] | https://azb.wikipedia.org/wiki/%D8%B3%D8%A7%D8... |
Q103177 | severe acute respiratory syndrome | Q18123741 | P1542 | Q81068910 | 2019–20 COVID-19 pandemic | has effect | Q103177 | infectious disease | be.wikipedia | Цяжкі востры рэспіраторны сіндром | [[be:Цяжкі востры рэспіраторны сіндром|Цяжкі в... | https://be.wikipedia.org/wiki/%D0%A6%D1%8F%D0%... |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
Q89368376 | COVID-19 crisis 2020 at the GGD Hollands Midden | Q3241045 | unknown | unknown | None | NaN | Q89368376 | disease outbreak | NaN | NaN | NaN | NaN |
Q89368378 | COVID-19 crisis 2020 at the GGD Rotterdam-Rijn... | Q3241045 | unknown | unknown | None | NaN | Q89368378 | disease outbreak | NaN | NaN | NaN | NaN |
Q89368379 | COVID-19 crisis 2020 at the GGD Zuid-Holland-Zuid | Q3241045 | unknown | unknown | None | NaN | Q89368379 | disease outbreak | NaN | NaN | NaN | NaN |
Q89375395 | Novel Coronavirus (2019-nCoV) Situation Report 74 | Q88380217 | P921 | Q81068910 | 2019–20 COVID-19 pandemic | main subject | Q89375395 | WHO situation report | NaN | NaN | NaN | NaN |
Q89378072 | 2020 coronavirus pandemic in Washim district | Q3241045 | P361 | Q81068910 | 2019–20 COVID-19 pandemic | part of | Q89378072 | disease outbreak | NaN | NaN | NaN | NaN |
10326 rows × 12 columns
nonHumansPages.to_pickle('pagesPerProjectNonHumans20200403.pickle')
import numpy as np
with open('pagesPerProjectNonHumans20200403.wikitext','w') as f:
for project,data in nonHumansPages.groupby('project'):
if project != np.nan:
f.write('\n== %s == \n \n' % project )
for wikilink,d in data.groupby('wikilink'):
f.write('* %s (' % wikilink)
for index,cause in d.iterrows():
if cause['connector Label'] == np.nan:
cause['connector Label'] == 'unknown'
cause['connected To Label'] == 'unknown'
output = '%s: %s, ' % (cause['connector Label'],cause['connected To Label'])
output = output[0:-2]
output += ')\n'
f.write(output)
!pip install xlwt
nonHumansPages[['project','page','url','wikilink']].drop_duplicates().to_excel('pagesPerProjectNonHumans20200403.xls')
Requirement already satisfied: xlwt in /srv/paws/lib/python3.6/site-packages
itemsInfoTablePages = itemsInfoTable.join(pagesPerProjectTable.set_index('wikidataItem'))
import numpy as np
with open('pagesPerProjectMethodologyMarch30-excuted.wikitext','w') as f:
for project,data in nonHumansPages.groupby('project'):
if project != np.nan:
f.write('\n== %s == \n \n' % project )
for wikilink,d in data.groupby('wikilink'):
f.write('* %s (' % wikilink)
for index,cause in d.iterrows():
if cause['connector'] != np.nan:
output = '%s: %s, ' % (cause['connector Label'],cause['connected To Label'])
output = output[0:-2]
output += ')\n'
f.write(output)