import pywikibot
import re
from pywikibot import pagegenerators
site = pywikibot.Site('en', 'wikipedia')
def extractfirst(text):
i=0
result=text
result=re.sub("\[\[[Cc]ategory:[^\]]*]]","",result)
result=re.sub("\[\[[Ff]ile:[^\]]*]]","",result)
result=re.sub("\[\[[Ii]mage:[^\]]*]]","",result)
result=re.sub("\n"," ",result)
result=re.sub("==.*","",result)
result=re.sub("''+","",result)
result=re.sub("\[\[([^\|\]\[]*\|)?([^\|\]\[]*)]]",r"\2",result)
while i < 5:
result=re.sub("{{[^{}]*}}","",result)
result=re.sub("\([^\(\)]*\)","",result)
result=re.sub("<ref[^<>]*>[^<>]*<\/ref>","",result)
result=re.sub("<ref[^<>]*\/>","",result)
result=re.sub("<!--[^<>]*-->","",result)
i+=1
result=re.sub("\n","",result)
result=re.sub(" *"," ",result)
result=re.sub("(^.*?[.!?](?=\s[A-Z]|$)).*",r"\1",result)
result=re.sub("^\s*","",result)
result=re.sub("\s*(?=,|\.)","",result)
result=re.sub("\s*$","",result)
return result
def extractdescription(text):
result=text
if re.search("\.[^\s]\.",result,re.IGNORECASE) or re.search("(br|chan|chapln|dr|fr|gov|miss|mr|mrs|ms|mme|m|msgr|pres|prof|rep|rev|revs|sen|sr|sra|srta|hon|esq|jr|ret|lt|col|sgt|gen|cpl|capt|bg|adm|cwo|ens|maj|msgt|st)\.",result,re.IGNORECASE):
return False
if re.search("(and|or)$",result):
return False
if re.match('^(("?[A-Z][a-z]*"?|(Jr\.)),? )*?(is|was) (a|an) ',result):
result=re.sub('^(("?[A-Z][a-z]*"?|(Jr\.)),? )*?(is|was) (a|an) (.*)',r"\6",result)
result=re.sub(',? who.*',"",result)
result=re.sub(',? and currently.*',"",result)
result=re.sub(',? currently.*',"",result)
result=re.sub(',? as well.*',"",result)
result=re.sub(',? better known.*',"",result)
result=re.sub(',? best known.*',"",result)
result=re.sub(',? also known.*',"",result)
result=re.sub(',? most known.*',"",result)
result=re.sub(',? mostly known.*',"",result)
result=re.sub(',? generally known.*',"",result)
result=re.sub(',? especially known.*',"",result)
result=re.sub(',? well known.*',"",result)
result=re.sub(',? particularly known.*',"",result)
result=re.sub(',? primarily known.*',"",result)
result=re.sub(',? also known.*',"",result)
result=re.sub(',? known for.*',"",result)
result=re.sub(',? riding for( the)?.*',"",result)
result=re.sub(",? active in.*","",result)
result=re.sub(",? born in.*","",result)
result=re.sub("\.$","",result)
result=re.sub("\bformer\b","",result)
result=re.sub('[,;]? (he|she|they) (is|are|were|was).*',"",result)
result=re.sub('([a-zA-Z])', lambda x: x.groups()[0].upper(), result, 1)
result=re.sub("[\.\,\;]$","",result)
result=re.sub("\b(and|is|that|was|were|are|for)$","",result)
result=re.sub("\b(and|is|that|was|were|are|for)$","",result)
if len(result) <= 40:
if re.match("(Afghan|Albanian|Algerian|Andorran|Angolan|Barbuda|Antiguan|Barbudan|Argentine|Armenian|Australian|Austrian|Azerbaijani|Azeri|Bahamas|Bahamian|Bahraini|Bengali|Barbadian|Belarusian|Belgian|Belizean|Beninese|Beninois|Bhutanese|Bolivian|Bosnian|Herzegovinian|Motswana|Botswanan|Brazilian|Bruneian|Bulgarian|Faso|Burkinabé|Burmese|Burundian|Verde|Cabo|Verdean|Cambodian|Cameroonian|Canadian|African|Chadian|Chilean|Chinese|Colombian|Comoran|Comorian|Congolese|Rican|Ivorian|Croatian|Cuban|Cypriot|Republic|Czech|Danish|Djiboutian|Dominican|Republic|Dominican|Timor|Timorese|Ecuadorian|Egyptian|Salvador|Salvadoran|Guinea|Equatorial|Guinean|Equatoguinean|Eritrean|Estonian|Ethiopian|Fijian|Finnish|French|Gabonese|Gambian|Georgian|German|Ghanaian|Gibraltar|Greek|Hellenic|Grenadian|Guatemalan|Guinean|Bissau|Guinean|Guyanese|Haitian|Honduran|Hungarian|Magyar|Icelandic|Indian|Indonesian|Iranian|Persian|Iraqi|Irish|Israeli|Italian|Coast|Ivorian|Jamaican|Japanese|Jordanian|Kazakhstani|Kazakh|Kenyan|Kiribati|Korea|North|Korean|Korea|South|Korean|Kuwaiti|Kyrgyzstani|Kyrgyz|Kirgiz|Kirghiz|Lao|Laotian|Latvian|Lettish|Lebanese|Basotho|Liberian|Libyan|Liechtensteiner|Lithuanian|Luxembourg|Luxembourgish|Macedonian|Malagasy|Malawian|Malaysian|Maldivian|Malian|Malinese|Maltese|Islands|Marshallese|Martiniquais|Martinican|Mauritanian|Mauritian|Mexican|Micronesian|Moldovan|Monégasque|Monacan|Mongolian|Montenegrin|Moroccan|Mozambican|Namibian|Nauruan|Nepali|Nepalese|Dutch|Netherlandic|Zealand|Zealand|Zelanian|Nicaraguan|Nigerien|Nigerian|Marianan|Norwegian|Omani|Pakistani|Palauan|Palestinian|Panamanian|Guinea|Papua|Guinean|Papuan|Paraguayan|Peruvian|Filipino|Philippine|Polish|Portuguese|Rico|Puerto|Rican|Qatari|Romanian|Russian|Rwandan|Kitts|and|Nevis|Kittitian|Nevisian|Saint|Lucian|Saint|Vincentian|Vincentian|Samoan|Marino|Sammarinese|Tomé|Príncipe|São|Toméan|Arabia|Saudi|Arabian|Senegalese|Serbian|Seychellois|Leone|Sierra|Leonean|Singapore|Singaporean|Slovak|Slovenian|Slovene|Islands|Solomon|Island|Somali|African?|South|African|Sudan|South|Sudanese|Spanish|Lanka|Sri|Lankan|Sudanese|Surinamese|Swazi|Swedish|Swiss|Syrian|Tajikistani|Tanzanian|Thai|Leste|Timorese|Togolese|Tokelauan|Tongan|Tobago|Trinidadian|Tobagonian|Tunisian|Turkish|Turkmen|Tuvaluan|Ugandan|Ukrainian|Arab|Emirates|Emirati|Emirian|Emiri|Kingdom|Great|Britain|Northern|Ireland|UK|British|America|United|States|U.S.|American|Uruguayan|Uzbekistani|Uzbek|Vanuatu|Vanuatuan|Vatican|Venezuelan|Vietnamese|Yemeni|Zambian|Zimbabwean)",result):
return result
return False
savecounter = 0
def category_filter(generator, category):
"""
Filter memebers of the specified category out of the generator.
@param generator: Generator to filter
@type generator: iterator
@param category: Category to filter out
@type category: L{pywikibot.pgae.Category}
"""
for page in generator:
if category not in page.categories():
yield page
living_people_cat = pywikibot.Category(site, 'Living people')
sd_article_cat = pywikibot.Category(site, 'Articles with short description')
gen = pagegenerators.CategorizedPageGenerator(living_people_cat,start='Benjamin')
gen = category_filter(gen, sd_article_cat)
gen = pagegenerators.PreloadingGenerator(gen)
savecounter = 0
for page in gen:
sd = extractdescription(extractfirst(page.text))
if not sd or 'short description' in page.text: # Is the second condition necessary?
continue
description = "{{short description|" + sd + "|bot=PearBOT 5}}\n"
page.text = description + page.text
savecounter+=1
print(description)
if pywikibot.Page(site, u"User:PearBOT/Biography short descriptions/stop page").text == "":
page.save('Adding automatically generated short description. For more information see [[Wikipedia:Bots/Requests for approval/PearBOT 5]] Feedback appreciated at [[User talk:Trialpears]]')
else:
break;