#
# (C) Toto Azéro, 2012-2013
#
# Distribué sous licence GNU GPLv3
# Distributed under the terms of the GNU GPLv3 license
# http://www.gnu.org/licenses/gpl.html
#
import re
import pywikibot
import time
def extract_titles(text, beginning, match_title):
"""
Extracts all the titles of a text, starting at 'beginning'
Setting beginning to '' or None will start at the beginning of the text
[‹!› Not working] Setting beginning to anything else (but only unicode) will start ignore all the titles
before the first occurrence of the phrase given.
match_title should be a regular expression (use re.compile).
Returns a list of unicode strings.
"""
titles = {
-1: beginning
}
i = 0
#print text
if not text:
return None
while re.search(match_title, text):
titles[i] = re.search(match_title, text).group(0)
if titles[i][0] == '\n':
titles[i] = titles[i][1:]
#print titles[i]
text = text[text.index(titles[i]) + len(titles[i]):]
i += 1
del titles[-1]
return titles
def extract_sections(text, titles):
"""
Extracts all the sections of a text, based on a list of titles.
You can use extract_titles() to be given this list.
Returns a dictionnary as following :
section_number (int): section_value (unicode)
NB: section_value includes the section's title.
"""
if not titles:
return None
sections = {}
for setion_number in titles:
# Si jamais le titre est celui de la dernière section, on procède
# sans rechercher le titre de la section suivante, puisqu'elle
# n'existe pas.
if (setion_number + 1) != len(titles):
sections[setion_number] = text[text.index(titles[setion_number]):text.index(titles[setion_number + 1])]
text = text[text.index(sections[setion_number]) + len(sections[setion_number]):]
else:
sections[setion_number] = text[text.index(titles[setion_number]):]
return sections
def extract_sections_with_titles(text, beginning, match_title):
"""
Extracts all the titles and sections of a text, starting at 'beginning'.
match_title should be a regular expression (use re.compile).
Returns a dictionnary as following :
section_title (unicode): section_value (unicode)
NB: section_value includes the section's title.
"""
titles = extract_titles(text, beginning, match_title)
if not titles:
return None
sections = {}
for setion_number in titles:
# Si jamais le titre est celui de la dernière section, on procède
# sans rechercher le titre de la section suivante, puisqu'elle
# n'existe pas.
current_title = titles[setion_number]
if (setion_number + 1) != len(titles):
sections[current_title] = text[text.index(titles[setion_number]):text.index(titles[setion_number + 1])]
text = text[text.index(sections[current_title]) + len(sections[current_title]):]
else:
sections[current_title] = text[text.index(titles[setion_number]):]
return sections
class WarnBot:
def __init__(self):
self.site = pywikibot.Site('fr', 'wikipedia')
self.main_page = pywikibot.Page(self.site, u"Wikipédia:Respect de l'obligation de transparence à vérifier/En cours")
self.match_titre_requete = re.compile(u"=== *([^=].*?) *===")
def analyse_une_section(self, page, match_debut):
# TODO : - gérer ou du moins éviter les problèmes en cas de doublons de titres.
text = page.get()
# Permet de ne garder que le texte contenant les requêtes à étudier,
# car plusieurs sections se trouvent sur la même page.
if match_debut == u'Comptes bloqués en attente':
text = text[0:text.index(u"= Comptes bloqués en attente =")]
elif match_debut == u'En cours':
text = text[text.index(u"= En cours ="):]
titres = extract_titles(text, beginning = None, match_title = self.match_titre_requete)
sections = extract_sections(text, titres)
return {
'titres': titres,
'sections': sections
}
test = WarnBot()
liste = test.analyse_une_section(page=test.main_page, match_debut="En cours")
i = 0
listee=[]
regexp = re.compile(r'(Statut : {{Terminé}})')
site = pywikibot.Site('fr', 'wikipedia')
page = pywikibot.Page(site, u"Wikipédia:Respect de l'obligation de transparence à vérifier/En cours")
textee = page.get()
for field,value in liste.items():
if field == 'sections':
for bb,val in value.items():
if regexp.search(val):
i= i+1
listee.append(val)
textee = textee.replace(val, '')
print(i)
page.text = textee
page.save(summary="[[wp:bot|robot]] : archivage de " + str(i) + "sections")
time.sleep(120)
site = pywikibot.Site('fr', 'wikipedia')
page = pywikibot.Page(site, u"Wikipédia:Respect de l'obligation de transparence à vérifier/Archives")
texteee = page.get()
for theentry in listee:
texteee = texteee + theentry
page.text = texteee
page.save(summary="[[wp:bot|robot]] : archivage de " + str(i) + " sections")
print(texteee)
4
Sleeping for 5.9 seconds, 2018-10-25 21:40:33 Page [[fr:Wikipédia:Respect de l'obligation de transparence à vérifier/En cours]] saved
--------------------------------------------------------------------------- NoPage Traceback (most recent call last) <ipython-input-1-8944c393eaef> in <module>() 153 site = pywikibot.Site('fr', 'wikipedia') 154 page = pywikibot.Page(site, u"Wikipédia:Respect de l'obligation de transparence à vérifier/Archives") --> 155 texteee = page.get() 156 157 for theentry in listee: /srv/paws/pwb/pywikibot/tools/__init__.py in wrapper(*__args, **__kw) 1738 cls, depth) 1739 del __kw[old_arg] -> 1740 return obj(*__args, **__kw) 1741 1742 if not __debug__: /srv/paws/pwb/pywikibot/page.py in get(self, force, get_redirect, sysop) 475 del self.latest_revision_id 476 try: --> 477 self._getInternals(sysop) 478 except pywikibot.IsRedirectPage: 479 if not get_redirect: /srv/paws/pwb/pywikibot/page.py in _getInternals(self, sysop) 505 if self._latest_cached_revision() is None: 506 try: --> 507 self.site.loadrevisions(self, content=True, sysop=sysop) 508 except (pywikibot.NoPage, pywikibot.SectionError) as e: 509 self._getexception = e /srv/paws/pwb/pywikibot/tools/__init__.py in wrapper(*__args, **__kw) 1738 cls, depth) 1739 del __kw[old_arg] -> 1740 return obj(*__args, **__kw) 1741 1742 if not __debug__: /srv/paws/pwb/pywikibot/site.py in loadrevisions(self, page, content, revids, startid, endid, starttime, endtime, rvdir, user, excludeuser, section, sysop, step, total, rollback) 4142 raise InconsistentTitleReceived(page, pagedata['title']) 4143 if 'missing' in pagedata: -> 4144 raise NoPage(page) 4145 api.update_page(page, pagedata, rvgen.props) 4146 NoPage: Page [[fr:Wikipédia:Respect de l'obligation de transparence à vérifier/Archives]] doesn't exist.