Purple exclamation mark.svg Planning the future of Botwiki! - Help us bring Botwiki up to date, contribute to our strategy discussion, add bot scripts, and contribute manuals, guides, and tutorials! Almost anything related to bots, particularly those used to edit mediawiki, is welcome.

Red exclamation mark.svg UNABLE TO EDIT? - We've experienced attacks by spambots lately and now require you to confirm your e-mail before you can edit (go to your preferences, enter an e-mail address, and request a confirmation e-mail, then go to your e-mail and click on the confirmation link). We also require new accounts to make a few edits and wait a few minutes before before you can create a page; however, if this is a problem contact us in #botwiki and we can manually confirm your account. Sorry for the inconvenience.

Python:Archive index.py

From Botwiki
Jump to: navigation, search
#!/usr/bin/python
# -*- coding: utf-8  -*-
"""
 
Generates indexes of archived talk pages.
 
The following parameters are supported:
 
    -debug         If given, doesn't do any real changes, but only shows
                   what would have been changed.
 
    -log           Writes output to logfile
 
    -page:pagename
                   Create an index only on this page. 
                   Otherwise all pages which transclude 
                   the hometemplate will be processed.
 
    -logbook:pagename
                   Write a log to this page 
 
    -defaulttemplate:pagename
                   Default template to use 
 
    -hometemplate:pagename
                   page which is transcluded to generate the index.
                   *** This is required! ***
 
"""
__version__ = '$Id$'
import wikipedia
import pagegenerators
import re
import sys
import zlib
from time import strftime, localtime
from operator import itemgetter
 
# This is required for the text that is shown when you run this script
# with the parameter -help.
docuReplacements = {
}
 
# contains handy static functions
class TextFunctions:
 
    # http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/145672/index_txt
    def formatBlock(block):
        '''Format the given block of text, trimming leading/trailing
        empty lines and any leading whitespace that is common to all lines.
        The purpose is to let us list a code block as a multiline,
        triple-quoted Python string, taking care of indentation concerns.'''
        # separate block into lines
        lines = str(block).split('\n')
        # remove leading/trailing empty lines
        while lines and not lines[0]:  del lines[0]
        while lines and not lines[-1]: del lines[-1]
        # look at first line to see how much indentation to trim
        ws = re.match(r'\s*',lines[0]).group(0)
        if ws:
            lines = map( lambda x: x.replace(ws,'',1), lines )
        # remove leading/trailing blank lines (after leading ws removal)
        # we do this again in case there were pure-whitespace lines
        while lines and not lines[0]:  del lines[0]
        while lines and not lines[-1]: del lines[-1]
        return '\n'.join(lines)+'\n'
    formatBlock = staticmethod(formatBlock)
 
    def getanchor(sectiontext):
        ''' 
        get the anchor link of a section based on the title
        on the dutch wikipedia it is not enough to call wikipedia.sectionencode
        '''
        # remove [[ ]] around sectionname    
        anchor=TextFunctions.removeformatting(sectiontext)
 
        anchor = wikipedia.sectionencode(anchor,wikipedia.getSite().encoding())
 
        # remove spaces at start and end (which are converted to underscores)
        while anchor[:1]=='_':
            anchor = anchor[1:]
        while anchor[-1:]=='_':
            anchor = anchor[:-1]
        return anchor
    getanchor = staticmethod(getanchor)
 
    def removeformatting(linktext):
        '''
        remove [[ ]] and '' from a string, convert to text which would be shown
        '''
        p1 = re.compile(r'\[\[ [^|\]]* \|( [^|\]]* ) \]\]', re.VERBOSE)
        linktext = p1.sub(r'\1',linktext)
        p2 = re.compile(r'\[\[ ( [^\]]* ) \]\]', re.VERBOSE)
        linktext = p2.sub(r'\1',linktext)
        linktext=re.sub(r"'''(.*)'''",r'\1',linktext)
        linktext=re.sub(r"''(.*)''",r'\1',linktext)
        return linktext
    removeformatting = staticmethod(removeformatting)
 
class Templates:
    '''
    Used to generate a table from an index
    a template is defined as a python dictionary
    use get to load a template from a wikipedia page
    use parsetemplate to create a template from a string
    use processindex to create a textual table of an index based on a template
    '''
    def __init__(self):
        self.templates = {}
        self.default='default'
        self.templates[self.default]=self.getdefaulttemplate()
 
    def loadpage(self, name):
        if name is None:
            return
        page = wikipedia.Page(wikipedia.getSite(), name)
        try:
            text = page.get()
        except (wikipedia.NoPage, wikipedia.IsRedirectPage):
            return
        self.templates[name]=self.parsetemplate(text)
 
    def get(self, name):
        if name not in self.templates:
            self.loadpage(name)
        if name in self.templates:
            return self.templates[name]
        return self.templates[self.default]
 
    def getdefaulttemplate(self):
        text=TextFunctions.formatBlock('''
            <!-- HEADER -->
            {| class="sortable"
            ! Onderwerp !! Link
 
            <!-- ROW -->
            |-
            | %%topic%% || [[%%link%%|%%page%%]]
 
            <!-- ALT ROW -->
            |- style="background: #dddddd;"
            | %%topic%% || [[%%link%%|%%page%%]]
 
            <!-- FOOTER -->
            |}
 
            <!-- END -->
            ''')
        return self.parsetemplate(text)
 
    def parsetemplate(self,text):
        section=''
        ret={}
        seperators = re.compile(r'(<!--[^-]*-->)')
        seperators2 = re.compile(r'<!--([^-]*)-->')
        parts = seperators.split(text)
        for part in parts:
            t2 = seperators2.match(part)
            if t2 is not None:
                section=t2.group(1).strip()
            else:
                ret[section]=part.strip()+'\n'
        return ret
 
    def processindex(self,template,index):
        if len(index) == 0:
            return ''
 
        ret=''
        count=0
        if 'LEAD' in template:
            ret += template['LEAD']
        if 'HEADER' in template:
            ret += template['HEADER']
 
        for r in index:
            count += 1
            if (count%2==0) and ('ALT ROW' in template):
                t = template['ALT ROW']
            else:
                t = template['ROW']
            t = t.replace('%%topic%%',r['topic'])
            t = t.replace('%%link%%',r['link'])
            t = t.replace('%%page%%',r['page'])
            ret += t
 
        if 'FOOTER' in template:
            ret += template['FOOTER']
        if 'TAIL' in template:
            ret += template['TAIL']
 
        # allow %%subst%% and %%now%% to be replaced in all headers, not just ROW
        ret = ret.replace('%%subst%%','subst:')
        ret = ret.replace('%%now%%',strftime("%d %b %Y %H:%M (%Z)"))
        ret = ret.replace('%%((%%','{{')
        ret = ret.replace('%%))%%','}}')
        ret = ret.replace('%%(%%','{')
        ret = ret.replace('%%)%%','}')
        return ret
 
class IndexGenerator:
    '''
    Create an archive index of a numbor of pages
    an index is a list of dictonaries with the following keys
        sortkey : lowercase text usefull for sorting
        link    : link to page
        page    : title of the (sub) page
        topic   : title of section
    readoptions is used to read an optionstring
    getoptionstring to return the current options
    addpage is an internal function to process a single page
    retrieve is used to generate the index using the previous set options
    '''
    def __init__(self):
        self.pages=[]
        self.globaloptions={}
        pass
 
    def setoption(self,name,value):
        if value is None:
            if name in self.globaloptions:
                del self.globaloptions[name]
        else:
            self.globaloptions[name] = str(value)
 
    def readoption(self,name):
        if name in self.globaloptions:
            return self.globaloptions[name]
        return None
 
    def changedchecksum(self,checksum):
        if ('checksum' in self.globaloptions) and (str(self.globaloptions['checksum']) == str(checksum)):
            return False
        self.globaloptions['checksum'] = str(checksum)
        return True
 
    def readoptions(self,txt,pagename):
        options=txt.split(';')
        for option in options:
            opt=option.split('=',2)
            if len(opt)==2:
                if opt[0] in ('page', 'pageprefix'):
                    self.pages.append({opt[0]:opt[1]})
                elif (opt[0] in ('name','include','exclude')) and (len(self.pages) != 0):
                    self.pages[-1][opt[0]] = opt[1]
                elif opt[0] in ('checksum','template'):
                    self.globaloptions[opt[0]] = opt[1]
                else:
                    wikipedia.output('unknown/invalid option: %s=%s' % (opt[0],opt[1]))
        if len(self.pages) == 0:
            self.pages.append({'pageprefix':pagename+'/'})
 
 
    def getoptionstring(self):
        ret=''
        if 'template' in self.globaloptions:
            ret += 'template=%s;' % self.globaloptions['template']
        for indexpage in self.pages:
            if 'page' in indexpage:
                ret += 'page=%s;name=%s;' % (indexpage['page'], indexpage['name'])
            elif 'pageprefix' in indexpage:
                ret += 'pageprefix=%s;' % indexpage['pageprefix']
                if 'name' in indexpage:
                    ret += 'name=%s;' % indexpage['name']
                if 'include' in indexpage:
                    ret += 'include=%s;' % indexpage['include']
                elif 'exclude' in indexpage:
                    ret += 'exclude=%s;' % indexpage['exclude']
        if 'checksum' in self.globaloptions:
            ret += 'checksum=%s;' % self.globaloptions['checksum']
        return ret
 
 
    def addpage(self, page, shortname):
        pagetitle = page.title()
        try:
            text = page.get()
        except (wikipedia.NoPage, wikipedia.IsRedirectPage):
            wikipedia.output('error get()')
            return []
 
        # \n is enough...
        text=re.sub('\r','',text)
 
        ret = []
 
        lasttitle2=""
        lasttitle3=""
        lasttitle4=""
        title2 = re.compile(r'^==\ *([^= ].*[^= ])\ *==$')
        title3 = re.compile(r'^===\ *([^= ].*[^= ])\ *===$')
        title4 = re.compile(r'====\ *([^= ].*[^= ])\ *====')
        alltitles = re.compile(r'^(==.*==)$',re.MULTILINE)
        parts = alltitles.split(text)
        for part in parts:
            t2 = title2.match(part)
            t3 = title3.match(part)
            t4 = title4.match(part)
            if t2 is not None:
                lasttitle2=t2.group(1)
                lasttitle3=""
            elif t3 is not None:
                lasttitle3=t3.group(1)
                lasttitle4=""
            elif t4 is not None:
                lasttitle4=t4.group(1)
            else:
                if lasttitle4 != "":
                    lasttitle4=""
                elif lasttitle3 != "":
                    #wikipedia.output(u"===%s===" % lasttitle3)
                    anchor = TextFunctions.getanchor(lasttitle3)
                    linktext = TextFunctions.removeformatting(lasttitle3+" ("+lasttitle2+")")
                    sortkey=linktext.lower()
                    sortkey=re.sub('[^a-z]','',sortkey)
                    ret.append({'sortkey':sortkey, 'link':pagetitle+"#"+anchor, 'page':shortname, 'topic':linktext})
                    #linktext = TextFunctions.removeformatting(lasttitle2+", "+lasttitle3)
                    #sortkey=linktext.lower()
                    #sortkey=re.sub('[^a-z]','',sortkey)
                    #ret.append({'sortkey':sortkey, 'link':pagetitle+"#"+anchor, 'page':shortname, 'topic':linktext})
                elif lasttitle2 != "":
                    #wikipedia.output(u"==%s==" % lasttitle2)
                    anchor = TextFunctions.getanchor(lasttitle2)
                    linktext = TextFunctions.removeformatting(lasttitle2)
                    sortkey=linktext.lower()
                    sortkey=re.sub('[^a-z]','',sortkey)
                    ret.append({'sortkey':sortkey, 'link':pagetitle+"#"+anchor, 'page':shortname, 'topic':linktext})
        return ret
 
    def retrieve(self):
        ret=[]
        for indexpage in self.pages:
            if 'page' in indexpage:
                if 'name' not in indexpage:
                    indexpage['name']=indexpage['page']
                page = wikipedia.Page(wikipedia.getSite(), indexpage['page'])
                ret.extend(self.addpage(page, indexpage['name']))
            elif 'pageprefix' in indexpage:
                if 'name' not in indexpage:
                    indexpage['name']=''
                pagelist=[]
                subpagegen = pagegenerators.PrefixingPageGenerator(prefix = indexpage['pageprefix'])
                for subpage in subpagegen:
                    if 'include' in indexpage:
                        if re.search(indexpage['include'],subpage.title()) is not None:
                            pagelist.append(subpage.title())
                    elif 'exclude' in indexpage:
                        if re.search(indexpage['exclude'],subpage.title()) is None:
                            pagelist.append(subpage.title())
                    else:
                        pagelist.append(subpage.title())
                if pagelist != []:
                    gen = iter([wikipedia.Page(wikipedia.getSite(), t) for t in pagelist])
                    gen = pagegenerators.PreloadingGenerator(gen)
                    for page in gen:
                        ret.extend(self.addpage(page,indexpage['name']+page.title()[len(indexpage['pageprefix']):]))
 
        ret.sort(key=itemgetter('sortkey'))
 
        return ret
 
 
class ArchiveBot:
    '''
    '''
    # Edit summary message that should be used.
    # NOTE: Put a good description here, and add translations, if possible!
    msg = {
        'en': u'Robot: Create archive index',
        'nl': u'robot: Creƫer archief index',
    }
 
    def __init__(self, debug, hometemplate, defaulttemplate, logbook, singlepage):
        """
        Constructor. Parameters:
            * debug     - If True, doesn't do any real changes, but only shows
                          what would have been changed.
        """
        self.generator = None
        self.debug = debug
        self.hometemplate = hometemplate
        self.defaulttemplate = defaulttemplate
        self.logbook = logbook
        self.singlepage = singlepage
        self.acceptall = False
        self.processed = 0
        self.changecount = 0
        self.errorcount = 0
 
        self.templates = Templates()
 
        if self.singlepage is not None:
            self.generator = iter([wikipedia.Page(wikipedia.getSite(), self.singlepage)])
        else:
            transclusionPage = wikipedia.Page(wikipedia.getSite(), self.hometemplate)
            self.generator = pagegenerators.ReferringPageGenerator(transclusionPage, onlyTemplateInclusion = True)
            self.generator = pagegenerators.PreloadingGenerator(self.generator)
 
    def createlog(self):
        if self.logbook is None:
            return
        if (self.changecount+self.errorcount)==0:
            return
 
        log_page = wikipedia.Page(wikipedia.getSite(), self.logbook)
        try:
            log_text = log_page.get()
        except (wikipedia.NoPage, wikipedia.IsRedirectPage):
            log_text = ''
 
        old_log_text = log_text
 
        args = [wikipedia.decodeArg(sys.argv[0])] + map(lambda s: wikipedia.decodeArg('"%s"' % s), sys.argv[1:])
 
        log_text += '\n* Start: %s\n' % self.starttime
        log_text += r'* Command: <nowiki>' + u' '.join(args) + r'</nowiki>' + '\n'
        log_text += '* Processed: %d pages\n' % self.processed
        log_text += '* Changes: %d pages\n' % self.changecount
        log_text += '* Errors: %d pages\n' % self.errorcount
        log_text += '* End: %s\n' % self.endtime
        log_text += '----\n'
 
        com = wikipedia.translate(wikipedia.getSite(), self.msg) + ' (Log)'
 
        wikipedia.showDiff(old_log_text, log_text)
 
        if not self.debug:
            try:
                log_page.put(log_text, comment = com, minorEdit = True)
                #wikipedia.output('page.put()')
            except:
                wikipedia.output(u'Could not save log')
 
    def run(self):
        self.starttime = strftime("%d %b %Y %H:%M (%Z)")
        # Set the edit summary message
        wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), self.msg))
        for page in self.generator:
            self.treat(page)
 
        self.endtime = strftime("%d %b %Y %H:%M (%Z)")
        self.createlog()
 
    def treat(self, page):
        """
        Loads the given page, does some changes, and saves it.
        """
 
        self.processed += 1
 
        # Show the title of the page we're working on.
        # Highlight the title in purple.
        wikipedia.output(u"\03{lightpurple}%s\03{default}:" % page.title())
 
        try:
            # Load the page
            text = page.get()
        except wikipedia.NoPage:
            wikipedia.output(u"Page %s does not exist; skipping." % page.aslink())
            return
        except wikipedia.IsRedirectPage:
            wikipedia.output(u"Page %s is a redirect; skipping." % page.aslink())
            return
 
        if not page.botMayEdit(wikipedia.getSite().loggedInAs()):
            wikipedia.output(u"Page %s is locked for robot editing; skipping." % page.aslink())
            return
 
        thisindex = IndexGenerator()
 
        fulltext = re.compile(r'(\{\{'+self.hometemplate+r'(\|[^}]*)?\}\}(.*'+self.hometemplate+r'-->)?)',re.DOTALL)
        tmplonly = re.compile(r'\{\{'+self.hometemplate+r'\|([^}]*)\}\}')
        full=fulltext.search(text)
        tmplopt=tmplonly.search(text)
        if tmplopt is not None:
            thisindex.readoptions(tmplopt.group(1), page.title())
        else:
            wikipedia.output('cannot read options, using default')
            thisindex.readoptions('', page.title())
 
        if thisindex.readoption('template') is None:
            thisindex.setoption('template',self.defaulttemplate)
 
        idx=thisindex.retrieve()
 
        checktemplate=self.templates.parsetemplate(r'<!--ROW-->%%link%%')
        checktext=self.templates.processindex(checktemplate,idx)
        checksum = zlib.adler32(checktext.encode('utf8'))&0xffffffffL
        #wikipedia.output("checksum=%X" % checksum)
 
        t = self.templates.get(thisindex.readoption('template'))
        newtext=self.templates.processindex(t,idx)
 
 
        if thisindex.changedchecksum(checksum):
            text=fulltext.sub('{{'+self.hometemplate+'|'+thisindex.getoptionstring()+'}}'+newtext+'<!--'+self.hometemplate+'-->', text)
        else:
            wikipedia.output('Not changed')
 
        ###############################
        # save if something was changed
 
        if text != page.get():
            # show what was changed
            wikipedia.showDiff(page.get(), text)
 
            if not self.debug:
                if self.acceptall:
                    choice = 'y'
                else:
                    choice = wikipedia.inputChoice(u'Do you want to accept these changes?', ['Yes', 'No', 'All'], ['y', 'N', 'a'], 'N')
 
                if choice == 'a':
                    choice = 'y'
                    self.acceptall = True
 
                if choice == 'y':
                    self.changecount += 1
                    try:
                        # Save the page
                        page.put(text)
                        #wikipedia.output('page.put()')
                    except wikipedia.LockedPage:
                        wikipedia.output(u"Page %s is locked; skipping." % page.aslink())
                        self.errorcount += 1
                    except wikipedia.EditConflict:
                        wikipedia.output(u'Skipping %s because of edit conflict' % (page.title()))
                        self.errorcount += 1
                    except wikipedia.SpamfilterError, error:
                        wikipedia.output(u'Cannot change %s because of spam blacklist entry %s' % (page.title(), error.url))
                        self.errorcount += 1
 
 
def main():
    # If debug is True, doesn't do any real changes, but only show
    # what would have been changed.
    debug = False
 
    hometemplate = None
    logbook = None
    defaulttemplate = None
    singlepage = None
 
    # Parse command line arguments
    for arg in wikipedia.handleArgs():
        if arg.startswith("-debug"):
            debug = True
        elif arg.startswith('-page:'):
            singlepage = arg[6:]
        elif arg.startswith('-logbook:'):
            logbook = arg[9:]
        elif arg.startswith('-defaulttemplate:'):
            defaulttemplate = arg[17:]
        elif arg.startswith('-hometemplate:'):
            hometemplate = arg[14:]
 
    if hometemplate is None:
        wikipedia.output('hometemplate is required')
        return
 
    bot = ArchiveBot(debug, hometemplate, defaulttemplate, logbook, singlepage)
    bot.run()
 
if __name__ == "__main__":
    try:
        main()
    finally:
        wikipedia.stopme()
Personal tools
Share