Python:Page list (en)

From Botwiki

Jump to: navigation, search

Most bots can take a -file parameter, which reads a file and gets pages it should act on from that. On a small wiki, it may be interesting to get a text list of all pages, go through it by hand (or with a text editor's search and replace) and then give that file to your bot.

This script generates a list of all page titles.

#!/usr/bin/python
# -*- coding: utf-8  -*-
"""
prints a flat text list of page titles as wiki links
for use with other bots and the -file option
"""
import wikipedia
import pagegenerators
import sys
import urllib
import re
 
def listpages(self, start = '!', namespace = 0, throttle = True):
        """This is just a hacked version of the function from wikipedia.py, 
            made to return text instead of objects."""
        while True:
            # encode Non-ASCII characters in hexadecimal format (e.g. %F6)
            start = start.encode(self.encoding())
            start = urllib.quote(start)
            # load a list which contains a series of article names (always 480)
            path = self.allpages_address(start, namespace)
            print 'Retrieving Allpages special page for %s from %s, namespace %i' % (repr(self), start, namespace)
            returned_html = self.getUrl(path)
            # Try to find begin and end markers
            try:
                # In 1.4, another table was added above the navigational links
                if self.version() < "1.4":
                    begin_s = '<table'
                    end_s = '</table'
                else:
                    begin_s = '</table><hr /><table'
                    end_s = '</table'
                ibegin = returned_html.index(begin_s)
                iend = returned_html.index(end_s,ibegin + 3)
            except ValueError:
                raise ServerError('Couldn\'t extract allpages special page. Make sure you\'re using the MonoBook skin.')
            # remove the irrelevant sections
            returned_html = returned_html[ibegin:iend]
            if self.version()=="1.2":
                R = re.compile('/wiki/(.*?)" *class=[\'\"]printable')
            else:
                R = re.compile('title ?="(.*?)"')
            # Count the number of useful links on this page
            n = 0
            for hit in R.findall(returned_html):
                # count how many articles we found on the current page
                n = n + 1
 
                yield hit
 
                # save the last hit, so that we know where to continue when we
                # finished all articles on the current page. Append a '!' so that
                # we don't yield a page twice.
                start = wikipedia.Page(self,hit).titleWithoutNamespace() + '!'
            # A small shortcut: if there are less than 100 pages listed on this
            # page, there is certainly no next. Probably 480 would do as well,
            # but better be safe than sorry.
            if n < 100:
                break
 
try:
    start = []
    test = False
    for arg in wikipedia.handleArgs():
        if arg.startswith("-test"):
            test = True
        else:
            start.append(arg)
    if start:
        start = " ".join(start)
    else:
        start = "!"
    mysite = wikipedia.getSite()
 
    for page in listpages(mysite):
        print "[[%s]]" % page
 
finally:
    wikipedia.stopme()
Personal tools