Manual:Replacer.py

From Botwiki

Jump to: navigation, search
# -*- coding: utf-8 -*-
'''
Critical issues:
* max. 10 threads to not make too many edits and keep database connections below limit
* www.mediawiki.org must be supported
 
FIMEs:
*Right after getting the CheckUsage results, it start retrieving [[User:CommonsDelinker/replace-I18n]]. It does this multiple times for each site. That's quite inefficient.
'''
 
import wikipedia, config, codecs
import urllib2, re, time, thread
import MySQLdb 
 
months=['', "January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]
summaries={'default':u'[[w:commons:User:Orgullobot/commands|Bot]]: Replacing $1 with $2. [[m:User:CommonsDelinker|Translate me]] [[User:CommonsDelinker/replace-I18n|here]]!'}
done=[]
editing=[]#a list of pages the bot is currently editing/checking, to avoid edit conflicts with the threads
#Note: This is NOT a good way to do this.
existentes=[]
#a list of pages that we have checked if they exist, as to not check them over again.
def pageText(url):
	request=urllib2.Request(url)
	user_agent='Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.12) Gecko/20050915 Firefox/1.0.7'
	#print url
	request.add_header("User-Agent", user_agent)
	response=urllib2.urlopen(request)
	text=response.read()
	response.close()
	return text
 
def checanombres(page): #checanombres('es', 'wikipedia')
    dicc={}
    crudo=pageText('http://'+page.site().hostname()+'/w/api.php?action=query&prop=revisions&titles='+page.urlname()+'&rvprop=content&format=xml')
    carne=crudo.split('<page ')[1].split('>')[0]
    ns=carne.split('ns="')[1].split('"')[0]
    return int(ns)
 
 
pagelink=wikipedia.Page(wikipedia.Site('es', 'wikipedia'), 'Wikipedia:Putos')
wikipedia.output( u'ChecaNombres: '+str(checanombres(pagelink)) )
records=[]
def record(hora, page, img, new_image_name="NULL", status="ok"):
    records.append((hora, page, img, new_image_name, status))
    if len(records)>50:
        print 'Recording...'
        conn = MySQLdb.connect(host="sql",user="orgullo", passwd="****",db="u_orgullo_logs", charset='utf8', use_unicode=1)
        cursor=conn.cursor()
        cursor.execute('set names utf8;')
        for archivo in records:
            hora=archivo[0]
            page=archivo[1]
            img=archivo[2]
            new_image_name=archivo[3]
            status=archivo[4]
            rightnow=str(time.time())
            canIDB(rightnow)
            wikiT=str(page.site()).split(':')
            if wikiT[0]=='commons':
                wikiT=['wikimedia', 'commons']
            wiki=wikiT[1]+'.'+wikiT[0]+'.org'
            fortit=page.titleWithoutNamespace().replace(' ', '_')
            img=img.replace(' ', '_')
            img=conn.escape_string(img.encode('utf-8')).decode('utf-8')
            new_image_name=conn.escape_string(new_image_name.encode('utf-8')).decode('utf-8')
            fortit=conn.escape_string(fortit.encode('utf-8')).decode('utf-8')
            query=r"insert into delinker VALUES('"+hora+"','"+img+"', '"+wiki+"', '"+fortit+"', '"+str(checanombres(page))+"', '"+status+"', '"+new_image_name+"');"
            cursor.execute(query)
        conn.commit()
        conn.close()
 
 
        dbt=open('dbthrottle.txt', 'r')
        dbtt=dbt.read()
        dbt.close()
        newdbtt=dbtt.replace(rightnow+'\n', '')
        dbtW=open('./dbthrottle.txt', 'w')
        dbtW.write(newdbtt)
        dbtW.close()
        while records != []:
            records.remove(records[0])
 
#record(time.time(), wikipedia.Page(wikipedia.getSite(), 'This is a test'), 'Testimage.jpg')
#'%Y-%m-%d %H:%M
#'2006-09-22 21:01'
 
def exists(page):
    """This is much more efficient for the servers"""
    #http://es.wikipedia.org/w/query.php?what=content&titles=Image:Punta%20del%20Este.jpg&aplimit=1&format=xml
    if page in existentes:
        return True
    path='http://'+page.site().hostname()+'/w/query.php?what=imageinfo&titles='+page.urlname()+'&aplimit=1&format=xml'
    crudo=pageText(path)
    identi=crudo.split('<id>')[1].split('</id>')[0]
    espacio=crudo.split('<ns>')[1].split('</ns>')[0]
    if identi != "0":
        if espacio=="6":
            if not '<image ' in crudo:
                return False
        existentes.append(page)
        return True
    else:
        return False
 
def getcommands():
    uni=[]
    restored=[]
    lo=codecs.open('commons-commands.txt', 'r', 'utf-8')
    lotxt=lo.read()
    lo.close()
    return lotxt
 
def canIedit():
    if '{{stop}}' in getcommands().lower():
        return False
    else:
        return True
def canIDB(rightnow):
    try:
        canIgo=False
        while canIgo==False:
            dbthrottleCheck=open('dbthrottle.txt', 'r')
            dbthrottleCheckTxt=dbthrottleCheck.read()
            dbthrottleCheck.close()
            if dbthrottleCheckTxt.count('\n')<10:
                dbthrottleA=open('dbthrottle.txt', 'a')
                dbthrottleA.write(rightnow+'\n')
                return True
            time.sleep(10)
    except IOError:
        time.sleep(10)
 
def summary(wiki_site):
    try:
        if wiki_site in summaries:
            if time.time()-summaries[wiki_site][1]<3600:##reload the summary if it's over an hour old
                return summaries[wiki_site][0]
        pl=wikipedia.Page(wiki_site, u'User:CommonsDelinker/replace-I18n')
        try:
            x=pl.get()
            summaries[wiki_site]=[x, time.time()]
            return x
        except wikipedia.NoPage:
            if not 'wikipedia' in str(wiki_site):
                lang=str(wiki_site).split(':')[-1]
                if lang in ('incubator', 'meta', 'commons', 'species'):
                    new_site=wikipedia.Site('en', 'wikipedia')
                else:
                    new_site=wikipedia.Site(lang, 'wikipedia')
                return summary(new_site)
            summaries[wiki_site]=[summaries['default'], time.time()]
            return summaries['default']
    except:
        return summaries['default']
def replace_image(img, pg, newimg):
    tocon='a'*14
    if canIedit()==False:
        return None
    print ('Replacing image', img, pg, pg.site(), newimg)
    fix=wikipedia.Page(pg.site(), img)
    img=fix.titleWithoutNamespace()
    ext1=img.split('.')[-1]
    ext2=newimg.split('.')[-1]
    print 'Extensions: '+ext1+', '+ext2
    if ext2.lower()=='svg':
        if ext1.lower() !='svg':
            print 'Ignoring non-SVG to SVG replacement.'
            return None
    newimg=wikipedia.Page(pg.site(), newimg).titleWithoutNamespace()
    while pg in editing:
        time.sleep(3)
    editing.append(pg)
    msg=summary(pg.site())
    msg=msg.replace('$1', img)
    msg=msg.replace('$2', newimg)
 
    imagen=pg.site().namespace(6)
    wikipedia.output(pg.title())
    if pg.namespace() not in [99999999]:
        txt=pg.get()
        newTxt=txt
        if pg.site() != wikipedia.Site('commons', 'commons'):
            ce=wikipedia.Page(pg.site(), 'Image:'+img)
            if exists(ce):
                print 'Pulling out'
                return None
        forpat=img
        toescape=('.', '(', ')')
        for te in toescape:
            forpat=forpat.replace(te, '\\'+te)
        rx=r'['+img[0].upper()+forpat[0].lower()+']'+forpat[1:]
        if ' ' in rx:
            rx=rx.replace(' ', '[ _]')
        elif '_' in rx:
            rx=rx.replace('_', '[ _]')
        print [rx]
 
        posis=re.findall(rx, newTxt)
        print posis
        for posi in posis:
            newTxt=newTxt.replace(posi, newimg)
 
 
        if txt != newTxt:
            try:
                ##We want to make sure the userpage is not empty
                filename='canedit.cdl'
                f=codecs.open(filename, 'r', 'utf-8')
                ftxt=f.read()
                f.close()
                if not '#'+str(pg.site()) in ftxt:
                    userpage=wikipedia.Page(pg.site(), 'User:CommonsDelinker')
                    if not exists(userpage):
                        userpage.put('#Redirect[[m:User:CommonsDelinker]]', '')
                    f=codecs.open(filename, 'a', 'utf-8')
                    f.write('#'+str(pg.site()))
                    f.close()
                wikipedia.showDiff(txt, newTxt)
                pg.put(newTxt, msg)
                thread.start_new_thread(record, (tocon, pg, img, newimg, "ok"))
            except wikipedia.LockedPage:
                thread.start_new_thread(record, (tocon, pg, img, newimg, "failed"))
                print 'Page is locked' 
        else:
            #thread.start_new_thread(record, (tocon, pg, img, newimg, "skipped"))
            wikipedia.output( u'No match: '+pg.site().hostname()+'/wiki/'+pg.urlname() )
    while pg in editing:
        editing.remove(pg)
def checkUsage(image, newimg):
    print ('check usage', image, newimg)
    imageU=wikipedia.Page(wikipedia.getSite(), image).urlname()
    path='http://tools.wikimedia.de/%7Edaniel/WikiSense/CheckUsage.php?i='+imageU+'&w=_100000#end'
    ch=pageText(path).decode('utf-8')
    projs=ch.split("class='project'")[1:]
    print (str(len(projs))+ u' projects for', image)
    for proj in projs:
        baseR=ur'wik(?:i[mp]edia|ibooks|tionary|iquote|inews|isource|iversity)'
        proid=re.findall('http://([^\.]*\.'+baseR+'\.org)', proj)[0]
        #couldbe=('<i><b>different image', '<i>found:', '<i>local duplicate')
        #found=0
        #for could in couldbe:
        #    if could in proj:
        #        found=1
        #if found==1:
        #   #going on, this project has a local copy
        #    continue
        tabla=proj.split("<div class='page'>")[1:]
        wikipedia.output(u'Checkusage returns '+str(len(tabla))+' for '+image+' in '+proid+'.')
        for ta in tabla:
            ta=ta.split('</div>')[0]
            url=ta.split('<a href="')[1].split('?uselang=en"')[0]
            wpR=ur'http://(commons|incubator|meta|species|www|[^\.]*)\.('+baseR+')\.org/wiki/(.*)'
            wikipedia.output( wpR )
            wp=re.findall(wpR, url)
            print wp
            if wp != []:
                if wp[0][0]==u'commons':
                    wp=['commons', 'commons', wp[0][2]]
                elif wp[0][0]==u'meta':
                    wp=['meta', 'meta', wp[0][2]]
                elif wp[0][0]==u'incubator':
                    wp=['incubator', 'incubator', wp[0][2]]
                elif wp[0][0]==u'www':
                    wp=['mediawiki', 'mediawiki', wp[0][2]]
                elif wp[0][0]==u'species':
                    wp=['species', 'species', wp[0][2]]
                else:
                    wp=wp[0]
                pagelink=wikipedia.Page(wikipedia.Site(wp[0], wp[1]), wp[2])
                try:
                    user=config.usernames[wp[1]][wp[0]]
                    #retirar_imagen(image, pagelink, admin)
                    while pagelink in editing:
                        time.sleep(3)
                    thread.start_new_thread(replace_image, (image, pagelink, newimg))
                except KeyError:
			continue
 
hechas=[]
fiables=['User:Orgullobot/commands']
def RUN():
    for fiable in fiables:
       cmd=wikipedia.Page(wikipedia.Site('commons', 'commons'), fiable)
       texto=cmd.get()
       lfile=codecs.open('./commons-commands.txt', 'w', 'utf-8')
       lfile.write(texto)
       lfile.close()
    texto=getcommands()
    chuletas=texto.split('{{')
    chuletas.remove(chuletas[0])
    for chuleta in chuletas:
        if chuleta in hechas:
            chuletas.remove(chuleta)
            continue
        else:
            hechas.append(chuleta)
 
            com=chuleta.split('|')[0]
            if com.lower() in ['universal replace', 'universal_replace']:
                img=chuleta.split('|')[1]
                newimg=chuleta.split('|')[2].split('}}')[0]
		thread.start_new_thread(checkUsage, (img, newimg))
                time.sleep(5)
		#checkUsage(img, newimg)
while 2==2:
    RUN()
    time.sleep(60)
Personal tools