# -*- coding: utf-8 -*-
'''
Critical issues:
* max. 10 threads to not make too many edits and keep database connections below limit
* www.mediawiki.org must be supported
FIMEs:
*Right after getting the CheckUsage results, it start retrieving [[User:CommonsDelinker/replace-I18n]]. It does this multiple times for each site. That's quite inefficient.
'''
import wikipedia, config, codecs
import urllib2, re, time, thread
import MySQLdb
months=['', "January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]
summaries={'default':u'[[w:commons:User:Orgullobot/commands|Bot]]: Replacing $1 with $2. [[m:User:CommonsDelinker|Translate me]] [[User:CommonsDelinker/replace-I18n|here]]!'}
done=[]
editing=[]#a list of pages the bot is currently editing/checking, to avoid edit conflicts with the threads
#Note: This is NOT a good way to do this.
existentes=[]
#a list of pages that we have checked if they exist, as to not check them over again.
def pageText(url):
request=urllib2.Request(url)
user_agent='Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.12) Gecko/20050915 Firefox/1.0.7'
#print url
request.add_header("User-Agent", user_agent)
response=urllib2.urlopen(request)
text=response.read()
response.close()
return text
def checanombres(page): #checanombres('es', 'wikipedia')
dicc={}
crudo=pageText('http://'+page.site().hostname()+'/w/api.php?action=query&prop=revisions&titles='+page.urlname()+'&rvprop=content&format=xml')
carne=crudo.split('<page ')[1].split('>')[0]
ns=carne.split('ns="')[1].split('"')[0]
return int(ns)
pagelink=wikipedia.Page(wikipedia.Site('es', 'wikipedia'), 'Wikipedia:Putos')
wikipedia.output( u'ChecaNombres: '+str(checanombres(pagelink)) )
records=[]
def record(hora, page, img, new_image_name="NULL", status="ok"):
records.append((hora, page, img, new_image_name, status))
if len(records)>50:
print 'Recording...'
conn = MySQLdb.connect(host="sql",user="orgullo", passwd="****",db="u_orgullo_logs", charset='utf8', use_unicode=1)
cursor=conn.cursor()
cursor.execute('set names utf8;')
for archivo in records:
hora=archivo[0]
page=archivo[1]
img=archivo[2]
new_image_name=archivo[3]
status=archivo[4]
rightnow=str(time.time())
canIDB(rightnow)
wikiT=str(page.site()).split(':')
if wikiT[0]=='commons':
wikiT=['wikimedia', 'commons']
wiki=wikiT[1]+'.'+wikiT[0]+'.org'
fortit=page.titleWithoutNamespace().replace(' ', '_')
img=img.replace(' ', '_')
img=conn.escape_string(img.encode('utf-8')).decode('utf-8')
new_image_name=conn.escape_string(new_image_name.encode('utf-8')).decode('utf-8')
fortit=conn.escape_string(fortit.encode('utf-8')).decode('utf-8')
query=r"insert into delinker VALUES('"+hora+"','"+img+"', '"+wiki+"', '"+fortit+"', '"+str(checanombres(page))+"', '"+status+"', '"+new_image_name+"');"
cursor.execute(query)
conn.commit()
conn.close()
dbt=open('dbthrottle.txt', 'r')
dbtt=dbt.read()
dbt.close()
newdbtt=dbtt.replace(rightnow+'\n', '')
dbtW=open('./dbthrottle.txt', 'w')
dbtW.write(newdbtt)
dbtW.close()
while records != []:
records.remove(records[0])
#record(time.time(), wikipedia.Page(wikipedia.getSite(), 'This is a test'), 'Testimage.jpg')
#'%Y-%m-%d %H:%M
#'2006-09-22 21:01'
def exists(page):
"""This is much more efficient for the servers"""
#http://es.wikipedia.org/w/query.php?what=content&titles=Image:Punta%20del%20Este.jpg&aplimit=1&format=xml
if page in existentes:
return True
path='http://'+page.site().hostname()+'/w/query.php?what=imageinfo&titles='+page.urlname()+'&aplimit=1&format=xml'
crudo=pageText(path)
identi=crudo.split('<id>')[1].split('</id>')[0]
espacio=crudo.split('<ns>')[1].split('</ns>')[0]
if identi != "0":
if espacio=="6":
if not '<image ' in crudo:
return False
existentes.append(page)
return True
else:
return False
def getcommands():
uni=[]
restored=[]
lo=codecs.open('commons-commands.txt', 'r', 'utf-8')
lotxt=lo.read()
lo.close()
return lotxt
def canIedit():
if '{{stop}}' in getcommands().lower():
return False
else:
return True
def canIDB(rightnow):
try:
canIgo=False
while canIgo==False:
dbthrottleCheck=open('dbthrottle.txt', 'r')
dbthrottleCheckTxt=dbthrottleCheck.read()
dbthrottleCheck.close()
if dbthrottleCheckTxt.count('\n')<10:
dbthrottleA=open('dbthrottle.txt', 'a')
dbthrottleA.write(rightnow+'\n')
return True
time.sleep(10)
except IOError:
time.sleep(10)
def summary(wiki_site):
try:
if wiki_site in summaries:
if time.time()-summaries[wiki_site][1]<3600:##reload the summary if it's over an hour old
return summaries[wiki_site][0]
pl=wikipedia.Page(wiki_site, u'User:CommonsDelinker/replace-I18n')
try:
x=pl.get()
summaries[wiki_site]=[x, time.time()]
return x
except wikipedia.NoPage:
if not 'wikipedia' in str(wiki_site):
lang=str(wiki_site).split(':')[-1]
if lang in ('incubator', 'meta', 'commons', 'species'):
new_site=wikipedia.Site('en', 'wikipedia')
else:
new_site=wikipedia.Site(lang, 'wikipedia')
return summary(new_site)
summaries[wiki_site]=[summaries['default'], time.time()]
return summaries['default']
except:
return summaries['default']
def replace_image(img, pg, newimg):
tocon='a'*14
if canIedit()==False:
return None
print ('Replacing image', img, pg, pg.site(), newimg)
fix=wikipedia.Page(pg.site(), img)
img=fix.titleWithoutNamespace()
ext1=img.split('.')[-1]
ext2=newimg.split('.')[-1]
print 'Extensions: '+ext1+', '+ext2
if ext2.lower()=='svg':
if ext1.lower() !='svg':
print 'Ignoring non-SVG to SVG replacement.'
return None
newimg=wikipedia.Page(pg.site(), newimg).titleWithoutNamespace()
while pg in editing:
time.sleep(3)
editing.append(pg)
msg=summary(pg.site())
msg=msg.replace('$1', img)
msg=msg.replace('$2', newimg)
imagen=pg.site().namespace(6)
wikipedia.output(pg.title())
if pg.namespace() not in [99999999]:
txt=pg.get()
newTxt=txt
if pg.site() != wikipedia.Site('commons', 'commons'):
ce=wikipedia.Page(pg.site(), 'Image:'+img)
if exists(ce):
print 'Pulling out'
return None
forpat=img
toescape=('.', '(', ')')
for te in toescape:
forpat=forpat.replace(te, '\\'+te)
rx=r'['+img[0].upper()+forpat[0].lower()+']'+forpat[1:]
if ' ' in rx:
rx=rx.replace(' ', '[ _]')
elif '_' in rx:
rx=rx.replace('_', '[ _]')
print [rx]
posis=re.findall(rx, newTxt)
print posis
for posi in posis:
newTxt=newTxt.replace(posi, newimg)
if txt != newTxt:
try:
##We want to make sure the userpage is not empty
filename='canedit.cdl'
f=codecs.open(filename, 'r', 'utf-8')
ftxt=f.read()
f.close()
if not '#'+str(pg.site()) in ftxt:
userpage=wikipedia.Page(pg.site(), 'User:CommonsDelinker')
if not exists(userpage):
userpage.put('#Redirect[[m:User:CommonsDelinker]]', '')
f=codecs.open(filename, 'a', 'utf-8')
f.write('#'+str(pg.site()))
f.close()
wikipedia.showDiff(txt, newTxt)
pg.put(newTxt, msg)
thread.start_new_thread(record, (tocon, pg, img, newimg, "ok"))
except wikipedia.LockedPage:
thread.start_new_thread(record, (tocon, pg, img, newimg, "failed"))
print 'Page is locked'
else:
#thread.start_new_thread(record, (tocon, pg, img, newimg, "skipped"))
wikipedia.output( u'No match: '+pg.site().hostname()+'/wiki/'+pg.urlname() )
while pg in editing:
editing.remove(pg)
def checkUsage(image, newimg):
print ('check usage', image, newimg)
imageU=wikipedia.Page(wikipedia.getSite(), image).urlname()
path='http://tools.wikimedia.de/%7Edaniel/WikiSense/CheckUsage.php?i='+imageU+'&w=_100000#end'
ch=pageText(path).decode('utf-8')
projs=ch.split("class='project'")[1:]
print (str(len(projs))+ u' projects for', image)
for proj in projs:
baseR=ur'wik(?:i[mp]edia|ibooks|tionary|iquote|inews|isource|iversity)'
proid=re.findall('http://([^\.]*\.'+baseR+'\.org)', proj)[0]
#couldbe=('<i><b>different image', '<i>found:', '<i>local duplicate')
#found=0
#for could in couldbe:
# if could in proj:
# found=1
#if found==1:
# #going on, this project has a local copy
# continue
tabla=proj.split("<div class='page'>")[1:]
wikipedia.output(u'Checkusage returns '+str(len(tabla))+' for '+image+' in '+proid+'.')
for ta in tabla:
ta=ta.split('</div>')[0]
url=ta.split('<a href="')[1].split('?uselang=en"')[0]
wpR=ur'http://(commons|incubator|meta|species|www|[^\.]*)\.('+baseR+')\.org/wiki/(.*)'
wikipedia.output( wpR )
wp=re.findall(wpR, url)
print wp
if wp != []:
if wp[0][0]==u'commons':
wp=['commons', 'commons', wp[0][2]]
elif wp[0][0]==u'meta':
wp=['meta', 'meta', wp[0][2]]
elif wp[0][0]==u'incubator':
wp=['incubator', 'incubator', wp[0][2]]
elif wp[0][0]==u'www':
wp=['mediawiki', 'mediawiki', wp[0][2]]
elif wp[0][0]==u'species':
wp=['species', 'species', wp[0][2]]
else:
wp=wp[0]
pagelink=wikipedia.Page(wikipedia.Site(wp[0], wp[1]), wp[2])
try:
user=config.usernames[wp[1]][wp[0]]
#retirar_imagen(image, pagelink, admin)
while pagelink in editing:
time.sleep(3)
thread.start_new_thread(replace_image, (image, pagelink, newimg))
except KeyError:
continue
hechas=[]
fiables=['User:Orgullobot/commands']
def RUN():
for fiable in fiables:
cmd=wikipedia.Page(wikipedia.Site('commons', 'commons'), fiable)
texto=cmd.get()
lfile=codecs.open('./commons-commands.txt', 'w', 'utf-8')
lfile.write(texto)
lfile.close()
texto=getcommands()
chuletas=texto.split('{{')
chuletas.remove(chuletas[0])
for chuleta in chuletas:
if chuleta in hechas:
chuletas.remove(chuleta)
continue
else:
hechas.append(chuleta)
com=chuleta.split('|')[0]
if com.lower() in ['universal replace', 'universal_replace']:
img=chuleta.split('|')[1]
newimg=chuleta.split('|')[2].split('}}')[0]
thread.start_new_thread(checkUsage, (img, newimg))
time.sleep(5)
#checkUsage(img, newimg)
while 2==2:
RUN()
time.sleep(60)