#!/usr/bin/python
# -*- coding: UTF-8 -*-
"""Inspired by
http://www.ducdigital.com/2009/11/24/massive-download-from-pixiv/
modified by Nandaka. http://nandaka.wordpress.com
further modified by Kaens Bard http://kaen.su
Works well in Python 2.6.x and 2.7.x
Run cmd.exe, then:
pip install html5lib
pip install beautifulsoup4
pip install PySocks
pip install "https://github.com/spiiin/mechanize/archive/master.zip"
/!\ If you have previously installed mechanize from elsewhere, kill it (pip uninstall mechanize, [y]) before you do this last one. This is for SOCKS proxy support. Also find this in this script: "#socket module patch"; follow the instructions there.
Usage:
set your account language to Japanese (for now their English translation seems shaky, will fix later and don't care about Chinese)
EITHER:
create a text file in UTF-8 w/o signature in scripts folder, list all the links you want mass-downloaded/updated there, each on its own line
OR:
just write the links directly in command line
setup config.ini for startpage and quickcheck OR set quickcheck in command line (+q on, -q off)
run the script with the file name (or many) as parameter(s)
you can add links as parameters, they'll be processed
The links accepted in the list file have been tested as working for:
- user gallery pages
- your own bookmark pages (make sure you've set up config.ini for your own profile for that)
- tag searches
- ranking pages
- user stacc→user gallery (also accepts just the user ID as list name, checks for it are done after checks for list file name)
- user profile → user gallery
Additionally, it is possible to enumerate a user's or your own okiniiri list from the profile into a local file in list format.
If you intend to use some Windows Notepad to create it, don't use unicode characters in links
"""
import re
import os
import sys
import codecs
import socks
import socket
def create_connection(address, timeout=None, source_address=None):
sock = socks.socksocket()
sock.connect(address)
return sock
socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "localhost", 3128)
socket.socket = socks.socksocket
socket.create_connection = create_connection
socksproxyenabled=1
import urllib2
import mechanize
import time
import random
import calendar
from datetime import date,datetime
from mechanize import Browser,ProxyHandler
from bs4 import BeautifulSoup
from sockshandler import SocksiPyHandler
import ConfigParser
version = '2016-05-14'
#-------Defaults
url = 'http://www.pixiv.net/'
proxyAddress = ''
proxyIsSOCKS = 0
proxies = {}
username = ''
password = ''
useragent = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:16.0) Gecko/16.0 Firefox/16.0'
debugHttp = False
numberOfPage = 0
startPage = 1
quickCheck = True
useRobots = False
genericfilename_format = r"_%pageline%/%artist% (%artist_id%)/%image_id% - %title%"
member_illust_format = r"(%artist_id%)/%image_id% - %title%"
bookmark_new_illust_format = r"[Following tracker]/%image_id% (%artist_id%) - %title%"
response_format = r"%inurl_id% res/%image_id% (%artist_id%) - %title%"
ranking_format = r"[%today%'s %inurl_id%'s best]/[#%num%] %image_id% (%artist_id%) - %title%"
description_format = u"Title:%title%\\n%Tags:%tags%\\nCommentary:%remarks%"
tokentag = r""
overwrite = False
logFile = ""
descript_ion = False
description_file = u"descript.ion"
blacklist = r"[腐 【腐 腐】 腐] 腐注 腐向 ※腐 BL注" #yaoi cut
min_id = 0 #downto which id to longdownload, useful if you missed the 2000 latest okiniiri limit
br = Browser()
pr = None
fQuickCheck = False
curPage = 0
ugoira = False
noOfImages = 0
debugid = 0
#-----------Start Download, input: image ID, type: VOID, save to disk
def downloadImage(id):
global br
global fQuickCheck
global curPage
global ugoira
global tokentag
if quickCheck and fQuickCheck:
return
print "\nGetting file id: " + str(id) + "\t\t",
tickity=0 #DEBUG
while True:
try:
mediumPage = br.open(url+"member_illust.php?mode=medium&illust_id="+str(id),timeout=10)
if logFile == "": print chr(8)+'`',
else: print '`',
ttt=mediumPage
mediumPage = mediumPage.read()
#time.sleep(random.random()*10+0.4) #i'm a human. human, not a robot. okay? right.
if logFile == "": print chr(8)+'-',
else: print '-',
parseTitle = BeautifulSoup(mediumPage,"html5lib")
if logFile == "": print chr(8)+'.',
else: print '.',
tickity += 1
if mediumPage.find(r"rkn.gov.ru")>-1:
print "...PKH: error 451, skipping..."
return
if mediumPage.find(r"例外エラーが発生しました")>-1:
print "...pixiv: unexpected error occurred, retrying..."
raise
elif mediumPage.find(r"該当作品は削除されたか、存在しない作品IDです。")>-1:
print "...pixiv: submission taken down by artist, skipping..."
return
elif mediumPage.find(r"マイピクにのみ公開されています")>-1:
print "...pixiv: submission MyPixiv-only, skipping..."
return
#DEBUG
if tickity==20:
print "-wrote tickity "+str(id)+" file-",
lolfile=open('tickity '+str(id),'wb+')
lolfile.write(parseTitle)
lolfile.close()
#/DEBUG
break
except:
if id == debugid:
print 1,
else:
print '.',
tickity += 1
if tickity == 20:
print "-wrote tickity "+str(id)+" file-",
lolfile=open('tickity '+str(id),'wb+')
lolfile.write(mediumPage)
lolfile.close()
time.sleep(5)
anilookup = parseTitle.find('div',{'class':'_ugoku-illust-player-container'})
ugoira = anilookup != None
if id == debugid:
f=open("logofpages","wb+")
f.write(parseTitle)
f.close()
#parse artist
try:
artist = parseTitle.find('div',{'class':'_unit profile-unit'}).h1.contents[0]
print "artist: ",artist
except UnicodeError:
print "(not supported by console)"
except LookupError,AttributeError:
if parseTitle.find("エラーが発生しました".decode('utf8')):
print "...denied by pixiv server, skipping"
else:
print "...Oops. Submission was taken down by the artist while downloading the rest OR another error occurred."
f=open("submission %s down" % (id),"a+")
f.write(mediumPage)
f.close()
return
except:
print "...artist parsing failed, SKIPPING"
time.sleep(5)
#parse commentary
#print "Commentary:", #better not >_>
try: artist_id = parseTitle.find('div',{'class':'_unit _work-detail-unit'}) .find('a',{'class':"tab-feed"})['href'] .split('/')[-1]
except: print "artist_id not found:",parseTitle.find('div',{'class':'_unit _work-detail-unit'})
try: works_caption = parseTitle.find('p',{'class':'works_caption'}).getText(separator=u'\\n')
except: works_caption=u'n/a'
#parse tags, for blacklist as well
tagsline=u'' #for descript.ion
tagslist=[] #for log
addtokentag=False #for the token tag
try:
for x in parseTitle.find('span',{'class':'tags-container'}).find_all('a',{'class':'text'}):
if not x.string: continue
if x.string==u'*':
tagsline+=x.string
else:
tagslist+=[x.string]
tagsline+=x.string+' '
if not addtokentag:
addtokentag = (tokentag==x.string)
except:
print "...tag parsing failed, retrying"
if tagsline==u'': tagsline=u'(n/a)'
print "Tags:",
for x in tagslist:
try:
print x,
except UnicodeError:
print "-",
except LookupError:
print "-",
for i in blacklist.decode("utf8").split(" "):
if (len(i) > 0) and (x.find(i) >= 0):
try:
print "blacklisted by %s, skipping..."%(i)
except:
print "blacklisted, skipping..."
return
print ""
#parse imagedate and number of manga pages
manga = False
classmeta = parseTitle.find('ul',{'class':'meta'})
try:
if classmeta != None:
imagedate = classmeta.find(text=re.compile(r"(\d{4}年\d*月\d*日 \d{2}:\d{2}|\d*/\d*/\d{4} \d{2}:\d{2})".decode('utf8')))
else:
print "WARNING: Either pixiv changed page format or page not found"
z = re.search(r"(\d{4})年(\d*)月(\d*)日 (\d{2}):(\d{2})".decode('utf8'),imagedate)
if z: # Japanese date
imagedate = datetime(int(z.group(1)),int(z.group(2)),int(z.group(3)),
int(z.group(4)),int(z.group(5)))
else:
imagedate = work_info.find('ul',{'class':'meta'}).find_all('li',text=re.compile(r"\d{2}/\d{2}/\d{2} \d{2}:\d{2}".decode('utf8')))[0].split("\n")[0]
z = re.search(r"(\d*)/(\d*)/(\d{4}) (\d{2}):(\d{2})".decode('utf8'),imagedate)
if z: # American date
imagedate = datetime(int(z.group(3)),int(z.group(1)),int(z.group(2)),
int(z.group(4)),int(z.group(5)))
except Exception,e:
print "(a)",str(e),
print "(timestamp not found, assuming now/UTC)",
imagedate = datetime.utcnow()
if (not ugoira) & (parseTitle.find('img',{'class':'original-image'}) == None):
manga = True
mangapages = 1
if classmeta != None:
mlookup = classmeta.find(text=re.compile(r"複数枚投稿 \d*P".decode('utf8')))
if mlookup != None:
mangapages = int(mlookup.split(" ")[1][:-1])
imagedates = imagedate.strftime('%Y-%m-%d %H:%M')
print "Date:",imagedates,
#parse title
try:
#for a in parseTitle.find_all('h1',{'class':'title'}): print a.text
title = parseTitle.find_all('h1',{'class':'title'})[2].text
print "title:",title
except UnicodeError:
print "(not supported by console)"
except LookupError:
print "(unknown console encoding)"
except:
title = "untitled"
#parse actual image(s)
if ugoira:
z = re.search(r"pixiv\.context\.ugokuIllustFullscreenData.+(http:.+ugoira1920x1080\.zip)",mediumPage)
if z:
anilink = z.group(1).replace('\\',"")
else:
print "Failed to find the Cinematic zip, skipping"
exit
works_display = parseTitle.find('div',{"id":"wrapper"})
if not manga:
tickity = 0
if not ugoira: #ergo, a single pic
if parseTitle.find("再度ログインしなおしてください".decode("utf8")):
print "Attempting to re-login...",
if login(username, password) == 0:
print "success!"
if manga:
print "Getting manga,",mangapages,"pages..."
imgList = []
if mangapages>1:
for i in range(mangapages):
imgList.append("member_illust.php?mode=manga_big&illust_id="+str(id)+"&page="+str(i))
else:
imgList.append("member_illust.php?mode=big&illust_id="+str(id))
elif ugoira:
imgList = [anilink]
else: #ergo, single page
imgList = [parseTitle.find('img',{"class":"original-image"})]
for imgFile in imgList: #each imgFile becomes a big page link-to-follow in case of manga ._.
if quickCheck and fQuickCheck:
break
ext = '#'
if ugoira:
ext = re.sub(r'http:.*/(\d+_ugoira.*\.zip)',r'\1',anilink)
elif manga:
while 1:
try:
req = urllib2.Request(url+imgFile)
req.add_header("Referer", url+"member_illust.php?mode=medium&illust_id="+str(id))
viewPage = br.open(req)
parser = BeautifulSoup(viewPage.read(),"html5lib")
imgFileM = parser('img')[0]
ext = os.path.basename(imgFileM['src'])
break
except Exception,e:
print "(c)",str(e),
if str(e).find('global name')>-1:
print "...skipping..."
break
if str(e).startswith('HTTP Error 404'):
print "[404]The submission is rendered unloadable, skipping..."
break
if str(e).startswith('HTTP Error 400'):
print "[400]The submission is rendered unloadable, skipping..."
break
time.sleep(5)
else: #not manga or cine
try:
ext = os.path.basename(imgFile['data-src'])
except Exception,e:
print "No original image link found..."
if ext.split('.')[0].startswith(str(id)):
image_id = ext.split('.')[0]
if manga:
#for comfort browsing of manga stuff, zero-pad it to 2 digits:
z = re.search(r'_p(\d*)',image_id)
if z:
image_id = re.sub(r'_p\d*','_p{0:02}'.format(int(z.group(1))),image_id)
elif ugoira:
image_id = re.sub(r'http://.*/(\d*)_ugoira.*',r'\1_ani',anilink)
global _pager
fileName = makeFilename(_pager, id, artist_id, artist, title, image_id, noOfImages, imagedates, addtokentag)
fileName = fileName+"."+ext.split('.')[1].split('?')[0]
print 'Saving to:',
fileName = sanitizeFilename(fileName)
try:
print fileName
except UnicodeError:
print "(not supported by console)"
except LookupError:
print "(unknown console encoding)"
if manga:
dl(imgFileM['src'], fileName, viewPage.geturl(), imagedate)
elif ugoira:
dl(anilink, fileName, ttt.geturl(), imagedate)
else:
dl(imgFile['data-src'], fileName, br.geturl(), imagedate)
#descript.ion update tiem
# assuming UTF-8 (so what if it doesn't work on most 2panel commanders, so what if it's not in the specs at http://jpsoft.com/ascii/descfile.txt? Who REALLY cares about this legacy? It just won't work otherwise, so there)
# Using ^D the way Ghisler's Total Commadner 7.55a does--for description breaks, instead of what the specs say.
# Likewise, EoF characters aren't specifically processed
# Assuming available for writing
# I hate everything about this implementation.
if descript_ion:
dfile = os.path.dirname(fileName)+'/'+description_file
_descmagic='\x04\xC2\x0A' #writelines() will autoreplace \x0A with \x0D\x0A on w32 O_o
if os.path.exists(dfile):
descfile = open(dfile,'r')
curdesc=descfile.readlines()
descfile.close()
else:
curdesc=[]
notyet=True
for x in curdesc:
if x.find(os.path.basename(fileName).encode('utf-8'))>-1:
notyet=False
break
if notyet:
print 'Updating descript.ion...'
curdesc.append( makeDescription( os.path.basename(fileName), title, tagsline, works_caption ).encode('utf-8') + _descmagic )
descfile = open(dfile,'w')
descfile.writelines(curdesc)
descfile.close()
curdesc=[]
ttt.close()
#-----------List all images
#@profile
def downloadAllImages(pager):
global fQuickCheck
global _pager
_pager = pager
print "Getting pages from ", pager
fQuickCheck = False
global startPage
global curPage
curPage = startPage
hasMorePage = 1
global noOfImages
noOfImages = 1
id = None
previd = [0]
weirdvar = 5
relogined = 0
listpage = '
- 見つかりませんでした'
while (hasMorePage != 0) and (weirdvar > 0):
if quickCheck and fQuickCheck:
break
print "\nListing page #%d\t" % (curPage),
while True:
try:
listPage = br.open(pager+"&p="+str(curPage),timeout=10)
if logFile == "": print chr(8)+'`',
else: print '`',
ttt=listPage
listPage=listPage.read()
ttt.close()
if logFile == "": print chr(8)+'-',
else: print '-',
#time.sleep(random.random()*10+2) #netiquette wwwhy is it here at all
parseList = BeautifulSoup(listPage,"html5lib")
if logFile == "": print chr(8)+'.',
else: print '.',
break
except Exception, e:
print "(d)",str(e)
if str(e).find("403"):
print "Attempting to re-login...",
if login(username, password) == 0:
print "success!"
else:
print '.',
time.sleep(5)
itsranking = False
itsbookmarks = False
illust_c = parseList.find('li',{'class':r'image-item'})
#print "* * * illust_c:",illust_c
if illust_c == None: #bookmarks?
illust_c = parseList.find('div', {'class':re.compile(r'^display_works')})
if illust_c == None:
#if illust_c == None:
# illust_c = parseList.find('ul', {'class':re.compile(r'image-items^')})
illust_c = parseList.find('div', {'class':re.compile('^ranking-items')})
if illust_c == None: #ranking?
if listpage.find('
- 見つかりませんでした')>-1:
print "End of list"
else:
print "Unknown webpage design, ask the dev to support it!"
return
else:
itsranking = True
print "Parsed as ranking"
else:
itsbookmarks = True
print "Parsed as bookmarks"
else:
print "Parsed as generic"
if illust_c != None: #found
#if itsranking: illust = illust_c.find_all('a',href=re.compile(r'illust_id=\d*'))
##else:
illust = parseList.find_all('a',{'class':re.compile('^work')})
for link in illust:
if quickCheck and fQuickCheck:
break
try:
id = link['href'].split('=')[2].split('&')[0].split('_')[0].split('?')[0]
except IndexError:
if link['href'].find('response.php'):
continue
print link['href'], 'has failed: unable to pick illust_id'
if (id in previd) or (int(id) < 11):
continue #skipping what's just in and incorrect id
if int(id) < min_id:
print "Lower id than minimum, stopping"
break
print "#"+ str(noOfImages)+':',
downloadImage(id)
previd.append(id)
noOfImages = noOfImages + 1
hasMorePage = len(illust)
curPage += 1
if numberOfPage == curPage:
hasMorePage = 0
elif pager.find("bookmark_new_illust.php") and (curPage > 100):
hasMorePage = 0
elif id != None:
if int(id) < min_id:
hasMorePage = 0
else:
if relogined==0:
print "\nNothing found on the page (div class * parsing error?), retrying.\n"
weirdvar -= 1
time.sleep(5)
if weirdvar==0:
print "This is taking too long, attempting re-login..."
configBrowser()
if login(username, password) == 0:
print "success!"
relogined = 1
weirdvar = 5
else:
print "That didn't help, skipping.\n"
weirdvar = 0
print "Listing complete"
previd = [0]
re.purge()
#-----------Download file
def dl(url,filename,referer=None):
dl(url,filename,referer,datetime.now())
def dl(url,filename,referer,imagedate):
#circumventing some weirdass bug where we're downloading the same thing twice
global prevname
global fQuickCheck
if quickCheck and fQuickCheck:
return
#url = re.sub(r'_p(\d*)\.',r'_big_p\1.',url) #add big_ to manga #OBSOLETED
try:
print "Downloading:", url,
except LookupError:
print "(unknown console encoding)"
if os.path.exists(filename) and os.path.isfile(filename):
if quickCheck:
fQuickCheck = True
print "\tFile exists, quick check-skipping the rest.\n"
else:
print "\tFile exists!\n"
return
print " "
print "Trying to request ",
req = urllib2.Request(url)
if referer != None: req.add_header("Referer", referer)
while True:
try:
res = br.open(req,timeout=10)
break
except Exception, e:
if str(e).startswith('HTTP Error 404'):
url = re.sub(r'_big_p(\d*)\.',r'_p\1.',url) #remove big_ from manga back >_>
req = urllib2.Request(url)
if referer != None:
req.add_header("Referer", referer)
print "\nURL change to",url
while True:
try:
res = br.open(req,timeout=10)
break
except Exception, e:
if str(e).startswith('HTTP Error 404'):
print "Error 404 on fullsize, skipping the picture..."
return
else:
print "(e)",str(e),
print '.',
time.sleep(5)
else:
print "(f)",str(e),
print '.',
time.sleep(5)
dir = os.path.dirname(filename)
if not os.path.exists(dir):
try:
print " Creating directory", dir,
except UnicodeError:
print "(not supported by console)",
except LookupError:
print "(unknown console encoding)",
os.makedirs(dir)
fretry = False
fretrying = True
save = open(filename, "w+b", 32768)
while fretrying:
try:
prev = 0
if logFile == "":
print '{0:10d} bytes'.format(prev),
while 1:
save.write(res.read(1024*256))
curr = save.tell()
if logFile == "":
print '\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b',
print '{0:10d} bytes'.format(curr),
if curr == prev:
fretrying = False
break
prev = curr
except IOError:
if not fretry:
fretry = True
#save.seek(0,0)
#save.truncate()
save.flush()
res = br.open(req,timeout=10)
#print "and on...",
else:
print ".",
time.sleep(5)
save.close()
if logFile != "":
print '{0:10d} bytes'.format(curr),
os.utime( filename, (calendar.timegm(imagedate.timetuple()), calendar.timegm(imagedate.timetuple())) )
print " done"
#-------------produce an artist's okiniiri list.txt
def dlOkiniiriList(pager):
curpage=1
gotlist=''
artistid=''
print "Retrieving okiniiri list at:", pager,
while True:
req = urllib2.Request(pager+'&p=%d'%(curpage))
while True:
try:
res = br.open(req,timeout=10)
break
except Exception, e:
if str(e).startswith('HTTP Error 404'):
print "Not found, skipping."
return
else:
print '.',
time.sleep(5)
while True:
try:
parseList = BeautifulSoup(res.read(),"html5lib")
break
except:
time.sleep(5)
print ',',
if artistid=='':
artistid = re.search( r'^http://i\d*\.pixiv\.net/img\d*/profile/([^/]*)/(mobile/)?.*', parseList.find('div',{'class':'members'}).find('img')['src'] ).group(1)
print '\nUser='+artistid, 'Pages:',
list_person = parseList.find_all('div',{'class':'usericon'})
for au in list_person:
z = re.search( r'^http://i\d*\.pixiv\.net/img\d*/profile/([^/]*)/(mobile/)?.*', au.find('img')['src'] )
if z: tehid = z.group(1)
else:
print "~", #extracting stacc name from artist's page
q=urllib2.Request(url+au.find('a')['href'])
try:
a=br.open(q,timeout=10)
p=BeautifulSoup(a.read(),"html5lib")
p=p.find('div',{'class':'extaraNavi'}).find('a',href=re.compile(r'.*net/stacc/.*'))['href']
tehid=p.split('/')[-1]
#print tehid,
except:
print "?", #failed to extract. bah, forget all this omfgletsfinditnomatterwhat shit, use full link
tehid=url+'member_illust.php?id='+au.find('a')['href'].split('=')[1]
gotlist+= tehid+'\x0A'
if list_person:
print curpage,
curpage+=1
else:
filename = 'Okiniiri of %s.txt'%(artistid)
print "\nSaving the list as",filename
open(filename,'w+').writelines(gotlist.encode('utf8'))
return
#-------------process list.txt
def processList(filename):
def ___commonpart___(pager):
if re.search('^[a-zA-Z0-9-_]*$',pager):
pager = "http://pixiv.me/"+pager
if not re.search(r'^(http://)*([w\.]*)pixiv\.(me|net|cc)',pager):
print "Not a pixiv address! Skipping..."
else:
pager = re.sub(r'member\.php',r'member_illust.php',pager)
pager = re.sub(r'([?&]page=\d*|[?&]p=\d*|[?&]num=\d*)','',pager)
if re.search(r'(/stacc/|pixiv\.me)',pager):
print "Converting a userid-based address...",
try:
ww = br.open(pager)
www = BeautifulSoup(ww.read(),"html5lib")
wwww = www.find('a',{'class':'tab-works'})
pager = wwww['href']
print "success!",
except Exception,e:
print "(h)",str(e),": skipping..."
return
if pager == "http://www.pixiv.net/bookmark_new_illust.php":
pager = "http://www.pixiv.net/bookmark_new_illust.php?mode=new"
if re.search(r'bookmark\.php.*type=user',pager):
dlOkiniiriList(pager)
else:
downloadAllImages(pager)
if filename[:7]=='http://':
print "Downloading from:", filename
else: print "Processing list from:", filename
if filename[:7]=='http://':
___commonpart___(filename)
elif os.path.exists(filename) and os.path.isfile(filename):
reader = open(filename,'r')
for line in reader:
pager = line.replace(chr(10),"").replace(chr(13),"")
if pager.startswith('#'):
continue
___commonpart___(pager)
else:
print "File not found."
#-------------Sanitize filename (windows, but / counts as \)
badchars= re.compile(r'['+chr(01)+'-'+chr(31)+']|^\.|\.$|^ | $|^$|\?|:|<|>|\||\*|\"')
badnames= re.compile(r'(aux|com[1-9]|con|lpt[1-9]|prn)(\.|$)')
def sanitizeFilename(s):
name= badchars.sub('_', s)
if badnames.match(name):
name= '_'+name
return name
#------------Main Block
def main():
try:
prepare()
global logFile
if logFile != "":
print "Logging output to "+logFile
sys.stdout = codecs.open(logFile,'a+',encoding="utf-8-sig")
global version
print "Pixiv Mass Downloader ver." + version
if logFile != "":
print "By Duc Digital; Nandaka; Kaens Bard"
global username
if username == None or username == "":
if logFile != "":
print "Cannot log in from stdin while file-logging, terminating"
else:
username = raw_input("Username = ")
else:
print "Login as: "+username
global password
if password == None or password == "":
if logFile != "":
print "Cannot log in from stdin while file-logging, terminating"
else:
password = raw_input("Password = ")
global numberOfPage
if numberOfPage != 0:
print "Page processing limit = ", numberOfPage
global overwrite
global quickCheck
if overwrite:
if not quickCheck:
print "Overwrite mode"
else:
print "Overwrite mode--overridden by Quick check mode"
overwrite = False
if len(sys.argv) == 0:
print "\nUTC", str(datetime.utcnow()), "Empty command line! Nothing to process."
else:
if login(username, password) == 0:
for arg in sys.argv[1:]:
if arg=='-q':
print "\nQuick check mode now disabled"
quickCheck = False
elif arg=='+q':
print "\nQuick check mode now enabled, overriding overwrite mode"
quickCheck = True
overwrite = False
elif arg[:7]=='http://':
processList(arg)
elif os.path.exists(arg) and os.path.isfile(arg):
processList(arg)
else:
if re.search('^[a-zA-Z0-9-_]*$',arg):
print "Checking for a stacc address...",
try:
#ww = br.open(url+'stacc/'+arg)
ww = br.open("http://pixiv.me/"+arg)
www = BeautifulSoup(ww.read(),"html5lib")
#open(arg+'-dump','w+').writelines(www.encode('utf8'))
wwww = www.find("a",{'class':'tab-works'})
print 'success!'
processList(url+wwww['href'])
except Exception,e:
if str(e).startswith('HTTP Error 404'):
print 'failed, skipping...'
else:
print "(ii)",str(e)
if not quickCheck and (logFile==""):
print "UTC", str(datetime.utcnow()), "All done! Press Enter to exit."
raw_input()
else:
print "UTC", str(datetime.utcnow()), "Quick check complete."
else:
print "UTC", str(datetime.utcnow()), "Failed to log in."
except KeyboardInterrupt as ex:
print 'CTRL+C, aborted'
#-------load config
def loadConfig():
config = ConfigParser.RawConfigParser()
try:
config.read('pixivUtil.ini')
print "Reading values for",
global username
print "username",
username = config.get('Authentication','username')
global password
print "password",
password = config.get('Authentication','password')
global proxyAddress
global proxies
print "proxy_address",
proxyAddress = config.get('Settings','proxy_address')
if proxyAddress:
try:
proxies = {'http':proxyAddress}
except:
print "(couldn't parse proxy config)",
proxies = {}
global useragent
print "user_agent",
useragent = config.get('Settings','user_agent')
global numberOfPage
print "number_of_page",
numberOfPage = config.getint('Pixiv','number_of_page')
global startPage
print "start_page",
startPage = config.getint('Pixiv','start_page')
global quickCheck
print "quickcheck",
quickCheck = config.getboolean('Pixiv','quickcheck')
global genericfilename_format
print "genericfilename_format",
genericfilename_format = config.get('Pixiv','genericfilename_format').decode('utf8')
global member_illust_format
print "member_illust_format",
member_illust_format = config.get('Pixiv','member_illust_format').decode('utf8')
global bookmark_new_illust_format
print "bookmark_new_illust_format",
bookmark_new_illust_format = config.get('Pixiv','bookmark_new_illust_format').decode('utf8')
global response_format
print "response_format",
response_format = config.get('Pixiv','response_format').decode('utf8')
global ranking_format
print "ranking_format",
ranking_format = config.get('Pixiv','ranking_format').decode('utf8')
global description_format
print "description_format",
description_format = config.get('Pixiv','description_format').decode('utf8')
global tokentag
print "tokentag",
tokentag = config.get('Pixiv','tokentag').decode('utf8')
global blacklist
print "blacklist",
blacklist = config.get('Pixiv','blacklist')
global logFile
print "logfile",
logFile = config.get('Settings','logfile').decode('utf8')
global descript_ion
print "descript.ion",
descript_ion = config.getboolean('Settings','descript.ion')
global description_file
print "descript.ion_file",
descript_ion = config.get('Settings','descript.ion_file').decode('utf8')
global debugHttp
print "debug_http",
debugHttp = config.getboolean('Settings','debug_http')
global useRobots
print "use_robots."
useRobots = config.getboolean('Settings','use_robots')
except ConfigParser.NoOptionError:
print "Required option not found in config, writing defaults..."
writeConfig()
exit()
except ConfigParser.NoSectionError:
print "Required section not found in config, writing defaults..."
writeConfig()
exit()
#-------write config
def writeConfig():
print "Writing defaults is temporarily disabled, please add the missing option/section manually."
return
config = ConfigParser.RawConfigParser()
config.add_section('Settings')
config.add_section('Pixiv')
config.add_section('Authentication')
config.set('Authentication', 'username', username)
config.set('Authentication', 'password', password)
config.set('Pixiv', 'number_of_page', numberOfPage)
config.set('Pixiv', 'start_page', startPage)
config.set('Pixiv', 'quickcheck', quickCheck)
config.set('Pixiv', 'blacklist', blacklist)
config.set('Pixiv', 'genericfilename_format', genericfilename_format)
config.set('Pixiv', 'member_illust_format', member_illust_format)
config.set('Pixiv', 'bookmark_new_illust_format', bookmark_new_illust_format)
config.set('Pixiv', 'response_format', response_format)
config.set('Pixiv', 'ranking_format', ranking_format)
config.set('Pixiv', 'descript.ion_format',description_format)
config.set('Pixiv', 'tokentag',tokentag)
config.set('Settings', 'proxy_address', proxyAddress)
config.set('Settings', 'user_agent', useragent)
config.set('Settings', 'debug_http', debugHttp)
config.set('Settings', 'use_robots', useRobots)
config.set('Settings', 'logfile', logFile)
config.set('Settings','descript.ion',descript_ion)
config.set('Settings','descript.ion_file',description_file)
with open('pixivUtil.ini', 'wb') as configfile: #utf-8 dammit
config.write(configfile)
print "Configuration file saved."
#-------construct the filename
def makeFilename(pageline, member_id, artist_id, artist, title, image_id, num, imgdate, addtokentag):
global tokentag
image_id = str(image_id)
if (tokentag != "") and addtokentag:
image_id = image_id+r"["+tokentag+r"]"
inurl_id = ' '
q = urllib2.unquote(pageline).decode('utf8').split('/')[-1]
#specific format checks
z = re.search(r'member_illust\.php\?id=(\d*)',q)
if z:
inurl_id = z.group(1)
nameformat = member_illust_format
else:
z = re.search(r'bookmark_new_illust\.php',q)
if z:
nameformat = bookmark_new_illust_format
else:
z = re.search(r'response\.php\?illust_id=(\d*)',q)
if z:
inurl_id = z.group(1)
nameformat = response_format
else:
z = re.search(r'ranking.*(mode=([a-z0-9]*)|rookie)',q)
if z:
inurl_id = z.group(1)
nameformat = ranking_format
else:
nameformat = genericfilename_format
nameformat = nameformat.replace('%pageline%',q.replace(u'?',u'?'))\
.replace('%artist%',artist.replace(u'\\',u'_').replace(u'/',u'_'))\
.replace('%title%',title.replace(u'\\',u'_').replace(u'/',u'_'))\
.replace('%image_id%',image_id).replace('_big','')\
.replace('%member_id%',str(member_id))\
.replace('%artist_id%',artist_id)\
.replace('%inurl_id%',inurl_id)\
.replace('%today%',str(date.today()))\
.replace('%date%',imgdate.replace(u'\\',u'-')).replace(u'//',u'-')\
.replace('%num%','{0:03d}'.format(num)) #leaving at 3 because sorting on it is only important for ranking, and that's 500 max
return nameformat
#-------Construct the line of descript.ion
def makeDescription(file, title, tags, remarks):
_file = file
if _file.find(' ')>-1:
_file = '"'+_file+'"'
_file+=' '
return _file + description_format\
.replace(u'%title%',title)\
.replace(u'%tags%',tags)\
.replace(u'%remarks%',remarks)
#-------Configure browser object
def configBrowser():
global br
global pr
global proxies
global proxyAddress
br.set_handle_equiv(True)
#br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
global useRobots
br.set_handle_robots(useRobots)
global debugHttp
br.set_debug_http(debugHttp)
global useragent
br.addheaders = [('User-agent', useragent)]
pr = ProxyHandler(proxies)
if socksproxyenabled==1:
print "Using SOCKS proxy."
br.visit_response
#-------Login to pixiv
def login(username, password):
print "Login at %s\t\t" % (url),
req = urllib2.Request("https://accounts.pixiv.net/login")
while True:
try:
response = br.open(req,timeout=10)
break
except Exception,e:
print "(j)",str(e),':: sleeping for 5 sec'
time.sleep(5)
try:
br.form = mechanize.HTMLForm("https://www.pixiv.net/login.php", method="POST", enctype="multipart/form-data")
br.form.new_control('text','mode',{'value':"login"})
br.form.new_control('text','pixiv_id',{'value':username})
br.form.new_control('password','pass',{'value':password})
br.form.fixup()
response = br.submit() # LOGIN
except Exception,e:
print "(k)",str(e),"(assuming relogin)"
#print "(login form not found, assuming relogin)",
lolfile=open('loginness.log','wb+')
lolfile.write(response.read())
lolfile.close()
print response.geturl()
if response.geturl() == 'http://www.pixiv.net/':
print "DONE!"
return 0
else :
print 'Wrong username or password'
lolfile=open('loginness.log','wb+')
lolfile.write(response.read())
lolfile.close()
return 1
def printConfig():
print "Username :",username
print "Password :",password
print "Proxy Addr:",proxyAddress
def prepare():
loadConfig()
configBrowser()
if __name__ == "__main__":
main()