#!/usr/bin/python # -*- coding: UTF-8 -*- """Inspired by http://www.ducdigital.com/2009/11/24/massive-download-from-pixiv/ modified by Nandaka. http://nandaka.wordpress.com further modified by Kaens Bard http://kaen.su Works well in Python 2.6.x and 2.7.x Run cmd.exe, then: pip install html5lib pip install beautifulsoup4 pip install PySocks pip install "https://github.com/spiiin/mechanize/archive/master.zip" /!\ If you have previously installed mechanize from elsewhere, kill it (pip uninstall mechanize, [y]) before you do this last one. This is for SOCKS proxy support. Also find this in this script: "#socket module patch"; follow the instructions there. Usage: set your account language to Japanese (for now their English translation seems shaky, will fix later and don't care about Chinese) EITHER: create a text file in UTF-8 w/o signature in scripts folder, list all the links you want mass-downloaded/updated there, each on its own line OR: just write the links directly in command line setup config.ini for startpage and quickcheck OR set quickcheck in command line (+q on, -q off) run the script with the file name (or many) as parameter(s) you can add links as parameters, they'll be processed The links accepted in the list file have been tested as working for: - user gallery pages - your own bookmark pages (make sure you've set up config.ini for your own profile for that) - tag searches - ranking pages - user stacc→user gallery (also accepts just the user ID as list name, checks for it are done after checks for list file name) - user profile → user gallery Additionally, it is possible to enumerate a user's or your own okiniiri list from the profile into a local file in list format. If you intend to use some Windows Notepad to create it, don't use unicode characters in links """ import re import os import sys import codecs import socks import socket def create_connection(address, timeout=None, source_address=None): sock = socks.socksocket() sock.connect(address) return sock socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "localhost", 3128) socket.socket = socks.socksocket socket.create_connection = create_connection socksproxyenabled=1 import urllib2 import mechanize import time import random import calendar from datetime import date,datetime from mechanize import Browser,ProxyHandler from bs4 import BeautifulSoup from sockshandler import SocksiPyHandler import ConfigParser version = '2016-05-14' #-------Defaults url = 'http://www.pixiv.net/' proxyAddress = '' proxyIsSOCKS = 0 proxies = {} username = '' password = '' useragent = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:16.0) Gecko/16.0 Firefox/16.0' debugHttp = False numberOfPage = 0 startPage = 1 quickCheck = True useRobots = False genericfilename_format = r"_%pageline%/%artist% (%artist_id%)/%image_id% - %title%" member_illust_format = r"(%artist_id%)/%image_id% - %title%" bookmark_new_illust_format = r"[Following tracker]/%image_id% (%artist_id%) - %title%" response_format = r"%inurl_id% res/%image_id% (%artist_id%) - %title%" ranking_format = r"[%today%'s %inurl_id%'s best]/[#%num%] %image_id% (%artist_id%) - %title%" description_format = u"Title:%title%\\n%Tags:%tags%\\nCommentary:%remarks%" tokentag = r"" overwrite = False logFile = "" descript_ion = False description_file = u"descript.ion" blacklist = r"[腐 【腐 腐】 腐] 腐注 腐向 ※腐 BL注" #yaoi cut min_id = 0 #downto which id to longdownload, useful if you missed the 2000 latest okiniiri limit br = Browser() pr = None fQuickCheck = False curPage = 0 ugoira = False noOfImages = 0 debugid = 0 #-----------Start Download, input: image ID, type: VOID, save to disk def downloadImage(id): global br global fQuickCheck global curPage global ugoira global tokentag if quickCheck and fQuickCheck: return print "\nGetting file id: " + str(id) + "\t\t", tickity=0 #DEBUG while True: try: mediumPage = br.open(url+"member_illust.php?mode=medium&illust_id="+str(id),timeout=10) if logFile == "": print chr(8)+'`', else: print '`', ttt=mediumPage mediumPage = mediumPage.read() #time.sleep(random.random()*10+0.4) #i'm a human. human, not a robot. okay? right. if logFile == "": print chr(8)+'-', else: print '-', parseTitle = BeautifulSoup(mediumPage,"html5lib") if logFile == "": print chr(8)+'.', else: print '.', tickity += 1 if mediumPage.find(r"rkn.gov.ru")>-1: print "...PKH: error 451, skipping..." return if mediumPage.find(r"例外エラーが発生しました")>-1: print "...pixiv: unexpected error occurred, retrying..." raise elif mediumPage.find(r"該当作品は削除されたか、存在しない作品IDです。")>-1: print "...pixiv: submission taken down by artist, skipping..." return elif mediumPage.find(r"マイピクにのみ公開されています")>-1: print "...pixiv: submission MyPixiv-only, skipping..." return #DEBUG if tickity==20: print "-wrote tickity "+str(id)+" file-", lolfile=open('tickity '+str(id),'wb+') lolfile.write(parseTitle) lolfile.close() #/DEBUG break except: if id == debugid: print 1, else: print '.', tickity += 1 if tickity == 20: print "-wrote tickity "+str(id)+" file-", lolfile=open('tickity '+str(id),'wb+') lolfile.write(mediumPage) lolfile.close() time.sleep(5) anilookup = parseTitle.find('div',{'class':'_ugoku-illust-player-container'}) ugoira = anilookup != None if id == debugid: f=open("logofpages","wb+") f.write(parseTitle) f.close() #parse artist try: artist = parseTitle.find('div',{'class':'_unit profile-unit'}).h1.contents[0] print "artist: ",artist except UnicodeError: print "(not supported by console)" except LookupError,AttributeError: if parseTitle.find("エラーが発生しました".decode('utf8')): print "...denied by pixiv server, skipping" else: print "...Oops. Submission was taken down by the artist while downloading the rest OR another error occurred." f=open("submission %s down" % (id),"a+") f.write(mediumPage) f.close() return except: print "...artist parsing failed, SKIPPING" time.sleep(5) #parse commentary #print "Commentary:", #better not >_> try: artist_id = parseTitle.find('div',{'class':'_unit _work-detail-unit'}) .find('a',{'class':"tab-feed"})['href'] .split('/')[-1] except: print "artist_id not found:",parseTitle.find('div',{'class':'_unit _work-detail-unit'}) try: works_caption = parseTitle.find('p',{'class':'works_caption'}).getText(separator=u'\\n') except: works_caption=u'n/a' #parse tags, for blacklist as well tagsline=u'' #for descript.ion tagslist=[] #for log addtokentag=False #for the token tag try: for x in parseTitle.find('span',{'class':'tags-container'}).find_all('a',{'class':'text'}): if not x.string: continue if x.string==u'*': tagsline+=x.string else: tagslist+=[x.string] tagsline+=x.string+' ' if not addtokentag: addtokentag = (tokentag==x.string) except: print "...tag parsing failed, retrying" if tagsline==u'': tagsline=u'(n/a)' print "Tags:", for x in tagslist: try: print x, except UnicodeError: print "-", except LookupError: print "-", for i in blacklist.decode("utf8").split(" "): if (len(i) > 0) and (x.find(i) >= 0): try: print "blacklisted by %s, skipping..."%(i) except: print "blacklisted, skipping..." return print "" #parse imagedate and number of manga pages manga = False classmeta = parseTitle.find('ul',{'class':'meta'}) try: if classmeta != None: imagedate = classmeta.find(text=re.compile(r"(\d{4}年\d*月\d*日 \d{2}:\d{2}|\d*/\d*/\d{4} \d{2}:\d{2})".decode('utf8'))) else: print "WARNING: Either pixiv changed page format or page not found" z = re.search(r"(\d{4})年(\d*)月(\d*)日 (\d{2}):(\d{2})".decode('utf8'),imagedate) if z: # Japanese date imagedate = datetime(int(z.group(1)),int(z.group(2)),int(z.group(3)), int(z.group(4)),int(z.group(5))) else: imagedate = work_info.find('ul',{'class':'meta'}).find_all('li',text=re.compile(r"\d{2}/\d{2}/\d{2} \d{2}:\d{2}".decode('utf8')))[0].split("\n")[0] z = re.search(r"(\d*)/(\d*)/(\d{4}) (\d{2}):(\d{2})".decode('utf8'),imagedate) if z: # American date imagedate = datetime(int(z.group(3)),int(z.group(1)),int(z.group(2)), int(z.group(4)),int(z.group(5))) except Exception,e: print "(a)",str(e), print "(timestamp not found, assuming now/UTC)", imagedate = datetime.utcnow() if (not ugoira) & (parseTitle.find('img',{'class':'original-image'}) == None): manga = True mangapages = 1 if classmeta != None: mlookup = classmeta.find(text=re.compile(r"複数枚投稿 \d*P".decode('utf8'))) if mlookup != None: mangapages = int(mlookup.split(" ")[1][:-1]) imagedates = imagedate.strftime('%Y-%m-%d %H:%M') print "Date:",imagedates, #parse title try: #for a in parseTitle.find_all('h1',{'class':'title'}): print a.text title = parseTitle.find_all('h1',{'class':'title'})[2].text print "title:",title except UnicodeError: print "(not supported by console)" except LookupError: print "(unknown console encoding)" except: title = "untitled" #parse actual image(s) if ugoira: z = re.search(r"pixiv\.context\.ugokuIllustFullscreenData.+(http:.+ugoira1920x1080\.zip)",mediumPage) if z: anilink = z.group(1).replace('\\',"") else: print "Failed to find the Cinematic zip, skipping" exit works_display = parseTitle.find('div',{"id":"wrapper"}) if not manga: tickity = 0 if not ugoira: #ergo, a single pic if parseTitle.find("再度ログインしなおしてください".decode("utf8")): print "Attempting to re-login...", if login(username, password) == 0: print "success!" if manga: print "Getting manga,",mangapages,"pages..." imgList = [] if mangapages>1: for i in range(mangapages): imgList.append("member_illust.php?mode=manga_big&illust_id="+str(id)+"&page="+str(i)) else: imgList.append("member_illust.php?mode=big&illust_id="+str(id)) elif ugoira: imgList = [anilink] else: #ergo, single page imgList = [parseTitle.find('img',{"class":"original-image"})] for imgFile in imgList: #each imgFile becomes a big page link-to-follow in case of manga ._. if quickCheck and fQuickCheck: break ext = '#' if ugoira: ext = re.sub(r'http:.*/(\d+_ugoira.*\.zip)',r'\1',anilink) elif manga: while 1: try: req = urllib2.Request(url+imgFile) req.add_header("Referer", url+"member_illust.php?mode=medium&illust_id="+str(id)) viewPage = br.open(req) parser = BeautifulSoup(viewPage.read(),"html5lib") imgFileM = parser('img')[0] ext = os.path.basename(imgFileM['src']) break except Exception,e: print "(c)",str(e), if str(e).find('global name')>-1: print "...skipping..." break if str(e).startswith('HTTP Error 404'): print "[404]The submission is rendered unloadable, skipping..." break if str(e).startswith('HTTP Error 400'): print "[400]The submission is rendered unloadable, skipping..." break time.sleep(5) else: #not manga or cine try: ext = os.path.basename(imgFile['data-src']) except Exception,e: print "No original image link found..." if ext.split('.')[0].startswith(str(id)): image_id = ext.split('.')[0] if manga: #for comfort browsing of manga stuff, zero-pad it to 2 digits: z = re.search(r'_p(\d*)',image_id) if z: image_id = re.sub(r'_p\d*','_p{0:02}'.format(int(z.group(1))),image_id) elif ugoira: image_id = re.sub(r'http://.*/(\d*)_ugoira.*',r'\1_ani',anilink) global _pager fileName = makeFilename(_pager, id, artist_id, artist, title, image_id, noOfImages, imagedates, addtokentag) fileName = fileName+"."+ext.split('.')[1].split('?')[0] print 'Saving to:', fileName = sanitizeFilename(fileName) try: print fileName except UnicodeError: print "(not supported by console)" except LookupError: print "(unknown console encoding)" if manga: dl(imgFileM['src'], fileName, viewPage.geturl(), imagedate) elif ugoira: dl(anilink, fileName, ttt.geturl(), imagedate) else: dl(imgFile['data-src'], fileName, br.geturl(), imagedate) #descript.ion update tiem # assuming UTF-8 (so what if it doesn't work on most 2panel commanders, so what if it's not in the specs at http://jpsoft.com/ascii/descfile.txt? Who REALLY cares about this legacy? It just won't work otherwise, so there) # Using ^D the way Ghisler's Total Commadner 7.55a does--for description breaks, instead of what the specs say. # Likewise, EoF characters aren't specifically processed # Assuming available for writing # I hate everything about this implementation. if descript_ion: dfile = os.path.dirname(fileName)+'/'+description_file _descmagic='\x04\xC2\x0A' #writelines() will autoreplace \x0A with \x0D\x0A on w32 O_o if os.path.exists(dfile): descfile = open(dfile,'r') curdesc=descfile.readlines() descfile.close() else: curdesc=[] notyet=True for x in curdesc: if x.find(os.path.basename(fileName).encode('utf-8'))>-1: notyet=False break if notyet: print 'Updating descript.ion...' curdesc.append( makeDescription( os.path.basename(fileName), title, tagsline, works_caption ).encode('utf-8') + _descmagic ) descfile = open(dfile,'w') descfile.writelines(curdesc) descfile.close() curdesc=[] ttt.close() #-----------List all images #@profile def downloadAllImages(pager): global fQuickCheck global _pager _pager = pager print "Getting pages from ", pager fQuickCheck = False global startPage global curPage curPage = startPage hasMorePage = 1 global noOfImages noOfImages = 1 id = None previd = [0] weirdvar = 5 relogined = 0 listpage = '