Commits

Szymon Wróblewski committed 829f7a8

new page parsing method, more info extracted from image title

Comments (0)

Files changed (1)

 import os
+import re
 import logging
 import Queue as queue
 import shutil
 import threading
-from requests import session
+import requests
 from lxml import html
 
 log = logging.getLogger(__name__)
     user_agent = ('Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 '
                   '(KHTML, like Gecko) Chrome/24.0.1312.56 Safari/537.17')
     max_tasks = 8
+    title_re = re.compile(
+        r'(?P<title>.+) by .(?P<author_name>.+), '
+        r'(?P<date>\w{3} \d{1,2}, \d{4}) in '
+        r'(?P<category>.*)'
+    )
 
     def __init__(self, path='.'):
-        self.session = session()
+        self.session = requests.session()
         self.session.headers.update({'User-Agent': self.user_agent})
         self.path = path
         self.img_urls = queue.Queue()
     def parse_group(self, url):
         pass
 
-    def parse_gallery(self, url):
+    def parse_gallery(self, url, limit=None):
         offset = 0
         total = 0
         page = 1
             log.info('Page %d [%d/%d]', page, offset, total)
             item_nr = 0
             for item in gallery.iterfind(".//div[@userid]"):
-                log.info(item)
                 item_nr += 1
                 item = item.find_class('thumb')
                 if not item:
                     log.info('Item not available')
                     continue
                 item = item[0]
-                img_url = item.attrib.get('data-super-full-img') or item.attrib.get('data-super-img')
+                img_url = (
+                    item.attrib.get('data-super-full-img') or 
+                    item.attrib.get('data-super-img')
+                )
                 if img_url:
-                    log.info('Queuing %s', item.attrib['title'])
-                    self.img_urls.put(img_url)
+                    info = self.title_re.match(item.attrib.get('title', ''))
+                    if info is not None:
+                        info = info.groupdict()
+                        log.info('Queuing %s by %s', info['title'], info['author_name'])
+                    else:
+                        log.info('Queuing download')
+                    self.img_urls.put((img_url, info))
                 else:
-                    self.parse_page(item.attrib['href'], item.attrib['title'])
+                    self.parse_page(item.attrib['href'])
             if item_nr == 0:
                 break
             offset += part
             page += 1
 
-    def parse_page(self, url, name='download'):
+    def parse_page(self, url):
+        oauth_url = 'http://backend.deviantart.com/oembed'
+        response = self.session.get(oauth_url, params={'url': url})
+        if response.status_code == requests.codes.ok:
+            info = response.json()
+            log.info('Queuing %s by %s', info['title'], info['author_name'])
+            self.img_urls.put((info['url'], info))
+        else:
+            log.info('Item not available: %s', response.text)
+    
+    def _parse_page_old(self, url, name='download'):
         response = self.session.get(url)
         tree = html.fromstring(response.text)
         item = tree.get_element_by_id('download-button', None)
         if item is not None:
             log.info('Queuing %s', name)
-            self.img_urls.put(item.attrib['href'])
+            self.img_urls.put((item.attrib['href'], {}))
         else:
             log.info('Item locked')
 
     def download_task(self):
         while not self.deactivate.is_set():
             try:
-                self.download_image(self.img_urls.get())
+                url, info = self.img_urls.get()
+                self.download_image(url, info=info)
             finally:
                 self.img_urls.task_done()
 
-    def download_image(self, url, path=None):
+    def download_image(self, url, path=None, info={}):
         if path is None:
             path = self.path
-        path = os.path.join(path, os.path.split(url))
+        path = os.path.join(path, os.path.split(url)[1])
         if os.path.exists(path):
             log.info('Skipping %s', url)
         else:
     br = DABrowser('download')
     if br.login():
         log.info('Login successful')
-    #br.start_downloading()
+    br.start_downloading()
+    br.parse_gallery('http://shadowsinking.deviantart.com/gallery/?catpath=/')
+    #br.parse_gallery('http://bluex-pl.deviantart.com/favourites/38926098')
     #br.parse_gallery('http://bluex-pl.deviantart.com/favourites/48404227')
-    #br.parse_gallery('http://bluex-pl.deviantart.com/favourites/38926098')
-    #br.parse_gallery('http://shadowsinking.deviantart.com/gallery/?catpath=/')
-    br.parse_gallery('http://browse.deviantart.com/?q=dragon')
-    #br.wait_for_end()
+    #br.parse_gallery('http://browse.deviantart.com/?q=dragon')
+    br.wait_for_end()
+    if br.logout():
+        log.info('Logout successful')
     log.info('end')
 
 if __name__ == '__main__':