Commits

luoboiqingcai committed 88cd330

初步完成了用requests替代urllib2的过程,并且修正了prex不起作用的bug.经测试对于下载读秀能正常工作了。

Comments (0)

Files changed (1)

 
 """
 author: luoboiqingcai <sf.cumt@gmail.com>
+
+=======
+重构说明
+=======
+这一版本放弃继续使用urllib2库进行http连接,而是引进了requests包进行http连接处理。这么做的原因主要不是像requests自己吹的那样是由于requests的接口比较简单,第一位的原因是为了同时兼容python2和python3。由于python3的标准库中取消了urllib2,因此原本只考虑了python2的本程序不得不重写。如果python3中还保留urllib2我才不会没事改脚本呢。使用requests虽然引入了外部依赖,但它用起来确实很pythonic,也达到了我需要的python 2和3间的兼容性要求,不错。
+
+在对代码进行http连接部分进行重构的同时,也期待解决下面两个问题:
+
+ - 与gui的兼容性
+ - 超星下载的问题
+ - prex不起作用的bug
+
+为了防止被服务器端发现是在用机器人下资源,应避免从一个服务器下载。每次文献传递都可能是分配一个与众不同的服务器来提供文献。因此每次将咨询等到的服务器与现有服务器列表对照,如果这个服务器还没有被列入已知服务器列表,则把它加进去。
+
 """
-import urllib2
+import io
+import requests
 import re
 import getopt
 import sys
 
 duxiu_source_list = ["www2.zhengzhifl.cn","www.zirankxzl.cn","www.zhexuezj.cn","www.junshilei.cn",]
 
-DEBUG = False
+DEBUG = True
 TEM_FILE = "remote_webpage_content.html"
 
 class DownloadLib(object):
     main class for this module
 
     from_remote
-        用浏览器下载下来的网页与urllib2下载下来的网页是不同的,浏览器下载下来的网页可能已经过浏览器渲染过了,而用urllib2下载下来的没有经过客户端渲染。因此两都表现不同。
+        用浏览器下载下来的网页与requests下载下来的网页是不同的,浏览器下载下来的网页可能已经过浏览器渲染过了,而用 requests 下载下来的没有经过客户端渲染。因此两都表现不同。
     lnp
         根据远程文件的名字取得用于本地存储的名字。
     pattern
             self.prex = prex
         else:
             self.prex = ''
+            sys.stderr.write("warning: prex is empty")
         if logfile != None and isinstance(logfile,str):
             self.logfile = logfile
         else:
         else:
             self.log.setLevel(logging.INFO)
         if logfile:
-            self.log.addHandler(logging.FileHandler(self.prex+logfile,mode='w'))
+            self.log.addHandler(logging.FileHandler(os.path.join(self.prex,logfile),mode='w'))
         else:
             self.log.addHandler(logging.StreamHandler(sys.stderr))
         self.log.debug("DownloadLib or its subclass is instanced")
     def get_content(self,url,remotep=True):
         """
-        从远程网址或下载下来的本地网页中读取内容
+        从远程网址或下载下来的本地网页中读取内容.
+        
+        返回一个*读打开*的*文件*对象(filelike object)。以便进行模式匹配。
+
         url
             试读地址或网页文件地址,根据此地址获得网页中的试读图片。
         remotep
             if DEBUG:
                 if os.path.exists(TEM_FILE):
                     self.log.debug("%s existed"%TEM_FILE)
-                    content = open(TEM_FILE,'rb')
+                    content = open(TEM_FILE,'r',encoding='utf-8')
+                    #如果是写入字节:
+                    #content = open(TEM_FILE,'rb')
+                    #with open(TEM_FILE,'wb') as f:
+                    #    print(bytes(something,'utf-8'),file=f)#错,因为print不支持二进制串
                 else:
-                    content = urllib2.urlopen(url)
-                    with open(TEM_FILE,'wb') as f:
-                        f.write(content.read())
+                    response = requests.get(url)
+                    with open(TEM_FILE,'w',encoding='utf-8') as f:
+                        self.log.debug("type of response.text:%s\ntype of f:%s"%(str(type(response.text)),str(type(f))))
+                        f.write(response.text)
+                        #f.write("hi") 因为文件是'b'打开的,因此写入的应是二进制而不是字符形!
+                        #print(response.text,f)
                     self.log.debug("%swriten"%TEM_FILE)
+                    content = io.StringIO(response.text)
             else:
-                content = urllib2.urlopen(url)
+                response = requests.get(url)
+                content = io.StringIO(response.text)
         else:
-            content = open(url,'rb')
+            content = open(url,'r')
         return content
     def get_img(self,url):
         """
+        返回一个二进制字节流
         url
             试读图片相对地址
         """
         s = self.img_url_prex + url
         try:
-            img = urllib2.urlopen(s)
-        except urllib2.URLError,e:
+            img = requests.get(s).content
+        except requests.exceptions.RequestException as e:
             self.log.warn('first time download failed: %s'%e)
             time.sleep(10)
             try:
-                img = urllib2.urlopen(s)
-            except urllib2.URLError,e:
+                img = requests.get(s).content
+            except requests.exceptions.RequestException as e:
                 self.log.warn('try second time failed: %s'%e)
                 self.log.info("%s wasn't downloaded."%s)
             else:
                     matched1 = match1
                     self.log.debug('p1 match1')
                 if matched and matched1: break
+            content.close()
             if not matched or not matched1:
                 raise SystemError('regular expression error.matched:%s,matched1:%s'%(matched,matched1))
             imgurl = matched.group(1)
             self.log.info('*** end:%d'%end_p)
             if ep and end_p > int(ep):
                 end_p = int(ep)
-            print start_p,end_p
+            sys.stderr.write(str((start_p,end_p)))
             #imgurls = map(lambda x:imgurl+x[-6:]+'?.',['000000%d'%p for p in range(start_p,end_p+1)])
             imgurls = [imgurl+'{0:06}?.'.format(p) for p in range(start_p,end_p+1)]
         else:
             content = self.get_content(url,remotep=False)
             for line in content:
                 imgurls = re.findall(p,line)
+            content.close()
         return [{'url':self.img_url_prex+imgurl,'localname':os.path.join(self.prex,self.get_localname(imgurl))} for imgurl in imgurls]
 
     def downloadit(self,
             count += 1
             try:
                 f = open(ul['localname'],'wb')
-                f.write(img.read())
-            except Exception, e:
+                self.log.debug("type of img:%s"%type(img))
+                #print(img,file=f) print不支持二进制写,而这里img是二进制流,帮出错
+                f.write(img)
+            except Exception as e:
                 raise e
             else:
                 self.log.info("%s has been download and stored locally."%ul['localname'])
+            finally:
+                f.close()
         self.log.info("%d images are downloaded."%count)
         return count
 
     def get_content(self,url,remotep=True):
         """
         覆盖DownloadLib中的同名方法
+        通过get_content读取已下载下来的全文浏览网页
+        因为返回的文件对象要供下一步模式匹配用,所以要用'r'选项。
         """
-        content = open(url,'rb')
+        #content = open(url,'r',encoding='utf-8')
+        content = open(url,'r') #因为是浏览器下载的,而不是脚本下载的,需不需要encoding参数?
         return content
     def __call__(self,from_remote,url,sp='',ep=''):
         count = 0
     """
     img = None
     try:
-        img = urllib2.urlopen(pair['url'])
-    except urllib2.URLError,e:
+        img = requests.get(pair['url']).content
+    except requests.exceptions.RequestException as e:
         log_to_stderr('%s,trying second time...'%e)
         time.sleep(10)
         try:
-            img = urllib2.urlopen(pair['url'])
-        except urllib2.URLError,e:
+            img = requests.get(pair['url']).content
+        except requests.exceptions.RequestException as e:
             log_to_stderr("try second time failed: %s\n%s wasn't downloaded."%(e,pair['url']))
         else:
             log_to_stderr("second time succeed.")
     if img:
         with open(pair['localname'],'wb') as f:
-            f.write(img.read())
+            f.write(img)
     else:
         log_to_stderr('img empty')
 
     chaoxing_img_url_prex =  'http://img.sslibrary.com/n/' # 超星读秀
     try:
         opts, args = getopt.getopt(sys.argv[1:],'h',['help','local','cx','resume','prex=','logfile=','sp=','ep=','procnum='])
-    except getopt.GetoptError, err:
-        print str(err)
+    except getopt.GetoptError as err:
+        sys.stderr.write(str(err))
         usage()
         sys.exit(2)
     prex = None
             sys.exit(0)
         elif o == '--prex':
             if a[-1] == '/' or a[-1] == '\\':
-                prex = prex[:-1]
+                prex = a[:-1]
+            else:
+                prex = a
         elif o == '--logfile':
             logfile = a
         elif o == '--local':