1. luoboiqingcai
  2. downloadduxiu

Commits

luoboiqingcai  committed d0fe7ed

增加了strategy.py模块,其中定义了用以区分下载方式的类。改进了命令行的接口。修复了一些潜在的漏洞。

  • Participants
  • Parent commits a898683
  • Branches default

Comments (0)

Files changed (2)

File downloadlib.py

View file
 #import platform
 import os
 import os.path
+
+from strategy import (FromDuxiuRemote, FromDuxiuLocal, FromChaoxing)
 from multiprocessing import (cpu_count, Pool, log_to_stderr)
 
 __author__ = "luoboiqingcai"
         对于不同的策略(strategy),处理不同的情况。
         对于读秀下载下来的网页,其中包括完整的图片地址,而对于读秀远程和超星已下载,并不直接包含完整的图片地址,需要从其中提出信息进行处理。
 
-        1: 读秀远程
-        2: 读秀已下载
-        3: 超星已下载
+        - FromDuxiuRemote() 读秀远程
+        - FromDuxiuLocal() 读秀已下载
+        - FromChaoxing() 超星已下载
         """
         imgurls = []
         p = re.compile(self.pattern) # 匹配图片的相对地址
         p1 = re.compile(r'.?jpgRange = "(\d+)-(\d+)";') #匹配页码范围
-        if strategy in (1,3):
+        if isinstance(strategy, (FromDuxiuRemote,FromChaoxing)):
             #用于从读秀远程取得的网页和下载下来的超星网页
             matched = None
             matched1 = None
-            if strategy == 1:
-                content = self.get_content(url,remotep=True)
-            elif strategy == 3:
-                content = self.get_content(url,remotep=False)
+            if isinstance(strategy, FromDuxiuRemote):
+                content = self.get_content(url, remotep=True)
+            elif:
+                content = self.get_content(url, remotep=False)
             for line in content:
                 match = re.match(p,line)
                 match1 = re.search(p1,line)
             sys.stderr.flush() #如果没有这句,标准错误流就不会立即打印。
             #imgurls = map(lambda x:imgurl+x[-6:]+'?.',['000000%d'%p for p in range(start_p,end_p+1)])
             imgurls = [imgurl+'{0:06}?.'.format(p) for p in range(start_p,end_p+1)]
-        elif strategy == 2:
+        elif isinstance(strategy, FromDuxiuLocal):
             # 读秀已下载
             content = self.get_content(url,remotep=False)
             for line in content:
         strategy
             对于不同的策略(strategy),处理不同的情况。
             对于读秀下载下来的网页,其中包括完整的图片地址,而对于读秀远程和超星已下载,并不直接包含完整的图片地址,需要从其中提出信息进行处理。
-
-             1: 读秀远程
-             2: 读秀已下载
-             3: 超星已下载
+              
+             - FromDuxiuRemote() 读秀远程
+             - FromDuxiuLocal() 读秀已下载
+             - FromChaoxing() 超星已下载
 
         sp
             从sp页开始下载,如果sp小于网页上的最小页码,则还是按网页上的最小页码开始下载。对已经下载下来的duxiu网页没有作用(remote==false)
     print('''This Python script can be used to download duxiu documents.
 use this software at your own risk
 author:%s %s
-downlaodduxiu.py [options] arg
-Note: this script only support one argument.
+downlaodduxiu.py [options] arg0 arg1
+Note: this script only support two argument.
 Options:
 -h --help print help information
 --prex download dir
 --logfile logfile
---cx download from chaoxing
---method ,MUST BE PROVIDED when absence of --cx! 1:remote fetch duxiu; 2: fetch duxiu img whose address is fetched whithin downloaded page; 3:fetch chaoxing imag whose address can be inferred from downloaded page;
---resume ,resume recent breaken download schedule based on logfile, take preference to --sp and --ep. If the program run with --sp or --ep options and is terminated normally. --resume will NOT make any sense
+--resume resume recent breaken download schedule based on logfile, take preference to --sp and --ep. If the program run with --sp or --ep options and is terminated normally. --resume will NOT make any sense
 --sp specify where download start
 --ep specify where download end
 --procnum specify whether utilize multiprocessing, only work when ep and sp parameters are not used. This option apply for both chaoxing and duxiu
 
+arg0 "duxiu_url" :remote fetch duxiu; "duxiu_file": fetch duxiu img whose address is fetched whithin downloaded page; chaoxing:fetch chaoxing imag whose address can be inferred from downloaded page;
+arg1 the url or file.
+
 EXAMPLE:
-./downloadlib.py --prex=myprex --logfile=logfile --method=1 "url of duxiu book location"
+./downloadlib.py --prex=myprex --logfile=logfile duxiu_url "url of duxiu book location"
 
 '''%(__author__,__contact__))
 
         sys.exit(2)
     prex = None
     logfile = None
-    strategy = None
     resume = False
     sp = ep = ''
     processes = 1
                 prex = a
         elif o == '--logfile':
             logfile = a
-        elif o == '--method':
-            strategy = int(a)
         elif o == '--resume':
             resume = True
         elif o == '--sp':
             assert False, "unhandled option"
     if processes > 1:
         pool = Pool(processes=processes)
-    if len(args)>1:
-        print("only support one argument one time")
+    if len(args)>2:
+        print("only support two arguments one time")
         exit(1)
     else:
-        arg = args[0]
-    if strategy == 3:
+        libtype_ = args[0]
+        if args[0] == "chaoxing":
+            libtype = FromChaoxingRemote()
+        elif args[0] == "duxiu_url":
+            libtype = FromDuxiuRemote()
+        elif args[0] == "duxiu_file":
+            libtype = FromDuxiuLocal()
+        else:
+            raise ValueError("wrong libtype, should be chosen among chaoxing,duxiu_url,duxiu_file.")
+        arg = args[1]
+    if libtype_ == "chaoxing":
         instance = DownloadLib(lnp=lnp,pattern=pattern_for_chaoxing,img_url_prex=chaoxing_img_url_prex,prex=prex,logfile=logfile)
         if resume:
-            instance(from_remote,arg,sp=resume_point(os.path.join(prex,logfile)))
+            instance(libtype, arg, sp=resume_point(os.path.join(prex,logfile)))
             sys.exit(0)
         if sp and ep:
-            instance(from_remote,arg,sp=sp,ep=ep)
+            instance(libtype, arg, sp=sp, ep=ep)
             sys.exit(0)
         if sp:
-            instance(from_remote,arg,sp=sp)
+            instance(libtype, arg, sp=sp)
             sys.exit(0)
         if ep:
-            instance(from_remote,arg,ep=ep)
+            instance(libtype, arg, ep=ep)
             sys.exit(0)
         if processes > 1:
             #print instance.get_img_pairs(from_remote,arg)
             #TODO
-            pool.map_async(multidownloadlib,instance.get_img_pairs(strategy,arg),processes)
+            pool.map_async(multidownloadlib,instance.get_img_pairs(libtype, arg),processes)
             pool.close()
             pool.join()
             sys.exit(0)
         else:
-            instance(strategy,arg)
+            instance(libtype,arg)
             sys.exit(0)
-    elif strategy in (1,2): # from duxiu
-        if strategy == 1:
+    elif libtype_ in ("duxiu_url","duxiu_file"): # from duxiu
+        if libtype == "duxiu_url":
             instance = DownloadLib(lnp=lnp,pattern=pattern_for_duxiu_r,img_url_prex=img_url_prex,prex=prex,logfile=logfile)
         else:
             instance = DownloadLib(lnp=lnp,pattern=pattern_for_duxiu_l,img_url_prex=img_url_prex,prex=prex,logfile=logfile)
         if processes == 1:
-            instance(strategy,arg)
+            instance(libtype,arg)
         else:
-            pool.map_async(multidownloadlib,instance.get_img_pairs(strategy,arg),processes)
+            pool.map_async(multidownloadlib,instance.get_img_pairs(libtype,arg),processes)
             pool.close()
             pool.join()
     sys.exit(0)

File strategy.py

View file
+# -*- encoding:utf-8 -*-
+        1: ����Զ��
+        2: ����������
+        3: ����������
+
+class DownloadStrategy(object):
+    '''
+    Base class for download stragetys.
+    '''
+    pass
+
+class FromDuxiuRemote(DownloadStrategy):
+    '''
+    download from duxiu remote url.
+    '''
+    pass
+
+class FromDuxiuLocal(DownloadStrategy):
+    '''
+    download from duxiu local file.
+    '''
+    pass
+
+class FromChaoxing(DownloadStrategy):
+    '''
+    download from chaoxing localfile.
+    '''
+    pass