Source

light_novel / ln.py

# coding:utf-8
# 下载轻小说网站的帖子,并排版
# 输出为txt档
import re
import os
import sys
import urllib2

def main(args=None):
	USAGE = "Usage: python *.py [url]"
	print "start..."
	f_rows = []
	l_rows = []
	handle_txt = False
	title = "10000"
	if args == None:
		args = sys.argv[1:]

	if not args:
		print USAGE
		sys.exit(1)

	site = r"http://www.lightnovel.cn/favicon.ico"

	#尝试连上网络
	try:
		print site
		req = urllib2.Request(url=site)
		req.add_header('User-Agent','Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6')
		req.add_header('Referer','http://www.lightnovel.cn')
		page = urllib2.urlopen(req)
	except:
		print "connect false..."
		sys.exit(1)
	else:
		print "connect successful..."

	#不下载网页,只整理txt
	if args[0] == '-f':
		number = []
		handle_txt = True
	else:
		url = args[0]
		number = re.compile(r'(?<=thread-)\d+(?=-\d)').findall(url)

	if not number:
		print "not found page..."
		pass
	else:
		#下载网页
		urls = [r"http://www.lightnovel.cn/thread-"+number[0]+"-%d-1.html" %i for i in range(1,3)]
		if os.path.exists("temp.txt"):
			print "temp.txt is exist..."
			os.unlink("temp.txt")
			print "delete file: temp.txt"
		for i in range(len(urls)):
			print urls[i]
			try:
				req = urllib2.Request(url=urls[i])
				req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; rv:7.0.1) Gecko/20100101 Firefox/7.0.1')
				req.add_header('Referer','http://www.lightnovel.cn')
				#req.add_header("Accept-Encoding", "gzip, deflate")
				page = urllib2.urlopen(req)
			except:
				print "can not connect to page......"
				sys.exit(1)
			else:
				f_data = page.read()
				if i == 0:
					try:
						title = re.compile(r"(?<=<title>).+(?= - \W{14} -)").findall(f_data)[0].replace(".","").strip()
					except:
						print "not found title"
					else:
						print title
				with open("temp.txt",'a') as f:
					f.write(f_data)
				page.close()

		handle_txt = True

	#处理txt文本
	if handle_txt:
		with open(title+".txt","w") as l:
			with open("temp.txt","r") as f:
				lines = f.readlines()
			for line in lines:
				line = line.strip()
				if not line:
					pass
				else:
					line = re.sub(r'&nbsp;',"",line).strip()
					#以<p align="left">开头的行
					f_rows = re.compile(r'(?<=^<p align="left">).+').findall(line)
					if f_rows:
						rows = re.split(r'</p><p align="left">',f_rows[0])
						for row in rows:
							#删除通用<>标签
							c = re.sub(r'<[\s/\w]+>',"",row)
							#删除字体标签face|color|size
							c = re.sub(r'<font\sface=".{1,16}">',"",c)
							c = re.sub(r'<font\s\w{4,5}="\S{1,6}">',"",c)
							l.write(c+"\n")
					else:
						#以中文字符开头的行
						if line and list(line)[0] not in ('<','{','}','('):
							l_rows = re.compile(r'^\W{3}.+').findall(line)
							if l_rows:
								row = l_rows[0]
								#删除通用<>标签
								c = re.sub(r'<[\s/\w]+>',"",row)
								if "<a href" in c or "\xbe\xe0\xc0\xeb\xcf\xc2\xd2\xbb\xbc\xb6\xbb\xb9\xd0\xe8" in c:
									pass
								else:
									l.write(c+"\n")

if __name__ == "__main__":
	main()
	print "done"
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.