Commits

mazesoul committed afe4173 Draft

init

  • Participants

Comments (0)

Files changed (16)

+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+	<name>jiepang</name>
+	<comment></comment>
+	<projects>
+	</projects>
+	<buildSpec>
+		<buildCommand>
+			<name>org.python.pydev.PyDevBuilder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+	</buildSpec>
+	<natures>
+		<nature>org.python.pydev.pythonNature</nature>
+	</natures>
+</projectDescription>
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<?eclipse-pydev version="1.0"?>
+
+<pydev_project>
+<pydev_pathproperty name="org.python.pydev.PROJECT_SOURCE_PATH">
+<path>/jiepang</path>
+</pydev_pathproperty>
+<pydev_property name="org.python.pydev.PYTHON_PROJECT_VERSION">python 2.7</pydev_property>
+<pydev_property name="org.python.pydev.PYTHON_PROJECT_INTERPRETER">python</pydev_property>
+</pydev_project>

.settings/org.eclipse.core.resources.prefs

+eclipse.preferences.version=1
+encoding//jiepang/pipelines.py=utf-8
+encoding//jiepang/spiders/jiepang_spider.py=utf-8

jiepang/__init__.py

Empty file added.

jiepang/__init__.pyc

Binary file added.
+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/topics/items.html
+
+from scrapy.item import Item, Field
+
+class AddressItem(Item):
+    name = Field()
+    city = Field()
+    address = Field()
+    lat = Field()
+    lon = Field()
+    address_type = Field()

jiepang/items.pyc

Binary file added.

jiepang/pipelines.py

+#encoding:utf-8
+from pymongo import Connection
+from scrapy.exceptions import DropItem
+from scrapy.conf import settings
+from jiepang.items import AddressItem
+
+class JiepangPipeline(object):
+
+    def __init__(self):
+        connection = Connection(settings['MONGODB_SERVER'], settings['MONGODB_PORT'])
+        db = connection[settings['MONGODB_DB']]
+        db.authenticate(settings['MONGO_USER'], settings['MONGO_PWD'])
+        self.address = db['addresslist']
+
+    def process_item(self, item, spider):
+        if isinstance(item, AddressItem):
+            self.address.insert(dict(item))
+        else:
+            raise DropItem("DropItem")
+        return item

jiepang/pipelines.pyc

Binary file added.

jiepang/settings.py

+# Scrapy settings for jiepang project
+#
+# For simplicity, this file contains only the most important settings by
+# default. All the other settings are documented here:
+#
+#     http://doc.scrapy.org/topics/settings.html
+#
+
+BOT_NAME = 'jiepang'
+BOT_VERSION = '1.0'
+
+LOG_LEVEL = 'INFO'
+
+SPIDER_MODULES = ['jiepang.spiders']
+NEWSPIDER_MODULE = 'jiepang.spiders'
+USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)
+
+SCHEDULER_MIDDLEWARES_BASE = {
+    'scrapy.contrib.schedulermiddleware.duplicatesfilter.DuplicatesFilterMiddleware': 500,
+}
+
+ITEM_PIPELINES = [
+    'jiepang.pipelines.JiepangPipeline',
+]
+
+MONGODB_SERVER = "localhost"
+MONGODB_PORT = 27017
+MONGODB_DB = "jiepang"
+MONGO_USER = ""
+MONGO_PWD = ""
+
+
+DOWNLOAD_DELAY = 1
+
+CITY_LIST = ["beijing", "shanghai" "tianjin", "chongqing", "akesu", "anning", "anqing", "anshan", "anshun", "anyang", "baicheng", "baishan", "baiyin", "bengbu", "baoding", "baoji", "baoshan", "bazhong", "beihai", "benxi", "binzhou", "bole", "bozhou", "cangzhou", "changde", "changji", "changshu", "changzhou", "chaohu", "chaoyang", "chaozhou", "chengde", "chengdu", "chenggu", "chenzhou", "chibi", "chifeng", "chishui", "chizhou", "chongzuo", "chuxiong", "chuzhou", "cixi", "conghua", "dali", "dalian", "dandong", "danyang", "daqing", "datong", "dazhou", "deyang", "dezhou", "dongguan", "dongyang", "dongying", "douyun", "dunhua", "eerduosi", "enshi", "fangchenggang", "feicheng", "fenghua", "fushun", "fuxin", "fuyang", "fuyang1", "fuzhou", "fuzhou1", "ganyu", "ganzhou", "gaoming", "gaoyou", "geermu", "gejiu", "gongyi", "guangan", "guangyuan", "guangzhou", "gubaotou", "guigang", "guilin", "guiyang", "guyuan", "haerbin", "haicheng", "haikou", "haimen", "haining", "hami", "handan", "hangzhou", "hanzhong", "hebi", "hefei", "hengshui", "hengyang", "hetian", "heyuan", "heze", "huadou", "huaian", "huaibei", "huaihua", "huainan", "huanggang", "huangshan", "huangshi", "huhehaote", "huizhou", "huludao", "huzhou", "jiamusi", "jian", "jiangdou", "jiangmen", "jiangyin", "jiaonan", "jiaozhou", "jiaozuo", "jiashan", "jiaxing", "jiexiu", "jilin", "jimo", "jinan", "jincheng", "jingdezhen", "jinghong", "jingjiang", "jingmen", "jingzhou", "jinhua", "jining1", "jining", "jinjiang", "jintan", "jinzhong", "jinzhou", "jishou", "jiujiang", "jiuquan", "jixi", "jiyuan", "jurong", "kaifeng", "kaili", "kaiping", "kaiyuan", "kashen", "kelamayi", "kuerle", "kuitun", "kunming", "kunshan", "laibin", "laiwu", "laixi", "laizhou", "langfang", "lanzhou", "lasa", "leshan", "lianyungang", "liaocheng", "liaoyang", "liaoyuan", "lijiang", "linan", "lincang", "linfen", "lingbao", "linhe", "linxia", "linyi", "lishui", "liuan", "liupanshui", "liuzhou", "liyang", "longhai", "longyan", "loudi", "luohe", "luoyang", "luxi", "luzhou", "lvliang", "lvshun", "maanshan", "maoming", "meihekou", "meishan", "meizhou", "mianxian", "mianyang", "mudanjiang", "nanan", "nanchang", "nanchong", "nanjing", "nanning", "nanping", "nantong", "nanyang", "neijiang", "ningbo", "ningde", "panjin", "panzhihua", "penglai", "pingdingshan", "pingdu", "pinghu", "pingliang", "pingxiang", "pulandian", "puning", "putian", "puyang", "qiannan", "qidong", "qingdao", "qingyang", "qingyuan", "qingzhou", "qinhuangdao", "qinzhou", "qionghai", "qiqihaer", "quanzhou", "qujing", "quzhou", "rikaze", "rizhao", "rongcheng", "rugao", "ruian", "rushan", "sanmenxia", "sanming", "sanya", "xiamen", "shan", "shangluo", "shangqiu", "shangrao", "shangyu", "shantou", "ankang", "shaoguan", "shaoxing", "shaoyang", "shenyang", "shenzhen", "shihezi", "shijiazhuang", "shilin", "shishi", "shiyan", "shouguang", "shuangyashan", "shuozhou", "shuyang", "simao", "siping", "songyuan", "suining", "suizhou", "suzhou", "tacheng", "taian", "taicang", "taixing", "taiyuan", "taizhou", "taizhou1", "tangshan", "tengchong", "tengzhou", "tianmen", "tianshui", "tieling", "tongchuan", "tongliao", "tongling", "tonglu", "tongren", "tongxiang", "tongzhou", "tonghua", "tulufan", "wafangdian", "weifang", "weihai", "weinan", "wendeng", "wenling", "wenzhou", "wuhai", "wuhan", "wuhu", "wujiang", "wulanhaote", "wuwei", "wuxi", "wuzhou", "xian", "xiangcheng", "xiangfan", "xianggelila", "xiangshan", "xiangtan", "xiangxiang", "xianning", "xiantao", "xianyang", "xicang", "xichang", "xingtai", "xingyi", "xining", "xinxiang", "xinyang", "xinyu", "xinzhou", "suqian", "suyu", "suzhou1", "xuancheng", "xuchang", "xuzhou", "yaan", "yakeshi", "yanan", "yanbian", "yancheng", "yangjiang", "yangquan", "yangzhou", "yanji", "yantai", "yanzhou", "yibin", "yichang", "yichun", "yichun1", "yili", "yinchuan", "yingkou", "yingtan", "yining", "yiwu", "yixing", "yiyang", "yongkang", "yongzhou", "yueyang", "yuhuan", "yulin1", "yulin", "yuncheng", "yuxi", "yuyao", "zaozhuang", "zengcheng", "changchun", "changhai", "zhangjiagang", "zhangjiajie", "zhangjiakou", "changle", "zhangqiu", "changsha", "zhangye", "changzhi", "zhangzhou", "zhanjiang", "zhaodong", "zhaoqing", "zhaotong", "zhengzhou", "zhenjiang", "zhongshan", "zhoukou", "zhoushan", "zhucheng", "zhuhai", "zhuji", "zhumadian", "zhuzhou", "zibo", "zigong", "zunyi", "wulumuqi", "fuqing", "ezhou", "baotou", "xiaoshan", "xuanhua", "jiangyou", "ziyang", "xinji", "foshan", "wanzhou", "zoucheng", "shaowu", "jiangyan", "xiangyin", "songjiang", "qitaihe", "liling", "fuling", "gongzhuling", "shexian", "xinghua"]
+CATEGORY_LIST = ["0601", "0603", "0604", "0701", "0702", "0704", "0705", "0706", "0711"]

jiepang/settings.pyc

Binary file added.

jiepang/spiders/__init__.py

+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.

jiepang/spiders/__init__.pyc

Binary file added.

jiepang/spiders/jiepang_spider.py

+#encoding:utf-8
+from pymongo import Connection
+from scrapy.spider import BaseSpider
+from scrapy.http import Request, FormRequest
+from scrapy.selector import HtmlXPathSelector
+from scrapy.conf import settings
+from jiepang.items import AddressItem
+
+class JiepangSpider(BaseSpider):
+    name = "jiepang"
+    allowed_domains = ["jiepang.com"]
+    start_urls = [
+                  "http://jiepang.com/"
+                  ]
+
+    def __init__(self):
+        connection = Connection(settings['MONGODB_SERVER'], settings['MONGODB_PORT'])
+        db = connection[settings['MONGODB_DB']]
+        db.authenticate(settings['MONGO_USER'], settings['MONGO_PWD'])
+        self.address = db['addresslist']
+        self.urllist = db['urllist']
+        self.scrawled_count = 0
+        super(JiepangSpider, self).__init__()
+
+    def parse(self, response):
+        return [FormRequest(
+                    url="http://jiepang.com/index/login",
+                    formdata={'user': 'spider_007', 'pwd': '123456', 'save-pwd': 'on'},
+                    callback=self.after_login)]
+
+    def after_login(self, response):
+        if "header-login-fail" in response.body:
+            print "Login failed"
+            return
+        else:
+            parse_list = []
+            for c_id in settings['CATEGORY_LIST']:
+                for city in settings['CITY_LIST']:
+                    request = Request(url="http://jiepang.com/city/category?c_id=%s&city=%s" % (c_id, city), callback=self.parsePage)
+                    request.meta['city'] = city
+                    parse_list.append(request)
+            return parse_list
+
+    def parsePage(self, response):
+        hxs = HtmlXPathSelector(response)
+        city = response.meta['city']
+        items = hxs.select("/html/body/div[@id='container']/div[@id='outer-container']/div[@id='inner-container']/div[@id='user-venues']/ul[@class='venue-list']/li")
+        for index, itemLink in enumerate(items):
+            index += 1
+            url = itemLink.select("/html/body/div[@id='container']/div[@id='outer-container']/div[@id='inner-container']/div[@id='user-venues']/ul[@class='venue-list']/li[" + str(index) + "]//div[@class='info']/p[@class='title']/a/@href").extract()[0]
+            if self.urllist.find({"url": url}).count():
+                continue
+            self.urllist.insert({"url":url})
+            url = "http://jiepang.com%s" % url
+            request = Request(url, callback=self.parseItem)
+            request.meta['city'] = city
+            yield request
+        urls = hxs.select("/html/body/div[@id='container']/div[@id='outer-container']/div[@id='inner-container']/div[@id='user-venues']/ul[@class='paginator']/li/a/@href").extract()
+        for url in urls:
+            if self.urllist.find({"url": url}).count():
+                    continue
+            self.urllist.insert({"url":url})
+            url = "http://jiepang.com%s" % url
+            request = Request(url, callback=self.parsePage)
+            request.meta['city'] = city
+            yield request
+
+    def parseItem(self, response):
+        city = response.meta['city']
+        try:
+            hxs = HtmlXPathSelector(response)
+            name = hxs.select("/html/body/div[@id='container']/div[@id='two-column']/div[@id='column-left']/h2[@id='venue-title']/text()").extract()[0]
+            address = hxs.select("/html/body/div[@id='container']/div[@id='two-column']/div[@id='column-left']/div[@id='venue-basic']/p[1]/text()").extract()[0]
+            address_type = hxs.select("/html/body/div[@id='container']/div[@id='two-column']/div[@id='column-left']/h2[@id='venue-title']/img[@class='venueicon ']/@alt").extract()[0]
+            lat = hxs.select("/html/body/div[@id='container']/div[@id='two-column']/div[@id='column-right']/section[@id='venue-details']/div[@class='content']/div[@id='big-venue-map']/@data-lat").extract()[0]
+            lon = hxs.select("/html/body/div[@id='container']/div[@id='two-column']/div[@id='column-right']/section[@id='venue-details']/div[@class='content']/div[@id='big-venue-map']/@data-lon").extract()[0]
+            print city, address_type, name, address, lat, lon
+            addr = AddressItem(name=name, address_type=address_type, address=address, lon=lon, lat=lat, city=city)
+        except Exception, e:
+            print e
+        else:
+            yield addr

jiepang/spiders/jiepang_spider.pyc

Binary file added.
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# http://doc.scrapy.org/topics/scrapyd.html
+
+[settings]
+default = jiepang.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = jiepang