Commits

Grigoriy Petukhov committed 30243b6

Remove obsolete files

Comments (0)

Files changed (8)

docs-redirect/Makefile

-# Makefile for Sphinx documentation
-#
-
-# You can set these variables from the command line.
-SPHINXOPTS    =
-SPHINXBUILD   = sphinx-build
-PAPER         =
-BUILDDIR      = _build
-
-# Internal variables.
-PAPEROPT_a4     = -D latex_paper_size=a4
-PAPEROPT_letter = -D latex_paper_size=letter
-ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
-
-.PHONY: help clean html dirhtml pickle json htmlhelp qthelp latex changes linkcheck doctest
-
-help:
-	@echo "Please use \`make <target>' where <target> is one of"
-	@echo "  html      to make standalone HTML files"
-	@echo "  dirhtml   to make HTML files named index.html in directories"
-	@echo "  pickle    to make pickle files"
-	@echo "  json      to make JSON files"
-	@echo "  htmlhelp  to make HTML files and a HTML help project"
-	@echo "  qthelp    to make HTML files and a qthelp project"
-	@echo "  latex     to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
-	@echo "  changes   to make an overview of all changed/added/deprecated items"
-	@echo "  linkcheck to check all external links for integrity"
-	@echo "  doctest   to run all doctests embedded in the documentation (if enabled)"
-
-clean:
-	-rm -rf $(BUILDDIR)/*
-
-html:
-	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
-	@echo
-	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
-
-dirhtml:
-	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
-	@echo
-	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
-
-pickle:
-	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
-	@echo
-	@echo "Build finished; now you can process the pickle files."
-
-json:
-	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
-	@echo
-	@echo "Build finished; now you can process the JSON files."
-
-htmlhelp:
-	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
-	@echo
-	@echo "Build finished; now you can run HTML Help Workshop with the" \
-	      ".hhp project file in $(BUILDDIR)/htmlhelp."
-
-qthelp:
-	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
-	@echo
-	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
-	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
-	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/redirect-docs.qhcp"
-	@echo "To view the help file:"
-	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/redirect-docs.qhc"
-
-latex:
-	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
-	@echo
-	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
-	@echo "Run \`make all-pdf' or \`make all-ps' in that directory to" \
-	      "run these through (pdf)latex."
-
-changes:
-	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
-	@echo
-	@echo "The overview file is in $(BUILDDIR)/changes."
-
-linkcheck:
-	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
-	@echo
-	@echo "Link check complete; look for any errors in the above output " \
-	      "or in $(BUILDDIR)/linkcheck/output.txt."
-
-doctest:
-	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
-	@echo "Testing of doctests in the sources finished, look at the " \
-	      "results in $(BUILDDIR)/doctest/output.txt."

docs-redirect/conf.py

-# -*- coding: utf-8 -*-
-#
-# redirect-docs documentation build configuration file, created by
-# sphinx-quickstart on Tue Jun 19 23:05:19 2012.
-#
-# This file is execfile()d with the current directory set to its containing dir.
-#
-# Note that not all possible configuration values are present in this
-# autogenerated file.
-#
-# All configuration values have a default; values that are commented out
-# serve to show the default.
-
-import sys, os
-
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-#sys.path.append(os.path.abspath('.'))
-
-# -- General configuration -----------------------------------------------------
-
-# Add any Sphinx extension module names here, as strings. They can be extensions
-# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
-extensions = []
-
-# Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
-
-# The suffix of source filenames.
-source_suffix = '.rst'
-
-# The encoding of source files.
-#source_encoding = 'utf-8'
-
-# The master toctree document.
-master_doc = 'index'
-
-# General information about the project.
-project = u'redirect-docs'
-copyright = u'2012, Grigoriy Petukhov'
-
-# The version info for the project you're documenting, acts as replacement for
-# |version| and |release|, also used in various other places throughout the
-# built documents.
-#
-# The short X.Y version.
-version = '0.0'
-# The full version, including alpha/beta/rc tags.
-release = '0.0'
-
-# The language for content autogenerated by Sphinx. Refer to documentation
-# for a list of supported languages.
-#language = None
-
-# There are two options for replacing |today|: either, you set today to some
-# non-false value, then it is used:
-#today = ''
-# Else, today_fmt is used as the format for a strftime call.
-#today_fmt = '%B %d, %Y'
-
-# List of documents that shouldn't be included in the build.
-#unused_docs = []
-
-# List of directories, relative to source directory, that shouldn't be searched
-# for source files.
-exclude_trees = ['_build']
-
-# The reST default role (used for this markup: `text`) to use for all documents.
-#default_role = None
-
-# If true, '()' will be appended to :func: etc. cross-reference text.
-#add_function_parentheses = True
-
-# If true, the current module name will be prepended to all description
-# unit titles (such as .. function::).
-#add_module_names = True
-
-# If true, sectionauthor and moduleauthor directives will be shown in the
-# output. They are ignored by default.
-#show_authors = False
-
-# The name of the Pygments (syntax highlighting) style to use.
-pygments_style = 'sphinx'
-
-# A list of ignored prefixes for module index sorting.
-#modindex_common_prefix = []
-
-
-# -- Options for HTML output ---------------------------------------------------
-
-# The theme to use for HTML and HTML Help pages.  Major themes that come with
-# Sphinx are currently 'default' and 'sphinxdoc'.
-html_theme = 'default'
-
-# Theme options are theme-specific and customize the look and feel of a theme
-# further.  For a list of options available for each theme, see the
-# documentation.
-#html_theme_options = {}
-
-# Add any paths that contain custom themes here, relative to this directory.
-#html_theme_path = []
-
-# The name for this set of Sphinx documents.  If None, it defaults to
-# "<project> v<release> documentation".
-#html_title = None
-
-# A shorter title for the navigation bar.  Default is the same as html_title.
-#html_short_title = None
-
-# The name of an image file (relative to this directory) to place at the top
-# of the sidebar.
-#html_logo = None
-
-# The name of an image file (within the static path) to use as favicon of the
-# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
-# pixels large.
-#html_favicon = None
-
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
-
-# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
-# using the given strftime format.
-#html_last_updated_fmt = '%b %d, %Y'
-
-# If true, SmartyPants will be used to convert quotes and dashes to
-# typographically correct entities.
-#html_use_smartypants = True
-
-# Custom sidebar templates, maps document names to template names.
-#html_sidebars = {}
-
-# Additional templates that should be rendered to pages, maps page names to
-# template names.
-#html_additional_pages = {}
-
-# If false, no module index is generated.
-#html_use_modindex = True
-
-# If false, no index is generated.
-#html_use_index = True
-
-# If true, the index is split into individual pages for each letter.
-#html_split_index = False
-
-# If true, links to the reST sources are added to the pages.
-#html_show_sourcelink = True
-
-# If true, an OpenSearch description file will be output, and all pages will
-# contain a <link> tag referring to it.  The value of this option must be the
-# base URL from which the finished HTML is served.
-#html_use_opensearch = ''
-
-# If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml").
-#html_file_suffix = ''
-
-# Output file base name for HTML help builder.
-htmlhelp_basename = 'redirect-docsdoc'
-
-
-# -- Options for LaTeX output --------------------------------------------------
-
-# The paper size ('letter' or 'a4').
-#latex_paper_size = 'letter'
-
-# The font size ('10pt', '11pt' or '12pt').
-#latex_font_size = '10pt'
-
-# Grouping the document tree into LaTeX files. List of tuples
-# (source start file, target name, title, author, documentclass [howto/manual]).
-latex_documents = [
-  ('index', 'redirect-docs.tex', u'redirect-docs Documentation',
-   u'Grigoriy Petukhov', 'manual'),
-]
-
-# The name of an image file (relative to this directory) to place at the top of
-# the title page.
-#latex_logo = None
-
-# For "manual" documents, if this is true, then toplevel headings are parts,
-# not chapters.
-#latex_use_parts = False
-
-# Additional stuff for the LaTeX preamble.
-#latex_preamble = ''
-
-# Documents to append as an appendix to all manuals.
-#latex_appendices = []
-
-# If false, no module index is generated.
-#latex_use_modindex = True

docs-redirect/index.rst

-.. redirect-docs documentation master file, created by
-   sphinx-quickstart on Tue Jun 19 23:05:19 2012.
-   You can adapt this file completely to your liking, but it should at least
-   contain the root `toctree` directive.
-
-Grab - site scraping framework
-==============================
-
-.. warning::
-
-    This is outdated document. You can get actual Grab documentation at official Grab website: http://grablib.org/docs
-
-Useful links
-------------
-
-:official site: http://grablib.org
-:repo & bugtracker: http://bitbucket.org/lorien/grab
-:documentation: http://grablib.org/docs
-:jabber conference: grablab@conference.jabber.ru
-:mail list: http://groups.google.com/group/python-grab
-:email conact: lorien@lorien.name
-
-.. toctree::
-   :maxdepth: 2

py3k.py

-#!/usr/bin/python3
-from grab import Grab
-
-g = Grab()
-g.go('http://ya.ru/')
-print(g.response.code)

test/spider_cache.py

 
 db = pymongo.Connection()['spider_test']
 
+def build_html():
+    return """
+    <head>
+        <title>ABC</title>
+    </head>
+    <body>
+        <a href="%(BASE_URL)s/">link #1</a>
+        <a href="%(BASE_URL)s/">link #2</a>
+    </body>
+    """ % {'BASE_URL': SERVER.BASE_URL}
+
 class SimpleSpider(Spider):
     def task_foo(self, grab, task):
         grab.setup(url=SERVER.BASE_URL)
         yield Task('bar', grab=grab)
+        for elem in grab.doc.select('//a'):
+            yield Task('bar', url=elem.attr('href'))
 
     def task_bar(self, grab, task):
         pass
 class TestSpiderCache(TestCase):
     def setUp(self):
         SERVER.reset()
+        SERVER.RESPONSE['get'] = build_html()
 
     def test_bug1(self):
         """
         * got exception
         """
 
+        class Bug1Spider(Spider):
+            def task_foo(self, grab, task):
+                grab.setup(url=SERVER.BASE_URL)
+                yield Task('bar', grab=grab)
+
+            def task_bar(self, grab, task):
+                pass
+
+        db.cache.remove({})
+        bot = Bug1Spider()
+        bot.setup_cache(backend='mongo', database='spider_test')
+        bot.setup_queue()
+        bot.add_task(Task('foo', SERVER.BASE_URL))
+        bot.run()
+
+
+    def test_mysql_cache(self):
+        db.cache.remove({})
+        bot = SimpleSpider()
+        bot.setup_cache(backend='mysql', database='spider_test',
+                        user='web', passwd='web-**')
+        bot.setup_queue()
+        bot.add_task(Task('foo', SERVER.BASE_URL))
+        bot.run()
+
+
+    def test_mysql_cache(self):
         db.cache.remove({})
         bot = SimpleSpider()
         bot.setup_cache(backend='mongo', database='spider_test')

test_mysql_cache.py

-#!/usr/bin/env python
-# coding: utf-8
-from grab.spider import Spider, Task
-from grab.tools.logs import default_logging
-from grab import Grab
-import pymongo
-import logging
-from urlparse import urlsplit, parse_qs, parse_qsl, urlunsplit, urljoin
-from grab.tools.lxml_tools import parse_html, render_html, drop_node, clone_node
-import traceback
-import urllib
-from collections import defaultdict
-import re
-
-class DefaultSpider(Spider):
-    initial_urls = ['http://desconto.ru/']
-    base_url = 'http://desconto.ru'
-
-    def task_initial(self, grab, task):
-        print 'title', grab.xpath_text('//title', '')
-        for elem in grab.xpath_list('//a'):
-            yield Task('page', url=elem.get('href'))
-
-    def task_page(self, grab, task):
-        print 'title', grab.xpath_text('//title', '')
-
-
-def main():
-    import sys 
-    default_logging()
-    cls = globals()[sys.argv[1] if len(sys.argv) > 1 else 'DefaultSpider']
-    bot = cls(thread_number=1)
-    bot.setup_cache(
-        backend='mysql',
-        database='encdic_cache',
-        use_compression=True,
-        user='web', passwd='web-**'
-    )
-    #bot.setup_cache(
-        #backend='mongo',
-        #database='spider_test',
-        #use_compression=True,
-    #)
-    #bot.load_proxylist('/web/proxy.txt', 'text_file')
-    try:
-        bot.run()
-    except KeyboardInterrupt:
-        pass
-    #bot.save_all_lists('var')
-    bot.save_list('fatal', 'var/fatal.txt')
-    print bot.render_stats()
-
-
-if __name__ == '__main__':
-    main()

test_spider_integrity.py

-# coding: utf-8
-"""
-Quick test to check that Spider refactoring
-has not breaked things totally.
-"""
-from grab.spider import Spider, Task
-import pymongo
-import logging
-
-db = pymongo.Connection()['spider_test']
-
-class TestSpider(Spider):
-    def task_generator(self):
-        yield Task('yandex', url='http://yandex.ru/')
-        yield Task('google', url='http://google.ru/')
-        yield Task('bitbucket_login', url='https://bitbucket.org/account/signin/?next=/')
-
-    def task_yandex(self, grab, task):
-        assert grab.xpath_text('//title') == u'Яндекс'
-        yield Task('yandex_from_cache', url=task.url)
-
-    def task_yandex_from_cache(self, grab, task):
-        assert grab.xpath_text('//title') == u'Яндекс'
-
-    def task_google(self, grab, task):
-        assert grab.xpath_text('//title') == u'Google'
-        yield Task('google_from_cache', url=task.url)
-
-    def task_google_from_cache(self, grab, task):
-        assert grab.xpath_text('//title') == u'Google'
-
-    def task_bitbucket_login(self, grab, task):
-        grab.set_input('username', 'grabtest')
-        grab.set_input('password', 'grabtest')
-        assert 'Log in' in grab.xpath_text('//title')
-        grab.submit(make_request=False)
-        yield Task('bitbucket_dashboard', grab=grab)
-
-    def task_bitbucket_dashboard(self, grab, task):
-        grab.xpath_exists('//li[@id="user-dropdown"]/a/span')
-        grab.setup(url='https://bitbucket.org/account/user/grabtest/')
-        yield Task('bitbucket_account', grab=grab)
-
-    def task_bitbucket_account(self, grab, task):
-        grab.xpath_exists('//input[@value="Save settings"]')
-
-
-if __name__ == '__main__':
-    logging.basicConfig(level=logging.DEBUG)
-    db.cache.remove()
-    bot = TestSpider(use_cache=True, cache_db='spider_test',
-                     thread_number=4)#, request_limit=2)
-    bot.run()
+#!/usr/bin/python3
+from grab import Grab
+
+g = Grab()
+g.go('http://ya.ru/')
+print(g.response.code)
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.