1. Stephen Tanner
  2. scavenger

Commits

Stephen Tanner  committed 925826c

Re-organized my async calls to make only one call from the sync code to start off the async worker class.

  • Participants
  • Parent commits 18c4b19
  • Branches default

Comments (0)

Files changed (3)

File config.json.example

View file
  • Ignore whitespace
 {
     "sites": {
         "site_name": {
-            "url": "http://somesite.com/path/to/links",
-            "class": "class_of_link"
+            "path": "/path/to/links",
+            "class": "class_of_link",
+            "domain": "http://somesite.com"
             }
         },
     "includes": ["term1", "term2", "terma", "termb", "termc"],

File scavenge.py

View file
  • Ignore whitespace
 from webmaster import WebMaster
 
 #setup argument parsing
-parser = argparse.ArgumentParser(description='Get notifications when search terms appear on certain BST forums.')
+parser = argparse.ArgumentParser(description='Get notifications when search terms appear on certain forums.')
 parser.add_argument('--config', help='Location of the config file.')
 
 
 
     webmaster = WebMaster(config["includes"], config["excludes"])
     loop = asyncio.get_event_loop()
-    f = asyncio.wait([webmaster.fetch_links(config["sites"][u]) for u in config["sites"]])
-    links_future = loop.run_until_complete(f)[0].pop()
-    links = links_future.result()
-
-    for link in links:
-        print(link)
+    # f = asyncio.wait([webmaster.fetch_links(config["sites"][u]) for u in config["sites"]])
+    loop.run_until_complete(webmaster.start_fetch(config["sites"]))
+    # links_future = loop.run_until_complete(f)[0].pop()
+    # links = links_future.result()
+    #
+    # for link in links:
+    #     print(link)
 
 
     #p = asyncio.wait([webmaster.get_link_data(l) for l in links])

File webmaster.py

View file
  • Ignore whitespace
+import ipdb
 #This file will hold the helper functions to allow the scavenger app to handle the
 #generic features of fetching links and parsing posts.
 from bs4 import BeautifulSoup
         self.exclude_list = excludes
 
 
+    #This design was reccomended by some helpful members on #python (freenode)
+    #thank you \u03b5 and whg
+    @asyncio.coroutine
+    def start_fetch(self, sites):
+        for site in sites:
+
+
+            url = sites[site]['domain'] + sites[site]['path']
+            links = yield from self.fetch_links(url, sites[site]['class'])
+            for link in links:
+                href = sites[site]['domain'] + link['href']
+                page = yield from self._GET(href, compress='True')
+                ipdb.set_trace()  ######### Break Point ###########
+
+
+
+
+
     #Fetch the links from the right URL
     #Look into how to modularize this method with specific sub methods for grepping links on
     #the right pages.
     @asyncio.coroutine
-    def fetch_links(self, site):
+    def fetch_links(self, url, link_class):
 
         #The yield from is required to call the @async.coroutine methods
-        page = yield from self._GET(site['url'], compress='True')
+        page = yield from self._GET(url, compress='True')
         bs_page = BeautifulSoup(page)
-        links = bs_page.find_all("a", class_=site['class'])
+        links = bs_page.find_all("a", class_=link_class)
 
         filtered = self.filter_links(links)
 
         return False
 
 
-    #Fetch all the pages in a given list
-    @asyncio.coroutine
-    def get_link_data(self, links):
-        pages = []
-        for link in links:
-            href = self._reddit_href(link['href'])
-            title = link.string
-
-            page = yield from self._GET(href, compress='True')
-            pages.append(page)
-
-        return pages
-
-
     #Async routine to run HTTP GET method on specifc URL
     @asyncio.coroutine
     def _GET(self, *args, **kwargs):
         response = yield from aiohttp.request('GET', *args, **kwargs)
         return (yield from response.read_and_close())
-
-
-    #Method to convert reddit hrefs
-    def _reddit_href(self, href):
-        reddit = "http://reddit.com"
-        return reddit + href