Commits

Stephen Tanner  committed 83ffc36

Added webmaster class to handle the heavy lifting. Can now grab links and filter based on include/exclude lists.

  • Participants
  • Parent commits c0e02ee

Comments (0)

Files changed (4)

+Requirements
+============
+
+- beautiful soup 4

File config.json.example

 {
-    sites: ["http://somesite.com/path/to/links"],
-    terms: ["term1", "term2", "terma", "termb" "termc"],
-    dabase: "path/to/sqlite.db"
-    phone: "5555555555"
+    "sites": ["http://somesite.com/path/to/links"],
+    "includes": ["term1", "term2", "terma", "termb" "termc"],
+    "database": "path/to/scavenge_sqlite.db"
+    "phone": "5555555555"
 }
+import ipdb
 #@author stanner
 #This is an async web scraper. It uses modules for indivudal sites to handle descrepencies
 #between the way they are organized and accessed.
 
 
+#imports
+import argparse
+import json
+import asyncio
+from webmaster import WebMaster
+
+#setup argument parsing
+parser = argparse.ArgumentParser(description='Get notifications when search terms appear on certain BST forums.')
+parser.add_argument('--config', help='Location of the config file.')
+
+
 def main():
-    
+    cmdl_args = parser.parse_args()
+    config_file = cmdl_args.config
+
+    config = {}
+
+    with open(config_file, 'r') as cf:
+        text = cf.read()
+        config = json.loads(text)
+
+    webmaster = WebMaster(config["includes"], config["excludes"])
+    loop = asyncio.get_event_loop()
+    r = asyncio.wait([webmaster.fetch_reddit_links(u) for u in config["sites"]])
+    reddit_links = loop.run_until_complete(r)
+
+    p = asyncio.wait([webmaster.get_link_data(l) for l in reddit_links])
+    page_data = loop.run_until_complete(p)
+
 
 
 

File webmaster.py

+import ipdb
+#This file will hold the helper functions to allow the scavenger app to handle the
+#generic features of fetching links and parsing posts.
+from bs4 import BeautifulSoup
+import asyncio
+import aiohttp
+
+
+class WebMaster():
+
+    def __init__(self, includes, excludes):
+        self.include_list = includes
+        self.exclude_list = excludes
+
+
+    #Fetch the links from the right URL
+    #Look into how to modularize this method with specific sub methods for grepping links on
+    #the right pages.
+    def fetch_reddit_links(self, URL):
+        #The yield from is required to call the @async.coroutine methods
+        page = yield from self._GET(URL, compress='True')
+        bs_page = BeautifulSoup(page)
+        links = bs_page.find_all("a", class_="title")
+
+        filtered = self.filter_links(links)
+        ipdb.set_trace()  ######### Break Point ###########
+
+        return filtered
+
+    #Filter out links based on include/exclude lists
+    def filter_links(self, links):
+        keep = []
+        for link in links:
+            title = str(link.string).lower()
+            if self._filter_excludes(title) and self._filter_includes(title):
+                keep.append(link)
+        return keep
+
+
+    #Take a BS4 tag, and returns true if it should not be excluded.
+    #Returns false if a term in the exclude list is found
+    def _filter_excludes(self, link):
+        for exclude in self.exclude_list:
+            if exclude in link:
+                return False
+
+        return True
+
+
+    #Take a BS4 tag, and return true if it should be included
+    #Returns false if an include term is not found
+    def _filter_includes(self, link):
+        for include in self.include_list:
+            if include in link:
+                return True
+
+        return False
+
+
+    #Fetch all the pages in a given list
+    def get_link_data(self, links):
+        pages = []
+        for link in links:
+            href = self._reddit_href(link['href'])
+            title = link.string
+
+            page = yield from self._GET(href, compress='True')
+            pages.append(page)
+        return pages
+
+
+    #Async routine to run HTTP GET method on specifc URL
+    @asyncio.coroutine
+    def _GET(self, *args, **kwargs):
+        response = yield from aiohttp.request('GET', *args, **kwargs)
+        return (yield from response.read_and_close())
+
+
+    #Method to convert reddit hrefs
+    def _reddit_href(self, href):
+        reddit = "http://reddit.com"
+        return reddit + href