Commits

Stephen Tanner  committed ca56417

Can now parse out basic data for lists of links. Working on filtering out urls that have already been seen.

  • Participants
  • Parent commits 925826c

Comments (0)

Files changed (3)

File config.json.example

     "sites": {
         "site_name": {
             "path": "/path/to/links",
-            "class": "class_of_link",
-            "domain": "http://somesite.com"
+            "link_class": "class_of_link",
+            "domain": "http://somesite.com",
+            "post_class": "class_of_post"
             }
         },
     "includes": ["term1", "term2", "terma", "termb", "termc"],
-    "excludes": ["termx", "termy", "termz"]
+    "excludes": ["termx", "termy", "termz"],
     "database": "path/to/sqlite_scavenge.db",
-    "email": "you@yourdomain.com"
+    "email": {
+        "to": "you@yourdomain.com",
+        "from": "you_other_email@yourdomain.com",
+        "host": "stmp_host",
+        "port": "smtp_port",
+        "user": "smtp_user",
+        "pass": "smtp_pass"
+    }
+
 }
         text = cf.read()
         config = json.loads(text)
 
-    webmaster = WebMaster(config["includes"], config["excludes"])
+    webmaster = WebMaster(config["includes"], config["excludes"], config["database"], config['email'])
     loop = asyncio.get_event_loop()
-    # f = asyncio.wait([webmaster.fetch_links(config["sites"][u]) for u in config["sites"]])
     loop.run_until_complete(webmaster.start_fetch(config["sites"]))
-    # links_future = loop.run_until_complete(f)[0].pop()
-    # links = links_future.result()
-    #
-    # for link in links:
-    #     print(link)
 
 
-    #p = asyncio.wait([webmaster.get_link_data(l) for l in links])
-    #page_future = loop.run_until_complete(p)[0].pop()
-    #page_data = page_future.result()
 
-
+    #close the event loop
+    loop.close()
 
 if __name__ == "__main__":
     # execute only if run as a script

File webmaster.py

 from bs4 import BeautifulSoup
 import asyncio
 import aiohttp
-
+import sqlite3
+import datetime
+import smtplib
+import hashlib
 
 class WebMaster():
 
-    def __init__(self, includes, excludes):
+    def __init__(self, includes, excludes, db_location, email_config):
         self.include_list = includes
         self.exclude_list = excludes
+        self.db_location = db_location
+        self.db_conn = sqlite3.connect(self.db_location)
+        self.db_cursor = self.db_conn.cursor()
+        self.email = email_config
 
 
     #This design was reccomended by some helpful members on #python (freenode)
     #thank you \u03b5 and whg
     @asyncio.coroutine
     def start_fetch(self, sites):
+        posts = []
         for site in sites:
+            url = sites[site]['domain'] + sites[site]['path']
+            links = yield from self.fetch_links(url, sites[site]['link_class'])
 
 
-            url = sites[site]['domain'] + sites[site]['path']
-            links = yield from self.fetch_links(url, sites[site]['class'])
-            for link in links:
-                href = sites[site]['domain'] + link['href']
+            # #Convert the list of hrefs/urls of the links to md5 sums
+            # for href in hrefs:
+            #     md5 = hashlib.md5()
+            #     m.update(link)
+            #     link_hash = md5.hexdigest()
+
+            new_links = self.db_filter(hrefs)
+
+            #I probably need to turn the links list into a links dict so I can access
+            #the title easily instead of trying to reparse the data with BS4
+
+            for link in new_links:
+
+                ipdb.set_trace()  ######### Break Point ###########
+                #href = sites[site]['domain'] + link['href']
+                short = yield from self.shorten_url(href)
                 page = yield from self._GET(href, compress='True')
-                ipdb.set_trace()  ######### Break Point ###########
+                content = self.parse_post(page, sites[site]['post_class'])
+                post = {
+                    "url": link,
+                    "site": sites[site],
+                    "title": "",
+                    "content": content,
+                    "short": short,
+                    "time_stamp": datetime.datetime.now()}
+                posts.append(post)
+        if len(posts):
+            self.insert_posts(posts)
+            self.email_posts(posts)
 
 
+    #Send myself a message with the titles paired with the shortend links
+    def email_posts(self, posts):
+        body = "<h4>New Links<h4>"
 
+        headers = [
+    		"From: <" + self.email["from"] + ">",
+            "To: <" + self.email["to"] + ">",
+            "Subject: " + len(posts) + " New Links Posted",
+            "Content-Type: text/html"
+        ]
+
+        session = smtplib.SMTP(self.email['host'], self.email['port'])
+        session.ehlo()
+        session.starttls() # Omit if SMTPS
+        session.ehlo() # Omit if SMTPS
+        session.login(self.email['user'], self.email['pass'])
+        session.sendmail(_from, _to, "\r\n".join(headers) + "\r\n\r\n" + body)
+        session.quit()
+
+    #Need to build an html table of links
+    def _build_table(self, posts):
+        tbl = "<table><thead><tr><th>Title</th><th>Link</th></tr></thead><tbody>"
+        for post in posts:
+            tbl += "<tr><td>" + post['title'] + "</td><td>" + + "</td></tr>"
+
+    #Take a set of links, and check to see if each URL is in the DB
+    #return a set of links that have been filtered
+    def db_filter(self, links):
+
+
+        urls = ",".join(links)
+        #ipdb.set_trace()  ######### Break Point ###########
+
+        url_set = self.db_cursor.execute("SELECT url FROM posts WHERE url IN (" + ",".join("?"*len(links)) + ")", links).fetchall()
+        link_set = set(links)
+        return list(link_set - url_set)
+
+
+
+    #We need to store each page and its data in the DB
+    def insert_posts(self, posts):
+        inserts = []
+        for post in posts:
+            inserts.append[post['url'], post['site'], post['content'], post['short'], post['time_stamp']]
+        self.db_cursor.executemany('INSERT INTO posts VALUES (?,?,?,?,?)', inserts)
+
+
+
+    #Parse the page content out of the page
+    def parse_post(self, page, post_class):
+        bs_page = BeautifulSoup(page)
+        post = bs_page.find("div", class_=post_class)
+        return post
+
+
+    #Create a Short link from the link post
+    @asyncio.coroutine
+    def shorten_url(self, url):
+        isgd_url = "http://is.gd/create.php?format=simple&url=" + url
+        page = yield from self._GET(isgd_url, compress='True')
+        return page.decode("utf-8")
 
 
     #Fetch the links from the right URL