Commits

Stephen Tanner committed 6dfbe3a

Can now build posts to insert to the DB

  • Participants
  • Parent commits ca56417

Comments (0)

Files changed (1)

 import sqlite3
 import datetime
 import smtplib
-import hashlib
 
 class WebMaster():
 
     @asyncio.coroutine
     def start_fetch(self, sites):
         posts = []
+
         for site in sites:
             url = sites[site]['domain'] + sites[site]['path']
             links = yield from self.fetch_links(url, sites[site]['link_class'])
 
+            new_links = self.db_filter(links, sites[site]['domain'])
 
-            # #Convert the list of hrefs/urls of the links to md5 sums
-            # for href in hrefs:
-            #     md5 = hashlib.md5()
-            #     m.update(link)
-            #     link_hash = md5.hexdigest()
-
-            new_links = self.db_filter(hrefs)
-
-            #I probably need to turn the links list into a links dict so I can access
-            #the title easily instead of trying to reparse the data with BS4
 
             for link in new_links:
+                post = yield from self.build_post(link, sites[site])
+                ipdb.set_trace()  ######### Break Point ###########
+                posts.append(post)
 
-                ipdb.set_trace()  ######### Break Point ###########
-                #href = sites[site]['domain'] + link['href']
-                short = yield from self.shorten_url(href)
-                page = yield from self._GET(href, compress='True')
-                content = self.parse_post(page, sites[site]['post_class'])
-                post = {
-                    "url": link,
-                    "site": sites[site],
-                    "title": "",
-                    "content": content,
-                    "short": short,
-                    "time_stamp": datetime.datetime.now()}
-                posts.append(post)
-        if len(posts):
+        if posts:
             self.insert_posts(posts)
             self.email_posts(posts)
 
 
+    #build post dict out of link and other data
+    @asyncio.coroutine
+    def build_post(self, link, site):
+        href = site['domain'] + link['href']
+        short = yield from self.shorten_url(href)
+        page = yield from self._GET(href, compress='True')
+        content = self.parse_post(page, site['post_class'])
+        title = str(link.string)
+        now = datetime.datetime.now().isoformat(' ')
+        post = {"url": href, "site": site, "title": title, "content": content, "short": short, "time_stamp": now}
+        ipdb.set_trace()  ######### Break Point ###########
+
+        return post
+
+
     #Send myself a message with the titles paired with the shortend links
     def email_posts(self, posts):
-        body = "<h4>New Links<h4>"
+        body = "<h4>Recent Links<h4><br />"
+
+        body += self._build_table(posts)
 
         headers = [
     		"From: <" + self.email["from"] + ">",
     def _build_table(self, posts):
         tbl = "<table><thead><tr><th>Title</th><th>Link</th></tr></thead><tbody>"
         for post in posts:
-            tbl += "<tr><td>" + post['title'] + "</td><td>" + + "</td></tr>"
+            tbl += "<tr><td>" + post['title'] + "</td><td>" + post['short'] + "</td></tr>"
+
+        tbl += "</tbody></table>"
+
+        return tbl
 
     #Take a set of links, and check to see if each URL is in the DB
     #return a set of links that have been filtered
-    def db_filter(self, links):
+    def db_filter(self, links, domain):
+        hrefs = [domain + l['href'] for l in links]
+        url_params = ','.join('?'*len(hrefs))
+        qry = 'SELECT url FROM posts WHERE url in (' + url_params + ')'
+        rows = self.db_cursor.execute(qry, hrefs).fetchall()
 
+        url_set = {url[0] for url in rows}
 
-        urls = ",".join(links)
-        #ipdb.set_trace()  ######### Break Point ###########
-
-        url_set = self.db_cursor.execute("SELECT url FROM posts WHERE url IN (" + ",".join("?"*len(links)) + ")", links).fetchall()
-        link_set = set(links)
-        return list(link_set - url_set)
+        return [l for l in links if (domain + l['href']) not in url_set]
 
 
 
     #We need to store each page and its data in the DB
     def insert_posts(self, posts):
-        inserts = []
-        for post in posts:
-            inserts.append[post['url'], post['site'], post['content'], post['short'], post['time_stamp']]
-        self.db_cursor.executemany('INSERT INTO posts VALUES (?,?,?,?,?)', inserts)
+        inserts = [[post['url'], post['site'], post['title'], post['content'], post['short'], post['time_stamp']] for post in posts]
+        self.db_cursor.executemany('INSERT INTO posts VALUES (?,?,?,?,?,?)', inserts)