Ed Brannin avatar Ed Brannin committed 167833f

A bit more progress.

Comments (0)

Files changed (2)

 def handle(path):
   print "Reading %s" % path
   try:
-    soup = BeautifulSoup(file(path))
+    html = open(path).read()
+    html = html.replace(" ", " ")
+    soup = BeautifulSoup(html)
   except:
     print "%s has an error: %s" % (path, sys.exc_info()[1])
     return
   for tag in ['style', 'script']:
     [s.extract() for s in soup.findAll(tag)]
-  strip_spans(soup)
+  soup = strip_spans(soup)
+  soup = strip_styles(soup)
+  soup = strip_empty(soup)
   for div in soup.findAll('div'):
-    print "New <div>"
-    print type(div)
+    # print "New <div>"
+    # print type(div)
     if type(div) == NavigableString:
-      print "NavigableString!"
+      # print "NavigableString!"
+      pass
     else:
-      print div.prettify()
+      if div.renderContents().strip() == "":
+        div.extract()
+      # print div.prettify()
       # print dir(div)
+      pass
     #print div.parent.parent
-    print
+    # print
   print "Total <div>s: %d" % len(soup.findAll('div')) 
+  empty_link_urls = set()
   for a in soup.findAll('a'):
     if len(a.contents) == 0:
+      empty_link_urls.add(a['href'])
       print "Empty link to %s" % a['href']
       a.extract()
     else: 
-      print "OK: %s" % a.prettify()
+      # print "OK: %s" % a.prettify()
+      pass
    
   print "Title: " + soup.title.renderContents()
-  # print soup.prettify()
+  print soup.prettify()
 
   raise Exception("stop after one file for now")
 
+def strip_empty(soup):
+  for t in soup.findAll(None):
+    print t
+    print
+    if t.renderContents().strip() == '':
+      t.extract()
+  return soup
+
 def strip_styles(soup):
   for attr in ('id', 'class', 'style'):
     for t in soup.findAll(None, attrs={attr: True}):
     soup = BeautifulSoup(soup.prettify())
   # print "Soup is: %s" % soup
   return soup
+
+def collapse(soup):
+  did_stuff = False
+  for tag in soup.findAll(None):
+    # if the tag is the only child of its parent and isn't a protected tag (like <html> or <a>) then we want to replace the tag with its contents.
+    pass #FIXME
+    #if tag
   
 
 if __name__ == '__main__':
   assert observed_answer == desired_answer
 
 
+def test_collapse():
+  soup = BeautifulSoup("""
+  <div id="Oobj19" >
+  <div id="Ggeo47" class="dfltt">
+  <font face="Times New Roman" class="fsx01"><B><I><a href="about.html" title="ABOUT">ABOUT</a><br></I></B></font></div>
+  </div>
+  """)
+  observed_answer = collapse(soup).prettify().strip()
+  desired_answer = """
+  <div><a href="about.html" title="ABOUT">ABOUT</a></div>
+  """.strip()
+  assert observed_answer == desired_answer
 
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.