Commits

Ed Brannin  committed ef26fef

Trying to split this into testable chunks.

  • Participants
  • Parent commits 0d7df0a

Comments (0)

Files changed (1)

 import os, sys
 
 def handle(path):
+  print "Reading %s" % path
   try:
     soup = BeautifulSoup(file(path))
   except:
     return
   for tag in ['style', 'script']:
     [s.extract() for s in soup.findAll(tag)]
-  for tag in ('font', 'span', 'b', 'i'):
-    for s in soup.findAll(tag):
-      s.replaceWith(NavigableString(s.renderContents()))
+  strip_spans(soup)
   for div in soup.findAll('div'):
     print "New <div>"
     print type(div)
       print "NavigableString!"
     else:
       print div.prettify()
-      print dir(div)
+      # print dir(div)
     #print div.parent.parent
     print
-  
+  print "Total <div>s: %d" % len(soup.findAll('div')) 
   for a in soup.findAll('a'):
     if len(a.contents) == 0:
       print "Empty link to %s" % a['href']
   print "Title: " + soup.title.renderContents()
   # print soup.prettify()
 
-  raise "Stop!"
+  raise Exception("stop after one file for now")
+
+def strip_spans(soup):
+  for tag in ('font', 'span', 'b', 'i', 'B', 'I'):
+    for s in soup.findAll(tag):
+      s.replaceWith(NavigableString(s.renderContents()))
+  return soup
+  
+print strip_spans(BeautifulSoup("""
+<div id="Oobj19" style="position:absolute; z-index:2; visibility:visible; left:359px; top:86px; width:60px; height:20px;">
+<div id="Ggeo47" class="dfltt">
+<font face="Times New Roman" class="fsx01"><B><I><a href="about.html" title="ABOUT">ABOUT</a><br></I></B></font></div>
+</div>
+""")).prettify()
+
+raise Exception("hi")
 
 try:
   path = sys.argv[1]