Source

holyname-greece.org-converter / soupify.py

Full commit
#!/usr/bin/python

from BeautifulSoup import BeautifulSoup, NavigableString
import os, sys

def handle(path):
  print "Reading %s" % path
  try:
    html = open(path).read()
    html = html.replace(" ", " ")
    soup = BeautifulSoup(html)
  except:
    print "%s has an error: %s" % (path, sys.exc_info()[1])
    return
  for tag in ['style', 'script']:
    [s.extract() for s in soup.findAll(tag)]
  soup = strip_spans(soup)
  soup = strip_styles(soup)
  soup = strip_empty(soup)
  for div in soup.findAll('div'):
    # print "New <div>"
    # print type(div)
    if type(div) == NavigableString:
      # print "NavigableString!"
      pass
    else:
      if div.renderContents().strip() == "":
        div.extract()
      # print div.prettify()
      # print dir(div)
      pass
    #print div.parent.parent
    # print
  print "Total <div>s: %d" % len(soup.findAll('div')) 
  empty_link_urls = set()
  for a in soup.findAll('a'):
    if len(a.contents) == 0:
      empty_link_urls.add(a['href'])
      print "Empty link to %s" % a['href']
      a.extract()
    else: 
      # print "OK: %s" % a.prettify()
      pass
   
  print "Title: " + soup.title.renderContents()
  print soup.prettify()

  raise Exception("stop after one file for now")

def strip_empty(soup):
  for t in soup.findAll(None):
    print t
    print
    if t.renderContents().strip() == '':
      t.extract()
  return soup

def strip_styles(soup):
  for attr in ('id', 'class', 'style'):
    for t in soup.findAll(None, attrs={attr: True}):
      del(t[attr])
  return soup

def strip_spans(soup):
  for tag in ('font', 'span', 'b', 'i', 'B', 'I'):
    for s in soup.findAll(tag):
      parent = s.parent
      s.replaceWith(NavigableString(s.renderContents()))
      # print parent.prettify()
    soup = BeautifulSoup(soup.prettify())
  # print "Soup is: %s" % soup
  return soup

def collapse(soup):
  did_stuff = False
  for tag in soup.findAll(None):
    # if the tag is the only child of its parent and isn't a protected tag (like <html> or <a>) then we want to replace the tag with its contents.
    pass #FIXME
    #if tag
  

if __name__ == '__main__':
  try:
    path = sys.argv[1]
  except IndexError:
    path = "../current-site/www.holyname-greece.org/"

  for root, dirs, files in os.walk(path):
    for f in files:
      if 'htm' in f:
        handle(os.path.join(root, f))