1. yodeleyihu
  2. eyfo

Commits

unknown  committed 3c7d259

first commit ever

  • Participants
  • Branches master

Comments (0)

Files changed (64)

File add-to-init.py

View file
+try:
+    if 'home' in os.environ:
+        home = os.environ['home']
+    elif 'userprofile' in os.environ:
+        home = os.environ['userprofile']
+    sys.path.append(home)
+    import whooshconfig
+except:
+    fname = join(home,'whooshconfig.py')
+    open(fname,'w').write('')
+
+
+class Conf:
+    def __init__(self, fname):
+        self.conf = __import__(fname.replace('.py','')
+        self.fname = fname
+
+
+    def get_config(self, key, default=''):
+        try:
+            o = getattr(self.conf, key)
+            o = json.loads(o)
+        except:
+            o = default
+            setattr(self.conf, key, o)
+        return o
+
+
+
+    def set_config(self, key, value):
+        setattr(self.conf, key, value)
+        reg = re.compile(r'^\s*(.*?)\s*=\s*(.*)\s*$', re.I+re.M+re.S)
+        lines = open(self.fname).read()
+        regfind = re.compile(r'^\s*%s\s*=\s*(.*?)\s*$' % key, re.I+re.M+re.S)
+        regsub = re.compile(r'^\s*%s\s*=\s*(.*?)\s*$' % key, re.I+re.M+re.S)
+        if regfind.findall(line)[0]:
+            regsub = re.compile(r'^\s*%s\s*=\s*(.*?)\s*$' % key, re.I+re.M+re.S)
+            newline = '%s = %s' % (key, json.dumps(value))
+            regsub.sub(newline, lines)
+        open(self.fname, 'w').write(lines)
+
+
+
+    def order_tab_key(self):
+        order = (
+            self.search_words_combo,
+            self.search_button,
+            self.dont_search_words_combo,
+            self.exit_button,
+            self.where_to_search,
+            self.index_button,
+            self.add_path_button,
+            self.del_path_button,
+            self.select_all_path_button,
+        )
+        for i in xrange(len(order) - 1):
+            order[i+1].MoveAfterInTabOrder(order[i])

File archive_7z.py

View file
+# $f is the filename
+# $d is the output dir (defaults to curr-dir). however, program should chdir to desired dir anyway because some doesnt support destination dir.
+# $s is the path/file separator (/ or \)
+#    win_cmd_list_filter = re.compile('''^ +[0-9]+ +[0-9]+ +[0-9.]+ +[0-9-]+ +[0-9:]+ +[0-9a-z]+ +[a-z-]+ +[BTPMGVX0-9\+]+ +$''', re.I)
+#'^[^<>/\:*?"|]+ +[0-9]+ +[0-9]+ +[0-9.]+ +[0-9-]+ +[0-9:]+ +[0-9a-z]+ +[a-z-]+ +[BTPMGVX0-9\+]+ +$'
+ 
+wincmd = '7z -y x $f $d$s'
+wincmd_list = '7z l $f'
+win_cmd_list_filter = re.compile('^[0-9][0-9-]+ [0-9:]+ ..... [0-9 ]+ [0-9 ]+ (.*?)$', re.M)
+lincmd = 'un7z $f'
+outtype = 'dir'
+
+
+#if os.platfrm...
+cmd = wincmd_list
+cmd = prepare_command(cmd, filename, temp_dir)
+#print cmd
+txt = os.popen(cmd).read()
+#print txt
+found_files = win_cmd_list_filter.findall(txt)
+#ret = os.system(cmd)
+#print '7z exit status:', found_files
+#sys.exit()

File archive_arj.py

View file
+# $f is the filename
+# $d is the output dir (defaults to curr-dir). however, program should chdir to desired dir anyway because some doesnt support destination dir.
+# $s is the path/file separator (/ or \)
+#    win_cmd_list_filter = re.compile('''^ +[0-9]+ +[0-9]+ +[0-9.]+ +[0-9-]+ +[0-9:]+ +[0-9a-z]+ +[a-z-]+ +[BTPMGVX0-9\+]+ +$''', re.I)
+#'^[^<>/\:*?"|]+ +[0-9]+ +[0-9]+ +[0-9.]+ +[0-9-]+ +[0-9:]+ +[0-9a-z]+ +[a-z-]+ +[BTPMGVX0-9\+]+ +$'
+ 
+wincmd = 'arj -y x $f $d$s'
+wincmd_list = 'arj t $f'
+win_cmd_list_filter = re.compile('^Testing(.*?)' + chr(8) + '+', re.M)
+lincmd = 'unarj $f'
+outtype = 'dir'
+
+
+#if os.platfrm...
+cmd = wincmd_list
+cmd = prepare_command(cmd, filename, temp_dir)
+#print 'cmd'
+#print cmd
+#sys.exit()
+txt = os.popen(cmd).read()
+#print 'txt[:300]'
+#print txt[:300]
+found_files = win_cmd_list_filter.findall(txt)
+found_files = map(strip, found_files) 
+#ret = os.system(cmd)
+#print 'arj exit status:', found_files
+

File archive_ipk.py

View file
+#import file_types
+#def archive(filename, temp_dir)
+file_type = 'ipk'
+
+if sys.platform in ['win32']:
+    os.windows_cmd = ''
+    temp_dir = os.environ['TEMP']
+if sys.platform in ['linux']:
+    linux_cmd = ''
+    temp_dir = '/tmp'
+
+import tarfile
+
+# 'filename' is from the parent scope
+try:
+    t = tarfile.TarFile.open(filename)
+    found_files = t.getnames()
+    #if 'control.tar.gz' in found_files:
+    t.extract('control.tar.gz', temp_dir)   # not multiuser-friendly!
+    t2 = tarfile.TarFile.open(temp_dir + os.sep + 'control.tar.gz')
+    file_text = t2.extractfile('control').read()
+    #if 'data.tar.gz' in found_files:
+    t.extract('data.tar.gz', temp_dir)   # not multiuser-friendly!
+    t2 = tarfile.TarFile.open(temp_dir + os.sep + 'data.tar.gz')
+    found_files = t2.getnames()
+
+except:
+    print "(not an IPK (tar.gz) file? - %s)" % filename
+
+

File archive_tar.py

View file
+#import file_types
+#def archive(filename, temp_dir)
+file_type = 'tar'
+
+windows_cmd = ''
+linux_cmd = ''
+
+import tarfile
+
+# 'filename' is from the parent scope
+t = tarfile.TarFile.open(filename)
+found_files = t.getnames()
+print "in archive tar"
+

File archive_zip.py

View file
+#import file_types
+#def archive(filename, temp_dir)
+file_type = 'zip'
+
+windows_cmd = ''
+linux_cmd = 'unzip -d $f'
+
+import zipfile
+# 'filename' is from the parent scope
+
+try:
+    z = zipfile.ZipFile(filename)
+    found_files = z.namelist()
+except:
+    print "(not a zip file? - %s)" % filename
+

File blank.html

View file
+  

File enc_hebrew - עברית.txt

File eyfo.py

View file
+'''
+
+EYFO - a local file indexer
+
+contact:    https://bitbucket.org/yodeleyihu/eyfo
+license:    LGPL
+created:    2013-02-03
+version:    0.4 (alpha)
+
+This will need to be heavily reimplemented.
+
+It was originally created by Boa Constructor, but is not compatible with it anymore, due to the use of "file_list" object. It would be nice to use Boa again, I just need to understand how to add file_list as a user-control...
+
+Eyfo's constitution:
+    - responsiveness - speed IS an issue
+    - accessibility - the user interface must support keyboard shortcuts and shortcuts.
+    - multi-platform
+    - open source
+    - multi-lingual:  detect and convert file's encoding, and convert its contents to unicode (utf8). I don't care much about the UI language, btw.
+    - easy to read, write, maintain, and add plugins
+    - plugins:  support extracting semantically meaningful parts of documents from as many file formats as needed.
+    -
+
+'''
+
+import library
+from os.path import expanduser, isdir, isfile, join, basename, abspath, splitext
+from persistant_dict import persistant_dict
+from wx import MessageBox as msgbox
+from wx.lib.anchors import LayoutAnchors
+import filelist
+import os
+import whoosh_indexer
+import wx
+import wx.animate
+import wx.html
+import wx.html2
+import eyfoview
+
+
+CONF_FILE_BASE_NAME = 'eyfo-config.json'
+
+default_config_values = {
+    "filesystem_manager": "c:\\program files\\total commander\\totalcmd.exe",
+    'progress_indicator_gif_animation_file': 'progress.gif',
+	"editor": "c:\\windows\\system32\\notepad.exe",
+	"index_start_folder": "d:\\",
+	"whoosh_index_db_dir": "d:\\temp\\whoosh",
+    "max_results" : 10000,
+    'exclude_patterns' : [
+        r'/dev/*',
+        r'/proc/*',
+        r'/tmp/*',
+        r'?:\RECYCLER\*',
+        r'*\nltk_data\*',
+        r'*\license.txt',
+        r'*\readme.txt',
+        r'*.tmp',
+        r'*.res',
+        r'*.swp',
+        r'*.pyo',
+        r'*.pyc',
+        r'*.frx',
+        r'?:\windows\AppPatch\*',
+        r'?:\windows\Config\*',
+        r'?:\windows\Connection Wizard\*',
+        r'?:\windows\CSC\*',
+        r'?:\windows\Debug\*',
+        r'?:\windows\IIS Temporary Compressed Files\*',
+        r'?:\windows\inf\*',
+        r'?:\windows\Installer\*',
+        r'?:\windows\java\*',
+        r'?:\windows\l2schemas\*',
+        r'?:\windows\Media\*',
+        r'?:\windows\Microsoft.NET\*',
+        r'?:\windows\Minidump\*',
+        r'?:\windows\msagent\*',
+        r'?:\windows\msapps\*',
+        r'?:\windows\msdownld.tmp\*',
+        r'?:\windows\mui\*',
+        r'?:\windows\Offline Web Pages\*',
+        r'?:\windows\pchealth\*',
+        r'?:\windows\Prefetch\*',
+        r'?:\windows\Registration\*',
+        r'?:\windows\Resources\*',
+        r'?:\windows\system32\wbem\*',
+        r'?:\windows\Temp\*',
+        r'?:\windows\WBEM\*',
+        r'?:\windows\Web\*',
+        r'?:\windows\WinSxS\*',
+    ]
+}
+
+
+class search_form_controller(eyfoview.search_form):
+
+    def init_eyfo(self):
+
+        if conf['whoosh_index_db_dir']:
+            d = conf['whoosh_index_db_dir']
+            if not isdir(d): d = "d:\\temp\\whoosh"   # xxx todo: find a better directory...
+            whoosh_indexer.whoosh_index_db_dir = d
+
+        if conf['index_start_folder']:
+            d = conf['index_start_folder']
+            if not isdir(d): d = "c:\\"   # xxx todo: find a better directory...
+            whoosh_indexer.index_start_folder = d
+
+        if conf['windowleft'] and conf['windowtop']:
+            self.SetPosition( (conf['windowleft'], conf['windowtop']) )
+
+        if conf['windowwidth'] and conf['windowheight']:
+            self.SetSize( (conf['windowwidth'], conf['windowheight']) )
+
+        if conf['maximized']:
+            self.Maximize()
+
+        if conf['max_results']:
+            eyfoview.MAX_RESULTS = conf['max_results']
+
+        if conf['exclude_patterns']:
+            eyfoview.exclude_patterns = conf['exclude_patterns']
+
+        for combo in [ self.search_words_combo, self.dont_search_words_combo ]:
+            items = conf[ combo.Name ]
+            if items and type(items) is list:
+                combo.Items = conf[ combo.Name ]
+                combo.SetSelection(0)  # not a good idea for reject list...?
+
+        if conf['progress_indicator_gif_animation_file']:
+            f = conf['progress_indicator_gif_animation_file']
+            if isfile(f):
+                self.progress_indicator.LoadFile( f )
+
+
+    def get_list_file(self, wxlist):
+        i = self.file_list.GetFocusedItem()
+        f = self.file_list.GetItem(i,0).Text
+        p = self.file_list.GetItem(i,1).Text
+        f=join(p,f)
+        if not isfile(f):
+            msgbox("file <%s> is missing" % f)
+            return ""
+        else:
+            return f
+
+    def __init__(self):
+
+        parent = None
+        self.conf = conf
+        self._init_ctrls(parent)
+        self.init_eyfo()
+
+    def OnIndex_buttonButton(self, event):
+        msg = "indexer is not fully implemented in the GUI.\n You'll get more from the command line\n python whoosh_indexer.py for usage info."
+        print msg
+        msgbox(msg)
+
+        d = wx.MessageDialog(self, u"IT IS GOING TO FREEZE THE GUI FOR AN HOUR OR SO.\n\n OK ? ", "alpha state")
+        ans = d.ShowModal()
+        if not ans in [wx.OK, wx.YES, 5100]:
+            return
+
+        self.progress_indicator.Play()
+        root = conf['index_start_folder']
+        i = whoosh_indexer.Eyfo_index()
+        i.only_fnames = False  #conf['index_names_only']
+        i.incremental_index( dirname=root )
+        self.progress_indicator.Stop()
+
+
+    def OnExit_buttonButton(self, event):
+        self.Close()
+
+    def OnSearch_buttonButton(self, event):
+        self.progress_indicator.Play()
+        wx.GetApp().Yield(True)   # lets start the progress indicator
+
+        self.file_list.DeleteAllItems() # .ClearAll()
+        print conf
+        s = whoosh_indexer.Eyfo_Search()
+        # todo
+        s.only_fnames = (self.where_to_search=='Search &File names only')
+        query = self.search_words_combo.Value
+        txt = s.search( query, gui=True )   #???
+        files = txt.splitlines()
+        if not files:
+            self.file_list.add_file(' { nothing found } ')
+            return 0
+
+        self.statusbar.SetStatusText(number=0, text=u'%d indexed files' % len(files))
+        for f in files:
+            if isfile(f):
+                wx.GetApp().Yield(True)   # i'd rather do it as a background thread, e.g. sendmessage but i don't know how. this list can have thousands of items in it. currently it's limited to max-something. there's an interesting answer by robin in the wxpython group.
+                self.file_list.add_file(f)
+
+        self.file_list.auto_width()
+
+        self.update_history( self.search_words_combo, query )
+        #self.update_history( self.dont_search_words_combo, query )
+
+        #self.file_list.SetFocus()
+        self.progress_indicator.Stop()
+
+    def update_history( self, combo, text ):
+        text = text.strip()
+        if not text: return
+
+        items = combo.Items
+        if text in items:
+            items.remove(text)
+        items.insert(0, text)
+        combo.Items = items
+        combo.SetSelection(0)
+        conf[combo.Name] = items
+
+
+    def OnFile_listKeyUp(self, event):
+        f = self.get_list_file(self.file_list)
+
+        key = event.GetUnicodeKey()
+        if key==13:
+            self.OnFile_listLeftDclick(None)
+
+        key = event.GetKeyCode()
+        # handle F4
+        if key == wx.WXK_F4:
+            editor = ''
+            if 'editor' in conf:
+                editor = conf['editor']
+            if not isfile(editor) and 'editor' in os.environ:
+                editor = os.environ['editor']
+            if not isfile(editor):
+                msgbox("Editor is not defined (as env-var or in eyfo-config.json)")
+                return
+            library.run('%s "%s"' % (editor, f), wait=False)
+
+
+    def OnFile_listLeftDclick(self, event):
+        f = self.get_list_file(self.file_list)
+        if f:
+            library.open_any_file(f)
+
+    def OnFile_listListItemRightClick(self, event):
+        f = self.get_list_file(self.file_list)
+        folder = os.path.split(f)[0]
+        conf['filesystem_manager']
+        library.run('"%s" "%s"' % (conf['filesystem_manager'], folder), wait=False )
+
+
+    def OnFile_listListItemSelected(self, event):
+        f = self.get_list_file(self.file_list)
+        if f:
+            #self.html.LoadURL(f)
+            ext = os.path.splitext(f)[1][1:]
+            if ext in ['jpg', 'jpeg','gif','png','bmp','ico','wmf','svg','tif','tiff','pnm','apng',
+                        'avi', 'mpg','mpeg','mp4','mp3','wav','ogg','svg','tif','tiff','pnm','apng',]:
+                html = '<embed src="%s" width="100%%" height="100%%" />' % f.replace('\\','/')
+                #self.html.PageSource(html)
+                #self.html.SetPage(html)
+                self.html.LoadFile(f)
+            elif ext in ['txt', 'ini','py','inf','aspx','html','htm','php','asp','aspx',]:
+                self.html.LoadFile(f)
+            elif ext in ['pdf', 'swf','ttf',]:
+                pass
+                self.SetLabel("zorba")
+            else:
+                print 'else', ext
+                self.html.LoadPage("blank.html")
+
+    def OnAdd_path_buttonButton(self, event):
+        event.Skip()
+
+    def OnDel_path_buttonButton(self, event):
+        event.Skip()
+
+    def OnSelect_all_path_buttonButton(self, event):
+        event.Skip()
+
+    def get_list_file(self, wxlist):
+        i = self.file_list.GetFocusedItem()
+        f = self.file_list.GetItem(i,0).Text
+        p = self.file_list.GetItem(i,1).Text
+        f=join(p,f)
+        if not isfile(f):
+            msgbox("file <%s> is missing" % f)
+            return ""
+        else:
+            return f
+
+    def keybaord_handler(self, event):
+        key = event.GetKeyCode()
+        if key == wx.WXK_ESCAPE:
+            self.Close()
+            return
+        event.Skip()
+
+    def order_tab_key(self):
+        #self.order_tab_key
+        order = (
+            self.search_words_combo,
+            self.search_button,
+            self.dont_search_words_combo,
+            self.exit_button,
+            self.where_to_search,
+            self.index_button,
+            self.add_path_button,
+            self.del_path_button,
+            self.select_all_path_button,
+        )
+        for i in xrange(len(order) - 1):
+            order[i+1].MoveAfterInTabOrder(order[i])
+
+    def OnDialog1Maximize(self, event):
+        conf['maximized'] = True
+        event.Skip()
+
+    def OnDialog1Move(self, event):
+        x,y = self.GetPosition()
+        conf['windowleft'] = x
+        conf['windowtop'] = y
+        event.Skip()
+
+    def OnDialog1Size(self, event):
+        conf['maximized'] = False
+        w,h = self.GetSize()
+        conf['windowwidth'] = w
+        conf['windowheight'] = h
+        event.Skip()
+
+
+
+
+configfile = library.find_user_dir( particular_file=CONF_FILE_BASE_NAME, return_file_or_home='file', accept_current_dir=True, create_if_not_exist=True )
+print 'configfile',configfile
+conf = persistant_dict( configfile )
+conf.update_missing( default_config_values )
+
+if __name__ == '__main__':
+
+    app = wx.App()
+    dlg = search_form_controller()
+    try:
+        dlg.ShowModal()
+    finally:
+        dlg.Destroy()
+    app.MainLoop()

File eyfoview.py

View file
+#Boa:Dialog:search_form
+'''
+
+EYFO - a local file indexer
+
+started:    02/03/2013
+contact:    https://bitbucket.org/yodeleyihu/eyfo
+license:    LGPL
+version:    0.4 (alpha)
+
+This will need to be heavily reimplemented.
+
+It was originally created by Boa Constructor, but is not compatible with it anymore, due to the use of "file_list" object. It would be nice to use Boa again, I just need to understand how to add file_list as a user-control...
+
+Eyfo's constitution:
+    - responsiveness - speed IS an issue
+    - accessibility - the user interface must support keyboard shortcuts and shortcuts.
+    - multi-platform
+    - open source
+    - multi-lingual:  detect and convert file's encoding, and convert its contents to unicode (utf8). I don't care much about the UI language, btw.
+    - easy to read, write, maintain, and add plugins
+    - plugins:  support extracting semantically meaningful parts of documents from as many file formats as needed.
+    -
+
+'''
+
+from library import open_any_file, run
+from os.path import expanduser, isdir, isfile, join, basename, abspath, splitext
+from persistant_dict import persistant_dict
+from wx import MessageBox as msgbox
+from wx.lib.anchors import LayoutAnchors
+import filelist
+import os
+import whoosh_indexer
+import wx
+import wx.animate
+import wx.html
+import wx.html2
+
+
+def create(parent):
+    return search_form(parent)
+
+[wxID_SEARCH_FORM, wxID_SEARCH_FORMADD_PATH_BUTTON,
+ wxID_SEARCH_FORMDEL_PATH_BUTTON, wxID_SEARCH_FORMDONT_SEARCH_WORDS_COMBO,
+ wxID_SEARCH_FORMEXIT_BUTTON, wxID_SEARCH_FORMFILE_LIST,
+ wxID_SEARCH_FORMGIFANIMATIONCTRL1, wxID_SEARCH_FORMHTML,
+ wxID_SEARCH_FORMINDEX_BUTTON, wxID_SEARCH_FORMPATHS_LIST,
+ wxID_SEARCH_FORMSEARCH_BUTTON, wxID_SEARCH_FORMSEARCH_WORDS_COMBO,
+ wxID_SEARCH_FORMSELECT_ALL_PATH_BUTTON, wxID_SEARCH_FORMSPLITTERWINDOW1,
+ wxID_SEARCH_FORMSTATICTEXT1, wxID_SEARCH_FORMSTATICTEXT2,
+ wxID_SEARCH_FORMSTATUSBAR, wxID_SEARCH_FORMWHERE_TO_SEARCH,
+] = [wx.NewId() for _init_ctrls in range(18)]
+
+
+class search_form(wx.Dialog):
+    def _init_coll_flexGridSizer1_Items(self, parent):
+        # generated method, don't edit
+
+        parent.AddWindow(self.staticText1, 2, border=0,
+              flag=wx.ALIGN_RIGHT | wx.ALIGN_CENTER_VERTICAL)
+        parent.AddWindow(self.search_words_combo, 6, border=0, flag=wx.EXPAND)
+        parent.AddSpacer(wx.Size(8, 8), border=0, flag=0)
+        parent.AddWindow(self.index_button, 1, border=0,
+              flag=wx.EXPAND | wx.ALIGN_CENTER_VERTICAL | wx.ALIGN_CENTER)
+        parent.AddWindow(self.staticText2, 2, border=0,
+              flag=wx.ALIGN_CENTER_VERTICAL)
+        parent.AddWindow(self.dont_search_words_combo, 6, border=0,
+              flag=wx.GROW | wx.EXPAND)
+        parent.AddSpacer(wx.Size(8, 8), border=0, flag=0)
+        parent.AddWindow(self.exit_button, 1, border=0,
+              flag=wx.ALIGN_CENTER | wx.ALIGN_CENTER_VERTICAL | wx.EXPAND)
+
+    def _init_coll_flexGridSizer1_Growables(self, parent):
+        # generated method, don't edit
+
+        parent.AddGrowableCol(0)
+        parent.AddGrowableCol(1)
+        parent.AddGrowableCol(3)
+
+    def _init_coll_paths_controls_sizer_Items(self, parent):
+        # generated method, don't edit
+
+        parent.AddWindow(self.add_path_button, 0, border=0,
+              flag=wx.ALIGN_CENTER_VERTICAL | wx.ALIGN_CENTER)
+        parent.AddWindow(self.del_path_button, 0, border=0,
+              flag=wx.ALIGN_CENTER | wx.ALIGN_CENTER_VERTICAL)
+        parent.AddWindow(self.select_all_path_button, 0, border=0,
+              flag=wx.ALIGN_CENTER | wx.ALIGN_CENTER_VERTICAL)
+
+    def _init_coll_paths_selection_sizer_Items(self, parent):
+        # generated method, don't edit
+
+        parent.AddWindow(self.paths_list, 8, border=0, flag=0)
+        parent.AddSpacer(wx.Size(8, 8), border=0, flag=0)
+        parent.AddSizer(self.paths_controls_sizer, 2, border=0, flag=0)
+
+    def _init_coll_boxSizer1_Items(self, parent):
+        # generated method, don't edit
+
+        parent.AddSpacer(wx.Size(8, 8), border=0, flag=0)
+        parent.AddSizer(self.flexGridSizer1, 0, border=0, flag=0)
+        parent.AddSpacer(wx.Size(8, 8), border=0, flag=0)
+        parent.AddSizer(self.boxSizer2, 0, border=0, flag=0)
+        parent.AddSpacer(wx.Size(8, 8), border=0, flag=0)
+        parent.AddSizer(self.paths_selection_sizer, 0, border=0, flag=0)
+        parent.AddSpacer(wx.Size(8, 8), border=0, flag=0)
+        parent.AddWindow(self.splitterWindow1, 6, border=3,
+              flag=wx.EXPAND | wx.GROW)
+        parent.AddWindow(self.statusbar, 0, border=0, flag=wx.GROW | wx.EXPAND)
+
+    def _init_coll_boxSizer2_Items(self, parent):
+        # generated method, don't edit
+
+        parent.AddWindow(self.where_to_search, 2, border=0,
+              flag=wx.GROW | wx.EXPAND)
+        parent.AddSpacer(wx.Size(8, 8), border=0, flag=0)
+        parent.AddWindow(self.progress_indicator, 1, border=0, flag=0)
+        parent.AddSpacer(wx.Size(8, 8), border=0, flag=0)
+        parent.AddWindow(self.search_button, 1, border=0,
+              flag=wx.GROW | wx.EXPAND)
+
+    def _init_coll_statusbar_Fields(self, parent):
+        # generated method, don't edit
+        parent.SetFieldsCount(3)
+
+        parent.SetStatusText(number=0, text=u'0 indexed files')
+        parent.SetStatusText(number=1, text=u'0 files found')
+        parent.SetStatusText(number=2, text=u'la la la')
+
+        parent.SetStatusWidths([-1, -1, -1])
+
+    def _init_sizers(self):
+        # generated method, don't edit
+        self.boxSizer1 = wx.BoxSizer(orient=wx.VERTICAL)
+
+        self.flexGridSizer1 = wx.FlexGridSizer(cols=4, hgap=0, rows=2, vgap=0)
+        self.flexGridSizer1.SetFlexibleDirection(wx.HORIZONTAL)
+        self.flexGridSizer1.SetNonFlexibleGrowMode(wx.FLEX_GROWMODE_ALL)
+
+        self.boxSizer2 = wx.BoxSizer(orient=wx.HORIZONTAL)
+
+        self.paths_selection_sizer = wx.BoxSizer(orient=wx.HORIZONTAL)
+
+        self.paths_controls_sizer = wx.BoxSizer(orient=wx.VERTICAL)
+
+        self._init_coll_boxSizer1_Items(self.boxSizer1)
+        self._init_coll_flexGridSizer1_Items(self.flexGridSizer1)
+        self._init_coll_flexGridSizer1_Growables(self.flexGridSizer1)
+        self._init_coll_boxSizer2_Items(self.boxSizer2)
+        self._init_coll_paths_selection_sizer_Items(self.paths_selection_sizer)
+        self._init_coll_paths_controls_sizer_Items(self.paths_controls_sizer)
+
+        self.SetSizer(self.boxSizer1)
+
+    def _init_ctrls(self, prnt):
+        # generated method, don't edit
+        wx.Dialog.__init__(self, id=wxID_SEARCH_FORM, name=u'search_form',
+              parent=prnt, pos=wx.Point(258, 274), size=wx.Size(632, 473),
+              style=wx.THICK_FRAME | wx.TAB_TRAVERSAL | wx.SYSTEM_MENU | wx.RAISED_BORDER | wx.RESIZE_BORDER | wx.MINIMIZE_BOX | wx.MAXIMIZE_BOX | wx.CLOSE_BOX | wx.CAPTION | wx.ALWAYS_SHOW_SB | wx.DEFAULT_DIALOG_STYLE,
+              title=u'WhooSearch')
+        self.SetClientSize(wx.Size(624, 446))
+        self.Center(wx.VERTICAL)
+        self.SetBackgroundStyle(wx.BG_STYLE_SYSTEM)
+
+        self.search_words_combo = wx.ComboBox(choices=[], id=wxID_SEARCH_FORMSEARCH_WORDS_COMBO, name=u'search_words_combo', parent=self, pos=wx.Point(128, 8), size=wx.Size(362, 33), value=u'', style=wx.TAB_TRAVERSAL)
+        self.search_words_combo.SetFont(wx.Font(14, wx.SWISS, wx.NORMAL, wx.NORMAL, False, u'Arial'))
+        self.search_words_combo.SetLabel(u'')
+
+        self.search_button = wx.Button(id=wxID_SEARCH_FORMSEARCH_BUTTON, label=u'&Search', name=u'search_button', parent=self, pos=wx.Point(507, 82), size=wx.Size(117, 70), style=wx.TAB_TRAVERSAL )
+        self.search_button.Center(wx.VERTICAL)
+        self.search_button.SetDefault()
+        self.search_button.SetFont(wx.Font(20, wx.SWISS, wx.NORMAL, wx.BOLD, False, u'Arial Black'))
+        self.search_button.Bind(wx.EVT_BUTTON, self.OnSearch_buttonButton, id=wxID_SEARCH_FORMSEARCH_BUTTON)
+
+        self.index_button = wx.Button(id=wxID_SEARCH_FORMINDEX_BUTTON, label=u'&Index', name=u'index_button', parent=self, pos=wx.Point(498, 8), size=wx.Size(111, 33), style = wx.TAB_TRAVERSAL)
+        self.index_button.Bind(wx.EVT_BUTTON, self.OnIndex_buttonButton, id=wxID_SEARCH_FORMINDEX_BUTTON)
+
+        self.exit_button = wx.Button(id=wxID_SEARCH_FORMEXIT_BUTTON,
+              label=u'E&xit', name=u'exit_button', parent=self,
+              pos=wx.Point(498, 41), size=wx.Size(111, 33), style=wx.TAB_TRAVERSAL )
+        self.exit_button.Bind(wx.EVT_BUTTON, self.OnExit_buttonButton,
+              id=wxID_SEARCH_FORMEXIT_BUTTON)
+
+        self.staticText2 = wx.StaticText(id=wxID_SEARCH_FORMSTATICTEXT2,
+              label=u'does &NOT contain words ', name='staticText2',
+              parent=self, pos=wx.Point(0, 50), size=wx.Size(128, 15),
+              style=wx.ALIGN_RIGHT)
+
+        #
+        self.dont_search_words_combo = wx.ComboBox(choices=[], id=wxID_SEARCH_FORMDONT_SEARCH_WORDS_COMBO, name=u'dont_search_words_combo', parent=self, pos=wx.Point(128, 41), size=wx.Size(362, 33), style=wx.TAB_TRAVERSAL , value=u'')
+        self.dont_search_words_combo.SetLabel(u'')
+
+        self.where_to_search = wx.RadioBox(choices=[u'Search &File names only',
+              u'Search also contents'], id=wxID_SEARCH_FORMWHERE_TO_SEARCH,
+              label=u'How to search', majorDimension=1, name=u'where_to_search',
+              parent=self, pos=wx.Point(0, 82), size=wx.Size(234, 70),
+              style=wx.TAB_TRAVERSAL | wx.RA_SPECIFY_COLS)
+
+        self.progress_indicator = wx.animate.GIFAnimationCtrl(filename=u'progress2.gif',
+              id=wxID_SEARCH_FORMGIFANIMATIONCTRL1, name='progress_indicator',
+              parent=self, pos=wx.Point(242, 82), size=wx.Size(257, 70),
+              style=wx.animate.AN_FIT_ANIMATION|wx.NO_BORDER)
+
+        self.splitterWindow1 = wx.SplitterWindow(id=wxID_SEARCH_FORMSPLITTERWINDOW1, name='splitterWindow1', parent=self, pos=wx.Point(0, 237), size=wx.Size(624, 188), style=wx.SP_3D)
+
+        self.html = wx.html.HtmlWindow(id=wxID_SEARCH_FORMHTML, name=u'html', parent=self.splitterWindow1, pos=wx.Point(209, 2), size=wx.Size(413, 184), style=wx.TAB_TRAVERSAL | wx.html.HW_SCROLLBAR_AUTO)
+        # self.html = wx.html2.WebView.New(id=wxID_SEARCH_FORMHTML, name=u'html', parent=self.splitterWindow1, pos=wx.Point(209, 2), size=wx.Size(413, 184), style=wx.html.HW_SCROLLBAR_AUTO | wx.TAB_TRAVERSAL)
+
+        # self.file_list = wx.ListView(id=wxID_SEARCH_FORMFILE_LIST, name=u'file_list', parent=self.splitterWindow1, pos=wx.Point(2, 2), size=wx.Size(200, 184), style=wx.LC_ICON)
+        # self.file_list = filelist.FileList(id=wxID_SEARCH_FORMFILE_LIST, parent=self)
+        self.file_list = filelist.FileList(id=wxID_SEARCH_FORMFILE_LIST, parent=self.splitterWindow1)
+
+        self.file_list.Bind(wx.EVT_KEY_UP, self.OnFile_listKeyUp)
+        self.file_list.Bind(wx.EVT_LEFT_DCLICK, self.OnFile_listLeftDclick)
+        self.file_list.Bind(wx.EVT_LIST_ITEM_RIGHT_CLICK, self.OnFile_listListItemRightClick, id=wxID_SEARCH_FORMFILE_LIST)
+        self.file_list.Bind(wx.EVT_LIST_ITEM_SELECTED,
+        self.OnFile_listListItemSelected, id=wxID_SEARCH_FORMFILE_LIST)
+        self.splitterWindow1.SplitVertically(self.file_list, self.html, 202)
+
+        self.statusbar = wx.StatusBar(id=wxID_SEARCH_FORMSTATUSBAR,
+              name=u'statusbar', parent=self, style=wx.CAPTION)
+        self.statusbar.SetAutoLayout(True)
+        self._init_coll_statusbar_Fields(self.statusbar)
+
+        self.staticText1 = wx.StaticText(id=wxID_SEARCH_FORMSTATICTEXT1,
+              label=u'Contain &Words ', name='staticText1', parent=self,
+              pos=wx.Point(54, 17), size=wx.Size(74, 15), style=wx.ALIGN_RIGHT)
+
+        self.paths_list = wx.CheckListBox(choices=[],
+              id=wxID_SEARCH_FORMPATHS_LIST, name=u'paths_list', parent=self,
+              pos=wx.Point(0, 160), size=wx.Size(488, 67), style=wx.TAB_TRAVERSAL)
+
+        self.add_path_button = wx.Button(id=wxID_SEARCH_FORMADD_PATH_BUTTON,
+              label=u'&Add', name=u'add_path_button', parent=self,
+              pos=wx.Point(519, 160), size=wx.Size(75, 23), style=wx.TAB_TRAVERSAL)
+        self.add_path_button.Bind(wx.EVT_BUTTON, self.OnAdd_path_buttonButton,
+              id=wxID_SEARCH_FORMADD_PATH_BUTTON)
+
+        self.del_path_button = wx.Button(id=wxID_SEARCH_FORMDEL_PATH_BUTTON,
+              label=u'&Del', name=u'del_path_button', parent=self,
+              pos=wx.Point(519, 183), size=wx.Size(75, 23), style=wx.TAB_TRAVERSAL)
+        self.del_path_button.Bind(wx.EVT_BUTTON, self.OnDel_path_buttonButton,
+              id=wxID_SEARCH_FORMDEL_PATH_BUTTON)
+
+        self.select_all_path_button = wx.Button(id=wxID_SEARCH_FORMSELECT_ALL_PATH_BUTTON,
+              label=u'Select Al&l', name=u'select_all_path_button', parent=self,
+              pos=wx.Point(519, 206), size=wx.Size(75, 23), style=wx.TAB_TRAVERSAL)
+        self.select_all_path_button.Bind(wx.EVT_BUTTON, self.OnSelect_all_path_buttonButton, id=wxID_SEARCH_FORMSELECT_ALL_PATH_BUTTON)
+
+        self.Bind(wx.EVT_CHAR_HOOK, self.keybaord_handler)
+        self.Bind(wx.EVT_MAXIMIZE, self.OnDialog1Maximize)
+        self.Bind(wx.EVT_MOVE, self.OnDialog1Move)
+        self.Bind(wx.EVT_SIZE, self.OnDialog1Size)
+
+        self._init_sizers()
+
+        self.order_tab_key
+
+
+    def __init__(self, parent):
+        self._init_ctrls(parent)
+        self.init_eyfo()
+
+    # to be overriden
+
+    def OnDialog1Maximize(self, event):
+        event.Skip()
+
+    def OnDialog1Move(self, event):
+        event.Skip()
+
+    def OnDialog1Size(self, event):
+        event.Skip()
+
+    def OnAdd_path_buttonButton(self, event):
+        event.Skip()
+
+    def OnDel_path_buttonButton(self, event):
+        event.Skip()
+
+    def OnExit_buttonButton(self, event):
+        event.Skip()
+
+    def OnFile_listKeyUp(self, event):
+        event.Skip()
+
+    def OnFile_listLeftDclick(self, event):
+        event.Skip()
+
+    def OnFile_listListItemRightClick(self, event):
+        event.Skip()
+
+    def OnFile_listListItemSelected(self, event):
+        event.Skip()
+
+    def OnIndex_buttonButton(self, event):
+        event.Skip()
+
+    def OnSearch_buttonButton(self, event):
+        event.Skip()
+
+    def OnSelect_all_path_buttonButton(self, event):
+        event.Skip()
+
+
+if __name__ == '__main__':
+
+    f = 'eyfo-config.json'
+    configfile = find_user_dir( particular_file=f, return_file_or_home='file', accept_current_dir=True, create_if_not_exist=True )
+    conf = persistant_dict( configfile )
+
+    app = wx.PySimpleApp()
+    dlg = create(None)
+    #dlg.conf = conf
+    try:
+        dlg.ShowModal()
+    finally:
+        dlg.Destroy()
+    app.MainLoop()

File html2text.py

View file
+#!/usr/bin/env python
+"""html2text: Turn HTML into equivalent Markdown-structured text."""
+__version__ = "2.35"
+__author__ = "Aaron Swartz (me@aaronsw.com)"
+__copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3."
+__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes"]
+
+# TODO:
+#   Support decoded entities with unifiable.
+#   Relative URL resolution
+
+if not hasattr(__builtins__, 'True'): True, False = 1, 0
+import re, sys, urllib, htmlentitydefs, codecs, StringIO, types
+import sgmllib
+sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]')
+
+try: from textwrap import wrap
+except: pass
+
+# Use Unicode characters instead of their ascii psuedo-replacements
+UNICODE_SNOB = 0
+
+# Put the links after each paragraph instead of at the end.
+LINKS_EACH_PARAGRAPH = 0
+
+# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)
+BODY_WIDTH = 78
+
+# Don't show internal links (href="#local-anchor") -- corresponding link targets
+# won't be visible in the plain text file anyway.
+SKIP_INTERNAL_LINKS = False
+
+### Entity Nonsense ###
+
+def name2cp(k):
+    if k == 'apos': return ord("'")
+    if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
+        return htmlentitydefs.name2codepoint[k]
+    else:
+        k = htmlentitydefs.entitydefs[k]
+        if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
+        return ord(codecs.latin_1_decode(k)[0])
+
+unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"', 
+'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
+'ndash':'-', 'oelig':'oe', 'aelig':'ae',
+'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a', 
+'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e', 
+'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i',
+'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o', 
+'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'}
+
+unifiable_n = {}
+
+for k in unifiable.keys():
+    unifiable_n[name2cp(k)] = unifiable[k]
+
+def charref(name):
+    if name[0] in ['x','X']:
+        c = int(name[1:], 16)
+    else:
+        c = int(name)
+    
+    if not UNICODE_SNOB and c in unifiable_n.keys():
+        return unifiable_n[c]
+    else:
+        return unichr(c)
+
+def entityref(c):
+    if not UNICODE_SNOB and c in unifiable.keys():
+        return unifiable[c]
+    else:
+        try: name2cp(c)
+        except KeyError: return "&" + c
+        else: return unichr(name2cp(c))
+
+def replaceEntities(s):
+    s = s.group(1)
+    if s[0] == "#": 
+        return charref(s[1:])
+    else: return entityref(s)
+
+r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
+def unescape(s):
+    return r_unescape.sub(replaceEntities, s)
+    
+def fixattrs(attrs):
+    # Fix bug in sgmllib.py
+    if not attrs: return attrs
+    newattrs = []
+    for attr in attrs:
+        newattrs.append((attr[0], unescape(attr[1])))
+    return newattrs
+
+### End Entity Nonsense ###
+
+def onlywhite(line):
+    """Return true if the line does only consist of whitespace characters."""
+    for c in line:
+        if c is not ' ' and c is not '  ':
+            return c is ' '
+    return line
+
+def optwrap(text):
+    """Wrap all paragraphs in the provided text."""
+    if not BODY_WIDTH:
+        return text
+    
+    assert wrap, "Requires Python 2.3."
+    result = ''
+    newlines = 0
+    for para in text.split("\n"):
+        if len(para) > 0:
+            if para[0] is not ' ' and para[0] is not '-' and para[0] is not '*':
+                for line in wrap(para, BODY_WIDTH):
+                    result += line + "\n"
+                result += "\n"
+                newlines = 2
+            else:
+                if not onlywhite(para):
+                    result += para + "\n"
+                    newlines = 1
+        else:
+            if newlines < 2:
+                result += "\n"
+                newlines += 1
+    return result
+
+def hn(tag):
+    if tag[0] == 'h' and len(tag) == 2:
+        try:
+            n = int(tag[1])
+            if n in range(1, 10): return n
+        except ValueError: return 0
+
+class _html2text(sgmllib.SGMLParser):
+    def __init__(self, out=sys.stdout.write):
+        sgmllib.SGMLParser.__init__(self)
+        
+        if out is None: self.out = self.outtextf
+        else: self.out = out
+        self.outtext = u''
+        self.quiet = 0
+        self.p_p = 0
+        self.outcount = 0
+        self.start = 1
+        self.space = 0
+        self.a = []
+        self.astack = []
+        self.acount = 0
+        self.list = []
+        self.blockquote = 0
+        self.pre = 0
+        self.startpre = 0
+        self.lastWasNL = 0
+        self.abbr_title = None # current abbreviation definition
+        self.abbr_data = None # last inner HTML (for abbr being defined)
+        self.abbr_list = {} # stack of abbreviations to write later
+    
+    def outtextf(self, s): 
+        self.outtext += s
+    
+    def close(self):
+        sgmllib.SGMLParser.close(self)
+        
+        self.pbr()
+        self.o('', 0, 'end')
+        
+        return self.outtext
+        
+    def handle_charref(self, c):
+        self.o(charref(c))
+
+    def handle_entityref(self, c):
+        self.o(entityref(c))
+            
+    def unknown_starttag(self, tag, attrs):
+        self.handle_tag(tag, attrs, 1)
+    
+    def unknown_endtag(self, tag):
+        self.handle_tag(tag, None, 0)
+        
+    def previousIndex(self, attrs):
+        """ returns the index of certain set of attributes (of a link) in the
+            self.a list
+ 
+            If the set of attributes is not found, returns None
+        """
+        if not attrs.has_key('href'): return None
+        
+        i = -1
+        for a in self.a:
+            i += 1
+            match = 0
+            
+            if a.has_key('href') and a['href'] == attrs['href']:
+                if a.has_key('title') or attrs.has_key('title'):
+                        if (a.has_key('title') and attrs.has_key('title') and
+                            a['title'] == attrs['title']):
+                            match = True
+                else:
+                    match = True
+
+            if match: return i
+
+    def handle_tag(self, tag, attrs, start):
+        attrs = fixattrs(attrs)
+    
+        if hn(tag):
+            self.p()
+            if start: self.o(hn(tag)*"#" + ' ')
+
+        if tag in ['p', 'div']: self.p()
+        
+        if tag == "br" and start: self.o("  \n")
+
+        if tag == "hr" and start:
+            self.p()
+            self.o("* * *")
+            self.p()
+
+        if tag in ["head", "style", 'script']: 
+            if start: self.quiet += 1
+            else: self.quiet -= 1
+
+        if tag in ["body"]:
+            self.quiet = 0 # sites like 9rules.com never close <head>
+        
+        if tag == "blockquote":
+            if start: 
+                self.p(); self.o('> ', 0, 1); self.start = 1
+                self.blockquote += 1
+            else:
+                self.blockquote -= 1
+                self.p()
+        
+        if tag in ['em', 'i', 'u']: self.o("_")
+        if tag in ['strong', 'b']: self.o("**")
+        if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` ``
+        if tag == "abbr":
+            if start:
+                attrsD = {}
+                for (x, y) in attrs: attrsD[x] = y
+                attrs = attrsD
+                
+                self.abbr_title = None
+                self.abbr_data = ''
+                if attrs.has_key('title'):
+                    self.abbr_title = attrs['title']
+            else:
+                if self.abbr_title != None:
+                    self.abbr_list[self.abbr_data] = self.abbr_title
+                    self.abbr_title = None
+                self.abbr_data = ''
+        
+        if tag == "a":
+            if start:
+                attrsD = {}
+                for (x, y) in attrs: attrsD[x] = y
+                attrs = attrsD
+                if attrs.has_key('href') and not (SKIP_INTERNAL_LINKS and attrs['href'].startswith('#')): 
+                    self.astack.append(attrs)
+                    self.o("[")
+                else:
+                    self.astack.append(None)
+            else:
+                if self.astack:
+                    a = self.astack.pop()
+                    if a:
+                        i = self.previousIndex(a)
+                        if i is not None:
+                            a = self.a[i]
+                        else:
+                            self.acount += 1
+                            a['count'] = self.acount
+                            a['outcount'] = self.outcount
+                            self.a.append(a)
+                        self.o("][" + `a['count']` + "]")
+        
+        if tag == "img" and start:
+            attrsD = {}
+            for (x, y) in attrs: attrsD[x] = y
+            attrs = attrsD
+            if attrs.has_key('src'):
+                attrs['href'] = attrs['src']
+                alt = attrs.get('alt', '')
+                i = self.previousIndex(attrs)
+                if i is not None:
+                    attrs = self.a[i]
+                else:
+                    self.acount += 1
+                    attrs['count'] = self.acount
+                    attrs['outcount'] = self.outcount
+                    self.a.append(attrs)
+                self.o("![")
+                self.o(alt)
+                self.o("]["+`attrs['count']`+"]")
+        
+        if tag == 'dl' and start: self.p()
+        if tag == 'dt' and not start: self.pbr()
+        if tag == 'dd' and start: self.o('    ')
+        if tag == 'dd' and not start: self.pbr()
+        
+        if tag in ["ol", "ul"]:
+            if start:
+                self.list.append({'name':tag, 'num':0})
+            else:
+                if self.list: self.list.pop()
+            
+            self.p()
+        
+        if tag == 'li':
+            if start:
+                self.pbr()
+                if self.list: li = self.list[-1]
+                else: li = {'name':'ul', 'num':0}
+                self.o("  "*len(self.list)) #TODO: line up <ol><li>s > 9 correctly.
+                if li['name'] == "ul": self.o("* ")
+                elif li['name'] == "ol":
+                    li['num'] += 1
+                    self.o(`li['num']`+". ")
+                self.start = 1
+            else:
+                self.pbr()
+        
+        if tag in ["table", "tr"] and start: self.p()
+        if tag == 'td': self.pbr()
+        
+        if tag == "pre":
+            if start:
+                self.startpre = 1
+                self.pre = 1
+            else:
+                self.pre = 0
+            self.p()
+            
+    def pbr(self):
+        if self.p_p == 0: self.p_p = 1
+
+    def p(self): self.p_p = 2
+    
+    def o(self, data, puredata=0, force=0):
+        if self.abbr_data is not None: self.abbr_data += data
+        
+        if not self.quiet: 
+            if puredata and not self.pre:
+                data = re.sub('\s+', ' ', data)
+                if data and data[0] == ' ':
+                    self.space = 1
+                    data = data[1:]
+            if not data and not force: return
+            
+            if self.startpre:
+                #self.out(" :") #TODO: not output when already one there
+                self.startpre = 0
+            
+            bq = (">" * self.blockquote)
+            if not (force and data and data[0] == ">") and self.blockquote: bq += " "
+            
+            if self.pre:
+                bq += "    "
+                data = data.replace("\n", "\n"+bq)
+            
+            if self.start:
+                self.space = 0
+                self.p_p = 0
+                self.start = 0
+
+            if force == 'end':
+                # It's the end.
+                self.p_p = 0
+                self.out("\n")
+                self.space = 0
+
+
+            if self.p_p:
+                self.out(('\n'+bq)*self.p_p)
+                self.space = 0
+                
+            if self.space:
+                if not self.lastWasNL: self.out(' ')
+                self.space = 0
+
+            if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"):
+                if force == "end": self.out("\n")
+
+                newa = []
+                for link in self.a:
+                    if self.outcount > link['outcount']:
+                        self.out("   ["+`link['count']`+"]: " + link['href']) #TODO: base href
+                        if link.has_key('title'): self.out(" ("+link['title']+")")
+                        self.out("\n")
+                    else:
+                        newa.append(link)
+
+                if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.
+
+                self.a = newa
+            
+            if self.abbr_list and force == "end":
+                for abbr, definition in self.abbr_list.items():
+                    self.out("  *[" + abbr + "]: " + definition + "\n")
+
+            self.p_p = 0
+            self.out(data)
+            self.lastWasNL = data and data[-1] == '\n'
+            self.outcount += 1
+
+    def handle_data(self, data):
+        if r'\/script>' in data: self.quiet -= 1
+        self.o(data, 1)
+    
+    def unknown_decl(self, data): pass
+
+def wrapwrite(text): sys.stdout.write(text.encode('utf8'))
+
+def html2text_file(html, out=wrapwrite):
+    h = _html2text(out)
+    h.feed(html)
+    h.feed("")
+    return h.close()
+
+def html2text(html):
+    return optwrap(html2text_file(html, None))
+
+if __name__ == "__main__":
+    if sys.argv[1:]:
+        arg = sys.argv[1]
+        if arg.startswith('http://'):
+            j = urllib.urlopen(arg)
+            try:
+                from feedparser import _getCharacterEncoding as enc
+            except ImportError:
+                   enc = lambda x, y: ('utf-8', 1)
+            text = j.read()
+            encoding = enc(j.headers, text)[0]
+            if encoding == 'us-ascii': encoding = 'utf-8'
+            data = unicode(text, encoding, 'replace')
+
+        else:
+            encoding = 'utf8'
+            if len(sys.argv) > 2:
+                encoding = sys.argv[2]
+            data = open(arg, 'r').read().decode(encoding)
+    else:
+        data = sys.stdin.read().decode('utf8')
+    wrapwrite(html2text(data))

File pdfminer/Makefile

View file
+# Makefile for pdfminer
+
+RM=rm -f
+
+all:
+
+clean:
+	-$(RM) *.pyc *.pyo
+	cd cmap && make clean

File pdfminer/__init__.py

View file
+#!/usr/bin/env python2
+__version__ = '20110515'
+
+if __name__ == '__main__': print __version__

File pdfminer/arcfour.py

View file
+#!/usr/bin/env python2
+
+""" Python implementation of Arcfour encryption algorithm.
+
+This code is in the public domain.
+
+"""
+
+##  Arcfour
+##
+class Arcfour(object):
+
+    """
+    >>> Arcfour('Key').process('Plaintext').encode('hex')
+    'bbf316e8d940af0ad3'
+    >>> Arcfour('Wiki').process('pedia').encode('hex')
+    '1021bf0420'
+    >>> Arcfour('Secret').process('Attack at dawn').encode('hex')
+    '45a01f645fc35b383552544b9bf5'
+    """
+
+    def __init__(self, key):
+        s = range(256)
+        j = 0
+        klen = len(key)
+        for i in xrange(256):
+            j = (j + s[i] + ord(key[i % klen])) % 256
+            (s[i], s[j]) = (s[j], s[i])
+        self.s = s
+        (self.i, self.j) = (0, 0)
+        return
+
+    def process(self, data):
+        (i, j) = (self.i, self.j)
+        s = self.s
+        r = ''
+        for c in data:
+            i = (i+1) % 256
+            j = (j+s[i]) % 256
+            (s[i], s[j]) = (s[j], s[i])
+            k = s[(s[i]+s[j]) % 256]
+            r += chr(ord(c) ^ k)
+        (self.i, self.j) = (i, j)
+        return r
+
+# test
+if __name__ == '__main__':
+    import doctest
+    doctest.testmod()

File pdfminer/ascii85.py

View file
+#!/usr/bin/env python2
+
+""" Python implementation of ASCII85/ASCIIHex decoder (Adobe version).
+
+This code is in the public domain.
+
+"""
+
+import re
+import struct
+
+# ascii85decode(data)
+def ascii85decode(data):
+    """
+    In ASCII85 encoding, every four bytes are encoded with five ASCII
+    letters, using 85 different types of characters (as 256**4 < 85**5).
+    When the length of the original bytes is not a multiple of 4, a special
+    rule is used for round up.
+    
+    The Adobe's ASCII85 implementation is slightly different from
+    its original in handling the last characters.
+    
+    The sample string is taken from:
+      http://en.wikipedia.org/w/index.php?title=Ascii85
+    
+    >>> ascii85decode('9jqo^BlbD-BleB1DJ+*+F(f,q')
+    'Man is distinguished'
+    >>> ascii85decode('E,9)oF*2M7/c~>')
+    'pleasure.'
+    """
+    n = b = 0
+    out = ''
+    for c in data:
+        if '!' <= c and c <= 'u':
+            n += 1
+            b = b*85+(ord(c)-33)
+            if n == 5:
+                out += struct.pack('>L',b)
+                n = b = 0
+        elif c == 'z':
+            assert n == 0
+            out += '\0\0\0\0'
+        elif c == '~':
+            if n:
+                for _ in range(5-n):
+                    b = b*85+84
+                out += struct.pack('>L',b)[:n-1]
+            break
+    return out
+
+# asciihexdecode(data)
+hex_re = re.compile(r'([a-f\d]{2})', re.IGNORECASE)
+trail_re = re.compile(r'^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$', re.IGNORECASE)
+def asciihexdecode(data):
+    """
+    ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1
+    For each pair of ASCII hexadecimal digits (0-9 and A-F or a-f), the
+    ASCIIHexDecode filter produces one byte of binary data. All white-space
+    characters are ignored. A right angle bracket character (>) indicates
+    EOD. Any other characters will cause an error. If the filter encounters
+    the EOD marker after reading an odd number of hexadecimal digits, it
+    will behave as if a 0 followed the last digit.
+    
+    >>> asciihexdecode('61 62 2e6364   65')
+    'ab.cde'
+    >>> asciihexdecode('61 62 2e6364   657>')
+    'ab.cdep'
+    >>> asciihexdecode('7>')
+    'p'
+    """
+    decode = (lambda hx: chr(int(hx, 16)))
+    out = map(decode, hex_re.findall(data))
+    m = trail_re.search(data)
+    if m:
+        out.append(decode("%c0" % m.group(1)))
+    return ''.join(out)
+
+
+if __name__ == '__main__':
+    import doctest
+    doctest.testmod()

File pdfminer/cmap/Makefile

View file
+# Makefile for pdfminer.cmap
+
+all:
+
+clean:
+	-rm *.pyc *.pyo
+
+cmap_clean:
+	rm -f *.pickle.gz

File pdfminer/cmap/__init__.py

Empty file added.

File pdfminer/cmapdb.py

View file
+#!/usr/bin/env python2
+
+""" Adobe character mapping (CMap) support.
+
+CMaps provide the mapping between character codes and Unicode
+code-points to character ids (CIDs).
+
+More information is available on the Adobe website:
+
+  http://opensource.adobe.com/wiki/display/cmap/CMap+Resources
+
+"""
+
+import sys
+import re
+import os
+import os.path
+import gzip
+import cPickle as pickle
+import cmap
+import struct
+from psparser import PSStackParser
+from psparser import PSException, PSSyntaxError, PSTypeError, PSEOF
+from psparser import PSLiteral, PSKeyword
+from psparser import literal_name, keyword_name
+from encodingdb import name2unicode
+from utils import choplist, nunpack
+
+
+class CMapError(Exception): pass
+
+
+##  CMap
+##
+class CMap(object):
+
+    debug = 0
+
+    def __init__(self, code2cid=None):
+        self.code2cid = code2cid or {}
+        return
+
+    def is_vertical(self):
+        return False
+
+    def use_cmap(self, cmap):
+        assert isinstance(cmap, CMap)
+        def copy(dst, src):
+            for (k,v) in src.iteritems():
+                if isinstance(v, dict):
+                    d = {}
+                    dst[k] = d
+                    copy(d, v)
+                else:
+                    dst[k] = v
+        copy(self.code2cid, cmap.code2cid)
+        return
+
+    def decode(self, code):
+        if self.debug:
+            print >>sys.stderr, 'decode: %r, %r' % (self, code)
+        d = self.code2cid
+        for c in code:
+            c = ord(c)
+            if c in d:
+                d = d[c]
+                if isinstance(d, int):
+                    yield d
+                    d = self.code2cid
+            else:
+                d = self.code2cid
+        return
+
+    def dump(self, out=sys.stdout, code2cid=None, code=None):
+        if code2cid is None:
+            code2cid = self.code2cid
+            code = ()
+        for (k,v) in sorted(code2cid.iteritems()):
+            c = code+(k,)
+            if isinstance(v, int):
+                out.write('code %r = cid %d\n' % (c,v))
+            else:
+                self.dump(out=out, code2cid=v, code=c)
+        return
+    
+
+##  IdentityCMap
+##
+class IdentityCMap(object):
+
+    def __init__(self, vertical):
+        self.vertical = vertical
+        return
+
+    def is_vertical(self):
+        return self.vertical
+
+    def decode(self, code):
+        n = len(code)/2
+        if n:
+            return struct.unpack('>%dH' % n, code)
+        else:
+            return ()
+        
+            
+
+##  UnicodeMap
+##
+class UnicodeMap(object):
+
+    debug = 0
+
+    def __init__(self, cid2unichr=None):
+        self.cid2unichr = cid2unichr or {}
+        return
+
+    def get_unichr(self, cid):
+        if self.debug:
+            print >>sys.stderr, 'get_unichr: %r, %r' % (self, cid)
+        return self.cid2unichr[cid]
+
+    def dump(self, out=sys.stdout):
+        for (k,v) in sorted(self.cid2unichr.iteritems()):
+            out.write('cid %d = unicode %r\n' % (k,v))
+        return
+
+
+##  FileCMap
+##
+class FileCMap(CMap):
+
+    def __init__(self):
+        CMap.__init__(self)
+        self.attrs = {}
+        return
+
+    def __repr__(self):
+        return '<CMap: %s>' % self.attrs.get('CMapName')
+
+    def is_vertical(self):
+        return self.attrs.get('WMode', 0) != 0
+
+    def set_attr(self, k, v):
+        self.attrs[k] = v
+        return
+
+    def add_code2cid(self, code, cid):
+        assert isinstance(code, str) and isinstance(cid, int)
+        d = self.code2cid
+        for c in code[:-1]:
+            c = ord(c)
+            if c in d:
+                d = d[c]
+            else:
+                t = {}
+                d[c] = t
+                d =t
+        c = ord(code[-1])
+        d[c] = cid
+        return
+
+
+##  FileUnicodeMap
+##
+class FileUnicodeMap(UnicodeMap):
+    
+    def __init__(self):
+        UnicodeMap.__init__(self)
+        self.attrs = {}
+        return
+
+    def __repr__(self):
+        return '<UnicodeMap: %s>' % self.attrs.get('CMapName')
+
+    def set_attr(self, k, v):
+        self.attrs[k] = v
+        return
+
+    def add_cid2unichr(self, cid, code):
+        assert isinstance(cid, int)
+        if isinstance(code, PSLiteral):
+            # Interpret as an Adobe glyph name.
+            self.cid2unichr[cid] = name2unicode(code.name)
+        elif isinstance(code, str):
+            # Interpret as UTF-16BE.
+            self.cid2unichr[cid] = unicode(code, 'UTF-16BE', 'ignore')
+        elif isinstance(code, int):
+            self.cid2unichr[cid] = unichr(code)
+        else:
+            raise TypeError(code)
+        return
+
+
+##  PyCMap
+##
+class PyCMap(CMap):
+
+    def __init__(self, name, module):
+        CMap.__init__(self, module.CODE2CID)
+        self.name = name
+        self._is_vertical = module.IS_VERTICAL
+        return
+
+    def __repr__(self):
+        return '<PyCMap: %s>' % (self.name)
+
+    def is_vertical(self):
+        return self._is_vertical
+    
+
+##  PyUnicodeMap
+##
+class PyUnicodeMap(UnicodeMap):
+    
+    def __init__(self, name, module, vertical):
+        if vertical:
+            cid2unichr = module.CID2UNICHR_V
+        else:
+            cid2unichr = module.CID2UNICHR_H
+        UnicodeMap.__init__(self, cid2unichr)
+        self.name = name
+        return
+
+    def __repr__(self):
+        return '<PyUnicodeMap: %s>' % (self.name)
+
+
+##  CMapDB
+##
+class CMapDB(object):
+
+    debug = 0
+    _cmap_cache = {}
+    _umap_cache = {}
+    
+    class CMapNotFound(CMapError): pass
+
+    @classmethod
+    def _load_data(klass, name):
+        filename = '%s.pickle.gz' % name
+        if klass.debug:
+            print >>sys.stderr, 'loading:', name
+        default_path = os.environ.get('CMAP_PATH', '/usr/share/pdfminer/')
+        for directory in (os.path.dirname(cmap.__file__), default_path):
+            path = os.path.join(directory, filename)
+            if os.path.exists(path):
+                gzfile = gzip.open(path)
+                try:
+                    return type(name, (), pickle.loads(gzfile.read()))
+                finally:
+                    gzfile.close()
+        else:
+            raise CMapDB.CMapNotFound(name)
+
+    @classmethod
+    def get_cmap(klass, name):
+        if name == 'Identity-H':
+            return IdentityCMap(False)
+        elif name == 'Identity-V':
+            return IdentityCMap(True)
+        try:
+            return klass._cmap_cache[name]
+        except KeyError:
+            pass
+        data = klass._load_data(name)
+        klass._cmap_cache[name] = cmap = PyCMap(name, data)
+        return cmap
+
+    @classmethod
+    def get_unicode_map(klass, name, vertical=False):
+        try:
+            return klass._umap_cache[name][vertical]
+        except KeyError:
+            pass
+        data = klass._load_data('to-unicode-%s' % name)
+        klass._umap_cache[name] = umaps = [PyUnicodeMap(name, data, v) for v in (False, True)]
+        return umaps[vertical]
+
+
+##  CMapParser
+##
+class CMapParser(PSStackParser):
+
+    def __init__(self, cmap, fp):
+        PSStackParser.__init__(self, fp)
+        self.cmap = cmap
+        self._in_cmap = False
+        return
+
+    def run(self):
+        try:
+            self.nextobject()
+        except PSEOF:
+            pass
+        return
+
+    def do_keyword(self, pos, token):
+        name = token.name
+        if name == 'begincmap':
+            self._in_cmap = True
+            self.popall()
+            return
+        elif name == 'endcmap':
+            self._in_cmap = False
+            return
+        if not self._in_cmap: return
+        #
+        if name == 'def':
+            try:
+                ((_,k),(_,v)) = self.pop(2)
+                self.cmap.set_attr(literal_name(k), v)
+            except PSSyntaxError:
+                pass
+            return
+
+        if name == 'usecmap':
+            try:
+                ((_,cmapname),) = self.pop(1)
+                self.cmap.use_cmap(CMapDB.get_cmap(literal_name(cmapname)))
+            except PSSyntaxError:
+                pass
+            except CMapDB.CMapNotFound:
+                pass
+            return
+
+        if name == 'begincodespacerange':
+            self.popall()
+            return
+        if name == 'endcodespacerange':
+            self.popall()
+            return
+
+        if name == 'begincidrange':
+            self.popall()
+            return
+        if name == 'endcidrange':
+            objs = [ obj for (_,obj) in self.popall() ]
+            for (s,e,cid) in choplist(3, objs):
+                if (not isinstance(s, str) or not isinstance(e, str) or
+                    not isinstance(cid, int) or len(s) != len(e)): continue
+                sprefix = s[:-4]
+                eprefix = e[:-4]
+                if sprefix != eprefix: continue
+                svar = s[-4:]
+                evar = e[-4:]
+                s1 = nunpack(svar)
+                e1 = nunpack(evar)
+                vlen = len(svar)
+                #assert s1 <= e1
+                for i in xrange(e1-s1+1):
+                    x = sprefix+struct.pack('>L',s1+i)[-vlen:]
+                    self.cmap.add_code2cid(x, cid+i)
+            return
+
+        if name == 'begincidchar':
+            self.popall()
+            return
+        if name == 'endcidchar':
+            objs = [ obj for (_,obj) in self.popall() ]
+            for (cid,code) in choplist(2, objs):
+                if isinstance(code, str) and isinstance(cid, str):
+                    self.cmap.add_code2cid(code, nunpack(cid))
+            return
+
+        if name == 'beginbfrange':
+            self.popall()
+            return
+        if name == 'endbfrange':
+            objs = [ obj for (_,obj) in self.popall() ]
+            for (s,e,code) in choplist(3, objs):
+                if (not isinstance(s, str) or not isinstance(e, str) or
+                    len(s) != len(e)): continue
+                s1 = nunpack(s)
+                e1 = nunpack(e)
+                #assert s1 <= e1
+                if isinstance(code, list):
+                    for i in xrange(e1-s1+1):
+                        self.cmap.add_cid2unichr(s1+i, code[i])
+                else:
+                    var = code[-4:]
+                    base = nunpack(var)
+                    prefix = code[:-4]
+                    vlen = len(var)
+                    for i in xrange(e1-s1+1):
+                        x = prefix+struct.pack('>L',base+i)[-vlen:]
+                        self.cmap.add_cid2unichr(s1+i, x)
+            return
+
+        if name == 'beginbfchar':
+            self.popall()
+            return
+        if name == 'endbfchar':
+            objs = [ obj for (_,obj) in self.popall() ]
+            for (cid,code) in choplist(2, objs):
+                if isinstance(cid, str) and isinstance(code, str):
+                    self.cmap.add_cid2unichr(nunpack(cid), code)
+            return
+
+        if name == 'beginnotdefrange':
+            self.popall()
+            return
+        if name == 'endnotdefrange':
+            self.popall()
+            return
+
+        self.push((pos, token))
+        return
+
+# test
+def main(argv):
+    args = argv[1:]
+    for fname in args:
+        fp = file(fname, 'rb')
+        cmap = FileUnicodeMap()
+        #cmap = FileCMap()
+        CMapParser(cmap, fp).run()
+        fp.close()
+        cmap.dump()
+    return
+
+if __name__ == '__main__': sys.exit(main(sys.argv))

File pdfminer/converter.py

View file
+#!/usr/bin/env python2
+import sys, os.path
+from pdfdevice import PDFDevice, PDFTextDevice
+from pdffont import PDFUnicodeNotDefined
+from pdftypes import LITERALS_DCT_DECODE
+from pdfcolor import LITERAL_DEVICE_GRAY, LITERAL_DEVICE_RGB
+from layout import LTContainer, LTPage, LTText, LTLine, LTRect, LTCurve
+from layout import LTFigure, LTImage, LTChar, LTTextLine
+from layout import LTTextBox, LTTextBoxVertical, LTTextGroup
+from utils import apply_matrix_pt, mult_matrix
+from utils import enc, bbox2str, create_bmp
+
+
+##  PDFLayoutAnalyzer
+##
+class PDFLayoutAnalyzer(PDFTextDevice):
+
+    def __init__(self, rsrcmgr, pageno=1, laparams=None):
+        PDFTextDevice.__init__(self, rsrcmgr)
+        self.pageno = pageno
+        self.laparams = laparams
+        self._stack = []
+        return
+
+    def begin_page(self, page, ctm):
+        (x0,y0,x1,y1) = page.mediabox
+        (x0,y0) = apply_matrix_pt(ctm, (x0,y0))
+        (x1,y1) = apply_matrix_pt(ctm, (x1,y1))
+        mediabox = (0, 0, abs(x0-x1), abs(y0-y1))
+        self.cur_item = LTPage(self.pageno, mediabox)
+        return
+
+    def end_page(self, page):
+        assert not self._stack
+        assert isinstance(self.cur_item, LTPage)
+        if self.laparams is not None:
+            self.cur_item.analyze(self.laparams)
+        self.pageno += 1
+        self.receive_layout(self.cur_item)
+        return
+
+    def begin_figure(self, name, bbox, matrix):
+        self._stack.append(self.cur_item)
+        self.cur_item = LTFigure(name, bbox, mult_matrix(matrix, self.ctm))
+        return
+
+    def end_figure(self, _):
+        fig = self.cur_item
+        assert isinstance(self.cur_item, LTFigure)
+        self.cur_item = self._stack.pop()
+        self.cur_item.add(fig)
+        return
+
+    def render_image(self, name, stream):
+        assert isinstance(self.cur_item, LTFigure)
+        item = LTImage(name, stream,
+                       (self.cur_item.x0, self.cur_item.y0,
+                        self.cur_item.x1, self.cur_item.y1))
+        self.cur_item.add(item)
+        return
+
+    def paint_path(self, gstate, stroke, fill, evenodd, path):
+        shape = ''.join(x[0] for x in path)
+        if shape == 'ml':
+            # horizontal/vertical line
+            (_,x0,y0) = path[0]
+            (_,x1,y1) = path[1]
+            (x0,y0) = apply_matrix_pt(self.ctm, (x0,y0))
+            (x1,y1) = apply_matrix_pt(self.ctm, (x1,y1))
+            if x0 == x1 or y0 == y1:
+                self.cur_item.add(LTLine(gstate.linewidth, (x0,y0), (x1,y1)))
+                return
+        if shape == 'mlllh':
+            # rectangle
+            (_,x0,y0) = path[0]
+            (_,x1,y1) = path[1]
+            (_,x2,y2) = path[2]
+            (_,x3,y3) = path[3]
+            (x0,y0) = apply_matrix_pt(self.ctm, (x0,y0))
+            (x1,y1) = apply_matrix_pt(self.ctm, (x1,y1))
+            (x2,y2) = apply_matrix_pt(self.ctm, (x2,y2))
+            (x3,y3) = apply_matrix_pt(self.ctm, (x3,y3))
+            if ((x0 == x1 and y1 == y2 and x2 == x3 and y3 == y0) or
+                (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)):
+                self.cur_item.add(LTRect(gstate.linewidth, (x0,y0,x2,y2)))
+                return
+        # other shapes
+        pts = []
+        for p in path:
+            for i in xrange(1, len(p), 2):
+                pts.append(apply_matrix_pt(self.ctm, (p[i], p[i+1])))
+        self.cur_item.add(LTCurve(gstate.linewidth, pts))
+        return
+
+    def render_char(self, matrix, font, fontsize, scaling, rise, cid):
+        try:
+            text = font.to_unichr(cid)
+            assert isinstance(text, unicode), text
+        except PDFUnicodeNotDefined:
+            text = self.handle_undefined_char(font, cid)
+        textwidth = font.char_width(cid)
+        textdisp = font.char_disp(cid)
+        item = LTChar(matrix, font, fontsize, scaling, rise, text, textwidth, textdisp)
+        self.cur_item.add(item)
+        return item.adv
+
+    def handle_undefined_char(self, font, cid):
+        if self.debug:
+            print >>sys.stderr, 'undefined: %r, %r' % (font, cid)
+        return '(cid:%d)' % cid
+
+    def receive_layout(self, ltpage):
+        return
+
+
+##  PDFPageAggregator
+##
+class PDFPageAggregator(PDFLayoutAnalyzer):
+
+    def __init__(self, rsrcmgr, pageno=1, laparams=None):
+        PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
+        self.result = None
+        return
+    
+    def receive_layout(self, ltpage):
+        self.result = ltpage
+        return
+
+    def get_result(self):
+        return self.result
+
+
+##  PDFConverter
+##
+class PDFConverter(PDFLayoutAnalyzer):
+
+    def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None):
+        PDFLayoutAnalyzer.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
+        self.outfp = outfp
+        self.codec = codec
+        return
+
+    def write_image(self, image):
+        stream = image.stream
+        filters = stream.get_filters()
+        if len(filters) == 1 and filters[0] in LITERALS_DCT_DECODE:
+            ext = '.jpg'
+            data = stream.get_rawdata()
+        elif stream.colorspace is LITERAL_DEVICE_RGB:
+            ext = '.bmp'
+            data = create_bmp(stream.get_data(), stream.bits*3, image.width, image.height)
+        elif stream.colorspace is LITERAL_DEVICE_GRAY:
+            ext = '.bmp'
+            data = create_bmp(stream.get_data(), stream.bits, image.width, image.height)
+        else:
+            ext = '.img'
+            data = stream.get_data()
+        name = image.name+ext
+        path = os.path.join(self.outdir, name)
+        fp = file(path, 'wb')
+        fp.write(data)
+        fp.close()
+        return name
+    
+
+##  TextConverter
+##
+class TextConverter(PDFConverter):
+
+    def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None,
+                 showpageno=False):
+        PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
+        self.showpageno = showpageno
+        return
+
+    def write_text(self, text):
+        self.outfp.write(text.encode(self.codec, 'ignore'))
+        return
+
+    def receive_layout(self, ltpage):
+        def render(item):
+            if isinstance(item, LTContainer):
+                for child in item:
+                    render(child)
+            elif isinstance(item, LTText):
+                self.write_text(item.get_text())
+            if isinstance(item, LTTextBox):
+                self.write_text('\n')
+        if self.showpageno:
+            self.write_text('Page %s\n' % ltpage.pageid)
+        render(ltpage)
+        self.write_text('\f')
+        return
+
+    # Some dummy functions to save memory/CPU when all that is wanted is text.
+    # This stops all the image and drawing ouput from being recorded and taking
+    # up RAM.
+    def render_image(self, name, stream):
+        pass
+    def paint_path(self, gstate, stroke, fill, evenodd, path):
+        pass
+
+
+##  HTMLConverter
+##
+class HTMLConverter(PDFConverter):
+
+    RECT_COLORS = {
+        #'char': 'green',
+        'figure': 'yellow',
+        'textline': 'magenta',
+        'textbox': 'cyan',
+        'textgroup': 'red',
+        'curve': 'black',
+        'page': 'gray',
+        }
+    
+    TEXT_COLORS = {
+        'textbox': 'blue',
+        'char': 'black',
+        }
+
+    def __init__(self, rsrcmgr, outfp, codec='utf-8', pageno=1, laparams=None, 
+                 scale=1, fontscale=0.7, layoutmode='normal', showpageno=True,
+                 pagemargin=50, outdir=None,
+                 rect_colors={'curve':'black', 'page':'gray'},
+                 text_colors={'char':'black'}):
+        PDFConverter.__init__(self, rsrcmgr, outfp, codec=codec, pageno=pageno, laparams=laparams)
+        self.scale = scale
+        self.fontscale = fontscale
+        self.layoutmode = layoutmode
+        self.showpageno = showpageno
+        self.pagemargin = pagemargin
+        self.outdir = outdir