Anonymous avatar Anonymous committed 05ed757

[oed][m]: (old: 2y) add in original script for automating ocring of oed 1st edition.

Comments (0)

Files changed (1)

+'''
+Grab original source files from:
+
+pdf: http://www.archive.org/download/oed01arch/oed01arch.pdf
+jp2: http://www.archive.org/download/oed01arch/oed01arch_jp2.tar   
+
+Put them inside oed/
+
+cache/ directory is created and used for storing results of processing.
+
+Dictionary
+==========
+
+Dictionary itself begins on p.25 of first scan volume.
+'''
+import os
+
+TIF_DIR = './cache/tif'
+PDF_DIR = './cache/pdf'
+def ensure_dir(dir):
+    if not os.path.exists(dir): os.makedirs(dir)
+ensure_dir(TIF_DIR)
+ensure_dir(PDF_DIR)
+
+def basename(path):
+    basename = os.path.basename(path)
+    return os.path.splitext(basename)[0]
+
+def pdf2tif(pdf):
+    basename = os.path.basename(pdf)
+    outfile = os.path.splitext(basename)[0] + '-%03d.tif'
+    outfile = os.path.join(TIF_DIR, outfile)
+    first_page = 1
+    last_page = 10
+    cmd = 'gs -q -dNOPAUSE -dBATCH -dSAFER -r300x300 -sDEVICE=tiffg3 ' + \
+        '-dFirstPage=%s -dLastPage=%s ' % (first_page, last_page) + \
+        '-sOutputFile="%s" -c save pop -f "%s"' % (outfile, pdf)
+    os.system(cmd)
+
+def chop_pdf(in_fp, out_fp):
+    '''Chop up the large pdf file (250MB, 1.3k pages) into more manageable
+    chunks (gs b0rks on the whole thing).
+    '''
+    from pyPdf import PdfFileWriter, PdfFileReader
+
+    output = PdfFileWriter()
+    input1 = PdfFileReader(file(in_fp, "rb"))
+    for ii in range(10):
+        output.addPage(input1.getPage(24+ii))
+    # finally, write "output" to document-output.pdf
+    outputStream = file(out_fp, "wb")
+    output.write(outputStream)
+    outputStream.close()
+
+def jp22tif(src, dest):
+    '''Convert jpeg 2000 documents to tiff.
+
+    Requires imagemagick's convert command line utility.
+    
+    Remarks: for our purposes this has the disadvantage that a 1.5MB jp2 doc
+    becomes a 30MB tiff!
+    '''
+    cmd = 'convert %s %s' % (src, dest)
+    os.system(cmd)
+
+def get_jp2_fp(page_num):
+    num = str(page_num).rjust(4, '0')
+    fp = 'oed/oed01arch_jp2/oed01arch_%s.jp2' % num
+    return fp
+
+def tesseract():
+    pass
+
+def main():
+    usage = \
+'''%prog [options] 
+
+Process oed to plain text.'''
+    import optparse
+    parser = optparse.OptionParser(usage)
+    options, args = parser.parse_args()
+    if len(args) > 0:
+        pdf2tif(args[0])
+
+def test_pdf_route():
+    '''Produces unusable tiffs ...'''
+    in_fp = 'oed/oed01arch.pdf'
+    out_fp = os.path.join(PDF_DIR, 'chunk1.pdf')
+    chop_pdf(in_fp, out_fp)
+    pdf2tif(out_fp)
+
+def test_jp2_route():
+    src = get_jp2_fp(26)
+    dest = os.path.join(TIF_DIR, basename(src) + '.tif')
+    jp22tif(src,dest)
+
+if __name__ == '__main__':
+    test_jp2_route()
+
Tip: Filter by directory path e.g. /media app.js to search for public/media/app.js.
Tip: Use camelCasing e.g. ProjME to search for ProjectModifiedEvent.java.
Tip: Filter by extension type e.g. /repo .js to search for all .js files in the /repo directory.
Tip: Separate your search with spaces e.g. /ssh pom.xml to search for src/ssh/pom.xml.
Tip: Use ↑ and ↓ arrow keys to navigate and return to view the file.
Tip: You can also navigate files with Ctrl+j (next) and Ctrl+k (previous) and view the file with Ctrl+o.
Tip: You can also navigate files with Alt+j (next) and Alt+k (previous) and view the file with Alt+o.