Commits

Anonymous committed e5403c4

sorting and other small fixes

Comments (0)

Files changed (3)

 
+__author__ = "Piotr Findeisen <piotr.findeisen@gmail.com>"
+
 import os, sys, unicodedata, codecs, signal
 from functools import wraps
 import version

src/unicode_blocks.py

 
+__author__ = "Piotr Findeisen <piotr.findeisen@gmail.com>"
+
 from bisect import bisect_left
+import traceback
+
 from helpers import *
 
 
 class BlockName(object):
-    def __init__(self, name):
+    def __init__(self, name, id):
         self.name = name
+        self.id = id
 
 _initialized = False
-fallback = BlockName("No_Block")
+fallback = BlockName("No_Block", 0)
 block_names = []
 
 def initialize(path, options):
     global _initialized
-    if _initialized: return
+    if _initialized:
+        return
+
+    block_id = 0
     for line in definition_file_xreadlines(path):
+        block_id += 1
         try:
             range, colon, name = line.partition(';')
             assert colon == ';'
             lo, _, hi = range.partition('..')
             assert hi is not None
             lo, hi = int(lo, 16), int(hi, 16)
-            block_names.append((lo, hi, BlockName(name.strip())))
+            block_names.append((lo, hi, BlockName(name.strip(), block_id)))
 
         except Exception:
-            import traceback; traceback.print_exc()
+            traceback.print_exc()
     _initialized = True
 
 def isinitialized():
 
 def block(unichr):
     assert len(unichr) == 1
-    if not block_names: return fallback
+    if not block_names:
+        return fallback
     cp = ord(unichr)
     j = bisect_left(block_names, (cp, None, None))
     if j > 0: j -= 1

src/unihistext.py

-#!/usr/bin/env python
 
 __author__ = "Piotr Findeisen <piotr.findeisen@gmail.com>"
 
 import unicodedata
 from itertools import imap
 import codecs
-from helpers import * 
+import operator
+
+from helpers import *
+import unicode_blocks
 
 @make_main
 def main():
     parser = OptionParser()
     parser.add_option("-i", "--input", dest="input", help="read Unicode stream from FILE ('-' means stdin, this is the default)", metavar="FILE", default="-")
     parser.add_option("-f", "--encoding", help="set input stream binary encoding ('utf-8' is the default)", default='utf-8')
-    parser.add_option("-l", "--list-encodings", help="list some available encodings and exit", default=False, action="store_true")
+    parser.add_option("-l", "--list-encodings", help="list available encodings and exit", default=False, action="store_true")
     parser.add_option("-V", "--version", help="print version and exit", default=False, action="store_true")
     parser.add_option("-c", "--combining", help="recognize combining character sequences", default=False, action="store_true")
     parser.add_option("-n", "--names", help="print names of Unicode characters or sequences", default=False, action="store_true")
     parser.add_option("-S", "--sequence-names-file", help="use file in format of NamedSequences.txt from Unicode instead of system default",
             default="/usr/share/unicode/NamedSequences.txt", metavar="FILE")
-    parser.add_option("--only-combining", help="print only combining character sequences", default=False, action="store_true")
+    parser.add_option("-C", "--only-combining", help="print only combining character sequences", default=False, action="store_true")
     parser.add_option("-b", "--blocks", help="make statistics of Unicode blocks instead of separate code points", default=False, action="store_true")
     parser.add_option("--blocks-definitions", help="read blocks definitions from FILE", metavar="FILE",
             default="/usr/share/unicode/Blocks.txt")
     parser.add_option("-B", "--filter-block", help="make statistics only for caracters in BLOCK_NAME as reported by --blocks; " + \
             "block names are case insensitive (may be repeated)", metavar="BLOCK_NAME", action="append")
+    parser.add_option("-s", "--sort",
+        help="sort statistics by METHOD which is 'block', 'code'" +
+            " or 'frequency' (default)",
+        choices=('block', 'code', 'frequency'), metavar="METHOD",
+        default='frequency')
 
     run(*parser.parse_args())
     parser.destroy()
                 sys.stdout.write(encs[pos].ljust(maxlen))
         sys.stdout.write("\n")
     print
-    print "Any encoding supported by Python's codecs module is suported."
+    print "Any encoding supported by Python's codecs module is supported."
     print "For a complete list of supported encodings"
     print "visit http://www.google.com/search?q=python+standard+encodings."
     print
     return unichr, line, i
 
 def make_block_abstracter(options):
-    import unicode_blocks
     unicode_blocks.initialize(options.blocks_definitions, options)
     return lambda unichr, line, i: (unicode_blocks.block(unichr), line, i)
 
 def make_block_filter(options):
-    import unicode_blocks
     unicode_blocks.initialize(options.blocks_definitions, options)
     names = [n.lower().strip() for n in options.filter_block]
     return lambda unichr, line, i: (unicode_blocks.block(unichr).name.lower().strip() in names)
 
 def make_stats(input, options, args):
-    if options.combining and options.blocks: die("You cannot use --combining and --blocks together.")
-    if options.combining: glue_combinings_ = glue_combinings
-    else: glue_combinings_ = lambda *args: args
+    if options.combining and options.blocks:
+        die("You cannot use --combining and --blocks together.")
+    if options.combining:
+        glue_combinings_ = glue_combinings
+    else:
+        glue_combinings_ = lambda *args: args
 
     if options.blocks: abstract_block = make_block_abstracter(options)
     else: abstract_block = lambda *args: args
 
 class BlockNameFmt(Formatter):
     def __init__(self):
-        import unicode_blocks
         assert unicode_blocks.isinitialized()
-        self.unicode_blocks = unicode_blocks
 
     def fmt(self, d, totals):
         return " %s " % d['unistr'].name
     if not stats:
         print >> sys.stderr, "Empty input or all Unicode code points filtered out."
         return
-    second = lambda t: t[1]
-    stats.sort(key = second, reverse=True) # sort by occurrances
+    if options.sort == 'frequency':
+        stats.sort(key=operator.itemgetter(1), reverse=True) # sort by occurrences
+    elif options.sort == 'code':
+        if options.blocks:
+            die("You cannot use --blocks and --sort 'code' together.")
+        stats.sort(key=operator.itemgetter(0), reverse=False)
+    elif options.sort == 'block':
+        if options.blocks:
+            assert unicode_blocks.isinitialized()
+            stats.sort(key=lambda t: t[0].id, reverse=False)
+        elif options.combining:
+            die("You cannot use --combining and --sort 'block' together.")
+        else:
+            unicode_blocks.initialize(options.blocks_definitions, options)
+            stats.sort(key=lambda t: unicode_blocks.block(t[0]).id, reverse=False)
+    else:
+        die("Sorting method %r is not implemented" % options.sort)
     totals = {}
     stats = [ {'unistr': unistr, 'count': count } for (unistr, count) in stats ] # convert each entry to dict
 
     fmts = [StatFmt()]
-    if not options.blocks: fmts.extend([HexFmt(), UnistrFmt()])
-    else: fmts.append(BlockNameFmt())
+    if not options.blocks:
+        fmts.extend([HexFmt(), UnistrFmt()])
+    else:
+        fmts.append(BlockNameFmt())
 
     if options.names:
         if options.blocks: die("You cannot use --names and --blocks together.")