Commits

Miki Tebeka committed a782937

Script to summarize, fix length edge case

  • Participants
  • Parent commits 474afac

Comments (0)

Files changed (1)

File summarize.py

 
 # Algorithm:
 # 1. For each word, caluclate it's frequency in the document
-# 2. For each sentence in the document 
+# 2. For each sentence in the document
 #       score(sentence) = sum([freq(word) for word in sentence])
 # 3. Print X top sentences such that their size < MAX_SUMMARY_SIZE
 # See http://en.wikipedia.org/wiki/Sentence_extraction for more
 
-__author__ = "Miki Tebeka <miki.tebeka@gmail.com>"
-__all__ =  [ "summarize" ]
+__author__ = 'Miki Tebeka <miki.tebeka@gmail.com>'
 
 from collections import defaultdict
-from itertools import repeat
 import re
 
 MAX_SUMMARY_SIZE = 300
 
+
 def tokenize(text):
     '''Very simple white space tokenizer, in real life we'll be much more
     fancy.
     '''
     return text.split()
 
+
 def split_to_sentences(text):
     '''Very simple spliting to sentences by [.!?] and paragraphs.
     In real life we'll be much more fancy.
     start = 0
 
     # We use finditer and not split since we want to keep the end
-    for match in re.finditer("(\s*[.!?]\s*)|(\n{2,})", text):
+    for match in re.finditer('(\s*[.!?]\s*)|(\n{2,})', text):
         sentences.append(text[start:match.end()].strip())
         start = match.end()
 
 
     return sentences
 
+
 def token_frequency(text):
     '''Return frequency (count) for each token in the text'''
-    frequencies = defaultdict(repeat(0).next)
+    frequencies = defaultdict(int)
     for token in tokenize(text):
         frequencies[token] += 1
 
     return frequencies
 
+
 def sentence_score(sentence, frequencies):
     return sum((frequencies[token] for token in tokenize(sentence)))
 
+
 def create_summary(sentences, max_length):
     summary = []
     size = 0
     for sentence in sentences:
+        summary.append(sentence)
         size += len(sentence)
         if size >= max_length:
             break
-        summary.append(sentence)
 
-    return "\n".join(summary)
+    summary = summary[:max_length]
+    return '\n'.join(summary)
+
 
 def summarize(text, max_summary_size=MAX_SUMMARY_SIZE):
     frequencies = token_frequency(text)
 
     return summary
 
-if __name__ == "__main__":
+
+def _test(size=MAX_SUMMARY_SIZE):
     text = '''
     Turkey has recalled its ambassador to Sweden after the Swedish Parliament
     voted to describe Turkey's killings of Armenians in World War I as
 
     The Swedish vote came despite the Swedish government's opposition to the
     resolution, as several parliament members crossed party lines in the vote,
-    which passed the resolution by a vote of 131-130, with 88 parliament members
-    absent. The Swedish government called the vote a "mistake," but added that
-    it will not influence their position on the matter. The Turkish government
-    released a statement saying, "our people and our government reject this
-    decision based upon major errors and without foundation," and Prime Minister
-    Tayyip Erdogan immediately cancelled a planned visit to Sweden. Despite the
-    reaction, Turkey said that the moves did "not correspond to the close
-    friendship of our two nations," and they were only recalling their
-    ambassador for consultations.
+    which passed the resolution by a vote of 131-130, with 88 parliament
+    members absent. The Swedish government called the vote a "mistake," but
+    added that it will not influence their position on the matter. The Turkish
+    government released a statement saying, "our people and our government
+    reject this decision based upon major errors and without foundation," and
+    Prime Minister Tayyip Erdogan immediately cancelled a planned visit to
+    Sweden. Despite the reaction, Turkey said that the moves did "not
+    correspond to the close friendship of our two nations," and they were only
+    recalling their ambassador for consultations.
 
     The resolution is particularly sensitive given that Sweden has long been a
     strong supporter of Turkey and their bid to join the European Union, and
     Turkey has been for years maintaining that their actions in World War I
     against Armenians did not amount to genocide. Despite Turkey's claims,
-    Armenians have been heavily campaigning for the killings to be recognized as
-    genocide, and many countries worldwide have done so.
+    Armenians have been heavily campaigning for the killings to be recognized
+    as genocide, and many countries worldwide have done so.
 
     Swedish Foreign Minister Carl Bildt said that the vote would likely have a
     significant effect on the fate of negotiations between Turkey and Armenia,
     which have been attempting to resume normal diplomatic relations since last
-    year. The Turkish ambassador that was recalled said that the vote would have
-    "drastic effects" on the negotiations, and it would have an impact for some
-    time.
+    year. The Turkish ambassador that was recalled said that the vote would
+    have "drastic effects" on the negotiations, and it would have an impact for
+    some time.
 
     The Swedish vote came not long after a similar vote by a US Congressional
-    panel, which also approved a resolution with similar terminology, leading to
-    the removal of Turkey's ambassador. In that case, the US government has been
-    trying to prevent the resolution from going further, in an attempt to limit
-    the consequences of the vote.'''
+    panel, which also approved a resolution with similar terminology, leading
+    to the removal of Turkey's ambassador. In that case, the US government has
+    been trying to prevent the resolution from going further, in an attempt to
+    limit the consequences of the vote.'''
 
-    print summarize(text)
+    print(summarize(text, size))
+
+
+def main(argv=None):
+    import sys
+    from argparse import ArgumentParser
+
+    argv = argv or sys.argv
+
+    parser = ArgumentParser(description='Summarize text')
+    parser.add_argument('file', nargs='?',
+        help='file to summarize (stdin will be used otherwise)')
+    parser.add_argument('-s', '--size', help='Size of summary',
+                        default=MAX_SUMMARY_SIZE, type=int)
+    parser.add_argument('--test', help='test')
+    args = parser.parse_args(argv[1:])
+
+    if args.test:
+        _test(args.size)
+        raise SystemExit
+
+    if args.file:
+        if args.file == '-':
+            fo = sys.stdin
+        else:
+            try:
+                fo = open(args.file)
+            except IOError as err:
+                raise SystemExit('error: cannot open {}: {}'.format(
+                    args.file, err))
+    else:
+        fo = sys.stdin
+
+    text = fo.read()
+    print(summarize(text, args.size))
+
+
+if __name__ == '__main__':
+    main()