Commits

yanchuan sim  committed efe72b0

added filter_rare_terms

  • Participants
  • Parent commits 8faa5a9

Comments (0)

Files changed (1)

File ycutils/bagofwords.py

     return self
   #end def
 
+  def filter_rare_terms(self, limit):
+    """
+    Filter rare terms from the :class:`BOW`.
+    If :attr:`limit` > 1, terms that appear < :attr:`limit` will be removed.
+    If 0 < :attr:`limit` < 1, the rarest :attr:`limit` * 100 percent of the :class:`BOW` (in terms of counts) will be removed.
+
+    :param limit: the cutoff for removing rare terms.
+
+    :returns: number of items removed.
+    """
+
+    count = 0
+    if limit > 1.0:
+      for w, c in self.items():
+        if c < limit:
+          count += 1
+          del self[w]
+        #end if
+      #end for
+
+    elif limit < 1.0 and limit >= 0:
+      n = int(math.floor(limit * len(self)))
+      for w, c in self.most_common()[:-n:-1]:
+        count += 1
+        del self[w]
+      #end for
+    #end if
+
+    return count
+  #end def
+
   def add_tokens(self, tokens):
     """Adds a list of tokenized words to the bag of words collections.