Commits

James  committed 6f849ee

Merged utils written for entity matching into spyda.utils and removed a bunch of testing crap

  • Participants
  • Parent commits 4368238

Comments (0)

Files changed (6)

File merge.py

-#!/usr/bin/env python
-
-from glob import glob
-from json import dumps, loads
-
-
-files = glob("tmp/*.json")
-
-with open("data.json", "wb") as f:
-    f.write("[{0:s}]".format(",".join([dumps({"result": loads(open(file, "rb").read())}) for file in files])))

File nonpeople.py

-#!/usr/bin/env python
-
-from glob import glob
-from json import loads
-
-
-datum = (loads(open(f, "rB").read()) for f in glob("tmp/*.json"))
-articles = [d["_source"] for d in datum if not d["people"]]
-print(len(articles))

File spyda/matcher.py

 from multiprocessing.pool import Pool
 
 from . import __version__
-from .utils import is_url
 from . import fetch_url, log
-from utils import get_close_matches
+from .utils import is_url, get_close_matches
 
 USAGE = "%prog [options] [ data | url ] [ sources ]"
 VERSION = "%prog v" + __version__

File spyda/utils.py

 import re
 import htmlentitydefs
+from heapq import nlargest
+from difflib import SequenceMatcher
+from csv import DictReader, Sniffer
 
 
 UNICHAR_REPLACEMENTS = (
         text = text.replace(*replacement)
 
     return text
+
+
+def csv_to_dictlist(csvfile):
+    with open(csvfile, "rb") as f:
+        dialect = Sniffer().sniff(f.read(1024))
+        f.seek(0)
+        return list(dict((k.strip(), v.strip()) for k, v in d.items()) for d in DictReader(f, dialect=dialect))
+
+
+def get_close_matches(word, possibilities, n=3, cutoff=0.6):
+    """Use SequenceMatcher to return list of close matches.
+
+    word is a sequence for which close matches are desired (typically a string).
+
+    possibilities is a list of sequences against which to match word (typically a list of strings).
+
+    Optional arg n (default 3) is the maximum number of close matches to return. n must be > 0.
+
+    Optional arg cutoff (default 0.6) is a float in [0.0, 1.0].
+    Possibilities that don't score at least that similar to word are ignored.
+
+    The best (no more than n) matches among the possibilities are returned
+    in a list, sorted by similarity score, most similar first.
+
+    >>> get_close_matches("appel", ["ape", "apple", "peach", "puppy"])
+    ['apple', 'ape']
+    >>> import keyword as _keyword
+    >>> get_close_matches("wheel", _keyword.kwlist)
+    ['while']
+    >>> get_close_matches("apple", _keyword.kwlist)
+    []
+    >>> get_close_matches("accept", _keyword.kwlist)
+    ['except']
+    """
+
+    if not n > 0:
+        raise ValueError("n must be > 0: %r" % (n,))
+    if not 0.0 <= cutoff <= 1.0:
+        raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,))
+
+    result = []
+    s = SequenceMatcher()
+    s.set_seq2(word)
+    for x in possibilities:
+        s.set_seq1(x)
+        if s.real_quick_ratio() >= cutoff and s.quick_ratio() >= cutoff and s.ratio() >= cutoff:
+            result.append((x, s.ratio()))
+
+    # Return n largest best scorers and their matches.
+    return nlargest(n, result)

File test_match.py

-#!/usr/bin/env python
-
-from json import loads
-from operator import itemgetter
-
-from fuzzywuzzy.process import extract
-from nltk.metrics import edit_distance, masi_distance
-
-from utils import csv_to_dictlist, get_close_matches
-
-records = loads(open("data.json", "rb").read())
-keys = [("preferred_name", "family_name"), ("given_name", "family_name")]
-namesets = list(dict(("{0:s} {1:s}".format(*itemgetter(*k)(record)), record["uri"]) for record in records) for k in keys)

File utils.py

-#!/usr/bin/env python
-
-from heapq import nlargest
-from difflib import SequenceMatcher
-from csv import DictReader, Sniffer
-
-
-def csv_to_dictlist(csvfile):
-    with open(csvfile, "rb") as f:
-        dialect = Sniffer().sniff(f.read(1024))
-        f.seek(0)
-        return list(dict((k.strip(), v.strip()) for k, v in d.items()) for d in DictReader(f, dialect=dialect))
-
-
-def get_close_matches(word, possibilities, n=3, cutoff=0.6):
-    """Use SequenceMatcher to return list of close matches.
-
-    word is a sequence for which close matches are desired (typically a string).
-
-    possibilities is a list of sequences against which to match word (typically a list of strings).
-
-    Optional arg n (default 3) is the maximum number of close matches to return. n must be > 0.
-
-    Optional arg cutoff (default 0.6) is a float in [0.0, 1.0].
-    Possibilities that don't score at least that similar to word are ignored.
-
-    The best (no more than n) matches among the possibilities are returned
-    in a list, sorted by similarity score, most similar first.
-
-    >>> get_close_matches("appel", ["ape", "apple", "peach", "puppy"])
-    ['apple', 'ape']
-    >>> import keyword as _keyword
-    >>> get_close_matches("wheel", _keyword.kwlist)
-    ['while']
-    >>> get_close_matches("apple", _keyword.kwlist)
-    []
-    >>> get_close_matches("accept", _keyword.kwlist)
-    ['except']
-    """
-
-    if not n > 0:
-        raise ValueError("n must be > 0: %r" % (n,))
-    if not 0.0 <= cutoff <= 1.0:
-        raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,))
-
-    result = []
-    s = SequenceMatcher()
-    s.set_seq2(word)
-    for x in possibilities:
-        s.set_seq1(x)
-        if s.real_quick_ratio() >= cutoff and s.quick_ratio() >= cutoff and s.ratio() >= cutoff:
-            result.append((x, s.ratio()))
-
-    # Return n largest best scorers and their matches.
-    return nlargest(n, result)
-
-
-def _test():
-    import doctest
-    return doctest.testmod()
-
-
-if __name__ == "__main__":
-    _test()