Commits

Kelsey Hightower committed fb1b7fb

first commit

  • Participants

Comments (0)

Files changed (1)

File ldifparse.py

+import os
+import re
+import sys
+
+
+_ENTRY_PATTERN = re.compile('\n\s*\n')
+
+def parse_ldif1(path):
+    # The fastest solution, but a memory hog.
+    # Uses 5 to 6 times memory than both and `parse_ldif2`
+    # and `parse_ldif3`
+    with open(LDIF, 'r', encoding='utf-8') as ldif:
+        records = re.split(_ENTRY_PATTERN, ldif.read())
+        for record in records:
+            if record == '':
+                continue
+            _record = {}
+            for row in record.split('\n'):
+                attribute, value = row.split(':')
+                if attribute in _record:
+                    _record[attribute].append(value)
+                else:
+                    _record[attribute] = [value.strip()]
+            yield _record
+
+
+_LDIF_E = re.compile('^(.*?): (.*)\n')
+_LDIF_B = re.compile('^\n')
+def parse_ldif2(ldif):
+    # Python regexs are SLOW.
+    # This solution is about 3 - 10 times slower depending on the
+    # dataset. Uses slightly more memory than `parse_ldif3`
+    row = {}
+    for line in ldif:
+        match = re.match(_LDIF_E, line)
+        if match:
+            if match.group(1) in row:
+                row[match.group(1)].append(match.group(2))
+            else:
+                row[match.group(1)] = [match.group(2)]
+        elif re.match(_LDIF_B, line):
+            yield row
+            row = {}
+
+
+def parse_ldif3(ldif):
+    # Slightly slower than `parse_ldif1` and uses the least
+    # amount of memory.
+    row = {}
+    for line in ldif:
+        if line.startswith('\n'):
+            yield row
+            row = {}
+        else:
+            attribute, value = line.split(':')
+            if attribute in row:
+                row[attribute].append(value.strip())
+            else:
+                row[attribute] = [value.strip()]
+
+
+if __name__ == '__main__':
+    LDIF = sys.argv[1]
+    if os.path.isfile(LDIF):
+        pass
+    else:
+        print("{} missing".format(LDIF))
+        sys.exit(1)
+    ldif = open(LDIF, 'r', encoding='utf-8')
+    entries = parse_ldif3(ldif)
+    for e in entries:
+        print(e)
+    ldif.close()