Commits

Mikhail Korobov committed ea4ddac

further simplification: join annotations for joined tokens

Comments (0)

Files changed (3)

 instances, annotated with a list of ``ruscorpora.Annotation`` instances.
 
 ``ruscorpora.simplify`` simplifies a result of ``ruscorpora.parse_xml`` by
-removing ambiguous annotations, joining split tokens and removing accent
-information.
+removing ambiguous annotations, joining split tokens (+ joining their
+annotations) and removing accent information.
 
 ::
 

ruscorpora/__init__.py

 Token = namedtuple('Token', 'text annotations')
 Annotation = namedtuple('Annotation', 'lex gr joined')
 
+FlatToken = namedtuple('FlatToken', 'text lex gr joined')
+
 def parse_xml(source):
     """
     Parse XML file ``source`` (which can be obtained from ruscorpora.ru);
 
 
 def simplify(sents, remove_accents=True, join_split=True,
-             join_hyphenated=True, punct_tag='PNCT', wrap_tags=True):
+             join_hyphenated=True, punct_tag='PNCT', wrap_tags=True,
+             flat_tokens=True):
     """
     Simplify the result of ``sents`` parsing:
 
     * join hyphenated words to a single token (if ``join_hyphenated==True``);
     * remove accents (if ``remove_accents==True``);
     * convert string tag representation to ruscorpora.Tag instances
-      (if ``wrap_tags==True``).
+      (if ``wrap_tags==True``);
+    * return tokens as FlatToken instances (if ``flat_tokens==True``).
     """
 
     def remove_extra_annotations(token):
             return (token.text, [None])
         return (token.text, [token.annotations[-1]])
 
+    def _token_to_flat(token):
+        ann = token.annotations
+        if ann[0] is None:
+            return FlatToken(token.text, None, None, None)
+
+        if all(a.joined == 'together' for a in ann):
+            return FlatToken(
+                token.text,
+                "".join(a.lex for a in ann),
+                token.annotations[-1].gr,
+                'together'
+            )
+
+        if len(ann) == 2 and all(a.joined == 'hyphen' for a in ann):
+            ann1, ann2 = ann
+
+            tag = ann2.gr
+            if str(ann2.gr) in set(['PART', 'NUM=ciph', 'PR']):
+                tag = ann1.gr
+
+            return FlatToken(
+                token.text,
+                "-".join([ann1.lex, ann2.lex]),
+                tag,
+                'hyphen'
+            )
+
+        return FlatToken(token.text, ann[0].lex, ann[0].gr, ann[0].joined)
+
     def _combine_tokens(tokens):
         text = "".join(t[0] for t in tokens)
         annotations = [ann for t in tokens for ann in t[1] if ann]
         if wrap_tags:
             sent = with_wrapped_tags(sent)
 
-        yield [Token(*t) for t in sent]
+        sent = [Token(*t) for t in sent]
+        if flat_tokens:
+            sent = [_token_to_flat(t) for t in sent]
+
+        yield sent
+
+
+
+def parse_simple(source, **simplify_kwargs):
+    return simplify(parse_xml(source), **simplify_kwargs)
 
 
 if __name__ == '__main__':
     import sys
-    for sent in simplify(parse_xml(sys.argv[1])):
+    for sent in parse_simple(sys.argv[1]):
         for tok in sent:
             print(tok)
         print("\n")

tests/test_reader.py

 
     assert _parse(corpus) == [
         [
-            ('«', [rnc.Annotation(lex='«', gr='PNCT', joined=None)]),
-            ('Школа', [rnc.Annotation(lex='школа', gr='S,f,inan=sg,nom', joined=None)]),
-            ('злословия', [rnc.Annotation(lex='злословие', gr='S,n,inan=sg,gen', joined=None)]),
-            (' » ,-', [rnc.Annotation(lex=' » ,-', gr='PNCT', joined=None)]),
-            ('СМИ', [rnc.Annotation(lex='сми', gr='S,0=sg,nom', joined=None)]),
-            (' !', [rnc.Annotation(lex=' !', gr='PNCT', joined=None)])
+            ('«', '«', 'PNCT', None),
+            ('Школа', 'школа', 'S,f,inan=sg,nom', None),
+            ('злословия', 'злословие', 'S,n,inan=sg,gen', None),
+            (' » ,-', ' » ,-', 'PNCT', None),
+            ('СМИ', 'сми', 'S,0=sg,nom', None),
+            (' !', ' !', 'PNCT', None)
         ]
     ]
 
     <w><ana lex="Сегодня" gr="ADV" joined="hyphen"></ana>Сег`одня</w>-<w><ana lex="завтра" gr="ADV" joined="hyphen"></ana>з`автра</w>
     <w><ana lex="школа" gr="S,f,inan=sg,nom"></ana>шк`ола</w></se>
     """
-    assert _parse(corpus) == [
-        [
-            ('Сегодня-завтра', [
-                rnc.Annotation(lex='Сегодня', gr='ADV', joined='hyphen'),
-                rnc.Annotation(lex='завтра', gr='ADV', joined='hyphen')]),
-            ('школа', [rnc.Annotation(lex='школа', gr='S,f,inan=sg,nom', joined=None)])
-        ]
+    parsed = _parse(corpus)
+    assert len(parsed) == 1
+    assert parsed[0] == [
+        ('Сегодня-завтра', 'Сегодня-завтра', 'ADV', 'hyphen'),
+        ('школа', 'школа', 'S,f,inan=sg,nom', None),
     ]
 
 
+def test_joined_hyphen_complex():
+    corpus = """
+    <se>
+    <w><ana lex="певец" gr="S,m,anim=pl,ins" joined="hyphen"></ana>певц`ами</w>-<w><ana lex="солист" gr="S,m,anim=pl,ins" joined="hyphen"></ana>сол`истами</w> ,
+    <w><ana lex="интернет" gr="S,m,inan=sg,nom" joined="hyphen"></ana>интерн`ет</w>-<w><ana lex="торговля" gr="S,f,inan=sg,gen" joined="hyphen"></ana>торг`овли</w>
+    <w><ana lex="они" gr="S-PRO,pl,3p=gen" joined="hyphen"></ana>их</w>-<w><ana lex="то" gr="PART" joined="hyphen"></ana>то</w></se>
+    """
+    parsed = _parse(corpus)
+    assert len(parsed) == 1
+    assert parsed[0] == [
+        ('певцами-солистами', 'певец-солист', 'S,m,anim=pl,ins', 'hyphen'),
+        (' ,', ' ,', 'PNCT', None),
+        ('интернет-торговли', 'интернет-торговля', 'S,f,inan=sg,gen', 'hyphen'),
+        ('их-то', 'они-то', 'S-PRO,pl,3p=gen', 'hyphen'),
+    ]
+
+
+
 def test_joined_together():
     corpus = """
     <se>
     """
     assert _parse(corpus) == [
         [
-            ('Злословия', [rnc.Annotation(lex='злословие', gr='S,n,inan=sg,gen', joined=None)]),
-            (' -', [rnc.Annotation(lex=' -', gr='PNCT', joined=None)]),
-            ('полдюжины', [
-                rnc.Annotation(lex='пол', gr='NUM', joined='together'),
-                rnc.Annotation(lex='дюжина', gr='S,f,inan=sg,gen', joined='together')])
+            ('Злословия', 'злословие', 'S,n,inan=sg,gen', None),
+            (' -', ' -', 'PNCT', None),
+            ('полдюжины', 'полдюжина', 'S,f,inan=sg,gen', 'together')
         ]
     ]