Source

htsql / src / htsql / core / classify.py

Full commit
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
#
# Copyright (c) 2006-2012, Prometheus Research, LLC
#


from .context import context
from .cache import once
from .adapter import Adapter, adapt
from .model import (Node, Arc, Label, HomeNode, TableNode, TableArc, ChainArc,
                    ColumnArc, SyntaxArc, AmbiguousArc)
from .entity import DirectJoin, ReverseJoin
from .introspect import introspect
import re
import unicodedata


def normalize(name):
    """
    Normalizes a name to provide a valid HTSQL identifier.

    We assume `name` is a Unicode string.  Then it is:

    - translated to Unicode normal form C;
    - converted to lowercase;
    - has non-alphanumeric characters replaced with underscores;
    - preceded with an underscore if it starts with a digit.

    The result is a valid HTSQL identifier.
    """
    assert isinstance(name, unicode) and len(name) > 0
    name = unicodedata.normalize('NFC', name).lower()
    name = re.sub(ur"(?u)^(?=\d)|\W", u"_", name)
    return name


class Classify(Adapter):

    adapt(Node)

    def __init__(self, node):
        self.node = node

    def __call__(self):
        arcs = self.trace(self.node)
        bids_by_arc = {}
        for arc in arcs:
            bids_by_arc[arc] = self.call(arc)

        names_by_weight = {}
        arcs_by_bid = {}
        for arc in arcs:
            for bid in bids_by_arc[arc]:
                name, weight = bid
                names_by_weight.setdefault(weight, set()).add(name)
                arcs_by_bid.setdefault(bid, []).append(arc)

        arc_by_signature = {}
        name_by_arc = {}
        rejections_by_signature = {}

        for weight in sorted(names_by_weight, reverse=True):
            names = sorted(names_by_weight[weight],
                           key=(lambda name: (len(name), name)))
            for name in names:
                contenders_by_arity = {}
                for arc in arcs_by_bid[name, weight]:
                    contenders_by_arity.setdefault(arc.arity, []).append(arc)
                for arity in sorted(contenders_by_arity):
                    signature = (name, arity)
                    contenders = contenders_by_arity[arity]
                    if signature in arc_by_signature:
                        continue
                    if (len(contenders) > 1 or
                            signature in rejections_by_signature):
                        rejections_by_signature.setdefault(signature, [])
                        rejections_by_signature[signature].extend(contenders)
                        continue
                    [arc] = contenders
                    if arc in name_by_arc:
                        rejections_by_signature[signature] = [arc]
                        continue
                    arc_by_signature[signature] = arc
                    name_by_arc[arc] = name

        labels = []
        for arc in arcs:
            if arc not in name_by_arc:
                continue
            name = name_by_arc[arc]
            label = Label(name, arc, False)
            labels.append(label)
        for signature in sorted(rejections_by_signature):
            name, arity = signature
            alternatives = []
            duplicates = set()
            for arc in rejections_by_signature[signature]:
                if arc in duplicates:
                    continue
                alternatives.append(arc)
                duplicates.add(arc)
            arc = AmbiguousArc(arity, alternatives)
            label = Label(name, arc, False)
            labels.append(label)

        labels = self.order(labels)

        return labels

    def trace(self, node):
        arcs = []
        duplicates = set()
        for arc in Trace.__invoke__(node):
            if arc in duplicates:
                continue
            arcs.append(arc)
            duplicates.add(arc)
        return arcs

    def call(self, arc):
        bids = []
        duplicates = set()
        for name, weight in Call.__invoke__(arc):
            name = normalize(name)
            if (name, weight) in duplicates:
                continue
            bids.append((name, weight))
            duplicates.add((name, weight))
        return bids

    def order(self, labels):
        return Order.__invoke__(self.node, labels)


class Trace(Adapter):

    adapt(Node)

    def __init__(self, node):
        self.node = node

    def __call__(self):
        return []


class TraceHome(Trace):

    adapt(HomeNode)

    def __call__(self):
        catalog = introspect()
        for schema in catalog.schemas:
            for table in schema.tables:
                yield TableArc(table)


class TraceTable(Trace):

    adapt(TableNode)

    def __call__(self):
        table = self.node.table
        for column in table.columns:
            link = self.find_link(column)
            yield ColumnArc(table, column, link)
        for foreign_key in table.foreign_keys:
            join = DirectJoin(foreign_key)
            yield ChainArc(table, [join])
        for foreign_key in table.referring_foreign_keys:
            join = ReverseJoin(foreign_key)
            yield ChainArc(table, [join])

    def find_link(self, column):
        # Determines if the column may represents a link to another table.
        # This is the case when the column is associated with a foreign key.

        # Get a list of foreign keys associated with the given column.
        candidates = [foreign_key for foreign_key in column.foreign_keys
                                  if len(foreign_key.origin_columns) == 1]

        # Return immediately if there are no candidate keys.
        if not candidates:
            return None

        # Generate the joins corresponding to each alternative.
        alternatives = []
        for foreign_key in candidates:
            join = DirectJoin(foreign_key)
            arc = ChainArc(column.table, [join])
            alternatives.append(arc)
        # We got an unambiguous link if there's only one foreign key
        # associated with the column.
        if len(alternatives) == 1:
            return alternatives[0]
        else:
            return AmbiguousArc(alternatives)


class Call(Adapter):

    adapt(Arc)

    def __init__(self, arc):
        self.arc = arc

    def __call__(self):
        return []


class CallTable(Call):

    adapt(TableArc)

    def __call__(self):
        table = self.arc.table
        yield table.name, table.schema.priority
        if table.schema.name:
            name = u"%s %s" % (table.schema.name, table.name)
            yield name, -1


class CallColumn(Call):

    adapt(ColumnArc)

    def __call__(self):
        yield self.arc.column.name, 10


class CallChain(Call):

    adapt(ChainArc)

    path_word = u"via"

    def __call__(self):
        is_primary = True
        for join in self.arc.joins:
            foreign_key = join.foreign_key
            primary_key = foreign_key.origin.primary_key
            if primary_key is None:
                is_primary = False
                break
            if not all(column in primary_key.origin_columns
                       for column in foreign_key.origin_columns):
                is_primary = False
                break

        is_direct = all(join.is_direct for join in self.arc.joins)

        target = self.arc.target.table.name
        prefix = None
        column = None
        if len(self.arc.joins) == 1:
            foreign_key = join.foreign_key
            origin_name = foreign_key.origin_columns[-1].name
            target_name = foreign_key.target_columns[-1].name
            if origin_name.endswith(target_name):
                prefix = origin_name[:-len(target_name)].rstrip(u' _-')
                if not prefix:
                    prefix = target
            column = origin_name

        if is_direct and prefix:
            yield prefix, 5
        if is_primary:
            yield target, 4
        else:
            yield target, 3
        if not is_direct and prefix:
            name = u"%s %s %s" % (target, self.path_word, prefix)
            yield name, 2
        if not is_direct and column:
            name = u"%s %s %s" % (target, self.path_word, column)
            yield name, 1


class CallSyntax(Call):

    adapt(SyntaxArc)


class Order(Adapter):

    adapt(Node)

    def __init__(self, node, labels):
        self.node = node
        self.labels = labels

    def __call__(self):
        return self.labels


class OrderHome(Order):

    adapt(HomeNode)


class OrderTable(Order):

    adapt(TableNode)

    def __call__(self):
        return [label.clone(is_public=(label.is_public or
                                       isinstance(label.arc, ColumnArc)))
                for label in self.labels]


class Localize(Adapter):

    adapt(Node)

    def __init__(self, node):
        self.node = node

    def __call__(self):
        return None


class LocalizeTable(Localize):

    adapt(TableNode)

    def __call__(self):
        label_by_column = {}
        label_by_join = {}
        for label in classify(self.node):
            if (isinstance(label.arc, ColumnArc) and
                    label.arc.column not in label_by_column):
                label_by_column[label.arc.column] = label
            if (isinstance(label.arc, ChainArc) and
                    len(label.arc.joins) == 1 and
                    label.arc.joins[0] not in label_by_join):
                label_by_join[label.arc.joins[0]] = label
        table = self.node.table
        for key in [table.primary_key]+table.unique_keys:
            if key.is_partial:
                continue
            if not all(not column.is_nullable for column in key.origin_columns):
                continue
            columns = key.origin_columns[:]
            identity = []
            while columns:
                for foreign_key in self.node.table.foreign_keys:
                    if foreign_key.is_partial:
                        continue
                    width = len(foreign_key.origin_columns)
                    if foreign_key.origin_columns == columns[:width]:
                        join = DirectJoin(foreign_key)
                        if join not in label_by_join:
                            continue
                        label = label_by_join[join]
                        if localize(label.target) is None:
                            continue
                        identity.append(label)
                        columns = columns[width:]
                        break
                else:
                    column = columns[0]
                    if column not in label_by_column:
                        break
                    columns.pop(0)
                    identity.append(label_by_column[column])
            if not columns:
                return identity


@once
def classify(node):
    assert isinstance(node, Node)
    return Classify.__invoke__(node)


@once
def relabel(arc):
    assert isinstance(arc, Arc)
    cache = context.app.htsql.cache
    labels = classify(arc.origin)
    duplicates = set()
    labels_by_arc = {}
    labels_by_arc[arc] = []
    arcs = [arc]
    for label in labels:
        assert label.name not in duplicates, label
        duplicates.add(label.name)
        arc = label.arc
        if arc not in labels_by_arc:
            labels_by_arc[arc] = []
            arcs.append(arc)
        labels_by_arc[arc].append(label)
    for arc in arcs:
        key = (relabel.__module__, relabel.__name__, arc)
        value = labels_by_arc[arc]
        if key not in cache.values:
            cache.set(key, value)
    return labels_by_arc[arcs[0]]


@once
def localize(node):
    assert isinstance(node, Node)
    return Localize.__invoke__(node)