Commits

Tianming Lu committed 8836918

add data generation, FPtree done

Comments (0)

Files changed (2)

+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+#author: Tianming Lu
+
+import operator
+import data
+import sys
+from itertools import combinations
+
+class Pattern:
+    def __init__(self, itemset, support):
+        self.itemset = itemset
+        self.support = support
+        
+    def merge(self, p):
+        self.itemset += p.itemset
+        self.support = min(self.support, p.support)
+        
+class FPnode:
+    def __init__(self, item, count):
+        self.item = item
+        self.count = count
+        self.children = []
+        self.parent = None
+        self.mark = 0   #dummpy property, used when building conditional tree
+        self.condition_count = 0
+
+    def append(self, node):
+        self.children.append(node)
+        if node.parent != None:
+            node.parent.remove(node)
+        node.parent = self
+    def remove(self, node):
+        self.children.remove(node)
+        node.parent = None
+        
+    def is_single_prefix(self):
+        if len(self.children) == 1:
+            return True
+        else:
+            return False
+    
+    def seperate(self):
+        """
+        seperate into single prefix and branch
+        """
+        #find branch node
+        node = self
+        while len(node.children) == 1:
+            node = node.children[0]
+        parent = node.parent
+        parent.remove(node)
+        new_node = FPnode(node.item, node.count)
+        parent.append(new_node)
+        node.item = None
+        node.count = 0
+        return self, node
+    def items(self):
+        counts = {}
+        self._items(counts)
+        return counts.keys()
+        
+    def _items(self, counts):
+        if self.item != None:
+            if counts.has_key(self.item):
+                counts[self.item] += 1
+            else:
+                counts[self.item] = 1
+        for c in self.children:
+            c._items(counts)
+    
+def _frequent_items(transaction, threshold):
+    """
+    scan transaction once, to get the frequent items
+    """
+    counts = {}
+    for t in transactions:
+        for item in t:
+            if counts.has_key(item):
+                counts[item] += 1
+            else:
+                counts[item] = 1
+    frequent_items = {k : v for k, v in counts.iteritems() if v >= threshold}
+    sorted_items =  sorted(frequent_items.iteritems(), 
+                           key = operator.itemgetter(1), reverse = True)
+    return [i[0] for i in sorted_items]
+
+def _build_tree(transactions, frequent_items):
+    """
+    build the FP tree
+    """
+    root = FPnode(None, 0)
+    header_table = {i : [] for i in frequent_items}
+    node = root
+    for t in transactions:
+        #filter and sort items in t in the order of frequent_items
+        filtered_t = [i for i in t if i in frequent_items]
+        sorted_t = sorted(filtered_t, key = lambda x: frequent_items.index(x))
+
+        for item in sorted_t:
+            subnode = _find_subnode(node, item)
+            if subnode == None:
+                subnode = FPnode(item, 1)
+                node.append(subnode)
+
+                #new node, add to header_table
+                header_table[item].append(subnode)
+            else:
+                subnode.count += 1
+            node = subnode
+        node = root
+
+    return root, header_table 
+    
+def _build_conditional_tree(node, nodes_link, condition):
+    """
+    build conditional FP tree base on the condition
+    """
+    if node == None or (node.item == None and len(node.children) == 0):
+        return None, None
+        
+    #mark nodes that should be included in the conditional tree
+    for n in nodes_link:
+        p = n.parent
+        while p != None:
+            p.mark = 1
+            p.condition_count += n.count
+            p = p.parent
+    
+    #generate conditional tree
+    items = node.items()
+
+    header_table = {i : [] for i in items}
+    root = FPnode(node.item, node.count)
+
+    for subnode in node.children:
+        subn = _build_conditional_subtree(subnode, header_table)
+        if subn != None:
+            root.append(subn)
+    if len(root.children) == 0:
+        return None, header_table
+    return root, header_table
+    
+def _build_conditional_subtree(node, header_table):
+    """
+    sub routine recursive used when building conditional FP tree
+    """
+    if node.mark == 0:
+        return None
+
+    n = FPnode(node.item, node.condition_count)
+    node.mark = 0
+    node.condition_count = 0
+    
+    #new node, add to header_table
+    header_table[n.item].append(n)
+    for sub in node.children:
+        subn = _build_conditional_subtree(sub, header_table)
+        if subn != None:
+            n.append(subn)
+    return n
+    
+def _find_subnode(node, item):
+    for subnode in node.children:
+        if subnode.item == item:
+            return subnode
+    return None       
+ 
+
+            
+def FP_growth(node, header_table, prefix, threshold):
+    patterns_P = []
+    patterns_Q = []
+    patterns_all = []
+
+    if node.is_single_prefix():
+        P, Q = node.seperate()
+        patterns_P = single_freq_pattern_set(P, threshold)
+        for p in patterns_P:
+            p.merge(prefix)
+
+    else:
+        Q = node
+    
+    for i in Q.items():
+        support = 0
+        node_link = header_table[i]        
+        for n in node_link:
+            support += n.count        
+        p = Pattern([i], support)
+        p.merge(prefix)
+        if p.support < threshold:
+            continue
+        
+        patterns_Q.append(p)
+        tree_p, header_p = _build_conditional_tree(node, node_link, p)
+        if tree_p != None:
+            patterns = FP_growth(tree_p, header_p, p, threshold)
+            patterns_Q.extend(patterns)
+         
+    patterns_all.extend(patterns_P)
+    patterns_all.extend(patterns_Q)
+
+    for p in patterns_P:
+        for q in patterns_Q:
+            itemset = []
+            itemset.extend(p.itemset)
+            itemset.extend(q.itemset)
+            patterns_all.append(Pattern(itemset, min(p.support, q.support)))
+    return patterns_all        
+        
+
+def single_freq_pattern_set(node, threshold):
+    """
+    freq_pattern_set for single prefix path
+    """    
+    items = []
+    n = node
+    while True:    
+        if n.item == None:
+            n = n.children[0]
+            continue
+        items.append(n)
+        
+        if len(n.children) == 0:
+            break
+        else:
+            n = n.children[0]
+
+    patterns = []
+    for i in range(len(items)):
+        coms = combinations(items, i+1)
+        for c in coms:
+            min_support = sys.maxint
+            pattern = []
+            for i in c:
+                pattern.append(i.item)
+                if i.count < min_support:
+                    min_support = i.count
+            if min_support >= threshold:
+                patterns.append(Pattern(pattern, min_support))
+    return patterns
+
+def print_tree(node, depth):
+    s = ""
+    for i in range(depth):
+        s += "    "
+    print((s +"{0}:{1}").format(node.item, node.count))
+    for child in node.children:
+        print_tree(child, depth + 1)
+        
+def print_patterns(patterns):
+    for p in patterns:
+        print("pattern: {0}, support:{1}".format(p.itemset, p.support))      
+        
+if __name__ == "__main__":
+    transactions = data.read("test.txt")
+    frequent_items = _frequent_items(transactions, 3)
+    fp_tree, header_table = _build_tree(transactions, frequent_items)
+
+    p = Pattern([], sys.maxint)
+    patterns = FP_growth(fp_tree, header_table, p, 3)
+    print_patterns(patterns)
+        
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+#author: Tianming Lu
+
+from scipy import stats
+import numpy as np
+import random
+
+def generate(D, T, I, L, N):
+    #generate large itemsets
+    correlation_level = 0.5
+    large_sets = []
+    size = 0
+    for i in range(L):
+        sz = stats.poisson.rvs(I)
+        if sz == 0:
+            sz += 1
+
+        if i == 0:
+            itemsets = random.sample(xrange(N), sz)
+            itemsets.sort()
+            large_sets.append(itemsets)
+        else:
+            pick_items_p = 2
+            while pick_items_p > 1:
+                pick_items_p = stats.expon.rvs(scale=correlation_level)
+            pick_items_count = int(pick_items_p * sz)
+            pick_items = random.sample(large_sets[-1], min(pick_items_count, size))
+            random_items = random.sample([x for x in xrange(N) if x not in pick_items],
+                                         sz - len(pick_items))
+            itemset = pick_items + random_items
+            itemset.sort()
+            large_sets.append(itemset)
+
+        size = sz
+    sets_p = stats.expon.rvs(size = L)
+    sets_p = np.array(sets_p)
+    sump = sum(sets_p)
+    sets_p /= sump
+
+    corruption_levels = stats.norm.rvs(0.5, 0.1, size = L)
+    trans = []
+    tran_size = stats.poisson.rvs(T, size = D)
+    for i in range(D):
+        P = np.random.rand(L)
+        P = sets_p * P
+        index = np.argmax(P)
+        itemset = list(large_sets[index])
+
+        c = corruption_levels[index]
+        set_size = len(itemset)
+        while stats.uniform.rvs() < c and set_size > 1:
+            index = stats.randint.rvs(0, set_size)
+            itemset.pop(index)
+            set_size -= 1
+        trans.append(itemset)
+    return trans
+
+def write(transactions, filename):
+    with open(filename, 'w') as output:
+        for t in transactions:
+            for item in t:
+                output.write(str(item) + ' ')
+            output.write('\n')
+
+def read(filename):
+    transactions = []
+    with open(filename, 'r') as input:
+        while True:
+            line = input.readline()
+            if not line:
+                break
+            t = [item for item in line.split()]
+            transactions.append(t)
+    return transactions
+
+if __name__ == "__main__":
+    trans = generate(10000, 20, 4, 2000, 1000)
+    write(trans, "test.txt")