Source

quechua / modules / data / itemsets.cc

Full commit
/* 
 * Quechua - the lightweight data mining framework
 *
 * Copyright (C) 2012 Marek Denis <quechua@octogan.net>
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#include "itemsets.h"

itemset::itemset() : size(1), count(0),copied(0), generator(false) {
    values = new index_t[size];
};

itemset::itemset(int s) : count(0), copied(0), generator(false) {
    size = s<0?1:s;
    if(size) values = new index_t[size];
};

itemset::~itemset() {
    size = 0;
    if(!copied) delete []values;
};

/* rather bad idea
index_t* itemset::giveaway() {
    copied = 1;
    return values;
}
*/
void itemset::debug() const {
    static char buf[32];
    static string_t s;
    
    for(int i=0;i<size;++i) {
        memset(buf,0,32);
        snprintf(buf,32,"%lu,",values[i]);
        s.append(buf);
    };
    LOG(DEBUG) << "itemset at: " << this
               << " size: " << size
               << " count: " << count
               << " values: " << s;
    s.clear();
};

int itemset::getsize() const {
    return size;
};

int itemset::getcount() const {
    return count;
};

void itemset::setcount(int x) {
    count = count + x;
};

bool itemset::getgenerator() const {
    return generator;
};

void itemset::setgenerator(bool g) {
    generator = g;
};

void itemset::inc() {
    ++count;
};

void itemset::dec() {
    --count;
};

index_t& itemset::val(int x) const {
    if(x >= 0 && x < size) {
        return values[x];
    }
    LOG(ERROR) << "itemset::val: Trying to acces out of range element, x: " << x
              << " size: " << size;

    return values[size-1];
};

int itemset::clone(itemset* item, int border,bool cloneall) const {
    if(border < 0 || border > getsize() || border > item->getsize() ) {
        LOG(WARN) << "itemset::clone: border greater than itemset size";
        return -1;
    }
    int n_size = size - border;
//    memcpy(&(item->val(0)),values,n_size);
    for(int i=0;i<n_size;++i) {
        item->val(i) = val(i);
    };
    if(cloneall) { 
        item->setcount(count);
        item->setgenerator(generator);
    }
    return 0;
};

int itemset::cmp(const itemset* a, const itemset* b) {
    return cmp_part(a,b,0);
}

/*
 *   Returns
 *   -1 -> a < b
 *   0  -> a == b 
 *   1  -> a > b
 *   The function compares values except last border values.
 */ 
int itemset::cmp_part(const itemset* a, const itemset* b, int border) {
    int size;
    if(a->getsize()==b->getsize())
        size = a->getsize();
    else
        size = a->getsize()>b->getsize()?b->getsize():a->getsize();
    if(size<border) {
    a->debug();
    b->debug();
    }
    assert(size>=border);
    size -= border;
    for(int i=0;i<size;++i) {
        if(a->val(i) < b->val(i))
            return  -1;
        if(a->val(i) > b->val(i))
            return 1;
    };
    return 0;
};

itemset* itemset::merge(const itemset* a, const itemset* b) {
    int size;
    if(a->getsize()==b->getsize())
        size = a->getsize();
    else
        size = a->getsize()>b->getsize()?b->getsize():a->getsize();
    int r_size = size + 1;
    itemset* r_itemset = new itemset(r_size);

    a->clone(r_itemset,1); // copy a, but size-1 elements
    
    if(a->val(size-1) >= b->val(size-1)) {
        r_itemset->val(size-1) = b->val(size-1);
        r_itemset->val(size) = a->val(size-1);
    } else {
        r_itemset->val(size-1) = a->val(size-1);
        r_itemset->val(size) = b->val(size-1);
    }
    return r_itemset;
};

/* itemsets */

itemsets::itemsets() : tree(NULL) {
}

itemsets::itemsets(int s) : tree(NULL) {
    s = s<1?1:s;
    values.resize(s);
}

itemsets::~itemsets() {
    if(tree) delete tree;
    values.clear();
    tree = NULL;
}

void itemsets::clear() {
    if(tree) delete tree;
    tree = NULL;
    std::for_each(values.begin(),values.end(),delitem);
    values.clear();
}

void itemsets::debug() const {
    for(int i=0;i<values.size();++i)
        values[i]->debug();
};

int itemsets::getsize() const {
    return values.size(); //  deque.size() is O(1)
}

itemset* itemsets::find(const itemset* item) const {
    int idx = find_idx(item);
    if(idx==-1)
        return NULL;
    return values[idx];
}

// TODO(marek):
// plain old linear insert - to be optimized
itemset* itemsets::insert(itemset* item) {
    deque<itemset*>::iterator it,end;
    it = values.begin();
    end= values.end();
    
    if(values.empty()) {
        values.push_back(item);
        return item;
    }

    int cmp;
    itemset* i;
    for(;it!=end;++it) {
        i = *it;
        cmp = itemset::cmp(item,i);
        if(!cmp)
            return i;
        else if(cmp<0) {
            values.insert(it,item);
            return item;
        }
    }
    values.push_back(item); // we got the end;
    return item;
};

void itemsets::append(itemset* item) {
    values.push_back(item);
};

bool itemsets::erase(const itemset* item) {
    int idx = find_idx(item);
    if(idx==-1)
        return false;
    itemset* i = values[idx];
    delete i;
    values.erase(values.begin()+idx);
    return true;
};

bool itemsets::erase(int x) {
    if(x < 0 || x>= getsize())
        return false;
    deque<itemset*>::iterator it = values.begin()+x;
    delete *it;
    values.erase(values.begin()+x);
    return true;
}

void itemsets::buildtree() {
    if(tree)
        delete tree;
    tree = new hashnode();
    for(int i=0;i<getsize();++i) {
        tree->add(values[i],0);
    }
}

itemset* itemsets::getitem(int x) const {
    if(x < 0 || x > getsize()-1 ) {
        return NULL;
    }
    return values[x];
};

itemsets* itemset::subsets() const {
    int idx,m,s=0; // m - master, s - slave
    m = getsize()-1;

   itemsets* r_subsets = new itemsets;

    for(;m>=0;--m) {
        itemset* iset = new itemset(getsize()-1);
        for(idx=0,s=0;s<getsize();++s) {
            if(m==s) continue;
            iset->val(idx++) = val(s);
        }
        r_subsets->append(iset);
    }
    return r_subsets;
};

bool itemset::intersection(const index_t* txn, int size) {
    int s,t;
    s=t=0;
    for(int j=0;j<size;++j){
        for(int i=s;i<getsize();++i) {
            if(values[i]==txn[j]) {
                if(++s == getsize()) return true; 
                break;
            }  
        }  
    }  
    return false; 
}
void itemsets::prunecandidates(int minsup) {
    deque<itemset*>::iterator it = values.begin();
    while(it!=values.end()) {
        if((*it)->getcount() < minsup) {
            delete *it;
            it = values.erase(it);
        } else ++it;
    }
};

void itemsets::batchinc(int x) {
    deque<itemset*>::iterator it, end;
    it = values.begin();
    end= values.end();
    for(;it!=end;++it) {
        (*it)->setcount(x);
    }
}

void itemsets::correlate(itemset* compare,itemsets* ct) {
    if(!tree)
        buildtree();
    set_t r_set;
    set_t tmp_set;
    if(values.empty())
        return;
    for(int i=0;i<1/*txn->size*/;++i) {
        tmp_set.clear();
        tmp_set = tree->find(&(compare->val(0)),compare->getsize(),i,0);
        r_set.insert(tmp_set.begin(),tmp_set.end());
    }
    int idx;
    set_it sit = r_set.begin();
    for(;sit!=r_set.end();++sit) {
        ct->append((*sit));
    }
};


const itemset* itemsets::utilize(const itemset* mstr, const itemset* slv) {
    if(mstr == slv)
        return mstr;
    delete slv;

    slv = mstr;
    return slv;
}

/*
 * Returns -1 if  not found,otherwise index of the element
 * Binary search algorithm implementation
 */
int itemsets::find_idx(const itemset* item) const {
    if(values.empty())
        return -1;
    int low, high, mid, cmp;
    low = 0;
    high = getsize() - 1;     
    while (low <= high) {
        mid = (low + high)/2;
        cmp = itemset::cmp(item,values[mid]);
        if(cmp < 0)
            high = mid - 1;
        else if(cmp > 0)
            low  = mid + 1;
        else
            return mid;
    }
    return -1;
};

void itemsets::delitem(itemset* i) {
    delete i;
};