Source

quechua / modules / algorithm / apriori / apriori.cc

Full commit
/* 
 * Quechua - the lightweight data mining framework
 *
 * Copyright (C) 2012 Marek Denis <quechua@octogan.net>
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#include "apriori.h"

Apriori::Apriori() : minsup(0) {
};

Apriori::~Apriori() {
    clear();
};

void Apriori::clear() {
    for_each(sets.begin(),sets.end(),delitem);
    sets.clear();
}

bool Apriori::reset() {
    clear();
}

// We need only minimal support value
bool Apriori::load_misc_conf(const setting_t& misc) {
    if(!misc.exists("minsup")) {
        LOG(ERROR) << "Algorithm Apriori: Cannot find required parametr 'minsup'";
        return false;
    }
    misc.lookupValue("minsup",minsup);
    return true;
};

datapack_ptr Apriori::work(datapack_ptr data) {
    IndexTransactions* transactions = dynamic_cast<IndexTransactions*>(data.get());
    IndexTransactions* freqsets = new IndexTransactions();
    
    apriori(transactions);
      
    itemset* idxtxn; 
    for(int i=0;i<sets.size();++i) {
        itemsets* isets = sets[i];
        for(int j=0;j<isets->getsize();++j) {
            itemset* txn = isets->getitem(j);
            idxtxn = new itemset(txn->getsize());
            txn->clone(idxtxn,0,true);
            freqsets->append(idxtxn);
        }
        isets->clear();
    };

    transactions->clear();
    return datapack_ptr(freqsets);
};

void Apriori::generate_1_itemsets(IndexTransactions* transactions) {
    itemsets* isets = new itemsets;

    itemset *item,*txn;

    for(int i=0;i<transactions->getsize();++i) {
        txn = transactions->getitem(i); // just for convience;
        for(int j=0; j<txn->getsize() ;++j) {
            itemset* ii = new itemset(1);
            ii->val(0) = txn->val(j);
            item = isets->insert(ii);
            item->inc();
            itemsets::utilize(item,ii);
        }
    }
    isets->prunecandidates(minsup);
    if(isets->getsize())
        sets.push_front(isets);
};

itemsets* Apriori::apriorigen(const itemsets* isets) {
    int m=0; //master
    int s=1; //slave
    itemset* mm;
    itemset* ss; 
    itemset* iset;
    itemsets* r_itemsets = new itemsets;
    while(m<isets->getsize()) {
        mm = isets->getitem(m);
        while(s<isets->getsize()) {
            ss = isets->getitem(s);
            if(!itemset::cmp_part(mm,ss,1)) {// mergable
                iset = itemset::merge(mm,ss);
                if(prunable(iset,isets)) {
                    delete iset;
                }
                else {
                    r_itemsets->append(iset);
                }
                ++s;
            } else break;
        }
        ++m;
        s = m+1;
    }
    return r_itemsets;
};

void Apriori::apriori(IndexTransactions* transactions) {
    generate_1_itemsets(transactions);
    LOG(DEBUG) << "Generated k=1 itemsets";
    int k=1;
    while(!sets.empty() && sets.back()->getsize()) {
        itemsets* ck = apriorigen(sets[k-1]);
        if(!ck->getsize()) break;
        sets.push_back(ck);
        setupcounters(ck,transactions);
        ck->prunecandidates(minsup); 
        ++k;
    }
    //debug_work();
}
// TODO(marek): paralelize
void Apriori::setupcounters(itemsets* ck,IndexTransactions* itrans) {

    itemsets* ct;
    itemset* trans;
    for(int i=0;i<itrans->getsize();++i) {
        trans = itrans->getitem(i);
        ct = checktransactions(ck,trans);
        ct->batchinc(1);
        delete ct;
    }
};

// Returns itemsets that were found in
// raw transactions
itemsets* Apriori::checktransactions(itemsets* ck, itemset* txn) {
    itemsets* ct = new itemsets;
    ck->correlate(txn,ct);
    return ct;
};

void Apriori::debug_work() {
    for(int i=0;i<sets.size();++i) {
        {LOG(INFO) << "Apriori: Debug iteration " << i << " size: " << sets.size() ;};
        sets[i]->debug();
    }
};

bool Apriori::prunable(itemset* cand,const itemsets* ref) {
    itemsets* s_subsets = cand->subsets();

    itemset *tmp_itemset, *subs;
    bool res = false;
    
    deque<itemset*> subset_rollback_list;
    deque<bool>     bool_rollback_list;

    for(int j=0;j<s_subsets->getsize();++j) {
        tmp_itemset = s_subsets->getitem(j);
        if((subs = ref->find(tmp_itemset)) == NULL ) {
            res = true;
            break;
        } else {
            subset_rollback_list.push_back(subs);
            bool_rollback_list.push_back(subs->getgenerator());
            subs->setgenerator(true);
        }
    }

    /* check whether rollback is required or not */
    if(res) {
        bool val;
        for(int j=0;j<subset_rollback_list.size();++j) {
            val = bool_rollback_list[j];
            subset_rollback_list[j]->setgenerator(val);
        }
    }

    s_subsets->clear();
    delete s_subsets;
    return res;
}

void Apriori::delitem(itemsets* i) {
    i->clear();
    delete i;
}