Source

orange-modelmaps / archive / model_map_similarity.py

Full commit
import numpy
import Orange

from operator import itemgetter
from tools import *

print 'loading...'

fileName = 'zoo-1603'
fileCommon = ROOT + '_explore_/' + fileName
fileA = fileCommon + '-rank'
#fileB = ROOT + 'new/zoo-allmodels-420'
fileB = fileCommon + '-class'

# warning saved matrix in lower-diagonal!
modelA = numpy.load('%s.npy' % fileA)
# from lower-diagonal build symmetric 
modelA = modelA + modelA.transpose()
itemsA = Orange.data.Table('%s.tab' % fileCommon)

modelB = numpy.load('%s.npy' % fileB)
modelB = modelB + modelB.transpose()
#smxB = loadModel(fileB)
#modelB = numpy.zeros((smxB.dim, smxB.dim))
#for i in range(smxB.dim):
#    for j in range(smxB.dim):
#        modelB[i,j] = smxB[i,j]
        
itemsB = Orange.data.Table('%s.tab' %  fileCommon)

def compare_model_similarity(modelA, itemsA, modelB, itemsB):
    print len(modelA), 'read in model A,', len(modelB), 'read in model B'
    
    print 'matching...'
    matchA = sorted((ex['model'].value + ', '.join(sorted(ex['attributes'].value.split(', '))), i) for i, ex in enumerate(itemsA))
    matchB = sorted((ex['model'].value + ', '.join(sorted(ex['attributes'].value.split(', '))), i) for i, ex in enumerate(itemsB))
    
    i,j = 0,0
    matches = []
    # warning! this works only if list values are unique (which they are in my case :)
    while i < len(matchA) and j < len(matchB):
        mA, iA = matchA[i]
        mB, iB = matchB[j]
    
        if mA == mB:
            matches.append((iA,iB))
            i += 1
            j += 1 
        elif mA < mB:
            i += 1
        else:
            j += 1
    
    print len(matches), 'matched'
    indA, indB = zip(*matches)
    
    matrixA = modelA.take(indA, axis=0).take(indA, axis=1)
    matrixB = modelB.take(indB, axis=0).take(indB, axis=1)
    
    nnA = numpy.argsort(matrixA)
    nnB = numpy.argsort(matrixB)

    print 'comparing...'
    scores = []
    for k in range(2, len(nnA)+1):
        nnA_tmp = nnA[:,:k]
        nnB_tmp = nnB[:,:k]
        count = 0
        for i in range(len(nnA)):
            count += len(set(nnA_tmp[i]).intersection(set(nnB_tmp[i]))) - 1

        scores.append(count / float((k-1)*len(nnA)))
        if k % 100 == 0:
            print k
    return scores

def plt(x,y,fn):
    import matplotlib.pyplot as plt

    plt.title('')
    plt.xlabel('k-neighbors')
    plt.ylabel('similarity')
    plt.grid(True)
    
    plt.plot(x, y, linewidth=1.0)
    
    plt.savefig(fn)
    
scores = compare_model_similarity(modelA, itemsA, modelB, itemsB)

plt(range(1, len(scores[:50])+1), scores[:50], '%s%s-similarity-50.png' % (ROOT, fileName) )
plt(range(1, len(scores)+1), scores, '%s%s-similarity.png' % (ROOT, fileName) )

print 'saving results...'
fp = file(ROOT + 'similarity_results.txt', 'a')
fp.write('%s-class;%s-prob;%s\n' % (fileName, fileName, ';'.join(str(s) for s in scores)))
fp.close()