Source

woocode / py / md5_walk.py

Full commit
 #-*- encoding:utf-8 -*-

from __future__ import with_statement

import os

from hashlib import md5

def get_relpath(path):
    '''
    >>> p = 'a/b/c'
    >>> get_relpath(p)
    'a/b/c'
    >>> p = 'a/b/c'
    >>> get_relpath(p)
    'b/c'
    >>> p = 'a'
    >>> get_relpath(p)
    'a'
    >>> p = 'c:\\a\\b\\c'
    >>> get_relpath(p)
    'a\\b\\c'
    '''
    i = path.find(os.sep) + 1
    if i == 0:
        return path
    return path[i:]

def gen_file_md5(fp, chunk=2*2048):
    m = md5()
    with open(fp, 'rb') as fb:
        while True:
            content = fb.read(chunk)
            if not content:
                break
            m.update(content)
    return m.hexdigest()

def get_files(path):
    for root, dirs, files in os.walk(path):
        for fn in files:
            yield os.path.join(root, fn)

def gen_dir_md5(dirname):
    md5_list = [(f, gen_file_md5(f)) for f in get_files(dirname)]
    return md5_list

def is_same(file_a, file_b):
    pass

def compare_dirs(da, db):
    filelst_a = dict(gen_dir_md5(da))
    filelst_b = dict(gen_dir_md5(db))

    writefile(da + '.md5.txt', os.linesep.join(['%s,%s' % (k,v) for k, v in filelst_a.iteritems()]))
    writefile(db + '.md5.txt', os.linesep.join(['%s,%s' % (k,v) for k, v in filelst_b.iteritems()]))

    rel_path_a = [p[len(da)+1:] for p in filelst_a]
    rel_path_b = [p[len(db)+1:] for p in filelst_b]
    only_a = [f for f in rel_path_a if f not in rel_path_b]
    only_b = [f for f in rel_path_b if f not in rel_path_a]
    isolate_files = only_a > only_b and only_b or only_a
    file_set = set(rel_path_a) & set(rel_path_b)
    diff_files = []
    for fn in file_set:
        if filelst_a[os.sep.join([da, fn])] != filelst_b[os.sep.join([db, fn])]:
            diff_files.append(fn)

    return (only_a, only_b, diff_files)

def writefile(fp, content):
    with open(fp, 'wb') as fb:
        fb.write(content)

def main():
    cwd_dirs = [d for d in os.listdir('.') if os.path.isdir(d)]
    cwd_dirs = dict(zip(map(str, range(len(cwd_dirs))), cwd_dirs))
    for k, v in cwd_dirs.iteritems():
        print '[%s] %s' % (k, v)
    print 'Choose two dirs to compare.'
    dir_a = raw_input('1st dirname:')
    dir_b = raw_input('2snd dirname:')

    assert dir_a in cwd_dirs
    assert dir_b in cwd_dirs

    only_in_a, only_in_b, diff_files = compare_dirs(cwd_dirs[dir_a], cwd_dirs[dir_b])
    is_same = True
    if only_in_a or only_in_b or diff_files:
        is_same = False
    for d in only_in_a:
        print 'Only in [%s]: %s' % (cwd_dirs[dir_a], d)
    for d in only_in_b:
        print 'Only in [%s]: %s' % (cwd_dirs[dir_b], d)

    for df in diff_files:
        print 'diff file: %s' % df

    if is_same:
        print '%s, %s are the same' % (cwd_dirs[dir_a], cwd_dirs[dir_b])

if __name__ == '__main__':
    main()
    raw_input('press any key to continue.')