Source

scripts / sort_files.py

Full commit
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# PYTHON_ARGCOMPLETE_OK
"""
Organize media files by date
----------------------------

Requires `exiftool` binary.

:copyright:  (c) Andrey Mikhaylenko, 2012--2013
:license:  LGPL3
:disclaimer:  ABSOLUTELY NO WARRANTY!  USE AT YOUR OWN RISK.

"""

from collections import OrderedDict
import datetime
import filecmp
import os
import re
import subprocess

import argh
import blessings


t = blessings.Terminal()


MOVED = 1
SKIPPED = 2
REMOVED = 3
ERROR = 4

DEFAULT_EXTENSIONS = '.jpg, .jpeg, .JPG, .JPEG'

DATE_HIERARCHY_PATTERN = re.compile(r'\d{4}/\d{2}/\d{2}([^0-9/]+|/[^0-9/]+)?$')


# the first that's found wins
DATE_FIELDS = (
    'Date/Time Original',
    'Create Date',
    'Creation Date',
    'Media Create Date',
    'Track Create Date',
    'File Modification Date/Time',
)


def parse_date(value):
    # rough; but we don't need precise date/time here
    raw_date, _, _ = value.partition(' ')
    return datetime.datetime.strptime(raw_date, '%Y:%m:%d').date()


def get_meta(file_path):
    # requires exiftool.
    # returns metadata even for non-EXIF-friendly files because basic file
    # attributes are merged in.
    p = subprocess.Popen(['exiftool', file_path], stdout=subprocess.PIPE)
    output, errors = p.communicate()

    if errors:
        raise RuntimeError('EXIF: {0}: {1}'.format(file_path, errors))

    if not output:
        raise RuntimeError('EXIF: {0}: no meta data found'.format(file_path))

    lines = output.decode(errors='replace').split('\n')
    triples = (x.strip().partition(':') for x in lines if x.strip())
    pairs = ((k.strip(), v.strip()) for k,sep,v in triples)

    return OrderedDict(sorted(pairs, key=lambda t: t[0]))


def get_file_datetime(path):
    metadata = get_meta(path)

    for field in DATE_FIELDS:
        value = metadata.get(field)
        if not value:
            continue
        date = parse_date(value)
        if not date:
            continue
        assert 2000 < date.year, 'a reasonable date would be after year ~2000'
        return date

    raise KeyError(u'{0} has no date/time EXIF data. '
                   u'Tags: {1}'.format(path, sorted(metadata)))


def move_image(src, dest_dir, dry_run=False, verbosity=0,
               delete_dupes=False, movesorted=False):
    """ Перемещает указанный файл в каталог вида: год/месяц/день.
    Дата извлекается из EXIF.

    Примерный шаблон работы::

        $src → $dest_dir/$year/$month/$day/$name

    (где $name — извлеченное из $src имя файла)

    * извлекать дату из EXIF
      * корректно обрабатывать ситуацию с отсутствием EXIF или битой датой
    * создавать каталоги (год, месяц, день), если не существуют
    * проверять, нет ли там уже файла с этим именем
      * если есть, проверять, идентично ли содержание файлов
        * если идентично, затирать исходный
        * если различается, ничего не трогать и орать о конфликте имен
      * если нет, перемещать файл туда

    """

    if verbosity >= 2:
        print()
        print(src)

    name = os.path.basename(src)
    if name.startswith('.'):
        print(t.red('WARNING: including hidden file {0}'.format(src)))

    # извлекаем дату
    try:
        dt = get_file_datetime(src)
    except KeyError as e:
        print('  ERROR:', t.red(e.message))
        return ERROR

    # make year/month/day/ directory
    date_dirs = u'{0.year}/{0.month:02d}/{0.day:02d}'.format(dt)
    inner_dest_dir = os.path.join(dest_dir, date_dirs)
    if not dry_run:
        if not os.path.exists(inner_dest_dir):
            if verbosity >= 1:
                print('  mkdir -p', inner_dest_dir)
            os.makedirs(inner_dest_dir)

    dest = os.path.join(inner_dest_dir, name)
    if verbosity >= 2:
        print('  trying', src, '→', dest)

    if os.path.abspath(src) == os.path.abspath(dest):
        if verbosity >= 2:
            print(t.yellow('  SKIP: source equals destination for {0}'.format(src)))
        return SKIPPED

    src_dir, _ = os.path.split(src)
    pre_sort_match = DATE_HIERARCHY_PATTERN.search(src_dir)
    if not movesorted and pre_sort_match and pre_sort_match.group() not in dest:
        if verbosity >= 1:
            print(t.yellow('  SKIP: file is within a hierarchy but under '
                           'a different date: {0} vs {1}'.format(src, dest)))
        return SKIPPED

    # make sure the file does not exist there
    if os.path.exists(dest):
        if verbosity >= 2:
            print(t.yellow('  the file is already there'))
        if filecmp.cmp(src, dest):
            if verbosity >= 2:
                print(t.green('  files are identical'))
            if delete_dupes:
                if not dry_run:
                    if verbosity >= 1:
                        print(t.green('  removing {0}'.format(src)))
                    os.remove(src)
                return REMOVED
            else:
                return SKIPPED
        else:
            print(t.red('  ERROR: files differ, name conflict:\n'
                        '    {src}\n'
                        '    {dest}'.format(src=src, dest=dest)))
            return ERROR
    else:
        if verbosity >= 1:
            print(t.green('  moving "{0}" → "{1}"'.format(src, dest)))
        if not dry_run:
            os.rename(src, dest)
        return MOVED


@argh.arg('-v', '--verbosity', action='count')
def move_images(src_dir, dest_dir, dry_run=False, verbosity=0,
                extensions=DEFAULT_EXTENSIONS, delete_dupes=False,
                recursive=False,
                checksorted:'check even files which seem to be already sorted'=False,
                movesorted:'move even files which seem to be already sorted'=False,
                include:'comma-separated list of patterns to include (e.g. "/screen_,/img_")'='',
                exclude:'comma-separated list of patterns to ignore'=''):
    """ Moves images from src_dir/foo.jpg to dest_dir/year/month/day/foo.jpg.
    The dates are extracted from EXIF.

    :param delete_dupes:
        Removes previously copied files instead of skipping them.

    """
    time_start = datetime.datetime.now()

    allowed_extensions = [x.strip() for x in extensions.split(',') if x.strip()]
    patterns_to_include = [x.strip() for x in include.split(',') if x.strip()]
    patterns_to_ignore = [x.strip() for x in exclude.split(',') if x.strip()]

    if recursive:
        nodes = []
        for root, dirs, fnames in os.walk(src_dir):
            for fname in fnames:
                nodes.append(os.path.join(root, fname))
        paths = sorted(nodes)
    else:
        paths = (os.path.join(src_dir, x) for x in sorted(os.listdir(src_dir)))

    statuses = {}
    for path in paths:
        if patterns_to_include and not any(
                pattern in path for pattern in patterns_to_include):
            continue

        if any(pattern in path for pattern in patterns_to_ignore):
            continue

        _, ext = os.path.splitext(path)
        if not ext in allowed_extensions:
            continue

        if not os.path.isfile(path):
            continue

        src_dir, _ = os.path.split(path)
        if not checksorted and DATE_HIERARCHY_PATTERN.search(src_dir):
            if verbosity >= 2:
                print(t.yellow('  file is already within a date hierarchy: '
                               '{0}'.format(path)))
            statuses.setdefault(SKIPPED, 0)
            statuses[SKIPPED] += 1
            continue

        status = move_image(path, dest_dir,
                            dry_run=dry_run, verbosity=verbosity,
                            delete_dupes=delete_dupes, movesorted=movesorted)
        statuses.setdefault(status, 0)
        statuses[status] += 1

    time_end = datetime.datetime.now()
    time_delta = time_end - time_start
    total_files = sum(statuses.values())
    if total_files:
        print('Done in {0} ({1:.02f} sec per file)'.format(time_delta,
                time_delta.total_seconds() / total_files))

    report_matrix = (
        ('moved',   MOVED,   t.green),
        ('errors',  ERROR,   t.red),
        ('skipped', SKIPPED, t.yellow),
        ('removed', REMOVED, t.yellow),
    )

    for name, code, colour in report_matrix:
        cnt = statuses.get(code, 0)
        if cnt:
            print(colour('{name}: {cnt}'.format(name=name, cnt=cnt)))


if __name__ == '__main__':
    argh.dispatch_command(move_images)