Source

freenet-spider-database / fdb.py

Full commit
#!/usr/bin/env python3

"""A simple flatfile database for freenet spiders."""

import fdb_io as io
from os import makedirs
from os import mkdir, rmdir # for locking
from time import sleep
from os.path import isdir, join
from copy import deepcopy

def get_data_for_key(key,
                     sites_dir="sites",
                     states_dir="states",
                     latest_revision=False):
    """Get the data for one site.

    >>> io._test_clean_dirs()
    >>> gpl = {'path': 'KSK@gpl.txt', 'description': 'The GNU General Public License. The most widespread license for free software.'}
    >>> edit_data_for_key("KSK@gpl.txt", site=gpl, sites_dir="test_sites", states_dir="test_states")
    >>> gpl == get_data_for_key("KSK@gpl.txt", sites_dir="test_sites", states_dir="test_states")[0]
    True
    """

    # make sure the data folders exist.
    if not isdir(sites_dir):
        makedirs(sites_dir)
    if not isdir(states_dir):
        makedirs(states_dir)
    
    sites, states = io.get_dataset_for_key(key, sites_dir, states_dir)
    subkey = io.key_to_subkey(key, latest_revision=latest_revision)
    site = {}
    for s in sites:
        if s.get("path", None) == subkey:
            site = s
    state = {}
    for s in states:
        if s.get("path", None) == subkey:
            state = s

    return site, state


def _put_data_for_key(key, subkey, site=None, state=None, 
                      sites_dir="sites",
                      states_dir="states",
                      path_from_key=True):
    """Put a site or state or both into the site. This HAS TO be called in the CRITICAL SECTION of edit_data_for_key() to make it multiprocess-safe."""
    sites_new = []
    states_new = []
    sites, states = io.get_dataset_for_key(key, sites_dir, states_dir)
    if site is not None:
        added = False
        for s in sites:
            if not added and (path_from_key and s.get("path", None) == subkey
                              or not path_from_key and 'path' in site and s.get("path", None) == site['path']):
                # update the site
                sites_new.append(site)
                added = True
            else:
                sites_new.append(s)
        if not added:
            sites_new.append(site)
        sites = sites_new
        
    if state is not None:
        for s in states:
            added = False
            if not added and (path_from_key and s.get("path", None) == subkey
                              or not path_from_key and 'path' in state and s.get("path", None) == state['path']):
                # update the site
                states_new.append(site)
                added = True
            else:
                states_new.append(s)
        if not added:
            states_new.append(site)
        states = states_new

    io.put_dataset_for_key(key, sites, states, sites_dir, states_dir)
    

def _remove_data_for_key(key, subkey, site=None, state=None, 
                         sites_dir="sites",
                         states_dir="states",
                         path_from_key=True):
    """Remove a site or state or both from the site. This HAS TO be called in the CRITICAL SECTION of edit_data_for_key() to make it multiprocess-safe.

    Set site or state to anything but None to remove the data."""
    sites_new = []
    states_new = []
    sites, states = io.get_dataset_for_key(key, sites_dir, states_dir)
    if site is not None:
        for s in sites:
            if path_from_key and not s.get("path", None) == subkey or not path_from_key and 'path' in site and not s.get("path", None) == site['path']:
                sites_new.append(s)
        sites = sites_new
        
    if state is not None:
        for s in states:
            if path_from_key and not s.get("path", None) == subkey or not path_from_key and 'path' in state and not s.get("path", None) == state['path']:
                states_new.append(s)
        states = states_new

    io.put_dataset_for_key(key, sites, states, sites_dir, states_dir)
    

def edit_data_for_key(key, site=None, state=None, action="put", 
                      sites_dir="sites",
                      states_dir="states",
                      latest_revision=False,
                      path_from_key=True):
    """Put a site or state or both into the site.

    @param path_from_key: Should the path be computed from the key or taken from the site/state dictionary?

    TODO: Refactor: the site doesn’t need to contain the path. The path should come from the key alone.
    
    >>> io._test_clean_dirs()
    >>> gpl = {'path': 'KSK@gpl.txt', 'description': 'The GNU General Public License. The most widespread license for free software.'}
    >>> edit_data_for_key("KSK@gpl.txt", site=gpl, sites_dir="test_sites", states_dir="test_states")
    >>> gpl == get_data_for_key("KSK@gpl.txt", sites_dir="test_sites", states_dir="test_states")[0]
    True
    >>> rogar = {'path': 'KSK@rogar.txt', 'description': 'Rogar, the barbarian.'}
    >>> edit_data_for_key(rogar["path"], site=rogar, sites_dir="test_sites", states_dir="test_states")
    >>> sites, states = io.get_dataset_for_key("KSK@gpl.txt", sites_dir="test_sites", states_dir="test_states")
    >>> list(sites)
    [{'path': 'KSK@gpl.txt', 'description': 'The GNU General Public License. The most widespread license for free software.'}, {'path': 'KSK@rogar.txt', 'description': 'Rogar, the barbarian.'}]
    >>> list(states)
    []

    >>> infinite_hands = {'path': '/Infinite_Hands/2/', 'description': 'singing a part of the history of free software.', 'randomint': 5}
    >>> inf_subkey = io.key_to_subkey(infinite_hands['path'])
    >>> usk = "USK@N7dmVKbxm5Q9YIzg74T1gUJd96eyAa2VLoMlPp0CQQs,AfZRxhqkvql5E~hDcW1s0mtHTAKrkhWpfcRm2kqkUjE,AQACAAE"
    >>> edit_data_for_key(usk + infinite_hands["path"], site=infinite_hands, sites_dir="test_sites", states_dir="test_states")
    >>> get_data_for_key(usk + infinite_hands['path'], sites_dir="test_sites", states_dir="test_states")
    ({'path': '/Infinite_Hands/2/', 'description': 'singing a part of the history of free software.', 'randomint': 5}, {})
    >>> sites, states = io.get_dataset_for_key(usk, sites_dir="test_sites", states_dir="test_states")
    >>> list(sites)
    [{'path': '/Infinite_Hands/2/', 'description': 'singing a part of the history of free software.', 'randomint': 5}]

    >>> inf_subkey = io.key_to_subkey(infinite_hands['path'], latest_revision=True)
    >>> inf_subkey
    '/Infinite_Hands/-1/'
    >>> usk = "USK@N7dmVKbxm5Q9YIzg74T1gUJd96eyAa2VLoMlPp0CQQs,AfZRxhqkvql5E~hDcW1s0mtHTAKrkhWpfcRm2kqkUjE,AQACAAE"
    >>> edit_data_for_key(usk + inf_subkey, site=infinite_hands, sites_dir="test_sites", states_dir="test_states")
    Warning: the path in the site and the key don’t match. Adjusting the path.
    >>> get_data_for_key(usk + inf_subkey, sites_dir="test_sites", states_dir="test_states")
    ({'path': '/Infinite_Hands/-1/', 'description': 'singing a part of the history of free software.', 'randomint': 5}, {})
    
    >>> sites, states = io.get_dataset_for_key(usk, sites_dir="test_sites", states_dir="test_states")
    >>> list(sites)
    [{'path': '/Infinite_Hands/2/', 'description': 'singing a part of the history of free software.', 'randomint': 5}, {'path': '/Infinite_Hands/-1/', 'description': 'singing a part of the history of free software.', 'randomint': 5}]
    >>> edit_data_for_key("USK@N7dmVKbxm5Q9YIzg74T1gUJd96eyAa2VLoMlPp0CQQs,AfZRxhqkvql5E~hDcW1s0mtHTAKrkhWpfcRm2kqkUjE,AQACAAE" + inf_subkey, site=infinite_hands, action="remove", sites_dir="test_sites", states_dir="test_states")
    Warning: the path in the site and the key don’t match. Adjusting the path.
    >>> get_data_for_key("USK@N7dmVKbxm5Q9YIzg74T1gUJd96eyAa2VLoMlPp0CQQs,AfZRxhqkvql5E~hDcW1s0mtHTAKrkhWpfcRm2kqkUjE,AQACAAE" + inf_subkey, sites_dir="test_sites", states_dir="test_states")
    ({}, {})
    >>> edit_data_for_key(usk + infinite_hands['path'], site=infinite_hands, sites_dir="test_sites", states_dir="test_states", latest_revision=True)
    Warning: the path in the site and the key don’t match. Adjusting the path.
    >>> get_data_for_key("USK@N7dmVKbxm5Q9YIzg74T1gUJd96eyAa2VLoMlPp0CQQs,AfZRxhqkvql5E~hDcW1s0mtHTAKrkhWpfcRm2kqkUjE,AQACAAE" + inf_subkey, sites_dir="test_sites", states_dir="test_states")
    ({'path': '/Infinite_Hands/-1/', 'description': 'singing a part of the history of free software.', 'randomint': 5}, {})
    >>> get_data_for_key("USK@N7dmVKbxm5Q9YIzg74T1gUJd96eyAa2VLoMlPp0CQQs,AfZRxhqkvql5E~hDcW1s0mtHTAKrkhWpfcRm2kqkUjE,AQACAAE" + infinite_hands['path'], sites_dir="test_sites", states_dir="test_states", latest_revision=True)
    ({'path': '/Infinite_Hands/-1/', 'description': 'singing a part of the history of free software.', 'randomint': 5}, {})
    >>> edit_data_for_key("USK@N7dmVKbxm5Q9YIzg74T1gUJd96eyAa2VLoMlPp0CQQs,AfZRxhqkvql5E~hDcW1s0mtHTAKrkhWpfcRm2kqkUjE,AQACAAE" + infinite_hands['path'], site=infinite_hands, action="remove", sites_dir="test_sites", states_dir="test_states", latest_revision=True)
    Warning: the path in the site and the key don’t match. Adjusting the path.
    >>> get_data_for_key("USK@N7dmVKbxm5Q9YIzg74T1gUJd96eyAa2VLoMlPp0CQQs,AfZRxhqkvql5E~hDcW1s0mtHTAKrkhWpfcRm2kqkUjE,AQACAAE" + inf_subkey, sites_dir="test_sites", states_dir="test_states")
    ({}, {})
    >>> edit_data_for_key("USK@N7dmVKbxm5Q9YIzg74T1gUJd96eyAa2VLoMlPp0CQQs,AfZRxhqkvql5E~hDcW1s0mtHTAKrkhWpfcRm2kqkUjE,AQACAAE" + infinite_hands['path'], site={'description': 'singing a part of the history of free software.'}, sites_dir="test_sites", states_dir="test_states", latest_revision=True)
    Warning: the path in the site and the key don’t match. Adjusting the path.
    >>> get_data_for_key("USK@N7dmVKbxm5Q9YIzg74T1gUJd96eyAa2VLoMlPp0CQQs,AfZRxhqkvql5E~hDcW1s0mtHTAKrkhWpfcRm2kqkUjE,AQACAAE" + inf_subkey, sites_dir="test_sites", states_dir="test_states")
    ({'path': '/Infinite_Hands/-1/', 'description': 'singing a part of the history of free software.'}, {})
    >>> edit_data_for_key("USK@N7dmVKbxm5Q9YIzg74T1gUJd96eyAa2VLoMlPp0CQQs,AfZRxhqkvql5E~hDcW1s0mtHTAKrkhWpfcRm2kqkUjE,AQACAAE" + infinite_hands['path'], site={'description': 'singing a part of the history of free software.'}, sites_dir="test_sites", states_dir="test_states", latest_revision=True, path_from_key=False)
    >>> sites, states = io.get_dataset_for_key(usk, sites_dir="test_sites", states_dir="test_states")
    >>> list(sites)
    [{'path': '/Infinite_Hands/2/', 'description': 'singing a part of the history of free software.', 'randomint': 5}, {'path': '/Infinite_Hands/-1/', 'description': 'singing a part of the history of free software.'}, {'description': 'singing a part of the history of free software.'}]

    """
    # make sure the data folders exist.
    if not isdir(sites_dir):
        makedirs(sites_dir)
    if not isdir(states_dir):
        makedirs(states_dir)

    # prepare local stuff for the critical section.
    subkey = io.key_to_subkey(key, latest_revision=latest_revision)
    if site is not None and path_from_key and (('path' in site and site['path'] != subkey) or not 'path' in site):
        print("Warning: the path in the site and the key don’t match. Adjusting the path.")
        # TODO: use deepcopy
        site = deepcopy(site)
        site['path'] = subkey
    if state is not None and path_from_key and (('path' in state and state['path'] != subkey) or not 'path' in site):
        print("Warning: the path in the state and the key don’t match. Adjusting the path.")
        site = deepcopy(site)
        state['path'] = subkey

    # to make sure that we can work in multiple processes
    # we use a lock directory: sites_dir/.lock/
    # mkdir is atomic and fails atomically. 
    lockdir = join(sites_dir, ".lock")
    while True:
        try:
            mkdir(lockdir)
            break
        except OSError:
            sleep(0.01)
    #### critical section: insert the data and writeout the dataset ####
    # nothing in this section may break the script, else the lock stays put.
    try:
        if action == "put": 
            _put_data_for_key(key, subkey, site=site, state=state, sites_dir=sites_dir, states_dir=states_dir, path_from_key=path_from_key)
        elif action == "remove":
            _remove_data_for_key(key, subkey, site=site, state=state, sites_dir=sites_dir, states_dir=states_dir, path_from_key=path_from_key)
        else: print("Unknown action")
    except IOError:
        print("Couldn’t store the updated data. Dataset discarded.")
    #except:
    #    print("Unknown Error. Dataset discarded.")
    #### /critical section ####
    rmdir(lockdir)

def _test():
    from doctest import testmod
    True
    testmod()

if __name__ == "__main__":
    from sys import argv
    if "--test" in argv: 
        _test()
        exit()