Source

aichallenge-py / mdp.py

Full commit
# -*- coding: utf-8 -*-
#
#  mdp.py
#  aichallenge-py
#
#  Created by Lars Yencken on 2011-11-13.
#  Copyright 2011 Lars Yencken. All rights reserved.
#

"""
Solve Markov Decision Processes using value iteration, where the agent is on
a map and can only move in four directions.
"""

import numpy as np

import settings

def value_iteration(reward, torus=False, gamma=settings.MDP_GAMMA,
        eps=settings.MDP_EPS):
    "Solve the MDP problem with the value iteration method."
    value = reward

    if torus:
        propagate = _propagate_torus
    else:
        propagate = _propagate_plane

    last_value = value
    value = propagate(reward, reward, gamma)
    while abs(last_value - value).mean() > eps:
        last_value = value
        value = propagate(value, reward, gamma)

    return value

def _propagate_plane(value, reward, gamma):
    rows, cols = value.shape
    shape = (4,) + value.shape
    actions = np.zeros(shape, dtype=np.float32)

    # s
    actions[0, 0:rows-1, :] = value[1:rows]
    # n
    actions[1, 1:rows, :] = value[0:rows-1]
    # e
    actions[2, :, 0:cols-1] = value[:, 1:cols]
    # w
    actions[3, :, 1:cols] = value[:, 0:cols-1]

    action = np.maximum(actions[0], actions[1])
    action = np.maximum(action, actions[2])
    action = np.maximum(action, actions[3])

    return reward + gamma * action

def _propagate_torus(value, reward, gamma=settings.MDP_GAMMA):
    rows, cols = value.shape

    # each action is an offset on our torus
    shape = (4,) + value.shape
    actions = np.zeros(shape, dtype=np.float32)

    # s: first row wraps around
    actions[0, 0:rows-1] = value[1:rows]
    actions[0, rows-1] = value[0]
    # n: last row wraps around
    actions[1, 1:rows] = value[0:rows-1]
    actions[1, 0] = value[rows-1]
    # e: first col wraps around
    actions[2, :, 0:cols-1] = value[:, 1:cols]
    actions[2, :, cols-1] = value[:, 0]
    # w: last col wraps around
    actions[3, :, 1:cols] = value[:, 0:cols-1]
    actions[3, :, 0] = value[:, cols-1]

    action = np.maximum(actions[0], actions[1])
    action = np.maximum(action, actions[2])
    action = np.maximum(action, actions[3])

    return reward + gamma * action