Commits

Christoph Dann  committed 10d807f

PST for policy evaluation

  • Participants
  • Parent commits 8f77b2e
  • Branches poleval

Comments (0)

Files changed (4)

File Domains/Domain.py

 import Tools
 from copy import copy
 from Tools.progressbar import ProgressBar
-
+from collections import Counter
 __copyright__ = "Copyright 2013, RLPy http://www.acl.mit.edu/RLPy"
 __credits__ = ["Alborz Geramifard", "Robert H. Klein", "Christoph Dann",
                "William Dabney", "Jonathan P. How"]
 
         return np.array(next_states), np.array(rewards)
 
+
+class BigDiscreteDomain(object):
+
+    @property
+    def num_dim(self):
+        return self.statespace_limits.shape[0]
+
+    batch_V_sampling = 1
+
+    """
+    def num_V_batches(self, policy, num_traj,  discretization):
+        xtest, stationary_dis = self.test_states(policy, discretization=discretization, num_traj=num_traj)
+        return int(np.ceil(float(xtest.shape[1]) / self.batch_V_sampling))
+
+    def precompute_V_batch_cache(self, discretization, policy, max_len_traj, num_traj, batch):
+        xtest, stationary_dis = self.test_states(policy, discretization=discretization, num_traj=num_traj)
+        rng = slice(batch * self.batch_V_sampling, min((batch + 1) * self.batch_V_sampling, xtest.shape[1]))
+        print "Compute batch", batch, "with indices", rng
+        # no need to return stuff. should be cached anyway
+        self.sample_V_batch(xtest[:,rng], policy, num_traj, max_len_traj)
+
+    """
+
+    def V(self, policy, discretization=None, max_len_traj=200, num_traj=1000, num_traj_stationary=1000):
+        stat_dis = self.stationary_distribution(policy, num_traj=num_traj_stationary)
+        #xtest, stationary_dis = self.test_states(policy, discretization=discretization, num_traj=num_traj_stationary)
+        stationary_dis = np.zeros(len(stat_dis), dtype="int")
+        R = np.zeros(len(stat_dis))
+        xtest = np.zeros((self.num_dim, len(stat_dis)), dtype="int64")
+        xtest_term = np.zeros(xtest.shape[1], dtype="bool")
+        if self.batch_V_sampling == 1:
+            with ProgressBar() as p:
+                i = 0
+                for k, count in stat_dis.iteritems():
+                    xtest[:,i] = np.array(k)
+                    stationary_dis[i] = count
+                    p.update(i, xtest.shape[1], "Sample V")
+                    if self.isTerminal(xtest[:,i]):
+                        R[i] = 0.
+                        xtest_term[i] = True
+                    else:
+                        R[i] = self.sample_V(xtest[:,i], policy, num_traj, max_len_traj)
+                    i += 1
+        else:
+            # batch mode to be able to do caching with smaller batches and
+            # precompute V in parallel with cluster jobs
+            for k, count in stat_dis.iteritems():
+                xtest[:,i] = np.array(k)
+                stationary_dis[i] = count
+                i += 1
+            for i in xrange(0, xtest.shape[1], self.batch_V_sampling):
+                rng = slice(i * self.batch_V_sampling, min((i + 1) * self.batch_V_sampling, xtest.shape[1]))
+
+                R[rng], xtest_term[rng] = self.sample_V_batch(xtest[:,rng], policy, num_traj, max_len_traj)
+        return R, xtest, xtest_term, stationary_dis
+    """
+    def sample_V_batch(self, xtest, policy, num_traj, max_len_traj):
+        R = np.zeros(xtest.shape[1])
+        xtest_term = np.zeros(xtest.shape[1], dtype="bool")
+        with ProgressBar() as p:
+            for i in xrange(xtest.shape[1]):
+                p.update(i, xtest.shape[1], "Sample V batch")
+                if self.isTerminal(xtest[:,i]):
+                    R[i] = 0.
+                    xtest_term[i] = True
+                else:
+                    R[i] = self.sample_V(xtest[:,i], policy, num_traj, max_len_traj)
+        return R, xtest_term
+    """
+    def sample_V(self, s, policy, num_traj, max_len_traj):
+        ret = 0.
+        for i in xrange(num_traj):
+            _, _, rt = self.sample_trajectory(s, policy, max_len_traj)
+            ret += np.sum(rt * np.power(self.gamma, np.arange(len(rt))))
+        return ret / num_traj
+
+    def stationary_distribution(self, policy, num_traj=1000):
+        counts = Counter()
+
+        for i in xrange(num_traj):
+            self.s0()
+            s, _, _ = self.sample_trajectory(self.state, policy, self.episodeCap)
+            for j in xrange(s.shape[0]):
+                counts[tuple(s[j,:])] += 1
+        return counts
+
+
+
 class ContinuousDomain(object):
 
     @property

File Domains/PST.py

 """Persistent search and track mission domain."""
 
 from Tools import *
-from Domain import Domain
-
+from Domain import Domain, BigDiscreteDomain
+import copy
 __copyright__ = "Copyright 2013, RLPy http://www.acl.mit.edu/RLPy"
 __credits__ = ["Alborz Geramifard", "Robert H. Klein", "Christoph Dann",
             "William Dabney", "Jonathan P. How"]
 __author__ = ["Robert H. Klein", "Alborz Geramifard"]
 
 
-class PST(Domain):
+class PST(Domain, BigDiscreteDomain):
     """
     Persistent Search and Track Mission with multiple Unmanned Aerial Vehicle
     (UAV) agents.
     apenalty of - 50 if any UAV crashes and always some small penalty for burned fuel.
 
     **REFERENCE:**
-    
+
     .. seealso::
         J. D. Redding, T. Toksoz, N. Ure, A. Geramifard, J. P. How, M. Vavrina,
         and J. Vian. Distributed Multi-Agent Persistent Surveillance and
 
     # Domain variables
     motionNoise         = 0    #: Noise in action (with some probability, loiter rather than move)
-    numHealthySurveil   = 0    # Number of UAVs in surveillance area with working sensor and actuator [n_s]
-    fuelUnitsBurned     = 0
     LIMITS              = []   # Limits on action indices
-    isCommStatesCovered = False # All comms states are covered on a given timestep, enabling surveillance rewards
 
     # Plotting constants
     UAV_RADIUS = 0.3
     RECT_GAP            = 0.9   # Gap to leave between rectangles
     dist_between_locations = 0 # Total distance between location rectangles in plot, computed in init()
 
+    possible_action_cache = {}
     ###
     def __init__(self, NUM_UAV = 3, motionNoise = 0, logger = None):
         """
         self.statespace_limits      = vstack([locations_lim, fuel_lim, actuator_lim, sensor_lim])# Limits of each dimension of the state space. Each row corresponds to one dimension and has two elements [min, max]
         self.motionNoise            = motionNoise # with some noise, when uav desires to transition to new state, remains where it is (loiter)
         self.LIMITS                 = UAVAction.SIZE * ones(NUM_UAV, dtype='int') # eg [3,3,3,3], number of possible actions
-        self.isCommStatesCovered    = False # Don't have communications available yet, so no surveillance reward allowed.
         self.location_rect_vis      = None #
         self.location_coord         = None
         self.uav_circ_vis           = None
         [self.DimNames.append('UAV%d-sen' % i) for i in arange(NUM_UAV)]
         super(PST,self).__init__(logger)
         if self.logger: self.logger.log("NUM_UAV:\t\t%d" % self.NUM_UAV)
+        def gk(key):
+            key["policy"] = (key["policy"].__class__.__name__,
+                            copy.copy(key["policy"].__getstate__()))
+            if "domain" in key["policy"][1]:
+                del key["policy"][1]["domain"]
+            if "random_state" in key["policy"][1]:
+                del key["policy"][1]["random_state"]
+
+            key["self"] = copy.copy(key["self"].__getstate__())
+            lst = [
+                "random_state", "V", "domain_fig", "logger", "DirNames", "state",
+                "transition_samples", "_all_states", "chain_model", "policy",
+                "possible_action_cache", "location_rect_vis", "location_coord",
+                "uav_circ_vis", "uav_text_vis", "uav_sensor_vis",
+                "uav_actuator_vis", "comms_line", "DimNames", "stationary_distribution"]
+            for l in lst:
+                if l in key["self"]:
+                    del key["self"][l]
+            return key
+
+        self.V = memory.cache(self.V, get_key=gk)
+        self.stationary_distribution = memory.cache(self.stationary_distribution, get_key=gk)
+        self.s0()
 
     def showDomain(self, a=0):
         s = self.state
         sStruct = self.state2Struct(self.state)
         nsStruct = self.state2Struct(ns)
         # Subtract 1 below to give -1,0,1, easily sum actions
-        actionVector = array(id2vec(a,self.LIMITS)) # returns list of form [0,1,0,2] corresponding to action of each uav
+        actionVector = np.array(id2vec(a,self.LIMITS)) # returns list of form [0,1,0,2] corresponding to action of each uav
         nsStruct.locations += (actionVector-1)
 
         #TODO - incorporate cost graph as in matlab.
         fuelBurnedBool = [(actionVector[i] == UAVAction.LOITER and (nsStruct.locations[i] == UAVLocation.REFUEL or nsStruct.locations[i] == UAVLocation.BASE)) for i in arange(self.NUM_UAV)]
         fuelBurnedBool = array(fuelBurnedBool) == False
         nsStruct.fuel = array([sStruct.fuel[i] - self.NOM_FUEL_BURN * fuelBurnedBool[i] for i in arange(self.NUM_UAV)]) # if fuel
-        self.fuelUnitsBurned = sum(fuelBurnedBool)
+        fuelUnitsBurned = sum(fuelBurnedBool)
         distanceTraveled = sum(logical_and(nsStruct.locations, sStruct.locations))
 
         # Actuator failure transition
         nsStruct.sensor[baseIndices] = SensorState.RUNNING
 
         # Test if have communication
-        self.isCommStatesCovered = any(sStruct.locations == UAVLocation.COMMS)
+        isCommStatesCovered = any(sStruct.locations == UAVLocation.COMMS)
 
         surveillanceBool = (sStruct.locations == UAVLocation.SURVEIL)
-        self.numHealthySurveil = sum(logical_and(surveillanceBool, sStruct.sensor))
+        numHealthySurveil = sum(logical_and(surveillanceBool, sStruct.sensor))
 
         totalStepReward = 0
 
         self.state = ns.copy()
 
         ##### Compute reward #####
-        if self.isCommStatesCovered:
-            totalStepReward += self.SURVEIL_REWARD * min(self.NUM_TARGET, self.numHealthySurveil)
+        if isCommStatesCovered:
+            totalStepReward += self.SURVEIL_REWARD * min(self.NUM_TARGET, numHealthySurveil)
         if self.isTerminal(): totalStepReward += self.CRASH_REWARD
-        totalStepReward += self.FUEL_BURN_REWARD_COEFF * self.fuelUnitsBurned + self.MOVE_REWARD_COEFF * distanceTraveled # Presently movement penalty is set to 0
+        totalStepReward += self.FUEL_BURN_REWARD_COEFF * fuelUnitsBurned + self.MOVE_REWARD_COEFF * distanceTraveled # Presently movement penalty is set to 0
         return totalStepReward,ns,self.isTerminal(), self.possibleActions()
 
     def s0(self):
 
     def possibleActions(self):
         s = self.state
+        pp = self.possible_action_cache.get(tuple(s), None)
+        if pp is not None:
+            return pp
+
         # return the id of possible actions
-        # find empty blocks (nothing on top)
         validActions = [] # Contains a list of uav_actions lists, e.g. [[0,1,2],[0,1],[1,2]] with the index corresponding to a uav.
         # First, enumerate the possible actions for each uav
         sStruct = self.state2Struct(s)
                 validActions.append([uav_actions])
             else:
                 validActions.append(uav_actions)
-        return array(self.vecList2id(validActions, UAVAction.SIZE)) # TODO place this in tools
+        pa = np.array(self.vecList2id(validActions, UAVAction.SIZE))
+        self.possible_action_cache[tuple(s)] = pa
+        return pa
 
     # TODO place this in Tools
     def vecList2id(self,x,maxValue):
             else:
                 self.vecList2idHelper(x,actionIDs,ind+1,partialActionAssignment, maxValue,limits) # TODO remove self
 
-    def isTerminal(self):
-        sStruct = self.state2Struct(self.state)
+    def isTerminal(self, s=None):
+        if s is None:
+            s = self.state
+        sStruct = self.state2Struct(s)
         return any(logical_and(sStruct.fuel <= 0, sStruct.locations != UAVLocation.REFUEL))
 
 class UAVLocation:

File Tools/GeneralTools.py

     lim_prod = cumprod(limits[:-1])
     return x[0] + sum(map(lambda (x,y):x*y,zip(x[1:],lim_prod)))
 def vec2id(x,limits):
-    # returns a unique id by calculating the enumerated number corresponding to a vector given the number of bins in each dimension
-    # I use a recursive calculation to save time by looping once backward on the array = O(n)
-    # for example:
-    # vec2id([0,1],[5,10]) = 5
+    """returns a unique id by calculating the enumerated number corresponding
+    to a vector given the number of bins in each dimension
+    I use a recursive calculation to save time by looping once backward on the array = O(n)
+    for example:
+    >>> vec2id([0,1],[5,10]) = 5
+
+    """
     if isinstance(x,int): return x
     _id = 0
     for d in arange(len(x)-1,-1,-1):
         _id += x[d]
 
     return _id
+
 ######################################################
 def id2vec(_id,limits):
     #returns the vector corresponding to an id given the number of buckets in each dimension (invers of vec2id)

File examples/uav/poleval/ifddk.py

+from Tools import Logger
+from ValueEstimators.TDLearning import TDLearning
+from Representations import *
+from Domains import PST
+from Tools import __rlpy_location__
+from Policies.FixedPolicy import GoodCartPoleSwingupPolicy
+from Experiments.PolicyEvaluationExperiment import PolicyEvaluationExperiment
+from Policies import StoredPolicy
+import numpy as np
+from hyperopt import hp
+
+param_space = {
+               'discover_threshold': hp.loguniform("discover_threshold",
+                   np.log(1e0), np.log(1e4)),
+               'lambda_': hp.uniform("lambda_", 0., 1.),
+               'kappa': hp.loguniform("kappa", np.log(1e-10), np.log(1e-3)),
+               'boyan_N0': hp.loguniform("boyan_N0", np.log(1e1), np.log(1e5)),
+               'initial_alpha': hp.loguniform("initial_alpha", np.log(1e-2), np.log(1))}
+
+
+def make_experiment(id=1, path="./Results/Temp/{domain}/poleval/ifdd/",
+                    discover_threshold=100., #0.42878655,
+                    lambda_=0.701309,
+                    kappa=1e-7,
+                    boyan_N0=1375.098,
+                    initial_alpha=0.016329):
+    logger = Logger()
+    max_steps = 500000
+    sparsify = 1
+    domain = PST(NUM_UAV=4, motionNoise=0, logger=logger)
+    pol = StoredPolicy(filename=__rlpy_location__+ "/Policies/PST_4UAV_mediocre_policy_nocache.pck")
+
+    initial_rep = IndependentDiscretization(domain, logger)
+    representation = iFDDK(domain, logger, discover_threshold, initial_rep,
+                           sparsify=sparsify, lambda_=lambda_,
+                           useCache=True, lazy=True, kappa=kappa)
+    estimator = TDLearning(representation=representation, lambda_=lambda_,
+                           boyan_N0=boyan_N0, initial_alpha=initial_alpha, alpha_decay_mode="boyan")
+    experiment = PolicyEvaluationExperiment(estimator, domain, pol, max_steps=max_steps, num_checks=20,
+                                            path=path, log_interval=10, id=id)
+    experiment.num_eval_points_per_dim=20
+    experiment.num_traj_V = 100
+    experiment.num_traj_stationary = 100
+    return experiment
+
+if __name__ == '__main__':
+    from Tools.run import run_profiled
+    #run_profiled(make_experiment)
+    experiment = make_experiment(1)
+    experiment.run()
+    #experiment.plot()
+    #experiment.save()