diff --git a/docs/source/memory.rst b/docs/source/memory.rst index 3446f3d..7cacd28 100644 --- a/docs/source/memory.rst +++ b/docs/source/memory.rst @@ -1,4 +1,8 @@ Memory Structures ================= -.. automodule:: rltorch.memory +.. autoclass:: rltorch.memory.ReplayMemory + :members: +.. autoclass:: rltorch.memory.PrioritizedReplayMemory + :members: +.. autoclass:: rltorch.memory.EpisodeMemory :members: diff --git a/rltorch/memory/EpisodeMemory.py b/rltorch/memory/EpisodeMemory.py index 0957465..27efa69 100644 --- a/rltorch/memory/EpisodeMemory.py +++ b/rltorch/memory/EpisodeMemory.py @@ -5,22 +5,43 @@ Transition = namedtuple('Transition', ('state', 'action', 'reward', 'next_state', 'done')) class EpisodeMemory(object): + """ + Memory structure that stores an entire episode and + the observation's associated log-based probabilities. + """ def __init__(self): self.memory = [] self.log_probs = [] def append(self, *args): - """Saves a transition.""" + """ + Adds a transition to the memory. + + Parameters + ---------- + *args + The state, action, reward, next_state, done tuple + """ self.memory.append(Transition(*args)) def append_log_probs(self, logprob): + """ + Adds a log-based probability to the observation. + """ self.log_probs.append(logprob) def clear(self): + """ + Clears the transitions and log-based probabilities. + """ self.memory.clear() self.log_probs.clear() def recall(self): + """ + Return a list of the transitions with their + associated log-based probabilities. + """ if len(self.memory) != len(self.log_probs): raise ValueError("Memory and recorded log probabilities must be the same length.") return list(zip(*tuple(zip(*self.memory)), self.log_probs)) diff --git a/rltorch/memory/PrioritizedReplayMemory.py b/rltorch/memory/PrioritizedReplayMemory.py index 58843e3..1bf153a 100644 --- a/rltorch/memory/PrioritizedReplayMemory.py +++ b/rltorch/memory/PrioritizedReplayMemory.py @@ -147,7 +147,9 @@ class MinSegmentTree(SegmentTree): class PrioritizedReplayMemory(ReplayMemory): def __init__(self, capacity, alpha): - """Create Prioritized Replay buffer. + """ + Create Prioritized Replay buffer. + Parameters ---------- capacity: int @@ -156,9 +158,6 @@ class PrioritizedReplayMemory(ReplayMemory): alpha: float how much prioritization is used (0 - no prioritization, 1 - full prioritization) - See Also - -------- - ReplayBuffer.__init__ """ super(PrioritizedReplayMemory, self).__init__(capacity) assert alpha >= 0 @@ -173,7 +172,14 @@ class PrioritizedReplayMemory(ReplayMemory): self._max_priority = 1.0 def append(self, *args, **kwargs): - """See ReplayBuffer.store_effect""" + """ + Adds a transition to the buffer and add an initial prioritization. + + Parameters + ---------- + *args + The state, action, reward, next_state, done tuple + """ idx = self.position super().append(*args, **kwargs) self._it_sum[idx] = self._max_priority ** self._alpha @@ -191,10 +197,11 @@ class PrioritizedReplayMemory(ReplayMemory): return res def sample(self, batch_size, beta): - """Sample a batch of experiences. - compared to ReplayBuffer.sample - it also returns importance weights and idxes + """ + Sample a batch of experiences. + while returning importance weights and idxes of sampled experiences. + Parameters ---------- batch_size: int @@ -202,6 +209,7 @@ class PrioritizedReplayMemory(ReplayMemory): beta: float To what degree to use importance weights (0 - no corrections, 1 - full correction) + Returns ------- weights: np.array @@ -232,6 +240,32 @@ class PrioritizedReplayMemory(ReplayMemory): return batch def sample_n_steps(self, batch_size, steps, beta): + r""" + Sample a batch of sequential experiences. + while returning importance weights and idxes + of sampled experiences. + + Parameters + ---------- + batch_size: int + How many transitions to sample. + beta: float + To what degree to use importance weights + (0 - no corrections, 1 - full correction) + + Notes + ----- + The number of batches sampled is :math:`\lfloor\frac{batch\_size}{steps}\rfloor`. + + Returns + ------- + weights: np.array + Array of shape (batch_size,) and dtype np.float32 + denoting importance weight of each sampled transition + idxes: np.array + Array of shape (batch_size,) and dtype np.int32 + idexes in buffer of sampled experiences + """ assert beta > 0 sample_size = batch_size // steps @@ -262,9 +296,11 @@ class PrioritizedReplayMemory(ReplayMemory): @jit(forceobj = True) def update_priorities(self, idxes, priorities): - """Update priorities of sampled transitions. + """ + Update priorities of sampled transitions. sets priority of transition at index idxes[i] in buffer to priorities[i]. + Parameters ---------- idxes: [int] diff --git a/rltorch/memory/ReplayMemory.py b/rltorch/memory/ReplayMemory.py index aa32ab7..5507185 100644 --- a/rltorch/memory/ReplayMemory.py +++ b/rltorch/memory/ReplayMemory.py @@ -4,21 +4,38 @@ import torch Transition = namedtuple('Transition', ('state', 'action', 'reward', 'next_state', 'done')) -# Implements a Ring Buffer class ReplayMemory(object): + """ + Creates a ring buffer of a fixed size. + + Parameters + ---------- + capacity : int + The maximum size of the buffer + """ def __init__(self, capacity): self.capacity = capacity self.memory = [] self.position = 0 def append(self, *args): - """Saves a transition.""" + """ + Adds a transition to the buffer. + + Parameters + ---------- + *args + The state, action, reward, next_state, done tuple + """ if len(self.memory) < self.capacity: self.memory.append(None) self.memory[self.position] = Transition(*args) self.position = (self.position + 1) % self.capacity def clear(self): + """ + Clears the buffer. + """ self.memory.clear() self.position = 0 @@ -37,10 +54,35 @@ class ReplayMemory(object): def sample(self, batch_size): + """ + Returns a random sample from the buffer. + + Parameters + ---------- + batch_size : int + The number of observations to sample. + """ return random.sample(self.memory, batch_size) def sample_n_steps(self, batch_size, steps): - idxes = random.sample(range(len(self.memory) - steps), batch_size // steps) + r""" + Returns a random sample of sequential batches of size steps. + + Notes + ----- + The number of batches sampled is :math:`\lfloor\frac{batch\_size}{steps}\rfloor`. + + Parameters + ---------- + batch_size : int + The total number of observations to sample. + steps : int + The number of observations after the one selected to sample. + """ + idxes = random.sample( + range(len(self.memory) - steps), + batch_size // steps + ) step_idxes = [] for i in idxes: step_idxes += range(i, i + steps) @@ -56,10 +98,10 @@ class ReplayMemory(object): return value in self.memory def __getitem__(self, index): - return self.memory[index] + return self.memory[index % self.capacity] def __setitem__(self, index, value): - self.memory[index] = value + self.memory[index % self.capacity] = value def __reversed__(self): return reversed(self.memory)