From 190eb1f0c4f9f4647548a84e07ced2e6cf38a94f Mon Sep 17 00:00:00 2001 From: Brandon Rozek Date: Mon, 4 Mar 2019 21:59:02 -0500 Subject: [PATCH] Correct discount_rewards function to only multiply with gamma throughout --- rltorch/agents/A2CSingleAgent.py | 35 +++++++++++++++++++++++++------- rltorch/agents/PPOAgent.py | 9 ++------ 2 files changed, 30 insertions(+), 14 deletions(-) diff --git a/rltorch/agents/A2CSingleAgent.py b/rltorch/agents/A2CSingleAgent.py index 305c543..5d382d8 100644 --- a/rltorch/agents/A2CSingleAgent.py +++ b/rltorch/agents/A2CSingleAgent.py @@ -1,4 +1,5 @@ from copy import deepcopy +import numpy as np import torch import torch.nn.functional as F import rltorch @@ -13,13 +14,27 @@ class A2CSingleAgent: self.logger = logger def _discount_rewards(self, rewards): - discounted_rewards = torch.zeros_like(rewards) - running_add = 0 - for t in reversed(range(len(rewards))): - running_add = running_add * self.config['discount_rate'] + rewards[t] - discounted_rewards[t] = running_add + gammas = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards)), dim = 0) + return gammas * rewards + + # This function is currently not used since the performance gains hasn't been shown + # May be due to a faulty implementation, need to investigate more.. + def _generalized_advantage_estimation(self, states, rewards, next_states, not_done): + tradeoff = 0.5 + with torch.no_grad(): + next_values = torch.zeros_like(rewards) + next_values[not_done] = self.value_net(next_states[not_done]).squeeze(1) + values = self.value_net(states).squeeze(1) + + generalized_advantages = torch.zeros_like(rewards) + for i in range(len(generalized_advantages)): + weights = torch.ones_like(rewards[i:]) + if i != len(generalized_advantages) - 1: + weights[1:] = torch.cumprod(torch.tensor(self.config['discount_rate'] * tradeoff).repeat(len(rewards) - i - 1), dim = 0) + generalized_advantages[i] = (weights * (rewards[i:] + self.config['discount_rate'] * next_values[i:] - values[i:])).sum() + + return generalized_advantages - return discounted_rewards def learn(self): episode_batch = self.memory.recall() @@ -35,7 +50,9 @@ class A2CSingleAgent: ## Value Loss # In A2C, the value loss is the difference between the discounted reward and the value from the first state # The value of the first state is supposed to tell us the expected reward from the current policy of the whole episode - value_loss = F.mse_loss(self._discount_rewards(reward_batch).sum(), self.value_net(state_batch[0])) + discounted_reward = self._discount_rewards(reward_batch) + observed_value = discounted_reward.sum() + value_loss = F.mse_loss(observed_value, self.value_net(state_batch[0])) self.value_net.zero_grad() value_loss.backward() self.value_net.step() @@ -50,6 +67,10 @@ class A2CSingleAgent: advantages = (reward_batch.unsqueeze(1) + self.config['discount_rate'] * next_state_values) - state_values advantages = advantages.squeeze(1) + # advantages = self._generalized_advantage_estimation(state_batch, reward_batch, next_state_batch, not_done_batch) + # Scale for more stable learning + advantages = advantages / (advantages.std() + np.finfo('float').eps) + policy_loss = (-log_prob_batch * advantages).sum() if self.logger is not None: diff --git a/rltorch/agents/PPOAgent.py b/rltorch/agents/PPOAgent.py index 44c1f5d..fa51740 100644 --- a/rltorch/agents/PPOAgent.py +++ b/rltorch/agents/PPOAgent.py @@ -18,13 +18,8 @@ class PPOAgent: self.logger = logger def _discount_rewards(self, rewards): - discounted_rewards = torch.zeros_like(rewards) - running_add = 0 - for t in reversed(range(len(rewards))): - running_add = running_add * self.config['discount_rate'] + rewards[t] - discounted_rewards[t] = running_add - - return discounted_rewards + gammas = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards)), dim = 0) + return gammas * rewards def learn(self):