From 8683b75ad902bbf25b8c78b67dbf8e91a8d1feb8 Mon Sep 17 00:00:00 2001 From: Brandon Rozek Date: Mon, 4 Mar 2019 22:04:13 -0500 Subject: [PATCH] Corrected gamma multiplication --- rltorch/agents/A2CSingleAgent.py | 6 ++++-- rltorch/agents/PPOAgent.py | 4 +++- rltorch/agents/REINFORCEAgent.py | 4 +++- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/rltorch/agents/A2CSingleAgent.py b/rltorch/agents/A2CSingleAgent.py index 5d382d8..c7f367e 100644 --- a/rltorch/agents/A2CSingleAgent.py +++ b/rltorch/agents/A2CSingleAgent.py @@ -14,9 +14,11 @@ class A2CSingleAgent: self.logger = logger def _discount_rewards(self, rewards): - gammas = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards)), dim = 0) + gammas = torch.ones_like(rewards) + if len(rewards) > 1: + gammas[1:] = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards) - 1), dim = 0) return gammas * rewards - + # This function is currently not used since the performance gains hasn't been shown # May be due to a faulty implementation, need to investigate more.. def _generalized_advantage_estimation(self, states, rewards, next_states, not_done): diff --git a/rltorch/agents/PPOAgent.py b/rltorch/agents/PPOAgent.py index fa51740..0a3ded4 100644 --- a/rltorch/agents/PPOAgent.py +++ b/rltorch/agents/PPOAgent.py @@ -18,7 +18,9 @@ class PPOAgent: self.logger = logger def _discount_rewards(self, rewards): - gammas = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards)), dim = 0) + gammas = torch.ones_like(rewards) + if len(rewards) > 1: + gammas[1:] = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards) - 1), dim = 0) return gammas * rewards diff --git a/rltorch/agents/REINFORCEAgent.py b/rltorch/agents/REINFORCEAgent.py index 7c3b163..7c8d8d6 100644 --- a/rltorch/agents/REINFORCEAgent.py +++ b/rltorch/agents/REINFORCEAgent.py @@ -21,7 +21,9 @@ class REINFORCEAgent: shaped_rewards = torch.zeros_like(rewards) baseline = rewards.mean() for i in range(len(rewards)): - gammas = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards) - i), dim = 0) + gammas = torch.ones_like(rewards[i:]) + if i != len(rewards) - 1: + gammas[1:] = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards) - i - 1), dim = 0) advantages = rewards[i:] - baseline shaped_rewards[i] = (gammas * advantages).sum() return shaped_rewards