Corrected gamma multiplication

2019-03-04 22:04:13 -05:00 · 2019-03-04 22:04:13 -05:00 · 8683b75ad9
commit 8683b75ad9
parent 190eb1f0c4
3 changed files with 10 additions and 4 deletions
--- a/rltorch/agents/A2CSingleAgent.py
+++ b/rltorch/agents/A2CSingleAgent.py
@ -14,7 +14,9 @@ class A2CSingleAgent:
    self.logger = logger
  def _discount_rewards(self, rewards):
-    gammas = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards)), dim = 0)
+    gammas = torch.ones_like(rewards)
    if len(rewards) > 1:
      gammas[1:] = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards) - 1), dim = 0)
    return gammas * rewards
  # This function is currently not used since the performance gains hasn't been shown
--- a/rltorch/agents/PPOAgent.py
+++ b/rltorch/agents/PPOAgent.py
@ -18,7 +18,9 @@ class PPOAgent:
    self.logger = logger
  def _discount_rewards(self, rewards):
-    gammas = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards)), dim = 0)
+    gammas = torch.ones_like(rewards)
    if len(rewards) > 1:
      gammas[1:] = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards) - 1), dim = 0)
    return gammas * rewards
--- a/rltorch/agents/REINFORCEAgent.py
+++ b/rltorch/agents/REINFORCEAgent.py
@ -21,7 +21,9 @@ class REINFORCEAgent:
    shaped_rewards = torch.zeros_like(rewards)
    baseline = rewards.mean()
    for i in range(len(rewards)):
-      gammas = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards) - i), dim = 0)
+      gammas = torch.ones_like(rewards[i:])
      if i != len(rewards) - 1:
        gammas[1:] = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards) - i - 1), dim = 0)
      advantages = rewards[i:] - baseline
      shaped_rewards[i] = (gammas * advantages).sum()
    return shaped_rewards