From 8683b75ad902bbf25b8c78b67dbf8e91a8d1feb8 Mon Sep 17 00:00:00 2001
From: Brandon Rozek <rozekbrandon@gmail.com>
Date: Mon, 4 Mar 2019 22:04:13 -0500
Subject: [PATCH] Corrected gamma multiplication

---
 rltorch/agents/A2CSingleAgent.py | 6 ++++--
 rltorch/agents/PPOAgent.py       | 4 +++-
 rltorch/agents/REINFORCEAgent.py | 4 +++-
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/rltorch/agents/A2CSingleAgent.py b/rltorch/agents/A2CSingleAgent.py
index 5d382d8..c7f367e 100644
--- a/rltorch/agents/A2CSingleAgent.py
+++ b/rltorch/agents/A2CSingleAgent.py
@@ -14,9 +14,11 @@ class A2CSingleAgent:
     self.logger = logger
 
   def _discount_rewards(self, rewards):
-    gammas = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards)), dim = 0)
+    gammas = torch.ones_like(rewards)
+    if len(rewards) > 1:
+      gammas[1:] = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards) - 1), dim = 0)
     return gammas * rewards
-
+  
   # This function is currently not used since the performance gains hasn't been shown
   # May be due to a faulty implementation, need to investigate more..
   def _generalized_advantage_estimation(self, states, rewards, next_states, not_done):
diff --git a/rltorch/agents/PPOAgent.py b/rltorch/agents/PPOAgent.py
index fa51740..0a3ded4 100644
--- a/rltorch/agents/PPOAgent.py
+++ b/rltorch/agents/PPOAgent.py
@@ -18,7 +18,9 @@ class PPOAgent:
     self.logger = logger
 
   def _discount_rewards(self, rewards):
-    gammas = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards)), dim = 0)
+    gammas = torch.ones_like(rewards)
+    if len(rewards) > 1:
+      gammas[1:] = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards) - 1), dim = 0)
     return gammas * rewards
   
   
diff --git a/rltorch/agents/REINFORCEAgent.py b/rltorch/agents/REINFORCEAgent.py
index 7c3b163..7c8d8d6 100644
--- a/rltorch/agents/REINFORCEAgent.py
+++ b/rltorch/agents/REINFORCEAgent.py
@@ -21,7 +21,9 @@ class REINFORCEAgent:
     shaped_rewards = torch.zeros_like(rewards)
     baseline = rewards.mean()
     for i in range(len(rewards)):
-      gammas = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards) - i), dim = 0)
+      gammas = torch.ones_like(rewards[i:])
+      if i != len(rewards) - 1:
+        gammas[1:] = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards) - i - 1), dim = 0)
       advantages = rewards[i:] - baseline
       shaped_rewards[i] = (gammas * advantages).sum()
     return shaped_rewards