Corrected A2C and PPO to train at the end of an episode

2019-03-01 21:04:13 -05:00 · 2019-03-01 21:04:13 -05:00 · e42f5bba1b
commit e42f5bba1b
parent 1958fc7c7e
5 changed files with 48 additions and 28 deletions
--- a/rltorch/agents/A2CSingleAgent.py
+++ b/rltorch/agents/A2CSingleAgent.py
@ -27,9 +27,6 @@ class A2CSingleAgent:
  
  
  def learn(self):
-    if len(self.memory) < self.config['batch_size']:
-      return
-
    episode_batch = self.memory.recall()
    state_batch, action_batch, reward_batch, next_state_batch, done_batch, log_prob_batch = zip(*episode_batch)  

@ -40,7 +37,7 @@ class A2CSingleAgent:
    log_prob_batch = torch.cat(log_prob_batch).to(self.value_net.device)

    ## Value Loss
-    value_loss = F.mse_loss(self._discount_rewards(reward_batch), self.value_net(state_batch[0]))
+    value_loss = F.mse_loss(self._discount_rewards(reward_batch).sum(), self.value_net(state_batch[0]))
    self.value_net.zero_grad()
    value_loss.backward()
    self.value_net.step()
--- a/rltorch/agents/PPOAgent.py
+++ b/rltorch/agents/PPOAgent.py
@ -1,5 +1,3 @@
-# Deprecated since the idea of the idea shouldn't work without having some sort of "mental model" of the environment
-
 from copy import deepcopy
 import numpy as np
 import torch
@ -30,9 +28,6 @@ class PPOAgent:
  
  
  def learn(self):
-    if len(self.memory) < self.config['batch_size']:
-      return
-
    episode_batch = self.memory.recall()
    state_batch, action_batch, reward_batch, next_state_batch, done_batch, log_prob_batch = zip(*episode_batch)  

@ -44,7 +39,7 @@ class PPOAgent:
    log_prob_batch = torch.cat(log_prob_batch).to(self.value_net.device)

    ## Value Loss
-    value_loss = F.mse_loss(self._discount_rewards(reward_batch), self.value_net(state_batch[0]))
+    value_loss = F.mse_loss(self._discount_rewards(reward_batch).sum(), self.value_net(state_batch[0]))
    self.value_net.zero_grad()
    value_loss.backward()
    self.value_net.step()
--- a/rltorch/env/simulate.py
+++ b/rltorch/env/simulate.py
@ -62,4 +62,40 @@ class EnvironmentRunSync():
    if self.logwriter is not None:
      self.logwriter.write(logger)
    
-    self.last_state = state
+    self.last_state = state
+
+
+class EnvironmentEpisodeSync():
+  def __init__(self, env, actor, config, memory = None, logwriter = None, name = ""):
+    self.env = env
+    self.name = name
+    self.actor = actor
+    self.config = deepcopy(config)
+    self.logwriter = logwriter
+    self.memory = memory
+    self.episode_num = 1
+
+  def run(self):
+    state = self.env.reset()
+    done = False
+    episodeReward = 0
+    logger = rltorch.log.Logger() if self.logwriter is not None else None
+    while not done:
+      action = self.actor.act(state)
+      next_state, reward, done, _ = self.env.step(action)
+       
+      episodeReward += reward
+      if self.memory is not None:
+        self.memory.append(state, action, reward, next_state, done)
+       
+      state = next_state
+
+    if self.episode_num % self.config['print_stat_n_eps'] == 0:
+      print("episode: {}/{}, score: {}"
+        .format(self.episode_num, self.config['total_training_episodes'], episodeReward))
+          
+    if self.logwriter is not None:
+      logger.append(self.name + '/EpisodeReward', episodeReward)
+      self.logwriter.write(logger)
+    
+    self.episode_num +=  1