Correct discount_rewards function to only multiply with gamma throughout
This commit is contained in:
parent
11d99df977
commit
190eb1f0c4
2 changed files with 30 additions and 14 deletions
|
@ -1,4 +1,5 @@
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
import rltorch
|
import rltorch
|
||||||
|
@ -13,13 +14,27 @@ class A2CSingleAgent:
|
||||||
self.logger = logger
|
self.logger = logger
|
||||||
|
|
||||||
def _discount_rewards(self, rewards):
|
def _discount_rewards(self, rewards):
|
||||||
discounted_rewards = torch.zeros_like(rewards)
|
gammas = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards)), dim = 0)
|
||||||
running_add = 0
|
return gammas * rewards
|
||||||
for t in reversed(range(len(rewards))):
|
|
||||||
running_add = running_add * self.config['discount_rate'] + rewards[t]
|
# This function is currently not used since the performance gains hasn't been shown
|
||||||
discounted_rewards[t] = running_add
|
# May be due to a faulty implementation, need to investigate more..
|
||||||
|
def _generalized_advantage_estimation(self, states, rewards, next_states, not_done):
|
||||||
|
tradeoff = 0.5
|
||||||
|
with torch.no_grad():
|
||||||
|
next_values = torch.zeros_like(rewards)
|
||||||
|
next_values[not_done] = self.value_net(next_states[not_done]).squeeze(1)
|
||||||
|
values = self.value_net(states).squeeze(1)
|
||||||
|
|
||||||
|
generalized_advantages = torch.zeros_like(rewards)
|
||||||
|
for i in range(len(generalized_advantages)):
|
||||||
|
weights = torch.ones_like(rewards[i:])
|
||||||
|
if i != len(generalized_advantages) - 1:
|
||||||
|
weights[1:] = torch.cumprod(torch.tensor(self.config['discount_rate'] * tradeoff).repeat(len(rewards) - i - 1), dim = 0)
|
||||||
|
generalized_advantages[i] = (weights * (rewards[i:] + self.config['discount_rate'] * next_values[i:] - values[i:])).sum()
|
||||||
|
|
||||||
|
return generalized_advantages
|
||||||
|
|
||||||
return discounted_rewards
|
|
||||||
|
|
||||||
def learn(self):
|
def learn(self):
|
||||||
episode_batch = self.memory.recall()
|
episode_batch = self.memory.recall()
|
||||||
|
@ -35,7 +50,9 @@ class A2CSingleAgent:
|
||||||
## Value Loss
|
## Value Loss
|
||||||
# In A2C, the value loss is the difference between the discounted reward and the value from the first state
|
# In A2C, the value loss is the difference between the discounted reward and the value from the first state
|
||||||
# The value of the first state is supposed to tell us the expected reward from the current policy of the whole episode
|
# The value of the first state is supposed to tell us the expected reward from the current policy of the whole episode
|
||||||
value_loss = F.mse_loss(self._discount_rewards(reward_batch).sum(), self.value_net(state_batch[0]))
|
discounted_reward = self._discount_rewards(reward_batch)
|
||||||
|
observed_value = discounted_reward.sum()
|
||||||
|
value_loss = F.mse_loss(observed_value, self.value_net(state_batch[0]))
|
||||||
self.value_net.zero_grad()
|
self.value_net.zero_grad()
|
||||||
value_loss.backward()
|
value_loss.backward()
|
||||||
self.value_net.step()
|
self.value_net.step()
|
||||||
|
@ -50,6 +67,10 @@ class A2CSingleAgent:
|
||||||
advantages = (reward_batch.unsqueeze(1) + self.config['discount_rate'] * next_state_values) - state_values
|
advantages = (reward_batch.unsqueeze(1) + self.config['discount_rate'] * next_state_values) - state_values
|
||||||
advantages = advantages.squeeze(1)
|
advantages = advantages.squeeze(1)
|
||||||
|
|
||||||
|
# advantages = self._generalized_advantage_estimation(state_batch, reward_batch, next_state_batch, not_done_batch)
|
||||||
|
# Scale for more stable learning
|
||||||
|
advantages = advantages / (advantages.std() + np.finfo('float').eps)
|
||||||
|
|
||||||
policy_loss = (-log_prob_batch * advantages).sum()
|
policy_loss = (-log_prob_batch * advantages).sum()
|
||||||
|
|
||||||
if self.logger is not None:
|
if self.logger is not None:
|
||||||
|
|
|
@ -18,13 +18,8 @@ class PPOAgent:
|
||||||
self.logger = logger
|
self.logger = logger
|
||||||
|
|
||||||
def _discount_rewards(self, rewards):
|
def _discount_rewards(self, rewards):
|
||||||
discounted_rewards = torch.zeros_like(rewards)
|
gammas = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards)), dim = 0)
|
||||||
running_add = 0
|
return gammas * rewards
|
||||||
for t in reversed(range(len(rewards))):
|
|
||||||
running_add = running_add * self.config['discount_rate'] + rewards[t]
|
|
||||||
discounted_rewards[t] = running_add
|
|
||||||
|
|
||||||
return discounted_rewards
|
|
||||||
|
|
||||||
|
|
||||||
def learn(self):
|
def learn(self):
|
||||||
|
|
Loading…
Reference in a new issue