diff --git a/rltorch/action_selector/ArgMaxSelector.py b/rltorch/action_selector/ArgMaxSelector.py index 3f374f9..fb0ff07 100644 --- a/rltorch/action_selector/ArgMaxSelector.py +++ b/rltorch/action_selector/ArgMaxSelector.py @@ -1,7 +1,7 @@ from random import randrange import torch class ArgMaxSelector: - def __init__(self, model, action_size, device = None): + def __init__(self, model, action_size, device=None): self.model = model self.action_size = action_size self.device = device @@ -12,7 +12,8 @@ class ArgMaxSelector: if self.device is not None: state = state.to(self.device) action_values = self.model(state).squeeze(0) - action = self.random_act() if (action_values[0] == action_values).all() else action_values.argmax().item() + action = self.random_act() if (action_values[0] == action_values).all() \ + else action_values.argmax().item() return action def act(self, state): - return self.best_act(state) \ No newline at end of file + return self.best_act(state) diff --git a/rltorch/action_selector/EpsilonGreedySelector.py b/rltorch/action_selector/EpsilonGreedySelector.py index e537f2f..69eba32 100644 --- a/rltorch/action_selector/EpsilonGreedySelector.py +++ b/rltorch/action_selector/EpsilonGreedySelector.py @@ -1,13 +1,14 @@ -from .ArgMaxSelector import ArgMaxSelector -import numpy as np import collections +import numpy as np +from .ArgMaxSelector import ArgMaxSelector + class EpsilonGreedySelector(ArgMaxSelector): - def __init__(self, model, action_size, device = None, epsilon = 0.1): - super(EpsilonGreedySelector, self).__init__(model, action_size, device = device) + def __init__(self, model, action_size, device=None, epsilon=0.1): + super(EpsilonGreedySelector, self).__init__(model, action_size, device=device) self.epsilon = epsilon # random_act is already implemented in ArgMaxSelector # best_act is already implemented in ArgMaxSelector def act(self, state): eps = next(self.epsilon) if isinstance(self.epsilon, collections.Iterable) else self.epsilon action = self.random_act() if np.random.rand() < eps else self.best_act(state) - return action \ No newline at end of file + return action diff --git a/rltorch/action_selector/IdentitySelector.py b/rltorch/action_selector/IdentitySelector.py index 85da177..d25ba96 100644 --- a/rltorch/action_selector/IdentitySelector.py +++ b/rltorch/action_selector/IdentitySelector.py @@ -1,8 +1,9 @@ -from .ArgMaxSelector import ArgMaxSelector import torch +from .ArgMaxSelector import ArgMaxSelector + class IdentitySelector(ArgMaxSelector): - def __init__(self, model, action_size, device = None): - super(IdentitySelector, self).__init__(model, action_size, device = device) + def __init__(self, model, action_size, device=None): + super(IdentitySelector, self).__init__(model, action_size, device=device) # random_act is already implemented in ArgMaxSelector def best_act(self, state): with torch.no_grad(): @@ -11,4 +12,4 @@ class IdentitySelector(ArgMaxSelector): action = self.model(state).squeeze(0).item() return action def act(self, state): - return self.best_act(state) \ No newline at end of file + return self.best_act(state) diff --git a/rltorch/action_selector/RandomSelector.py b/rltorch/action_selector/RandomSelector.py index 441c512..eb2a0da 100644 --- a/rltorch/action_selector/RandomSelector.py +++ b/rltorch/action_selector/RandomSelector.py @@ -1,10 +1,10 @@ from random import randrange -class RandomSelector(): +class RandomSelector: def __init__(self, action_size): self.action_size = action_size def random_act(self): - return randrange(action_size) - def best_act(self, state): + return randrange(self.action_size) + def best_act(self, _): return self.random_act() - def act(self, state): + def act(self, _): return self.random_act() diff --git a/rltorch/action_selector/StochasticSelector.py b/rltorch/action_selector/StochasticSelector.py index 304862a..2ed5150 100644 --- a/rltorch/action_selector/StochasticSelector.py +++ b/rltorch/action_selector/StochasticSelector.py @@ -1,22 +1,19 @@ -from random import randrange -import torch from torch.distributions import Categorical -import rltorch -from rltorch.action_selector import ArgMaxSelector - +from .ArgMaxSelector import ArgMaxSelector +from ..memory.EpisodeMemory import EpisodeMemory class StochasticSelector(ArgMaxSelector): - def __init__(self, model, action_size, memory = None, device = None): - super(StochasticSelector, self).__init__(model, action_size, device = device) + def __init__(self, model, action_size, memory=None, device=None): + super(StochasticSelector, self).__init__(model, action_size, device=device) self.model = model self.action_size = action_size self.device = device self.memory = memory - def best_act(self, state, log_prob = True): + def best_act(self, state, log_prob=True): if self.device is not None: state = state.to(self.device) action_probabilities = self.model(state) distribution = Categorical(action_probabilities) action = distribution.sample() - if log_prob and isinstance(self.memory, rltorch.memory.EpisodeMemory): + if log_prob and isinstance(self.memory, EpisodeMemory): self.memory.append_log_probs(distribution.log_prob(action)) - return action.item() \ No newline at end of file + return action.item() diff --git a/rltorch/action_selector/__init__.py b/rltorch/action_selector/__init__.py index 4fe0da3..35ecaa6 100644 --- a/rltorch/action_selector/__init__.py +++ b/rltorch/action_selector/__init__.py @@ -1,5 +1,5 @@ -from .ArgMaxSelector import * -from .EpsilonGreedySelector import * -from .IdentitySelector import * -from .RandomSelector import * -from .StochasticSelector import * \ No newline at end of file +from .ArgMaxSelector import ArgMaxSelector +from .EpsilonGreedySelector import EpsilonGreedySelector +from .IdentitySelector import IdentitySelector +from .RandomSelector import RandomSelector +from .StochasticSelector import StochasticSelector diff --git a/rltorch/agents/A2CSingleAgent.py b/rltorch/agents/A2CSingleAgent.py index e7316ec..0c2541a 100644 --- a/rltorch/agents/A2CSingleAgent.py +++ b/rltorch/agents/A2CSingleAgent.py @@ -2,89 +2,91 @@ from copy import deepcopy import numpy as np import torch import torch.nn.functional as F -import rltorch -import rltorch.memory as M class A2CSingleAgent: - def __init__(self, policy_net, value_net, memory, config, logger = None): - self.policy_net = policy_net - self.value_net = value_net - self.memory = memory - self.config = deepcopy(config) - self.logger = logger + def __init__(self, policy_net, value_net, memory, config, logger=None): + self.policy_net = policy_net + self.value_net = value_net + self.memory = memory + self.config = deepcopy(config) + self.logger = logger - def _discount_rewards(self, rewards): - gammas = torch.ones_like(rewards) - if len(rewards) > 1: - gammas[1:] = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards) - 1), dim = 0) - return gammas * rewards - - # This function is currently not used since the performance gains hasn't been shown - # May be due to a faulty implementation, need to investigate more.. - def _generalized_advantage_estimation(self, states, rewards, next_states, not_done): - tradeoff = 0.5 - with torch.no_grad(): - next_values = torch.zeros_like(rewards) - next_values[not_done] = self.value_net(next_states[not_done]).squeeze(1) - values = self.value_net(states).squeeze(1) + def _discount_rewards(self, rewards): + gammas = torch.ones_like(rewards) + if len(rewards) > 1: + discount_tensor = torch.tensor(self.config['discount_rate']) + gammas[1:] = torch.cumprod( + discount_tensor.repeat(len(rewards) - 1), + dim=0 + ) + return gammas * rewards - generalized_advantages = torch.zeros_like(rewards) - for i in range(len(generalized_advantages)): - weights = torch.ones_like(rewards[i:]) - if i != len(generalized_advantages) - 1: - weights[1:] = torch.cumprod(torch.tensor(self.config['discount_rate'] * tradeoff).repeat(len(rewards) - i - 1), dim = 0) - generalized_advantages[i] = (weights * (rewards[i:] + self.config['discount_rate'] * next_values[i:] - values[i:])).sum() + # This function is currently not used since the performance gains hasn't been shown + # May be due to a faulty implementation, need to investigate more.. + def _generalized_advantage_estimation(self, states, rewards, next_states, not_done): + tradeoff = 0.5 + with torch.no_grad(): + next_values = torch.zeros_like(rewards) + next_values[not_done] = self.value_net(next_states[not_done]).squeeze(1) + values = self.value_net(states).squeeze(1) - return generalized_advantages + generalized_advantages = torch.zeros_like(rewards) + discount_tensor = torch.tensor(self.config['discount_rate']) * tradeoff + for i, _ in enumerate(generalized_advantages): + weights = torch.ones_like(rewards[i:]) + if i != len(generalized_advantages) - 1: + weights[1:] = torch.cumprod(discount_tensor.repeat(len(rewards) - i - 1), dim=0) + generalized_advantages[i] = (weights * (rewards[i:] + self.config['discount_rate'] * next_values[i:] - values[i:])).sum() - - def learn(self): - episode_batch = self.memory.recall() - state_batch, action_batch, reward_batch, next_state_batch, done_batch, log_prob_batch = zip(*episode_batch) + return generalized_advantages - # Send batches to the appropriate device - state_batch = torch.cat(state_batch).to(self.value_net.device) - reward_batch = torch.tensor(reward_batch).to(self.value_net.device).float() - not_done_batch = ~torch.tensor(done_batch).to(self.value_net.device) - next_state_batch = torch.cat(next_state_batch).to(self.value_net.device) - log_prob_batch = torch.cat(log_prob_batch).to(self.value_net.device) + def learn(self): + episode_batch = self.memory.recall() + state_batch, _, reward_batch, next_state_batch, done_batch, log_prob_batch = zip(*episode_batch) - ## Value Loss - # In A2C, the value loss is the difference between the discounted reward and the value from the first state - # The value of the first state is supposed to tell us the expected reward from the current policy of the whole episode - discounted_reward = self._discount_rewards(reward_batch) - observed_value = discounted_reward.sum() - value_loss = F.mse_loss(observed_value, self.value_net(state_batch[0])) - self.value_net.zero_grad() - value_loss.backward() - self.value_net.step() + # Send batches to the appropriate device + state_batch = torch.cat(state_batch).to(self.value_net.device) + reward_batch = torch.tensor(reward_batch).to(self.value_net.device).float() + not_done_batch = ~torch.tensor(done_batch).to(self.value_net.device) + next_state_batch = torch.cat(next_state_batch).to(self.value_net.device) + log_prob_batch = torch.cat(log_prob_batch).to(self.value_net.device) - ## Policy Loss - # Increase probabilities of advantageous states - # and decrease the probabilities of non-advantageous ones - with torch.no_grad(): - state_values = self.value_net(state_batch) - next_state_values = torch.zeros_like(state_values) - next_state_values[not_done_batch] = self.value_net(next_state_batch[not_done_batch]) - advantages = (reward_batch.unsqueeze(1) + self.config['discount_rate'] * next_state_values) - state_values - advantages = advantages.squeeze(1) + ## Value Loss + # In A2C, the value loss is the difference between the discounted reward + # and the value from the first state. + # The value of the first state is supposed to tell us + # the expected reward from the current policy of the whole episode + discounted_reward = self._discount_rewards(reward_batch) + observed_value = discounted_reward.sum() + value_loss = F.mse_loss(observed_value, self.value_net(state_batch[0])) + self.value_net.zero_grad() + value_loss.backward() + self.value_net.step() - # advantages = self._generalized_advantage_estimation(state_batch, reward_batch, next_state_batch, not_done_batch) - # Scale for more stable learning - advantages = advantages / (advantages.std() + np.finfo('float').eps) + ## Policy Loss + # Increase probabilities of advantageous states + # and decrease the probabilities of non-advantageous ones + with torch.no_grad(): + state_values = self.value_net(state_batch) + next_state_values = torch.zeros_like(state_values) + next_state_values[not_done_batch] = self.value_net(next_state_batch[not_done_batch]) + advantages = (reward_batch.unsqueeze(1) + self.config['discount_rate'] * next_state_values) - state_values + advantages = advantages.squeeze(1) - policy_loss = (-log_prob_batch * advantages).sum() + # advantages = self._generalized_advantage_estimation(state_batch, reward_batch, next_state_batch, not_done_batch) + # Scale for more stable learning + advantages = advantages / (advantages.std() + np.finfo('float').eps) + + policy_loss = (-log_prob_batch * advantages).sum() - if self.logger is not None: - self.logger.append("Loss/Policy", policy_loss.item()) - self.logger.append("Loss/Value", value_loss.item()) + if self.logger is not None: + self.logger.append("Loss/Policy", policy_loss.item()) + self.logger.append("Loss/Value", value_loss.item()) - self.policy_net.zero_grad() - policy_loss.backward() - self.policy_net.step() - - # Memory under the old policy is not needed for future training - self.memory.clear() - + self.policy_net.zero_grad() + policy_loss.backward() + self.policy_net.step() + # Memory under the old policy is not needed for future training + self.memory.clear() diff --git a/rltorch/agents/DQNAgent.py b/rltorch/agents/DQNAgent.py index 6f47c05..71f9015 100644 --- a/rltorch/agents/DQNAgent.py +++ b/rltorch/agents/DQNAgent.py @@ -1,13 +1,11 @@ import collections +from copy import deepcopy import rltorch.memory as M import torch import torch.nn.functional as F -from copy import deepcopy -import numpy as np -from pathlib import Path class DQNAgent: - def __init__(self, net , memory, config, target_net = None, logger = None): + def __init__(self, net, memory, config, target_net=None, logger=None): self.net = net self.target_net = target_net self.memory = memory @@ -20,16 +18,16 @@ class DQNAgent: self.net.model.to(self.net.device) self.target_net.sync() - def learn(self, logger = None): + def learn(self, logger=None): if len(self.memory) < self.config['batch_size']: return - if (isinstance(self.memory, M.PrioritizedReplayMemory)): + if isinstance(self.memory, M.PrioritizedReplayMemory): weight_importance = self.config['prioritized_replay_weight_importance'] # If it's a scheduler then get the next value by calling next, otherwise just use it's value beta = next(weight_importance) if isinstance(weight_importance, collections.Iterable) else weight_importance - minibatch = self.memory.sample(self.config['batch_size'], beta = beta) - state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, importance_weights, batch_indexes = M.zip_batch(minibatch, priority = True) + minibatch = self.memory.sample(self.config['batch_size'], beta=beta) + state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, importance_weights, batch_indexes = M.zip_batch(minibatch, priority=True) else: minibatch = self.memory.sample(self.config['batch_size']) state_batch, action_batch, reward_batch, next_state_batch, not_done_batch = M.zip_batch(minibatch) @@ -49,7 +47,7 @@ class DQNAgent: # and the regular net to select the action # That way we decouple the value and action selecting processes (DOUBLE DQN) not_done_size = not_done_batch.sum() - next_state_values = torch.zeros_like(state_values, device = self.net.device) + next_state_values = torch.zeros_like(state_values, device=self.net.device) if self.target_net is not None: next_state_values[not_done_batch] = self.target_net(next_state_batch[not_done_batch]) next_best_action = self.net(next_state_batch[not_done_batch]).argmax(1) @@ -57,15 +55,15 @@ class DQNAgent: next_state_values[not_done_batch] = self.net(next_state_batch[not_done_batch]) next_best_action = next_state_values[not_done_batch].argmax(1) - best_next_state_value = torch.zeros(self.config['batch_size'], device = self.net.device) + best_next_state_value = torch.zeros(self.config['batch_size'], device=self.net.device) best_next_state_value[not_done_batch] = next_state_values[not_done_batch].gather(1, next_best_action.view((not_done_size, 1))).squeeze(1) expected_values = (reward_batch + (self.config['discount_rate'] * best_next_state_value)).unsqueeze(1) # If we're sampling by TD error, multiply loss by a importance weight which helps decrease overfitting - if (isinstance(self.memory, M.PrioritizedReplayMemory)): + if isinstance(self.memory, M.PrioritizedReplayMemory): # loss = (torch.as_tensor(importance_weights, device = self.net.device) * F.smooth_l1_loss(obtained_values, expected_values, reduction = 'none').squeeze(1)).mean() - loss = (torch.as_tensor(importance_weights, device = self.net.device) * ((obtained_values - expected_values)**2).squeeze(1)).mean() + loss = (torch.as_tensor(importance_weights, device=self.net.device) * ((obtained_values - expected_values)**2).squeeze(1)).mean() else: # loss = F.smooth_l1_loss(obtained_values, expected_values) loss = F.mse_loss(obtained_values, expected_values) @@ -85,8 +83,6 @@ class DQNAgent: self.target_net.sync() # If we're sampling by TD error, readjust the weights of the experiences - if (isinstance(self.memory, M.PrioritizedReplayMemory)): + if isinstance(self.memory, M.PrioritizedReplayMemory): td_error = (obtained_values - expected_values).detach().abs() self.memory.update_priorities(batch_indexes, td_error) - - diff --git a/rltorch/agents/DQfDAgent.py b/rltorch/agents/DQfDAgent.py index 43f2cc9..076e821 100644 --- a/rltorch/agents/DQfDAgent.py +++ b/rltorch/agents/DQfDAgent.py @@ -1,14 +1,12 @@ import collections +from copy import deepcopy import rltorch.memory as M import torch import torch.nn.functional as F -from copy import deepcopy -import numpy as np -from pathlib import Path -from rltorch.action_selector import ArgMaxSelector + class DQfDAgent: - def __init__(self, net, memory, config, target_net = None, logger = None): + def __init__(self, net, memory, config, target_net=None, logger=None): self.net = net self.target_net = target_net self.memory = memory @@ -21,7 +19,7 @@ class DQfDAgent: self.net.model.to(self.net.device) self.target_net.sync() - def learn(self, logger = None): + def learn(self, logger=None): if len(self.memory) < self.config['batch_size']: return @@ -32,29 +30,19 @@ class DQfDAgent: batch_size = self.config['batch_size'] steps = None - if isinstance(self.memory, M.DQfDMemory): - weight_importance = self.config['prioritized_replay_weight_importance'] - # If it's a scheduler then get the next value by calling next, otherwise just use it's value - beta = next(weight_importance) if isinstance(weight_importance, collections.Iterable) else weight_importance - - # Check to see if we are doing N-Step DQN - if steps is not None: - minibatch = self.memory.sample_n_steps(batch_size, steps, beta) - else: - minibatch = self.memory.sample(batch_size, beta = beta) - - # Process batch - state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, importance_weights, batch_indexes = M.zip_batch(minibatch, priority = True) - + weight_importance = self.config['prioritized_replay_weight_importance'] + # If it's a scheduler then get the next value by calling next, otherwise just use it's value + beta = next(weight_importance) if isinstance(weight_importance, collections.Iterable) \ + else weight_importance + + # Check to see if we are doing N-Step DQN + if steps is not None: + minibatch = self.memory.sample_n_steps(batch_size, steps, beta) else: - # Check to see if we're doing N-Step DQN - if steps is not None: - minibatch = self.memory.sample_n_steps(batch_size, steps) - else: - minibatch = self.memory.sample(batch_size) + minibatch = self.memory.sample(batch_size, beta=beta) - # Process batch - state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, batch_indexes = M.zip_batch(minibatch, want_indices = True) + # Process batch + state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, importance_weights, batch_indexes = M.zip_batch(minibatch, priority=True) batch_index_tensors = torch.tensor(batch_indexes) demo_mask = batch_index_tensors < self.memory.demo_position @@ -75,7 +63,7 @@ class DQfDAgent: # and the regular net to select the action # That way we decouple the value and action selecting processes (DOUBLE DQN) not_done_size = not_done_batch.sum() - next_state_values = torch.zeros_like(state_values, device = self.net.device) + next_state_values = torch.zeros_like(state_values, device=self.net.device) if self.target_net is not None: next_state_values[not_done_batch] = self.target_net(next_state_batch[not_done_batch]) next_best_action = self.net(next_state_batch[not_done_batch]).argmax(1) @@ -83,14 +71,14 @@ class DQfDAgent: next_state_values[not_done_batch] = self.net(next_state_batch[not_done_batch]) next_best_action = next_state_values[not_done_batch].argmax(1) - best_next_state_value = torch.zeros(batch_size, device = self.net.device) + best_next_state_value = torch.zeros(batch_size, device=self.net.device) best_next_state_value[not_done_batch] = next_state_values[not_done_batch].gather(1, next_best_action.view((not_done_size, 1))).squeeze(1) - expected_values = (reward_batch + (self.config['discount_rate'] * best_next_state_value)).unsqueeze(1) + expected_values = (reward_batch + (batch_size * best_next_state_value)).unsqueeze(1) # N-Step DQN Loss # num_steps capture how many steps actually exist before the end of episode - if steps != None: + if steps is not None: expected_n_step_values = [] with torch.no_grad(): for i in range(0, len(state_batch), steps): @@ -127,7 +115,7 @@ class DQfDAgent: l = torch.ones_like(state_values[demo_mask]) expert_actions = action_batch[demo_mask] # l(s, a) is zero for every action the expert doesn't take - for i,a in zip(range(len(l)), expert_actions): + for i, _, a in zip(enumerate(l), expert_actions): l[i].fill_(0.8) # According to paper l[i, a] = 0 if self.target_net is not None: @@ -139,35 +127,26 @@ class DQfDAgent: # Iterate through hyperparamters if isinstance(self.config['dqfd_demo_loss_weight'], collections.Iterable): demo_importance = next(self.config['dqfd_demo_loss_weight']) - else: + else: demo_importance = self.config['dqfd_demo_loss_weight'] if isinstance(self.config['dqfd_td_loss_weight'], collections.Iterable): td_importance = next(self.config['dqfd_td_loss_weight']) - else: + else: td_importance = self.config['dqfd_td_loss_weight'] # Since dqn_loss and demo_loss are different sizes, the reduction has to happen before they are combined - if isinstance(self.memory, M.DQfDMemory): - dqn_loss = (torch.as_tensor(importance_weights, device = self.net.device) * F.mse_loss(obtained_values, expected_values, reduction = 'none').squeeze(1)).mean() - else: - dqn_loss = F.mse_loss(obtained_values, expected_values) + dqn_loss = (torch.as_tensor(importance_weights, device=self.net.device) * F.mse_loss(obtained_values, expected_values, reduction='none').squeeze(1)).mean() - if steps != None: - if isinstance(self.memory, M.DQfDMemory): - dqn_n_step_loss = (torch.as_tensor(importance_weights[::steps], device = self.net.device) * F.mse_loss(observed_n_step_values, expected_n_step_values, reduction = 'none')).mean() - else: - dqn_n_step_loss = F.mse_loss(observed_n_step_values, expected_n_step_values, reduction = 'none').mean() + if steps is not None: + dqn_n_step_loss = (torch.as_tensor(importance_weights[::steps], device=self.net.device) * F.mse_loss(observed_n_step_values, expected_n_step_values, reduction='none')).mean() else: - dqn_n_step_loss = torch.tensor(0, device = self.net.device) + dqn_n_step_loss = torch.tensor(0, device=self.net.device) if demo_mask.sum() > 0: - if isinstance(self.memory, M.DQfDMemory): - demo_loss = (torch.as_tensor(importance_weights, device = self.net.device)[demo_mask] * F.mse_loss((state_values[demo_mask] + l).max(1)[0].unsqueeze(1), expert_value, reduction = 'none').squeeze(1)).mean() - else: - demo_loss = F.mse_loss((state_values[demo_mask] + l).max(1)[0].unsqueeze(1), expert_value, reduction = 'none').squeeze(1).mean() + demo_loss = (torch.as_tensor(importance_weights, device=self.net.device)[demo_mask] * F.mse_loss((state_values[demo_mask] + l).max(1)[0].unsqueeze(1), expert_value, reduction='none').squeeze(1)).mean() else: - demo_loss = 0. + demo_loss = 0 loss = td_importance * dqn_loss + td_importance * dqn_n_step_loss + demo_importance * demo_loss if self.logger is not None: diff --git a/rltorch/agents/PPOAgent.py b/rltorch/agents/PPOAgent.py index 8f6b78e..97a77e8 100644 --- a/rltorch/agents/PPOAgent.py +++ b/rltorch/agents/PPOAgent.py @@ -1,81 +1,72 @@ from copy import deepcopy -import numpy as np import torch import torch.nn.functional as F from torch.distributions import Categorical import rltorch -import rltorch.memory as M -import collections -import random class PPOAgent: - def __init__(self, policy_net, value_net, memory, config, logger = None): - self.policy_net = policy_net - self.old_policy_net = rltorch.network.TargetNetwork(policy_net) - self.value_net = value_net - self.memory = memory - self.config = deepcopy(config) - self.logger = logger + def __init__(self, policy_net, value_net, memory, config, logger=None): + self.policy_net = policy_net + self.old_policy_net = rltorch.network.TargetNetwork(policy_net) + self.value_net = value_net + self.memory = memory + self.config = deepcopy(config) + self.logger = logger - def _discount_rewards(self, rewards): - gammas = torch.ones_like(rewards) - if len(rewards) > 1: - gammas[1:] = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards) - 1), dim = 0) - return gammas * rewards + def _discount_rewards(self, rewards): + gammas = torch.ones_like(rewards) + if len(rewards) > 1: + gammas[1:] = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards) - 1), dim=0) + return gammas * rewards - - def learn(self): - episode_batch = self.memory.recall() - state_batch, action_batch, reward_batch, next_state_batch, done_batch, log_prob_batch = zip(*episode_batch) + def learn(self): + episode_batch = self.memory.recall() + state_batch, action_batch, reward_batch, next_state_batch, done_batch, log_prob_batch = zip(*episode_batch) - # Send batches to the appropriate device - state_batch = torch.cat(state_batch).to(self.value_net.device) - action_batch = torch.tensor(action_batch).to(self.value_net.device) - reward_batch = torch.tensor(reward_batch).to(self.value_net.device).float() - not_done_batch = ~torch.tensor(done_batch).to(self.value_net.device) - next_state_batch = torch.cat(next_state_batch).to(self.value_net.device) - log_prob_batch = torch.cat(log_prob_batch).to(self.value_net.device) + # Send batches to the appropriate device + state_batch = torch.cat(state_batch).to(self.value_net.device) + action_batch = torch.tensor(action_batch).to(self.value_net.device) + reward_batch = torch.tensor(reward_batch).to(self.value_net.device).float() + not_done_batch = ~torch.tensor(done_batch).to(self.value_net.device) + next_state_batch = torch.cat(next_state_batch).to(self.value_net.device) + log_prob_batch = torch.cat(log_prob_batch).to(self.value_net.device) - ## Value Loss - # In PPO, the value loss is the difference between the discounted reward and the value from the first state - # The value of the first state is supposed to tell us the expected reward from the current policy of the whole episode - value_loss = F.mse_loss(self._discount_rewards(reward_batch).sum(), self.value_net(state_batch[0])) - self.value_net.zero_grad() - value_loss.backward() - self.value_net.step() + ## Value Loss + # In PPO, the value loss is the difference between the discounted reward and the value from the first state + # The value of the first state is supposed to tell us the expected reward from the current policy of the whole episode + value_loss = F.mse_loss(self._discount_rewards(reward_batch).sum(), self.value_net(state_batch[0])) + self.value_net.zero_grad() + value_loss.backward() + self.value_net.step() - ## Policy Loss - # Increase probabilities of advantageous states - # and decrease the probabilities of non-advantageous ones - with torch.no_grad(): - state_values = self.value_net(state_batch) - next_state_values = torch.zeros_like(state_values) - next_state_values[not_done_batch] = self.value_net(next_state_batch[not_done_batch]) - advantages = (reward_batch.unsqueeze(1) + self.config['discount_rate'] * next_state_values) - state_values - advantages = advantages.squeeze(1) + ## Policy Loss + # Increase probabilities of advantageous states + # and decrease the probabilities of non-advantageous ones + with torch.no_grad(): + state_values = self.value_net(state_batch) + next_state_values = torch.zeros_like(state_values) + next_state_values[not_done_batch] = self.value_net(next_state_batch[not_done_batch]) + advantages = (reward_batch.unsqueeze(1) + self.config['discount_rate'] * next_state_values) - state_values + advantages = advantages.squeeze(1) - action_probabilities = self.old_policy_net(state_batch).detach() - distributions = list(map(Categorical, action_probabilities)) - old_log_probs = torch.stack(list(map(lambda distribution, action: distribution.log_prob(action), distributions, action_batch))) + action_probabilities = self.old_policy_net(state_batch).detach() + distributions = list(map(Categorical, action_probabilities)) + old_log_probs = torch.stack(list(map(lambda distribution, action: distribution.log_prob(action), distributions, action_batch))) - # For PPO we want to stay within a certain ratio of the old policy - policy_ratio = torch.exp(log_prob_batch - old_log_probs) # Equivalent to (log_prob / old_log_prob) - policy_loss1 = policy_ratio * advantages - policy_loss2 = policy_ratio.clamp(min = 0.8, max = 1.2) * advantages # From original paper - policy_loss = -torch.min(policy_loss1, policy_loss2).sum() + # For PPO we want to stay within a certain ratio of the old policy + policy_ratio = torch.exp(log_prob_batch - old_log_probs) # Equivalent to (log_prob / old_log_prob) + policy_loss1 = policy_ratio * advantages + policy_loss2 = policy_ratio.clamp(min=0.8, max=1.2) * advantages # From original paper + policy_loss = -torch.min(policy_loss1, policy_loss2).sum() - if self.logger is not None: - self.logger.append("Loss/Policy", policy_loss.item()) - self.logger.append("Loss/Value", value_loss.item()) + if self.logger is not None: + self.logger.append("Loss/Policy", policy_loss.item()) + self.logger.append("Loss/Value", value_loss.item()) - - self.old_policy_net.sync() - self.policy_net.zero_grad() - policy_loss.backward() - self.policy_net.step() - - - # Memory under the old policy is not needed for future training - self.memory.clear() - + self.old_policy_net.sync() + self.policy_net.zero_grad() + policy_loss.backward() + self.policy_net.step() + # Memory under the old policy is not needed for future training + self.memory.clear() diff --git a/rltorch/agents/QEPAgent.py b/rltorch/agents/QEPAgent.py index 9dd0fd9..d46fab0 100644 --- a/rltorch/agents/QEPAgent.py +++ b/rltorch/agents/QEPAgent.py @@ -2,16 +2,17 @@ from copy import deepcopy import collections import numpy as np import torch +import torch.nn.functional as F from torch.distributions import Categorical import rltorch import rltorch.memory as M -import torch.nn.functional as F + # Q-Evolutionary Policy Agent # Maximizes the policy with respect to the Q-Value function. # Since function is non-differentiabile, depends on the Evolutionary Strategy algorithm class QEPAgent: - def __init__(self, policy_net, value_net, memory, config, target_value_net = None, logger = None, entropy_importance = 0, policy_skip = 4, after_value_train = None): + def __init__(self, policy_net, value_net, memory, config, target_value_net=None, logger=None, entropy_importance=0, policy_skip=4): self.policy_net = policy_net assert isinstance(self.policy_net, rltorch.network.ESNetwork) or isinstance(self.policy_net, rltorch.network.ESNetworkMP) self.policy_net.fitness = self.fitness @@ -22,7 +23,6 @@ class QEPAgent: self.logger = logger self.policy_skip = policy_skip self.entropy_importance = entropy_importance - self.after_value_train = after_value_train def save(self, file_location): torch.save({ @@ -42,43 +42,41 @@ class QEPAgent: batch_size = len(state_batch) with torch.no_grad(): action_probabilities = policy_net(state_batch) - action_size = action_probabilities.shape[1] distributions = list(map(Categorical, action_probabilities)) - actions = torch.stack([d.sample() for d in distributions]) with torch.no_grad(): state_values = value_net(state_batch) - + # Weird hacky solution where in multiprocess, it sometimes spits out nans # So have it try again while torch.isnan(state_values).any(): - print("NAN DETECTED") with torch.no_grad(): state_values = value_net(state_batch) - obtained_values = state_values.gather(1, actions.view(batch_size, 1)).squeeze(1) - + obtained_values = state_values.gather(1, actions.view(len(state_batch), 1)).squeeze(1) + # return -obtained_values.mean().item() + entropy_importance = 0 # Entropy accounting for 1% of loss seems to work well entropy_importance = next(self.entropy_importance) if isinstance(self.entropy_importance, collections.Iterable) else self.entropy_importance value_importance = 1 - entropy_importance # entropy_loss = (action_probabilities * torch.log2(action_probabilities)).sum(1) # Standard entropy loss from information theory - entropy_loss = (action_probabilities - torch.tensor(1 / action_size, device = state_batch.device).repeat(batch_size, action_size)).abs().sum(1) + entropy_loss = (action_probabilities - torch.tensor(1 / action_size, device=state_batch.device).repeat(len(state_batch), action_size)).abs().sum(1) return (entropy_importance * entropy_loss - value_importance * obtained_values).mean().item() - def learn(self, logger = None): + def learn(self, logger=None): if len(self.memory) < self.config['batch_size']: return - if (isinstance(self.memory, M.PrioritizedReplayMemory)): + if isinstance(self.memory, M.PrioritizedReplayMemory): weight_importance = self.config['prioritized_replay_weight_importance'] # If it's a scheduler then get the next value by calling next, otherwise just use it's value beta = next(weight_importance) if isinstance(weight_importance, collections.Iterable) else weight_importance - minibatch = self.memory.sample(self.config['batch_size'], beta = beta) - state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, importance_weights, batch_indexes = M.zip_batch(minibatch, priority = True) + minibatch = self.memory.sample(self.config['batch_size'], beta=beta) + state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, importance_weights, batch_indexes = M.zip_batch(minibatch, priority=True) else: minibatch = self.memory.sample(self.config['batch_size']) state_batch, action_batch, reward_batch, next_state_batch, not_done_batch = M.zip_batch(minibatch) @@ -98,7 +96,7 @@ class QEPAgent: # and the regular net to select the action # That way we decouple the value and action selecting processes (DOUBLE DQN) not_done_size = not_done_batch.sum() - next_state_values = torch.zeros_like(state_values, device = self.value_net.device) + next_state_values = torch.zeros_like(state_values, device=self.value_net.device) if self.target_value_net is not None: next_state_values[not_done_batch] = self.target_value_net(next_state_batch[not_done_batch]) next_best_action = self.value_net(next_state_batch[not_done_batch]).argmax(1) @@ -106,13 +104,13 @@ class QEPAgent: next_state_values[not_done_batch] = self.value_net(next_state_batch[not_done_batch]) next_best_action = next_state_values[not_done_batch].argmax(1) - best_next_state_value = torch.zeros(self.config['batch_size'], device = self.value_net.device) + best_next_state_value = torch.zeros(self.config['batch_size'], device=self.value_net.device) best_next_state_value[not_done_batch] = next_state_values[not_done_batch].gather(1, next_best_action.view((not_done_size, 1))).squeeze(1) expected_values = (reward_batch + (self.config['discount_rate'] * best_next_state_value)).unsqueeze(1) - if (isinstance(self.memory, M.PrioritizedReplayMemory)): - value_loss = (torch.as_tensor(importance_weights, device = self.value_net.device) * ((obtained_values - expected_values)**2).squeeze(1)).mean() + if isinstance(self.memory, M.PrioritizedReplayMemory): + value_loss = (torch.as_tensor(importance_weights, device=self.value_net.device) * ((obtained_values - expected_values)**2).squeeze(1)).mean() else: value_loss = F.mse_loss(obtained_values, expected_values) @@ -124,28 +122,26 @@ class QEPAgent: self.value_net.clamp_gradients() self.value_net.step() - if callable(self.after_value_train): - self.after_value_train() - if self.target_value_net is not None: if 'target_sync_tau' in self.config: self.target_value_net.partial_sync(self.config['target_sync_tau']) else: self.target_value_net.sync() - if (isinstance(self.memory, M.PrioritizedReplayMemory)): + if isinstance(self.memory, M.PrioritizedReplayMemory): td_error = (obtained_values - expected_values).detach().abs() self.memory.update_priorities(batch_indexes, td_error) ## Policy Training if self.policy_skip > 0: - self.policy_skip -= 1 - return - self.policy_skip = self.config['policy_skip'] + self.policy_skip -= 1 + return + self.policy_skip = 4 + if self.target_value_net is not None: - self.policy_net.calc_gradients(self.target_value_net, state_batch) + self.policy_net.calc_gradients(self.target_value_net, state_batch) else: - self.policy_net.calc_gradients(self.value_net, state_batch) + self.policy_net.calc_gradients(self.value_net, state_batch) self.policy_net.step() diff --git a/rltorch/agents/REINFORCEAgent.py b/rltorch/agents/REINFORCEAgent.py index 7c8d8d6..12318dc 100644 --- a/rltorch/agents/REINFORCEAgent.py +++ b/rltorch/agents/REINFORCEAgent.py @@ -1,60 +1,60 @@ -import rltorch from copy import deepcopy -import torch import numpy as np +import torch +import rltorch class REINFORCEAgent: - def __init__(self, net , memory, config, target_net = None, logger = None): - self.net = net - if not isinstance(memory, rltorch.memory.EpisodeMemory): - raise ValueError("Memory must be of instance EpisodeMemory") - self.memory = memory - self.config = deepcopy(config) - self.target_net = target_net - self.logger = logger + def __init__(self, net, memory, config, target_net=None, logger=None): + self.net = net + if not isinstance(memory, rltorch.memory.EpisodeMemory): + raise ValueError("Memory must be of instance EpisodeMemory") + self.memory = memory + self.config = deepcopy(config) + self.target_net = target_net + self.logger = logger - # Shaped rewards implements three improvements to REINFORCE - # 1) Discounted rewards, future rewards matter less than current - # 2) Baselines: We use the mean reward to see if the current reward is advantageous or not - # 3) Causality: Your current actions do not affect your past. Only the present and future. - def _shape_rewards(self, rewards): - shaped_rewards = torch.zeros_like(rewards) - baseline = rewards.mean() - for i in range(len(rewards)): - gammas = torch.ones_like(rewards[i:]) - if i != len(rewards) - 1: - gammas[1:] = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards) - i - 1), dim = 0) - advantages = rewards[i:] - baseline - shaped_rewards[i] = (gammas * advantages).sum() - return shaped_rewards + # Shaped rewards implements three improvements to REINFORCE + # 1) Discounted rewards, future rewards matter less than current + # 2) Baselines: We use the mean reward to see if the current reward is advantageous or not + # 3) Causality: Your current actions do not affect your past. Only the present and future. + def _shape_rewards(self, rewards): + shaped_rewards = torch.zeros_like(rewards) + baseline = rewards.mean() + for i in range(len(rewards)): + gammas = torch.ones_like(rewards[i:]) + if i != len(rewards) - 1: + gammas[1:] = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards) - i - 1), dim=0) + advantages = rewards[i:] - baseline + shaped_rewards[i] = (gammas * advantages).sum() + return shaped_rewards - def learn(self): - episode_batch = self.memory.recall() - state_batch, action_batch, reward_batch, next_state_batch, done_batch, log_prob_batch = zip(*episode_batch) + def learn(self): + episode_batch = self.memory.recall() + _, _, reward_batch, _, _, log_prob_batch = zip(*episode_batch) - # Caluclate discounted rewards to place more importance to recent rewards - shaped_reward_batch = self._shape_rewards(torch.tensor(reward_batch)) + # Caluclate discounted rewards to place more importance to recent rewards + shaped_reward_batch = self._shape_rewards(torch.tensor(reward_batch)) - # Scale discounted rewards to have variance 1 (stabalizes training) - shaped_reward_batch = shaped_reward_batch / (shaped_reward_batch.std() + np.finfo('float').eps) + # Scale discounted rewards to have variance 1 (stabalizes training) + shaped_reward_batch = shaped_reward_batch / (shaped_reward_batch.std() + np.finfo('float').eps) - log_prob_batch = torch.cat(log_prob_batch) + log_prob_batch = torch.cat(log_prob_batch) - policy_loss = (-log_prob_batch * shaped_reward_batch).sum() + policy_loss = (-log_prob_batch * shaped_reward_batch).sum() - if self.logger is not None: + if self.logger is not None: self.logger.append("Loss", policy_loss.item()) - self.net.zero_grad() - policy_loss.backward() - self.net.clamp_gradients() - self.net.step() + self.net.zero_grad() + policy_loss.backward() + self.net.clamp_gradients() + self.net.step() - if self.target_net is not None: - if 'target_sync_tau' in self.config: - self.target_net.partial_sync(self.config['target_sync_tau']) - else: - self.target_net.sync() + if self.target_net is not None: + if 'target_sync_tau' in self.config: + self.target_net.partial_sync(self.config['target_sync_tau']) + else: + self.target_net.sync() - # Memory under the old policy is not needed for future training - self.memory.clear() + # Memory under the old policy is not needed for future training + self.memory.clear() diff --git a/rltorch/agents/__init__.py b/rltorch/agents/__init__.py index 2d12612..390d6be 100644 --- a/rltorch/agents/__init__.py +++ b/rltorch/agents/__init__.py @@ -1,6 +1,6 @@ -from .A2CSingleAgent import * -from .DQNAgent import * -from .DQfDAgent import * -from .PPOAgent import * -from .QEPAgent import * -from .REINFORCEAgent import * \ No newline at end of file +from .A2CSingleAgent import A2CSingleAgent +from .DQNAgent import DQNAgent +from .DQfDAgent import DQfDAgent +from .PPOAgent import PPOAgent +from .QEPAgent import QEPAgent +from .REINFORCEAgent import REINFORCEAgent diff --git a/rltorch/env/simulate.py b/rltorch/env/simulate.py index ec544cb..d9b65ea 100644 --- a/rltorch/env/simulate.py +++ b/rltorch/env/simulate.py @@ -1,108 +1,108 @@ -from copy import deepcopy -import rltorch +from copy import deepcopy import time +import rltorch -def simulateEnvEps(env, actor, config, total_episodes = 1, memory = None, logger = None, name = "", render = False): - for episode in range(total_episodes): - state = env.reset() - done = False - episode_reward = 0 - while not done: - action = actor.act(state) - next_state, reward, done, _ = env.step(action) - if render: - env.render() - time.sleep(0.01) +def simulateEnvEps(env, actor, config, total_episodes=1, memory=None, logger=None, name="", render=False): + for episode in range(total_episodes): + state = env.reset() + done = False + episode_reward = 0 + while not done: + action = actor.act(state) + next_state, reward, done, _ = env.step(action) + if render: + env.render() + time.sleep(0.01) - episode_reward = episode_reward + reward - if memory is not None: - memory.append(state, action, reward, next_state, done) - state = next_state + episode_reward = episode_reward + reward + if memory is not None: + memory.append(state, action, reward, next_state, done) + state = next_state - if episode % config['print_stat_n_eps'] == 0: - print("episode: {}/{}, score: {}" - .format(episode, total_episodes, episode_reward), flush=True) + if episode % config['print_stat_n_eps'] == 0: + print("episode: {}/{}, score: {}" + .format(episode, total_episodes, episode_reward), flush=True) - if logger is not None: - logger.append(name + '/EpisodeReward', episode_reward) + if logger is not None: + logger.append(name + '/EpisodeReward', episode_reward) -class EnvironmentRunSync(): - def __init__(self, env, actor, config, memory = None, logwriter = None, name = "", render = False): - self.env = env - self.name = name - self.actor = actor - self.config = deepcopy(config) - self.logwriter = logwriter - self.memory = memory - self.episode_num = 1 - self.episode_reward = 0 - self.last_state = env.reset() - self.render = render +class EnvironmentRunSync: + def __init__(self, env, actor, config, memory=None, logwriter=None, name="", render=False): + self.env = env + self.name = name + self.actor = actor + self.config = deepcopy(config) + self.logwriter = logwriter + self.memory = memory + self.episode_num = 1 + self.episode_reward = 0 + self.last_state = env.reset() + self.render = render - def run(self, iterations): - state = self.last_state - logger = rltorch.log.Logger() if self.logwriter is not None else None - for _ in range(iterations): - action = self.actor.act(state) - next_state, reward, done, _ = self.env.step(action) - if self.render: - self.env.render() + def run(self, iterations): + state = self.last_state + logger = rltorch.log.Logger() if self.logwriter is not None else None + for _ in range(iterations): + action = self.actor.act(state) + next_state, reward, done, _ = self.env.step(action) + if self.render: + self.env.render() - self.episode_reward += reward - if self.memory is not None: - self.memory.append(state, action, reward, next_state, done) + self.episode_reward += reward + if self.memory is not None: + self.memory.append(state, action, reward, next_state, done) - state = next_state + state = next_state + + if done: + if self.episode_num % self.config['print_stat_n_eps'] == 0: + print("episode: {}/{}, score: {}" + .format(self.episode_num, self.config['total_training_episodes'], self.episode_reward), flush=True) + + if self.logwriter is not None: + logger.append(self.name + '/EpisodeReward', self.episode_reward) + self.episode_reward = 0 + state = self.env.reset() + self.episode_num += 1 + + if self.logwriter is not None: + self.logwriter.write(logger) + + self.last_state = state + + +class EnvironmentEpisodeSync: + def __init__(self, env, actor, config, memory=None, logwriter=None, name=""): + self.env = env + self.name = name + self.actor = actor + self.config = deepcopy(config) + self.logwriter = logwriter + self.memory = memory + self.episode_num = 1 + + def run(self): + state = self.env.reset() + done = False + episodeReward = 0 + logger = rltorch.log.Logger() if self.logwriter is not None else None + while not done: + action = self.actor.act(state) + next_state, reward, done, _ = self.env.step(action) + + episodeReward += reward + if self.memory is not None: + self.memory.append(state, action, reward, next_state, done) + + state = next_state - if done: if self.episode_num % self.config['print_stat_n_eps'] == 0: - print("episode: {}/{}, score: {}" - .format(self.episode_num, self.config['total_training_episodes'], self.episode_reward), flush=True) + print("episode: {}/{}, score: {}" + .format(self.episode_num, self.config['total_training_episodes'], episodeReward), flush=True) if self.logwriter is not None: - logger.append(self.name + '/EpisodeReward', self.episode_reward) - self.episode_reward = 0 - state = self.env.reset() - self.episode_num += 1 - - if self.logwriter is not None: - self.logwriter.write(logger) + logger.append(self.name + '/EpisodeReward', episodeReward) + self.logwriter.write(logger) - self.last_state = state - - -class EnvironmentEpisodeSync(): - def __init__(self, env, actor, config, memory = None, logwriter = None, name = ""): - self.env = env - self.name = name - self.actor = actor - self.config = deepcopy(config) - self.logwriter = logwriter - self.memory = memory - self.episode_num = 1 - - def run(self): - state = self.env.reset() - done = False - episodeReward = 0 - logger = rltorch.log.Logger() if self.logwriter is not None else None - while not done: - action = self.actor.act(state) - next_state, reward, done, _ = self.env.step(action) - - episodeReward += reward - if self.memory is not None: - self.memory.append(state, action, reward, next_state, done) - - state = next_state - - if self.episode_num % self.config['print_stat_n_eps'] == 0: - print("episode: {}/{}, score: {}" - .format(self.episode_num, self.config['total_training_episodes'], episodeReward), flush=True) - - if self.logwriter is not None: - logger.append(self.name + '/EpisodeReward', episodeReward) - self.logwriter.write(logger) - - self.episode_num += 1 + self.episode_num += 1 diff --git a/rltorch/env/wrappers.py b/rltorch/env/wrappers.py index 2bd5b97..0edfa99 100644 --- a/rltorch/env/wrappers.py +++ b/rltorch/env/wrappers.py @@ -1,8 +1,8 @@ +from collections import deque import gym import torch from gym import spaces import cv2 -from collections import deque import numpy as np class EpisodicLifeEnv(gym.Wrapper): @@ -111,129 +111,134 @@ class ClippedRewardsWrapper(gym.RewardWrapper): # Mostly derived from OpenAI baselines class FireResetEnv(gym.Wrapper): - def __init__(self, env): - """Take action on reset for environments that are fixed until firing.""" - gym.Wrapper.__init__(self, env) - assert env.unwrapped.get_action_meanings()[1] == 'FIRE' - assert len(env.unwrapped.get_action_meanings()) >= 3 + def __init__(self, env): + """Take action on reset for environments that are fixed until firing.""" + gym.Wrapper.__init__(self, env) + assert env.unwrapped.get_action_meanings()[1] == 'FIRE' + assert len(env.unwrapped.get_action_meanings()) >= 3 - def reset(self, **kwargs): - self.env.reset(**kwargs) - obs, _, done, _ = self.env.step(1) - if done: - self.env.reset(**kwargs) - obs, _, done, _ = self.env.step(2) - if done: - self.env.reset(**kwargs) - return obs + def reset(self, **kwargs): + self.env.reset(**kwargs) + obs, _, done, _ = self.env.step(1) + if done: + self.env.reset(**kwargs) + obs, _, done, _ = self.env.step(2) + if done: + self.env.reset(**kwargs) + return obs - def step(self, ac): - return self.env.step(ac) + def step(self, ac): + return self.env.step(ac) class LazyFrames(object): - def __init__(self, frames): - """This object ensures that common frames between the observations are only stored once. - It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay - buffers. - This object should only be converted to numpy array before being passed to the model. - You'd not believe how complex the previous solution was.""" - self._frames = frames - self._out = None + def __init__(self, frames): + """This object ensures that common frames between the observations are only stored once. + It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay + buffers. + This object should only be converted to numpy array before being passed to the model. + You'd not believe how complex the previous solution was.""" + self._frames = frames + self._out = None - def _force(self): - if self._out is None: - self._out = torch.stack(self._frames) - self._frames = None - return self._out + def _force(self): + if self._out is None: + self._out = torch.stack(self._frames) + self._frames = None + return self._out - def __array__(self, dtype=None): - out = self._force() - if dtype is not None: - out = out.astype(dtype) - return out + def __array__(self, dtype=None): + out = self._force() + if dtype is not None: + out = out.astype(dtype) + return out - def __len__(self): - return len(self._force()) + def __len__(self): + return len(self._force()) - def __getitem__(self, i): - return self._force()[i] + def __getitem__(self, i): + return self._force()[i] class FrameStack(gym.Wrapper): - def __init__(self, env, k): - """Stack k last frames. - Returns lazy array, which is much more memory efficient. - See Also - -------- - baselines.common.atari_wrappers.LazyFrames - """ - gym.Wrapper.__init__(self, env) - self.k = k - self.frames = deque([], maxlen=k) - shp = env.observation_space.shape - self.observation_space = spaces.Box(low=0, high=255, shape=(shp[:-1] + (shp[-1] * k,)), dtype=env.observation_space.dtype) + def __init__(self, env, k): + """Stack k last frames. + Returns lazy array, which is much more memory efficient. + See Also + -------- + baselines.common.atari_wrappers.LazyFrames + """ + gym.Wrapper.__init__(self, env) + self.k = k + self.frames = deque([], maxlen=k) + shp = env.observation_space.shape + self.observation_space = spaces.Box( + low=0, + high=255, + shape=(shp[:-1] + (shp[-1] * k,)), + dtype=env.observation_space.dtype + ) - def reset(self): - ob = self.env.reset() - for _ in range(self.k): - self.frames.append(ob) - return self._get_ob() + def reset(self): + ob = self.env.reset() + for _ in range(self.k): + self.frames.append(ob) + return self._get_ob() - def step(self, action): - ob, reward, done, info = self.env.step(action) - self.frames.append(ob) - return self._get_ob(), reward, done, info + def step(self, action): + ob, reward, done, info = self.env.step(action) + self.frames.append(ob) + return self._get_ob(), reward, done, info - def _get_ob(self): - assert len(self.frames) == self.k - # return LazyFrames(list(self.frames)) - return torch.cat(list(self.frames)).unsqueeze(0) + def _get_ob(self): + assert len(self.frames) == self.k + # return LazyFrames(list(self.frames)) + return torch.cat(list(self.frames)).unsqueeze(0) class ProcessFrame(gym.Wrapper): - def __init__(self, env, resize_shape = None, crop_bounds = None, grayscale = False): - gym.Wrapper.__init__(self, env) - self.resize_shape = resize_shape - self.crop_bounds = crop_bounds - self.grayscale = grayscale + def __init__(self, env, resize_shape=None, crop_bounds=None, grayscale=False): + gym.Wrapper.__init__(self, env) + self.resize_shape = resize_shape + self.crop_bounds = crop_bounds + self.grayscale = grayscale - def reset(self): - return self._preprocess(self.env.reset()) + def reset(self): + return self._preprocess(self.env.reset()) - def step(self, action): - next_state, reward, done, info = self.env.step(action) - next_state = self._preprocess(next_state) - return next_state, reward, done, info + def step(self, action): + next_state, reward, done, info = self.env.step(action) + next_state = self._preprocess(next_state) + return next_state, reward, done, info - def _preprocess(self, frame): - if self.grayscale: - frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY) - if self.crop_bounds is not None and len(self.crop_bounds) == 4: - frame = frame[self.crop_bounds[0]:self.crop_bounds[1], self.crop_bounds[2]:self.crop_bounds[3]] - if self.resize_shape is not None and len(self.resize_shape) == 2: - frame = cv2.resize(frame, self.resize_shape, interpolation=cv2.INTER_AREA) - # Normalize - frame = frame / 255 - return frame - + def _preprocess(self, frame): + if self.grayscale: + frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY) + if self.crop_bounds is not None and len(self.crop_bounds) == 4: + frame = frame[ + self.crop_bounds[0]:self.crop_bounds[1], + self.crop_bounds[2]:self.crop_bounds[3] + ] + if self.resize_shape is not None and len(self.resize_shape) == 2: + frame = cv2.resize(frame, self.resize_shape, interpolation=cv2.INTER_AREA) + # Normalize + frame = frame / 255 + return frame # Turns observations into torch tensors # Adds an additional dimension that's suppose to represent the batch dim class TorchWrap(gym.Wrapper): - def __init__(self, env): - gym.Wrapper.__init__(self, env) + def __init__(self, env): + gym.Wrapper.__init__(self, env) - def reset(self): - return self._convert(self.env.reset()) + def reset(self): + return self._convert(self.env.reset()) - def step(self, action): - next_state, reward, done, info = self.env.step(action) - next_state = self._convert(next_state) - return next_state, reward, done, info - - def _convert(self, frame): - frame = torch.from_numpy(frame).unsqueeze(0).float() - return frame - + def step(self, action): + next_state, reward, done, info = self.env.step(action) + next_state = self._convert(next_state) + return next_state, reward, done, info + def _convert(self, frame): + frame = torch.from_numpy(frame).unsqueeze(0).float() + return frame class ProcessFrame84(gym.ObservationWrapper): def __init__(self, env=None): @@ -256,4 +261,3 @@ class ProcessFrame84(gym.ObservationWrapper): x_t = resized_screen[18:102, :] x_t = np.reshape(x_t, [84, 84]) return x_t.astype(np.uint8) - diff --git a/rltorch/memory/DQfDMemory.py b/rltorch/memory/DQfDMemory.py index 3a8341b..f62b443 100644 --- a/rltorch/memory/DQfDMemory.py +++ b/rltorch/memory/DQfDMemory.py @@ -1,13 +1,13 @@ -from .PrioritizedReplayMemory import PrioritizedReplayMemory from collections import namedtuple import numpy as np +from .PrioritizedReplayMemory import PrioritizedReplayMemory Transition = namedtuple('Transition', ('state', 'action', 'reward', 'next_state', 'done')) class DQfDMemory(PrioritizedReplayMemory): - def __init__(self, capacity, alpha, max_demo = -1): + def __init__(self, capacity, alpha, max_demo=-1): assert max_demo <= capacity super().__init__(capacity, alpha) self.demo_position = 0 @@ -47,7 +47,8 @@ class DQfDMemory(PrioritizedReplayMemory): idxes = self._sample_proportional(sample_size) step_idxes = [] for i in idxes: - # If the interval of experiences fall between demonstration and obtained, move it over to the demonstration half + # If the interval of experiences fall between demonstration and obtained, + # move it over to the demonstration half if i < self.demo_position and i + steps > self.demo_position: diff = i + steps - self.demo_position step_idxes += range(i - diff, i + steps - diff) diff --git a/rltorch/memory/EpisodeMemory.py b/rltorch/memory/EpisodeMemory.py index 27efa69..375d8d2 100644 --- a/rltorch/memory/EpisodeMemory.py +++ b/rltorch/memory/EpisodeMemory.py @@ -1,6 +1,4 @@ -import random from collections import namedtuple -import torch Transition = namedtuple('Transition', ('state', 'action', 'reward', 'next_state', 'done')) @@ -39,7 +37,7 @@ class EpisodeMemory(object): def recall(self): """ - Return a list of the transitions with their + Return a list of the transitions with their associated log-based probabilities. """ if len(self.memory) != len(self.log_probs): diff --git a/rltorch/memory/PrioritizedReplayMemory.py b/rltorch/memory/PrioritizedReplayMemory.py index 1bf153a..8966850 100644 --- a/rltorch/memory/PrioritizedReplayMemory.py +++ b/rltorch/memory/PrioritizedReplayMemory.py @@ -1,10 +1,9 @@ # From OpenAI Baselines https://github.com/openai/baselines/blob/master/baselines/deepq/replay_buffer.py - -from .ReplayMemory import ReplayMemory import operator import random import numpy as np from numba import jit +from .ReplayMemory import ReplayMemory class SegmentTree(object): def __init__(self, capacity, operation, neutral_element): @@ -34,7 +33,7 @@ class SegmentTree(object): self._value = [neutral_element for _ in range(2 * capacity)] self._operation = operation - @jit(forceobj = True) + @jit(forceobj=True) def _reduce_helper(self, start, end, node, node_start, node_end): if start == node_start and end == node_end: return self._value[node] @@ -50,7 +49,7 @@ class SegmentTree(object): self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1, node_end) ) - @jit(forceobj = True) + @jit(forceobj=True) def reduce(self, start=0, end=None): """Returns result of applying `self.operation` to a contiguous subsequence of the array. @@ -73,7 +72,7 @@ class SegmentTree(object): end -= 1 return self._reduce_helper(start, end, 1, 0, self._capacity - 1) - @jit(forceobj = True) + @jit(forceobj=True) def __setitem__(self, idx, val): # index of the leaf idx += self._capacity @@ -86,7 +85,7 @@ class SegmentTree(object): ) idx //= 2 - @jit(forceobj = True) + @jit(forceobj=True) def __getitem__(self, idx): assert 0 <= idx < self._capacity return self._value[self._capacity + idx] @@ -100,12 +99,12 @@ class SumSegmentTree(SegmentTree): neutral_element=0.0 ) - @jit(forceobj = True) + @jit(forceobj=True) def sum(self, start=0, end=None): """Returns arr[start] + ... + arr[end]""" return super(SumSegmentTree, self).reduce(start, end) - @jit(forceobj = True, parallel = True) + @jit(forceobj=True, parallel=True) def find_prefixsum_idx(self, prefixsum): """Find the highest index `i` in the array such that sum(arr[0] + arr[1] + ... + arr[i - i]) <= prefixsum @@ -140,7 +139,7 @@ class MinSegmentTree(SegmentTree): neutral_element=float('inf') ) - @jit(forceobj = True) + @jit(forceobj=True) def min(self, start=0, end=None): """Returns min(arr[start], ..., arr[end])""" return super(MinSegmentTree, self).reduce(start, end) @@ -185,7 +184,7 @@ class PrioritizedReplayMemory(ReplayMemory): self._it_sum[idx] = self._max_priority ** self._alpha self._it_min[idx] = self._max_priority ** self._alpha - @jit(forceobj = True) + @jit(forceobj=True) def _sample_proportional(self, batch_size): res = [] p_total = self._it_sum.sum(0, len(self.memory) - 1) @@ -294,7 +293,7 @@ class PrioritizedReplayMemory(ReplayMemory): batch = list(zip(*encoded_sample, weights, step_idxes)) return batch - @jit(forceobj = True) + @jit(forceobj=True) def update_priorities(self, idxes, priorities): """ Update priorities of sampled transitions. @@ -320,4 +319,3 @@ class PrioritizedReplayMemory(ReplayMemory): self._it_min[idx] = priority ** self._alpha self._max_priority = max(self._max_priority, priority) - diff --git a/rltorch/memory/ReplayMemory.py b/rltorch/memory/ReplayMemory.py index 56b28f5..ad9f726 100644 --- a/rltorch/memory/ReplayMemory.py +++ b/rltorch/memory/ReplayMemory.py @@ -80,7 +80,7 @@ class ReplayMemory(object): The number of observations after the one selected to sample. """ idxes = random.sample( - range(len(self.memory) - steps), + range(len(self.memory) - steps), batch_size // steps ) step_idxes = [] @@ -106,11 +106,9 @@ class ReplayMemory(object): def __reversed__(self): return reversed(self.memory) -def zip_batch(minibatch, priority = False, want_indices = False): +def zip_batch(minibatch, priority=False): if priority: state_batch, action_batch, reward_batch, next_state_batch, done_batch, weights, indexes = zip(*minibatch) - elif want_indices: - state_batch, action_batch, reward_batch, next_state_batch, done_batch, indexes = zip(*minibatch) else: state_batch, action_batch, reward_batch, next_state_batch, done_batch = zip(*minibatch) @@ -122,7 +120,5 @@ def zip_batch(minibatch, priority = False, want_indices = False): if priority: return state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, weights, indexes - elif want_indices: - return state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, indexes else: - return state_batch, action_batch, reward_batch, next_state_batch, not_done_batch \ No newline at end of file + return state_batch, action_batch, reward_batch, next_state_batch, not_done_batch diff --git a/rltorch/mp/EnvironmentEpisode.py b/rltorch/mp/EnvironmentEpisode.py index c753e29..87d3502 100644 --- a/rltorch/mp/EnvironmentEpisode.py +++ b/rltorch/mp/EnvironmentEpisode.py @@ -5,115 +5,32 @@ from copy import deepcopy import torch.multiprocessing as mp class EnvironmentEpisode(mp.Process): - def __init__(self, env, actor, config, logger = None, name = ""): - super(EnvironmentEpisode, self).__init__() - self.env = env - self.actor = actor - self.config = deepcopy(config) - self.logger = logger - self.name = name - self.episode_num = 1 + def __init__(self, env, actor, config, logger=None, name=""): + super(EnvironmentEpisode, self).__init__() + self.env = env + self.actor = actor + self.config = deepcopy(config) + self.logger = logger + self.name = name + self.episode_num = 1 - def run(self, printstat = False, memory = None): - state = self.env.reset() - done = False - episode_reward = 0 - while not done: - action = self.actor.act(state) - next_state, reward, done, _ = self.env.step(action) + def run(self, printstat=False, memory=None): + state = self.env.reset() + done = False + episode_reward = 0 + while not done: + action = self.actor.act(state) + next_state, reward, done, _ = self.env.step(action) - episode_reward = episode_reward + reward - if memory is not None: - memory.put((state, action, reward, next_state, done)) - state = next_state + episode_reward = episode_reward + reward + if memory is not None: + memory.put((state, action, reward, next_state, done)) + state = next_state - if printstat: - print("episode: {}/{}, score: {}" - .format(self.episode_num, self.config['total_training_episodes'], episode_reward)) - if self.logger is not None: - self.logger.append(self.name + '/EpisodeReward', episode_reward) + if printstat: + print("episode: {}/{}, score: {}" + .format(self.episode_num, self.config['total_training_episodes'], episode_reward)) + if self.logger is not None: + self.logger.append(self.name + '/EpisodeReward', episode_reward) - self.episode_num += 1 - - - - - - - - -# from copy import deepcopy -# import torch.multiprocessing as mp -# from ctypes import * -# import rltorch.log - -# def envepisode(actor, env, episode_num, config, runcondition, memoryqueue = None, logqueue = None, name = ""): -# # Wait for signal to start running through the environment -# while runcondition.wait(): -# # Start a logger to log the rewards -# logger = rltorch.log.Logger() -# state = env.reset() -# episode_reward = 0 -# done = False -# while not done: -# action = actor.act(state) -# next_state, reward, done, _ = env.step(action) - -# episode_reward += reward -# if memoryqueue is not None: -# memoryqueue.put((state, action, reward, next_state, done)) - -# state = next_state - -# if done: -# with episode_num.get_lock(): -# if episode_num.value % config['print_stat_n_eps'] == 0: -# print("episode: {}/{}, score: {}" -# .format(episode_num.value, config['total_training_episodes'], episode_reward)) - -# if logger is not None: -# logger.append(name + '/EpisodeReward', episode_reward) -# episode_reward = 0 -# state = env.reset() -# with episode_num.get_lock(): -# episode_num.value += 1 - -# logqueue.put(logger) - -# class EnvironmentRun(): -# def __init__(self, env_func, actor, config, memory = None, name = ""): -# self.config = deepcopy(config) -# self.memory = memory -# self.episode_num = mp.Value(c_uint) -# self.runcondition = mp.Event() -# # Interestingly enough, there isn't a good reliable way to know how many states an episode will have -# # Perhaps we can share a uint to keep track... -# self.memory_queue = mp.Queue(maxsize = config['replay_skip'] + 1) -# self.logqueue = mp.Queue(maxsize = 1) -# with self.episode_num.get_lock(): -# self.episode_num.value = 1 -# self.runner = mp.Process(target=envrun, -# args=(actor, env_func, self.episode_num, config, self.runcondition), -# kwargs = {'iterations': config['replay_skip'] + 1, -# 'memoryqueue' : self.memory_queue, 'logqueue' : self.logqueue, 'name' : name}) -# self.runner.start() - -# def run(self): -# self.runcondition.set() - -# def join(self): -# self._sync_memory() -# if self.logwriter is not None: -# self.logwriter.write(self._get_reward_logger()) - -# def sync_memory(self): -# if self.memory is not None: -# for i in range(self.config['replay_skip'] + 1): -# self.memory.append(*self.memory_queue.get()) - -# def get_reward_logger(self): -# return self.logqueue.get() - -# def terminate(self): -# self.runner.terminate() - + self.episode_num += 1 diff --git a/rltorch/mp/EnvironmentRun.py b/rltorch/mp/EnvironmentRun.py index bcd82af..0d21a35 100644 --- a/rltorch/mp/EnvironmentRun.py +++ b/rltorch/mp/EnvironmentRun.py @@ -1,40 +1,40 @@ from copy import deepcopy +from ctypes import c_uint import torch.multiprocessing as mp -from ctypes import * import rltorch.log -def envrun(actor, env, episode_num, config, runcondition, iterations = 1, memoryqueue = None, logqueue = None, name = ""): - state = env.reset() - episode_reward = 0 - # Wait for signal to start running through the environment - while runcondition.wait(): - # Start a logger to log the rewards - logger = rltorch.log.Logger() if logqueue is not None else None +def envrun(actor, env, episode_num, config, runcondition, iterations=1, memoryqueue=None, logqueue=None, name=""): + state = env.reset() + episode_reward = 0 + # Wait for signal to start running through the environment + while runcondition.wait(): + # Start a logger to log the rewards + logger = rltorch.log.Logger() if logqueue is not None else None for _ in range(iterations): - action = actor.act(state) - next_state, reward, done, _ = env.step(action) - - episode_reward += reward - if memoryqueue is not None: - memoryqueue.put((state, action, reward, next_state, done)) - - state = next_state - - if done: - with episode_num.get_lock(): - if episode_num.value % config['print_stat_n_eps'] == 0: - print("episode: {}/{}, score: {}" - .format(episode_num.value, config['total_training_episodes'], episode_reward)) - - if logger is not None: - logger.append(name + '/EpisodeReward', episode_reward) - episode_reward = 0 - state = env.reset() - with episode_num.get_lock(): - episode_num.value += 1 - - if logqueue is not None: - logqueue.put(logger) + action = actor.act(state) + next_state, reward, done, _ = env.step(action) + + episode_reward += reward + if memoryqueue is not None: + memoryqueue.put((state, action, reward, next_state, done)) + + state = next_state + + if done: + with episode_num.get_lock(): + if episode_num.value % config['print_stat_n_eps'] == 0: + print("episode: {}/{}, score: {}" + .format(episode_num.value, config['total_training_episodes'], episode_reward)) + + if logger is not None: + logger.append(name + '/EpisodeReward', episode_reward) + episode_reward = 0 + state = env.reset() + with episode_num.get_lock(): + episode_num.value += 1 + + if logqueue is not None: + logqueue.put(logger) class EnvironmentRun(): def __init__(self, env, actor, config, memory = None, logwriter = None, name = ""): diff --git a/rltorch/network/ESNetwork.py b/rltorch/network/ESNetwork.py index 6c83def..f498806 100644 --- a/rltorch/network/ESNetwork.py +++ b/rltorch/network/ESNetwork.py @@ -1,7 +1,8 @@ +from copy import deepcopy import numpy as np import torch from .Network import Network -from copy import deepcopy + # [TODO] Should we torch.no_grad the __call__? # What if we want to sometimes do gradient descent as well? @@ -11,8 +12,8 @@ class ESNetwork(Network): Notes ----- - Derived from the paper - Evolutionary Strategies + Derived from the paper + Evolutionary Strategies (https://arxiv.org/abs/1703.03864) Parameters @@ -38,7 +39,7 @@ class ESNetwork(Network): name For use in logger to differentiate in analysis. """ - def __init__(self, model, optimizer, population_size, fitness_fn, config, sigma = 0.05, device = None, logger = None, name = ""): + def __init__(self, model, optimizer, population_size, fitness_fn, config, sigma=0.05, device=None, logger=None, name=""): super(ESNetwork, self).__init__(model, optimizer, config, device, logger, name) self.population_size = population_size self.fitness = fitness_fn @@ -49,7 +50,7 @@ class ESNetwork(Network): """ Notes ----- - Since gradients aren't going to be computed in the + Since gradients aren't going to be computed in the traditional fashion, there is no need to keep track of the computations performed on the tensors. @@ -64,7 +65,11 @@ class ESNetwork(Network): white_noise_dict = {} noise_dict = {} for key in model_dict.keys(): - white_noise_dict[key] = torch.randn(self.population_size, *model_dict[key].shape, device = self.device) + white_noise_dict[key] = torch.randn( + self.population_size, + *model_dict[key].shape, + device=self.device + ) noise_dict[key] = self.sigma * white_noise_dict[key] return white_noise_dict, noise_dict @@ -87,7 +92,7 @@ class ESNetwork(Network): This is calculated by evaluating the fitness of multiple networks according to the fitness function specified in - the class. + the class. """ ## Generate Noise white_noise_dict, noise_dict = self._generate_noise_dicts() @@ -96,7 +101,10 @@ class ESNetwork(Network): candidate_solutions = self._generate_candidate_solutions(noise_dict) ## Calculate fitness then mean shift, scale - fitness_values = torch.tensor([self.fitness(x, *args) for x in candidate_solutions], device = self.device) + fitness_values = torch.tensor( + [self.fitness(x, *args) for x in candidate_solutions], + device=self.device + ) if self.logger is not None: self.logger.append(self.name + "/" + "fitness_value", fitness_values.mean().item()) fitness_values = (fitness_values - fitness_values.mean()) / (fitness_values.std() + np.finfo('float').eps) @@ -107,4 +115,4 @@ class ESNetwork(Network): if param.requires_grad: noise_dim_n = len(white_noise_dict[name].shape) dim = np.repeat(1, noise_dim_n - 1).tolist() if noise_dim_n > 0 else [] - param.grad = (white_noise_dict[name] * fitness_values.float().reshape(self.population_size, *dim)).mean(0) / self.sigma \ No newline at end of file + param.grad = (white_noise_dict[name] * fitness_values.float().reshape(self.population_size, *dim)).mean(0) / self.sigma diff --git a/rltorch/network/ESNetworkMP.py b/rltorch/network/ESNetworkMP.py index 69b0d21..85372b4 100644 --- a/rltorch/network/ESNetworkMP.py +++ b/rltorch/network/ESNetworkMP.py @@ -1,9 +1,8 @@ +from copy import deepcopy import numpy as np import torch -from .Network import Network -from copy import deepcopy import torch.multiprocessing as mp -import functools +from .Network import Network class fn_copy: def __init__(self, fn, args): @@ -20,14 +19,15 @@ class ESNetworkMP(Network): fitness_fun := model, *args -> fitness_value (float) We wish to find a model that maximizes the fitness function """ - def __init__(self, model, optimizer, population_size, fitness_fn, config, sigma = 0.05, device = None, logger = None, name = ""): + def __init__(self, model, optimizer, population_size, fitness_fn, config, sigma=0.05, device=None, logger=None, name=""): super(ESNetworkMP, self).__init__(model, optimizer, config, device, logger, name) self.population_size = population_size self.fitness = fitness_fn self.sigma = sigma assert self.sigma > 0 mp_ctx = mp.get_context("spawn") - self.pool = mp_ctx.Pool(processes=2) #[TODO] Probably should make number of processes a config variable + #[TODO] Probably should make number of processes a config variable + self.pool = mp_ctx.Pool(processes=2) # We're not going to be calculating gradients in the traditional way # So there's no need to waste computation time keeping track @@ -42,7 +42,11 @@ class ESNetworkMP(Network): white_noise_dict = {} noise_dict = {} for key in model_dict.keys(): - white_noise_dict[key] = torch.randn(self.population_size, *model_dict[key].shape, device = self.device) + white_noise_dict[key] = torch.randn( + self.population_size, + *model_dict[key].shape, + device=self.device + ) noise_dict[key] = self.sigma * white_noise_dict[key] return white_noise_dict, noise_dict @@ -67,7 +71,10 @@ class ESNetworkMP(Network): candidate_solutions = self._generate_candidate_solutions(noise_dict) ## Calculate fitness then mean shift, scale - fitness_values = torch.tensor(list(self.pool.map(fn_copy(self.fitness, args), candidate_solutions)), device = self.device) + fitness_values = torch.tensor( + list(self.pool.map(fn_copy(self.fitness, args), candidate_solutions)), + device=self.device + ) if self.logger is not None: self.logger.append(self.name + "/" + "fitness_value", fitness_values.mean().item()) @@ -87,4 +94,4 @@ class ESNetworkMP(Network): def __getstate__(self): self_dict = self.__dict__.copy() del self_dict['pool'] - return self_dict \ No newline at end of file + return self_dict diff --git a/rltorch/network/Network.py b/rltorch/network/Network.py index d16d436..db09d6a 100644 --- a/rltorch/network/Network.py +++ b/rltorch/network/Network.py @@ -17,12 +17,16 @@ class Network: name For use in logger to differentiate in analysis. """ - def __init__(self, model, optimizer, config, device = None, logger = None, name = ""): + def __init__(self, model, optimizer, config, device=None, logger=None, name=""): self.model = model if 'weight_decay' in config: - self.optimizer = optimizer(model.parameters(), lr = config['learning_rate'], weight_decay = config['weight_decay']) + self.optimizer = optimizer( + model.parameters(), + lr=config['learning_rate'], + weight_decay=config['weight_decay'] + ) else: - self.optimizer = optimizer(model.parameters(), lr = config['learning_rate']) + self.optimizer = optimizer(model.parameters(), lr=config['learning_rate']) self.logger = logger self.name = name self.device = device @@ -32,7 +36,7 @@ class Network: def __call__(self, *args): return self.model(*args) - def clamp_gradients(self, x = 1): + def clamp_gradients(self, x=1): """ Forcing gradients to stay within a certain interval by setting it to the bound if it goes over it. diff --git a/rltorch/network/NoisyLinear.py b/rltorch/network/NoisyLinear.py index cd8b905..82e039c 100644 --- a/rltorch/network/NoisyLinear.py +++ b/rltorch/network/NoisyLinear.py @@ -1,67 +1,67 @@ +import math import torch import torch.nn as nn import torch.nn.functional as F -import math + # This class utilizes this property of the normal distribution # N(mu, sigma) = mu + sigma * N(0, 1) class NoisyLinear(nn.Linear): - """ - Draws the parameters of nn.Linear from a normal distribution. - The parameters of the normal distribution are registered as - learnable parameters in the neural network. - - Parameters - ---------- - in_features - Size of each input sample. - out_features - Size of each output sample. - sigma_init - The starting standard deviation of guassian noise. - bias - If set to False, the layer will not - learn an additive bias. - Default: True - """ - def __init__(self, in_features, out_features, sigma_init = 0.017, bias = True): - super(NoisyLinear, self).__init__(in_features, out_features, bias = bias) - # One of the parameters the network is going to tune is the - # standard deviation of the gaussian noise on the weights - self.sigma_weight = nn.Parameter(torch.Tensor(out_features, in_features).fill_(sigma_init)) - # Reserve space for N(0, 1) of weights in the forward() call - self.register_buffer("s_normal_weight", torch.zeros(out_features, in_features)) - if bias: - # If a bias exists, then we manipulate the standard deviation of the - # gaussion noise on them as well - self.sigma_bias = nn.Parameter(torch.Tensor(out_features).fill_(sigma_init)) - # Reserve space for N(0, 1) of bias in the foward() call - self.register_buffer("s_normal_bias", torch.zeros(out_features)) - self.reset_parameters() - - def reset_parameters(self): - std = math.sqrt(3 / self.in_features) - nn.init.uniform_(self.weight, -std, std) - nn.init.uniform_(self.bias, -std, std) - - def forward(self, x): - r""" - Calculates the output :math:`y` through the following: - - :math:`sigma \sim N(mu_1, std_1)` - - :math:`bias \sim N(mu_2, std_2)` - - :math:`y = sigma \cdot x + bias` """ + Draws the parameters of nn.Linear from a normal distribution. + The parameters of the normal distribution are registered as + learnable parameters in the neural network. + Parameters + ---------- + in_features + Size of each input sample. + out_features + Size of each output sample. + sigma_init + The starting standard deviation of guassian noise. + bias + If set to False, the layer will not + learn an additive bias. + Default: True + """ + def __init__(self, in_features, out_features, sigma_init=0.017, bias=True): + super(NoisyLinear, self).__init__(in_features, out_features, bias=bias) + # One of the parameters the network is going to tune is the + # standard deviation of the gaussian noise on the weights + self.sigma_weight = nn.Parameter(torch.Tensor(out_features, in_features).fill_(sigma_init)) + # Reserve space for N(0, 1) of weights in the forward() call + self.register_buffer("s_normal_weight", torch.zeros(out_features, in_features)) + if bias: + # If a bias exists, then we manipulate the standard deviation of the + # gaussion noise on them as well + self.sigma_bias = nn.Parameter(torch.Tensor(out_features).fill_(sigma_init)) + # Reserve space for N(0, 1) of bias in the foward() call + self.register_buffer("s_normal_bias", torch.zeros(out_features)) + self.reset_parameters() + + def reset_parameters(self): + std = math.sqrt(3 / self.in_features) + nn.init.uniform_(self.weight, -std, std) + nn.init.uniform_(self.bias, -std, std) + + def forward(self, x): + r""" + Calculates the output :math:`y` through the following: + + :math:`sigma \sim N(mu_1, std_1)` + + :math:`bias \sim N(mu_2, std_2)` + + :math:`y = sigma \cdot x + bias` + """ # Fill s_normal_weight with values from the standard normal distribution - self.s_normal_weight.normal_() - weight_noise = self.sigma_weight * self.s_normal_weight.clone().requires_grad_() - - bias = None - if self.bias is not None: - # Fill s_normal_bias with values from standard normal - self.s_normal_bias.normal_() - bias = self.bias + self.sigma_bias * self.s_normal_bias.clone().requires_grad_() - - return F.linear(x, self.weight + weight_noise, bias) \ No newline at end of file + self.s_normal_weight.normal_() + weight_noise = self.sigma_weight * self.s_normal_weight.clone().requires_grad_() + + bias = None + if self.bias is not None: + # Fill s_normal_bias with values from standard normal + self.s_normal_bias.normal_() + bias = self.bias + self.sigma_bias * self.s_normal_bias.clone().requires_grad_() + + return F.linear(x, self.weight + weight_noise, bias) diff --git a/rltorch/network/TargetNetwork.py b/rltorch/network/TargetNetwork.py index 3339bdd..ddf6122 100644 --- a/rltorch/network/TargetNetwork.py +++ b/rltorch/network/TargetNetwork.py @@ -11,7 +11,7 @@ class TargetNetwork: device The device to put the cloned parameters in. """ - def __init__(self, network, device = None): + def __init__(self, network, device=None): self.model = network.model self.target_model = deepcopy(network.model) if device is not None: @@ -37,7 +37,8 @@ class TargetNetwork: Parameters ---------- tau : number - A number between 0-1 which indicates the proportion of the originator and clone in the new clone. + A number between 0-1 which indicates + the proportion of the originator and clone in the new clone. """ assert isinstance(tau, float) assert 0.0 < tau <= 1.0 @@ -45,4 +46,4 @@ class TargetNetwork: target_state = self.target_model.state_dict() for grad_index, grad in model_state.items(): target_state[grad_index].copy_((1 - tau) * target_state[grad_index] + tau * grad) - self.target_model.load_state_dict(target_state) \ No newline at end of file + self.target_model.load_state_dict(target_state) diff --git a/rltorch/network/__init__.py b/rltorch/network/__init__.py index d178654..6f35945 100644 --- a/rltorch/network/__init__.py +++ b/rltorch/network/__init__.py @@ -1,5 +1,5 @@ -from .ESNetwork import * -from .ESNetworkMP import * -from .Network import * -from .NoisyLinear import * -from .TargetNetwork import * \ No newline at end of file +from .ESNetwork import ESNetwork +from .ESNetworkMP import ESNetworkMP +from .Network import Network +from .NoisyLinear import NoisyLinear +from .TargetNetwork import TargetNetwork \ No newline at end of file diff --git a/rltorch/scheduler/ExponentialScheduler.py b/rltorch/scheduler/ExponentialScheduler.py index 97a9ebd..0f081d2 100644 --- a/rltorch/scheduler/ExponentialScheduler.py +++ b/rltorch/scheduler/ExponentialScheduler.py @@ -36,4 +36,3 @@ class ExponentialScheduler(Scheduler): return self.initial_value * (self.base ** (self.current_iteration - 1)) else: return self.end_value - diff --git a/rltorch/scheduler/Scheduler.py b/rltorch/scheduler/Scheduler.py index 0314907..df7a34b 100644 --- a/rltorch/scheduler/Scheduler.py +++ b/rltorch/scheduler/Scheduler.py @@ -7,4 +7,4 @@ class Scheduler(): def __iter__(self): return self def __next__(self): - raise NotImplementedError("Scheduler does not have it's function to create a value implemented") \ No newline at end of file + raise NotImplementedError("__next__ not implemented in Scheduler")