PEP8 Conformance
This commit is contained in:
parent
9b81188a77
commit
8fa4691511
29 changed files with 652 additions and 755 deletions
|
@ -12,7 +12,8 @@ class ArgMaxSelector:
|
|||
if self.device is not None:
|
||||
state = state.to(self.device)
|
||||
action_values = self.model(state).squeeze(0)
|
||||
action = self.random_act() if (action_values[0] == action_values).all() else action_values.argmax().item()
|
||||
action = self.random_act() if (action_values[0] == action_values).all() \
|
||||
else action_values.argmax().item()
|
||||
return action
|
||||
def act(self, state):
|
||||
return self.best_act(state)
|
|
@ -1,6 +1,7 @@
|
|||
from .ArgMaxSelector import ArgMaxSelector
|
||||
import numpy as np
|
||||
import collections
|
||||
import numpy as np
|
||||
from .ArgMaxSelector import ArgMaxSelector
|
||||
|
||||
class EpsilonGreedySelector(ArgMaxSelector):
|
||||
def __init__(self, model, action_size, device=None, epsilon=0.1):
|
||||
super(EpsilonGreedySelector, self).__init__(model, action_size, device=device)
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
from .ArgMaxSelector import ArgMaxSelector
|
||||
import torch
|
||||
from .ArgMaxSelector import ArgMaxSelector
|
||||
|
||||
class IdentitySelector(ArgMaxSelector):
|
||||
def __init__(self, model, action_size, device=None):
|
||||
super(IdentitySelector, self).__init__(model, action_size, device=device)
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
from random import randrange
|
||||
class RandomSelector():
|
||||
class RandomSelector:
|
||||
def __init__(self, action_size):
|
||||
self.action_size = action_size
|
||||
def random_act(self):
|
||||
return randrange(action_size)
|
||||
def best_act(self, state):
|
||||
return randrange(self.action_size)
|
||||
def best_act(self, _):
|
||||
return self.random_act()
|
||||
def act(self, state):
|
||||
def act(self, _):
|
||||
return self.random_act()
|
||||
|
|
|
@ -1,9 +1,6 @@
|
|||
from random import randrange
|
||||
import torch
|
||||
from torch.distributions import Categorical
|
||||
import rltorch
|
||||
from rltorch.action_selector import ArgMaxSelector
|
||||
|
||||
from .ArgMaxSelector import ArgMaxSelector
|
||||
from ..memory.EpisodeMemory import EpisodeMemory
|
||||
class StochasticSelector(ArgMaxSelector):
|
||||
def __init__(self, model, action_size, memory=None, device=None):
|
||||
super(StochasticSelector, self).__init__(model, action_size, device=device)
|
||||
|
@ -17,6 +14,6 @@ class StochasticSelector(ArgMaxSelector):
|
|||
action_probabilities = self.model(state)
|
||||
distribution = Categorical(action_probabilities)
|
||||
action = distribution.sample()
|
||||
if log_prob and isinstance(self.memory, rltorch.memory.EpisodeMemory):
|
||||
if log_prob and isinstance(self.memory, EpisodeMemory):
|
||||
self.memory.append_log_probs(distribution.log_prob(action))
|
||||
return action.item()
|
|
@ -1,5 +1,5 @@
|
|||
from .ArgMaxSelector import *
|
||||
from .EpsilonGreedySelector import *
|
||||
from .IdentitySelector import *
|
||||
from .RandomSelector import *
|
||||
from .StochasticSelector import *
|
||||
from .ArgMaxSelector import ArgMaxSelector
|
||||
from .EpsilonGreedySelector import EpsilonGreedySelector
|
||||
from .IdentitySelector import IdentitySelector
|
||||
from .RandomSelector import RandomSelector
|
||||
from .StochasticSelector import StochasticSelector
|
||||
|
|
|
@ -2,8 +2,6 @@ from copy import deepcopy
|
|||
import numpy as np
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
import rltorch
|
||||
import rltorch.memory as M
|
||||
|
||||
class A2CSingleAgent:
|
||||
def __init__(self, policy_net, value_net, memory, config, logger=None):
|
||||
|
@ -16,7 +14,11 @@ class A2CSingleAgent:
|
|||
def _discount_rewards(self, rewards):
|
||||
gammas = torch.ones_like(rewards)
|
||||
if len(rewards) > 1:
|
||||
gammas[1:] = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards) - 1), dim = 0)
|
||||
discount_tensor = torch.tensor(self.config['discount_rate'])
|
||||
gammas[1:] = torch.cumprod(
|
||||
discount_tensor.repeat(len(rewards) - 1),
|
||||
dim=0
|
||||
)
|
||||
return gammas * rewards
|
||||
|
||||
# This function is currently not used since the performance gains hasn't been shown
|
||||
|
@ -29,18 +31,18 @@ class A2CSingleAgent:
|
|||
values = self.value_net(states).squeeze(1)
|
||||
|
||||
generalized_advantages = torch.zeros_like(rewards)
|
||||
for i in range(len(generalized_advantages)):
|
||||
discount_tensor = torch.tensor(self.config['discount_rate']) * tradeoff
|
||||
for i, _ in enumerate(generalized_advantages):
|
||||
weights = torch.ones_like(rewards[i:])
|
||||
if i != len(generalized_advantages) - 1:
|
||||
weights[1:] = torch.cumprod(torch.tensor(self.config['discount_rate'] * tradeoff).repeat(len(rewards) - i - 1), dim = 0)
|
||||
weights[1:] = torch.cumprod(discount_tensor.repeat(len(rewards) - i - 1), dim=0)
|
||||
generalized_advantages[i] = (weights * (rewards[i:] + self.config['discount_rate'] * next_values[i:] - values[i:])).sum()
|
||||
|
||||
return generalized_advantages
|
||||
|
||||
|
||||
def learn(self):
|
||||
episode_batch = self.memory.recall()
|
||||
state_batch, action_batch, reward_batch, next_state_batch, done_batch, log_prob_batch = zip(*episode_batch)
|
||||
state_batch, _, reward_batch, next_state_batch, done_batch, log_prob_batch = zip(*episode_batch)
|
||||
|
||||
# Send batches to the appropriate device
|
||||
state_batch = torch.cat(state_batch).to(self.value_net.device)
|
||||
|
@ -50,8 +52,10 @@ class A2CSingleAgent:
|
|||
log_prob_batch = torch.cat(log_prob_batch).to(self.value_net.device)
|
||||
|
||||
## Value Loss
|
||||
# In A2C, the value loss is the difference between the discounted reward and the value from the first state
|
||||
# The value of the first state is supposed to tell us the expected reward from the current policy of the whole episode
|
||||
# In A2C, the value loss is the difference between the discounted reward
|
||||
# and the value from the first state.
|
||||
# The value of the first state is supposed to tell us
|
||||
# the expected reward from the current policy of the whole episode
|
||||
discounted_reward = self._discount_rewards(reward_batch)
|
||||
observed_value = discounted_reward.sum()
|
||||
value_loss = F.mse_loss(observed_value, self.value_net(state_batch[0]))
|
||||
|
@ -86,5 +90,3 @@ class A2CSingleAgent:
|
|||
|
||||
# Memory under the old policy is not needed for future training
|
||||
self.memory.clear()
|
||||
|
||||
|
||||
|
|
|
@ -1,10 +1,8 @@
|
|||
import collections
|
||||
from copy import deepcopy
|
||||
import rltorch.memory as M
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from copy import deepcopy
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
|
||||
class DQNAgent:
|
||||
def __init__(self, net, memory, config, target_net=None, logger=None):
|
||||
|
@ -24,7 +22,7 @@ class DQNAgent:
|
|||
if len(self.memory) < self.config['batch_size']:
|
||||
return
|
||||
|
||||
if (isinstance(self.memory, M.PrioritizedReplayMemory)):
|
||||
if isinstance(self.memory, M.PrioritizedReplayMemory):
|
||||
weight_importance = self.config['prioritized_replay_weight_importance']
|
||||
# If it's a scheduler then get the next value by calling next, otherwise just use it's value
|
||||
beta = next(weight_importance) if isinstance(weight_importance, collections.Iterable) else weight_importance
|
||||
|
@ -63,7 +61,7 @@ class DQNAgent:
|
|||
expected_values = (reward_batch + (self.config['discount_rate'] * best_next_state_value)).unsqueeze(1)
|
||||
|
||||
# If we're sampling by TD error, multiply loss by a importance weight which helps decrease overfitting
|
||||
if (isinstance(self.memory, M.PrioritizedReplayMemory)):
|
||||
if isinstance(self.memory, M.PrioritizedReplayMemory):
|
||||
# loss = (torch.as_tensor(importance_weights, device = self.net.device) * F.smooth_l1_loss(obtained_values, expected_values, reduction = 'none').squeeze(1)).mean()
|
||||
loss = (torch.as_tensor(importance_weights, device=self.net.device) * ((obtained_values - expected_values)**2).squeeze(1)).mean()
|
||||
else:
|
||||
|
@ -85,8 +83,6 @@ class DQNAgent:
|
|||
self.target_net.sync()
|
||||
|
||||
# If we're sampling by TD error, readjust the weights of the experiences
|
||||
if (isinstance(self.memory, M.PrioritizedReplayMemory)):
|
||||
if isinstance(self.memory, M.PrioritizedReplayMemory):
|
||||
td_error = (obtained_values - expected_values).detach().abs()
|
||||
self.memory.update_priorities(batch_indexes, td_error)
|
||||
|
||||
|
||||
|
|
|
@ -1,11 +1,9 @@
|
|||
import collections
|
||||
from copy import deepcopy
|
||||
import rltorch.memory as M
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from copy import deepcopy
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
from rltorch.action_selector import ArgMaxSelector
|
||||
|
||||
|
||||
class DQfDAgent:
|
||||
def __init__(self, net, memory, config, target_net=None, logger=None):
|
||||
|
@ -32,10 +30,10 @@ class DQfDAgent:
|
|||
batch_size = self.config['batch_size']
|
||||
steps = None
|
||||
|
||||
if isinstance(self.memory, M.DQfDMemory):
|
||||
weight_importance = self.config['prioritized_replay_weight_importance']
|
||||
# If it's a scheduler then get the next value by calling next, otherwise just use it's value
|
||||
beta = next(weight_importance) if isinstance(weight_importance, collections.Iterable) else weight_importance
|
||||
beta = next(weight_importance) if isinstance(weight_importance, collections.Iterable) \
|
||||
else weight_importance
|
||||
|
||||
# Check to see if we are doing N-Step DQN
|
||||
if steps is not None:
|
||||
|
@ -46,16 +44,6 @@ class DQfDAgent:
|
|||
# Process batch
|
||||
state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, importance_weights, batch_indexes = M.zip_batch(minibatch, priority=True)
|
||||
|
||||
else:
|
||||
# Check to see if we're doing N-Step DQN
|
||||
if steps is not None:
|
||||
minibatch = self.memory.sample_n_steps(batch_size, steps)
|
||||
else:
|
||||
minibatch = self.memory.sample(batch_size)
|
||||
|
||||
# Process batch
|
||||
state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, batch_indexes = M.zip_batch(minibatch, want_indices = True)
|
||||
|
||||
batch_index_tensors = torch.tensor(batch_indexes)
|
||||
demo_mask = batch_index_tensors < self.memory.demo_position
|
||||
|
||||
|
@ -86,11 +74,11 @@ class DQfDAgent:
|
|||
best_next_state_value = torch.zeros(batch_size, device=self.net.device)
|
||||
best_next_state_value[not_done_batch] = next_state_values[not_done_batch].gather(1, next_best_action.view((not_done_size, 1))).squeeze(1)
|
||||
|
||||
expected_values = (reward_batch + (self.config['discount_rate'] * best_next_state_value)).unsqueeze(1)
|
||||
expected_values = (reward_batch + (batch_size * best_next_state_value)).unsqueeze(1)
|
||||
|
||||
# N-Step DQN Loss
|
||||
# num_steps capture how many steps actually exist before the end of episode
|
||||
if steps != None:
|
||||
if steps is not None:
|
||||
expected_n_step_values = []
|
||||
with torch.no_grad():
|
||||
for i in range(0, len(state_batch), steps):
|
||||
|
@ -127,7 +115,7 @@ class DQfDAgent:
|
|||
l = torch.ones_like(state_values[demo_mask])
|
||||
expert_actions = action_batch[demo_mask]
|
||||
# l(s, a) is zero for every action the expert doesn't take
|
||||
for i,a in zip(range(len(l)), expert_actions):
|
||||
for i, _, a in zip(enumerate(l), expert_actions):
|
||||
l[i].fill_(0.8) # According to paper
|
||||
l[i, a] = 0
|
||||
if self.target_net is not None:
|
||||
|
@ -148,26 +136,17 @@ class DQfDAgent:
|
|||
|
||||
|
||||
# Since dqn_loss and demo_loss are different sizes, the reduction has to happen before they are combined
|
||||
if isinstance(self.memory, M.DQfDMemory):
|
||||
dqn_loss = (torch.as_tensor(importance_weights, device=self.net.device) * F.mse_loss(obtained_values, expected_values, reduction='none').squeeze(1)).mean()
|
||||
else:
|
||||
dqn_loss = F.mse_loss(obtained_values, expected_values)
|
||||
|
||||
if steps != None:
|
||||
if isinstance(self.memory, M.DQfDMemory):
|
||||
if steps is not None:
|
||||
dqn_n_step_loss = (torch.as_tensor(importance_weights[::steps], device=self.net.device) * F.mse_loss(observed_n_step_values, expected_n_step_values, reduction='none')).mean()
|
||||
else:
|
||||
dqn_n_step_loss = F.mse_loss(observed_n_step_values, expected_n_step_values, reduction = 'none').mean()
|
||||
else:
|
||||
dqn_n_step_loss = torch.tensor(0, device=self.net.device)
|
||||
|
||||
if demo_mask.sum() > 0:
|
||||
if isinstance(self.memory, M.DQfDMemory):
|
||||
demo_loss = (torch.as_tensor(importance_weights, device=self.net.device)[demo_mask] * F.mse_loss((state_values[demo_mask] + l).max(1)[0].unsqueeze(1), expert_value, reduction='none').squeeze(1)).mean()
|
||||
else:
|
||||
demo_loss = F.mse_loss((state_values[demo_mask] + l).max(1)[0].unsqueeze(1), expert_value, reduction = 'none').squeeze(1).mean()
|
||||
else:
|
||||
demo_loss = 0.
|
||||
demo_loss = 0
|
||||
loss = td_importance * dqn_loss + td_importance * dqn_n_step_loss + demo_importance * demo_loss
|
||||
|
||||
if self.logger is not None:
|
||||
|
|
|
@ -1,12 +1,8 @@
|
|||
from copy import deepcopy
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from torch.distributions import Categorical
|
||||
import rltorch
|
||||
import rltorch.memory as M
|
||||
import collections
|
||||
import random
|
||||
|
||||
class PPOAgent:
|
||||
def __init__(self, policy_net, value_net, memory, config, logger=None):
|
||||
|
@ -23,7 +19,6 @@ class PPOAgent:
|
|||
gammas[1:] = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards) - 1), dim=0)
|
||||
return gammas * rewards
|
||||
|
||||
|
||||
def learn(self):
|
||||
episode_batch = self.memory.recall()
|
||||
state_batch, action_batch, reward_batch, next_state_batch, done_batch, log_prob_batch = zip(*episode_batch)
|
||||
|
@ -68,14 +63,10 @@ class PPOAgent:
|
|||
self.logger.append("Loss/Policy", policy_loss.item())
|
||||
self.logger.append("Loss/Value", value_loss.item())
|
||||
|
||||
|
||||
self.old_policy_net.sync()
|
||||
self.policy_net.zero_grad()
|
||||
policy_loss.backward()
|
||||
self.policy_net.step()
|
||||
|
||||
|
||||
# Memory under the old policy is not needed for future training
|
||||
self.memory.clear()
|
||||
|
||||
|
||||
|
|
|
@ -2,16 +2,17 @@ from copy import deepcopy
|
|||
import collections
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from torch.distributions import Categorical
|
||||
import rltorch
|
||||
import rltorch.memory as M
|
||||
import torch.nn.functional as F
|
||||
|
||||
|
||||
# Q-Evolutionary Policy Agent
|
||||
# Maximizes the policy with respect to the Q-Value function.
|
||||
# Since function is non-differentiabile, depends on the Evolutionary Strategy algorithm
|
||||
class QEPAgent:
|
||||
def __init__(self, policy_net, value_net, memory, config, target_value_net = None, logger = None, entropy_importance = 0, policy_skip = 4, after_value_train = None):
|
||||
def __init__(self, policy_net, value_net, memory, config, target_value_net=None, logger=None, entropy_importance=0, policy_skip=4):
|
||||
self.policy_net = policy_net
|
||||
assert isinstance(self.policy_net, rltorch.network.ESNetwork) or isinstance(self.policy_net, rltorch.network.ESNetworkMP)
|
||||
self.policy_net.fitness = self.fitness
|
||||
|
@ -22,7 +23,6 @@ class QEPAgent:
|
|||
self.logger = logger
|
||||
self.policy_skip = policy_skip
|
||||
self.entropy_importance = entropy_importance
|
||||
self.after_value_train = after_value_train
|
||||
|
||||
def save(self, file_location):
|
||||
torch.save({
|
||||
|
@ -42,10 +42,8 @@ class QEPAgent:
|
|||
batch_size = len(state_batch)
|
||||
with torch.no_grad():
|
||||
action_probabilities = policy_net(state_batch)
|
||||
|
||||
action_size = action_probabilities.shape[1]
|
||||
distributions = list(map(Categorical, action_probabilities))
|
||||
|
||||
actions = torch.stack([d.sample() for d in distributions])
|
||||
|
||||
with torch.no_grad():
|
||||
|
@ -54,17 +52,17 @@ class QEPAgent:
|
|||
# Weird hacky solution where in multiprocess, it sometimes spits out nans
|
||||
# So have it try again
|
||||
while torch.isnan(state_values).any():
|
||||
print("NAN DETECTED")
|
||||
with torch.no_grad():
|
||||
state_values = value_net(state_batch)
|
||||
|
||||
obtained_values = state_values.gather(1, actions.view(batch_size, 1)).squeeze(1)
|
||||
|
||||
obtained_values = state_values.gather(1, actions.view(len(state_batch), 1)).squeeze(1)
|
||||
# return -obtained_values.mean().item()
|
||||
entropy_importance = 0 # Entropy accounting for 1% of loss seems to work well
|
||||
entropy_importance = next(self.entropy_importance) if isinstance(self.entropy_importance, collections.Iterable) else self.entropy_importance
|
||||
value_importance = 1 - entropy_importance
|
||||
|
||||
# entropy_loss = (action_probabilities * torch.log2(action_probabilities)).sum(1) # Standard entropy loss from information theory
|
||||
entropy_loss = (action_probabilities - torch.tensor(1 / action_size, device = state_batch.device).repeat(batch_size, action_size)).abs().sum(1)
|
||||
entropy_loss = (action_probabilities - torch.tensor(1 / action_size, device=state_batch.device).repeat(len(state_batch), action_size)).abs().sum(1)
|
||||
|
||||
return (entropy_importance * entropy_loss - value_importance * obtained_values).mean().item()
|
||||
|
||||
|
@ -73,7 +71,7 @@ class QEPAgent:
|
|||
if len(self.memory) < self.config['batch_size']:
|
||||
return
|
||||
|
||||
if (isinstance(self.memory, M.PrioritizedReplayMemory)):
|
||||
if isinstance(self.memory, M.PrioritizedReplayMemory):
|
||||
weight_importance = self.config['prioritized_replay_weight_importance']
|
||||
# If it's a scheduler then get the next value by calling next, otherwise just use it's value
|
||||
beta = next(weight_importance) if isinstance(weight_importance, collections.Iterable) else weight_importance
|
||||
|
@ -111,7 +109,7 @@ class QEPAgent:
|
|||
|
||||
expected_values = (reward_batch + (self.config['discount_rate'] * best_next_state_value)).unsqueeze(1)
|
||||
|
||||
if (isinstance(self.memory, M.PrioritizedReplayMemory)):
|
||||
if isinstance(self.memory, M.PrioritizedReplayMemory):
|
||||
value_loss = (torch.as_tensor(importance_weights, device=self.value_net.device) * ((obtained_values - expected_values)**2).squeeze(1)).mean()
|
||||
else:
|
||||
value_loss = F.mse_loss(obtained_values, expected_values)
|
||||
|
@ -124,16 +122,13 @@ class QEPAgent:
|
|||
self.value_net.clamp_gradients()
|
||||
self.value_net.step()
|
||||
|
||||
if callable(self.after_value_train):
|
||||
self.after_value_train()
|
||||
|
||||
if self.target_value_net is not None:
|
||||
if 'target_sync_tau' in self.config:
|
||||
self.target_value_net.partial_sync(self.config['target_sync_tau'])
|
||||
else:
|
||||
self.target_value_net.sync()
|
||||
|
||||
if (isinstance(self.memory, M.PrioritizedReplayMemory)):
|
||||
if isinstance(self.memory, M.PrioritizedReplayMemory):
|
||||
td_error = (obtained_values - expected_values).detach().abs()
|
||||
self.memory.update_priorities(batch_indexes, td_error)
|
||||
|
||||
|
@ -141,7 +136,8 @@ class QEPAgent:
|
|||
if self.policy_skip > 0:
|
||||
self.policy_skip -= 1
|
||||
return
|
||||
self.policy_skip = self.config['policy_skip']
|
||||
self.policy_skip = 4
|
||||
|
||||
if self.target_value_net is not None:
|
||||
self.policy_net.calc_gradients(self.target_value_net, state_batch)
|
||||
else:
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import rltorch
|
||||
from copy import deepcopy
|
||||
import torch
|
||||
import numpy as np
|
||||
import torch
|
||||
import rltorch
|
||||
|
||||
class REINFORCEAgent:
|
||||
def __init__(self, net, memory, config, target_net=None, logger=None):
|
||||
|
@ -30,7 +30,7 @@ class REINFORCEAgent:
|
|||
|
||||
def learn(self):
|
||||
episode_batch = self.memory.recall()
|
||||
state_batch, action_batch, reward_batch, next_state_batch, done_batch, log_prob_batch = zip(*episode_batch)
|
||||
_, _, reward_batch, _, _, log_prob_batch = zip(*episode_batch)
|
||||
|
||||
# Caluclate discounted rewards to place more importance to recent rewards
|
||||
shaped_reward_batch = self._shape_rewards(torch.tensor(reward_batch))
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from .A2CSingleAgent import *
|
||||
from .DQNAgent import *
|
||||
from .DQfDAgent import *
|
||||
from .PPOAgent import *
|
||||
from .QEPAgent import *
|
||||
from .REINFORCEAgent import *
|
||||
from .A2CSingleAgent import A2CSingleAgent
|
||||
from .DQNAgent import DQNAgent
|
||||
from .DQfDAgent import DQfDAgent
|
||||
from .PPOAgent import PPOAgent
|
||||
from .QEPAgent import QEPAgent
|
||||
from .REINFORCEAgent import REINFORCEAgent
|
||||
|
|
6
rltorch/env/simulate.py
vendored
6
rltorch/env/simulate.py
vendored
|
@ -1,6 +1,6 @@
|
|||
from copy import deepcopy
|
||||
import rltorch
|
||||
import time
|
||||
import rltorch
|
||||
|
||||
def simulateEnvEps(env, actor, config, total_episodes=1, memory=None, logger=None, name="", render=False):
|
||||
for episode in range(total_episodes):
|
||||
|
@ -27,7 +27,7 @@ def simulateEnvEps(env, actor, config, total_episodes = 1, memory = None, logger
|
|||
logger.append(name + '/EpisodeReward', episode_reward)
|
||||
|
||||
|
||||
class EnvironmentRunSync():
|
||||
class EnvironmentRunSync:
|
||||
def __init__(self, env, actor, config, memory=None, logwriter=None, name="", render=False):
|
||||
self.env = env
|
||||
self.name = name
|
||||
|
@ -72,7 +72,7 @@ class EnvironmentRunSync():
|
|||
self.last_state = state
|
||||
|
||||
|
||||
class EnvironmentEpisodeSync():
|
||||
class EnvironmentEpisodeSync:
|
||||
def __init__(self, env, actor, config, memory=None, logwriter=None, name=""):
|
||||
self.env = env
|
||||
self.name = name
|
||||
|
|
18
rltorch/env/wrappers.py
vendored
18
rltorch/env/wrappers.py
vendored
|
@ -1,8 +1,8 @@
|
|||
from collections import deque
|
||||
import gym
|
||||
import torch
|
||||
from gym import spaces
|
||||
import cv2
|
||||
from collections import deque
|
||||
import numpy as np
|
||||
|
||||
class EpisodicLifeEnv(gym.Wrapper):
|
||||
|
@ -170,7 +170,12 @@ class FrameStack(gym.Wrapper):
|
|||
self.k = k
|
||||
self.frames = deque([], maxlen=k)
|
||||
shp = env.observation_space.shape
|
||||
self.observation_space = spaces.Box(low=0, high=255, shape=(shp[:-1] + (shp[-1] * k,)), dtype=env.observation_space.dtype)
|
||||
self.observation_space = spaces.Box(
|
||||
low=0,
|
||||
high=255,
|
||||
shape=(shp[:-1] + (shp[-1] * k,)),
|
||||
dtype=env.observation_space.dtype
|
||||
)
|
||||
|
||||
def reset(self):
|
||||
ob = self.env.reset()
|
||||
|
@ -207,14 +212,16 @@ class ProcessFrame(gym.Wrapper):
|
|||
if self.grayscale:
|
||||
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
|
||||
if self.crop_bounds is not None and len(self.crop_bounds) == 4:
|
||||
frame = frame[self.crop_bounds[0]:self.crop_bounds[1], self.crop_bounds[2]:self.crop_bounds[3]]
|
||||
frame = frame[
|
||||
self.crop_bounds[0]:self.crop_bounds[1],
|
||||
self.crop_bounds[2]:self.crop_bounds[3]
|
||||
]
|
||||
if self.resize_shape is not None and len(self.resize_shape) == 2:
|
||||
frame = cv2.resize(frame, self.resize_shape, interpolation=cv2.INTER_AREA)
|
||||
# Normalize
|
||||
frame = frame / 255
|
||||
return frame
|
||||
|
||||
|
||||
# Turns observations into torch tensors
|
||||
# Adds an additional dimension that's suppose to represent the batch dim
|
||||
class TorchWrap(gym.Wrapper):
|
||||
|
@ -233,8 +240,6 @@ class TorchWrap(gym.Wrapper):
|
|||
frame = torch.from_numpy(frame).unsqueeze(0).float()
|
||||
return frame
|
||||
|
||||
|
||||
|
||||
class ProcessFrame84(gym.ObservationWrapper):
|
||||
def __init__(self, env=None):
|
||||
super(ProcessFrame84, self).__init__(env)
|
||||
|
@ -256,4 +261,3 @@ class ProcessFrame84(gym.ObservationWrapper):
|
|||
x_t = resized_screen[18:102, :]
|
||||
x_t = np.reshape(x_t, [84, 84])
|
||||
return x_t.astype(np.uint8)
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from .PrioritizedReplayMemory import PrioritizedReplayMemory
|
||||
from collections import namedtuple
|
||||
import numpy as np
|
||||
from .PrioritizedReplayMemory import PrioritizedReplayMemory
|
||||
|
||||
Transition = namedtuple('Transition',
|
||||
('state', 'action', 'reward', 'next_state', 'done'))
|
||||
|
@ -47,7 +47,8 @@ class DQfDMemory(PrioritizedReplayMemory):
|
|||
idxes = self._sample_proportional(sample_size)
|
||||
step_idxes = []
|
||||
for i in idxes:
|
||||
# If the interval of experiences fall between demonstration and obtained, move it over to the demonstration half
|
||||
# If the interval of experiences fall between demonstration and obtained,
|
||||
# move it over to the demonstration half
|
||||
if i < self.demo_position and i + steps > self.demo_position:
|
||||
diff = i + steps - self.demo_position
|
||||
step_idxes += range(i - diff, i + steps - diff)
|
||||
|
|
|
@ -1,6 +1,4 @@
|
|||
import random
|
||||
from collections import namedtuple
|
||||
import torch
|
||||
Transition = namedtuple('Transition',
|
||||
('state', 'action', 'reward', 'next_state', 'done'))
|
||||
|
||||
|
|
|
@ -1,10 +1,9 @@
|
|||
# From OpenAI Baselines https://github.com/openai/baselines/blob/master/baselines/deepq/replay_buffer.py
|
||||
|
||||
from .ReplayMemory import ReplayMemory
|
||||
import operator
|
||||
import random
|
||||
import numpy as np
|
||||
from numba import jit
|
||||
from .ReplayMemory import ReplayMemory
|
||||
|
||||
class SegmentTree(object):
|
||||
def __init__(self, capacity, operation, neutral_element):
|
||||
|
@ -320,4 +319,3 @@ class PrioritizedReplayMemory(ReplayMemory):
|
|||
self._it_min[idx] = priority ** self._alpha
|
||||
|
||||
self._max_priority = max(self._max_priority, priority)
|
||||
|
||||
|
|
|
@ -106,11 +106,9 @@ class ReplayMemory(object):
|
|||
def __reversed__(self):
|
||||
return reversed(self.memory)
|
||||
|
||||
def zip_batch(minibatch, priority = False, want_indices = False):
|
||||
def zip_batch(minibatch, priority=False):
|
||||
if priority:
|
||||
state_batch, action_batch, reward_batch, next_state_batch, done_batch, weights, indexes = zip(*minibatch)
|
||||
elif want_indices:
|
||||
state_batch, action_batch, reward_batch, next_state_batch, done_batch, indexes = zip(*minibatch)
|
||||
else:
|
||||
state_batch, action_batch, reward_batch, next_state_batch, done_batch = zip(*minibatch)
|
||||
|
||||
|
@ -122,7 +120,5 @@ def zip_batch(minibatch, priority = False, want_indices = False):
|
|||
|
||||
if priority:
|
||||
return state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, weights, indexes
|
||||
elif want_indices:
|
||||
return state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, indexes
|
||||
else:
|
||||
return state_batch, action_batch, reward_batch, next_state_batch, not_done_batch
|
|
@ -34,86 +34,3 @@ class EnvironmentEpisode(mp.Process):
|
|||
self.logger.append(self.name + '/EpisodeReward', episode_reward)
|
||||
|
||||
self.episode_num += 1
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# from copy import deepcopy
|
||||
# import torch.multiprocessing as mp
|
||||
# from ctypes import *
|
||||
# import rltorch.log
|
||||
|
||||
# def envepisode(actor, env, episode_num, config, runcondition, memoryqueue = None, logqueue = None, name = ""):
|
||||
# # Wait for signal to start running through the environment
|
||||
# while runcondition.wait():
|
||||
# # Start a logger to log the rewards
|
||||
# logger = rltorch.log.Logger()
|
||||
# state = env.reset()
|
||||
# episode_reward = 0
|
||||
# done = False
|
||||
# while not done:
|
||||
# action = actor.act(state)
|
||||
# next_state, reward, done, _ = env.step(action)
|
||||
|
||||
# episode_reward += reward
|
||||
# if memoryqueue is not None:
|
||||
# memoryqueue.put((state, action, reward, next_state, done))
|
||||
|
||||
# state = next_state
|
||||
|
||||
# if done:
|
||||
# with episode_num.get_lock():
|
||||
# if episode_num.value % config['print_stat_n_eps'] == 0:
|
||||
# print("episode: {}/{}, score: {}"
|
||||
# .format(episode_num.value, config['total_training_episodes'], episode_reward))
|
||||
|
||||
# if logger is not None:
|
||||
# logger.append(name + '/EpisodeReward', episode_reward)
|
||||
# episode_reward = 0
|
||||
# state = env.reset()
|
||||
# with episode_num.get_lock():
|
||||
# episode_num.value += 1
|
||||
|
||||
# logqueue.put(logger)
|
||||
|
||||
# class EnvironmentRun():
|
||||
# def __init__(self, env_func, actor, config, memory = None, name = ""):
|
||||
# self.config = deepcopy(config)
|
||||
# self.memory = memory
|
||||
# self.episode_num = mp.Value(c_uint)
|
||||
# self.runcondition = mp.Event()
|
||||
# # Interestingly enough, there isn't a good reliable way to know how many states an episode will have
|
||||
# # Perhaps we can share a uint to keep track...
|
||||
# self.memory_queue = mp.Queue(maxsize = config['replay_skip'] + 1)
|
||||
# self.logqueue = mp.Queue(maxsize = 1)
|
||||
# with self.episode_num.get_lock():
|
||||
# self.episode_num.value = 1
|
||||
# self.runner = mp.Process(target=envrun,
|
||||
# args=(actor, env_func, self.episode_num, config, self.runcondition),
|
||||
# kwargs = {'iterations': config['replay_skip'] + 1,
|
||||
# 'memoryqueue' : self.memory_queue, 'logqueue' : self.logqueue, 'name' : name})
|
||||
# self.runner.start()
|
||||
|
||||
# def run(self):
|
||||
# self.runcondition.set()
|
||||
|
||||
# def join(self):
|
||||
# self._sync_memory()
|
||||
# if self.logwriter is not None:
|
||||
# self.logwriter.write(self._get_reward_logger())
|
||||
|
||||
# def sync_memory(self):
|
||||
# if self.memory is not None:
|
||||
# for i in range(self.config['replay_skip'] + 1):
|
||||
# self.memory.append(*self.memory_queue.get())
|
||||
|
||||
# def get_reward_logger(self):
|
||||
# return self.logqueue.get()
|
||||
|
||||
# def terminate(self):
|
||||
# self.runner.terminate()
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from copy import deepcopy
|
||||
from ctypes import c_uint
|
||||
import torch.multiprocessing as mp
|
||||
from ctypes import *
|
||||
import rltorch.log
|
||||
|
||||
def envrun(actor, env, episode_num, config, runcondition, iterations=1, memoryqueue=None, logqueue=None, name=""):
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
from copy import deepcopy
|
||||
import numpy as np
|
||||
import torch
|
||||
from .Network import Network
|
||||
from copy import deepcopy
|
||||
|
||||
|
||||
# [TODO] Should we torch.no_grad the __call__?
|
||||
# What if we want to sometimes do gradient descent as well?
|
||||
|
@ -64,7 +65,11 @@ class ESNetwork(Network):
|
|||
white_noise_dict = {}
|
||||
noise_dict = {}
|
||||
for key in model_dict.keys():
|
||||
white_noise_dict[key] = torch.randn(self.population_size, *model_dict[key].shape, device = self.device)
|
||||
white_noise_dict[key] = torch.randn(
|
||||
self.population_size,
|
||||
*model_dict[key].shape,
|
||||
device=self.device
|
||||
)
|
||||
noise_dict[key] = self.sigma * white_noise_dict[key]
|
||||
return white_noise_dict, noise_dict
|
||||
|
||||
|
@ -96,7 +101,10 @@ class ESNetwork(Network):
|
|||
candidate_solutions = self._generate_candidate_solutions(noise_dict)
|
||||
|
||||
## Calculate fitness then mean shift, scale
|
||||
fitness_values = torch.tensor([self.fitness(x, *args) for x in candidate_solutions], device = self.device)
|
||||
fitness_values = torch.tensor(
|
||||
[self.fitness(x, *args) for x in candidate_solutions],
|
||||
device=self.device
|
||||
)
|
||||
if self.logger is not None:
|
||||
self.logger.append(self.name + "/" + "fitness_value", fitness_values.mean().item())
|
||||
fitness_values = (fitness_values - fitness_values.mean()) / (fitness_values.std() + np.finfo('float').eps)
|
||||
|
|
|
@ -1,9 +1,8 @@
|
|||
from copy import deepcopy
|
||||
import numpy as np
|
||||
import torch
|
||||
from .Network import Network
|
||||
from copy import deepcopy
|
||||
import torch.multiprocessing as mp
|
||||
import functools
|
||||
from .Network import Network
|
||||
|
||||
class fn_copy:
|
||||
def __init__(self, fn, args):
|
||||
|
@ -27,7 +26,8 @@ class ESNetworkMP(Network):
|
|||
self.sigma = sigma
|
||||
assert self.sigma > 0
|
||||
mp_ctx = mp.get_context("spawn")
|
||||
self.pool = mp_ctx.Pool(processes=2) #[TODO] Probably should make number of processes a config variable
|
||||
#[TODO] Probably should make number of processes a config variable
|
||||
self.pool = mp_ctx.Pool(processes=2)
|
||||
|
||||
# We're not going to be calculating gradients in the traditional way
|
||||
# So there's no need to waste computation time keeping track
|
||||
|
@ -42,7 +42,11 @@ class ESNetworkMP(Network):
|
|||
white_noise_dict = {}
|
||||
noise_dict = {}
|
||||
for key in model_dict.keys():
|
||||
white_noise_dict[key] = torch.randn(self.population_size, *model_dict[key].shape, device = self.device)
|
||||
white_noise_dict[key] = torch.randn(
|
||||
self.population_size,
|
||||
*model_dict[key].shape,
|
||||
device=self.device
|
||||
)
|
||||
noise_dict[key] = self.sigma * white_noise_dict[key]
|
||||
return white_noise_dict, noise_dict
|
||||
|
||||
|
@ -67,7 +71,10 @@ class ESNetworkMP(Network):
|
|||
candidate_solutions = self._generate_candidate_solutions(noise_dict)
|
||||
|
||||
## Calculate fitness then mean shift, scale
|
||||
fitness_values = torch.tensor(list(self.pool.map(fn_copy(self.fitness, args), candidate_solutions)), device = self.device)
|
||||
fitness_values = torch.tensor(
|
||||
list(self.pool.map(fn_copy(self.fitness, args), candidate_solutions)),
|
||||
device=self.device
|
||||
)
|
||||
|
||||
if self.logger is not None:
|
||||
self.logger.append(self.name + "/" + "fitness_value", fitness_values.mean().item())
|
||||
|
|
|
@ -20,7 +20,11 @@ class Network:
|
|||
def __init__(self, model, optimizer, config, device=None, logger=None, name=""):
|
||||
self.model = model
|
||||
if 'weight_decay' in config:
|
||||
self.optimizer = optimizer(model.parameters(), lr = config['learning_rate'], weight_decay = config['weight_decay'])
|
||||
self.optimizer = optimizer(
|
||||
model.parameters(),
|
||||
lr=config['learning_rate'],
|
||||
weight_decay=config['weight_decay']
|
||||
)
|
||||
else:
|
||||
self.optimizer = optimizer(model.parameters(), lr=config['learning_rate'])
|
||||
self.logger = logger
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
import math
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import math
|
||||
|
||||
|
||||
# This class utilizes this property of the normal distribution
|
||||
# N(mu, sigma) = mu + sigma * N(0, 1)
|
||||
|
@ -10,7 +11,6 @@ class NoisyLinear(nn.Linear):
|
|||
Draws the parameters of nn.Linear from a normal distribution.
|
||||
The parameters of the normal distribution are registered as
|
||||
learnable parameters in the neural network.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
in_features
|
||||
|
|
|
@ -37,7 +37,8 @@ class TargetNetwork:
|
|||
Parameters
|
||||
----------
|
||||
tau : number
|
||||
A number between 0-1 which indicates the proportion of the originator and clone in the new clone.
|
||||
A number between 0-1 which indicates
|
||||
the proportion of the originator and clone in the new clone.
|
||||
"""
|
||||
assert isinstance(tau, float)
|
||||
assert 0.0 < tau <= 1.0
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
from .ESNetwork import *
|
||||
from .ESNetworkMP import *
|
||||
from .Network import *
|
||||
from .NoisyLinear import *
|
||||
from .TargetNetwork import *
|
||||
from .ESNetwork import ESNetwork
|
||||
from .ESNetworkMP import ESNetworkMP
|
||||
from .Network import Network
|
||||
from .NoisyLinear import NoisyLinear
|
||||
from .TargetNetwork import TargetNetwork
|
|
@ -36,4 +36,3 @@ class ExponentialScheduler(Scheduler):
|
|||
return self.initial_value * (self.base ** (self.current_iteration - 1))
|
||||
else:
|
||||
return self.end_value
|
||||
|
||||
|
|
|
@ -7,4 +7,4 @@ class Scheduler():
|
|||
def __iter__(self):
|
||||
return self
|
||||
def __next__(self):
|
||||
raise NotImplementedError("Scheduler does not have it's function to create a value implemented")
|
||||
raise NotImplementedError("__next__ not implemented in Scheduler")
|
||||
|
|
Loading…
Reference in a new issue