PEP8 Conformance
This commit is contained in:
		
							parent
							
								
									9b81188a77
								
							
						
					
					
						commit
						8fa4691511
					
				
					 29 changed files with 652 additions and 755 deletions
				
			
		| 
						 | 
				
			
			@ -1,7 +1,7 @@
 | 
			
		|||
from random import randrange
 | 
			
		||||
import torch
 | 
			
		||||
class ArgMaxSelector:
 | 
			
		||||
    def __init__(self, model, action_size, device = None):
 | 
			
		||||
    def __init__(self, model, action_size, device=None):
 | 
			
		||||
        self.model = model
 | 
			
		||||
        self.action_size = action_size
 | 
			
		||||
        self.device = device
 | 
			
		||||
| 
						 | 
				
			
			@ -12,7 +12,8 @@ class ArgMaxSelector:
 | 
			
		|||
            if self.device is not None:
 | 
			
		||||
                state = state.to(self.device)
 | 
			
		||||
            action_values = self.model(state).squeeze(0)
 | 
			
		||||
            action = self.random_act() if (action_values[0] == action_values).all() else action_values.argmax().item()
 | 
			
		||||
            action = self.random_act() if (action_values[0] == action_values).all() \
 | 
			
		||||
                        else action_values.argmax().item()
 | 
			
		||||
        return action
 | 
			
		||||
    def act(self, state):
 | 
			
		||||
        return self.best_act(state)
 | 
			
		||||
| 
						 | 
				
			
			@ -1,9 +1,10 @@
 | 
			
		|||
from .ArgMaxSelector import ArgMaxSelector
 | 
			
		||||
import numpy as np 
 | 
			
		||||
import collections
 | 
			
		||||
import numpy as np
 | 
			
		||||
from .ArgMaxSelector import ArgMaxSelector
 | 
			
		||||
 | 
			
		||||
class EpsilonGreedySelector(ArgMaxSelector):
 | 
			
		||||
    def __init__(self, model, action_size, device = None, epsilon = 0.1):
 | 
			
		||||
        super(EpsilonGreedySelector, self).__init__(model, action_size, device = device)
 | 
			
		||||
    def __init__(self, model, action_size, device=None, epsilon=0.1):
 | 
			
		||||
        super(EpsilonGreedySelector, self).__init__(model, action_size, device=device)
 | 
			
		||||
        self.epsilon = epsilon
 | 
			
		||||
    # random_act is already implemented in ArgMaxSelector
 | 
			
		||||
    # best_act is already implemented in ArgMaxSelector
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,8 +1,9 @@
 | 
			
		|||
from .ArgMaxSelector import ArgMaxSelector
 | 
			
		||||
import torch
 | 
			
		||||
from .ArgMaxSelector import ArgMaxSelector
 | 
			
		||||
 | 
			
		||||
class IdentitySelector(ArgMaxSelector):
 | 
			
		||||
    def __init__(self, model, action_size, device = None):
 | 
			
		||||
        super(IdentitySelector, self).__init__(model, action_size, device = device)
 | 
			
		||||
    def __init__(self, model, action_size, device=None):
 | 
			
		||||
        super(IdentitySelector, self).__init__(model, action_size, device=device)
 | 
			
		||||
    # random_act is already implemented in ArgMaxSelector
 | 
			
		||||
    def best_act(self, state):
 | 
			
		||||
        with torch.no_grad():
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,10 +1,10 @@
 | 
			
		|||
from random import randrange
 | 
			
		||||
class RandomSelector():
 | 
			
		||||
class RandomSelector:
 | 
			
		||||
    def __init__(self, action_size):
 | 
			
		||||
        self.action_size = action_size
 | 
			
		||||
    def random_act(self):
 | 
			
		||||
        return randrange(action_size)
 | 
			
		||||
    def best_act(self, state):
 | 
			
		||||
        return randrange(self.action_size)
 | 
			
		||||
    def best_act(self, _):
 | 
			
		||||
        return self.random_act()
 | 
			
		||||
    def act(self, state):
 | 
			
		||||
    def act(self, _):
 | 
			
		||||
        return self.random_act()
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,22 +1,19 @@
 | 
			
		|||
from random import randrange
 | 
			
		||||
import torch
 | 
			
		||||
from torch.distributions import Categorical
 | 
			
		||||
import rltorch
 | 
			
		||||
from rltorch.action_selector import ArgMaxSelector
 | 
			
		||||
 | 
			
		||||
from .ArgMaxSelector import ArgMaxSelector
 | 
			
		||||
from ..memory.EpisodeMemory import EpisodeMemory
 | 
			
		||||
class StochasticSelector(ArgMaxSelector):
 | 
			
		||||
    def __init__(self, model, action_size, memory = None, device = None):
 | 
			
		||||
        super(StochasticSelector, self).__init__(model, action_size, device = device)
 | 
			
		||||
    def __init__(self, model, action_size, memory=None, device=None):
 | 
			
		||||
        super(StochasticSelector, self).__init__(model, action_size, device=device)
 | 
			
		||||
        self.model = model
 | 
			
		||||
        self.action_size = action_size
 | 
			
		||||
        self.device = device
 | 
			
		||||
        self.memory = memory
 | 
			
		||||
    def best_act(self, state, log_prob = True):
 | 
			
		||||
    def best_act(self, state, log_prob=True):
 | 
			
		||||
        if self.device is not None:
 | 
			
		||||
            state = state.to(self.device)
 | 
			
		||||
        action_probabilities = self.model(state)
 | 
			
		||||
        distribution = Categorical(action_probabilities)
 | 
			
		||||
        action = distribution.sample()
 | 
			
		||||
        if log_prob and isinstance(self.memory, rltorch.memory.EpisodeMemory):
 | 
			
		||||
        if log_prob and isinstance(self.memory, EpisodeMemory):
 | 
			
		||||
            self.memory.append_log_probs(distribution.log_prob(action))
 | 
			
		||||
        return action.item()
 | 
			
		||||
| 
						 | 
				
			
			@ -1,5 +1,5 @@
 | 
			
		|||
from .ArgMaxSelector import * 
 | 
			
		||||
from .EpsilonGreedySelector import * 
 | 
			
		||||
from .IdentitySelector import * 
 | 
			
		||||
from .RandomSelector import * 
 | 
			
		||||
from .StochasticSelector import * 
 | 
			
		||||
from .ArgMaxSelector import ArgMaxSelector
 | 
			
		||||
from .EpsilonGreedySelector import EpsilonGreedySelector
 | 
			
		||||
from .IdentitySelector import IdentitySelector
 | 
			
		||||
from .RandomSelector import RandomSelector
 | 
			
		||||
from .StochasticSelector import StochasticSelector
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -2,11 +2,9 @@ from copy import deepcopy
 | 
			
		|||
import numpy as np
 | 
			
		||||
import torch
 | 
			
		||||
import torch.nn.functional as F
 | 
			
		||||
import rltorch
 | 
			
		||||
import rltorch.memory as M
 | 
			
		||||
 | 
			
		||||
class A2CSingleAgent:
 | 
			
		||||
  def __init__(self, policy_net, value_net, memory, config, logger = None):
 | 
			
		||||
    def __init__(self, policy_net, value_net, memory, config, logger=None):
 | 
			
		||||
        self.policy_net = policy_net
 | 
			
		||||
        self.value_net = value_net
 | 
			
		||||
        self.memory = memory
 | 
			
		||||
| 
						 | 
				
			
			@ -16,7 +14,11 @@ class A2CSingleAgent:
 | 
			
		|||
    def _discount_rewards(self, rewards):
 | 
			
		||||
        gammas = torch.ones_like(rewards)
 | 
			
		||||
        if len(rewards) > 1:
 | 
			
		||||
      gammas[1:] = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards) - 1), dim = 0)
 | 
			
		||||
            discount_tensor = torch.tensor(self.config['discount_rate'])
 | 
			
		||||
            gammas[1:] = torch.cumprod(
 | 
			
		||||
                discount_tensor.repeat(len(rewards) - 1),
 | 
			
		||||
                dim=0
 | 
			
		||||
            )
 | 
			
		||||
        return gammas * rewards
 | 
			
		||||
 | 
			
		||||
    # This function is currently not used since the performance gains hasn't been shown
 | 
			
		||||
| 
						 | 
				
			
			@ -29,18 +31,18 @@ class A2CSingleAgent:
 | 
			
		|||
            values = self.value_net(states).squeeze(1)
 | 
			
		||||
 | 
			
		||||
        generalized_advantages = torch.zeros_like(rewards)
 | 
			
		||||
    for i in range(len(generalized_advantages)):
 | 
			
		||||
        discount_tensor = torch.tensor(self.config['discount_rate']) * tradeoff
 | 
			
		||||
        for i, _ in enumerate(generalized_advantages):
 | 
			
		||||
            weights = torch.ones_like(rewards[i:])
 | 
			
		||||
            if i != len(generalized_advantages) - 1:
 | 
			
		||||
        weights[1:] = torch.cumprod(torch.tensor(self.config['discount_rate'] * tradeoff).repeat(len(rewards) - i - 1), dim = 0)
 | 
			
		||||
                weights[1:] = torch.cumprod(discount_tensor.repeat(len(rewards) - i - 1), dim=0)
 | 
			
		||||
            generalized_advantages[i] = (weights * (rewards[i:] + self.config['discount_rate'] * next_values[i:] - values[i:])).sum()
 | 
			
		||||
 | 
			
		||||
        return generalized_advantages
 | 
			
		||||
 | 
			
		||||
  
 | 
			
		||||
    def learn(self):
 | 
			
		||||
        episode_batch = self.memory.recall()
 | 
			
		||||
    state_batch, action_batch, reward_batch, next_state_batch, done_batch, log_prob_batch = zip(*episode_batch)  
 | 
			
		||||
        state_batch, _, reward_batch, next_state_batch, done_batch, log_prob_batch = zip(*episode_batch)
 | 
			
		||||
 | 
			
		||||
        # Send batches to the appropriate device
 | 
			
		||||
        state_batch = torch.cat(state_batch).to(self.value_net.device)
 | 
			
		||||
| 
						 | 
				
			
			@ -50,8 +52,10 @@ class A2CSingleAgent:
 | 
			
		|||
        log_prob_batch = torch.cat(log_prob_batch).to(self.value_net.device)
 | 
			
		||||
 | 
			
		||||
        ## Value Loss
 | 
			
		||||
    # In A2C, the value loss is the difference between the discounted reward and the value from the first state
 | 
			
		||||
    # The value of the first state is supposed to tell us the expected reward from the current policy of the whole episode
 | 
			
		||||
        # In A2C, the value loss is the difference between the discounted reward
 | 
			
		||||
        # and the value from the first state.
 | 
			
		||||
        # The value of the first state is supposed to tell us
 | 
			
		||||
        # the expected reward from the current policy of the whole episode
 | 
			
		||||
        discounted_reward = self._discount_rewards(reward_batch)
 | 
			
		||||
        observed_value = discounted_reward.sum()
 | 
			
		||||
        value_loss = F.mse_loss(observed_value, self.value_net(state_batch[0]))
 | 
			
		||||
| 
						 | 
				
			
			@ -86,5 +90,3 @@ class A2CSingleAgent:
 | 
			
		|||
 | 
			
		||||
        # Memory under the old policy is not needed for future training
 | 
			
		||||
        self.memory.clear()
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,13 +1,11 @@
 | 
			
		|||
import collections
 | 
			
		||||
from copy import deepcopy
 | 
			
		||||
import rltorch.memory as M
 | 
			
		||||
import torch
 | 
			
		||||
import torch.nn.functional as F
 | 
			
		||||
from copy import deepcopy
 | 
			
		||||
import numpy as np
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
 | 
			
		||||
class DQNAgent:
 | 
			
		||||
    def __init__(self, net , memory, config, target_net = None, logger = None):
 | 
			
		||||
    def __init__(self, net, memory, config, target_net=None, logger=None):
 | 
			
		||||
        self.net = net
 | 
			
		||||
        self.target_net = target_net
 | 
			
		||||
        self.memory = memory
 | 
			
		||||
| 
						 | 
				
			
			@ -20,16 +18,16 @@ class DQNAgent:
 | 
			
		|||
        self.net.model.to(self.net.device)
 | 
			
		||||
        self.target_net.sync()
 | 
			
		||||
 | 
			
		||||
    def learn(self, logger = None):
 | 
			
		||||
    def learn(self, logger=None):
 | 
			
		||||
        if len(self.memory) < self.config['batch_size']:
 | 
			
		||||
            return
 | 
			
		||||
        
 | 
			
		||||
        if (isinstance(self.memory, M.PrioritizedReplayMemory)):
 | 
			
		||||
        if isinstance(self.memory, M.PrioritizedReplayMemory):
 | 
			
		||||
            weight_importance = self.config['prioritized_replay_weight_importance']
 | 
			
		||||
            # If it's a scheduler then get the next value by calling next, otherwise just use it's value
 | 
			
		||||
            beta = next(weight_importance) if isinstance(weight_importance, collections.Iterable) else weight_importance
 | 
			
		||||
            minibatch = self.memory.sample(self.config['batch_size'], beta = beta)
 | 
			
		||||
            state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, importance_weights, batch_indexes = M.zip_batch(minibatch, priority = True)
 | 
			
		||||
            minibatch = self.memory.sample(self.config['batch_size'], beta=beta)
 | 
			
		||||
            state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, importance_weights, batch_indexes = M.zip_batch(minibatch, priority=True)
 | 
			
		||||
        else:
 | 
			
		||||
            minibatch = self.memory.sample(self.config['batch_size'])
 | 
			
		||||
            state_batch, action_batch, reward_batch, next_state_batch, not_done_batch = M.zip_batch(minibatch)
 | 
			
		||||
| 
						 | 
				
			
			@ -49,7 +47,7 @@ class DQNAgent:
 | 
			
		|||
            # and the regular net to select the action
 | 
			
		||||
            # That way we decouple the value and action selecting processes (DOUBLE DQN)
 | 
			
		||||
            not_done_size = not_done_batch.sum()
 | 
			
		||||
            next_state_values = torch.zeros_like(state_values, device = self.net.device)
 | 
			
		||||
            next_state_values = torch.zeros_like(state_values, device=self.net.device)
 | 
			
		||||
            if self.target_net is not None:
 | 
			
		||||
                next_state_values[not_done_batch] = self.target_net(next_state_batch[not_done_batch])
 | 
			
		||||
                next_best_action = self.net(next_state_batch[not_done_batch]).argmax(1)
 | 
			
		||||
| 
						 | 
				
			
			@ -57,15 +55,15 @@ class DQNAgent:
 | 
			
		|||
                next_state_values[not_done_batch] = self.net(next_state_batch[not_done_batch])
 | 
			
		||||
                next_best_action = next_state_values[not_done_batch].argmax(1)
 | 
			
		||||
 | 
			
		||||
            best_next_state_value = torch.zeros(self.config['batch_size'], device = self.net.device)
 | 
			
		||||
            best_next_state_value = torch.zeros(self.config['batch_size'], device=self.net.device)
 | 
			
		||||
            best_next_state_value[not_done_batch] = next_state_values[not_done_batch].gather(1, next_best_action.view((not_done_size, 1))).squeeze(1)
 | 
			
		||||
            
 | 
			
		||||
        expected_values = (reward_batch + (self.config['discount_rate'] * best_next_state_value)).unsqueeze(1)
 | 
			
		||||
 | 
			
		||||
        # If we're sampling by TD error, multiply loss by a importance weight which helps decrease overfitting
 | 
			
		||||
        if (isinstance(self.memory, M.PrioritizedReplayMemory)):
 | 
			
		||||
        if isinstance(self.memory, M.PrioritizedReplayMemory):
 | 
			
		||||
            # loss = (torch.as_tensor(importance_weights, device = self.net.device) * F.smooth_l1_loss(obtained_values, expected_values, reduction = 'none').squeeze(1)).mean()
 | 
			
		||||
             loss = (torch.as_tensor(importance_weights, device = self.net.device) * ((obtained_values - expected_values)**2).squeeze(1)).mean()
 | 
			
		||||
            loss = (torch.as_tensor(importance_weights, device=self.net.device) * ((obtained_values - expected_values)**2).squeeze(1)).mean()
 | 
			
		||||
        else:
 | 
			
		||||
            # loss = F.smooth_l1_loss(obtained_values, expected_values)
 | 
			
		||||
            loss = F.mse_loss(obtained_values, expected_values)
 | 
			
		||||
| 
						 | 
				
			
			@ -85,8 +83,6 @@ class DQNAgent:
 | 
			
		|||
                self.target_net.sync()
 | 
			
		||||
 | 
			
		||||
        # If we're sampling by TD error, readjust the weights of the experiences
 | 
			
		||||
        if (isinstance(self.memory, M.PrioritizedReplayMemory)):
 | 
			
		||||
        if isinstance(self.memory, M.PrioritizedReplayMemory):
 | 
			
		||||
            td_error = (obtained_values - expected_values).detach().abs()
 | 
			
		||||
            self.memory.update_priorities(batch_indexes, td_error)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,14 +1,12 @@
 | 
			
		|||
import collections
 | 
			
		||||
from copy import deepcopy
 | 
			
		||||
import rltorch.memory as M
 | 
			
		||||
import torch
 | 
			
		||||
import torch.nn.functional as F
 | 
			
		||||
from copy import deepcopy
 | 
			
		||||
import numpy as np
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
from rltorch.action_selector import ArgMaxSelector
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class DQfDAgent:
 | 
			
		||||
    def __init__(self, net, memory, config, target_net = None, logger = None):
 | 
			
		||||
    def __init__(self, net, memory, config, target_net=None, logger=None):
 | 
			
		||||
        self.net = net
 | 
			
		||||
        self.target_net = target_net
 | 
			
		||||
        self.memory = memory
 | 
			
		||||
| 
						 | 
				
			
			@ -21,7 +19,7 @@ class DQfDAgent:
 | 
			
		|||
        self.net.model.to(self.net.device)
 | 
			
		||||
        self.target_net.sync()
 | 
			
		||||
    
 | 
			
		||||
    def learn(self, logger = None):
 | 
			
		||||
    def learn(self, logger=None):
 | 
			
		||||
        if len(self.memory) < self.config['batch_size']:
 | 
			
		||||
            return
 | 
			
		||||
        
 | 
			
		||||
| 
						 | 
				
			
			@ -32,29 +30,19 @@ class DQfDAgent:
 | 
			
		|||
            batch_size = self.config['batch_size']
 | 
			
		||||
            steps = None
 | 
			
		||||
        
 | 
			
		||||
        if isinstance(self.memory, M.DQfDMemory):
 | 
			
		||||
        weight_importance = self.config['prioritized_replay_weight_importance']
 | 
			
		||||
        # If it's a scheduler then get the next value by calling next, otherwise just use it's value
 | 
			
		||||
            beta = next(weight_importance) if isinstance(weight_importance, collections.Iterable) else weight_importance
 | 
			
		||||
        beta = next(weight_importance) if isinstance(weight_importance, collections.Iterable) \
 | 
			
		||||
            else weight_importance
 | 
			
		||||
        
 | 
			
		||||
        # Check to see if we are doing N-Step DQN
 | 
			
		||||
        if steps is not None:
 | 
			
		||||
            minibatch = self.memory.sample_n_steps(batch_size, steps, beta)
 | 
			
		||||
        else:
 | 
			
		||||
                minibatch = self.memory.sample(batch_size, beta = beta)
 | 
			
		||||
            minibatch = self.memory.sample(batch_size, beta=beta)
 | 
			
		||||
 | 
			
		||||
        # Process batch
 | 
			
		||||
            state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, importance_weights, batch_indexes = M.zip_batch(minibatch, priority = True)
 | 
			
		||||
 | 
			
		||||
        else:
 | 
			
		||||
            # Check to see if we're doing N-Step DQN
 | 
			
		||||
            if steps is not None:
 | 
			
		||||
                minibatch = self.memory.sample_n_steps(batch_size, steps)
 | 
			
		||||
            else:
 | 
			
		||||
                minibatch = self.memory.sample(batch_size)
 | 
			
		||||
 | 
			
		||||
            # Process batch
 | 
			
		||||
            state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, batch_indexes = M.zip_batch(minibatch, want_indices = True)
 | 
			
		||||
        state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, importance_weights, batch_indexes = M.zip_batch(minibatch, priority=True)
 | 
			
		||||
 | 
			
		||||
        batch_index_tensors = torch.tensor(batch_indexes)
 | 
			
		||||
        demo_mask = batch_index_tensors < self.memory.demo_position
 | 
			
		||||
| 
						 | 
				
			
			@ -75,7 +63,7 @@ class DQfDAgent:
 | 
			
		|||
            # and the regular net to select the action
 | 
			
		||||
            # That way we decouple the value and action selecting processes (DOUBLE DQN)
 | 
			
		||||
            not_done_size = not_done_batch.sum()
 | 
			
		||||
            next_state_values = torch.zeros_like(state_values, device = self.net.device)
 | 
			
		||||
            next_state_values = torch.zeros_like(state_values, device=self.net.device)
 | 
			
		||||
            if self.target_net is not None:
 | 
			
		||||
                next_state_values[not_done_batch] = self.target_net(next_state_batch[not_done_batch])
 | 
			
		||||
                next_best_action = self.net(next_state_batch[not_done_batch]).argmax(1)
 | 
			
		||||
| 
						 | 
				
			
			@ -83,14 +71,14 @@ class DQfDAgent:
 | 
			
		|||
                next_state_values[not_done_batch] = self.net(next_state_batch[not_done_batch])
 | 
			
		||||
                next_best_action = next_state_values[not_done_batch].argmax(1)
 | 
			
		||||
 | 
			
		||||
            best_next_state_value = torch.zeros(batch_size, device = self.net.device)
 | 
			
		||||
            best_next_state_value = torch.zeros(batch_size, device=self.net.device)
 | 
			
		||||
            best_next_state_value[not_done_batch] = next_state_values[not_done_batch].gather(1, next_best_action.view((not_done_size, 1))).squeeze(1)
 | 
			
		||||
            
 | 
			
		||||
        expected_values = (reward_batch + (self.config['discount_rate'] * best_next_state_value)).unsqueeze(1)
 | 
			
		||||
        expected_values = (reward_batch + (batch_size * best_next_state_value)).unsqueeze(1)
 | 
			
		||||
 | 
			
		||||
        # N-Step DQN Loss
 | 
			
		||||
        # num_steps capture how many steps actually exist before the end of episode
 | 
			
		||||
        if steps != None:
 | 
			
		||||
        if steps is not None:
 | 
			
		||||
            expected_n_step_values = []
 | 
			
		||||
            with torch.no_grad():
 | 
			
		||||
                for i in range(0, len(state_batch), steps):
 | 
			
		||||
| 
						 | 
				
			
			@ -127,7 +115,7 @@ class DQfDAgent:
 | 
			
		|||
            l = torch.ones_like(state_values[demo_mask])
 | 
			
		||||
            expert_actions = action_batch[demo_mask]
 | 
			
		||||
            # l(s, a) is zero for every action the expert doesn't take
 | 
			
		||||
            for i,a in zip(range(len(l)), expert_actions):
 | 
			
		||||
            for i, _, a in zip(enumerate(l), expert_actions):
 | 
			
		||||
                l[i].fill_(0.8) # According to paper
 | 
			
		||||
                l[i, a] = 0
 | 
			
		||||
            if self.target_net is not None:
 | 
			
		||||
| 
						 | 
				
			
			@ -148,26 +136,17 @@ class DQfDAgent:
 | 
			
		|||
        
 | 
			
		||||
        
 | 
			
		||||
        # Since dqn_loss and demo_loss are different sizes, the reduction has to happen before they are combined
 | 
			
		||||
        if isinstance(self.memory, M.DQfDMemory):
 | 
			
		||||
            dqn_loss = (torch.as_tensor(importance_weights, device = self.net.device) * F.mse_loss(obtained_values, expected_values, reduction = 'none').squeeze(1)).mean()
 | 
			
		||||
        else:
 | 
			
		||||
            dqn_loss = F.mse_loss(obtained_values, expected_values)
 | 
			
		||||
        dqn_loss = (torch.as_tensor(importance_weights, device=self.net.device) * F.mse_loss(obtained_values, expected_values, reduction='none').squeeze(1)).mean()
 | 
			
		||||
        
 | 
			
		||||
        if steps != None:
 | 
			
		||||
            if isinstance(self.memory, M.DQfDMemory):
 | 
			
		||||
                dqn_n_step_loss =  (torch.as_tensor(importance_weights[::steps], device = self.net.device) * F.mse_loss(observed_n_step_values, expected_n_step_values, reduction = 'none')).mean()
 | 
			
		||||
        if steps is not None:
 | 
			
		||||
            dqn_n_step_loss = (torch.as_tensor(importance_weights[::steps], device=self.net.device) * F.mse_loss(observed_n_step_values, expected_n_step_values, reduction='none')).mean()
 | 
			
		||||
        else:
 | 
			
		||||
                dqn_n_step_loss =  F.mse_loss(observed_n_step_values, expected_n_step_values, reduction = 'none').mean()
 | 
			
		||||
        else:
 | 
			
		||||
            dqn_n_step_loss = torch.tensor(0, device = self.net.device)
 | 
			
		||||
            dqn_n_step_loss = torch.tensor(0, device=self.net.device)
 | 
			
		||||
        
 | 
			
		||||
        if demo_mask.sum() > 0:
 | 
			
		||||
            if isinstance(self.memory, M.DQfDMemory):
 | 
			
		||||
                demo_loss = (torch.as_tensor(importance_weights, device = self.net.device)[demo_mask] * F.mse_loss((state_values[demo_mask] + l).max(1)[0].unsqueeze(1), expert_value, reduction = 'none').squeeze(1)).mean()
 | 
			
		||||
            demo_loss = (torch.as_tensor(importance_weights, device=self.net.device)[demo_mask] * F.mse_loss((state_values[demo_mask] + l).max(1)[0].unsqueeze(1), expert_value, reduction='none').squeeze(1)).mean()
 | 
			
		||||
        else:
 | 
			
		||||
                demo_loss = F.mse_loss((state_values[demo_mask] + l).max(1)[0].unsqueeze(1), expert_value, reduction = 'none').squeeze(1).mean()
 | 
			
		||||
        else:
 | 
			
		||||
            demo_loss = 0.
 | 
			
		||||
            demo_loss = 0
 | 
			
		||||
        loss = td_importance * dqn_loss + td_importance * dqn_n_step_loss + demo_importance * demo_loss
 | 
			
		||||
        
 | 
			
		||||
        if self.logger is not None:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,15 +1,11 @@
 | 
			
		|||
from copy import deepcopy
 | 
			
		||||
import numpy as np
 | 
			
		||||
import torch
 | 
			
		||||
import torch.nn.functional as F
 | 
			
		||||
from torch.distributions import Categorical
 | 
			
		||||
import rltorch
 | 
			
		||||
import rltorch.memory as M
 | 
			
		||||
import collections
 | 
			
		||||
import random
 | 
			
		||||
 | 
			
		||||
class PPOAgent:
 | 
			
		||||
  def __init__(self, policy_net, value_net, memory, config, logger = None):
 | 
			
		||||
    def __init__(self, policy_net, value_net, memory, config, logger=None):
 | 
			
		||||
        self.policy_net = policy_net
 | 
			
		||||
        self.old_policy_net = rltorch.network.TargetNetwork(policy_net)
 | 
			
		||||
        self.value_net = value_net
 | 
			
		||||
| 
						 | 
				
			
			@ -20,10 +16,9 @@ class PPOAgent:
 | 
			
		|||
    def _discount_rewards(self, rewards):
 | 
			
		||||
        gammas = torch.ones_like(rewards)
 | 
			
		||||
        if len(rewards) > 1:
 | 
			
		||||
      gammas[1:] = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards) - 1), dim = 0)
 | 
			
		||||
            gammas[1:] = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards) - 1), dim=0)
 | 
			
		||||
        return gammas * rewards
 | 
			
		||||
  
 | 
			
		||||
  
 | 
			
		||||
    def learn(self):
 | 
			
		||||
        episode_batch = self.memory.recall()
 | 
			
		||||
        state_batch, action_batch, reward_batch, next_state_batch, done_batch, log_prob_batch = zip(*episode_batch)  
 | 
			
		||||
| 
						 | 
				
			
			@ -61,21 +56,17 @@ class PPOAgent:
 | 
			
		|||
        # For PPO we want to stay within a certain ratio of the old policy
 | 
			
		||||
        policy_ratio = torch.exp(log_prob_batch - old_log_probs) # Equivalent to (log_prob / old_log_prob)
 | 
			
		||||
        policy_loss1 = policy_ratio * advantages
 | 
			
		||||
    policy_loss2 = policy_ratio.clamp(min = 0.8, max = 1.2) * advantages # From original paper
 | 
			
		||||
        policy_loss2 = policy_ratio.clamp(min=0.8, max=1.2) * advantages # From original paper
 | 
			
		||||
        policy_loss = -torch.min(policy_loss1, policy_loss2).sum()
 | 
			
		||||
    
 | 
			
		||||
        if self.logger is not None:
 | 
			
		||||
            self.logger.append("Loss/Policy", policy_loss.item())
 | 
			
		||||
            self.logger.append("Loss/Value", value_loss.item())
 | 
			
		||||
 | 
			
		||||
    
 | 
			
		||||
        self.old_policy_net.sync()
 | 
			
		||||
        self.policy_net.zero_grad()
 | 
			
		||||
        policy_loss.backward()
 | 
			
		||||
        self.policy_net.step()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        # Memory under the old policy is not needed for future training
 | 
			
		||||
        self.memory.clear()
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -2,16 +2,17 @@ from copy import deepcopy
 | 
			
		|||
import collections
 | 
			
		||||
import numpy as np
 | 
			
		||||
import torch
 | 
			
		||||
import torch.nn.functional as F
 | 
			
		||||
from torch.distributions import Categorical
 | 
			
		||||
import rltorch
 | 
			
		||||
import rltorch.memory as M
 | 
			
		||||
import torch.nn.functional as F
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Q-Evolutionary Policy Agent
 | 
			
		||||
# Maximizes the policy with respect to the Q-Value function.
 | 
			
		||||
# Since function is non-differentiabile, depends on the Evolutionary Strategy algorithm
 | 
			
		||||
class QEPAgent:
 | 
			
		||||
    def __init__(self, policy_net, value_net, memory, config, target_value_net = None, logger = None, entropy_importance = 0, policy_skip = 4, after_value_train = None):
 | 
			
		||||
    def __init__(self, policy_net, value_net, memory, config, target_value_net=None, logger=None, entropy_importance=0, policy_skip=4):
 | 
			
		||||
        self.policy_net = policy_net
 | 
			
		||||
        assert isinstance(self.policy_net, rltorch.network.ESNetwork) or isinstance(self.policy_net, rltorch.network.ESNetworkMP)
 | 
			
		||||
        self.policy_net.fitness = self.fitness
 | 
			
		||||
| 
						 | 
				
			
			@ -22,7 +23,6 @@ class QEPAgent:
 | 
			
		|||
        self.logger = logger
 | 
			
		||||
        self.policy_skip = policy_skip
 | 
			
		||||
        self.entropy_importance = entropy_importance
 | 
			
		||||
        self.after_value_train = after_value_train
 | 
			
		||||
    
 | 
			
		||||
    def save(self, file_location):
 | 
			
		||||
        torch.save({
 | 
			
		||||
| 
						 | 
				
			
			@ -42,10 +42,8 @@ class QEPAgent:
 | 
			
		|||
        batch_size = len(state_batch)
 | 
			
		||||
        with torch.no_grad():
 | 
			
		||||
            action_probabilities = policy_net(state_batch)
 | 
			
		||||
        
 | 
			
		||||
        action_size = action_probabilities.shape[1]
 | 
			
		||||
        distributions = list(map(Categorical, action_probabilities))
 | 
			
		||||
 | 
			
		||||
        actions = torch.stack([d.sample() for d in distributions])
 | 
			
		||||
      
 | 
			
		||||
        with torch.no_grad():
 | 
			
		||||
| 
						 | 
				
			
			@ -54,31 +52,31 @@ class QEPAgent:
 | 
			
		|||
        # Weird hacky solution where in multiprocess, it sometimes spits out nans
 | 
			
		||||
        # So have it try again
 | 
			
		||||
        while torch.isnan(state_values).any():
 | 
			
		||||
            print("NAN DETECTED")
 | 
			
		||||
            with torch.no_grad():
 | 
			
		||||
                state_values = value_net(state_batch)
 | 
			
		||||
 | 
			
		||||
        obtained_values = state_values.gather(1, actions.view(batch_size, 1)).squeeze(1)
 | 
			
		||||
 | 
			
		||||
        obtained_values = state_values.gather(1, actions.view(len(state_batch), 1)).squeeze(1)
 | 
			
		||||
        # return -obtained_values.mean().item()
 | 
			
		||||
        entropy_importance = 0 # Entropy accounting for 1% of loss seems to work well
 | 
			
		||||
        entropy_importance = next(self.entropy_importance) if isinstance(self.entropy_importance, collections.Iterable) else self.entropy_importance
 | 
			
		||||
        value_importance = 1 - entropy_importance
 | 
			
		||||
        
 | 
			
		||||
        # entropy_loss = (action_probabilities * torch.log2(action_probabilities)).sum(1) # Standard entropy loss from information theory
 | 
			
		||||
        entropy_loss = (action_probabilities - torch.tensor(1 / action_size, device = state_batch.device).repeat(batch_size, action_size)).abs().sum(1)
 | 
			
		||||
        entropy_loss = (action_probabilities - torch.tensor(1 / action_size, device=state_batch.device).repeat(len(state_batch), action_size)).abs().sum(1)
 | 
			
		||||
        
 | 
			
		||||
        return (entropy_importance * entropy_loss - value_importance * obtained_values).mean().item()
 | 
			
		||||
        
 | 
			
		||||
 | 
			
		||||
    def learn(self, logger = None):
 | 
			
		||||
    def learn(self, logger=None):
 | 
			
		||||
        if len(self.memory) < self.config['batch_size']:
 | 
			
		||||
            return
 | 
			
		||||
        
 | 
			
		||||
        if (isinstance(self.memory, M.PrioritizedReplayMemory)):
 | 
			
		||||
        if isinstance(self.memory, M.PrioritizedReplayMemory):
 | 
			
		||||
            weight_importance = self.config['prioritized_replay_weight_importance']
 | 
			
		||||
            # If it's a scheduler then get the next value by calling next, otherwise just use it's value
 | 
			
		||||
            beta = next(weight_importance) if isinstance(weight_importance, collections.Iterable) else weight_importance
 | 
			
		||||
            minibatch = self.memory.sample(self.config['batch_size'], beta = beta)
 | 
			
		||||
            state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, importance_weights, batch_indexes = M.zip_batch(minibatch, priority = True)
 | 
			
		||||
            minibatch = self.memory.sample(self.config['batch_size'], beta=beta)
 | 
			
		||||
            state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, importance_weights, batch_indexes = M.zip_batch(minibatch, priority=True)
 | 
			
		||||
        else:
 | 
			
		||||
            minibatch = self.memory.sample(self.config['batch_size'])
 | 
			
		||||
            state_batch, action_batch, reward_batch, next_state_batch, not_done_batch = M.zip_batch(minibatch)
 | 
			
		||||
| 
						 | 
				
			
			@ -98,7 +96,7 @@ class QEPAgent:
 | 
			
		|||
            # and the regular net to select the action
 | 
			
		||||
            # That way we decouple the value and action selecting processes (DOUBLE DQN)
 | 
			
		||||
            not_done_size = not_done_batch.sum()
 | 
			
		||||
            next_state_values = torch.zeros_like(state_values, device = self.value_net.device)
 | 
			
		||||
            next_state_values = torch.zeros_like(state_values, device=self.value_net.device)
 | 
			
		||||
            if self.target_value_net is not None:
 | 
			
		||||
                next_state_values[not_done_batch] = self.target_value_net(next_state_batch[not_done_batch])
 | 
			
		||||
                next_best_action = self.value_net(next_state_batch[not_done_batch]).argmax(1)
 | 
			
		||||
| 
						 | 
				
			
			@ -106,13 +104,13 @@ class QEPAgent:
 | 
			
		|||
                next_state_values[not_done_batch] = self.value_net(next_state_batch[not_done_batch])
 | 
			
		||||
                next_best_action = next_state_values[not_done_batch].argmax(1)
 | 
			
		||||
 | 
			
		||||
            best_next_state_value = torch.zeros(self.config['batch_size'], device = self.value_net.device)
 | 
			
		||||
            best_next_state_value = torch.zeros(self.config['batch_size'], device=self.value_net.device)
 | 
			
		||||
            best_next_state_value[not_done_batch] = next_state_values[not_done_batch].gather(1, next_best_action.view((not_done_size, 1))).squeeze(1)
 | 
			
		||||
            
 | 
			
		||||
        expected_values = (reward_batch + (self.config['discount_rate'] * best_next_state_value)).unsqueeze(1)
 | 
			
		||||
 | 
			
		||||
        if (isinstance(self.memory, M.PrioritizedReplayMemory)):
 | 
			
		||||
            value_loss = (torch.as_tensor(importance_weights, device = self.value_net.device) * ((obtained_values - expected_values)**2).squeeze(1)).mean()
 | 
			
		||||
        if isinstance(self.memory, M.PrioritizedReplayMemory):
 | 
			
		||||
            value_loss = (torch.as_tensor(importance_weights, device=self.value_net.device) * ((obtained_values - expected_values)**2).squeeze(1)).mean()
 | 
			
		||||
        else:
 | 
			
		||||
            value_loss = F.mse_loss(obtained_values, expected_values)
 | 
			
		||||
        
 | 
			
		||||
| 
						 | 
				
			
			@ -124,16 +122,13 @@ class QEPAgent:
 | 
			
		|||
        self.value_net.clamp_gradients()
 | 
			
		||||
        self.value_net.step()
 | 
			
		||||
 | 
			
		||||
        if callable(self.after_value_train):
 | 
			
		||||
            self.after_value_train()
 | 
			
		||||
 | 
			
		||||
        if self.target_value_net is not None:
 | 
			
		||||
            if 'target_sync_tau' in self.config:
 | 
			
		||||
                self.target_value_net.partial_sync(self.config['target_sync_tau'])
 | 
			
		||||
            else:
 | 
			
		||||
                self.target_value_net.sync()
 | 
			
		||||
 | 
			
		||||
        if (isinstance(self.memory, M.PrioritizedReplayMemory)):
 | 
			
		||||
        if isinstance(self.memory, M.PrioritizedReplayMemory):
 | 
			
		||||
            td_error = (obtained_values - expected_values).detach().abs()
 | 
			
		||||
            self.memory.update_priorities(batch_indexes, td_error)
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -141,7 +136,8 @@ class QEPAgent:
 | 
			
		|||
        if self.policy_skip > 0:
 | 
			
		||||
            self.policy_skip -= 1
 | 
			
		||||
            return
 | 
			
		||||
        self.policy_skip = self.config['policy_skip']
 | 
			
		||||
        self.policy_skip = 4
 | 
			
		||||
 | 
			
		||||
        if self.target_value_net is not None:
 | 
			
		||||
            self.policy_net.calc_gradients(self.target_value_net, state_batch)
 | 
			
		||||
        else:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,10 +1,10 @@
 | 
			
		|||
import rltorch
 | 
			
		||||
from copy import deepcopy
 | 
			
		||||
import torch
 | 
			
		||||
import numpy as np
 | 
			
		||||
import torch
 | 
			
		||||
import rltorch
 | 
			
		||||
 | 
			
		||||
class REINFORCEAgent:
 | 
			
		||||
  def __init__(self, net , memory, config, target_net = None, logger = None):
 | 
			
		||||
    def __init__(self, net, memory, config, target_net=None, logger=None):
 | 
			
		||||
        self.net = net
 | 
			
		||||
        if not isinstance(memory, rltorch.memory.EpisodeMemory):
 | 
			
		||||
            raise ValueError("Memory must be of instance EpisodeMemory")
 | 
			
		||||
| 
						 | 
				
			
			@ -23,14 +23,14 @@ class REINFORCEAgent:
 | 
			
		|||
        for i in range(len(rewards)):
 | 
			
		||||
            gammas = torch.ones_like(rewards[i:])
 | 
			
		||||
            if i != len(rewards) - 1:
 | 
			
		||||
        gammas[1:] = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards) - i - 1), dim = 0)
 | 
			
		||||
                gammas[1:] = torch.cumprod(torch.tensor(self.config['discount_rate']).repeat(len(rewards) - i - 1), dim=0)
 | 
			
		||||
            advantages = rewards[i:] - baseline
 | 
			
		||||
            shaped_rewards[i] = (gammas * advantages).sum()
 | 
			
		||||
        return shaped_rewards
 | 
			
		||||
  
 | 
			
		||||
    def learn(self):
 | 
			
		||||
        episode_batch = self.memory.recall()
 | 
			
		||||
    state_batch, action_batch, reward_batch, next_state_batch, done_batch, log_prob_batch = zip(*episode_batch)
 | 
			
		||||
        _, _, reward_batch, _, _, log_prob_batch = zip(*episode_batch)
 | 
			
		||||
 | 
			
		||||
        # Caluclate discounted rewards to place more importance to recent rewards
 | 
			
		||||
        shaped_reward_batch = self._shape_rewards(torch.tensor(reward_batch))
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,6 +1,6 @@
 | 
			
		|||
from .A2CSingleAgent import *
 | 
			
		||||
from .DQNAgent import *
 | 
			
		||||
from .DQfDAgent import *
 | 
			
		||||
from .PPOAgent import *
 | 
			
		||||
from .QEPAgent import *
 | 
			
		||||
from .REINFORCEAgent import *
 | 
			
		||||
from .A2CSingleAgent import A2CSingleAgent
 | 
			
		||||
from .DQNAgent import DQNAgent
 | 
			
		||||
from .DQfDAgent import DQfDAgent
 | 
			
		||||
from .PPOAgent import PPOAgent
 | 
			
		||||
from .QEPAgent import QEPAgent
 | 
			
		||||
from .REINFORCEAgent import REINFORCEAgent
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										12
									
								
								rltorch/env/simulate.py
									
										
									
									
										vendored
									
									
								
							
							
						
						
									
										12
									
								
								rltorch/env/simulate.py
									
										
									
									
										vendored
									
									
								
							| 
						 | 
				
			
			@ -1,8 +1,8 @@
 | 
			
		|||
from copy import deepcopy
 | 
			
		||||
import rltorch
 | 
			
		||||
import time
 | 
			
		||||
import rltorch
 | 
			
		||||
 | 
			
		||||
def simulateEnvEps(env, actor, config, total_episodes = 1, memory = None, logger = None, name = "", render = False):
 | 
			
		||||
def simulateEnvEps(env, actor, config, total_episodes=1, memory=None, logger=None, name="", render=False):
 | 
			
		||||
    for episode in range(total_episodes):
 | 
			
		||||
        state = env.reset()
 | 
			
		||||
        done = False
 | 
			
		||||
| 
						 | 
				
			
			@ -27,8 +27,8 @@ def simulateEnvEps(env, actor, config, total_episodes = 1, memory = None, logger
 | 
			
		|||
            logger.append(name + '/EpisodeReward', episode_reward)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class EnvironmentRunSync():
 | 
			
		||||
  def __init__(self, env, actor, config, memory = None, logwriter = None, name = "", render = False):
 | 
			
		||||
class EnvironmentRunSync:
 | 
			
		||||
    def __init__(self, env, actor, config, memory=None, logwriter=None, name="", render=False):
 | 
			
		||||
        self.env = env
 | 
			
		||||
        self.name = name
 | 
			
		||||
        self.actor = actor
 | 
			
		||||
| 
						 | 
				
			
			@ -72,8 +72,8 @@ class EnvironmentRunSync():
 | 
			
		|||
        self.last_state = state
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class EnvironmentEpisodeSync():
 | 
			
		||||
  def __init__(self, env, actor, config, memory = None, logwriter = None, name = ""):
 | 
			
		||||
class EnvironmentEpisodeSync:
 | 
			
		||||
    def __init__(self, env, actor, config, memory=None, logwriter=None, name=""):
 | 
			
		||||
        self.env = env
 | 
			
		||||
        self.name = name
 | 
			
		||||
        self.actor = actor
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										20
									
								
								rltorch/env/wrappers.py
									
										
									
									
										vendored
									
									
								
							
							
						
						
									
										20
									
								
								rltorch/env/wrappers.py
									
										
									
									
										vendored
									
									
								
							| 
						 | 
				
			
			@ -1,8 +1,8 @@
 | 
			
		|||
from collections import deque
 | 
			
		||||
import gym
 | 
			
		||||
import torch
 | 
			
		||||
from gym import spaces
 | 
			
		||||
import cv2
 | 
			
		||||
from collections import deque
 | 
			
		||||
import numpy as np
 | 
			
		||||
 | 
			
		||||
class EpisodicLifeEnv(gym.Wrapper):
 | 
			
		||||
| 
						 | 
				
			
			@ -170,7 +170,12 @@ class FrameStack(gym.Wrapper):
 | 
			
		|||
        self.k = k
 | 
			
		||||
        self.frames = deque([], maxlen=k)
 | 
			
		||||
        shp = env.observation_space.shape
 | 
			
		||||
    self.observation_space = spaces.Box(low=0, high=255, shape=(shp[:-1] + (shp[-1] * k,)), dtype=env.observation_space.dtype)
 | 
			
		||||
        self.observation_space = spaces.Box(
 | 
			
		||||
            low=0,
 | 
			
		||||
            high=255,
 | 
			
		||||
            shape=(shp[:-1] + (shp[-1] * k,)),
 | 
			
		||||
            dtype=env.observation_space.dtype
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    def reset(self):
 | 
			
		||||
        ob = self.env.reset()
 | 
			
		||||
| 
						 | 
				
			
			@ -189,7 +194,7 @@ class FrameStack(gym.Wrapper):
 | 
			
		|||
        return torch.cat(list(self.frames)).unsqueeze(0)
 | 
			
		||||
 | 
			
		||||
class ProcessFrame(gym.Wrapper):
 | 
			
		||||
  def __init__(self, env, resize_shape = None, crop_bounds = None, grayscale = False):
 | 
			
		||||
    def __init__(self, env, resize_shape=None, crop_bounds=None, grayscale=False):
 | 
			
		||||
        gym.Wrapper.__init__(self, env)
 | 
			
		||||
        self.resize_shape = resize_shape
 | 
			
		||||
        self.crop_bounds = crop_bounds
 | 
			
		||||
| 
						 | 
				
			
			@ -207,14 +212,16 @@ class ProcessFrame(gym.Wrapper):
 | 
			
		|||
        if self.grayscale:
 | 
			
		||||
            frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
 | 
			
		||||
        if self.crop_bounds is not None and len(self.crop_bounds) == 4:
 | 
			
		||||
        frame = frame[self.crop_bounds[0]:self.crop_bounds[1], self.crop_bounds[2]:self.crop_bounds[3]] 
 | 
			
		||||
            frame = frame[
 | 
			
		||||
                self.crop_bounds[0]:self.crop_bounds[1],
 | 
			
		||||
                self.crop_bounds[2]:self.crop_bounds[3]
 | 
			
		||||
            ] 
 | 
			
		||||
        if self.resize_shape is not None and len(self.resize_shape) == 2:
 | 
			
		||||
            frame = cv2.resize(frame, self.resize_shape, interpolation=cv2.INTER_AREA)
 | 
			
		||||
        # Normalize
 | 
			
		||||
        frame = frame / 255
 | 
			
		||||
        return frame
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Turns observations into torch tensors
 | 
			
		||||
# Adds an additional dimension that's suppose to represent the batch dim
 | 
			
		||||
class TorchWrap(gym.Wrapper):
 | 
			
		||||
| 
						 | 
				
			
			@ -233,8 +240,6 @@ class TorchWrap(gym.Wrapper):
 | 
			
		|||
        frame = torch.from_numpy(frame).unsqueeze(0).float()
 | 
			
		||||
        return frame
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class ProcessFrame84(gym.ObservationWrapper):
 | 
			
		||||
    def __init__(self, env=None):
 | 
			
		||||
        super(ProcessFrame84, self).__init__(env)
 | 
			
		||||
| 
						 | 
				
			
			@ -256,4 +261,3 @@ class ProcessFrame84(gym.ObservationWrapper):
 | 
			
		|||
        x_t = resized_screen[18:102, :]
 | 
			
		||||
        x_t = np.reshape(x_t, [84, 84])
 | 
			
		||||
        return x_t.astype(np.uint8)
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,13 +1,13 @@
 | 
			
		|||
from .PrioritizedReplayMemory import PrioritizedReplayMemory
 | 
			
		||||
from collections import namedtuple
 | 
			
		||||
import numpy as np
 | 
			
		||||
from .PrioritizedReplayMemory import PrioritizedReplayMemory
 | 
			
		||||
 | 
			
		||||
Transition = namedtuple('Transition',
 | 
			
		||||
    ('state', 'action', 'reward', 'next_state', 'done'))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class DQfDMemory(PrioritizedReplayMemory):
 | 
			
		||||
    def __init__(self, capacity, alpha, max_demo = -1):
 | 
			
		||||
    def __init__(self, capacity, alpha, max_demo=-1):
 | 
			
		||||
        assert max_demo <= capacity
 | 
			
		||||
        super().__init__(capacity, alpha)
 | 
			
		||||
        self.demo_position = 0
 | 
			
		||||
| 
						 | 
				
			
			@ -47,7 +47,8 @@ class DQfDMemory(PrioritizedReplayMemory):
 | 
			
		|||
        idxes = self._sample_proportional(sample_size)
 | 
			
		||||
        step_idxes = []
 | 
			
		||||
        for i in idxes:
 | 
			
		||||
            # If the interval of experiences fall between demonstration and obtained, move it over to the demonstration half
 | 
			
		||||
            # If the interval of experiences fall between demonstration and obtained,
 | 
			
		||||
            # move it over to the demonstration half
 | 
			
		||||
            if i < self.demo_position and i + steps > self.demo_position:
 | 
			
		||||
                diff = i + steps - self.demo_position
 | 
			
		||||
                step_idxes += range(i - diff, i + steps - diff)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,6 +1,4 @@
 | 
			
		|||
import random
 | 
			
		||||
from collections import namedtuple
 | 
			
		||||
import torch
 | 
			
		||||
Transition = namedtuple('Transition',
 | 
			
		||||
    ('state', 'action', 'reward', 'next_state', 'done'))
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,10 +1,9 @@
 | 
			
		|||
# From OpenAI Baselines https://github.com/openai/baselines/blob/master/baselines/deepq/replay_buffer.py
 | 
			
		||||
 | 
			
		||||
from .ReplayMemory import ReplayMemory
 | 
			
		||||
import operator
 | 
			
		||||
import random
 | 
			
		||||
import numpy as np
 | 
			
		||||
from numba import jit
 | 
			
		||||
from .ReplayMemory import ReplayMemory
 | 
			
		||||
 | 
			
		||||
class SegmentTree(object):
 | 
			
		||||
    def __init__(self, capacity, operation, neutral_element):
 | 
			
		||||
| 
						 | 
				
			
			@ -34,7 +33,7 @@ class SegmentTree(object):
 | 
			
		|||
        self._value = [neutral_element for _ in range(2 * capacity)]
 | 
			
		||||
        self._operation = operation
 | 
			
		||||
 | 
			
		||||
    @jit(forceobj = True)
 | 
			
		||||
    @jit(forceobj=True)
 | 
			
		||||
    def _reduce_helper(self, start, end, node, node_start, node_end):
 | 
			
		||||
        if start == node_start and end == node_end:
 | 
			
		||||
            return self._value[node]
 | 
			
		||||
| 
						 | 
				
			
			@ -50,7 +49,7 @@ class SegmentTree(object):
 | 
			
		|||
                    self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1, node_end)
 | 
			
		||||
                )
 | 
			
		||||
 | 
			
		||||
    @jit(forceobj = True)
 | 
			
		||||
    @jit(forceobj=True)
 | 
			
		||||
    def reduce(self, start=0, end=None):
 | 
			
		||||
        """Returns result of applying `self.operation`
 | 
			
		||||
        to a contiguous subsequence of the array.
 | 
			
		||||
| 
						 | 
				
			
			@ -73,7 +72,7 @@ class SegmentTree(object):
 | 
			
		|||
        end -= 1
 | 
			
		||||
        return self._reduce_helper(start, end, 1, 0, self._capacity - 1)
 | 
			
		||||
 | 
			
		||||
    @jit(forceobj = True)
 | 
			
		||||
    @jit(forceobj=True)
 | 
			
		||||
    def __setitem__(self, idx, val):
 | 
			
		||||
        # index of the leaf
 | 
			
		||||
        idx += self._capacity
 | 
			
		||||
| 
						 | 
				
			
			@ -86,7 +85,7 @@ class SegmentTree(object):
 | 
			
		|||
            )
 | 
			
		||||
            idx //= 2
 | 
			
		||||
 | 
			
		||||
    @jit(forceobj = True)
 | 
			
		||||
    @jit(forceobj=True)
 | 
			
		||||
    def __getitem__(self, idx):
 | 
			
		||||
        assert 0 <= idx < self._capacity
 | 
			
		||||
        return self._value[self._capacity + idx]
 | 
			
		||||
| 
						 | 
				
			
			@ -100,12 +99,12 @@ class SumSegmentTree(SegmentTree):
 | 
			
		|||
            neutral_element=0.0
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    @jit(forceobj = True)
 | 
			
		||||
    @jit(forceobj=True)
 | 
			
		||||
    def sum(self, start=0, end=None):
 | 
			
		||||
        """Returns arr[start] + ... + arr[end]"""
 | 
			
		||||
        return super(SumSegmentTree, self).reduce(start, end)
 | 
			
		||||
 | 
			
		||||
    @jit(forceobj = True, parallel = True)
 | 
			
		||||
    @jit(forceobj=True, parallel=True)
 | 
			
		||||
    def find_prefixsum_idx(self, prefixsum):
 | 
			
		||||
        """Find the highest index `i` in the array such that
 | 
			
		||||
            sum(arr[0] + arr[1] + ... + arr[i - i]) <= prefixsum
 | 
			
		||||
| 
						 | 
				
			
			@ -140,7 +139,7 @@ class MinSegmentTree(SegmentTree):
 | 
			
		|||
            neutral_element=float('inf')
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    @jit(forceobj = True)
 | 
			
		||||
    @jit(forceobj=True)
 | 
			
		||||
    def min(self, start=0, end=None):
 | 
			
		||||
        """Returns min(arr[start], ...,  arr[end])"""
 | 
			
		||||
        return super(MinSegmentTree, self).reduce(start, end)
 | 
			
		||||
| 
						 | 
				
			
			@ -185,7 +184,7 @@ class PrioritizedReplayMemory(ReplayMemory):
 | 
			
		|||
        self._it_sum[idx] = self._max_priority ** self._alpha
 | 
			
		||||
        self._it_min[idx] = self._max_priority ** self._alpha
 | 
			
		||||
 | 
			
		||||
    @jit(forceobj = True)
 | 
			
		||||
    @jit(forceobj=True)
 | 
			
		||||
    def _sample_proportional(self, batch_size):
 | 
			
		||||
        res = []
 | 
			
		||||
        p_total = self._it_sum.sum(0, len(self.memory) - 1)
 | 
			
		||||
| 
						 | 
				
			
			@ -294,7 +293,7 @@ class PrioritizedReplayMemory(ReplayMemory):
 | 
			
		|||
        batch = list(zip(*encoded_sample, weights, step_idxes))
 | 
			
		||||
        return batch
 | 
			
		||||
    
 | 
			
		||||
    @jit(forceobj = True)
 | 
			
		||||
    @jit(forceobj=True)
 | 
			
		||||
    def update_priorities(self, idxes, priorities):
 | 
			
		||||
        """
 | 
			
		||||
        Update priorities of sampled transitions.
 | 
			
		||||
| 
						 | 
				
			
			@ -320,4 +319,3 @@ class PrioritizedReplayMemory(ReplayMemory):
 | 
			
		|||
            self._it_min[idx] = priority ** self._alpha
 | 
			
		||||
 | 
			
		||||
            self._max_priority = max(self._max_priority, priority)
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -106,11 +106,9 @@ class ReplayMemory(object):
 | 
			
		|||
    def __reversed__(self):
 | 
			
		||||
        return reversed(self.memory)
 | 
			
		||||
 | 
			
		||||
def zip_batch(minibatch, priority = False, want_indices = False):
 | 
			
		||||
def zip_batch(minibatch, priority=False):
 | 
			
		||||
    if priority:
 | 
			
		||||
        state_batch, action_batch, reward_batch, next_state_batch, done_batch, weights, indexes = zip(*minibatch)
 | 
			
		||||
    elif want_indices:
 | 
			
		||||
        state_batch, action_batch, reward_batch, next_state_batch, done_batch, indexes = zip(*minibatch)
 | 
			
		||||
    else:
 | 
			
		||||
        state_batch, action_batch, reward_batch, next_state_batch, done_batch = zip(*minibatch)
 | 
			
		||||
        
 | 
			
		||||
| 
						 | 
				
			
			@ -122,7 +120,5 @@ def zip_batch(minibatch, priority = False, want_indices = False):
 | 
			
		|||
 | 
			
		||||
    if priority:
 | 
			
		||||
        return state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, weights, indexes
 | 
			
		||||
    elif want_indices:
 | 
			
		||||
        return state_batch, action_batch, reward_batch, next_state_batch, not_done_batch, indexes
 | 
			
		||||
    else:
 | 
			
		||||
        return state_batch, action_batch, reward_batch, next_state_batch, not_done_batch
 | 
			
		||||
| 
						 | 
				
			
			@ -5,7 +5,7 @@ from copy import deepcopy
 | 
			
		|||
import torch.multiprocessing as mp
 | 
			
		||||
 | 
			
		||||
class EnvironmentEpisode(mp.Process):
 | 
			
		||||
  def __init__(self, env, actor, config, logger = None, name = ""):
 | 
			
		||||
    def __init__(self, env, actor, config, logger=None, name=""):
 | 
			
		||||
        super(EnvironmentEpisode, self).__init__()
 | 
			
		||||
        self.env = env
 | 
			
		||||
        self.actor = actor
 | 
			
		||||
| 
						 | 
				
			
			@ -14,7 +14,7 @@ class EnvironmentEpisode(mp.Process):
 | 
			
		|||
        self.name = name
 | 
			
		||||
        self.episode_num = 1
 | 
			
		||||
 | 
			
		||||
  def run(self, printstat = False, memory = None):
 | 
			
		||||
    def run(self, printstat=False, memory=None):
 | 
			
		||||
        state = self.env.reset()
 | 
			
		||||
        done = False
 | 
			
		||||
        episode_reward = 0
 | 
			
		||||
| 
						 | 
				
			
			@ -34,86 +34,3 @@ class EnvironmentEpisode(mp.Process):
 | 
			
		|||
            self.logger.append(self.name + '/EpisodeReward', episode_reward)
 | 
			
		||||
 | 
			
		||||
        self.episode_num += 1
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# from copy import deepcopy
 | 
			
		||||
# import torch.multiprocessing as mp
 | 
			
		||||
# from ctypes import *
 | 
			
		||||
# import rltorch.log
 | 
			
		||||
 | 
			
		||||
# def envepisode(actor, env, episode_num, config, runcondition, memoryqueue = None, logqueue = None, name = ""):
 | 
			
		||||
#   # Wait for signal to start running through the environment
 | 
			
		||||
#   while runcondition.wait():
 | 
			
		||||
#     # Start a logger to log the rewards
 | 
			
		||||
#     logger = rltorch.log.Logger()
 | 
			
		||||
#     state = env.reset()
 | 
			
		||||
#     episode_reward = 0
 | 
			
		||||
#     done = False
 | 
			
		||||
#     while not done:
 | 
			
		||||
#       action = actor.act(state)
 | 
			
		||||
#       next_state, reward, done, _ = env.step(action)
 | 
			
		||||
       
 | 
			
		||||
#       episode_reward += reward
 | 
			
		||||
#       if memoryqueue is not None:
 | 
			
		||||
#         memoryqueue.put((state, action, reward, next_state, done))
 | 
			
		||||
       
 | 
			
		||||
#       state = next_state
 | 
			
		||||
 | 
			
		||||
#       if done:
 | 
			
		||||
#         with episode_num.get_lock():
 | 
			
		||||
#           if episode_num.value % config['print_stat_n_eps'] == 0:
 | 
			
		||||
#             print("episode: {}/{}, score: {}"
 | 
			
		||||
#               .format(episode_num.value, config['total_training_episodes'], episode_reward))
 | 
			
		||||
          
 | 
			
		||||
#         if logger is not None:
 | 
			
		||||
#           logger.append(name + '/EpisodeReward', episode_reward)
 | 
			
		||||
#         episode_reward = 0
 | 
			
		||||
#         state = env.reset()
 | 
			
		||||
#         with episode_num.get_lock():
 | 
			
		||||
#           episode_num.value +=  1
 | 
			
		||||
          
 | 
			
		||||
#     logqueue.put(logger)
 | 
			
		||||
  
 | 
			
		||||
# class EnvironmentRun():
 | 
			
		||||
#   def __init__(self, env_func, actor, config, memory = None, name = ""):
 | 
			
		||||
#     self.config = deepcopy(config)
 | 
			
		||||
#     self.memory = memory
 | 
			
		||||
#     self.episode_num = mp.Value(c_uint)
 | 
			
		||||
#     self.runcondition = mp.Event()
 | 
			
		||||
#     # Interestingly enough, there isn't a good reliable way to know how many states an episode will have
 | 
			
		||||
#     # Perhaps we can share a uint to keep track...
 | 
			
		||||
#     self.memory_queue = mp.Queue(maxsize = config['replay_skip'] + 1)
 | 
			
		||||
#     self.logqueue = mp.Queue(maxsize = 1)
 | 
			
		||||
#     with self.episode_num.get_lock():
 | 
			
		||||
#       self.episode_num.value = 1
 | 
			
		||||
#     self.runner = mp.Process(target=envrun, 
 | 
			
		||||
#       args=(actor, env_func, self.episode_num, config, self.runcondition),
 | 
			
		||||
#       kwargs = {'iterations': config['replay_skip'] + 1, 
 | 
			
		||||
#         'memoryqueue' : self.memory_queue, 'logqueue' : self.logqueue, 'name' : name})
 | 
			
		||||
#     self.runner.start()
 | 
			
		||||
 | 
			
		||||
#   def run(self):
 | 
			
		||||
#     self.runcondition.set()
 | 
			
		||||
 | 
			
		||||
#   def join(self):
 | 
			
		||||
#     self._sync_memory()
 | 
			
		||||
#     if self.logwriter is not None:
 | 
			
		||||
#       self.logwriter.write(self._get_reward_logger())
 | 
			
		||||
 | 
			
		||||
#   def sync_memory(self):
 | 
			
		||||
#     if self.memory is not None:
 | 
			
		||||
#       for i in range(self.config['replay_skip'] + 1):
 | 
			
		||||
#         self.memory.append(*self.memory_queue.get())
 | 
			
		||||
 | 
			
		||||
#   def get_reward_logger(self):
 | 
			
		||||
#     return self.logqueue.get()
 | 
			
		||||
 | 
			
		||||
#   def terminate(self):
 | 
			
		||||
#     self.runner.terminate()
 | 
			
		||||
    
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,9 +1,9 @@
 | 
			
		|||
from copy import deepcopy
 | 
			
		||||
from ctypes import c_uint
 | 
			
		||||
import torch.multiprocessing as mp
 | 
			
		||||
from ctypes import *
 | 
			
		||||
import rltorch.log
 | 
			
		||||
 | 
			
		||||
def envrun(actor, env, episode_num, config, runcondition, iterations = 1, memoryqueue = None, logqueue = None, name = ""):
 | 
			
		||||
def envrun(actor, env, episode_num, config, runcondition, iterations=1, memoryqueue=None, logqueue=None, name=""):
 | 
			
		||||
    state = env.reset()
 | 
			
		||||
    episode_reward = 0
 | 
			
		||||
    # Wait for signal to start running through the environment
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,7 +1,8 @@
 | 
			
		|||
from copy import deepcopy
 | 
			
		||||
import numpy as np
 | 
			
		||||
import torch
 | 
			
		||||
from .Network import Network
 | 
			
		||||
from copy import deepcopy
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# [TODO] Should we torch.no_grad the __call__?
 | 
			
		||||
# What if we want to sometimes do gradient descent as well?
 | 
			
		||||
| 
						 | 
				
			
			@ -38,7 +39,7 @@ class ESNetwork(Network):
 | 
			
		|||
    name
 | 
			
		||||
      For use in logger to differentiate in analysis.
 | 
			
		||||
    """
 | 
			
		||||
    def __init__(self, model, optimizer, population_size, fitness_fn, config, sigma = 0.05, device = None, logger = None, name = ""):
 | 
			
		||||
    def __init__(self, model, optimizer, population_size, fitness_fn, config, sigma=0.05, device=None, logger=None, name=""):
 | 
			
		||||
        super(ESNetwork, self).__init__(model, optimizer, config, device, logger, name)
 | 
			
		||||
        self.population_size = population_size
 | 
			
		||||
        self.fitness = fitness_fn
 | 
			
		||||
| 
						 | 
				
			
			@ -64,7 +65,11 @@ class ESNetwork(Network):
 | 
			
		|||
        white_noise_dict = {}
 | 
			
		||||
        noise_dict = {}
 | 
			
		||||
        for key in model_dict.keys():
 | 
			
		||||
            white_noise_dict[key] = torch.randn(self.population_size, *model_dict[key].shape, device = self.device)
 | 
			
		||||
            white_noise_dict[key] = torch.randn(
 | 
			
		||||
                self.population_size,
 | 
			
		||||
                *model_dict[key].shape,
 | 
			
		||||
                device=self.device
 | 
			
		||||
            )
 | 
			
		||||
            noise_dict[key] = self.sigma * white_noise_dict[key]
 | 
			
		||||
        return white_noise_dict, noise_dict
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -96,7 +101,10 @@ class ESNetwork(Network):
 | 
			
		|||
        candidate_solutions = self._generate_candidate_solutions(noise_dict)
 | 
			
		||||
        
 | 
			
		||||
        ## Calculate fitness then mean shift, scale
 | 
			
		||||
        fitness_values = torch.tensor([self.fitness(x, *args) for x in candidate_solutions], device = self.device)
 | 
			
		||||
        fitness_values = torch.tensor(
 | 
			
		||||
            [self.fitness(x, *args) for x in candidate_solutions],
 | 
			
		||||
            device=self.device
 | 
			
		||||
        )
 | 
			
		||||
        if self.logger is not None:
 | 
			
		||||
            self.logger.append(self.name + "/" + "fitness_value", fitness_values.mean().item())
 | 
			
		||||
        fitness_values = (fitness_values - fitness_values.mean()) / (fitness_values.std() + np.finfo('float').eps)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,9 +1,8 @@
 | 
			
		|||
from copy import deepcopy
 | 
			
		||||
import numpy as np
 | 
			
		||||
import torch
 | 
			
		||||
from .Network import Network
 | 
			
		||||
from copy import deepcopy
 | 
			
		||||
import torch.multiprocessing as mp
 | 
			
		||||
import functools
 | 
			
		||||
from .Network import Network
 | 
			
		||||
 | 
			
		||||
class fn_copy:
 | 
			
		||||
    def __init__(self, fn, args):
 | 
			
		||||
| 
						 | 
				
			
			@ -20,14 +19,15 @@ class ESNetworkMP(Network):
 | 
			
		|||
    fitness_fun := model, *args -> fitness_value (float)
 | 
			
		||||
    We wish to find a model that maximizes the fitness function
 | 
			
		||||
    """
 | 
			
		||||
    def __init__(self, model, optimizer, population_size, fitness_fn, config, sigma = 0.05, device = None, logger = None, name = ""):
 | 
			
		||||
    def __init__(self, model, optimizer, population_size, fitness_fn, config, sigma=0.05, device=None, logger=None, name=""):
 | 
			
		||||
        super(ESNetworkMP, self).__init__(model, optimizer, config, device, logger, name)
 | 
			
		||||
        self.population_size = population_size
 | 
			
		||||
        self.fitness = fitness_fn
 | 
			
		||||
        self.sigma = sigma
 | 
			
		||||
        assert self.sigma > 0
 | 
			
		||||
        mp_ctx = mp.get_context("spawn")
 | 
			
		||||
        self.pool = mp_ctx.Pool(processes=2) #[TODO] Probably should make number of processes a config variable
 | 
			
		||||
        #[TODO] Probably should make number of processes a config variable
 | 
			
		||||
        self.pool = mp_ctx.Pool(processes=2) 
 | 
			
		||||
 | 
			
		||||
    # We're not going to be calculating gradients in the traditional way
 | 
			
		||||
    # So there's no need to waste computation time keeping track
 | 
			
		||||
| 
						 | 
				
			
			@ -42,7 +42,11 @@ class ESNetworkMP(Network):
 | 
			
		|||
        white_noise_dict = {}
 | 
			
		||||
        noise_dict = {}
 | 
			
		||||
        for key in model_dict.keys():
 | 
			
		||||
            white_noise_dict[key] = torch.randn(self.population_size, *model_dict[key].shape, device = self.device)
 | 
			
		||||
            white_noise_dict[key] = torch.randn(
 | 
			
		||||
                self.population_size,
 | 
			
		||||
                *model_dict[key].shape,
 | 
			
		||||
                device=self.device
 | 
			
		||||
            )
 | 
			
		||||
            noise_dict[key] = self.sigma * white_noise_dict[key]
 | 
			
		||||
        return white_noise_dict, noise_dict
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -67,7 +71,10 @@ class ESNetworkMP(Network):
 | 
			
		|||
        candidate_solutions = self._generate_candidate_solutions(noise_dict)
 | 
			
		||||
        
 | 
			
		||||
        ## Calculate fitness then mean shift, scale
 | 
			
		||||
        fitness_values = torch.tensor(list(self.pool.map(fn_copy(self.fitness, args), candidate_solutions)), device = self.device)
 | 
			
		||||
        fitness_values = torch.tensor(
 | 
			
		||||
            list(self.pool.map(fn_copy(self.fitness, args), candidate_solutions)),
 | 
			
		||||
            device=self.device
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        if self.logger is not None:
 | 
			
		||||
            self.logger.append(self.name + "/" + "fitness_value", fitness_values.mean().item())
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -17,12 +17,16 @@ class Network:
 | 
			
		|||
    name
 | 
			
		||||
      For use in logger to differentiate in analysis.
 | 
			
		||||
    """
 | 
			
		||||
    def __init__(self, model, optimizer, config, device = None, logger = None, name = ""):
 | 
			
		||||
    def __init__(self, model, optimizer, config, device=None, logger=None, name=""):
 | 
			
		||||
        self.model = model
 | 
			
		||||
        if 'weight_decay' in config:
 | 
			
		||||
            self.optimizer = optimizer(model.parameters(), lr = config['learning_rate'], weight_decay = config['weight_decay'])
 | 
			
		||||
            self.optimizer = optimizer(
 | 
			
		||||
                model.parameters(),
 | 
			
		||||
                lr=config['learning_rate'],
 | 
			
		||||
                weight_decay=config['weight_decay']
 | 
			
		||||
            )
 | 
			
		||||
        else:
 | 
			
		||||
            self.optimizer = optimizer(model.parameters(), lr = config['learning_rate'])
 | 
			
		||||
            self.optimizer = optimizer(model.parameters(), lr=config['learning_rate'])
 | 
			
		||||
        self.logger = logger
 | 
			
		||||
        self.name = name
 | 
			
		||||
        self.device = device
 | 
			
		||||
| 
						 | 
				
			
			@ -32,7 +36,7 @@ class Network:
 | 
			
		|||
    def __call__(self, *args):
 | 
			
		||||
        return self.model(*args)
 | 
			
		||||
 | 
			
		||||
    def clamp_gradients(self, x = 1):
 | 
			
		||||
    def clamp_gradients(self, x=1):
 | 
			
		||||
        """
 | 
			
		||||
        Forcing gradients to stay within a certain interval
 | 
			
		||||
        by setting it to the bound if it goes over it.
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,7 +1,8 @@
 | 
			
		|||
import math
 | 
			
		||||
import torch
 | 
			
		||||
import torch.nn as nn
 | 
			
		||||
import torch.nn.functional as F
 | 
			
		||||
import math
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# This class utilizes this property of the normal distribution
 | 
			
		||||
# N(mu, sigma) = mu + sigma * N(0, 1)
 | 
			
		||||
| 
						 | 
				
			
			@ -10,7 +11,6 @@ class NoisyLinear(nn.Linear):
 | 
			
		|||
    Draws the parameters of nn.Linear from a normal distribution.
 | 
			
		||||
    The parameters of the normal distribution are registered as
 | 
			
		||||
    learnable parameters in the neural network.
 | 
			
		||||
 | 
			
		||||
    Parameters
 | 
			
		||||
    ----------
 | 
			
		||||
    in_features
 | 
			
		||||
| 
						 | 
				
			
			@ -24,8 +24,8 @@ class NoisyLinear(nn.Linear):
 | 
			
		|||
       learn an additive bias.
 | 
			
		||||
       Default: True
 | 
			
		||||
    """
 | 
			
		||||
  def __init__(self, in_features, out_features, sigma_init = 0.017, bias = True):
 | 
			
		||||
    super(NoisyLinear, self).__init__(in_features, out_features, bias = bias)
 | 
			
		||||
    def __init__(self, in_features, out_features, sigma_init=0.017, bias=True):
 | 
			
		||||
        super(NoisyLinear, self).__init__(in_features, out_features, bias=bias)
 | 
			
		||||
        # One of the parameters the network is going to tune is the 
 | 
			
		||||
        # standard deviation of the gaussian noise on the weights
 | 
			
		||||
        self.sigma_weight = nn.Parameter(torch.Tensor(out_features, in_features).fill_(sigma_init))
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -11,7 +11,7 @@ class TargetNetwork:
 | 
			
		|||
    device
 | 
			
		||||
      The device to put the cloned parameters in.
 | 
			
		||||
    """
 | 
			
		||||
    def __init__(self, network, device = None):
 | 
			
		||||
    def __init__(self, network, device=None):
 | 
			
		||||
        self.model = network.model
 | 
			
		||||
        self.target_model = deepcopy(network.model)
 | 
			
		||||
        if device is not None:
 | 
			
		||||
| 
						 | 
				
			
			@ -37,7 +37,8 @@ class TargetNetwork:
 | 
			
		|||
        Parameters
 | 
			
		||||
        ----------
 | 
			
		||||
        tau : number
 | 
			
		||||
          A number between 0-1 which indicates the proportion of the originator and clone in the new clone.
 | 
			
		||||
          A number between 0-1 which indicates
 | 
			
		||||
          the proportion of the originator and clone in the new clone.
 | 
			
		||||
        """
 | 
			
		||||
        assert isinstance(tau, float)
 | 
			
		||||
        assert 0.0 < tau <= 1.0
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,5 +1,5 @@
 | 
			
		|||
from .ESNetwork import *
 | 
			
		||||
from .ESNetworkMP import *
 | 
			
		||||
from .Network import *
 | 
			
		||||
from .NoisyLinear import *
 | 
			
		||||
from .TargetNetwork import *
 | 
			
		||||
from .ESNetwork import ESNetwork
 | 
			
		||||
from .ESNetworkMP import ESNetworkMP
 | 
			
		||||
from .Network import Network
 | 
			
		||||
from .NoisyLinear import NoisyLinear
 | 
			
		||||
from .TargetNetwork import TargetNetwork
 | 
			
		||||
| 
						 | 
				
			
			@ -36,4 +36,3 @@ class ExponentialScheduler(Scheduler):
 | 
			
		|||
            return self.initial_value * (self.base ** (self.current_iteration - 1))
 | 
			
		||||
        else:
 | 
			
		||||
            return self.end_value
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -7,4 +7,4 @@ class Scheduler():
 | 
			
		|||
    def __iter__(self):
 | 
			
		||||
        return self
 | 
			
		||||
    def __next__(self):
 | 
			
		||||
        raise NotImplementedError("Scheduler does not have it's function to create a value implemented")
 | 
			
		||||
        raise NotImplementedError("__next__ not implemented in Scheduler")
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue