diff --git a/examples/acrobot.py b/examples/acrobot.py index 8b33cc1..94692de 100644 --- a/examples/acrobot.py +++ b/examples/acrobot.py @@ -65,77 +65,77 @@ config['prioritized_replay_sampling_priority'] = 0.6 # 1 - Lower the importance of high losses # Should ideally start from 0 and move your way to 1 to prevent overfitting config['prioritized_replay_weight_importance'] = rltorch.scheduler.ExponentialScheduler(initial_value = 0.4, end_value = 1, iterations = 5000) - -def train(runner, agent, config, logwriter = None, memory = None): + +def train(runner, agent, config, logger = None, logwriter = None): finished = False - episode_num = 1 - memory_queue = mp.Queue(maxsize = config['replay_skip'] + 1) + last_episode_num = 1 while not finished: - runner.run(config['replay_skip'] + 1, printstat = runner.episode_num % config['print_stat_n_eps'] == 0, memory = memory_queue) + runner.run() agent.learn() runner.join() - for i in range(config['replay_skip'] + 1): - memory.append(*memory_queue.get()) - # When the episode number changes, write out the weight histograms - if logwriter is not None and episode_num < runner.episode_num: - episode_num = runner.episode_num - agent.net.log_named_parameters() - - if logwriter is not None: - logwriter.write() - finished = runner.episode_num > config['total_training_episodes'] + # When the episode number changes, log network paramters + with runner.episode_num.get_lock(): + if logwriter is not None and last_episode_num < runner.episode_num.value: + last_episode_num = runner.episode_num.value + agent.net.log_named_parameters() + if logwriter is not None: + logwriter.write(logger) + finished = runner.episode_num.value > config['total_training_episodes'] -torch.multiprocessing.set_sharing_strategy('file_system') # To not hit file descriptor memory limit -# Setting up the environment -rltorch.set_seed(config['seed']) -print("Setting up environment...", end = " ") -env = E.TorchWrap(gym.make(config['environment_name'])) -env.seed(config['seed']) -print("Done.") + +if __name__ == "__main__": + torch.multiprocessing.set_sharing_strategy('file_system') # To not hit file descriptor memory limit + + # Setting up the environment + rltorch.set_seed(config['seed']) + print("Setting up environment...", end = " ") + env = E.TorchWrap(gym.make(config['environment_name'])) + env.seed(config['seed']) + print("Done.") + + state_size = env.observation_space.shape[0] + action_size = env.action_space.n + + # Logging + logger = rltorch.log.Logger() + # logwriter = rltorch.log.LogWriter(logger, SummaryWriter()) + logwriter = rltorch.log.LogWriter(SummaryWriter()) + + # Setting up the networks + device = torch.device("cuda:0" if torch.cuda.is_available() and not config['disable_cuda'] else "cpu") + net = rn.Network(Value(state_size, action_size), + torch.optim.Adam, config, device = device, name = "DQN") + target_net = rn.TargetNetwork(net, device = device) + net.model.share_memory() + target_net.model.share_memory() + + # Actor takes a net and uses it to produce actions from given states + actor = ArgMaxSelector(net, action_size, device = device) + # Memory stores experiences for later training + memory = M.PrioritizedReplayMemory(capacity = config['memory_size'], alpha = config['prioritized_replay_sampling_priority']) + # memory = M.ReplayMemory(capacity = config['memory_size']) + + # Runner performs a certain number of steps in the environment + runner = rltorch.mp.EnvironmentRun(env, actor, config, name = "Training", memory = memory, logwriter = logwriter) + + # Agent is what performs the training + agent = rltorch.agents.DQNAgent(net, memory, config, target_net = target_net, logger = logger) -state_size = env.observation_space.shape[0] -action_size = env.action_space.n + print("Training...") -# Logging -logger = rltorch.log.Logger() -logwriter = rltorch.log.LogWriter(logger, SummaryWriter()) + train(runner, agent, config, logger = logger, logwriter = logwriter) + # For profiling... + # import cProfile + # cProfile.run('train(runner, agent, config, logger = logger, logwriter = logwriter )') + # python -m torch.utils.bottleneck /path/to/source/script.py [args] is also a good solution... -# Setting up the networks -device = torch.device("cuda:0" if torch.cuda.is_available() and not config['disable_cuda'] else "cpu") -net = rn.Network(Value(state_size, action_size), - torch.optim.Adam, config, device = device, logger = logger, name = "DQN") -target_net = rn.TargetNetwork(net, device = device) -net.model.share_memory() -target_net.model.share_memory() + print("Training Finished.") + runner.terminate() # We don't need the extra process anymore -# Actor takes a net and uses it to produce actions from given states -actor = ArgMaxSelector(net, action_size, device = device) -# Memory stores experiences for later training -memory = M.PrioritizedReplayMemory(capacity = config['memory_size'], alpha = config['prioritized_replay_sampling_priority']) -# memory = M.ReplayMemory(capacity = config['memory_size']) + print("Evaluating...") + rltorch.env.simulateEnvEps(env, actor, config, total_episodes = config['total_evaluation_episodes'], logger = logger, name = "Evaluation") + print("Evaulations Done.") -# Runner performs a certain number of steps in the environment -runner = rltorch.mp.EnvironmentRun(env, actor, config, logger = logger, name = "Training") -runner.start() - -# Agent is what performs the training -agent = rltorch.agents.DQNAgent(net, memory, config, target_net = target_net, logger = logger) - -print("Training...") -train(runner, agent, config, logwriter = logwriter, memory = memory) - -# For profiling... -# import cProfile -# cProfile.run('train(runner, agent, config, logwriter = logwriter )') -# python -m torch.utils.bottleneck /path/to/source/script.py [args] is also a good solution... - -print("Training Finished.") -runner.terminate() # We don't need the extra process anymore - -print("Evaluating...") -rltorch.env.simulateEnvEps(env, actor, config, total_episodes = config['total_evaluation_episodes'], logger = logger, name = "Evaluation") -print("Evaulations Done.") - -logwriter.close() # We don't need to write anything out to disk anymore + logwriter.close() # We don't need to write anything out to disk anymore diff --git a/examples/pong.py b/examples/pong.py index a9b67e4..c9820a2 100644 --- a/examples/pong.py +++ b/examples/pong.py @@ -88,76 +88,60 @@ config['prioritized_replay_sampling_priority'] = 0.6 # Should ideally start from 0 and move your way to 1 to prevent overfitting config['prioritized_replay_weight_importance'] = rltorch.scheduler.ExponentialScheduler(initial_value = 0.4, end_value = 1, iterations = 5000) -def train(runner, agent, config, logwriter = None, memory = None): - finished = False - episode_num = 1 - memory_queue = mp.Queue(maxsize = config['replay_skip'] + 1) - while not finished: - runner.run(config['replay_skip'] + 1, printstat = runner.episode_num % config['print_stat_n_eps'] == 0, memory = memory_queue) - agent.learn() - runner.join() - for i in range(config['replay_skip'] + 1): - memory.append(*memory_queue.get()) - # When the episode number changes, write out the weight histograms - if logwriter is not None and episode_num < runner.episode_num: - episode_num = runner.episode_num - agent.net.log_named_parameters() - - if logwriter is not None: - logwriter.write() - finished = runner.episode_num > config['total_training_episodes'] +if __name__ == "__main__": + torch.multiprocessing.set_sharing_strategy('file_system') # To not hit file descriptor memory limit - -torch.multiprocessing.set_sharing_strategy('file_system') # To not hit file descriptor memory limit -rltorch.set_seed(config['seed']) -print("Setting up environment...", end = " ") -env = E.FrameStack(E.TorchWrap( + # Setting up the environment + rltorch.set_seed(config['seed']) + print("Setting up environment...", end = " ") + env = E.FrameStack(E.TorchWrap( E.ProcessFrame(E.FireResetEnv(gym.make(config['environment_name'])), - resize_shape = (80, 80), crop_bounds = [34, 194, 15, 145], grayscale = True)) -, 4) -env.seed(config['seed']) -print("Done.") + resize_shape = (80, 80), crop_bounds = [34, 194, 15, 145], grayscale = True)) + , 4) + env.seed(config['seed']) + print("Done.") + + state_size = env.observation_space.shape[0] + action_size = env.action_space.n + + # Logging + logger = rltorch.log.Logger() + logwriter = rltorch.log.LogWriter(SummaryWriter()) + + # Setting up the networks + device = torch.device("cuda:0" if torch.cuda.is_available() and not config['disable_cuda'] else "cpu") + net = rn.Network(Value(state_size, action_size), + torch.optim.Adam, config, device = device, name = "DQN") + target_net = rn.TargetNetwork(net, device = device) + net.model.share_memory() + target_net.model.share_memory() + + # Actor takes a net and uses it to produce actions from given states + actor = ArgMaxSelector(net, action_size, device = device) + # Memory stores experiences for later training + memory = M.PrioritizedReplayMemory(capacity = config['memory_size'], alpha = config['prioritized_replay_sampling_priority']) + # memory = M.ReplayMemory(capacity = config['memory_size']) + + # Runner performs a certain number of steps in the environment + runner = rltorch.mp.EnvironmentRun(env, actor, config, name = "Training", memory = memory, logwriter = logwriter) + + # Agent is what performs the training + agent = rltorch.agents.DQNAgent(net, memory, config, target_net = target_net, logger = logger) -state_size = env.observation_space.shape[0] -action_size = env.action_space.n + print("Training...") -# Logging -logger = rltorch.log.Logger() -logwriter = rltorch.log.LogWriter(logger, SummaryWriter()) + train(runner, agent, config, logger = logger, logwriter = logwriter) -# Setting up the networks -device = torch.device("cuda:0" if torch.cuda.is_available() and not config['disable_cuda'] else "cpu") -net = rn.Network(Value(state_size, action_size), - torch.optim.Adam, config, device = device, logger = logger, name = "DQN") -target_net = rn.TargetNetwork(net, device = device) -net.model.share_memory() -target_net.model.share_memory() + # For profiling... + # import cProfile + # cProfile.run('train(runner, agent, config, logger = logger, logwriter = logwriter )') + # python -m torch.utils.bottleneck /path/to/source/script.py [args] is also a good solution... -# Actor takes a network and uses it to produce actions from given states -actor = ArgMaxSelector(net, action_size, device = device) -# Memory stores experiences for later training -memory = M.PrioritizedReplayMemory(capacity = config['memory_size'], alpha = config['prioritized_replay_sampling_priority']) + print("Training Finished.") + runner.terminate() # We don't need the extra process anymore -# Runner performs a certain number of steps in the environment -runner = rltorch.mp.EnvironmentRun(env, actor, config, logger = logger, name = "Training") -runner.start() + print("Evaluating...") + rltorch.env.simulateEnvEps(env, actor, config, total_episodes = config['total_evaluation_episodes'], logger = logger, name = "Evaluation") + print("Evaulations Done.") -# Agent is what performs the training -agent = rltorch.agents.DQNAgent(net, memory, config, target_net = target_net, logger = logger) - -print("Training...") -train(runner, agent, config, logwriter = logwriter, memory = memory) - -# For profiling... -# import cProfile -# cProfile.run('train(runner, agent, config, logwriter = logwriter )') -# python -m torch.utils.bottleneck /path/to/source/script.py [args] is also a good solution... - -print("Training Finished.") -runner.terminate() # We don't need the extra process anymore - -print("Evaluating...") -rltorch.env.simulateEnvEps(env, actor, config, total_episodes = config['total_evaluation_episodes'], logger = logger, name = "Evaluation") -print("Evaulations Done.") - -logwriter.close() # We don't need to write anything out to disk anymore + logwriter.close() # We don't need to write anything out to disk anymore diff --git a/rltorch/agents/DQNAgent.py b/rltorch/agents/DQNAgent.py index 8dd7233..8e665ab 100644 --- a/rltorch/agents/DQNAgent.py +++ b/rltorch/agents/DQNAgent.py @@ -13,7 +13,7 @@ class DQNAgent: self.config = deepcopy(config) self.logger = logger - def learn(self): + def learn(self, logger = None): if len(self.memory) < self.config['batch_size']: return diff --git a/rltorch/log.py b/rltorch/log.py index 37e5343..efb4ae2 100644 --- a/rltorch/log.py +++ b/rltorch/log.py @@ -9,6 +9,8 @@ class Logger: if tag not in self.log.keys(): self.log[tag] = [] self.log[tag].append(value) + def clear(self): + self.log.clear() def keys(self): return self.log.keys() def __len__(self): @@ -25,20 +27,37 @@ class Logger: return reversed(self.log) # Workaround since we can't use SummaryWriter in a different process +# class LogWriter: +# def __init__(self, logger, writer): +# self.logger = logger +# self.writer = writer +# self.steps = Counter() +# def write(self): +# for key in self.logger.keys(): +# for value in self.logger[key]: +# self.steps[key] += 1 +# if isinstance(value, int) or isinstance(value, float): +# self.writer.add_scalar(key, value, self.steps[key]) +# if isinstance(value, np.ndarray) or isinstance(value, torch.Tensor): +# self.writer.add_histogram(key, value, self.steps[key]) +# self.logger.log = {} +# def close(self): +# self.writer.close() + + class LogWriter: - def __init__(self, logger, writer): - self.logger = logger + def __init__(self, writer): self.writer = writer self.steps = Counter() - def write(self): - for key in self.logger.keys(): - for value in self.logger[key]: + def write(self, logger): + for key in logger.keys(): + for value in logger[key]: self.steps[key] += 1 if isinstance(value, int) or isinstance(value, float): self.writer.add_scalar(key, value, self.steps[key]) if isinstance(value, np.ndarray) or isinstance(value, torch.Tensor): self.writer.add_histogram(key, value, self.steps[key]) - self.logger.log = {} + logger.clear() def close(self): self.writer.close() diff --git a/rltorch/memory/PrioritizedReplayMemory.py b/rltorch/memory/PrioritizedReplayMemory.py index a3f860b..4f738bf 100644 --- a/rltorch/memory/PrioritizedReplayMemory.py +++ b/rltorch/memory/PrioritizedReplayMemory.py @@ -246,7 +246,8 @@ class PrioritizedReplayMemory(ReplayMemory): assert len(idxes) == len(priorities) priorities += np.finfo('float').eps for idx, priority in zip(idxes, priorities): - assert priority > 0 + if priority < 0: + priority = np.finfo('float').eps assert 0 <= idx < len(self.memory) self._it_sum[idx] = priority ** self._alpha self._it_min[idx] = priority ** self._alpha diff --git a/rltorch/mp/EnvironmentEpisode.py b/rltorch/mp/EnvironmentEpisode.py index 7be789b..c753e29 100644 --- a/rltorch/mp/EnvironmentEpisode.py +++ b/rltorch/mp/EnvironmentEpisode.py @@ -1,3 +1,6 @@ +# EnvironmentEpisode is currently under maintenance +# Feel free to use the old API, though it is scheduled to change soon. + from copy import deepcopy import torch.multiprocessing as mp @@ -32,3 +35,85 @@ class EnvironmentEpisode(mp.Process): self.episode_num += 1 + + + + + + + +# from copy import deepcopy +# import torch.multiprocessing as mp +# from ctypes import * +# import rltorch.log + +# def envepisode(actor, env, episode_num, config, runcondition, memoryqueue = None, logqueue = None, name = ""): +# # Wait for signal to start running through the environment +# while runcondition.wait(): +# # Start a logger to log the rewards +# logger = rltorch.log.Logger() +# state = env.reset() +# episode_reward = 0 +# done = False +# while not done: +# action = actor.act(state) +# next_state, reward, done, _ = env.step(action) + +# episode_reward += reward +# if memoryqueue is not None: +# memoryqueue.put((state, action, reward, next_state, done)) + +# state = next_state + +# if done: +# with episode_num.get_lock(): +# if episode_num.value % config['print_stat_n_eps'] == 0: +# print("episode: {}/{}, score: {}" +# .format(episode_num.value, config['total_training_episodes'], episode_reward)) + +# if logger is not None: +# logger.append(name + '/EpisodeReward', episode_reward) +# episode_reward = 0 +# state = env.reset() +# with episode_num.get_lock(): +# episode_num.value += 1 + +# logqueue.put(logger) + +# class EnvironmentRun(): +# def __init__(self, env_func, actor, config, memory = None, name = ""): +# self.config = deepcopy(config) +# self.memory = memory +# self.episode_num = mp.Value(c_uint) +# self.runcondition = mp.Event() +# # Interestingly enough, there isn't a good reliable way to know how many states an episode will have +# # Perhaps we can share a uint to keep track... +# self.memory_queue = mp.Queue(maxsize = config['replay_skip'] + 1) +# self.logqueue = mp.Queue(maxsize = 1) +# with self.episode_num.get_lock(): +# self.episode_num.value = 1 +# self.runner = mp.Process(target=envrun, +# args=(actor, env_func, self.episode_num, config, self.runcondition), +# kwargs = {'iterations': config['replay_skip'] + 1, +# 'memoryqueue' : self.memory_queue, 'logqueue' : self.logqueue, 'name' : name}) +# self.runner.start() + +# def run(self): +# self.runcondition.set() + +# def join(self): +# self._sync_memory() +# if self.logwriter is not None: +# self.logwriter.write(self._get_reward_logger()) + +# def sync_memory(self): +# if self.memory is not None: +# for i in range(self.config['replay_skip'] + 1): +# self.memory.append(*self.memory_queue.get()) + +# def get_reward_logger(self): +# return self.logqueue.get() + +# def terminate(self): +# self.runner.terminate() + diff --git a/rltorch/mp/EnvironmentRun.py b/rltorch/mp/EnvironmentRun.py index cb091c3..e193cc7 100644 --- a/rltorch/mp/EnvironmentRun.py +++ b/rltorch/mp/EnvironmentRun.py @@ -1,38 +1,73 @@ from copy import deepcopy import torch.multiprocessing as mp +from ctypes import * +import rltorch.log -class EnvironmentRun(mp.Process): - def __init__(self, env, actor, config, logger = None, name = ""): - super(EnvironmentRun, self).__init__() - self.env = env - self.actor = actor - self.config = deepcopy(config) - self.logger = logger - self.name = name - self.episode_num = 1 - self.episode_reward = 0 - self.last_state = env.reset() - - def run(self, iterations = 1, printstat = False, memory = None): - state = self.last_state +def envrun(actor, env, episode_num, config, runcondition, iterations = 1, memoryqueue = None, logqueue = None, name = ""): + state = env.reset() + episode_reward = 0 + # Wait for signal to start running through the environment + while runcondition.wait(): + # Start a logger to log the rewards + logger = rltorch.log.Logger() for _ in range(iterations): - action = self.actor.act(state) - next_state, reward, done, _ = self.env.step(action) - - self.episode_reward = self.episode_reward + reward - if memory is not None: - memory.put((state, action, reward, next_state, done)) + action = actor.act(state) + next_state, reward, done, _ = env.step(action) + + episode_reward += reward + if memoryqueue is not None: + memoryqueue.put((state, action, reward, next_state, done)) + state = next_state if done: - if printstat: + with episode_num.get_lock(): + if episode_num.value % config['print_stat_n_eps'] == 0: print("episode: {}/{}, score: {}" - .format(self.episode_num, self.config['total_training_episodes'], self.episode_reward)) - if self.logger is not None: - self.logger.append(self.name + '/EpisodeReward', self.episode_reward) - self.episode_num = self.episode_num + 1 - self.episode_reward = 0 - state = self.env.reset() + .format(episode_num.value, config['total_training_episodes'], episode_reward)) + + if logger is not None: + logger.append(name + '/EpisodeReward', episode_reward) + episode_reward = 0 + state = env.reset() + with episode_num.get_lock(): + episode_num.value += 1 + + logqueue.put(logger) - self.last_state = state +class EnvironmentRun(): + def __init__(self, env, actor, config, memory = None, logwriter = None, name = ""): + self.config = deepcopy(config) + self.logwriter = logwriter + self.memory = memory + self.episode_num = mp.Value(c_uint) + self.runcondition = mp.Event() + self.memory_queue = mp.Queue(maxsize = config['replay_skip'] + 1) + self.logqueue = mp.Queue(maxsize = 1) + with self.episode_num.get_lock(): + self.episode_num.value = 1 + self.runner = mp.Process(target=envrun, + args=(actor, env, self.episode_num, config, self.runcondition), + kwargs = {'iterations': config['replay_skip'] + 1, + 'memoryqueue' : self.memory_queue, 'logqueue' : self.logqueue, 'name' : name}) + self.runner.start() + + def run(self): + self.runcondition.set() + + def join(self): + self._sync_memory() + if self.logwriter is not None: + self.logwriter.write(self._get_reward_logger()) + + def _sync_memory(self): + if self.memory is not None: + for i in range(self.config['replay_skip'] + 1): + self.memory.append(*self.memory_queue.get()) + + def _get_reward_logger(self): + return self.logqueue.get() + + def terminate(self): + self.runner.terminate() diff --git a/rltorch/network/Network.py b/rltorch/network/Network.py index ea523e1..eeafae9 100644 --- a/rltorch/network/Network.py +++ b/rltorch/network/Network.py @@ -16,7 +16,7 @@ class Network: def __call__(self, *args): return self.model(*args) - + def clamp_gradients(self, x = 1): assert x > 0 for param in self.model.parameters():