From c6172f309d7290ff15c849717581b102df3a11df Mon Sep 17 00:00:00 2001 From: Brandon Rozek Date: Tue, 14 Apr 2020 15:24:48 -0400 Subject: [PATCH] Made Logger global --- Readme.md | 7 +- examples/acrobot_a2c.py | 89 ++++++------- examples/acrobot_es.py | 145 ++++++++++------------ examples/acrobot_ppo.py | 128 +++++++++---------- examples/acrobot_qep.py | 157 +++++++++++------------ examples/acrobot_reinforce.py | 116 ++++++++--------- examples/acrobot_single_process_dqn.py | 114 ++++++++--------- examples/pong_mp_dqn.py | 165 ++++++++++++++----------- rltorch/agents/A2CSingleAgent.py | 10 +- rltorch/agents/DQNAgent.py | 10 +- rltorch/agents/DQfDAgent.py | 11 +- rltorch/agents/PPOAgent.py | 10 +- rltorch/agents/QEPAgent.py | 10 +- rltorch/agents/REINFORCEAgent.py | 7 +- rltorch/env/simulate.py | 19 ++- rltorch/log.py | 1 + rltorch/memory/__init__.py | 1 - rltorch/mp/EnvironmentEpisode.py | 8 +- rltorch/network/ESNetwork.py | 12 +- rltorch/network/ESNetworkMP.py | 9 +- rltorch/network/Network.py | 11 +- 21 files changed, 513 insertions(+), 527 deletions(-) diff --git a/Readme.md b/Readme.md index da5dc4b..49bf166 100644 --- a/Readme.md +++ b/Readme.md @@ -14,13 +14,8 @@ This is a dictionary that is shared around the different components. Contains hy ### Environment This component needs to support the standard openai functions reset and step. -### Logger -For Tensorboard to work, you need to define a logger that will (optionally) later go into the network, runner, and agent/trainer. - -Due to issues with multiprocessing, the Logger is a shared dictionary of lists that get appended to and the LogWriter writes on the main thread. - ### Network -A network takes a PyTorch nn.Module, PyTorch optimizer, configuration, and the optional logger. +A network takes a PyTorch nn.Module, PyTorch optimizer, and configuration. ### Target Network Takes in a network and provides methods to sync a copy of the original network. diff --git a/examples/acrobot_a2c.py b/examples/acrobot_a2c.py index b59390c..0ec24cd 100644 --- a/examples/acrobot_a2c.py +++ b/examples/acrobot_a2c.py @@ -8,6 +8,7 @@ import rltorch.memory as M import rltorch.env as E from rltorch.action_selector import StochasticSelector from tensorboardX import SummaryWriter +from rltorch.log import Logger # ## Networks @@ -68,65 +69,55 @@ config['disable_cuda'] = False # ## Training Loop # -def train(runner, agent, config, logger = None, logwriter = None): +def train(runner, agent, config, logwriter=None): finished = False while not finished: runner.run() agent.learn() if logwriter is not None: - agent.value_net.log_named_parameters() - agent.policy_net.log_named_parameters() - logwriter.write(logger) + agent.value_net.log_named_parameters() + agent.policy_net.log_named_parameters() + logwriter.write(Logger) finished = runner.episode_num > config['total_training_episodes'] if __name__ == "__main__": - # Setting up the environment - rltorch.set_seed(config['seed']) - print("Setting up environment...", end = " ") - env = E.TorchWrap(gym.make(config['environment_name'])) - env.seed(config['seed']) - print("Done.") - - state_size = env.observation_space.shape[0] - action_size = env.action_space.n + # Setting up the environment + rltorch.set_seed(config['seed']) + print("Setting up environment...", end=" ") + env = E.TorchWrap(gym.make(config['environment_name'])) + env.seed(config['seed']) + print("Done.") - # Logging - logger = rltorch.log.Logger() - logwriter = rltorch.log.LogWriter(SummaryWriter()) - - # Setting up the networks - device = torch.device("cuda:0" if torch.cuda.is_available() and not config['disable_cuda'] else "cpu") - policy_net = rn.Network(Policy(state_size, action_size), - torch.optim.Adam, config, device = device, name = "Policy") - value_net = rn.Network(Value(state_size), - torch.optim.Adam, config, device = device, name = "DQN") - - - # Memory stores experiences for later training - memory = M.EpisodeMemory() - - # Actor takes a net and uses it to produce actions from given states - actor = StochasticSelector(policy_net, action_size, memory, device = device) - - # Agent is what performs the training - agent = rltorch.agents.A2CSingleAgent(policy_net, value_net, memory, config, logger = logger) - - # Runner performs one episode in the environment - runner = rltorch.env.EnvironmentEpisodeSync(env, actor, config, name = "Training", memory = memory, logwriter = logwriter) + state_size = env.observation_space.shape[0] + action_size = env.action_space.n + # Logging + logwriter = rltorch.log.LogWriter(SummaryWriter()) + # Setting up the networks + device = torch.device("cuda:0" if torch.cuda.is_available() and not config['disable_cuda'] else "cpu") + policy_net = rn.Network(Policy(state_size, action_size), + torch.optim.Adam, config, device=device, name="Policy") + value_net = rn.Network(Value(state_size), + torch.optim.Adam, config, device=device, name="DQN") + # Memory stores experiences for later training + memory = M.EpisodeMemory() + # Actor takes a net and uses it to produce actions from given states + actor = StochasticSelector(policy_net, action_size, memory, device = device) + # Agent is what performs the training + agent = rltorch.agents.A2CSingleAgent(policy_net, value_net, memory, config) + # Runner performs one episode in the environment + runner = rltorch.env.EnvironmentEpisodeSync(env, actor, config, name="Training", memory=memory, logwriter=logwriter) - print("Training...") - train(runner, agent, config, logger = logger, logwriter = logwriter) + print("Training...") + train(runner, agent, config, logwriter=logwriter) - # For profiling... - # import cProfile - # cProfile.run('train(runner, agent, config, logger = logger, logwriter = logwriter )') - # python -m torch.utils.bottleneck /path/to/source/script.py [args] is also a good solution... + # For profiling... + # import cProfile + # cProfile.run('train(runner, agent, config, logwriter = logwriter )') + # python -m torch.utils.bottleneck /path/to/source/script.py [args] is also a good solution... - print("Training Finished.") - - print("Evaluating...") - rltorch.env.simulateEnvEps(env, actor, config, total_episodes = config['total_evaluation_episodes'], logger = logger, name = "Evaluation") - print("Evaulations Done.") - - logwriter.close() # We don't need to write anything out to disk anymore + print("Training Finished.") + print("Evaluating...") + rltorch.env.simulateEnvEps(env, actor, config, total_episodes = config['total_evaluation_episodes'], name="Evaluation") + print("Evaulations Done.") + logwriter.close() # We don't need to write anything out to disk anymore diff --git a/examples/acrobot_es.py b/examples/acrobot_es.py index c8be856..7c6d748 100644 --- a/examples/acrobot_es.py +++ b/examples/acrobot_es.py @@ -9,29 +9,28 @@ import rltorch.memory as M import rltorch.env as E from rltorch.action_selector import StochasticSelector from tensorboardX import SummaryWriter +from rltorch.log import Logger # ## Networks # class Policy(nn.Module): - def __init__(self, state_size, action_size): - super(Policy, self).__init__() - self.state_size = state_size - self.action_size = action_size + def __init__(self, state_size, action_size): + super(Policy, self).__init__() + self.state_size = state_size + self.action_size = action_size + self.fc1 = nn.Linear(state_size, 125) + self.fc_norm = nn.LayerNorm(125) - self.fc1 = nn.Linear(state_size, 125) - self.fc_norm = nn.LayerNorm(125) - - self.fc2 = nn.Linear(125, 125) - self.fc2_norm = nn.LayerNorm(125) + self.fc2 = nn.Linear(125, 125) + self.fc2_norm = nn.LayerNorm(125) + self.action_prob = nn.Linear(125, action_size) - self.action_prob = nn.Linear(125, action_size) - - def forward(self, x): - x = F.relu(self.fc_norm(self.fc1(x))) - x = F.relu(self.fc2_norm(self.fc2(x))) - x = F.softmax(self.action_prob(x), dim = 1) - return x + def forward(self, x): + x = F.relu(self.fc_norm(self.fc1(x))) + x = F.relu(self.fc2_norm(self.fc2(x))) + x = F.softmax(self.action_prob(x), dim = 1) + return x # ## Configuration @@ -50,75 +49,67 @@ config['disable_cuda'] = False # ## Training Loop # -def train(runner, net, config, logger = None, logwriter = None): - finished = False - while not finished: - runner.run() - net.calc_gradients() - net.step() - if logwriter is not None: - net.log_named_parameters() - logwriter.write(logger) - finished = runner.episode_num > config['total_training_episodes'] +def train(runner, net, config, logwriter=None): + finished = False + while not finished: + runner.run() + net.calc_gradients() + net.step() + if logwriter is not None: + net.log_named_parameters() + logwriter.write(Logger) + finished = runner.episode_num > config['total_training_episodes'] # ## Loss function # def fitness(model): - env = gym.make("Acrobot-v1") - state = torch.from_numpy(env.reset()).float().unsqueeze(0) - total_reward = 0 - done = False - while not done: - action_probabilities = model(state) - distribution = Categorical(action_probabilities) - action = distribution.sample().item() - next_state, reward, done, _ = env.step(action) - total_reward += reward - state = torch.from_numpy(next_state).float().unsqueeze(0) - return -total_reward + env = gym.make("Acrobot-v1") + state = torch.from_numpy(env.reset()).float().unsqueeze(0) + total_reward = 0 + done = False + while not done: + action_probabilities = model(state) + distribution = Categorical(action_probabilities) + action = distribution.sample().item() + next_state, reward, done, _ = env.step(action) + total_reward += reward + state = torch.from_numpy(next_state).float().unsqueeze(0) + return -total_reward if __name__ == "__main__": - # Hide internal gym warnings - gym.logger.set_level(40) + # Hide internal gym warnings + gym.logger.set_level(40) - # Setting up the environment - rltorch.set_seed(config['seed']) - print("Setting up environment...", end = " ") - env = E.TorchWrap(gym.make(config['environment_name'])) - env.seed(config['seed']) - print("Done.") - - state_size = env.observation_space.shape[0] - action_size = env.action_space.n + # Setting up the environment + rltorch.set_seed(config['seed']) + print("Setting up environment...", end=" ") + env = E.TorchWrap(gym.make(config['environment_name'])) + env.seed(config['seed']) + print("Done.") - # Logging - logger = rltorch.log.Logger() - logwriter = rltorch.log.LogWriter(SummaryWriter()) + state_size = env.observation_space.shape[0] + action_size = env.action_space.n - # Setting up the networks - device = torch.device("cuda:0" if torch.cuda.is_available() and not config['disable_cuda'] else "cpu") - net = rn.ESNetwork(Policy(state_size, action_size), - torch.optim.Adam, 100, fitness, config, device = device, name = "ES", logger = logger) + # Logging + logwriter = rltorch.log.LogWriter(SummaryWriter()) + # Setting up the networks + device = torch.device("cuda:0" if torch.cuda.is_available() and not config['disable_cuda'] else "cpu") + net = rn.ESNetwork(Policy(state_size, action_size), + torch.optim.Adam, 100, fitness, config, device=device, name="ES") + # Actor takes a net and uses it to produce actions from given states + actor = StochasticSelector(net, action_size, device=device) + # Runner performs an episode of the environment + runner = rltorch.env.EnvironmentEpisodeSync(env, actor, config, name="Training", logwriter=logwriter) + print("Training...") + train(runner, net, config, logwriter=logwriter) + # For profiling... + # import cProfile + # cProfile.run('train(runner, agent, config, logwriter = logwriter )') + # python -m torch.utils.bottleneck /path/to/source/script.py [args] is also a good solution... + print("Training Finished.") + print("Evaluating...") + rltorch.env.simulateEnvEps(env, actor, config, total_episodes=config['total_evaluation_episodes'], name="Evaluation") + print("Evaulations Done.") - # Actor takes a net and uses it to produce actions from given states - actor = StochasticSelector(net, action_size, device = device) - - # Runner performs an episode of the environment - runner = rltorch.env.EnvironmentEpisodeSync(env, actor, config, name = "Training", logwriter = logwriter) - - print("Training...") - train(runner, net, config, logger = logger, logwriter = logwriter) - - # For profiling... - # import cProfile - # cProfile.run('train(runner, agent, config, logger = logger, logwriter = logwriter )') - # python -m torch.utils.bottleneck /path/to/source/script.py [args] is also a good solution... - - print("Training Finished.") - - print("Evaluating...") - rltorch.env.simulateEnvEps(env, actor, config, total_episodes = config['total_evaluation_episodes'], logger = logger, name = "Evaluation") - print("Evaulations Done.") - - logwriter.close() # We don't need to write anything out to disk anymore + logwriter.close() # We don't need to write anything out to disk anymore diff --git a/examples/acrobot_ppo.py b/examples/acrobot_ppo.py index d0f3bce..9d6921c 100644 --- a/examples/acrobot_ppo.py +++ b/examples/acrobot_ppo.py @@ -8,48 +8,49 @@ import rltorch.memory as M import rltorch.env as E from rltorch.action_selector import StochasticSelector from tensorboardX import SummaryWriter +from rltorch.log import Logger # ## Networks # class Value(nn.Module): - def __init__(self, state_size): - super(Value, self).__init__() - self.state_size = state_size + def __init__(self, state_size): + super(Value, self).__init__() + self.state_size = state_size - self.fc1 = rn.NoisyLinear(state_size, 64) - self.fc_norm = nn.LayerNorm(64) + self.fc1 = rn.NoisyLinear(state_size, 64) + self.fc_norm = nn.LayerNorm(64) - self.fc2 = rn.NoisyLinear(64, 64) - self.fc2_norm = nn.LayerNorm(64) + self.fc2 = rn.NoisyLinear(64, 64) + self.fc2_norm = nn.LayerNorm(64) - self.fc3 = rn.NoisyLinear(64, 1) + self.fc3 = rn.NoisyLinear(64, 1) - def forward(self, x): - x = F.relu(self.fc_norm(self.fc1(x))) - x = F.relu(self.fc2_norm(self.fc2(x))) - x = self.fc3(x) - return x + def forward(self, x): + x = F.relu(self.fc_norm(self.fc1(x))) + x = F.relu(self.fc2_norm(self.fc2(x))) + x = self.fc3(x) + return x class Policy(nn.Module): - def __init__(self, state_size, action_size): - super(Policy, self).__init__() - self.state_size = state_size - self.action_size = action_size + def __init__(self, state_size, action_size): + super(Policy, self).__init__() + self.state_size = state_size + self.action_size = action_size - self.fc1 = rn.NoisyLinear(state_size, 64) - self.fc_norm = nn.LayerNorm(64) + self.fc1 = rn.NoisyLinear(state_size, 64) + self.fc_norm = nn.LayerNorm(64) - self.fc2 = rn.NoisyLinear(64, 64) - self.fc2_norm = nn.LayerNorm(64) + self.fc2 = rn.NoisyLinear(64, 64) + self.fc2_norm = nn.LayerNorm(64) - self.fc3 = rn.NoisyLinear(64, action_size) + self.fc3 = rn.NoisyLinear(64, action_size) - def forward(self, x): - x = F.relu(self.fc_norm(self.fc1(x))) - x = F.relu(self.fc2_norm(self.fc2(x))) - x = F.softmax(self.fc3(x), dim = 1) - return x + def forward(self, x): + x = F.relu(self.fc_norm(self.fc1(x))) + x = F.relu(self.fc2_norm(self.fc2(x))) + x = F.softmax(self.fc3(x), dim = 1) + return x # ## Configuration @@ -68,64 +69,63 @@ config['disable_cuda'] = False # ## Training Loop # -def train(runner, agent, config, logger = None, logwriter = None): +def train(runner, agent, config, logwriter = None): finished = False while not finished: runner.run() agent.learn() if logwriter is not None: - agent.value_net.log_named_parameters() - agent.policy_net.log_named_parameters() - logwriter.write(logger) + agent.value_net.log_named_parameters() + agent.policy_net.log_named_parameters() + logwriter.write(Logger) finished = runner.episode_num > config['total_training_episodes'] if __name__ == "__main__": - # Setting up the environment - rltorch.set_seed(config['seed']) - print("Setting up environment...", end = " ") - env = E.TorchWrap(gym.make(config['environment_name'])) - env.seed(config['seed']) - print("Done.") + # Setting up the environment + rltorch.set_seed(config['seed']) + print("Setting up environment...", end=" ") + env = E.TorchWrap(gym.make(config['environment_name'])) + env.seed(config['seed']) + print("Done.") - state_size = env.observation_space.shape[0] - action_size = env.action_space.n + state_size = env.observation_space.shape[0] + action_size = env.action_space.n - # Logging - logger = rltorch.log.Logger() - logwriter = rltorch.log.LogWriter(SummaryWriter()) + # Logging + logwriter = rltorch.log.LogWriter(SummaryWriter()) - # Setting up the networks - device = torch.device("cuda:0" if torch.cuda.is_available() and not config['disable_cuda'] else "cpu") - policy_net = rn.Network(Policy(state_size, action_size), - torch.optim.Adam, config, device = device, name = "Policy") - value_net = rn.Network(Value(state_size), - torch.optim.Adam, config, device = device, name = "DQN") + # Setting up the networks + device = torch.device("cuda:0" if torch.cuda.is_available() and not config['disable_cuda'] else "cpu") + policy_net = rn.Network(Policy(state_size, action_size), + torch.optim.Adam, config, device=device, name="Policy") + value_net = rn.Network(Value(state_size), + torch.optim.Adam, config, device=device, name="DQN") - # Memory stores experiences for later training - memory = M.EpisodeMemory() + # Memory stores experiences for later training + memory = M.EpisodeMemory() - # Actor takes a net and uses it to produce actions from given states - actor = StochasticSelector(policy_net, action_size, memory, device = device) + # Actor takes a net and uses it to produce actions from given states + actor = StochasticSelector(policy_net, action_size, memory, device=device) - # Agent is what performs the training - agent = rltorch.agents.PPOAgent(policy_net, value_net, memory, config, logger = logger) + # Agent is what performs the training + agent = rltorch.agents.PPOAgent(policy_net, value_net, memory, config) - # Runner performs a certain number of steps in the environment - runner = rltorch.env.EnvironmentEpisodeSync(env, actor, config, name = "Training", memory = memory, logwriter = logwriter) + # Runner performs a certain number of steps in the environment + runner = rltorch.env.EnvironmentEpisodeSync(env, actor, config, name="Training", memory=memory, logwriter=logwriter) - print("Training...") - train(runner, agent, config, logger = logger, logwriter = logwriter) + print("Training...") + train(runner, agent, config, logwriter=logwriter) # For profiling... # import cProfile - # cProfile.run('train(runner, agent, config, logger = logger, logwriter = logwriter )') + # cProfile.run('train(runner, agent, config, logwriter = logwriter )') # python -m torch.utils.bottleneck /path/to/source/script.py [args] is also a good solution... - print("Training Finished.") + print("Training Finished.") - print("Evaluating...") - rltorch.env.simulateEnvEps(env, actor, config, total_episodes = config['total_evaluation_episodes'], logger = logger, name = "Evaluation") - print("Evaulations Done.") + print("Evaluating...") + rltorch.env.simulateEnvEps(env, actor, config, total_episodes=config['total_evaluation_episodes'], name="Evaluation") + print("Evaulations Done.") - logwriter.close() # We don't need to write anything out to disk anymore + logwriter.close() # We don't need to write anything out to disk anymore diff --git a/examples/acrobot_qep.py b/examples/acrobot_qep.py index 9643f11..169103b 100644 --- a/examples/acrobot_qep.py +++ b/examples/acrobot_qep.py @@ -7,61 +7,62 @@ import rltorch.network as rn import rltorch.memory as M import rltorch.env as E from rltorch.action_selector import StochasticSelector -from tensorboardX import SummaryWriter +# from tensorboardX import SummaryWriter from copy import deepcopy +from rltorch.log import Logger # ## Networks # class Value(nn.Module): - def __init__(self, state_size, action_size): - super(Value, self).__init__() - self.state_size = state_size - self.action_size = action_size + def __init__(self, state_size, action_size): + super(Value, self).__init__() + self.state_size = state_size + self.action_size = action_size - self.fc1 = rn.NoisyLinear(state_size, 255) - self.fc_norm = nn.LayerNorm(255) + self.fc1 = rn.NoisyLinear(state_size, 255) + self.fc_norm = nn.LayerNorm(255) - self.value_fc = rn.NoisyLinear(255, 255) - self.value_fc_norm = nn.LayerNorm(255) - self.value = rn.NoisyLinear(255, 1) - - self.advantage_fc = rn.NoisyLinear(255, 255) - self.advantage_fc_norm = nn.LayerNorm(255) - self.advantage = rn.NoisyLinear(255, action_size) + self.value_fc = rn.NoisyLinear(255, 255) + self.value_fc_norm = nn.LayerNorm(255) + self.value = rn.NoisyLinear(255, 1) - def forward(self, x): - x = F.relu(self.fc_norm(self.fc1(x))) + self.advantage_fc = rn.NoisyLinear(255, 255) + self.advantage_fc_norm = nn.LayerNorm(255) + self.advantage = rn.NoisyLinear(255, action_size) + + def forward(self, x): + x = F.relu(self.fc_norm(self.fc1(x))) - state_value = F.relu(self.value_fc_norm(self.value_fc(x))) - state_value = self.value(state_value) + state_value = F.relu(self.value_fc_norm(self.value_fc(x))) + state_value = self.value(state_value) - advantage = F.relu(self.advantage_fc_norm(self.advantage_fc(x))) - advantage = self.advantage(advantage) + advantage = F.relu(self.advantage_fc_norm(self.advantage_fc(x))) + advantage = self.advantage(advantage) - x = state_value + advantage - advantage.mean() - return x + x = state_value + advantage - advantage.mean() + return x class Policy(nn.Module): - def __init__(self, state_size, action_size): - super(Policy, self).__init__() - self.state_size = state_size - self.action_size = action_size + def __init__(self, state_size, action_size): + super(Policy, self).__init__() + self.state_size = state_size + self.action_size = action_size - self.fc1 = nn.Linear(state_size, 125) - self.fc_norm = nn.LayerNorm(125) + self.fc1 = nn.Linear(state_size, 125) + self.fc_norm = nn.LayerNorm(125) - self.fc2 = nn.Linear(125, 125) - self.fc2_norm = nn.LayerNorm(125) + self.fc2 = nn.Linear(125, 125) + self.fc2_norm = nn.LayerNorm(125) - self.action_prob = nn.Linear(125, action_size) + self.action_prob = nn.Linear(125, action_size) - def forward(self, x): - x = F.relu(self.fc_norm(self.fc1(x))) - x = F.relu(self.fc2_norm(self.fc2(x))) - x = F.softmax(self.action_prob(x), dim = 1) - return x + def forward(self, x): + x = F.relu(self.fc_norm(self.fc1(x))) + x = F.relu(self.fc2_norm(self.fc2(x))) + x = F.softmax(self.action_prob(x), dim = 1) + return x # ## Configuration @@ -94,70 +95,70 @@ config['prioritized_replay_weight_importance'] = rltorch.scheduler.ExponentialSc # ## Training Loop # -def train(runner, agent, config, logger = None, logwriter = None): +def train(runner, agent, config, logwriter=None): finished = False last_episode_num = 1 while not finished: runner.run(config['replay_skip'] + 1) agent.learn() if logwriter is not None: - if last_episode_num < runner.episode_num: - last_episode_num = runner.episode_num - agent.value_net.log_named_parameters() - agent.policy_net.log_named_parameters() - logwriter.write(logger) + if last_episode_num < runner.episode_num: + last_episode_num = runner.episode_num + agent.value_net.log_named_parameters() + agent.policy_net.log_named_parameters() + logwriter.write(Logger) finished = runner.episode_num > config['total_training_episodes'] if __name__ == "__main__": - # Setting up the environment - rltorch.set_seed(config['seed']) - print("Setting up environment...", end = " ") - env = E.TorchWrap(gym.make(config['environment_name'])) - env.seed(config['seed']) - print("Done.") + # Setting up the environment + rltorch.set_seed(config['seed']) + print("Setting up environment...", end = " ") + env = E.TorchWrap(gym.make(config['environment_name'])) + env.seed(config['seed']) + print("Done.") - state_size = env.observation_space.shape[0] - action_size = env.action_space.n + state_size = env.observation_space.shape[0] + action_size = env.action_space.n - # Logging - logger = rltorch.log.Logger() - logwriter = rltorch.log.LogWriter(SummaryWriter()) + # Logging + logwriter = None + # logwriter = rltorch.log.LogWriter(SummaryWriter()) - # Setting up the networks - device = torch.device("cuda:0" if torch.cuda.is_available() and not config['disable_cuda'] else "cpu") - config2 = deepcopy(config) - config2['learning_rate'] = 0.01 - policy_net = rn.ESNetwork(Policy(state_size, action_size), - torch.optim.Adam, 500, None, config2, sigma = 0.1, device = device, name = "ES", logger = logger) - value_net = rn.Network(Value(state_size, action_size), - torch.optim.Adam, config, device = device, name = "DQN", logger = logger) - target_net = rn.TargetNetwork(value_net, device = device) + # Setting up the networks + device = torch.device("cuda:0" if torch.cuda.is_available() and not config['disable_cuda'] else "cpu") + config2 = deepcopy(config) + config2['learning_rate'] = 0.01 + policy_net = rn.ESNetwork(Policy(state_size, action_size), + torch.optim.Adam, 500, None, config2, sigma=0.1, device=device, name="ES") + value_net = rn.Network(Value(state_size, action_size), + torch.optim.Adam, config, device=device, name="DQN") + target_net = rn.TargetNetwork(value_net, device=device) - # Actor takes a net and uses it to produce actions from given states - actor = StochasticSelector(policy_net, action_size, device = device) + # Actor takes a net and uses it to produce actions from given states + actor = StochasticSelector(policy_net, action_size, device=device) - # Memory stores experiences for later training - memory = M.PrioritizedReplayMemory(capacity = config['memory_size'], alpha = config['prioritized_replay_sampling_priority']) + # Memory stores experiences for later training + memory = M.PrioritizedReplayMemory(capacity=config['memory_size'], alpha=config['prioritized_replay_sampling_priority']) - # Runner performs a certain number of steps in the environment - runner = rltorch.env.EnvironmentRunSync(env, actor, config, name = "Training", memory = memory, logwriter = logwriter) + # Runner performs a certain number of steps in the environment + runner = rltorch.env.EnvironmentRunSync(env, actor, config, name="Training", memory=memory, logwriter=logwriter) - # Agent is what performs the training - agent = rltorch.agents.QEPAgent(policy_net, value_net, memory, config, target_value_net = target_net, logger = logger) + # Agent is what performs the training + agent = rltorch.agents.QEPAgent(policy_net, value_net, memory, config, target_value_net=target_net) - print("Training...") - train(runner, agent, config, logger = logger, logwriter = logwriter) + print("Training...") + train(runner, agent, config, logwriter=logwriter) # For profiling... # import cProfile - # cProfile.run('train(runner, agent, config, logger = logger, logwriter = logwriter )') + # cProfile.run('train(runner, agent, config, logwriter = logwriter )') # python -m torch.utils.bottleneck /path/to/source/script.py [args] is also a good solution... - print("Training Finished.") + print("Training Finished.") - print("Evaluating...") - rltorch.env.simulateEnvEps(env, actor, config, total_episodes = config['total_evaluation_episodes'], logger = logger, name = "Evaluation") - print("Evaulations Done.") + print("Evaluating...") + rltorch.env.simulateEnvEps(env, actor, config, total_episodes = config['total_evaluation_episodes'], name="Evaluation") + print("Evaulations Done.") - logwriter.close() # We don't need to write anything out to disk anymore + # logwriter.close() # We don't need to write anything out to disk anymore diff --git a/examples/acrobot_reinforce.py b/examples/acrobot_reinforce.py index d0cd397..a7cd995 100644 --- a/examples/acrobot_reinforce.py +++ b/examples/acrobot_reinforce.py @@ -7,30 +7,30 @@ import rltorch.network as rn import rltorch.memory as M import rltorch.env as E from rltorch.action_selector import StochasticSelector -from tensorboardX import SummaryWriter +from rltorch.log import Logger # ## Networks # class Policy(nn.Module): - def __init__(self, state_size, action_size): - super(Policy, self).__init__() - self.state_size = state_size - self.action_size = action_size + def __init__(self, state_size, action_size): + super(Policy, self).__init__() + self.state_size = state_size + self.action_size = action_size - self.fc1 = rn.NoisyLinear(state_size, 64) - self.fc_norm = nn.LayerNorm(64) + self.fc1 = rn.NoisyLinear(state_size, 64) + self.fc_norm = nn.LayerNorm(64) - self.fc2 = rn.NoisyLinear(64, 64) - self.fc2_norm = nn.LayerNorm(64) + self.fc2 = rn.NoisyLinear(64, 64) + self.fc2_norm = nn.LayerNorm(64) - self.fc3 = rn.NoisyLinear(64, action_size) + self.fc3 = rn.NoisyLinear(64, action_size) - def forward(self, x): - x = F.relu(self.fc_norm(self.fc1(x))) - x = F.relu(self.fc2_norm(self.fc2(x))) - x = F.softmax(self.fc3(x), dim = 1) - return x + def forward(self, x): + x = F.relu(self.fc_norm(self.fc1(x))) + x = F.relu(self.fc2_norm(self.fc2(x))) + x = F.softmax(self.fc3(x), dim=1) + return x # ## Configuration @@ -49,65 +49,65 @@ config['disable_cuda'] = False # ## Training Loop # -def train(runner, agent, config, logger = None, logwriter = None): - finished = False - while not finished: - runner.run() - agent.learn() - # When the episode number changes, log network paramters - if logwriter is not None: - agent.net.log_named_parameters() - logwriter.write(logger) - finished = runner.episode_num > config['total_training_episodes'] +def train(runner, agent, config, logwriter=None): + finished = False + while not finished: + runner.run() + agent.learn() + # When the episode number changes, log network paramters + if logwriter is not None: + agent.net.log_named_parameters() + logwriter.write(Logger) + finished = runner.episode_num > config['total_training_episodes'] if __name__ == "__main__": - torch.multiprocessing.set_sharing_strategy('file_system') # To not hit file descriptor memory limit + torch.multiprocessing.set_sharing_strategy('file_system') # To not hit file descriptor memory limit - # Setting up the environment - rltorch.set_seed(config['seed']) - print("Setting up environment...", end = " ") - env = E.TorchWrap(gym.make(config['environment_name'])) - env.seed(config['seed']) - print("Done.") + # Setting up the environment + rltorch.set_seed(config['seed']) + print("Setting up environment...", end=" ") + env = E.TorchWrap(gym.make(config['environment_name'])) + env.seed(config['seed']) + print("Done.") - state_size = env.observation_space.shape[0] - action_size = env.action_space.n + state_size = env.observation_space.shape[0] + action_size = env.action_space.n - # Logging - logger = rltorch.log.Logger() - logwriter = rltorch.log.LogWriter(SummaryWriter()) + # Logging + logwriter = None + # logwriter = rltorch.log.LogWriter(SummaryWriter()) - # Setting up the networks - device = torch.device("cuda:0" if torch.cuda.is_available() and not config['disable_cuda'] else "cpu") - net = rn.Network(Policy(state_size, action_size), - torch.optim.Adam, config, device = device, name = "DQN") - target_net = rn.TargetNetwork(net, device = device) + # Setting up the networks + device = torch.device("cuda:0" if torch.cuda.is_available() and not config['disable_cuda'] else "cpu") + net = rn.Network(Policy(state_size, action_size), + torch.optim.Adam, config, device=device, name="DQN") + target_net = rn.TargetNetwork(net, device=device) - # Memory stores experiences for later training - memory = M.EpisodeMemory() + # Memory stores experiences for later training + memory = M.EpisodeMemory() - # Actor takes a net and uses it to produce actions from given states - actor = StochasticSelector(net, action_size, memory, device = device) + # Actor takes a net and uses it to produce actions from given states + actor = StochasticSelector(net, action_size, memory, device=device) - # Agent is what performs the training - agent = rltorch.agents.REINFORCEAgent(net, memory, config, target_net = target_net, logger = logger) + # Agent is what performs the training + agent = rltorch.agents.REINFORCEAgent(net, memory, config, target_net=target_net) - # Runner performs one episode in the environment - runner = rltorch.env.EnvironmentEpisodeSync(env, actor, config, name = "Training", memory = memory, logwriter = logwriter) + # Runner performs one episode in the environment + runner = rltorch.env.EnvironmentEpisodeSync(env, actor, config, name="Training", memory=memory, logwriter=logwriter) - print("Training...") - train(runner, agent, config, logger = logger, logwriter = logwriter) + print("Training...") + train(runner, agent, config, logwriter=logwriter) # For profiling... # import cProfile - # cProfile.run('train(runner, agent, config, logger = logger, logwriter = logwriter )') + # cProfile.run('train(runner, agent, config, logwriter = logwriter )') # python -m torch.utils.bottleneck /path/to/source/script.py [args] is also a good solution... - print("Training Finished.") + print("Training Finished.") - print("Evaluating...") - rltorch.env.simulateEnvEps(env, actor, config, total_episodes = config['total_evaluation_episodes'], logger = logger, name = "Evaluation") - print("Evaulations Done.") + print("Evaluating...") + rltorch.env.simulateEnvEps(env, actor, config, total_episodes=config['total_evaluation_episodes'], name="Evaluation") + print("Evaulations Done.") - logwriter.close() # We don't need to write anything out to disk anymore + # logwriter.close() # We don't need to write anything out to disk anymore diff --git a/examples/acrobot_single_process_dqn.py b/examples/acrobot_single_process_dqn.py index 0990cb9..023ff19 100644 --- a/examples/acrobot_single_process_dqn.py +++ b/examples/acrobot_single_process_dqn.py @@ -7,39 +7,39 @@ import rltorch.network as rn import rltorch.memory as M import rltorch.env as E from rltorch.action_selector import ArgMaxSelector -from tensorboardX import SummaryWriter +from rltorch.log import Logger # ## Networks # class Value(nn.Module): - def __init__(self, state_size, action_size): - super(Value, self).__init__() - self.state_size = state_size - self.action_size = action_size + def __init__(self, state_size, action_size): + super(Value, self).__init__() + self.state_size = state_size + self.action_size = action_size - self.fc1 = rn.NoisyLinear(state_size, 255) - self.fc_norm = nn.LayerNorm(255) + self.fc1 = rn.NoisyLinear(state_size, 255) + self.fc_norm = nn.LayerNorm(255) - self.value_fc = rn.NoisyLinear(255, 255) - self.value_fc_norm = nn.LayerNorm(255) - self.value = rn.NoisyLinear(255, 1) + self.value_fc = rn.NoisyLinear(255, 255) + self.value_fc_norm = nn.LayerNorm(255) + self.value = rn.NoisyLinear(255, 1) - self.advantage_fc = rn.NoisyLinear(255, 255) - self.advantage_fc_norm = nn.LayerNorm(255) - self.advantage = rn.NoisyLinear(255, action_size) + self.advantage_fc = rn.NoisyLinear(255, 255) + self.advantage_fc_norm = nn.LayerNorm(255) + self.advantage = rn.NoisyLinear(255, action_size) - def forward(self, x): - x = F.relu(self.fc_norm(self.fc1(x))) + def forward(self, x): + x = F.relu(self.fc_norm(self.fc1(x))) - state_value = F.relu(self.value_fc_norm(self.value_fc(x))) - state_value = self.value(state_value) + state_value = F.relu(self.value_fc_norm(self.value_fc(x))) + state_value = self.value(state_value) - advantage = F.relu(self.advantage_fc_norm(self.advantage_fc(x))) - advantage = self.advantage(advantage) + advantage = F.relu(self.advantage_fc_norm(self.advantage_fc(x))) + advantage = self.advantage(advantage) - x = state_value + advantage - advantage.mean() - return x + x = state_value + advantage - advantage.mean() + return x # ## Configuration @@ -71,7 +71,7 @@ config['prioritized_replay_weight_importance'] = rltorch.scheduler.ExponentialSc # ## Training Loop # -def train(runner, agent, config, logger = None, logwriter = None): +def train(runner, agent, config, logwriter=None): finished = False last_episode_num = 1 while not finished: @@ -79,56 +79,56 @@ def train(runner, agent, config, logger = None, logwriter = None): agent.learn() if logwriter is not None: if last_episode_num < runner.episode_num: - last_episode_num = runner.episode_num - agent.net.log_named_parameters() - logwriter.write(logger) + last_episode_num = runner.episode_num + agent.net.log_named_parameters() + logwriter.write(Logger) finished = runner.episode_num > config['total_training_episodes'] if __name__ == "__main__": - # Setting up the environment - rltorch.set_seed(config['seed']) - print("Setting up environment...", end = " ") - env = E.TorchWrap(gym.make(config['environment_name'])) - env.seed(config['seed']) - print("Done.") + # Setting up the environment + rltorch.set_seed(config['seed']) + print("Setting up environment...", end=" ") + env = E.TorchWrap(gym.make(config['environment_name'])) + env.seed(config['seed']) + print("Done.") - state_size = env.observation_space.shape[0] - action_size = env.action_space.n + state_size = env.observation_space.shape[0] + action_size = env.action_space.n - # Logging - logger = rltorch.log.Logger() - logwriter = rltorch.log.LogWriter(SummaryWriter()) + # Logging + logwriter = None + # logwriter = rltorch.log.LogWriter(SummaryWriter()) - # Setting up the networks - device = torch.device("cuda:0" if torch.cuda.is_available() and not config['disable_cuda'] else "cpu") - net = rn.Network(Value(state_size, action_size), - torch.optim.Adam, config, device = device, name = "DQN", logger = logger) - target_net = rn.TargetNetwork(net, device = device) + # Setting up the networks + device = torch.device("cuda:0" if torch.cuda.is_available() and not config['disable_cuda'] else "cpu") + net = rn.Network(Value(state_size, action_size), + torch.optim.Adam, config, device=device, name="DQN") + target_net = rn.TargetNetwork(net, device=device) - # Actor takes a net and uses it to produce actions from given states - actor = ArgMaxSelector(net, action_size, device = device) - # Memory stores experiences for later training - memory = M.PrioritizedReplayMemory(capacity = config['memory_size'], alpha = config['prioritized_replay_sampling_priority']) + # Actor takes a net and uses it to produce actions from given states + actor = ArgMaxSelector(net, action_size, device=device) + # Memory stores experiences for later training + memory = M.PrioritizedReplayMemory(capacity=config['memory_size'], alpha=config['prioritized_replay_sampling_priority']) - # Runner performs a certain number of steps in the environment - runner = rltorch.env.EnvironmentRunSync(env, actor, config, name = "Training", memory = memory, logwriter = logwriter) + # Runner performs a certain number of steps in the environment + runner = rltorch.env.EnvironmentRunSync(env, actor, config, name="Training", memory=memory, logwriter=logwriter) - # Agent is what performs the training - agent = rltorch.agents.DQNAgent(net, memory, config, target_net = target_net, logger = logger) + # Agent is what performs the training + agent = rltorch.agents.DQNAgent(net, memory, config, target_net=target_net) - print("Training...") - train(runner, agent, config, logger = logger, logwriter = logwriter) + print("Training...") + train(runner, agent, config, logwriter=logwriter) # For profiling... # import cProfile - # cProfile.run('train(runner, agent, config, logger = logger, logwriter = logwriter )') + # cProfile.run('train(runner, agent, config, logwriter = logwriter )') # python -m torch.utils.bottleneck /path/to/source/script.py [args] is also a good solution... - print("Training Finished.") + print("Training Finished.") - print("Evaluating...") - rltorch.env.simulateEnvEps(env, actor, config, total_episodes = config['total_evaluation_episodes'], logger = logger, name = "Evaluation") - print("Evaulations Done.") + print("Evaluating...") + rltorch.env.simulateEnvEps(env, actor, config, total_episodes = config['total_evaluation_episodes'], name = "Evaluation") + print("Evaulations Done.") - logwriter.close() # We don't need to write anything out to disk anymore + logwriter.close() # We don't need to write anything out to disk anymore diff --git a/examples/pong_mp_dqn.py b/examples/pong_mp_dqn.py index 30ff21e..4315cbc 100644 --- a/examples/pong_mp_dqn.py +++ b/examples/pong_mp_dqn.py @@ -9,58 +9,59 @@ import rltorch.env as E from rltorch.action_selector import ArgMaxSelector from tensorboardX import SummaryWriter import torch.multiprocessing as mp +from rltorch.log import Logger # ## Networks # class Value(nn.Module): - def __init__(self, state_size, action_size): - super(Value, self).__init__() - self.state_size = state_size - self.action_size = action_size + def __init__(self, state_size, action_size): + super(Value, self).__init__() + self.state_size = state_size + self.action_size = action_size - self.conv1 = nn.Conv2d(4, 32, kernel_size = (8, 8), stride = (4, 4)) - self.conv_norm1 = nn.LayerNorm([32, 19, 19]) - self.conv2 = nn.Conv2d(32, 64, kernel_size = (4, 4), stride = (2, 2)) - self.conv_norm2 = nn.LayerNorm([64, 8, 8]) - self.conv3 = nn.Conv2d(64, 64, kernel_size = (3, 3), stride = (1, 1)) - self.conv_norm3 = nn.LayerNorm([64, 6, 6]) + self.conv1 = nn.Conv2d(4, 32, kernel_size=(8, 8), stride=(4, 4)) + self.conv_norm1 = nn.LayerNorm([32, 19, 19]) + self.conv2 = nn.Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2)) + self.conv_norm2 = nn.LayerNorm([64, 8, 8]) + self.conv3 = nn.Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1)) + self.conv_norm3 = nn.LayerNorm([64, 6, 6]) - self.fc1 = rn.NoisyLinear(64 * 6 * 6, 384) - self.fc_norm = nn.LayerNorm(384) + self.fc1 = rn.NoisyLinear(64 * 6 * 6, 384) + self.fc_norm = nn.LayerNorm(384) - self.value_fc = rn.NoisyLinear(384, 384) - self.value_fc_norm = nn.LayerNorm(384) - self.value = rn.NoisyLinear(384, 1) + self.value_fc = rn.NoisyLinear(384, 384) + self.value_fc_norm = nn.LayerNorm(384) + self.value = rn.NoisyLinear(384, 1) - self.advantage_fc = rn.NoisyLinear(384, 384) - self.advantage_fc_norm = nn.LayerNorm(384) - self.advantage = rn.NoisyLinear(384, action_size) + self.advantage_fc = rn.NoisyLinear(384, 384) + self.advantage_fc_norm = nn.LayerNorm(384) + self.advantage = rn.NoisyLinear(384, action_size) - def forward(self, x): - x = F.relu(self.conv_norm1(self.conv1(x))) - x = F.relu(self.conv_norm2(self.conv2(x))) - x = F.relu(self.conv_norm3(self.conv3(x))) + def forward(self, x): + x = F.relu(self.conv_norm1(self.conv1(x))) + x = F.relu(self.conv_norm2(self.conv2(x))) + x = F.relu(self.conv_norm3(self.conv3(x))) - # Makes batch_size dimension again - x = x.view(-1, 64 * 6 * 6) - x = F.relu(self.fc_norm(self.fc1(x))) + # Makes batch_size dimension again + x = x.view(-1, 64 * 6 * 6) + x = F.relu(self.fc_norm(self.fc1(x))) - state_value = F.relu(self.value_fc_norm(self.value_fc(x))) - state_value = self.value(state_value) + state_value = F.relu(self.value_fc_norm(self.value_fc(x))) + state_value = self.value(state_value) - advantage = F.relu(self.advantage_fc_norm(self.advantage_fc(x))) - advantage = self.advantage(advantage) + advantage = F.relu(self.advantage_fc_norm(self.advantage_fc(x))) + advantage = self.advantage(advantage) - x = state_value + advantage - advantage.mean() + x = state_value + advantage - advantage.mean() - # For debugging purposes... - if torch.isnan(x).any().item(): - print("WARNING NAN IN MODEL DETECTED") - - return x + # For debugging purposes... + if torch.isnan(x).any().item(): + print("WARNING NAN IN MODEL DETECTED") + return x + # ## Configuration # @@ -89,59 +90,73 @@ config['prioritized_replay_sampling_priority'] = 0.6 # Should ideally start from 0 and move your way to 1 to prevent overfitting config['prioritized_replay_weight_importance'] = rltorch.scheduler.ExponentialScheduler(initial_value = 0.4, end_value = 1, iterations = 5000) +# +## Training Loop +# +def train(runner, agent, config, logwriter = None): + finished = False + while not finished: + runner.run() + agent.learn() + if logwriter is not None: + agent.value_net.log_named_parameters() + agent.policy_net.log_named_parameters() + logwriter.write(Logger) + finished = runner.episode_num > config['total_training_episodes'] + + if __name__ == "__main__": - # To not hit file descriptor memory limit - torch.multiprocessing.set_sharing_strategy('file_system') + # To not hit file descriptor memory limit + torch.multiprocessing.set_sharing_strategy('file_system') - # Setting up the environment - rltorch.set_seed(config['seed']) - print("Setting up environment...", end = " ") - env = E.FrameStack(E.TorchWrap( - E.ProcessFrame(E.FireResetEnv(gym.make(config['environment_name'])), - resize_shape = (80, 80), crop_bounds = [34, 194, 15, 145], grayscale = True)) - , 4) - env.seed(config['seed']) - print("Done.") + # Setting up the environment + rltorch.set_seed(config['seed']) + print("Setting up environment...", end = " ") + env = E.FrameStack(E.TorchWrap( + E.ProcessFrame(E.FireResetEnv(gym.make(config['environment_name'])), + resize_shape=(80, 80), crop_bounds=[34, 194, 15, 145], grayscale=True)) + , 4) + env.seed(config['seed']) + print("Done.") - state_size = env.observation_space.shape[0] - action_size = env.action_space.n + state_size = env.observation_space.shape[0] + action_size = env.action_space.n - # Logging - logger = rltorch.log.Logger() - logwriter = rltorch.log.LogWriter(SummaryWriter()) + # Logging + logwriter = rltorch.log.LogWriter(SummaryWriter()) - # Setting up the networks - device = torch.device("cuda:0" if torch.cuda.is_available() and not config['disable_cuda'] else "cpu") - net = rn.Network(Value(state_size, action_size), - torch.optim.Adam, config, device = device, name = "DQN") - target_net = rn.TargetNetwork(net, device = device) - net.model.share_memory() - target_net.model.share_memory() + # Setting up the networks + device = torch.device("cuda:0" if torch.cuda.is_available() and not config['disable_cuda'] else "cpu") + net = rn.Network(Value(state_size, action_size), + torch.optim.Adam, config, device=device, name="DQN") + target_net = rn.TargetNetwork(net, device=device) + net.model.share_memory() + target_net.model.share_memory() - # Actor takes a net and uses it to produce actions from given states - actor = ArgMaxSelector(net, action_size, device = device) - # Memory stores experiences for later training - memory = M.PrioritizedReplayMemory(capacity = config['memory_size'], alpha = config['prioritized_replay_sampling_priority']) + # Actor takes a net and uses it to produce actions from given states + actor = ArgMaxSelector(net, action_size, device=device) + # Memory stores experiences for later training + memory = M.PrioritizedReplayMemory(capacity=config['memory_size'], alpha=config['prioritized_replay_sampling_priority']) - # Runner performs a certain number of steps in the environment - runner = rltorch.mp.EnvironmentRun(env, actor, config, name = "Training", memory = memory, logwriter = logwriter) + # Runner performs a certain number of steps in the environment + runner = rltorch.mp.EnvironmentRun(env, actor, config, name="Training", memory=memory, logwriter=logwriter) - # Agent is what performs the training - agent = rltorch.agents.DQNAgent(net, memory, config, target_net = target_net, logger = logger) + # Agent is what performs the training + agent = rltorch.agents.DQNAgent(net, memory, config, target_net=target_net) - print("Training...") - train(runner, agent, config, logger = logger, logwriter = logwriter) + print("Training...") + train(runner, agent, config, logwriter=logwriter) # For profiling... # import cProfile - # cProfile.run('train(runner, agent, config, logger = logger, logwriter = logwriter )') + # cProfile.run('train(runner, agent, config, logwriter = logwriter )') # python -m torch.utils.bottleneck /path/to/source/script.py [args] is also a good solution... - print("Training Finished.") - runner.terminate() # We don't need the extra process anymore + print("Training Finished.") + runner.terminate() # We don't need the extra process anymore - print("Evaluating...") - rltorch.env.simulateEnvEps(env, actor, config, total_episodes = config['total_evaluation_episodes'], logger = logger, name = "Evaluation") - print("Evaulations Done.") + print("Evaluating...") + rltorch.env.simulateEnvEps(env, actor, config, total_episodes=config['total_evaluation_episodes'], name="Evaluation") + print("Evaulations Done.") - logwriter.close() # We don't need to write anything out to disk anymore + logwriter.close() # We don't need to write anything out to disk anymore diff --git a/rltorch/agents/A2CSingleAgent.py b/rltorch/agents/A2CSingleAgent.py index 0c2541a..721857d 100644 --- a/rltorch/agents/A2CSingleAgent.py +++ b/rltorch/agents/A2CSingleAgent.py @@ -2,14 +2,14 @@ from copy import deepcopy import numpy as np import torch import torch.nn.functional as F +import rltorch.log as log class A2CSingleAgent: - def __init__(self, policy_net, value_net, memory, config, logger=None): + def __init__(self, policy_net, value_net, memory, config): self.policy_net = policy_net self.value_net = value_net self.memory = memory self.config = deepcopy(config) - self.logger = logger def _discount_rewards(self, rewards): gammas = torch.ones_like(rewards) @@ -79,9 +79,9 @@ class A2CSingleAgent: policy_loss = (-log_prob_batch * advantages).sum() - if self.logger is not None: - self.logger.append("Loss/Policy", policy_loss.item()) - self.logger.append("Loss/Value", value_loss.item()) + if log.enabled: + log.Logger["Loss/Policy"].append(policy_loss.item()) + log.Logger["Loss/Value"].append(value_loss.item()) self.policy_net.zero_grad() diff --git a/rltorch/agents/DQNAgent.py b/rltorch/agents/DQNAgent.py index 71f9015..d14162f 100644 --- a/rltorch/agents/DQNAgent.py +++ b/rltorch/agents/DQNAgent.py @@ -3,14 +3,14 @@ from copy import deepcopy import rltorch.memory as M import torch import torch.nn.functional as F +import rltorch.log as log class DQNAgent: - def __init__(self, net, memory, config, target_net=None, logger=None): + def __init__(self, net, memory, config, target_net=None): self.net = net self.target_net = target_net self.memory = memory self.config = deepcopy(config) - self.logger = logger def save(self, file_location): torch.save(self.net.model.state_dict(), file_location) def load(self, file_location): @@ -18,7 +18,7 @@ class DQNAgent: self.net.model.to(self.net.device) self.target_net.sync() - def learn(self, logger=None): + def learn(self): if len(self.memory) < self.config['batch_size']: return @@ -68,8 +68,8 @@ class DQNAgent: # loss = F.smooth_l1_loss(obtained_values, expected_values) loss = F.mse_loss(obtained_values, expected_values) - if self.logger is not None: - self.logger.append("Loss", loss.item()) + if log.enabled: + log.Logger["Loss"].append(loss.item()) self.net.zero_grad() loss.backward() diff --git a/rltorch/agents/DQfDAgent.py b/rltorch/agents/DQfDAgent.py index 076e821..f362e9a 100644 --- a/rltorch/agents/DQfDAgent.py +++ b/rltorch/agents/DQfDAgent.py @@ -3,15 +3,14 @@ from copy import deepcopy import rltorch.memory as M import torch import torch.nn.functional as F - +import rltorch.log as log class DQfDAgent: - def __init__(self, net, memory, config, target_net=None, logger=None): + def __init__(self, net, memory, config, target_net=None): self.net = net self.target_net = target_net self.memory = memory self.config = deepcopy(config) - self.logger = logger def save(self, file_location): torch.save(self.net.model.state_dict(), file_location) def load(self, file_location): @@ -19,7 +18,7 @@ class DQfDAgent: self.net.model.to(self.net.device) self.target_net.sync() - def learn(self, logger=None): + def learn(self): if len(self.memory) < self.config['batch_size']: return @@ -149,8 +148,8 @@ class DQfDAgent: demo_loss = 0 loss = td_importance * dqn_loss + td_importance * dqn_n_step_loss + demo_importance * demo_loss - if self.logger is not None: - self.logger.append("Loss", loss.item()) + if log.enabled: + log.Logger["Loss"].append(loss.item()) self.net.zero_grad() loss.backward() diff --git a/rltorch/agents/PPOAgent.py b/rltorch/agents/PPOAgent.py index 97a77e8..d89aaf3 100644 --- a/rltorch/agents/PPOAgent.py +++ b/rltorch/agents/PPOAgent.py @@ -3,15 +3,15 @@ import torch import torch.nn.functional as F from torch.distributions import Categorical import rltorch +import rltorch.log as log class PPOAgent: - def __init__(self, policy_net, value_net, memory, config, logger=None): + def __init__(self, policy_net, value_net, memory, config): self.policy_net = policy_net self.old_policy_net = rltorch.network.TargetNetwork(policy_net) self.value_net = value_net self.memory = memory self.config = deepcopy(config) - self.logger = logger def _discount_rewards(self, rewards): gammas = torch.ones_like(rewards) @@ -59,9 +59,9 @@ class PPOAgent: policy_loss2 = policy_ratio.clamp(min=0.8, max=1.2) * advantages # From original paper policy_loss = -torch.min(policy_loss1, policy_loss2).sum() - if self.logger is not None: - self.logger.append("Loss/Policy", policy_loss.item()) - self.logger.append("Loss/Value", value_loss.item()) + if log.enabled: + log.Logger["Loss/Policy"].append(policy_loss.item()) + log.Logger["Loss/Value"].append(value_loss.item()) self.old_policy_net.sync() self.policy_net.zero_grad() diff --git a/rltorch/agents/QEPAgent.py b/rltorch/agents/QEPAgent.py index d46fab0..713b4e0 100644 --- a/rltorch/agents/QEPAgent.py +++ b/rltorch/agents/QEPAgent.py @@ -6,13 +6,14 @@ import torch.nn.functional as F from torch.distributions import Categorical import rltorch import rltorch.memory as M +import rltorch.log as log # Q-Evolutionary Policy Agent # Maximizes the policy with respect to the Q-Value function. # Since function is non-differentiabile, depends on the Evolutionary Strategy algorithm class QEPAgent: - def __init__(self, policy_net, value_net, memory, config, target_value_net=None, logger=None, entropy_importance=0, policy_skip=4): + def __init__(self, policy_net, value_net, memory, config, target_value_net=None, entropy_importance=0, policy_skip=4): self.policy_net = policy_net assert isinstance(self.policy_net, rltorch.network.ESNetwork) or isinstance(self.policy_net, rltorch.network.ESNetworkMP) self.policy_net.fitness = self.fitness @@ -20,7 +21,6 @@ class QEPAgent: self.target_value_net = target_value_net self.memory = memory self.config = deepcopy(config) - self.logger = logger self.policy_skip = policy_skip self.entropy_importance = entropy_importance @@ -67,7 +67,7 @@ class QEPAgent: return (entropy_importance * entropy_loss - value_importance * obtained_values).mean().item() - def learn(self, logger=None): + def learn(self): if len(self.memory) < self.config['batch_size']: return @@ -114,8 +114,8 @@ class QEPAgent: else: value_loss = F.mse_loss(obtained_values, expected_values) - if self.logger is not None: - self.logger.append("Loss/Value", value_loss.item()) + if log.enabled: + log.Logger["Loss/Value"].append(value_loss.item()) self.value_net.zero_grad() value_loss.backward() diff --git a/rltorch/agents/REINFORCEAgent.py b/rltorch/agents/REINFORCEAgent.py index 12318dc..13d6439 100644 --- a/rltorch/agents/REINFORCEAgent.py +++ b/rltorch/agents/REINFORCEAgent.py @@ -4,14 +4,13 @@ import torch import rltorch class REINFORCEAgent: - def __init__(self, net, memory, config, target_net=None, logger=None): + def __init__(self, net, memory, config, target_net=None): self.net = net if not isinstance(memory, rltorch.memory.EpisodeMemory): raise ValueError("Memory must be of instance EpisodeMemory") self.memory = memory self.config = deepcopy(config) self.target_net = target_net - self.logger = logger # Shaped rewards implements three improvements to REINFORCE # 1) Discounted rewards, future rewards matter less than current @@ -42,8 +41,8 @@ class REINFORCEAgent: policy_loss = (-log_prob_batch * shaped_reward_batch).sum() - if self.logger is not None: - self.logger.append("Loss", policy_loss.item()) + if rltorch.log.enabled: + rltorch.log.Logger["Loss"].append(policy_loss.item()) self.net.zero_grad() policy_loss.backward() diff --git a/rltorch/env/simulate.py b/rltorch/env/simulate.py index d9b65ea..166f6a8 100644 --- a/rltorch/env/simulate.py +++ b/rltorch/env/simulate.py @@ -2,7 +2,7 @@ from copy import deepcopy import time import rltorch -def simulateEnvEps(env, actor, config, total_episodes=1, memory=None, logger=None, name="", render=False): +def simulateEnvEps(env, actor, config, total_episodes=1, memory=None, name="", render=False): for episode in range(total_episodes): state = env.reset() done = False @@ -23,8 +23,8 @@ def simulateEnvEps(env, actor, config, total_episodes=1, memory=None, logger=Non print("episode: {}/{}, score: {}" .format(episode, total_episodes, episode_reward), flush=True) - if logger is not None: - logger.append(name + '/EpisodeReward', episode_reward) + if rltorch.log.enabled: + rltorch.log.Logger[name + '/EpisodeReward'].append(episode_reward) class EnvironmentRunSync: @@ -42,7 +42,6 @@ class EnvironmentRunSync: def run(self, iterations): state = self.last_state - logger = rltorch.log.Logger() if self.logwriter is not None else None for _ in range(iterations): action = self.actor.act(state) next_state, reward, done, _ = self.env.step(action) @@ -61,13 +60,13 @@ class EnvironmentRunSync: .format(self.episode_num, self.config['total_training_episodes'], self.episode_reward), flush=True) if self.logwriter is not None: - logger.append(self.name + '/EpisodeReward', self.episode_reward) + rltorch.log.Logger[self.name + '/EpisodeReward'].append(self.episode_reward) self.episode_reward = 0 state = self.env.reset() self.episode_num += 1 if self.logwriter is not None: - self.logwriter.write(logger) + self.logwriter.write(rltorch.log.Logger) self.last_state = state @@ -86,15 +85,13 @@ class EnvironmentEpisodeSync: state = self.env.reset() done = False episodeReward = 0 - logger = rltorch.log.Logger() if self.logwriter is not None else None while not done: action = self.actor.act(state) next_state, reward, done, _ = self.env.step(action) - episodeReward += reward if self.memory is not None: self.memory.append(state, action, reward, next_state, done) - + state = next_state if self.episode_num % self.config['print_stat_n_eps'] == 0: @@ -102,7 +99,7 @@ class EnvironmentEpisodeSync: .format(self.episode_num, self.config['total_training_episodes'], episodeReward), flush=True) if self.logwriter is not None: - logger.append(self.name + '/EpisodeReward', episodeReward) - self.logwriter.write(logger) + rltorch.log.Logger[self.name + '/EpisodeReward'].append(episodeReward) + self.logwriter.write(rltorch.log.Logger) self.episode_num += 1 diff --git a/rltorch/log.py b/rltorch/log.py index ff3335f..13d6e7d 100644 --- a/rltorch/log.py +++ b/rltorch/log.py @@ -3,6 +3,7 @@ from typing import Dict, List, Any import numpy as np import torch +enabled = False Logger: Dict[Any, List[Any]] = defaultdict(list) class LogWriter: diff --git a/rltorch/memory/__init__.py b/rltorch/memory/__init__.py index 05312d9..1efb2ad 100644 --- a/rltorch/memory/__init__.py +++ b/rltorch/memory/__init__.py @@ -2,4 +2,3 @@ from .EpisodeMemory import * from .ReplayMemory import * from .PrioritizedReplayMemory import * from .DQfDMemory import * -from .iDQfDMemory import * \ No newline at end of file diff --git a/rltorch/mp/EnvironmentEpisode.py b/rltorch/mp/EnvironmentEpisode.py index 87d3502..e141cd9 100644 --- a/rltorch/mp/EnvironmentEpisode.py +++ b/rltorch/mp/EnvironmentEpisode.py @@ -3,14 +3,14 @@ from copy import deepcopy import torch.multiprocessing as mp +import rltorch.log as log class EnvironmentEpisode(mp.Process): - def __init__(self, env, actor, config, logger=None, name=""): + def __init__(self, env, actor, config, name=""): super(EnvironmentEpisode, self).__init__() self.env = env self.actor = actor self.config = deepcopy(config) - self.logger = logger self.name = name self.episode_num = 1 @@ -30,7 +30,7 @@ class EnvironmentEpisode(mp.Process): if printstat: print("episode: {}/{}, score: {}" .format(self.episode_num, self.config['total_training_episodes'], episode_reward)) - if self.logger is not None: - self.logger.append(self.name + '/EpisodeReward', episode_reward) + if log.enabled: + log.Logger[self.name + '/EpisodeReward'].append(episode_reward) self.episode_num += 1 diff --git a/rltorch/network/ESNetwork.py b/rltorch/network/ESNetwork.py index f498806..687abcc 100644 --- a/rltorch/network/ESNetwork.py +++ b/rltorch/network/ESNetwork.py @@ -2,7 +2,7 @@ from copy import deepcopy import numpy as np import torch from .Network import Network - +import rltorch.log as log # [TODO] Should we torch.no_grad the __call__? # What if we want to sometimes do gradient descent as well? @@ -34,13 +34,11 @@ class ESNetwork(Network): A dictionary of configuration items. device A device to send the weights to. - logger - Keeps track of historical weights name For use in logger to differentiate in analysis. """ - def __init__(self, model, optimizer, population_size, fitness_fn, config, sigma=0.05, device=None, logger=None, name=""): - super(ESNetwork, self).__init__(model, optimizer, config, device, logger, name) + def __init__(self, model, optimizer, population_size, fitness_fn, config, sigma=0.05, device=None, name=""): + super(ESNetwork, self).__init__(model, optimizer, config, device, name) self.population_size = population_size self.fitness = fitness_fn self.sigma = sigma @@ -105,8 +103,8 @@ class ESNetwork(Network): [self.fitness(x, *args) for x in candidate_solutions], device=self.device ) - if self.logger is not None: - self.logger.append(self.name + "/" + "fitness_value", fitness_values.mean().item()) + if log.enabled: + log.Logger[self.name + "/" + "fitness_value"].append(fitness_values.mean().item()) fitness_values = (fitness_values - fitness_values.mean()) / (fitness_values.std() + np.finfo('float').eps) ## Insert adjustments into gradients slot diff --git a/rltorch/network/ESNetworkMP.py b/rltorch/network/ESNetworkMP.py index 85372b4..a69cdb8 100644 --- a/rltorch/network/ESNetworkMP.py +++ b/rltorch/network/ESNetworkMP.py @@ -3,6 +3,7 @@ import numpy as np import torch import torch.multiprocessing as mp from .Network import Network +import rltorch.log as log class fn_copy: def __init__(self, fn, args): @@ -19,8 +20,8 @@ class ESNetworkMP(Network): fitness_fun := model, *args -> fitness_value (float) We wish to find a model that maximizes the fitness function """ - def __init__(self, model, optimizer, population_size, fitness_fn, config, sigma=0.05, device=None, logger=None, name=""): - super(ESNetworkMP, self).__init__(model, optimizer, config, device, logger, name) + def __init__(self, model, optimizer, population_size, fitness_fn, config, sigma=0.05, device=None, name=""): + super(ESNetworkMP, self).__init__(model, optimizer, config, device, name) self.population_size = population_size self.fitness = fitness_fn self.sigma = sigma @@ -76,8 +77,8 @@ class ESNetworkMP(Network): device=self.device ) - if self.logger is not None: - self.logger.append(self.name + "/" + "fitness_value", fitness_values.mean().item()) + if log.enabled: + log.Logger[self.name + "/" + "fitness_value"].append(fitness_values.mean().item()) fitness_values = (fitness_values - fitness_values.mean()) / (fitness_values.std() + np.finfo('float').eps) ## Insert adjustments into gradients slot diff --git a/rltorch/network/Network.py b/rltorch/network/Network.py index db09d6a..1dde8ee 100644 --- a/rltorch/network/Network.py +++ b/rltorch/network/Network.py @@ -1,3 +1,5 @@ +import rltorch.log as log + class Network: """ Wrapper around model and optimizer in PyTorch to abstract away common use cases. @@ -12,12 +14,10 @@ class Network: A dictionary of configuration items. device A device to send the weights to. - logger - Keeps track of historical weights name For use in logger to differentiate in analysis. """ - def __init__(self, model, optimizer, config, device=None, logger=None, name=""): + def __init__(self, model, optimizer, config, device=None, name=""): self.model = model if 'weight_decay' in config: self.optimizer = optimizer( @@ -27,7 +27,6 @@ class Network: ) else: self.optimizer = optimizer(model.parameters(), lr=config['learning_rate']) - self.logger = logger self.name = name self.device = device if self.device is not None: @@ -63,8 +62,8 @@ class Network: self.optimizer.step() def log_named_parameters(self): - if self.logger is not None: + if log.enabled: for name, param in self.model.named_parameters(): - self.logger.append(self.name + "/" + name, param.cpu().detach().numpy()) + log.Logger[self.name + "/" + name].append(param.cpu().detach().numpy())