From ab40cb1b552cdfe09ecd387a79c406d12fcaaefa Mon Sep 17 00:00:00 2001 From: Chris Nota Date: Tue, 23 Jul 2019 19:10:55 -0400 Subject: [PATCH 1/9] Feature/experiment (#88) * add new experiment class * update SlurmExperiment to accept multiple agents at once * fix parens * re-add parallel runner * update Experiment tests * update scripts * update demos --- all/experiments/experiment.py | 143 +++++++---------------------- all/experiments/experiment_test.py | 52 ++++++----- all/experiments/runner.py | 126 +++++++++++++++++++++++++ all/experiments/slurm.py | 23 +++-- demos/slurm_atari.py | 5 +- demos/slurm_atari_full_suite.py | 2 +- scripts/atari.py | 26 +++--- scripts/classic.py | 32 +++---- scripts/continuous.py | 39 ++++---- scripts/release.py | 48 +++++----- 10 files changed, 276 insertions(+), 220 deletions(-) create mode 100644 all/experiments/runner.py diff --git a/all/experiments/experiment.py b/all/experiments/experiment.py index b46bff0d..0578463e 100644 --- a/all/experiments/experiment.py +++ b/all/experiments/experiment.py @@ -1,123 +1,42 @@ -from timeit import default_timer as timer import numpy as np -import torch -from all.environments import GymEnvironment, State +from .runner import SingleEnvRunner, ParallelEnvRunner from .writer import ExperimentWriter - class Experiment: - def __init__(self, env, frames=None, episodes=None): - if frames is None: - frames = np.inf - if episodes is None: - episodes = np.inf - if isinstance(env, str): - self.env = GymEnvironment(env) - else: - self.env = env - self._max_frames = frames - self._max_episodes = episodes - self._agent = None - self._episode = None - self._frames = None - self._writer = None - self._render = None - self._console = None - - def run( + def __init__( self, - make_agent, - label=None, + agents, + envs, + frames=np.inf, + episodes=np.inf, render=False, - console=True, - write_loss=True + quiet=False, + write_loss=True, ): - if isinstance(make_agent, tuple): - make, n_envs = make_agent - self._init_trial(make, label, render, console, write_loss) - self._run_multi(make, n_envs) - else: - self._init_trial(make_agent, label, render, console, write_loss) - self._run_single(make_agent) - - def _init_trial(self, make_agent, label, render, console, write_loss): - if label is None: - label = make_agent.__name__ - self._frames = 0 - self._episode = 1 - self._render = render - self._console = console - self._writer = self._make_writer(label, write_loss) - - def _run_single(self, make_agent): - self._agent = make_agent(self.env, writer=self._writer) - while not self._done(): - self._run_episode() - - def _run_episode(self): - # setup - env = self.env - agent = self._agent - start = timer() - start_frames = self._frames - returns = 0 + if not isinstance(agents, list): + agents = [agents] - # run episode - env.reset() - while not env.done: - if self._render: - env.render() - env.step(agent.act(env.state, env.reward)) - returns += env.reward - self._frames += 1 - self._writer.frames = self._frames - agent.act(env.state, env.reward) + if not isinstance(envs, list): + envs = [envs] - # cleanup and logging - end = timer() - fps = (self._frames - start_frames) / (end - start) - self._log(returns, fps) - self._episode += 1 - self._writer.episodes = self._episode - - def _run_multi(self, make_agent, n_envs): - envs = self.env.duplicate(n_envs) - agent = make_agent(envs, writer=self._writer) for env in envs: - env.reset() - returns = torch.zeros((n_envs)).float().to(self.env.device) - start = timer() - while not self._done(): - states = State.from_list([env.state for env in envs]) - rewards = torch.tensor([env.reward for env in envs]).float().to(self.env.device) - actions = agent.act(states, rewards) - for i, env in enumerate(envs): - if env.done: - end = timer() - fps = self._frames / (end - start) - returns[i] += rewards[i] - self._log(returns[i], fps) - env.reset() - returns[i] = 0 - self._episode += 1 - self._writer.episodes = self._episode + for agent in agents: + if isinstance(agent, tuple): + agent_name = agent[0].__name__ + runner = ParallelEnvRunner else: - if actions[i] is not None: - returns[i] += rewards[i] - env.step(actions[i]) - self._frames += 1 - self._writer.frames = self._frames - - def _done(self): - return self._frames > self._max_frames or self._episode > self._max_episodes - - def _log(self, returns, fps): - if self._console: - print("episode: %i, frames: %i, fps: %d, returns: %d" % - (self._episode, self._frames, fps, returns)) - self._writer.add_evaluation('returns-by-episode', returns, step="episode") - self._writer.add_evaluation('returns-by-frame', returns, step="frame") - self._writer.add_scalar('fps', fps, step="frame") - - def _make_writer(self, label, write_loss): - return ExperimentWriter(label, self.env.name, loss=write_loss) + agent_name = agent.__name__ + runner = SingleEnvRunner + + runner( + agent, + env, + frames=frames, + episodes=episodes, + render=render, + quiet=quiet, + writer=self._make_writer(agent_name, env.name, write_loss), + ) + + def _make_writer(self, agent_name, env_name, write_loss): + return ExperimentWriter(agent_name, env_name, write_loss) diff --git a/all/experiments/experiment_test.py b/all/experiments/experiment_test.py index 349a5759..0ecc7cbb 100644 --- a/all/experiments/experiment_test.py +++ b/all/experiments/experiment_test.py @@ -1,9 +1,12 @@ import unittest import numpy as np import torch -from all.presets.classic_control import dqn +from all.presets.classic_control import dqn, a2c +from all.environments import GymEnvironment from all.experiments import Experiment, Writer +# pylint: disable=protected-access + class MockWriter(Writer): def __init__(self, label, write_loss): @@ -15,10 +18,7 @@ def __init__(self, label, write_loss): def add_scalar(self, key, value, step="frame"): if not key in self.data: - self.data[key] = { - "values": [], - "steps": [] - } + self.data[key] = {"values": [], "steps": []} self.data[key]["values"].append(value) self.data[key]["steps"].append(self._get_step(step)) @@ -26,7 +26,7 @@ def add_loss(self, name, value, step="frame"): pass def add_evaluation(self, name, value, step="frame"): - self.add_scalar('evaluation/' + name, value, self._get_step(step)) + self.add_scalar("evaluation/" + name, value, self._get_step(step)) def _get_step(self, _type): if _type == "frame": @@ -37,39 +37,43 @@ def _get_step(self, _type): class MockExperiment(Experiment): - def _make_writer(self, label, write_loss=True): - return MockWriter(label, write_loss) - -# pylint: disable=protected-access + def _make_writer(self, agent_name, env_name, write_loss): + self._writer = MockWriter(agent_name + '_' + env_name, write_loss) + return self._writer class TestExperiment(unittest.TestCase): def setUp(self): np.random.seed(0) torch.manual_seed(0) - self.experiment = MockExperiment('CartPole-v0', episodes=3) - self.experiment.env.seed(0) + self.env = GymEnvironment('CartPole-v0') + self.env.seed(0) + self.experiment = None def test_adds_label(self): - self.experiment.run(dqn(), console=False) - self.assertEqual(self.experiment._writer.label, "_dqn") + experiment = MockExperiment(dqn(), self.env, quiet=True, episodes=3) + self.assertEqual(experiment._writer.label, "_dqn_CartPole-v0") def test_writes_returns_eps(self): - self.experiment.run(dqn(), console=False) + experiment = MockExperiment(dqn(), self.env, quiet=True, episodes=3) np.testing.assert_equal( - self.experiment._writer.data["evaluation/returns-by-episode"]["values"], - np.array([14., 19., 26.]) + experiment._writer.data["evaluation/returns-by-episode"]["values"], + np.array([14.0, 19.0, 26.0]), ) np.testing.assert_equal( - self.experiment._writer.data["evaluation/returns-by-episode"]["steps"], - np.array([1, 2, 3]) + experiment._writer.data["evaluation/returns-by-episode"]["steps"], + np.array([1, 2, 3]), ) def test_writes_loss(self): - self.experiment.run(dqn(), console=False) - self.assertTrue(self.experiment._writer.write_loss) - self.experiment.run(dqn(), console=False, write_loss=False) - self.assertFalse(self.experiment._writer.write_loss) + experiment = MockExperiment(dqn(), self.env, quiet=True, write_loss=True, episodes=3) + self.assertTrue(experiment._writer.write_loss) + experiment = MockExperiment(dqn(), self.env, quiet=True, write_loss=False, episodes=3) + self.assertFalse(experiment._writer.write_loss) + + def test_runs_multi_env(self): + experiment = MockExperiment(a2c(n_envs=3), self.env, quiet=True, episodes=3) + self.assertEqual(len(experiment._writer.data["evaluation/returns-by-episode"]["values"]), 3) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/all/experiments/runner.py b/all/experiments/runner.py new file mode 100644 index 00000000..841be35f --- /dev/null +++ b/all/experiments/runner.py @@ -0,0 +1,126 @@ +from abc import ABC, abstractmethod +from timeit import default_timer as timer +import numpy as np +import torch +from all.environments import State + +class EnvRunner(ABC): + def __init__( + self, + agent, + env, + writer, + frames=np.inf, + episodes=np.inf, + render=False, + quiet=False, + ): + self._agent = agent(env, writer) + self._env = env + self._writer = writer + self._max_frames = frames + self._max_episodes = episodes + self._render = render + self._quiet = quiet + self.run() + + @abstractmethod + def run(self): + pass + + def _done(self): + return ( + self._writer.frames > self._max_frames or + self._writer.episodes > self._max_episodes + ) + + def _log(self, returns, fps): + if not self._quiet: + print("episode: %i, frames: %i, fps: %d, returns: %d" % + (self._writer.episodes, self._writer.frames, fps, returns)) + self._writer.add_evaluation('returns-by-episode', returns, step="episode") + self._writer.add_evaluation('returns-by-frame', returns, step="frame") + self._writer.add_scalar('fps', fps, step="frame") + +class SingleEnvRunner(EnvRunner): + def run(self): + while not self._done(): + self._run_episode() + + def _run_episode(self): + start_time = timer() + start_frames = self._writer.frames + returns = self._run_until_terminal_state() + end_time = timer() + fps = (self._writer.frames - start_frames) / (end_time - start_time) + self._log(returns, fps) + self._writer.episodes += 1 + + def _run_until_terminal_state(self): + agent = self._agent + env = self._env + + env.reset() + returns = 0 + action = agent.act(env.state, env.reward) + + while not env.done: + self._writer.frames += 1 + if self._render: + env.render() + env.step(action) + returns += env.reward + action = agent.act(env.state, env.reward) + + return returns + +class ParallelEnvRunner(EnvRunner): + def __init__(self, agent, env, writer, **kwargs): + make_agent, n_envs = agent + envs = env.duplicate(n_envs) + self._n_envs = n_envs + self._returns = None + self._start_time = None + super().__init__(make_agent, envs, writer, **kwargs) + + def run(self): + self._reset() + while not self._done(): + self._step() + + def _reset(self): + for env in self._env: + env.reset() + self._returns = torch.zeros( + (self._n_envs), + dtype=torch.float, + device=self._env[0].device + ) + self._start_time = timer() + + def _step(self): + states = State.from_list([env.state for env in self._env]) + rewards = torch.tensor( + [env.reward for env in self._env], + dtype=torch.float, + device=self._env[0].device + ) + actions = self._agent.act(states, rewards) + + for i, env in enumerate(self._env): + self._step_env(i, env, actions[i]) + + def _step_env(self, i, env, action): + if env.done: + self._returns[i] += env.reward + end_time = timer() + fps = self._writer.frames / (end_time - self._start_time) + self._log(self._returns[i], fps) + env.reset() + self._returns[i] = 0 + self._writer.episodes += 1 + else: + if action is not None: + self._returns[i] += env.reward + env.step(action) + self._writer.frames += 1 diff --git a/all/experiments/slurm.py b/all/experiments/slurm.py index 06233ddc..952b042f 100644 --- a/all/experiments/slurm.py +++ b/all/experiments/slurm.py @@ -16,18 +16,22 @@ class SlurmExperiment: def __init__( self, - agent, + agents, envs, frames, job_name='autonomous-learning-library', - hyperparameters=None, sbatch_args=None, ): - self.agent = agent + if not isinstance(agents, list): + agents = [agents] + + if not isinstance(envs, list): + envs = [envs] + + self.agents = agents self.envs = envs self.frames = frames self.job_name = job_name - self.hyperparameters = hyperparameters or {} self.sbatch_args = sbatch_args or {} self.parse_args() @@ -51,10 +55,10 @@ def parse_args(self): self.args = parser.parse_args() def run_experiment(self): - index = int(os.environ['SLURM_ARRAY_TASK_ID']) - env = self.envs[index] - experiment = Experiment(env, frames=self.frames) - experiment.run(self.agent(**self.hyperparameters), write_loss=False) + task_id = int(os.environ['SLURM_ARRAY_TASK_ID']) + env = self.envs[int(task_id / len(self.agents))] + agent = self.agents[task_id % len(self.agents)] + Experiment(agent, env, frames=self.frames, write_loss=False) def queue_jobs(self): self.create_sbatch_script() @@ -64,12 +68,13 @@ def queue_jobs(self): def create_sbatch_script(self): script = open(SCRIPT_NAME, 'w') script.write('#!/bin/sh\n\n') + num_experiments = len(self.envs) * len(self.agents) sbatch_args = { 'job-name': self.job_name, 'output': 'out/all_%A_%a.out', 'error': 'out/all_%A_%a.err', - 'array': '0-' + str(len(self.envs) - 1), + 'array': '0-' + str(num_experiments - 1), 'partition': '1080ti-short', 'ntasks': 1, 'mem-per-cpu': 4000, diff --git a/demos/slurm_atari.py b/demos/slurm_atari.py index 4f610d19..68cbd638 100644 --- a/demos/slurm_atari.py +++ b/demos/slurm_atari.py @@ -8,11 +8,8 @@ from all.presets.atari import a2c from all.environments import AtariEnvironment -# Quick demo of a2c running on slurm. -# Note that it only runs for 1 million frames. -# For real experiments, you will surely need a modified version of this script. device = 'cuda' envs = [AtariEnvironment(env, device) for env in ['Pong', 'Breakout', 'SpaceInvaders']] -SlurmExperiment(a2c, envs, 1e6, hyperparameters={'device': device}, sbatch_args={ +SlurmExperiment(a2c(device=device), envs, 1e6, sbatch_args={ 'partition': '1080ti-short' }) diff --git a/demos/slurm_atari_full_suite.py b/demos/slurm_atari_full_suite.py index 1e8023fd..7a3840fb 100644 --- a/demos/slurm_atari_full_suite.py +++ b/demos/slurm_atari_full_suite.py @@ -18,6 +18,6 @@ if 'NoFrameskip-v4' in env and not '-ram' in env ] -SlurmExperiment(a2c, envs, 1e9, hyperparameters={'device': device}, sbatch_args={ +SlurmExperiment(a2c(device=device), envs, 1e9, sbatch_args={ 'partition': '1080ti-long' # long queue: run for a week }) diff --git a/scripts/atari.py b/scripts/atari.py index e090147e..10572da6 100644 --- a/scripts/atari.py +++ b/scripts/atari.py @@ -3,29 +3,29 @@ from all.experiments import Experiment from all.presets import atari + def run_atari(): - parser = argparse.ArgumentParser(description='Run an Atari benchmark.') - parser.add_argument('env', help='Name of the Atari game (e.g. Pong)') + parser = argparse.ArgumentParser(description="Run an Atari benchmark.") + parser.add_argument("env", help="Name of the Atari game (e.g. Pong)") + parser.add_argument( + "agent", help="Name of the agent (e.g. dqn). See presets for available agents." + ) parser.add_argument( - 'agent', - help="Name of the agent (e.g. dqn). See presets for available agents." + "--device", + default="cuda", + help="The name of the device to run the agent on (e.g. cpu, cuda, cuda:0)", ) parser.add_argument( - '--device', default='cuda', - help='The name of the device to run the agent on (e.g. cpu, cuda, cuda:0)' + "--frames", type=int, default=200e6, help="The number of training frames" ) - parser.add_argument('--frames', type=int, default=200e6, help='The number of training frames') args = parser.parse_args() env = AtariEnvironment(args.env, device=args.device) agent_name = args.agent agent = getattr(atari, agent_name) - experiment = Experiment( - env, - frames=args.frames - ) - experiment.run(agent(device=args.device), label=agent_name) + Experiment(agent(device=args.device), env, frames=args.frames) + -if __name__ == '__main__': +if __name__ == "__main__": run_atari() diff --git a/scripts/classic.py b/scripts/classic.py index 6a349af0..dcd4b6c9 100644 --- a/scripts/classic.py +++ b/scripts/classic.py @@ -3,17 +3,21 @@ from all.experiments import Experiment from all.presets import classic_control -def run_atari(): - parser = argparse.ArgumentParser( - description='Run a classic control benchmark.') - parser.add_argument('env', help='Name of the env (e.g. CartPole-v1)') + +def run_classic(): + parser = argparse.ArgumentParser(description="Run a classic control benchmark.") + parser.add_argument("env", help="Name of the env (e.g. CartPole-v1)") parser.add_argument( - 'agent', help="Name of the agent (e.g. sarsa). See presets for available agents.") - parser.add_argument('--episodes', type=int, default=2000, - help='The number of training frames') + "agent", + help="Name of the agent (e.g. sarsa). See presets for available agents.", + ) parser.add_argument( - '--device', default='cuda', - help='The name of the device to run the agent on (e.g. cpu, cuda, cuda:0)' + "--episodes", type=int, default=2000, help="The number of training frames" + ) + parser.add_argument( + "--device", + default="cuda", + help="The name of the device to run the agent on (e.g. cpu, cuda, cuda:0)", ) args = parser.parse_args() @@ -21,12 +25,8 @@ def run_atari(): agent_name = args.agent agent = getattr(classic_control, agent_name) - experiment = Experiment( - env, - episodes=args.episodes - ) - experiment.run(agent(device=args.device), label=agent_name) + Experiment(agent(device=args.device), env, episodes=args.episodes) -if __name__ == '__main__': - run_atari() +if __name__ == "__main__": + run_classic() diff --git a/scripts/continuous.py b/scripts/continuous.py index 52737fd7..de066a47 100644 --- a/scripts/continuous.py +++ b/scripts/continuous.py @@ -7,28 +7,31 @@ # some example envs # can also enter ID directly envs = { - 'walker': 'BipedalWalker-v2', - 'mountaincar': 'MountainCarContinuous-v0', - 'lander': 'LunarLanderContinuous-v2', - 'hopper': 'RoboschoolHopper-v1', - 'cheetah': 'RoboschoolHalfCheetah-v1' + "walker": "BipedalWalker-v2", + "mountaincar": "MountainCarContinuous-v0", + "lander": "LunarLanderContinuous-v2", + "hopper": "RoboschoolHopper-v1", + "cheetah": "RoboschoolHalfCheetah-v1", } + def run_atari(): - parser = argparse.ArgumentParser( - description='Run a continuous actions benchmark.') - parser.add_argument('env', help='Name of the env (see envs)') + parser = argparse.ArgumentParser(description="Run a continuous actions benchmark.") + parser.add_argument("env", help="Name of the env (see envs)") + parser.add_argument( + "agent", + help="Name of the agent (e.g. actor_critic). See presets for available agents.", + ) parser.add_argument( - 'agent', help="Name of the agent (e.g. actor_critic). See presets for available agents.") - parser.add_argument('--frames', type=int, default=2e6, - help='The number of training frames') + "--frames", type=int, default=2e6, help="The number of training frames" + ) parser.add_argument( - '--device', default='cuda', - help='The name of the device to run the agent on (e.g. cpu, cuda, cuda:0)' + "--device", + default="cuda", + help="The name of the device to run the agent on (e.g. cpu, cuda, cuda:0)", ) parser.add_argument( - '--render', default=False, - help='Whether to render the environment.' + "--render", default=False, help="Whether to render the environment." ) args = parser.parse_args() @@ -42,11 +45,9 @@ def run_atari(): agent = getattr(continuous, agent_name) experiment = Experiment( - env, - frames=args.frames + agent(device=args.device), env, frames=args.frames, render=args.render ) - experiment.run(agent(device=args.device), label=agent_name, render=args.render) -if __name__ == '__main__': +if __name__ == "__main__": run_atari() diff --git a/scripts/release.py b/scripts/release.py index 4faae17b..dc18df4c 100644 --- a/scripts/release.py +++ b/scripts/release.py @@ -7,29 +7,33 @@ # run on gpu device = 'cuda' -# create slurm tasks for running classic control agents -for agent_name in classic_control.__all__: - print('CartPole-v0,', agent_name) - agent = getattr(classic_control, agent_name) - envs = [GymEnvironment('CartPole-v0', device=device)] - SlurmExperiment(agent, envs, 100000, hyperparameters={'device': device}, sbatch_args={ - 'partition': '1080ti-short' - }) +def get_agents(preset): + agents = [getattr(preset, agent_name) for agent_name in classic_control.__all__] + return [agent(device=device) for agent in agents] -# create slurm tasks for running atari agents -for agent_name in atari.__all__: - print('Breakout', agent_name) - agent = getattr(atari, agent_name) - envs = [AtariEnvironment('Breakout', device=device)] - SlurmExperiment(agent, envs, 2e7, hyperparameters={'device': device}, sbatch_args={ +SlurmExperiment( + get_agents(atari), + AtariEnvironment('Breakout', device=device), + 2e7, + sbatch_args={ 'partition': '1080ti-long' - }) + } +) + +SlurmExperiment( + get_agents(classic_control), + GymEnvironment('CartPole-v0', device=device), + 100000, + sbatch_args={ + 'partition': '1080ti-short' + } +) -# create slurm tasks for running atari agents -for agent_name in continuous.__all__: - print('Lander', agent_name) - agent = getattr(continuous, agent_name) - envs = [GymEnvironment('LunarLanderContinuous-v2', device=device)] - SlurmExperiment(agent, envs, 500000, hyperparameters={'device': device}, sbatch_args={ +SlurmExperiment( + get_agents(continuous), + GymEnvironment('LunarLanderContinuous-v2', device=device), + 500000, + sbatch_args={ 'partition': '1080ti-short' - }) + } +) From 0fa0a4e94b80e5384b89517eb1b16eb1654e4ba4 Mon Sep 17 00:00:00 2001 From: Chris Nota Date: Wed, 24 Jul 2019 18:24:16 -0400 Subject: [PATCH 2/9] Feature/policy approximation (#89) * make all policies inherit from Approximation * remove unused function --- all/approximation/approximation.py | 10 +------ all/policies/__init__.py | 6 ++-- all/policies/deterministic.py | 34 +++++++-------------- all/policies/gaussian.py | 2 +- all/policies/gaussian_test.py | 2 +- all/policies/greedy.py | 6 +--- all/policies/policy.py | 11 ------- all/policies/soft_deterministic.py | 31 +++++--------------- all/policies/softmax.py | 4 +-- all/policies/stochastic.py | 47 ++++++++++++------------------ 10 files changed, 47 insertions(+), 106 deletions(-) delete mode 100644 all/policies/policy.py diff --git a/all/approximation/approximation.py b/all/approximation/approximation.py index b221e3d7..f82dfb06 100644 --- a/all/approximation/approximation.py +++ b/all/approximation/approximation.py @@ -2,7 +2,7 @@ from torch.nn import utils from torch.nn.functional import mse_loss from all.experiments import DummyWriter -from .target import FixedTarget, TrivialTarget +from .target import TrivialTarget class Approximation(): def __init__( @@ -72,11 +72,3 @@ def _dequeue(self, batch_size): items = torch.cat(self._cache[:i]) self._cache = self._cache[i:] return items - - def _init_target_model(self, target_update_frequency): - if target_update_frequency is not None: - self._target = FixedTarget(target_update_frequency) - self._target.init(self.model) - else: - self._target = TrivialTarget() - self._target.init(self.model) diff --git a/all/policies/__init__.py b/all/policies/__init__.py index 04e911a1..516fc862 100644 --- a/all/policies/__init__.py +++ b/all/policies/__init__.py @@ -1,15 +1,15 @@ -from .policy import Policy from .gaussian import GaussianPolicy from .greedy import GreedyPolicy from .softmax import SoftmaxPolicy from .stochastic import StochasticPolicy from .deterministic import DeterministicPolicy +from .soft_deterministic import SoftDeterministicPolicy __all__ = [ - "Policy", "GaussianPolicy", "GreedyPolicy", "SoftmaxPolicy", "StochasticPolicy", - "DeterministicPolicy" + "DeterministicPolicy", + "SoftDeterministicPolicy" ] diff --git a/all/policies/deterministic.py b/all/policies/deterministic.py index ad9b2b9f..5a8ee6b1 100644 --- a/all/policies/deterministic.py +++ b/all/policies/deterministic.py @@ -1,11 +1,9 @@ import torch -from all.approximation import TrivialTarget -from all.nn import ListNetwork, utils -from all.experiments import DummyWriter -from .policy import Policy +from all.approximation import Approximation +from all.nn import ListNetwork -class DeterministicPolicy(Policy): +class DeterministicPolicy(Approximation): def __init__( self, model, @@ -13,23 +11,20 @@ def __init__( space, noise, name='policy', - target=None, - clip_grad=0, - writer=DummyWriter() + **kwargs ): - self.model = ListNetwork(model, (space.shape[0],)) - self.optimizer = optimizer - self.name = name - self.device = next(model.parameters()).device + model = ListNetwork(model) + super().__init__( + model, + optimizer, + name=name, + **kwargs + ) self.noise = torch.distributions.normal.Normal(0, noise) - self._target = target or TrivialTarget() - self._target.init(self.model) self._low = torch.tensor(space.low, device=self.device) self._high = torch.tensor(space.high, device=self.device) - self._clip_grad = clip_grad self._log_probs = [] self._entropy = [] - self._writer = writer def __call__(self, state, action=None, prob=None): outputs = self.model(state).detach() @@ -50,10 +45,3 @@ def reinforce(self, _): 'Call backward() on a loss derived from the action' + 'and then call policy.step()' ) - - def step(self): - if self._clip_grad != 0: - utils.clip_grad_norm_(self.model.parameters(), self._clip_grad) - self.optimizer.step() - self.optimizer.zero_grad() - self._target.update() diff --git a/all/policies/gaussian.py b/all/policies/gaussian.py index b59d32de..2bfc4440 100644 --- a/all/policies/gaussian.py +++ b/all/policies/gaussian.py @@ -11,7 +11,7 @@ def __init__( action_dim, **kwargs ): - model = ListNetwork(model, (action_dim * 2,)) + model = ListNetwork(model) optimizer = optimizer def distribution(outputs): diff --git a/all/policies/gaussian_test.py b/all/policies/gaussian_test.py index 3c6fe528..8275a060 100644 --- a/all/policies/gaussian_test.py +++ b/all/policies/gaussian_test.py @@ -7,7 +7,7 @@ STATE_DIM = 2 ACTION_DIM = 3 -class TestSoftmax(unittest.TestCase): +class TestGaussian(unittest.TestCase): def setUp(self): torch.manual_seed(2) self.model = nn.Sequential( diff --git a/all/policies/greedy.py b/all/policies/greedy.py index 453f0f01..3dd4b429 100644 --- a/all/policies/greedy.py +++ b/all/policies/greedy.py @@ -1,8 +1,7 @@ import numpy as np import torch -from .policy import Policy -class GreedyPolicy(Policy): +class GreedyPolicy(): def __init__( self, q, @@ -30,9 +29,6 @@ def __call__(self, state, action=None, prob=False): action_scores = self.q.eval(state) return torch.argmax(action_scores, dim=1) - def reinforce(self, errors): - return # not possible - def anneal(self): self.executions += 1 if self.executions < self.annealing_start: diff --git a/all/policies/policy.py b/all/policies/policy.py deleted file mode 100644 index cdc7026d..00000000 --- a/all/policies/policy.py +++ /dev/null @@ -1,11 +0,0 @@ -from abc import abstractmethod - -# pylint: disable=arguments-differ -class Policy(): - @abstractmethod - def __call__(self, state, action=None, prob=False): - pass - - @abstractmethod - def reinforce(self, errors): - pass diff --git a/all/policies/soft_deterministic.py b/all/policies/soft_deterministic.py index 74ef80d5..e5967b0d 100644 --- a/all/policies/soft_deterministic.py +++ b/all/policies/soft_deterministic.py @@ -1,31 +1,23 @@ import torch -from all.approximation import TrivialTarget -from all.nn import ListNetwork, utils -from all.experiments import DummyWriter -from .policy import Policy +from all.approximation import Approximation +from all.nn import ListNetwork -class SoftDeterministicPolicy(Policy): +class SoftDeterministicPolicy(Approximation): def __init__( self, model, optimizer, space, name="policy", - target=None, - clip_grad=0, - writer=DummyWriter(), + **kwargs ): - self.model = ListNetwork(model) - self.optimizer = optimizer - self.name = name - self.device = next(model.parameters()).device + model = ListNetwork(model) + optimizer = optimizer + name = name + super().__init__(model, optimizer, name=name, **kwargs) self._action_dim = space.shape[0] - self._target = target or TrivialTarget() - self._target.init(self.model) - self._clip_grad = clip_grad self._entropy = [] - self._writer = writer self._last_dist = None self._raw_actions = None # parameters for squashing to tanh @@ -60,13 +52,6 @@ def reinforce(self, _): + "and then call policy.step()" ) - def step(self): - if self._clip_grad != 0: - utils.clip_grad_norm_(self.model.parameters(), self._clip_grad) - self.optimizer.step() - self.optimizer.zero_grad() - self._target.update() - def _distribution(self, outputs): means = outputs[:, 0 : self._action_dim] logvars = outputs[:, self._action_dim :] diff --git a/all/policies/softmax.py b/all/policies/softmax.py index cd508e1a..5042d6dd 100644 --- a/all/policies/softmax.py +++ b/all/policies/softmax.py @@ -9,10 +9,10 @@ def __init__( self, model, optimizer, - actions, + _, # deprecated **kwargs ): - model = ListNetwork(model, (actions,)) + model = ListNetwork(model) def distribution(outputs): probs = functional.softmax(outputs, dim=-1) diff --git a/all/policies/stochastic.py b/all/policies/stochastic.py index 16c68d50..5490c791 100644 --- a/all/policies/stochastic.py +++ b/all/policies/stochastic.py @@ -1,10 +1,8 @@ import torch -from torch.nn import utils -from all.experiments import DummyWriter -from .policy import Policy +from all.approximation import Approximation -class StochasticPolicy(Policy): +class StochasticPolicy(Approximation): def __init__( self, model, @@ -12,28 +10,27 @@ def __init__( distribution, name='policy', entropy_loss_scaling=0, - clip_grad=0, - writer=DummyWriter() + **kwargs ): - self.model = model - self.optimizer = optimizer + super().__init__( + model, + optimizer, + name=name, + **kwargs + ) self.distribution = distribution - self.name = name - self.device = next(model.parameters()).device self._entropy_loss_scaling = entropy_loss_scaling - self._clip_grad = clip_grad self._log_probs = [] self._entropy = [] - self._writer = writer def __call__(self, state, action=None, prob=None): outputs = self.model(state) distribution = self.distribution(outputs) if action is None: action = distribution.sample() - self._cache(distribution, action) + self._enqueue(distribution, action) return action - self._cache(distribution, action) + self._enqueue(distribution, action) return distribution.log_prob(action) def eval(self, state, action=None): @@ -47,36 +44,30 @@ def eval(self, state, action=None): def reinforce(self, loss, retain_graph=False): if callable(loss): - log_probs, entropy = self._decache_all() + log_probs, entropy = self._dequeue_all() policy_loss = loss(log_probs) else: # shape the data properly errors = loss.view(-1) batch_size = len(errors) - log_probs, entropy = self._decache(batch_size) + log_probs, entropy = self._dequeue(batch_size) policy_loss = (-log_probs.transpose(0, -1) * errors).mean() if log_probs.requires_grad: # compute losses entropy_loss = -entropy.mean() loss = policy_loss + self._entropy_loss_scaling * entropy_loss - self._writer.add_loss(self.name, loss) - self._writer.add_loss(self.name + '/pg', policy_loss) - self._writer.add_loss(self.name + '/entropy', entropy_loss) + self._writer.add_loss(self._name, loss) + self._writer.add_loss(self._name + '/pg', policy_loss) + self._writer.add_loss(self._name + '/entropy', entropy_loss) loss.backward(retain_graph=retain_graph) # take gradient step self.step() - def step(self): - if self._clip_grad != 0: - utils.clip_grad_norm_(self.model.parameters(), self._clip_grad) - self.optimizer.step() - self.optimizer.zero_grad() - - def _cache(self, distribution, action): + def _enqueue(self, distribution, action): self._log_probs.append(distribution.log_prob(action)) self._entropy.append(distribution.entropy()) - def _decache(self, batch_size): + def _dequeue(self, batch_size): i = 0 items = 0 while items < batch_size and i < len(self._log_probs): @@ -92,7 +83,7 @@ def _decache(self, batch_size): return log_probs, entropy - def _decache_all(self): + def _dequeue_all(self): log_probs = torch.cat(self._log_probs) self._log_probs = [] entropy = torch.cat(self._entropy) From 56b8a35e3954795e851253f32e02bb15526bfb45 Mon Sep 17 00:00:00 2001 From: Chris Nota Date: Fri, 26 Jul 2019 16:43:29 -0400 Subject: [PATCH 3/9] Feature/save (#90) * add Checkpointer * add load_and_watch functionality * make approximation default to saving model periodically * add watch scripts * fix cyclic import * make sure runs is created so that tests don't fail --- all/agents/evaluation/greedy_agent.py | 0 all/agents/sac.py | 2 +- all/approximation/__init__.py | 1 + all/approximation/approximation.py | 14 ++++ all/approximation/checkpointer/__init__.py | 36 ++++++++++ all/approximation/feature_network.py | 38 ++++++----- all/approximation/target/trivial.py | 6 +- all/experiments/__init__.py | 7 +- all/experiments/watch.py | 76 ++++++++++++++++++++++ all/experiments/writer.py | 12 +++- all/presets/atari/a2c.py | 11 +++- all/presets/atari/ppo.py | 7 +- all/presets/atari/vac.py | 7 +- all/presets/atari/vpg.py | 3 +- all/presets/classic_control/vpg.py | 6 +- all/presets/continuous/ddpg.py | 3 +- all/presets/continuous/sac.py | 3 +- scripts/watch_atari.py | 20 ++++++ scripts/watch_classic.py | 19 ++++++ scripts/watch_continuous.py | 5 ++ 20 files changed, 241 insertions(+), 35 deletions(-) create mode 100644 all/agents/evaluation/greedy_agent.py create mode 100644 all/approximation/checkpointer/__init__.py create mode 100644 all/experiments/watch.py create mode 100644 scripts/watch_atari.py create mode 100644 scripts/watch_classic.py create mode 100644 scripts/watch_continuous.py diff --git a/all/agents/evaluation/greedy_agent.py b/all/agents/evaluation/greedy_agent.py new file mode 100644 index 00000000..e69de29b diff --git a/all/agents/sac.py b/all/agents/sac.py index 4ce68732..31638b18 100644 --- a/all/agents/sac.py +++ b/all/agents/sac.py @@ -1,5 +1,5 @@ import torch -from all.experiments import DummyWriter +from all.experiments.writer import DummyWriter from ._agent import Agent class SAC(Agent): diff --git a/all/approximation/__init__.py b/all/approximation/__init__.py index ff309d79..db3f9e5d 100644 --- a/all/approximation/__init__.py +++ b/all/approximation/__init__.py @@ -4,3 +4,4 @@ from .v_network import VNetwork from .feature_network import FeatureNetwork from .target import TargetNetwork, FixedTarget, PolyakTarget, TrivialTarget +from .checkpointer import Checkpointer, DummyCheckpointer, PeriodicCheckpointer diff --git a/all/approximation/approximation.py b/all/approximation/approximation.py index f82dfb06..b5500d61 100644 --- a/all/approximation/approximation.py +++ b/all/approximation/approximation.py @@ -1,8 +1,12 @@ +import os import torch from torch.nn import utils from torch.nn.functional import mse_loss from all.experiments import DummyWriter from .target import TrivialTarget +from .checkpointer import PeriodicCheckpointer + +DEFAULT_CHECKPOINT_FREQUENCY = 200 class Approximation(): def __init__( @@ -15,6 +19,7 @@ def __init__( name='approximation', target=None, writer=DummyWriter(), + checkpointer=None ): self.model = model self.device = next(model.parameters()).device @@ -29,6 +34,14 @@ def __init__( self._writer = writer self._name = name + if checkpointer is None: + checkpointer = PeriodicCheckpointer(DEFAULT_CHECKPOINT_FREQUENCY) + self._checkpointer = checkpointer + self._checkpointer.init( + self.model, + os.path.join(writer.log_dir, name + '.pt') + ) + def __call__(self, *inputs, detach=True): result = self.model(*inputs) if detach: @@ -54,6 +67,7 @@ def step(self): self._optimizer.step() self._optimizer.zero_grad() self._target.update() + self._checkpointer() def zero_grad(self): self._optimizer.zero_grad() diff --git a/all/approximation/checkpointer/__init__.py b/all/approximation/checkpointer/__init__.py new file mode 100644 index 00000000..a7e64910 --- /dev/null +++ b/all/approximation/checkpointer/__init__.py @@ -0,0 +1,36 @@ +from abc import abstractmethod, ABC +import torch + +class Checkpointer(ABC): + @abstractmethod + def init(self, model, filename): + pass + + @abstractmethod + def __call__(self): + pass + + +class DummyCheckpointer(Checkpointer): + def init(self, *inputs): + pass + + def __call__(self): + pass + + +class PeriodicCheckpointer(Checkpointer): + def __init__(self, frequency): + self.frequency = frequency + self._updates = 1 + self._filename = None + self._model = None + + def init(self, model, filename): + self._model = model + self._filename = filename + + def __call__(self): + if self._updates % self.frequency == 0: + torch.save(self._model, self._filename) + self._updates += 1 diff --git a/all/approximation/feature_network.py b/all/approximation/feature_network.py index d147d606..c71a1995 100644 --- a/all/approximation/feature_network.py +++ b/all/approximation/feature_network.py @@ -2,30 +2,34 @@ from all.environments import State from .approximation import Approximation -class FeatureNetwork(Approximation): - def __init__(self, model, optimizer=None, **kwargs): - super().__init__(model, optimizer, **kwargs) - self._cache = [] - self._out = [] +class FeatureModule(torch.nn.Module): + def __init__(self, model): + super().__init__() + self.model = model - def __call__(self, states): + def forward(self, states): features = self.model(states.features.float()) - out = features.detach() - out.requires_grad = True - self._enqueue(features, out) return State( - out, + features, mask=states.mask, info=states.info ) - def eval(self, states): - result = self._target(states.features.float()) - return State( - result, - mask=states.mask, - info=states.info - ) +class FeatureNetwork(Approximation): + def __init__(self, model, optimizer=None, name='feature', **kwargs): + model = FeatureModule(model) + super().__init__(model, optimizer, name=name, **kwargs) + self._cache = [] + self._out = [] + + def __call__(self, states): + features = self.model(states) + graphs = features.raw + # pylint: disable=protected-access + features._raw = graphs.detach() + features._raw.requires_grad = True + self._enqueue(graphs, features._raw) + return features def reinforce(self): graphs, grads = self._dequeue() diff --git a/all/approximation/target/trivial.py b/all/approximation/target/trivial.py index 31d88240..716aea59 100644 --- a/all/approximation/target/trivial.py +++ b/all/approximation/target/trivial.py @@ -7,13 +7,11 @@ def __init__(self): def __call__(self, *inputs): with torch.no_grad(): - training = self._model.training - result = self._model(*inputs) - self._model.train(training) - return result + return self._model(*inputs) def init(self, model): self._model = model + self._model.training = False def update(self): pass diff --git a/all/experiments/__init__.py b/all/experiments/__init__.py index a4300f3f..50699b4c 100644 --- a/all/experiments/__init__.py +++ b/all/experiments/__init__.py @@ -1,10 +1,15 @@ from .experiment import Experiment from .slurm import SlurmExperiment from .writer import Writer, ExperimentWriter, DummyWriter +from .watch import GreedyAgent, watch, load_and_watch + __all__ = [ "Experiment", "Writer", "ExperimentWriter", "DummyWriter", - "SlurmExperiment" + "SlurmExperiment", + "GreedyAgent", + "watch", + "load_and_watch", ] diff --git a/all/experiments/watch.py b/all/experiments/watch.py new file mode 100644 index 00000000..cf9ca2b9 --- /dev/null +++ b/all/experiments/watch.py @@ -0,0 +1,76 @@ +import os +import torch +import gym +from all.agents import Agent +from all.bodies import DeepmindAtariBody +from all.environments import AtariEnvironment + +def watch(agent, env): + action = None + returns = 0 + while True: + if env.done: + print('returns:', returns) + env.reset() + returns = 0 + else: + env.step(action) + env.render() + action = agent.act(env.state, env.reward) + returns += env.reward + +def load_and_watch(dir, env): + watch(GreedyAgent.load(dir, env), env) + +class GreedyAgent(Agent): + def __init__( + self, + action_space, + feature=None, + q=None, + policy=None + ): + self.action_space = action_space + self.feature = feature + self.policy = None + if policy: + self.policy = policy + else: + self.policy = q + if not self.policy: + raise TypeError('GreedyAgent must have either policy or q function') + + def act(self, state, reward): + with torch.no_grad(): + if self.feature: + state = self.feature(state) + if isinstance(self.action_space, gym.spaces.Discrete): + return torch.argmax(self.policy(state), dim=1) + if isinstance(self.action_space, gym.spaces.Box): + return self.policy(state)[0, :self.action_space.shape[0]] + raise TypeError('Unknown action space') + + @staticmethod + def load(dirname, env): + feature = None + policy = None + q = None + for filename in os.listdir(dirname): + if filename == 'feature.pt': + feature = torch.load(os.path.join(dirname, filename)).to(env.device) + if filename == 'policy.pt': + policy = torch.load(os.path.join(dirname, filename)).to(env.device) + if filename == 'q.pt': + q = torch.load(os.path.join(dirname, filename)).to(env.device) + + agent = GreedyAgent( + env.action_space, + feature=feature, + policy=policy, + q=q, + ) + + if isinstance(env, AtariEnvironment): + agent = DeepmindAtariBody(agent, env) + + return agent diff --git a/all/experiments/writer.py b/all/experiments/writer.py index d3b83c8c..74dc6b4a 100644 --- a/all/experiments/writer.py +++ b/all/experiments/writer.py @@ -5,8 +5,9 @@ from datetime import datetime from tensorboardX import SummaryWriter - class Writer(ABC): + log_dir = 'runs' + @abstractmethod def add_loss(self, name, value, step="frame"): pass @@ -35,13 +36,13 @@ class ExperimentWriter(SummaryWriter, Writer): def __init__(self, agent_name, env_name, loss=True): self.env_name = env_name current_time = str(datetime.now()) - log_dir = os.path.join( + self.log_dir = os.path.join( 'runs', ("%s %s %s" % (agent_name, COMMIT_HASH, current_time)) ) self._frames = 0 self._episodes = 1 self._loss = loss - super().__init__(log_dir=log_dir) + super().__init__(log_dir=self.log_dir) def add_loss(self, name, value, step="frame"): if self._loss: @@ -84,3 +85,8 @@ def get_commit_hash(): COMMIT_HASH = get_commit_hash() + +try: + os.mkdir('runs') +except FileExistsError: + pass diff --git a/all/presets/atari/a2c.py b/all/presets/atari/a2c.py index 9a4217c8..22d025da 100644 --- a/all/presets/atari/a2c.py +++ b/all/presets/atari/a2c.py @@ -37,13 +37,18 @@ def _a2c(envs, writer=DummyWriter()): policy_model.parameters(), alpha=alpha, lr=lr, eps=eps ) - features = FeatureNetwork(feature_model, feature_optimizer, clip_grad=clip_grad) + features = FeatureNetwork( + feature_model, + feature_optimizer, + clip_grad=clip_grad, + writer=writer + ) v = VNetwork( value_model, value_optimizer, loss_scaling=value_loss_scaling, clip_grad=clip_grad, - writer=writer, + writer=writer ) policy = SoftmaxPolicy( policy_model, @@ -51,7 +56,7 @@ def _a2c(envs, writer=DummyWriter()): env.action_space.n, entropy_loss_scaling=entropy_loss_scaling, clip_grad=clip_grad, - writer=writer, + writer=writer ) return ParallelAtariBody( diff --git a/all/presets/atari/ppo.py b/all/presets/atari/ppo.py index b5c21671..209d01b1 100644 --- a/all/presets/atari/ppo.py +++ b/all/presets/atari/ppo.py @@ -39,7 +39,12 @@ def _ppo(envs, writer=DummyWriter()): value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps) policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps) - features = FeatureNetwork(feature_model, feature_optimizer, clip_grad=clip_grad) + features = FeatureNetwork( + feature_model, + feature_optimizer, + clip_grad=clip_grad, + writer=writer + ) v = VNetwork( value_model, value_optimizer, diff --git a/all/presets/atari/vac.py b/all/presets/atari/vac.py index e11d12f3..a0ac4aee 100644 --- a/all/presets/atari/vac.py +++ b/all/presets/atari/vac.py @@ -50,7 +50,12 @@ def _vac(envs, writer=DummyWriter()): clip_grad=clip_grad, writer=writer, ) - features = FeatureNetwork(feature_model, feature_optimizer, clip_grad=clip_grad) + features = FeatureNetwork( + feature_model, + feature_optimizer, + clip_grad=clip_grad, + writer=writer + ) return ParallelAtariBody( VAC(features, v, policy, gamma=discount_factor), diff --git a/all/presets/atari/vpg.py b/all/presets/atari/vpg.py index 44b3a5bf..6a38f649 100644 --- a/all/presets/atari/vpg.py +++ b/all/presets/atari/vpg.py @@ -48,7 +48,8 @@ def _vpg_atari(env, writer=DummyWriter()): features = FeatureNetwork( feature_model, feature_optimizer, - clip_grad=clip_grad + clip_grad=clip_grad, + writer=writer ) v = VNetwork( value_model, diff --git a/all/presets/classic_control/vpg.py b/all/presets/classic_control/vpg.py index 1edd8b1d..c464d6f6 100644 --- a/all/presets/classic_control/vpg.py +++ b/all/presets/classic_control/vpg.py @@ -26,7 +26,11 @@ def _vpg(env, writer=DummyWriter()): policy_optimizer = Adam(policy_model.parameters(), lr=lr) features = FeatureNetwork( - feature_model, feature_optimizer, clip_grad=clip_grad) + feature_model, + feature_optimizer, + clip_grad=clip_grad, + writer=writer + ) v = VNetwork( value_model, value_optimizer, diff --git a/all/presets/continuous/ddpg.py b/all/presets/continuous/ddpg.py index 9a835efc..d958f759 100644 --- a/all/presets/continuous/ddpg.py +++ b/all/presets/continuous/ddpg.py @@ -59,7 +59,8 @@ def _ddpg(env, writer=DummyWriter()): policy_optimizer, env.action_space, noise, - target=PolyakTarget(polyak_rate) + target=PolyakTarget(polyak_rate), + writer=writer ) replay_buffer = ExperienceReplayBuffer( diff --git a/all/presets/continuous/sac.py b/all/presets/continuous/sac.py index c0298a80..b7f555ff 100644 --- a/all/presets/continuous/sac.py +++ b/all/presets/continuous/sac.py @@ -86,7 +86,8 @@ def _sac(env, writer=DummyWriter()): policy = SoftDeterministicPolicy( policy_model, policy_optimizer, - env.action_space + env.action_space, + writer=writer ) replay_buffer = ExperienceReplayBuffer( diff --git a/scripts/watch_atari.py b/scripts/watch_atari.py new file mode 100644 index 00000000..c7eb0aae --- /dev/null +++ b/scripts/watch_atari.py @@ -0,0 +1,20 @@ +import argparse +from all.environments import AtariEnvironment +from all.experiments import load_and_watch + + +def watch_atari(): + parser = argparse.ArgumentParser(description="Run an Atari benchmark.") + parser.add_argument("env", help="Name of the Atari game (e.g. Pong)") + parser.add_argument("dir", help="Directory where the agent's model was saved.") + parser.add_argument( + "--device", + default="cpu", + help="The name of the device to run the agent on (e.g. cpu, cuda, cuda:0)", + ) + args = parser.parse_args() + env = AtariEnvironment(args.env, device=args.device) + load_and_watch(args.dir, env) + +if __name__ == "__main__": + watch_atari() diff --git a/scripts/watch_classic.py b/scripts/watch_classic.py new file mode 100644 index 00000000..f3140505 --- /dev/null +++ b/scripts/watch_classic.py @@ -0,0 +1,19 @@ +import argparse +from all.environments import GymEnvironment +from all.experiments import load_and_watch + +def watch(): + parser = argparse.ArgumentParser(description="Run an Atari benchmark.") + parser.add_argument("env", help="Name of the environment (e.g. RoboschoolHalfCheetah-v1") + parser.add_argument("dir", help="Directory where the agent's model was saved.") + parser.add_argument( + "--device", + default="cpu", + help="The name of the device to run the agent on (e.g. cpu, cuda, cuda:0)", + ) + args = parser.parse_args() + env = GymEnvironment(args.env, device=args.device) + load_and_watch(args.dir, env) + +if __name__ == "__main__": + watch() diff --git a/scripts/watch_continuous.py b/scripts/watch_continuous.py new file mode 100644 index 00000000..18bf0a7a --- /dev/null +++ b/scripts/watch_continuous.py @@ -0,0 +1,5 @@ +import roboschool # registers env +from watch_classic import watch + +if __name__ == "__main__": + watch() From a3b5dfd624100b081d7d41999b9de9e28eab473e Mon Sep 17 00:00:00 2001 From: Chris Nota Date: Mon, 29 Jul 2019 15:43:26 -0400 Subject: [PATCH 4/9] Feature/autotemp (#91) * automatically learn temperature in SAC * make replay buffer convert size to int --- all/agents/sac.py | 20 +++++++++++++++----- all/memory/replay_buffer.py | 2 +- all/presets/continuous/sac.py | 10 ++++++---- 3 files changed, 22 insertions(+), 10 deletions(-) diff --git a/all/agents/sac.py b/all/agents/sac.py index 31638b18..5c05a095 100644 --- a/all/agents/sac.py +++ b/all/agents/sac.py @@ -9,7 +9,9 @@ def __init__(self, q_2, v, replay_buffer, - entropy_regularizer=0.01, + entropy_target=-2., # usually -action_space.size[0] + temperature_initial=0.1, + lr_temperature=1e-4, discount_factor=0.99, minibatch_size=32, replay_start_size=5000, @@ -28,7 +30,10 @@ def __init__(self, self.update_frequency = update_frequency self.minibatch_size = minibatch_size self.discount_factor = discount_factor - self.entropy_regularizer = entropy_regularizer + # vars for learning the temperature + self.entropy_target = entropy_target + self.temperature = temperature_initial + self.lr_temperature = lr_temperature # data self.env = None self.state = None @@ -62,10 +67,13 @@ def _train(self): v_targets = torch.min( self.q_1.eval(states, _actions), self.q_2.eval(states, _actions), - ) - self.entropy_regularizer * _log_probs + ) - self.temperature * _log_probs + temperature_loss = ((_log_probs + self.entropy_target).detach().mean()) self.writer.add_loss('entropy', -_log_probs.mean()) self.writer.add_loss('v_mean', v_targets.mean()) self.writer.add_loss('r_mean', rewards.mean()) + self.writer.add_loss('temperature_loss', temperature_loss) + self.writer.add_loss('temperature', self.temperature) # update Q-functions q_1_errors = q_targets - self.q_1(states, actions) @@ -79,15 +87,17 @@ def _train(self): # train policy _actions, _log_probs = self.policy(states, log_prob=True) - loss = -( self.q_1(states, _actions, detach=False) - - self.entropy_regularizer * _log_probs + - self.temperature * _log_probs ).mean() loss.backward() self.policy.step() self.q_1.zero_grad() + # adjust temperature + self.temperature += self.lr_temperature * temperature_loss + def _should_train(self): return (self.frames_seen > self.replay_start_size and self.frames_seen % self.update_frequency == 0) diff --git a/all/memory/replay_buffer.py b/all/memory/replay_buffer.py index bfe7b6e2..38848626 100644 --- a/all/memory/replay_buffer.py +++ b/all/memory/replay_buffer.py @@ -24,7 +24,7 @@ def update_priorities(self, indexes, td_errors): class ExperienceReplayBuffer(ReplayBuffer): def __init__(self, size, device=torch.device('cpu')): self.buffer = [] - self.capacity = size + self.capacity = int(size) self.pos = 0 self.device = device diff --git a/all/presets/continuous/sac.py b/all/presets/continuous/sac.py index b7f555ff..403720fa 100644 --- a/all/presets/continuous/sac.py +++ b/all/presets/continuous/sac.py @@ -42,10 +42,11 @@ def fc_policy(env): def sac( lr_q=3e-4, lr_v=3e-4, - lr_pi=1e-4, - entropy_regularizer=0.1, + lr_pi=3e-4, + lr_temperature=1e-5, + entropy_target_scaling=1, replay_start_size=5000, - replay_buffer_size=50000, + replay_buffer_size=1e6, minibatch_size=256, discount_factor=0.99, polyak_rate=0.005, @@ -101,7 +102,8 @@ def _sac(env, writer=DummyWriter()): q_2, v, replay_buffer, - entropy_regularizer=entropy_regularizer, + entropy_target=(-env.action_space.shape[0] * entropy_target_scaling), + lr_temperature=lr_temperature, replay_start_size=replay_start_size, discount_factor=discount_factor, update_frequency=update_frequency, From 02ec6c6b4e0b1187ab5967752f77bf2bef06915a Mon Sep 17 00:00:00 2001 From: Chris Nota Date: Tue, 30 Jul 2019 11:55:08 -0400 Subject: [PATCH 5/9] Feature/scheduler (#92) * add linear scheduler * add schedule to classic dqn * Make Agent schedulable * update e-greedy agent presets to use scheduler * add lrscheduler to approximation * fix load_and_watch scripts for pybullet * make watch fps configurable * add scheduler to ppo --- Makefile | 5 ++- all/agents/_agent.py | 3 +- all/agents/ppo.py | 3 +- all/agents/sac.py | 2 +- all/approximation/approximation.py | 12 +++++- all/environments/abstract.py | 2 +- all/environments/gym.py | 4 +- all/experiments/__init__.py | 4 -- all/experiments/experiment.py | 2 +- all/experiments/experiment_test.py | 6 ++- all/experiments/watch.py | 12 ++++-- .../writer.py => logging/__init__.py} | 11 +++++ all/optim/__init__.py | 1 + all/optim/scheduler.py | 42 +++++++++++++++++++ all/optim/scheduler_test.py | 18 ++++++++ all/policies/greedy.py | 27 ++---------- all/presets/atari/a2c.py | 2 +- all/presets/atari/dqn.py | 22 ++++++---- all/presets/atari/ppo.py | 41 +++++++++++++++--- all/presets/atari/rainbow.py | 23 ++++++---- all/presets/atari/vac.py | 2 +- all/presets/atari/vpg.py | 2 +- all/presets/atari/vqn.py | 21 ++++++---- all/presets/atari/vsarsa.py | 21 ++++++---- all/presets/classic_control/a2c.py | 2 +- all/presets/classic_control/dqn.py | 14 +++++-- all/presets/classic_control/ppo.py | 2 +- all/presets/classic_control/rainbow.py | 7 +--- all/presets/classic_control/vac.py | 2 +- all/presets/classic_control/vpg.py | 2 +- all/presets/classic_control/vqn.py | 4 +- all/presets/classic_control/vsarsa.py | 8 +++- all/presets/continuous/ddpg.py | 2 +- all/presets/continuous/sac.py | 2 +- all/presets/validate_agent.py | 2 +- scripts/watch_continuous.py | 4 +- 36 files changed, 237 insertions(+), 102 deletions(-) rename all/{experiments/writer.py => logging/__init__.py} (87%) create mode 100644 all/optim/__init__.py create mode 100644 all/optim/scheduler.py create mode 100644 all/optim/scheduler_test.py diff --git a/Makefile b/Makefile index a929b20e..bb1fea63 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,7 @@ install: - pip install https://download.pytorch.org/whl/cu100/torch-1.0.1.post2-cp37-cp37m-linux_x86_64.whl - pip install torchvision tensorflow + pip install https://download.pytorch.org/whl/cu100/torch-1.1.0-cp37-cp37m-linux_x86_64.whl + pip install https://download.pytorch.org/whl/cu100/torchvision-0.3.0-cp37-cp37m-linux_x86_64.whl + pip install tensorflow pip install -e . lint: diff --git a/all/agents/_agent.py b/all/agents/_agent.py index 9d05280f..5882ca88 100644 --- a/all/agents/_agent.py +++ b/all/agents/_agent.py @@ -1,6 +1,7 @@ from abc import ABC, abstractmethod +from all.optim import Schedulable -class Agent(ABC): +class Agent(ABC, Schedulable): """ A reinforcement learning agent. diff --git a/all/agents/ppo.py b/all/agents/ppo.py index 33a7212d..8115cf68 100644 --- a/all/agents/ppo.py +++ b/all/agents/ppo.py @@ -76,7 +76,8 @@ def _compute_policy_loss(self, pi_0, advantages): def _policy_loss(pi_i): ratios = torch.exp(pi_i - pi_0) surr1 = ratios * advantages - surr2 = torch.clamp(ratios, 1.0 - self._epsilon, 1.0 + self._epsilon) * advantages + epsilon = self._epsilon + surr2 = torch.clamp(ratios, 1.0 - epsilon, 1.0 + epsilon) * advantages return -torch.min(surr1, surr2).mean() return _policy_loss diff --git a/all/agents/sac.py b/all/agents/sac.py index 5c05a095..13723f59 100644 --- a/all/agents/sac.py +++ b/all/agents/sac.py @@ -1,5 +1,5 @@ import torch -from all.experiments.writer import DummyWriter +from all.logging import DummyWriter from ._agent import Agent class SAC(Agent): diff --git a/all/approximation/approximation.py b/all/approximation/approximation.py index b5500d61..1fa72a0d 100644 --- a/all/approximation/approximation.py +++ b/all/approximation/approximation.py @@ -2,7 +2,7 @@ import torch from torch.nn import utils from torch.nn.functional import mse_loss -from all.experiments import DummyWriter +from all.logging import DummyWriter from .target import TrivialTarget from .checkpointer import PeriodicCheckpointer @@ -17,6 +17,7 @@ def __init__( loss_scaling=1, loss=mse_loss, name='approximation', + scheduler=None, target=None, writer=DummyWriter(), checkpointer=None @@ -24,6 +25,7 @@ def __init__( self.model = model self.device = next(model.parameters()).device self._target = target or TrivialTarget() + self._scheduler = scheduler self._target.init(model) self._updates = 0 self._optimizer = optimizer @@ -67,6 +69,9 @@ def step(self): self._optimizer.step() self._optimizer.zero_grad() self._target.update() + if self._scheduler: + self._writer.add_schedule(self._name + '/lr', self._optimizer.param_groups[0]['lr']) + self._scheduler.step() self._checkpointer() def zero_grad(self): @@ -86,3 +91,8 @@ def _dequeue(self, batch_size): items = torch.cat(self._cache[:i]) self._cache = self._cache[i:] return items + +class ConstantLR(): + '''Dummy LRScheduler''' + def step(self): + pass diff --git a/all/environments/abstract.py b/all/environments/abstract.py index 96149823..71c80e1c 100644 --- a/all/environments/abstract.py +++ b/all/environments/abstract.py @@ -52,7 +52,7 @@ def step(self, action): """ @abstractmethod - def render(self): + def render(self, **kwargs): """ Render the current environment state. """ diff --git a/all/environments/gym.py b/all/environments/gym.py index 331caa4e..7262400e 100644 --- a/all/environments/gym.py +++ b/all/environments/gym.py @@ -40,8 +40,8 @@ def step(self, action): self._done = done return self._state, self._reward - def render(self): - return self._env.render() + def render(self, **kwargs): + return self._env.render(**kwargs) def close(self): return self._env.close() diff --git a/all/experiments/__init__.py b/all/experiments/__init__.py index 50699b4c..77193d70 100644 --- a/all/experiments/__init__.py +++ b/all/experiments/__init__.py @@ -1,13 +1,9 @@ from .experiment import Experiment from .slurm import SlurmExperiment -from .writer import Writer, ExperimentWriter, DummyWriter from .watch import GreedyAgent, watch, load_and_watch __all__ = [ "Experiment", - "Writer", - "ExperimentWriter", - "DummyWriter", "SlurmExperiment", "GreedyAgent", "watch", diff --git a/all/experiments/experiment.py b/all/experiments/experiment.py index 0578463e..a7419f64 100644 --- a/all/experiments/experiment.py +++ b/all/experiments/experiment.py @@ -1,6 +1,6 @@ import numpy as np +from all.logging import ExperimentWriter from .runner import SingleEnvRunner, ParallelEnvRunner -from .writer import ExperimentWriter class Experiment: def __init__( diff --git a/all/experiments/experiment_test.py b/all/experiments/experiment_test.py index 0ecc7cbb..c2832533 100644 --- a/all/experiments/experiment_test.py +++ b/all/experiments/experiment_test.py @@ -3,7 +3,8 @@ import torch from all.presets.classic_control import dqn, a2c from all.environments import GymEnvironment -from all.experiments import Experiment, Writer +from all.experiments import Experiment +from all.logging import Writer # pylint: disable=protected-access @@ -25,6 +26,9 @@ def add_scalar(self, key, value, step="frame"): def add_loss(self, name, value, step="frame"): pass + def add_schedule(self, name, value, step="frame"): + pass + def add_evaluation(self, name, value, step="frame"): self.add_scalar("evaluation/" + name, value, self._get_step(step)) diff --git a/all/experiments/watch.py b/all/experiments/watch.py index cf9ca2b9..0aeac890 100644 --- a/all/experiments/watch.py +++ b/all/experiments/watch.py @@ -1,14 +1,18 @@ import os +import time import torch import gym from all.agents import Agent from all.bodies import DeepmindAtariBody from all.environments import AtariEnvironment -def watch(agent, env): +def watch(agent, env, fps=60): action = None returns = 0 + # have to call this before initial reset for pybullet envs + env.render(mode="human") while True: + time.sleep(1 / fps) if env.done: print('returns:', returns) env.reset() @@ -19,8 +23,8 @@ def watch(agent, env): action = agent.act(env.state, env.reward) returns += env.reward -def load_and_watch(dir, env): - watch(GreedyAgent.load(dir, env), env) +def load_and_watch(dir, env, fps=60): + watch(GreedyAgent.load(dir, env), env, fps=fps) class GreedyAgent(Agent): def __init__( @@ -40,7 +44,7 @@ def __init__( if not self.policy: raise TypeError('GreedyAgent must have either policy or q function') - def act(self, state, reward): + def act(self, state, _): with torch.no_grad(): if self.feature: state = self.feature(state) diff --git a/all/experiments/writer.py b/all/logging/__init__.py similarity index 87% rename from all/experiments/writer.py rename to all/logging/__init__.py index 74dc6b4a..2467a90f 100644 --- a/all/experiments/writer.py +++ b/all/logging/__init__.py @@ -20,6 +20,10 @@ def add_evaluation(self, name, value, step="frame"): def add_scalar(self, name, value, step="frame"): pass + @abstractmethod + def add_schedule(self, name, value, step="frame"): + pass + class DummyWriter(Writer): def add_loss(self, name, value, step="frame"): @@ -31,6 +35,9 @@ def add_evaluation(self, name, value, step="frame"): def add_scalar(self, name, value, step="frame"): pass + def add_schedule(self, name, value, step="frame"): + pass + class ExperimentWriter(SummaryWriter, Writer): def __init__(self, agent_name, env_name, loss=True): @@ -51,6 +58,10 @@ def add_loss(self, name, value, step="frame"): def add_evaluation(self, name, value, step="frame"): self.add_scalar('evaluation/' + name, value, self._get_step(step)) + def add_schedule(self, name, value, step="frame"): + if self._loss: + self.add_scalar('schedule' + '/' + name, value, self._get_step(step)) + def add_scalar(self, name, value, step="frame"): super().add_scalar(self.env_name + "/" + name, value, self._get_step(step)) diff --git a/all/optim/__init__.py b/all/optim/__init__.py new file mode 100644 index 00000000..45e161d3 --- /dev/null +++ b/all/optim/__init__.py @@ -0,0 +1 @@ +from .scheduler import LinearScheduler, Schedulable diff --git a/all/optim/scheduler.py b/all/optim/scheduler.py new file mode 100644 index 00000000..eebb916f --- /dev/null +++ b/all/optim/scheduler.py @@ -0,0 +1,42 @@ +from all.logging import DummyWriter + +class Schedulable: + '''Allow "instance" descriptors to implement parameter scheduling.''' + def __getattribute__(self, name): + value = object.__getattribute__(self, name) + if hasattr(value, '__get__'): + value = value.__get__(self, self.__class__) + return value + + +class LinearScheduler: + def __init__( + self, + initial_value, + final_value, + decay_start, + decay_end, + name='variable', + writer=DummyWriter(), + ): + self._initial_value = initial_value + self._final_value = final_value + self._decay_start = decay_start + self._decay_end = decay_end + self._i = -1 + self._name = name + self._writer = writer + + def __get__(self, instance, owner=None): + result = self._get_value() + self._writer.add_schedule(self._name, result) + return result + + def _get_value(self): + self._i += 1 + if self._i < self._decay_start: + return self._initial_value + if self._i >= self._decay_end: + return self._final_value + alpha = (self._i - self._decay_start) / (self._decay_end - self._decay_start) + return alpha * self._final_value + (1 - alpha) * self._initial_value diff --git a/all/optim/scheduler_test.py b/all/optim/scheduler_test.py new file mode 100644 index 00000000..844d30ee --- /dev/null +++ b/all/optim/scheduler_test.py @@ -0,0 +1,18 @@ +import unittest +import numpy as np +from all.optim import Schedulable, LinearScheduler + +class Obj(Schedulable): + def __init__(self): + self.attr = 0 + +class TestScheduler(unittest.TestCase): + def test_linear_scheduler(self): + obj = Obj() + obj.attr = LinearScheduler(10, 0, 3, 13) + expected = [10, 10, 10, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0] + actual = [obj.attr for _ in expected] + np.testing.assert_allclose(actual, expected) + +if __name__ == '__main__': + unittest.main() diff --git a/all/policies/greedy.py b/all/policies/greedy.py index 3dd4b429..2e694725 100644 --- a/all/policies/greedy.py +++ b/all/policies/greedy.py @@ -1,40 +1,21 @@ import numpy as np import torch +from all.optim import Schedulable -class GreedyPolicy(): +class GreedyPolicy(Schedulable): def __init__( self, q, num_actions, - initial_epsilon=1., - final_epsilon=0.1, - annealing_start=0, - annealing_time=1 + epsilon=0., ): self.q = q self.num_actions = num_actions - self.epsilon = initial_epsilon - self.initial_epsilon = initial_epsilon - self.final_epsilon = final_epsilon - self.executions = 0 - self.annealing_start = annealing_start - self.annealing_end = annealing_start + annealing_time - self.annealing_time = annealing_time + self.epsilon = epsilon def __call__(self, state, action=None, prob=False): - self.epsilon = self.anneal() if np.random.rand() < self.epsilon: return torch.randint(self.num_actions, (len(state),), device=self.q.device) with torch.no_grad(): action_scores = self.q.eval(state) return torch.argmax(action_scores, dim=1) - - def anneal(self): - self.executions += 1 - if self.executions < self.annealing_start: - return self.initial_epsilon - if self.executions < self.annealing_end: - alpha = (self.executions - self.annealing_start) / \ - (self.annealing_end - self.annealing_start) - return (1 - alpha) * self.initial_epsilon + alpha * self.final_epsilon - return self.final_epsilon diff --git a/all/presets/atari/a2c.py b/all/presets/atari/a2c.py index 22d025da..7aa1b306 100644 --- a/all/presets/atari/a2c.py +++ b/all/presets/atari/a2c.py @@ -4,7 +4,7 @@ from all.agents import A2C from all.bodies import ParallelAtariBody from all.approximation import VNetwork, FeatureNetwork -from all.experiments import DummyWriter +from all.logging import DummyWriter from all.policies import SoftmaxPolicy from .models import nature_cnn, nature_value_head, nature_policy_head diff --git a/all/presets/atari/dqn.py b/all/presets/atari/dqn.py index a9dc3feb..b9c78d18 100644 --- a/all/presets/atari/dqn.py +++ b/all/presets/atari/dqn.py @@ -5,9 +5,10 @@ from all.approximation import QNetwork, FixedTarget from all.agents import DQN from all.bodies import DeepmindAtariBody -from all.experiments import DummyWriter +from all.logging import DummyWriter from all.policies import GreedyPolicy from all.memory import ExperienceReplayBuffer +from all.optim import LinearScheduler from .models import nature_dqn @@ -50,13 +51,18 @@ def _dqn(env, writer=DummyWriter()): loss=smooth_l1_loss, writer=writer ) - policy = GreedyPolicy(q, - env.action_space.n, - annealing_start=replay_start_size, - annealing_time=final_exploration_frame - replay_start_size, - initial_epsilon=initial_exploration, - final_epsilon=final_exploration - ) + policy = GreedyPolicy( + q, + env.action_space.n, + epsilon=LinearScheduler( + initial_exploration, + final_exploration, + replay_start_size, + final_exploration_frame, + name="epsilon", + writer=writer + ) + ) replay_buffer = ExperienceReplayBuffer( replay_buffer_size, device=device diff --git a/all/presets/atari/ppo.py b/all/presets/atari/ppo.py index 209d01b1..aaa06e7c 100644 --- a/all/presets/atari/ppo.py +++ b/all/presets/atari/ppo.py @@ -1,10 +1,12 @@ # /Users/cpnota/repos/autonomous-learning-library/all/approximation/value/action/torch.py import torch from torch.optim import Adam +from torch.optim.lr_scheduler import CosineAnnealingLR from all.agents import PPO from all.bodies import ParallelAtariBody from all.approximation import VNetwork, FeatureNetwork -from all.experiments import DummyWriter +from all.logging import DummyWriter +from all.optim import LinearScheduler from all.policies import SoftmaxPolicy from .models import nature_cnn, nature_value_head, nature_policy_head @@ -18,14 +20,21 @@ def ppo( eps=1e-5, # Adam stability entropy_loss_scaling=0.01, value_loss_scaling=0.5, - feature_lr_scaling=1, + min_lr_scale=0.1, # Maximum amount to anneal the lr + clip_initial=0.1, + clip_final=0.01, + final_anneal_frame=40e6, # Anneal LR and clip until here epochs=4, minibatches=4, - epsilon=0.1, n_envs=8, n_steps=128, device=torch.device("cpu"), ): + # Update epoch * minibatches times per update, + # but we only update once per n_steps, + # with n_envs and 4 frames per step + final_anneal_step = final_anneal_frame * epochs * minibatches / (n_steps * n_envs * 4) + def _ppo(envs, writer=DummyWriter()): env = envs[0] @@ -34,7 +43,7 @@ def _ppo(envs, writer=DummyWriter()): feature_model = nature_cnn().to(device) feature_optimizer = Adam( - feature_model.parameters(), lr=lr * feature_lr_scaling, eps=eps + feature_model.parameters(), lr=lr, eps=eps ) value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps) policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps) @@ -43,6 +52,11 @@ def _ppo(envs, writer=DummyWriter()): feature_model, feature_optimizer, clip_grad=clip_grad, + scheduler=CosineAnnealingLR( + feature_optimizer, + final_anneal_step, + eta_min=lr * min_lr_scale + ), writer=writer ) v = VNetwork( @@ -51,6 +65,11 @@ def _ppo(envs, writer=DummyWriter()): loss_scaling=value_loss_scaling, clip_grad=clip_grad, writer=writer, + scheduler=CosineAnnealingLR( + value_optimizer, + final_anneal_step, + eta_min=lr * min_lr_scale + ), ) policy = SoftmaxPolicy( policy_model, @@ -59,6 +78,11 @@ def _ppo(envs, writer=DummyWriter()): entropy_loss_scaling=entropy_loss_scaling, clip_grad=clip_grad, writer=writer, + scheduler=CosineAnnealingLR( + policy_optimizer, + final_anneal_step, + eta_min=lr * min_lr_scale + ), ) return ParallelAtariBody( @@ -66,7 +90,14 @@ def _ppo(envs, writer=DummyWriter()): features, v, policy, - epsilon=epsilon, + epsilon=LinearScheduler( + clip_initial, + clip_final, + 0, + final_anneal_step, + name='clip', + writer=writer + ), epochs=epochs, minibatches=minibatches, n_envs=n_envs, diff --git a/all/presets/atari/rainbow.py b/all/presets/atari/rainbow.py index 4facde9a..c2961c18 100644 --- a/all/presets/atari/rainbow.py +++ b/all/presets/atari/rainbow.py @@ -5,8 +5,9 @@ from all.approximation import QNetwork, FixedTarget from all.agents import DQN from all.bodies import DeepmindAtariBody -from all.experiments import DummyWriter +from all.logging import DummyWriter from all.memory import PrioritizedReplayBuffer +from all.optim import LinearScheduler from all.policies import GreedyPolicy from .models import nature_ddqn @@ -44,7 +45,6 @@ def rainbow( 4. NoisyNets 5. Multi-step Learning 6. Distributional RL - 7. Double Q-Learning ''' # counted by number of updates rather than number of frame final_exploration_frame /= action_repeat @@ -66,13 +66,18 @@ def _rainbow(env, writer=DummyWriter()): loss=smooth_l1_loss, writer=writer ) - policy = GreedyPolicy(q, - env.action_space.n, - annealing_start=replay_start_size, - annealing_time=final_exploration_frame - replay_start_size, - initial_epsilon=initial_exploration, - final_epsilon=final_exploration - ) + policy = GreedyPolicy( + q, + env.action_space.n, + epsilon=LinearScheduler( + initial_exploration, + final_exploration, + replay_start_size, + final_exploration_frame, + name="epsilon", + writer=writer + ) + ) replay_buffer = PrioritizedReplayBuffer( replay_buffer_size, alpha=alpha, diff --git a/all/presets/atari/vac.py b/all/presets/atari/vac.py index a0ac4aee..3b5c7299 100644 --- a/all/presets/atari/vac.py +++ b/all/presets/atari/vac.py @@ -3,7 +3,7 @@ from all.agents import VAC from all.approximation import VNetwork, FeatureNetwork from all.bodies import ParallelAtariBody -from all.experiments import DummyWriter +from all.logging import DummyWriter from all.policies import SoftmaxPolicy from .models import nature_cnn, nature_value_head, nature_policy_head diff --git a/all/presets/atari/vpg.py b/all/presets/atari/vpg.py index 6a38f649..4b6a4fa0 100644 --- a/all/presets/atari/vpg.py +++ b/all/presets/atari/vpg.py @@ -3,7 +3,7 @@ from all.agents import VPG from all.approximation import VNetwork, FeatureNetwork from all.bodies import DeepmindAtariBody -from all.experiments import DummyWriter +from all.logging import DummyWriter from all.policies import SoftmaxPolicy from .models import nature_cnn, nature_value_head, nature_policy_head diff --git a/all/presets/atari/vqn.py b/all/presets/atari/vqn.py index b25006bd..00f62fdd 100644 --- a/all/presets/atari/vqn.py +++ b/all/presets/atari/vqn.py @@ -5,7 +5,8 @@ from all.approximation import QNetwork from all.agents import VQN from all.bodies import ParallelAtariBody -from all.experiments import DummyWriter +from all.logging import DummyWriter +from all.optim import LinearScheduler from all.policies import GreedyPolicy from .models import nature_ddqn @@ -35,12 +36,18 @@ def _vqn(envs, writer=DummyWriter()): loss=smooth_l1_loss, writer=writer ) - policy = GreedyPolicy(q, - env.action_space.n, - annealing_time=final_exploration_frame, - initial_epsilon=initial_exploration, - final_epsilon=final_exploration - ) + policy = GreedyPolicy( + q, + env.action_space.n, + epsilon=LinearScheduler( + initial_exploration, + final_exploration, + 0, + final_exploration_frame, + name="epsilon", + writer=writer + ) + ) return ParallelAtariBody( VQN(q, policy, gamma=discount_factor), envs, diff --git a/all/presets/atari/vsarsa.py b/all/presets/atari/vsarsa.py index 7bd51de8..c9881693 100644 --- a/all/presets/atari/vsarsa.py +++ b/all/presets/atari/vsarsa.py @@ -5,7 +5,8 @@ from all.approximation import QNetwork from all.agents import VSarsa from all.bodies import ParallelAtariBody -from all.experiments import DummyWriter +from all.logging import DummyWriter +from all.optim import LinearScheduler from all.policies import GreedyPolicy from .models import nature_ddqn @@ -35,12 +36,18 @@ def _vsarsa(envs, writer=DummyWriter()): loss=smooth_l1_loss, writer=writer ) - policy = GreedyPolicy(q, - env.action_space.n, - annealing_time=final_exploration_frame, - initial_epsilon=initial_exploration, - final_epsilon=final_exploration - ) + policy = GreedyPolicy( + q, + env.action_space.n, + epsilon=LinearScheduler( + initial_exploration, + final_exploration, + 0, + final_exploration_frame, + name="epsilon", + writer=writer + ) + ) return ParallelAtariBody( VSarsa(q, policy, gamma=discount_factor), envs, diff --git a/all/presets/classic_control/a2c.py b/all/presets/classic_control/a2c.py index b9d6562c..1be8454e 100644 --- a/all/presets/classic_control/a2c.py +++ b/all/presets/classic_control/a2c.py @@ -3,7 +3,7 @@ from torch.optim import Adam from all.agents import A2C from all.approximation import VNetwork, FeatureNetwork -from all.experiments import DummyWriter +from all.logging import DummyWriter from all.policies import SoftmaxPolicy from .models import fc_relu_features, fc_policy_head, fc_value_head diff --git a/all/presets/classic_control/dqn.py b/all/presets/classic_control/dqn.py index a9265ed6..42cc4a55 100644 --- a/all/presets/classic_control/dqn.py +++ b/all/presets/classic_control/dqn.py @@ -4,8 +4,9 @@ from torch.nn.functional import mse_loss from all.agents import DQN from all.approximation import QNetwork, FixedTarget -from all.experiments import DummyWriter +from all.logging import DummyWriter from all.memory import ExperienceReplayBuffer +from all.optim import LinearScheduler from all.policies import GreedyPolicy from .models import fc_relu_q @@ -36,9 +37,14 @@ def _dqn(env, writer=DummyWriter()): policy = GreedyPolicy( q, env.action_space.n, - initial_epsilon=initial_exploration, - final_epsilon=final_exploration, - annealing_time=final_exploration_frame + epsilon=LinearScheduler( + initial_exploration, + final_exploration, + replay_start_size, + final_exploration_frame, + name="epsilon", + writer=writer + ) ) replay_buffer = ExperienceReplayBuffer( replay_buffer_size, device=device) diff --git a/all/presets/classic_control/ppo.py b/all/presets/classic_control/ppo.py index d65d87d0..f80eb864 100644 --- a/all/presets/classic_control/ppo.py +++ b/all/presets/classic_control/ppo.py @@ -3,7 +3,7 @@ from torch.optim import Adam from all.agents import PPO from all.approximation import VNetwork, FeatureNetwork -from all.experiments import DummyWriter +from all.logging import DummyWriter from all.policies import SoftmaxPolicy from .models import fc_relu_features, fc_policy_head, fc_value_head diff --git a/all/presets/classic_control/rainbow.py b/all/presets/classic_control/rainbow.py index 45cbaeab..fcfda5a8 100644 --- a/all/presets/classic_control/rainbow.py +++ b/all/presets/classic_control/rainbow.py @@ -5,7 +5,7 @@ from all import nn from all.agents import DQN from all.approximation import QNetwork, FixedTarget -from all.experiments import DummyWriter +from all.logging import DummyWriter from all.memory import PrioritizedReplayBuffer from all.policies import GreedyPolicy @@ -74,10 +74,7 @@ def _rainbow(env, writer=DummyWriter()): policy = GreedyPolicy( q, env.action_space.n, - initial_epsilon=1, - final_epsilon=0, - annealing_start=replay_start_size, - annealing_time=1 + epsilon=0 ) # replay_buffer = ExperienceReplayBuffer(replay_buffer_size) replay_buffer = PrioritizedReplayBuffer( diff --git a/all/presets/classic_control/vac.py b/all/presets/classic_control/vac.py index 3c8741f7..b8f716d8 100644 --- a/all/presets/classic_control/vac.py +++ b/all/presets/classic_control/vac.py @@ -3,7 +3,7 @@ from torch.optim import RMSprop from all.agents import VAC from all.approximation import VNetwork, FeatureNetwork -from all.experiments import DummyWriter +from all.logging import DummyWriter from all.policies import SoftmaxPolicy from .models import fc_relu_features, fc_policy_head, fc_value_head diff --git a/all/presets/classic_control/vpg.py b/all/presets/classic_control/vpg.py index c464d6f6..475c8653 100644 --- a/all/presets/classic_control/vpg.py +++ b/all/presets/classic_control/vpg.py @@ -3,7 +3,7 @@ from torch.optim import Adam from all.agents import VPG from all.approximation import VNetwork, FeatureNetwork -from all.experiments import DummyWriter +from all.logging import DummyWriter from all.policies import SoftmaxPolicy from .models import fc_relu_features, fc_policy_head, fc_value_head diff --git a/all/presets/classic_control/vqn.py b/all/presets/classic_control/vqn.py index b12631bb..c6591b14 100644 --- a/all/presets/classic_control/vqn.py +++ b/all/presets/classic_control/vqn.py @@ -4,7 +4,7 @@ from all.agents import VQN from all.approximation import QNetwork from all.policies import GreedyPolicy -from all.experiments import DummyWriter +from all.logging import DummyWriter from .models import fc_relu_q def vqn( @@ -21,7 +21,7 @@ def _vqn(envs, writer=DummyWriter()): model = fc_relu_q(env).to(device) optimizer = RMSprop(model.parameters(), lr=lr, alpha=alpha, eps=eps) q = QNetwork(model, optimizer, env.action_space.n, writer=writer) - policy = GreedyPolicy(q, env.action_space.n, annealing_time=1, final_epsilon=epsilon) + policy = GreedyPolicy(q, env.action_space.n, epsilon=epsilon) return VQN(q, policy, gamma=gamma) return _vqn, n_envs \ No newline at end of file diff --git a/all/presets/classic_control/vsarsa.py b/all/presets/classic_control/vsarsa.py index 206e0ae1..399708cb 100644 --- a/all/presets/classic_control/vsarsa.py +++ b/all/presets/classic_control/vsarsa.py @@ -4,7 +4,7 @@ from all.agents import VSarsa from all.approximation import QNetwork from all.policies import GreedyPolicy -from all.experiments import DummyWriter +from all.logging import DummyWriter from .models import fc_relu_q def vsarsa( @@ -21,7 +21,11 @@ def _vsarsa(envs, writer=DummyWriter()): model = fc_relu_q(env).to(device) optimizer = RMSprop(model.parameters(), lr=lr, alpha=alpha, eps=eps) q = QNetwork(model, optimizer, env.action_space.n, writer=writer) - policy = GreedyPolicy(q, env.action_space.n, annealing_time=1, final_epsilon=epsilon) + policy = GreedyPolicy( + q, + env.action_space.n, + epsilon=epsilon + ) return VSarsa(q, policy, gamma=gamma) return _vsarsa, n_envs \ No newline at end of file diff --git a/all/presets/continuous/ddpg.py b/all/presets/continuous/ddpg.py index d958f759..37673b3f 100644 --- a/all/presets/continuous/ddpg.py +++ b/all/presets/continuous/ddpg.py @@ -4,7 +4,7 @@ from all import nn from all.agents import DDPG from all.approximation import QContinuous, PolyakTarget -from all.experiments import DummyWriter +from all.logging import DummyWriter from all.policies import DeterministicPolicy from all.memory import ExperienceReplayBuffer diff --git a/all/presets/continuous/sac.py b/all/presets/continuous/sac.py index 403720fa..3cd59ead 100644 --- a/all/presets/continuous/sac.py +++ b/all/presets/continuous/sac.py @@ -4,7 +4,7 @@ from all import nn from all.agents import SAC from all.approximation import QContinuous, PolyakTarget, VNetwork -from all.experiments import DummyWriter +from all.logging import DummyWriter from all.policies.soft_deterministic import SoftDeterministicPolicy from all.memory import ExperienceReplayBuffer diff --git a/all/presets/validate_agent.py b/all/presets/validate_agent.py index 5a6b5399..8bda66d6 100644 --- a/all/presets/validate_agent.py +++ b/all/presets/validate_agent.py @@ -1,6 +1,6 @@ import torch from all.environments import State -from all.experiments import DummyWriter +from all.logging import DummyWriter def validate_agent(make_agent, env): if isinstance(make_agent, tuple): diff --git a/scripts/watch_continuous.py b/scripts/watch_continuous.py index 18bf0a7a..40fdb3ab 100644 --- a/scripts/watch_continuous.py +++ b/scripts/watch_continuous.py @@ -1,4 +1,6 @@ -import roboschool # registers env +import gym +import pybullet_envs +import roboschool from watch_classic import watch if __name__ == "__main__": From 38c83dc9eb4e7485ae8964f0e303ecd783684265 Mon Sep 17 00:00:00 2001 From: Chris Nota Date: Tue, 30 Jul 2019 17:45:49 -0400 Subject: [PATCH 6/9] Feature/target (#94) --- all/agents/ddpg.py | 2 +- all/agents/dqn.py | 2 +- all/agents/sac.py | 9 ++++----- all/agents/vac.py | 2 +- all/agents/vqn.py | 2 +- all/agents/vsarsa.py | 2 +- all/approximation/approximation.py | 14 ++++++++++++++ all/approximation/q_network_test.py | 10 +++++----- all/policies/deterministic.py | 3 --- all/policies/deterministic_test.py | 4 ++-- all/policies/soft_deterministic.py | 6 +++++- scripts/watch_continuous.py | 1 + 12 files changed, 36 insertions(+), 21 deletions(-) diff --git a/all/agents/ddpg.py b/all/agents/ddpg.py index 00587f2b..c6f38091 100644 --- a/all/agents/ddpg.py +++ b/all/agents/ddpg.py @@ -46,7 +46,7 @@ def _train(self): # train q function td_errors = ( rewards + - self.discount_factor * self.q.eval(next_states, self.policy.eval(next_states)) - + self.discount_factor * self.q.target(next_states, self.policy.target(next_states)) - self.q(states, torch.cat(actions)) ) self.q.reinforce(weights * td_errors) diff --git a/all/agents/dqn.py b/all/agents/dqn.py index 65c49c58..1347407a 100644 --- a/all/agents/dqn.py +++ b/all/agents/dqn.py @@ -45,7 +45,7 @@ def _train(self): self.minibatch_size) td_errors = ( rewards + - self.discount_factor * torch.max(self.q.eval(next_states), dim=1)[0] - + self.discount_factor * torch.max(self.q.target(next_states), dim=1)[0] - self.q(states, actions) ) self.q.reinforce(weights * td_errors) diff --git a/all/agents/sac.py b/all/agents/sac.py index 13723f59..0973d909 100644 --- a/all/agents/sac.py +++ b/all/agents/sac.py @@ -44,8 +44,7 @@ def act(self, state, reward): self._store_transition(state, reward) self._train() self.state = state - with torch.no_grad(): - self.action = self.policy(state) + self.action = self.policy.eval(state) return self.action def _store_transition(self, state, reward): @@ -63,10 +62,10 @@ def _train(self): # compute targets for Q and V with torch.no_grad(): _actions, _log_probs = self.policy(states, log_prob=True) - q_targets = rewards + self.discount_factor * self.v.eval(next_states) + q_targets = rewards + self.discount_factor * self.v.target(next_states) v_targets = torch.min( - self.q_1.eval(states, _actions), - self.q_2.eval(states, _actions), + self.q_1.target(states, _actions), + self.q_2.target(states, _actions), ) - self.temperature * _log_probs temperature_loss = ((_log_probs + self.entropy_target).detach().mean()) self.writer.add_loss('entropy', -_log_probs.mean()) diff --git a/all/agents/vac.py b/all/agents/vac.py index 43bc4d86..7e1258bd 100644 --- a/all/agents/vac.py +++ b/all/agents/vac.py @@ -13,7 +13,7 @@ def act(self, state, reward): if self._previous_features: td_error = ( reward - + self.gamma * self.v.eval(self.features.eval(state)) + + self.gamma * self.v.target(self.features.eval(state)) - self.v(self._previous_features) ) self.v.reinforce(td_error) diff --git a/all/agents/vqn.py b/all/agents/vqn.py index e9d5b190..f6417353 100644 --- a/all/agents/vqn.py +++ b/all/agents/vqn.py @@ -17,7 +17,7 @@ def act(self, state, reward): if self.previous_state: td_error = ( reward - + self.gamma * torch.max(self.q.eval(state), dim=1)[0] + + self.gamma * torch.max(self.q.target(state), dim=1)[0] - self.q(self.previous_state, self.previous_action) ) self.q.reinforce(td_error) diff --git a/all/agents/vsarsa.py b/all/agents/vsarsa.py index 1c7b934d..1e162398 100644 --- a/all/agents/vsarsa.py +++ b/all/agents/vsarsa.py @@ -16,7 +16,7 @@ def act(self, state, reward): if self.previous_state: td_error = ( reward - + self.gamma * self.q.eval(state, action) + + self.gamma * self.q.target(state, action) - self.q(self.previous_state, self.previous_action) ) self.q.reinforce(td_error) diff --git a/all/approximation/approximation.py b/all/approximation/approximation.py index 1fa72a0d..2825195c 100644 --- a/all/approximation/approximation.py +++ b/all/approximation/approximation.py @@ -45,6 +45,12 @@ def __init__( ) def __call__(self, *inputs, detach=True): + ''' + Run a forward pass of the model. + + If detach=True, the computation graph is cached and the result is detached. + If detach=False, nothing is cached and instead returns the attached result. + ''' result = self.model(*inputs) if detach: self._enqueue(result) @@ -52,9 +58,16 @@ def __call__(self, *inputs, detach=True): return result def eval(self, *inputs): + '''Run a forward pass of the model in no_grad mode.''' + with torch.no_grad(): + return self.model(*inputs) + + def target(self, *inputs): + '''Run a forward pass of the target network.''' return self._target(*inputs) def reinforce(self, errors, retain_graph=False): + '''Update the model using the cache and the errors passed in.''' batch_size = len(errors) cache = self._dequeue(batch_size) if cache.requires_grad: @@ -64,6 +77,7 @@ def reinforce(self, errors, retain_graph=False): self.step() def step(self): + '''Given that a bakcward pass has been made, run an optimization step.''' if self._clip_grad != 0: utils.clip_grad_norm_(self.model.parameters(), self._clip_grad) self._optimizer.step() diff --git a/all/approximation/q_network_test.py b/all/approximation/q_network_test.py index 0b060d31..c7df0320 100644 --- a/all/approximation/q_network_test.py +++ b/all/approximation/q_network_test.py @@ -63,31 +63,31 @@ def test_target_net(self): errors = torch.tensor([-1.]) policy_value = q(inputs).item() - target_value = q.eval(inputs).item() + target_value = q.target(inputs).item() np.testing.assert_equal(policy_value, -0.008584141731262207) np.testing.assert_equal(target_value, -0.008584141731262207) q.reinforce(errors) policy_value = q(inputs).item() - target_value = q.eval(inputs).item() + target_value = q.target(inputs).item() np.testing.assert_equal(policy_value, -0.20858412981033325) np.testing.assert_equal(target_value, -0.008584141731262207) q.reinforce(errors) policy_value = q(inputs).item() - target_value = q.eval(inputs).item() + target_value = q.target(inputs).item() np.testing.assert_equal(policy_value, -0.4085841178894043) np.testing.assert_equal(target_value, -0.008584141731262207) q.reinforce(errors) policy_value = q(inputs).item() - target_value = q.eval(inputs).item() + target_value = q.target(inputs).item() np.testing.assert_equal(policy_value, -0.6085841655731201) np.testing.assert_equal(target_value, -0.6085841655731201) q.reinforce(errors) policy_value = q(inputs).item() - target_value = q.eval(inputs).item() + target_value = q.target(inputs).item() np.testing.assert_equal(policy_value, -0.8085841536521912) np.testing.assert_equal(target_value, -0.6085841655731201) diff --git a/all/policies/deterministic.py b/all/policies/deterministic.py index 5a8ee6b1..6f0db85b 100644 --- a/all/policies/deterministic.py +++ b/all/policies/deterministic.py @@ -36,9 +36,6 @@ def __call__(self, state, action=None, prob=None): def greedy(self, state): return self.model(state) - def eval(self, state): - return self._target(state) - def reinforce(self, _): raise NotImplementedError( 'Deterministic policies are trainted through backpropagation.' + diff --git a/all/policies/deterministic_test.py b/all/policies/deterministic_test.py index 7d138614..d1843dcb 100644 --- a/all/policies/deterministic_test.py +++ b/all/policies/deterministic_test.py @@ -80,12 +80,12 @@ def test_target(self): # run update step, make sure target network doesn't change action.sum().backward(retain_graph=True) self.policy.step() - tt.assert_equal(self.policy.eval(state), torch.zeros(1, ACTION_DIM)) + tt.assert_equal(self.policy.target(state), torch.zeros(1, ACTION_DIM)) # again... action.sum().backward(retain_graph=True) self.policy.step() - tt.assert_equal(self.policy.eval(state), torch.zeros(1, ACTION_DIM)) + tt.assert_equal(self.policy.target(state), torch.zeros(1, ACTION_DIM)) # third time, target should be updated action.sum().backward(retain_graph=True) diff --git a/all/policies/soft_deterministic.py b/all/policies/soft_deterministic.py index 98060ffe..ea22f282 100644 --- a/all/policies/soft_deterministic.py +++ b/all/policies/soft_deterministic.py @@ -43,7 +43,11 @@ def greedy(self, state): return self._squash(self.model(state)[:, 0:self._action_dim]) def eval(self, state): - return self._squash(self._target(state)[:, 0:self._action_dim]) + with torch.no_grad(): + return self(state) + + def target(self, state): + self._target(state) def reinforce(self, _): raise NotImplementedError( diff --git a/scripts/watch_continuous.py b/scripts/watch_continuous.py index 40fdb3ab..0e7e2483 100644 --- a/scripts/watch_continuous.py +++ b/scripts/watch_continuous.py @@ -1,6 +1,7 @@ import gym import pybullet_envs import roboschool +import pybulletgym from watch_classic import watch if __name__ == "__main__": From 65a7f73e4f714482badcaebfb57f45ed8fdaf1a3 Mon Sep 17 00:00:00 2001 From: Chris Nota Date: Tue, 30 Jul 2019 21:03:07 -0400 Subject: [PATCH 7/9] Feature/advantage (#95) * make calls to target network explicit * tweak a2c classic control preset * reimplement advantage buffer and a2c * update generalized buffer * remove rewards from buffer.advantages(states) * fix way a2c buffers states and actions * update ppo to compute advantage correctly --- all/agents/a2c.py | 30 ++++++++++------- all/agents/ppo.py | 26 +++++++------- all/memory/advantage.py | 27 ++++++++------- all/memory/advantage_test.py | 13 +++---- all/memory/generalized_advantage.py | 43 +++++------------------- all/memory/generalized_advantage_test.py | 11 +++--- all/presets/classic_control/a2c.py | 6 ++-- 7 files changed, 67 insertions(+), 89 deletions(-) diff --git a/all/agents/a2c.py b/all/agents/a2c.py index 77532477..74d8eafd 100644 --- a/all/agents/a2c.py +++ b/all/agents/a2c.py @@ -1,5 +1,3 @@ -import torch -from all.environments import State from all.memory import NStepAdvantageBuffer from ._agent import Agent @@ -22,26 +20,34 @@ def __init__( self.n_envs = n_envs self.n_steps = n_steps self.discount_factor = discount_factor + self._states = None + self._actions = None self._batch_size = n_envs * n_steps self._buffer = self._make_buffer() self._features = [] def act(self, states, rewards): - self._buffer.store(states, torch.zeros(self.n_envs), rewards) - self._train() - features = self.features(states) - self._features.append(features) - return self.policy(features) + self._store_transitions(rewards) + self._train(states) + self._states = states + self._actions = self.policy.eval(self.features.eval(states)) + return self._actions - def _train(self): + def _store_transitions(self, rewards): + if self._states: + self._buffer.store(self._states, self._actions, rewards) + + def _train(self, states): if len(self._buffer) >= self._batch_size: - states = State.from_list(self._features) - _, _, advantages = self._buffer.sample(self._batch_size) - self.v(states) + states, actions, advantages = self._buffer.advantages(states) + # forward pass + features = self.features(states) + self.v(features) + self.policy(features, actions) + # backward pass self.v.reinforce(advantages) self.policy.reinforce(advantages) self.features.reinforce() - self._features = [] def _make_buffer(self): return NStepAdvantageBuffer( diff --git a/all/agents/ppo.py b/all/agents/ppo.py index 8115cf68..5fc11481 100644 --- a/all/agents/ppo.py +++ b/all/agents/ppo.py @@ -26,6 +26,8 @@ def __init__( self.n_steps = n_steps self.discount_factor = discount_factor self.lam = lam + self._states = None + self._actions = None self._epsilon = epsilon self._epochs = epochs self._batch_size = n_envs * n_steps @@ -34,14 +36,19 @@ def __init__( self._features = [] def act(self, states, rewards): - self._train() - actions = self.policy.eval(self.features.eval(states)) - self._buffer.store(states, actions, rewards) - return actions + self._store_transitions(rewards) + self._train(states) + self._states = states + self._actions = self.policy.eval(self.features.eval(states)) + return self._actions - def _train(self): + def _store_transitions(self, rewards): + if self._states: + self._buffer.store(self._states, self._actions, rewards) + + def _train(self, _states): if len(self._buffer) >= self._batch_size: - states, actions, advantages = self._buffer.sample(self._batch_size) + states, actions, advantages = self._buffer.advantages(_states) with torch.no_grad(): features = self.features.eval(states) pi_0 = self.policy.eval(features, actions) @@ -65,13 +72,6 @@ def _train_minibatch(self, states, actions, pi_0, advantages, targets): self.v.reinforce(targets - self.v(features)) self.features.reinforce() - def _compute_targets(self, returns, next_states, lengths): - return ( - returns + - (self.discount_factor ** lengths) - * self.v.eval(self.features.eval(next_states)) - ) - def _compute_policy_loss(self, pi_0, advantages): def _policy_loss(pi_i): ratios = torch.exp(pi_i - pi_0) diff --git a/all/memory/advantage.py b/all/memory/advantage.py index 2aa07ecd..097b9002 100644 --- a/all/memory/advantage.py +++ b/all/memory/advantage.py @@ -13,30 +13,29 @@ def __init__(self, v, features, n_steps, n_envs, discount_factor=1): self._rewards = [] def __len__(self): - if not self._states: - return 0 - return (len(self._states) - 1) * self.n_envs + return len(self._states) * self.n_envs def store(self, states, actions, rewards): if not self._states: self._states = [states] self._actions = [actions] self._rewards = [rewards] - elif len(self._states) <= self.n_steps: + elif len(self._states) < self.n_steps: self._states.append(states) self._actions.append(actions) self._rewards.append(rewards) else: raise Exception("Buffer length exceeded: " + str(self.n_steps)) - def sample(self, _): + def advantages(self, states): if len(self) < self.n_steps * self.n_envs: raise Exception("Not enough states received!") + self._states.append(states) rewards, lengths = self._compute_returns() states, actions, next_states = self._summarize_transitions() advantages = self._compute_advantages(states, rewards, next_states, lengths) - self._update_buffers() + self._clear_buffers() return ( states, @@ -90,17 +89,21 @@ def _summarize_transitions(self): if not state.mask: next_state = state - return State.from_list(sample_states), sample_actions, State.from_list(sample_next_states) + return ( + State.from_list(sample_states), + torch.stack(sample_actions), + State.from_list(sample_next_states) + ) def _compute_advantages(self, states, rewards, next_states, lengths): return ( rewards.view(-1) + (self.gamma ** lengths.view(-1)) - * self.v.eval(self.features.eval(next_states)) + * self.v.target(self.features.target(next_states)) - self.v.eval(self.features.eval(states)) ) - def _update_buffers(self): - self._states = self._states[self.n_steps:] - self._actions = self._actions[self.n_steps:] - self._rewards = self._rewards[self.n_steps:] + def _clear_buffers(self): + self._states = [] + self._actions = [] + self._rewards = [] diff --git a/all/memory/advantage_test.py b/all/memory/advantage_test.py index 06fddb65..9dafbecd 100644 --- a/all/memory/advantage_test.py +++ b/all/memory/advantage_test.py @@ -26,8 +26,7 @@ def test_rollout(self): states = State(torch.arange(0, 12).unsqueeze(1)) buffer.store(states[0:3], actions, torch.zeros(3)) buffer.store(states[3:6], actions, torch.ones(3)) - buffer.store(states[6:9], actions, 4 * torch.ones(3)) - states, _, advantages = buffer.sample(-1) + states, _, advantages = buffer.advantages(states[6:9]) expected_states = State(torch.arange(0, 6).unsqueeze(1)) expected_next_states = State( @@ -59,8 +58,7 @@ def test_rollout_with_nones(self): buffer.store(states[0:3], actions, torch.zeros(3)) buffer.store(states[3:6], actions, torch.ones(3)) buffer.store(states[6:9], actions, 2 * torch.ones(3)) - buffer.store(states[9:12], actions, 4 * torch.ones(3)) - states, actions, advantages = buffer.sample(-1) + states, actions, advantages = buffer.advantages(states[9:12]) expected_states = State(torch.arange(0, 9).unsqueeze(1), done[0:9]) expected_next_done = torch.zeros(9) @@ -94,9 +92,8 @@ def test_multi_rollout(self): actions = torch.ones((2)) buffer.store(raw_states[0:2], actions, torch.ones(2)) buffer.store(raw_states[2:4], actions, torch.ones(2)) - buffer.store(raw_states[4:6], actions, torch.ones(2)) - states, actions, advantages = buffer.sample(-1) + states, actions, advantages = buffer.advantages(raw_states[4:6]) expected_states = State(torch.arange(0, 4).unsqueeze(1)) expected_returns = torch.tensor([1.5, 1.5, 1, 1]) expected_next_states = State(torch.tensor([4, 5, 4, 5]).unsqueeze(1)) @@ -109,10 +106,10 @@ def test_multi_rollout(self): expected_lengths )) + buffer.store(raw_states[4:6], actions, torch.ones(2)) buffer.store(raw_states[6:8], actions, torch.ones(2)) - buffer.store(raw_states[8:10], actions, torch.ones(2)) - states, actions, advantages = buffer.sample(-1) + states, actions, advantages = buffer.advantages(raw_states[8:10]) expected_states = State(torch.arange(4, 8).unsqueeze(1)) self.assert_states_equal(states, expected_states) tt.assert_allclose(advantages, self._compute_expected_advantages( diff --git a/all/memory/generalized_advantage.py b/all/memory/generalized_advantage.py index 402027ff..0da7eca7 100644 --- a/all/memory/generalized_advantage.py +++ b/all/memory/generalized_advantage.py @@ -23,9 +23,7 @@ def __init__( self._rewards = [] def __len__(self): - if not self._states: - return 0 - return (len(self._states) - 1) * self.n_envs + return len(self._states) * self.n_envs def store(self, states, actions, rewards): if not self._states: @@ -39,20 +37,20 @@ def store(self, states, actions, rewards): else: raise Exception("Buffer length exceeded: " + str(self.n_steps)) - def sample(self, _): + def advantages(self, states): if len(self) < self._batch_size: raise Exception("Not enough states received!") - # states, actions, = self._summarize_transitions() + self._states.append(states) states = State.from_list(self._states[0:self.n_steps + 1]) actions = torch.cat(self._actions[:self.n_steps], dim=0) rewards = torch.stack(self._rewards[:self.n_steps]).view(self.n_steps, -1) - _values = self.v.eval(self.features.eval(states)).view((self.n_steps + 1, -1)) + _values = self.v.target(self.features.target(states)).view((self.n_steps + 1, -1)) values = _values[0:self.n_steps] next_values = _values[1:self.n_steps + 1] td_errors = rewards + self.gamma * next_values - values advantages = self._compute_advantages(td_errors) - self._update_buffers() + self._clear_buffers() return ( states[0:self._batch_size], @@ -74,30 +72,7 @@ def _compute_advantages(self, td_errors): return advantages.view(-1) - def _summarize_transitions(self): - sample_n = self.n_envs * self.n_steps - sample_states = [None] * sample_n - sample_actions = [None] * sample_n - sample_next_states = [None] * sample_n - - for e in range(self.n_envs): - next_state = self._states[self.n_steps][e] - for i in range(self.n_steps): - t = self.n_steps - 1 - i - idx = t * self.n_envs + e - state = self._states[t][e] - action = self._actions[t][e] - - sample_states[idx] = state - sample_actions[idx] = action - sample_next_states[idx] = next_state - - if not state.mask: - next_state = state - - return State.from_list(sample_states), sample_actions, State.from_list(sample_next_states) - - def _update_buffers(self): - self._states = self._states[self.n_steps:] - self._actions = self._actions[self.n_steps:] - self._rewards = self._rewards[self.n_steps:] + def _clear_buffers(self): + self._states = [] + self._actions = [] + self._rewards = [] diff --git a/all/memory/generalized_advantage_test.py b/all/memory/generalized_advantage_test.py index b4bc88d2..bbad2eeb 100644 --- a/all/memory/generalized_advantage_test.py +++ b/all/memory/generalized_advantage_test.py @@ -34,7 +34,6 @@ def test_simple(self): rewards = torch.tensor([1., 2, 4]) buffer.store(states[0], actions, rewards[0]) buffer.store(states[1], actions, rewards[1]) - buffer.store(states[2], actions, rewards[2]) values = self.v.eval(self.features.eval(states)) tt.assert_almost_equal(values, torch.tensor([0.1826, -0.3476, -0.8777]), decimal=3) @@ -49,11 +48,11 @@ def test_simple(self): advantages[1] = td_errors[1] tt.assert_almost_equal(advantages, torch.tensor([1.121, 1.909]), decimal=3) - _states, _actions, _advantages = buffer.sample(-1) + _states, _actions, _advantages = buffer.advantages(states[2]) tt.assert_almost_equal(_advantages, advantages) tt.assert_equal(_actions, torch.tensor([1, 1])) - def test_multi(self): + def test_parallel(self): buffer = GeneralizedAdvantageBuffer( self.v, self.features, @@ -71,10 +70,8 @@ def test_multi(self): rewards = torch.tensor([[1., 1], [2, 1], [4, 1]]) buffer.store(states[0], actions, rewards[0]) buffer.store(states[1], actions, rewards[1]) - buffer.store(states[2], actions, rewards[2]) - states = State.from_list(states) - values = self.v.eval(self.features.eval(states)).view(3, -1) + values = self.v.eval(self.features.eval(State.from_list(states))).view(3, -1) tt.assert_almost_equal(values, torch.tensor([ [0.183, -1.408], [-0.348, -1.938], @@ -97,7 +94,7 @@ def test_multi(self): [1.909, 1.704] ]), decimal=3) - _states, _actions, _advantages = buffer.sample(-1) + _states, _actions, _advantages = buffer.advantages(states[2]) tt.assert_almost_equal(_advantages, advantages.view(-1)) def assert_array_equal(self, actual, expected): diff --git a/all/presets/classic_control/a2c.py b/all/presets/classic_control/a2c.py index 1be8454e..1e11832e 100644 --- a/all/presets/classic_control/a2c.py +++ b/all/presets/classic_control/a2c.py @@ -12,9 +12,9 @@ def a2c( clip_grad=0.1, discount_factor=0.99, entropy_loss_scaling=0.001, - lr=1e-3, - n_envs=8, - n_steps=8, + lr=3e-3, + n_envs=4, + n_steps=32, device=torch.device('cpu') ): def _a2c(envs, writer=DummyWriter()): From 1acb74b9d267b80ea8e0f66395a2593a67f01230 Mon Sep 17 00:00:00 2001 From: Chris Nota Date: Fri, 2 Aug 2019 16:31:07 -0400 Subject: [PATCH 8/9] Feature/max (#96) * add returns/max metric * update test --- all/experiments/experiment_test.py | 6 +++--- all/experiments/runner.py | 10 +++++++--- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/all/experiments/experiment_test.py b/all/experiments/experiment_test.py index c2832533..8c743b79 100644 --- a/all/experiments/experiment_test.py +++ b/all/experiments/experiment_test.py @@ -61,11 +61,11 @@ def test_adds_label(self): def test_writes_returns_eps(self): experiment = MockExperiment(dqn(), self.env, quiet=True, episodes=3) np.testing.assert_equal( - experiment._writer.data["evaluation/returns-by-episode"]["values"], + experiment._writer.data["evaluation/returns/episode"]["values"], np.array([14.0, 19.0, 26.0]), ) np.testing.assert_equal( - experiment._writer.data["evaluation/returns-by-episode"]["steps"], + experiment._writer.data["evaluation/returns/episode"]["steps"], np.array([1, 2, 3]), ) @@ -77,7 +77,7 @@ def test_writes_loss(self): def test_runs_multi_env(self): experiment = MockExperiment(a2c(n_envs=3), self.env, quiet=True, episodes=3) - self.assertEqual(len(experiment._writer.data["evaluation/returns-by-episode"]["values"]), 3) + self.assertEqual(len(experiment._writer.data["evaluation/returns/episode"]["values"]), 3) if __name__ == "__main__": unittest.main() diff --git a/all/experiments/runner.py b/all/experiments/runner.py index 841be35f..9053b01c 100644 --- a/all/experiments/runner.py +++ b/all/experiments/runner.py @@ -22,6 +22,7 @@ def __init__( self._max_episodes = episodes self._render = render self._quiet = quiet + self._best_returns = -np.inf self.run() @abstractmethod @@ -38,8 +39,11 @@ def _log(self, returns, fps): if not self._quiet: print("episode: %i, frames: %i, fps: %d, returns: %d" % (self._writer.episodes, self._writer.frames, fps, returns)) - self._writer.add_evaluation('returns-by-episode', returns, step="episode") - self._writer.add_evaluation('returns-by-frame', returns, step="frame") + if returns > self._best_returns: + self._best_returns = returns + self._writer.add_evaluation('returns/episode', returns, step="episode") + self._writer.add_evaluation('returns/frame', returns, step="frame") + self._writer.add_evaluation("returns/max", self._best_returns, step="frame") self._writer.add_scalar('fps', fps, step="frame") class SingleEnvRunner(EnvRunner): @@ -115,7 +119,7 @@ def _step_env(self, i, env, action): self._returns[i] += env.reward end_time = timer() fps = self._writer.frames / (end_time - self._start_time) - self._log(self._returns[i], fps) + self._log(self._returns[i].item(), fps) env.reset() self._returns[i] = 0 self._writer.episodes += 1 From 28f1e8188dd095fbe151f5341cc95f8eaa35c902 Mon Sep 17 00:00:00 2001 From: Chris Nota Date: Fri, 2 Aug 2019 16:34:48 -0400 Subject: [PATCH 9/9] bump version number and update readme --- README.md | 1 + setup.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 2feb0517..84326719 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,7 @@ We provide out-of-the-box modules for: - [x] Generalized Advantage Estimation (GAE) - [x] Target networks - [x] Polyak averaging +- [x] Easy parameter and learning rate scheduling - [x] An enhanced `nn` module (includes dueling layers, noisy layers, action bounds, and the coveted `nn.Flatten`) - [x] `gym` to `pytorch` wrappers - [x] Atari wrappers diff --git a/setup.py b/setup.py index 35b01f61..c213a0b3 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name="all", - version="0.2.4", + version="0.3.0", description=("A reinforcement learning library in python"), packages=find_packages(), url="https://github.com/cpnota/autonomous-learning-library.git",