diff --git a/examples/atari_dqn_async_cpu.py b/examples/atari_dqn_async_cpu.py index 4d4799ab..58ae0bc1 100644 --- a/examples/atari_dqn_async_cpu.py +++ b/examples/atari_dqn_async_cpu.py @@ -7,11 +7,16 @@ from rlpyt.utils.launching.affinity import make_affinity from rlpyt.samplers.async_.cpu_sampler import AsyncCpuSampler from rlpyt.envs.atari.atari_env import AtariEnv, AtariTrajInfo +from rlpyt.envs.gym import make as gym_make from rlpyt.algos.dqn.dqn import DQN from rlpyt.agents.dqn.atari.atari_dqn_agent import AtariDqnAgent from rlpyt.runners.async_rl import AsyncRlEval from rlpyt.utils.logging.context import logger_context +import sys +sys.path.append('~/home/isaac/codes/dd-zero/rlpyt/examples') +from examples.example_9 import ResizeFrame, make_env_custom, CustomDqnAgent + def build_and_train(game="pong", run_ID=0): # Change these inputs to match local machine and desired parallelism. @@ -44,7 +49,9 @@ def build_and_train(game="pong", run_ID=0): min_steps_learn=1e4, replay_size=int(1e5) ) - agent = AtariDqnAgent() + # agent = AtariDqnAgent() + agent = CustomDqnAgent() + runner = AsyncRlEval( algo=algo, agent=agent, diff --git a/examples/example_2.py b/examples/example_2.py index 718d4960..59109c8d 100644 --- a/examples/example_2.py +++ b/examples/example_2.py @@ -59,3 +59,5 @@ def build_and_train(env_id="Hopper-v3", run_ID=0, cuda_idx=None): run_ID=args.run_ID, cuda_idx=args.cuda_idx, ) + + diff --git a/examples/example_8.py b/examples/example_8.py new file mode 100644 index 00000000..348f698e --- /dev/null +++ b/examples/example_8.py @@ -0,0 +1,64 @@ + +""" +Runs one instance of the environment and optimizes using the Soft Actor +Critic algorithm. Can use a GPU for the agent (applies to both sample and +train). No parallelism employed, everything happens in one python process; can +be easier to debug. + +Requires OpenAI gym (and maybe mujoco). If not installed, move on to next +example. + +""" + +from rlpyt.samplers.serial.sampler import SerialSampler +from rlpyt.envs.gym import make as gym_make +from rlpyt.algos.qpg.sac import SAC +from rlpyt.algos.qpg.ddpg import DDPG +from rlpyt.agents.qpg.sac_agent import SacAgent +from rlpyt.agents.qpg.ddpg_agent import DdpgAgent +from rlpyt.runners.minibatch_rl import MinibatchRlEval +from rlpyt.utils.logging.context import logger_context + + +def build_and_train(env_id="LunarLanderContinuous-v2", run_ID=0, cuda_idx=None): + sampler = SerialSampler( + EnvCls=gym_make, + env_kwargs=dict(id=env_id), + eval_env_kwargs=dict(id=env_id), + batch_T=1, # One time-step per sampler iteration. + batch_B=1, # One environment (i.e. sampler Batch dimension). + max_decorrelation_steps=0, + eval_n_envs=10, + eval_max_steps=int(51e3), + eval_max_trajectories=50, + ) + algo = DDPG() #SAC() # Run with defaults. + agent = DdpgAgent() #SacAgent() + runner = MinibatchRlEval( + algo=algo, + agent=agent, + sampler=sampler, + n_steps=1e6, + log_interval_steps=1e4, + affinity=dict(cuda_idx=cuda_idx), + ) + config = dict(env_id=env_id) + name = "ddpg_" + env_id + log_dir = "example_8" + with logger_context(log_dir, run_ID, name, config): + runner.train() + + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + # for gym_make you have to have an env with Box observation and action space. It doesn't work for Discrete. Only work for continues action spaces ->deepdrive-zero: should be ok1 + parser.add_argument('--env_id', help='environment ID', default='LunarLanderContinuous-v2') # 'BipedalWalkerHardcore-v3':ok, 'LunarLanderContinuous-v2':ok + parser.add_argument('--run_ID', help='run identifier (logging)', type=int, default=0) + parser.add_argument('--cuda_idx', help='gpu to use ', type=int, default=None) + args = parser.parse_args() + build_and_train( + env_id=args.env_id, + run_ID=args.run_ID, + cuda_idx=args.cuda_idx, + ) diff --git a/examples/example_9.py b/examples/example_9.py new file mode 100644 index 00000000..c7463664 --- /dev/null +++ b/examples/example_9.py @@ -0,0 +1,195 @@ + +from rlpyt.utils.launching.affinity import make_affinity +from rlpyt.samplers.parallel.cpu.sampler import CpuSampler +from rlpyt.samplers.parallel.gpu.sampler import GpuSampler +from rlpyt.samplers.serial.sampler import SerialSampler +from rlpyt.samplers.async_.cpu_sampler import AsyncCpuSampler +from rlpyt.algos.dqn.dqn import DQN +from rlpyt.agents.dqn.dqn_agent import DqnAgent +from rlpyt.agents.dqn.atari.atari_dqn_agent import AtariDqnAgent +from rlpyt.runners.minibatch_rl import MinibatchRlEval, MinibatchRl +from rlpyt.utils.logging.context import logger_context +from rlpyt.envs.gym import GymEnvWrapper +from rlpyt.runners.async_rl import AsyncRlEval +from rlpyt.utils.tensor import infer_leading_dims, restore_leading_dims +from rlpyt.utils.wrappers import * +from rlpyt.envs.gym import make as make_env +from rlpyt.replays.non_sequence.uniform import UniformReplayBuffer +from rlpyt.envs.base import EnvSpaces +from rlpyt.utils.buffer import buffer_to + +import torch +import torch.nn as nn +import torch.nn.functional as F + +import numpy as np +import gym + + + +############################# classes and functions ############################# + +class CustomMixin: + def make_env_to_model_kwargs(self, env_spaces): + return dict(observation_shape=env_spaces.observation.shape, + output_size=env_spaces.action.n) + + +class CustomDqnModel(torch.nn.Module): + def __init__( + self, + observation_shape, + output_size, + fc_sizes=64 + ): + super().__init__() + self._obs_ndim = len(observation_shape) + input_shape = observation_shape[0] + + self.base_net = torch.nn.Sequential( + torch.nn.Linear(input_shape, fc_sizes), + torch.nn.ReLU(), + torch.nn.Linear(fc_sizes, fc_sizes), + torch.nn.ReLU(), + torch.nn.Linear(fc_sizes, output_size), + ) + # self.base_net.apply(self.init_weights) + + def forward(self, observation, prev_action, prev_reward): + observation = observation.type(torch.float) + lead_dim, T, B, obs_shape = infer_leading_dims(observation, self._obs_ndim) + obs = observation.view(T * B, -1) + q = self.base_net(obs) + q = restore_leading_dims(q, lead_dim, T, B) + return q + + def init_weights(self, m): + if type(m) == torch.nn.Linear: + torch.nn.init.normal_(m.weight) + torch.nn.init.zeros_(m.bias) + + +# class CustomDqnAgent(CustomMixin, DqnAgent): +class CustomDqnAgent(CustomMixin, DqnAgent): + def __init__(self, ModelCls=CustomDqnModel, **kwargs): + super().__init__(ModelCls=ModelCls, **kwargs) + + @torch.no_grad() + def eval_step(self, observation, prev_action, prev_reward): + """Computes Q-values for states/observations and selects actions by + epsilon-greedy. (no grad)""" + # prev_action = self.distribution.to_onehot(prev_action) + model_inputs = buffer_to((observation, prev_action, prev_reward), + device=self.device) + q = self.model(*model_inputs) + q = q.cpu() + action = torch.argmax(q) + return action + + +def make_env_custom(*args, **kwargs): + env = gym.make('CartPole-v0') + env = GymEnvWrapper(env) + return env + + +def build_and_train(run_ID=0, cuda_idx=None): + env_id = 'CartPole-v0' + + sampler = CpuSampler( + EnvCls=make_env, + env_kwargs=dict(id=env_id), #env_config, + eval_env_kwargs=dict(id=env_id), #env_config, + batch_T=4, # One time-step per sampler iteration. + batch_B=8, # One environment (i.e. sampler Batch dimension). + max_decorrelation_steps=100, + eval_n_envs=2, + eval_max_steps=int(10e3), + eval_max_trajectories=4, + ) + + algo = DQN( + learning_rate=1e-3, + replay_ratio=8, + batch_size=32, + min_steps_learn=32, + eps_steps=10e3, + replay_size=int(1e3), + # double_dqn=True, + # target_update_interval=1, + # prioritized_replay=True, + ReplayBufferCls=UniformReplayBuffer, + ) + + agent = CustomDqnAgent() + + runner = MinibatchRl( + algo=algo, + agent=agent, + sampler=sampler, + n_steps=1e6, + log_interval_steps=1e2, + affinity=dict(cuda_idx=cuda_idx, workers_cpus=[0, 1, 2, 4, 5, 6]) + ) + + config = dict(env_id=env_id) + algo_name = 'dqn_' + name = algo_name + env_id + log_dir = algo_name + env_id + + with logger_context(log_dir, run_ID, name, config, snapshot_mode='last'): + runner.train() + + +def evaluate(): + import time + pre_trained_model = '/home/isaac/codes/dd-zero/rlpyt/data/local/2020_04-04_06-52.20/dqn_CartPole-v0/run_0/itr_24713.pkl' + data = torch.load(pre_trained_model) + agent_state_dict = data['agent_state_dict'] + + # for loading pre-trained models see: https://github.com/astooke/rlpyt/issues/69 + env = gym.make('CartPole-v0') + + agent = CustomDqnAgent(initial_model_state_dict=agent_state_dict['model']) + + env_spaces = EnvSpaces( + observation=env.observation_space, + action=env.action_space, + ) + agent.initialize(env_spaces) + agent.load_state_dict(agent_state_dict['model']) + + obs = env.reset() + tot_reward = 0 + while True: + # action = agent.step(torch.tensor(obs, dtype=torch.float32), torch.tensor(0), torch.tensor(0)) + action = agent.eval_step(torch.tensor(obs, dtype=torch.float32), None, None) + a = np.array(action) + obs, reward, done, info = env.step(a) + tot_reward += reward + env.render() + time.sleep(0.001) + if done: + break + + print('reward: ', tot_reward) + env.close() + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('--run_ID', help='run identifier (logging)', type=int, default=0) + parser.add_argument('--cuda_idx', help='gpu to use ', type=int, default=0) + parser.add_argument('--mode', help='train or eval', default='eval') + + args = parser.parse_args() + + if args.mode == 'train': + build_and_train( + run_ID=args.run_ID, + cuda_idx=args.cuda_idx, + ) + else: + evaluate() diff --git a/rlpyt/agents/base.py b/rlpyt/agents/base.py index 1786f5f9..25a627e2 100644 --- a/rlpyt/agents/base.py +++ b/rlpyt/agents/base.py @@ -9,6 +9,9 @@ from rlpyt.utils.logging import logger from rlpyt.models.utils import strip_ddp_state_dict +from rlpyt.spaces.int_box import IntBox +from rlpyt.spaces.float_box import FloatBox + AgentInputs = namedarraytuple("AgentInputs", ["observation", "prev_action", "prev_reward"]) AgentStep = namedarraytuple("AgentStep", ["action", "agent_info"]) @@ -56,6 +59,9 @@ def __init__(self, ModelCls=None, model_kwargs=None, initial_model_state_dict=No self._send_count = mp.RawValue("l", 0) self._recv_count = 0 + ## TODO: add fram idx counter for e-greedy action selection + self.frame_idx = 0 + def __call__(self, observation, prev_action, prev_reward): """Returns values from model forward pass on training data (i.e. used in algorithm).""" @@ -79,6 +85,10 @@ def initialize(self, env_spaces, share_memory=False, **kwargs): env_spaces: passed to ``make_env_to_model_kwargs()``, typically namedtuple of 'observation' and 'action'. share_memory (bool): whether to use shared memory for model parameters. """ + + # FloatBox(env_spaces.observation.low, env_spaces.observation.high) + # FloatBox(env_spaces.observation.low, env_spaces.observation.high) + self.env_model_kwargs = self.make_env_to_model_kwargs(env_spaces) self.model = self.ModelCls(**self.env_model_kwargs, **self.model_kwargs) diff --git a/rlpyt/agents/dqn/atari/atari_dqn_agent.py b/rlpyt/agents/dqn/atari/atari_dqn_agent.py index 2be7cee6..bd9bf55c 100644 --- a/rlpyt/agents/dqn/atari/atari_dqn_agent.py +++ b/rlpyt/agents/dqn/atari/atari_dqn_agent.py @@ -3,7 +3,6 @@ from rlpyt.models.dqn.atari_dqn_model import AtariDqnModel from rlpyt.agents.dqn.atari.mixin import AtariMixin - class AtariDqnAgent(AtariMixin, DqnAgent): def __init__(self, ModelCls=AtariDqnModel, **kwargs): diff --git a/rlpyt/agents/dqn/deepdrive/__init__.py b/rlpyt/agents/dqn/deepdrive/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/rlpyt/agents/dqn/deepdrive/deepdrive_dqn_agent.py b/rlpyt/agents/dqn/deepdrive/deepdrive_dqn_agent.py new file mode 100644 index 00000000..a12cf4cd --- /dev/null +++ b/rlpyt/agents/dqn/deepdrive/deepdrive_dqn_agent.py @@ -0,0 +1,27 @@ +from rlpyt.agents.dqn.dqn_agent import DqnAgent +from rlpyt.models.dqn.deepdrive_dqn_model import DeepDriveDqnModel +from rlpyt.utils.buffer import buffer_to +from rlpyt.utils.collections import namedarraytuple +from rlpyt.agents.dqn.deepdrive.mixin import DeepDriveMixin + +import torch + + +AgentInfo = namedarraytuple("AgentInfo", "q") + + +class DeepDriveDqnAgent(DeepDriveMixin, DqnAgent): + def __init__(self, ModelCls=DeepDriveDqnModel, **kwargs): + super().__init__(ModelCls=ModelCls, **kwargs) + + @torch.no_grad() + def eval_step(self, observation, prev_action, prev_reward): + """Computes Q-values for states/observations and selects actions by + epsilon-greedy. (no grad)""" + # prev_action = self.distribution.to_onehot(prev_action) + model_inputs = buffer_to((observation, prev_action, prev_reward), + device=self.device) + q = self.model(*model_inputs) + q = q.cpu() + action = torch.argmax(q) + return action \ No newline at end of file diff --git a/rlpyt/agents/dqn/deepdrive/deepdrive_r2d1_agent.py b/rlpyt/agents/dqn/deepdrive/deepdrive_r2d1_agent.py new file mode 100644 index 00000000..d9e69397 --- /dev/null +++ b/rlpyt/agents/dqn/deepdrive/deepdrive_r2d1_agent.py @@ -0,0 +1,40 @@ + +from rlpyt.agents.dqn.r2d1_agent import R2d1Agent +from rlpyt.models.dqn.deepdrive_r2d1_model import DeepdriveR2d1Model +from rlpyt.utils.buffer import buffer_to, buffer_func, buffer_method +from rlpyt.utils.collections import namedarraytuple +from rlpyt.agents.base import AgentStep + +import torch + +AgentInfo = namedarraytuple("AgentInfo", ["q", "prev_rnn_state"]) + + +class DeepDriveR2d1Agent(R2d1Agent): + + def __init__(self, ModelCls=DeepdriveR2d1Model, **kwargs): + super().__init__(ModelCls=ModelCls, **kwargs) + + def make_env_to_model_kwargs(self, env_spaces): + return dict(observation_shape=env_spaces.observation.shape, + output_size=env_spaces.action.n) + + @torch.no_grad() + def eval_step(self, observation, prev_action, prev_reward): + """Computes Q-values for states/observations and selects actions by + epsilon-greedy (no grad). Advances RNN state.""" + prev_action = self.distribution.to_onehot(prev_action) + # prev_reward = prev_reward.float() #model expects float tensor + agent_inputs = buffer_to((observation, prev_action, prev_reward), + device=self.device) + q, rnn_state = self.model(*agent_inputs, self.prev_rnn_state) # Model handles None. + q = q.cpu() + action = torch.argmax(q) + prev_rnn_state = self.prev_rnn_state or buffer_func(rnn_state, torch.zeros_like) + # Transpose the rnn_state from [N,B,H] --> [B,N,H] for storage. + # (Special case, model should always leave B dimension in.) + prev_rnn_state = buffer_method(prev_rnn_state, "transpose", 0, 1) + prev_rnn_state = buffer_to(prev_rnn_state, device="cpu") + agent_info = AgentInfo(q=q, prev_rnn_state=prev_rnn_state) + self.advance_rnn_state(rnn_state) # Keep on device. + return AgentStep(action=action, agent_info=agent_info) diff --git a/rlpyt/agents/dqn/deepdrive/mixin.py b/rlpyt/agents/dqn/deepdrive/mixin.py new file mode 100644 index 00000000..4e841076 --- /dev/null +++ b/rlpyt/agents/dqn/deepdrive/mixin.py @@ -0,0 +1,6 @@ + +class DeepDriveMixin: + + def make_env_to_model_kwargs(self, env_spaces): + return dict(observation_shape=env_spaces.observation.shape, + output_size=env_spaces.action.n) \ No newline at end of file diff --git a/rlpyt/agents/dqn/dqn_agent.py b/rlpyt/agents/dqn/dqn_agent.py index 92aa80f3..6a9ec5e2 100644 --- a/rlpyt/agents/dqn/dqn_agent.py +++ b/rlpyt/agents/dqn/dqn_agent.py @@ -3,6 +3,7 @@ from torch.nn.parallel import DistributedDataParallel as DDP from torch.nn.parallel import DistributedDataParallelCPU as DDPC + from rlpyt.agents.base import BaseAgent, AgentStep from rlpyt.agents.dqn.epsilon_greedy import EpsilonGreedyAgentMixin from rlpyt.distributions.epsilon_greedy import EpsilonGreedy @@ -42,6 +43,8 @@ def initialize(self, env_spaces, share_memory=False, if env_ranks is not None: self.make_vec_eps(global_B, env_ranks) + self.env_spaces = env_spaces + def to_device(self, cuda_idx=None): super().to_device(cuda_idx) self.target_model.to(self.device) @@ -60,6 +63,7 @@ def step(self, observation, prev_action, prev_reward): q = self.model(*model_inputs) q = q.cpu() action = self.distribution.sample(q) + agent_info = AgentInfo(q=q) # action, agent_info = buffer_to((action, agent_info), device="cpu") return AgentStep(action=action, agent_info=agent_info) @@ -75,3 +79,4 @@ def target(self, observation, prev_action, prev_reward): def update_target(self, tau=1): """Copies the model parameters into the target model.""" update_state_dict(self.target_model, self.model.state_dict(), tau) + diff --git a/rlpyt/agents/dqn/epsilon_greedy.py b/rlpyt/agents/dqn/epsilon_greedy.py index f60cdc24..9699bef3 100644 --- a/rlpyt/agents/dqn/epsilon_greedy.py +++ b/rlpyt/agents/dqn/epsilon_greedy.py @@ -38,6 +38,8 @@ def __init__( self._eps_itr_min_max[0] = eps_itr_min self._eps_itr_min_max[1] = eps_itr_max + #TODO: counter for time-steps + def collector_initialize(self, global_B=1, env_ranks=None): """For vector-valued epsilon, the agent inside the sampler worker process must initialize with its own epsilon values.""" diff --git a/rlpyt/algos/dqn/dqn.py b/rlpyt/algos/dqn/dqn.py index 2f3f8969..d2979498 100644 --- a/rlpyt/algos/dqn/dqn.py +++ b/rlpyt/algos/dqn/dqn.py @@ -8,6 +8,9 @@ from rlpyt.replays.non_sequence.frame import (UniformReplayFrameBuffer, PrioritizedReplayFrameBuffer, AsyncUniformReplayFrameBuffer, AsyncPrioritizedReplayFrameBuffer) +from rlpyt.replays.non_sequence.uniform import UniformReplayBuffer, AsyncUniformReplayBuffer +from rlpyt.replays.non_sequence.prioritized import PrioritizedReplayBuffer, AsyncPrioritizedReplayBuffer +from rlpyt.replays.non_sequence.uniform import UniformReplayBuffer from rlpyt.utils.collections import namedarraytuple from rlpyt.utils.tensor import select_at_indexes, valid_mean from rlpyt.algos.utils import valid_from_done @@ -53,8 +56,9 @@ def __init__( pri_beta_final=1., pri_beta_steps=int(50e6), default_priority=None, - ReplayBufferCls=None, # Leave None to select by above options. + replay_buffer_class=None, # Leave None to select by above options. updates_per_sync=1, # For async mode only. + frame_state_space=True, # set True for atari-like games and False for envs like CartPole-v0 ): """Saves input arguments. @@ -92,6 +96,7 @@ def initialize(self, agent, n_itr, batch_spec, mid_batch_reset, examples, f"updates per iteration.") self.min_itr_learn = int(self.min_steps_learn // sampler_bs) eps_itr_max = max(1, int(self.eps_steps // sampler_bs)) + agent.set_epsilon_itr_min_max(self.min_itr_learn, eps_itr_max) self.initialize_replay_buffer(examples, batch_spec) self.optim_initialize(rank) @@ -105,11 +110,13 @@ def async_initialize(self, agent, sampler_n_itr, batch_spec, mid_batch_reset, self.initialize_replay_buffer(examples, batch_spec, async_=True) self.mid_batch_reset = mid_batch_reset self.sampler_bs = sampler_bs = batch_spec.size + self.updates_per_optimize = self.updates_per_sync self.min_itr_learn = int(self.min_steps_learn // sampler_bs) eps_itr_max = max(1, int(self.eps_steps // sampler_bs)) # Before any forking so all sub processes have epsilon schedule: agent.set_epsilon_itr_min_max(self.min_itr_learn, eps_itr_max) + return self.replay_buffer def optim_initialize(self, rank=0): @@ -142,18 +149,31 @@ def initialize_replay_buffer(self, examples, batch_spec, async_=False): discount=self.discount, n_step_return=self.n_step_return, ) - if self.prioritized_replay: - replay_kwargs.update(dict( - alpha=self.pri_alpha, - beta=self.pri_beta_init, - default_priority=self.default_priority, - )) - ReplayCls = (AsyncPrioritizedReplayFrameBuffer if async_ else - PrioritizedReplayFrameBuffer) + + if self.replay_buffer_class is None: + if self.prioritized_replay: + replay_kwargs.update(dict( + alpha=self.pri_alpha, + beta=self.pri_beta_init, + default_priority=self.default_priority, + )) + if self.frame_state_space: + ReplayCls = (AsyncPrioritizedReplayFrameBuffer if async_ else + PrioritizedReplayFrameBuffer) + else: + ReplayCls = (AsyncPrioritizedReplayBuffer if async_ else + PrioritizedReplayBuffer) + else: + if self.frame_state_space: + ReplayCls = (AsyncUniformReplayFrameBuffer if async_ else + UniformReplayFrameBuffer) + else: + ReplayCls = (AsyncUniformReplayBuffer if async_ else + UniformReplayBuffer) else: - ReplayCls = (AsyncUniformReplayFrameBuffer if async_ else - UniformReplayFrameBuffer) - self.replay_buffer = ReplayCls(**replay_kwargs) + ReplayCls = self.replay_buffer_class + + self.replay_buffer = ReplayCls(**replay_kwargs) #UniformReplayBuffer(**replay_kwargs) # def optimize_agent(self, itr, samples=None, sampler_itr=None): """ diff --git a/rlpyt/algos/dqn/r2d1.py b/rlpyt/algos/dqn/r2d1.py index 64b761d4..0748e6da 100644 --- a/rlpyt/algos/dqn/r2d1.py +++ b/rlpyt/algos/dqn/r2d1.py @@ -10,6 +10,10 @@ from rlpyt.replays.sequence.frame import (UniformSequenceReplayFrameBuffer, PrioritizedSequenceReplayFrameBuffer, AsyncUniformSequenceReplayFrameBuffer, AsyncPrioritizedSequenceReplayFrameBuffer) +from rlpyt.replays.sequence.prioritized import AsyncPrioritizedSequenceReplayBuffer, PrioritizedSequenceReplayBuffer +from rlpyt.replays.sequence.uniform import AsyncUniformSequenceReplayBuffer, UniformSequenceReplayBuffer + + from rlpyt.utils.tensor import select_at_indexes, valid_mean from rlpyt.algos.utils import valid_from_done, discount_return_n_step from rlpyt.utils.buffer import buffer_to, buffer_method, torchify_buffer @@ -61,7 +65,9 @@ def __init__( input_priorities=True, input_priority_shift=None, value_scale_eps=1e-3, # 1e-3 (Steven). + replay_buffer_class=None, # set None to select automatically updates_per_sync=1, # For async mode only. + frame_state_space=True, # set True for atari-like games and False for envs like CartPole-v0 ): """Saves input arguments. @@ -79,6 +85,10 @@ def __init__( default_priority = delta_clip or 1. if input_priority_shift is None: input_priority_shift = warmup_T // store_rnn_state_interval + + # self.replaybuffercls = ReplayBufferCls + # self.frame_state_space = frame_state_space + save__init__args(locals()) self._batch_size = (self.batch_T + self.warmup_T) * self.batch_B @@ -105,19 +115,32 @@ def initialize_replay_buffer(self, examples, batch_spec, async_=False): # batch_T fixed for prioritized, (relax if rnn_state_interval=1 or 0). batch_T=self.batch_T + self.warmup_T, ) - if self.prioritized_replay: - replay_kwargs.update(dict( - alpha=self.pri_alpha, - beta=self.pri_beta_init, - default_priority=self.default_priority, - input_priorities=self.input_priorities, # True/False. - input_priority_shift=self.input_priority_shift, - )) - ReplayCls = (AsyncPrioritizedSequenceReplayFrameBuffer if async_ - else PrioritizedSequenceReplayFrameBuffer) + + if self.replay_buffer_class is None: + if self.prioritized_replay: + replay_kwargs.update(dict( + alpha=self.pri_alpha, + beta=self.pri_beta_init, + default_priority=self.default_priority, + input_priorities=self.input_priorities, # True/False. + input_priority_shift=self.input_priority_shift, + )) + if self.frame_state_space: # if state space is fram -> like atari + ReplayCls = (AsyncPrioritizedSequenceReplayFrameBuffer if async_ + else PrioritizedSequenceReplayFrameBuffer) + else: #non-frame state space -> vector state space + ReplayCls = (AsyncPrioritizedSequenceReplayBuffer if async_ + else PrioritizedSequenceReplayBuffer) + else: + if self.frame_state_space: + ReplayCls = (AsyncUniformSequenceReplayFrameBuffer if async_ + else UniformSequenceReplayFrameBuffer) + else: + ReplayCls = (AsyncUniformSequenceReplayBuffer if async_ + else UniformSequenceReplayBuffer) else: - ReplayCls = (AsyncUniformSequenceReplayFrameBuffer if async_ - else UniformSequenceReplayFrameBuffer) + ReplayCls = self.replay_buffer_class + self.replay_buffer = ReplayCls(**replay_kwargs) return self.replay_buffer diff --git a/rlpyt/envs/gym.py b/rlpyt/envs/gym.py index e628bdc6..e0994bfc 100644 --- a/rlpyt/envs/gym.py +++ b/rlpyt/envs/gym.py @@ -5,11 +5,15 @@ from gym.wrappers.time_limit import TimeLimit from collections import namedtuple -from rlpyt.envs.base import EnvSpaces, EnvStep +from rlpyt.envs.base import EnvSpaces, EnvStep, Env from rlpyt.spaces.gym_wrapper import GymSpaceWrapper from rlpyt.utils.collections import is_namedtuple_class +from gym import ActionWrapper +from rlpyt.spaces.int_box import IntBox + + class GymEnvWrapper(Wrapper): """Gym-style wrapper for converting the Openai Gym interface to the rlpyt interface. Action and observation spaces are wrapped by rlpyt's @@ -170,3 +174,4 @@ def make(*args, info_example=None, **kwargs): else: return GymEnvWrapper(EnvInfoWrapper( gym.make(*args, **kwargs), info_example)) + diff --git a/rlpyt/experiments/configs/deepdrive_zero/dqn/dd0_r2d1_configs.py b/rlpyt/experiments/configs/deepdrive_zero/dqn/dd0_r2d1_configs.py new file mode 100644 index 00000000..5e49bbb6 --- /dev/null +++ b/rlpyt/experiments/configs/deepdrive_zero/dqn/dd0_r2d1_configs.py @@ -0,0 +1,89 @@ + +import copy +from rlpyt.replays.sequence.uniform import UniformSequenceReplayBuffer +from rlpyt.replays.sequence.n_step import SequenceNStepReturnBuffer +from rlpyt.replays.sequence.prioritized import PrioritizedSequenceReplayBuffer + +configs = dict() + +config = dict( + agent=dict(), + model=dict(dueling=True), + algo=dict( + discount=0.997, + batch_T=80, + batch_B=32, # In the paper, 64. + warmup_T=40, + store_rnn_state_interval=40, + replay_ratio=1, # In the paper, more like 0.8. + replay_size=int(5e5), + learning_rate=8e-5, + clip_grad_norm=80., # 80 (Steven.) + min_steps_learn=int(1e4), + eps_steps=int(1e6), + target_update_interval=2500, #2500 + double_dqn=True, + frame_state_space=False, + prioritized_replay=True, + input_priorities=False, ## True + n_step_return=5, #5 #in the prioritization formula, r2d1 uses n-step return td-error -> I think we have to use n_step if we want to use prioritized replay + pri_alpha=0.6, # Fixed on 20190813 + pri_beta_init=0.9, # I think had these backwards before. + pri_beta_final=0.9, + replay_buffer_class=None, #UniformSequenceReplayBuffer, + input_priority_shift=1, # Added 20190826 (used to default to 1) + ), + optim=dict(), + env = dict( + id='deepdrive-2d-intersection-w-gs-allow-decel-v0', + is_intersection_map=True, + is_one_waypoint_map=False, + expect_normalized_actions=True, + expect_normalized_action_deltas=False, + jerk_penalty_coeff=3.3e-6, + gforce_penalty_coeff=0.006, + lane_penalty_coeff=0.1, #0.02, + collision_penalty_coeff=4, + speed_reward_coeff=0.50, + end_on_harmful_gs=False, + incent_win=True, + incent_yield_to_oncoming_traffic=True, + constrain_controls=False, + physics_steps_per_observation=12, + contain_prev_actions_in_obs=False, + dummy_accel_agent_indices=[1] #for opponent + ), + eval_env=dict( + id='deepdrive-2d-intersection-w-gs-allow-decel-v0', + is_intersection_map=True, + is_one_waypoint_map=False, + expect_normalized_actions=True, + expect_normalized_action_deltas=False, + jerk_penalty_coeff=3.3e-6, + gforce_penalty_coeff=0.006, + lane_penalty_coeff=0.02, + collision_penalty_coeff=4, + speed_reward_coeff=0.50, + end_on_harmful_gs=False, + incent_win=True, + incent_yield_to_oncoming_traffic=True, + constrain_controls=False, + physics_steps_per_observation=12, + contain_prev_actions_in_obs=False, + dummy_accel_agent_indices=[1] + ), + runner=dict( + n_steps=10e6, + log_interval_steps=1e1, + ), + sampler=dict( + batch_T=30, # Match the algo / replay_ratio. + batch_B=32, + max_decorrelation_steps=1000, + eval_n_envs=4, + eval_max_steps=int(51e3), + eval_max_trajectories=100, + ), +) + +configs["r2d1"] = config diff --git a/rlpyt/experiments/scripts/atari/dqn/launch/launch_atari_r2d1_gpu_basic.py b/rlpyt/experiments/scripts/atari/dqn/launch/launch_atari_r2d1_gpu_basic.py index 59b094b4..92b19dc1 100644 --- a/rlpyt/experiments/scripts/atari/dqn/launch/launch_atari_r2d1_gpu_basic.py +++ b/rlpyt/experiments/scripts/atari/dqn/launch/launch_atari_r2d1_gpu_basic.py @@ -7,7 +7,7 @@ affinity_code = encode_affinity( n_cpu_core=4, n_gpu=1, - hyperthread_offset=8, + hyperthread_offset=None, n_socket=1, # cpu_per_run=2, ) diff --git a/rlpyt/experiments/scripts/atari/dqn/launch/pabti/launch_atari_r2d1_async_alt_gravitar.py b/rlpyt/experiments/scripts/atari/dqn/launch/pabti/launch_atari_r2d1_async_alt_gravitar.py index a3c5dc32..8e219c64 100644 --- a/rlpyt/experiments/scripts/atari/dqn/launch/pabti/launch_atari_r2d1_async_alt_gravitar.py +++ b/rlpyt/experiments/scripts/atari/dqn/launch/pabti/launch_atari_r2d1_async_alt_gravitar.py @@ -5,13 +5,13 @@ script = "rlpyt/experiments/scripts/atari/dqn/train/atari_r2d1_async_alt.py" affinity_code = encode_affinity( - n_cpu_core=24, - n_gpu=4, + n_cpu_core=4, + n_gpu=1, async_sample=True, - gpu_per_run=1, - sample_gpu_per_run=2, + # gpu_per_run=1, + # sample_gpu_per_run=0, # hyperthread_offset=24, - # optim_sample_share_gpu=True, + optim_sample_share_gpu=True, n_socket=1, # Force this. alternating=True, ) @@ -19,7 +19,7 @@ experiment_title = "atari_r2d1_async_alt" variant_levels = list() -games = ["gravitar"] +games = ["pong"] values = list(zip(games)) dir_names = ["{}".format(*v) for v in values] keys = [("env", "game")] diff --git a/rlpyt/experiments/scripts/atari/dqn/train/atari_r2d1_async_alt.py b/rlpyt/experiments/scripts/atari/dqn/train/atari_r2d1_async_alt.py index 1ed66c15..10ae9352 100644 --- a/rlpyt/experiments/scripts/atari/dqn/train/atari_r2d1_async_alt.py +++ b/rlpyt/experiments/scripts/atari/dqn/train/atari_r2d1_async_alt.py @@ -1,4 +1,4 @@ - +import json import sys from rlpyt.utils.launching.affinity import affinity_from_code @@ -20,6 +20,8 @@ def build_and_train(slot_affinity_code, log_dir, run_ID, config_key): variant = load_variant(log_dir) config = update_config(config, variant) config["eval_env"]["game"] = config["env"]["game"] + config["runner"]["n_steps"] = 20e6 # 20e3 + config["runner"]["log_interval_steps"] = 1e3 sampler = AsyncAlternatingSampler( EnvCls=AtariEnv, @@ -39,6 +41,8 @@ def build_and_train(slot_affinity_code, log_dir, run_ID, config_key): affinity=affinity, **config["runner"] ) + + print(f'my config:\n{json.dumps(config, indent=2)}') name = "async_alt_" + config["env"]["game"] with logger_context(log_dir, run_ID, name, config): runner.train() diff --git a/rlpyt/experiments/scripts/deepdrive_zero/dd0_ddpg.py b/rlpyt/experiments/scripts/deepdrive_zero/dd0_ddpg.py new file mode 100644 index 00000000..603037e0 --- /dev/null +++ b/rlpyt/experiments/scripts/deepdrive_zero/dd0_ddpg.py @@ -0,0 +1,127 @@ +""" +Runs one instance of the environment and optimizes using the Soft Actor +Critic algorithm. Can use a GPU for the agent (applies to both sample and +train). No parallelism employed, everything happens in one python process; can +be easier to debug. + +Requires OpenAI gym (and maybe mujoco). If not installed, move on to next +example. + +""" + +from deepdrive_zero.envs.env import Deepdrive2DEnv +from rlpyt.samplers.parallel.cpu.sampler import CpuSampler +from rlpyt.algos.qpg.ddpg import DDPG +from rlpyt.agents.qpg.ddpg_agent import DdpgAgent +from rlpyt.runners.minibatch_rl import MinibatchRlEval +from rlpyt.utils.logging.context import logger_context +from rlpyt.envs.gym import GymEnvWrapper +from rlpyt.envs.base import EnvSpaces + +import torch +import numpy as np + + +env_config = dict( + id='deepdrive-2d-intersection-w-gs-allow-decel-v0', + is_intersection_map=True, + expect_normalized_action_deltas=False, + jerk_penalty_coeff=0.1, + gforce_penalty_coeff=0.01, + end_on_harmful_gs=False, + incent_win=True, + constrain_controls=False, + dummy_accel_agent_indices=[1] +) + + +def make_env(*args, **kwargs): + env = Deepdrive2DEnv() + env.configure_env(kwargs) + return GymEnvWrapper(env) + + +def build_and_train(run_ID=0, cuda_idx=None): + sampler = CpuSampler( + EnvCls=make_env, + env_kwargs=env_config, + eval_env_kwargs=env_config, + batch_T=4, # One time-step per sampler iteration. + batch_B=8, # One environment (i.e. sampler Batch dimension). + max_decorrelation_steps=0, + eval_n_envs=10, + eval_max_steps=int(51e3), + eval_max_trajectories=50, + ) + + + algo = DDPG( + batch_size=64, + replay_size=100000, + bootstrap_timelimit=False, + ) + agent = DdpgAgent() + + runner = MinibatchRlEval( + algo=algo, + agent=agent, + sampler=sampler, + n_steps=1e6, + log_interval_steps=10, + affinity=dict(cuda_idx=cuda_idx, workers_cpus=[0,1,2,3,4,5,6]), + ) + + config = dict(env_id=env_config['id']) + name = "ddpg_" + env_config['id'] + log_dir = "dd2d" + + with logger_context(log_dir, run_ID, name, config, snapshot_mode='last'): + runner.train() + + +def evaluate(pre_trained_model): + data = torch.load(pre_trained_model) + agent_state_dict = data['agent_state_dict'] + + # for loading pre-trained models see: https://github.com/astooke/rlpyt/issues/69 + env = Deepdrive2DEnv() + env.configure_env(env_config) + + agent = DdpgAgent() + env_spaces = EnvSpaces( + observation=env.observation_space, + action=env.action_space, + ) + agent.initialize(env_spaces) + agent.load_state_dict(agent_state_dict) + + obs = env.reset() + while True: + action = agent.step(torch.tensor(obs, dtype=torch.float32), None, None) + a = np.array(action.action) + obs, reward, done, info = env.step(a) + env.render() + if done: + break + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('--run_ID', help='run identifier (logging)', type=int, default=0) + parser.add_argument('--cuda_idx', help='gpu to use ', type=int, default=0) + parser.add_argument('--mode', help='train or eval', default='train') + parser.add_argument('--pre_trained_model', + help='path to the pre-trained model.', + default='/home/isaac/codes/dd-zero/rlpyt/data/local/2020_03-23_20-18.59/dd2d/run_0/params.pkl') + args = parser.parse_args() + + if args.mode == 'train': + build_and_train( + run_ID=args.run_ID, + cuda_idx=args.cuda_idx + ) + else: + evaluate(args.pre_trained_model) + diff --git a/rlpyt/experiments/scripts/deepdrive_zero/dd0_dqn.py b/rlpyt/experiments/scripts/deepdrive_zero/dd0_dqn.py new file mode 100644 index 00000000..7b48718e --- /dev/null +++ b/rlpyt/experiments/scripts/deepdrive_zero/dd0_dqn.py @@ -0,0 +1,208 @@ +""" +Runs one instance of the environment and optimizes using the Soft Actor +Critic algorithm. Can use a GPU for the agent (applies to both sample and +train). No parallelism employed, everything happens in one python process; can +be easier to debug. + +Requires OpenAI gym (and maybe mujoco). If not installed, move on to next +example. +""" + +from deepdrive_zero.envs.env import Deepdrive2DEnv +from deepdrive_zero.envs.variants import OneWaypointEnv + +from rlpyt.samplers.parallel.cpu.sampler import CpuSampler +from rlpyt.samplers.parallel.gpu.sampler import GpuSampler +from rlpyt.samplers.serial.sampler import SerialSampler +from rlpyt.algos.dqn.dqn import DQN +from rlpyt.agents.dqn.deepdrive.deepdrive_dqn_agent import DeepDriveDqnAgent +from rlpyt.agents.dqn.dqn_agent import DqnAgent +from rlpyt.runners.minibatch_rl import MinibatchRlEval, MinibatchRl +from rlpyt.utils.logging.context import logger_context +from rlpyt.envs.gym import GymEnvWrapper +from rlpyt.envs.base import EnvSpaces +from rlpyt.utils.wrappers import DeepDriveDiscretizeActionWrapper +from rlpyt.utils.launching.affinity import make_affinity +from rlpyt.samplers.async_.cpu_sampler import AsyncCpuSampler +from rlpyt.runners.async_rl import AsyncRlEval +from rlpyt.replays.non_sequence.uniform import UniformReplayBuffer +from rlpyt.utils.seed import set_seed +from rlpyt.utils.logging import logger + +import torch +import numpy as np + + +env_config = dict( + id='deepdrive-2d-intersection-w-gs-allow-decel-v0', + # id='deepdrive-2d-one-waypoint-v0', + is_intersection_map=True, + is_one_waypoint_map=False, + expect_normalized_actions=True, + expect_normalized_action_deltas=False, + jerk_penalty_coeff=0.0, + gforce_penalty_coeff=0.0, + lane_penalty_coeff=0.02, #0.02 + collision_penalty_coeff=0.31, + speed_reward_coeff=0.50, + end_on_harmful_gs=False, + incent_win=True, + constrain_controls=False, + physics_steps_per_observation=12, + dummy_accel_agent_indices=[1], +) + + +def make_env(*args, **kwargs): + env = Deepdrive2DEnv() + env.configure_env(kwargs) + env = DeepDriveDiscretizeActionWrapper(env) + env = GymEnvWrapper(env) + return env + + +def build_and_train(run_ID=0, cuda_idx=None): + resume_chkpnt = None #'/home/isaac/codes/dd-zero/rlpyt/data/local/2020_04-03_09-06.57/dqn_dd0/run_0/params.pkl' + + sampler = CpuSampler( + EnvCls=make_env, + env_kwargs=env_config, + eval_env_kwargs=env_config, + batch_T=32, # One time-step per sampler iteration. + batch_B=64, # One environment (i.e. sampler Batch dimension). + max_decorrelation_steps=0, + eval_n_envs=2, + eval_max_steps=int(51e3), + eval_max_trajectories=50, + ) + + # for loading pre-trained models see: https://github.com/astooke/rlpyt/issues/69 + if resume_chkpnt is not None: + print('Continue from previous checkpoint ...') + data = torch.load(resume_chkpnt) + agent_state_dict = data['agent_state_dict']['model'] + optimizer_state_dict = data['optimizer_state_dict'] + else: + print('start training from scratch ...') + agent_state_dict = None + optimizer_state_dict = None + + algo = DQN( + initial_optim_state_dict=optimizer_state_dict, + learning_rate=5e-4, + replay_ratio=8, + batch_size=32, + min_steps_learn=1e3, + eps_steps=10e3, + replay_size=int(5e4), + double_dqn=True, + target_update_interval=100, #20 + target_update_tau=1, + # prioritized_replay=True, + ReplayBufferCls=UniformReplayBuffer, + ) + + agent = DeepDriveDqnAgent( + eps_final=0.02, + initial_model_state_dict=agent_state_dict + ) + + runner = MinibatchRlEval( + algo=algo, + agent=agent, + sampler=sampler, + n_steps=2e6, + log_interval_steps=1, + affinity=dict(cuda_idx=cuda_idx, workers_cpus=[0, 1, 2, 3, 4, 5, 6]) + ) + + config = dict(env_id=env_config['id']) + algo_name = 'dqn_' + name = algo_name + env_config['id'] + log_dir = algo_name + "dd0" + + with logger_context(log_dir, run_ID, name, config, snapshot_mode='last'): + runner.train() + + +def evaluate(pre_trained_model): + data = torch.load(pre_trained_model) + agent_state_dict = data['agent_state_dict'] + + # for loading pre-trained models see: https://github.com/astooke/rlpyt/issues/69 + env = Deepdrive2DEnv() + env.configure_env(env_config) + env = DeepDriveDiscretizeActionWrapper(env) + + agent = DeepDriveDqnAgent(initial_model_state_dict=agent_state_dict['model']) + env_spaces = EnvSpaces( + observation=env.observation_space, + action=env.action_space, + ) + agent.initialize(env_spaces) + + obs = env.reset() + while True: + action = agent.eval_step(torch.tensor(obs, dtype=torch.float32), None, None) + a = np.array(action) + obs, reward, done, info = env.step(a) + env.render() + if done: + obs = env.reset() + + +def test(): + env_config = dict( + id='deepdrive-2d-intersection-w-gs-allow-decel-v0', + # id='deepdrive-2d-one-waypoint-v0', + is_intersection_map=True, + is_one_waypoint_map=False, + expect_normalized_actions=True, + expect_normalized_action_deltas=False, + jerk_penalty_coeff=0.0, + gforce_penalty_coeff=0.0, + lane_penalty_coeff=0.02, # 0.02 + collision_penalty_coeff=0.31, + speed_reward_coeff=0.50, + end_on_harmful_gs=False, + incent_win=True, + constrain_controls=False, + physics_steps_per_observation=12, + contain_prev_actions_in_obs=False, + dummy_accel_agent_indices=[1] + ) + env = Deepdrive2DEnv() + env.configure_env(env_config) + + obs = env.reset() + while True: + a = np.array([0, 1, -1]) + obs, reward, done, info = env.step(a) + env.render() + if done: + obs = env.reset() + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('--run_ID', help='run identifier (logging)', type=int, default=0) + parser.add_argument('--cuda_idx', help='gpu to use ', type=int, default=0) + parser.add_argument('--mode', help='train or eval', default='eval') + parser.add_argument('--pre_trained_model', + help='path to the pre-trained model.', + default='/home/isaac/codes/dd-zero/rlpyt/data/local/2020_04-03_11-26.59/dqn_dd0/run_0/params.pkl' + ) + + args = parser.parse_args() + + # if args.mode == 'train': + # build_and_train( + # run_ID=args.run_ID, + # cuda_idx=args.cuda_idx, + # ) + # else: + # evaluate(args.pre_trained_model) + + test() diff --git a/rlpyt/experiments/scripts/deepdrive_zero/dd0_r2d1.py b/rlpyt/experiments/scripts/deepdrive_zero/dd0_r2d1.py new file mode 100644 index 00000000..4e3057cc --- /dev/null +++ b/rlpyt/experiments/scripts/deepdrive_zero/dd0_r2d1.py @@ -0,0 +1,127 @@ +from deepdrive_zero.envs.env import Deepdrive2DEnv +from rlpyt.samplers.parallel.gpu.sampler import GpuSampler +from rlpyt.samplers.parallel.cpu.sampler import CpuSampler +from rlpyt.samplers.parallel.gpu.collectors import GpuWaitResetCollector +from rlpyt.algos.dqn.r2d1 import R2D1 +from rlpyt.agents.dqn.deepdrive.deepdrive_r2d1_agent import DeepDriveR2d1Agent +from rlpyt.runners.minibatch_rl import MinibatchRlEval +from rlpyt.utils.logging.context import logger_context +from rlpyt.experiments.configs.deepdrive_zero.dqn.dd0_r2d1_configs import configs +from rlpyt.envs.gym import GymEnvWrapper +from rlpyt.envs.base import EnvSpaces +from rlpyt.utils.wrappers import DeepDriveDiscretizeActionWrapper + +import torch +import numpy as np +import gym + +def make_env(*args, **kwargs): + env = Deepdrive2DEnv() + env.configure_env(kwargs) + env = DeepDriveDiscretizeActionWrapper(env) + env = GymEnvWrapper(env) + return env + + +def build_and_train(pre_trained_model=None, run_ID=0): + # for loading pre-trained models see: https://github.com/astooke/rlpyt/issues/69 + if pre_trained_model is not None: + print('Continue from previous checkpoint ...') + data = torch.load(pre_trained_model) + agent_state_dict = data['agent_state_dict']['model'] + optimizer_state_dict = data['optimizer_state_dict'] + else: + print('start training from scratch ...') + agent_state_dict = None + optimizer_state_dict = None + + affinity = dict(cuda_idx=0, workers_cpus=[0,1,2,3,4,5,6]) + config = configs['r2d1'] + + cfg = dict(env_id=config['env']['id'], **config) + algo_name = 'r2d1_' + name = algo_name + config['env']['id'] + log_dir = algo_name + "dd0" + + # TODO: doesn't work with CpuSampler. Check why? + sampler = GpuSampler( + EnvCls=make_env, + env_kwargs=config['env'], + CollectorCls=GpuWaitResetCollector, + eval_env_kwargs=config['eval_env'], + **config["sampler"] + ) + + algo = R2D1( + optim_kwargs=config["optim"], + **config["algo"] + ) + + agent = DeepDriveR2d1Agent( + initial_model_state_dict=agent_state_dict, + **config["agent"] + ) + + runner = MinibatchRlEval( + algo=algo, + agent=agent, + sampler=sampler, + affinity=affinity, + **config["runner"] + ) + + with logger_context(log_dir, run_ID, name, cfg, snapshot_mode='last'): + runner.train() + + +def evaluate(pre_trained_model): + data = torch.load(pre_trained_model) + agent_state_dict = data['agent_state_dict'] + + # for loading pre-trained models see: https://github.com/astooke/rlpyt/issues/69 + config = configs['r2d1'] + env_config = config['eval_env'] + env = Deepdrive2DEnv() + env.configure_env(env_config) + env = DeepDriveDiscretizeActionWrapper(env) + + agent = DeepDriveR2d1Agent(initial_model_state_dict=agent_state_dict['model']) + env_spaces = EnvSpaces( + observation=env.observation_space, + action=env.action_space, + ) + agent.initialize(env_spaces) + + obs = env.reset() + prev_action = torch.tensor(0.0, dtype=torch.float) #None + prev_reward = torch.tensor(0.0, dtype=torch.float) #None + while True: + #TODO: feed prev_action and reward for eval_step() + #TODO: do we need warm-up for evaluation too? + action = agent.eval_step(torch.tensor(obs, dtype=torch.float32), prev_action, prev_reward) + action = np.array(action.action) + obs, reward, done, info = env.step(action) + prev_action = torch.tensor(action, dtype=torch.float) + prev_reward = torch.tensor(reward, dtype=torch.float) + env.render() + if done: + obs = env.reset() + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('--mode', help='train or eval', default='train') + parser.add_argument('--pre_trained_model', + help='path to the pre-trained model.', + default='/home/isaac/codes/dd-zero/rlpyt/data/local/2020_04-06_15-42.03/r2d1_dd0/run_0/params.pkl' + ) + + args = parser.parse_args() + + if args.mode == 'train': + build_and_train(args.pre_trained_model) + else: + evaluate(args.pre_trained_model) + diff --git a/rlpyt/experiments/scripts/deepdrive_zero/dd0_sac.py b/rlpyt/experiments/scripts/deepdrive_zero/dd0_sac.py new file mode 100644 index 00000000..aac70622 --- /dev/null +++ b/rlpyt/experiments/scripts/deepdrive_zero/dd0_sac.py @@ -0,0 +1,140 @@ +""" +Runs one instance of the environment and optimizes using the Soft Actor +Critic algorithm. Can use a GPU for the agent (applies to both sample and +train). No parallelism employed, everything happens in one python process; can +be easier to debug. + +Requires OpenAI gym (and maybe mujoco). If not installed, move on to next +example. + +""" + +from deepdrive_zero.envs.env import Deepdrive2DEnv + +from rlpyt.samplers.parallel.cpu.sampler import CpuSampler +from rlpyt.algos.qpg.sac import SAC +from rlpyt.agents.qpg.sac_agent import SacAgent +from rlpyt.runners.minibatch_rl import MinibatchRlEval +from rlpyt.utils.logging.context import logger_context +from rlpyt.envs.gym import GymEnvWrapper +from rlpyt.envs.base import EnvSpaces +from rlpyt.samplers.serial.sampler import SerialSampler +from rlpyt.utils.logging import logger + +import torch +import numpy as np + + +env_config = dict( + id='deepdrive-2d-intersection-w-gs-allow-decel-v0', + is_intersection_map=True, + is_one_waypoint_map=False, + expect_normalized_actions=True, + expect_normalized_action_deltas=False, + jerk_penalty_coeff=0.0, + gforce_penalty_coeff=0.0, + lane_penalty_coeff=0.02, + collision_penalty_coeff=0.31, + speed_reward_coeff=0.50, + end_on_harmful_gs=False, + incent_win=True, + constrain_controls=False, + physics_steps_per_observation=6, +) + + +def make_env(*args, **kwargs): + env = Deepdrive2DEnv() + env.configure_env(kwargs) + return GymEnvWrapper(env) + + +def build_and_train(run_ID=0, cuda_idx=None): + sampler = SerialSampler( + EnvCls=make_env, + env_kwargs=env_config, + eval_env_kwargs=env_config, + batch_T=1, # One time-step per sampler iteration. + batch_B=1, # One environment (i.e. sampler Batch dimension). + max_decorrelation_steps=0, + eval_n_envs=2, + eval_max_steps=int(51e3), + eval_max_trajectories=50, + ) + + # for loading pre-trained models see: https://github.com/astooke/rlpyt/issues/69 + algo = SAC( + batch_size=64, + replay_size=100000, + bootstrap_timelimit=False, + ) + agent = SacAgent() + + runner = MinibatchRlEval( + algo=algo, + agent=agent, + sampler=sampler, + n_steps=1e6, + log_interval_steps=10, + affinity=dict(cuda_idx=cuda_idx, workers_cpus=[0,1,2,3,4,5,6]), + ) + + config = dict(env_id=env_config['id']) + + algo_name = 'sac_' + name = algo_name + env_config['id'] + log_dir = algo_name + "dd0" + + with logger_context(log_dir, run_ID, name, config, snapshot_mode='last'): + runner.train() + + +def evaluate(pre_trained_model): + data = torch.load(pre_trained_model) + agent_state_dict = data['agent_state_dict'] + + # for loading pre-trained models see: https://github.com/astooke/rlpyt/issues/69 + env = Deepdrive2DEnv() + env.configure_env(env_config) + + agent = SacAgent(initial_model_state_dict=agent_state_dict) + env_spaces = EnvSpaces( + observation=env.observation_space, + action=env.action_space, + ) + agent.initialize(env_spaces) + # agent.load_state_dict(agent_state_dict) + + obs = env.reset() + while True: + action = agent.step(torch.tensor(obs, dtype=torch.float32), None, None) + a = np.array(action.action) + logger.log(f"action: {a}") + obs, reward, done, info = env.step(a) + env.render() + if done: + break + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('--run_ID', help='run identifier (logging)', type=int, default=0) + parser.add_argument('--cuda_idx', help='gpu to use ', type=int, default=0) + parser.add_argument('--mode', help='train or eval', default='train') + parser.add_argument('--pre_trained_model', + help='path to the pre-trained model.', + default='/home/isaac/codes/dd-zero/rlpyt/data/local/2020_03-28_18-46.54/sac_ddzero/run_0/params.pkl') + + args = parser.parse_args() + + if args.mode == 'train': + build_and_train( + run_ID=args.run_ID, + cuda_idx=args.cuda_idx, + ) + else: + evaluate(args.pre_trained_model) + + diff --git a/rlpyt/models/dqn/deepdrive_dqn_model.py b/rlpyt/models/dqn/deepdrive_dqn_model.py new file mode 100644 index 00000000..d2185920 --- /dev/null +++ b/rlpyt/models/dqn/deepdrive_dqn_model.py @@ -0,0 +1,97 @@ + +import torch + +from rlpyt.utils.tensor import infer_leading_dims, restore_leading_dims +from rlpyt.models.mlp import MlpModel +from rlpyt.models.dqn.dueling import DuelingHeadModel +from rlpyt.models.running_mean_std import RunningMeanStdModel + + +class DeepDriveDqnModel(torch.nn.Module): + """ + Mlp network for DQN. + """ + def __init__( + self, + observation_shape, + output_size, + fc_sizes=[128, 128], + dueling=False, + normalize_observation=False, + norm_obs_clip=2, + norm_obs_var_clip=1e-6, + ): + """Instantiates the neural network according to arguments; network defaults + stored within this method.""" + super().__init__() + self.dueling = dueling + self._obs_ndim = len(observation_shape) + input_shape = observation_shape[0] + + self.base_net = MlpModel(input_size=input_shape, + hidden_sizes=fc_sizes, + output_size=output_size, + nonlinearity=torch.nn.ReLU) + + # normalize obs + if normalize_observation: + self.obs_rms = RunningMeanStdModel(observation_shape) + self.norm_obs_clip = norm_obs_clip + self.norm_obs_var_clip = norm_obs_var_clip + self.normalize_observation = normalize_observation + + # head network + if dueling: + self.head = DuelingHeadModel(fc_sizes, fc_sizes, output_size) + else: + self.head = MlpModel(input_size=fc_sizes[0], + hidden_sizes=fc_sizes, + output_size=output_size, + nonlinearity=torch.nn.ReLU) + + # self.tot_mean = 0 + # self.tot_std = 0 + # self.n_data = 0 + + def forward(self, observation, prev_action, prev_reward): + """ + Compute action Q-value estimates from input state. + Infers leading dimensions of input: can be [T,B], [B], or []; provides + returns with same leading dims. Used in both sampler and in algorithm (both + via the agent). + """ + observation = observation.type(torch.float) + lead_dim, T, B, obs_shape = infer_leading_dims(observation, self._obs_ndim) + + if self.normalize_observation: + obs_var = self.obs_rms.var + if self.norm_obs_var_clip is not None: + obs_var = torch.clamp(obs_var, min=self.norm_obs_var_clip) + observation = torch.clamp((observation - self.obs_rms.mean) / + obs_var.sqrt(), -self.norm_obs_clip, self.norm_obs_clip) + + ## TODO: test this way for normalization too + # obs = observation.view(T * B, -1) + # new_mean = obs.mean(dim=0) + # new_std = obs.std(dim=0) + # self.tot_mean = (new_mean * obs.shape[1] + self.tot_mean * self.n_data) / (obs.shape[1] + self.n_data) + # self.tot_std = (new_std * obs.shape[1] + self.tot_std * self.n_data) / (obs.shape[1] + self.n_data) + # obs = (obs - self.tot_mean) / (self.tot_std + 1e-8) + # self.n_data += obs.shape[1] + + obs = observation.view(T * B, -1) + q = self.base_net(obs) + # q = torch.relu(q) + # q = self.head(x) + q = restore_leading_dims(q, lead_dim, T, B) + return q + + def update_obs_rms(self, observation): + if self.normalize_observation: + self.obs_rms.update(observation) + + def init_weights(self, m): + if type(m) == torch.nn.Linear: + torch.nn.init.xavier_normal_(m.weight.data) + torch.nn.init.uniform_(m.bias.data) + diff --git a/rlpyt/models/dqn/deepdrive_r2d1_model.py b/rlpyt/models/dqn/deepdrive_r2d1_model.py new file mode 100644 index 00000000..8c36d8cd --- /dev/null +++ b/rlpyt/models/dqn/deepdrive_r2d1_model.py @@ -0,0 +1,92 @@ + +import torch + +from rlpyt.utils.tensor import infer_leading_dims, restore_leading_dims +from rlpyt.utils.collections import namedarraytuple +from rlpyt.models.mlp import MlpModel +from rlpyt.models.dqn.dueling import DuelingHeadModel +from rlpyt.models.running_mean_std import RunningMeanStdModel + +RnnState = namedarraytuple("RnnState", ["h", "c"]) + + +class DeepdriveR2d1Model(torch.nn.Module): + """MLP network feeding into an LSTM and MLP output for Q-value outputs for + the action set.""" + def __init__( + self, + observation_shape, + output_size, + fc_size=128, # Between mlp and lstm. + lstm_size=128, + head_size=128, + dueling=True, + normalize_observation=False, + norm_obs_clip = 10, + norm_obs_var_clip = 1e-6, + ): + """Instantiates the neural network according to arguments; network defaults + stored within this method.""" + super().__init__() + self._obs_n_dim = len(observation_shape) + self.normalize_observation=normalize_observation + self.dueling = dueling + input_shape = observation_shape[0] + + if self.normalize_observation: + self.obs_rms = RunningMeanStdModel(observation_shape) + self.norm_obs_clip = norm_obs_clip + self.norm_obs_var_clip = norm_obs_var_clip + + self.mlp = MlpModel(input_size=input_shape, + hidden_sizes=[256], + output_size=fc_size, + nonlinearity=torch.nn.Tanh # Match spinningup + ) + self.lstm = torch.nn.LSTM(fc_size + output_size + 1, lstm_size) + if dueling: + self.head = DuelingHeadModel(lstm_size, head_size, output_size) + else: + self.head = MlpModel(input_size=lstm_size, + hidden_sizes=head_size, + output_size=output_size, + nonlinearity=torch.nn.Tanh) #TODO: test with Tanh + + def forward(self, observation, prev_action, prev_reward, init_rnn_state): + """Feedforward layers process as [T*B,H]. Return same leading dims as + input, can be [T,B], [B], or [].""" + obz = observation.type(torch.float) # Expect torch.uint8 inputs + + # Infer (presence of) leading dimensions: [T,B], [B], or []. + lead_dim, T, B, _ = infer_leading_dims(obz, self._obs_n_dim) + + if self.normalize_observation: + obs_var = self.obs_rms.var + if self.norm_obs_var_clip is not None: + obs_var = torch.clamp(obs_var, min=self.norm_obs_var_clip) + observation = torch.clamp((observation - self.obs_rms.mean) / + obs_var.sqrt(), -self.norm_obs_clip, self.norm_obs_clip) + + mlp_out = self.mlp(observation.view(T * B, -1)) + + lstm_input = torch.cat([ + mlp_out.view(T, B, -1), + prev_action.view(T, B, -1), + prev_reward.view(T, B, 1), + ], dim=2) + + init_rnn_state = None if init_rnn_state is None else tuple(init_rnn_state) + lstm_out, (hn, cn) = self.lstm(lstm_input, init_rnn_state) + + q = self.head(lstm_out.view(T * B, -1)) + + # Restore leading dimensions: [T,B], [B], or [], as input. + q = restore_leading_dims(q, lead_dim, T, B) + # Model should always leave B-dimension in rnn state: [N,B,H]. + next_rnn_state = RnnState(h=hn, c=cn) + + return q, next_rnn_state + + def update_obs_rms(self, observation): + if self.normalize_observation: + self.obs_rms.update(observation) \ No newline at end of file diff --git a/rlpyt/runners/async_rl.py b/rlpyt/runners/async_rl.py index bdad26ea..628dbba1 100644 --- a/rlpyt/runners/async_rl.py +++ b/rlpyt/runners/async_rl.py @@ -87,6 +87,10 @@ def train(self): throttle_itr, delta_throttle_itr = self.startup() throttle_time = 0. sampler_itr = itr = 0 + + # TODO: train mode + self.agent.model.train() + if self._eval: while self.ctrl.sampler_itr.value < 1: # Sampler does eval first. time.sleep(THROTTLE_WAIT) diff --git a/rlpyt/runners/minibatch_rl.py b/rlpyt/runners/minibatch_rl.py index f27a7e6c..e5b57606 100644 --- a/rlpyt/runners/minibatch_rl.py +++ b/rlpyt/runners/minibatch_rl.py @@ -256,10 +256,14 @@ def train(self): self.agent.sample_mode(itr) # Might not be this agent sampling. samples, traj_infos = self.sampler.obtain_samples(itr) self.agent.train_mode(itr) + # sampling_eps = self.agent.distribution._epsilon opt_info = self.algo.optimize_agent(itr, samples) self.store_diagnostics(itr, traj_infos, opt_info) if (itr + 1) % self.log_interval_itrs == 0: self.log_diagnostics(itr) + + # logger.log(f"epsilon value: {self.agent.distribution._epsilon}") + self.shutdown() def initialize_logging(self): @@ -278,6 +282,13 @@ def log_diagnostics(self, itr): logger.record_tabular('NewCompletedTrajs', self._new_completed_trajs) logger.record_tabular('StepsInTrajWindow', sum(info["Length"] for info in self._traj_infos)) + + ## TODO: log epsilon for dqn agents + # try: + # logger.record_tabular('Epsilon', sampling_eps) + # except: + # pass + super().log_diagnostics(itr) self._new_completed_trajs = 0 @@ -307,10 +318,12 @@ def train(self): self.agent.sample_mode(itr) samples, traj_infos = self.sampler.obtain_samples(itr) self.agent.train_mode(itr) + # sampling_eps = self.agent.distribution._epsilon opt_info = self.algo.optimize_agent(itr, samples) self.store_diagnostics(itr, traj_infos, opt_info) if (itr + 1) % self.log_interval_itrs == 0: eval_traj_infos, eval_time = self.evaluate_agent(itr) + #TODO: save best model self.log_diagnostics(itr, eval_traj_infos, eval_time) self.shutdown() @@ -345,4 +358,11 @@ def log_diagnostics(self, itr, eval_traj_infos, eval_time): logger.record_tabular('TrajsInEval', len(eval_traj_infos)) self._cum_eval_time += eval_time logger.record_tabular('CumEvalTime', self._cum_eval_time) + + ## TODO: log epsilon for dqn agents + # try: + # logger.record_tabular('Epsilon', sampling_eps) + # except: + # pass + super().log_diagnostics(itr, eval_traj_infos, eval_time) diff --git a/rlpyt/utils/logging/context.py b/rlpyt/utils/logging/context.py index 2d0b0532..6e11e353 100644 --- a/rlpyt/utils/logging/context.py +++ b/rlpyt/utils/logging/context.py @@ -4,7 +4,8 @@ import os.path as osp from contextlib import contextmanager try: - from torch.utils.tensorboard.writer import SummaryWriter + # from torch.utils.tensorboard.writer import SummaryWriter + from torch.utils.tensorboard import SummaryWriter except ImportError: print("Unable to import tensorboard SummaryWriter, proceeding without.") @@ -14,8 +15,8 @@ def get_log_dir(experiment_name): - yyyymmdd = datetime.datetime.today().strftime("%Y%m%d") - log_dir = osp.join(LOG_DIR, "local", yyyymmdd, experiment_name) + date_str = datetime.datetime.now().strftime('%Y_%m-%d_%H-%M.%S') + log_dir = osp.join(LOG_DIR, "local", date_str, experiment_name) return log_dir @@ -23,7 +24,7 @@ def get_log_dir(experiment_name): def logger_context( log_dir, run_ID, name, log_params=None, snapshot_mode="none", override_prefix=False, use_summary_writer=False, -): + ): """Use as context manager around calls to the runner's ``train()`` method. Sets up the logger directory and filenames. Unless override_prefix is True, this function automatically prepends ``log_dir`` with the rlpyt logging @@ -52,7 +53,7 @@ def logger_context( exp_dir = osp.abspath(log_dir) if LOG_DIR != osp.commonpath([exp_dir, LOG_DIR]) and not override_prefix: print(f"logger_context received log_dir outside of {LOG_DIR}: " - f"prepending by {LOG_DIR}/local//") + f"prepending by {LOG_DIR}/local//") exp_dir = get_log_dir(log_dir) tabular_log_file = osp.join(exp_dir, "progress.csv") text_log_file = osp.join(exp_dir, "debug.log") diff --git a/rlpyt/utils/wrappers.py b/rlpyt/utils/wrappers.py new file mode 100644 index 00000000..78dfbff9 --- /dev/null +++ b/rlpyt/utils/wrappers.py @@ -0,0 +1,43 @@ +from rlpyt.envs.base import Env +from rlpyt.envs.gym import IntBox +import gym + +class DeepDriveDiscretizeActionWrapper(gym.ActionWrapper, Env): + """ Discretizes the action space of deepdrive_zero env. + """ + def __init__(self, env): + super(DeepDriveDiscretizeActionWrapper, self).__init__(env) + discrete_steer = [-.3, -.2, -.1, 0, 0.1, .2, .3] #list(np.arange(-0.45, 0.451, 0.15)) #list(np.arange(-1, 1.01, 0.08)) + discrete_acc = [-1, 0.5, 1] + # discrete_brake = [-1, 0, 1] + self.discrete_act = [discrete_steer, discrete_acc] # acc, steer + self.n_steer = len(self.discrete_act[0]) + self.n_acc = len(self.discrete_act[1]) + # self.n_brake = len(self.discrete_act[2]) + self.action_space = gym.spaces.Discrete(self.n_steer * self.n_acc) + # self.action_space = IntBox(low=0, high=self.n_acc * self.n_steer) + + self.action_items = [] + for s in discrete_steer: + for a in discrete_acc: + # for b in discrete_brake: + # self.action_items.append([s, a, b]) + if a >= 0: + self.action_items.append([s, a, 0]) + else: + self.action_items.append([s, 0, -a]) + + def step(self, action): + # action input is continues: + # **steer** + # > Heading angle of the ego + # + # **accel** + # > m/s/s of the ego, positive for forward, negative for reverse + # + # **brake** + # > From 0g at -1 to 1g at 1 of brake force + # [steer, accel, brake] + + act = self.action_items[action] + return self.env.step(act)