diff --git a/ding/envs/env/ding_env_wrapper.py b/ding/envs/env/ding_env_wrapper.py index 83fab06048..dc67e826bd 100644 --- a/ding/envs/env/ding_env_wrapper.py +++ b/ding/envs/env/ding_env_wrapper.py @@ -26,16 +26,16 @@ class DingEnvWrapper(BaseEnv): def __init__(self, env: gym.Env = None, cfg: dict = None, seed_api: bool = True, caller: str = 'collector') -> None: """ Overview: - Initialize the DingEnvWrapper. Either an environment instance or a config to create the environment - instance should be passed in: - - An environment instance: The `env` parameter must not be `None`, but should be the instance. - It does not support subprocess environment manager. Thus, it is usually used in simple environments. - - A config to create an environment instance: The `cfg` parameter must contain `env_id`. + Initialize the DingEnvWrapper. Either an environment instance or a config to create the environment \ + instance should be passed in. For the former, i.e., an environment instance: The `env` parameter must not \ + be `None`, but should be the instance. It does not support subprocess environment manager. Thus, it is \ + usually used in simple environments. For the latter, i.e., a config to create an environment instance: \ + The `cfg` parameter must contain `env_id`. Arguments: - env (:obj:`gym.Env`): An environment instance to be wrapped. - cfg (:obj:`dict`): The configuration dictionary to create an environment instance. - seed_api (:obj:`bool`): Whether to use seed API. Defaults to True. - - caller (:obj:`str`): A string representing the caller of this method, including ``collector`` or + - caller (:obj:`str`): A string representing the caller of this method, including ``collector`` or \ ``evaluator``. Different caller may need different wrappers. Default is 'collector'. """ self._env = None @@ -44,7 +44,7 @@ def __init__(self, env: gym.Env = None, cfg: dict = None, seed_api: bool = True, self._seed_api = seed_api # some env may disable `env.seed` api self._caller = caller if self._cfg is None: - self._cfg = dict() + self._cfg = {} self._cfg = EasyDict(self._cfg) if 'act_scale' not in self._cfg: self._cfg.act_scale = False diff --git a/ding/envs/env_manager/base_env_manager.py b/ding/envs/env_manager/base_env_manager.py index 96aa43f817..529d5b0225 100644 --- a/ding/envs/env_manager/base_env_manager.py +++ b/ding/envs/env_manager/base_env_manager.py @@ -562,6 +562,9 @@ def closed(self) -> bool: """ return self._closed + def random_action(self) -> Dict: + return {env_id: self._env_ref.action_space.sample() for env_id in self.ready_obs_id} + @ENV_MANAGER_REGISTRY.register('base_v2') class BaseEnvManagerV2(BaseEnvManager): @@ -577,7 +580,8 @@ class BaseEnvManagerV2(BaseEnvManager): .. note:: For more details about new task pipeline, please refer to the system document of DI-engine \ - (`en link <../03_system/index.html>`_). + (`system en link <../03_system/index.html>`_). + Interfaces: reset, step, seed, close, enable_save_replay, launch, default_config, reward_shaping, enable_save_figure Properties: diff --git a/ding/envs/env_wrappers/env_wrappers.py b/ding/envs/env_wrappers/env_wrappers.py index 76c0880d59..f62de83352 100644 --- a/ding/envs/env_wrappers/env_wrappers.py +++ b/ding/envs/env_wrappers/env_wrappers.py @@ -39,6 +39,8 @@ - GymToGymnasiumWrapper: Adapts environments from the Gym library to be compatible with the Gymnasium library. - AllinObsWrapper: Consolidates all information into the observation, useful for environments where the agent's observation should include additional information such as the current score or time remaining. +- ObsPlusPrevActRewWrapper: This wrapper is used in policy NGU. It sets a dict as the new wrapped observation, + which includes the current observation, previous action and previous reward. """ import copy diff --git a/ding/model/common/head.py b/ding/model/common/head.py index 30f5b58d98..99e94a85b1 100755 --- a/ding/model/common/head.py +++ b/ding/model/common/head.py @@ -1293,6 +1293,7 @@ def forward(self, key: torch.Tensor, query: torch.Tensor) -> torch.Tensor: >>> query = torch.randn(4, 64) >>> logit = head(key, query) >>> assert logit.shape == torch.Size([4, 5]) + .. note:: In this head, we assume that the ``key`` and ``query`` tensor are both normalized. """ diff --git a/ding/model/common/utils.py b/ding/model/common/utils.py index 0ca8df7fb5..f74a179962 100644 --- a/ding/model/common/utils.py +++ b/ding/model/common/utils.py @@ -21,6 +21,7 @@ def create_model(cfg: EasyDict) -> torch.nn.Module: >>> 'action_shape': 2, >>> }) >>> model = create_model(cfg) + .. tip:: This method will not modify the ``cfg`` , it will deepcopy the ``cfg`` and then modify it. """ diff --git a/ding/model/template/__init__.py b/ding/model/template/__init__.py index b2dd815287..4a63c3dcc6 100755 --- a/ding/model/template/__init__.py +++ b/ding/model/template/__init__.py @@ -26,3 +26,4 @@ from .procedure_cloning import ProcedureCloningMCTS, ProcedureCloningBFS from .bcq import BCQ from .edac import EDAC +from .ebm import EBM, AutoregressiveEBM diff --git a/ding/model/template/acer.py b/ding/model/template/acer.py index bb46b22bec..44bb386cba 100644 --- a/ding/model/template/acer.py +++ b/ding/model/template/acer.py @@ -85,40 +85,15 @@ def forward(self, inputs: Union[torch.Tensor, Dict], mode: str) -> Dict: Use observation to predict output. Parameter updates with ACER's MLPs forward setup. Arguments: - Forward with ``'compute_actor'``: - - inputs (:obj:`torch.Tensor`): - The encoded embedding tensor, determined with given ``hidden_size``, i.e. ``(B, N=hidden_size)``. - Whether ``actor_head_hidden_size`` or ``critic_head_hidden_size`` depend on ``mode``. - - Forward with ``'compute_critic'``, inputs:`torch.Tensor` Necessary Keys: - - ``obs`` encoded tensors. - - mode (:obj:`str`): Name of the forward mode. Returns: - outputs (:obj:`Dict`): Outputs of network forward. - - Forward with ``'compute_actor'``, Necessary Keys (either): - - logit (:obj:`torch.Tensor`): - - logit (:obj:`torch.Tensor`): Logit encoding tensor. - - Forward with ``'compute_critic'``, Necessary Keys: - - q_value (:obj:`torch.Tensor`): Q value tensor. - Actor Shapes: + Shapes (Actor): - obs (:obj:`torch.Tensor`): :math:`(B, N1)`, where B is batch size and N1 is ``obs_shape`` - logit (:obj:`torch.FloatTensor`): :math:`(B, N2)`, where B is batch size and N2 is ``action_shape`` - Critic Shapes: + Shapes (Critic): - inputs (:obj:`torch.Tensor`): :math:`(B, N1)`, B is batch size and N1 corresponds to ``obs_shape`` - q_value (:obj:`torch.FloatTensor`): :math:`(B, N2)`, where B is batch size and N2 is ``action_shape`` - Actor Examples: - >>> # Regression mode - >>> model = ACER(64, 64) - >>> inputs = torch.randn(4, 64) - >>> actor_outputs = model(inputs,'compute_actor') - >>> assert actor_outputs['logit'].shape == torch.Size([4, 64]) - Critic Examples: - >>> inputs = torch.randn(4,N) - >>> model = ACER(obs_shape=(N, ),action_shape=5) - >>> model(inputs, mode='compute_critic')['q_value'] """ assert mode in self.mode, "not support forward mode: {}/{}".format(mode, self.mode) return getattr(self, mode)(inputs) @@ -127,7 +102,7 @@ def compute_actor(self, inputs: torch.Tensor) -> Dict: """ Overview: Use encoded embedding tensor to predict output. - Execute parameter updates with ``'compute_actor'`` mode + Execute parameter updates with ``compute_actor`` mode Use encoded embedding tensor to predict output. Arguments: - inputs (:obj:`torch.Tensor`): @@ -156,7 +131,7 @@ def compute_actor(self, inputs: torch.Tensor) -> Dict: def compute_critic(self, inputs: torch.Tensor) -> Dict: """ Overview: - Execute parameter updates with ``'compute_critic'`` mode + Execute parameter updates with ``compute_critic`` mode Use encoded embedding tensor to predict output. Arguments: - ``obs``, ``action`` encoded tensors. diff --git a/ding/model/template/maqac.py b/ding/model/template/maqac.py index ba74b97573..2d72e43d53 100644 --- a/ding/model/template/maqac.py +++ b/ding/model/template/maqac.py @@ -1,6 +1,5 @@ from typing import Union, Dict, Optional from easydict import EasyDict -import numpy as np import torch import torch.nn as nn @@ -96,7 +95,7 @@ def __init__( def forward(self, inputs: Union[torch.Tensor, Dict], mode: str) -> Dict: """ Overview: - Use observation tensor to predict output, with ``'compute_actor'`` or ``'compute_critic'`` mode. + Use observation tensor to predict output, with ``compute_actor`` or ``compute_critic`` mode. Arguments: - inputs (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys: - ``obs`` (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys: @@ -109,41 +108,11 @@ def forward(self, inputs: Union[torch.Tensor, Dict], mode: str) -> Dict: - ``action_mask`` (:obj:`torch.Tensor`): The action mask tensor data, \ with shape :math:`(B, A, N2)`, where B is batch size and A is agent num. \ N2 corresponds to ``action_shape``. + - mode (:obj:`str`): The forward mode, all the modes are defined in the beginning of this class. Returns: - - output (:obj:`Dict[str, torch.Tensor]`): The output dict of DiscreteMAQAC forward computation graph, whose \ - key-values vary in different forward modes. - Forward with ``'compute_actor'``, Necessary Keys (either): - - logit (:obj:`torch.Tensor`): Action's probabilities. - - action_mask (:obj:`torch.Tensor`): Action mask tensor with same size as ``action_shape``. - Forward with ``'compute_critic'``, if ``twin_critic`` is ``False``, Necessary Keys: - - q_value (:obj:`torch.Tensor`): Q value tensor is the shape of :math:`(B, A, N2)`, where B is batch size \ - and A is agent num. N2 corresponds to ``action_shape``. - Forward with ``'compute_critic'``, if ``twin_critic`` is ``True``, Necessary Keys: - - q_value (:obj:`list`): 2 elements, each is the shape of :math:`(B, A, N2)`, where B is batch size and \ - A is agent num. N2 corresponds to ``action_shape``. - Shapes: - - inputs (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys: - - ``obs`` (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys: - - ``agent_state`` (:obj:`torch.Tensor`): :math:`(B, A, N0)`, where B is batch size and A is agent num. \ - N0 corresponds to ``agent_obs_shape``. - - ``global_state`` (:obj:`torch.Tensor`): :math:`(B, A, N1)`, where B is batch size and A is agent num. \ - N1 corresponds to ``global_obs_shape``. - - ``action_mask`` (:obj:`torch.Tensor`): :math:`(B, A, N2)`, where B is batch size and A is agent num. \ - N2 corresponds to ``action_shape``. - - output (:obj:`Dict[str, torch.Tensor]`): The output dict of DiscreteMAQAC forward computation graph, whose \ - key-values vary in different forward modes. - Forward with ``'compute_actor'``, Necessary Keys (either): - - logit (:obj:`torch.Tensor`): :math:`(B, A, N2)`, where B is batch size and A is agent num. \ - N2 corresponds to ``action_shape``. - - action_mask (:obj:`torch.Tensor`): :math:`(B, A, N2)`, where B is batch size and A is agent num. \ - N2 corresponds to ``action_shape``. - Forward with ``'compute_critic'``, if ``twin_critic`` is ``True``, Necessary Keys: - - q_value (:obj:`list`): 2 elements, each is the shape of :math:`(B, A, N2)`, where B is batch size and \ - A is agent num. N2 corresponds to ``action_shape``. - Forward with ``'compute_critic'``, if ``twin_critic`` is ``False``, Necessary Keys: - - q_value (:obj:`torch.Tensor`): :math:`(B, A, N2)`, where B is batch size and A is agent num. \ - N2 corresponds to ``action_shape``. + - output (:obj:`Dict[str, torch.Tensor]`): The output dict of DiscreteMAQAC forward computation graph, \ + whose key-values vary in different forward modes. Examples: >>> B = 32 >>> agent_obs_shape = 216 @@ -181,25 +150,11 @@ def compute_actor(self, inputs: Dict) -> Dict: with shape :math:`(B, A, N2)`, where B is batch size and A is agent num. \ N2 corresponds to ``action_shape``. Returns: - - output (:obj:`Dict[str, torch.Tensor]`): The output dict of DiscreteMAQAC forward computation graph, whose \ - key-values vary in different forward modes. - - logit (:obj:`torch.Tensor`): Action's probabilities. + - output (:obj:`Dict[str, torch.Tensor]`): The output dict of DiscreteMAQAC forward computation graph, \ + whose key-values vary in different forward modes. + - logit (:obj:`torch.Tensor`): Action's output logit (real value range), whose shape is \ + :math:`(B, A, N2)`, where N2 corresponds to ``action_shape``. - action_mask (:obj:`torch.Tensor`): Action mask tensor with same size as ``action_shape``. - Shapes: - - inputs (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys: - - ``obs`` (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys: - - ``agent_state`` (:obj:`torch.Tensor`): :math:`(B, A, N0)`, where B is batch size and A is agent num. \ - N0 corresponds to ``agent_obs_shape``. - - ``global_state`` (:obj:`torch.Tensor`): :math:`(B, A, N1)`, where B is batch size and A is agent num. \ - N1 corresponds to ``global_obs_shape``. - - ``action_mask`` (:obj:`torch.Tensor`): :math:`(B, A, N2)`, where B is batch size and A is agent num. \ - N2 corresponds to ``action_shape``. - - output (:obj:`Dict[str, torch.Tensor]`): The output dict of DiscreteMAQAC forward computation graph, whose \ - key-values vary in different forward modes. - - logit (:obj:`torch.Tensor`): :math:`(B, A, N2)`, where B is batch size and A is agent num. \ - N2 corresponds to ``action_shape``. - - action_mask (:obj:`torch.Tensor`): :math:`(B, A, N2)`, where B is batch size and A is agent num. \ - N2 corresponds to ``action_shape``. Examples: >>> B = 32 >>> agent_obs_shape = 216 @@ -237,31 +192,11 @@ def compute_critic(self, inputs: Dict) -> Dict: with shape :math:`(B, A, N2)`, where B is batch size and A is agent num. \ N2 corresponds to ``action_shape``. Returns: - - output (:obj:`Dict[str, torch.Tensor]`): The output dict of DiscreteMAQAC forward computation graph, whose \ - key-values vary in different forward modes. - Forward with ``'compute_critic'``, if ``twin_critic`` is ``False``, Necessary Keys: - - q_value (:obj:`torch.Tensor`): Q value tensor is the shape of :math:`(B, A, N2)`, where B is batch size \ - and A is agent num. N2 corresponds to ``action_shape``. - Forward with ``'compute_critic'``, if ``twin_critic`` is ``True``, Necessary Keys: - - q_value (:obj:`list`): 2 elements, each is the shape of :math:`(B, A, N2)`, where B is batch size and \ - A is agent num. N2 corresponds to ``action_shape``. - Shapes: - - inputs (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys: - - ``obs`` (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys: - - ``agent_state`` (:obj:`torch.Tensor`): :math:`(B, A, N0)`, where B is batch size and A is agent num. \ - N0 corresponds to ``agent_obs_shape``. - - ``global_state`` (:obj:`torch.Tensor`): :math:`(B, A, N1)`, where B is batch size and A is agent num. \ - N1 corresponds to ``global_obs_shape``. - - ``action_mask`` (:obj:`torch.Tensor`): :math:`(B, A, N2)`, where B is batch size and A is agent num. \ - N2 corresponds to ``action_shape``. - - output (:obj:`Dict[str, torch.Tensor]`): The output dict of DiscreteMAQAC forward computation graph, whose \ - key-values vary in different forward modes. - if ``twin_critic`` is ``True``, Necessary Keys: - - q_value (:obj:`list`): 2 elements, each is the shape of :math:`(B, A, N2)`, where B is batch size and \ - A is agent num. N2 corresponds to ``action_shape``. - if ``twin_critic`` is ``False``, Necessary Keys: - - q_value (:obj:`torch.Tensor`): :math:`(B, A, N2)`, where B is batch size and A is agent num. \ - N2 corresponds to ``action_shape``. + - output (:obj:`Dict[str, torch.Tensor]`): The output dict of DiscreteMAQAC forward computation graph, \ + whose key-values vary in different values of ``twin_critic``. + - q_value (:obj:`list`): If ``twin_critic=True``, q_value should be 2 elements, each is the shape of \ + :math:`(B, A, N2)`, where B is batch size and A is agent num. N2 corresponds to ``action_shape``. \ + Otherwise, q_value should be ``torch.Tensor``. Examples: >>> B = 32 >>> agent_obs_shape = 216 @@ -397,7 +332,7 @@ def __init__( def forward(self, inputs: Union[torch.Tensor, Dict], mode: str) -> Dict: """ Overview: - Use observation and action tensor to predict output in ``'compute_actor'`` or ``'compute_critic'`` mode. + Use observation and action tensor to predict output in ``compute_actor`` or ``compute_critic`` mode. Arguments: - inputs (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys: - ``obs`` (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys: @@ -410,54 +345,21 @@ def forward(self, inputs: Union[torch.Tensor, Dict], mode: str) -> Dict: - ``action_mask`` (:obj:`torch.Tensor`): The action mask tensor data, \ with shape :math:`(B, A, N2)`, where B is batch size and A is agent num. \ N2 corresponds to ``action_shape``. + - ``action`` (:obj:`torch.Tensor`): The action tensor data, \ with shape :math:`(B, A, N3)`, where B is batch size and A is agent num. \ N3 corresponds to ``action_shape``. - mode (:obj:`str`): Name of the forward mode. Returns: - - outputs (:obj:`Dict`): Outputs of network forward. - Forward with ``'compute_actor'``, if action_space == 'regression', Necessary Keys: - - action (:obj:`torch.Tensor`): Action tensor with same size as ``action_shape``. - Forward with ``'compute_actor'``, if action_space == 'reparameterization', Necessary Keys: - - logit (:obj:`list`): 2 elements, each is the shape of :math:`(B, A, N3)`, where B is batch size and \ - A is agent num. N3 corresponds to ``action_shape``. - Forward with ``'compute_critic'``, if ``twin_critic`` is ``True``, Necessary Keys: - - q_value (:obj:`list`): 2 elements, each is the shape of :math:`(B, A)`, where B is batch size and \ - A is agent num. - Forward with ``'compute_critic'``, if ``twin_critic`` is ``False``, Necessary Keys: - - q_value (:obj:`torch.Tensor`): :math:`(B, A)`, where B is batch size and A is agent num. - Shapes: - - inputs (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys: - - ``obs`` (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys: - - ``agent_state`` (:obj:`torch.Tensor`): :math:`(B, A, N0)`, where B is batch size and A is agent num. \ - N0 corresponds to ``agent_obs_shape``. - - ``global_state`` (:obj:`torch.Tensor`): :math:`(B, A, N1)`, where B is batch size and A is agent num. \ - N1 corresponds to ``global_obs_shape``. - - ``action_mask`` (:obj:`torch.Tensor`): :math:`(B, A, N2)`, where B is batch size and A is agent num. \ - N2 corresponds to ``action_shape``. - - ``action`` (:obj:`torch.Tensor`): :math:`(B, A, N3)`, where B is batch size and A is agent num. \ - N3 corresponds to ``action_shape``. - - outputs (:obj:`Dict`): Outputs of network forward. - Forward with ``'compute_actor'``, if action_space == 'regression', Necessary Keys: - - action (:obj:`torch.Tensor`): :math:`(B, A, N3)`, where B is batch size and A is agent num. \ - N3 corresponds to ``action_shape``. - Forward with ``'compute_actor'``, if action_space == 'reparameterization', Necessary Keys: - - logit (:obj:`list`): 2 elements, each is the shape of :math:`(B, A, N3)`, where B is batch size and \ - A is agent num. N3 corresponds to ``action_shape``. - Forward with ``'compute_critic'``, if ``twin_critic`` is ``True``, Necessary Keys: - - q_value (:obj:`list`): 2 elements, each is the shape of :math:`(B, A)`, where B is batch size and \ - A is agent num. - Forward with ``'compute_critic'``, if ``twin_critic`` is ``False``, Necessary Keys: - - q_value (:obj:`torch.Tensor`): :math:`(B, A)`, where B is batch size and A is agent num. + - outputs (:obj:`Dict`): Outputs of network forward, whose key-values will be different for different \ + ``mode``, ``twin_critic``, ``action_space``. Examples: >>> B = 32 >>> agent_obs_shape = 216 >>> global_obs_shape = 264 >>> agent_num = 8 >>> action_shape = 14 - >>> action_space = 'regression' - >>> # or - >>> action_space = 'reparameterization' + >>> act_space = 'reparameterization' # regression >>> data = { >>> 'obs': { >>> 'agent_state': torch.randn(B, agent_num, agent_obs_shape), @@ -466,7 +368,7 @@ def forward(self, inputs: Union[torch.Tensor, Dict], mode: str) -> Dict: >>> }, >>> 'action': torch.randn(B, agent_num, squeeze(action_shape)) >>> } - >>> model = ContinuousMAQAC(agent_obs_shape, global_obs_shape, action_shape, action_space, twin_critic=False) + >>> model = ContinuousMAQAC(agent_obs_shape, global_obs_shape, action_shape, act_space, twin_critic=False) >>> if action_space == 'regression': >>> action = model(data['obs'], mode='compute_actor')['action'] >>> elif action_space == 'reparameterization': @@ -485,37 +387,25 @@ def compute_actor(self, inputs: Dict) -> Dict: - ``agent_state`` (:obj:`torch.Tensor`): The agent's observation tensor data, \ with shape :math:`(B, A, N0)`, where B is batch size and A is agent num. \ N0 corresponds to ``agent_obs_shape``. + Returns: - outputs (:obj:`Dict`): Outputs of network forward. - if action_space == 'regression', Necessary Keys: - - action (:obj:`torch.Tensor`): Action tensor with same size as ``action_shape``. - if action_space == 'reparameterization', Necessary Keys: - - logit (:obj:`list`): 2 elements, each is the shape of :math:`(B, A, N3)`, where B is batch size and \ - A is agent num. N3 corresponds to ``action_shape``. - Shapes: - - inputs (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys: - - ``agent_state`` (:obj:`torch.Tensor`): :math:`(B, A, N0)`, where B is batch size and A is agent num. \ - N0 corresponds to ``agent_obs_shape``. - - outputs (:obj:`Dict`): Outputs of network forward. - if action_space == 'regression', Necessary Keys: - - action (:obj:`torch.Tensor`): :math:`(B, A, N3)`, where B is batch size and A is agent num. \ - N3 corresponds to ``action_shape``. - if action_space == 'reparameterization', Necessary Keys: - - logit (:obj:`list`): 2 elements, each is the shape of :math:`(B, A, N3)`, where B is batch size and \ - A is agent num. N3 corresponds to ``action_shape``. + ReturnKeys (``action_space == 'regression'``): + - action (:obj:`torch.Tensor`): Action tensor with same size as ``action_shape``. + ReturnKeys (``action_space == 'reparameterization'``): + - logit (:obj:`list`): 2 elements, each is the shape of :math:`(B, A, N3)`, where B is batch size and \ + A is agent num. N3 corresponds to ``action_shape``. Examples: >>> B = 32 >>> agent_obs_shape = 216 >>> global_obs_shape = 264 >>> agent_num = 8 >>> action_shape = 14 - >>> action_space = 'regression' - >>> # or - >>> action_space = 'reparameterization' + >>> act_space = 'reparameterization' # 'regression' >>> data = { >>> 'agent_state': torch.randn(B, agent_num, agent_obs_shape), >>> } - >>> model = ContinuousMAQAC(agent_obs_shape, global_obs_shape, action_shape, action_space, twin_critic=False) + >>> model = ContinuousMAQAC(agent_obs_shape, global_obs_shape, action_shape, act_space, twin_critic=False) >>> if action_space == 'regression': >>> action = model.compute_actor(data)['action'] >>> elif action_space == 'reparameterization': @@ -545,42 +435,25 @@ def compute_critic(self, inputs: Dict) -> Dict: - ``action_mask`` (:obj:`torch.Tensor`): The action mask tensor data, \ with shape :math:`(B, A, N2)`, where B is batch size and A is agent num. \ N2 corresponds to ``action_shape``. + - ``action`` (:obj:`torch.Tensor`): The action tensor data, \ with shape :math:`(B, A, N3)`, where B is batch size and A is agent num. \ N3 corresponds to ``action_shape``. + Returns: - outputs (:obj:`Dict`): Outputs of network forward. - if ``twin_critic`` is ``True``, Necessary Keys: - - q_value (:obj:`list`): 2 elements, each is the shape of :math:`(B, A)`, where B is batch size and \ - A is agent num. - if ``twin_critic`` is ``False``, Necessary Keys: - - q_value (:obj:`torch.Tensor`): :math:`(B, A)`, where B is batch size and A is agent num. - Shapes: - - inputs (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys: - - ``obs`` (:obj:`Dict[str, torch.Tensor]`): The input dict tensor data, has keys: - - ``agent_state`` (:obj:`torch.Tensor`): :math:`(B, A, N0)`, where B is batch size and A is agent num. \ - N0 corresponds to ``agent_obs_shape``. - - ``global_state`` (:obj:`torch.Tensor`): :math:`(B, A, N1)`, where B is batch size and A is agent num. \ - N1 corresponds to ``global_obs_shape``. - - ``action_mask`` (:obj:`torch.Tensor`): :math:`(B, A, N2)`, where B is batch size and A is agent num. \ - N2 corresponds to ``action_shape``. - - ``action`` (:obj:`torch.Tensor`): :math:`(B, A, N3)`, where B is batch size and A is agent num. \ - N3 corresponds to ``action_shape``. - - outputs (:obj:`Dict`): Outputs of network forward. - if ``twin_critic`` is ``True``, Necessary Keys: - - q_value (:obj:`list`): 2 elements, each is the shape of :math:`(B, A)`, where B is batch size and \ - A is agent num. - if ``twin_critic`` is ``False``, Necessary Keys: - - q_value (:obj:`torch.Tensor`): :math:`(B, A)`, where B is batch size and A is agent num. + ReturnKeys (``twin_critic=True``): + - q_value (:obj:`list`): 2 elements, each is the shape of :math:`(B, A)`, where B is batch size and \ + A is agent num. + ReturnKeys (``twin_critic=False``): + - q_value (:obj:`torch.Tensor`): :math:`(B, A)`, where B is batch size and A is agent num. Examples: >>> B = 32 >>> agent_obs_shape = 216 >>> global_obs_shape = 264 >>> agent_num = 8 >>> action_shape = 14 - >>> action_space = 'regression' - >>> # or - >>> action_space = 'reparameterization' + >>> act_space = 'reparameterization' # 'regression' >>> data = { >>> 'obs': { >>> 'agent_state': torch.randn(B, agent_num, agent_obs_shape), @@ -589,7 +462,7 @@ def compute_critic(self, inputs: Dict) -> Dict: >>> }, >>> 'action': torch.randn(B, agent_num, squeeze(action_shape)) >>> } - >>> model = ContinuousMAQAC(agent_obs_shape, global_obs_shape, action_shape, action_space, twin_critic=False) + >>> model = ContinuousMAQAC(agent_obs_shape, global_obs_shape, action_shape, act_space, twin_critic=False) >>> value = model.compute_critic(data)['q_value'] """ diff --git a/ding/model/template/mavac.py b/ding/model/template/mavac.py index cdd521f2b1..78071e6783 100644 --- a/ding/model/template/mavac.py +++ b/ding/model/template/mavac.py @@ -52,8 +52,8 @@ def __init__( - actor_head_layer_num (:obj:`int`): The num of layers used in the ``actor_head`` network to compute action. - critic_head_hidden_size (:obj:`Optional[int]`): The ``hidden_size`` of ``critic_head`` network, defaults \ to 512, it must match the last element of ``global_obs_shape``. - - critic_head_layer_num (:obj:`int`): - The num of layers used in the network to compute Q value output for critic's nn. + - critic_head_layer_num (:obj:`int`): The num of layers used in the network to compute Q value output for \ + critic's nn. - action_space (:obj:`Union[int, SequenceType]`): The type of different action spaces, including \ ['discrete', 'continuous'], then will instantiate corresponding head, including ``DiscreteHead`` \ and ``ReparameterizationHead``. @@ -180,8 +180,7 @@ def compute_actor(self, x: Dict) -> Dict: - action_mask(optional): (:obj:`torch.Tensor`): When ``action_space`` is discrete, action_mask needs \ to be provided to mask illegal actions. Returns: - - outputs (:obj:`Dict`): - The output dict of MAVAC's forward computation graph for actor, including ``logit``. + - outputs (:obj:`Dict`): The output dict of the forward computation graph for actor, including ``logit``. ReturnsKeys: - logit (:obj:`torch.Tensor`): The predicted action logit tensor, for discrete action space, it will be \ the same dimension real-value ranged tensor of possible action choices, and for continuous action \ @@ -253,7 +252,7 @@ def compute_actor_critic(self, x: Dict) -> Dict: MAVAC forward computation graph for both actor and critic part, input observation to predict action \ logit and state value. Arguments: - - x (:obj:Dict): The input dict contains ``agent_state``, ``global_state`` and other related info. + - x (:obj:`Dict`): The input dict contains ``agent_state``, ``global_state`` and other related info. Returns: - outputs (:obj:`Dict`): The output dict of MAVAC's forward computation graph for both actor and critic, \ including ``logit`` and ``value``. diff --git a/ding/model/template/vae.py b/ding/model/template/vae.py index 9839f0e905..f3181361c7 100644 --- a/ding/model/template/vae.py +++ b/ding/model/template/vae.py @@ -184,22 +184,17 @@ def forward(self, input: Dict[str, Tensor], **kwargs) -> dict: 'z': z } - def loss_function(self, args: Dict[str, Tensor], **kwargs) -> dict: + def loss_function(self, args: Dict[str, Tensor], **kwargs) -> Dict[str, Tensor]: """ Overview: Computes the VAE loss function. - KL(N(\mu, \sigma), N(0, 1)) = \log \frac{1}{\sigma} + \frac{\sigma^2 + \mu^2}{2} - \frac{1}{2} Arguments: - - args (:obj:`Dict`): Dict containing keywords `recons_action` (:obj:`torch.Tensor`) \ - and `prediction_residual` (:obj:`torch.Tensor`), `original_action` (:obj:`torch.Tensor`), \ - `mu` (:obj:`torch.Tensor`), `log_var` (:obj:`torch.Tensor`) and \ - `true_residual` (:obj:`torch.Tensor`). - - kwargs (:obj:`Dict`): Dict containing keywords `kld_weight` (:obj:`torch.Tensor`) \ - and `predict_weight` (:obj:`torch.Tensor`). + - args (:obj:`Dict[str, Tensor]`): Dict containing keywords ``recons_action``, ``prediction_residual`` \ + ``original_action``, ``mu``, ``log_var`` and ``true_residual``. + - kwargs (:obj:`Dict`): Dict containing keywords ``kld_weight`` and ``predict_weight``. Returns: - - outputs (:obj: `Dict`): Dict containing keywords `loss` \ - (`obj`:`torch.Tensor`), `reconstruction_loss` (:obj: `torch.Tensor`), \ - `kld_loss` (:obj: `torch.Tensor`) and `predict_loss` (:obj: `torch.Tensor`). + - outputs (:obj:`Dict[str, Tensor]`): Dict containing different ``loss`` results, including ``loss``, \ + ``reconstruction_loss``, ``kld_loss``, ``predict_loss``. Shapes: - recons_action (:obj:`torch.Tensor`): :math:`(B, A)`, where B is batch size \ and A is ``action dim``. diff --git a/ding/policy/mbpolicy/mbsac.py b/ding/policy/mbpolicy/mbsac.py index 7af83021e2..1918e161db 100644 --- a/ding/policy/mbpolicy/mbsac.py +++ b/ding/policy/mbpolicy/mbsac.py @@ -36,7 +36,6 @@ class MBSACPolicy(SACPolicy): == ==================== ======== ============= ================================== .. note:: - For other configs, please refer to ding.policy.sac.SACPolicy. """ diff --git a/ding/reward_model/pwil_irl_model.py b/ding/reward_model/pwil_irl_model.py index 5fec46b821..8738ee2d81 100644 --- a/ding/reward_model/pwil_irl_model.py +++ b/ding/reward_model/pwil_irl_model.py @@ -45,10 +45,7 @@ class PwilRewardModel(BaseRewardModel): | ``path`` .pkl | | file 3 | ``sample_size`` int 1000 | sample data from expert dataset | | with fixed size | - 4 | ``alpha`` int 5 | factor alpha | r = alpha * exp( - | (-beta*T/sqrt( - | |s_size|+ |a_size|) - | )*c_i) + 4 | ``alpha`` int 5 | factor alpha | 5 | ``beta`` int 5 | factor beta | 6 | ``s_size`` int 4 | state size | 7 | ``a_size`` int 2 | action size | diff --git a/ding/rl_utils/td.py b/ding/rl_utils/td.py index 7c5b995eaa..4dd2df6c4b 100644 --- a/ding/rl_utils/td.py +++ b/ding/rl_utils/td.py @@ -722,8 +722,6 @@ def bdq_nstep_td_error( Deep Reinforcement Learning", link: https://arxiv.org/pdf/1711.08946. In fact, the original paper only provides the 1-step TD-error calculation method, and here we extend the \ calculation method of n-step, i.e., TD-error: - :math:`y_d = \sigma_{t=0}^{nstep} \gamma^t * r_t + \gamma^{nstep} * Q_d'(s', argmax Q_d(s', a_d))` - :math:`TD-error = \frac{1}{D} * (y_d - Q_d(s, a_d))^2` Arguments: - data (:obj:`q_nstep_td_data`): The input data, q_nstep_td_data to calculate loss - gamma (:obj:`float`): Discount factor