s2t2 · s2t2 · Jun 19, 2025
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -29,6 +29,7 @@ typing-extensions = "^4.12.2"
 ipython = "^8.27.0"
 importlib-resources = { version = "*", python = "<3.11" }
 python-dotenv = "^1.1.0"
+tqdm = "^4.67.1"
 
 [tool.poetry.group.dev.dependencies]
 pytest = "^8.3.5"
@@ -69,7 +70,6 @@ force-exclude = '''
 )
 '''
 
-
 [tool.isort]
 lexicographical = true
 group_by_package = true

diff --git a/smart_control/environment/environment.py b/smart_control/environment/environment.py
@@ -374,7 +374,6 @@ def __init__(
       image_generator: (
           building_image_generator.BuildingImageGenerator | None
       ) = None,
-      step_interval: pd.Timedelta = pd.Timedelta(5, unit="minutes"),
       writer_factory: writer_lib.BaseWriterFactory | None = None,
   ) -> None:
     """Environment constructor.
@@ -427,7 +426,9 @@ def __init__(
     self._end_timestamp: pd.Timestamp = self._start_timestamp + pd.Timedelta(
         num_days_in_episode, unit="days"
     )
-    self._step_interval = step_interval
+    self._step_interval = self.building.time_step_sec * pd.Timedelta(
+        1, unit="seconds"
+    )
     self._num_timesteps_in_episode = int(
         (self._end_timestamp - self._start_timestamp) / self._step_interval
     )

diff --git a/smart_control/environment/environment_test.py b/smart_control/environment/environment_test.py
@@ -720,7 +720,7 @@ def test_step(self):
       (pd.Timedelta(1, unit="minute")),
       (pd.Timedelta(1, unit="hour")),
   )
-  def test_validate_environment(self, step_interval):
+  def test_validate_environment(self):
     class TerminatingEnv(environment.Environment):
       """Environment that terminates after a fixed number of steps.
 
@@ -734,15 +734,13 @@ def __init__(
           obs_normalizer,
           action_config,
           discount_factor: float = 1,
-          step_interval: pd.Timedelta = pd.Timedelta(1, unit="minute"),
       ):
         super().__init__(
             building,
             reward_function,
             obs_normalizer,
             action_config,
             discount_factor,
-            step_interval=step_interval,
         )
         self.counter = 0
 
@@ -762,7 +760,6 @@ def _step(self, action) -> ts.TimeStep:
         reward_function,
         obs_normalizer,
         action_config,
-        step_interval=step_interval,
     )
 
     utils.validate_py_environment(env, episodes=5)

diff --git a/smart_control/reinforcement_learning/agents/ddpg_agent.py b/smart_control/reinforcement_learning/agents/ddpg_agent.py
@@ -0,0 +1,143 @@
+"""DDPG Agent implementation.
+
+This module provides a function to create a DDPG agent with customizable
+parameters.
+"""
+
+from typing import Optional, Sequence
+
+import tensorflow as tf
+from tf_agents.agents import tf_agent
+from tf_agents.agents.ddpg import ddpg_agent
+from tf_agents.networks import network
+from tf_agents.typing import types
+
+from smart_control.reinforcement_learning.agents.networks.ddpg_networks import create_sequential_actor_network
+from smart_control.reinforcement_learning.agents.networks.ddpg_networks import create_sequential_critic_network
+
+
+def create_ddpg_agent(
+    time_step_spec: types.TimeStep,
+    action_spec: types.NestedTensorSpec,
+    # Actor network parameters
+    actor_fc_layers: Sequence[int] = (128, 128),
+    actor_network: Optional[network.Network] = None,
+    # Critic network parameters
+    critic_obs_fc_layers: Sequence[int] = (128, 64),
+    critic_action_fc_layers: Sequence[int] = (128, 64),
+    critic_joint_fc_layers: Sequence[int] = (128, 64),
+    critic_network: Optional[network.Network] = None,
+    # Optimizer parameters
+    actor_learning_rate: float = 3e-4,
+    critic_learning_rate: float = 3e-4,
+    # Agent parameters
+    ou_stddev: float = 1.0,
+    ou_damping: float = 1.0,
+    gamma: float = 0.99,
+    target_update_tau: float = 0.005,
+    target_update_period: int = 1,
+    reward_scale_factor: float = 1.0,
+    # Training parameters
+    gradient_clipping: Optional[float] = None,
+    debug_summaries: bool = False,
+    summarize_grads_and_vars: bool = False,
+    train_step_counter: Optional[tf.Variable] = None,
+) -> tf_agent.TFAgent:
+  """Creates a DDPG Agent.
+
+  Args:
+      time_step_spec: A `TimeStep` spec of the expected time_steps.
+
+      action_spec: A nest of BoundedTensorSpec representing the actions.
+
+      actor_fc_layers: Iterable of fully connected layer units for the actor
+                       network.
+
+      actor_network: Optional custom actor network to use.
+
+      critic_obs_fc_layers: Iterable of fully connected layer units for the
+                            critic observation network.
+
+      critic_action_fc_layers: Iterable of fully connected layer units for the
+                               critic action network.
+
+      critic_joint_fc_layers: Iterable of fully connected layer units for the
+                              joint part of the critic network.
+
+      critic_network: Optional custom critic network to use.
+
+      actor_learning_rate: Actor network learning rate.
+
+      critic_learning_rate: Critic network learning rate.
+
+      ou_stddev: Standard deviation for the Ornstein-Uhlenbeck (OU) noise added
+                 for exploration.
+
+      ou_damping: Damping factor for the OU noise.
+
+      gamma: Discount factor for future rewards.
+
+      target_update_tau: Factor for soft update of target networks.
+
+      target_update_period: Period for soft update of target networks.
+
+      reward_scale_factor: Multiplicative scale for the reward.
+
+      gradient_clipping: Norm length to clip gradients.
+
+      debug_summaries: Whether to emit debug summaries.
+
+      summarize_grads_and_vars: Whether to summarize gradients and variables.
+
+      train_step_counter: An optional counter to increment every time the train
+                          op is run. Defaults to the global_step.
+
+  Returns:
+      A TFAgent instance with the DDPG agent.
+  """
+  # Create train step counter if not provided
+  if train_step_counter is None:
+    train_step_counter = tf.Variable(0, trainable=False, dtype=tf.int64)
+
+  # Create networks if not provided
+  if actor_network is None:
+    actor_network = create_sequential_actor_network(
+        actor_fc_layers=actor_fc_layers, action_tensor_spec=action_spec
+    )
+
+  if critic_network is None:
+    critic_network = create_sequential_critic_network(
+        obs_fc_layer_units=critic_obs_fc_layers,
+        action_fc_layer_units=critic_action_fc_layers,
+        joint_fc_layer_units=critic_joint_fc_layers,
+    )
+
+  # Create agent
+  ddpg_tf_agent = ddpg_agent.DdpgAgent(
+      time_step_spec=time_step_spec,
+      action_spec=action_spec,
+      actor_network=actor_network,
+      critic_network=critic_network,
+      actor_optimizer=tf.keras.optimizers.Adam(
+          learning_rate=actor_learning_rate
+      ),
+      critic_optimizer=tf.keras.optimizers.Adam(
+          learning_rate=critic_learning_rate
+      ),
+      ou_stddev=ou_stddev,
+      ou_damping=ou_damping,
+      target_update_tau=target_update_tau,
+      target_update_period=target_update_period,
+      td_errors_loss_fn=tf.math.squared_difference,
+      gamma=gamma,
+      reward_scale_factor=reward_scale_factor,
+      gradient_clipping=gradient_clipping,
+      debug_summaries=debug_summaries,
+      summarize_grads_and_vars=summarize_grads_and_vars,
+      train_step_counter=train_step_counter,
+  )
+
+  # Initialize the agent
+  ddpg_tf_agent.initialize()
+
+  return ddpg_tf_agent
diff --git a/smart_control/reinforcement_learning/agents/networks/ddpg_networks.py b/smart_control/reinforcement_learning/agents/networks/ddpg_networks.py
@@ -0,0 +1,132 @@
+"""Network architectures for DDPG agent.
+
+This module provides functions to create actor and critic networks for
+DDPG agents.
+"""
+
+import functools
+from typing import Sequence
+
+import tensorflow as tf
+from tf_agents.keras_layers import inner_reshape
+from tf_agents.networks import nest_map
+from tf_agents.networks import sequential
+from tf_agents.typing import types
+from tf_agents.utils import common
+
+# Utility to create dense layers with consistent initialization and activation
+dense = functools.partial(
+    tf.keras.layers.Dense,
+    activation=tf.keras.activations.relu,
+    kernel_initializer=tf.compat.v1.variance_scaling_initializer(
+        scale=1.0 / 3.0, mode='fan_in', distribution='uniform'
+    ),
+)
+
+
+def create_identity_layer() -> tf.keras.layers.Layer:
+  """Creates an identity layer.
+
+  Returns:
+      A Lambda layer that returns its input.
+  """
+  return tf.keras.layers.Lambda(lambda x: x)
+
+
+def create_fc_network(layer_units: Sequence[int]) -> tf.keras.Model:
+  """Creates a fully connected network.
+
+  Args:
+      layer_units: A sequence of layer units.
+
+  Returns:
+      A sequential model of dense layers.
+  """
+  return sequential.Sequential([dense(num_units) for num_units in layer_units])
+
+
+def create_sequential_actor_network(
+    actor_fc_layers: Sequence[int],
+    action_tensor_spec: types.NestedTensorSpec,
+) -> sequential.Sequential:
+  """Create a sequential actor network for DDPG.
+
+  Args:
+      actor_fc_layers: Units for actor network fully connected layers.
+      action_tensor_spec: The action tensor spec.
+
+  Returns:
+      A sequential actor network.
+  """
+  flat_action_spec = tf.nest.flatten(action_tensor_spec)
+  if len(flat_action_spec) > 1:
+    raise ValueError('Only a single action tensor is supported by this network')
+  flat_action_spec = flat_action_spec[0]
+
+  fc_layers = [dense(num_units) for num_units in actor_fc_layers]
+  num_actions = flat_action_spec.shape.num_elements()
+  action_fc_layer = tf.keras.layers.Dense(
+      num_actions,
+      activation=tf.keras.activations.tanh,
+      kernel_initializer=tf.keras.initializers.RandomUniform(
+          minval=-0.003, maxval=0.003
+      ),
+  )
+
+  scaling_layer = tf.keras.layers.Lambda(
+      lambda x: common.scale_to_spec(x, flat_action_spec)
+  )
+  return sequential.Sequential(fc_layers + [action_fc_layer, scaling_layer])
+
+
+def create_sequential_critic_network(
+    obs_fc_layer_units: Sequence[int],
+    action_fc_layer_units: Sequence[int],
+    joint_fc_layer_units: Sequence[int],
+) -> sequential.Sequential:
+  """Create a sequential critic network for DDPG.
+
+  Args:
+      obs_fc_layer_units: Units for observation network layers.
+      action_fc_layer_units: Units for action network layers.
+      joint_fc_layer_units: Units for joint network layers.
+
+  Returns:
+      A sequential critic network.
+  """
+
+  def split_inputs(inputs):
+    return {'observation': inputs[0], 'action': inputs[1]}
+
+  obs_network = (
+      create_fc_network(obs_fc_layer_units)
+      if obs_fc_layer_units
+      else create_identity_layer()
+  )
+  action_network = (
+      create_fc_network(action_fc_layer_units)
+      if action_fc_layer_units
+      else create_identity_layer()
+  )
+  joint_network = (
+      create_fc_network(joint_fc_layer_units)
+      if joint_fc_layer_units
+      else create_identity_layer()
+  )
+  value_fc_layer = tf.keras.layers.Dense(
+      1,
+      activation=None,
+      kernel_initializer=tf.keras.initializers.RandomUniform(
+          minval=-0.003, maxval=0.003
+      ),
+  )
+
+  return sequential.Sequential([
+      tf.keras.layers.Lambda(split_inputs),
+      nest_map.NestMap({'observation': obs_network, 'action': action_network}),
+      nest_map.NestFlatten(),
+      tf.keras.layers.Concatenate(),
+      joint_network,
+      value_fc_layer,
+      inner_reshape.InnerReshape([1], []),
+  ])
diff --git a/smart_control/reinforcement_learning/agents/networks/sac_networks.py b/smart_control/reinforcement_learning/agents/networks/sac_networks.py
@@ -116,6 +116,10 @@ def call(self, inputs, **kwargs):
     kwargs['outer_rank'] = self.predefined_outer_rank
     if 'step_type' in kwargs:
       del kwargs['step_type']
+    del kwargs[
+        'network_state'
+    ]  # was getting error saying that this argument was unexpected in
+    # the call below
     return super(_TanhNormalProjectionNetworkWrapper, self).call(
         inputs, **kwargs
     )

diff --git a/smart_control/reinforcement_learning/agents/networks/td3_networks.py b/smart_control/reinforcement_learning/agents/networks/td3_networks.py
diff --git a/...ing_run_1day_2025_06_19-21:10:54/collect/events.out.tfevents.1750367497.devbox.26123.0.v2 b/...ing_run_1day_2025_06_19-21:10:54/collect/events.out.tfevents.1750367497.devbox.26123.0.v2
diff --git a/.../experiment_results/test_training_run_1day_2025_06_19-21:10:54/experiment_parameters.json b/.../experiment_results/test_training_run_1day_2025_06_19-21:10:54/experiment_parameters.json
@@ -0,0 +1,15 @@
+{
+    "starter_buffer_path": "/app/smart_control/utils/../../smart_control/reinforcement_learning/replay_buffer_data/initial_exploration_buffer",
+    "experiment_name": "test_training_run_1day",
+    "agent_type": "sac",
+    "train_iterations": 10,
+    "collect_steps_per_iteration": 50,
+    "batch_size": 256,
+    "log_interval": 1,
+    "eval_interval": 10,
+    "num_eval_episodes": 1,
+    "checkpoint_interval": 10,
+    "learner_iterations": 200,
+    "scenario_config_path": "/tmp/gin_configs/config_timestepsec-300_numdaysinepisode-1_starttimestamp-2023-07-06.gin",
+    "timestamp": "2025_06_19-21:10:54"
+}
diff --git a/...g/experiment_results/test_training_run_1day_2025_06_19-21:10:54/experiment_parameters.txt b/...g/experiment_results/test_training_run_1day_2025_06_19-21:10:54/experiment_parameters.txt
@@ -0,0 +1,16 @@
+Experiment Parameters:
+=====================
+
+starter_buffer_path: /app/smart_control/utils/../../smart_control/reinforcement_learning/replay_buffer_data/initial_exploration_buffer
+experiment_name: test_training_run_1day
+agent_type: sac
+train_iterations: 10
+collect_steps_per_iteration: 50
+batch_size: 256
+log_interval: 1
+eval_interval: 10
+num_eval_episodes: 1
+checkpoint_interval: 10
+learner_iterations: 200
+scenario_config_path: /tmp/gin_configs/config_timestepsec-300_numdaysinepisode-1_starttimestamp-2023-07-06.gin
+timestamp: 2025_06_19-21:10:54
diff --git a/...ining_run_1day_2025_06_19-21:10:54/replay_buffer/2025-06-19T21:10:42.103748744+00:00/DONE b/...ining_run_1day_2025_06_19-21:10:54/replay_buffer/2025-06-19T21:10:42.103748744+00:00/DONE
diff --git a/...day_2025_06_19-21:10:54/replay_buffer/2025-06-19T21:10:42.103748744+00:00/chunks.tfrecord b/...day_2025_06_19-21:10:54/replay_buffer/2025-06-19T21:10:42.103748744+00:00/chunks.tfrecord
diff --git a/...1day_2025_06_19-21:10:54/replay_buffer/2025-06-19T21:10:42.103748744+00:00/items.tfrecord b/...1day_2025_06_19-21:10:54/replay_buffer/2025-06-19T21:10:42.103748744+00:00/items.tfrecord
diff --git a/...day_2025_06_19-21:10:54/replay_buffer/2025-06-19T21:10:42.103748744+00:00/tables.tfrecord b/...day_2025_06_19-21:10:54/replay_buffer/2025-06-19T21:10:42.103748744+00:00/tables.tfrecord