[Feature] Add async policy-server collector benchmark prototype#3872
Draft
vmoens wants to merge 3 commits into
Draft
[Feature] Add async policy-server collector benchmark prototype#3872vmoens wants to merge 3 commits into
vmoens wants to merge 3 commits into
Conversation
Convert seq_length to tensor before torch.where operations to avoid torch.compile inductor C++ codegen bugs with mixed scalar/tensor int64 in blendv operations. ghstack-source-id: 8e546ee Pull-Request: pytorch#3298
…/decoders - Add _Contiguous helper module for torch.compile inductor compatibility - Refactor ObsEncoder.forward and ObsDecoder.forward to use flatten/unflatten with contiguous() - Add _maybe_record_function_decorator for profiling ghstack-source-id: a92513f Pull-Request: pytorch#3306
🔗 Helpful Links🧪 See artifacts and rendered test results at hud.pytorch.org/pr/pytorch/rl/3872
Note: Links to docs will display an error until the docs builds have been completed. ❌ 10 New FailuresAs of commit 7e61859 with merge base d7ef78b ( NEW FAILURES - The following jobs have failed:
This comment was automatically generated by Dr. CI and updates every 15 minutes. |
Contributor
Benchmark Results: PR
|
| Benchmark | main ops | PR ops | Change |
|---|---|---|---|
benchmarks/test_replaybuffer_benchmark.py::test_rb_populate[TensorDictReplayBuffer-ListStorage-RandomSampler-400] |
38.99 | 197.95 | +407.71% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_populate[TensorDictReplayBuffer-ListStorage-SamplerWithoutReplacement-400] |
185.24 | 38.35 | -79.30% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_sample[TensorDictReplayBuffer-LazyTensorStorage-SamplerWithoutReplacement-10000] |
2,768 | 3,609 | +30.41% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_populate[TensorDictReplayBuffer-LazyTensorStorage-RandomSampler-400] |
1,074 | 779.66 | -27.41% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_populate[TensorDictPrioritizedReplayBuffer-LazyTensorStorage-None-400] |
736.91 | 880.56 | +19.49% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_sample[TensorDictReplayBuffer-LazyMemmapStorage-RandomSampler-10000] |
2,814 | 3,315 | +17.80% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_iterate[TensorDictReplayBuffer-LazyTensorStorage-SamplerWithoutReplacement-10000] |
2,756 | 3,243 | +17.65% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_sample[TensorDictReplayBuffer-LazyTensorStorage-RandomSampler-10000] |
2,970 | 3,491 | +17.53% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_iterate[TensorDictReplayBuffer-LazyTensorStorage-RandomSampler-10000] |
2,738 | 3,148 | +14.99% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_iterate[TensorDictPrioritizedReplayBuffer-LazyMemmapStorage-None-10000] |
1,897 | 2,173 | +14.57% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_iterate[TensorDictReplayBuffer-LazyMemmapStorage-SamplerWithoutReplacement-10000] |
2,774 | 3,150 | +13.55% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_sample[TensorDictPrioritizedReplayBuffer-LazyTensorStorage-None-10000] |
1,935 | 2,186 | +12.96% |
benchmarks/test_objectives_benchmarks.py::test_dqn_speed[True-backward] |
880.30 | 988.77 | +12.32% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_iterate[TensorDictReplayBuffer-LazyMemmapStorage-RandomSampler-10000] |
2,602 | 2,891 | +11.08% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_sample[TensorDictPrioritizedReplayBuffer-LazyMemmapStorage-None-10000] |
1,854 | 2,030 | +9.49% |
benchmarks/test_objectives_benchmarks.py::test_sac_speed[True-backward] |
228.84 | 247.71 | +8.25% |
benchmarks/test_objectives_benchmarks.py::test_gae_speed[vec_generalized_advantage_estimate-True-32-512] |
29.06 | 31.44 | +8.20% |
benchmarks/test_objectives_benchmarks.py::test_ddpg_speed[reduce-overhead-None] |
670.05 | 713.59 | +6.50% |
benchmarks/test_objectives_benchmarks.py::test_a2c_speed[True-backward] |
112.65 | 119.62 | +6.19% |
benchmarks/test_replaybuffer_benchmark.py::TestPrioritizedReplayBufferBenchmark::test_sampler_sample_scale[10000000-cpu] |
53.74 | 50.56 | -5.92% |
benchmarks/test_rnn_reset_backends_benchmark.py::test_rnn_rollout_with_intermediate_resets[b256-t128-i32-h512-scan-True-0-gru] |
4.0714 | 4.2954 | +5.50% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[False-True-True-False-False] |
56,708 | 59,778 | +5.41% |
benchmarks/test_objectives_benchmarks.py::test_redq_speed[True-None] |
215.27 | 226.04 | +5.00% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_sample[TensorDictReplayBuffer-LazyMemmapStorage-SamplerWithoutReplacement-10000] |
2,649 | 2,776 | +4.79% |
benchmarks/test_replaybuffer_benchmark.py::TestPrioritizedReplayBufferBenchmark::test_sampler_sample_scale[1000000-cpu] |
99.04 | 94.80 | -4.28% |
benchmarks/test_objectives_benchmarks.py::test_dqn_speed[reduce-overhead-None] |
1,780 | 1,855 | +4.25% |
benchmarks/test_compressed_storage_benchmark.py::TestCompressedStorageBenchmark::test_tensor_to_bytestream_speed[safetensors] |
24,052 | 23,033 | -4.23% |
benchmarks/test_non_tensor_env_benchmark.py::test_non_tensor_env_rollout_speed[1000-parallel-buffers-False] |
0.5916 | 0.6157 | +4.07% |
benchmarks/test_objectives_benchmarks.py::test_redq_deprec_speed[reduce-overhead-None] |
270.98 | 281.92 | +4.04% |
benchmarks/test_objectives_benchmarks.py::test_cql_speed[True-backward] |
55.58 | 57.80 | +3.99% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[False-True-False-False-True] |
30,412 | 31,582 | +3.85% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[False-False-False-False-True] |
28,238 | 29,311 | +3.80% |
benchmarks/test_objectives_benchmarks.py::test_reinforce_speed[True-backward] |
120.41 | 124.98 | +3.80% |
benchmarks/test_storage_write_benchmark.py::TestStorageWriteBenchmark::test_storage_write_lazystack[50-img_shape0-small] |
4,325 | 4,489 | +3.79% |
benchmarks/test_objectives_benchmarks.py::test_dqn_speed[True-None] |
1,729 | 1,794 | +3.78% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[True-True-False-False-True] |
37,360 | 38,737 | +3.69% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[False-True-False-True-True] |
19,849 | 20,562 | +3.59% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[False-False-True-True-True] |
18,932 | 19,538 | +3.20% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[True-True-False-True-True] |
22,060 | 22,756 | +3.15% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[False-False-True-False-False] |
51,177 | 52,678 | +2.93% |
benchmarks/test_collectors_benchmark.py::test_async |
17.49 | 17.99 | +2.85% |
benchmarks/test_storage_write_benchmark.py::TestCollectorIntegrationBenchmark::test_collector_with_rb[100-img_shape0-atari] |
25.48 | 26.18 | +2.72% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[False-True-True-False-True] |
32,656 | 33,541 | +2.71% |
benchmarks/test_envs_benchmark.py::test_transformed |
0.8863 | 0.9102 | +2.69% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[True-False-False-True-True] |
20,039 | 20,558 | +2.59% |
benchmarks/test_storage_write_benchmark.py::TestStorageWriteBenchmark::test_storage_write_contiguous[200-img_shape3-large_batch] |
783.83 | 764.06 | -2.52% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_populate[TensorDictReplayBuffer-LazyTensorStorage-SamplerWithoutReplacement-400] |
1,080 | 1,053 | -2.50% |
benchmarks/test_compressed_storage_benchmark.py::TestCompressedStorageBenchmark::test_tensor_to_bytestream_speed[numpy] |
336,314 | 328,307 | -2.38% |
benchmarks/test_storage_write_benchmark.py::TestStorageWriteBenchmark::test_collector_stack_then_write[100-img_shape1-atari] |
270.53 | 276.93 | +2.37% |
benchmarks/test_storage_write_benchmark.py::TestStorageWriteBenchmark::test_collector_stack_then_write[100-img_shape2-large_img] |
173.17 | 169.11 | -2.34% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[True-False-True-False-True] |
37,304 | 38,169 | +2.32% |
benchmarks/test_envs_benchmark.py::test_parallel |
0.9723 | 0.9498 | -2.31% |
benchmarks/test_objectives_benchmarks.py::test_redq_speed[reduce-overhead-None] |
224.86 | 230.02 | +2.29% |
benchmarks/test_storage_write_benchmark.py::TestStorageWriteBenchmark::test_storage_write_contiguous[100-img_shape2-large_img] |
572.05 | 559.05 | -2.27% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_iterate[TensorDictPrioritizedReplayBuffer-LazyTensorStorage-None-10000] |
2,294 | 2,242 | -2.24% |
benchmarks/test_objectives_benchmarks.py::test_sac_speed[True-None] |
462.88 | 473.05 | +2.20% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[True-True-False-False-False] |
63,778 | 65,174 | +2.19% |
benchmarks/test_rnn_reset_backends_benchmark.py::test_rnn_rollout_with_intermediate_resets[b256-t128-i32-h512-scan-False-0-gru] |
3.0027 | 3.0681 | +2.18% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[True-True-True-True-True] |
23,972 | 24,487 | +2.15% |
benchmarks/test_objectives_benchmarks.py::test_redq_speed[False-None] |
96.38 | 94.31 | -2.15% |
benchmarks/test_storage_write_benchmark.py::TestStorageWriteBenchmark::test_collector_lazystack_then_write[100-img_shape1-atari] |
640.82 | 654.58 | +2.15% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[False-False-False-True-False] |
27,775 | 28,368 | +2.13% |
benchmarks/test_storage_write_benchmark.py::TestStorageWriteBenchmark::test_storage_write_lazystack[100-img_shape1-atari] |
695.16 | 709.52 | +2.07% |
benchmarks/test_storage_write_benchmark.py::TestStorageWriteBenchmark::test_collector_lazystack_then_write[200-img_shape3-large_batch] |
307.89 | 314.21 | +2.05% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[False-False-False-False-False] |
44,903 | 45,805 | +2.01% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[True-False-True-True-True] |
21,224 | 21,645 | +1.98% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_extend_sample[ReplayBuffer-LazyTensorStorage-RandomSampler-10000-10000-100-False] |
52.91 | 53.94 | +1.93% |
benchmarks/test_storage_write_benchmark.py::TestStorageWriteBenchmark::test_collector_stack_then_write[200-img_shape3-large_batch] |
137.42 | 139.97 | +1.85% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[False-True-False-False-False] |
51,009 | 51,952 | +1.85% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[True-False-False-False-False] |
55,403 | 56,424 | +1.84% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[False-True-True-True-True] |
20,990 | 21,356 | +1.74% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_sample[TensorDictReplayBuffer-LazyMemmapStorage-sampler6-10000] |
679.37 | 690.91 | +1.70% |
benchmarks/test_objectives_benchmarks.py::test_ddpg_speed[False-None] |
340.17 | 345.82 | +1.66% |
benchmarks/test_compressed_storage_benchmark.py::TestCompressedStorageBenchmark::test_tensor_to_bytestream_speed[untyped_storage] |
8.1783 | 8.3143 | +1.66% |
benchmarks/test_non_tensor_env_benchmark.py::test_non_tensor_env_rollout_speed[1000-parallel-no-buffers-True] |
0.2175 | 0.2139 | -1.66% |
benchmarks/test_storage_write_benchmark.py::TestStorageWriteBenchmark::test_collector_lazystack_then_write[50-img_shape0-small] |
3,512 | 3,570 | +1.64% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_extend_sample[ReplayBuffer-LazyTensorStorage-RandomSampler-100000-10000-100-False] |
52.44 | 53.29 | +1.63% |
benchmarks/test_non_tensor_env_benchmark.py::test_non_tensor_env_rollout_speed[1000-serial-buffers-False] |
0.5928 | 0.6024 | +1.62% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[True-False-True-True-False] |
35,267 | 35,815 | +1.55% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[True-False-True-False-False] |
64,234 | 65,216 | +1.53% |
benchmarks/test_non_tensor_env_benchmark.py::test_non_tensor_env_rollout_speed[1000-serial-no-buffers-True] |
0.5978 | 0.5891 | -1.46% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[True-True-True-True-False] |
42,718 | 43,341 | +1.46% |
benchmarks/test_rnn_reset_backends_benchmark.py::test_rnn_rollout_with_intermediate_resets[b256-t128-i32-h512-cudnn-False-0-gru] |
1.3238 | 1.3045 | -1.46% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[True-False-False-True-False] |
32,547 | 33,008 | +1.42% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_extend_sample[ReplayBuffer-LazyTensorStorage-RandomSampler-10000-10000-100-True] |
24.99 | 25.33 | +1.37% |
benchmarks/test_non_tensor_env_benchmark.py::test_non_tensor_env_rollout_speed[1000-parallel-buffers-True] |
0.5354 | 0.5427 | +1.35% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_sample[TensorDictReplayBuffer-LazyTensorStorage-sampler7-10000] |
725.96 | 735.44 | +1.31% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[False-True-False-True-False] |
32,717 | 32,290 | -1.30% |
benchmarks/test_objectives_benchmarks.py::test_ddpg_speed[False-backward] |
240.41 | 243.53 | +1.30% |
benchmarks/test_non_tensor_env_benchmark.py::test_non_tensor_env_rollout_speed[1000-parallel-no-buffers-False] |
0.2226 | 0.2255 | +1.30% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[True-True-True-False-False] |
76,820 | 77,815 | +1.30% |
benchmarks/test_non_tensor_env_benchmark.py::test_non_tensor_env_rollout_speed[1000-serial-no-buffers-False] |
0.6881 | 0.6792 | -1.30% |
benchmarks/test_objectives_benchmarks.py::test_ddpg_speed[True-backward] |
405.19 | 410.24 | +1.25% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_extend_sample[ReplayBuffer-LazyTensorStorage-RandomSampler-1000000-10000-100-False] |
48.26 | 48.86 | +1.24% |
benchmarks/test_storage_write_benchmark.py::TestStorageWriteBenchmark::test_storage_write_contiguous[100-img_shape1-atari] |
5,019 | 5,081 | +1.24% |
benchmarks/test_objectives_benchmarks.py::test_cql_speed[True-None] |
83.50 | 84.52 | +1.23% |
benchmarks/test_objectives_benchmarks.py::test_redq_deprec_speed[True-backward] |
136.77 | 135.12 | -1.21% |
benchmarks/test_objectives_benchmarks.py::test_td3_speed[True-None] |
548.39 | 554.83 | +1.17% |
benchmarks/test_objectives_benchmarks.py::test_gae_speed[vec_generalized_advantage_estimate-True-1-512] |
631.97 | 639.28 | +1.16% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_populate[TensorDictReplayBuffer-LazyMemmapStorage-RandomSampler-400] |
517.70 | 511.72 | -1.15% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[True-True-True-False-True] |
42,090 | 42,551 | +1.09% |
benchmarks/test_objectives_benchmarks.py::test_iql_speed[reduce-overhead-None] |
117.61 | 116.37 | -1.05% |
benchmarks/test_objectives_benchmarks.py::test_redq_speed[False-backward] |
55.82 | 55.24 | -1.05% |
benchmarks/test_storage_write_benchmark.py::TestCollectorIntegrationBenchmark::test_collector_without_rb[200-img_shape1-large_batch] |
15.06 | 15.22 | +1.03% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[False-False-True-True-False] |
30,055 | 30,359 | +1.01% |
benchmarks/test_storage_write_benchmark.py::TestCollectorIntegrationBenchmark::test_collector_without_rb[100-img_shape0-atari] |
29.68 | 29.98 | +1.00% |
benchmarks/test_objectives_benchmarks.py::test_a2c_speed[reduce-overhead-None] |
288.79 | 285.91 | -1.00% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[True-False-False-False-True] |
34,623 | 34,959 | +0.97% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_iterate[TensorDictReplayBuffer-ListStorage-RandomSampler-4000] |
169.69 | 168.05 | -0.96% |
benchmarks/test_objectives_benchmarks.py::test_redq_deprec_speed[False-backward] |
61.87 | 62.41 | +0.87% |
benchmarks/test_objectives_benchmarks.py::test_gae_speed[vec_generalized_advantage_estimate-False-32-512] |
545.60 | 541.04 | -0.84% |
benchmarks/test_objectives_benchmarks.py::test_dqn_speed[False-backward] |
513.69 | 517.96 | +0.83% |
benchmarks/test_rnn_reset_backends_benchmark.py::test_rnn_rollout_with_intermediate_resets[b256-t128-i32-h512-cudnn-True-0-lstm] |
0.9429 | 0.9507 | +0.82% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[False-False-True-False-True] |
30,541 | 30,789 | +0.81% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_extend_sample[ReplayBuffer-LazyTensorStorage-RandomSampler-1000000-10000-100-True] |
23.23 | 23.41 | +0.80% |
benchmarks/test_storage_write_benchmark.py::TestCollectorIntegrationBenchmark::test_collector_with_rb[200-img_shape1-large_batch] |
13.19 | 13.29 | +0.74% |
benchmarks/test_objectives_benchmarks.py::test_td3_speed[False-None] |
121.91 | 122.79 | +0.72% |
benchmarks/test_objectives_benchmarks.py::test_td3_speed[False-backward] |
90.02 | 90.65 | +0.70% |
benchmarks/test_rnn_reset_backends_benchmark.py::test_rnn_rollout_with_intermediate_resets[b256-t128-i32-h512-cudnn-False-0-lstm] |
0.8552 | 0.8611 | +0.69% |
benchmarks/test_objectives_benchmarks.py::test_iql_speed[False-None] |
49.68 | 49.34 | -0.67% |
| ... | ... | ... | Showing 120 of 187 comparisons, sorted by absolute change. |
GPU
Compared 197 benchmarks. Regressions over 5%: 17. Improvements over 5%: 13.
| Benchmark | main ops | PR ops | Change |
|---|---|---|---|
benchmarks/test_objectives_benchmarks.py::test_iql_speed[reduce-overhead-None] |
76.16 | 103.65 | +36.09% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_iterate[TensorDictReplayBuffer-LazyMemmapStorage-SamplerWithoutReplacement-10000] |
2,768 | 3,447 | +24.53% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_iterate[TensorDictReplayBuffer-LazyTensorStorage-SamplerWithoutReplacement-10000] |
3,056 | 3,677 | +20.34% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_sample[TensorDictReplayBuffer-LazyMemmapStorage-SamplerWithoutReplacement-10000] |
3,172 | 2,615 | -17.57% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_populate[TensorDictReplayBuffer-LazyMemmapStorage-SamplerWithoutReplacement-400] |
454.43 | 530.54 | +16.75% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_sample[TensorDictReplayBuffer-LazyTensorStorage-sampler7-10000] |
736.70 | 852.97 | +15.78% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_iterate[TensorDictReplayBuffer-LazyMemmapStorage-RandomSampler-10000] |
3,220 | 2,715 | -15.68% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_sample[TensorDictReplayBuffer-LazyMemmapStorage-RandomSampler-10000] |
3,253 | 2,783 | -14.44% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_populate[TensorDictPrioritizedReplayBuffer-LazyMemmapStorage-None-400] |
416.39 | 475.23 | +14.13% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_sample[TensorDictPrioritizedReplayBuffer-LazyTensorStorage-None-10000] |
2,291 | 1,979 | -13.62% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_sample[TensorDictReplayBuffer-LazyTensorStorage-RandomSampler-10000] |
3,465 | 2,997 | -13.51% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_populate[TensorDictReplayBuffer-ListStorage-RandomSampler-400] |
44.14 | 38.78 | -12.14% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_populate[TensorDictReplayBuffer-LazyTensorStorage-SamplerWithoutReplacement-400] |
916.05 | 1,025 | +11.89% |
benchmarks/test_storage_write_benchmark.py::TestStorageWriteBenchmark::test_storage_write_contiguous[100-img_shape2-large_img] |
626.31 | 553.70 | -11.59% |
benchmarks/test_storage_write_benchmark.py::TestStorageWriteBenchmark::test_storage_write_lazystack[100-img_shape2-large_img] |
452.98 | 401.11 | -11.45% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_iterate[TensorDictReplayBuffer-LazyTensorStorage-RandomSampler-10000] |
3,302 | 3,662 | +10.90% |
benchmarks/test_collectors_benchmark.py::test_single_with_rb_pixels |
5.3598 | 4.7795 | -10.83% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_iterate[TensorDictPrioritizedReplayBuffer-LazyMemmapStorage-None-10000] |
2,248 | 2,011 | -10.52% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_sample[TensorDictPrioritizedReplayBuffer-LazyMemmapStorage-None-10000] |
2,167 | 1,955 | -9.77% |
benchmarks/test_storage_write_benchmark.py::TestStorageWriteBenchmark::test_collector_lazystack_then_write[100-img_shape2-large_img] |
428.86 | 391.68 | -8.67% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_sample[TensorDictReplayBuffer-LazyTensorStorage-SamplerWithoutReplacement-10000] |
2,994 | 2,747 | -8.27% |
benchmarks/test_objectives_benchmarks.py::test_dqn_speed[True-backward] |
893.15 | 966.85 | +8.25% |
benchmarks/test_objectives_benchmarks.py::test_ppo_speed[True-backward] |
328.88 | 352.58 | +7.21% |
benchmarks/test_compressed_storage_benchmark.py::TestCompressedStorageBenchmark::test_tensor_to_bytestream_speed[untyped_storage] |
8.9510 | 8.3064 | -7.20% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[False-True-True-True-True] |
22,043 | 20,478 | -7.10% |
benchmarks/test_storage_write_benchmark.py::TestStorageWriteBenchmark::test_storage_write_contiguous[200-img_shape3-large_batch] |
778.78 | 724.51 | -6.97% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_iterate[TensorDictPrioritizedReplayBuffer-LazyTensorStorage-None-10000] |
2,280 | 2,122 | -6.96% |
benchmarks/test_objectives_benchmarks.py::test_ddpg_speed[True-backward] |
444.87 | 475.11 | +6.80% |
benchmarks/test_storage_write_benchmark.py::TestStorageWriteBenchmark::test_collector_stack_then_write[200-img_shape3-large_batch] |
133.37 | 141.92 | +6.41% |
benchmarks/test_storage_write_benchmark.py::TestStorageWriteBenchmark::test_storage_write_lazystack[50-img_shape0-small] |
4,164 | 4,384 | +5.29% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[False-False-True-False-True] |
31,311 | 29,952 | -4.34% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_sample[TensorDictReplayBuffer-LazyMemmapStorage-sampler6-10000] |
727.68 | 696.30 | -4.31% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[True-False-False-True-False] |
33,138 | 31,712 | -4.30% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[True-False-False-False-False] |
56,895 | 54,483 | -4.24% |
benchmarks/test_storage_write_benchmark.py::TestStorageWriteBenchmark::test_storage_write_contiguous[100-img_shape1-atari] |
4,050 | 4,220 | +4.21% |
benchmarks/test_objectives_benchmarks.py::test_sac_speed[True-backward] |
319.53 | 332.90 | +4.18% |
benchmarks/test_objectives_benchmarks.py::test_cql_speed[True-backward] |
215.92 | 224.88 | +4.15% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[False-False-True-True-True] |
19,390 | 18,589 | -4.13% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[False-False-False-True-True] |
18,715 | 17,948 | -4.10% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_extend_sample[ReplayBuffer-LazyTensorStorage-RandomSampler-100000-10000-100-False] |
52.78 | 50.63 | -4.09% |
benchmarks/test_replaybuffer_benchmark.py::TestPrioritizedReplayBufferBenchmark::test_sample_mixed_devices[1000000-cuda_storage_cuda_samp... |
1,508 | 1,448 | -3.97% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[False-False-False-False-False] |
45,543 | 43,813 | -3.80% |
benchmarks/test_objectives_benchmarks.py::test_gae_speed[vec_generalized_advantage_estimate-False-1-512] |
1,354 | 1,303 | -3.78% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[True-True-True-True-True] |
24,380 | 23,586 | -3.26% |
benchmarks/test_compressed_storage_benchmark.py::TestCompressedStorageBenchmark::test_tensor_to_bytestream_speed[pickle] |
11,809 | 12,193 | +3.25% |
benchmarks/test_storage_write_benchmark.py::TestStorageWriteBenchmark::test_storage_write_contiguous[50-img_shape0-small] |
5,919 | 6,110 | +3.23% |
benchmarks/test_objectives_benchmarks.py::test_ppo_speed[reduce-overhead-None] |
787.72 | 812.72 | +3.17% |
benchmarks/test_objectives_benchmarks.py::test_sac_speed[False-backward] |
77.32 | 79.75 | +3.14% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[False-False-False-True-False] |
27,930 | 27,054 | -3.14% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[True-False-False-True-True] |
20,203 | 19,571 | -3.13% |
benchmarks/test_envs_benchmark.py::test_simple |
1.2343 | 1.1966 | -3.06% |
benchmarks/test_objectives_benchmarks.py::test_a2c_speed[True-None] |
741.90 | 719.28 | -3.05% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_populate[TensorDictPrioritizedReplayBuffer-ListStorage-None-400] |
182.96 | 188.28 | +2.91% |
benchmarks/test_storage_write_benchmark.py::TestStorageWriteBenchmark::test_collector_lazystack_then_write[50-img_shape0-small] |
3,454 | 3,552 | +2.86% |
benchmarks/test_objectives_benchmarks.py::test_redq_deprec_speed[reduce-overhead-None] |
110.10 | 107.09 | -2.73% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_populate[TensorDictReplayBuffer-LazyMemmapStorage-RandomSampler-400] |
481.49 | 494.58 | +2.72% |
benchmarks/test_objectives_benchmarks.py::test_redq_deprec_speed[True-backward] |
266.65 | 273.88 | +2.71% |
benchmarks/test_storage_write_benchmark.py::TestStorageWriteBenchmark::test_collector_stack_then_write[100-img_shape2-large_img] |
172.16 | 176.78 | +2.68% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_populate[TensorDictPrioritizedReplayBuffer-LazyTensorStorage-None-400] |
802.06 | 823.40 | +2.66% |
benchmarks/test_storage_write_benchmark.py::TestStorageWriteBenchmark::test_collector_lazystack_then_write[100-img_shape1-atari] |
643.90 | 660.91 | +2.64% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[True-True-False-False-True] |
37,463 | 38,440 | +2.61% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[False-False-True-True-False] |
30,038 | 29,266 | -2.57% |
benchmarks/test_objectives_benchmarks.py::test_values[vec_generalized_advantage_estimate-True-True] |
289.27 | 296.62 | +2.54% |
benchmarks/test_envs_benchmark.py::test_transformed |
0.7112 | 0.6937 | -2.46% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[True-False-True-False-True] |
38,456 | 37,519 | -2.44% |
benchmarks/test_objectives_benchmarks.py::test_gae_speed[generalized_advantage_estimate-False-1-512] |
48.87 | 47.71 | -2.38% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[False-False-False-False-True] |
29,297 | 28,600 | -2.38% |
benchmarks/test_envs_benchmark.py::test_parallel |
0.5380 | 0.5254 | -2.34% |
benchmarks/test_objectives_benchmarks.py::test_ddpg_speed[False-backward] |
232.80 | 238.22 | +2.33% |
benchmarks/test_objectives_benchmarks.py::test_gae_speed[vec_generalized_advantage_estimate-False-32-512] |
1,320 | 1,290 | -2.26% |
benchmarks/test_objectives_benchmarks.py::test_cql_speed[reduce-overhead-None] |
91.08 | 89.06 | -2.21% |
benchmarks/test_objectives_benchmarks.py::test_a2c_speed[True-backward] |
361.48 | 369.42 | +2.20% |
benchmarks/test_objectives_benchmarks.py::test_reinforce_speed[reduce-overhead-None] |
121.68 | 119.05 | -2.16% |
benchmarks/test_objectives_benchmarks.py::test_reinforce_speed[False-None] |
392.01 | 383.68 | -2.13% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_extend_sample[ReplayBuffer-LazyTensorStorage-RandomSampler-1000000-10000-100-False] |
48.27 | 47.26 | -2.09% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[True-False-True-True-True] |
21,024 | 20,601 | -2.01% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[False-True-False-True-False] |
32,517 | 31,866 | -2.00% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[True-False-True-False-False] |
64,901 | 63,610 | -1.99% |
benchmarks/test_objectives_benchmarks.py::test_reinforce_speed[True-backward] |
345.51 | 352.37 | +1.99% |
benchmarks/test_objectives_benchmarks.py::test_td3_speed[True-backward] |
389.65 | 396.80 | +1.84% |
benchmarks/test_objectives_benchmarks.py::test_td3_speed[reduce-overhead-None] |
44.49 | 43.69 | -1.79% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[False-True-True-False-True] |
32,789 | 32,204 | -1.79% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[False-False-True-False-False] |
50,904 | 50,000 | -1.78% |
benchmarks/test_objectives_benchmarks.py::test_ddpg_speed[True-None] |
816.74 | 831.04 | +1.75% |
benchmarks/test_storage_write_benchmark.py::TestCollectorIntegrationBenchmark::test_collector_without_rb_cuda[100-img_shape0-atari] |
17.09 | 17.38 | +1.74% |
benchmarks/test_objectives_benchmarks.py::test_dqn_speed[True-None] |
1,919 | 1,952 | +1.73% |
benchmarks/test_objectives_benchmarks.py::test_gae_speed[vec_generalized_advantage_estimate-True-1-512] |
1,261 | 1,240 | -1.70% |
benchmarks/test_objectives_benchmarks.py::test_values[generalized_advantage_estimate-True-True] |
48.21 | 49.01 | +1.68% |
benchmarks/test_objectives_benchmarks.py::test_iql_speed[False-backward] |
68.31 | 69.41 | +1.60% |
benchmarks/test_objectives_benchmarks.py::test_dqn_speed[reduce-overhead-None] |
1,905 | 1,935 | +1.56% |
benchmarks/test_replaybuffer_benchmark.py::TestPrioritizedReplayBufferBenchmark::test_sampler_sample_scale[1000000-cuda] |
2,255 | 2,221 | -1.52% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_sample[TensorDictReplayBuffer-ListStorage-SamplerWithoutReplacement-4000] |
167.28 | 169.71 | +1.45% |
benchmarks/test_objectives_benchmarks.py::test_values[td1_return_estimate-False-False] |
20.10 | 20.39 | +1.42% |
benchmarks/test_storage_write_benchmark.py::TestStorageWriteBenchmark::test_storage_write_lazystack[100-img_shape1-atari] |
703.88 | 694.09 | -1.39% |
benchmarks/test_objectives_benchmarks.py::test_ddpg_speed[reduce-overhead-None] |
824.44 | 835.89 | +1.39% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[True-False-False-False-True] |
35,198 | 34,711 | -1.38% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[False-True-False-False-True] |
31,308 | 30,891 | -1.33% |
benchmarks/test_objectives_benchmarks.py::test_a2c_speed[False-None] |
274.36 | 270.73 | -1.32% |
benchmarks/test_envs_benchmark.py::test_serial |
0.4222 | 0.4277 | +1.32% |
benchmarks/test_objectives_benchmarks.py::test_iql_speed[True-backward] |
236.83 | 239.94 | +1.31% |
benchmarks/test_objectives_benchmarks.py::test_cql_speed[False-None] |
53.30 | 53.98 | +1.27% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[False-True-True-True-False] |
34,748 | 34,314 | -1.25% |
benchmarks/test_compressed_storage_benchmark.py::TestCompressedStorageBenchmark::test_tensor_to_bytestream_speed[torch.save] |
7,207 | 7,121 | -1.19% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_iterate[TensorDictReplayBuffer-ListStorage-RandomSampler-4000] |
166.15 | 168.09 | +1.17% |
benchmarks/test_objectives_benchmarks.py::test_a2c_speed[False-backward] |
147.30 | 148.99 | +1.15% |
benchmarks/test_non_tensor_env_benchmark.py::test_non_tensor_env_rollout_speed[1000-serial-no-buffers-True] |
0.5988 | 0.5919 | -1.14% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[True-True-False-True-False] |
39,115 | 38,671 | -1.14% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[True-True-False-True-True] |
22,367 | 22,119 | -1.11% |
benchmarks/test_envs_benchmark.py::test_step_mdp_speed[False-True-False-True-True] |
20,032 | 19,823 | -1.05% |
benchmarks/test_replaybuffer_benchmark.py::TestPrioritizedReplayBufferBenchmark::test_sample_mixed_devices[1000000-cuda_storage_cpu_sampler] |
87.80 | 88.72 | +1.05% |
benchmarks/test_objectives_benchmarks.py::test_values[td0_return_estimate-False-False] |
11,522 | 11,401 | -1.05% |
benchmarks/test_objectives_benchmarks.py::test_values[vec_td1_return_estimate-False-False] |
848.69 | 839.90 | -1.04% |
benchmarks/test_objectives_benchmarks.py::test_sac_speed[True-None] |
612.92 | 619.15 | +1.02% |
benchmarks/test_storage_write_benchmark.py::TestCollectorIntegrationBenchmark::test_collector_with_rb[200-img_shape1-large_batch] |
13.28 | 13.41 | +0.99% |
benchmarks/test_objectives_benchmarks.py::test_redq_deprec_speed[False-backward] |
71.19 | 71.88 | +0.97% |
benchmarks/test_objectives_benchmarks.py::test_reinforce_speed[True-None] |
768.88 | 761.55 | -0.95% |
benchmarks/test_storage_write_benchmark.py::TestStorageWriteBenchmark::test_collector_stack_then_write[50-img_shape0-small] |
864.48 | 872.68 | +0.95% |
benchmarks/test_objectives_benchmarks.py::test_reinforce_speed[False-backward] |
266.65 | 264.28 | -0.89% |
benchmarks/test_objectives_benchmarks.py::test_values[vec_td_lambda_return_estimate-True-False] |
853.61 | 846.11 | -0.88% |
benchmarks/test_replaybuffer_benchmark.py::test_rb_extend_sample[ReplayBuffer-LazyTensorStorage-RandomSampler-10000-10000-100-True] |
23.59 | 23.38 | -0.87% |
| ... | ... | ... | Showing 120 of 197 comparisons, sorted by absolute change. |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.This suggestion is invalid because no changes were made to the code.Suggestions cannot be applied while the pull request is closed.Suggestions cannot be applied while viewing a subset of changes.Only one suggestion per line can be applied in a batch.Add this suggestion to a batch that can be applied as a single commit.Applying suggestions on deleted lines is not supported.You must change the existing code in this line in order to create a valid suggestion.Outdated suggestions cannot be applied.This suggestion has been applied or marked resolved.Suggestions cannot be applied from pending reviews.Suggestions cannot be applied on multi-line comments.Suggestions cannot be applied while the pull request is queued to merge.Suggestion cannot be applied right now. Please check back later.
Summary
InferenceServerwithpolicy_device/output_devicehandoff and lightweight batching/latency statsProcessInferenceServerand wireAsyncBatchedCollector(server_backend=...)benchmarks/bench_collectors.pyto compare regularCollector + ParallelEnvagainst async policy-server collection across env counts, env backends, process/thread server mode, and batching rulesLocal benchmark notes
gpuand skips without CUDAParallelEnvand slightly ahead at 8 envs in the fake-env sweepmin_bs=1often under-batches, while full-batch async nearly matches the synchronous barrier in the homogeneous fake-env benchmarkTest plan
uv run pytest -q test/test_inference_server.pyuv run python benchmarks/bench_collectors.py --num-envs 1,2,4,8 --total-frames 600 --frames-per-batch 100 --warmup-batches 1 --env-step-latency-ms 5 --policy-delay-ms 0 --backends parallel,async-thread,async-env-mp,async-process --batching-rules auto --policy-device cpu --jsonl /tmp/async_collector_bench/no_policy_delay_fixed.jsonl --csv /tmp/async_collector_bench/no_policy_delay_fixed.csvuv run python benchmarks/bench_collectors.py --num-envs 1,2,4,8 --total-frames 200 --frames-per-batch 100 --warmup-batches 0 --env-step-latency-ms 5 --policy-delay-ms 20 --backends parallel,async-thread --batching-rules no-batch,auto,min4,full --policy-device cpu --jsonl /tmp/async_collector_bench/thread_batching_20ms_fixed.jsonl --csv /tmp/async_collector_bench/thread_batching_20ms_fixed.csvuv run python benchmarks/bench_collectors.py --num-envs 1,2,4,8 --total-frames 200 --frames-per-batch 100 --warmup-batches 0 --env-step-latency-ms 5 --policy-delay-ms 20 --backends async-env-mp --batching-rules auto --policy-device cpu --jsonl /tmp/async_collector_bench/mp_auto_20ms_fixed.jsonl --csv /tmp/async_collector_bench/mp_auto_20ms_fixed.csvFollow-ups
from_pixels=Truewith EGL rendering