diff --git a/CMakeLists.txt b/CMakeLists.txt index 0381722bd..897c23bae 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,6 +22,8 @@ option(TGFX_BUILD_LAYERS "Enable building the layers module" OFF) option(TGFX_BUILD_HELLO2D "Enable building the tgfx-hello2d library for testing" OFF) option(TGFX_USE_OPENGL "Use OpenGL as the GPU backend" ON) option(TGFX_USE_METAL "Use Metal as the GPU backend on Apple platforms" OFF) +option(TGFX_USE_D3D12 "Use D3D12 as the GPU backend on Windows" OFF) +option(TGFX_D3D12_USE_WARP "Force the D3D12 backend onto the WARP software rasterizer (CI only)" OFF) option(TGFX_USE_QT "Enable building with the Qt framework." OFF) option(TGFX_USE_SWIFTSHADER "Enable building with the SwiftShader library" OFF) option(TGFX_USE_ANGLE "Enable building with the ANGLE library" OFF) @@ -97,6 +99,16 @@ if (TGFX_USE_VULKAN) endif () set(TGFX_USE_OPENGL OFF) set(TGFX_USE_METAL OFF) + set(TGFX_USE_D3D12 OFF) + set(TGFX_USE_QT OFF) + set(TGFX_USE_SWIFTSHADER OFF) + set(TGFX_USE_ANGLE OFF) +elseif (TGFX_USE_D3D12) + if (NOT WIN32) + message(FATAL_ERROR "TGFX_USE_D3D12 is only supported on Windows.") + endif () + set(TGFX_USE_OPENGL OFF) + set(TGFX_USE_METAL OFF) set(TGFX_USE_QT OFF) set(TGFX_USE_SWIFTSHADER OFF) set(TGFX_USE_ANGLE OFF) @@ -124,8 +136,8 @@ else () endif () endif () -if (NOT TGFX_USE_METAL AND NOT TGFX_USE_OPENGL AND NOT TGFX_USE_VULKAN) - message(FATAL_ERROR "At least one GPU backend (TGFX_USE_METAL, TGFX_USE_OPENGL, or TGFX_USE_VULKAN) must be enabled.") +if (NOT TGFX_USE_METAL AND NOT TGFX_USE_OPENGL AND NOT TGFX_USE_VULKAN AND NOT TGFX_USE_D3D12) + message(FATAL_ERROR "At least one GPU backend (TGFX_USE_METAL, TGFX_USE_OPENGL, TGFX_USE_VULKAN, or TGFX_USE_D3D12) must be enabled.") endif () message("TGFX_VERSION: ${TGFX_VERSION}") @@ -135,6 +147,8 @@ message("TGFX_BUILD_LAYERS: ${TGFX_BUILD_LAYERS}") message("TGFX_USE_OPENGL: ${TGFX_USE_OPENGL}") message("TGFX_USE_METAL: ${TGFX_USE_METAL}") message("TGFX_USE_VULKAN: ${TGFX_USE_VULKAN}") +message("TGFX_USE_D3D12: ${TGFX_USE_D3D12}") +message("TGFX_D3D12_USE_WARP: ${TGFX_D3D12_USE_WARP}") message("TGFX_USE_QT: ${TGFX_USE_QT}") message("TGFX_USE_SWIFTSHADER: ${TGFX_USE_SWIFTSHADER}") message("TGFX_USE_ANGLE: ${TGFX_USE_ANGLE}") @@ -192,7 +206,7 @@ file(GLOB PLATFORM_COMMON_FILES src/platform/*.*) list(APPEND TGFX_FILES ${PLATFORM_COMMON_FILES}) -if (NOT TGFX_USE_METAL AND NOT TGFX_USE_VULKAN) +if (NOT TGFX_USE_METAL AND NOT TGFX_USE_VULKAN AND NOT TGFX_USE_D3D12) file(GLOB SHADER_COMPILER_FILES src/gpu/ShaderCompiler.*) if (SHADER_COMPILER_FILES) list(REMOVE_ITEM TGFX_FILES ${SHADER_COMPILER_FILES}) @@ -371,6 +385,22 @@ elseif (APPLE) endif () endif () +if (TGFX_USE_D3D12) + file(GLOB_RECURSE GFX_PLATFORM_FILES src/gpu/d3d12/*.*) + list(APPEND TGFX_FILES ${GFX_PLATFORM_FILES}) + list(APPEND TGFX_DEFINES TGFX_USE_D3D12) + if (TGFX_D3D12_USE_WARP) + # CI / headless opt-in: route DevicePool::Make() onto D3D12Device::MakeWarp() so the + # tests run on the WARP software rasterizer instead of expecting a hardware GPU. + list(APPEND TGFX_DEFINES TGFX_D3D12_USE_WARP) + endif () + # Add shaderc and SPIRV-Cross for GLSL to HLSL conversion + list(APPEND TGFX_STATIC_VENDORS shaderc SPIRV-Cross) + list(APPEND TGFX_INCLUDES third_party/shaderc/libshaderc/include) + list(APPEND TGFX_INCLUDES third_party/SPIRV-Cross) + list(APPEND TGFX_DEFINES SPIRV_CROSS_EXCEPTIONS_TO_ASSERTIONS) +endif () + # Auto-sync shaderc sub-dependencies (glslang, SPIRV-Tools, etc.) if any backend needs shaderc. # This runs at configure time; for offline builds, run manually before cmake: # python third_party/shaderc/utils/git-sync-deps @@ -596,7 +626,14 @@ elseif (WIN32) file(GLOB_RECURSE PLATFORM_FILES src/platform/win/*.*) list(APPEND TGFX_FILES ${PLATFORM_FILES}) - if (TGFX_USE_NATIVE_GL) + if (TGFX_USE_D3D12) + find_library(D3D12_LIB d3d12) + list(APPEND TGFX_STATIC_LIBS ${D3D12_LIB}) + find_library(DXGI_LIB dxgi) + list(APPEND TGFX_STATIC_LIBS ${DXGI_LIB}) + find_library(D3DCOMPILER_LIB d3dcompiler) + list(APPEND TGFX_STATIC_LIBS ${D3DCOMPILER_LIB}) + elseif (TGFX_USE_NATIVE_GL) file(GLOB_RECURSE GPU_PLATFORM_FILES src/gpu/opengl/wgl/*.*) find_library(OPENGL_LIB opengl32) list(APPEND TGFX_STATIC_LIBS ${OPENGL_LIB}) @@ -715,6 +752,10 @@ if (TGFX_BUILD_TESTS) if (TGFX_TEST_VULKAN_FILES) list(REMOVE_ITEM TGFX_TEST_FILES ${TGFX_TEST_VULKAN_FILES}) endif () + file(GLOB_RECURSE TGFX_TEST_D3D12_FILES test/src/d3d12/*.*) + if (TGFX_TEST_D3D12_FILES) + list(REMOVE_ITEM TGFX_TEST_FILES ${TGFX_TEST_D3D12_FILES}) + endif () elseif (TGFX_USE_VULKAN) file(GLOB_RECURSE TGFX_TEST_OPENGL_FILES test/src/opengl/*.*) if (TGFX_TEST_OPENGL_FILES) @@ -724,6 +765,23 @@ if (TGFX_BUILD_TESTS) if (TGFX_TEST_METAL_FILES) list(REMOVE_ITEM TGFX_TEST_FILES ${TGFX_TEST_METAL_FILES}) endif () + file(GLOB_RECURSE TGFX_TEST_D3D12_FILES test/src/d3d12/*.*) + if (TGFX_TEST_D3D12_FILES) + list(REMOVE_ITEM TGFX_TEST_FILES ${TGFX_TEST_D3D12_FILES}) + endif () + elseif (TGFX_USE_D3D12) + file(GLOB_RECURSE TGFX_TEST_OPENGL_FILES test/src/opengl/*.*) + if (TGFX_TEST_OPENGL_FILES) + list(REMOVE_ITEM TGFX_TEST_FILES ${TGFX_TEST_OPENGL_FILES}) + endif () + file(GLOB_RECURSE TGFX_TEST_METAL_FILES test/src/metal/*.*) + if (TGFX_TEST_METAL_FILES) + list(REMOVE_ITEM TGFX_TEST_FILES ${TGFX_TEST_METAL_FILES}) + endif () + file(GLOB_RECURSE TGFX_TEST_VULKAN_FILES test/src/vulkan/*.*) + if (TGFX_TEST_VULKAN_FILES) + list(REMOVE_ITEM TGFX_TEST_FILES ${TGFX_TEST_VULKAN_FILES}) + endif () else () file(GLOB_RECURSE TGFX_TEST_METAL_FILES test/src/metal/*.*) list(REMOVE_ITEM TGFX_TEST_FILES ${TGFX_TEST_METAL_FILES}) @@ -731,6 +789,10 @@ if (TGFX_BUILD_TESTS) if (TGFX_TEST_VULKAN_FILES) list(REMOVE_ITEM TGFX_TEST_FILES ${TGFX_TEST_VULKAN_FILES}) endif () + file(GLOB_RECURSE TGFX_TEST_D3D12_FILES test/src/d3d12/*.*) + if (TGFX_TEST_D3D12_FILES) + list(REMOVE_ITEM TGFX_TEST_FILES ${TGFX_TEST_D3D12_FILES}) + endif () endif () file(GLOB_RECURSE TGFX_TEST_WEBGL_FILES test/src/webgl/*.*) diff --git a/include/tgfx/gpu/Backend.h b/include/tgfx/gpu/Backend.h index 72c5e05e1..99b80a9c4 100644 --- a/include/tgfx/gpu/Backend.h +++ b/include/tgfx/gpu/Backend.h @@ -19,6 +19,7 @@ #pragma once #include "tgfx/gpu/PixelFormat.h" +#include "tgfx/gpu/d3d12/D3D12Types.h" #include "tgfx/gpu/metal/MetalTypes.h" #include "tgfx/gpu/opengl/GLTypes.h" #include "tgfx/gpu/vulkan/VulkanTypes.h" @@ -27,7 +28,7 @@ namespace tgfx { /** * Possible GPU backend APIs that may be used by TGFX. */ -enum class Backend { Unknown, OpenGL, Metal, Vulkan, WebGPU }; +enum class Backend { Unknown, OpenGL, Metal, Vulkan, WebGPU, D3D12 }; /** * Wrapper class for passing into and receiving data from TGFX about a backend texture object. @@ -61,6 +62,13 @@ class BackendTexture { : _backend(Backend::Vulkan), _width(width), _height(height), vulkanInfo(vulkanInfo) { } + /** + * Creates a D3D12 backend texture. + */ + explicit BackendTexture(const D3D12TextureInfo& d3d12Info, int width, int height) + : _backend(Backend::D3D12), _width(width), _height(height), d3d12Info(d3d12Info) { + } + BackendTexture(const BackendTexture& that) { *this = that; } @@ -118,6 +126,12 @@ class BackendTexture { */ bool getVulkanImageInfo(VulkanImageInfo* vulkanImageInfo) const; + /** + * If the backend API is D3D12, copies a snapshot of the D3D12TextureInfo struct into the passed + * in pointer and returns true. Otherwise, returns false if the backend API is not D3D12. + */ + bool getD3D12TextureInfo(D3D12TextureInfo* d3d12TextureInfo) const; + private: Backend _backend = Backend::Unknown; int _width = 0; @@ -127,6 +141,7 @@ class BackendTexture { GLTextureInfo glInfo; MetalTextureInfo metalInfo; VulkanImageInfo vulkanInfo; + D3D12TextureInfo d3d12Info; }; }; @@ -162,6 +177,13 @@ class BackendRenderTarget { : _backend(Backend::Vulkan), _width(width), _height(height), vulkanInfo(vulkanInfo) { } + /** + * Creates a D3D12 backend render target. + */ + explicit BackendRenderTarget(const D3D12TextureInfo& d3d12Info, int width, int height) + : _backend(Backend::D3D12), _width(width), _height(height), d3d12Info(d3d12Info) { + } + BackendRenderTarget(const BackendRenderTarget& that) { *this = that; } @@ -219,6 +241,12 @@ class BackendRenderTarget { */ bool getVulkanImageInfo(VulkanImageInfo* vulkanImageInfo) const; + /** + * If the backend API is D3D12, copies a snapshot of the D3D12TextureInfo struct into the passed + * in pointer and returns true. Otherwise, returns false if the backend API is not D3D12. + */ + bool getD3D12TextureInfo(D3D12TextureInfo* d3d12TextureInfo) const; + private: Backend _backend = Backend::Unknown; int _width = 0; @@ -227,6 +255,7 @@ class BackendRenderTarget { GLFrameBufferInfo glInfo; MetalTextureInfo metalInfo; VulkanImageInfo vulkanInfo; + D3D12TextureInfo d3d12Info; }; }; @@ -262,6 +291,13 @@ class BackendSemaphore { : _backend(Backend::Vulkan), vulkanSyncInfo(vulkanInfo) { } + /** + * Creates a D3D12 backend semaphore. + */ + explicit BackendSemaphore(const D3D12SyncInfo& d3d12Info) + : _backend(Backend::D3D12), d3d12SyncInfo(d3d12Info) { + } + BackendSemaphore(const BackendSemaphore& that) { *this = that; } @@ -298,12 +334,19 @@ class BackendSemaphore { */ bool getVulkanSync(VulkanSyncInfo* vulkanSyncInfo) const; + /** + * If the backend API is D3D12, copies a snapshot of the D3D12SyncInfo struct into the passed in + * pointer and returns true. Otherwise, returns false if the backend API is not D3D12. + */ + bool getD3D12Sync(D3D12SyncInfo* d3d12Info) const; + private: Backend _backend = Backend::Unknown; union { GLSyncInfo glSyncInfo; MetalSyncInfo metalSyncInfo; VulkanSyncInfo vulkanSyncInfo; + D3D12SyncInfo d3d12SyncInfo; }; }; } // namespace tgfx diff --git a/include/tgfx/gpu/d3d12/D3D12Device.h b/include/tgfx/gpu/d3d12/D3D12Device.h new file mode 100644 index 000000000..3f29e9b9d --- /dev/null +++ b/include/tgfx/gpu/d3d12/D3D12Device.h @@ -0,0 +1,66 @@ +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Tencent is pleased to support the open source community by making tgfx available. +// +// Copyright (C) 2026 Tencent. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// unless required by applicable law or agreed to in writing, software distributed under the +// license is distributed on an "as is" basis, without warranties or conditions of any kind, +// either express or implied. see the license for the specific language governing permissions +// and limitations under the license. +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include "tgfx/gpu/Device.h" + +namespace tgfx { + +/** + * The D3D12 interface for drawing graphics. + */ +class D3D12Device : public Device { + public: + /** + * Creates a new D3D12Device using the default hardware adapter. Returns nullptr if D3D12 is not + * available. + */ + static std::shared_ptr Make(); + + /** + * Creates a new D3D12Device backed by the WARP software rasterizer. WARP is a CPU-based D3D12 + * implementation that ships with Windows; it is functionally complete (feature level 12_1) but + * orders of magnitude slower than a real GPU. Intended for headless CI runners and other + * environments without a usable hardware adapter — do not rely on it for performance work. + * Returns nullptr if WARP is unavailable on the current system. + */ + static std::shared_ptr MakeWarp(); + + /** + * Creates a new D3D12Device from an existing ID3D12Device. The device parameter is a pointer to + * an ID3D12Device object. Returns nullptr if the device is invalid. + */ + static std::shared_ptr MakeFrom(void* device); + + ~D3D12Device() override; + + /** + * Returns the underlying ID3D12Device as a raw pointer. + */ + void* d3d12Device() const; + + protected: + bool onLockContext() override; + void onUnlockContext() override; + + private: + explicit D3D12Device(std::unique_ptr gpu); +}; + +} // namespace tgfx diff --git a/include/tgfx/gpu/d3d12/D3D12Types.h b/include/tgfx/gpu/d3d12/D3D12Types.h new file mode 100644 index 000000000..d66d96981 --- /dev/null +++ b/include/tgfx/gpu/d3d12/D3D12Types.h @@ -0,0 +1,60 @@ +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Tencent is pleased to support the open source community by making tgfx available. +// +// Copyright (C) 2026 Tencent. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// unless required by applicable law or agreed to in writing, software distributed under the +// license is distributed on an "as is" basis, without warranties or conditions of any kind, +// either express or implied. see the license for the specific language governing permissions +// and limitations under the license. +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include +#include + +namespace tgfx { +/** + * Types for interacting with D3D12 textures created externally to TGFX. + */ +struct D3D12TextureInfo { + /** + * Pointer to an ID3D12Resource object representing a texture. + */ + const void* resource = nullptr; + + /** + * The pixel format of this texture (DXGI_FORMAT value). + */ + unsigned format = 0; // DXGI_FORMAT_UNKNOWN +}; + +/** + * Types for interacting with D3D12 synchronization objects created externally to TGFX. + */ +struct D3D12SyncInfo { + /** + * Pointer to an ID3D12Fence object. + */ + const void* fence = nullptr; + + /** + * The signal value for the fence. + */ + uint64_t value = 0; +}; + +static_assert(std::is_trivially_copyable_v); +static_assert(std::is_trivially_copyable_v); +static_assert(std::is_standard_layout_v); +static_assert(std::is_standard_layout_v); + +} // namespace tgfx diff --git a/include/tgfx/gpu/d3d12/D3D12Window.h b/include/tgfx/gpu/d3d12/D3D12Window.h new file mode 100644 index 000000000..8e61a490d --- /dev/null +++ b/include/tgfx/gpu/d3d12/D3D12Window.h @@ -0,0 +1,67 @@ +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Tencent is pleased to support the open source community by making tgfx available. +// +// Copyright (C) 2026 Tencent. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// unless required by applicable law or agreed to in writing, software distributed under the +// license is distributed on an "as is" basis, without warranties or conditions of any kind, +// either express or implied. see the license for the specific language governing permissions +// and limitations under the license. +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include +#include "tgfx/core/ColorSpace.h" +#include "tgfx/gpu/Window.h" +#include "tgfx/gpu/d3d12/D3D12Device.h" + +#ifdef _WIN32 +struct HWND__; +typedef HWND__* HWND; +#endif + +namespace tgfx { + +/** + * D3D12Window manages an IDXGISwapChain3 and its backbuffer textures for presenting rendered + * content to a Win32 window. Each frame the current backbuffer is exposed as a RenderTarget + * through the standard Window/Surface API; on present the swap chain flips to the next buffer. + */ +class D3D12Window : public Window { + public: +#ifdef _WIN32 + /** + * Creates a D3D12Window from a Win32 window handle. Returns nullptr if the swap chain cannot + * be created. Note: only sRGB output is currently supported. The colorSpace parameter is + * accepted for forward compatibility but non-sRGB values are ignored with a warning. + */ + static std::shared_ptr MakeFrom(HWND hwnd, std::shared_ptr device, + std::shared_ptr colorSpace = nullptr); +#endif + + ~D3D12Window() override; + + protected: + std::shared_ptr onCreateRenderTarget(Context* context) override; + void onPresent(Context* context) override; + + private: + // PImpl: all DXGI / D3D12 handles and per-backbuffer state live in PlatformState (defined in + // the .cpp) so this header pulls in neither dxgi.h nor d3d12.h. + struct PlatformState; + + explicit D3D12Window(std::shared_ptr device, std::unique_ptr state, + std::shared_ptr colorSpace); + + std::unique_ptr _platformState; +}; + +} // namespace tgfx diff --git a/src/gpu/Backend.cpp b/src/gpu/Backend.cpp index 45e4c9d5b..15f903fe8 100644 --- a/src/gpu/Backend.cpp +++ b/src/gpu/Backend.cpp @@ -17,6 +17,7 @@ ///////////////////////////////////////////////////////////////////////////////////////////////// #include "tgfx/gpu/Backend.h" +#include "d3d12/D3D12Defines.h" #include "metal/MetalDefines.h" #include "opengl/GLDefines.h" #include "vulkan/VulkanDefines.h" @@ -63,6 +64,9 @@ BackendTexture& BackendTexture::operator=(const BackendTexture& that) { case Backend::Vulkan: vulkanInfo = that.vulkanInfo; break; + case Backend::D3D12: + d3d12Info = that.d3d12Info; + break; default: break; } @@ -80,6 +84,8 @@ PixelFormat BackendTexture::format() const { return MetalPixelFormatToPixelFormat(metalInfo.format); case Backend::Vulkan: return VulkanFormatToPixelFormat(vulkanInfo.format); + case Backend::D3D12: + return DXGIFormatToPixelFormat(d3d12Info.format); default: break; } @@ -110,6 +116,14 @@ bool BackendTexture::getVulkanImageInfo(VulkanImageInfo* vulkanImageInfo) const return true; } +bool BackendTexture::getD3D12TextureInfo(D3D12TextureInfo* d3d12TextureInfo) const { + if (!isValid() || _backend != Backend::D3D12) { + return false; + } + *d3d12TextureInfo = d3d12Info; + return true; +} + BackendRenderTarget& BackendRenderTarget::operator=(const BackendRenderTarget& that) { if (!that.isValid()) { _width = _height = 0; @@ -128,6 +142,9 @@ BackendRenderTarget& BackendRenderTarget::operator=(const BackendRenderTarget& t case Backend::Vulkan: vulkanInfo = that.vulkanInfo; break; + case Backend::D3D12: + d3d12Info = that.d3d12Info; + break; default: break; } @@ -145,6 +162,8 @@ PixelFormat BackendRenderTarget::format() const { return MetalPixelFormatToPixelFormat(metalInfo.format); case Backend::Vulkan: return VulkanFormatToPixelFormat(vulkanInfo.format); + case Backend::D3D12: + return DXGIFormatToPixelFormat(d3d12Info.format); default: break; } @@ -175,6 +194,14 @@ bool BackendRenderTarget::getVulkanImageInfo(VulkanImageInfo* vulkanImageInfo) c return true; } +bool BackendRenderTarget::getD3D12TextureInfo(D3D12TextureInfo* d3d12TextureInfo) const { + if (!isValid() || _backend != Backend::D3D12) { + return false; + } + *d3d12TextureInfo = d3d12Info; + return true; +} + BackendSemaphore& BackendSemaphore::operator=(const BackendSemaphore& that) { _backend = that._backend; switch (that._backend) { @@ -187,6 +214,9 @@ BackendSemaphore& BackendSemaphore::operator=(const BackendSemaphore& that) { case Backend::Vulkan: vulkanSyncInfo = that.vulkanSyncInfo; break; + case Backend::D3D12: + d3d12SyncInfo = that.d3d12SyncInfo; + break; default: break; } @@ -201,6 +231,8 @@ bool BackendSemaphore::isInitialized() const { return metalSyncInfo.event != nullptr; case Backend::Vulkan: return vulkanSyncInfo.semaphore != 0; + case Backend::D3D12: + return d3d12SyncInfo.fence != nullptr; default: break; } @@ -231,4 +263,12 @@ bool BackendSemaphore::getVulkanSync(VulkanSyncInfo* vulkanSyncInfo) const { return true; } +bool BackendSemaphore::getD3D12Sync(D3D12SyncInfo* d3d12Info) const { + if (_backend != Backend::D3D12 || d3d12SyncInfo.fence == nullptr) { + return false; + } + *d3d12Info = d3d12SyncInfo; + return true; +} + } // namespace tgfx diff --git a/src/gpu/ShaderCaps.cpp b/src/gpu/ShaderCaps.cpp index 91fafb6b0..2b1a179ee 100644 --- a/src/gpu/ShaderCaps.cpp +++ b/src/gpu/ShaderCaps.cpp @@ -40,6 +40,9 @@ static void PrintGPUInfo(const GPUInfo* info) { case Backend::WebGPU: backend = "WebGPU"; break; + case Backend::D3D12: + backend = "D3D12"; + break; case Backend::Unknown: backend = "Unknown"; break; diff --git a/src/gpu/ShaderCompiler.cpp b/src/gpu/ShaderCompiler.cpp index 780573bb7..1f3f144b6 100644 --- a/src/gpu/ShaderCompiler.cpp +++ b/src/gpu/ShaderCompiler.cpp @@ -139,12 +139,17 @@ std::string PreprocessGLSL(const std::string& glslCode) { } std::vector CompileGLSLToSPIRV(const shaderc::Compiler* compiler, - const std::string& glslCode, ShaderStage stage) { + const std::string& glslCode, ShaderStage stage, + bool preserveInterfaceVariables) { if (compiler == nullptr) { return {}; } shaderc::CompileOptions options; - options.SetOptimizationLevel(shaderc_optimization_level_performance); + // See header doc on `preserveInterfaceVariables` for the rationale; D3D12 requires zero so the + // optimiser cannot dead-strip fragment inputs that have no body uses, while Vulkan/Metal stay + // on the performance preset for tighter SPIR-V. + options.SetOptimizationLevel(preserveInterfaceVariables ? shaderc_optimization_level_zero + : shaderc_optimization_level_performance); options.SetTargetEnvironment(shaderc_target_env_vulkan, shaderc_env_version_vulkan_1_0); shaderc_shader_kind shaderKind = diff --git a/src/gpu/ShaderCompiler.h b/src/gpu/ShaderCompiler.h index b149d591f..ba1c97011 100644 --- a/src/gpu/ShaderCompiler.h +++ b/src/gpu/ShaderCompiler.h @@ -36,7 +36,17 @@ std::string PreprocessGLSL(const std::string& glslCode); /// Compiles preprocessed GLSL 450 source to SPIR-V binary using shaderc. Returns an empty vector /// on failure. +/// +/// `preserveInterfaceVariables` controls the optimisation level: +/// - false (default): runs `shaderc_optimization_level_performance`, which is what Vulkan and +/// Metal want — both bind interface variables by name/location, so dead-stripping unused +/// fragment inputs is harmless and yields better generated code. +/// - true: runs `shaderc_optimization_level_zero` so every declared vertex output / fragment +/// input survives. D3D12 needs this because the SPIR-V → HLSL pass turns SPIR-V locations +/// into TEXCOORDn semantics; if the optimiser drops a fragment input, the resulting HLSL +/// mismatches the vertex shader's output signature and PSO creation fails. std::vector CompileGLSLToSPIRV(const shaderc::Compiler* compiler, - const std::string& vulkanGLSL, ShaderStage stage); + const std::string& vulkanGLSL, ShaderStage stage, + bool preserveInterfaceVariables = false); } // namespace tgfx diff --git a/src/gpu/VaryingHandler.cpp b/src/gpu/VaryingHandler.cpp index 2ae27c06b..dbdd2aaa6 100644 --- a/src/gpu/VaryingHandler.cpp +++ b/src/gpu/VaryingHandler.cpp @@ -17,6 +17,7 @@ ///////////////////////////////////////////////////////////////////////////////////////////////// #include "VaryingHandler.h" +#include #include "ProgramBuilder.h" namespace tgfx { @@ -61,6 +62,14 @@ void VaryingHandler::finalize() { void VaryingHandler::appendDecls(const std::vector& vars, std::string* out, ShaderStage stage) const { + // Do not emit explicit "layout(location=N)" qualifiers for varyings here. Desktop OpenGL + // sticks to "#version 150" which rejects layout(location) on varyings without the + // GL_ARB_separate_shader_objects extension, so emitting them breaks shader compilation on + // that backend. The SPIR-V cross-compiled backends (D3D12 / Vulkan / Metal) reassign + // locations later in ShaderCompiler::PreprocessGLSL via the in/out regex passes, which walk + // the GLSL source in textual order. Vertex outputs and fragment inputs come from the same + // VaryingHandler::varyings sequence, so the per-stage numbering they end up with already + // matches across the boundary. for (const auto& var : vars) { out->append(programBuilder->getShaderVarDeclarations(var, stage)); out->append(";\n"); diff --git a/src/gpu/d3d12/D3D12BarrierBatch.cpp b/src/gpu/d3d12/D3D12BarrierBatch.cpp new file mode 100644 index 000000000..5666450a7 --- /dev/null +++ b/src/gpu/d3d12/D3D12BarrierBatch.cpp @@ -0,0 +1,47 @@ +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Tencent is pleased to support the open source community by making tgfx available. +// +// Copyright (C) 2026 Tencent. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// unless required by applicable law or agreed to in writing, software distributed under the +// license is distributed on an "as is" basis, without warranties or conditions of any kind, +// either express or implied. see the license for the specific language governing permissions +// and limitations under the license. +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#include "D3D12BarrierBatch.h" + +namespace tgfx { + +void D3D12BarrierBatch::addTransition(ID3D12Resource* resource, D3D12_RESOURCE_STATES before, + D3D12_RESOURCE_STATES after, UINT subresource) { + if (resource == nullptr || before == after) { + return; + } + D3D12_RESOURCE_BARRIER barrier = {}; + barrier.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION; + barrier.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE; + barrier.Transition.pResource = resource; + barrier.Transition.StateBefore = before; + barrier.Transition.StateAfter = after; + barrier.Transition.Subresource = subresource; + barriers.push_back(barrier); +} + +void D3D12BarrierBatch::flush(ID3D12GraphicsCommandList* commandList) { + if (barriers.empty() || commandList == nullptr) { + barriers.clear(); + return; + } + commandList->ResourceBarrier(static_cast(barriers.size()), barriers.data()); + barriers.clear(); +} + +} // namespace tgfx diff --git a/src/gpu/d3d12/D3D12BarrierBatch.h b/src/gpu/d3d12/D3D12BarrierBatch.h new file mode 100644 index 000000000..fbaef6456 --- /dev/null +++ b/src/gpu/d3d12/D3D12BarrierBatch.h @@ -0,0 +1,87 @@ +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Tencent is pleased to support the open source community by making tgfx available. +// +// Copyright (C) 2026 Tencent. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// unless required by applicable law or agreed to in writing, software distributed under the +// license is distributed on an "as is" basis, without warranties or conditions of any kind, +// either express or implied. see the license for the specific language governing permissions +// and limitations under the license. +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include +#include "D3D12Util.h" + +namespace tgfx { + +/** + * Accumulates D3D12 resource transitions and flushes them in a single ResourceBarrier call. + * + * Why batching matters: + * - Each ID3D12GraphicsCommandList::ResourceBarrier call has fixed CPU-side runtime/driver + * cost (state-machine bookkeeping, debug-layer validation, and on some GPUs an implicit + * pipeline stall to flush caches). + * - When N independent barriers are submitted in a single call, the driver can collapse + * redundant cache flushes — for example five textures all moving from RENDER_TARGET to + * PIXEL_SHADER_RESOURCE only require one RT-cache flush + one PS-cache invalidate, not + * five of each. NVIDIA / AMD / Intel guidance lists barrier batching as one of the top + * three D3D12 performance pitfalls. + * + * Usage pattern: + * D3D12BarrierBatch batch; + * batch.addTransition(rt, COMMON, RENDER_TARGET); + * batch.addTransition(dsv, COMMON, DEPTH_WRITE); + * batch.flush(commandList); // one ResourceBarrier(2, ...) + * + * Thread safety: not thread-safe. Each batch is local to one command-list recording context. + */ +class D3D12BarrierBatch { + public: + D3D12BarrierBatch() { + // Most batches in the backend hold 1–8 transitions (e.g. one per color attachment, or one + // per sampled texture in a draw). Pre-reserve to avoid the small-buffer reallocation + // sequence that plain push_back() incurs. + barriers.reserve(8); + } + + D3D12BarrierBatch(const D3D12BarrierBatch&) = delete; + D3D12BarrierBatch& operator=(const D3D12BarrierBatch&) = delete; + + /** + * Queues a transition barrier. No-op if `before == after` so callers can pass through + * already-correct states without polluting the batch. `subresource` defaults to "all + * subresources" which is what most call sites need; callers manipulating individual mips + * should pass the explicit subresource index. + */ + void addTransition(ID3D12Resource* resource, D3D12_RESOURCE_STATES before, + D3D12_RESOURCE_STATES after, + UINT subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES); + + /** + * Submits all queued barriers to the command list as a single ResourceBarrier(N, ...) call + * and clears the batch. Safe to call when the batch is empty (no-op). + */ + void flush(ID3D12GraphicsCommandList* commandList); + + bool empty() const { + return barriers.empty(); + } + + size_t size() const { + return barriers.size(); + } + + private: + std::vector barriers; +}; + +} // namespace tgfx diff --git a/src/gpu/d3d12/D3D12Buffer.cpp b/src/gpu/d3d12/D3D12Buffer.cpp new file mode 100644 index 000000000..a658eea20 --- /dev/null +++ b/src/gpu/d3d12/D3D12Buffer.cpp @@ -0,0 +1,135 @@ +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Tencent is pleased to support the open source community by making tgfx available. +// +// Copyright (C) 2026 Tencent. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// unless required by applicable law or agreed to in writing, software distributed under the +// license is distributed on an "as is" basis, without warranties or conditions of any kind, +// either express or implied. see the license for the specific language governing permissions +// and limitations under the license. +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#include "D3D12Buffer.h" +#include "D3D12GPU.h" +#include "core/utils/Log.h" + +namespace tgfx { + +std::shared_ptr D3D12Buffer::Make(D3D12GPU* gpu, size_t size, uint32_t usage) { + if (gpu == nullptr || size == 0) { + return nullptr; + } + + D3D12_HEAP_PROPERTIES heapProperties = {}; + if (usage & GPUBufferUsage::READBACK) { + heapProperties.Type = D3D12_HEAP_TYPE_READBACK; + } else { + // TODO: Place static (write-once) vertex/index buffers in D3D12_HEAP_TYPE_DEFAULT and stage + // their initial contents through an UPLOAD heap copy, leaving only streamed (per-frame) + // buffers in D3D12_HEAP_TYPE_UPLOAD. The Vulkan and Metal backends share the same shortcut + // today (every non-readback buffer is host-visible) so any improvement here should be paired + // with the matching VMA/MTLResourceStorageModePrivate work and a STATIC/STREAM hint on + // GPUBufferUsage. Until then keep UPLOAD so vertex/index/uniform buffers remain mappable. + heapProperties.Type = D3D12_HEAP_TYPE_UPLOAD; + } + + D3D12_RESOURCE_DESC resourceDesc = {}; + resourceDesc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER; + resourceDesc.Width = static_cast(size); + resourceDesc.Height = 1; + resourceDesc.DepthOrArraySize = 1; + resourceDesc.MipLevels = 1; + resourceDesc.Format = static_cast(DXGI_FORMAT_UNKNOWN); + resourceDesc.SampleDesc.Count = 1; + resourceDesc.SampleDesc.Quality = 0; + resourceDesc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR; + resourceDesc.Flags = D3D12_RESOURCE_FLAG_NONE; + + D3D12_RESOURCE_STATES initialState = D3D12_RESOURCE_STATE_GENERIC_READ; + if (usage & GPUBufferUsage::READBACK) { + initialState = D3D12_RESOURCE_STATE_COPY_DEST; + } + + ComPtr d3d12Resource = nullptr; + auto hr = + gpu->device()->CreateCommittedResource(&heapProperties, D3D12_HEAP_FLAG_NONE, &resourceDesc, + initialState, nullptr, IID_PPV_ARGS(&d3d12Resource)); + if (FAILED(hr)) { + LOGE("D3D12Buffer::Make() CreateCommittedResource failed, HRESULT=0x%08X", + static_cast(hr)); + return nullptr; + } + + return gpu->makeResource(size, usage, std::move(d3d12Resource)); +} + +D3D12Buffer::D3D12Buffer(size_t size, uint32_t usage, ComPtr d3d12Resource) + : GPUBuffer(size, usage), resource(std::move(d3d12Resource)) { +} + +void D3D12Buffer::onRelease(D3D12GPU*) { + if (mappedPointer != nullptr) { + resource->Unmap(0, nullptr); + mappedPointer = nullptr; + } + resource = nullptr; +} + +void* D3D12Buffer::map(size_t offset, size_t size) { + if (resource == nullptr || mappedPointer != nullptr) { + return nullptr; + } + if (size == 0) { + LOGE("D3D12Buffer::map() size cannot be 0!"); + return nullptr; + } + if (size == GPU_BUFFER_WHOLE_SIZE) { + size = _size - offset; + } + if (offset + size > _size) { + LOGE("D3D12Buffer::map() range out of bounds!"); + return nullptr; + } + + D3D12_RANGE readRange = {}; + if (_usage & GPUBufferUsage::READBACK) { + readRange.Begin = offset; + readRange.End = offset + size; + } + + void* data = nullptr; + auto hr = resource->Map(0, &readRange, &data); + if (FAILED(hr) || data == nullptr) { + LOGE("D3D12Buffer::map() Map failed, HRESULT=0x%08X", static_cast(hr)); + return nullptr; + } + + mappedPointer = static_cast(data) + offset; + return mappedPointer; +} + +void D3D12Buffer::unmap() { + if (resource == nullptr || mappedPointer == nullptr) { + return; + } + D3D12_RANGE writtenRange = {}; + if (!(_usage & GPUBufferUsage::READBACK)) { + writtenRange.Begin = 0; + writtenRange.End = _size; + } + resource->Unmap(0, &writtenRange); + mappedPointer = nullptr; +} + +bool D3D12Buffer::isReady() const { + return resource != nullptr; +} + +} // namespace tgfx diff --git a/src/gpu/d3d12/D3D12Buffer.h b/src/gpu/d3d12/D3D12Buffer.h new file mode 100644 index 000000000..6748884db --- /dev/null +++ b/src/gpu/d3d12/D3D12Buffer.h @@ -0,0 +1,60 @@ +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Tencent is pleased to support the open source community by making tgfx available. +// +// Copyright (C) 2026 Tencent. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// unless required by applicable law or agreed to in writing, software distributed under the +// license is distributed on an "as is" basis, without warranties or conditions of any kind, +// either express or implied. see the license for the specific language governing permissions +// and limitations under the license. +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include "D3D12Resource.h" +#include "D3D12Util.h" +#include "tgfx/gpu/GPUBuffer.h" + +namespace tgfx { + +class D3D12GPU; + +/** + * D3D12 buffer implementation. + */ +class D3D12Buffer : public GPUBuffer, public D3D12Resource { + public: + static std::shared_ptr Make(D3D12GPU* gpu, size_t size, uint32_t usage); + + /** + * Returns the underlying D3D12 resource. + */ + ID3D12Resource* d3d12Resource() const { + return resource.Get(); + } + + void* map(size_t offset, size_t size) override; + void unmap() override; + bool isReady() const override; + + protected: + void onRelease(D3D12GPU* gpu) override; + + private: + D3D12Buffer(size_t size, uint32_t usage, ComPtr resource); + ~D3D12Buffer() override = default; + + ComPtr resource = nullptr; + void* mappedPointer = nullptr; + + friend class D3D12GPU; +}; + +} // namespace tgfx diff --git a/src/gpu/d3d12/D3D12CommandBuffer.cpp b/src/gpu/d3d12/D3D12CommandBuffer.cpp new file mode 100644 index 000000000..18b0c248e --- /dev/null +++ b/src/gpu/d3d12/D3D12CommandBuffer.cpp @@ -0,0 +1,34 @@ +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Tencent is pleased to support the open source community by making tgfx available. +// +// Copyright (C) 2026 Tencent. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// unless required by applicable law or agreed to in writing, software distributed under the +// license is distributed on an "as is" basis, without warranties or conditions of any kind, +// either express or implied. see the license for the specific language governing permissions +// and limitations under the license. +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#include "D3D12CommandBuffer.h" +#include "D3D12GPU.h" + +namespace tgfx { + +D3D12CommandBuffer::~D3D12CommandBuffer() { + if (session.commandAllocator == nullptr) { + // Session was already moved out by submit(). Normal path — nothing to clean up. + return; + } + // Abandon path: CommandBuffer was created (finish() succeeded) but never submitted. + // Reclaim all session resources through the same unified path used by reclaimSubmission(). + _gpu->reclaimAbandonedSession(std::move(session)); +} + +} // namespace tgfx diff --git a/src/gpu/d3d12/D3D12CommandBuffer.h b/src/gpu/d3d12/D3D12CommandBuffer.h new file mode 100644 index 000000000..4d2769bcc --- /dev/null +++ b/src/gpu/d3d12/D3D12CommandBuffer.h @@ -0,0 +1,66 @@ +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Tencent is pleased to support the open source community by making tgfx available. +// +// Copyright (C) 2026 Tencent. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// unless required by applicable law or agreed to in writing, software distributed under the +// license is distributed on an "as is" basis, without warranties or conditions of any kind, +// either express or implied. see the license for the specific language governing permissions +// and limitations under the license. +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include "gpu/d3d12/D3D12FrameSession.h" +#include "tgfx/gpu/CommandBuffer.h" + +namespace tgfx { + +class D3D12GPU; + +/** + * Transport container that carries a D3D12FrameSession from encoding to submission. + * + * Created by D3D12CommandEncoder::onFinish() which moves its D3D12FrameSession here. Consumed by + * D3D12CommandQueue::submit() which moves the session into D3D12GPU's InflightSubmission. After + * submit(), this object is empty and may be discarded. + * + * If the CommandBuffer is abandoned (created but never submitted), the destructor reclaims all + * session resources via D3D12GPU::reclaimAbandonedSession(). This matches the abandon safety + * guarantee provided by D3D12CommandEncoder::onRelease() — both use the same unified cleanup path + * in D3D12GPU, ensuring no D3D12 objects are leaked regardless of where the pipeline is + * interrupted. + */ +class D3D12CommandBuffer : public CommandBuffer { + public: + D3D12CommandBuffer(D3D12GPU* gpu, D3D12FrameSession session) + : _gpu(gpu), session(std::move(session)) { + } + + ~D3D12CommandBuffer() override; + + D3D12FrameSession& frameSession() { + return session; + } + + ID3D12GraphicsCommandList* d3d12CommandList() const { + return session.commandList.Get(); + } + + ID3D12CommandAllocator* d3d12CommandAllocator() const { + return session.commandAllocator.Get(); + } + + private: + D3D12GPU* _gpu = nullptr; + D3D12FrameSession session; +}; + +} // namespace tgfx diff --git a/src/gpu/d3d12/D3D12CommandEncoder.cpp b/src/gpu/d3d12/D3D12CommandEncoder.cpp new file mode 100644 index 000000000..e7db252bf --- /dev/null +++ b/src/gpu/d3d12/D3D12CommandEncoder.cpp @@ -0,0 +1,456 @@ +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Tencent is pleased to support the open source community by making tgfx available. +// +// Copyright (C) 2026 Tencent. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// unless required by applicable law or agreed to in writing, software distributed under the +// license is distributed on an "as is" basis, without warranties or conditions of any kind, +// either express or implied. see the license for the specific language governing permissions +// and limitations under the license. +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#include "D3D12CommandEncoder.h" +#include "D3D12BarrierBatch.h" +#include "D3D12Buffer.h" +#include "D3D12CommandBuffer.h" +#include "D3D12Defines.h" +#include "D3D12GPU.h" +#include "D3D12MipmapGenerator.h" +#include "D3D12RenderPass.h" +#include "D3D12Texture.h" +#include "core/utils/Log.h" + +namespace tgfx { + +std::shared_ptr D3D12CommandEncoder::Make(D3D12GPU* gpu) { + if (gpu == nullptr) { + return nullptr; + } + // Pull a recording-state allocator/list pair from the pool. On a hit, it has just been Reset(); + // on a miss, the pool creates a fresh pair internally. Either way, the list arrives in the + // same recording state CreateCommandList would have produced. + auto entry = gpu->commandListPool().acquire(gpu->device()); + if (!entry.valid()) { + LOGE("D3D12CommandEncoder: command-list pool acquire failed."); + return nullptr; + } + + // Bind the process-wide shader-visible CBV/SRV/UAV ring and Sampler heap once for the entire + // life of this command list. Reset() clears any previous SetDescriptorHeaps state, so the + // bind has to happen on every reuse, not just on first creation. D3D12 documents repeated + // SetDescriptorHeaps as a potential stall on some drivers, and our render passes always + // sub-allocate descriptors into these two heaps, so a single bind is both correct and optimal. + ID3D12DescriptorHeap* heaps[] = {gpu->srvRing().heap(), gpu->samplerHeap()}; + entry.commandList->SetDescriptorHeaps(2, heaps); + + return gpu->makeResource(gpu, std::move(entry.allocator), + std::move(entry.commandList)); +} + +D3D12CommandEncoder::D3D12CommandEncoder(D3D12GPU* gpu, ComPtr allocator, + ComPtr commandList) + : _gpu(gpu) { + session.commandAllocator = std::move(allocator); + session.commandList = std::move(commandList); +} + +GPU* D3D12CommandEncoder::gpu() const { + return _gpu; +} + +std::shared_ptr D3D12CommandEncoder::onBeginRenderPass( + const RenderPassDescriptor& descriptor) { + return D3D12RenderPass::Make(this, descriptor); +} + +void D3D12CommandEncoder::copyTextureToTexture(std::shared_ptr srcTexture, + const Rect& srcRect, + std::shared_ptr dstTexture, + const Point& dstOffset) { + if (!srcTexture || !dstTexture) { + return; + } + // Clamp copy region to source bounds. + auto srcX = static_cast(srcRect.x()); + auto srcY = static_cast(srcRect.y()); + auto copyWidth = static_cast(srcRect.width()); + auto copyHeight = static_cast(srcRect.height()); + auto srcW = static_cast(srcTexture->width()); + auto srcH = static_cast(srcTexture->height()); + if (srcX + copyWidth > srcW) { + copyWidth = srcW > static_cast(srcX) ? srcW - static_cast(srcX) : 0; + } + if (srcY + copyHeight > srcH) { + copyHeight = srcH > static_cast(srcY) ? srcH - static_cast(srcY) : 0; + } + + auto d3d12Src = std::static_pointer_cast(srcTexture); + auto d3d12Dst = std::static_pointer_cast(dstTexture); + retainResource(d3d12Src); + retainResource(d3d12Dst); + + auto cmd = session.commandList.Get(); + if (copyWidth == 0 || copyHeight == 0) { + return; + } + + // Combine the two pre-copy transitions (src -> COPY_SOURCE, dst -> COPY_DEST) into a single + // ResourceBarrier(2, ...) call. addTransition() collapses no-op transitions, so callers that + // are already in the requested state simply skip ahead. recordTextureStateChange snapshots + // the original state into session.initialTextureStates on first touch so the abandoned- + // session path can roll _currentState back if this copy never reaches the GPU. + D3D12BarrierBatch enterBatch; + enterBatch.addTransition(d3d12Src->d3d12Resource(), d3d12Src->currentState(), + D3D12_RESOURCE_STATE_COPY_SOURCE); + recordTextureStateChange(d3d12Src.get(), D3D12_RESOURCE_STATE_COPY_SOURCE); + enterBatch.addTransition(d3d12Dst->d3d12Resource(), d3d12Dst->currentState(), + D3D12_RESOURCE_STATE_COPY_DEST); + recordTextureStateChange(d3d12Dst.get(), D3D12_RESOURCE_STATE_COPY_DEST); + enterBatch.flush(cmd); + + D3D12_TEXTURE_COPY_LOCATION dst = {}; + dst.pResource = d3d12Dst->d3d12Resource(); + dst.Type = D3D12_TEXTURE_COPY_TYPE_SUBRESOURCE_INDEX; + dst.SubresourceIndex = 0; + + D3D12_TEXTURE_COPY_LOCATION src = {}; + src.pResource = d3d12Src->d3d12Resource(); + src.Type = D3D12_TEXTURE_COPY_TYPE_SUBRESOURCE_INDEX; + // Both SubresourceIndex 0 values intentionally target (mip 0, array slice 0, plane 0). The + // CommandEncoder::copyTextureToTexture contract does not expose mip / slice arguments and the + // Vulkan/Metal backends do exactly the same thing. A per-mip copy would require an API + // extension across every backend, not just D3D12. + src.SubresourceIndex = 0; + + D3D12_BOX srcBox = {}; + srcBox.left = static_cast(srcX); + srcBox.top = static_cast(srcY); + srcBox.front = 0; + srcBox.right = static_cast(srcX) + copyWidth; + srcBox.bottom = static_cast(srcY) + copyHeight; + srcBox.back = 1; + + cmd->CopyTextureRegion(&dst, static_cast(dstOffset.x), static_cast(dstOffset.y), 0, + &src, &srcBox); + + // Transition both resources back to COMMON in a single barrier call. D3D12 will then promote + // them implicitly to PIXEL_SHADER_RESOURCE on the next sample — matching the "promote on + // demand" behaviour the rest of the backend assumes. + D3D12BarrierBatch exitBatch; + exitBatch.addTransition(d3d12Src->d3d12Resource(), D3D12_RESOURCE_STATE_COPY_SOURCE, + D3D12_RESOURCE_STATE_COMMON); + exitBatch.addTransition(d3d12Dst->d3d12Resource(), D3D12_RESOURCE_STATE_COPY_DEST, + D3D12_RESOURCE_STATE_COMMON); + exitBatch.flush(cmd); + recordTextureStateChange(d3d12Src.get(), D3D12_RESOURCE_STATE_COMMON); + recordTextureStateChange(d3d12Dst.get(), D3D12_RESOURCE_STATE_COMMON); +} + +void D3D12CommandEncoder::copyTextureToBuffer(std::shared_ptr srcTexture, + const Rect& srcRect, + std::shared_ptr dstBuffer, + size_t dstOffset, size_t dstRowBytes) { + if (!srcTexture || !dstBuffer) { + return; + } + auto srcX = static_cast(srcRect.x()); + auto srcY = static_cast(srcRect.y()); + auto copyWidth = static_cast(srcRect.width()); + auto copyHeight = static_cast(srcRect.height()); + auto srcW = static_cast(srcTexture->width()); + auto srcH = static_cast(srcTexture->height()); + if (srcX + copyWidth > srcW) { + copyWidth = srcW > static_cast(srcX) ? srcW - static_cast(srcX) : 0; + } + if (srcY + copyHeight > srcH) { + copyHeight = srcH > static_cast(srcY) ? srcH - static_cast(srcY) : 0; + } + if (copyWidth == 0 || copyHeight == 0) { + return; + } + + auto d3d12Src = std::static_pointer_cast(srcTexture); + auto d3d12Dst = std::static_pointer_cast(dstBuffer); + retainResource(d3d12Src); + retainResource(d3d12Dst); + + auto cmd = session.commandList.Get(); + auto bytesPerPixel = static_cast(DXGIFormatBytesPerPixel(d3d12Src->dxgiFormat())); + uint32_t tightRowBytes = + dstRowBytes > 0 ? static_cast(dstRowBytes) : copyWidth * bytesPerPixel; + + // D3D12 requires CopyTextureRegion's destination buffer footprint to use a row pitch that is a + // multiple of D3D12_TEXTURE_DATA_PITCH_ALIGNMENT (256). The caller's buffer is laid out tightly + // (one row immediately follows the previous), so when tightRowBytes happens to be unaligned we + // route the copy through a transient default-heap staging buffer with an aligned row pitch and + // then issue per-row CopyBufferRegion calls to repack the rows into the caller's buffer. + uint32_t alignedRowPitch = (tightRowBytes + D3D12_TEXTURE_DATA_PITCH_ALIGNMENT - 1) & + ~(D3D12_TEXTURE_DATA_PITCH_ALIGNMENT - 1); + bool needsRepack = (alignedRowPitch != tightRowBytes); + + TransitionResourceState(cmd, d3d12Src->d3d12Resource(), d3d12Src->currentState(), + D3D12_RESOURCE_STATE_COPY_SOURCE); + recordTextureStateChange(d3d12Src.get(), D3D12_RESOURCE_STATE_COPY_SOURCE); + + ComPtr stagingBuffer = nullptr; + ID3D12Resource* footprintTarget = d3d12Dst->d3d12Resource(); + UINT64 footprintOffset = static_cast(dstOffset); + + if (needsRepack) { + // Allocate a transient default-heap buffer big enough to hold the aligned-row-pitch image. + // The buffer is created in COPY_DEST state so CopyTextureRegion can write to it directly, + // then transitioned to COPY_SOURCE so we can read it back row-by-row into the caller's + // buffer. The session retains the staging buffer until the fence signals. + auto device = _gpu->device(); + D3D12_HEAP_PROPERTIES heapProps = {}; + heapProps.Type = D3D12_HEAP_TYPE_DEFAULT; + D3D12_RESOURCE_DESC bufferDesc = {}; + bufferDesc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER; + bufferDesc.Width = static_cast(alignedRowPitch) * copyHeight; + bufferDesc.Height = 1; + bufferDesc.DepthOrArraySize = 1; + bufferDesc.MipLevels = 1; + bufferDesc.Format = static_cast(0); // DXGI_FORMAT_UNKNOWN + bufferDesc.SampleDesc.Count = 1; + bufferDesc.SampleDesc.Quality = 0; + bufferDesc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR; + bufferDesc.Flags = D3D12_RESOURCE_FLAG_NONE; + auto hr = device->CreateCommittedResource(&heapProps, D3D12_HEAP_FLAG_NONE, &bufferDesc, + D3D12_RESOURCE_STATE_COPY_DEST, nullptr, + IID_PPV_ARGS(&stagingBuffer)); + if (FAILED(hr)) { + LOGE( + "D3D12CommandEncoder::copyTextureToBuffer: staging buffer creation failed, " + "HRESULT=0x%08X", + static_cast(hr)); + // Fall back to direct copy with potentially wrong stride; better than dropping silently. + stagingBuffer = nullptr; + needsRepack = false; + } else { + footprintTarget = stagingBuffer.Get(); + footprintOffset = 0; + } + } + + D3D12_PLACED_SUBRESOURCE_FOOTPRINT footprint = {}; + footprint.Offset = footprintOffset; + footprint.Footprint.Format = static_cast(d3d12Src->dxgiFormat()); + footprint.Footprint.Width = copyWidth; + footprint.Footprint.Height = copyHeight; + footprint.Footprint.Depth = 1; + footprint.Footprint.RowPitch = needsRepack ? alignedRowPitch : tightRowBytes; + + D3D12_TEXTURE_COPY_LOCATION dstLoc = {}; + dstLoc.pResource = footprintTarget; + dstLoc.Type = D3D12_TEXTURE_COPY_TYPE_PLACED_FOOTPRINT; + dstLoc.PlacedFootprint = footprint; + + D3D12_TEXTURE_COPY_LOCATION srcLoc = {}; + srcLoc.pResource = d3d12Src->d3d12Resource(); + srcLoc.Type = D3D12_TEXTURE_COPY_TYPE_SUBRESOURCE_INDEX; + srcLoc.SubresourceIndex = 0; + + D3D12_BOX srcBox = {}; + srcBox.left = static_cast(srcX); + srcBox.top = static_cast(srcY); + srcBox.front = 0; + srcBox.right = static_cast(srcX) + copyWidth; + srcBox.bottom = static_cast(srcY) + copyHeight; + srcBox.back = 1; + + cmd->CopyTextureRegion(&dstLoc, 0, 0, 0, &srcLoc, &srcBox); + + if (needsRepack) { + // Transition the staging buffer to COPY_SOURCE and repack each row into the caller's buffer. + TransitionResourceState(cmd, stagingBuffer.Get(), D3D12_RESOURCE_STATE_COPY_DEST, + D3D12_RESOURCE_STATE_COPY_SOURCE); + for (uint32_t row = 0; row < copyHeight; row++) { + cmd->CopyBufferRegion(d3d12Dst->d3d12Resource(), + static_cast(dstOffset) + row * tightRowBytes, + stagingBuffer.Get(), row * alignedRowPitch, tightRowBytes); + } + session.auxBuffers.push_back(std::move(stagingBuffer)); + } + + TransitionResourceState(cmd, d3d12Src->d3d12Resource(), D3D12_RESOURCE_STATE_COPY_SOURCE, + D3D12_RESOURCE_STATE_COMMON); + recordTextureStateChange(d3d12Src.get(), D3D12_RESOURCE_STATE_COMMON); +} + +void D3D12CommandEncoder::generateMipmapsForTexture(std::shared_ptr texture) { + if (!texture) { + return; + } + auto d3d12Tex = std::static_pointer_cast(texture); + auto mipCount = static_cast(d3d12Tex->mipLevelCount()); + if (mipCount <= 1) { + return; + } + + auto* generator = _gpu->mipmapGenerator(); + if (generator == nullptr || !generator->isReady()) { + static bool warned = false; + if (!warned) { + LOGE( + "D3D12CommandEncoder::generateMipmapsForTexture: mipmap generator unavailable, " + "skipping (subsequent calls silently no-op)."); + warned = true; + } + return; + } + + auto device = _gpu->device(); + auto cmd = session.commandList.Get(); + auto resource = d3d12Tex->d3d12Resource(); + auto dxgiFormat = static_cast(d3d12Tex->dxgiFormat()); + + retainResource(d3d12Tex); + + // Sub-allocate a contiguous (mipCount-1)*2-slot range out of the GPU's process-wide + // CBV/SRV/UAV ring. Each mip level pair occupies two consecutive slots (SRV at 2i, UAV at + // 2i+1) so the compute shader's two single-descriptor tables can be bound by GPU handle. The + // ring is already bound to this command list; no SetDescriptorHeaps needed. + uint32_t descriptorsPerLevel = 2; + uint32_t totalDescriptors = (mipCount - 1) * descriptorsPerLevel; + auto range = _gpu->srvRing().allocate(totalDescriptors); + if (!range.valid()) { + LOGE("D3D12CommandEncoder::generateMipmapsForTexture: SRV ring allocation failed (count=%u).", + totalDescriptors); + return; + } + auto descSize = _gpu->srvRing().descriptorSize(); + + cmd->SetComputeRootSignature(generator->rootSignature()); + cmd->SetPipelineState(generator->pipelineState()); + + // Move every subresource into NON_PIXEL_SHADER_RESOURCE so the SRV reads in the first iteration + // are valid. The current() state is the per-resource state set by previous code (typically + // COMMON after a writeTexture / RenderPass end). + auto previousState = d3d12Tex->currentState(); + if (previousState != D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE) { + TransitionResourceState(cmd, resource, previousState, + D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE); + recordTextureStateChange(d3d12Tex.get(), D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE); + } + + uint32_t mipWidth = static_cast(d3d12Tex->width()); + uint32_t mipHeight = static_cast(d3d12Tex->height()); + for (uint32_t i = 0; i < mipCount - 1; i++) { + uint32_t outWidth = (mipWidth > 1) ? mipWidth / 2 : 1; + uint32_t outHeight = (mipHeight > 1) ? mipHeight / 2 : 1; + + // Transition the destination subresource (mip[i+1]) from NON_PIXEL_SHADER_RESOURCE to + // UNORDERED_ACCESS so the compute shader can write to it. Source mip[i] stays in + // NON_PIXEL_SHADER_RESOURCE. + D3D12_RESOURCE_BARRIER barrier = {}; + barrier.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION; + barrier.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE; + barrier.Transition.pResource = resource; + barrier.Transition.StateBefore = D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE; + barrier.Transition.StateAfter = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; + barrier.Transition.Subresource = i + 1; + cmd->ResourceBarrier(1, &barrier); + + // Compute the descriptor handles for this iteration's SRV (slot 2i within range) and UAV + // (slot 2i+1). + D3D12_CPU_DESCRIPTOR_HANDLE srvCpu = range.cpuStart; + srvCpu.ptr += static_cast(2 * i) * descSize; + D3D12_CPU_DESCRIPTOR_HANDLE uavCpu = range.cpuStart; + uavCpu.ptr += static_cast(2 * i + 1) * descSize; + D3D12_GPU_DESCRIPTOR_HANDLE srvGpu = range.gpuStart; + srvGpu.ptr += static_cast(2 * i) * descSize; + D3D12_GPU_DESCRIPTOR_HANDLE uavGpu = range.gpuStart; + uavGpu.ptr += static_cast(2 * i + 1) * descSize; + + D3D12_SHADER_RESOURCE_VIEW_DESC srvDesc = {}; + srvDesc.Format = dxgiFormat; + srvDesc.ViewDimension = D3D12_SRV_DIMENSION_TEXTURE2D; + srvDesc.Shader4ComponentMapping = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING; + srvDesc.Texture2D.MostDetailedMip = i; + srvDesc.Texture2D.MipLevels = 1; + srvDesc.Texture2D.PlaneSlice = 0; + srvDesc.Texture2D.ResourceMinLODClamp = 0.0f; + device->CreateShaderResourceView(resource, &srvDesc, srvCpu); + + D3D12_UNORDERED_ACCESS_VIEW_DESC uavDesc = {}; + uavDesc.Format = dxgiFormat; + uavDesc.ViewDimension = D3D12_UAV_DIMENSION_TEXTURE2D; + uavDesc.Texture2D.MipSlice = i + 1; + uavDesc.Texture2D.PlaneSlice = 0; + device->CreateUnorderedAccessView(resource, nullptr, &uavDesc, uavCpu); + + // Bind 4 root constants (output mip width, height, 1/width, 1/height) plus the SRV and UAV + // tables, then dispatch enough thread groups to cover the destination mip. + UINT mipConstants[4]; + mipConstants[0] = outWidth; + mipConstants[1] = outHeight; + *reinterpret_cast(&mipConstants[2]) = 1.0f / static_cast(outWidth); + *reinterpret_cast(&mipConstants[3]) = 1.0f / static_cast(outHeight); + cmd->SetComputeRoot32BitConstants(0, 4, mipConstants, 0); + cmd->SetComputeRootDescriptorTable(1, srvGpu); + cmd->SetComputeRootDescriptorTable(2, uavGpu); + + UINT groupsX = (outWidth + D3D12_MIPMAP_THREAD_GROUP_SIZE - 1) / D3D12_MIPMAP_THREAD_GROUP_SIZE; + UINT groupsY = + (outHeight + D3D12_MIPMAP_THREAD_GROUP_SIZE - 1) / D3D12_MIPMAP_THREAD_GROUP_SIZE; + cmd->Dispatch(groupsX, groupsY, 1); + + // Transition mip[i+1] from UNORDERED_ACCESS back to NON_PIXEL_SHADER_RESOURCE so the next + // iteration can use it as the SRV source. + barrier.Transition.StateBefore = D3D12_RESOURCE_STATE_UNORDERED_ACCESS; + barrier.Transition.StateAfter = D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE; + barrier.Transition.Subresource = i + 1; + cmd->ResourceBarrier(1, &barrier); + + mipWidth = outWidth; + mipHeight = outHeight; + } + + // Final transition: every subresource is currently NON_PIXEL_SHADER_RESOURCE. Move the whole + // resource back to COMMON so subsequent samplers / RTVs can pick a fresh state on demand, + // matching the convention every other code path uses. + TransitionResourceState(cmd, resource, D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE, + D3D12_RESOURCE_STATE_COMMON); + recordTextureStateChange(d3d12Tex.get(), D3D12_RESOURCE_STATE_COMMON); +} + +std::shared_ptr D3D12CommandEncoder::onFinish() { + auto hr = session.commandList->Close(); + if (FAILED(hr)) { + LOGE("D3D12CommandEncoder: ID3D12GraphicsCommandList::Close failed, HRESULT=0x%08X", + static_cast(hr)); + _gpu->reclaimAbandonedSession(std::move(session)); + return nullptr; + } + return std::make_shared(_gpu, std::move(session)); +} + +void D3D12CommandEncoder::recordTextureStateChange(D3D12Texture* texture, + D3D12_RESOURCE_STATES newState) { + if (texture == nullptr) { + return; + } + // Snapshot the original state on the first call for this texture inside the current session. + // unordered_map::emplace inserts only when the key is not present, leaving subsequent calls + // for the same texture as cheap O(1) lookups that do not overwrite the saved value. + session.initialTextureStates.emplace(texture, texture->currentState()); + texture->setCurrentState(newState); +} + +void D3D12CommandEncoder::onRelease(D3D12GPU* gpu) { + // If onFinish() was called, the session has already been moved to D3D12CommandBuffer. + // This path only handles abandoned encoders (encoding was started but never finished). + if (session.commandList == nullptr) { + return; + } + gpu->reclaimAbandonedSession(std::move(session)); +} + +} // namespace tgfx diff --git a/src/gpu/d3d12/D3D12CommandEncoder.h b/src/gpu/d3d12/D3D12CommandEncoder.h new file mode 100644 index 000000000..4c24059b9 --- /dev/null +++ b/src/gpu/d3d12/D3D12CommandEncoder.h @@ -0,0 +1,106 @@ +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Tencent is pleased to support the open source community by making tgfx available. +// +// Copyright (C) 2026 Tencent. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// unless required by applicable law or agreed to in writing, software distributed under the +// license is distributed on an "as is" basis, without warranties or conditions of any kind, +// either express or implied. see the license for the specific language governing permissions +// and limitations under the license. +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include +#include "D3D12FrameSession.h" +#include "D3D12Resource.h" +#include "D3D12Util.h" +#include "tgfx/gpu/CommandEncoder.h" + +namespace tgfx { + +class D3D12GPU; +class D3D12Texture; + +/** + * Records GPU commands into an ID3D12GraphicsCommandList and collects resource references into a + * D3D12FrameSession. + * + * Lifecycle mirrors VulkanCommandEncoder: + * - Make() allocates an ID3D12CommandAllocator + ID3D12GraphicsCommandList (already in + * recording state per the D3D12 API) and binds the GPU's process-wide shader-visible + * CBV/SRV/UAV ring and Sampler heap to the list. + * - Resource binding (RenderPass) and copy commands populate retainedResources so the GPU + * keeps live references until the fence signals. Descriptor slots used during the session + * are sub-allocated from the GPU's descriptor ring and reclaimed by fence directly. + * - onFinish() Closes the command list and moves the entire session to D3D12CommandBuffer. + * - onRelease() (abandon path) reclaims the session via D3D12GPU::reclaimAbandonedSession(). + */ +class D3D12CommandEncoder : public CommandEncoder, public D3D12Resource { + public: + static std::shared_ptr Make(D3D12GPU* gpu); + + ID3D12GraphicsCommandList* d3d12CommandList() const { + return session.commandList.Get(); + } + + ID3D12CommandAllocator* d3d12CommandAllocator() const { + return session.commandAllocator.Get(); + } + + GPU* gpu() const override; + + std::shared_ptr onBeginRenderPass(const RenderPassDescriptor& descriptor) override; + + void copyTextureToTexture(std::shared_ptr srcTexture, const Rect& srcRect, + std::shared_ptr dstTexture, const Point& dstOffset) override; + + void copyTextureToBuffer(std::shared_ptr srcTexture, const Rect& srcRect, + std::shared_ptr dstBuffer, size_t dstOffset = 0, + size_t dstRowBytes = 0) override; + + void generateMipmapsForTexture(std::shared_ptr texture) override; + + protected: + std::shared_ptr onFinish() override; + void onRelease(D3D12GPU* gpu) override; + + private: + D3D12CommandEncoder(D3D12GPU* gpu, ComPtr allocator, + ComPtr commandList); + ~D3D12CommandEncoder() override = default; + + D3D12GPU* _gpu = nullptr; + D3D12FrameSession session; + + // Used by D3D12RenderPass to register attachments / pipelines / textures for deferred release. + void retainResource(std::shared_ptr resource) { + session.retainedResources.push_back(std::move(resource)); + } + + void retainDescriptorHeap(ComPtr heap) { + session.retainedDescriptorHeaps.push_back(std::move(heap)); + } + + /** + * Updates a D3D12Texture's CPU-tracked _currentState and, on the first call for this texture + * within the current session, snapshots the original state into session.initialTextureStates + * so reclaimAbandonedSession() can roll it back if the encoder is destroyed before submit. + * Every D3D12 backend call site that previously did `tex->setCurrentState(newState)` must go + * through this helper instead, otherwise an aborted encoder would leave _currentState ahead + * of the GPU's real state and the next pass would emit a "Before state mismatch" barrier. + */ + void recordTextureStateChange(D3D12Texture* texture, D3D12_RESOURCE_STATES newState); + + friend class D3D12GPU; + friend class D3D12RenderPass; +}; + +} // namespace tgfx diff --git a/src/gpu/d3d12/D3D12CommandListPool.cpp b/src/gpu/d3d12/D3D12CommandListPool.cpp new file mode 100644 index 000000000..5ac9cfaad --- /dev/null +++ b/src/gpu/d3d12/D3D12CommandListPool.cpp @@ -0,0 +1,92 @@ +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Tencent is pleased to support the open source community by making tgfx available. +// +// Copyright (C) 2026 Tencent. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// unless required by applicable law or agreed to in writing, software distributed under the +// license is distributed on an "as is" basis, without warranties or conditions of any kind, +// either express or implied. see the license for the specific language governing permissions +// and limitations under the license. +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#include "D3D12CommandListPool.h" +#include "core/utils/Log.h" + +namespace tgfx { + +D3D12CommandListPool::Entry D3D12CommandListPool::acquire(ID3D12Device* device) { + Entry result = {}; + if (device == nullptr) { + return result; + } + // Reuse a pooled pair if available. Reset returns the allocator to its initial empty state + // and rewinds the command list to a fresh recording state — both are cheap (microseconds), + // unlike CreateCommandAllocator/CreateCommandList which trigger driver-internal allocation. + while (!freeList.empty()) { + auto entry = std::move(freeList.back()); + freeList.pop_back(); + if (entry.allocator == nullptr || entry.commandList == nullptr) { + continue; + } + auto hr = entry.allocator->Reset(); + if (FAILED(hr)) { + LOGE("D3D12CommandListPool::acquire: allocator Reset failed (HRESULT=0x%08X), discarding.", + static_cast(hr)); + continue; + } + hr = entry.commandList->Reset(entry.allocator.Get(), nullptr); + if (FAILED(hr)) { + LOGE( + "D3D12CommandListPool::acquire: command-list Reset failed (HRESULT=0x%08X), " + "discarding.", + static_cast(hr)); + continue; + } + return entry; + } + // Cold path: nothing pooled (or every pooled entry failed to Reset). Build a fresh pair. + ComPtr allocator = nullptr; + auto hr = + device->CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_DIRECT, IID_PPV_ARGS(&allocator)); + if (FAILED(hr)) { + LOGE("D3D12CommandListPool::acquire: CreateCommandAllocator failed (HRESULT=0x%08X).", + static_cast(hr)); + return result; + } + ComPtr commandList = nullptr; + hr = device->CreateCommandList(0, D3D12_COMMAND_LIST_TYPE_DIRECT, allocator.Get(), nullptr, + IID_PPV_ARGS(&commandList)); + if (FAILED(hr)) { + LOGE("D3D12CommandListPool::acquire: CreateCommandList failed (HRESULT=0x%08X).", + static_cast(hr)); + return result; + } + result.allocator = std::move(allocator); + result.commandList = std::move(commandList); + return result; +} + +void D3D12CommandListPool::release(Entry entry) { + if (!entry.valid()) { + return; + } + if (freeList.size() >= MAX_POOLED) { + // Cap the pool so a long-running app with bursty submission patterns doesn't keep an + // unbounded number of allocators alive. ComPtr destructors release the driver references. + return; + } + freeList.push_back(std::move(entry)); +} + +void D3D12CommandListPool::clear() { + freeList.clear(); +} + +} // namespace tgfx diff --git a/src/gpu/d3d12/D3D12CommandListPool.h b/src/gpu/d3d12/D3D12CommandListPool.h new file mode 100644 index 000000000..57f1a9c79 --- /dev/null +++ b/src/gpu/d3d12/D3D12CommandListPool.h @@ -0,0 +1,95 @@ +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Tencent is pleased to support the open source community by making tgfx available. +// +// Copyright (C) 2026 Tencent. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// unless required by applicable law or agreed to in writing, software distributed under the +// license is distributed on an "as is" basis, without warranties or conditions of any kind, +// either express or implied. see the license for the specific language governing permissions +// and limitations under the license. +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include +#include "D3D12Util.h" + +namespace tgfx { + +/** + * Pool of (ID3D12CommandAllocator, ID3D12GraphicsCommandList) pairs for D3D12_COMMAND_LIST_TYPE_DIRECT. + * + * Why pooling matters: + * - CreateCommandAllocator and CreateCommandList are among the slowest D3D12 APIs (the runtime + * pre-patches command-list metadata against the device). Microsoft's guidance is "create + * once, reset many". + * - Backends that submit multiple command lists per frame (Tiled rendering, BackgroundBlur + * offscreen passes, transient upload lists) compound the cost. + * + * Lifecycle invariants: + * - acquire() returns a recording-state list. On a hit, both objects have been Reset(); on a + * miss, freshly-created (D3D12 returns lists in recording state by default). + * - release() is called by D3D12GPU::reclaimSubmission only after the GPU fence has confirmed + * execution completed. That guarantees ID3D12CommandAllocator::Reset is safe at the next + * acquire(). + * - Abandoned sessions (encoder destroyed before submit) do not return to the pool; their + * ComPtrs simply destruct. Avoids any ambiguity about "is the list closed yet" at recycle + * time. + * + * Thread safety: not thread-safe. Caller serialises access (matches the rest of the D3D12 + * backend's single-threaded usage). + */ +class D3D12CommandListPool { + public: + // Soft cap on idle pairs kept around. Anything released past the cap is destroyed instead of + // pooled, preventing unbounded growth in long-running applications. 16 is comfortably above + // MAX_FRAMES_IN_FLIGHT (2) plus typical per-frame transient upload lists, so steady-state + // workloads won't churn the cap. + static constexpr size_t MAX_POOLED = 16; + + struct Entry { + ComPtr allocator; + ComPtr commandList; + bool valid() const { + return allocator != nullptr && commandList != nullptr; + } + }; + + D3D12CommandListPool() = default; + ~D3D12CommandListPool() = default; + + D3D12CommandListPool(const D3D12CommandListPool&) = delete; + D3D12CommandListPool& operator=(const D3D12CommandListPool&) = delete; + + /** + * Acquires a pair ready for recording. On a cache hit, both objects are Reset() before being + * returned. On a miss (or if Reset fails), a fresh pair is created. Returns an Entry with both + * fields null only if the underlying CreateXxx call fails, which is logged. + */ + Entry acquire(ID3D12Device* device); + + /** + * Returns a pair to the pool for future reuse. Caller must guarantee the GPU has finished + * executing every command list that was recorded with this pair (i.e. the fence value + * associated with the submission has signalled). Entries beyond MAX_POOLED are destroyed. + */ + void release(Entry entry); + + /** + * Drops every pooled pair. Used by D3D12GPU::releaseAll on shutdown to release driver + * references before the device disappears. + */ + void clear(); + + private: + std::vector freeList; +}; + +} // namespace tgfx diff --git a/src/gpu/d3d12/D3D12CommandQueue.cpp b/src/gpu/d3d12/D3D12CommandQueue.cpp new file mode 100644 index 000000000..6d08de28d --- /dev/null +++ b/src/gpu/d3d12/D3D12CommandQueue.cpp @@ -0,0 +1,329 @@ +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Tencent is pleased to support the open source community by making tgfx available. +// +// Copyright (C) 2026 Tencent. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// unless required by applicable law or agreed to in writing, software distributed under the +// license is distributed on an "as is" basis, without warranties or conditions of any kind, +// either express or implied. see the license for the specific language governing permissions +// and limitations under the license. +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#include "D3D12CommandQueue.h" +#include "D3D12Buffer.h" +#include "D3D12CommandBuffer.h" +#include "D3D12Defines.h" +#include "D3D12Semaphore.h" +#include "D3D12Texture.h" +#include "core/utils/Log.h" + +namespace tgfx { + +template +static T AlignUp(T x, T alignment) { + return (x + alignment - 1) & ~(alignment - 1); +} + +D3D12CommandQueue::D3D12CommandQueue(D3D12GPU* d3d12GPU) : gpu(d3d12GPU) { + D3D12_COMMAND_QUEUE_DESC desc = {}; + desc.Type = D3D12_COMMAND_LIST_TYPE_DIRECT; + desc.Flags = D3D12_COMMAND_QUEUE_FLAG_NONE; + auto hr = gpu->device()->CreateCommandQueue(&desc, IID_PPV_ARGS(&commandQueue)); + if (FAILED(hr)) { + LOGE("D3D12CommandQueue: Failed to create command queue, HRESULT=0x%08X", + static_cast(hr)); + } +} + +D3D12CommandQueue::~D3D12CommandQueue() { + // Pending uploads (staging buffers + footprints) and pending semaphores will be released by + // the field destructors. There is no command list to flush them with at this point — the + // application must call waitUntilCompleted() before destruction if it cares about durability. +} + +std::chrono::steady_clock::time_point D3D12CommandQueue::completedFrameTime() const { + return gpu->lastFenceSignalTime(); +} + +void D3D12CommandQueue::writeBuffer(std::shared_ptr buffer, size_t bufferOffset, + const void* data, size_t dataSize) { + if (!buffer || !data || dataSize == 0) { + return; + } + void* mappedData = buffer->map(bufferOffset, dataSize); + if (mappedData) { + memcpy(mappedData, data, dataSize); + buffer->unmap(); + } +} + +void D3D12CommandQueue::writeTexture(std::shared_ptr texture, const Rect& rect, + const void* pixels, size_t rowBytes) { + if (!texture || !pixels) { + return; + } + auto d3d12Tex = std::static_pointer_cast(texture); + + auto width = static_cast(rect.width()); + auto height = static_cast(rect.height()); + auto bytesPerPixel = static_cast(DXGIFormatBytesPerPixel(d3d12Tex->dxgiFormat())); + if (width == 0 || height == 0 || bytesPerPixel == 0) { + return; + } + + // D3D12 requires the row pitch of a placed footprint to be a multiple of + // D3D12_TEXTURE_DATA_PITCH_ALIGNMENT (256). Caller-supplied stride may be larger or smaller. + uint32_t srcRowBytes = rowBytes > 0 ? static_cast(rowBytes) : width * bytesPerPixel; + uint32_t alignedRowPitch = AlignUp( + width * bytesPerPixel, static_cast(D3D12_TEXTURE_DATA_PITCH_ALIGNMENT)); + uint64_t stagingSize = static_cast(alignedRowPitch) * height; + + // Fast path: sub-allocate from the GPU's process-wide UPLOAD ring. The ring resource is kept + // alive by D3D12GPU and the bytes are reclaimed automatically once the owning fence signals, + // so we do not need to add anything to PendingUpload to keep the resource live. + ID3D12Resource* stagingResource = nullptr; + uint64_t stagingOffset = 0; + uint8_t* stagingCpu = nullptr; + ComPtr fallbackResource = nullptr; + auto allocation = + gpu->uploadHeap().allocate(static_cast(stagingSize), + static_cast(D3D12_TEXTURE_DATA_PLACEMENT_ALIGNMENT)); + if (allocation.valid()) { + stagingResource = allocation.resource; + stagingOffset = allocation.offsetInResource; + stagingCpu = static_cast(allocation.cpu); + } else { + // Slow path (oversize allocation or saturated ring): create a one-off UPLOAD buffer. Its + // ComPtr is parked in PendingUpload so the resource outlives GPU execution. + D3D12_HEAP_PROPERTIES heapProps = {}; + heapProps.Type = D3D12_HEAP_TYPE_UPLOAD; + D3D12_RESOURCE_DESC bufferDesc = {}; + bufferDesc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER; + bufferDesc.Width = stagingSize; + bufferDesc.Height = 1; + bufferDesc.DepthOrArraySize = 1; + bufferDesc.MipLevels = 1; + bufferDesc.Format = static_cast(DXGI_FORMAT_UNKNOWN); + bufferDesc.SampleDesc.Count = 1; + bufferDesc.SampleDesc.Quality = 0; + bufferDesc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR; + bufferDesc.Flags = D3D12_RESOURCE_FLAG_NONE; + auto hr = gpu->device()->CreateCommittedResource(&heapProps, D3D12_HEAP_FLAG_NONE, &bufferDesc, + D3D12_RESOURCE_STATE_GENERIC_READ, nullptr, + IID_PPV_ARGS(&fallbackResource)); + if (FAILED(hr)) { + LOGE( + "D3D12CommandQueue::writeTexture: fallback CreateCommittedResource failed, " + "HRESULT=0x%08X", + static_cast(hr)); + return; + } + void* mapped = nullptr; + D3D12_RANGE readRange = {0, 0}; + hr = fallbackResource->Map(0, &readRange, &mapped); + if (FAILED(hr) || mapped == nullptr) { + LOGE("D3D12CommandQueue::writeTexture: fallback Map failed, HRESULT=0x%08X", + static_cast(hr)); + return; + } + stagingResource = fallbackResource.Get(); + stagingOffset = 0; + stagingCpu = static_cast(mapped); + } + + auto src = static_cast(pixels); + uint32_t tightRowBytes = width * bytesPerPixel; + for (uint32_t row = 0; row < height; row++) { + memcpy(stagingCpu + row * alignedRowPitch, src + row * srcRowBytes, tightRowBytes); + } + if (fallbackResource != nullptr) { + // Mapping a one-off UPLOAD buffer for the duration of GPU execution is allowed but we Unmap + // here for symmetry with the previous (pre-ring) implementation; this also lets the runtime + // page out the buffer if memory pressure allows. + fallbackResource->Unmap(0, nullptr); + } + + D3D12GPU::PendingUpload upload = {}; + // Only the slow path needs to retain the staging buffer: the ring resource lives on the GPU + // instance and is reclaimed by fence directly. + upload.stagingBuffer = std::move(fallbackResource); + upload.texture = d3d12Tex; + pendingUploads.push_back(std::move(upload)); + + UploadFootprint fp = {}; + fp.stagingResource = stagingResource; + fp.footprint.Offset = stagingOffset; + fp.footprint.Footprint.Format = static_cast(d3d12Tex->dxgiFormat()); + fp.footprint.Footprint.Width = width; + fp.footprint.Footprint.Height = height; + fp.footprint.Footprint.Depth = 1; + fp.footprint.Footprint.RowPitch = alignedRowPitch; + fp.dstX = static_cast(rect.x()); + fp.dstY = static_cast(rect.y()); + fp.srcWidth = width; + fp.srcHeight = height; + pendingFootprints.push_back(fp); +} + +void D3D12CommandQueue::flushUploads(ID3D12GraphicsCommandList* commandList, + D3D12FrameSession& session) { + if (pendingUploads.empty() || commandList == nullptr) { + return; + } + for (size_t i = 0; i < pendingUploads.size(); i++) { + auto& up = pendingUploads[i]; + auto& fp = pendingFootprints[i]; + + auto current = up.texture->currentState(); + if (current != D3D12_RESOURCE_STATE_COPY_DEST) { + TransitionResourceState(commandList, up.texture->d3d12Resource(), current, + D3D12_RESOURCE_STATE_COPY_DEST); + // Snapshot the original CPU-tracked state on the first touch of this texture during the + // session so reclaimAbandonedSession() can roll _currentState back if the upload is + // never executed by the GPU. emplace() preserves the earliest snapshot on subsequent + // updates, exactly like D3D12CommandEncoder::recordTextureStateChange(). + session.initialTextureStates.emplace(up.texture.get(), current); + up.texture->setCurrentState(D3D12_RESOURCE_STATE_COPY_DEST); + } + + D3D12_TEXTURE_COPY_LOCATION dstLoc = {}; + dstLoc.pResource = up.texture->d3d12Resource(); + dstLoc.Type = D3D12_TEXTURE_COPY_TYPE_SUBRESOURCE_INDEX; + // SubresourceIndex 0 = (mip level 0, array slice 0, plane 0). This matches the public + // writeTexture contract in CommandQueue.h: "If the texture has mipmaps, you should call + // CommandEncoder's generateMipmapsForTexture() method after writing the pixels, as mipmaps + // will not be generated automatically." VulkanCommandQueue / MetalCommandQueue make the + // same assumption (Vulkan even DEBUG_ASSERTs imageSubresource.mipLevel == 0 in its upload + // batcher). If tgfx ever adds array textures or a per-mip writeTexture overload, every + // backend must extend together — this is not a D3D12-only TODO. + dstLoc.SubresourceIndex = 0; + + D3D12_TEXTURE_COPY_LOCATION srcLoc = {}; + // The staging source is either a slot inside the GPU's UPLOAD ring (kept alive by the GPU + // instance, with offsetInResource embedded in fp.footprint.Offset) or a one-off staging + // buffer parked in PendingUpload::stagingBuffer. Either way fp.stagingResource is the raw + // pointer to use at copy time. + srcLoc.pResource = fp.stagingResource; + srcLoc.Type = D3D12_TEXTURE_COPY_TYPE_PLACED_FOOTPRINT; + srcLoc.PlacedFootprint = fp.footprint; + + D3D12_BOX srcBox = {}; + srcBox.left = 0; + srcBox.top = 0; + srcBox.front = 0; + srcBox.right = fp.srcWidth; + srcBox.bottom = fp.srcHeight; + srcBox.back = 1; + + commandList->CopyTextureRegion(&dstLoc, fp.dstX, fp.dstY, 0, &srcLoc, &srcBox); + + TransitionResourceState(commandList, up.texture->d3d12Resource(), + D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_COMMON); + // Already snapshotted at the COPY_DEST entry above; the second emplace is a no-op when the + // texture appears for the first time in this branch and a no-op-on-collision otherwise. + session.initialTextureStates.emplace(up.texture.get(), up.texture->currentState()); + up.texture->setCurrentState(D3D12_RESOURCE_STATE_COMMON); + } + pendingFootprints.clear(); + // pendingUploads is moved into the SubmitRequest by the caller so its staging buffers outlive + // GPU execution. +} + +void D3D12CommandQueue::submit(std::shared_ptr commandBuffer) { + if (!commandBuffer) { + return; + } + auto d3d12Cmd = std::static_pointer_cast(commandBuffer); + auto session = std::move(d3d12Cmd->frameSession()); + if (session.commandList == nullptr) { + return; + } + + // If pixel uploads were recorded since the last submit, splice them onto the front of the + // submission as an auxiliary upload command list. The GPU executes auxCommandLists before the + // session.commandList, ensuring textures are populated before the render list samples them. + if (!pendingUploads.empty()) { + auto entry = gpu->commandListPool().acquire(gpu->device()); + if (!entry.valid()) { + LOGE( + "D3D12CommandQueue::submit: failed to acquire transient upload list, dropping " + "uploads."); + pendingUploads.clear(); + pendingFootprints.clear(); + } else { + flushUploads(entry.commandList.Get(), session); + entry.commandList->Close(); + session.auxAllocators.push_back(std::move(entry.allocator)); + session.auxCommandLists.push_back(std::move(entry.commandList)); + } + } + + D3D12GPU::SubmitRequest request = {}; + request.session = std::move(session); + request.uploads = std::move(pendingUploads); + request.signalSemaphore = std::move(pendingSignalSemaphore); + request.waitSemaphore = std::move(pendingWaitSemaphore); + // Capture _frameTime here (CommandQueue base class member). The GPU stamps the inflight + // submission with this value and later publishes it as _lastFenceSignalTime so the resource + // cache can decide which scratch resources the GPU is done reading. + request.frameTime = _frameTime; + pendingUploads.clear(); + pendingFootprints.clear(); + pendingSignalSemaphore = nullptr; + pendingWaitSemaphore = nullptr; + + gpu->executeSubmission(std::move(request)); +} + +std::shared_ptr D3D12CommandQueue::insertSemaphore() { + auto semaphore = D3D12Semaphore::Make(gpu); + if (semaphore == nullptr) { + return nullptr; + } + pendingSignalSemaphore = semaphore; + return semaphore; +} + +void D3D12CommandQueue::waitSemaphore(std::shared_ptr semaphore) { + if (semaphore == nullptr) { + return; + } + pendingWaitSemaphore = std::static_pointer_cast(semaphore); +} + +void D3D12CommandQueue::waitUntilCompleted() { + // Flush any pending uploads even if the application did not submit a command buffer between + // writeTexture() and waitUntilCompleted(). + if (!pendingUploads.empty()) { + auto entry = gpu->commandListPool().acquire(gpu->device()); + if (entry.valid()) { + D3D12GPU::SubmitRequest request = {}; + flushUploads(entry.commandList.Get(), request.session); + entry.commandList->Close(); + + request.session.auxAllocators.push_back(std::move(entry.allocator)); + request.session.auxCommandLists.push_back(std::move(entry.commandList)); + request.uploads = std::move(pendingUploads); + request.frameTime = _frameTime; + pendingUploads.clear(); + pendingFootprints.clear(); + gpu->executeSubmission(std::move(request)); + } else { + LOGE( + "D3D12CommandQueue::waitUntilCompleted: failed to acquire upload list, dropping " + "uploads."); + pendingUploads.clear(); + pendingFootprints.clear(); + } + } + gpu->waitAllInflightSubmissions(); +} + +} // namespace tgfx diff --git a/src/gpu/d3d12/D3D12CommandQueue.h b/src/gpu/d3d12/D3D12CommandQueue.h new file mode 100644 index 000000000..33b8e8b06 --- /dev/null +++ b/src/gpu/d3d12/D3D12CommandQueue.h @@ -0,0 +1,105 @@ +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Tencent is pleased to support the open source community by making tgfx available. +// +// Copyright (C) 2026 Tencent. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// unless required by applicable law or agreed to in writing, software distributed under the +// license is distributed on an "as is" basis, without warranties or conditions of any kind, +// either express or implied. see the license for the specific language governing permissions +// and limitations under the license. +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include +#include +#include "D3D12GPU.h" +#include "D3D12Util.h" +#include "tgfx/gpu/CommandQueue.h" + +namespace tgfx { + +class D3D12GPU; +class D3D12Semaphore; + +/** + * Thin coordination layer satisfying the public CommandQueue interface. Mirrors VulkanCommandQueue: + * holds only the data accumulated between two consecutive submit() calls and delegates submission + * timing / inflight tracking to D3D12GPU::executeSubmission(). + * + * Pending state held here: + * - pendingUploads: staging UPLOAD buffers from writeTexture(), consumed by submit(). + * - pendingSignal/WaitSemaphore: from insertSemaphore()/waitSemaphore(), consumed by submit(). + */ +class D3D12CommandQueue : public CommandQueue { + public: + explicit D3D12CommandQueue(D3D12GPU* gpu); + ~D3D12CommandQueue() override; + + ID3D12CommandQueue* d3d12CommandQueue() const { + return commandQueue.Get(); + } + + void submit(std::shared_ptr commandBuffer) override; + + void writeBuffer(std::shared_ptr buffer, size_t bufferOffset, const void* data, + size_t dataSize) override; + + void writeTexture(std::shared_ptr texture, const Rect& rect, const void* pixels, + size_t rowBytes) override; + + std::shared_ptr insertSemaphore() override; + + void waitSemaphore(std::shared_ptr semaphore) override; + + void waitUntilCompleted() override; + + protected: + // Report the steady-clock timestamp of the most recently completed inflight submission so + // ResourceCache::findScratchResource can correctly skip scratch buffers/textures that the GPU + // is still reading. The base class default returns the *current* frame time, which lets a + // second flush() reuse a vertex buffer the first flush()'s GPU work is still reading + // (see RecordingTest.MultipleRecordingsInOrder). + std::chrono::steady_clock::time_point completedFrameTime() const override; + + private: + // Records the upload command list, retaining the staging buffer references inside `session` so + // it can carry them through to the inflight submission and so an abandoned submit can roll + // back any texture state changes the upload introduced. + void flushUploads(ID3D12GraphicsCommandList* commandList, D3D12FrameSession& session); + + D3D12GPU* gpu = nullptr; + ComPtr commandQueue = nullptr; + + // Produced by writeTexture(), consumed by the next submit() (or waitUntilCompleted()) which + // records CopyTextureRegion commands and then moves the staging buffers into the inflight + // submission so they can be safely released after the GPU fence signals. + std::vector pendingUploads; + + // Per-upload metadata kept alongside pendingUploads so flushUploads can record the GPU copy + // without re-deriving the row pitch / pixel dimensions. + struct UploadFootprint { + // Source ID3D12Resource for CopyTextureRegion. Lifetime is owned either by D3D12GPU's + // UPLOAD ring (fast path, no extra retention required) or by the matching PendingUpload's + // stagingBuffer ComPtr (slow / fallback path). + ID3D12Resource* stagingResource = nullptr; + D3D12_PLACED_SUBRESOURCE_FOOTPRINT footprint = {}; + UINT dstX = 0; + UINT dstY = 0; + UINT srcWidth = 0; + UINT srcHeight = 0; + }; + std::vector pendingFootprints; + + std::shared_ptr pendingSignalSemaphore; + std::shared_ptr pendingWaitSemaphore; +}; + +} // namespace tgfx diff --git a/src/gpu/d3d12/D3D12Defines.h b/src/gpu/d3d12/D3D12Defines.h new file mode 100644 index 000000000..6bee1d903 --- /dev/null +++ b/src/gpu/d3d12/D3D12Defines.h @@ -0,0 +1,98 @@ +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Tencent is pleased to support the open source community by making tgfx available. +// +// Copyright (C) 2026 Tencent. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// unless required by applicable law or agreed to in writing, software distributed under the +// license is distributed on an "as is" basis, without warranties or conditions of any kind, +// either express or implied. see the license for the specific language governing permissions +// and limitations under the license. +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include +#include "tgfx/gpu/PixelFormat.h" + +namespace tgfx { + +// DXGI_FORMAT values (from dxgiformat.h). Defined as constexpr constants instead of #define macros +// to avoid conflicts with the Windows SDK's DXGI_FORMAT enum when both headers are included. +static constexpr unsigned DXGI_FORMAT_UNKNOWN = 0; +static constexpr unsigned DXGI_FORMAT_D32_FLOAT_S8X24_UINT = 20; +static constexpr unsigned DXGI_FORMAT_R8G8B8A8_UNORM = 28; +static constexpr unsigned DXGI_FORMAT_R8G8B8A8_UNORM_SRGB = 29; +static constexpr unsigned DXGI_FORMAT_D24_UNORM_S8_UINT = 45; +static constexpr unsigned DXGI_FORMAT_R8G8_UNORM = 49; +static constexpr unsigned DXGI_FORMAT_R8_UNORM = 61; +static constexpr unsigned DXGI_FORMAT_A8_UNORM = 65; +static constexpr unsigned DXGI_FORMAT_B8G8R8A8_UNORM = 87; +static constexpr unsigned DXGI_FORMAT_B8G8R8A8_UNORM_SRGB = 91; + +inline PixelFormat DXGIFormatToPixelFormat(unsigned dxgiFormat) { + switch (dxgiFormat) { + case DXGI_FORMAT_R8_UNORM: + case DXGI_FORMAT_A8_UNORM: + return PixelFormat::ALPHA_8; + case DXGI_FORMAT_R8G8_UNORM: + return PixelFormat::RG_88; + case DXGI_FORMAT_B8G8R8A8_UNORM: + case DXGI_FORMAT_B8G8R8A8_UNORM_SRGB: + return PixelFormat::BGRA_8888; + case DXGI_FORMAT_D24_UNORM_S8_UINT: + case DXGI_FORMAT_D32_FLOAT_S8X24_UINT: + return PixelFormat::DEPTH24_STENCIL8; + case DXGI_FORMAT_R8G8B8A8_UNORM: + case DXGI_FORMAT_R8G8B8A8_UNORM_SRGB: + default: + return PixelFormat::RGBA_8888; + } +} + +inline unsigned PixelFormatToDXGIFormat(PixelFormat format) { + switch (format) { + case PixelFormat::ALPHA_8: + return DXGI_FORMAT_R8_UNORM; + case PixelFormat::GRAY_8: + return DXGI_FORMAT_R8_UNORM; + case PixelFormat::RG_88: + return DXGI_FORMAT_R8G8_UNORM; + case PixelFormat::BGRA_8888: + return DXGI_FORMAT_B8G8R8A8_UNORM; + case PixelFormat::DEPTH24_STENCIL8: + return DXGI_FORMAT_D24_UNORM_S8_UINT; + case PixelFormat::RGBA_8888: + return DXGI_FORMAT_R8G8B8A8_UNORM; + default: + return DXGI_FORMAT_UNKNOWN; + } +} + +inline size_t DXGIFormatBytesPerPixel(unsigned dxgiFormat) { + switch (dxgiFormat) { + case DXGI_FORMAT_R8_UNORM: + case DXGI_FORMAT_A8_UNORM: + return 1; + case DXGI_FORMAT_R8G8_UNORM: + return 2; + case DXGI_FORMAT_R8G8B8A8_UNORM: + case DXGI_FORMAT_R8G8B8A8_UNORM_SRGB: + case DXGI_FORMAT_B8G8R8A8_UNORM: + case DXGI_FORMAT_B8G8R8A8_UNORM_SRGB: + case DXGI_FORMAT_D24_UNORM_S8_UINT: + return 4; + case DXGI_FORMAT_D32_FLOAT_S8X24_UINT: + return 8; + default: + return 4; + } +} + +} // namespace tgfx diff --git a/src/gpu/d3d12/D3D12DescriptorRing.cpp b/src/gpu/d3d12/D3D12DescriptorRing.cpp new file mode 100644 index 000000000..4b56e5bcf --- /dev/null +++ b/src/gpu/d3d12/D3D12DescriptorRing.cpp @@ -0,0 +1,142 @@ +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Tencent is pleased to support the open source community by making tgfx available. +// +// Copyright (C) 2026 Tencent. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// unless required by applicable law or agreed to in writing, software distributed under the +// license is distributed on an "as is" basis, without warranties or conditions of any kind, +// either express or implied. see the license for the specific language governing permissions +// and limitations under the license. +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#include "D3D12DescriptorRing.h" +#include "core/utils/Log.h" + +namespace tgfx { + +bool D3D12DescriptorRing::init(ID3D12Device* device, D3D12_DESCRIPTOR_HEAP_TYPE type, + uint32_t capacity, bool shaderVisible) { + if (device == nullptr || capacity == 0) { + return false; + } + D3D12_DESCRIPTOR_HEAP_DESC desc = {}; + desc.Type = type; + desc.NumDescriptors = capacity; + // SHADER_VISIBLE is illegal on RTV/DSV heaps; callers that build those rings pass + // shaderVisible=false and skip the gpuBase initialisation below so the ring still works for + // CPU-side allocation but never hands out a GPU handle. + desc.Flags = + shaderVisible ? D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE : D3D12_DESCRIPTOR_HEAP_FLAG_NONE; + if (FAILED(device->CreateDescriptorHeap(&desc, IID_PPV_ARGS(&_heap)))) { + LOGE("D3D12DescriptorRing::init() CreateDescriptorHeap failed (type=%d capacity=%u).", + static_cast(type), capacity); + return false; + } + _capacity = capacity; + _descriptorSize = device->GetDescriptorHandleIncrementSize(type); + cpuBase = _heap->GetCPUDescriptorHandleForHeapStart(); + gpuBase = + shaderVisible ? _heap->GetGPUDescriptorHandleForHeapStart() : D3D12_GPU_DESCRIPTOR_HANDLE{}; + head = 0; + committedHead = 0; + outstandingSlots = 0; + // Drop any inflight entries left over from a previous init() so the post-init state really is + // "fresh", matching the resetForContextLost() invariant. There is no current re-init path, + // but if one is added later (device-lost recovery, test teardown) those inflight entries + // would otherwise reference the previous, just-released heap. + inflight.clear(); + return true; +} + +D3D12DescriptorRing::Range D3D12DescriptorRing::allocate(uint32_t count) { + if (_heap == nullptr || count == 0 || count > _capacity) { + return {}; + } + // Use the explicit outstandingSlots counter to know how many slots are still in use. + // (head, tail) arithmetic cannot disambiguate "empty" from "full" once an allocation pushes + // head back onto where tail used to be — see the rationale on outstandingSlots in the header. + uint32_t free = _capacity - outstandingSlots; + uint32_t needed = count; + uint32_t startSlot = head; + uint32_t skipped = 0; + if (head + count > _capacity) { + // Avoid splitting an allocation across the wrap-around boundary so callers can pass a + // single contiguous CPU/GPU descriptor range to D3D12 APIs (CreateShaderResourceView, + // SetGraphicsRootDescriptorTable, etc.). The discarded slots between head and the end of + // the ring are billed against the same free pool so the ring stays accounting-consistent. + skipped = _capacity - head; + needed = count + skipped; + startSlot = 0; + } + if (needed > free) { + LOGE( + "D3D12DescriptorRing::allocate() out of slots: requested=%u free=%u capacity=%u " + "skipped=%u.", + count, free, _capacity, skipped); + return {}; + } + Range range = {}; + range.cpuStart = cpuBase; + range.cpuStart.ptr += static_cast(startSlot) * _descriptorSize; + range.gpuStart = gpuBase; + range.gpuStart.ptr += static_cast(startSlot) * _descriptorSize; + range.startSlot = startSlot; + range.count = count; + head = startSlot + count; + if (head == _capacity) { + head = 0; + } + outstandingSlots += needed; + return range; +} + +void D3D12DescriptorRing::commit(uint64_t fenceValue) { + // Compute slots consumed since the previous commit including any wrap-around skip. Comparing + // (head, committedHead) directly fails the "first allocation took the entire capacity" case + // because head wraps right back to committedHead — guard that with outstandingSlots. + uint32_t bytesSinceCommit = + (head >= committedHead) ? (head - committedHead) : (_capacity - (committedHead - head)); + if (bytesSinceCommit == 0) { + if (outstandingSlots == 0) { + // Truly nothing happened since the last commit; skip enqueuing an empty fence record. + return; + } + // The ring was filled to exactly capacity since the last commit, so head == committedHead + // again. Charge the entire capacity to this fence so retire() returns it eventually. + bytesSinceCommit = _capacity; + } + InflightRange entry = {}; + entry.fenceValue = fenceValue; + entry.slots = bytesSinceCommit; + inflight.push_back(entry); + committedHead = head; +} + +void D3D12DescriptorRing::retire(uint64_t completedFenceValue) { + while (!inflight.empty() && inflight.front().fenceValue <= completedFenceValue) { + if (outstandingSlots >= inflight.front().slots) { + outstandingSlots -= inflight.front().slots; + } else { + // Defensive: bookkeeping should never drop below zero. If it does we reset rather than + // wrap to UINT32_MAX and stop accepting allocations forever. + outstandingSlots = 0; + } + inflight.pop_front(); + } +} + +void D3D12DescriptorRing::resetForContextLost() { + inflight.clear(); + head = 0; + committedHead = 0; + outstandingSlots = 0; +} + +} // namespace tgfx diff --git a/src/gpu/d3d12/D3D12DescriptorRing.h b/src/gpu/d3d12/D3D12DescriptorRing.h new file mode 100644 index 000000000..efd6c4289 --- /dev/null +++ b/src/gpu/d3d12/D3D12DescriptorRing.h @@ -0,0 +1,144 @@ +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Tencent is pleased to support the open source community by making tgfx available. +// +// Copyright (C) 2026 Tencent. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// unless required by applicable law or agreed to in writing, software distributed under the +// license is distributed on an "as is" basis, without warranties or conditions of any kind, +// either express or implied. see the license for the specific language governing permissions +// and limitations under the license. +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include +#include "D3D12Util.h" + +namespace tgfx { + +/** + * Single shader-visible descriptor heap used as a fence-tracked ring buffer. + * + * Rationale: + * - D3D12 shader-visible descriptor heaps are expensive to create (they reserve a GPU virtual + * address range and the runtime caps total live shader-visible descriptors per heap type). + * - The naive "create one heap per render pass" pattern hits driver limits and burns CPU on + * every submission. The standard D3D12 idiom is one large heap per heap type, sub-allocated + * linearly with fence-based reclamation. + * + * Allocation model: + * - allocate(count) hands out a contiguous slot range from a monotonic head pointer. + * - commit(fenceValue) snapshots head: every slot allocated since the last commit is now + * "owned" by `fenceValue` and will be reclaimed once the GPU signals it. + * - retire(completedFenceValue) advances tail past every committed range whose fence has + * completed, freeing those slots for re-allocation. + * - When head wraps around back near tail, allocate() returns an invalid Range; the caller + * must treat that as a hard failure (capacity should be sized to make this unreachable in + * normal use). + * + * Thread safety: not thread-safe. Caller must serialise all access. Matches tgfx's overall + * single-threaded D3D12 backend usage. + */ +class D3D12DescriptorRing { + public: + struct Range { + D3D12_CPU_DESCRIPTOR_HANDLE cpuStart = {}; + D3D12_GPU_DESCRIPTOR_HANDLE gpuStart = {}; + uint32_t startSlot = 0; + uint32_t count = 0; + bool valid() const { + return count > 0; + } + }; + + D3D12DescriptorRing() = default; + + /** + * Creates the underlying descriptor heap. Pass D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV / + * SAMPLER for shader-visible rings (the default), or RTV / DSV with shaderVisible=false for + * the non-shader-visible variants used by render targets. D3D12 rejects SHADER_VISIBLE on + * RTV/DSV heaps, so the flag must follow the heap type. + */ + bool init(ID3D12Device* device, D3D12_DESCRIPTOR_HEAP_TYPE type, uint32_t capacity, + bool shaderVisible = true); + + /** + * Sub-allocates `count` consecutive slots. Returns an invalid Range if the ring cannot satisfy + * the request without overrunning still-in-flight slots. + */ + Range allocate(uint32_t count); + + /** + * Marks every slot allocated since the previous commit() as belonging to `fenceValue`. Those + * slots become reclaimable only after the GPU advances the fence past `fenceValue`. + */ + void commit(uint64_t fenceValue); + + /** + * Reclaims slots whose owning fence value is at or below `completedFenceValue`. Cheap; + * intended to be called from the same place as the existing inflight-submission polling. + */ + void retire(uint64_t completedFenceValue); + + /** + * Drops every inflight range and resets the ring head/tail/outstanding bookkeeping while + * keeping the underlying ID3D12DescriptorHeap allocated. Intended for the context-lost + * recovery path: once D3D12GPU has decided the device is gone, the fences associated with + * those inflight ranges will never advance, so retire() would never reclaim them. Without + * this reset their slots stay billed against outstandingSlots and the ring would refuse + * every subsequent allocation forever, even if the application keeps the GPU instance + * around for diagnostics. + */ + void resetForContextLost(); + + ID3D12DescriptorHeap* heap() const { + return _heap.Get(); + } + + uint32_t descriptorSize() const { + return _descriptorSize; + } + + uint32_t capacity() const { + return _capacity; + } + + private: + // head is the slot index modulo capacity at which the next allocate() will start writing. + // The classic (head, tail) pair would also be needed to ask "how full is the ring?", but + // that pair cannot disambiguate "empty" from "full" once an allocation pushes head back onto + // tail; we maintain an explicit outstandingSlots counter instead — allocate() bumps it, + // retire() drains it. Without that counter an allocation that spans the entire capacity (or + // a sequence whose head wraps right onto where tail used to be) would convince the next + // allocate() that the ring is empty and hand back slots the GPU is still reading. + uint32_t head = 0; + // Snapshot of head at the last commit() call. Slots in [committedHead, head) are part of the + // current pending submission; slots before that have already been associated with a fence. + uint32_t committedHead = 0; + // Slots currently held by either an as-yet-uncommitted allocation or an inflight commit + // waiting on its fence. allocate() rejects the request when needed > capacity - outstanding. + uint32_t outstandingSlots = 0; + + struct InflightRange { + uint64_t fenceValue = 0; + // Slots consumed between the previous commit() and this one (including any wrap-around + // skip), returned to outstandingSlots when retire() reaches this entry. + uint32_t slots = 0; + }; + std::deque inflight; + + ComPtr _heap = nullptr; + uint32_t _capacity = 0; + uint32_t _descriptorSize = 0; + D3D12_CPU_DESCRIPTOR_HANDLE cpuBase = {}; + D3D12_GPU_DESCRIPTOR_HANDLE gpuBase = {}; +}; + +} // namespace tgfx diff --git a/src/gpu/d3d12/D3D12Device.cpp b/src/gpu/d3d12/D3D12Device.cpp new file mode 100644 index 000000000..27715395a --- /dev/null +++ b/src/gpu/d3d12/D3D12Device.cpp @@ -0,0 +1,177 @@ +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Tencent is pleased to support the open source community by making tgfx available. +// +// Copyright (C) 2026 Tencent. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// unless required by applicable law or agreed to in writing, software distributed under the +// license is distributed on an "as is" basis, without warranties or conditions of any kind, +// either express or implied. see the license for the specific language governing permissions +// and limitations under the license. +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#include "D3D12Device.h" +#include +#include "D3D12GPU.h" +#include "core/utils/Log.h" + +namespace tgfx { + +// Initialise the optional D3D12 debug layer and DRED settings shared by both Make() and +// MakeWarp(). Idempotent — calling EnableDebugLayer / DRED setup twice is a no-op past the +// first invocation, so the duplication does not cost anything in practice. +static void EnableD3D12DebugFeatures() { +#if !defined(NDEBUG) || defined(TGFX_D3D12_DEBUG_LAYER) + // Enable the D3D12 debug layer when explicitly requested. Must be called before + // D3D12CreateDevice. Validation messages surface as readable text instead of + // generic E_INVALIDARG return codes. + { + ComPtr debugController = nullptr; + if (SUCCEEDED(D3D12GetDebugInterface(IID_PPV_ARGS(&debugController)))) { + debugController->EnableDebugLayer(); + } + } +#endif +#if !defined(NDEBUG) || defined(TGFX_D3D12_DEBUG_LAYER) || defined(TGFX_D3D12_DRED) + // Enable Device Removed Extended Data so that, on a TDR/hang, we can ask the driver which + // command was the last one the GPU started and which one it was about to execute next. This + // is the cheapest way to localise a hang without attaching PIX. Must be requested before + // D3D12CreateDevice; queried later via D3D12GPU when GetDeviceRemovedReason() reports a fault. + { + ComPtr dredSettings = nullptr; + auto dredHr = D3D12GetDebugInterface(IID_PPV_ARGS(&dredSettings)); + if (SUCCEEDED(dredHr)) { + dredSettings->SetAutoBreadcrumbsEnablement(D3D12_DRED_ENABLEMENT_FORCED_ON); + dredSettings->SetPageFaultEnablement(D3D12_DRED_ENABLEMENT_FORCED_ON); + LOGE("[DRED setup] Auto-breadcrumbs and page-fault tracking enabled."); + } else { + LOGE( + "[DRED setup] D3D12GetDebugInterface(ID3D12DeviceRemovedExtendedDataSettings) " + "returned HRESULT=0x%08X; DRED unavailable.", + static_cast(dredHr)); + } + } +#endif +} + +std::shared_ptr D3D12Device::Make() { + EnableD3D12DebugFeatures(); + ComPtr factory = nullptr; + if (FAILED(CreateDXGIFactory1(IID_PPV_ARGS(&factory)))) { + LOGE("D3D12Device::Make() Failed to create DXGI factory."); + return nullptr; + } + ComPtr adapter = nullptr; + for (UINT i = 0; factory->EnumAdapters1(i, &adapter) != DXGI_ERROR_NOT_FOUND; ++i) { + DXGI_ADAPTER_DESC1 desc = {}; + adapter->GetDesc1(&desc); + if (desc.Flags & DXGI_ADAPTER_FLAG_SOFTWARE) { + adapter = nullptr; + continue; + } + ComPtr d3d12Device = nullptr; + if (SUCCEEDED( + D3D12CreateDevice(adapter.Get(), D3D_FEATURE_LEVEL_11_0, IID_PPV_ARGS(&d3d12Device)))) { + return MakeFrom(d3d12Device.Get()); + } + adapter = nullptr; + } + LOGE("D3D12Device::Make() No suitable D3D12 hardware adapter found."); + return nullptr; +} + +std::shared_ptr D3D12Device::MakeWarp() { + // WARP is the Windows Advanced Rasterization Platform — Microsoft's CPU-based reference + // implementation of D3D12 that ships with every modern Windows install. It is feature + // complete (FL12_1) but very slow; the only sensible callers are CI runners and offline + // tools that do not have a usable hardware adapter. CreateDXGIFactory1 + + // EnumWarpAdapter is the documented entry point. + EnableD3D12DebugFeatures(); + ComPtr factory = nullptr; + if (FAILED(CreateDXGIFactory1(IID_PPV_ARGS(&factory)))) { + LOGE("D3D12Device::MakeWarp() Failed to create DXGI factory."); + return nullptr; + } + ComPtr warpAdapter = nullptr; + if (FAILED(factory->EnumWarpAdapter(IID_PPV_ARGS(&warpAdapter)))) { + LOGE("D3D12Device::MakeWarp() EnumWarpAdapter failed; WARP unavailable on this system."); + return nullptr; + } + ComPtr d3d12Device = nullptr; + if (FAILED(D3D12CreateDevice(warpAdapter.Get(), D3D_FEATURE_LEVEL_11_0, + IID_PPV_ARGS(&d3d12Device)))) { + LOGE("D3D12Device::MakeWarp() D3D12CreateDevice on WARP adapter failed."); + return nullptr; + } + return MakeFrom(d3d12Device.Get()); +} + +std::shared_ptr D3D12Device::MakeFrom(void* device) { + if (device == nullptr) { + return nullptr; + } + auto d3d12Device = static_cast(device); + ComPtr devicePtr = nullptr; + d3d12Device->QueryInterface(IID_PPV_ARGS(&devicePtr)); + if (devicePtr == nullptr) { + return nullptr; + } +#if !defined(NDEBUG) || defined(TGFX_D3D12_DEBUG_LAYER) + // Configure the debug-layer info queue so subsequent CreateDescriptorHeap / ResourceBarrier / + // DrawX failures get logged with the underlying validation message instead of just an opaque + // HRESULT. Messages are drained per-submission by D3D12GPU::executeSubmission(). + ComPtr infoQueue = nullptr; + if (SUCCEEDED(devicePtr->QueryInterface(IID_PPV_ARGS(&infoQueue)))) { + infoQueue->SetMuteDebugOutput(FALSE); + // Default storage limit is 1024; expand it so we don't lose messages between drains. + infoQueue->SetMessageCountLimit(8192); + // Break into the debugger on the most useful severities. When no debugger is attached the + // breaks are no-ops, but the messages still queue up for DrainDebugMessages() to log. + infoQueue->SetBreakOnSeverity(D3D12_MESSAGE_SEVERITY_CORRUPTION, TRUE); + infoQueue->SetBreakOnSeverity(D3D12_MESSAGE_SEVERITY_ERROR, FALSE); + } +#endif + auto gpu = D3D12GPU::Make(std::move(devicePtr)); + if (gpu == nullptr) { + return nullptr; + } + auto result = std::shared_ptr(new D3D12Device(std::move(gpu))); + result->weakThis = result; + return result; +} + +D3D12Device::D3D12Device(std::unique_ptr gpu) : Device(std::move(gpu)) { +} + +D3D12Device::~D3D12Device() { + static_cast(_gpu)->releaseAll(true); +} + +void* D3D12Device::d3d12Device() const { + return static_cast(_gpu)->device(); +} + +bool D3D12Device::onLockContext() { + // The base Device::lockContext() acquires the device mutex before calling us. If the GPU has + // been removed (e.g. DXGI_ERROR_DEVICE_REMOVED on a previous Signal), every subsequent + // operation on the device would either return failure immediately or, worse, leave the + // application waiting on a mutex it owns from a path that already encountered the loss but + // did not unwind cleanly. We surface the loss here so the base class unlocks the mutex and + // returns nullptr, matching the OpenGL backend's CONTEXT_LOST handling. + auto* gpu = static_cast(_gpu); + if (gpu->isContextLost()) { + return false; + } + return true; +} + +void D3D12Device::onUnlockContext() { +} + +} // namespace tgfx diff --git a/src/gpu/d3d12/D3D12Device.h b/src/gpu/d3d12/D3D12Device.h new file mode 100644 index 000000000..2780def34 --- /dev/null +++ b/src/gpu/d3d12/D3D12Device.h @@ -0,0 +1,29 @@ +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Tencent is pleased to support the open source community by making tgfx available. +// +// Copyright (C) 2026 Tencent. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// unless required by applicable law or agreed to in writing, software distributed under the +// license is distributed on an "as is" basis, without warranties or conditions of any kind, +// either express or implied. see the license for the specific language governing permissions +// and limitations under the license. +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include "D3D12GPU.h" +#include "tgfx/gpu/d3d12/D3D12Device.h" + +namespace tgfx { + +// Private implementation details for D3D12Device. +// The public interface is in include/tgfx/gpu/d3d12/D3D12Device.h + +} // namespace tgfx diff --git a/src/gpu/d3d12/D3D12FrameSession.h b/src/gpu/d3d12/D3D12FrameSession.h new file mode 100644 index 000000000..6736edf22 --- /dev/null +++ b/src/gpu/d3d12/D3D12FrameSession.h @@ -0,0 +1,99 @@ +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Tencent is pleased to support the open source community by making tgfx available. +// +// Copyright (C) 2026 Tencent. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// unless required by applicable law or agreed to in writing, software distributed under the +// license is distributed on an "as is" basis, without warranties or conditions of any kind, +// either express or implied. see the license for the specific language governing permissions +// and limitations under the license. +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include +#include +#include +#include "gpu/d3d12/D3D12Resource.h" +#include "gpu/d3d12/D3D12Util.h" + +namespace tgfx { + +class D3D12Texture; + +/** + * Value-type aggregate of all per-frame GPU resources produced during one encoding session. + * + * Why D3D12 needs this (mirrors VulkanFrameSession's role): + * - D3D12, like Vulkan, provides NO automatic resource tracking. ID3D12GraphicsCommandList is + * deferred: it executes on the GPU long after recording finishes. If an ID3D12Resource backing + * a buffer or texture is released before the GPU finishes reading it, behaviour is undefined. + * Likewise, ID3D12CommandAllocator must outlive every command list it produced until the GPU + * consumes them, otherwise Reset() will fail or recorded commands will be corrupted. + * - The application must explicitly keep allocator, command list, and any referenced resources + * alive until the associated ID3D12Fence value signals on the queue. + * + * D3D12FrameSession is the single place where "everything this frame needs" is defined. It is + * moved (not copied) through the pipeline: Encoder -> CommandBuffer -> InflightSubmission. Cleanup + * happens exclusively after the queue's fence confirms GPU completion. + * + * Differences from Vulkan's FrameSession: + * - No descriptor pools: D3D12 binds via descriptor heaps, which are managed independently and + * do not require per-frame pool churn. + * - No render passes / framebuffers: D3D12 has no equivalent persistent objects; render targets + * are bound through RTV/DSV descriptors at record time. + * - retainedResources holds D3D12Resource subclasses (buffers, textures, samplers, pipelines). + * + * Adding a new per-frame resource type requires only two changes: + * 1. Add a field here. + * 2. Add cleanup logic in D3D12GPU::reclaimSubmission() (introduced in a later step). + */ +struct D3D12FrameSession { + ComPtr commandAllocator; + ComPtr commandList; + // Strong references preventing D3D12Resource destruction while GPU is still executing. + // When cleared after the fence signals, refcounts decrement; resources reaching zero enter the + // ReturnQueue and are safely destroyed during processUnreferencedResources(). + std::vector> retainedResources; + // Shader-visible descriptor heaps (CBV/SRV/UAV and Sampler) created per render pass to back + // SetGraphicsRootDescriptorTable. They must outlive GPU execution because the GPU keeps reading + // their contents until the fence signals. Released after the fence signals. + std::vector> retainedDescriptorHeaps; + // Auxiliary command allocators/lists used to record one-off work (texture uploads) outside the + // main command list. Captured here so they outlive GPU execution; freed after the fence signals. + std::vector> auxAllocators; + std::vector> auxCommandLists; + // Auxiliary ID3D12Resource buffers (e.g. transient staging buffers used by + // copyTextureToBuffer for row-pitch alignment) that must live until the fence signals. + std::vector> auxBuffers; + + // Original D3D12_RESOURCE_STATES of every texture this session has been about to mutate, keyed + // by raw D3D12Texture* (raw is fine because the matching shared_ptr is held in + // retainedResources for the session's lifetime). Populated by the helper recordTextureState- + // Change(): the first call for a given texture saves its current state, subsequent calls do + // nothing. If the session is abandoned (~D3D12CommandBuffer or ~D3D12CommandEncoder before + // submit), reclaimAbandonedSession walks this map to roll D3D12Texture::_currentState back + // to what the GPU still sees, preventing the next render pass from emitting transitions + // whose StateBefore disagrees with reality. + std::unordered_map initialTextureStates; + + D3D12FrameSession() = default; + + // ComPtr is move-aware and zeroes the source on move; std::vector moves are also clean. The + // explicit move/copy declarations match VulkanFrameSession for consistency and to make the + // copy-deletion intent obvious at the call site. + D3D12FrameSession(D3D12FrameSession&& other) noexcept = default; + D3D12FrameSession& operator=(D3D12FrameSession&& other) noexcept = default; + + D3D12FrameSession(const D3D12FrameSession&) = delete; + D3D12FrameSession& operator=(const D3D12FrameSession&) = delete; +}; + +} // namespace tgfx diff --git a/src/gpu/d3d12/D3D12GPU.cpp b/src/gpu/d3d12/D3D12GPU.cpp new file mode 100644 index 000000000..2592529ad --- /dev/null +++ b/src/gpu/d3d12/D3D12GPU.cpp @@ -0,0 +1,956 @@ +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Tencent is pleased to support the open source community by making tgfx available. +// +// Copyright (C) 2026 Tencent. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// unless required by applicable law or agreed to in writing, software distributed under the +// license is distributed on an "as is" basis, without warranties or conditions of any kind, +// either express or implied. see the license for the specific language governing permissions +// and limitations under the license. +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#include "D3D12GPU.h" +#include +#include +#include +#include +#include +#include "D3D12Buffer.h" +#include "D3D12CommandEncoder.h" +#include "D3D12CommandQueue.h" +#include "D3D12MipmapGenerator.h" +#include "D3D12RenderPipeline.h" +#include "D3D12Resource.h" +#include "D3D12Sampler.h" +#include "D3D12Semaphore.h" +#include "D3D12ShaderModule.h" +#include "D3D12Texture.h" +#include "core/utils/Log.h" + +namespace tgfx { + +bool HardwareBufferAvailable() { + return false; +} + +#ifdef TGFX_D3D12_DEBUG_LAYER +void D3D12GPU::drainDebugMessages(const char* tag) { + if (d3d12Device == nullptr) { + return; + } + ComPtr infoQueue = nullptr; + if (FAILED(d3d12Device->QueryInterface(IID_PPV_ARGS(&infoQueue)))) { + return; + } + auto count = infoQueue->GetNumStoredMessages(); + for (UINT64 i = 0; i < count; i++) { + SIZE_T msgLen = 0; + infoQueue->GetMessage(i, nullptr, &msgLen); + std::vector buf(msgLen); + auto* msg = reinterpret_cast(buf.data()); + if (SUCCEEDED(infoQueue->GetMessage(i, msg, &msgLen))) { + LOGE("[D3D12 debug @ %s] %.*s", tag, static_cast(msg->DescriptionByteLength), + msg->pDescription); + } + } + if (count > 0) { + infoQueue->ClearStoredMessages(); + } +} +#else +void D3D12GPU::drainDebugMessages(const char*) { +} +#endif + +static ComPtr FindAdapter(ID3D12Device* device) { + ComPtr factory = nullptr; + if (FAILED(CreateDXGIFactory1(IID_PPV_ARGS(&factory)))) { + return nullptr; + } + LUID luid = device->GetAdapterLuid(); + ComPtr adapter = nullptr; + for (UINT i = 0; factory->EnumAdapters1(i, &adapter) != DXGI_ERROR_NOT_FOUND; ++i) { + DXGI_ADAPTER_DESC1 desc = {}; + adapter->GetDesc1(&desc); + if (desc.AdapterLuid.LowPart == luid.LowPart && desc.AdapterLuid.HighPart == luid.HighPart) { + return adapter; + } + adapter = nullptr; + } + return nullptr; +} + +std::unique_ptr D3D12GPU::Make(ComPtr device) { + if (device == nullptr) { + return nullptr; + } + auto adapter = FindAdapter(device.Get()); + auto gpu = std::unique_ptr(new D3D12GPU(std::move(device), std::move(adapter))); + if (gpu->commandQueue == nullptr || gpu->_frameFence == nullptr || + gpu->_frameFenceEvent == nullptr || gpu->_srvRing.heap() == nullptr || + gpu->_samplerHeap == nullptr || gpu->_uploadHeap.capacity() == 0) { + return nullptr; + } + return gpu; +} + +D3D12GPU::D3D12GPU(ComPtr device, ComPtr adapter) + : d3d12Device(std::move(device)), dxgiAdapter(std::move(adapter)) { + initInfo(); + initFeatures(); + initLimits(); + // Create the per-frame fence and its waitable event before the command queue, since the queue + // uses these handles when scheduling Signal/Wait operations. + if (FAILED(d3d12Device->CreateFence(0, D3D12_FENCE_FLAG_NONE, IID_PPV_ARGS(&_frameFence)))) { + LOGE("D3D12GPU: failed to create frame fence."); + return; + } + _frameFenceEvent = CreateEventW(nullptr, FALSE, FALSE, nullptr); + if (_frameFenceEvent == nullptr) { + LOGE("D3D12GPU: failed to create frame fence event."); + _frameFence = nullptr; + return; + } + // Allocate the process-wide shader-visible CBV/SRV/UAV ring up front. Failure here means we + // cannot satisfy any subsequent render-pass binding, so propagate it through to Make() via the + // null-heap check rather than letting the GPU come up half-initialised. + if (!_srvRing.init(d3d12Device.Get(), D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV, + SRV_RING_CAPACITY)) { + LOGE("D3D12GPU: failed to initialise CBV/SRV/UAV descriptor ring."); + return; + } + // Allocate the process-wide shader-visible Sampler heap. Append-only, capped at D3D12's hard + // 2048-descriptor limit. Each unique SamplerDescriptor consumes one slot for the lifetime of + // the GPU instance. + D3D12_DESCRIPTOR_HEAP_DESC samplerDesc = {}; + samplerDesc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_SAMPLER; + samplerDesc.NumDescriptors = SAMPLER_HEAP_CAPACITY; + samplerDesc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE; + if (FAILED(d3d12Device->CreateDescriptorHeap(&samplerDesc, IID_PPV_ARGS(&_samplerHeap)))) { + LOGE("D3D12GPU: failed to create shader-visible Sampler heap."); + return; + } + _samplerHeapCapacity = SAMPLER_HEAP_CAPACITY; + _samplerDescriptorIncrement = + d3d12Device->GetDescriptorHandleIncrementSize(D3D12_DESCRIPTOR_HEAP_TYPE_SAMPLER); + // Allocate the process-wide UPLOAD ring used to stage CPU-to-GPU pixel/buffer data. Failure + // is surfaced through the capacity-zero check in Make() so the GPU does not come up with a + // partially-functional upload path. + if (!_uploadHeap.init(d3d12Device.Get(), UPLOAD_HEAP_CAPACITY)) { + LOGE("D3D12GPU: failed to initialise UPLOAD heap."); + return; + } + // Non-shader-visible RTV / DSV rings replace the per-render-pass CreateDescriptorHeap calls + // that the old D3D12RenderPass::initialise made. The OMSetRenderTargets command consumes + // CPU descriptor handles only, so a single shared heap is enough; we ring-buffer the slots + // by fence value just like the SRV/Sampler ring so descriptors stay valid until the GPU + // command list referencing them has retired. + if (!_rtvRing.init(d3d12Device.Get(), D3D12_DESCRIPTOR_HEAP_TYPE_RTV, RTV_RING_CAPACITY, false)) { + LOGE("D3D12GPU: failed to initialise RTV descriptor ring."); + return; + } + if (!_dsvRing.init(d3d12Device.Get(), D3D12_DESCRIPTOR_HEAP_TYPE_DSV, DSV_RING_CAPACITY, false)) { + LOGE("D3D12GPU: failed to initialise DSV descriptor ring."); + return; + } + commandQueue = std::make_unique(this); + compiler = std::make_unique(); +} + +D3D12_GPU_DESCRIPTOR_HANDLE D3D12GPU::allocatePermanentSamplerSlot(const D3D12_SAMPLER_DESC& desc) { + D3D12_GPU_DESCRIPTOR_HANDLE invalid = {}; + if (_samplerHeap == nullptr || _samplerHeapSize >= _samplerHeapCapacity) { + LOGE("D3D12GPU::allocatePermanentSamplerSlot: sampler heap exhausted (%u/%u).", + _samplerHeapSize, _samplerHeapCapacity); + return invalid; + } + auto slot = _samplerHeapSize++; + D3D12_CPU_DESCRIPTOR_HANDLE cpuHandle = _samplerHeap->GetCPUDescriptorHandleForHeapStart(); + cpuHandle.ptr += static_cast(slot) * _samplerDescriptorIncrement; + d3d12Device->CreateSampler(&desc, cpuHandle); + D3D12_GPU_DESCRIPTOR_HANDLE gpuHandle = _samplerHeap->GetGPUDescriptorHandleForHeapStart(); + gpuHandle.ptr += static_cast(slot) * _samplerDescriptorIncrement; + return gpuHandle; +} + +D3D12GPU::~D3D12GPU() { + DEBUG_ASSERT(returnQueue == nullptr); + DEBUG_ASSERT(resources.empty()); + if (_frameFenceEvent != nullptr) { + CloseHandle(_frameFenceEvent); + _frameFenceEvent = nullptr; + } +} + +void D3D12GPU::initInfo() { + _info.backend = Backend::D3D12; + _info.version = "Direct3D 12"; + if (dxgiAdapter != nullptr) { + DXGI_ADAPTER_DESC1 desc = {}; + dxgiAdapter->GetDesc1(&desc); + std::wstring wRenderer(desc.Description); + int sizeNeeded = + WideCharToMultiByte(CP_UTF8, 0, wRenderer.data(), static_cast(wRenderer.size()), + nullptr, 0, nullptr, nullptr); + _info.renderer.resize(static_cast(sizeNeeded)); + WideCharToMultiByte(CP_UTF8, 0, wRenderer.data(), static_cast(wRenderer.size()), + _info.renderer.data(), sizeNeeded, nullptr, nullptr); + if (desc.VendorId == 0x10DE) { + _info.vendor = "NVIDIA"; + } else if (desc.VendorId == 0x1002) { + _info.vendor = "AMD"; + } else if (desc.VendorId == 0x8086) { + _info.vendor = "Intel"; + } else if (desc.VendorId == 0x1414) { + _info.vendor = "Microsoft"; + } else { + _info.vendor = "Unknown"; + } + } else { + _info.renderer = "Unknown D3D12 Device"; + _info.vendor = "Unknown"; + } +} + +void D3D12GPU::initFeatures() { + _features.semaphore = true; + _features.clampToBorder = true; + // D3D12 has no glTextureBarrier() equivalent: a resource cannot be in RENDER_TARGET and + // PIXEL_SHADER_RESOURCE state simultaneously, so the renderer cannot bind the current RTV + // as an SRV inside the same render pass. Mirror VulkanCaps and disable this feature so + // OpsCompositor::makeDstTextureInfo falls back to the copy-to-temp-texture path whenever an + // advanced blend mode (Lighten / Darken / etc.) needs to read the destination. + _features.textureBarrier = false; +} + +void D3D12GPU::initLimits() { + D3D12_FEATURE_DATA_D3D12_OPTIONS options = {}; + if (SUCCEEDED(d3d12Device->CheckFeatureSupport(D3D12_FEATURE_D3D12_OPTIONS, &options, + sizeof(options)))) { + // D3D12 resource binding tier determines sampler limits. + switch (options.ResourceBindingTier) { + case D3D12_RESOURCE_BINDING_TIER_1: + _limits.maxSamplersPerShaderStage = 16; + break; + case D3D12_RESOURCE_BINDING_TIER_2: + case D3D12_RESOURCE_BINDING_TIER_3: + default: + _limits.maxSamplersPerShaderStage = 2048; + break; + } + } else { + _limits.maxSamplersPerShaderStage = 16; + } + _limits.maxTextureDimension2D = D3D12_REQ_TEXTURE2D_U_OR_V_DIMENSION; + _limits.maxUniformBufferBindingSize = D3D12_REQ_CONSTANT_BUFFER_ELEMENT_COUNT * 16; + _limits.minUniformBufferOffsetAlignment = D3D12_CONSTANT_BUFFER_DATA_PLACEMENT_ALIGNMENT; +} + +CommandQueue* D3D12GPU::queue() const { + return commandQueue.get(); +} + +const shaderc::Compiler* D3D12GPU::shaderCompiler() const { + return compiler.get(); +} + +bool D3D12GPU::isFormatRenderable(PixelFormat format) const { + auto dxgiFormat = PixelFormatToDXGIFormat(format); + if (dxgiFormat == DXGI_FORMAT_UNKNOWN) { + return false; + } + D3D12_FEATURE_DATA_FORMAT_SUPPORT formatSupport = {}; + formatSupport.Format = static_cast(dxgiFormat); + if (FAILED(d3d12Device->CheckFeatureSupport(D3D12_FEATURE_FORMAT_SUPPORT, &formatSupport, + sizeof(formatSupport)))) { + return false; + } + // Vulkan/Metal back-ends report any colour or depth-stencil attachment format as renderable. + // Match that language so callers using a backend-agnostic gate (e.g. createTexture's + // RENDER_ATTACHMENT pre-check) work uniformly when the format happens to be depth-stencil. + constexpr UINT attachmentMask = + D3D12_FORMAT_SUPPORT1_RENDER_TARGET | D3D12_FORMAT_SUPPORT1_DEPTH_STENCIL; + return (formatSupport.Support1 & attachmentMask) != 0; +} + +std::shared_ptr D3D12GPU::createBuffer(size_t size, uint32_t usage) { + if (size == 0) { + return nullptr; + } + return D3D12Buffer::Make(this, size, usage); +} + +std::shared_ptr D3D12GPU::createTexture(const TextureDescriptor& descriptor) { + if (descriptor.width <= 0 || descriptor.height <= 0) { + LOGE("D3D12GPU::createTexture() invalid dimensions: %dx%d", descriptor.width, + descriptor.height); + return nullptr; + } + if (!isFormatRenderable(descriptor.format) && + (descriptor.usage & TextureUsage::RENDER_ATTACHMENT)) { + LOGE("D3D12GPU::createTexture() format not renderable for render attachment"); + return nullptr; + } + auto texture = D3D12Texture::Make(this, descriptor); + if (texture == nullptr) { + LOGE("D3D12GPU::createTexture() D3D12Texture::Make failed for %dx%d format=%d", + descriptor.width, descriptor.height, static_cast(descriptor.format)); + } + return texture; +} + +std::shared_ptr D3D12GPU::createSampler(const SamplerDescriptor& descriptor) { + auto key = MakeSamplerKey(descriptor); + auto iter = samplerCache.find(key); + if (iter != samplerCache.end()) { + return iter->second; + } + auto sampler = D3D12Sampler::Make(this, descriptor); + // Cache the result even when Make returned nullptr. The total number of distinct + // SamplerDescriptor values is bounded by the AddressMode/FilterMode/MipmapMode enums + // (currently 4 x 4 x 2 x 2 x 3 = 192), so the sampler heap (2048 slots) should never run + // out, but memoising the failure protects the hot path: a repeated failing key would + // otherwise retry CreateSampler + log the error on every draw call that needs a sampler. + samplerCache[key] = sampler; + return sampler; +} + +ComPtr D3D12GPU::findRootSignature(const std::vector& shapeKey) { + auto iter = rootSignatureCache.find(shapeKey); + if (iter == rootSignatureCache.end()) { + return nullptr; + } + return iter->second; +} + +void D3D12GPU::cacheRootSignature(std::vector shapeKey, + ComPtr rootSignature) { + if (rootSignature == nullptr) { + return; + } + rootSignatureCache.emplace(std::move(shapeKey), std::move(rootSignature)); +} + +uint32_t D3D12GPU::MakeSamplerKey(const SamplerDescriptor& descriptor) { + uint32_t key = 0; + key |= static_cast(descriptor.addressModeX); + key |= static_cast(descriptor.addressModeY) << 3; + key |= static_cast(descriptor.minFilter) << 6; + key |= static_cast(descriptor.magFilter) << 8; + key |= static_cast(descriptor.mipmapMode) << 10; + return key; +} + +std::shared_ptr D3D12GPU::createShaderModule( + const ShaderModuleDescriptor& descriptor) { + // Cache compiled DXBC blobs by (stage, hash(GLSL source)). The upper layer's program cache + // works at the (vertex+fragment) tuple level, so two distinct programs sharing one of the two + // sources still hit our backend twice. Caching here lets the second hit skip the full + // GLSL -> SPIR-V -> HLSL -> DXBC chain. + ShaderCacheKey key = {}; + key.stage = static_cast(descriptor.stage); + key.sourceHash = std::hash{}(descriptor.code); + if (auto it = shaderModuleCache.find(key); it != shaderModuleCache.end()) { + return it->second; + } + auto module = D3D12ShaderModule::Make(this, descriptor); + if (module != nullptr) { + shaderModuleCache.emplace(key, module); + } + return module; +} + +std::shared_ptr D3D12GPU::createRenderPipeline( + const RenderPipelineDescriptor& descriptor) { + return D3D12RenderPipeline::Make(this, descriptor); +} + +std::shared_ptr D3D12GPU::createCommandEncoder() { + processUnreferencedResources(); + return D3D12CommandEncoder::Make(this); +} + +D3D12MipmapGenerator* D3D12GPU::mipmapGenerator() { + if (_mipmapGenerator == nullptr) { + _mipmapGenerator = std::unique_ptr(new D3D12MipmapGenerator(this)); + if (!_mipmapGenerator->isReady()) { + _mipmapGenerator = nullptr; + } + } + return _mipmapGenerator.get(); +} + +int D3D12GPU::getSampleCount(int requestedCount, PixelFormat pixelFormat) const { + if (requestedCount <= 1) { + return 1; + } + auto dxgiFormat = PixelFormatToDXGIFormat(pixelFormat); + if (dxgiFormat == DXGI_FORMAT_UNKNOWN) { + return 1; + } + for (int sampleCount = requestedCount; sampleCount <= D3D12_MAX_MULTISAMPLE_SAMPLE_COUNT; + sampleCount++) { + D3D12_FEATURE_DATA_MULTISAMPLE_QUALITY_LEVELS qualityLevels = {}; + qualityLevels.Format = static_cast(dxgiFormat); + qualityLevels.SampleCount = static_cast(sampleCount); + if (SUCCEEDED(d3d12Device->CheckFeatureSupport(D3D12_FEATURE_MULTISAMPLE_QUALITY_LEVELS, + &qualityLevels, sizeof(qualityLevels))) && + qualityLevels.NumQualityLevels > 0) { + return sampleCount; + } + } + return 1; +} + +std::vector> D3D12GPU::importHardwareTextures(HardwareBufferRef, + uint32_t) { + // D3D12 hardware buffer import is not supported yet. + return {}; +} + +std::shared_ptr D3D12GPU::importBackendTexture(const BackendTexture& backendTexture, + uint32_t usage, bool adopted) { + if (backendTexture.backend() != Backend::D3D12) { + return nullptr; + } + D3D12TextureInfo d3d12Info = {}; + if (!backendTexture.getD3D12TextureInfo(&d3d12Info) || d3d12Info.resource == nullptr) { + return nullptr; + } + auto d3d12Resource = + const_cast(static_cast(d3d12Info.resource)); + ComPtr resource = nullptr; + d3d12Resource->QueryInterface(IID_PPV_ARGS(&resource)); + if (resource == nullptr) { + return nullptr; + } + return D3D12Texture::MakeFrom(this, std::move(resource), d3d12Info.format, usage, adopted); +} + +std::shared_ptr D3D12GPU::importBackendRenderTarget( + const BackendRenderTarget& backendRenderTarget) { + if (backendRenderTarget.backend() != Backend::D3D12) { + return nullptr; + } + D3D12TextureInfo d3d12Info = {}; + if (!backendRenderTarget.getD3D12TextureInfo(&d3d12Info) || d3d12Info.resource == nullptr) { + return nullptr; + } + auto format = backendRenderTarget.format(); + if (!isFormatRenderable(format)) { + return nullptr; + } + auto d3d12Resource = + const_cast(static_cast(d3d12Info.resource)); + ComPtr resource = nullptr; + d3d12Resource->QueryInterface(IID_PPV_ARGS(&resource)); + if (resource == nullptr) { + return nullptr; + } + return D3D12Texture::MakeFrom(this, std::move(resource), d3d12Info.format, + TextureUsage::RENDER_ATTACHMENT, false); +} + +std::shared_ptr D3D12GPU::importBackendSemaphore(const BackendSemaphore& semaphore) { + if (semaphore.backend() != Backend::D3D12) { + return nullptr; + } + D3D12SyncInfo info = {}; + if (!semaphore.getD3D12Sync(&info) || info.fence == nullptr) { + return nullptr; + } + auto rawFence = const_cast(static_cast(info.fence)); + ComPtr fence = nullptr; + rawFence->QueryInterface(IID_PPV_ARGS(&fence)); + if (fence == nullptr) { + return nullptr; + } + return D3D12Semaphore::MakeFrom(this, std::move(fence), info.value); +} + +BackendSemaphore D3D12GPU::stealBackendSemaphore(std::shared_ptr semaphore) { + if (semaphore == nullptr || semaphore.use_count() > 2) { + return {}; + } + return semaphore->getBackendSemaphore(); +} + +std::shared_ptr D3D12GPU::addResource(D3D12Resource* resource) { + DEBUG_ASSERT(resource != nullptr); + resources.push_back(resource); + resource->cachedPosition = --resources.end(); + return std::static_pointer_cast(returnQueue->makeShared(resource)); +} + +void D3D12GPU::processUnreferencedResources() { + DEBUG_ASSERT(returnQueue != nullptr); + while (auto resource = static_cast(returnQueue->dequeue())) { + resources.erase(resource->cachedPosition); + resource->onRelease(this); + delete resource; + } +} + +// Map a D3D12_AUTO_BREADCRUMB_OP enum value to a short readable string. Not exhaustive — only the +// ops the TGFX backend actually emits are listed; everything else falls through to "". +static const char* AutoBreadcrumbOpName(D3D12_AUTO_BREADCRUMB_OP op) { + switch (op) { + case D3D12_AUTO_BREADCRUMB_OP_SETMARKER: + return "SetMarker"; + case D3D12_AUTO_BREADCRUMB_OP_BEGINEVENT: + return "BeginEvent"; + case D3D12_AUTO_BREADCRUMB_OP_ENDEVENT: + return "EndEvent"; + case D3D12_AUTO_BREADCRUMB_OP_DRAWINSTANCED: + return "DrawInstanced"; + case D3D12_AUTO_BREADCRUMB_OP_DRAWINDEXEDINSTANCED: + return "DrawIndexedInstanced"; + case D3D12_AUTO_BREADCRUMB_OP_EXECUTEINDIRECT: + return "ExecuteIndirect"; + case D3D12_AUTO_BREADCRUMB_OP_DISPATCH: + return "Dispatch"; + case D3D12_AUTO_BREADCRUMB_OP_COPYBUFFERREGION: + return "CopyBufferRegion"; + case D3D12_AUTO_BREADCRUMB_OP_COPYTEXTUREREGION: + return "CopyTextureRegion"; + case D3D12_AUTO_BREADCRUMB_OP_COPYRESOURCE: + return "CopyResource"; + case D3D12_AUTO_BREADCRUMB_OP_RESOLVESUBRESOURCE: + return "ResolveSubresource"; + case D3D12_AUTO_BREADCRUMB_OP_CLEARRENDERTARGETVIEW: + return "ClearRenderTargetView"; + case D3D12_AUTO_BREADCRUMB_OP_CLEARDEPTHSTENCILVIEW: + return "ClearDepthStencilView"; + case D3D12_AUTO_BREADCRUMB_OP_CLEARUNORDEREDACCESSVIEW: + return "ClearUAV"; + case D3D12_AUTO_BREADCRUMB_OP_RESOURCEBARRIER: + return "ResourceBarrier"; + case D3D12_AUTO_BREADCRUMB_OP_PRESENT: + return "Present"; + default: + return ""; + } +} + +void D3D12GPU::markContextLost(const char* tag) { + if (contextLost) { + return; + } + contextLost = true; + // Once the GPU is gone every fence value associated with previously-committed inflight ring + // ranges will stay unsignalled forever, so retire() can never reclaim them and the per-ring + // outstandingSlots / outstandingBytes counters would saturate. Reset the bookkeeping in + // place — the underlying ID3D12DescriptorHeap / UPLOAD ID3D12Resource stay allocated so any + // diagnostics path that keeps the GPU instance around (DRED dump etc.) can still query the + // device — just stop the rings from rejecting future allocations forever. + _srvRing.resetForContextLost(); + _rtvRing.resetForContextLost(); + _dsvRing.resetForContextLost(); + _uploadHeap.resetForContextLost(); + dumpDeviceRemovedExtendedData(tag); +} + +void D3D12GPU::dumpDeviceRemovedExtendedData(const char* tag) { + if (d3d12Device == nullptr) { + return; + } + auto reason = d3d12Device->GetDeviceRemovedReason(); + if (SUCCEEDED(reason)) { + return; + } + // Some drivers populate DRED breadcrumb buffers asynchronously after the device transitions to + // a removed state. Sleep briefly so the breadcrumb / page-fault output is fully formed before + // we query it. Diagnostic-only path; cost is bounded to a couple of milliseconds at fault time. + Sleep(50); + LOGE("[DRED %s] device removed, reason=0x%08X", tag, static_cast(reason)); + + ComPtr dred = nullptr; + if (FAILED(d3d12Device.As(&dred))) { + LOGE("[DRED %s] DRED interface unavailable on this device.", tag); + return; + } + + D3D12_DRED_AUTO_BREADCRUMBS_OUTPUT breadcrumbsOutput = {}; + if (SUCCEEDED(dred->GetAutoBreadcrumbsOutput(&breadcrumbsOutput))) { + auto* node = breadcrumbsOutput.pHeadAutoBreadcrumbNode; + if (node == nullptr) { + LOGE( + "[DRED %s] auto-breadcrumb list is empty — either no command list executed before " + "the fault, or the driver did not record breadcrumbs (verify " + "SetAutoBreadcrumbsEnablement(FORCED_ON) was called before D3D12CreateDevice).", + tag); + } + int nodeIndex = 0; + while (node != nullptr) { + const char* listName = + node->pCommandListDebugNameA ? node->pCommandListDebugNameA : ""; + const char* queueName = + node->pCommandQueueDebugNameA ? node->pCommandQueueDebugNameA : ""; + auto last = node->pLastBreadcrumbValue ? *node->pLastBreadcrumbValue : 0u; + auto count = node->BreadcrumbCount; + LOGE("[DRED %s] node %d: list='%s' queue='%s' completed=%u/%u", tag, nodeIndex, listName, + queueName, last, count); + // Print a small window around the last completed op so we can see the failing call. + uint32_t windowStart = last >= 4 ? last - 4 : 0; + uint32_t windowEnd = (last + 8 < count) ? (last + 8) : count; + for (uint32_t i = windowStart; i < windowEnd; i++) { + const char* marker = (i == last) ? " >>>" : " "; + LOGE("[DRED %s] %s op[%u] = %s", tag, marker, i, + AutoBreadcrumbOpName(node->pCommandHistory[i])); + } + node = node->pNext; + nodeIndex++; + } + } else { + LOGE("[DRED %s] GetAutoBreadcrumbsOutput failed.", tag); + } + + D3D12_DRED_PAGE_FAULT_OUTPUT pageFaultOutput = {}; + if (SUCCEEDED(dred->GetPageFaultAllocationOutput(&pageFaultOutput))) { + if (pageFaultOutput.PageFaultVA != 0) { + LOGE("[DRED %s] page fault VA = 0x%llx", tag, + static_cast(pageFaultOutput.PageFaultVA)); + auto* existing = pageFaultOutput.pHeadExistingAllocationNode; + while (existing != nullptr) { + LOGE("[DRED %s] existing alloc near fault: '%s' type=%d", tag, + existing->ObjectNameA ? existing->ObjectNameA : "", + static_cast(existing->AllocationType)); + existing = existing->pNext; + } + auto* recent = pageFaultOutput.pHeadRecentFreedAllocationNode; + while (recent != nullptr) { + LOGE("[DRED %s] recently freed: '%s' type=%d", tag, + recent->ObjectNameA ? recent->ObjectNameA : "", + static_cast(recent->AllocationType)); + recent = recent->pNext; + } + } + } +} + +void D3D12GPU::releaseAll(bool releaseGPU) { + // Shutdown ordering must wait for all GPU work to complete before destroying anything that + // the GPU may still be reading. waitAllInflightSubmissions() also handles the device-removed + // case internally (it short-circuits to a synchronous reclaim instead of waiting on a fence + // that will never advance), so it is safe to call regardless of the releaseGPU flag and + // mirrors what VulkanGPU::releaseAll does. The earlier "releaseGPU == false skips the wait" + // path could leave inflight FrameSessions holding command lists / allocators the GPU was + // still reading, tripping OBJECT_DELETED_WHILE_STILL_IN_USE. + waitAllInflightSubmissions(); + samplerCache.clear(); + // Drop cached shader-module shared_ptrs so D3D12Resource cleanup can run. Without this the + // shaderModuleCache would hold strong refs through the resources-list walk below and prevent + // the modules from ever being released. + shaderModuleCache.clear(); + rootSignatureCache.clear(); + // Drop pooled command allocator/list pairs. Done after wait so the GPU is no longer using + // them; doing it before would still be safe (ComPtrs hold the only references) but ordering + // matches the rest of the shutdown path. + _commandListPool.clear(); + // Drop the UPLOAD ring's underlying resource similarly. Inflight allocations have either been + // drained by the wait above or are tracked against fences that will never advance, in which + // case dropping the resource is the only safe action. + _uploadHeap.clear(); + // Drop the cached mipmap generator's root signature + PSO before tearing the device down. + // Resources retained by inflight submissions have already been released above. + _mipmapGenerator = nullptr; + if (releaseGPU) { + for (auto& resource : resources) { + resource->onRelease(this); + } + } + resources.clear(); + returnQueue = nullptr; +} + +void D3D12GPU::reclaimAbandonedSession(D3D12FrameSession session) { + // Letting the local go out of scope releases the command list and allocator via ComPtr, and + // drops every shared_ptr in retainedResources / retainedDescriptorHeaps. RTV / DSV descriptor + // slots used by this session live in D3D12GPU::_rtvRing / _dsvRing and will be reclaimed by + // their fence-based retire path; nothing is held here for them. + // Resources whose refcount reaches zero enter the ReturnQueue and will be destroyed by the + // next processUnreferencedResources() call. + // + // The work that *does* need active rollback here is CPU-side D3D12 resource state tracking. + // Render-pass / copy / generate-mipmaps codepaths bumped D3D12Texture::_currentState while + // recording transition commands. If those commands never make it to the GPU (this abandon + // path), the GPU is still in the original state. Walking session.initialTextureStates and + // restoring the snapshotted state keeps CPU and GPU views aligned so the next encoder does + // not emit a barrier whose StateBefore disagrees with reality (which the D3D12 debug layer + // flags as "Before state mismatch"). + for (auto& [texture, originalState] : session.initialTextureStates) { + if (texture != nullptr) { + texture->setCurrentState(originalState); + } + } + (void)session; +} + +void D3D12GPU::executeSubmission(SubmitRequest request) { + // If the GPU has already reported a fatal error, drop the submission without waiting on the + // fence — DXGI_ERROR_DEVICE_REMOVED is sticky and the fence will never advance. Local cleanup + // still runs because session/uploads destructors release their D3D12 references. + if (contextLost) { + reclaimAbandonedSession(std::move(request.session)); + request.uploads.clear(); + return; + } + + // Step 1: Non-blocking reclaim of any submissions whose fence values have already signalled. + pollCompletedSubmissions(); + + // Step 2: Backpressure — block until the oldest inflight submission completes if we have + // already filled the in-flight pipeline. A bounded timeout protects against TDR scenarios + // where the GPU never advances; on timeout we mark the context lost and stop tracking it. + if (inflightSubmissions.size() >= MAX_FRAMES_IN_FLIGHT) { + auto& oldest = inflightSubmissions.front(); + if (_frameFence->GetCompletedValue() < oldest.fenceValue) { + _frameFence->SetEventOnCompletion(oldest.fenceValue, _frameFenceEvent); + auto waitResult = WaitForSingleObject(_frameFenceEvent, 5000); + if (waitResult != WAIT_OBJECT_0) { + LOGE( + "D3D12GPU::executeSubmission: backpressure wait timed out (target=%llu), " + "marking context lost.", + static_cast(oldest.fenceValue)); + markContextLost("executeSubmission backpressure timeout"); + reclaimAbandonedSession(std::move(request.session)); + request.uploads.clear(); + while (!inflightSubmissions.empty()) { + reclaimSubmission(inflightSubmissions.front()); + inflightSubmissions.pop_front(); + } + return; + } + } + pollCompletedSubmissions(); + // Re-check device removal after the wait — if the GPU TDR'd while we were blocked, the + // event would have been signalled but no work has actually completed. Detect this and exit + // the inflight queue cleanup path immediately. + if (FAILED(d3d12Device->GetDeviceRemovedReason())) { + markContextLost("executeSubmission backpressure post-check"); + reclaimAbandonedSession(std::move(request.session)); + request.uploads.clear(); + // Drop every still-tracked submission since the GPU will never signal again. + while (!inflightSubmissions.empty()) { + reclaimSubmission(inflightSubmissions.front()); + inflightSubmissions.pop_front(); + } + return; + } + } + + auto cmdQueue = commandQueue->d3d12CommandQueue(); + if (cmdQueue == nullptr) { + LOGE("D3D12GPU::executeSubmission: command queue is null, abandoning session."); + reclaimAbandonedSession(std::move(request.session)); + return; + } + + // Step 3: Optional cross-queue wait. D3D12Semaphore stores its target value as a host-readable + // member; the GPU side simply re-uses ID3D12CommandQueue::Wait(). + if (request.waitSemaphore != nullptr) { + auto fence = request.waitSemaphore->d3d12Fence(); + if (fence != nullptr) { + cmdQueue->Wait(fence, request.waitSemaphore->signalValue()); + } + } + + // Step 4: Execute auxiliary command lists (e.g. texture upload lists recorded by the queue + // outside the main render command list) followed by the main render command list. Order + // matters: uploads must complete before the render list can sample the destination textures. + std::vector lists; + lists.reserve(request.session.auxCommandLists.size() + 1); + for (auto& aux : request.session.auxCommandLists) { + if (aux != nullptr) { + lists.push_back(aux.Get()); + } + } + if (request.session.commandList != nullptr) { + lists.push_back(request.session.commandList.Get()); + } + if (!lists.empty()) { + cmdQueue->ExecuteCommandLists(static_cast(lists.size()), lists.data()); + } +#ifdef TGFX_D3D12_DEBUG_LAYER + drainDebugMessages("executeSubmission"); +#endif + + // Step 5: Optional signal of an external semaphore. The semaphore exposes a fixed fence value + // assigned at creation time — we just signal that value on this queue. After signalling, bump + // the semaphore's internal value so a subsequent insertSemaphore() call would see a fresh + // generation if the same fence object is re-used. + if (request.signalSemaphore != nullptr) { + auto fence = request.signalSemaphore->d3d12Fence(); + if (fence != nullptr) { + auto target = request.signalSemaphore->nextSignalValue(); + cmdQueue->Signal(fence, target); + request.signalSemaphore->commitSignalValue(); + } + // Keep the semaphore alive until the GPU has consumed the Signal command. Without this + // retention the application could drop its last reference and the underlying ID3D12Fence + // would be released before the GPU is done with it. + request.session.retainedResources.push_back(std::move(request.signalSemaphore)); + } + if (request.waitSemaphore != nullptr) { + request.session.retainedResources.push_back(std::move(request.waitSemaphore)); + } + + // Step 6: Signal the GPU's internal frame fence so we can later detect completion of this + // submission and reclaim its session. If the Signal call itself fails (it can return + // DXGI_ERROR_DEVICE_REMOVED if the GPU TDR'd while the previous ExecuteCommandLists was + // executing), trip the contextLost flag so subsequent calls don't block on a fence that will + // never advance. + ++_lastSignalledFenceValue; + // Tag every CBV/SRV/UAV descriptor-ring slot allocated during this submission with the + // about-to-be-signalled fence value. Once that fence value completes, the slots become + // reclaimable in retire(); see pollCompletedSubmissions(). The Sampler heap is append-only + // (slots persist for the GPU's lifetime) and does not need fence tracking. The upload ring's + // sub-allocations are tracked the same way so the staging bytes outlive their CopyTextureRegion + // commands. + _srvRing.commit(_lastSignalledFenceValue); + _rtvRing.commit(_lastSignalledFenceValue); + _dsvRing.commit(_lastSignalledFenceValue); + _uploadHeap.commit(_lastSignalledFenceValue); + auto signalHr = cmdQueue->Signal(_frameFence.Get(), _lastSignalledFenceValue); + if (FAILED(signalHr) || FAILED(d3d12Device->GetDeviceRemovedReason())) { + LOGE( + "D3D12GPU::executeSubmission: Signal failed (HRESULT=0x%08X) or device removed; " + "marking context lost.", + static_cast(signalHr)); + markContextLost("executeSubmission Signal"); + reclaimAbandonedSession(std::move(request.session)); + request.uploads.clear(); + while (!inflightSubmissions.empty()) { + reclaimSubmission(inflightSubmissions.front()); + inflightSubmissions.pop_front(); + } + return; + } + + InflightSubmission inflight = {}; + inflight.fenceValue = _lastSignalledFenceValue; + // Capture the submission timestamp so pollCompletedSubmissions() can later publish it as the + // "GPU completed up to this point" marker that ResourceCache uses to gate scratch reuse. + inflight.frameTime = request.frameTime; + inflight.session = std::move(request.session); + inflight.uploads = std::move(request.uploads); + inflightSubmissions.push_back(std::move(inflight)); +} + +void D3D12GPU::waitAllInflightSubmissions() { + if (_frameFence == nullptr) { + return; + } + if (contextLost || FAILED(d3d12Device->GetDeviceRemovedReason())) { + // Device is gone — fence will never advance again. Drop everything synchronously. + markContextLost("waitAllInflightSubmissions entry"); + while (!inflightSubmissions.empty()) { + reclaimSubmission(inflightSubmissions.front()); + inflightSubmissions.pop_front(); + } + processUnreferencedResources(); + return; + } + if (!inflightSubmissions.empty()) { + auto& last = inflightSubmissions.back(); + if (_frameFence->GetCompletedValue() < last.fenceValue) { + // Use a finite timeout instead of INFINITE so we never hang the application even if some + // earlier submission had a corrupted command list that prevents the GPU from advancing. + // 5 seconds is well past any sensible draw frame budget; if it expires we fall through to + // the device-removal check below. + _frameFence->SetEventOnCompletion(last.fenceValue, _frameFenceEvent); + auto waitResult = WaitForSingleObject(_frameFenceEvent, 5000); + if (waitResult != WAIT_OBJECT_0) { + LOGE( + "D3D12GPU::waitAllInflightSubmissions: fence wait timed out (target=%llu), " + "marking context lost.", + static_cast(last.fenceValue)); + markContextLost("waitAllInflightSubmissions timeout"); + while (!inflightSubmissions.empty()) { + reclaimSubmission(inflightSubmissions.front()); + inflightSubmissions.pop_front(); + } + processUnreferencedResources(); + return; + } + } + } + pollCompletedSubmissions(); +} + +uint64_t D3D12GPU::completedFenceValue() const { + return _frameFence != nullptr ? _frameFence->GetCompletedValue() : 0; +} + +void D3D12GPU::pollCompletedSubmissions() { + if (_frameFence == nullptr) { + return; + } + auto completed = _frameFence->GetCompletedValue(); + while (!inflightSubmissions.empty() && inflightSubmissions.front().fenceValue <= completed) { + auto& front = inflightSubmissions.front(); + // Publish the just-completed submission's submit-time stamp so ResourceCache can decide + // which scratch resources the GPU has finished reading. Without this update, the default + // CommandQueue::completedFrameTime() returns the *current* frame time, telling the cache + // every resource is reusable immediately — that lets a second flush() steal a vertex + // buffer the first flush()'s GPU work is still reading (see RecordingTest race). + auto ticks = front.frameTime.time_since_epoch().count(); + _lastFenceSignalTime.store(ticks, std::memory_order_release); + reclaimSubmission(front); + inflightSubmissions.pop_front(); + } + // Free shader-visible CBV/SRV/UAV descriptor slots whose owning submissions have signalled. + // Sampler slots persist for the GPU's lifetime so they need no per-fence retirement. The + // upload ring sheds reclaimable byte ranges on the same schedule. + _srvRing.retire(completed); + _rtvRing.retire(completed); + _dsvRing.retire(completed); + _uploadHeap.retire(completed); + // Releasing retained shared_ptrs may have moved D3D12Resource instances into the return queue; + // free them now so the caller sees up-to-date memory accounting. + processUnreferencedResources(); +} + +std::chrono::steady_clock::time_point D3D12GPU::lastFenceSignalTime() const { + auto ticks = _lastFenceSignalTime.load(std::memory_order_acquire); + return std::chrono::steady_clock::time_point(std::chrono::steady_clock::duration(ticks)); +} + +void D3D12GPU::reclaimSubmission(InflightSubmission& submission) { + // The GPU has signalled the fence value associated with this submission, so every command + // allocator/list pair it referenced is safe to reuse. Return them to the pool before tearing + // down the rest of the session — once the FrameSession destructor runs, the ComPtrs are gone. + if (submission.session.commandAllocator != nullptr && submission.session.commandList != nullptr) { + D3D12CommandListPool::Entry entry = {}; + entry.allocator = std::move(submission.session.commandAllocator); + entry.commandList = std::move(submission.session.commandList); + _commandListPool.release(std::move(entry)); + } + // Auxiliary command lists (e.g. transient upload lists recorded by D3D12CommandQueue::submit) + // are paired one-to-one with their auxAllocators by index; recycle them together so the pool + // sees consistent (allocator, list) pairs. + size_t auxCount = + std::min(submission.session.auxAllocators.size(), submission.session.auxCommandLists.size()); + for (size_t i = 0; i < auxCount; i++) { + if (submission.session.auxAllocators[i] != nullptr && + submission.session.auxCommandLists[i] != nullptr) { + D3D12CommandListPool::Entry entry = {}; + entry.allocator = std::move(submission.session.auxAllocators[i]); + entry.commandList = std::move(submission.session.auxCommandLists[i]); + _commandListPool.release(std::move(entry)); + } + } + // The remaining ComPtr / shared_ptr destructors in FrameSession handle descriptor heaps, + // retained resources, and staging UPLOAD buffers. + submission.session = D3D12FrameSession{}; + submission.uploads.clear(); +} + +} // namespace tgfx diff --git a/src/gpu/d3d12/D3D12GPU.h b/src/gpu/d3d12/D3D12GPU.h new file mode 100644 index 000000000..9fdf04136 --- /dev/null +++ b/src/gpu/d3d12/D3D12GPU.h @@ -0,0 +1,444 @@ +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Tencent is pleased to support the open source community by making tgfx available. +// +// Copyright (C) 2026 Tencent. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// unless required by applicable law or agreed to in writing, software distributed under the +// license is distributed on an "as is" basis, without warranties or conditions of any kind, +// either express or implied. see the license for the specific language governing permissions +// and limitations under the license. +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include "D3D12CommandListPool.h" +#include "D3D12Defines.h" +#include "D3D12DescriptorRing.h" +#include "D3D12FrameSession.h" +#include "D3D12UploadHeap.h" +#include "D3D12Util.h" +#include "core/utils/ReturnQueue.h" +#include "tgfx/gpu/GPU.h" + +namespace shaderc { +class Compiler; +}; + +namespace tgfx { + +class D3D12CommandQueue; +class D3D12MipmapGenerator; +class D3D12Resource; +class D3D12Semaphore; +class D3D12ShaderModule; +class D3D12Texture; + +/** + * D3D12 GPU implementation. + */ +class D3D12GPU : public GPU { + public: + static std::unique_ptr Make(ComPtr device); + + ~D3D12GPU(); + + ID3D12Device* device() const { + return d3d12Device.Get(); + } + + IDXGIAdapter1* adapter() const { + return dxgiAdapter.Get(); + } + + const GPUInfo* info() const override { + return &_info; + } + + const GPUFeatures* features() const override { + return &_features; + } + + const GPULimits* limits() const override { + return &_limits; + } + + CommandQueue* queue() const override; + + const shaderc::Compiler* shaderCompiler() const; + + unsigned getDXGIFormat(PixelFormat format) const { + return PixelFormatToDXGIFormat(format); + } + + bool isFormatRenderable(PixelFormat format) const override; + + std::shared_ptr createBuffer(size_t size, uint32_t usage) override; + + std::shared_ptr createTexture(const TextureDescriptor& descriptor) override; + + std::shared_ptr createSampler(const SamplerDescriptor& descriptor) override; + + std::shared_ptr createShaderModule( + const ShaderModuleDescriptor& descriptor) override; + + std::shared_ptr createRenderPipeline( + const RenderPipelineDescriptor& descriptor) override; + + std::shared_ptr createCommandEncoder() override; + + int getSampleCount(int requestedCount, PixelFormat pixelFormat) const override; + + std::vector> importHardwareTextures(HardwareBufferRef hardwareBuffer, + uint32_t usage) override; + + std::shared_ptr importBackendTexture(const BackendTexture& backendTexture, + uint32_t usage, bool adopted = false) override; + + std::shared_ptr importBackendRenderTarget( + const BackendRenderTarget& backendRenderTarget) override; + + std::shared_ptr importBackendSemaphore(const BackendSemaphore& semaphore) override; + + BackendSemaphore stealBackendSemaphore(std::shared_ptr semaphore) override; + + template + std::shared_ptr makeResource(Args&&... args) { + static_assert(std::is_base_of_v, "T must be a subclass of D3D12Resource!"); + auto resource = new T(std::forward(args)...); + return std::static_pointer_cast(addResource(resource)); + } + + void processUnreferencedResources(); + + void releaseAll(bool releaseGPU); + + /** + * Reclaims resources from a D3D12FrameSession that was created but never submitted (abandon + * path). Invoked by D3D12CommandBuffer's destructor and by D3D12CommandEncoder::onRelease(). This + * is the same unified cleanup path used after the GPU fence signals successful completion. + */ + void reclaimAbandonedSession(D3D12FrameSession session); + + // -- Submission lifecycle ------------------------------------------------------------------- + // The D3D12 backend mirrors the Vulkan FrameSession + InflightSubmission ownership model: + // each successful submit() moves a session into the inflight queue, where it is held alive + // until its fence value is signalled by the GPU. A staging upload buffer used by writeTexture + // is also tracked here so it can be safely released after the same fence signals. + + struct PendingUpload { + ComPtr stagingBuffer; + std::shared_ptr texture; + }; + + struct SubmitRequest { + D3D12FrameSession session; + std::vector uploads; + std::shared_ptr signalSemaphore; + std::shared_ptr waitSemaphore; + // Timestamp captured by D3D12CommandQueue::submit at the moment it hands the request to + // executeSubmission(). The GPU stamps the matching InflightSubmission with this value and, + // when the fence later signals, publishes it as _lastFenceSignalTime so the resource cache + // can identify scratch resources that are safe to reuse. + std::chrono::steady_clock::time_point frameTime = {}; + }; + + /** + * Executes a complete submission: optional cross-queue waitSemaphore, ExecuteCommandLists for + * the recorded command list, optional signalSemaphore, then the internal frame fence. Moves + * session/uploads into the inflight queue. Polls completed submissions before submitting and + * applies backpressure if more than MAX_FRAMES_IN_FLIGHT submissions are outstanding. + */ + void executeSubmission(SubmitRequest request); + + /** + * Blocks until every outstanding submission's fence value has signalled, reclaiming each + * session along the way. Used by waitUntilCompleted() and releaseAll(). + */ + void waitAllInflightSubmissions(); + + /** + * Returns the latest fence value that the GPU has signalled. Used by D3D12Buffer::isReady() to + * answer "is the GPU done with this buffer?" without blocking. + */ + uint64_t completedFenceValue() const; + + /** + * Returns the steady_clock::time_point at which the most recently completed inflight + * submission was first submitted. Mirrors VulkanGPU::lastFenceSignalTime() and is used by + * D3D12CommandQueue::completedFrameTime() to gate scratch-resource reuse — without this, + * ResourceCache::findScratchResource() would happily hand back vertex/uniform buffers that + * the GPU is still reading, producing torn frames when CPU recording races ahead of GPU + * execution (see RecordingTest.MultipleRecordingsInOrder). + */ + std::chrono::steady_clock::time_point lastFenceSignalTime() const; + + ID3D12Fence* frameFence() const { + return _frameFence.Get(); + } + + uint64_t lastSignalledFenceValue() const { + return _lastSignalledFenceValue; + } + + /** + * Returns true once the GPU has reported a fatal error (e.g. DXGI_ERROR_DEVICE_REMOVED) or a + * fence wait has timed out. After the flag is set, every executeSubmission() / wait* call + * short-circuits and the device is considered unusable for the remainder of the process. + */ + bool isContextLost() const { + return contextLost; + } + + /** + * If the device has been removed and DRED was enabled at device creation, queries + * ID3D12DeviceRemovedExtendedData and logs the auto-breadcrumb history (the command list + * operations the GPU had completed and was about to execute when it died) plus any page-fault + * information. Each (HRESULT-failing) call site that detects context loss should invoke this + * helper exactly once so the diagnostic appears next to the failure that triggered it. No-op on + * builds without DRED enabled or when GetDeviceRemovedReason() reports success. + */ + void dumpDeviceRemovedExtendedData(const char* tag); + + /** + * Drains the D3D12 debug-layer ID3D12InfoQueue and forwards every queued message to LOGE + * tagged with `tag`. Call sites should invoke this whenever a D3D12 API returns a failure + * (especially DEVICE_REMOVED) so the underlying validation error appears next to the failing + * call instead of being lost inside the runtime queue. No-op on builds without + * TGFX_D3D12_DEBUG_LAYER (and outside Debug builds), where the InfoQueue is not populated. + */ + void drainDebugMessages(const char* tag); + + /** + * Returns the singleton compute-shader mipmap generator, creating it on first use. The + * generator is owned by the GPU because its root signature and pipeline state can be reused + * across every D3D12CommandEncoder that asks to generate mipmaps. Returns nullptr if compute + * shader compilation or pipeline creation failed. + */ + D3D12MipmapGenerator* mipmapGenerator(); + + /** + * Process-wide shader-visible CBV/SRV/UAV ring backing every D3D12RenderPass binding. One large + * heap is created once at GPU construction; render passes sub-allocate slots out of it and the + * descriptors are reclaimed as soon as the owning fence signals. Avoids the per-pass + * CreateDescriptorHeap call and the associated GPU-VA churn. + */ + D3D12DescriptorRing& srvRing() { + return _srvRing; + } + + /** + * Non-shader-visible RTV ring used by D3D12RenderPass to publish OMSetRenderTargets handles + * without paying for a CreateDescriptorHeap on every pass. Slots are reclaimed once the + * fence value committed at submit() signals. + */ + D3D12DescriptorRing& rtvRing() { + return _rtvRing; + } + + /** + * Non-shader-visible DSV ring counterpart of rtvRing(), shared across all render passes. + */ + D3D12DescriptorRing& dsvRing() { + return _dsvRing; + } + + /** + * Allocates a GPU descriptor handle in the process-wide shader-visible Sampler heap and writes + * `desc` into it. The slot is never freed; the sampler heap is bounded by D3D12's hard 2048 + * limit and every distinct SamplerDescriptor is created at most once via D3D12GPU's sampler + * cache, so the heap effectively acts as an append-only descriptor table. Returns an + * uninitialised handle (.ptr == 0) if the heap is exhausted. + */ + D3D12_GPU_DESCRIPTOR_HANDLE allocatePermanentSamplerSlot(const D3D12_SAMPLER_DESC& desc); + + /** + * Returns the underlying shader-visible Sampler heap. Used by D3D12CommandEncoder when binding + * heaps onto a fresh command list (D3D12 requires SetDescriptorHeaps once per list). + */ + ID3D12DescriptorHeap* samplerHeap() const { + return _samplerHeap.Get(); + } + + /** + * Pool of (ID3D12CommandAllocator, ID3D12GraphicsCommandList) pairs reused across + * encoders/queue uploads. Avoids the per-submission CreateCommandAllocator/CreateCommandList + * overhead. Pairs are returned to the pool by reclaimSubmission once their fence signals. + */ + D3D12CommandListPool& commandListPool() { + return _commandListPool; + } + + /** + * Returns a shared root signature matching `shapeKey`, or null if not cached. Pipelines query + * the cache before serialising/creating their own root signature; on a hit they reuse the + * existing object (incrementing its ComPtr refcount), avoiding the cost of + * D3D12SerializeRootSignature + ID3D12Device::CreateRootSignature on every PSO build. + */ + ComPtr findRootSignature(const std::vector& shapeKey); + + /** + * Inserts a freshly-built root signature under `shapeKey`. Subsequent pipelines with the same + * binding shape will hit the cache. The shape key is a compact serialisation of the binding + * layout (uniform-block count + visibilities, sampler count) generated by D3D12RenderPipeline. + */ + void cacheRootSignature(std::vector shapeKey, ComPtr rootSignature); + + /** + * Process-wide UPLOAD-heap ring used to stage texture pixel uploads (and other CPU-to-GPU + * data). Sub-allocations live until the owning fence signals; callers fall back to a one-off + * CreateCommittedResource only if a single allocation exceeds the ring's capacity or the + * ring is fully in flight. + */ + D3D12UploadHeap& uploadHeap() { + return _uploadHeap; + } + + private: + /// Single entry point for marking the context lost. Sets the flag, dumps DRED diagnostics on + /// the first transition (subsequent calls are silent), and short-circuits all wait paths. + void markContextLost(const char* tag); + explicit D3D12GPU(ComPtr device, ComPtr adapter); + + std::shared_ptr addResource(D3D12Resource* resource); + + static uint32_t MakeSamplerKey(const SamplerDescriptor& descriptor); + + void initInfo(); + void initFeatures(); + void initLimits(); + + ComPtr d3d12Device = nullptr; + ComPtr dxgiAdapter = nullptr; + GPUInfo _info = {}; + GPUFeatures _features = {}; + GPULimits _limits = {}; + std::unique_ptr commandQueue = nullptr; + std::unique_ptr compiler = nullptr; + std::list resources = {}; + std::shared_ptr returnQueue = ReturnQueue::Make(); + std::unordered_map> samplerCache = {}; + + // Process-wide cache of compiled shader modules keyed by (stage, hash(GLSL source)). The + // upper layer caches Programs by ProgramKey, but two unrelated programs frequently share + // the same vertex shader (or the same fragment shader template). Without this cache every + // program build re-runs GLSL -> SPIR-V -> HLSL -> DXBC even though the bytecode would be + // byte-identical. Empirical measurement on the test suite: 700 createShaderModule calls + // produce only 340 distinct sources (~51% redundancy). + struct ShaderCacheKey { + uint32_t stage = 0; + size_t sourceHash = 0; + bool operator==(const ShaderCacheKey& other) const { + return stage == other.stage && sourceHash == other.sourceHash; + } + }; + struct ShaderCacheKeyHash { + size_t operator()(const ShaderCacheKey& k) const noexcept { + return k.sourceHash ^ (static_cast(k.stage) * 0x9E3779B97F4A7C15ull); + } + }; + std::unordered_map, ShaderCacheKeyHash> + shaderModuleCache; + + // Process-wide cache of root signatures keyed by their binding-layout shape (uniform-block + // count + visibilities, sampler count). Almost every pipeline in tgfx falls into one of a + // handful of shapes, so this turns the SerializeRootSignature + CreateRootSignature pair + // (combined ~30-100 us per call on first hit) into an unordered_map lookup for the steady + // state. Keys are arbitrary-length byte strings rather than a fixed integer to leave room + // for additional layout metadata without introducing hash collisions. + struct ShapeKeyHash { + size_t operator()(const std::vector& key) const noexcept { + // FNV-1a 64-bit, sufficient for the small distinct-shape population we see in practice. + size_t hash = 1469598103934665603ull; + for (auto byte : key) { + hash ^= static_cast(byte); + hash *= 1099511628211ull; + } + return hash; + } + }; + std::unordered_map, ComPtr, ShapeKeyHash> + rootSignatureCache; + // Lazily-initialised compute pipeline used by D3D12CommandEncoder::generateMipmapsForTexture. + // Built on first use so backends that never request mipmaps don't pay the shader-compile cost. + std::unique_ptr _mipmapGenerator = nullptr; + + // Process-wide shader-visible descriptor heaps used by render passes. The CBV/SRV/UAV ring is + // sized for thousands of unique bindings per frame and is recycled per-fence. The Sampler heap + // is append-only (capped at D3D12's 2048 limit; one entry per unique SamplerDescriptor for the + // life of the GPU instance) and therefore does not need a ring tail pointer. + static constexpr uint32_t SRV_RING_CAPACITY = 64 * 1024; + static constexpr uint32_t SAMPLER_HEAP_CAPACITY = 2048; + // RTV / DSV rings replace the per-render-pass CreateDescriptorHeap calls. Sized to handle a + // few dozen passes per frame with MAX_FRAMES_IN_FLIGHT outstanding; well under D3D12's hard + // 1024-RTV / 1024-DSV per-heap caps. + static constexpr uint32_t RTV_RING_CAPACITY = 512; + static constexpr uint32_t DSV_RING_CAPACITY = 64; + D3D12DescriptorRing _srvRing; + D3D12DescriptorRing _rtvRing; + D3D12DescriptorRing _dsvRing; + ComPtr _samplerHeap = nullptr; + uint32_t _samplerHeapSize = 0; + uint32_t _samplerHeapCapacity = 0; + uint32_t _samplerDescriptorIncrement = 0; + + // Pool of recycled command allocator + graphics command list pairs. Populated by + // reclaimSubmission once a submission's fence signals; consumed by D3D12CommandEncoder::Make + // and the transient upload-list paths inside D3D12CommandQueue. + D3D12CommandListPool _commandListPool; + + // Process-wide UPLOAD ring used by D3D12CommandQueue::writeTexture / writeBuffer. Initial + // capacity is sized for typical glyph atlas / blur seed traffic; oversize allocations or a + // saturated ring fall back to per-call CreateCommittedResource so behaviour stays correct + // even when the steady-state fast path can't satisfy the request. + static constexpr size_t UPLOAD_HEAP_CAPACITY = 64 * 1024 * 1024; + D3D12UploadHeap _uploadHeap; + + // Submission state. Following the Vulkan model, the GPU owns the frame fence and the inflight + // queue; D3D12CommandQueue is a thin coordination layer that builds a SubmitRequest and hands + // it to executeSubmission(). Frame-fence values are monotonically increasing. + static constexpr size_t MAX_FRAMES_IN_FLIGHT = 2; + + struct InflightSubmission { + uint64_t fenceValue = 0; + // Steady_clock timestamp recorded at submit() time. When the matching fence value is + // reached, this value gets published to _lastFenceSignalTime so the ResourceCache can + // tell which scratch resources the GPU is definitely done reading. + std::chrono::steady_clock::time_point frameTime = {}; + D3D12FrameSession session; + std::vector uploads; + }; + + void reclaimSubmission(InflightSubmission& submission); + void pollCompletedSubmissions(); + + ComPtr _frameFence = nullptr; + HANDLE _frameFenceEvent = nullptr; + uint64_t _lastSignalledFenceValue = 0; + // Timestamp of the most recently completed (fence-signalled) inflight submission. Stored as + // int64 ticks so it can be loaded/stored atomically. Updated from the same thread that owns + // the GPU, but exposed via std::atomic so a future readback thread (if any) sees a coherent + // value without taking the GPU lock. + std::atomic _lastFenceSignalTime = {0}; + std::deque inflightSubmissions; + // Sticky flag set when the device returns DXGI_ERROR_DEVICE_REMOVED or another fatal error. + // Once set, executeSubmission and waitAllInflightSubmissions stop blocking on the fence — the + // GPU will never signal again, and waiting INFINITE would hang the process. Submissions + // continue to clean up their resources locally so destruction terminates promptly. + bool contextLost = false; +}; + +} // namespace tgfx diff --git a/src/gpu/d3d12/D3D12MipmapGenerator.cpp b/src/gpu/d3d12/D3D12MipmapGenerator.cpp new file mode 100644 index 000000000..25d0b3250 --- /dev/null +++ b/src/gpu/d3d12/D3D12MipmapGenerator.cpp @@ -0,0 +1,188 @@ +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Tencent is pleased to support the open source community by making tgfx available. +// +// Copyright (C) 2026 Tencent. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// unless required by applicable law or agreed to in writing, software distributed under the +// license is distributed on an "as is" basis, without warranties or conditions of any kind, +// either express or implied. see the license for the specific language governing permissions +// and limitations under the license. +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#include "D3D12MipmapGenerator.h" +#include +#include "D3D12GPU.h" +#include "core/utils/Log.h" + +namespace tgfx { + +// Box-filter compute shader: each thread samples 2x2 source texels and writes one destination +// texel. SamplePoint is required because textures we generate mipmaps on may not have linear +// sampling enabled by the time this shader runs; sampling four corners with bilinear would +// average twice and double the cost. The HLSL is intentionally inline and tiny so D3DCompile +// finishes in well under a millisecond. +// +// Layout matches the root signature in createRootSignature(): +// register(b0) — uint4 with mip dimensions and 1/dimensions +// register(t0) — input mip (mip[i]) +// register(s0) — point sampler with clamp address mode +// register(u0) — output mip (mip[i+1]) +// Hardware linear sampling at the destination texel center performs a 2x2 weighted average of +// the source texels with the same weights MTLBlitCommandEncoder generateMipmapsForTexture and +// vkCmdBlitImage(VK_FILTER_LINEAR) use, so a single SampleLevel matches the Metal/Vulkan output +// bit-for-bit on even-divided mip levels and follows GPU-driver edge handling on odd ones. The +// older quincunx (four 0.25-texel offsets) effectively did a 16-tap blur and produced softer +// mips than the other backends. +static constexpr const char* kHLSLSource = R"( +cbuffer MipmapCB : register(b0) +{ + uint OutMipWidth; + uint OutMipHeight; + float InvOutMipWidth; + float InvOutMipHeight; +}; + +Texture2D InputMip : register(t0); +SamplerState LinearClamp : register(s0); +RWTexture2D OutputMip : register(u0); + +[numthreads(8, 8, 1)] +void main(uint3 dtID : SV_DispatchThreadID) +{ + if (dtID.x >= OutMipWidth || dtID.y >= OutMipHeight) { + return; + } + float2 uv = (float2(dtID.xy) + 0.5f) * float2(InvOutMipWidth, InvOutMipHeight); + OutputMip[dtID.xy] = InputMip.SampleLevel(LinearClamp, uv, 0); +} +)"; + +D3D12MipmapGenerator::D3D12MipmapGenerator(D3D12GPU* gpu) { + if (!createRootSignature(gpu)) { + return; + } + if (!createPipelineState(gpu)) { + _rootSignature = nullptr; + } +} + +D3D12MipmapGenerator* D3D12MipmapGenerator::Get(D3D12GPU* gpu) { + return gpu->mipmapGenerator(); +} + +bool D3D12MipmapGenerator::createRootSignature(D3D12GPU* gpu) { + // Constants + SRV table + UAV table. A single static sampler (point/clamp) means we don't have + // to thread a sampler descriptor heap through generateMipmapsForTexture(). + D3D12_DESCRIPTOR_RANGE srvRange = {}; + srvRange.RangeType = D3D12_DESCRIPTOR_RANGE_TYPE_SRV; + srvRange.NumDescriptors = 1; + srvRange.BaseShaderRegister = 0; + srvRange.RegisterSpace = 0; + srvRange.OffsetInDescriptorsFromTableStart = 0; + + D3D12_DESCRIPTOR_RANGE uavRange = {}; + uavRange.RangeType = D3D12_DESCRIPTOR_RANGE_TYPE_UAV; + uavRange.NumDescriptors = 1; + uavRange.BaseShaderRegister = 0; + uavRange.RegisterSpace = 0; + uavRange.OffsetInDescriptorsFromTableStart = 0; + + D3D12_ROOT_PARAMETER params[3] = {}; + params[0].ParameterType = D3D12_ROOT_PARAMETER_TYPE_32BIT_CONSTANTS; + params[0].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; + params[0].Constants.ShaderRegister = 0; + params[0].Constants.RegisterSpace = 0; + params[0].Constants.Num32BitValues = 4; + + params[1].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; + params[1].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; + params[1].DescriptorTable.NumDescriptorRanges = 1; + params[1].DescriptorTable.pDescriptorRanges = &srvRange; + + params[2].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; + params[2].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; + params[2].DescriptorTable.NumDescriptorRanges = 1; + params[2].DescriptorTable.pDescriptorRanges = &uavRange; + + D3D12_STATIC_SAMPLER_DESC samplerDesc = {}; + samplerDesc.Filter = D3D12_FILTER_MIN_MAG_LINEAR_MIP_POINT; + samplerDesc.AddressU = D3D12_TEXTURE_ADDRESS_MODE_CLAMP; + samplerDesc.AddressV = D3D12_TEXTURE_ADDRESS_MODE_CLAMP; + samplerDesc.AddressW = D3D12_TEXTURE_ADDRESS_MODE_CLAMP; + samplerDesc.MipLODBias = 0.0f; + samplerDesc.MaxAnisotropy = 1; + samplerDesc.ComparisonFunc = D3D12_COMPARISON_FUNC_NEVER; + samplerDesc.BorderColor = D3D12_STATIC_BORDER_COLOR_TRANSPARENT_BLACK; + samplerDesc.MinLOD = 0.0f; + samplerDesc.MaxLOD = D3D12_FLOAT32_MAX; + samplerDesc.ShaderRegister = 0; + samplerDesc.RegisterSpace = 0; + samplerDesc.ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL; + + D3D12_ROOT_SIGNATURE_DESC rootSigDesc = {}; + rootSigDesc.NumParameters = 3; + rootSigDesc.pParameters = params; + rootSigDesc.NumStaticSamplers = 1; + rootSigDesc.pStaticSamplers = &samplerDesc; + rootSigDesc.Flags = D3D12_ROOT_SIGNATURE_FLAG_NONE; + + ComPtr blob = nullptr; + ComPtr errorBlob = nullptr; + auto hr = + D3D12SerializeRootSignature(&rootSigDesc, D3D_ROOT_SIGNATURE_VERSION_1, &blob, &errorBlob); + if (FAILED(hr)) { + LOGE("D3D12MipmapGenerator: D3D12SerializeRootSignature failed (HRESULT=0x%08X).", + static_cast(hr)); + return false; + } + hr = gpu->device()->CreateRootSignature(0, blob->GetBufferPointer(), blob->GetBufferSize(), + IID_PPV_ARGS(&_rootSignature)); + if (FAILED(hr)) { + LOGE("D3D12MipmapGenerator: CreateRootSignature failed (HRESULT=0x%08X).", + static_cast(hr)); + return false; + } + return true; +} + +bool D3D12MipmapGenerator::createPipelineState(D3D12GPU* gpu) { + ComPtr csBlob = nullptr; + ComPtr errorBlob = nullptr; + UINT compileFlags = D3DCOMPILE_ENABLE_STRICTNESS; +#ifdef _DEBUG + compileFlags |= D3DCOMPILE_DEBUG | D3DCOMPILE_SKIP_OPTIMIZATION; +#else + compileFlags |= D3DCOMPILE_OPTIMIZATION_LEVEL3; +#endif + auto hr = D3DCompile(kHLSLSource, strlen(kHLSLSource), nullptr, nullptr, nullptr, "main", + "cs_5_0", compileFlags, 0, &csBlob, &errorBlob); + if (FAILED(hr)) { + LOGE("D3D12MipmapGenerator: D3DCompile failed (HRESULT=0x%08X): %s", static_cast(hr), + errorBlob ? static_cast(errorBlob->GetBufferPointer()) : ""); + return false; + } + + D3D12_COMPUTE_PIPELINE_STATE_DESC desc = {}; + desc.pRootSignature = _rootSignature.Get(); + desc.CS.pShaderBytecode = csBlob->GetBufferPointer(); + desc.CS.BytecodeLength = csBlob->GetBufferSize(); + desc.NodeMask = 0; + desc.Flags = D3D12_PIPELINE_STATE_FLAG_NONE; + + hr = gpu->device()->CreateComputePipelineState(&desc, IID_PPV_ARGS(&_pipelineState)); + if (FAILED(hr)) { + LOGE("D3D12MipmapGenerator: CreateComputePipelineState failed (HRESULT=0x%08X).", + static_cast(hr)); + return false; + } + return true; +} + +} // namespace tgfx diff --git a/src/gpu/d3d12/D3D12MipmapGenerator.h b/src/gpu/d3d12/D3D12MipmapGenerator.h new file mode 100644 index 000000000..a53cfe7ce --- /dev/null +++ b/src/gpu/d3d12/D3D12MipmapGenerator.h @@ -0,0 +1,83 @@ +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Tencent is pleased to support the open source community by making tgfx available. +// +// Copyright (C) 2026 Tencent. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// unless required by applicable law or agreed to in writing, software distributed under the +// license is distributed on an "as is" basis, without warranties or conditions of any kind, +// either express or implied. see the license for the specific language governing permissions +// and limitations under the license. +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include "D3D12Util.h" + +namespace tgfx { + +class D3D12GPU; +class D3D12Texture; + +/// Number of threads per group on each axis. The compute shader is `[numthreads(8, 8, 1)]`, so +/// dispatching encoders need to round the output mip dimensions up to this value. +static constexpr unsigned D3D12_MIPMAP_THREAD_GROUP_SIZE = 8; + +/** + * Lazily-initialised compute pipeline that downsamples mip[i] into mip[i+1] for a 2D texture. + * + * D3D12 has no built-in equivalent to Metal's [blitEncoder generateMipmapsForTexture] or + * Vulkan's vkCmdBlitImage chain, so we ship a tiny box-filter compute shader and dispatch it + * per mip level. The PSO and root signature are cached on the GPU so repeated mipmap generation + * doesn't repeatedly recompile the shader. + * + * Root signature layout: + * slot 0: 4 32-bit root constants (output mip width, height, inv_width, inv_height) + * slot 1: SRV descriptor table referencing mip[i] + * slot 2: UAV descriptor table referencing mip[i+1] + * + * The encoder is expected to: + * - Place the SRV / UAV pair into a shader-visible heap that lives long enough to cover the + * dispatch (added to the FrameSession's retainedDescriptorHeaps). + * - Issue ResourceBarriers transitioning the parent texture's individual subresources between + * UNORDERED_ACCESS and PIXEL_SHADER_RESOURCE as the chain walks up. + */ +class D3D12MipmapGenerator { + public: + static D3D12MipmapGenerator* Get(D3D12GPU* gpu); + + ID3D12RootSignature* rootSignature() const { + return _rootSignature.Get(); + } + + ID3D12PipelineState* pipelineState() const { + return _pipelineState.Get(); + } + + /** + * Returns true once both the root signature and the pipeline state are ready. Callers should + * skip mipmap generation when this is false (and emit a one-time log). + */ + bool isReady() const { + return _rootSignature != nullptr && _pipelineState != nullptr; + } + + private: + explicit D3D12MipmapGenerator(D3D12GPU* gpu); + + bool createRootSignature(D3D12GPU* gpu); + bool createPipelineState(D3D12GPU* gpu); + + ComPtr _rootSignature; + ComPtr _pipelineState; + + friend class D3D12GPU; +}; + +} // namespace tgfx diff --git a/src/gpu/d3d12/D3D12RenderPass.cpp b/src/gpu/d3d12/D3D12RenderPass.cpp new file mode 100644 index 000000000..a1284e768 --- /dev/null +++ b/src/gpu/d3d12/D3D12RenderPass.cpp @@ -0,0 +1,600 @@ +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Tencent is pleased to support the open source community by making tgfx available. +// +// Copyright (C) 2026 Tencent. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// unless required by applicable law or agreed to in writing, software distributed under the +// license is distributed on an "as is" basis, without warranties or conditions of any kind, +// either express or implied. see the license for the specific language governing permissions +// and limitations under the license. +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#include "D3D12RenderPass.h" +#include "D3D12Buffer.h" +#include "D3D12CommandEncoder.h" +#include "D3D12GPU.h" +#include "D3D12RenderPipeline.h" +#include "D3D12Sampler.h" +#include "D3D12Texture.h" +#include "core/utils/Log.h" + +namespace tgfx { + +static D3D12_PRIMITIVE_TOPOLOGY ToD3D12Topology(PrimitiveType type) { + return type == PrimitiveType::TriangleStrip ? D3D_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP + : D3D_PRIMITIVE_TOPOLOGY_TRIANGLELIST; +} + +std::shared_ptr D3D12RenderPass::Make(D3D12CommandEncoder* encoder, + const RenderPassDescriptor& descriptor) { + if (encoder == nullptr) { + return nullptr; + } + auto gpu = static_cast(encoder->gpu()); + auto pass = std::shared_ptr(new D3D12RenderPass(encoder, gpu, descriptor)); + if (!pass->initialise(descriptor)) { + return nullptr; + } + return pass; +} + +D3D12RenderPass::D3D12RenderPass(D3D12CommandEncoder* encoder, D3D12GPU* gpu, + const RenderPassDescriptor& passDescriptor) + : RenderPass(passDescriptor), encoder(encoder), d3d12GPU(gpu), + commandList(encoder->d3d12CommandList()) { +} + +bool D3D12RenderPass::initialise(const RenderPassDescriptor& passDescriptor) { + if (commandList == nullptr) { + return false; + } + auto device = d3d12GPU->device(); + + // Step 1: Allocate RTV / DSV descriptor slots from the GPU-wide non-shader-visible rings. + // Each render pass used to call CreateDescriptorHeap for these every time it was begun; we + // now sub-allocate from a shared ring that is reclaimed by fence value, removing a kernel + // round-trip per pass on the hot path. Ring slots stay valid until the next pollCompleted- + // Submissions() retires them, so OMSetRenderTargets does not need the heap pinned in + // FrameSession::retainedRTVDSVHeaps any more. + uint32_t numColorAttachments = 0; + for (auto& ca : passDescriptor.colorAttachments) { + if (ca.texture != nullptr) { + numColorAttachments++; + } + } + bool hasDepth = (passDescriptor.depthStencilAttachment.texture != nullptr); + + D3D12DescriptorRing::Range rtvRange = {}; + if (numColorAttachments > 0) { + rtvRange = d3d12GPU->rtvRing().allocate(numColorAttachments); + if (!rtvRange.valid()) { + LOGE("D3D12RenderPass: RTV ring exhausted (requested %u slots).", numColorAttachments); + return false; + } + } + + D3D12DescriptorRing::Range dsvRange = {}; + if (hasDepth) { + dsvRange = d3d12GPU->dsvRing().allocate(1); + if (!dsvRange.valid()) { + LOGE("D3D12RenderPass: DSV ring exhausted."); + return false; + } + } + + auto rtvDescriptorSize = d3d12GPU->rtvRing().descriptorSize(); + + std::vector rtvHandles; + rtvHandles.reserve(numColorAttachments); + uint32_t fbWidth = 0; + uint32_t fbHeight = 0; + + // First pass over color attachments: create RTV descriptors and accumulate transitions. + // We deliberately separate this from the Clear* calls below so we can issue a single + // ResourceBarrier for every attachment (color + depth) instead of N+1 individual calls. + D3D12BarrierBatch entryBatch; + for (auto& ca : passDescriptor.colorAttachments) { + if (ca.texture == nullptr) { + continue; + } + auto d3d12Tex = std::static_pointer_cast(ca.texture); + encoder->retainResource(d3d12Tex); + colorAttachments.push_back(d3d12Tex); + + // Queue the transition to RENDER_TARGET. The "current" state is either COMMON (newly + // created or coming back from sampling) or RENDER_TARGET from a preceding render pass that + // already left it there; addTransition() collapses the latter into a no-op. + entryBatch.addTransition(d3d12Tex->d3d12Resource(), d3d12Tex->currentState(), + D3D12_RESOURCE_STATE_RENDER_TARGET); + encoder->recordTextureStateChange(d3d12Tex.get(), D3D12_RESOURCE_STATE_RENDER_TARGET); + + D3D12_CPU_DESCRIPTOR_HANDLE rtvHandle = rtvRange.cpuStart; + rtvHandle.ptr += static_cast(rtvHandles.size()) * rtvDescriptorSize; + + D3D12_RENDER_TARGET_VIEW_DESC rtvDesc = {}; + rtvDesc.Format = static_cast(d3d12Tex->dxgiFormat()); + rtvDesc.ViewDimension = (d3d12Tex->sampleCount() > 1) ? D3D12_RTV_DIMENSION_TEXTURE2DMS + : D3D12_RTV_DIMENSION_TEXTURE2D; + if (rtvDesc.ViewDimension == D3D12_RTV_DIMENSION_TEXTURE2D) { + rtvDesc.Texture2D.MipSlice = 0; + rtvDesc.Texture2D.PlaneSlice = 0; + } + device->CreateRenderTargetView(d3d12Tex->d3d12Resource(), &rtvDesc, rtvHandle); + rtvHandles.push_back(rtvHandle); + + fbWidth = static_cast(d3d12Tex->width()); + fbHeight = static_cast(d3d12Tex->height()); + + // Capture the optional resolve target for this attachment. A null entry keeps the parallel + // vector aligned with colorAttachments so the onEnd() loop can match them by index. + if (ca.resolveTexture != nullptr) { + auto resolveTex = std::static_pointer_cast(ca.resolveTexture); + encoder->retainResource(resolveTex); + resolveTextures.push_back(std::move(resolveTex)); + } else { + resolveTextures.push_back(nullptr); + } + } + + D3D12_CPU_DESCRIPTOR_HANDLE dsvHandle = {}; + if (hasDepth) { + auto d3d12Tex = + std::static_pointer_cast(passDescriptor.depthStencilAttachment.texture); + encoder->retainResource(d3d12Tex); + depthStencilAttachment = d3d12Tex; + + entryBatch.addTransition(d3d12Tex->d3d12Resource(), d3d12Tex->currentState(), + D3D12_RESOURCE_STATE_DEPTH_WRITE); + encoder->recordTextureStateChange(d3d12Tex.get(), D3D12_RESOURCE_STATE_DEPTH_WRITE); + + dsvHandle = dsvRange.cpuStart; + D3D12_DEPTH_STENCIL_VIEW_DESC dsvDesc = {}; + dsvDesc.Format = static_cast(d3d12Tex->dxgiFormat()); + dsvDesc.ViewDimension = (d3d12Tex->sampleCount() > 1) ? D3D12_DSV_DIMENSION_TEXTURE2DMS + : D3D12_DSV_DIMENSION_TEXTURE2D; + if (dsvDesc.ViewDimension == D3D12_DSV_DIMENSION_TEXTURE2D) { + dsvDesc.Texture2D.MipSlice = 0; + } + device->CreateDepthStencilView(d3d12Tex->d3d12Resource(), &dsvDesc, dsvHandle); + + if (fbWidth == 0) { + fbWidth = static_cast(d3d12Tex->width()); + fbHeight = static_cast(d3d12Tex->height()); + } + } + + // Flush every pre-pass transition in one ResourceBarrier(N, ...) call. After this point all + // attachments are in their target state and clears are safe to issue. + entryBatch.flush(commandList); + + // Second pass: ClearRenderTargetView / ClearDepthStencilView for any attachment with + // LoadAction::Clear. These run after the barrier flush so the resource state is correct. + // rtvHandles[] was populated in the order non-null colorAttachments are visited above, so we + // walk passDescriptor.colorAttachments again with a parallel running counter — keeping this + // O(N) instead of paying an inner search per attachment. + size_t rtvIndex = 0; + for (const auto& ca : passDescriptor.colorAttachments) { + if (ca.texture == nullptr) { + continue; + } + if (ca.loadAction == LoadAction::Clear) { + const float clear[4] = {ca.clearValue.red, ca.clearValue.green, ca.clearValue.blue, + ca.clearValue.alpha}; + commandList->ClearRenderTargetView(rtvHandles[rtvIndex], clear, 0, nullptr); + } + rtvIndex++; + } + if (hasDepth && passDescriptor.depthStencilAttachment.loadAction == LoadAction::Clear) { + D3D12_CLEAR_FLAGS clearFlags = D3D12_CLEAR_FLAG_DEPTH | D3D12_CLEAR_FLAG_STENCIL; + commandList->ClearDepthStencilView( + dsvHandle, clearFlags, passDescriptor.depthStencilAttachment.depthClearValue, + static_cast(passDescriptor.depthStencilAttachment.stencilClearValue), 0, nullptr); + } + + // Step 2: Bind render targets. + commandList->OMSetRenderTargets(static_cast(rtvHandles.size()), + rtvHandles.empty() ? nullptr : rtvHandles.data(), FALSE, + hasDepth ? &dsvHandle : nullptr); + + // Step 3: Default viewport / scissor covering the entire framebuffer. + D3D12_VIEWPORT viewport = {}; + viewport.TopLeftX = 0.0f; + viewport.TopLeftY = 0.0f; + viewport.Width = static_cast(fbWidth); + viewport.Height = static_cast(fbHeight); + viewport.MinDepth = 0.0f; + viewport.MaxDepth = 1.0f; + commandList->RSSetViewports(1, &viewport); + + D3D12_RECT scissor = {}; + scissor.left = 0; + scissor.top = 0; + scissor.right = static_cast(fbWidth); + scissor.bottom = static_cast(fbHeight); + commandList->RSSetScissorRects(1, &scissor); + + // No per-pass descriptor heaps to retain any more: the RTV / DSV slots we allocated above + // live in D3D12GPU::_rtvRing / _dsvRing, whose underlying ID3D12DescriptorHeap is owned by + // the GPU instance and reclaimed by fence value (see pollCompletedSubmissions). Shader- + // visible heaps were already bound to this command list in D3D12CommandEncoder::Make(). + return true; +} + +GPU* D3D12RenderPass::gpu() const { + return d3d12GPU; +} + +void D3D12RenderPass::setViewport(int x, int y, int width, int height) { + D3D12_VIEWPORT viewport = {}; + viewport.TopLeftX = static_cast(x); + viewport.TopLeftY = static_cast(y); + viewport.Width = static_cast(width); + viewport.Height = static_cast(height); + viewport.MinDepth = 0.0f; + viewport.MaxDepth = 1.0f; + commandList->RSSetViewports(1, &viewport); +} + +void D3D12RenderPass::setScissorRect(int x, int y, int width, int height) { + D3D12_RECT scissor = {}; + scissor.left = x; + scissor.top = y; + scissor.right = x + width; + scissor.bottom = y + height; + commandList->RSSetScissorRects(1, &scissor); +} + +void D3D12RenderPass::setPipeline(std::shared_ptr pipeline) { + if (!pipeline) { + return; + } + auto d3d12Pipeline = std::static_pointer_cast(pipeline); + if (currentPipeline == d3d12Pipeline) { + return; + } + if (d3d12Pipeline->d3d12PipelineState() == nullptr || + d3d12Pipeline->d3d12RootSignature() == nullptr) { + return; + } + currentPipeline = d3d12Pipeline; + encoder->retainResource(d3d12Pipeline); + commandList->SetPipelineState(d3d12Pipeline->d3d12PipelineState()); + commandList->SetGraphicsRootSignature(d3d12Pipeline->d3d12RootSignature()); + + // Switching pipelines invalidates root parameter state, so re-flag every binding as dirty. + for (auto& ub : uniformBindings) { + if (ub.gpuAddress != 0) { + ub.dirty = true; + } + } + for (auto& tb : textureBindings) { + if (tb.srvTableStart.ptr != 0) { + tb.dirty = true; + } + } +} + +void D3D12RenderPass::setUniformBuffer(unsigned binding, std::shared_ptr buffer, + size_t offset, size_t /*size*/) { + if (!buffer || binding >= MaxUniformBindings) { + return; + } + auto d3d12Buffer = std::static_pointer_cast(buffer); + encoder->retainResource(d3d12Buffer); + auto gpuAddr = d3d12Buffer->d3d12Resource()->GetGPUVirtualAddress() + offset; + auto& ub = uniformBindings[binding]; + if (ub.gpuAddress != gpuAddr) { + ub.gpuAddress = gpuAddr; + ub.dirty = true; + } +} + +void D3D12RenderPass::setTexture(unsigned binding, std::shared_ptr texture, + std::shared_ptr sampler) { + if (!texture || !sampler || !currentPipeline || binding >= MaxTextureBindings) { + return; + } + auto d3d12Tex = std::static_pointer_cast(texture); + auto d3d12Samp = std::static_pointer_cast(sampler); + encoder->retainResource(d3d12Tex); + encoder->retainResource(d3d12Samp); + + // Color render targets and write-back textures may currently be in RENDER_TARGET / COPY_DEST. + // Queue a transition to PIXEL_SHADER_RESOURCE so the SRV will be valid by the time the next + // draw fires. The barrier is not issued immediately; pendingBarriers accumulates every state + // change recorded by setTexture() in this pass and flushBindingsIfNeeded() emits them all in + // a single ResourceBarrier(N, ...) call right before the draw. The CPU-side _currentState is + // updated immediately so a second setTexture() with the same texture sees the correct state + // and does not enqueue a redundant barrier. + auto current = d3d12Tex->currentState(); + if (current != D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE) { + pendingBarriers.addTransition(d3d12Tex->d3d12Resource(), current, + D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE); + encoder->recordTextureStateChange(d3d12Tex.get(), D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE); + // Track this texture so onEnd() can transition it back to COMMON. Without that step, D3D12 + // automatic state decay after ExecuteCommandLists drops the resource to COMMON, but our CPU + // tracker still believes it is in PIXEL_SHADER_RESOURCE — every subsequent transition then + // fails "Before state mismatch" validation and (on some drivers) destabilises the device. + shaderResourceTextures.push_back(d3d12Tex); + } + + auto device = d3d12GPU->device(); + + // SRV slot: dedup by (resource, format, mipLevels). Repeated bindings of the same texture + // within one render pass share a single descriptor sub-allocated from the GPU's CBV/SRV/UAV + // ring. + SrvCacheKey srvKey = {}; + srvKey.resource = d3d12Tex->d3d12Resource(); + srvKey.format = static_cast(d3d12Tex->dxgiFormat()); + srvKey.mipLevels = static_cast(d3d12Tex->mipLevelCount()); + D3D12_GPU_DESCRIPTOR_HANDLE srvGpu = {}; + auto srvIt = srvSlotCache.find(srvKey); + if (srvIt != srvSlotCache.end()) { + srvGpu = srvIt->second; + } else { + auto range = d3d12GPU->srvRing().allocate(1); + if (!range.valid()) { + LOGE("D3D12RenderPass::setTexture: SRV ring exhausted."); + return; + } + D3D12_SHADER_RESOURCE_VIEW_DESC srvDesc = {}; + srvDesc.Format = srvKey.format; + srvDesc.ViewDimension = (d3d12Tex->sampleCount() > 1) ? D3D12_SRV_DIMENSION_TEXTURE2DMS + : D3D12_SRV_DIMENSION_TEXTURE2D; + srvDesc.Shader4ComponentMapping = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING; + if (srvDesc.ViewDimension == D3D12_SRV_DIMENSION_TEXTURE2D) { + srvDesc.Texture2D.MostDetailedMip = 0; + srvDesc.Texture2D.MipLevels = srvKey.mipLevels; + srvDesc.Texture2D.PlaneSlice = 0; + srvDesc.Texture2D.ResourceMinLODClamp = 0.0f; + } + device->CreateShaderResourceView(d3d12Tex->d3d12Resource(), &srvDesc, range.cpuStart); + srvGpu = range.gpuStart; + srvSlotCache.emplace(srvKey, srvGpu); + } + + // Sampler GPU descriptor handle is owned by the D3D12Sampler instance and is permanent for the + // lifetime of the GPU. setTexture() never has to allocate or write a sampler descriptor. + D3D12_GPU_DESCRIPTOR_HANDLE sampGpu = d3d12Samp->gpuDescriptorHandle(); + + auto& tb = textureBindings[binding]; + tb.srvTableStart = srvGpu; + tb.samplerTableStart = sampGpu; + tb.dirty = true; +} + +void D3D12RenderPass::flushBindingsIfNeeded() { + if (!currentPipeline) { + return; + } + // Issue every queued state transition in a single ResourceBarrier(N, ...) call right before + // the draw. setTexture() may have added several transitions (one per unique sampled texture + // entering this pass); flushing them together lets the driver collapse redundant cache + // operations and avoids per-barrier API overhead. + pendingBarriers.flush(commandList); + + // Apply uniform CBVs — one root constant buffer view per dirty uniform binding. + for (unsigned i = 0; i < MaxUniformBindings; i++) { + auto& ub = uniformBindings[i]; + if (!ub.dirty) { + continue; + } + auto rootIndex = currentPipeline->getUniformRootParameterIndex(i); + if (rootIndex == UINT32_MAX) { + ub.dirty = false; + continue; + } + commandList->SetGraphicsRootConstantBufferView(rootIndex, ub.gpuAddress); + ub.dirty = false; + } + + // Apply texture/sampler descriptor tables. Each texture binding occupies two consecutive root + // parameters in our root signature: an SRV table (in the CBV/SRV/UAV heap) and a Sampler table + // (in the Sampler heap). We bind both with separate SetGraphicsRootDescriptorTable calls. + for (unsigned i = 0; i < MaxTextureBindings; i++) { + auto& tb = textureBindings[i]; + if (!tb.dirty) { + continue; + } + auto srvRoot = currentPipeline->getTextureRootParameterIndex(i); + auto samplerRoot = currentPipeline->getSamplerRootParameterIndex(i); + if (srvRoot != UINT32_MAX) { + commandList->SetGraphicsRootDescriptorTable(srvRoot, tb.srvTableStart); + } + if (samplerRoot != UINT32_MAX) { + commandList->SetGraphicsRootDescriptorTable(samplerRoot, tb.samplerTableStart); + } + tb.dirty = false; + } +} + +void D3D12RenderPass::setVertexBuffer(unsigned slot, std::shared_ptr buffer, + size_t offset) { + if (!buffer || !currentPipeline) { + return; + } + auto d3d12Buffer = std::static_pointer_cast(buffer); + // Guard against the size_t subtraction below underflowing. Unlike Vulkan/Metal where the + // backing API consumes (buffer, offset) directly, D3D12 expects us to compute SizeInBytes + // ourselves; an offset at or past the buffer end would wrap to ~0 and the UINT cast would + // then publish a 4 GB range to the GPU. + auto bufferSize = d3d12Buffer->size(); + if (offset >= bufferSize) { + LOGE("D3D12RenderPass::setVertexBuffer: offset %zu is out of range (buffer size=%zu).", offset, + bufferSize); + return; + } + encoder->retainResource(d3d12Buffer); + + D3D12_VERTEX_BUFFER_VIEW view = {}; + view.BufferLocation = d3d12Buffer->d3d12Resource()->GetGPUVirtualAddress() + offset; + view.SizeInBytes = static_cast(bufferSize - offset); + // D3D12 requires the per-vertex stride at draw time. We sourced it from the bound pipeline's + // VertexBufferLayout when the pipeline was built. The Vulkan backend keeps stride implicit in + // the VkPipeline's vertex input description; for D3D12 we must echo it back here. + view.StrideInBytes = currentPipeline->getVertexStride(slot); + commandList->IASetVertexBuffers(slot, 1, &view); +} + +void D3D12RenderPass::setIndexBuffer(std::shared_ptr buffer, IndexFormat format) { + if (!buffer) { + return; + } + auto d3d12Buffer = std::static_pointer_cast(buffer); + // Reject empty buffers up front: passing SizeInBytes=0 to IASetIndexBuffer leaves the index + // buffer effectively unset and later draws would silently produce no primitives. Mirrors the + // defensive offset check in setVertexBuffer. + auto bufferSize = d3d12Buffer->size(); + if (bufferSize == 0) { + LOGE("D3D12RenderPass::setIndexBuffer: buffer has zero size."); + return; + } + encoder->retainResource(d3d12Buffer); + + D3D12_INDEX_BUFFER_VIEW view = {}; + view.BufferLocation = d3d12Buffer->d3d12Resource()->GetGPUVirtualAddress(); + view.SizeInBytes = static_cast(bufferSize); + view.Format = (format == IndexFormat::UInt32) ? DXGI_FORMAT_R32_UINT : DXGI_FORMAT_R16_UINT; + commandList->IASetIndexBuffer(&view); +} + +void D3D12RenderPass::setStencilReference(uint32_t reference) { + commandList->OMSetStencilRef(reference); +} + +void D3D12RenderPass::draw(PrimitiveType primitiveType, uint32_t vertexCount, + uint32_t instanceCount, uint32_t firstVertex, uint32_t firstInstance) { + if (!currentPipeline) { + return; + } + auto topology = ToD3D12Topology(primitiveType); + if (!primitiveTopologySet || topology != currentTopology) { + commandList->IASetPrimitiveTopology(topology); + currentTopology = topology; + primitiveTopologySet = true; + } + flushBindingsIfNeeded(); + commandList->DrawInstanced(vertexCount, instanceCount, firstVertex, firstInstance); +} + +void D3D12RenderPass::drawIndexed(PrimitiveType primitiveType, uint32_t indexCount, + uint32_t instanceCount, uint32_t firstIndex, int32_t baseVertex, + uint32_t firstInstance) { + if (!currentPipeline) { + return; + } + auto topology = ToD3D12Topology(primitiveType); + if (!primitiveTopologySet || topology != currentTopology) { + commandList->IASetPrimitiveTopology(topology); + currentTopology = topology; + primitiveTopologySet = true; + } + flushBindingsIfNeeded(); + commandList->DrawIndexedInstanced(indexCount, instanceCount, firstIndex, baseVertex, + firstInstance); +} + +void D3D12RenderPass::onEnd() { + // Should not happen in normal flow (every draw flushes pendingBarriers), but guard against + // a render pass that ends without any draws ever recorded. + pendingBarriers.flush(commandList); + + // Step 1: collect the pre-resolve transitions for every MSAA color attachment that has a + // resolveTexture, then issue them in a single ResourceBarrier(N, ...) before the actual + // ResolveSubresource calls. Driver state requirements are RESOLVE_SOURCE for the multi-sample + // source and RESOLVE_DEST for the single-sample destination; both are restored to COMMON + // afterwards in step 3, mirroring Vulkan's pResolveAttachments behaviour. + D3D12BarrierBatch resolveBatch; + for (size_t i = 0; i < colorAttachments.size(); i++) { + if (i >= resolveTextures.size() || resolveTextures[i] == nullptr) { + continue; + } + auto& src = colorAttachments[i]; + auto& resolveDst = resolveTextures[i]; + if (src == nullptr) { + continue; + } + auto srcState = src->currentState(); + if (srcState != D3D12_RESOURCE_STATE_RESOLVE_SOURCE) { + resolveBatch.addTransition(src->d3d12Resource(), srcState, + D3D12_RESOURCE_STATE_RESOLVE_SOURCE); + encoder->recordTextureStateChange(src.get(), D3D12_RESOURCE_STATE_RESOLVE_SOURCE); + } + auto dstState = resolveDst->currentState(); + if (dstState != D3D12_RESOURCE_STATE_RESOLVE_DEST) { + resolveBatch.addTransition(resolveDst->d3d12Resource(), dstState, + D3D12_RESOURCE_STATE_RESOLVE_DEST); + encoder->recordTextureStateChange(resolveDst.get(), D3D12_RESOURCE_STATE_RESOLVE_DEST); + } + } + resolveBatch.flush(commandList); + + // Step 2: do the actual MSAA resolves now that every source/destination is in the right state. + for (size_t i = 0; i < colorAttachments.size(); i++) { + if (i >= resolveTextures.size() || resolveTextures[i] == nullptr) { + continue; + } + auto& src = colorAttachments[i]; + auto& resolveDst = resolveTextures[i]; + if (src == nullptr) { + continue; + } + commandList->ResolveSubresource(resolveDst->d3d12Resource(), 0, src->d3d12Resource(), 0, + static_cast(src->dxgiFormat())); + // The two zeros above are dst / src subresource indices. tgfx render targets are flat 2D + // textures (no array, MSAA targets are mip-locked to 1), so subresource 0 is the only valid + // index. Vulkan's pResolveAttachments and Metal's resolve attachments make the same + // assumption. + } + + // Step 3: collapse every "back to COMMON" transition (color attachments, resolve targets, + // depth stencil, sampled textures) into a single ResourceBarrier. The next consumer + // (sample, copy, present) will issue its own transition from COMMON to whatever state it + // needs. D3D12 implicitly decays buffers and simultaneous-access textures to COMMON after + // the command list executes; explicitly issuing the matching CPU-side transition keeps our + // tracker aligned with the runtime so subsequent passes don't trip "Before state mismatch" + // barriers. + D3D12BarrierBatch finalBatch; + for (auto& tex : colorAttachments) { + if (tex == nullptr) continue; + auto current = tex->currentState(); + if (current != D3D12_RESOURCE_STATE_COMMON) { + finalBatch.addTransition(tex->d3d12Resource(), current, D3D12_RESOURCE_STATE_COMMON); + encoder->recordTextureStateChange(tex.get(), D3D12_RESOURCE_STATE_COMMON); + } + } + for (auto& tex : resolveTextures) { + if (tex == nullptr) continue; + auto current = tex->currentState(); + if (current != D3D12_RESOURCE_STATE_COMMON) { + finalBatch.addTransition(tex->d3d12Resource(), current, D3D12_RESOURCE_STATE_COMMON); + encoder->recordTextureStateChange(tex.get(), D3D12_RESOURCE_STATE_COMMON); + } + } + if (depthStencilAttachment != nullptr) { + auto current = depthStencilAttachment->currentState(); + if (current != D3D12_RESOURCE_STATE_COMMON) { + finalBatch.addTransition(depthStencilAttachment->d3d12Resource(), current, + D3D12_RESOURCE_STATE_COMMON); + encoder->recordTextureStateChange(depthStencilAttachment.get(), D3D12_RESOURCE_STATE_COMMON); + } + } + for (auto& tex : shaderResourceTextures) { + if (tex == nullptr) continue; + auto current = tex->currentState(); + if (current != D3D12_RESOURCE_STATE_COMMON) { + finalBatch.addTransition(tex->d3d12Resource(), current, D3D12_RESOURCE_STATE_COMMON); + encoder->recordTextureStateChange(tex.get(), D3D12_RESOURCE_STATE_COMMON); + } + } + finalBatch.flush(commandList); +} + +} // namespace tgfx diff --git a/src/gpu/d3d12/D3D12RenderPass.h b/src/gpu/d3d12/D3D12RenderPass.h new file mode 100644 index 000000000..9528163b1 --- /dev/null +++ b/src/gpu/d3d12/D3D12RenderPass.h @@ -0,0 +1,161 @@ +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Tencent is pleased to support the open source community by making tgfx available. +// +// Copyright (C) 2026 Tencent. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// unless required by applicable law or agreed to in writing, software distributed under the +// license is distributed on an "as is" basis, without warranties or conditions of any kind, +// either express or implied. see the license for the specific language governing permissions +// and limitations under the license. +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include +#include +#include +#include "D3D12BarrierBatch.h" +#include "D3D12Util.h" +#include "tgfx/gpu/RenderPass.h" + +namespace tgfx { + +class D3D12CommandEncoder; +class D3D12GPU; +class D3D12RenderPipeline; +class D3D12Texture; + +/** + * D3D12 render pass implementation. + * + * On construction: + * - Allocates per-pass non-shader-visible RTV/DSV heaps and creates one descriptor per + * attachment. Issues ResourceBarrier transitions and OMSetRenderTargets. + * - Performs ClearRenderTargetView / ClearDepthStencilView for any attachment with + * LoadAction::Clear. + * + * Texture/sampler bindings: + * - Sub-allocates SRV slots out of the GPU's process-wide D3D12DescriptorRing (committed and + * fence-retired around each submission). No CreateDescriptorHeap call per render pass. + * - Reuses each D3D12Sampler's stable GPU descriptor handle from the GPU's append-only + * shader-visible Sampler heap; no per-binding CreateSampler is issued either. + * - SetDescriptorHeaps was already issued once on the encoder's command list, so render passes + * never need to call it. + * + * On end: + * - Transitions color attachments back to COMMON so they can be sampled later. RTV/DSV heaps + * remain alive in the FrameSession until the fence signals. + */ +class D3D12RenderPass : public RenderPass { + public: + static std::shared_ptr Make(D3D12CommandEncoder* encoder, + const RenderPassDescriptor& descriptor); + + ~D3D12RenderPass() override = default; + + GPU* gpu() const override; + void setViewport(int x, int y, int width, int height) override; + void setScissorRect(int x, int y, int width, int height) override; + void setPipeline(std::shared_ptr pipeline) override; + void setUniformBuffer(unsigned binding, std::shared_ptr buffer, size_t offset, + size_t size) override; + void setTexture(unsigned binding, std::shared_ptr texture, + std::shared_ptr sampler) override; + void setVertexBuffer(unsigned slot, std::shared_ptr buffer, + size_t offset = 0) override; + void setIndexBuffer(std::shared_ptr buffer, + IndexFormat format = IndexFormat::UInt16) override; + void setStencilReference(uint32_t reference) override; + void draw(PrimitiveType primitiveType, uint32_t vertexCount, uint32_t instanceCount = 1, + uint32_t firstVertex = 0, uint32_t firstInstance = 0) override; + void drawIndexed(PrimitiveType primitiveType, uint32_t indexCount, uint32_t instanceCount = 1, + uint32_t firstIndex = 0, int32_t baseVertex = 0, + uint32_t firstInstance = 0) override; + + protected: + void onEnd() override; + + private: + D3D12RenderPass(D3D12CommandEncoder* encoder, D3D12GPU* gpu, + const RenderPassDescriptor& descriptor); + + bool initialise(const RenderPassDescriptor& descriptor); + + // Lazily writes any pending texture/uniform bindings to the shader-visible descriptor heaps and + // calls SetGraphicsRootConstantBufferView / SetGraphicsRootDescriptorTable as appropriate. + // Called from draw() / drawIndexed() before the actual draw command. + void flushBindingsIfNeeded(); + + D3D12CommandEncoder* encoder = nullptr; + D3D12GPU* d3d12GPU = nullptr; + ID3D12GraphicsCommandList* commandList = nullptr; + + // Accumulator for resource state transitions queued by setTexture(). Flushed in + // flushBindingsIfNeeded() just before the actual draw, so a draw that touches N sampled + // textures issues a single ResourceBarrier(N, ...) call instead of N single-barrier calls. + D3D12BarrierBatch pendingBarriers; + + // Per-render-pass dedup cache for SRV slots in the GPU's shader-visible CBV/SRV/UAV ring. + // Repeated bindings of the same (resource, format, mipLevels) within a single pass share one + // sub-allocated descriptor; the cache is cleared at the next pass start because ring slots may + // have been retired by then. + struct SrvCacheKey { + ID3D12Resource* resource = nullptr; + DXGI_FORMAT format = static_cast(0); // DXGI_FORMAT_UNKNOWN + UINT mipLevels = 0; + bool operator==(const SrvCacheKey& other) const { + return resource == other.resource && format == other.format && mipLevels == other.mipLevels; + } + }; + struct SrvCacheKeyHash { + size_t operator()(const SrvCacheKey& k) const noexcept { + auto h1 = std::hash{}(static_cast(k.resource)); + auto h2 = std::hash{}(static_cast(k.format)); + auto h3 = std::hash{}(k.mipLevels); + return h1 ^ (h2 * 0x9E3779B97F4A7C15ull) ^ (h3 * 0xBF58476D1CE4E5B9ull); + } + }; + std::unordered_map srvSlotCache; + + // Per-binding deferred state. setUniformBuffer / setTexture record the argument; the actual + // RootCBV / RootDescriptorTable bind happens at flushBindingsIfNeeded() (just before draw). + struct UniformBinding { + D3D12_GPU_VIRTUAL_ADDRESS gpuAddress = 0; + bool dirty = false; + }; + struct TextureBinding { + D3D12_GPU_DESCRIPTOR_HANDLE srvTableStart = {}; + D3D12_GPU_DESCRIPTOR_HANDLE samplerTableStart = {}; + bool dirty = false; + }; + static constexpr unsigned MaxUniformBindings = 8; + static constexpr unsigned MaxTextureBindings = 32; + UniformBinding uniformBindings[MaxUniformBindings] = {}; + TextureBinding textureBindings[MaxTextureBindings] = {}; + + std::shared_ptr currentPipeline = nullptr; + D3D12_PRIMITIVE_TOPOLOGY currentTopology = D3D_PRIMITIVE_TOPOLOGY_UNDEFINED; + bool primitiveTopologySet = false; + + // Color attachments retained for state-restore at onEnd() time. + std::vector> colorAttachments; + // Per-color-attachment MSAA resolve target. Index N corresponds to colorAttachments[N]; an + // entry is nullptr when the matching color attachment has no resolve texture (i.e. sampleCount + // == 1). At onEnd() time we issue ResolveSubresource(colorAttachments[N], resolveTextures[N]) + // for every non-null pair, mirroring Vulkan's pResolveAttachments behaviour. + std::vector> resolveTextures; + std::shared_ptr depthStencilAttachment; + // Textures that were transitioned to PIXEL_SHADER_RESOURCE inside this pass via setTexture(). + // We keep them tracked so that onEnd() can transition them back to COMMON, avoiding mismatches + // with D3D12's automatic state-decay rules between ExecuteCommandLists calls. + std::vector> shaderResourceTextures; +}; + +} // namespace tgfx diff --git a/src/gpu/d3d12/D3D12RenderPipeline.cpp b/src/gpu/d3d12/D3D12RenderPipeline.cpp new file mode 100644 index 000000000..cd42ef642 --- /dev/null +++ b/src/gpu/d3d12/D3D12RenderPipeline.cpp @@ -0,0 +1,482 @@ +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Tencent is pleased to support the open source community by making tgfx available. +// +// Copyright (C) 2026 Tencent. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// unless required by applicable law or agreed to in writing, software distributed under the +// license is distributed on an "as is" basis, without warranties or conditions of any kind, +// either express or implied. see the license for the specific language governing permissions +// and limitations under the license. +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#include "D3D12RenderPipeline.h" +#include +#include +#include "D3D12GPU.h" +#include "D3D12ShaderModule.h" +#include "core/utils/Log.h" +#include "gpu/UniformData.h" +#include "tgfx/gpu/ColorWriteMask.h" +#include "tgfx/gpu/ShaderVisibility.h" + +namespace tgfx { + +// Map a TGFX ShaderVisibility bitmask to the D3D12 single-stage enum used by root parameters. +// D3D12 only allows a single visibility per root parameter; combinations fall back to ALL. +static D3D12_SHADER_VISIBILITY ToD3D12ShaderVisibility(uint32_t visibility) { + if (visibility == ShaderVisibility::Vertex) { + return D3D12_SHADER_VISIBILITY_VERTEX; + } + if (visibility == ShaderVisibility::Fragment) { + return D3D12_SHADER_VISIBILITY_PIXEL; + } + return D3D12_SHADER_VISIBILITY_ALL; +} + +static UINT8 ToD3D12RenderTargetWriteMask(uint32_t mask) { + UINT8 result = 0; + if (mask & ColorWriteMask::RED) result |= D3D12_COLOR_WRITE_ENABLE_RED; + if (mask & ColorWriteMask::GREEN) result |= D3D12_COLOR_WRITE_ENABLE_GREEN; + if (mask & ColorWriteMask::BLUE) result |= D3D12_COLOR_WRITE_ENABLE_BLUE; + if (mask & ColorWriteMask::ALPHA) result |= D3D12_COLOR_WRITE_ENABLE_ALPHA; + return result; +} + +// True when the descriptor declares any non-default stencil state, matching the same predicate +// used by the Vulkan backend so that pipeline state is consistent across backends. +static bool HasNonTrivialStencilState(const DepthStencilDescriptor& ds) { + return ds.stencilFront.compare != CompareFunction::Always || + ds.stencilBack.compare != CompareFunction::Always || + ds.stencilFront.failOp != StencilOperation::Keep || + ds.stencilFront.passOp != StencilOperation::Keep || + ds.stencilFront.depthFailOp != StencilOperation::Keep || + ds.stencilBack.failOp != StencilOperation::Keep || + ds.stencilBack.passOp != StencilOperation::Keep || + ds.stencilBack.depthFailOp != StencilOperation::Keep; +} + +std::shared_ptr D3D12RenderPipeline::Make( + D3D12GPU* gpu, const RenderPipelineDescriptor& descriptor) { + if (gpu == nullptr) { + return nullptr; + } + auto pipeline = gpu->makeResource(gpu, descriptor); + if (pipeline->pipelineState == nullptr) { + return nullptr; + } + return pipeline; +} + +D3D12RenderPipeline::D3D12RenderPipeline(D3D12GPU* gpu, + const RenderPipelineDescriptor& descriptor) { + if (!createRootSignature(gpu, descriptor)) { + return; + } + if (!createPipelineState(gpu, descriptor)) { + return; + } +} + +void D3D12RenderPipeline::onRelease(D3D12GPU*) { + pipelineState = nullptr; + rootSignature = nullptr; +} + +uint32_t D3D12RenderPipeline::getUniformRootParameterIndex(unsigned binding) const { + auto it = uniformRootParameterIndex.find(binding); + return it != uniformRootParameterIndex.end() ? it->second : UINT32_MAX; +} + +uint32_t D3D12RenderPipeline::getTextureRootParameterIndex(unsigned binding) const { + auto it = textureRootParameterIndex.find(binding); + return it != textureRootParameterIndex.end() ? it->second : UINT32_MAX; +} + +uint32_t D3D12RenderPipeline::getSamplerRootParameterIndex(unsigned binding) const { + auto it = samplerRootParameterIndex.find(binding); + return it != samplerRootParameterIndex.end() ? it->second : UINT32_MAX; +} + +unsigned D3D12RenderPipeline::getTextureIndex(unsigned binding) const { + auto it = textureUnits.find(binding); + return it != textureUnits.end() ? it->second : binding; +} + +uint32_t D3D12RenderPipeline::getUniformBlockVisibility(unsigned binding) const { + auto it = uniformBlockVisibility.find(binding); + return it != uniformBlockVisibility.end() ? it->second : ShaderVisibility::VertexFragment; +} + +bool D3D12RenderPipeline::createRootSignature(D3D12GPU* gpu, + const RenderPipelineDescriptor& descriptor) { + // First, populate the per-binding index maps — those are needed for every pipeline regardless + // of whether the underlying ID3D12RootSignature is cached. Walk uniform blocks first, then + // texture samplers, so the parameter indices line up with the order used when serialising. + std::vector shapeKey; + // Reserve roughly: 1 byte UBO count + 4 bytes per UBO (2 visibility + 1 vertex register + + // 1 fragment register) + 1 byte sampler count + 2 bytes per sampler (visibility). + shapeKey.reserve(2 + descriptor.layout.uniformBlocks.size() * 4 + + descriptor.layout.textureSamplers.size() * 2); + + // Pre-scan uniform blocks to compute, for every entry, its 0-based register index inside the + // vertex and fragment stages. SPIR-V binding K is mapped to HLSL register b{idx} where idx is + // the entry's position among same-stage entries in BindingLayout.uniformBlocks. Two entries + // visible to the same stage must therefore yield different register indices, and the indices + // must agree with what D3D12ShaderModule produces when it walks the SPIR-V resources of that + // stage in declaration order. + std::vector ubVertexRegister(descriptor.layout.uniformBlocks.size(), 0xFF); + std::vector ubFragmentRegister(descriptor.layout.uniformBlocks.size(), 0xFF); + uint32_t nextVertexRegister = 0; + uint32_t nextFragmentRegister = 0; + for (size_t i = 0; i < descriptor.layout.uniformBlocks.size(); i++) { + const auto& entry = descriptor.layout.uniformBlocks[i]; + if (entry.visibility & ShaderVisibility::Vertex) { + ubVertexRegister[i] = static_cast(nextVertexRegister++); + } + if (entry.visibility & ShaderVisibility::Fragment) { + ubFragmentRegister[i] = static_cast(nextFragmentRegister++); + } + } + + uint32_t paramCursor = 0; + shapeKey.push_back(static_cast(descriptor.layout.uniformBlocks.size())); + for (size_t i = 0; i < descriptor.layout.uniformBlocks.size(); i++) { + const auto& entry = descriptor.layout.uniformBlocks[i]; + uniformRootParameterIndex[entry.binding] = paramCursor++; + uniformBlockVisibility[entry.binding] = entry.visibility; + uniformBindingSet.insert(entry.binding); + // Encode visibility plus per-stage register indices in the shape key. Different stage-local + // register layouts must hit different cached root signatures; otherwise a pipeline whose + // fragment UBO ends up at b1 (because it has a sibling at b0) would reuse another + // pipeline's root signature that still places it at b0. + shapeKey.push_back(static_cast(entry.visibility & 0xFF)); + shapeKey.push_back(static_cast((entry.visibility >> 8) & 0xFF)); + shapeKey.push_back(ubVertexRegister[i]); + shapeKey.push_back(ubFragmentRegister[i]); + } + + shapeKey.push_back(static_cast(descriptor.layout.textureSamplers.size())); + unsigned textureUnit = 0; + for (const auto& entry : descriptor.layout.textureSamplers) { + uint32_t srvParamIndex = paramCursor++; + uint32_t samplerParamIndex = paramCursor++; + textureRootParameterIndex[entry.binding] = srvParamIndex; + samplerRootParameterIndex[entry.binding] = samplerParamIndex; + textureUnits[entry.binding] = textureUnit++; + textureBindingSet.insert(entry.binding); + // Encode each sampler binding's visibility into the shape key. Without this two pipelines + // that differ only in vertex/fragment-only sampler visibility would collide on the cached + // root signature once the SRV/Sampler root parameters below honour entry.visibility. + shapeKey.push_back(static_cast(entry.visibility & 0xFF)); + shapeKey.push_back(static_cast((entry.visibility >> 8) & 0xFF)); + } + + // Cache hit: reuse the existing D3D12 root signature object. Different pipelines sharing the + // same binding shape (e.g. all single-texture fragment-only shaders) end up referencing one + // ID3D12RootSignature, saving SerializeRootSignature + CreateRootSignature on every PSO. + if (auto cached = gpu->findRootSignature(shapeKey); cached != nullptr) { + rootSignature = std::move(cached); + return true; + } + + // Cache miss: build the root signature description from scratch and serialise it. + std::vector rootParameters; + // Each texture binding contributes two descriptor tables (SRV + Sampler) that live in different + // descriptor heap types. They cannot share one D3D12_ROOT_PARAMETER because each table can only + // reference a single heap. We therefore store each range in its own array entry; the + // D3D12_ROOT_PARAMETER references the array by pointer, so the storage must outlive the + // SerializeRootSignature call. reserve() keeps pointers stable across emplace_back(). + std::vector srvRanges; + std::vector samplerRanges; + srvRanges.reserve(descriptor.layout.textureSamplers.size()); + samplerRanges.reserve(descriptor.layout.textureSamplers.size()); + + for (size_t i = 0; i < descriptor.layout.uniformBlocks.size(); i++) { + const auto& entry = descriptor.layout.uniformBlocks[i]; + D3D12_ROOT_PARAMETER param = {}; + param.ParameterType = D3D12_ROOT_PARAMETER_TYPE_CBV; + param.ShaderVisibility = ToD3D12ShaderVisibility(entry.visibility); + // ShaderRegister is per-stage in HLSL. Pick the register index from whichever stage the + // entry is visible to; for VertexFragment-visible UBOs the two stage-local indices must + // match, otherwise the single CBV root parameter cannot satisfy both stages with one + // register number. Such a configuration is rejected here so the mismatch surfaces early + // instead of producing silently broken bindings. + uint8_t vsReg = ubVertexRegister[i]; + uint8_t fsReg = ubFragmentRegister[i]; + if (vsReg != 0xFF && fsReg != 0xFF && vsReg != fsReg) { + LOGE( + "D3D12RenderPipeline: VertexFragment-visible UBO binding %u cannot share a single CBV " + "root parameter when its vertex-stage register (b%u) and fragment-stage register (b%u) " + "differ. Either split it into vertex-only and fragment-only entries, or extend the " + "root signature to emit two CBV root parameters for this binding.", + entry.binding, static_cast(vsReg), static_cast(fsReg)); + return false; + } + param.Descriptor.ShaderRegister = (vsReg != 0xFF) ? vsReg : fsReg; + param.Descriptor.RegisterSpace = 0; + rootParameters.push_back(param); + } + + unsigned rangeRegister = 0; + for (const auto& entry : descriptor.layout.textureSamplers) { + auto& srvRange = srvRanges.emplace_back(); + srvRange = {}; + srvRange.RangeType = D3D12_DESCRIPTOR_RANGE_TYPE_SRV; + srvRange.NumDescriptors = 1; + srvRange.BaseShaderRegister = rangeRegister; + srvRange.RegisterSpace = 0; + srvRange.OffsetInDescriptorsFromTableStart = 0; + + D3D12_ROOT_PARAMETER srvParam = {}; + srvParam.ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; + // Honour the caller-declared visibility instead of forcing pixel-only. Vertex texture + // sampling (noise / displacement / geometry LOD lookups) needs SRVs visible to the vertex + // stage; the per-entry shapeKey above already partitions the cache so different visibility + // shapes do not collide. + srvParam.ShaderVisibility = ToD3D12ShaderVisibility(entry.visibility); + srvParam.DescriptorTable.NumDescriptorRanges = 1; + srvParam.DescriptorTable.pDescriptorRanges = &srvRange; + rootParameters.push_back(srvParam); + + auto& samplerRange = samplerRanges.emplace_back(); + samplerRange = {}; + samplerRange.RangeType = D3D12_DESCRIPTOR_RANGE_TYPE_SAMPLER; + samplerRange.NumDescriptors = 1; + samplerRange.BaseShaderRegister = rangeRegister; + samplerRange.RegisterSpace = 0; + samplerRange.OffsetInDescriptorsFromTableStart = 0; + + D3D12_ROOT_PARAMETER samplerParam = {}; + samplerParam.ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE; + samplerParam.ShaderVisibility = ToD3D12ShaderVisibility(entry.visibility); + samplerParam.DescriptorTable.NumDescriptorRanges = 1; + samplerParam.DescriptorTable.pDescriptorRanges = &samplerRange; + rootParameters.push_back(samplerParam); + + rangeRegister++; + } + + D3D12_ROOT_SIGNATURE_DESC rootSigDesc = {}; + rootSigDesc.NumParameters = static_cast(rootParameters.size()); + rootSigDesc.pParameters = rootParameters.empty() ? nullptr : rootParameters.data(); + rootSigDesc.NumStaticSamplers = 0; + rootSigDesc.pStaticSamplers = nullptr; + rootSigDesc.Flags = D3D12_ROOT_SIGNATURE_FLAG_ALLOW_INPUT_ASSEMBLER_INPUT_LAYOUT; + + ComPtr blob = nullptr; + ComPtr errorBlob = nullptr; + auto hr = + D3D12SerializeRootSignature(&rootSigDesc, D3D_ROOT_SIGNATURE_VERSION_1, &blob, &errorBlob); + if (FAILED(hr)) { + if (errorBlob != nullptr) { + LOGE("D3D12RenderPipeline: D3D12SerializeRootSignature failed (HRESULT=0x%08X): %s", + static_cast(hr), static_cast(errorBlob->GetBufferPointer())); + } else { + LOGE("D3D12RenderPipeline: D3D12SerializeRootSignature failed (HRESULT=0x%08X).", + static_cast(hr)); + } + return false; + } + + hr = gpu->device()->CreateRootSignature(0, blob->GetBufferPointer(), blob->GetBufferSize(), + IID_PPV_ARGS(&rootSignature)); + if (FAILED(hr)) { + LOGE("D3D12RenderPipeline: CreateRootSignature failed (HRESULT=0x%08X).", + static_cast(hr)); + rootSignature = nullptr; + return false; + } + // Publish the freshly-built root signature so subsequent pipelines with the same shape hit + // the cache. The map keeps an additional ComPtr reference; the pipeline retains its own + // reference via the rootSignature member, so the object outlives whichever owner is dropped + // first. + gpu->cacheRootSignature(std::move(shapeKey), rootSignature); + return true; +} + +bool D3D12RenderPipeline::createPipelineState(D3D12GPU* gpu, + const RenderPipelineDescriptor& descriptor) { + if (!descriptor.vertex.module || !descriptor.fragment.module) { + LOGE("D3D12RenderPipeline: vertex or fragment shader module is missing."); + return false; + } + auto vertexShader = std::static_pointer_cast(descriptor.vertex.module); + auto fragmentShader = std::static_pointer_cast(descriptor.fragment.module); + auto vsBytecode = vertexShader->shaderBytecode(); + auto psBytecode = fragmentShader->shaderBytecode(); + if (vsBytecode.pShaderBytecode == nullptr || psBytecode.pShaderBytecode == nullptr) { + LOGE("D3D12RenderPipeline: shader module produced empty bytecode."); + return false; + } + + // Vertex input layout. Semantic names match the SPIRV-Cross HLSL convention of TEXCOORD{N}, + // where N is the SPIR-V input location assigned by ShaderCompiler::PreprocessGLSL(). + std::vector inputElements; + uint32_t globalLocation = 0; + vertexStrides.assign(descriptor.vertex.bufferLayouts.size(), 0); + for (uint32_t i = 0; i < static_cast(descriptor.vertex.bufferLayouts.size()); i++) { + const auto& layout = descriptor.vertex.bufferLayouts[i]; + uint32_t offset = 0; + for (const auto& attr : layout.attributes) { + D3D12_INPUT_ELEMENT_DESC element = {}; + element.SemanticName = "TEXCOORD"; + element.SemanticIndex = globalLocation++; + element.Format = ToD3D12VertexFormat(attr.format()); + element.InputSlot = i; + element.AlignedByteOffset = offset; + element.InputSlotClass = (layout.stepMode == VertexStepMode::Instance) + ? D3D12_INPUT_CLASSIFICATION_PER_INSTANCE_DATA + : D3D12_INPUT_CLASSIFICATION_PER_VERTEX_DATA; + element.InstanceDataStepRate = (layout.stepMode == VertexStepMode::Instance) ? 1 : 0; + inputElements.push_back(element); + offset += static_cast(attr.size()); + } + // Fall back to the computed attribute total when the descriptor leaves stride at zero, which + // is the same convention the Vulkan/Metal backends use. + vertexStrides[i] = static_cast(layout.stride > 0 ? layout.stride : offset); + } + + D3D12_GRAPHICS_PIPELINE_STATE_DESC psoDesc = {}; + psoDesc.pRootSignature = rootSignature.Get(); + psoDesc.VS = vsBytecode; + psoDesc.PS = psBytecode; + + // Blend state — one entry per color attachment. + psoDesc.BlendState.AlphaToCoverageEnable = + descriptor.multisample.alphaToCoverageEnabled ? TRUE : FALSE; + psoDesc.BlendState.IndependentBlendEnable = + (descriptor.fragment.colorAttachments.size() > 1) ? TRUE : FALSE; + for (size_t i = 0; i < descriptor.fragment.colorAttachments.size() && + i < D3D12_SIMULTANEOUS_RENDER_TARGET_COUNT; + i++) { + const auto& ca = descriptor.fragment.colorAttachments[i]; + auto& rt = psoDesc.BlendState.RenderTarget[i]; + rt.BlendEnable = ca.blendEnable ? TRUE : FALSE; + rt.LogicOpEnable = FALSE; + rt.SrcBlend = ToD3D12BlendFactor(ca.srcColorBlendFactor); + rt.DestBlend = ToD3D12BlendFactor(ca.dstColorBlendFactor); + rt.BlendOp = ToD3D12BlendOperation(ca.colorBlendOp); + rt.SrcBlendAlpha = ToD3D12BlendFactorAlpha(ca.srcAlphaBlendFactor); + rt.DestBlendAlpha = ToD3D12BlendFactorAlpha(ca.dstAlphaBlendFactor); + rt.BlendOpAlpha = ToD3D12BlendOperation(ca.alphaBlendOp); + rt.LogicOp = D3D12_LOGIC_OP_NOOP; + rt.RenderTargetWriteMask = ToD3D12RenderTargetWriteMask(ca.colorWriteMask); + } + + psoDesc.SampleMask = descriptor.multisample.mask; + + // Rasterizer. + psoDesc.RasterizerState.FillMode = D3D12_FILL_MODE_SOLID; + psoDesc.RasterizerState.CullMode = ToD3D12CullMode(descriptor.primitive.cullMode); + psoDesc.RasterizerState.FrontCounterClockwise = + ToD3D12FrontCounterClockwise(descriptor.primitive.frontFace) ? TRUE : FALSE; + psoDesc.RasterizerState.DepthBias = D3D12_DEFAULT_DEPTH_BIAS; + psoDesc.RasterizerState.DepthBiasClamp = D3D12_DEFAULT_DEPTH_BIAS_CLAMP; + psoDesc.RasterizerState.SlopeScaledDepthBias = D3D12_DEFAULT_SLOPE_SCALED_DEPTH_BIAS; + psoDesc.RasterizerState.DepthClipEnable = TRUE; + psoDesc.RasterizerState.MultisampleEnable = (descriptor.multisample.count > 1) ? TRUE : FALSE; + psoDesc.RasterizerState.AntialiasedLineEnable = FALSE; + psoDesc.RasterizerState.ForcedSampleCount = 0; + psoDesc.RasterizerState.ConservativeRaster = D3D12_CONSERVATIVE_RASTERIZATION_MODE_OFF; + + // Depth-stencil. Depth test follows the same enable predicate as VulkanRenderPipeline. + bool depthTestEnable = (descriptor.depthStencil.depthCompare != CompareFunction::Always) || + descriptor.depthStencil.depthWriteEnabled; + psoDesc.DepthStencilState.DepthEnable = depthTestEnable ? TRUE : FALSE; + psoDesc.DepthStencilState.DepthWriteMask = descriptor.depthStencil.depthWriteEnabled + ? D3D12_DEPTH_WRITE_MASK_ALL + : D3D12_DEPTH_WRITE_MASK_ZERO; + psoDesc.DepthStencilState.DepthFunc = + ToD3D12CompareFunction(descriptor.depthStencil.depthCompare); + psoDesc.DepthStencilState.StencilEnable = + HasNonTrivialStencilState(descriptor.depthStencil) ? TRUE : FALSE; + psoDesc.DepthStencilState.StencilReadMask = + static_cast(descriptor.depthStencil.stencilReadMask); + psoDesc.DepthStencilState.StencilWriteMask = + static_cast(descriptor.depthStencil.stencilWriteMask); + psoDesc.DepthStencilState.FrontFace.StencilFailOp = + ToD3D12StencilOperation(descriptor.depthStencil.stencilFront.failOp); + psoDesc.DepthStencilState.FrontFace.StencilDepthFailOp = + ToD3D12StencilOperation(descriptor.depthStencil.stencilFront.depthFailOp); + psoDesc.DepthStencilState.FrontFace.StencilPassOp = + ToD3D12StencilOperation(descriptor.depthStencil.stencilFront.passOp); + psoDesc.DepthStencilState.FrontFace.StencilFunc = + ToD3D12CompareFunction(descriptor.depthStencil.stencilFront.compare); + psoDesc.DepthStencilState.BackFace.StencilFailOp = + ToD3D12StencilOperation(descriptor.depthStencil.stencilBack.failOp); + psoDesc.DepthStencilState.BackFace.StencilDepthFailOp = + ToD3D12StencilOperation(descriptor.depthStencil.stencilBack.depthFailOp); + psoDesc.DepthStencilState.BackFace.StencilPassOp = + ToD3D12StencilOperation(descriptor.depthStencil.stencilBack.passOp); + psoDesc.DepthStencilState.BackFace.StencilFunc = + ToD3D12CompareFunction(descriptor.depthStencil.stencilBack.compare); + + psoDesc.InputLayout.pInputElementDescs = inputElements.empty() ? nullptr : inputElements.data(); + psoDesc.InputLayout.NumElements = static_cast(inputElements.size()); + // Strip cut and topology type live on the PSO in D3D12, but tgfx exposes IndexFormat and + // PrimitiveType as per-draw-call state (RenderPass::setIndexBuffer / RenderPass::draw) + // rather than fields on RenderPipelineDescriptor. The two values below therefore must be + // chosen at PSO creation time without knowing what the eventual draws look like, so we hard + // code them to the only combination tgfx ever uses: + // * IBStripCutValue=DISABLED — matches Vulkan, which sets primitiveRestartEnable=false on + // its PSOs. No tgfx draw op relies on 0xFFFF/0xFFFFFFFF restarting a strip. + // * PrimitiveTopologyType=TRIANGLE — tgfx's PrimitiveType only carries Triangles and + // TriangleStrip today and ToD3D12PrimitiveTopologyType already collapses both onto + // TRIANGLE. Once tgfx adds LINE/POINT (or moves these fields onto PrimitiveDescriptor) + // this branch must be revisited together with the matching IASetPrimitiveTopology call + // in D3D12RenderPass; until then a single PSO topology type covers every draw call. + psoDesc.IBStripCutValue = D3D12_INDEX_BUFFER_STRIP_CUT_VALUE_DISABLED; + psoDesc.PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE_TRIANGLE; + psoDesc.NumRenderTargets = static_cast(descriptor.fragment.colorAttachments.size()); + for (size_t i = 0; i < descriptor.fragment.colorAttachments.size() && + i < D3D12_SIMULTANEOUS_RENDER_TARGET_COUNT; + i++) { + psoDesc.RTVFormats[i] = static_cast( + gpu->getDXGIFormat(descriptor.fragment.colorAttachments[i].format)); + } + psoDesc.DSVFormat = + (descriptor.depthStencil.format != PixelFormat::Unknown) + ? static_cast(gpu->getDXGIFormat(descriptor.depthStencil.format)) + : static_cast(DXGI_FORMAT_UNKNOWN); + psoDesc.SampleDesc.Count = static_cast(descriptor.multisample.count); + psoDesc.SampleDesc.Quality = 0; + psoDesc.NodeMask = 0; + psoDesc.Flags = D3D12_PIPELINE_STATE_FLAG_NONE; + + auto hr = gpu->device()->CreateGraphicsPipelineState(&psoDesc, IID_PPV_ARGS(&pipelineState)); + if (FAILED(hr)) { + LOGE("D3D12RenderPipeline: CreateGraphicsPipelineState failed (HRESULT=0x%08X).", + static_cast(hr)); +#ifdef TGFX_D3D12_DEBUG_LAYER + // Surface debug-layer messages so the underlying validation issue is visible. These are + // queued by the runtime when EnableDebugLayer was called before device creation. + ComPtr infoQueue = nullptr; + if (SUCCEEDED(gpu->device()->QueryInterface(IID_PPV_ARGS(&infoQueue)))) { + auto count = infoQueue->GetNumStoredMessages(); + for (UINT64 i = 0; i < count; i++) { + SIZE_T msgLen = 0; + infoQueue->GetMessage(i, nullptr, &msgLen); + std::vector buf(msgLen); + auto* msg = reinterpret_cast(buf.data()); + if (SUCCEEDED(infoQueue->GetMessage(i, msg, &msgLen))) { + LOGE(" D3D12 message: %.*s", static_cast(msg->DescriptionByteLength), + msg->pDescription); + } + } + infoQueue->ClearStoredMessages(); + } +#endif + pipelineState = nullptr; + return false; + } + return true; +} + +} // namespace tgfx diff --git a/src/gpu/d3d12/D3D12RenderPipeline.h b/src/gpu/d3d12/D3D12RenderPipeline.h new file mode 100644 index 000000000..f1df24633 --- /dev/null +++ b/src/gpu/d3d12/D3D12RenderPipeline.h @@ -0,0 +1,146 @@ +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Tencent is pleased to support the open source community by making tgfx available. +// +// Copyright (C) 2026 Tencent. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// unless required by applicable law or agreed to in writing, software distributed under the +// license is distributed on an "as is" basis, without warranties or conditions of any kind, +// either express or implied. see the license for the specific language governing permissions +// and limitations under the license. +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include +#include +#include +#include "D3D12Resource.h" +#include "D3D12Util.h" +#include "tgfx/gpu/RenderPipeline.h" + +namespace tgfx { + +class D3D12GPU; + +/** + * D3D12 render pipeline implementation. Owns three D3D12 objects produced from a single + * RenderPipelineDescriptor: + * + * 1. Root signature — equivalent to Vulkan's pipeline layout + descriptor set layout. Lays + * out where uniform buffers, textures, and samplers bind in the root + * argument table consumed by the command list. + * 2. Pipeline state object (PSO) — fixed-function + shader configuration; bound once via + * SetPipelineState() at the start of a draw sequence. + * 3. Binding metadata — lookup tables that translate user-facing binding numbers (the same + * numbers passed by GLSL programs) to root-parameter indices and texture + * unit ordinals consumed by D3D12RenderPass. + * + * Root signature layout produced for every pipeline (matches the SPIR-V -> HLSL register + * convention used by D3D12ShaderModule): + * + * root parameter 0 : CBV (b0, visibility = Vertex) [VertexUniformBlock, optional] + * root parameter 1 : CBV (b0, visibility = Pixel) [FragmentUniformBlock, optional] + * root parameter 2..N+1 : DescriptorTable {SRV t{i}, Sampler s{i}, visibility = Pixel} + * [one per texture sampler binding] + * + * UBO root parameters are CBVs with raw GPU virtual addresses, allowing the command queue to + * dynamically supply per-draw uniform data without re-allocating descriptor heaps. + */ +class D3D12RenderPipeline : public RenderPipeline, public D3D12Resource { + public: + static std::shared_ptr Make(D3D12GPU* gpu, + const RenderPipelineDescriptor& descriptor); + + ID3D12RootSignature* d3d12RootSignature() const { + return rootSignature.Get(); + } + + ID3D12PipelineState* d3d12PipelineState() const { + return pipelineState.Get(); + } + + /** + * Returns the root-parameter index that holds the CBV for the given uniform-block binding, + * or UINT32_MAX if the binding is not present in the pipeline. + */ + uint32_t getUniformRootParameterIndex(unsigned binding) const; + + /** + * Returns the root-parameter index of the descriptor table holding the SRV for the given + * texture-sampler binding, or UINT32_MAX if the binding is not present. The Sampler descriptor + * table for the same binding is stored at the next consecutive root parameter and can be + * obtained with getSamplerRootParameterIndex(). + */ + uint32_t getTextureRootParameterIndex(unsigned binding) const; + + /** + * Returns the root-parameter index of the descriptor table holding the Sampler for the given + * texture-sampler binding, or UINT32_MAX if the binding is not present. + */ + uint32_t getSamplerRootParameterIndex(unsigned binding) const; + + /** + * Returns the dense 0-based texture unit index for a texture-sampler binding. Mirrors the + * VulkanRenderPipeline accessor used by RenderPass to map binding -> shader register. + */ + unsigned getTextureIndex(unsigned binding) const; + + /** + * Returns the visibility bitmask (ShaderVisibility::*) declared by the user for a uniform-block + * binding, or ShaderVisibility::VertexFragment if unspecified. + */ + uint32_t getUniformBlockVisibility(unsigned binding) const; + + /** + * Returns the byte stride for the vertex buffer slot at the given index, as declared by the + * pipeline's VertexBufferLayout. Returns 0 for slots that the pipeline does not consume. + */ + uint32_t getVertexStride(unsigned slot) const { + return slot < vertexStrides.size() ? vertexStrides[slot] : 0; + } + + bool hasUniformBinding(unsigned binding) const { + return uniformBindingSet.count(binding) > 0; + } + + bool hasTextureBinding(unsigned binding) const { + return textureBindingSet.count(binding) > 0; + } + + const std::unordered_set& getTextureBindings() const { + return textureBindingSet; + } + + protected: + void onRelease(D3D12GPU* gpu) override; + + private: + D3D12RenderPipeline(D3D12GPU* gpu, const RenderPipelineDescriptor& descriptor); + ~D3D12RenderPipeline() override = default; + + bool createRootSignature(D3D12GPU* gpu, const RenderPipelineDescriptor& descriptor); + bool createPipelineState(D3D12GPU* gpu, const RenderPipelineDescriptor& descriptor); + + ComPtr rootSignature = nullptr; + ComPtr pipelineState = nullptr; + + std::unordered_map uniformRootParameterIndex = {}; + std::unordered_map textureRootParameterIndex = {}; + std::unordered_map samplerRootParameterIndex = {}; + std::unordered_map textureUnits = {}; + std::unordered_map uniformBlockVisibility = {}; + std::unordered_set uniformBindingSet = {}; + std::unordered_set textureBindingSet = {}; + std::vector vertexStrides = {}; + + friend class D3D12GPU; +}; + +} // namespace tgfx diff --git a/src/gpu/d3d12/D3D12Resource.cpp b/src/gpu/d3d12/D3D12Resource.cpp new file mode 100644 index 000000000..b33e6236b --- /dev/null +++ b/src/gpu/d3d12/D3D12Resource.cpp @@ -0,0 +1,21 @@ +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Tencent is pleased to support the open source community by making tgfx available. +// +// Copyright (C) 2026 Tencent. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// unless required by applicable law or agreed to in writing, software distributed under the +// license is distributed on an "as is" basis, without warranties or conditions of any kind, +// either express or implied. see the license for the specific language governing permissions +// and limitations under the license. +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#include "D3D12Resource.h" + +namespace tgfx {} // namespace tgfx diff --git a/src/gpu/d3d12/D3D12Resource.h b/src/gpu/d3d12/D3D12Resource.h new file mode 100644 index 000000000..707abf10f --- /dev/null +++ b/src/gpu/d3d12/D3D12Resource.h @@ -0,0 +1,47 @@ +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Tencent is pleased to support the open source community by making tgfx available. +// +// Copyright (C) 2026 Tencent. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// unless required by applicable law or agreed to in writing, software distributed under the +// license is distributed on an "as is" basis, without warranties or conditions of any kind, +// either express or implied. see the license for the specific language governing permissions +// and limitations under the license. +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include +#include "core/utils/ReturnQueue.h" + +namespace tgfx { + +class D3D12GPU; + +/** + * Base class for D3D12 GPU resources. Subclasses must implement the onRelease() method to free all + * underlying GPU resources. No D3D12 API calls should be made during destruction since the resource + * may be destroyed on any thread. + */ +class D3D12Resource : public ReturnNode { + protected: + /** + * Overridden to free the underlying D3D12 resources. After calling this method, the D3D12Resource + * must not be used, as doing so may lead to undefined behavior. + */ + virtual void onRelease(D3D12GPU* gpu) = 0; + + private: + std::list::iterator cachedPosition; + + friend class D3D12GPU; +}; + +} // namespace tgfx diff --git a/src/gpu/d3d12/D3D12Sampler.cpp b/src/gpu/d3d12/D3D12Sampler.cpp new file mode 100644 index 000000000..0391ffa98 --- /dev/null +++ b/src/gpu/d3d12/D3D12Sampler.cpp @@ -0,0 +1,78 @@ +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Tencent is pleased to support the open source community by making tgfx available. +// +// Copyright (C) 2026 Tencent. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// unless required by applicable law or agreed to in writing, software distributed under the +// license is distributed on an "as is" basis, without warranties or conditions of any kind, +// either express or implied. see the license for the specific language governing permissions +// and limitations under the license. +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#include "D3D12Sampler.h" +#include "D3D12GPU.h" + +namespace tgfx { + +std::shared_ptr D3D12Sampler::Make(D3D12GPU* gpu, + const SamplerDescriptor& descriptor) { + if (gpu == nullptr) { + return nullptr; + } + + D3D12_SAMPLER_DESC samplerDesc = {}; + samplerDesc.Filter = + ToD3D12Filter(descriptor.minFilter, descriptor.magFilter, descriptor.mipmapMode); + samplerDesc.AddressU = ToD3D12AddressMode(descriptor.addressModeX); + samplerDesc.AddressV = ToD3D12AddressMode(descriptor.addressModeY); + samplerDesc.AddressW = D3D12_TEXTURE_ADDRESS_MODE_CLAMP; + samplerDesc.MipLODBias = 0.0f; + samplerDesc.MaxAnisotropy = 1; + samplerDesc.ComparisonFunc = D3D12_COMPARISON_FUNC_NEVER; + // tgfx's public SamplerDescriptor does not expose a border colour today, so all three GPU + // backends hardcode transparent black: VulkanSampler picks VK_BORDER_COLOR_FLOAT_TRANSPARENT_ + // BLACK and MetalSampler picks MTLSamplerBorderColorTransparentBlack. If a borderColor field + // is ever added to SamplerDescriptor, this branch must thread it through and D3D12GPU:: + // MakeSamplerKey must include it in the cache key (otherwise two samplers differing only in + // border colour would collide in samplerCache). Keep the three backends in sync. + samplerDesc.BorderColor[0] = 0.0f; + samplerDesc.BorderColor[1] = 0.0f; + samplerDesc.BorderColor[2] = 0.0f; + samplerDesc.BorderColor[3] = 0.0f; + samplerDesc.MinLOD = 0.0f; + // When mipmap is disabled, clamp MaxLOD to 0 so the hardware always samples mip 0. Picking a + // D3D12_FILTER_*_MIP_POINT alone is not enough: the driver still walks the mip chain and may + // pick a smaller level (the "mipmap-disabled" filters in D3D12 only describe the filter shape + // used between mips, not whether mip selection happens). Mirror VulkanSampler's maxLod clamp + // so a SamplerDescriptor with mipmapMode=None produces the same result that + // RenderContext::drawImageRect (and other Strict-constraint paths) intend. + samplerDesc.MaxLOD = (descriptor.mipmapMode == MipmapMode::None) ? 0.0f : D3D12_FLOAT32_MAX; + + // Permanently reserve a slot in the process-wide shader-visible Sampler heap and write the + // descriptor there. The slot lives for the rest of the GPU's lifetime, mirroring the cache + // semantics already enforced by D3D12GPU::createSampler. + auto gpuHandle = gpu->allocatePermanentSamplerSlot(samplerDesc); + if (gpuHandle.ptr == 0) { + return nullptr; + } + + return gpu->makeResource(samplerDesc, gpuHandle); +} + +D3D12Sampler::D3D12Sampler(const D3D12_SAMPLER_DESC& samplerDesc, + D3D12_GPU_DESCRIPTOR_HANDLE gpuHandle) + : _samplerDesc(samplerDesc), _gpuHandle(gpuHandle) { +} + +void D3D12Sampler::onRelease(D3D12GPU*) { + // D3D12 samplers are pure descriptors. No GPU resource to release. +} + +} // namespace tgfx diff --git a/src/gpu/d3d12/D3D12Sampler.h b/src/gpu/d3d12/D3D12Sampler.h new file mode 100644 index 000000000..33f06c7ab --- /dev/null +++ b/src/gpu/d3d12/D3D12Sampler.h @@ -0,0 +1,69 @@ +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Tencent is pleased to support the open source community by making tgfx available. +// +// Copyright (C) 2026 Tencent. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// unless required by applicable law or agreed to in writing, software distributed under the +// license is distributed on an "as is" basis, without warranties or conditions of any kind, +// either express or implied. see the license for the specific language governing permissions +// and limitations under the license. +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include "D3D12Resource.h" +#include "D3D12Util.h" +#include "tgfx/gpu/Sampler.h" + +namespace tgfx { + +class D3D12GPU; + +/** + * D3D12 sampler implementation. + * + * The sampler descriptor is written into the GPU's process-wide shader-visible Sampler heap at + * construction time and the resulting GPU descriptor handle is cached on the instance. That + * handle is what render passes bind via SetGraphicsRootDescriptorTable: there is no per-pass + * Sampler heap allocation or per-binding CreateSampler call. + */ +class D3D12Sampler : public Sampler, public D3D12Resource { + public: + static std::shared_ptr Make(D3D12GPU* gpu, const SamplerDescriptor& descriptor); + + /** + * Returns the D3D12 sampler description. + */ + const D3D12_SAMPLER_DESC& samplerDesc() const { + return _samplerDesc; + } + + /** + * GPU descriptor handle pointing at this sampler's slot in the process-wide shader-visible + * Sampler heap. Stable for the lifetime of the GPU instance. + */ + D3D12_GPU_DESCRIPTOR_HANDLE gpuDescriptorHandle() const { + return _gpuHandle; + } + + protected: + void onRelease(D3D12GPU* gpu) override; + + private: + D3D12Sampler(const D3D12_SAMPLER_DESC& samplerDesc, D3D12_GPU_DESCRIPTOR_HANDLE gpuHandle); + ~D3D12Sampler() override = default; + + D3D12_SAMPLER_DESC _samplerDesc = {}; + D3D12_GPU_DESCRIPTOR_HANDLE _gpuHandle = {}; + + friend class D3D12GPU; +}; + +} // namespace tgfx diff --git a/src/gpu/d3d12/D3D12Semaphore.cpp b/src/gpu/d3d12/D3D12Semaphore.cpp new file mode 100644 index 000000000..a134fffce --- /dev/null +++ b/src/gpu/d3d12/D3D12Semaphore.cpp @@ -0,0 +1,66 @@ +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Tencent is pleased to support the open source community by making tgfx available. +// +// Copyright (C) 2026 Tencent. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// unless required by applicable law or agreed to in writing, software distributed under the +// license is distributed on an "as is" basis, without warranties or conditions of any kind, +// either express or implied. see the license for the specific language governing permissions +// and limitations under the license. +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#include "D3D12Semaphore.h" +#include "D3D12GPU.h" +#include "core/utils/Log.h" + +namespace tgfx { + +std::shared_ptr D3D12Semaphore::Make(D3D12GPU* gpu) { + if (gpu == nullptr) { + return nullptr; + } + ComPtr fence = nullptr; + auto hr = gpu->device()->CreateFence(0, D3D12_FENCE_FLAG_NONE, IID_PPV_ARGS(&fence)); + if (FAILED(hr)) { + LOGE("D3D12Semaphore::Make() CreateFence failed: HRESULT=0x%08X", static_cast(hr)); + return nullptr; + } + return gpu->makeResource(std::move(fence), static_cast(0), false); +} + +std::shared_ptr D3D12Semaphore::MakeFrom(D3D12GPU* gpu, ComPtr fence, + uint64_t value) { + if (gpu == nullptr || fence == nullptr) { + return nullptr; + } + return gpu->makeResource(std::move(fence), value, true); +} + +D3D12Semaphore::D3D12Semaphore(ComPtr fence, uint64_t value, bool adopted) + : _fence(std::move(fence)), _value(value), _adopted(adopted) { +} + +BackendSemaphore D3D12Semaphore::getBackendSemaphore() const { + if (_fence == nullptr) { + return {}; + } + D3D12SyncInfo info = {}; + info.fence = _fence.Get(); + info.value = _value; + return BackendSemaphore(info); +} + +void D3D12Semaphore::onRelease(D3D12GPU*) { + if (!_adopted) { + _fence = nullptr; + } +} + +} // namespace tgfx diff --git a/src/gpu/d3d12/D3D12Semaphore.h b/src/gpu/d3d12/D3D12Semaphore.h new file mode 100644 index 000000000..2174583c5 --- /dev/null +++ b/src/gpu/d3d12/D3D12Semaphore.h @@ -0,0 +1,73 @@ +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Tencent is pleased to support the open source community by making tgfx available. +// +// Copyright (C) 2026 Tencent. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// unless required by applicable law or agreed to in writing, software distributed under the +// license is distributed on an "as is" basis, without warranties or conditions of any kind, +// either express or implied. see the license for the specific language governing permissions +// and limitations under the license. +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include "D3D12Resource.h" +#include "D3D12Util.h" +#include "tgfx/gpu/Semaphore.h" + +namespace tgfx { + +class D3D12GPU; + +/** + * D3D12 semaphore implementation backed by an ID3D12Fence and a target signal value. Once the + * fence reaches the target value, all GPU work submitted before the signal is guaranteed to have + * completed, providing the same semantics as a Vulkan timeline semaphore. + */ +class D3D12Semaphore : public Semaphore, public D3D12Resource { + public: + static std::shared_ptr Make(D3D12GPU* gpu); + + static std::shared_ptr MakeFrom(D3D12GPU* gpu, ComPtr fence, + uint64_t value); + + D3D12Semaphore(ComPtr fence, uint64_t value, bool adopted); + ~D3D12Semaphore() override = default; + + ID3D12Fence* d3d12Fence() const { + return _fence.Get(); + } + + uint64_t signalValue() const { + return _value; + } + + uint64_t nextSignalValue() const { + return _value + 1; + } + + void commitSignalValue() { + ++_value; + } + + BackendSemaphore getBackendSemaphore() const override; + + protected: + void onRelease(D3D12GPU* gpu) override; + + private: + ComPtr _fence = nullptr; + uint64_t _value = 0; + bool _adopted = false; + + friend class D3D12GPU; +}; + +} // namespace tgfx diff --git a/src/gpu/d3d12/D3D12ShaderModule.cpp b/src/gpu/d3d12/D3D12ShaderModule.cpp new file mode 100644 index 000000000..dbd59c344 --- /dev/null +++ b/src/gpu/d3d12/D3D12ShaderModule.cpp @@ -0,0 +1,174 @@ +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Tencent is pleased to support the open source community by making tgfx available. +// +// Copyright (C) 2026 Tencent. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// unless required by applicable law or agreed to in writing, software distributed under the +// license is distributed on an "as is" basis, without warranties or conditions of any kind, +// either express or implied. see the license for the specific language governing permissions +// and limitations under the license. +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#include "D3D12ShaderModule.h" +#include +#include "D3D12GPU.h" +#include "core/utils/Log.h" +#include "gpu/ShaderCompiler.h" +#include "gpu/UniformData.h" +// Suppress warnings from SPIRV-Cross headers +#pragma warning(push) +#pragma warning(disable : 4100 4458 4245 4127 4244) +#include +#include +#pragma warning(pop) + +namespace tgfx { + +// Convert a SPIR-V binary to HLSL source code suitable for D3DCompile with profile vs_5_0/ps_5_0. +// +// Binding strategy: +// - UBOs are walked in the order SPIRV-Cross returns them — which is GLSL declaration order, +// and therefore matches the BindingLayout::uniformBlocks order seen by the pipeline side — +// and assigned consecutive CBV registers b0, b1, ... within this single stage. HLSL register +// namespaces are per-stage so the vertex stage and the pixel stage have independent b0 +// packings; D3D12RenderPipeline's root signature mirrors this by giving each entry a +// stage-local register index. +// - Sampled images at SPIR-V bindings 2..N are mapped to (t{N-2}, s{N-2}). Shifting by +// TEXTURE_BINDING_POINT_START keeps the t/s register space dense starting at zero, which +// simplifies root-signature construction. +static std::string convertSPIRVToHLSL(const std::vector& spirvBinary, ShaderStage stage) { + spirv_cross::Parser spvParser(spirvBinary.data(), spirvBinary.size()); + spvParser.parse(); + spirv_cross::CompilerHLSL hlslCompiler(std::move(spvParser.get_parsed_ir())); + + spirv_cross::CompilerHLSL::Options hlslOptions; + hlslOptions.shader_model = 50; + hlslCompiler.set_hlsl_options(hlslOptions); + + auto commonOptions = hlslCompiler.get_common_options(); + // Compensate for HLSL's clip-space Y direction matching Vulkan/GL after our standard flip. + commonOptions.vertex.flip_vert_y = true; + hlslCompiler.set_common_options(commonOptions); + + auto executionModel = + (stage == ShaderStage::Vertex) ? spv::ExecutionModelVertex : spv::ExecutionModelFragment; + + auto resources = hlslCompiler.get_shader_resources(); + + // Map UBOs: for the current stage, walk the SPIR-V uniform buffers in the order produced by + // SPIRV-Cross (which matches GLSL declaration order, identical to BindingLayout's + // uniformBlocks order on the pipeline side) and assign them HLSL CBV registers b0, b1, ... + // sequentially. HLSL register namespaces are per-stage, so this gives each stage a dense + // packing that matches D3D12RenderPipeline::createRootSignature, which assigns the same + // stage-local index to each entry's CBV root parameter. + uint32_t cbvRegister = 0; + for (auto& ubo : resources.uniform_buffers) { + uint32_t spvBinding = hlslCompiler.get_decoration(ubo.id, spv::DecorationBinding); + uint32_t spvDescSet = hlslCompiler.get_decoration(ubo.id, spv::DecorationDescriptorSet); + spirv_cross::HLSLResourceBinding resourceBinding = {}; + resourceBinding.stage = executionModel; + resourceBinding.desc_set = spvDescSet; + resourceBinding.binding = spvBinding; + resourceBinding.cbv.register_binding = cbvRegister++; + resourceBinding.cbv.register_space = 0; + hlslCompiler.add_hlsl_resource_binding(resourceBinding); + } + + // Map combined samplers: SPIR-V binding N -> (t{N - TEXTURE_BINDING_POINT_START}, + // s{N - TEXTURE_BINDING_POINT_START}). + for (auto& image : resources.sampled_images) { + uint32_t spvBinding = hlslCompiler.get_decoration(image.id, spv::DecorationBinding); + uint32_t spvDescSet = hlslCompiler.get_decoration(image.id, spv::DecorationDescriptorSet); + uint32_t hlslSlot = (spvBinding >= static_cast(TEXTURE_BINDING_POINT_START)) + ? spvBinding - static_cast(TEXTURE_BINDING_POINT_START) + : spvBinding; + spirv_cross::HLSLResourceBinding resourceBinding = {}; + resourceBinding.stage = executionModel; + resourceBinding.desc_set = spvDescSet; + resourceBinding.binding = spvBinding; + resourceBinding.srv.register_binding = hlslSlot; + resourceBinding.srv.register_space = 0; + resourceBinding.sampler.register_binding = hlslSlot; + resourceBinding.sampler.register_space = 0; + hlslCompiler.add_hlsl_resource_binding(resourceBinding); + } + + std::string hlsl = hlslCompiler.compile(); + if (hlsl.empty()) { + LOGE("D3D12ShaderModule: SPIR-V to HLSL conversion produced empty source."); + } + return hlsl; +} + +// Compile HLSL source to a DXBC bytecode blob using D3DCompile with the appropriate stage profile. +static ComPtr compileHLSLToDXBC(const std::string& hlsl, ShaderStage stage) { + const char* target = (stage == ShaderStage::Vertex) ? "vs_5_0" : "ps_5_0"; + UINT flags = D3DCOMPILE_ENABLE_STRICTNESS; +#ifdef _DEBUG + flags |= D3DCOMPILE_DEBUG | D3DCOMPILE_SKIP_OPTIMIZATION; +#else + flags |= D3DCOMPILE_OPTIMIZATION_LEVEL3; +#endif + + ComPtr codeBlob = nullptr; + ComPtr errorBlob = nullptr; + auto hr = D3DCompile(hlsl.data(), hlsl.size(), nullptr, nullptr, nullptr, "main", target, flags, + 0, &codeBlob, &errorBlob); + if (FAILED(hr)) { + if (errorBlob != nullptr) { + LOGE("D3D12ShaderModule: D3DCompile failed (HRESULT=0x%08X): %s", static_cast(hr), + static_cast(errorBlob->GetBufferPointer())); + } else { + LOGE("D3D12ShaderModule: D3DCompile failed (HRESULT=0x%08X) with no error message.", + static_cast(hr)); + } + LOGE("D3D12ShaderModule: HLSL source (first 1024 chars):\n%.1024s", hlsl.c_str()); + return nullptr; + } + return codeBlob; +} + +std::shared_ptr D3D12ShaderModule::Make( + D3D12GPU* gpu, const ShaderModuleDescriptor& descriptor) { + if (gpu == nullptr) { + return nullptr; + } + auto module = gpu->makeResource(gpu, descriptor); + if (module->bytecode == nullptr) { + return nullptr; + } + return module; +} + +D3D12ShaderModule::D3D12ShaderModule(D3D12GPU* gpu, const ShaderModuleDescriptor& descriptor) + : _stage(descriptor.stage) { + std::string vulkanGLSL = PreprocessGLSL(descriptor.code); + // D3D12 needs every declared interface variable to survive — see ShaderCompiler.h. + auto spirvBinary = CompileGLSLToSPIRV(gpu->shaderCompiler(), vulkanGLSL, descriptor.stage, true); + if (spirvBinary.empty()) { + LOGE("D3D12ShaderModule: GLSL to SPIR-V compilation failed."); + return; + } + std::string hlsl = convertSPIRVToHLSL(spirvBinary, descriptor.stage); + if (hlsl.empty()) { + return; + } + bytecode = compileHLSLToDXBC(hlsl, descriptor.stage); +#ifdef TGFX_D3D12_DEBUG_LAYER + _hlslSource = std::move(hlsl); +#endif +} + +void D3D12ShaderModule::onRelease(D3D12GPU*) { + // ID3DBlob is reference counted via ComPtr; releasing the ComPtr frees the bytecode. + bytecode = nullptr; +} + +} // namespace tgfx diff --git a/src/gpu/d3d12/D3D12ShaderModule.h b/src/gpu/d3d12/D3D12ShaderModule.h new file mode 100644 index 000000000..f674d6a74 --- /dev/null +++ b/src/gpu/d3d12/D3D12ShaderModule.h @@ -0,0 +1,100 @@ +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Tencent is pleased to support the open source community by making tgfx available. +// +// Copyright (C) 2026 Tencent. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// unless required by applicable law or agreed to in writing, software distributed under the +// license is distributed on an "as is" basis, without warranties or conditions of any kind, +// either express or implied. see the license for the specific language governing permissions +// and limitations under the license. +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include +#include +#include "D3D12Resource.h" +#include "D3D12Util.h" +#include "tgfx/gpu/ShaderModule.h" +#include "tgfx/gpu/ShaderStage.h" + +namespace tgfx { + +class D3D12GPU; + +/** + * D3D12 shader module implementation. + * + * Compilation pipeline (matching the GLSL-first design used by Vulkan and Metal backends): + * 1. PreprocessGLSL — assigns explicit binding/location qualifiers (shared with Vulkan/Metal). + * 2. CompileGLSLToSPIRV — uses shaderc to produce SPIR-V (shared with Vulkan/Metal). + * 3. SPIR-V -> HLSL — uses spirv_cross::CompilerHLSL targeting shader model 5.0. + * 4. HLSL -> DXBC — uses D3DCompile with profile vs_5_0 / ps_5_0. + * + * The resulting DXBC blob is consumed by D3D12RenderPipeline via shaderBytecode(). + * + * Resource binding mapping (SPIR-V binding -> HLSL register): + * - VertexUniformBlock (binding 0) -> b0 + * - FragmentUniformBlock (binding 1) -> b0 (HLSL b/t/s registers are per shader stage; both + * stages can use b0 without colliding because the + * D3D12 root signature distinguishes them via + * ShaderVisibility.) + * - sampler bindings (binding N >= 2) -> t{N-2} + s{N-2} + * + * SPIRV-Cross's default behaviour already matches CBV/SRV/Sampler register classes derived from + * the SPIR-V resource type, so the only customisation we need is shifting samplers to register 0. + */ +class D3D12ShaderModule : public ShaderModule, public D3D12Resource { + public: + static std::shared_ptr Make(D3D12GPU* gpu, + const ShaderModuleDescriptor& descriptor); + + /** + * Returns the compiled DXBC bytecode in the form expected by D3D12 pipeline state descriptors. + * The returned struct references memory owned by this object; its lifetime is bound to the + * lifetime of the D3D12ShaderModule. + */ + D3D12_SHADER_BYTECODE shaderBytecode() const { + if (bytecode == nullptr) { + return {nullptr, 0}; + } + return {bytecode->GetBufferPointer(), bytecode->GetBufferSize()}; + } + + ShaderStage stage() const { + return _stage; + } + +#ifdef TGFX_D3D12_DEBUG_LAYER + /// Returns the cross-compiled HLSL source captured during construction. Diagnostic-only: + /// available only when TGFX_D3D12_DEBUG_LAYER is defined so production builds don't pay the + /// memory cost of holding HLSL strings. + const std::string& hlslSource() const { + return _hlslSource; + } +#endif + + protected: + void onRelease(D3D12GPU* gpu) override; + + private: + D3D12ShaderModule(D3D12GPU* gpu, const ShaderModuleDescriptor& descriptor); + ~D3D12ShaderModule() override = default; + + ShaderStage _stage = ShaderStage::Vertex; + ComPtr bytecode = nullptr; +#ifdef TGFX_D3D12_DEBUG_LAYER + std::string _hlslSource; +#endif + + friend class D3D12GPU; +}; + +} // namespace tgfx diff --git a/src/gpu/d3d12/D3D12Texture.cpp b/src/gpu/d3d12/D3D12Texture.cpp new file mode 100644 index 000000000..54550ecbe --- /dev/null +++ b/src/gpu/d3d12/D3D12Texture.cpp @@ -0,0 +1,194 @@ +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Tencent is pleased to support the open source community by making tgfx available. +// +// Copyright (C) 2026 Tencent. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// unless required by applicable law or agreed to in writing, software distributed under the +// license is distributed on an "as is" basis, without warranties or conditions of any kind, +// either express or implied. see the license for the specific language governing permissions +// and limitations under the license. +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#include "D3D12Texture.h" +#include "D3D12Defines.h" +#include "D3D12GPU.h" +#include "core/utils/Log.h" + +namespace tgfx { + +static D3D12_RESOURCE_FLAGS ToD3D12ResourceFlags(uint32_t usage) { + D3D12_RESOURCE_FLAGS flags = D3D12_RESOURCE_FLAG_NONE; + if (usage & TextureUsage::RENDER_ATTACHMENT) { + flags |= D3D12_RESOURCE_FLAG_ALLOW_RENDER_TARGET; + } + return flags; +} + +std::shared_ptr D3D12Texture::Make(D3D12GPU* gpu, + const TextureDescriptor& descriptor) { + if (gpu == nullptr || descriptor.width <= 0 || descriptor.height <= 0) { + return nullptr; + } + + // D3D12 disallows D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS on MSAA resources, and we need + // UAV access on every mipmapped texture so the compute mipmap generator can write each + // downsampled level. Rather than create the resource and let CreateCommittedResource fail + // with a runtime debug-layer error, reject the combination up front. The GL backend already + // enforces the same contract in GLMultisampleTexture::MakeFrom; Metal's MTLTextureType2D- + // Multisample type cannot carry mip levels at all and so doesn't need an explicit check. + if (descriptor.mipLevelCount > 1 && descriptor.sampleCount > 1) { + LOGE( + "D3D12Texture::Make() multisample textures cannot have mip levels (mipLevelCount=%d, " + "sampleCount=%d).", + descriptor.mipLevelCount, descriptor.sampleCount); + return nullptr; + } + + auto dxgiFormat = static_cast(gpu->getDXGIFormat(descriptor.format)); + if (dxgiFormat == static_cast(DXGI_FORMAT_UNKNOWN)) { + LOGE("D3D12Texture::Make() unsupported pixel format: %d", static_cast(descriptor.format)); + return nullptr; + } + + bool isDepthStencil = (descriptor.format == PixelFormat::DEPTH24_STENCIL8); + + D3D12_RESOURCE_FLAGS resourceFlags = ToD3D12ResourceFlags(descriptor.usage); + if (isDepthStencil) { + resourceFlags |= D3D12_RESOURCE_FLAG_ALLOW_DEPTH_STENCIL; + resourceFlags &= ~D3D12_RESOURCE_FLAG_ALLOW_RENDER_TARGET; + } else { + // D3D12 requires ALLOW_RENDER_TARGET to be set at resource creation time before any + // CreateRenderTargetView call against the resource is legal. Other backends (Vulkan/Metal) + // can derive render-target capability lazily, so callers across the codebase commonly + // create textures with the default usage (TEXTURE_BINDING) and later wrap them via + // Surface::MakeFrom(context, backendTexture, ...). To keep that path working on D3D12 we + // unconditionally enable the flag for any non-depth, renderable colour format. The cost is + // marginal (some drivers skip a sampling-only compression path) and it avoids hard + // device-removal when a sampled texture is later asked to act as a render target. + if (gpu->isFormatRenderable(descriptor.format)) { + resourceFlags |= D3D12_RESOURCE_FLAG_ALLOW_RENDER_TARGET; + } + if (descriptor.mipLevelCount > 1) { + // Mipmapped textures need to be writable from a compute shader so that + // generateMipmapsForTexture() can downsample mip[i] -> mip[i+1] via UAV writes. The flag is + // a no-op for the basic sampling path and only adds a small driver-internal alignment + // overhead. + resourceFlags |= D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS; + } + } + + D3D12_HEAP_PROPERTIES heapProperties = {}; + heapProperties.Type = D3D12_HEAP_TYPE_DEFAULT; + + D3D12_RESOURCE_DESC resourceDesc = {}; + resourceDesc.Dimension = D3D12_RESOURCE_DIMENSION_TEXTURE2D; + resourceDesc.Width = static_cast(descriptor.width); + resourceDesc.Height = static_cast(descriptor.height); + resourceDesc.DepthOrArraySize = 1; + resourceDesc.MipLevels = static_cast(descriptor.mipLevelCount); + resourceDesc.Format = dxgiFormat; + resourceDesc.SampleDesc.Count = static_cast(descriptor.sampleCount); + resourceDesc.SampleDesc.Quality = 0; + resourceDesc.Layout = D3D12_TEXTURE_LAYOUT_UNKNOWN; + resourceDesc.Flags = resourceFlags; + + // Optimised clear values let D3D12 fast-path ClearRenderTargetView / ClearDepthStencilView + // when the runtime-supplied clear matches. We don't know the clear colour at creation time + // (callers vary, e.g. RGBA transparent for offscreen surfaces, white for blur seed), so for + // colour render targets we pass nullptr — forcing the slow-but-deterministic clear path is + // preferable to a perpetual "clear values do not match" debug-layer warning that some drivers + // also turn into a stalled GPU clear. Depth-stencil keeps an optimised value because the test + // suite uses a single canonical (0.0 depth, 0 stencil) clear. + D3D12_CLEAR_VALUE* clearValue = nullptr; + D3D12_CLEAR_VALUE clearValueStorage = {}; + if (isDepthStencil) { + clearValueStorage.Format = dxgiFormat; + clearValueStorage.DepthStencil.Depth = 0.0f; + clearValueStorage.DepthStencil.Stencil = 0; + clearValue = &clearValueStorage; + } + + D3D12_RESOURCE_STATES initialState = D3D12_RESOURCE_STATE_COMMON; + + ComPtr d3d12Resource = nullptr; + auto hr = gpu->device()->CreateCommittedResource(&heapProperties, D3D12_HEAP_FLAG_NONE, + &resourceDesc, initialState, clearValue, + IID_PPV_ARGS(&d3d12Resource)); + if (FAILED(hr)) { + LOGE("D3D12Texture::Make() CreateCommittedResource failed, HRESULT=0x%08X", + static_cast(hr)); + return nullptr; + } + + return gpu->makeResource(descriptor, std::move(d3d12Resource), + static_cast(dxgiFormat)); +} + +std::shared_ptr D3D12Texture::MakeFrom(D3D12GPU* gpu, ComPtr resource, + unsigned dxgiFormat, uint32_t usage, + bool /*adopted*/) { + if (gpu == nullptr || resource == nullptr) { + return nullptr; + } + + auto desc = resource->GetDesc(); + TextureDescriptor descriptor = {}; + descriptor.width = static_cast(desc.Width); + descriptor.height = static_cast(desc.Height); + descriptor.format = DXGIFormatToPixelFormat(dxgiFormat); + descriptor.mipLevelCount = static_cast(desc.MipLevels); + descriptor.sampleCount = static_cast(desc.SampleDesc.Count); + descriptor.usage = usage; + // The `adopted` flag is intentionally ignored on D3D12: COM reference counting makes the + // distinction Vulkan/Metal draw — "tgfx must explicitly destroy" vs "caller keeps owning it" — + // meaningless here. ComPtr always carries its own AddRef/Release pair, so: + // * adopted == true : caller hands its reference to us; on D3D12Texture destruction the + // ComPtr Release brings the refcount to zero and the runtime destroys the resource. + // * adopted == false : caller keeps its reference; we hold an additional one. The resource + // stays alive at least until both refs are released, satisfying the GPU::importBackendTexture + // contract that the backend texture remain valid for the wrapped Texture's lifetime. + // Either way the cleanup logic is identical, so a single code path is enough. + return gpu->makeResource(descriptor, std::move(resource), dxgiFormat); +} + +D3D12Texture::D3D12Texture(const TextureDescriptor& descriptor, + ComPtr d3d12Resource, unsigned dxgiFormat) + : Texture(descriptor), resource(std::move(d3d12Resource)), _dxgiFormat(dxgiFormat) { +} + +void D3D12Texture::onRelease(D3D12GPU*) { + onReleaseTexture(); +} + +void D3D12Texture::onReleaseTexture() { + resource = nullptr; +} + +BackendTexture D3D12Texture::getBackendTexture() const { + if (resource == nullptr || !(descriptor.usage & TextureUsage::TEXTURE_BINDING)) { + return {}; + } + D3D12TextureInfo d3d12Info = {}; + d3d12Info.resource = resource.Get(); + d3d12Info.format = _dxgiFormat; + return BackendTexture(d3d12Info, descriptor.width, descriptor.height); +} + +BackendRenderTarget D3D12Texture::getBackendRenderTarget() const { + if (resource == nullptr || !(descriptor.usage & TextureUsage::RENDER_ATTACHMENT)) { + return {}; + } + D3D12TextureInfo d3d12Info = {}; + d3d12Info.resource = resource.Get(); + d3d12Info.format = _dxgiFormat; + return BackendRenderTarget(d3d12Info, descriptor.width, descriptor.height); +} + +} // namespace tgfx diff --git a/src/gpu/d3d12/D3D12Texture.h b/src/gpu/d3d12/D3D12Texture.h new file mode 100644 index 000000000..a9f47fd3d --- /dev/null +++ b/src/gpu/d3d12/D3D12Texture.h @@ -0,0 +1,93 @@ +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Tencent is pleased to support the open source community by making tgfx available. +// +// Copyright (C) 2026 Tencent. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// unless required by applicable law or agreed to in writing, software distributed under the +// license is distributed on an "as is" basis, without warranties or conditions of any kind, +// either express or implied. see the license for the specific language governing permissions +// and limitations under the license. +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include "D3D12Resource.h" +#include "D3D12Util.h" +#include "tgfx/gpu/Texture.h" + +namespace tgfx { + +class D3D12GPU; + +/** + * D3D12 texture implementation. + */ +class D3D12Texture : public Texture, public D3D12Resource { + public: + static std::shared_ptr Make(D3D12GPU* gpu, const TextureDescriptor& descriptor); + + /** + * Creates a D3D12Texture wrapper from an external D3D12 resource. + */ + static std::shared_ptr MakeFrom(D3D12GPU* gpu, ComPtr resource, + unsigned dxgiFormat, uint32_t usage, bool adopted); + + /** + * Returns the underlying D3D12 resource. + */ + ID3D12Resource* d3d12Resource() const { + return resource.Get(); + } + + /** + * Returns the DXGI format of this texture. + */ + unsigned dxgiFormat() const { + return _dxgiFormat; + } + + /** + * Returns the resource state currently tracked on the CPU. D3D12, unlike Vulkan, requires the + * application to issue explicit ResourceBarrier calls to transition between read and write + * states. We track the most recently announced state per texture so that subsequent bindings + * can construct the correct transition barrier. + * + * Note: on textures imported from external D3D12 resources we initialise the state to COMMON + * (the value the application is required to leave the resource in when handing it off — see + * D3D12 SDK common-state promotion rules). This is conservative but correct. + */ + D3D12_RESOURCE_STATES currentState() const { + return _currentState; + } + + void setCurrentState(D3D12_RESOURCE_STATES state) { + _currentState = state; + } + + BackendTexture getBackendTexture() const override; + BackendRenderTarget getBackendRenderTarget() const override; + + protected: + D3D12Texture(const TextureDescriptor& descriptor, ComPtr resource, + unsigned dxgiFormat); + ~D3D12Texture() override = default; + + void onRelease(D3D12GPU* gpu) override; + + virtual void onReleaseTexture(); + + ComPtr resource = nullptr; + unsigned _dxgiFormat = 0; + D3D12_RESOURCE_STATES _currentState = D3D12_RESOURCE_STATE_COMMON; + + friend class D3D12GPU; +}; + +} // namespace tgfx diff --git a/src/gpu/d3d12/D3D12UploadHeap.cpp b/src/gpu/d3d12/D3D12UploadHeap.cpp new file mode 100644 index 000000000..641db358b --- /dev/null +++ b/src/gpu/d3d12/D3D12UploadHeap.cpp @@ -0,0 +1,186 @@ +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Tencent is pleased to support the open source community by making tgfx available. +// +// Copyright (C) 2026 Tencent. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// unless required by applicable law or agreed to in writing, software distributed under the +// license is distributed on an "as is" basis, without warranties or conditions of any kind, +// either express or implied. see the license for the specific language governing permissions +// and limitations under the license. +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#include "D3D12UploadHeap.h" +#include "core/utils/Log.h" + +namespace tgfx { + +static size_t AlignUpSize(size_t value, size_t alignment) { + if (alignment <= 1) { + return value; + } + return (value + alignment - 1) & ~(alignment - 1); +} + +bool D3D12UploadHeap::init(ID3D12Device* device, size_t capacity) { + if (device == nullptr || capacity == 0) { + return false; + } + D3D12_HEAP_PROPERTIES heapProps = {}; + heapProps.Type = D3D12_HEAP_TYPE_UPLOAD; + + D3D12_RESOURCE_DESC desc = {}; + desc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER; + desc.Width = static_cast(capacity); + desc.Height = 1; + desc.DepthOrArraySize = 1; + desc.MipLevels = 1; + desc.Format = static_cast(DXGI_FORMAT_UNKNOWN); + desc.SampleDesc.Count = 1; + desc.SampleDesc.Quality = 0; + desc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR; + desc.Flags = D3D12_RESOURCE_FLAG_NONE; + + ComPtr resource = nullptr; + auto hr = device->CreateCommittedResource(&heapProps, D3D12_HEAP_FLAG_NONE, &desc, + D3D12_RESOURCE_STATE_GENERIC_READ, nullptr, + IID_PPV_ARGS(&resource)); + if (FAILED(hr)) { + LOGE("D3D12UploadHeap::init(): CreateCommittedResource failed (HRESULT=0x%08X size=%zu).", + static_cast(hr), capacity); + return false; + } + + void* mapped = nullptr; + D3D12_RANGE readRange = {0, 0}; + hr = resource->Map(0, &readRange, &mapped); + if (FAILED(hr) || mapped == nullptr) { + LOGE("D3D12UploadHeap::init(): Map failed (HRESULT=0x%08X).", static_cast(hr)); + return false; + } + + _resource = std::move(resource); + mappedCpu = mapped; + gpuVA = _resource->GetGPUVirtualAddress(); + _capacity = capacity; + head = 0; + committedHead = 0; + outstandingBytes = 0; + // Drop any inflight entries left over from a previous init() so the post-init state really is + // "fresh", matching the resetForContextLost() invariant. There is no current re-init path, + // but if one is added later (device-lost recovery, test teardown) those inflight entries + // would otherwise reference the previous, just-released resource. + inflight.clear(); + return true; +} + +D3D12UploadHeap::Allocation D3D12UploadHeap::allocate(size_t size, size_t alignment) { + if (_resource == nullptr || size == 0 || size > _capacity) { + return {}; + } + size_t alignedHead = AlignUpSize(head, alignment); + // Track how much of the ring is currently occupied by outstanding allocations. head alone + // cannot disambiguate "ring empty" from "ring full" because both produce the same value once + // an allocation wraps it; we add an explicit byte counter that is incremented on every + // allocate() and decremented on retire(). This is what stops a wrap from silently overwriting + // staging bytes that were allocated but not yet committed (see RecordingTest race details + // captured in commit notes). + size_t free = _capacity - outstandingBytes; + size_t needed = size; + size_t startOffset = alignedHead; + size_t skipped = 0; + if (alignedHead + size > _capacity) { + // Splitting the range across the wrap boundary is not supported (CopyTextureRegion needs a + // single contiguous PLACED_SUBRESOURCE_FOOTPRINT), so jump back to offset 0 and pay for the + // discarded bytes between head and the end of the ring out of the same free pool. + skipped = _capacity - head; + needed = size + skipped; + startOffset = 0; + } else { + needed = size + (alignedHead - head); + } + if (needed > free) { + return {}; + } + + Allocation result = {}; + result.resource = _resource.Get(); + result.cpu = static_cast(mappedCpu) + startOffset; + result.gpuVirtualAddress = gpuVA + startOffset; + result.offsetInResource = startOffset; + result.size = size; + head = startOffset + size; + if (head == _capacity) { + head = 0; + } + outstandingBytes += needed; + return result; +} + +void D3D12UploadHeap::commit(uint64_t fenceValue) { + // Pair the about-to-be-signalled fence with the bytes consumed since the last commit so + // retire() can give those bytes back when the GPU finishes with them. Compute the byte total + // first because the fast `head == committedHead` check is ambiguous: it triggers both when + // truly nothing was allocated and when a single allocation spanned the entire capacity and + // wrapped head right back to committedHead. + size_t bytesSinceCommit = + (head >= committedHead) ? (head - committedHead) : (_capacity - (committedHead - head)); + if (bytesSinceCommit == 0) { + if (outstandingBytes == 0) { + return; + } + // Whole-capacity allocation case — bill the full ring to this fence so the retire() path + // eventually drains outstandingBytes. Without this branch the bytes would leak and stop + // the ring from accepting any further allocations once outstandingBytes saturates. + bytesSinceCommit = _capacity; + } + InflightRange entry = {}; + entry.fenceValue = fenceValue; + entry.bytes = bytesSinceCommit; + inflight.push_back(entry); + committedHead = head; +} + +void D3D12UploadHeap::retire(uint64_t completedFenceValue) { + while (!inflight.empty() && inflight.front().fenceValue <= completedFenceValue) { + if (outstandingBytes >= inflight.front().bytes) { + outstandingBytes -= inflight.front().bytes; + } else { + // Defensive: bookkeeping should never drop below zero, but if it does we reset rather + // than wrap to ~0 and stop accepting allocations forever. + outstandingBytes = 0; + } + inflight.pop_front(); + } +} + +void D3D12UploadHeap::clear() { + if (_resource != nullptr) { + _resource->Unmap(0, nullptr); + } + _resource = nullptr; + mappedCpu = nullptr; + gpuVA = 0; + _capacity = 0; + head = 0; + committedHead = 0; + outstandingBytes = 0; + inflight.clear(); +} + +void D3D12UploadHeap::resetForContextLost() { + // Keep _resource / mappedCpu / gpuVA / _capacity intact; the ring stays usable. Just drop + // every accounting entry that is waiting on a fence that is never going to advance. + head = 0; + committedHead = 0; + outstandingBytes = 0; + inflight.clear(); +} + +} // namespace tgfx diff --git a/src/gpu/d3d12/D3D12UploadHeap.h b/src/gpu/d3d12/D3D12UploadHeap.h new file mode 100644 index 000000000..e2f5533b3 --- /dev/null +++ b/src/gpu/d3d12/D3D12UploadHeap.h @@ -0,0 +1,138 @@ +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Tencent is pleased to support the open source community by making tgfx available. +// +// Copyright (C) 2026 Tencent. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// unless required by applicable law or agreed to in writing, software distributed under the +// license is distributed on an "as is" basis, without warranties or conditions of any kind, +// either express or implied. see the license for the specific language governing permissions +// and limitations under the license. +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include +#include "D3D12Util.h" + +namespace tgfx { + +/** + * Single permanently-mapped UPLOAD heap used as a byte-level fence-tracked ring for staging + * data on its way to GPU resources (texture pixel uploads, buffer writes, etc). + * + * Why not per-call CreateCommittedResource: + * - Every D3D12 CreateCommittedResource(UPLOAD) call goes through the kernel allocator and + * reserves a fresh GPU virtual address range. For high-frequency uploads (writeTexture() + * called once per glyph atlas update, per render pass blur seed, etc) this accumulates real + * CPU and driver memory cost. + * - The standard solution is a single big UPLOAD buffer that stays Mapped() for its entire + * lifetime; sub-allocations are pure pointer arithmetic. + * + * Allocation model: + * - allocate(size, alignment) returns an Allocation that names a contiguous byte range inside + * the underlying ID3D12Resource. The caller writes pixel data via the cpu pointer and uses + * resource() + offsetInResource as the source of CopyTextureRegion / CopyBufferRegion. + * - commit(fenceValue) snapshots head: every byte allocated since the last commit is now + * "owned" by `fenceValue` and must outlive its signal. + * - retire(completedFenceValue) reclaims byte ranges whose fence has signalled. + * + * Capacity management: + * - A starting capacity is allocated up front. If a single allocation exceeds capacity OR the + * ring is fully in flight, allocate() returns an invalid Allocation; the caller must fall + * back to a one-off CreateCommittedResource. This avoids the complications of mid-frame + * resource recreation while still letting the steady-state path skip the slow allocator. + * + * Thread safety: not thread-safe. Caller serialises access (matches the rest of the D3D12 + * backend). + */ +class D3D12UploadHeap { + public: + struct Allocation { + // Lifetime-stable raw pointer; the heap's underlying ID3D12Resource is kept alive by the + // D3D12UploadHeap instance (which lives on D3D12GPU). Callers must not extend its lifetime + // beyond the next D3D12GPU shutdown. + ID3D12Resource* resource = nullptr; + void* cpu = nullptr; + uint64_t gpuVirtualAddress = 0; + uint64_t offsetInResource = 0; + size_t size = 0; + bool valid() const { + return resource != nullptr && cpu != nullptr && size > 0; + } + }; + + D3D12UploadHeap() = default; + + /** + * Creates the underlying UPLOAD-heap committed resource and Map()s it permanently. Returns + * false on failure (logged inside). + */ + bool init(ID3D12Device* device, size_t capacity); + + /** + * Sub-allocates `size` bytes aligned to `alignment` within the ring. Returns an invalid + * Allocation if the ring cannot satisfy the request without overrunning still-in-flight + * bytes; callers must then fall back to a per-call upload buffer. + */ + Allocation allocate(size_t size, size_t alignment); + + /** + * Tags every allocation since the last commit() with `fenceValue`. The bytes become + * reclaimable in retire() once the GPU advances past `fenceValue`. + */ + void commit(uint64_t fenceValue); + + /** + * Reclaims byte ranges whose fence has signalled. Cheap to call. + */ + void retire(uint64_t completedFenceValue); + + /** + * Drops the underlying resource. Used by D3D12GPU::releaseAll on shutdown. After this call + * allocate() returns invalid until init() is invoked again. + */ + void clear(); + + /** + * Drops every inflight byte range and zeroes the head / outstanding bookkeeping while + * keeping the mapped UPLOAD ID3D12Resource alive. Used by the context-lost recovery path so + * the ring stops accumulating inflight bytes whose fences will never advance, which would + * otherwise saturate outstandingBytes and reject every future allocation even though the + * GPU is no longer touching anything. + */ + void resetForContextLost(); + + size_t capacity() const { + return _capacity; + } + + private: + ComPtr _resource = nullptr; + void* mappedCpu = nullptr; + uint64_t gpuVA = 0; + size_t _capacity = 0; + size_t head = 0; + size_t committedHead = 0; + // Bytes currently held by either a still-uncommitted allocation or an inflight commit waiting + // for its fence to signal. Tracked explicitly so allocate() can know how many bytes are still + // in use — head alone cannot distinguish "ring empty" from "ring full" when an allocation + // wraps head right back to where it started. + size_t outstandingBytes = 0; + + struct InflightRange { + uint64_t fenceValue = 0; + // Bytes consumed between the previous commit() and this one; returned to outstandingBytes + // when retire() reaches this entry. + size_t bytes = 0; + }; + std::deque inflight; +}; + +} // namespace tgfx diff --git a/src/gpu/d3d12/D3D12Util.cpp b/src/gpu/d3d12/D3D12Util.cpp new file mode 100644 index 000000000..c992dd29f --- /dev/null +++ b/src/gpu/d3d12/D3D12Util.cpp @@ -0,0 +1,313 @@ +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Tencent is pleased to support the open source community by making tgfx available. +// +// Copyright (C) 2026 Tencent. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// unless required by applicable law or agreed to in writing, software distributed under the +// license is distributed on an "as is" basis, without warranties or conditions of any kind, +// either express or implied. see the license for the specific language governing permissions +// and limitations under the license. +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#include "D3D12Util.h" +#include "tgfx/gpu/GPUBuffer.h" +#include "tgfx/gpu/Texture.h" + +namespace tgfx { + +D3D12_PRIMITIVE_TOPOLOGY ToD3D12PrimitiveTopology(PrimitiveType primitiveType) { + switch (primitiveType) { + case PrimitiveType::Triangles: + return D3D_PRIMITIVE_TOPOLOGY_TRIANGLELIST; + case PrimitiveType::TriangleStrip: + return D3D_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP; + default: + return D3D_PRIMITIVE_TOPOLOGY_TRIANGLELIST; + } +} + +D3D12_PRIMITIVE_TOPOLOGY_TYPE ToD3D12PrimitiveTopologyType(PrimitiveType primitiveType) { + switch (primitiveType) { + case PrimitiveType::Triangles: + case PrimitiveType::TriangleStrip: + return D3D12_PRIMITIVE_TOPOLOGY_TYPE_TRIANGLE; + default: + return D3D12_PRIMITIVE_TOPOLOGY_TYPE_TRIANGLE; + } +} + +DXGI_FORMAT ToD3D12VertexFormat(VertexFormat format) { + switch (format) { + case VertexFormat::Float: + return DXGI_FORMAT_R32_FLOAT; + case VertexFormat::Float2: + return DXGI_FORMAT_R32G32_FLOAT; + case VertexFormat::Float3: + return DXGI_FORMAT_R32G32B32_FLOAT; + case VertexFormat::Float4: + return DXGI_FORMAT_R32G32B32A32_FLOAT; + case VertexFormat::Half: + return DXGI_FORMAT_R16_FLOAT; + case VertexFormat::Half2: + return DXGI_FORMAT_R16G16_FLOAT; + case VertexFormat::Half3: + // D3D12 does not have a native R16G16B16_FLOAT format. Use R16G16B16A16_FLOAT as fallback. + return DXGI_FORMAT_R16G16B16A16_FLOAT; + case VertexFormat::Half4: + return DXGI_FORMAT_R16G16B16A16_FLOAT; + case VertexFormat::Int: + return DXGI_FORMAT_R32_SINT; + case VertexFormat::Int2: + return DXGI_FORMAT_R32G32_SINT; + case VertexFormat::Int3: + return DXGI_FORMAT_R32G32B32_SINT; + case VertexFormat::Int4: + return DXGI_FORMAT_R32G32B32A32_SINT; + case VertexFormat::UByteNormalized: + return DXGI_FORMAT_R8_UNORM; + case VertexFormat::UByte2Normalized: + return DXGI_FORMAT_R8G8_UNORM; + case VertexFormat::UByte3Normalized: + // D3D12 does not have a native R8G8B8_UNORM vertex format. Use R8G8B8A8_UNORM as fallback. + return DXGI_FORMAT_R8G8B8A8_UNORM; + case VertexFormat::UByte4Normalized: + return DXGI_FORMAT_R8G8B8A8_UNORM; + default: + return DXGI_FORMAT_R32_FLOAT; + } +} + +D3D12_COMPARISON_FUNC ToD3D12CompareFunction(CompareFunction compareFunction) { + switch (compareFunction) { + case CompareFunction::Never: + return D3D12_COMPARISON_FUNC_NEVER; + case CompareFunction::Less: + return D3D12_COMPARISON_FUNC_LESS; + case CompareFunction::Equal: + return D3D12_COMPARISON_FUNC_EQUAL; + case CompareFunction::LessEqual: + return D3D12_COMPARISON_FUNC_LESS_EQUAL; + case CompareFunction::Greater: + return D3D12_COMPARISON_FUNC_GREATER; + case CompareFunction::NotEqual: + return D3D12_COMPARISON_FUNC_NOT_EQUAL; + case CompareFunction::GreaterEqual: + return D3D12_COMPARISON_FUNC_GREATER_EQUAL; + case CompareFunction::Always: + return D3D12_COMPARISON_FUNC_ALWAYS; + default: + return D3D12_COMPARISON_FUNC_ALWAYS; + } +} + +D3D12_STENCIL_OP ToD3D12StencilOperation(StencilOperation stencilOp) { + switch (stencilOp) { + case StencilOperation::Keep: + return D3D12_STENCIL_OP_KEEP; + case StencilOperation::Zero: + return D3D12_STENCIL_OP_ZERO; + case StencilOperation::Replace: + return D3D12_STENCIL_OP_REPLACE; + case StencilOperation::Invert: + return D3D12_STENCIL_OP_INVERT; + case StencilOperation::IncrementClamp: + return D3D12_STENCIL_OP_INCR_SAT; + case StencilOperation::DecrementClamp: + return D3D12_STENCIL_OP_DECR_SAT; + case StencilOperation::IncrementWrap: + return D3D12_STENCIL_OP_INCR; + case StencilOperation::DecrementWrap: + return D3D12_STENCIL_OP_DECR; + default: + return D3D12_STENCIL_OP_KEEP; + } +} + +D3D12_BLEND ToD3D12BlendFactor(BlendFactor blendFactor) { + switch (blendFactor) { + case BlendFactor::Zero: + return D3D12_BLEND_ZERO; + case BlendFactor::One: + return D3D12_BLEND_ONE; + case BlendFactor::Src: + return D3D12_BLEND_SRC_COLOR; + case BlendFactor::OneMinusSrc: + return D3D12_BLEND_INV_SRC_COLOR; + case BlendFactor::Dst: + return D3D12_BLEND_DEST_COLOR; + case BlendFactor::OneMinusDst: + return D3D12_BLEND_INV_DEST_COLOR; + case BlendFactor::SrcAlpha: + return D3D12_BLEND_SRC_ALPHA; + case BlendFactor::OneMinusSrcAlpha: + return D3D12_BLEND_INV_SRC_ALPHA; + case BlendFactor::DstAlpha: + return D3D12_BLEND_DEST_ALPHA; + case BlendFactor::OneMinusDstAlpha: + return D3D12_BLEND_INV_DEST_ALPHA; + case BlendFactor::Src1: + return D3D12_BLEND_SRC1_COLOR; + case BlendFactor::OneMinusSrc1: + return D3D12_BLEND_INV_SRC1_COLOR; + case BlendFactor::Src1Alpha: + return D3D12_BLEND_SRC1_ALPHA; + case BlendFactor::OneMinusSrc1Alpha: + return D3D12_BLEND_INV_SRC1_ALPHA; + default: + return D3D12_BLEND_ONE; + } +} + +D3D12_BLEND ToD3D12BlendFactorAlpha(BlendFactor blendFactor) { + // Alpha blend factors must use the *_ALPHA variants. D3D11/12 validation rejects color factors + // (SRC_COLOR / INV_SRC_COLOR / DEST_COLOR / INV_DEST_COLOR / SRC1_COLOR / INV_SRC1_COLOR) when + // they appear in SrcBlendAlpha or DestBlendAlpha — color and alpha are independent channels. + switch (blendFactor) { + case BlendFactor::Src: + return D3D12_BLEND_SRC_ALPHA; + case BlendFactor::OneMinusSrc: + return D3D12_BLEND_INV_SRC_ALPHA; + case BlendFactor::Dst: + return D3D12_BLEND_DEST_ALPHA; + case BlendFactor::OneMinusDst: + return D3D12_BLEND_INV_DEST_ALPHA; + case BlendFactor::Src1: + return D3D12_BLEND_SRC1_ALPHA; + case BlendFactor::OneMinusSrc1: + return D3D12_BLEND_INV_SRC1_ALPHA; + default: + return ToD3D12BlendFactor(blendFactor); + } +} + +D3D12_BLEND_OP ToD3D12BlendOperation(BlendOperation blendOp) { + switch (blendOp) { + case BlendOperation::Add: + return D3D12_BLEND_OP_ADD; + case BlendOperation::Subtract: + return D3D12_BLEND_OP_SUBTRACT; + case BlendOperation::ReverseSubtract: + return D3D12_BLEND_OP_REV_SUBTRACT; + case BlendOperation::Min: + return D3D12_BLEND_OP_MIN; + case BlendOperation::Max: + return D3D12_BLEND_OP_MAX; + default: + return D3D12_BLEND_OP_ADD; + } +} + +D3D12_TEXTURE_ADDRESS_MODE ToD3D12AddressMode(AddressMode addressMode) { + switch (addressMode) { + case AddressMode::ClampToEdge: + return D3D12_TEXTURE_ADDRESS_MODE_CLAMP; + case AddressMode::Repeat: + return D3D12_TEXTURE_ADDRESS_MODE_WRAP; + case AddressMode::MirrorRepeat: + return D3D12_TEXTURE_ADDRESS_MODE_MIRROR; + case AddressMode::ClampToBorder: + return D3D12_TEXTURE_ADDRESS_MODE_BORDER; + default: + return D3D12_TEXTURE_ADDRESS_MODE_CLAMP; + } +} + +D3D12_FILTER ToD3D12Filter(FilterMode minFilter, FilterMode magFilter, MipmapMode mipmapMode) { + bool minLinear = (minFilter == FilterMode::Linear); + bool magLinear = (magFilter == FilterMode::Linear); + bool mipLinear = (mipmapMode == MipmapMode::Linear); + bool mipEnabled = (mipmapMode != MipmapMode::None); + + if (!mipEnabled) { + if (minLinear && magLinear) { + return D3D12_FILTER_MIN_MAG_LINEAR_MIP_POINT; + } + if (minLinear) { + return D3D12_FILTER_MIN_LINEAR_MAG_MIP_POINT; + } + if (magLinear) { + return D3D12_FILTER_MIN_POINT_MAG_LINEAR_MIP_POINT; + } + return D3D12_FILTER_MIN_MAG_MIP_POINT; + } + + if (minLinear && magLinear && mipLinear) { + return D3D12_FILTER_MIN_MAG_MIP_LINEAR; + } + if (minLinear && magLinear) { + return D3D12_FILTER_MIN_MAG_LINEAR_MIP_POINT; + } + if (minLinear && mipLinear) { + return D3D12_FILTER_MIN_LINEAR_MAG_POINT_MIP_LINEAR; + } + if (magLinear && mipLinear) { + return D3D12_FILTER_MIN_POINT_MAG_MIP_LINEAR; + } + if (minLinear) { + return D3D12_FILTER_MIN_LINEAR_MAG_MIP_POINT; + } + if (magLinear) { + return D3D12_FILTER_MIN_POINT_MAG_LINEAR_MIP_POINT; + } + if (mipLinear) { + return D3D12_FILTER_MIN_MAG_POINT_MIP_LINEAR; + } + return D3D12_FILTER_MIN_MAG_MIP_POINT; +} + +D3D12_CULL_MODE ToD3D12CullMode(CullMode cullMode) { + switch (cullMode) { + case CullMode::None: + return D3D12_CULL_MODE_NONE; + case CullMode::Front: + return D3D12_CULL_MODE_FRONT; + case CullMode::Back: + return D3D12_CULL_MODE_BACK; + } + return D3D12_CULL_MODE_NONE; +} + +bool ToD3D12FrontCounterClockwise(FrontFace frontFace) { + switch (frontFace) { + case FrontFace::CCW: + return true; + case FrontFace::CW: + return false; + } + return true; +} + +D3D12_INDEX_BUFFER_STRIP_CUT_VALUE ToD3D12StripCutValue(IndexFormat indexFormat) { + switch (indexFormat) { + case IndexFormat::UInt16: + return D3D12_INDEX_BUFFER_STRIP_CUT_VALUE_0xFFFF; + case IndexFormat::UInt32: + return D3D12_INDEX_BUFFER_STRIP_CUT_VALUE_0xFFFFFFFF; + } + return D3D12_INDEX_BUFFER_STRIP_CUT_VALUE_DISABLED; +} + +void TransitionResourceState(ID3D12GraphicsCommandList* commandList, ID3D12Resource* resource, + D3D12_RESOURCE_STATES oldState, D3D12_RESOURCE_STATES newState, + UINT subresource) { + if (commandList == nullptr || resource == nullptr || oldState == newState) { + return; + } + D3D12_RESOURCE_BARRIER barrier = {}; + barrier.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION; + barrier.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE; + barrier.Transition.pResource = resource; + barrier.Transition.StateBefore = oldState; + barrier.Transition.StateAfter = newState; + barrier.Transition.Subresource = subresource; + commandList->ResourceBarrier(1, &barrier); +} + +} // namespace tgfx diff --git a/src/gpu/d3d12/D3D12Util.h b/src/gpu/d3d12/D3D12Util.h new file mode 100644 index 000000000..3d663338b --- /dev/null +++ b/src/gpu/d3d12/D3D12Util.h @@ -0,0 +1,80 @@ +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Tencent is pleased to support the open source community by making tgfx available. +// +// Copyright (C) 2026 Tencent. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// unless required by applicable law or agreed to in writing, software distributed under the +// license is distributed on an "as is" basis, without warranties or conditions of any kind, +// either express or implied. see the license for the specific language governing permissions +// and limitations under the license. +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include +#include +#include +#include "tgfx/gpu/Attribute.h" +#include "tgfx/gpu/BlendFactor.h" +#include "tgfx/gpu/BlendOperation.h" +#include "tgfx/gpu/CompareFunction.h" +#include "tgfx/gpu/FilterMode.h" +#include "tgfx/gpu/MipmapMode.h" +#include "tgfx/gpu/RenderPass.h" +#include "tgfx/gpu/Sampler.h" +#include "tgfx/gpu/StencilOperation.h" + +namespace tgfx { + +template +using ComPtr = Microsoft::WRL::ComPtr; + +D3D12_PRIMITIVE_TOPOLOGY ToD3D12PrimitiveTopology(PrimitiveType primitiveType); + +D3D12_PRIMITIVE_TOPOLOGY_TYPE ToD3D12PrimitiveTopologyType(PrimitiveType primitiveType); + +DXGI_FORMAT ToD3D12VertexFormat(VertexFormat format); + +D3D12_COMPARISON_FUNC ToD3D12CompareFunction(CompareFunction compareFunction); + +D3D12_STENCIL_OP ToD3D12StencilOperation(StencilOperation stencilOp); + +D3D12_BLEND ToD3D12BlendFactor(BlendFactor blendFactor); + +/** + * Like ToD3D12BlendFactor() but rewrites the four COLOR-only D3D12 blend factors (SRC_COLOR, + * INV_SRC_COLOR, DEST_COLOR, INV_DEST_COLOR, plus their dual-source variants) into their ALPHA + * counterparts. CreateBlendState validation rejects color factors when applied to the alpha + * channel, so this helper must be used for {Src,Dest}BlendAlpha. + */ +D3D12_BLEND ToD3D12BlendFactorAlpha(BlendFactor blendFactor); + +D3D12_BLEND_OP ToD3D12BlendOperation(BlendOperation blendOp); + +D3D12_TEXTURE_ADDRESS_MODE ToD3D12AddressMode(AddressMode addressMode); + +D3D12_FILTER ToD3D12Filter(FilterMode minFilter, FilterMode magFilter, MipmapMode mipmapMode); + +D3D12_CULL_MODE ToD3D12CullMode(CullMode cullMode); + +bool ToD3D12FrontCounterClockwise(FrontFace frontFace); + +D3D12_INDEX_BUFFER_STRIP_CUT_VALUE ToD3D12StripCutValue(IndexFormat indexFormat); + +/** + * Records a single ID3D12Resource::ResourceBarrier(TRANSITION) on the given command list. No-op + * when oldState == newState. Used by every code path that needs to flip an ID3D12Resource between + * read- and write-only states (RTV/DSV setup, copy commands, shader sampling, etc.). + */ +void TransitionResourceState(ID3D12GraphicsCommandList* commandList, ID3D12Resource* resource, + D3D12_RESOURCE_STATES oldState, D3D12_RESOURCE_STATES newState, + UINT subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES); + +} // namespace tgfx diff --git a/src/gpu/d3d12/D3D12Window.cpp b/src/gpu/d3d12/D3D12Window.cpp new file mode 100644 index 000000000..069d03e30 --- /dev/null +++ b/src/gpu/d3d12/D3D12Window.cpp @@ -0,0 +1,398 @@ +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Tencent is pleased to support the open source community by making tgfx available. +// +// Copyright (C) 2026 Tencent. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// unless required by applicable law or agreed to in writing, software distributed under the +// license is distributed on an "as is" basis, without warranties or conditions of any kind, +// either express or implied. see the license for the specific language governing permissions +// and limitations under the license. +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#include "tgfx/gpu/d3d12/D3D12Window.h" +#include +#include +#include +#include +#include "D3D12CommandQueue.h" +#include "D3D12Defines.h" +#include "D3D12GPU.h" +#include "core/utils/Log.h" +#include "gpu/proxies/RenderTargetProxy.h" +#include "gpu/resources/RenderTarget.h" +#include "tgfx/gpu/Backend.h" +#include "tgfx/gpu/Context.h" +#include "tgfx/gpu/d3d12/D3D12Types.h" + +namespace tgfx { + +// Number of backbuffers in the swap chain. Two is the minimum allowed by FLIP_DISCARD; using three +// gives the OS one extra frame to compose, smoothing latency spikes under heavy GPU load. We pick +// two for parity with Vulkan's MAX_FRAMES_IN_FLIGHT and to keep peak VRAM low for typical 4K +// windows. The presentation engine still queues a small number of frames internally. +static constexpr UINT BACKBUFFER_COUNT = 2; + +// Private RenderTargetProxy that exposes the swap chain's current backbuffer as an external +// D3D12 render target. The proxy is created once when the application calls Surface::MakeFrom() +// and is then reused for every subsequent frame: Surface caches it for its entire lifetime +// rather than re-acquiring on each render. To keep that pattern working with FLIP_DISCARD — +// which rotates between BACKBUFFER_COUNT distinct ID3D12Resources — getRenderTarget() must +// re-query GetCurrentBackBufferIndex every call and invalidate the cached RenderTarget when +// the index changes. Otherwise every frame would be drawn into the same backbuffer slot and +// the other slot would never get updated, manifesting as "no visible change" on user input. +// +// Defined at file scope (not in an anonymous namespace) so D3D12Window::PlatformState can store +// a typed raw pointer to it; the .h does not expose this class, so it remains private to this +// translation unit even without anonymous-namespace internal linkage. +class D3D12SwapchainProxy : public RenderTargetProxy { + public: + D3D12SwapchainProxy(Context* context, IDXGISwapChain3* swapChain, + const std::vector>* backBuffers, unsigned format, + int width, int height) + : _context(context), _swapChain(swapChain), _backBuffers(backBuffers), _format(format), + _width(width), _height(height) { + } + + Context* getContext() const override { + return _context; + } + int width() const override { + return _width; + } + int height() const override { + return _height; + } + PixelFormat format() const override { + return DXGIFormatToPixelFormat(_format); + } + int sampleCount() const override { + return 1; + } + ImageOrigin origin() const override { + return ImageOrigin::TopLeft; + } + bool externallyOwned() const override { + return true; + } + std::shared_ptr getTextureView() const override { + return nullptr; + } + + std::shared_ptr getRenderTarget() const override { + if (_swapChain == nullptr || _backBuffers == nullptr || _backBuffers->empty()) { + return nullptr; + } + UINT index = _swapChain->GetCurrentBackBufferIndex(); + if (index >= _backBuffers->size()) { + return nullptr; + } + auto* currentBuffer = (*_backBuffers)[index].Get(); + if (_renderTarget != nullptr && currentBuffer == _cachedBackBuffer) { + return _renderTarget; + } + D3D12TextureInfo info = {}; + info.resource = currentBuffer; + info.format = _format; + BackendRenderTarget backendRT(info, _width, _height); + _renderTarget = RenderTarget::MakeFrom(_context, backendRT, ImageOrigin::TopLeft); + _cachedBackBuffer = currentBuffer; + return _renderTarget; + } + + /// Drops the cached RenderTarget so the next getRenderTarget() call goes through MakeFrom + /// again. Invoked by D3D12Window::onPresent — after Present() the swap chain promotes a new + /// backbuffer to "current", so the next acquisition must wrap that buffer instead of the one + /// the GPU just submitted to. + void releaseFrame() { + _renderTarget = nullptr; + _cachedBackBuffer = nullptr; + } + + private: + Context* _context = nullptr; + IDXGISwapChain3* _swapChain = nullptr; + const std::vector>* _backBuffers = nullptr; + unsigned _format = DXGI_FORMAT_R8G8B8A8_UNORM; + int _width = 0; + int _height = 0; + mutable std::shared_ptr _renderTarget = nullptr; + mutable ID3D12Resource* _cachedBackBuffer = nullptr; +}; + +// Hidden state shared between D3D12Window and its private RenderTargetProxy. Stored as PImpl so +// the public header doesn't need / . The DXGI format is kept as `unsigned` to +// match the rest of the D3D12 backend (D3D12Defines.h shadows the SDK enum with constexpr +// integers so an unqualified DXGI_FORMAT_R8G8B8A8_UNORM here is `unsigned`, not the enum type). +struct D3D12Window::PlatformState { + ComPtr swapChain; + std::vector> backBuffers; + unsigned format = DXGI_FORMAT_R8G8B8A8_UNORM; + HWND hwnd = nullptr; + int width = 0; + int height = 0; + + // Cached proxy for the currently-acquired backbuffer. Reset by onPresent() so the next + // onCreateRenderTarget() picks up the new frame's index. Held as a shared_ptr because tgfx's + // surface code may keep a strong reference for a single frame; currentProxyRaw mirrors the + // underlying D3D12SwapchainProxy* so onPresent() can call releaseFrame() without a static_cast + // from the base RenderTargetProxy*. The two pointers are written and cleared together so the + // raw view never outlives the shared owner. + std::shared_ptr currentProxy; + D3D12SwapchainProxy* currentProxyRaw = nullptr; + + bool buildBackBuffers(); + bool rebuild(int newWidth, int newHeight); +}; + +bool D3D12Window::PlatformState::buildBackBuffers() { + backBuffers.clear(); + backBuffers.resize(BACKBUFFER_COUNT); + for (UINT i = 0; i < BACKBUFFER_COUNT; i++) { + auto hr = swapChain->GetBuffer(i, IID_PPV_ARGS(&backBuffers[i])); + if (FAILED(hr)) { + LOGE("D3D12Window: GetBuffer(%u) failed, HRESULT=0x%08X", i, static_cast(hr)); + backBuffers.clear(); + return false; + } + } + return true; +} + +bool D3D12Window::PlatformState::rebuild(int newWidth, int newHeight) { + // Releasing every backbuffer reference is mandatory before ResizeBuffers; otherwise the call + // returns DXGI_ERROR_INVALID_CALL because the swapchain still owns outstanding references. + backBuffers.clear(); + currentProxy = nullptr; + currentProxyRaw = nullptr; + auto hr = + swapChain->ResizeBuffers(BACKBUFFER_COUNT, static_cast(newWidth), + static_cast(newHeight), static_cast(format), 0); + if (FAILED(hr)) { + LOGE("D3D12Window: ResizeBuffers failed, HRESULT=0x%08X", static_cast(hr)); + return false; + } + width = newWidth; + height = newHeight; + return buildBackBuffers(); +} + +#ifdef _WIN32 + +std::shared_ptr D3D12Window::MakeFrom(HWND hwnd, std::shared_ptr device, + std::shared_ptr colorSpace) { + if (hwnd == nullptr || device == nullptr) { + return nullptr; + } + if (colorSpace && !colorSpace->isSRGB()) { + LOGI( + "D3D12Window::MakeFrom(): non-sRGB colorSpace is not yet supported and will be ignored. " + "Only sRGB output is currently available."); + } + + auto context = device->lockContext(); + if (context == nullptr) { + return nullptr; + } + auto* gpu = static_cast(context->gpu()); + auto* d3d12CommandQueue = static_cast(gpu->queue())->d3d12CommandQueue(); + + RECT clientRect = {}; + GetClientRect(hwnd, &clientRect); + int width = static_cast(clientRect.right - clientRect.left); + int height = static_cast(clientRect.bottom - clientRect.top); + if (width <= 0 || height <= 0) { + width = std::max(width, 1); + height = std::max(height, 1); + } + + ComPtr factory; + auto hr = CreateDXGIFactory1(IID_PPV_ARGS(&factory)); + if (FAILED(hr)) { + LOGE("D3D12Window: CreateDXGIFactory1 failed, HRESULT=0x%08X", static_cast(hr)); + device->unlock(); + return nullptr; + } + + DXGI_SWAP_CHAIN_DESC1 desc = {}; + desc.BufferCount = BACKBUFFER_COUNT; + desc.Width = static_cast(width); + desc.Height = static_cast(height); + // DXGI_FORMAT_R8G8B8A8_UNORM in this TU resolves to the D3D12Defines.h `unsigned` constant + // (= 28) rather than the SDK enum, so cast back here for DXGI_SWAP_CHAIN_DESC1::Format which + // does want the real enum. + desc.Format = static_cast(DXGI_FORMAT_R8G8B8A8_UNORM); + desc.BufferUsage = DXGI_USAGE_RENDER_TARGET_OUTPUT; + desc.SwapEffect = DXGI_SWAP_EFFECT_FLIP_DISCARD; + desc.SampleDesc.Count = 1; + desc.SampleDesc.Quality = 0; + desc.AlphaMode = DXGI_ALPHA_MODE_IGNORE; + desc.Scaling = DXGI_SCALING_STRETCH; + desc.Flags = 0; + + ComPtr swapChain1; + hr = factory->CreateSwapChainForHwnd(d3d12CommandQueue, hwnd, &desc, nullptr, nullptr, + &swapChain1); + if (FAILED(hr)) { + LOGE("D3D12Window: CreateSwapChainForHwnd failed, HRESULT=0x%08X", static_cast(hr)); + device->unlock(); + return nullptr; + } + // FLIP_DISCARD requires IDXGISwapChain3 for GetCurrentBackBufferIndex; QI is mandatory here. + ComPtr swapChain3; + hr = swapChain1.As(&swapChain3); + if (FAILED(hr) || swapChain3 == nullptr) { + LOGE("D3D12Window: failed to QI IDXGISwapChain3, HRESULT=0x%08X", static_cast(hr)); + device->unlock(); + return nullptr; + } + // Disable DXGI's default Alt+Enter fullscreen handling. tgfx callers manage that themselves. + factory->MakeWindowAssociation(hwnd, DXGI_MWA_NO_ALT_ENTER); + + auto state = std::make_unique(); + state->swapChain = std::move(swapChain3); + state->format = desc.Format; + state->hwnd = hwnd; + state->width = width; + state->height = height; + if (!state->buildBackBuffers()) { + device->unlock(); + return nullptr; + } + + device->unlock(); + return std::shared_ptr(new D3D12Window(device, std::move(state), colorSpace)); +} + +#endif + +D3D12Window::D3D12Window(std::shared_ptr device, std::unique_ptr state, + std::shared_ptr colorSpace) + : Window(std::move(device), std::move(colorSpace)), _platformState(std::move(state)) { +} + +D3D12Window::~D3D12Window() { + // Tear-down ordering is delicate. After the last frame, swap-chain Present() schedules its + // own GPU work on our command queue (the GPU-side flip), but that work is *not* tracked by + // any tgfx fence — D3D12CommandQueue::waitUntilCompleted() only waits on submissions we + // submitted via executeSubmission. If we release the swap chain (or its backbuffers) while + // that Present work is still pending, the runtime fires + // OBJECT_DELETED_WHILE_STILL_IN_USE (#921) and the debug layer asserts. + // + // To make sure the queue really is idle, we Signal a fresh fence on the queue and wait for + // it: that flushes everything previously enqueued, Present included. + // + // Then we still have to release the in-tgfx owners of each backbuffer before destroying + // the swap chain itself: + // - the cached ExternalRenderTarget / ExternalTexture pair (drained via ResourceCache and + // D3D12GPU return queues) + // - recycled command lists in D3D12CommandListPool (each list still pins the resources it + // was last recorded against until its next Reset()) + auto context = device->lockContext(); + if (context != nullptr) { + auto* d3d12GPU = static_cast(context->gpu()); + auto* d3d12CmdQueue = static_cast(d3d12GPU->queue())->d3d12CommandQueue(); + + // 1. Wait for all tgfx-managed submissions to complete. + d3d12GPU->queue()->waitUntilCompleted(); + + // 2. Wait for any Present-driven work the queue still has queued up. Without this the + // swap-chain release path below trips OBJECT_DELETED_WHILE_STILL_IN_USE because DXGI's + // internal flip operation is still in flight on the queue. + ComPtr drainFence; + if (SUCCEEDED( + d3d12GPU->device()->CreateFence(0, D3D12_FENCE_FLAG_NONE, IID_PPV_ARGS(&drainFence)))) { + const UINT64 targetValue = 1; + if (SUCCEEDED(d3d12CmdQueue->Signal(drainFence.Get(), targetValue))) { + if (drainFence->GetCompletedValue() < targetValue) { + HANDLE evt = CreateEventW(nullptr, FALSE, FALSE, nullptr); + if (evt != nullptr) { + if (SUCCEEDED(drainFence->SetEventOnCompletion(targetValue, evt))) { + WaitForSingleObject(evt, 5000); + } + CloseHandle(evt); + } + } + } + } + + // 3. Drop tgfx-side owners of the backbuffers. + _platformState->currentProxy = nullptr; + _platformState->currentProxyRaw = nullptr; + context->purgeResourcesNotUsedSince(std::chrono::steady_clock::now()); + d3d12GPU->processUnreferencedResources(); + d3d12GPU->commandListPool().clear(); + + // 4. Release the swap chain and our own backbuffer ComPtrs. The order between these two + // is not important once the queue is idle and no tgfx object pins the backbuffers. + _platformState->backBuffers.clear(); + _platformState->swapChain = nullptr; + device->unlock(); + } else { + _platformState->currentProxy = nullptr; + _platformState->currentProxyRaw = nullptr; + _platformState->backBuffers.clear(); + _platformState->swapChain = nullptr; + } +} + +std::shared_ptr D3D12Window::onCreateRenderTarget(Context* context) { + if (_platformState->swapChain == nullptr) { + return nullptr; + } + // Detect resize. The application's WM_SIZE handler is expected to reset the cached Surface, + // which in turn drops references to our previous proxy/backbuffer; only then is it safe to + // call ResizeBuffers (which requires zero outstanding backbuffer references). + RECT rect = {}; + GetClientRect(_platformState->hwnd, &rect); + int width = static_cast(rect.right - rect.left); + int height = static_cast(rect.bottom - rect.top); + if (width <= 0 || height <= 0) { + return nullptr; + } + if (width != _platformState->width || height != _platformState->height) { + // Wait for the GPU to finish reading old backbuffers; ResizeBuffers cannot proceed while + // any reference is outstanding, including in-flight command lists. + context->gpu()->queue()->waitUntilCompleted(); + if (!_platformState->rebuild(width, height)) { + return nullptr; + } + } + + // Build one proxy per Surface and let it pull the current backbuffer index out of the swap + // chain on every getRenderTarget() call. Surface caches the proxy for its whole lifetime, so + // a per-frame allocation here would leak the freshly-created proxy and never reach the + // backbuffer-rotation code path. + auto proxy = std::make_shared( + context, _platformState->swapChain.Get(), &_platformState->backBuffers, + _platformState->format, _platformState->width, _platformState->height); + _platformState->currentProxyRaw = proxy.get(); + _platformState->currentProxy = std::move(proxy); + return _platformState->currentProxy; +} + +void D3D12Window::onPresent(Context* /*context*/) { + if (_platformState->swapChain == nullptr) { + return; + } + // SyncInterval=1 mirrors VK_PRESENT_MODE_FIFO_KHR: wait for the next vertical blank. Apps that + // need uncapped framerate can replace this with a FRAME_LATENCY_WAITABLE_OBJECT path later. + auto hr = _platformState->swapChain->Present(1, 0); + if (FAILED(hr)) { + LOGE("D3D12Window: Present failed, HRESULT=0x%08X", static_cast(hr)); + } + // Tell the proxy to drop its cached RenderTarget so the next getRenderTarget() picks up the + // backbuffer the swap chain just rotated in. Without this Surface keeps drawing into the + // same slot forever and the user sees a frozen frame regardless of input. + if (_platformState->currentProxyRaw != nullptr) { + _platformState->currentProxyRaw->releaseFrame(); + } +} + +} // namespace tgfx diff --git a/test/src/d3d12/DevicePool.cpp b/test/src/d3d12/DevicePool.cpp new file mode 100644 index 000000000..faf7c40f8 --- /dev/null +++ b/test/src/d3d12/DevicePool.cpp @@ -0,0 +1,38 @@ +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Tencent is pleased to support the open source community by making tgfx available. +// +// Copyright (C) 2026 Tencent. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// unless required by applicable law or agreed to in writing, software distributed under the +// license is distributed on an "as is" basis, without warranties or conditions of any kind, +// either express or implied. see the license for the specific language governing permissions +// and limitations under the license. +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#include "utils/DevicePool.h" +#include "tgfx/gpu/d3d12/D3D12Device.h" + +namespace tgfx { +thread_local std::shared_ptr cachedDevice = nullptr; + +std::shared_ptr DevicePool::Make() { + if (cachedDevice == nullptr) { +#ifdef TGFX_D3D12_USE_WARP + // CI opt-in (-DTGFX_D3D12_USE_WARP=ON): force the test suite onto WARP so headless runners + // without a hardware adapter can still exercise the D3D12 backend. WARP is functionally + // complete but very slow — never enable this for performance baselines. + cachedDevice = D3D12Device::MakeWarp(); +#else + cachedDevice = D3D12Device::Make(); +#endif + } + return cachedDevice; +} +} // namespace tgfx diff --git a/vcpkg/ports/tgfx/tgfx-functions.cmake b/vcpkg/ports/tgfx/tgfx-functions.cmake index 0672f233b..1f925657a 100644 --- a/vcpkg/ports/tgfx/tgfx-functions.cmake +++ b/vcpkg/ports/tgfx/tgfx-functions.cmake @@ -74,6 +74,12 @@ function(build_tgfx_single_config SOURCE_PATH NODEJS OUTPUT_DIR IS_DEBUG) list(APPEND BUILD_ARGS -DTGFX_USE_OPENGL=OFF) endif() + if("d3d12" IN_LIST FEATURES) + list(APPEND BUILD_ARGS -DTGFX_USE_D3D12=ON) + else() + list(APPEND BUILD_ARGS -DTGFX_USE_D3D12=OFF) + endif() + if("threads" IN_LIST FEATURES) list(APPEND BUILD_ARGS -DTGFX_USE_THREADS=ON) if(VCPKG_TARGET_IS_EMSCRIPTEN) diff --git a/vcpkg/ports/tgfx/vcpkg.json b/vcpkg/ports/tgfx/vcpkg.json index db9c42d22..30be2fd62 100644 --- a/vcpkg/ports/tgfx/vcpkg.json +++ b/vcpkg/ports/tgfx/vcpkg.json @@ -89,6 +89,10 @@ "opengl": { "description": "Enable OpenGL as the GPU backend" }, + "d3d12": { + "description": "Enable D3D12 as the GPU backend", + "supports": "windows" + }, "threads": { "description": "Enable support for multithreaded rendering" }, diff --git a/vendor.json b/vendor.json index c94abebcb..9f65e6153 100644 --- a/vendor.json +++ b/vendor.json @@ -347,10 +347,12 @@ "cmake": { "targets": [ "spirv-cross-core", - "spirv-cross-glsl" + "spirv-cross-glsl", + "spirv-cross-hlsl" ], "arguments": [ - "-DSPIRV_CROSS_ENABLE_TESTS=OFF" + "-DSPIRV_CROSS_ENABLE_TESTS=OFF", + "-DCMAKE_CXX_FLAGS=\"-w\"" ], "platforms": [ "win" diff --git a/win/CMakeLists.txt b/win/CMakeLists.txt index 20fe3c897..85899dfa7 100644 --- a/win/CMakeLists.txt +++ b/win/CMakeLists.txt @@ -47,6 +47,9 @@ file(GLOB HELLO_2D_COMMON_FILES src/*.*) if (TGFX_USE_VULKAN) file(GLOB HELLO_2D_BACKEND_FILES vulkan/*.*) list(APPEND HELLO_2D_INCLUDES vulkan) +elseif (TGFX_USE_D3D12) + file(GLOB HELLO_2D_BACKEND_FILES d3d12/*.*) + list(APPEND HELLO_2D_INCLUDES d3d12) else () file(GLOB HELLO_2D_BACKEND_FILES wgl/*.*) list(APPEND HELLO_2D_INCLUDES wgl) diff --git a/win/d3d12/TGFXWindow.cpp b/win/d3d12/TGFXWindow.cpp new file mode 100644 index 000000000..f73e8a493 --- /dev/null +++ b/win/d3d12/TGFXWindow.cpp @@ -0,0 +1,399 @@ +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Tencent is pleased to support the open source community by making tgfx available. +// +// Copyright (C) 2026 Tencent. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// unless required by applicable law or agreed to in writing, software distributed under the +// license is distributed on an "as is" basis, without warranties or conditions of any kind, +// either express or implied. see the license for the specific language governing permissions +// and limitations under the license. +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#include "TGFXWindow.h" +#include +#include +#include +#if WINVER >= 0x0603 // Windows 8.1 +#include +#endif +#include "hello2d/AppHost.h" +#include "hello2d/LayerBuilder.h" + +namespace hello2d { +static constexpr LPCWSTR ClassName = L"TGFXWindow"; +static constexpr float MAX_ZOOM = 1000.0f; +static constexpr float MIN_ZOOM = 0.001f; +static constexpr float WHEEL_RATIO = 400.0f; + +TGFXWindow::TGFXWindow() { + createAppHost(); +} + +TGFXWindow::~TGFXWindow() { + destroy(); +} + +bool TGFXWindow::open() { + destroy(); + WNDCLASS windowClass = RegisterWindowClass(); + auto pixelRatio = getPixelRatio(); + int initWidth = static_cast(pixelRatio * 800); + int initHeight = static_cast(pixelRatio * 600); + windowHandle = + CreateWindowEx(WS_EX_APPWINDOW, windowClass.lpszClassName, L"Hello2D", WS_OVERLAPPEDWINDOW, 0, + 0, initWidth, initHeight, nullptr, nullptr, windowClass.hInstance, this); + + if (windowHandle == nullptr) { + return false; + } + RegisterTouchWindow(windowHandle, 0); + SetWindowLongPtr(windowHandle, GWLP_USERDATA, reinterpret_cast(this)); + centerAndShow(); + ShowWindow(windowHandle, SW_SHOW); + UpdateWindow(windowHandle); + RECT rect = {}; + GetClientRect(windowHandle, &rect); + lastSurfaceWidth = static_cast(rect.right - rect.left); + lastSurfaceHeight = static_cast(rect.bottom - rect.top); + updateZoomScaleAndOffset(); + updateLayerTree(); + ::InvalidateRect(windowHandle, nullptr, FALSE); + return true; +} + +WNDCLASS TGFXWindow::RegisterWindowClass() { + auto hInstance = GetModuleHandle(nullptr); + WNDCLASS windowClass{}; + windowClass.hCursor = LoadCursor(nullptr, IDC_ARROW); + windowClass.lpszClassName = ClassName; + windowClass.style = CS_HREDRAW | CS_VREDRAW; + windowClass.cbClsExtra = 0; + windowClass.cbWndExtra = 0; + windowClass.hInstance = hInstance; + windowClass.hIcon = LoadIcon(hInstance, L"IDI_ICON1"); + windowClass.hbrBackground = nullptr; + windowClass.lpszMenuName = nullptr; + windowClass.lpfnWndProc = WndProc; + RegisterClass(&windowClass); + return windowClass; +} + +LRESULT CALLBACK TGFXWindow::WndProc(HWND window, UINT message, WPARAM wparam, + LPARAM lparam) noexcept { + auto tgfxWindow = reinterpret_cast(GetWindowLongPtr(window, GWLP_USERDATA)); + if (tgfxWindow != nullptr) { + return tgfxWindow->handleMessage(window, message, wparam, lparam); + } + return DefWindowProc(window, message, wparam, lparam); +} + +LRESULT TGFXWindow::handleMessage(HWND hwnd, UINT message, WPARAM wparam, LPARAM lparam) noexcept { + switch (message) { + case WM_ACTIVATE: + isDrawing = (LOWORD(wparam) != WA_INACTIVE); + break; + case WM_DESTROY: + destroy(); + PostQuitMessage(0); + break; + case WM_SIZE: { + RECT rect; + GetClientRect(windowHandle, &rect); + lastSurfaceWidth = static_cast(rect.right - rect.left); + lastSurfaceHeight = static_cast(rect.bottom - rect.top); + applyCenteringTransform(); + if (tgfxWindow) { + surface = nullptr; + presentImmediately = true; + } + ::InvalidateRect(windowHandle, nullptr, FALSE); + break; + } + case WM_PAINT: { + PAINTSTRUCT ps; + BeginPaint(hwnd, &ps); + if (isDrawing) { + draw(); + ::InvalidateRect(windowHandle, nullptr, FALSE); + } + EndPaint(hwnd, &ps); + break; + } + case WM_LBUTTONUP: { + int count = hello2d::LayerBuilder::Count(); + if (count > 0) { + currentDrawerIndex = (currentDrawerIndex + 1) % count; + zoomScale = 1.0f; + contentOffset = {0.0f, 0.0f}; + updateLayerTree(); + updateZoomScaleAndOffset(); + ::InvalidateRect(windowHandle, nullptr, FALSE); + } + break; + } + case WM_MOUSEWHEEL: { + POINT mousePoint = {GET_X_LPARAM(lparam), GET_Y_LPARAM(lparam)}; + ScreenToClient(hwnd, &mousePoint); + float pixelX = static_cast(mousePoint.x); + float pixelY = static_cast(mousePoint.y); + bool isCtrlPressed = (GetKeyState(VK_CONTROL) & 0x8000) != 0; + bool isShiftPressed = (GetKeyState(VK_SHIFT) & 0x8000) != 0; + + if (isCtrlPressed) { + float zoomStep = std::exp(GET_WHEEL_DELTA_WPARAM(wparam) / WHEEL_RATIO); + float newZoom = std::clamp(zoomScale * zoomStep, MIN_ZOOM, MAX_ZOOM); + float oldZoom = zoomScale; + contentOffset.x = pixelX - ((pixelX - contentOffset.x) / oldZoom) * newZoom; + contentOffset.y = pixelY - ((pixelY - contentOffset.y) / oldZoom) * newZoom; + zoomScale = newZoom; + } else { + float wheelDelta = static_cast(GET_WHEEL_DELTA_WPARAM(wparam)); + if (isShiftPressed) { + contentOffset.x += wheelDelta; + } else { + contentOffset.y -= wheelDelta; + } + } + updateZoomScaleAndOffset(); + ::InvalidateRect(windowHandle, nullptr, FALSE); + break; + } + case WM_GESTURE: { + GESTUREINFO gestureInfo{}; + gestureInfo.cbSize = sizeof(GESTUREINFO); + if (GetGestureInfo(reinterpret_cast(lparam), &gestureInfo)) { + if (gestureInfo.dwID == GID_ZOOM) { + double currentArgument = static_cast(gestureInfo.ullArguments); + if (lastZoomArgument != 0.0) { + double zoomFactor = currentArgument / lastZoomArgument; + POINT mousePoint = {GET_X_LPARAM(lparam), GET_Y_LPARAM(lparam)}; + ScreenToClient(hwnd, &mousePoint); + float pixelX = static_cast(mousePoint.x); + float pixelY = static_cast(mousePoint.y); + float newZoom = + std::clamp(zoomScale * static_cast(zoomFactor), MIN_ZOOM, MAX_ZOOM); + float oldZoom = zoomScale; + contentOffset.x = pixelX - ((pixelX - contentOffset.x) / oldZoom) * newZoom; + contentOffset.y = pixelY - ((pixelY - contentOffset.y) / oldZoom) * newZoom; + zoomScale = newZoom; + } + lastZoomArgument = currentArgument; + } + if (gestureInfo.dwFlags & GF_END) { + lastZoomArgument = 0.0; + } + CloseGestureInfoHandle(reinterpret_cast(lparam)); + updateZoomScaleAndOffset(); + ::InvalidateRect(windowHandle, nullptr, FALSE); + } + break; + } + default: + return DefWindowProc(windowHandle, message, wparam, lparam); + } + return 0; +} + +void TGFXWindow::destroy() { + if (windowHandle) { + DestroyWindow(windowHandle); + windowHandle = nullptr; + UnregisterClass(ClassName, nullptr); + } +} + +void TGFXWindow::centerAndShow() { + if ((GetWindowStyle(windowHandle) & WS_CHILD) != 0) { + return; + } + RECT rcDlg = {0}; + ::GetWindowRect(windowHandle, &rcDlg); + RECT rcArea = {0}; + RECT rcCenter = {0}; + HWND hWnd = windowHandle; + HWND hWndCenter = ::GetWindowOwner(windowHandle); + if (hWndCenter != nullptr) { + hWnd = hWndCenter; + } + + MONITORINFO oMonitor = {}; + oMonitor.cbSize = sizeof(oMonitor); + ::GetMonitorInfo(::MonitorFromWindow(hWnd, MONITOR_DEFAULTTONEAREST), &oMonitor); + rcArea = oMonitor.rcWork; + + if (hWndCenter == nullptr) { + rcCenter = rcArea; + } else { + ::GetWindowRect(hWndCenter, &rcCenter); + } + + int DlgWidth = rcDlg.right - rcDlg.left; + int DlgHeight = rcDlg.bottom - rcDlg.top; + + int xLeft = (rcCenter.left + rcCenter.right) / 2 - DlgWidth / 2; + int yTop = (rcCenter.top + rcCenter.bottom) / 2 - DlgHeight / 2; + + if (xLeft < rcArea.left) { + if (xLeft < 0) { + xLeft = GetSystemMetrics(SM_CXSCREEN) / 2 - DlgWidth / 2; + } else { + xLeft = rcArea.left; + } + } else if (xLeft + DlgWidth > rcArea.right) { + xLeft = rcArea.right - DlgWidth; + } + + if (yTop < rcArea.top) { + if (yTop < 0) { + yTop = GetSystemMetrics(SM_CYSCREEN) / 2 - DlgHeight / 2; + } else { + yTop = rcArea.top; + } + + } else if (yTop + DlgHeight > rcArea.bottom) { + yTop = rcArea.bottom - DlgHeight; + } + ::SetWindowPos(windowHandle, nullptr, xLeft, yTop, -1, -1, + SWP_NOSIZE | SWP_NOZORDER | SWP_NOACTIVATE | SWP_SHOWWINDOW); +} + +float TGFXWindow::getPixelRatio() { +#if WINVER >= 0x0603 + HMONITOR monitor = nullptr; + if (windowHandle != nullptr) { + monitor = ::MonitorFromWindow(windowHandle, MONITOR_DEFAULTTONEAREST); + } else { + monitor = ::MonitorFromPoint(POINT{0, 0}, MONITOR_DEFAULTTOPRIMARY); + } + UINT dpiX = 96; + UINT dpiY = 96; + GetDpiForMonitor(monitor, MDT_EFFECTIVE_DPI, &dpiX, &dpiY); + return static_cast(dpiX) / 96.0f; +#else + return 1.0f; +#endif +} + +void TGFXWindow::createAppHost() { + appHost = std::make_unique(); + + displayList.setRenderMode(tgfx::RenderMode::Tiled); + displayList.setAllowZoomBlur(true); + displayList.setMaxTileCount(512); + + std::filesystem::path filePath = __FILE__; + auto rootPath = filePath.parent_path().parent_path().parent_path().string(); + auto imagePath = rootPath + R"(\resources\assets\bridge.jpg)"; + auto image = tgfx::Image::MakeFromFile(imagePath); + appHost->addImage("bridge", image); + imagePath = rootPath + R"(\resources\assets\tgfx.png)"; + appHost->addImage("TGFX", tgfx::Image::MakeFromFile(imagePath)); + auto typeface = tgfx::Typeface::MakeFromName("Microsoft YaHei", ""); + appHost->addTypeface("default", typeface); + auto emojiPath = rootPath + R"(\resources\font\NotoColorEmoji.ttf)"; + typeface = tgfx::Typeface::MakeFromPath(emojiPath); + appHost->addTypeface("emoji", typeface); +} + +void TGFXWindow::updateLayerTree() { + int count = hello2d::LayerBuilder::Count(); + int index = (count > 0) ? (currentDrawerIndex % count) : 0; + if (index != lastDrawIndex || !contentLayer) { + auto builder = hello2d::LayerBuilder::GetByIndex(index); + if (builder) { + contentLayer = builder->buildLayerTree(appHost.get()); + if (contentLayer) { + displayList.root()->removeChildren(); + displayList.root()->addChild(contentLayer); + applyCenteringTransform(); + } + } + lastDrawIndex = index; + } +} + +void TGFXWindow::updateZoomScaleAndOffset() { + displayList.setZoomScale(zoomScale); + displayList.setContentOffset(contentOffset.x, contentOffset.y); +} + +void TGFXWindow::applyCenteringTransform() { + if (lastSurfaceWidth > 0 && lastSurfaceHeight > 0 && contentLayer) { + hello2d::LayerBuilder::ApplyCenteringTransform( + contentLayer, static_cast(lastSurfaceWidth), static_cast(lastSurfaceHeight)); + } +} + +void TGFXWindow::draw() { + if (!tgfxWindow) { + auto device = tgfx::D3D12Device::Make(); + if (device) { + tgfxWindow = tgfx::D3D12Window::MakeFrom(windowHandle, device); + } + } + if (tgfxWindow == nullptr) { + return; + } + RECT rect; + GetClientRect(windowHandle, &rect); + auto width = static_cast(rect.right - rect.left); + auto height = static_cast(rect.bottom - rect.top); + if (width <= 0 || height <= 0) { + return; + } + auto pixelRatio = getPixelRatio(); + + if (!displayList.hasContentChanged() && lastRecording == nullptr) { + return; + } + + auto device = tgfxWindow->getDevice(); + if (device == nullptr) { + return; + } + auto context = device->lockContext(); + if (context == nullptr) { + return; + } + if (surface == nullptr) { + if (lastRecording) { + context->submit(std::move(lastRecording)); + } + surface = tgfx::Surface::MakeFrom(context, tgfxWindow); + } + if (surface == nullptr) { + device->unlock(); + return; + } + + auto canvas = surface->getCanvas(); + canvas->clear(); + hello2d::DrawBackground(canvas, surface->width(), surface->height(), pixelRatio); + + displayList.render(surface.get(), false); + + auto recording = context->flush(); + + if (presentImmediately) { + presentImmediately = false; + if (recording) { + context->submit(std::move(recording)); + } + } else { + std::swap(lastRecording, recording); + + if (recording) { + context->submit(std::move(recording)); + } + } + + device->unlock(); +} +} // namespace hello2d diff --git a/win/d3d12/TGFXWindow.h b/win/d3d12/TGFXWindow.h new file mode 100644 index 000000000..c24cc6dc1 --- /dev/null +++ b/win/d3d12/TGFXWindow.h @@ -0,0 +1,81 @@ +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Tencent is pleased to support the open source community by making tgfx available. +// +// Copyright (C) 2026 Tencent. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// unless required by applicable law or agreed to in writing, software distributed under the +// license is distributed on an "as is" basis, without warranties or conditions of any kind, +// either express or implied. see the license for the specific language governing permissions +// and limitations under the license. +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#ifndef UNICODE +#define UNICODE +#endif + +#include +#include +#include +#include +#include +#include +#include "hello2d/AppHost.h" +#include "hello2d/LayerBuilder.h" +#include "tgfx/core/Point.h" +#include "tgfx/core/Surface.h" +#include "tgfx/gpu/Recording.h" +#include "tgfx/gpu/d3d12/D3D12Device.h" +#include "tgfx/gpu/d3d12/D3D12Window.h" +#include "tgfx/layers/DisplayList.h" + +namespace hello2d { +class TGFXWindow { + public: + TGFXWindow(); + virtual ~TGFXWindow(); + + bool open(); + + private: + HWND windowHandle = nullptr; + int currentDrawerIndex = 0; + int lastDrawIndex = -1; + double lastZoomArgument = 0.0; + float zoomScale = 1.0f; + tgfx::Point contentOffset = {0.0f, 0.0f}; + std::shared_ptr tgfxWindow = nullptr; + std::shared_ptr surface = nullptr; + std::shared_ptr appHost = nullptr; + tgfx::DisplayList displayList = {}; + std::shared_ptr contentLayer = nullptr; + std::unique_ptr lastRecording = nullptr; + int lastSurfaceWidth = 0; + int lastSurfaceHeight = 0; + bool presentImmediately = true; + + static WNDCLASS RegisterWindowClass(); + static LRESULT CALLBACK WndProc(HWND window, UINT message, WPARAM wparam, LPARAM lparam) noexcept; + + LRESULT handleMessage(HWND window, UINT message, WPARAM wparam, LPARAM lparam) noexcept; + + void destroy(); + void centerAndShow(); + float getPixelRatio(); + void createAppHost(); + void updateLayerTree(); + void updateZoomScaleAndOffset(); + void applyCenteringTransform(); + void draw(); + + bool isDrawing = true; +}; +} // namespace hello2d