diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0381722bd..897c23bae 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -22,6 +22,8 @@ option(TGFX_BUILD_LAYERS "Enable building the layers module" OFF)
 option(TGFX_BUILD_HELLO2D "Enable building the tgfx-hello2d library for testing" OFF)
 option(TGFX_USE_OPENGL "Use OpenGL as the GPU backend" ON)
 option(TGFX_USE_METAL "Use Metal as the GPU backend on Apple platforms" OFF)
+option(TGFX_USE_D3D12 "Use D3D12 as the GPU backend on Windows" OFF)
+option(TGFX_D3D12_USE_WARP "Force the D3D12 backend onto the WARP software rasterizer (CI only)" OFF)
 option(TGFX_USE_QT "Enable building with the Qt framework." OFF)
 option(TGFX_USE_SWIFTSHADER "Enable building with the SwiftShader library" OFF)
 option(TGFX_USE_ANGLE "Enable building with the ANGLE library" OFF)
@@ -97,6 +99,16 @@ if (TGFX_USE_VULKAN)
     endif ()
     set(TGFX_USE_OPENGL OFF)
     set(TGFX_USE_METAL OFF)
+    set(TGFX_USE_D3D12 OFF)
+    set(TGFX_USE_QT OFF)
+    set(TGFX_USE_SWIFTSHADER OFF)
+    set(TGFX_USE_ANGLE OFF)
+elseif (TGFX_USE_D3D12)
+    if (NOT WIN32)
+        message(FATAL_ERROR "TGFX_USE_D3D12 is only supported on Windows.")
+    endif ()
+    set(TGFX_USE_OPENGL OFF)
+    set(TGFX_USE_METAL OFF)
     set(TGFX_USE_QT OFF)
     set(TGFX_USE_SWIFTSHADER OFF)
     set(TGFX_USE_ANGLE OFF)
@@ -124,8 +136,8 @@ else ()
     endif ()
 endif ()
 
-if (NOT TGFX_USE_METAL AND NOT TGFX_USE_OPENGL AND NOT TGFX_USE_VULKAN)
-    message(FATAL_ERROR "At least one GPU backend (TGFX_USE_METAL, TGFX_USE_OPENGL, or TGFX_USE_VULKAN) must be enabled.")
+if (NOT TGFX_USE_METAL AND NOT TGFX_USE_OPENGL AND NOT TGFX_USE_VULKAN AND NOT TGFX_USE_D3D12)
+    message(FATAL_ERROR "At least one GPU backend (TGFX_USE_METAL, TGFX_USE_OPENGL, TGFX_USE_VULKAN, or TGFX_USE_D3D12) must be enabled.")
 endif ()
 
 message("TGFX_VERSION: ${TGFX_VERSION}")
@@ -135,6 +147,8 @@ message("TGFX_BUILD_LAYERS: ${TGFX_BUILD_LAYERS}")
 message("TGFX_USE_OPENGL: ${TGFX_USE_OPENGL}")
 message("TGFX_USE_METAL: ${TGFX_USE_METAL}")
 message("TGFX_USE_VULKAN: ${TGFX_USE_VULKAN}")
+message("TGFX_USE_D3D12: ${TGFX_USE_D3D12}")
+message("TGFX_D3D12_USE_WARP: ${TGFX_D3D12_USE_WARP}")
 message("TGFX_USE_QT: ${TGFX_USE_QT}")
 message("TGFX_USE_SWIFTSHADER: ${TGFX_USE_SWIFTSHADER}")
 message("TGFX_USE_ANGLE: ${TGFX_USE_ANGLE}")
@@ -192,7 +206,7 @@ file(GLOB PLATFORM_COMMON_FILES
         src/platform/*.*)
 list(APPEND TGFX_FILES ${PLATFORM_COMMON_FILES})
 
-if (NOT TGFX_USE_METAL AND NOT TGFX_USE_VULKAN)
+if (NOT TGFX_USE_METAL AND NOT TGFX_USE_VULKAN AND NOT TGFX_USE_D3D12)
     file(GLOB SHADER_COMPILER_FILES src/gpu/ShaderCompiler.*)
     if (SHADER_COMPILER_FILES)
         list(REMOVE_ITEM TGFX_FILES ${SHADER_COMPILER_FILES})
@@ -371,6 +385,22 @@ elseif (APPLE)
     endif ()
 endif ()
 
+if (TGFX_USE_D3D12)
+    file(GLOB_RECURSE GFX_PLATFORM_FILES src/gpu/d3d12/*.*)
+    list(APPEND TGFX_FILES ${GFX_PLATFORM_FILES})
+    list(APPEND TGFX_DEFINES TGFX_USE_D3D12)
+    if (TGFX_D3D12_USE_WARP)
+        # CI / headless opt-in: route DevicePool::Make() onto D3D12Device::MakeWarp() so the
+        # tests run on the WARP software rasterizer instead of expecting a hardware GPU.
+        list(APPEND TGFX_DEFINES TGFX_D3D12_USE_WARP)
+    endif ()
+    # Add shaderc and SPIRV-Cross for GLSL to HLSL conversion
+    list(APPEND TGFX_STATIC_VENDORS shaderc SPIRV-Cross)
+    list(APPEND TGFX_INCLUDES third_party/shaderc/libshaderc/include)
+    list(APPEND TGFX_INCLUDES third_party/SPIRV-Cross)
+    list(APPEND TGFX_DEFINES SPIRV_CROSS_EXCEPTIONS_TO_ASSERTIONS)
+endif ()
+
 # Auto-sync shaderc sub-dependencies (glslang, SPIRV-Tools, etc.) if any backend needs shaderc.
 # This runs at configure time; for offline builds, run manually before cmake:
 #   python third_party/shaderc/utils/git-sync-deps
@@ -596,7 +626,14 @@ elseif (WIN32)
     file(GLOB_RECURSE PLATFORM_FILES src/platform/win/*.*)
     list(APPEND TGFX_FILES ${PLATFORM_FILES})
 
-    if (TGFX_USE_NATIVE_GL)
+    if (TGFX_USE_D3D12)
+        find_library(D3D12_LIB d3d12)
+        list(APPEND TGFX_STATIC_LIBS ${D3D12_LIB})
+        find_library(DXGI_LIB dxgi)
+        list(APPEND TGFX_STATIC_LIBS ${DXGI_LIB})
+        find_library(D3DCOMPILER_LIB d3dcompiler)
+        list(APPEND TGFX_STATIC_LIBS ${D3DCOMPILER_LIB})
+    elseif (TGFX_USE_NATIVE_GL)
         file(GLOB_RECURSE GPU_PLATFORM_FILES src/gpu/opengl/wgl/*.*)
         find_library(OPENGL_LIB opengl32)
         list(APPEND TGFX_STATIC_LIBS ${OPENGL_LIB})
@@ -715,6 +752,10 @@ if (TGFX_BUILD_TESTS)
         if (TGFX_TEST_VULKAN_FILES)
             list(REMOVE_ITEM TGFX_TEST_FILES ${TGFX_TEST_VULKAN_FILES})
         endif ()
+        file(GLOB_RECURSE TGFX_TEST_D3D12_FILES test/src/d3d12/*.*)
+        if (TGFX_TEST_D3D12_FILES)
+            list(REMOVE_ITEM TGFX_TEST_FILES ${TGFX_TEST_D3D12_FILES})
+        endif ()
     elseif (TGFX_USE_VULKAN)
         file(GLOB_RECURSE TGFX_TEST_OPENGL_FILES test/src/opengl/*.*)
         if (TGFX_TEST_OPENGL_FILES)
@@ -724,6 +765,23 @@ if (TGFX_BUILD_TESTS)
         if (TGFX_TEST_METAL_FILES)
             list(REMOVE_ITEM TGFX_TEST_FILES ${TGFX_TEST_METAL_FILES})
         endif ()
+        file(GLOB_RECURSE TGFX_TEST_D3D12_FILES test/src/d3d12/*.*)
+        if (TGFX_TEST_D3D12_FILES)
+            list(REMOVE_ITEM TGFX_TEST_FILES ${TGFX_TEST_D3D12_FILES})
+        endif ()
+    elseif (TGFX_USE_D3D12)
+        file(GLOB_RECURSE TGFX_TEST_OPENGL_FILES test/src/opengl/*.*)
+        if (TGFX_TEST_OPENGL_FILES)
+            list(REMOVE_ITEM TGFX_TEST_FILES ${TGFX_TEST_OPENGL_FILES})
+        endif ()
+        file(GLOB_RECURSE TGFX_TEST_METAL_FILES test/src/metal/*.*)
+        if (TGFX_TEST_METAL_FILES)
+            list(REMOVE_ITEM TGFX_TEST_FILES ${TGFX_TEST_METAL_FILES})
+        endif ()
+        file(GLOB_RECURSE TGFX_TEST_VULKAN_FILES test/src/vulkan/*.*)
+        if (TGFX_TEST_VULKAN_FILES)
+            list(REMOVE_ITEM TGFX_TEST_FILES ${TGFX_TEST_VULKAN_FILES})
+        endif ()
     else ()
         file(GLOB_RECURSE TGFX_TEST_METAL_FILES test/src/metal/*.*)
         list(REMOVE_ITEM TGFX_TEST_FILES ${TGFX_TEST_METAL_FILES})
@@ -731,6 +789,10 @@ if (TGFX_BUILD_TESTS)
         if (TGFX_TEST_VULKAN_FILES)
             list(REMOVE_ITEM TGFX_TEST_FILES ${TGFX_TEST_VULKAN_FILES})
         endif ()
+        file(GLOB_RECURSE TGFX_TEST_D3D12_FILES test/src/d3d12/*.*)
+        if (TGFX_TEST_D3D12_FILES)
+            list(REMOVE_ITEM TGFX_TEST_FILES ${TGFX_TEST_D3D12_FILES})
+        endif ()
     endif ()
 
     file(GLOB_RECURSE TGFX_TEST_WEBGL_FILES test/src/webgl/*.*)
diff --git a/include/tgfx/gpu/Backend.h b/include/tgfx/gpu/Backend.h
index 72c5e05e1..99b80a9c4 100644
--- a/include/tgfx/gpu/Backend.h
+++ b/include/tgfx/gpu/Backend.h
@@ -19,6 +19,7 @@
 #pragma once
 
 #include "tgfx/gpu/PixelFormat.h"
+#include "tgfx/gpu/d3d12/D3D12Types.h"
 #include "tgfx/gpu/metal/MetalTypes.h"
 #include "tgfx/gpu/opengl/GLTypes.h"
 #include "tgfx/gpu/vulkan/VulkanTypes.h"
@@ -27,7 +28,7 @@ namespace tgfx {
 /**
  * Possible GPU backend APIs that may be used by TGFX.
  */
-enum class Backend { Unknown, OpenGL, Metal, Vulkan, WebGPU };
+enum class Backend { Unknown, OpenGL, Metal, Vulkan, WebGPU, D3D12 };
 
 /**
  * Wrapper class for passing into and receiving data from TGFX about a backend texture object.
@@ -61,6 +62,13 @@ class BackendTexture {
       : _backend(Backend::Vulkan), _width(width), _height(height), vulkanInfo(vulkanInfo) {
   }
 
+  /**
+   * Creates a D3D12 backend texture.
+   */
+  explicit BackendTexture(const D3D12TextureInfo& d3d12Info, int width, int height)
+      : _backend(Backend::D3D12), _width(width), _height(height), d3d12Info(d3d12Info) {
+  }
+
   BackendTexture(const BackendTexture& that) {
     *this = that;
   }
@@ -118,6 +126,12 @@ class BackendTexture {
    */
   bool getVulkanImageInfo(VulkanImageInfo* vulkanImageInfo) const;
 
+  /**
+   * If the backend API is D3D12, copies a snapshot of the D3D12TextureInfo struct into the passed
+   * in pointer and returns true. Otherwise, returns false if the backend API is not D3D12.
+   */
+  bool getD3D12TextureInfo(D3D12TextureInfo* d3d12TextureInfo) const;
+
  private:
   Backend _backend = Backend::Unknown;
   int _width = 0;
@@ -127,6 +141,7 @@ class BackendTexture {
     GLTextureInfo glInfo;
     MetalTextureInfo metalInfo;
     VulkanImageInfo vulkanInfo;
+    D3D12TextureInfo d3d12Info;
   };
 };
 
@@ -162,6 +177,13 @@ class BackendRenderTarget {
       : _backend(Backend::Vulkan), _width(width), _height(height), vulkanInfo(vulkanInfo) {
   }
 
+  /**
+   * Creates a D3D12 backend render target.
+   */
+  explicit BackendRenderTarget(const D3D12TextureInfo& d3d12Info, int width, int height)
+      : _backend(Backend::D3D12), _width(width), _height(height), d3d12Info(d3d12Info) {
+  }
+
   BackendRenderTarget(const BackendRenderTarget& that) {
     *this = that;
   }
@@ -219,6 +241,12 @@ class BackendRenderTarget {
    */
   bool getVulkanImageInfo(VulkanImageInfo* vulkanImageInfo) const;
 
+  /**
+   * If the backend API is D3D12, copies a snapshot of the D3D12TextureInfo struct into the passed
+   * in pointer and returns true. Otherwise, returns false if the backend API is not D3D12.
+   */
+  bool getD3D12TextureInfo(D3D12TextureInfo* d3d12TextureInfo) const;
+
  private:
   Backend _backend = Backend::Unknown;
   int _width = 0;
@@ -227,6 +255,7 @@ class BackendRenderTarget {
     GLFrameBufferInfo glInfo;
     MetalTextureInfo metalInfo;
     VulkanImageInfo vulkanInfo;
+    D3D12TextureInfo d3d12Info;
   };
 };
 
@@ -262,6 +291,13 @@ class BackendSemaphore {
       : _backend(Backend::Vulkan), vulkanSyncInfo(vulkanInfo) {
   }
 
+  /**
+   * Creates a D3D12 backend semaphore.
+   */
+  explicit BackendSemaphore(const D3D12SyncInfo& d3d12Info)
+      : _backend(Backend::D3D12), d3d12SyncInfo(d3d12Info) {
+  }
+
   BackendSemaphore(const BackendSemaphore& that) {
     *this = that;
   }
@@ -298,12 +334,19 @@ class BackendSemaphore {
    */
   bool getVulkanSync(VulkanSyncInfo* vulkanSyncInfo) const;
 
+  /**
+   * If the backend API is D3D12, copies a snapshot of the D3D12SyncInfo struct into the passed in
+   * pointer and returns true. Otherwise, returns false if the backend API is not D3D12.
+   */
+  bool getD3D12Sync(D3D12SyncInfo* d3d12Info) const;
+
  private:
   Backend _backend = Backend::Unknown;
   union {
     GLSyncInfo glSyncInfo;
     MetalSyncInfo metalSyncInfo;
     VulkanSyncInfo vulkanSyncInfo;
+    D3D12SyncInfo d3d12SyncInfo;
   };
 };
 }  // namespace tgfx
diff --git a/include/tgfx/gpu/d3d12/D3D12Device.h b/include/tgfx/gpu/d3d12/D3D12Device.h
new file mode 100644
index 000000000..3f29e9b9d
--- /dev/null
+++ b/include/tgfx/gpu/d3d12/D3D12Device.h
@@ -0,0 +1,66 @@
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Tencent is pleased to support the open source community by making tgfx available.
+//
+//  Copyright (C) 2026 Tencent. All rights reserved.
+//
+//  Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+//  in compliance with the License. You may obtain a copy of the License at
+//
+//      https://opensource.org/licenses/BSD-3-Clause
+//
+//  unless required by applicable law or agreed to in writing, software distributed under the
+//  license is distributed on an "as is" basis, without warranties or conditions of any kind,
+//  either express or implied. see the license for the specific language governing permissions
+//  and limitations under the license.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include "tgfx/gpu/Device.h"
+
+namespace tgfx {
+
+/**
+ * The D3D12 interface for drawing graphics.
+ */
+class D3D12Device : public Device {
+ public:
+  /**
+   * Creates a new D3D12Device using the default hardware adapter. Returns nullptr if D3D12 is not
+   * available.
+   */
+  static std::shared_ptr<D3D12Device> Make();
+
+  /**
+   * Creates a new D3D12Device backed by the WARP software rasterizer. WARP is a CPU-based D3D12
+   * implementation that ships with Windows; it is functionally complete (feature level 12_1) but
+   * orders of magnitude slower than a real GPU. Intended for headless CI runners and other
+   * environments without a usable hardware adapter — do not rely on it for performance work.
+   * Returns nullptr if WARP is unavailable on the current system.
+   */
+  static std::shared_ptr<D3D12Device> MakeWarp();
+
+  /**
+   * Creates a new D3D12Device from an existing ID3D12Device. The device parameter is a pointer to
+   * an ID3D12Device object. Returns nullptr if the device is invalid.
+   */
+  static std::shared_ptr<D3D12Device> MakeFrom(void* device);
+
+  ~D3D12Device() override;
+
+  /**
+   * Returns the underlying ID3D12Device as a raw pointer.
+   */
+  void* d3d12Device() const;
+
+ protected:
+  bool onLockContext() override;
+  void onUnlockContext() override;
+
+ private:
+  explicit D3D12Device(std::unique_ptr<class D3D12GPU> gpu);
+};
+
+}  // namespace tgfx
diff --git a/include/tgfx/gpu/d3d12/D3D12Types.h b/include/tgfx/gpu/d3d12/D3D12Types.h
new file mode 100644
index 000000000..d66d96981
--- /dev/null
+++ b/include/tgfx/gpu/d3d12/D3D12Types.h
@@ -0,0 +1,60 @@
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Tencent is pleased to support the open source community by making tgfx available.
+//
+//  Copyright (C) 2026 Tencent. All rights reserved.
+//
+//  Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+//  in compliance with the License. You may obtain a copy of the License at
+//
+//      https://opensource.org/licenses/BSD-3-Clause
+//
+//  unless required by applicable law or agreed to in writing, software distributed under the
+//  license is distributed on an "as is" basis, without warranties or conditions of any kind,
+//  either express or implied. see the license for the specific language governing permissions
+//  and limitations under the license.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include <cstdint>
+#include <type_traits>
+
+namespace tgfx {
+/**
+ * Types for interacting with D3D12 textures created externally to TGFX.
+ */
+struct D3D12TextureInfo {
+  /**
+   * Pointer to an ID3D12Resource object representing a texture.
+   */
+  const void* resource = nullptr;
+
+  /**
+   * The pixel format of this texture (DXGI_FORMAT value).
+   */
+  unsigned format = 0;  // DXGI_FORMAT_UNKNOWN
+};
+
+/**
+ * Types for interacting with D3D12 synchronization objects created externally to TGFX.
+ */
+struct D3D12SyncInfo {
+  /**
+   * Pointer to an ID3D12Fence object.
+   */
+  const void* fence = nullptr;
+
+  /**
+   * The signal value for the fence.
+   */
+  uint64_t value = 0;
+};
+
+static_assert(std::is_trivially_copyable_v<D3D12TextureInfo>);
+static_assert(std::is_trivially_copyable_v<D3D12SyncInfo>);
+static_assert(std::is_standard_layout_v<D3D12TextureInfo>);
+static_assert(std::is_standard_layout_v<D3D12SyncInfo>);
+
+}  // namespace tgfx
diff --git a/include/tgfx/gpu/d3d12/D3D12Window.h b/include/tgfx/gpu/d3d12/D3D12Window.h
new file mode 100644
index 000000000..8e61a490d
--- /dev/null
+++ b/include/tgfx/gpu/d3d12/D3D12Window.h
@@ -0,0 +1,67 @@
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Tencent is pleased to support the open source community by making tgfx available.
+//
+//  Copyright (C) 2026 Tencent. All rights reserved.
+//
+//  Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+//  in compliance with the License. You may obtain a copy of the License at
+//
+//      https://opensource.org/licenses/BSD-3-Clause
+//
+//  unless required by applicable law or agreed to in writing, software distributed under the
+//  license is distributed on an "as is" basis, without warranties or conditions of any kind,
+//  either express or implied. see the license for the specific language governing permissions
+//  and limitations under the license.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include <memory>
+#include "tgfx/core/ColorSpace.h"
+#include "tgfx/gpu/Window.h"
+#include "tgfx/gpu/d3d12/D3D12Device.h"
+
+#ifdef _WIN32
+struct HWND__;
+typedef HWND__* HWND;
+#endif
+
+namespace tgfx {
+
+/**
+ * D3D12Window manages an IDXGISwapChain3 and its backbuffer textures for presenting rendered
+ * content to a Win32 window. Each frame the current backbuffer is exposed as a RenderTarget
+ * through the standard Window/Surface API; on present the swap chain flips to the next buffer.
+ */
+class D3D12Window : public Window {
+ public:
+#ifdef _WIN32
+  /**
+   * Creates a D3D12Window from a Win32 window handle. Returns nullptr if the swap chain cannot
+   * be created. Note: only sRGB output is currently supported. The colorSpace parameter is
+   * accepted for forward compatibility but non-sRGB values are ignored with a warning.
+   */
+  static std::shared_ptr<D3D12Window> MakeFrom(HWND hwnd, std::shared_ptr<D3D12Device> device,
+                                               std::shared_ptr<ColorSpace> colorSpace = nullptr);
+#endif
+
+  ~D3D12Window() override;
+
+ protected:
+  std::shared_ptr<RenderTargetProxy> onCreateRenderTarget(Context* context) override;
+  void onPresent(Context* context) override;
+
+ private:
+  // PImpl: all DXGI / D3D12 handles and per-backbuffer state live in PlatformState (defined in
+  // the .cpp) so this header pulls in neither dxgi.h nor d3d12.h.
+  struct PlatformState;
+
+  explicit D3D12Window(std::shared_ptr<Device> device, std::unique_ptr<PlatformState> state,
+                       std::shared_ptr<ColorSpace> colorSpace);
+
+  std::unique_ptr<PlatformState> _platformState;
+};
+
+}  // namespace tgfx
diff --git a/src/gpu/Backend.cpp b/src/gpu/Backend.cpp
index 45e4c9d5b..15f903fe8 100644
--- a/src/gpu/Backend.cpp
+++ b/src/gpu/Backend.cpp
@@ -17,6 +17,7 @@
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 #include "tgfx/gpu/Backend.h"
+#include "d3d12/D3D12Defines.h"
 #include "metal/MetalDefines.h"
 #include "opengl/GLDefines.h"
 #include "vulkan/VulkanDefines.h"
@@ -63,6 +64,9 @@ BackendTexture& BackendTexture::operator=(const BackendTexture& that) {
     case Backend::Vulkan:
       vulkanInfo = that.vulkanInfo;
       break;
+    case Backend::D3D12:
+      d3d12Info = that.d3d12Info;
+      break;
     default:
       break;
   }
@@ -80,6 +84,8 @@ PixelFormat BackendTexture::format() const {
       return MetalPixelFormatToPixelFormat(metalInfo.format);
     case Backend::Vulkan:
       return VulkanFormatToPixelFormat(vulkanInfo.format);
+    case Backend::D3D12:
+      return DXGIFormatToPixelFormat(d3d12Info.format);
     default:
       break;
   }
@@ -110,6 +116,14 @@ bool BackendTexture::getVulkanImageInfo(VulkanImageInfo* vulkanImageInfo) const
   return true;
 }
 
+bool BackendTexture::getD3D12TextureInfo(D3D12TextureInfo* d3d12TextureInfo) const {
+  if (!isValid() || _backend != Backend::D3D12) {
+    return false;
+  }
+  *d3d12TextureInfo = d3d12Info;
+  return true;
+}
+
 BackendRenderTarget& BackendRenderTarget::operator=(const BackendRenderTarget& that) {
   if (!that.isValid()) {
     _width = _height = 0;
@@ -128,6 +142,9 @@ BackendRenderTarget& BackendRenderTarget::operator=(const BackendRenderTarget& t
     case Backend::Vulkan:
       vulkanInfo = that.vulkanInfo;
       break;
+    case Backend::D3D12:
+      d3d12Info = that.d3d12Info;
+      break;
     default:
       break;
   }
@@ -145,6 +162,8 @@ PixelFormat BackendRenderTarget::format() const {
       return MetalPixelFormatToPixelFormat(metalInfo.format);
     case Backend::Vulkan:
       return VulkanFormatToPixelFormat(vulkanInfo.format);
+    case Backend::D3D12:
+      return DXGIFormatToPixelFormat(d3d12Info.format);
     default:
       break;
   }
@@ -175,6 +194,14 @@ bool BackendRenderTarget::getVulkanImageInfo(VulkanImageInfo* vulkanImageInfo) c
   return true;
 }
 
+bool BackendRenderTarget::getD3D12TextureInfo(D3D12TextureInfo* d3d12TextureInfo) const {
+  if (!isValid() || _backend != Backend::D3D12) {
+    return false;
+  }
+  *d3d12TextureInfo = d3d12Info;
+  return true;
+}
+
 BackendSemaphore& BackendSemaphore::operator=(const BackendSemaphore& that) {
   _backend = that._backend;
   switch (that._backend) {
@@ -187,6 +214,9 @@ BackendSemaphore& BackendSemaphore::operator=(const BackendSemaphore& that) {
     case Backend::Vulkan:
       vulkanSyncInfo = that.vulkanSyncInfo;
       break;
+    case Backend::D3D12:
+      d3d12SyncInfo = that.d3d12SyncInfo;
+      break;
     default:
       break;
   }
@@ -201,6 +231,8 @@ bool BackendSemaphore::isInitialized() const {
       return metalSyncInfo.event != nullptr;
     case Backend::Vulkan:
       return vulkanSyncInfo.semaphore != 0;
+    case Backend::D3D12:
+      return d3d12SyncInfo.fence != nullptr;
     default:
       break;
   }
@@ -231,4 +263,12 @@ bool BackendSemaphore::getVulkanSync(VulkanSyncInfo* vulkanSyncInfo) const {
   return true;
 }
 
+bool BackendSemaphore::getD3D12Sync(D3D12SyncInfo* d3d12Info) const {
+  if (_backend != Backend::D3D12 || d3d12SyncInfo.fence == nullptr) {
+    return false;
+  }
+  *d3d12Info = d3d12SyncInfo;
+  return true;
+}
+
 }  // namespace tgfx
diff --git a/src/gpu/ShaderCaps.cpp b/src/gpu/ShaderCaps.cpp
index 91fafb6b0..2b1a179ee 100644
--- a/src/gpu/ShaderCaps.cpp
+++ b/src/gpu/ShaderCaps.cpp
@@ -40,6 +40,9 @@ static void PrintGPUInfo(const GPUInfo* info) {
     case Backend::WebGPU:
       backend = "WebGPU";
       break;
+    case Backend::D3D12:
+      backend = "D3D12";
+      break;
     case Backend::Unknown:
       backend = "Unknown";
       break;
diff --git a/src/gpu/ShaderCompiler.cpp b/src/gpu/ShaderCompiler.cpp
index 780573bb7..1f3f144b6 100644
--- a/src/gpu/ShaderCompiler.cpp
+++ b/src/gpu/ShaderCompiler.cpp
@@ -139,12 +139,17 @@ std::string PreprocessGLSL(const std::string& glslCode) {
 }
 
 std::vector<uint32_t> CompileGLSLToSPIRV(const shaderc::Compiler* compiler,
-                                         const std::string& glslCode, ShaderStage stage) {
+                                         const std::string& glslCode, ShaderStage stage,
+                                         bool preserveInterfaceVariables) {
   if (compiler == nullptr) {
     return {};
   }
   shaderc::CompileOptions options;
-  options.SetOptimizationLevel(shaderc_optimization_level_performance);
+  // See header doc on `preserveInterfaceVariables` for the rationale; D3D12 requires zero so the
+  // optimiser cannot dead-strip fragment inputs that have no body uses, while Vulkan/Metal stay
+  // on the performance preset for tighter SPIR-V.
+  options.SetOptimizationLevel(preserveInterfaceVariables ? shaderc_optimization_level_zero
+                                                          : shaderc_optimization_level_performance);
   options.SetTargetEnvironment(shaderc_target_env_vulkan, shaderc_env_version_vulkan_1_0);
 
   shaderc_shader_kind shaderKind =
diff --git a/src/gpu/ShaderCompiler.h b/src/gpu/ShaderCompiler.h
index b149d591f..ba1c97011 100644
--- a/src/gpu/ShaderCompiler.h
+++ b/src/gpu/ShaderCompiler.h
@@ -36,7 +36,17 @@ std::string PreprocessGLSL(const std::string& glslCode);
 
 /// Compiles preprocessed GLSL 450 source to SPIR-V binary using shaderc. Returns an empty vector
 /// on failure.
+///
+/// `preserveInterfaceVariables` controls the optimisation level:
+///   - false (default): runs `shaderc_optimization_level_performance`, which is what Vulkan and
+///     Metal want — both bind interface variables by name/location, so dead-stripping unused
+///     fragment inputs is harmless and yields better generated code.
+///   - true: runs `shaderc_optimization_level_zero` so every declared vertex output / fragment
+///     input survives. D3D12 needs this because the SPIR-V → HLSL pass turns SPIR-V locations
+///     into TEXCOORDn semantics; if the optimiser drops a fragment input, the resulting HLSL
+///     mismatches the vertex shader's output signature and PSO creation fails.
 std::vector<uint32_t> CompileGLSLToSPIRV(const shaderc::Compiler* compiler,
-                                         const std::string& vulkanGLSL, ShaderStage stage);
+                                         const std::string& vulkanGLSL, ShaderStage stage,
+                                         bool preserveInterfaceVariables = false);
 
 }  // namespace tgfx
diff --git a/src/gpu/VaryingHandler.cpp b/src/gpu/VaryingHandler.cpp
index 2ae27c06b..dbdd2aaa6 100644
--- a/src/gpu/VaryingHandler.cpp
+++ b/src/gpu/VaryingHandler.cpp
@@ -17,6 +17,7 @@
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 #include "VaryingHandler.h"
+#include <string>
 #include "ProgramBuilder.h"
 
 namespace tgfx {
@@ -61,6 +62,14 @@ void VaryingHandler::finalize() {
 
 void VaryingHandler::appendDecls(const std::vector<ShaderVar>& vars, std::string* out,
                                  ShaderStage stage) const {
+  // Do not emit explicit "layout(location=N)" qualifiers for varyings here. Desktop OpenGL
+  // sticks to "#version 150" which rejects layout(location) on varyings without the
+  // GL_ARB_separate_shader_objects extension, so emitting them breaks shader compilation on
+  // that backend. The SPIR-V cross-compiled backends (D3D12 / Vulkan / Metal) reassign
+  // locations later in ShaderCompiler::PreprocessGLSL via the in/out regex passes, which walk
+  // the GLSL source in textual order. Vertex outputs and fragment inputs come from the same
+  // VaryingHandler::varyings sequence, so the per-stage numbering they end up with already
+  // matches across the boundary.
   for (const auto& var : vars) {
     out->append(programBuilder->getShaderVarDeclarations(var, stage));
     out->append(";\n");
diff --git a/src/gpu/d3d12/D3D12BarrierBatch.cpp b/src/gpu/d3d12/D3D12BarrierBatch.cpp
new file mode 100644
index 000000000..5666450a7
--- /dev/null
+++ b/src/gpu/d3d12/D3D12BarrierBatch.cpp
@@ -0,0 +1,47 @@
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Tencent is pleased to support the open source community by making tgfx available.
+//
+//  Copyright (C) 2026 Tencent. All rights reserved.
+//
+//  Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+//  in compliance with the License. You may obtain a copy of the License at
+//
+//      https://opensource.org/licenses/BSD-3-Clause
+//
+//  unless required by applicable law or agreed to in writing, software distributed under the
+//  license is distributed on an "as is" basis, without warranties or conditions of any kind,
+//  either express or implied. see the license for the specific language governing permissions
+//  and limitations under the license.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "D3D12BarrierBatch.h"
+
+namespace tgfx {
+
+void D3D12BarrierBatch::addTransition(ID3D12Resource* resource, D3D12_RESOURCE_STATES before,
+                                      D3D12_RESOURCE_STATES after, UINT subresource) {
+  if (resource == nullptr || before == after) {
+    return;
+  }
+  D3D12_RESOURCE_BARRIER barrier = {};
+  barrier.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
+  barrier.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE;
+  barrier.Transition.pResource = resource;
+  barrier.Transition.StateBefore = before;
+  barrier.Transition.StateAfter = after;
+  barrier.Transition.Subresource = subresource;
+  barriers.push_back(barrier);
+}
+
+void D3D12BarrierBatch::flush(ID3D12GraphicsCommandList* commandList) {
+  if (barriers.empty() || commandList == nullptr) {
+    barriers.clear();
+    return;
+  }
+  commandList->ResourceBarrier(static_cast<UINT>(barriers.size()), barriers.data());
+  barriers.clear();
+}
+
+}  // namespace tgfx
diff --git a/src/gpu/d3d12/D3D12BarrierBatch.h b/src/gpu/d3d12/D3D12BarrierBatch.h
new file mode 100644
index 000000000..fbaef6456
--- /dev/null
+++ b/src/gpu/d3d12/D3D12BarrierBatch.h
@@ -0,0 +1,87 @@
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Tencent is pleased to support the open source community by making tgfx available.
+//
+//  Copyright (C) 2026 Tencent. All rights reserved.
+//
+//  Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+//  in compliance with the License. You may obtain a copy of the License at
+//
+//      https://opensource.org/licenses/BSD-3-Clause
+//
+//  unless required by applicable law or agreed to in writing, software distributed under the
+//  license is distributed on an "as is" basis, without warranties or conditions of any kind,
+//  either express or implied. see the license for the specific language governing permissions
+//  and limitations under the license.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include <vector>
+#include "D3D12Util.h"
+
+namespace tgfx {
+
+/**
+ * Accumulates D3D12 resource transitions and flushes them in a single ResourceBarrier call.
+ *
+ * Why batching matters:
+ *   - Each ID3D12GraphicsCommandList::ResourceBarrier call has fixed CPU-side runtime/driver
+ *     cost (state-machine bookkeeping, debug-layer validation, and on some GPUs an implicit
+ *     pipeline stall to flush caches).
+ *   - When N independent barriers are submitted in a single call, the driver can collapse
+ *     redundant cache flushes — for example five textures all moving from RENDER_TARGET to
+ *     PIXEL_SHADER_RESOURCE only require one RT-cache flush + one PS-cache invalidate, not
+ *     five of each. NVIDIA / AMD / Intel guidance lists barrier batching as one of the top
+ *     three D3D12 performance pitfalls.
+ *
+ * Usage pattern:
+ *   D3D12BarrierBatch batch;
+ *   batch.addTransition(rt, COMMON, RENDER_TARGET);
+ *   batch.addTransition(dsv, COMMON, DEPTH_WRITE);
+ *   batch.flush(commandList);   // one ResourceBarrier(2, ...)
+ *
+ * Thread safety: not thread-safe. Each batch is local to one command-list recording context.
+ */
+class D3D12BarrierBatch {
+ public:
+  D3D12BarrierBatch() {
+    // Most batches in the backend hold 1–8 transitions (e.g. one per color attachment, or one
+    // per sampled texture in a draw). Pre-reserve to avoid the small-buffer reallocation
+    // sequence that plain push_back() incurs.
+    barriers.reserve(8);
+  }
+
+  D3D12BarrierBatch(const D3D12BarrierBatch&) = delete;
+  D3D12BarrierBatch& operator=(const D3D12BarrierBatch&) = delete;
+
+  /**
+   * Queues a transition barrier. No-op if `before == after` so callers can pass through
+   * already-correct states without polluting the batch. `subresource` defaults to "all
+   * subresources" which is what most call sites need; callers manipulating individual mips
+   * should pass the explicit subresource index.
+   */
+  void addTransition(ID3D12Resource* resource, D3D12_RESOURCE_STATES before,
+                     D3D12_RESOURCE_STATES after,
+                     UINT subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES);
+
+  /**
+   * Submits all queued barriers to the command list as a single ResourceBarrier(N, ...) call
+   * and clears the batch. Safe to call when the batch is empty (no-op).
+   */
+  void flush(ID3D12GraphicsCommandList* commandList);
+
+  bool empty() const {
+    return barriers.empty();
+  }
+
+  size_t size() const {
+    return barriers.size();
+  }
+
+ private:
+  std::vector<D3D12_RESOURCE_BARRIER> barriers;
+};
+
+}  // namespace tgfx
diff --git a/src/gpu/d3d12/D3D12Buffer.cpp b/src/gpu/d3d12/D3D12Buffer.cpp
new file mode 100644
index 000000000..a658eea20
--- /dev/null
+++ b/src/gpu/d3d12/D3D12Buffer.cpp
@@ -0,0 +1,135 @@
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Tencent is pleased to support the open source community by making tgfx available.
+//
+//  Copyright (C) 2026 Tencent. All rights reserved.
+//
+//  Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+//  in compliance with the License. You may obtain a copy of the License at
+//
+//      https://opensource.org/licenses/BSD-3-Clause
+//
+//  unless required by applicable law or agreed to in writing, software distributed under the
+//  license is distributed on an "as is" basis, without warranties or conditions of any kind,
+//  either express or implied. see the license for the specific language governing permissions
+//  and limitations under the license.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "D3D12Buffer.h"
+#include "D3D12GPU.h"
+#include "core/utils/Log.h"
+
+namespace tgfx {
+
+std::shared_ptr<D3D12Buffer> D3D12Buffer::Make(D3D12GPU* gpu, size_t size, uint32_t usage) {
+  if (gpu == nullptr || size == 0) {
+    return nullptr;
+  }
+
+  D3D12_HEAP_PROPERTIES heapProperties = {};
+  if (usage & GPUBufferUsage::READBACK) {
+    heapProperties.Type = D3D12_HEAP_TYPE_READBACK;
+  } else {
+    // TODO: Place static (write-once) vertex/index buffers in D3D12_HEAP_TYPE_DEFAULT and stage
+    // their initial contents through an UPLOAD heap copy, leaving only streamed (per-frame)
+    // buffers in D3D12_HEAP_TYPE_UPLOAD. The Vulkan and Metal backends share the same shortcut
+    // today (every non-readback buffer is host-visible) so any improvement here should be paired
+    // with the matching VMA/MTLResourceStorageModePrivate work and a STATIC/STREAM hint on
+    // GPUBufferUsage. Until then keep UPLOAD so vertex/index/uniform buffers remain mappable.
+    heapProperties.Type = D3D12_HEAP_TYPE_UPLOAD;
+  }
+
+  D3D12_RESOURCE_DESC resourceDesc = {};
+  resourceDesc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER;
+  resourceDesc.Width = static_cast<UINT64>(size);
+  resourceDesc.Height = 1;
+  resourceDesc.DepthOrArraySize = 1;
+  resourceDesc.MipLevels = 1;
+  resourceDesc.Format = static_cast<DXGI_FORMAT>(DXGI_FORMAT_UNKNOWN);
+  resourceDesc.SampleDesc.Count = 1;
+  resourceDesc.SampleDesc.Quality = 0;
+  resourceDesc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR;
+  resourceDesc.Flags = D3D12_RESOURCE_FLAG_NONE;
+
+  D3D12_RESOURCE_STATES initialState = D3D12_RESOURCE_STATE_GENERIC_READ;
+  if (usage & GPUBufferUsage::READBACK) {
+    initialState = D3D12_RESOURCE_STATE_COPY_DEST;
+  }
+
+  ComPtr<ID3D12Resource> d3d12Resource = nullptr;
+  auto hr =
+      gpu->device()->CreateCommittedResource(&heapProperties, D3D12_HEAP_FLAG_NONE, &resourceDesc,
+                                             initialState, nullptr, IID_PPV_ARGS(&d3d12Resource));
+  if (FAILED(hr)) {
+    LOGE("D3D12Buffer::Make() CreateCommittedResource failed, HRESULT=0x%08X",
+         static_cast<unsigned>(hr));
+    return nullptr;
+  }
+
+  return gpu->makeResource<D3D12Buffer>(size, usage, std::move(d3d12Resource));
+}
+
+D3D12Buffer::D3D12Buffer(size_t size, uint32_t usage, ComPtr<ID3D12Resource> d3d12Resource)
+    : GPUBuffer(size, usage), resource(std::move(d3d12Resource)) {
+}
+
+void D3D12Buffer::onRelease(D3D12GPU*) {
+  if (mappedPointer != nullptr) {
+    resource->Unmap(0, nullptr);
+    mappedPointer = nullptr;
+  }
+  resource = nullptr;
+}
+
+void* D3D12Buffer::map(size_t offset, size_t size) {
+  if (resource == nullptr || mappedPointer != nullptr) {
+    return nullptr;
+  }
+  if (size == 0) {
+    LOGE("D3D12Buffer::map() size cannot be 0!");
+    return nullptr;
+  }
+  if (size == GPU_BUFFER_WHOLE_SIZE) {
+    size = _size - offset;
+  }
+  if (offset + size > _size) {
+    LOGE("D3D12Buffer::map() range out of bounds!");
+    return nullptr;
+  }
+
+  D3D12_RANGE readRange = {};
+  if (_usage & GPUBufferUsage::READBACK) {
+    readRange.Begin = offset;
+    readRange.End = offset + size;
+  }
+
+  void* data = nullptr;
+  auto hr = resource->Map(0, &readRange, &data);
+  if (FAILED(hr) || data == nullptr) {
+    LOGE("D3D12Buffer::map() Map failed, HRESULT=0x%08X", static_cast<unsigned>(hr));
+    return nullptr;
+  }
+
+  mappedPointer = static_cast<uint8_t*>(data) + offset;
+  return mappedPointer;
+}
+
+void D3D12Buffer::unmap() {
+  if (resource == nullptr || mappedPointer == nullptr) {
+    return;
+  }
+  D3D12_RANGE writtenRange = {};
+  if (!(_usage & GPUBufferUsage::READBACK)) {
+    writtenRange.Begin = 0;
+    writtenRange.End = _size;
+  }
+  resource->Unmap(0, &writtenRange);
+  mappedPointer = nullptr;
+}
+
+bool D3D12Buffer::isReady() const {
+  return resource != nullptr;
+}
+
+}  // namespace tgfx
diff --git a/src/gpu/d3d12/D3D12Buffer.h b/src/gpu/d3d12/D3D12Buffer.h
new file mode 100644
index 000000000..6748884db
--- /dev/null
+++ b/src/gpu/d3d12/D3D12Buffer.h
@@ -0,0 +1,60 @@
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Tencent is pleased to support the open source community by making tgfx available.
+//
+//  Copyright (C) 2026 Tencent. All rights reserved.
+//
+//  Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+//  in compliance with the License. You may obtain a copy of the License at
+//
+//      https://opensource.org/licenses/BSD-3-Clause
+//
+//  unless required by applicable law or agreed to in writing, software distributed under the
+//  license is distributed on an "as is" basis, without warranties or conditions of any kind,
+//  either express or implied. see the license for the specific language governing permissions
+//  and limitations under the license.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include "D3D12Resource.h"
+#include "D3D12Util.h"
+#include "tgfx/gpu/GPUBuffer.h"
+
+namespace tgfx {
+
+class D3D12GPU;
+
+/**
+ * D3D12 buffer implementation.
+ */
+class D3D12Buffer : public GPUBuffer, public D3D12Resource {
+ public:
+  static std::shared_ptr<D3D12Buffer> Make(D3D12GPU* gpu, size_t size, uint32_t usage);
+
+  /**
+   * Returns the underlying D3D12 resource.
+   */
+  ID3D12Resource* d3d12Resource() const {
+    return resource.Get();
+  }
+
+  void* map(size_t offset, size_t size) override;
+  void unmap() override;
+  bool isReady() const override;
+
+ protected:
+  void onRelease(D3D12GPU* gpu) override;
+
+ private:
+  D3D12Buffer(size_t size, uint32_t usage, ComPtr<ID3D12Resource> resource);
+  ~D3D12Buffer() override = default;
+
+  ComPtr<ID3D12Resource> resource = nullptr;
+  void* mappedPointer = nullptr;
+
+  friend class D3D12GPU;
+};
+
+}  // namespace tgfx
diff --git a/src/gpu/d3d12/D3D12CommandBuffer.cpp b/src/gpu/d3d12/D3D12CommandBuffer.cpp
new file mode 100644
index 000000000..18b0c248e
--- /dev/null
+++ b/src/gpu/d3d12/D3D12CommandBuffer.cpp
@@ -0,0 +1,34 @@
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Tencent is pleased to support the open source community by making tgfx available.
+//
+//  Copyright (C) 2026 Tencent. All rights reserved.
+//
+//  Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+//  in compliance with the License. You may obtain a copy of the License at
+//
+//      https://opensource.org/licenses/BSD-3-Clause
+//
+//  unless required by applicable law or agreed to in writing, software distributed under the
+//  license is distributed on an "as is" basis, without warranties or conditions of any kind,
+//  either express or implied. see the license for the specific language governing permissions
+//  and limitations under the license.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "D3D12CommandBuffer.h"
+#include "D3D12GPU.h"
+
+namespace tgfx {
+
+D3D12CommandBuffer::~D3D12CommandBuffer() {
+  if (session.commandAllocator == nullptr) {
+    // Session was already moved out by submit(). Normal path — nothing to clean up.
+    return;
+  }
+  // Abandon path: CommandBuffer was created (finish() succeeded) but never submitted.
+  // Reclaim all session resources through the same unified path used by reclaimSubmission().
+  _gpu->reclaimAbandonedSession(std::move(session));
+}
+
+}  // namespace tgfx
diff --git a/src/gpu/d3d12/D3D12CommandBuffer.h b/src/gpu/d3d12/D3D12CommandBuffer.h
new file mode 100644
index 000000000..4d2769bcc
--- /dev/null
+++ b/src/gpu/d3d12/D3D12CommandBuffer.h
@@ -0,0 +1,66 @@
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Tencent is pleased to support the open source community by making tgfx available.
+//
+//  Copyright (C) 2026 Tencent. All rights reserved.
+//
+//  Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+//  in compliance with the License. You may obtain a copy of the License at
+//
+//      https://opensource.org/licenses/BSD-3-Clause
+//
+//  unless required by applicable law or agreed to in writing, software distributed under the
+//  license is distributed on an "as is" basis, without warranties or conditions of any kind,
+//  either express or implied. see the license for the specific language governing permissions
+//  and limitations under the license.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include "gpu/d3d12/D3D12FrameSession.h"
+#include "tgfx/gpu/CommandBuffer.h"
+
+namespace tgfx {
+
+class D3D12GPU;
+
+/**
+ * Transport container that carries a D3D12FrameSession from encoding to submission.
+ *
+ * Created by D3D12CommandEncoder::onFinish() which moves its D3D12FrameSession here. Consumed by
+ * D3D12CommandQueue::submit() which moves the session into D3D12GPU's InflightSubmission. After
+ * submit(), this object is empty and may be discarded.
+ *
+ * If the CommandBuffer is abandoned (created but never submitted), the destructor reclaims all
+ * session resources via D3D12GPU::reclaimAbandonedSession(). This matches the abandon safety
+ * guarantee provided by D3D12CommandEncoder::onRelease() — both use the same unified cleanup path
+ * in D3D12GPU, ensuring no D3D12 objects are leaked regardless of where the pipeline is
+ * interrupted.
+ */
+class D3D12CommandBuffer : public CommandBuffer {
+ public:
+  D3D12CommandBuffer(D3D12GPU* gpu, D3D12FrameSession session)
+      : _gpu(gpu), session(std::move(session)) {
+  }
+
+  ~D3D12CommandBuffer() override;
+
+  D3D12FrameSession& frameSession() {
+    return session;
+  }
+
+  ID3D12GraphicsCommandList* d3d12CommandList() const {
+    return session.commandList.Get();
+  }
+
+  ID3D12CommandAllocator* d3d12CommandAllocator() const {
+    return session.commandAllocator.Get();
+  }
+
+ private:
+  D3D12GPU* _gpu = nullptr;
+  D3D12FrameSession session;
+};
+
+}  // namespace tgfx
diff --git a/src/gpu/d3d12/D3D12CommandEncoder.cpp b/src/gpu/d3d12/D3D12CommandEncoder.cpp
new file mode 100644
index 000000000..e7db252bf
--- /dev/null
+++ b/src/gpu/d3d12/D3D12CommandEncoder.cpp
@@ -0,0 +1,456 @@
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Tencent is pleased to support the open source community by making tgfx available.
+//
+//  Copyright (C) 2026 Tencent. All rights reserved.
+//
+//  Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+//  in compliance with the License. You may obtain a copy of the License at
+//
+//      https://opensource.org/licenses/BSD-3-Clause
+//
+//  unless required by applicable law or agreed to in writing, software distributed under the
+//  license is distributed on an "as is" basis, without warranties or conditions of any kind,
+//  either express or implied. see the license for the specific language governing permissions
+//  and limitations under the license.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "D3D12CommandEncoder.h"
+#include "D3D12BarrierBatch.h"
+#include "D3D12Buffer.h"
+#include "D3D12CommandBuffer.h"
+#include "D3D12Defines.h"
+#include "D3D12GPU.h"
+#include "D3D12MipmapGenerator.h"
+#include "D3D12RenderPass.h"
+#include "D3D12Texture.h"
+#include "core/utils/Log.h"
+
+namespace tgfx {
+
+std::shared_ptr<D3D12CommandEncoder> D3D12CommandEncoder::Make(D3D12GPU* gpu) {
+  if (gpu == nullptr) {
+    return nullptr;
+  }
+  // Pull a recording-state allocator/list pair from the pool. On a hit, it has just been Reset();
+  // on a miss, the pool creates a fresh pair internally. Either way, the list arrives in the
+  // same recording state CreateCommandList would have produced.
+  auto entry = gpu->commandListPool().acquire(gpu->device());
+  if (!entry.valid()) {
+    LOGE("D3D12CommandEncoder: command-list pool acquire failed.");
+    return nullptr;
+  }
+
+  // Bind the process-wide shader-visible CBV/SRV/UAV ring and Sampler heap once for the entire
+  // life of this command list. Reset() clears any previous SetDescriptorHeaps state, so the
+  // bind has to happen on every reuse, not just on first creation. D3D12 documents repeated
+  // SetDescriptorHeaps as a potential stall on some drivers, and our render passes always
+  // sub-allocate descriptors into these two heaps, so a single bind is both correct and optimal.
+  ID3D12DescriptorHeap* heaps[] = {gpu->srvRing().heap(), gpu->samplerHeap()};
+  entry.commandList->SetDescriptorHeaps(2, heaps);
+
+  return gpu->makeResource<D3D12CommandEncoder>(gpu, std::move(entry.allocator),
+                                                std::move(entry.commandList));
+}
+
+D3D12CommandEncoder::D3D12CommandEncoder(D3D12GPU* gpu, ComPtr<ID3D12CommandAllocator> allocator,
+                                         ComPtr<ID3D12GraphicsCommandList> commandList)
+    : _gpu(gpu) {
+  session.commandAllocator = std::move(allocator);
+  session.commandList = std::move(commandList);
+}
+
+GPU* D3D12CommandEncoder::gpu() const {
+  return _gpu;
+}
+
+std::shared_ptr<RenderPass> D3D12CommandEncoder::onBeginRenderPass(
+    const RenderPassDescriptor& descriptor) {
+  return D3D12RenderPass::Make(this, descriptor);
+}
+
+void D3D12CommandEncoder::copyTextureToTexture(std::shared_ptr<Texture> srcTexture,
+                                               const Rect& srcRect,
+                                               std::shared_ptr<Texture> dstTexture,
+                                               const Point& dstOffset) {
+  if (!srcTexture || !dstTexture) {
+    return;
+  }
+  // Clamp copy region to source bounds.
+  auto srcX = static_cast<int32_t>(srcRect.x());
+  auto srcY = static_cast<int32_t>(srcRect.y());
+  auto copyWidth = static_cast<uint32_t>(srcRect.width());
+  auto copyHeight = static_cast<uint32_t>(srcRect.height());
+  auto srcW = static_cast<uint32_t>(srcTexture->width());
+  auto srcH = static_cast<uint32_t>(srcTexture->height());
+  if (srcX + copyWidth > srcW) {
+    copyWidth = srcW > static_cast<uint32_t>(srcX) ? srcW - static_cast<uint32_t>(srcX) : 0;
+  }
+  if (srcY + copyHeight > srcH) {
+    copyHeight = srcH > static_cast<uint32_t>(srcY) ? srcH - static_cast<uint32_t>(srcY) : 0;
+  }
+
+  auto d3d12Src = std::static_pointer_cast<D3D12Texture>(srcTexture);
+  auto d3d12Dst = std::static_pointer_cast<D3D12Texture>(dstTexture);
+  retainResource(d3d12Src);
+  retainResource(d3d12Dst);
+
+  auto cmd = session.commandList.Get();
+  if (copyWidth == 0 || copyHeight == 0) {
+    return;
+  }
+
+  // Combine the two pre-copy transitions (src -> COPY_SOURCE, dst -> COPY_DEST) into a single
+  // ResourceBarrier(2, ...) call. addTransition() collapses no-op transitions, so callers that
+  // are already in the requested state simply skip ahead. recordTextureStateChange snapshots
+  // the original state into session.initialTextureStates on first touch so the abandoned-
+  // session path can roll _currentState back if this copy never reaches the GPU.
+  D3D12BarrierBatch enterBatch;
+  enterBatch.addTransition(d3d12Src->d3d12Resource(), d3d12Src->currentState(),
+                           D3D12_RESOURCE_STATE_COPY_SOURCE);
+  recordTextureStateChange(d3d12Src.get(), D3D12_RESOURCE_STATE_COPY_SOURCE);
+  enterBatch.addTransition(d3d12Dst->d3d12Resource(), d3d12Dst->currentState(),
+                           D3D12_RESOURCE_STATE_COPY_DEST);
+  recordTextureStateChange(d3d12Dst.get(), D3D12_RESOURCE_STATE_COPY_DEST);
+  enterBatch.flush(cmd);
+
+  D3D12_TEXTURE_COPY_LOCATION dst = {};
+  dst.pResource = d3d12Dst->d3d12Resource();
+  dst.Type = D3D12_TEXTURE_COPY_TYPE_SUBRESOURCE_INDEX;
+  dst.SubresourceIndex = 0;
+
+  D3D12_TEXTURE_COPY_LOCATION src = {};
+  src.pResource = d3d12Src->d3d12Resource();
+  src.Type = D3D12_TEXTURE_COPY_TYPE_SUBRESOURCE_INDEX;
+  // Both SubresourceIndex 0 values intentionally target (mip 0, array slice 0, plane 0). The
+  // CommandEncoder::copyTextureToTexture contract does not expose mip / slice arguments and the
+  // Vulkan/Metal backends do exactly the same thing. A per-mip copy would require an API
+  // extension across every backend, not just D3D12.
+  src.SubresourceIndex = 0;
+
+  D3D12_BOX srcBox = {};
+  srcBox.left = static_cast<UINT>(srcX);
+  srcBox.top = static_cast<UINT>(srcY);
+  srcBox.front = 0;
+  srcBox.right = static_cast<UINT>(srcX) + copyWidth;
+  srcBox.bottom = static_cast<UINT>(srcY) + copyHeight;
+  srcBox.back = 1;
+
+  cmd->CopyTextureRegion(&dst, static_cast<UINT>(dstOffset.x), static_cast<UINT>(dstOffset.y), 0,
+                         &src, &srcBox);
+
+  // Transition both resources back to COMMON in a single barrier call. D3D12 will then promote
+  // them implicitly to PIXEL_SHADER_RESOURCE on the next sample — matching the "promote on
+  // demand" behaviour the rest of the backend assumes.
+  D3D12BarrierBatch exitBatch;
+  exitBatch.addTransition(d3d12Src->d3d12Resource(), D3D12_RESOURCE_STATE_COPY_SOURCE,
+                          D3D12_RESOURCE_STATE_COMMON);
+  exitBatch.addTransition(d3d12Dst->d3d12Resource(), D3D12_RESOURCE_STATE_COPY_DEST,
+                          D3D12_RESOURCE_STATE_COMMON);
+  exitBatch.flush(cmd);
+  recordTextureStateChange(d3d12Src.get(), D3D12_RESOURCE_STATE_COMMON);
+  recordTextureStateChange(d3d12Dst.get(), D3D12_RESOURCE_STATE_COMMON);
+}
+
+void D3D12CommandEncoder::copyTextureToBuffer(std::shared_ptr<Texture> srcTexture,
+                                              const Rect& srcRect,
+                                              std::shared_ptr<GPUBuffer> dstBuffer,
+                                              size_t dstOffset, size_t dstRowBytes) {
+  if (!srcTexture || !dstBuffer) {
+    return;
+  }
+  auto srcX = static_cast<int32_t>(srcRect.x());
+  auto srcY = static_cast<int32_t>(srcRect.y());
+  auto copyWidth = static_cast<uint32_t>(srcRect.width());
+  auto copyHeight = static_cast<uint32_t>(srcRect.height());
+  auto srcW = static_cast<uint32_t>(srcTexture->width());
+  auto srcH = static_cast<uint32_t>(srcTexture->height());
+  if (srcX + copyWidth > srcW) {
+    copyWidth = srcW > static_cast<uint32_t>(srcX) ? srcW - static_cast<uint32_t>(srcX) : 0;
+  }
+  if (srcY + copyHeight > srcH) {
+    copyHeight = srcH > static_cast<uint32_t>(srcY) ? srcH - static_cast<uint32_t>(srcY) : 0;
+  }
+  if (copyWidth == 0 || copyHeight == 0) {
+    return;
+  }
+
+  auto d3d12Src = std::static_pointer_cast<D3D12Texture>(srcTexture);
+  auto d3d12Dst = std::static_pointer_cast<D3D12Buffer>(dstBuffer);
+  retainResource(d3d12Src);
+  retainResource(d3d12Dst);
+
+  auto cmd = session.commandList.Get();
+  auto bytesPerPixel = static_cast<uint32_t>(DXGIFormatBytesPerPixel(d3d12Src->dxgiFormat()));
+  uint32_t tightRowBytes =
+      dstRowBytes > 0 ? static_cast<uint32_t>(dstRowBytes) : copyWidth * bytesPerPixel;
+
+  // D3D12 requires CopyTextureRegion's destination buffer footprint to use a row pitch that is a
+  // multiple of D3D12_TEXTURE_DATA_PITCH_ALIGNMENT (256). The caller's buffer is laid out tightly
+  // (one row immediately follows the previous), so when tightRowBytes happens to be unaligned we
+  // route the copy through a transient default-heap staging buffer with an aligned row pitch and
+  // then issue per-row CopyBufferRegion calls to repack the rows into the caller's buffer.
+  uint32_t alignedRowPitch = (tightRowBytes + D3D12_TEXTURE_DATA_PITCH_ALIGNMENT - 1) &
+                             ~(D3D12_TEXTURE_DATA_PITCH_ALIGNMENT - 1);
+  bool needsRepack = (alignedRowPitch != tightRowBytes);
+
+  TransitionResourceState(cmd, d3d12Src->d3d12Resource(), d3d12Src->currentState(),
+                          D3D12_RESOURCE_STATE_COPY_SOURCE);
+  recordTextureStateChange(d3d12Src.get(), D3D12_RESOURCE_STATE_COPY_SOURCE);
+
+  ComPtr<ID3D12Resource> stagingBuffer = nullptr;
+  ID3D12Resource* footprintTarget = d3d12Dst->d3d12Resource();
+  UINT64 footprintOffset = static_cast<UINT64>(dstOffset);
+
+  if (needsRepack) {
+    // Allocate a transient default-heap buffer big enough to hold the aligned-row-pitch image.
+    // The buffer is created in COPY_DEST state so CopyTextureRegion can write to it directly,
+    // then transitioned to COPY_SOURCE so we can read it back row-by-row into the caller's
+    // buffer. The session retains the staging buffer until the fence signals.
+    auto device = _gpu->device();
+    D3D12_HEAP_PROPERTIES heapProps = {};
+    heapProps.Type = D3D12_HEAP_TYPE_DEFAULT;
+    D3D12_RESOURCE_DESC bufferDesc = {};
+    bufferDesc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER;
+    bufferDesc.Width = static_cast<UINT64>(alignedRowPitch) * copyHeight;
+    bufferDesc.Height = 1;
+    bufferDesc.DepthOrArraySize = 1;
+    bufferDesc.MipLevels = 1;
+    bufferDesc.Format = static_cast<DXGI_FORMAT>(0);  // DXGI_FORMAT_UNKNOWN
+    bufferDesc.SampleDesc.Count = 1;
+    bufferDesc.SampleDesc.Quality = 0;
+    bufferDesc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR;
+    bufferDesc.Flags = D3D12_RESOURCE_FLAG_NONE;
+    auto hr = device->CreateCommittedResource(&heapProps, D3D12_HEAP_FLAG_NONE, &bufferDesc,
+                                              D3D12_RESOURCE_STATE_COPY_DEST, nullptr,
+                                              IID_PPV_ARGS(&stagingBuffer));
+    if (FAILED(hr)) {
+      LOGE(
+          "D3D12CommandEncoder::copyTextureToBuffer: staging buffer creation failed, "
+          "HRESULT=0x%08X",
+          static_cast<unsigned>(hr));
+      // Fall back to direct copy with potentially wrong stride; better than dropping silently.
+      stagingBuffer = nullptr;
+      needsRepack = false;
+    } else {
+      footprintTarget = stagingBuffer.Get();
+      footprintOffset = 0;
+    }
+  }
+
+  D3D12_PLACED_SUBRESOURCE_FOOTPRINT footprint = {};
+  footprint.Offset = footprintOffset;
+  footprint.Footprint.Format = static_cast<DXGI_FORMAT>(d3d12Src->dxgiFormat());
+  footprint.Footprint.Width = copyWidth;
+  footprint.Footprint.Height = copyHeight;
+  footprint.Footprint.Depth = 1;
+  footprint.Footprint.RowPitch = needsRepack ? alignedRowPitch : tightRowBytes;
+
+  D3D12_TEXTURE_COPY_LOCATION dstLoc = {};
+  dstLoc.pResource = footprintTarget;
+  dstLoc.Type = D3D12_TEXTURE_COPY_TYPE_PLACED_FOOTPRINT;
+  dstLoc.PlacedFootprint = footprint;
+
+  D3D12_TEXTURE_COPY_LOCATION srcLoc = {};
+  srcLoc.pResource = d3d12Src->d3d12Resource();
+  srcLoc.Type = D3D12_TEXTURE_COPY_TYPE_SUBRESOURCE_INDEX;
+  srcLoc.SubresourceIndex = 0;
+
+  D3D12_BOX srcBox = {};
+  srcBox.left = static_cast<UINT>(srcX);
+  srcBox.top = static_cast<UINT>(srcY);
+  srcBox.front = 0;
+  srcBox.right = static_cast<UINT>(srcX) + copyWidth;
+  srcBox.bottom = static_cast<UINT>(srcY) + copyHeight;
+  srcBox.back = 1;
+
+  cmd->CopyTextureRegion(&dstLoc, 0, 0, 0, &srcLoc, &srcBox);
+
+  if (needsRepack) {
+    // Transition the staging buffer to COPY_SOURCE and repack each row into the caller's buffer.
+    TransitionResourceState(cmd, stagingBuffer.Get(), D3D12_RESOURCE_STATE_COPY_DEST,
+                            D3D12_RESOURCE_STATE_COPY_SOURCE);
+    for (uint32_t row = 0; row < copyHeight; row++) {
+      cmd->CopyBufferRegion(d3d12Dst->d3d12Resource(),
+                            static_cast<UINT64>(dstOffset) + row * tightRowBytes,
+                            stagingBuffer.Get(), row * alignedRowPitch, tightRowBytes);
+    }
+    session.auxBuffers.push_back(std::move(stagingBuffer));
+  }
+
+  TransitionResourceState(cmd, d3d12Src->d3d12Resource(), D3D12_RESOURCE_STATE_COPY_SOURCE,
+                          D3D12_RESOURCE_STATE_COMMON);
+  recordTextureStateChange(d3d12Src.get(), D3D12_RESOURCE_STATE_COMMON);
+}
+
+void D3D12CommandEncoder::generateMipmapsForTexture(std::shared_ptr<Texture> texture) {
+  if (!texture) {
+    return;
+  }
+  auto d3d12Tex = std::static_pointer_cast<D3D12Texture>(texture);
+  auto mipCount = static_cast<uint32_t>(d3d12Tex->mipLevelCount());
+  if (mipCount <= 1) {
+    return;
+  }
+
+  auto* generator = _gpu->mipmapGenerator();
+  if (generator == nullptr || !generator->isReady()) {
+    static bool warned = false;
+    if (!warned) {
+      LOGE(
+          "D3D12CommandEncoder::generateMipmapsForTexture: mipmap generator unavailable, "
+          "skipping (subsequent calls silently no-op).");
+      warned = true;
+    }
+    return;
+  }
+
+  auto device = _gpu->device();
+  auto cmd = session.commandList.Get();
+  auto resource = d3d12Tex->d3d12Resource();
+  auto dxgiFormat = static_cast<DXGI_FORMAT>(d3d12Tex->dxgiFormat());
+
+  retainResource(d3d12Tex);
+
+  // Sub-allocate a contiguous (mipCount-1)*2-slot range out of the GPU's process-wide
+  // CBV/SRV/UAV ring. Each mip level pair occupies two consecutive slots (SRV at 2i, UAV at
+  // 2i+1) so the compute shader's two single-descriptor tables can be bound by GPU handle. The
+  // ring is already bound to this command list; no SetDescriptorHeaps needed.
+  uint32_t descriptorsPerLevel = 2;
+  uint32_t totalDescriptors = (mipCount - 1) * descriptorsPerLevel;
+  auto range = _gpu->srvRing().allocate(totalDescriptors);
+  if (!range.valid()) {
+    LOGE("D3D12CommandEncoder::generateMipmapsForTexture: SRV ring allocation failed (count=%u).",
+         totalDescriptors);
+    return;
+  }
+  auto descSize = _gpu->srvRing().descriptorSize();
+
+  cmd->SetComputeRootSignature(generator->rootSignature());
+  cmd->SetPipelineState(generator->pipelineState());
+
+  // Move every subresource into NON_PIXEL_SHADER_RESOURCE so the SRV reads in the first iteration
+  // are valid. The current() state is the per-resource state set by previous code (typically
+  // COMMON after a writeTexture / RenderPass end).
+  auto previousState = d3d12Tex->currentState();
+  if (previousState != D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE) {
+    TransitionResourceState(cmd, resource, previousState,
+                            D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE);
+    recordTextureStateChange(d3d12Tex.get(), D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE);
+  }
+
+  uint32_t mipWidth = static_cast<uint32_t>(d3d12Tex->width());
+  uint32_t mipHeight = static_cast<uint32_t>(d3d12Tex->height());
+  for (uint32_t i = 0; i < mipCount - 1; i++) {
+    uint32_t outWidth = (mipWidth > 1) ? mipWidth / 2 : 1;
+    uint32_t outHeight = (mipHeight > 1) ? mipHeight / 2 : 1;
+
+    // Transition the destination subresource (mip[i+1]) from NON_PIXEL_SHADER_RESOURCE to
+    // UNORDERED_ACCESS so the compute shader can write to it. Source mip[i] stays in
+    // NON_PIXEL_SHADER_RESOURCE.
+    D3D12_RESOURCE_BARRIER barrier = {};
+    barrier.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
+    barrier.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE;
+    barrier.Transition.pResource = resource;
+    barrier.Transition.StateBefore = D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE;
+    barrier.Transition.StateAfter = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
+    barrier.Transition.Subresource = i + 1;
+    cmd->ResourceBarrier(1, &barrier);
+
+    // Compute the descriptor handles for this iteration's SRV (slot 2i within range) and UAV
+    // (slot 2i+1).
+    D3D12_CPU_DESCRIPTOR_HANDLE srvCpu = range.cpuStart;
+    srvCpu.ptr += static_cast<SIZE_T>(2 * i) * descSize;
+    D3D12_CPU_DESCRIPTOR_HANDLE uavCpu = range.cpuStart;
+    uavCpu.ptr += static_cast<SIZE_T>(2 * i + 1) * descSize;
+    D3D12_GPU_DESCRIPTOR_HANDLE srvGpu = range.gpuStart;
+    srvGpu.ptr += static_cast<UINT64>(2 * i) * descSize;
+    D3D12_GPU_DESCRIPTOR_HANDLE uavGpu = range.gpuStart;
+    uavGpu.ptr += static_cast<UINT64>(2 * i + 1) * descSize;
+
+    D3D12_SHADER_RESOURCE_VIEW_DESC srvDesc = {};
+    srvDesc.Format = dxgiFormat;
+    srvDesc.ViewDimension = D3D12_SRV_DIMENSION_TEXTURE2D;
+    srvDesc.Shader4ComponentMapping = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING;
+    srvDesc.Texture2D.MostDetailedMip = i;
+    srvDesc.Texture2D.MipLevels = 1;
+    srvDesc.Texture2D.PlaneSlice = 0;
+    srvDesc.Texture2D.ResourceMinLODClamp = 0.0f;
+    device->CreateShaderResourceView(resource, &srvDesc, srvCpu);
+
+    D3D12_UNORDERED_ACCESS_VIEW_DESC uavDesc = {};
+    uavDesc.Format = dxgiFormat;
+    uavDesc.ViewDimension = D3D12_UAV_DIMENSION_TEXTURE2D;
+    uavDesc.Texture2D.MipSlice = i + 1;
+    uavDesc.Texture2D.PlaneSlice = 0;
+    device->CreateUnorderedAccessView(resource, nullptr, &uavDesc, uavCpu);
+
+    // Bind 4 root constants (output mip width, height, 1/width, 1/height) plus the SRV and UAV
+    // tables, then dispatch enough thread groups to cover the destination mip.
+    UINT mipConstants[4];
+    mipConstants[0] = outWidth;
+    mipConstants[1] = outHeight;
+    *reinterpret_cast<float*>(&mipConstants[2]) = 1.0f / static_cast<float>(outWidth);
+    *reinterpret_cast<float*>(&mipConstants[3]) = 1.0f / static_cast<float>(outHeight);
+    cmd->SetComputeRoot32BitConstants(0, 4, mipConstants, 0);
+    cmd->SetComputeRootDescriptorTable(1, srvGpu);
+    cmd->SetComputeRootDescriptorTable(2, uavGpu);
+
+    UINT groupsX = (outWidth + D3D12_MIPMAP_THREAD_GROUP_SIZE - 1) / D3D12_MIPMAP_THREAD_GROUP_SIZE;
+    UINT groupsY =
+        (outHeight + D3D12_MIPMAP_THREAD_GROUP_SIZE - 1) / D3D12_MIPMAP_THREAD_GROUP_SIZE;
+    cmd->Dispatch(groupsX, groupsY, 1);
+
+    // Transition mip[i+1] from UNORDERED_ACCESS back to NON_PIXEL_SHADER_RESOURCE so the next
+    // iteration can use it as the SRV source.
+    barrier.Transition.StateBefore = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;
+    barrier.Transition.StateAfter = D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE;
+    barrier.Transition.Subresource = i + 1;
+    cmd->ResourceBarrier(1, &barrier);
+
+    mipWidth = outWidth;
+    mipHeight = outHeight;
+  }
+
+  // Final transition: every subresource is currently NON_PIXEL_SHADER_RESOURCE. Move the whole
+  // resource back to COMMON so subsequent samplers / RTVs can pick a fresh state on demand,
+  // matching the convention every other code path uses.
+  TransitionResourceState(cmd, resource, D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE,
+                          D3D12_RESOURCE_STATE_COMMON);
+  recordTextureStateChange(d3d12Tex.get(), D3D12_RESOURCE_STATE_COMMON);
+}
+
+std::shared_ptr<CommandBuffer> D3D12CommandEncoder::onFinish() {
+  auto hr = session.commandList->Close();
+  if (FAILED(hr)) {
+    LOGE("D3D12CommandEncoder: ID3D12GraphicsCommandList::Close failed, HRESULT=0x%08X",
+         static_cast<unsigned>(hr));
+    _gpu->reclaimAbandonedSession(std::move(session));
+    return nullptr;
+  }
+  return std::make_shared<D3D12CommandBuffer>(_gpu, std::move(session));
+}
+
+void D3D12CommandEncoder::recordTextureStateChange(D3D12Texture* texture,
+                                                   D3D12_RESOURCE_STATES newState) {
+  if (texture == nullptr) {
+    return;
+  }
+  // Snapshot the original state on the first call for this texture inside the current session.
+  // unordered_map::emplace inserts only when the key is not present, leaving subsequent calls
+  // for the same texture as cheap O(1) lookups that do not overwrite the saved value.
+  session.initialTextureStates.emplace(texture, texture->currentState());
+  texture->setCurrentState(newState);
+}
+
+void D3D12CommandEncoder::onRelease(D3D12GPU* gpu) {
+  // If onFinish() was called, the session has already been moved to D3D12CommandBuffer.
+  // This path only handles abandoned encoders (encoding was started but never finished).
+  if (session.commandList == nullptr) {
+    return;
+  }
+  gpu->reclaimAbandonedSession(std::move(session));
+}
+
+}  // namespace tgfx
diff --git a/src/gpu/d3d12/D3D12CommandEncoder.h b/src/gpu/d3d12/D3D12CommandEncoder.h
new file mode 100644
index 000000000..4c24059b9
--- /dev/null
+++ b/src/gpu/d3d12/D3D12CommandEncoder.h
@@ -0,0 +1,106 @@
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Tencent is pleased to support the open source community by making tgfx available.
+//
+//  Copyright (C) 2026 Tencent. All rights reserved.
+//
+//  Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+//  in compliance with the License. You may obtain a copy of the License at
+//
+//      https://opensource.org/licenses/BSD-3-Clause
+//
+//  unless required by applicable law or agreed to in writing, software distributed under the
+//  license is distributed on an "as is" basis, without warranties or conditions of any kind,
+//  either express or implied. see the license for the specific language governing permissions
+//  and limitations under the license.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include <memory>
+#include "D3D12FrameSession.h"
+#include "D3D12Resource.h"
+#include "D3D12Util.h"
+#include "tgfx/gpu/CommandEncoder.h"
+
+namespace tgfx {
+
+class D3D12GPU;
+class D3D12Texture;
+
+/**
+ * Records GPU commands into an ID3D12GraphicsCommandList and collects resource references into a
+ * D3D12FrameSession.
+ *
+ * Lifecycle mirrors VulkanCommandEncoder:
+ *   - Make() allocates an ID3D12CommandAllocator + ID3D12GraphicsCommandList (already in
+ *     recording state per the D3D12 API) and binds the GPU's process-wide shader-visible
+ *     CBV/SRV/UAV ring and Sampler heap to the list.
+ *   - Resource binding (RenderPass) and copy commands populate retainedResources so the GPU
+ *     keeps live references until the fence signals. Descriptor slots used during the session
+ *     are sub-allocated from the GPU's descriptor ring and reclaimed by fence directly.
+ *   - onFinish() Closes the command list and moves the entire session to D3D12CommandBuffer.
+ *   - onRelease() (abandon path) reclaims the session via D3D12GPU::reclaimAbandonedSession().
+ */
+class D3D12CommandEncoder : public CommandEncoder, public D3D12Resource {
+ public:
+  static std::shared_ptr<D3D12CommandEncoder> Make(D3D12GPU* gpu);
+
+  ID3D12GraphicsCommandList* d3d12CommandList() const {
+    return session.commandList.Get();
+  }
+
+  ID3D12CommandAllocator* d3d12CommandAllocator() const {
+    return session.commandAllocator.Get();
+  }
+
+  GPU* gpu() const override;
+
+  std::shared_ptr<RenderPass> onBeginRenderPass(const RenderPassDescriptor& descriptor) override;
+
+  void copyTextureToTexture(std::shared_ptr<Texture> srcTexture, const Rect& srcRect,
+                            std::shared_ptr<Texture> dstTexture, const Point& dstOffset) override;
+
+  void copyTextureToBuffer(std::shared_ptr<Texture> srcTexture, const Rect& srcRect,
+                           std::shared_ptr<GPUBuffer> dstBuffer, size_t dstOffset = 0,
+                           size_t dstRowBytes = 0) override;
+
+  void generateMipmapsForTexture(std::shared_ptr<Texture> texture) override;
+
+ protected:
+  std::shared_ptr<CommandBuffer> onFinish() override;
+  void onRelease(D3D12GPU* gpu) override;
+
+ private:
+  D3D12CommandEncoder(D3D12GPU* gpu, ComPtr<ID3D12CommandAllocator> allocator,
+                      ComPtr<ID3D12GraphicsCommandList> commandList);
+  ~D3D12CommandEncoder() override = default;
+
+  D3D12GPU* _gpu = nullptr;
+  D3D12FrameSession session;
+
+  // Used by D3D12RenderPass to register attachments / pipelines / textures for deferred release.
+  void retainResource(std::shared_ptr<D3D12Resource> resource) {
+    session.retainedResources.push_back(std::move(resource));
+  }
+
+  void retainDescriptorHeap(ComPtr<ID3D12DescriptorHeap> heap) {
+    session.retainedDescriptorHeaps.push_back(std::move(heap));
+  }
+
+  /**
+   * Updates a D3D12Texture's CPU-tracked _currentState and, on the first call for this texture
+   * within the current session, snapshots the original state into session.initialTextureStates
+   * so reclaimAbandonedSession() can roll it back if the encoder is destroyed before submit.
+   * Every D3D12 backend call site that previously did `tex->setCurrentState(newState)` must go
+   * through this helper instead, otherwise an aborted encoder would leave _currentState ahead
+   * of the GPU's real state and the next pass would emit a "Before state mismatch" barrier.
+   */
+  void recordTextureStateChange(D3D12Texture* texture, D3D12_RESOURCE_STATES newState);
+
+  friend class D3D12GPU;
+  friend class D3D12RenderPass;
+};
+
+}  // namespace tgfx
diff --git a/src/gpu/d3d12/D3D12CommandListPool.cpp b/src/gpu/d3d12/D3D12CommandListPool.cpp
new file mode 100644
index 000000000..5ac9cfaad
--- /dev/null
+++ b/src/gpu/d3d12/D3D12CommandListPool.cpp
@@ -0,0 +1,92 @@
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Tencent is pleased to support the open source community by making tgfx available.
+//
+//  Copyright (C) 2026 Tencent. All rights reserved.
+//
+//  Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+//  in compliance with the License. You may obtain a copy of the License at
+//
+//      https://opensource.org/licenses/BSD-3-Clause
+//
+//  unless required by applicable law or agreed to in writing, software distributed under the
+//  license is distributed on an "as is" basis, without warranties or conditions of any kind,
+//  either express or implied. see the license for the specific language governing permissions
+//  and limitations under the license.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "D3D12CommandListPool.h"
+#include "core/utils/Log.h"
+
+namespace tgfx {
+
+D3D12CommandListPool::Entry D3D12CommandListPool::acquire(ID3D12Device* device) {
+  Entry result = {};
+  if (device == nullptr) {
+    return result;
+  }
+  // Reuse a pooled pair if available. Reset returns the allocator to its initial empty state
+  // and rewinds the command list to a fresh recording state — both are cheap (microseconds),
+  // unlike CreateCommandAllocator/CreateCommandList which trigger driver-internal allocation.
+  while (!freeList.empty()) {
+    auto entry = std::move(freeList.back());
+    freeList.pop_back();
+    if (entry.allocator == nullptr || entry.commandList == nullptr) {
+      continue;
+    }
+    auto hr = entry.allocator->Reset();
+    if (FAILED(hr)) {
+      LOGE("D3D12CommandListPool::acquire: allocator Reset failed (HRESULT=0x%08X), discarding.",
+           static_cast<unsigned>(hr));
+      continue;
+    }
+    hr = entry.commandList->Reset(entry.allocator.Get(), nullptr);
+    if (FAILED(hr)) {
+      LOGE(
+          "D3D12CommandListPool::acquire: command-list Reset failed (HRESULT=0x%08X), "
+          "discarding.",
+          static_cast<unsigned>(hr));
+      continue;
+    }
+    return entry;
+  }
+  // Cold path: nothing pooled (or every pooled entry failed to Reset). Build a fresh pair.
+  ComPtr<ID3D12CommandAllocator> allocator = nullptr;
+  auto hr =
+      device->CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_DIRECT, IID_PPV_ARGS(&allocator));
+  if (FAILED(hr)) {
+    LOGE("D3D12CommandListPool::acquire: CreateCommandAllocator failed (HRESULT=0x%08X).",
+         static_cast<unsigned>(hr));
+    return result;
+  }
+  ComPtr<ID3D12GraphicsCommandList> commandList = nullptr;
+  hr = device->CreateCommandList(0, D3D12_COMMAND_LIST_TYPE_DIRECT, allocator.Get(), nullptr,
+                                 IID_PPV_ARGS(&commandList));
+  if (FAILED(hr)) {
+    LOGE("D3D12CommandListPool::acquire: CreateCommandList failed (HRESULT=0x%08X).",
+         static_cast<unsigned>(hr));
+    return result;
+  }
+  result.allocator = std::move(allocator);
+  result.commandList = std::move(commandList);
+  return result;
+}
+
+void D3D12CommandListPool::release(Entry entry) {
+  if (!entry.valid()) {
+    return;
+  }
+  if (freeList.size() >= MAX_POOLED) {
+    // Cap the pool so a long-running app with bursty submission patterns doesn't keep an
+    // unbounded number of allocators alive. ComPtr destructors release the driver references.
+    return;
+  }
+  freeList.push_back(std::move(entry));
+}
+
+void D3D12CommandListPool::clear() {
+  freeList.clear();
+}
+
+}  // namespace tgfx
diff --git a/src/gpu/d3d12/D3D12CommandListPool.h b/src/gpu/d3d12/D3D12CommandListPool.h
new file mode 100644
index 000000000..57f1a9c79
--- /dev/null
+++ b/src/gpu/d3d12/D3D12CommandListPool.h
@@ -0,0 +1,95 @@
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Tencent is pleased to support the open source community by making tgfx available.
+//
+//  Copyright (C) 2026 Tencent. All rights reserved.
+//
+//  Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+//  in compliance with the License. You may obtain a copy of the License at
+//
+//      https://opensource.org/licenses/BSD-3-Clause
+//
+//  unless required by applicable law or agreed to in writing, software distributed under the
+//  license is distributed on an "as is" basis, without warranties or conditions of any kind,
+//  either express or implied. see the license for the specific language governing permissions
+//  and limitations under the license.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include <vector>
+#include "D3D12Util.h"
+
+namespace tgfx {
+
+/**
+ * Pool of (ID3D12CommandAllocator, ID3D12GraphicsCommandList) pairs for D3D12_COMMAND_LIST_TYPE_DIRECT.
+ *
+ * Why pooling matters:
+ *   - CreateCommandAllocator and CreateCommandList are among the slowest D3D12 APIs (the runtime
+ *     pre-patches command-list metadata against the device). Microsoft's guidance is "create
+ *     once, reset many".
+ *   - Backends that submit multiple command lists per frame (Tiled rendering, BackgroundBlur
+ *     offscreen passes, transient upload lists) compound the cost.
+ *
+ * Lifecycle invariants:
+ *   - acquire() returns a recording-state list. On a hit, both objects have been Reset(); on a
+ *     miss, freshly-created (D3D12 returns lists in recording state by default).
+ *   - release() is called by D3D12GPU::reclaimSubmission only after the GPU fence has confirmed
+ *     execution completed. That guarantees ID3D12CommandAllocator::Reset is safe at the next
+ *     acquire().
+ *   - Abandoned sessions (encoder destroyed before submit) do not return to the pool; their
+ *     ComPtrs simply destruct. Avoids any ambiguity about "is the list closed yet" at recycle
+ *     time.
+ *
+ * Thread safety: not thread-safe. Caller serialises access (matches the rest of the D3D12
+ * backend's single-threaded usage).
+ */
+class D3D12CommandListPool {
+ public:
+  // Soft cap on idle pairs kept around. Anything released past the cap is destroyed instead of
+  // pooled, preventing unbounded growth in long-running applications. 16 is comfortably above
+  // MAX_FRAMES_IN_FLIGHT (2) plus typical per-frame transient upload lists, so steady-state
+  // workloads won't churn the cap.
+  static constexpr size_t MAX_POOLED = 16;
+
+  struct Entry {
+    ComPtr<ID3D12CommandAllocator> allocator;
+    ComPtr<ID3D12GraphicsCommandList> commandList;
+    bool valid() const {
+      return allocator != nullptr && commandList != nullptr;
+    }
+  };
+
+  D3D12CommandListPool() = default;
+  ~D3D12CommandListPool() = default;
+
+  D3D12CommandListPool(const D3D12CommandListPool&) = delete;
+  D3D12CommandListPool& operator=(const D3D12CommandListPool&) = delete;
+
+  /**
+   * Acquires a pair ready for recording. On a cache hit, both objects are Reset() before being
+   * returned. On a miss (or if Reset fails), a fresh pair is created. Returns an Entry with both
+   * fields null only if the underlying CreateXxx call fails, which is logged.
+   */
+  Entry acquire(ID3D12Device* device);
+
+  /**
+   * Returns a pair to the pool for future reuse. Caller must guarantee the GPU has finished
+   * executing every command list that was recorded with this pair (i.e. the fence value
+   * associated with the submission has signalled). Entries beyond MAX_POOLED are destroyed.
+   */
+  void release(Entry entry);
+
+  /**
+   * Drops every pooled pair. Used by D3D12GPU::releaseAll on shutdown to release driver
+   * references before the device disappears.
+   */
+  void clear();
+
+ private:
+  std::vector<Entry> freeList;
+};
+
+}  // namespace tgfx
diff --git a/src/gpu/d3d12/D3D12CommandQueue.cpp b/src/gpu/d3d12/D3D12CommandQueue.cpp
new file mode 100644
index 000000000..6d08de28d
--- /dev/null
+++ b/src/gpu/d3d12/D3D12CommandQueue.cpp
@@ -0,0 +1,329 @@
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Tencent is pleased to support the open source community by making tgfx available.
+//
+//  Copyright (C) 2026 Tencent. All rights reserved.
+//
+//  Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+//  in compliance with the License. You may obtain a copy of the License at
+//
+//      https://opensource.org/licenses/BSD-3-Clause
+//
+//  unless required by applicable law or agreed to in writing, software distributed under the
+//  license is distributed on an "as is" basis, without warranties or conditions of any kind,
+//  either express or implied. see the license for the specific language governing permissions
+//  and limitations under the license.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "D3D12CommandQueue.h"
+#include "D3D12Buffer.h"
+#include "D3D12CommandBuffer.h"
+#include "D3D12Defines.h"
+#include "D3D12Semaphore.h"
+#include "D3D12Texture.h"
+#include "core/utils/Log.h"
+
+namespace tgfx {
+
+template <typename T>
+static T AlignUp(T x, T alignment) {
+  return (x + alignment - 1) & ~(alignment - 1);
+}
+
+D3D12CommandQueue::D3D12CommandQueue(D3D12GPU* d3d12GPU) : gpu(d3d12GPU) {
+  D3D12_COMMAND_QUEUE_DESC desc = {};
+  desc.Type = D3D12_COMMAND_LIST_TYPE_DIRECT;
+  desc.Flags = D3D12_COMMAND_QUEUE_FLAG_NONE;
+  auto hr = gpu->device()->CreateCommandQueue(&desc, IID_PPV_ARGS(&commandQueue));
+  if (FAILED(hr)) {
+    LOGE("D3D12CommandQueue: Failed to create command queue, HRESULT=0x%08X",
+         static_cast<unsigned>(hr));
+  }
+}
+
+D3D12CommandQueue::~D3D12CommandQueue() {
+  // Pending uploads (staging buffers + footprints) and pending semaphores will be released by
+  // the field destructors. There is no command list to flush them with at this point — the
+  // application must call waitUntilCompleted() before destruction if it cares about durability.
+}
+
+std::chrono::steady_clock::time_point D3D12CommandQueue::completedFrameTime() const {
+  return gpu->lastFenceSignalTime();
+}
+
+void D3D12CommandQueue::writeBuffer(std::shared_ptr<GPUBuffer> buffer, size_t bufferOffset,
+                                    const void* data, size_t dataSize) {
+  if (!buffer || !data || dataSize == 0) {
+    return;
+  }
+  void* mappedData = buffer->map(bufferOffset, dataSize);
+  if (mappedData) {
+    memcpy(mappedData, data, dataSize);
+    buffer->unmap();
+  }
+}
+
+void D3D12CommandQueue::writeTexture(std::shared_ptr<Texture> texture, const Rect& rect,
+                                     const void* pixels, size_t rowBytes) {
+  if (!texture || !pixels) {
+    return;
+  }
+  auto d3d12Tex = std::static_pointer_cast<D3D12Texture>(texture);
+
+  auto width = static_cast<uint32_t>(rect.width());
+  auto height = static_cast<uint32_t>(rect.height());
+  auto bytesPerPixel = static_cast<uint32_t>(DXGIFormatBytesPerPixel(d3d12Tex->dxgiFormat()));
+  if (width == 0 || height == 0 || bytesPerPixel == 0) {
+    return;
+  }
+
+  // D3D12 requires the row pitch of a placed footprint to be a multiple of
+  // D3D12_TEXTURE_DATA_PITCH_ALIGNMENT (256). Caller-supplied stride may be larger or smaller.
+  uint32_t srcRowBytes = rowBytes > 0 ? static_cast<uint32_t>(rowBytes) : width * bytesPerPixel;
+  uint32_t alignedRowPitch = AlignUp<uint32_t>(
+      width * bytesPerPixel, static_cast<uint32_t>(D3D12_TEXTURE_DATA_PITCH_ALIGNMENT));
+  uint64_t stagingSize = static_cast<uint64_t>(alignedRowPitch) * height;
+
+  // Fast path: sub-allocate from the GPU's process-wide UPLOAD ring. The ring resource is kept
+  // alive by D3D12GPU and the bytes are reclaimed automatically once the owning fence signals,
+  // so we do not need to add anything to PendingUpload to keep the resource live.
+  ID3D12Resource* stagingResource = nullptr;
+  uint64_t stagingOffset = 0;
+  uint8_t* stagingCpu = nullptr;
+  ComPtr<ID3D12Resource> fallbackResource = nullptr;
+  auto allocation =
+      gpu->uploadHeap().allocate(static_cast<size_t>(stagingSize),
+                                 static_cast<size_t>(D3D12_TEXTURE_DATA_PLACEMENT_ALIGNMENT));
+  if (allocation.valid()) {
+    stagingResource = allocation.resource;
+    stagingOffset = allocation.offsetInResource;
+    stagingCpu = static_cast<uint8_t*>(allocation.cpu);
+  } else {
+    // Slow path (oversize allocation or saturated ring): create a one-off UPLOAD buffer. Its
+    // ComPtr is parked in PendingUpload so the resource outlives GPU execution.
+    D3D12_HEAP_PROPERTIES heapProps = {};
+    heapProps.Type = D3D12_HEAP_TYPE_UPLOAD;
+    D3D12_RESOURCE_DESC bufferDesc = {};
+    bufferDesc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER;
+    bufferDesc.Width = stagingSize;
+    bufferDesc.Height = 1;
+    bufferDesc.DepthOrArraySize = 1;
+    bufferDesc.MipLevels = 1;
+    bufferDesc.Format = static_cast<DXGI_FORMAT>(DXGI_FORMAT_UNKNOWN);
+    bufferDesc.SampleDesc.Count = 1;
+    bufferDesc.SampleDesc.Quality = 0;
+    bufferDesc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR;
+    bufferDesc.Flags = D3D12_RESOURCE_FLAG_NONE;
+    auto hr = gpu->device()->CreateCommittedResource(&heapProps, D3D12_HEAP_FLAG_NONE, &bufferDesc,
+                                                     D3D12_RESOURCE_STATE_GENERIC_READ, nullptr,
+                                                     IID_PPV_ARGS(&fallbackResource));
+    if (FAILED(hr)) {
+      LOGE(
+          "D3D12CommandQueue::writeTexture: fallback CreateCommittedResource failed, "
+          "HRESULT=0x%08X",
+          static_cast<unsigned>(hr));
+      return;
+    }
+    void* mapped = nullptr;
+    D3D12_RANGE readRange = {0, 0};
+    hr = fallbackResource->Map(0, &readRange, &mapped);
+    if (FAILED(hr) || mapped == nullptr) {
+      LOGE("D3D12CommandQueue::writeTexture: fallback Map failed, HRESULT=0x%08X",
+           static_cast<unsigned>(hr));
+      return;
+    }
+    stagingResource = fallbackResource.Get();
+    stagingOffset = 0;
+    stagingCpu = static_cast<uint8_t*>(mapped);
+  }
+
+  auto src = static_cast<const uint8_t*>(pixels);
+  uint32_t tightRowBytes = width * bytesPerPixel;
+  for (uint32_t row = 0; row < height; row++) {
+    memcpy(stagingCpu + row * alignedRowPitch, src + row * srcRowBytes, tightRowBytes);
+  }
+  if (fallbackResource != nullptr) {
+    // Mapping a one-off UPLOAD buffer for the duration of GPU execution is allowed but we Unmap
+    // here for symmetry with the previous (pre-ring) implementation; this also lets the runtime
+    // page out the buffer if memory pressure allows.
+    fallbackResource->Unmap(0, nullptr);
+  }
+
+  D3D12GPU::PendingUpload upload = {};
+  // Only the slow path needs to retain the staging buffer: the ring resource lives on the GPU
+  // instance and is reclaimed by fence directly.
+  upload.stagingBuffer = std::move(fallbackResource);
+  upload.texture = d3d12Tex;
+  pendingUploads.push_back(std::move(upload));
+
+  UploadFootprint fp = {};
+  fp.stagingResource = stagingResource;
+  fp.footprint.Offset = stagingOffset;
+  fp.footprint.Footprint.Format = static_cast<DXGI_FORMAT>(d3d12Tex->dxgiFormat());
+  fp.footprint.Footprint.Width = width;
+  fp.footprint.Footprint.Height = height;
+  fp.footprint.Footprint.Depth = 1;
+  fp.footprint.Footprint.RowPitch = alignedRowPitch;
+  fp.dstX = static_cast<UINT>(rect.x());
+  fp.dstY = static_cast<UINT>(rect.y());
+  fp.srcWidth = width;
+  fp.srcHeight = height;
+  pendingFootprints.push_back(fp);
+}
+
+void D3D12CommandQueue::flushUploads(ID3D12GraphicsCommandList* commandList,
+                                     D3D12FrameSession& session) {
+  if (pendingUploads.empty() || commandList == nullptr) {
+    return;
+  }
+  for (size_t i = 0; i < pendingUploads.size(); i++) {
+    auto& up = pendingUploads[i];
+    auto& fp = pendingFootprints[i];
+
+    auto current = up.texture->currentState();
+    if (current != D3D12_RESOURCE_STATE_COPY_DEST) {
+      TransitionResourceState(commandList, up.texture->d3d12Resource(), current,
+                              D3D12_RESOURCE_STATE_COPY_DEST);
+      // Snapshot the original CPU-tracked state on the first touch of this texture during the
+      // session so reclaimAbandonedSession() can roll _currentState back if the upload is
+      // never executed by the GPU. emplace() preserves the earliest snapshot on subsequent
+      // updates, exactly like D3D12CommandEncoder::recordTextureStateChange().
+      session.initialTextureStates.emplace(up.texture.get(), current);
+      up.texture->setCurrentState(D3D12_RESOURCE_STATE_COPY_DEST);
+    }
+
+    D3D12_TEXTURE_COPY_LOCATION dstLoc = {};
+    dstLoc.pResource = up.texture->d3d12Resource();
+    dstLoc.Type = D3D12_TEXTURE_COPY_TYPE_SUBRESOURCE_INDEX;
+    // SubresourceIndex 0 = (mip level 0, array slice 0, plane 0). This matches the public
+    // writeTexture contract in CommandQueue.h: "If the texture has mipmaps, you should call
+    // CommandEncoder's generateMipmapsForTexture() method after writing the pixels, as mipmaps
+    // will not be generated automatically." VulkanCommandQueue / MetalCommandQueue make the
+    // same assumption (Vulkan even DEBUG_ASSERTs imageSubresource.mipLevel == 0 in its upload
+    // batcher). If tgfx ever adds array textures or a per-mip writeTexture overload, every
+    // backend must extend together — this is not a D3D12-only TODO.
+    dstLoc.SubresourceIndex = 0;
+
+    D3D12_TEXTURE_COPY_LOCATION srcLoc = {};
+    // The staging source is either a slot inside the GPU's UPLOAD ring (kept alive by the GPU
+    // instance, with offsetInResource embedded in fp.footprint.Offset) or a one-off staging
+    // buffer parked in PendingUpload::stagingBuffer. Either way fp.stagingResource is the raw
+    // pointer to use at copy time.
+    srcLoc.pResource = fp.stagingResource;
+    srcLoc.Type = D3D12_TEXTURE_COPY_TYPE_PLACED_FOOTPRINT;
+    srcLoc.PlacedFootprint = fp.footprint;
+
+    D3D12_BOX srcBox = {};
+    srcBox.left = 0;
+    srcBox.top = 0;
+    srcBox.front = 0;
+    srcBox.right = fp.srcWidth;
+    srcBox.bottom = fp.srcHeight;
+    srcBox.back = 1;
+
+    commandList->CopyTextureRegion(&dstLoc, fp.dstX, fp.dstY, 0, &srcLoc, &srcBox);
+
+    TransitionResourceState(commandList, up.texture->d3d12Resource(),
+                            D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_COMMON);
+    // Already snapshotted at the COPY_DEST entry above; the second emplace is a no-op when the
+    // texture appears for the first time in this branch and a no-op-on-collision otherwise.
+    session.initialTextureStates.emplace(up.texture.get(), up.texture->currentState());
+    up.texture->setCurrentState(D3D12_RESOURCE_STATE_COMMON);
+  }
+  pendingFootprints.clear();
+  // pendingUploads is moved into the SubmitRequest by the caller so its staging buffers outlive
+  // GPU execution.
+}
+
+void D3D12CommandQueue::submit(std::shared_ptr<CommandBuffer> commandBuffer) {
+  if (!commandBuffer) {
+    return;
+  }
+  auto d3d12Cmd = std::static_pointer_cast<D3D12CommandBuffer>(commandBuffer);
+  auto session = std::move(d3d12Cmd->frameSession());
+  if (session.commandList == nullptr) {
+    return;
+  }
+
+  // If pixel uploads were recorded since the last submit, splice them onto the front of the
+  // submission as an auxiliary upload command list. The GPU executes auxCommandLists before the
+  // session.commandList, ensuring textures are populated before the render list samples them.
+  if (!pendingUploads.empty()) {
+    auto entry = gpu->commandListPool().acquire(gpu->device());
+    if (!entry.valid()) {
+      LOGE(
+          "D3D12CommandQueue::submit: failed to acquire transient upload list, dropping "
+          "uploads.");
+      pendingUploads.clear();
+      pendingFootprints.clear();
+    } else {
+      flushUploads(entry.commandList.Get(), session);
+      entry.commandList->Close();
+      session.auxAllocators.push_back(std::move(entry.allocator));
+      session.auxCommandLists.push_back(std::move(entry.commandList));
+    }
+  }
+
+  D3D12GPU::SubmitRequest request = {};
+  request.session = std::move(session);
+  request.uploads = std::move(pendingUploads);
+  request.signalSemaphore = std::move(pendingSignalSemaphore);
+  request.waitSemaphore = std::move(pendingWaitSemaphore);
+  // Capture _frameTime here (CommandQueue base class member). The GPU stamps the inflight
+  // submission with this value and later publishes it as _lastFenceSignalTime so the resource
+  // cache can decide which scratch resources the GPU is done reading.
+  request.frameTime = _frameTime;
+  pendingUploads.clear();
+  pendingFootprints.clear();
+  pendingSignalSemaphore = nullptr;
+  pendingWaitSemaphore = nullptr;
+
+  gpu->executeSubmission(std::move(request));
+}
+
+std::shared_ptr<Semaphore> D3D12CommandQueue::insertSemaphore() {
+  auto semaphore = D3D12Semaphore::Make(gpu);
+  if (semaphore == nullptr) {
+    return nullptr;
+  }
+  pendingSignalSemaphore = semaphore;
+  return semaphore;
+}
+
+void D3D12CommandQueue::waitSemaphore(std::shared_ptr<Semaphore> semaphore) {
+  if (semaphore == nullptr) {
+    return;
+  }
+  pendingWaitSemaphore = std::static_pointer_cast<D3D12Semaphore>(semaphore);
+}
+
+void D3D12CommandQueue::waitUntilCompleted() {
+  // Flush any pending uploads even if the application did not submit a command buffer between
+  // writeTexture() and waitUntilCompleted().
+  if (!pendingUploads.empty()) {
+    auto entry = gpu->commandListPool().acquire(gpu->device());
+    if (entry.valid()) {
+      D3D12GPU::SubmitRequest request = {};
+      flushUploads(entry.commandList.Get(), request.session);
+      entry.commandList->Close();
+
+      request.session.auxAllocators.push_back(std::move(entry.allocator));
+      request.session.auxCommandLists.push_back(std::move(entry.commandList));
+      request.uploads = std::move(pendingUploads);
+      request.frameTime = _frameTime;
+      pendingUploads.clear();
+      pendingFootprints.clear();
+      gpu->executeSubmission(std::move(request));
+    } else {
+      LOGE(
+          "D3D12CommandQueue::waitUntilCompleted: failed to acquire upload list, dropping "
+          "uploads.");
+      pendingUploads.clear();
+      pendingFootprints.clear();
+    }
+  }
+  gpu->waitAllInflightSubmissions();
+}
+
+}  // namespace tgfx
diff --git a/src/gpu/d3d12/D3D12CommandQueue.h b/src/gpu/d3d12/D3D12CommandQueue.h
new file mode 100644
index 000000000..33b8e8b06
--- /dev/null
+++ b/src/gpu/d3d12/D3D12CommandQueue.h
@@ -0,0 +1,105 @@
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Tencent is pleased to support the open source community by making tgfx available.
+//
+//  Copyright (C) 2026 Tencent. All rights reserved.
+//
+//  Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+//  in compliance with the License. You may obtain a copy of the License at
+//
+//      https://opensource.org/licenses/BSD-3-Clause
+//
+//  unless required by applicable law or agreed to in writing, software distributed under the
+//  license is distributed on an "as is" basis, without warranties or conditions of any kind,
+//  either express or implied. see the license for the specific language governing permissions
+//  and limitations under the license.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include <memory>
+#include <vector>
+#include "D3D12GPU.h"
+#include "D3D12Util.h"
+#include "tgfx/gpu/CommandQueue.h"
+
+namespace tgfx {
+
+class D3D12GPU;
+class D3D12Semaphore;
+
+/**
+ * Thin coordination layer satisfying the public CommandQueue interface. Mirrors VulkanCommandQueue:
+ * holds only the data accumulated between two consecutive submit() calls and delegates submission
+ * timing / inflight tracking to D3D12GPU::executeSubmission().
+ *
+ * Pending state held here:
+ *   - pendingUploads: staging UPLOAD buffers from writeTexture(), consumed by submit().
+ *   - pendingSignal/WaitSemaphore: from insertSemaphore()/waitSemaphore(), consumed by submit().
+ */
+class D3D12CommandQueue : public CommandQueue {
+ public:
+  explicit D3D12CommandQueue(D3D12GPU* gpu);
+  ~D3D12CommandQueue() override;
+
+  ID3D12CommandQueue* d3d12CommandQueue() const {
+    return commandQueue.Get();
+  }
+
+  void submit(std::shared_ptr<CommandBuffer> commandBuffer) override;
+
+  void writeBuffer(std::shared_ptr<GPUBuffer> buffer, size_t bufferOffset, const void* data,
+                   size_t dataSize) override;
+
+  void writeTexture(std::shared_ptr<Texture> texture, const Rect& rect, const void* pixels,
+                    size_t rowBytes) override;
+
+  std::shared_ptr<Semaphore> insertSemaphore() override;
+
+  void waitSemaphore(std::shared_ptr<Semaphore> semaphore) override;
+
+  void waitUntilCompleted() override;
+
+ protected:
+  // Report the steady-clock timestamp of the most recently completed inflight submission so
+  // ResourceCache::findScratchResource can correctly skip scratch buffers/textures that the GPU
+  // is still reading. The base class default returns the *current* frame time, which lets a
+  // second flush() reuse a vertex buffer the first flush()'s GPU work is still reading
+  // (see RecordingTest.MultipleRecordingsInOrder).
+  std::chrono::steady_clock::time_point completedFrameTime() const override;
+
+ private:
+  // Records the upload command list, retaining the staging buffer references inside `session` so
+  // it can carry them through to the inflight submission and so an abandoned submit can roll
+  // back any texture state changes the upload introduced.
+  void flushUploads(ID3D12GraphicsCommandList* commandList, D3D12FrameSession& session);
+
+  D3D12GPU* gpu = nullptr;
+  ComPtr<ID3D12CommandQueue> commandQueue = nullptr;
+
+  // Produced by writeTexture(), consumed by the next submit() (or waitUntilCompleted()) which
+  // records CopyTextureRegion commands and then moves the staging buffers into the inflight
+  // submission so they can be safely released after the GPU fence signals.
+  std::vector<D3D12GPU::PendingUpload> pendingUploads;
+
+  // Per-upload metadata kept alongside pendingUploads so flushUploads can record the GPU copy
+  // without re-deriving the row pitch / pixel dimensions.
+  struct UploadFootprint {
+    // Source ID3D12Resource for CopyTextureRegion. Lifetime is owned either by D3D12GPU's
+    // UPLOAD ring (fast path, no extra retention required) or by the matching PendingUpload's
+    // stagingBuffer ComPtr (slow / fallback path).
+    ID3D12Resource* stagingResource = nullptr;
+    D3D12_PLACED_SUBRESOURCE_FOOTPRINT footprint = {};
+    UINT dstX = 0;
+    UINT dstY = 0;
+    UINT srcWidth = 0;
+    UINT srcHeight = 0;
+  };
+  std::vector<UploadFootprint> pendingFootprints;
+
+  std::shared_ptr<D3D12Semaphore> pendingSignalSemaphore;
+  std::shared_ptr<D3D12Semaphore> pendingWaitSemaphore;
+};
+
+}  // namespace tgfx
diff --git a/src/gpu/d3d12/D3D12Defines.h b/src/gpu/d3d12/D3D12Defines.h
new file mode 100644
index 000000000..6bee1d903
--- /dev/null
+++ b/src/gpu/d3d12/D3D12Defines.h
@@ -0,0 +1,98 @@
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Tencent is pleased to support the open source community by making tgfx available.
+//
+//  Copyright (C) 2026 Tencent. All rights reserved.
+//
+//  Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+//  in compliance with the License. You may obtain a copy of the License at
+//
+//      https://opensource.org/licenses/BSD-3-Clause
+//
+//  unless required by applicable law or agreed to in writing, software distributed under the
+//  license is distributed on an "as is" basis, without warranties or conditions of any kind,
+//  either express or implied. see the license for the specific language governing permissions
+//  and limitations under the license.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include <cstddef>
+#include "tgfx/gpu/PixelFormat.h"
+
+namespace tgfx {
+
+// DXGI_FORMAT values (from dxgiformat.h). Defined as constexpr constants instead of #define macros
+// to avoid conflicts with the Windows SDK's DXGI_FORMAT enum when both headers are included.
+static constexpr unsigned DXGI_FORMAT_UNKNOWN = 0;
+static constexpr unsigned DXGI_FORMAT_D32_FLOAT_S8X24_UINT = 20;
+static constexpr unsigned DXGI_FORMAT_R8G8B8A8_UNORM = 28;
+static constexpr unsigned DXGI_FORMAT_R8G8B8A8_UNORM_SRGB = 29;
+static constexpr unsigned DXGI_FORMAT_D24_UNORM_S8_UINT = 45;
+static constexpr unsigned DXGI_FORMAT_R8G8_UNORM = 49;
+static constexpr unsigned DXGI_FORMAT_R8_UNORM = 61;
+static constexpr unsigned DXGI_FORMAT_A8_UNORM = 65;
+static constexpr unsigned DXGI_FORMAT_B8G8R8A8_UNORM = 87;
+static constexpr unsigned DXGI_FORMAT_B8G8R8A8_UNORM_SRGB = 91;
+
+inline PixelFormat DXGIFormatToPixelFormat(unsigned dxgiFormat) {
+  switch (dxgiFormat) {
+    case DXGI_FORMAT_R8_UNORM:
+    case DXGI_FORMAT_A8_UNORM:
+      return PixelFormat::ALPHA_8;
+    case DXGI_FORMAT_R8G8_UNORM:
+      return PixelFormat::RG_88;
+    case DXGI_FORMAT_B8G8R8A8_UNORM:
+    case DXGI_FORMAT_B8G8R8A8_UNORM_SRGB:
+      return PixelFormat::BGRA_8888;
+    case DXGI_FORMAT_D24_UNORM_S8_UINT:
+    case DXGI_FORMAT_D32_FLOAT_S8X24_UINT:
+      return PixelFormat::DEPTH24_STENCIL8;
+    case DXGI_FORMAT_R8G8B8A8_UNORM:
+    case DXGI_FORMAT_R8G8B8A8_UNORM_SRGB:
+    default:
+      return PixelFormat::RGBA_8888;
+  }
+}
+
+inline unsigned PixelFormatToDXGIFormat(PixelFormat format) {
+  switch (format) {
+    case PixelFormat::ALPHA_8:
+      return DXGI_FORMAT_R8_UNORM;
+    case PixelFormat::GRAY_8:
+      return DXGI_FORMAT_R8_UNORM;
+    case PixelFormat::RG_88:
+      return DXGI_FORMAT_R8G8_UNORM;
+    case PixelFormat::BGRA_8888:
+      return DXGI_FORMAT_B8G8R8A8_UNORM;
+    case PixelFormat::DEPTH24_STENCIL8:
+      return DXGI_FORMAT_D24_UNORM_S8_UINT;
+    case PixelFormat::RGBA_8888:
+      return DXGI_FORMAT_R8G8B8A8_UNORM;
+    default:
+      return DXGI_FORMAT_UNKNOWN;
+  }
+}
+
+inline size_t DXGIFormatBytesPerPixel(unsigned dxgiFormat) {
+  switch (dxgiFormat) {
+    case DXGI_FORMAT_R8_UNORM:
+    case DXGI_FORMAT_A8_UNORM:
+      return 1;
+    case DXGI_FORMAT_R8G8_UNORM:
+      return 2;
+    case DXGI_FORMAT_R8G8B8A8_UNORM:
+    case DXGI_FORMAT_R8G8B8A8_UNORM_SRGB:
+    case DXGI_FORMAT_B8G8R8A8_UNORM:
+    case DXGI_FORMAT_B8G8R8A8_UNORM_SRGB:
+    case DXGI_FORMAT_D24_UNORM_S8_UINT:
+      return 4;
+    case DXGI_FORMAT_D32_FLOAT_S8X24_UINT:
+      return 8;
+    default:
+      return 4;
+  }
+}
+
+}  // namespace tgfx
diff --git a/src/gpu/d3d12/D3D12DescriptorRing.cpp b/src/gpu/d3d12/D3D12DescriptorRing.cpp
new file mode 100644
index 000000000..4b56e5bcf
--- /dev/null
+++ b/src/gpu/d3d12/D3D12DescriptorRing.cpp
@@ -0,0 +1,142 @@
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Tencent is pleased to support the open source community by making tgfx available.
+//
+//  Copyright (C) 2026 Tencent. All rights reserved.
+//
+//  Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+//  in compliance with the License. You may obtain a copy of the License at
+//
+//      https://opensource.org/licenses/BSD-3-Clause
+//
+//  unless required by applicable law or agreed to in writing, software distributed under the
+//  license is distributed on an "as is" basis, without warranties or conditions of any kind,
+//  either express or implied. see the license for the specific language governing permissions
+//  and limitations under the license.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "D3D12DescriptorRing.h"
+#include "core/utils/Log.h"
+
+namespace tgfx {
+
+bool D3D12DescriptorRing::init(ID3D12Device* device, D3D12_DESCRIPTOR_HEAP_TYPE type,
+                               uint32_t capacity, bool shaderVisible) {
+  if (device == nullptr || capacity == 0) {
+    return false;
+  }
+  D3D12_DESCRIPTOR_HEAP_DESC desc = {};
+  desc.Type = type;
+  desc.NumDescriptors = capacity;
+  // SHADER_VISIBLE is illegal on RTV/DSV heaps; callers that build those rings pass
+  // shaderVisible=false and skip the gpuBase initialisation below so the ring still works for
+  // CPU-side allocation but never hands out a GPU handle.
+  desc.Flags =
+      shaderVisible ? D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE : D3D12_DESCRIPTOR_HEAP_FLAG_NONE;
+  if (FAILED(device->CreateDescriptorHeap(&desc, IID_PPV_ARGS(&_heap)))) {
+    LOGE("D3D12DescriptorRing::init() CreateDescriptorHeap failed (type=%d capacity=%u).",
+         static_cast<int>(type), capacity);
+    return false;
+  }
+  _capacity = capacity;
+  _descriptorSize = device->GetDescriptorHandleIncrementSize(type);
+  cpuBase = _heap->GetCPUDescriptorHandleForHeapStart();
+  gpuBase =
+      shaderVisible ? _heap->GetGPUDescriptorHandleForHeapStart() : D3D12_GPU_DESCRIPTOR_HANDLE{};
+  head = 0;
+  committedHead = 0;
+  outstandingSlots = 0;
+  // Drop any inflight entries left over from a previous init() so the post-init state really is
+  // "fresh", matching the resetForContextLost() invariant. There is no current re-init path,
+  // but if one is added later (device-lost recovery, test teardown) those inflight entries
+  // would otherwise reference the previous, just-released heap.
+  inflight.clear();
+  return true;
+}
+
+D3D12DescriptorRing::Range D3D12DescriptorRing::allocate(uint32_t count) {
+  if (_heap == nullptr || count == 0 || count > _capacity) {
+    return {};
+  }
+  // Use the explicit outstandingSlots counter to know how many slots are still in use.
+  // (head, tail) arithmetic cannot disambiguate "empty" from "full" once an allocation pushes
+  // head back onto where tail used to be — see the rationale on outstandingSlots in the header.
+  uint32_t free = _capacity - outstandingSlots;
+  uint32_t needed = count;
+  uint32_t startSlot = head;
+  uint32_t skipped = 0;
+  if (head + count > _capacity) {
+    // Avoid splitting an allocation across the wrap-around boundary so callers can pass a
+    // single contiguous CPU/GPU descriptor range to D3D12 APIs (CreateShaderResourceView,
+    // SetGraphicsRootDescriptorTable, etc.). The discarded slots between head and the end of
+    // the ring are billed against the same free pool so the ring stays accounting-consistent.
+    skipped = _capacity - head;
+    needed = count + skipped;
+    startSlot = 0;
+  }
+  if (needed > free) {
+    LOGE(
+        "D3D12DescriptorRing::allocate() out of slots: requested=%u free=%u capacity=%u "
+        "skipped=%u.",
+        count, free, _capacity, skipped);
+    return {};
+  }
+  Range range = {};
+  range.cpuStart = cpuBase;
+  range.cpuStart.ptr += static_cast<SIZE_T>(startSlot) * _descriptorSize;
+  range.gpuStart = gpuBase;
+  range.gpuStart.ptr += static_cast<UINT64>(startSlot) * _descriptorSize;
+  range.startSlot = startSlot;
+  range.count = count;
+  head = startSlot + count;
+  if (head == _capacity) {
+    head = 0;
+  }
+  outstandingSlots += needed;
+  return range;
+}
+
+void D3D12DescriptorRing::commit(uint64_t fenceValue) {
+  // Compute slots consumed since the previous commit including any wrap-around skip. Comparing
+  // (head, committedHead) directly fails the "first allocation took the entire capacity" case
+  // because head wraps right back to committedHead — guard that with outstandingSlots.
+  uint32_t bytesSinceCommit =
+      (head >= committedHead) ? (head - committedHead) : (_capacity - (committedHead - head));
+  if (bytesSinceCommit == 0) {
+    if (outstandingSlots == 0) {
+      // Truly nothing happened since the last commit; skip enqueuing an empty fence record.
+      return;
+    }
+    // The ring was filled to exactly capacity since the last commit, so head == committedHead
+    // again. Charge the entire capacity to this fence so retire() returns it eventually.
+    bytesSinceCommit = _capacity;
+  }
+  InflightRange entry = {};
+  entry.fenceValue = fenceValue;
+  entry.slots = bytesSinceCommit;
+  inflight.push_back(entry);
+  committedHead = head;
+}
+
+void D3D12DescriptorRing::retire(uint64_t completedFenceValue) {
+  while (!inflight.empty() && inflight.front().fenceValue <= completedFenceValue) {
+    if (outstandingSlots >= inflight.front().slots) {
+      outstandingSlots -= inflight.front().slots;
+    } else {
+      // Defensive: bookkeeping should never drop below zero. If it does we reset rather than
+      // wrap to UINT32_MAX and stop accepting allocations forever.
+      outstandingSlots = 0;
+    }
+    inflight.pop_front();
+  }
+}
+
+void D3D12DescriptorRing::resetForContextLost() {
+  inflight.clear();
+  head = 0;
+  committedHead = 0;
+  outstandingSlots = 0;
+}
+
+}  // namespace tgfx
diff --git a/src/gpu/d3d12/D3D12DescriptorRing.h b/src/gpu/d3d12/D3D12DescriptorRing.h
new file mode 100644
index 000000000..efd6c4289
--- /dev/null
+++ b/src/gpu/d3d12/D3D12DescriptorRing.h
@@ -0,0 +1,144 @@
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Tencent is pleased to support the open source community by making tgfx available.
+//
+//  Copyright (C) 2026 Tencent. All rights reserved.
+//
+//  Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+//  in compliance with the License. You may obtain a copy of the License at
+//
+//      https://opensource.org/licenses/BSD-3-Clause
+//
+//  unless required by applicable law or agreed to in writing, software distributed under the
+//  license is distributed on an "as is" basis, without warranties or conditions of any kind,
+//  either express or implied. see the license for the specific language governing permissions
+//  and limitations under the license.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include <deque>
+#include "D3D12Util.h"
+
+namespace tgfx {
+
+/**
+ * Single shader-visible descriptor heap used as a fence-tracked ring buffer.
+ *
+ * Rationale:
+ *   - D3D12 shader-visible descriptor heaps are expensive to create (they reserve a GPU virtual
+ *     address range and the runtime caps total live shader-visible descriptors per heap type).
+ *   - The naive "create one heap per render pass" pattern hits driver limits and burns CPU on
+ *     every submission. The standard D3D12 idiom is one large heap per heap type, sub-allocated
+ *     linearly with fence-based reclamation.
+ *
+ * Allocation model:
+ *   - allocate(count) hands out a contiguous slot range from a monotonic head pointer.
+ *   - commit(fenceValue) snapshots head: every slot allocated since the last commit is now
+ *     "owned" by `fenceValue` and will be reclaimed once the GPU signals it.
+ *   - retire(completedFenceValue) advances tail past every committed range whose fence has
+ *     completed, freeing those slots for re-allocation.
+ *   - When head wraps around back near tail, allocate() returns an invalid Range; the caller
+ *     must treat that as a hard failure (capacity should be sized to make this unreachable in
+ *     normal use).
+ *
+ * Thread safety: not thread-safe. Caller must serialise all access. Matches tgfx's overall
+ * single-threaded D3D12 backend usage.
+ */
+class D3D12DescriptorRing {
+ public:
+  struct Range {
+    D3D12_CPU_DESCRIPTOR_HANDLE cpuStart = {};
+    D3D12_GPU_DESCRIPTOR_HANDLE gpuStart = {};
+    uint32_t startSlot = 0;
+    uint32_t count = 0;
+    bool valid() const {
+      return count > 0;
+    }
+  };
+
+  D3D12DescriptorRing() = default;
+
+  /**
+   * Creates the underlying descriptor heap. Pass D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV /
+   * SAMPLER for shader-visible rings (the default), or RTV / DSV with shaderVisible=false for
+   * the non-shader-visible variants used by render targets. D3D12 rejects SHADER_VISIBLE on
+   * RTV/DSV heaps, so the flag must follow the heap type.
+   */
+  bool init(ID3D12Device* device, D3D12_DESCRIPTOR_HEAP_TYPE type, uint32_t capacity,
+            bool shaderVisible = true);
+
+  /**
+   * Sub-allocates `count` consecutive slots. Returns an invalid Range if the ring cannot satisfy
+   * the request without overrunning still-in-flight slots.
+   */
+  Range allocate(uint32_t count);
+
+  /**
+   * Marks every slot allocated since the previous commit() as belonging to `fenceValue`. Those
+   * slots become reclaimable only after the GPU advances the fence past `fenceValue`.
+   */
+  void commit(uint64_t fenceValue);
+
+  /**
+   * Reclaims slots whose owning fence value is at or below `completedFenceValue`. Cheap;
+   * intended to be called from the same place as the existing inflight-submission polling.
+   */
+  void retire(uint64_t completedFenceValue);
+
+  /**
+   * Drops every inflight range and resets the ring head/tail/outstanding bookkeeping while
+   * keeping the underlying ID3D12DescriptorHeap allocated. Intended for the context-lost
+   * recovery path: once D3D12GPU has decided the device is gone, the fences associated with
+   * those inflight ranges will never advance, so retire() would never reclaim them. Without
+   * this reset their slots stay billed against outstandingSlots and the ring would refuse
+   * every subsequent allocation forever, even if the application keeps the GPU instance
+   * around for diagnostics.
+   */
+  void resetForContextLost();
+
+  ID3D12DescriptorHeap* heap() const {
+    return _heap.Get();
+  }
+
+  uint32_t descriptorSize() const {
+    return _descriptorSize;
+  }
+
+  uint32_t capacity() const {
+    return _capacity;
+  }
+
+ private:
+  // head is the slot index modulo capacity at which the next allocate() will start writing.
+  // The classic (head, tail) pair would also be needed to ask "how full is the ring?", but
+  // that pair cannot disambiguate "empty" from "full" once an allocation pushes head back onto
+  // tail; we maintain an explicit outstandingSlots counter instead — allocate() bumps it,
+  // retire() drains it. Without that counter an allocation that spans the entire capacity (or
+  // a sequence whose head wraps right onto where tail used to be) would convince the next
+  // allocate() that the ring is empty and hand back slots the GPU is still reading.
+  uint32_t head = 0;
+  // Snapshot of head at the last commit() call. Slots in [committedHead, head) are part of the
+  // current pending submission; slots before that have already been associated with a fence.
+  uint32_t committedHead = 0;
+  // Slots currently held by either an as-yet-uncommitted allocation or an inflight commit
+  // waiting on its fence. allocate() rejects the request when needed > capacity - outstanding.
+  uint32_t outstandingSlots = 0;
+
+  struct InflightRange {
+    uint64_t fenceValue = 0;
+    // Slots consumed between the previous commit() and this one (including any wrap-around
+    // skip), returned to outstandingSlots when retire() reaches this entry.
+    uint32_t slots = 0;
+  };
+  std::deque<InflightRange> inflight;
+
+  ComPtr<ID3D12DescriptorHeap> _heap = nullptr;
+  uint32_t _capacity = 0;
+  uint32_t _descriptorSize = 0;
+  D3D12_CPU_DESCRIPTOR_HANDLE cpuBase = {};
+  D3D12_GPU_DESCRIPTOR_HANDLE gpuBase = {};
+};
+
+}  // namespace tgfx
diff --git a/src/gpu/d3d12/D3D12Device.cpp b/src/gpu/d3d12/D3D12Device.cpp
new file mode 100644
index 000000000..27715395a
--- /dev/null
+++ b/src/gpu/d3d12/D3D12Device.cpp
@@ -0,0 +1,177 @@
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Tencent is pleased to support the open source community by making tgfx available.
+//
+//  Copyright (C) 2026 Tencent. All rights reserved.
+//
+//  Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+//  in compliance with the License. You may obtain a copy of the License at
+//
+//      https://opensource.org/licenses/BSD-3-Clause
+//
+//  unless required by applicable law or agreed to in writing, software distributed under the
+//  license is distributed on an "as is" basis, without warranties or conditions of any kind,
+//  either express or implied. see the license for the specific language governing permissions
+//  and limitations under the license.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "D3D12Device.h"
+#include <dxgi1_4.h>
+#include "D3D12GPU.h"
+#include "core/utils/Log.h"
+
+namespace tgfx {
+
+// Initialise the optional D3D12 debug layer and DRED settings shared by both Make() and
+// MakeWarp(). Idempotent — calling EnableDebugLayer / DRED setup twice is a no-op past the
+// first invocation, so the duplication does not cost anything in practice.
+static void EnableD3D12DebugFeatures() {
+#if !defined(NDEBUG) || defined(TGFX_D3D12_DEBUG_LAYER)
+  // Enable the D3D12 debug layer when explicitly requested. Must be called before
+  // D3D12CreateDevice. Validation messages surface as readable text instead of
+  // generic E_INVALIDARG return codes.
+  {
+    ComPtr<ID3D12Debug> debugController = nullptr;
+    if (SUCCEEDED(D3D12GetDebugInterface(IID_PPV_ARGS(&debugController)))) {
+      debugController->EnableDebugLayer();
+    }
+  }
+#endif
+#if !defined(NDEBUG) || defined(TGFX_D3D12_DEBUG_LAYER) || defined(TGFX_D3D12_DRED)
+  // Enable Device Removed Extended Data so that, on a TDR/hang, we can ask the driver which
+  // command was the last one the GPU started and which one it was about to execute next. This
+  // is the cheapest way to localise a hang without attaching PIX. Must be requested before
+  // D3D12CreateDevice; queried later via D3D12GPU when GetDeviceRemovedReason() reports a fault.
+  {
+    ComPtr<ID3D12DeviceRemovedExtendedDataSettings> dredSettings = nullptr;
+    auto dredHr = D3D12GetDebugInterface(IID_PPV_ARGS(&dredSettings));
+    if (SUCCEEDED(dredHr)) {
+      dredSettings->SetAutoBreadcrumbsEnablement(D3D12_DRED_ENABLEMENT_FORCED_ON);
+      dredSettings->SetPageFaultEnablement(D3D12_DRED_ENABLEMENT_FORCED_ON);
+      LOGE("[DRED setup] Auto-breadcrumbs and page-fault tracking enabled.");
+    } else {
+      LOGE(
+          "[DRED setup] D3D12GetDebugInterface(ID3D12DeviceRemovedExtendedDataSettings) "
+          "returned HRESULT=0x%08X; DRED unavailable.",
+          static_cast<unsigned>(dredHr));
+    }
+  }
+#endif
+}
+
+std::shared_ptr<D3D12Device> D3D12Device::Make() {
+  EnableD3D12DebugFeatures();
+  ComPtr<IDXGIFactory4> factory = nullptr;
+  if (FAILED(CreateDXGIFactory1(IID_PPV_ARGS(&factory)))) {
+    LOGE("D3D12Device::Make() Failed to create DXGI factory.");
+    return nullptr;
+  }
+  ComPtr<IDXGIAdapter1> adapter = nullptr;
+  for (UINT i = 0; factory->EnumAdapters1(i, &adapter) != DXGI_ERROR_NOT_FOUND; ++i) {
+    DXGI_ADAPTER_DESC1 desc = {};
+    adapter->GetDesc1(&desc);
+    if (desc.Flags & DXGI_ADAPTER_FLAG_SOFTWARE) {
+      adapter = nullptr;
+      continue;
+    }
+    ComPtr<ID3D12Device> d3d12Device = nullptr;
+    if (SUCCEEDED(
+            D3D12CreateDevice(adapter.Get(), D3D_FEATURE_LEVEL_11_0, IID_PPV_ARGS(&d3d12Device)))) {
+      return MakeFrom(d3d12Device.Get());
+    }
+    adapter = nullptr;
+  }
+  LOGE("D3D12Device::Make() No suitable D3D12 hardware adapter found.");
+  return nullptr;
+}
+
+std::shared_ptr<D3D12Device> D3D12Device::MakeWarp() {
+  // WARP is the Windows Advanced Rasterization Platform — Microsoft's CPU-based reference
+  // implementation of D3D12 that ships with every modern Windows install. It is feature
+  // complete (FL12_1) but very slow; the only sensible callers are CI runners and offline
+  // tools that do not have a usable hardware adapter. CreateDXGIFactory1 +
+  // EnumWarpAdapter is the documented entry point.
+  EnableD3D12DebugFeatures();
+  ComPtr<IDXGIFactory4> factory = nullptr;
+  if (FAILED(CreateDXGIFactory1(IID_PPV_ARGS(&factory)))) {
+    LOGE("D3D12Device::MakeWarp() Failed to create DXGI factory.");
+    return nullptr;
+  }
+  ComPtr<IDXGIAdapter1> warpAdapter = nullptr;
+  if (FAILED(factory->EnumWarpAdapter(IID_PPV_ARGS(&warpAdapter)))) {
+    LOGE("D3D12Device::MakeWarp() EnumWarpAdapter failed; WARP unavailable on this system.");
+    return nullptr;
+  }
+  ComPtr<ID3D12Device> d3d12Device = nullptr;
+  if (FAILED(D3D12CreateDevice(warpAdapter.Get(), D3D_FEATURE_LEVEL_11_0,
+                               IID_PPV_ARGS(&d3d12Device)))) {
+    LOGE("D3D12Device::MakeWarp() D3D12CreateDevice on WARP adapter failed.");
+    return nullptr;
+  }
+  return MakeFrom(d3d12Device.Get());
+}
+
+std::shared_ptr<D3D12Device> D3D12Device::MakeFrom(void* device) {
+  if (device == nullptr) {
+    return nullptr;
+  }
+  auto d3d12Device = static_cast<ID3D12Device*>(device);
+  ComPtr<ID3D12Device> devicePtr = nullptr;
+  d3d12Device->QueryInterface(IID_PPV_ARGS(&devicePtr));
+  if (devicePtr == nullptr) {
+    return nullptr;
+  }
+#if !defined(NDEBUG) || defined(TGFX_D3D12_DEBUG_LAYER)
+  // Configure the debug-layer info queue so subsequent CreateDescriptorHeap / ResourceBarrier /
+  // DrawX failures get logged with the underlying validation message instead of just an opaque
+  // HRESULT. Messages are drained per-submission by D3D12GPU::executeSubmission().
+  ComPtr<ID3D12InfoQueue> infoQueue = nullptr;
+  if (SUCCEEDED(devicePtr->QueryInterface(IID_PPV_ARGS(&infoQueue)))) {
+    infoQueue->SetMuteDebugOutput(FALSE);
+    // Default storage limit is 1024; expand it so we don't lose messages between drains.
+    infoQueue->SetMessageCountLimit(8192);
+    // Break into the debugger on the most useful severities. When no debugger is attached the
+    // breaks are no-ops, but the messages still queue up for DrainDebugMessages() to log.
+    infoQueue->SetBreakOnSeverity(D3D12_MESSAGE_SEVERITY_CORRUPTION, TRUE);
+    infoQueue->SetBreakOnSeverity(D3D12_MESSAGE_SEVERITY_ERROR, FALSE);
+  }
+#endif
+  auto gpu = D3D12GPU::Make(std::move(devicePtr));
+  if (gpu == nullptr) {
+    return nullptr;
+  }
+  auto result = std::shared_ptr<D3D12Device>(new D3D12Device(std::move(gpu)));
+  result->weakThis = result;
+  return result;
+}
+
+D3D12Device::D3D12Device(std::unique_ptr<D3D12GPU> gpu) : Device(std::move(gpu)) {
+}
+
+D3D12Device::~D3D12Device() {
+  static_cast<D3D12GPU*>(_gpu)->releaseAll(true);
+}
+
+void* D3D12Device::d3d12Device() const {
+  return static_cast<D3D12GPU*>(_gpu)->device();
+}
+
+bool D3D12Device::onLockContext() {
+  // The base Device::lockContext() acquires the device mutex before calling us. If the GPU has
+  // been removed (e.g. DXGI_ERROR_DEVICE_REMOVED on a previous Signal), every subsequent
+  // operation on the device would either return failure immediately or, worse, leave the
+  // application waiting on a mutex it owns from a path that already encountered the loss but
+  // did not unwind cleanly. We surface the loss here so the base class unlocks the mutex and
+  // returns nullptr, matching the OpenGL backend's CONTEXT_LOST handling.
+  auto* gpu = static_cast<D3D12GPU*>(_gpu);
+  if (gpu->isContextLost()) {
+    return false;
+  }
+  return true;
+}
+
+void D3D12Device::onUnlockContext() {
+}
+
+}  // namespace tgfx
diff --git a/src/gpu/d3d12/D3D12Device.h b/src/gpu/d3d12/D3D12Device.h
new file mode 100644
index 000000000..2780def34
--- /dev/null
+++ b/src/gpu/d3d12/D3D12Device.h
@@ -0,0 +1,29 @@
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Tencent is pleased to support the open source community by making tgfx available.
+//
+//  Copyright (C) 2026 Tencent. All rights reserved.
+//
+//  Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+//  in compliance with the License. You may obtain a copy of the License at
+//
+//      https://opensource.org/licenses/BSD-3-Clause
+//
+//  unless required by applicable law or agreed to in writing, software distributed under the
+//  license is distributed on an "as is" basis, without warranties or conditions of any kind,
+//  either express or implied. see the license for the specific language governing permissions
+//  and limitations under the license.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include "D3D12GPU.h"
+#include "tgfx/gpu/d3d12/D3D12Device.h"
+
+namespace tgfx {
+
+// Private implementation details for D3D12Device.
+// The public interface is in include/tgfx/gpu/d3d12/D3D12Device.h
+
+}  // namespace tgfx
diff --git a/src/gpu/d3d12/D3D12FrameSession.h b/src/gpu/d3d12/D3D12FrameSession.h
new file mode 100644
index 000000000..6736edf22
--- /dev/null
+++ b/src/gpu/d3d12/D3D12FrameSession.h
@@ -0,0 +1,99 @@
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Tencent is pleased to support the open source community by making tgfx available.
+//
+//  Copyright (C) 2026 Tencent. All rights reserved.
+//
+//  Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+//  in compliance with the License. You may obtain a copy of the License at
+//
+//      https://opensource.org/licenses/BSD-3-Clause
+//
+//  unless required by applicable law or agreed to in writing, software distributed under the
+//  license is distributed on an "as is" basis, without warranties or conditions of any kind,
+//  either express or implied. see the license for the specific language governing permissions
+//  and limitations under the license.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include <memory>
+#include <unordered_map>
+#include <vector>
+#include "gpu/d3d12/D3D12Resource.h"
+#include "gpu/d3d12/D3D12Util.h"
+
+namespace tgfx {
+
+class D3D12Texture;
+
+/**
+ * Value-type aggregate of all per-frame GPU resources produced during one encoding session.
+ *
+ * Why D3D12 needs this (mirrors VulkanFrameSession's role):
+ *   - D3D12, like Vulkan, provides NO automatic resource tracking. ID3D12GraphicsCommandList is
+ *     deferred: it executes on the GPU long after recording finishes. If an ID3D12Resource backing
+ *     a buffer or texture is released before the GPU finishes reading it, behaviour is undefined.
+ *     Likewise, ID3D12CommandAllocator must outlive every command list it produced until the GPU
+ *     consumes them, otherwise Reset() will fail or recorded commands will be corrupted.
+ *   - The application must explicitly keep allocator, command list, and any referenced resources
+ *     alive until the associated ID3D12Fence value signals on the queue.
+ *
+ * D3D12FrameSession is the single place where "everything this frame needs" is defined. It is
+ * moved (not copied) through the pipeline: Encoder -> CommandBuffer -> InflightSubmission. Cleanup
+ * happens exclusively after the queue's fence confirms GPU completion.
+ *
+ * Differences from Vulkan's FrameSession:
+ *   - No descriptor pools: D3D12 binds via descriptor heaps, which are managed independently and
+ *     do not require per-frame pool churn.
+ *   - No render passes / framebuffers: D3D12 has no equivalent persistent objects; render targets
+ *     are bound through RTV/DSV descriptors at record time.
+ *   - retainedResources holds D3D12Resource subclasses (buffers, textures, samplers, pipelines).
+ *
+ * Adding a new per-frame resource type requires only two changes:
+ *   1. Add a field here.
+ *   2. Add cleanup logic in D3D12GPU::reclaimSubmission() (introduced in a later step).
+ */
+struct D3D12FrameSession {
+  ComPtr<ID3D12CommandAllocator> commandAllocator;
+  ComPtr<ID3D12GraphicsCommandList> commandList;
+  // Strong references preventing D3D12Resource destruction while GPU is still executing.
+  // When cleared after the fence signals, refcounts decrement; resources reaching zero enter the
+  // ReturnQueue and are safely destroyed during processUnreferencedResources().
+  std::vector<std::shared_ptr<D3D12Resource>> retainedResources;
+  // Shader-visible descriptor heaps (CBV/SRV/UAV and Sampler) created per render pass to back
+  // SetGraphicsRootDescriptorTable. They must outlive GPU execution because the GPU keeps reading
+  // their contents until the fence signals. Released after the fence signals.
+  std::vector<ComPtr<ID3D12DescriptorHeap>> retainedDescriptorHeaps;
+  // Auxiliary command allocators/lists used to record one-off work (texture uploads) outside the
+  // main command list. Captured here so they outlive GPU execution; freed after the fence signals.
+  std::vector<ComPtr<ID3D12CommandAllocator>> auxAllocators;
+  std::vector<ComPtr<ID3D12GraphicsCommandList>> auxCommandLists;
+  // Auxiliary ID3D12Resource buffers (e.g. transient staging buffers used by
+  // copyTextureToBuffer for row-pitch alignment) that must live until the fence signals.
+  std::vector<ComPtr<ID3D12Resource>> auxBuffers;
+
+  // Original D3D12_RESOURCE_STATES of every texture this session has been about to mutate, keyed
+  // by raw D3D12Texture* (raw is fine because the matching shared_ptr is held in
+  // retainedResources for the session's lifetime). Populated by the helper recordTextureState-
+  // Change(): the first call for a given texture saves its current state, subsequent calls do
+  // nothing. If the session is abandoned (~D3D12CommandBuffer or ~D3D12CommandEncoder before
+  // submit), reclaimAbandonedSession walks this map to roll D3D12Texture::_currentState back
+  // to what the GPU still sees, preventing the next render pass from emitting transitions
+  // whose StateBefore disagrees with reality.
+  std::unordered_map<D3D12Texture*, D3D12_RESOURCE_STATES> initialTextureStates;
+
+  D3D12FrameSession() = default;
+
+  // ComPtr is move-aware and zeroes the source on move; std::vector moves are also clean. The
+  // explicit move/copy declarations match VulkanFrameSession for consistency and to make the
+  // copy-deletion intent obvious at the call site.
+  D3D12FrameSession(D3D12FrameSession&& other) noexcept = default;
+  D3D12FrameSession& operator=(D3D12FrameSession&& other) noexcept = default;
+
+  D3D12FrameSession(const D3D12FrameSession&) = delete;
+  D3D12FrameSession& operator=(const D3D12FrameSession&) = delete;
+};
+
+}  // namespace tgfx
diff --git a/src/gpu/d3d12/D3D12GPU.cpp b/src/gpu/d3d12/D3D12GPU.cpp
new file mode 100644
index 000000000..2592529ad
--- /dev/null
+++ b/src/gpu/d3d12/D3D12GPU.cpp
@@ -0,0 +1,956 @@
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Tencent is pleased to support the open source community by making tgfx available.
+//
+//  Copyright (C) 2026 Tencent. All rights reserved.
+//
+//  Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+//  in compliance with the License. You may obtain a copy of the License at
+//
+//      https://opensource.org/licenses/BSD-3-Clause
+//
+//  unless required by applicable law or agreed to in writing, software distributed under the
+//  license is distributed on an "as is" basis, without warranties or conditions of any kind,
+//  either express or implied. see the license for the specific language governing permissions
+//  and limitations under the license.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "D3D12GPU.h"
+#include <dxgi1_4.h>
+#include <algorithm>
+#include <shaderc/shaderc.hpp>
+#include <string>
+#include <vector>
+#include "D3D12Buffer.h"
+#include "D3D12CommandEncoder.h"
+#include "D3D12CommandQueue.h"
+#include "D3D12MipmapGenerator.h"
+#include "D3D12RenderPipeline.h"
+#include "D3D12Resource.h"
+#include "D3D12Sampler.h"
+#include "D3D12Semaphore.h"
+#include "D3D12ShaderModule.h"
+#include "D3D12Texture.h"
+#include "core/utils/Log.h"
+
+namespace tgfx {
+
+bool HardwareBufferAvailable() {
+  return false;
+}
+
+#ifdef TGFX_D3D12_DEBUG_LAYER
+void D3D12GPU::drainDebugMessages(const char* tag) {
+  if (d3d12Device == nullptr) {
+    return;
+  }
+  ComPtr<ID3D12InfoQueue> infoQueue = nullptr;
+  if (FAILED(d3d12Device->QueryInterface(IID_PPV_ARGS(&infoQueue)))) {
+    return;
+  }
+  auto count = infoQueue->GetNumStoredMessages();
+  for (UINT64 i = 0; i < count; i++) {
+    SIZE_T msgLen = 0;
+    infoQueue->GetMessage(i, nullptr, &msgLen);
+    std::vector<char> buf(msgLen);
+    auto* msg = reinterpret_cast<D3D12_MESSAGE*>(buf.data());
+    if (SUCCEEDED(infoQueue->GetMessage(i, msg, &msgLen))) {
+      LOGE("[D3D12 debug @ %s] %.*s", tag, static_cast<int>(msg->DescriptionByteLength),
+           msg->pDescription);
+    }
+  }
+  if (count > 0) {
+    infoQueue->ClearStoredMessages();
+  }
+}
+#else
+void D3D12GPU::drainDebugMessages(const char*) {
+}
+#endif
+
+static ComPtr<IDXGIAdapter1> FindAdapter(ID3D12Device* device) {
+  ComPtr<IDXGIFactory4> factory = nullptr;
+  if (FAILED(CreateDXGIFactory1(IID_PPV_ARGS(&factory)))) {
+    return nullptr;
+  }
+  LUID luid = device->GetAdapterLuid();
+  ComPtr<IDXGIAdapter1> adapter = nullptr;
+  for (UINT i = 0; factory->EnumAdapters1(i, &adapter) != DXGI_ERROR_NOT_FOUND; ++i) {
+    DXGI_ADAPTER_DESC1 desc = {};
+    adapter->GetDesc1(&desc);
+    if (desc.AdapterLuid.LowPart == luid.LowPart && desc.AdapterLuid.HighPart == luid.HighPart) {
+      return adapter;
+    }
+    adapter = nullptr;
+  }
+  return nullptr;
+}
+
+std::unique_ptr<D3D12GPU> D3D12GPU::Make(ComPtr<ID3D12Device> device) {
+  if (device == nullptr) {
+    return nullptr;
+  }
+  auto adapter = FindAdapter(device.Get());
+  auto gpu = std::unique_ptr<D3D12GPU>(new D3D12GPU(std::move(device), std::move(adapter)));
+  if (gpu->commandQueue == nullptr || gpu->_frameFence == nullptr ||
+      gpu->_frameFenceEvent == nullptr || gpu->_srvRing.heap() == nullptr ||
+      gpu->_samplerHeap == nullptr || gpu->_uploadHeap.capacity() == 0) {
+    return nullptr;
+  }
+  return gpu;
+}
+
+D3D12GPU::D3D12GPU(ComPtr<ID3D12Device> device, ComPtr<IDXGIAdapter1> adapter)
+    : d3d12Device(std::move(device)), dxgiAdapter(std::move(adapter)) {
+  initInfo();
+  initFeatures();
+  initLimits();
+  // Create the per-frame fence and its waitable event before the command queue, since the queue
+  // uses these handles when scheduling Signal/Wait operations.
+  if (FAILED(d3d12Device->CreateFence(0, D3D12_FENCE_FLAG_NONE, IID_PPV_ARGS(&_frameFence)))) {
+    LOGE("D3D12GPU: failed to create frame fence.");
+    return;
+  }
+  _frameFenceEvent = CreateEventW(nullptr, FALSE, FALSE, nullptr);
+  if (_frameFenceEvent == nullptr) {
+    LOGE("D3D12GPU: failed to create frame fence event.");
+    _frameFence = nullptr;
+    return;
+  }
+  // Allocate the process-wide shader-visible CBV/SRV/UAV ring up front. Failure here means we
+  // cannot satisfy any subsequent render-pass binding, so propagate it through to Make() via the
+  // null-heap check rather than letting the GPU come up half-initialised.
+  if (!_srvRing.init(d3d12Device.Get(), D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV,
+                     SRV_RING_CAPACITY)) {
+    LOGE("D3D12GPU: failed to initialise CBV/SRV/UAV descriptor ring.");
+    return;
+  }
+  // Allocate the process-wide shader-visible Sampler heap. Append-only, capped at D3D12's hard
+  // 2048-descriptor limit. Each unique SamplerDescriptor consumes one slot for the lifetime of
+  // the GPU instance.
+  D3D12_DESCRIPTOR_HEAP_DESC samplerDesc = {};
+  samplerDesc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_SAMPLER;
+  samplerDesc.NumDescriptors = SAMPLER_HEAP_CAPACITY;
+  samplerDesc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE;
+  if (FAILED(d3d12Device->CreateDescriptorHeap(&samplerDesc, IID_PPV_ARGS(&_samplerHeap)))) {
+    LOGE("D3D12GPU: failed to create shader-visible Sampler heap.");
+    return;
+  }
+  _samplerHeapCapacity = SAMPLER_HEAP_CAPACITY;
+  _samplerDescriptorIncrement =
+      d3d12Device->GetDescriptorHandleIncrementSize(D3D12_DESCRIPTOR_HEAP_TYPE_SAMPLER);
+  // Allocate the process-wide UPLOAD ring used to stage CPU-to-GPU pixel/buffer data. Failure
+  // is surfaced through the capacity-zero check in Make() so the GPU does not come up with a
+  // partially-functional upload path.
+  if (!_uploadHeap.init(d3d12Device.Get(), UPLOAD_HEAP_CAPACITY)) {
+    LOGE("D3D12GPU: failed to initialise UPLOAD heap.");
+    return;
+  }
+  // Non-shader-visible RTV / DSV rings replace the per-render-pass CreateDescriptorHeap calls
+  // that the old D3D12RenderPass::initialise made. The OMSetRenderTargets command consumes
+  // CPU descriptor handles only, so a single shared heap is enough; we ring-buffer the slots
+  // by fence value just like the SRV/Sampler ring so descriptors stay valid until the GPU
+  // command list referencing them has retired.
+  if (!_rtvRing.init(d3d12Device.Get(), D3D12_DESCRIPTOR_HEAP_TYPE_RTV, RTV_RING_CAPACITY, false)) {
+    LOGE("D3D12GPU: failed to initialise RTV descriptor ring.");
+    return;
+  }
+  if (!_dsvRing.init(d3d12Device.Get(), D3D12_DESCRIPTOR_HEAP_TYPE_DSV, DSV_RING_CAPACITY, false)) {
+    LOGE("D3D12GPU: failed to initialise DSV descriptor ring.");
+    return;
+  }
+  commandQueue = std::make_unique<D3D12CommandQueue>(this);
+  compiler = std::make_unique<shaderc::Compiler>();
+}
+
+D3D12_GPU_DESCRIPTOR_HANDLE D3D12GPU::allocatePermanentSamplerSlot(const D3D12_SAMPLER_DESC& desc) {
+  D3D12_GPU_DESCRIPTOR_HANDLE invalid = {};
+  if (_samplerHeap == nullptr || _samplerHeapSize >= _samplerHeapCapacity) {
+    LOGE("D3D12GPU::allocatePermanentSamplerSlot: sampler heap exhausted (%u/%u).",
+         _samplerHeapSize, _samplerHeapCapacity);
+    return invalid;
+  }
+  auto slot = _samplerHeapSize++;
+  D3D12_CPU_DESCRIPTOR_HANDLE cpuHandle = _samplerHeap->GetCPUDescriptorHandleForHeapStart();
+  cpuHandle.ptr += static_cast<SIZE_T>(slot) * _samplerDescriptorIncrement;
+  d3d12Device->CreateSampler(&desc, cpuHandle);
+  D3D12_GPU_DESCRIPTOR_HANDLE gpuHandle = _samplerHeap->GetGPUDescriptorHandleForHeapStart();
+  gpuHandle.ptr += static_cast<UINT64>(slot) * _samplerDescriptorIncrement;
+  return gpuHandle;
+}
+
+D3D12GPU::~D3D12GPU() {
+  DEBUG_ASSERT(returnQueue == nullptr);
+  DEBUG_ASSERT(resources.empty());
+  if (_frameFenceEvent != nullptr) {
+    CloseHandle(_frameFenceEvent);
+    _frameFenceEvent = nullptr;
+  }
+}
+
+void D3D12GPU::initInfo() {
+  _info.backend = Backend::D3D12;
+  _info.version = "Direct3D 12";
+  if (dxgiAdapter != nullptr) {
+    DXGI_ADAPTER_DESC1 desc = {};
+    dxgiAdapter->GetDesc1(&desc);
+    std::wstring wRenderer(desc.Description);
+    int sizeNeeded =
+        WideCharToMultiByte(CP_UTF8, 0, wRenderer.data(), static_cast<int>(wRenderer.size()),
+                            nullptr, 0, nullptr, nullptr);
+    _info.renderer.resize(static_cast<size_t>(sizeNeeded));
+    WideCharToMultiByte(CP_UTF8, 0, wRenderer.data(), static_cast<int>(wRenderer.size()),
+                        _info.renderer.data(), sizeNeeded, nullptr, nullptr);
+    if (desc.VendorId == 0x10DE) {
+      _info.vendor = "NVIDIA";
+    } else if (desc.VendorId == 0x1002) {
+      _info.vendor = "AMD";
+    } else if (desc.VendorId == 0x8086) {
+      _info.vendor = "Intel";
+    } else if (desc.VendorId == 0x1414) {
+      _info.vendor = "Microsoft";
+    } else {
+      _info.vendor = "Unknown";
+    }
+  } else {
+    _info.renderer = "Unknown D3D12 Device";
+    _info.vendor = "Unknown";
+  }
+}
+
+void D3D12GPU::initFeatures() {
+  _features.semaphore = true;
+  _features.clampToBorder = true;
+  // D3D12 has no glTextureBarrier() equivalent: a resource cannot be in RENDER_TARGET and
+  // PIXEL_SHADER_RESOURCE state simultaneously, so the renderer cannot bind the current RTV
+  // as an SRV inside the same render pass. Mirror VulkanCaps and disable this feature so
+  // OpsCompositor::makeDstTextureInfo falls back to the copy-to-temp-texture path whenever an
+  // advanced blend mode (Lighten / Darken / etc.) needs to read the destination.
+  _features.textureBarrier = false;
+}
+
+void D3D12GPU::initLimits() {
+  D3D12_FEATURE_DATA_D3D12_OPTIONS options = {};
+  if (SUCCEEDED(d3d12Device->CheckFeatureSupport(D3D12_FEATURE_D3D12_OPTIONS, &options,
+                                                 sizeof(options)))) {
+    // D3D12 resource binding tier determines sampler limits.
+    switch (options.ResourceBindingTier) {
+      case D3D12_RESOURCE_BINDING_TIER_1:
+        _limits.maxSamplersPerShaderStage = 16;
+        break;
+      case D3D12_RESOURCE_BINDING_TIER_2:
+      case D3D12_RESOURCE_BINDING_TIER_3:
+      default:
+        _limits.maxSamplersPerShaderStage = 2048;
+        break;
+    }
+  } else {
+    _limits.maxSamplersPerShaderStage = 16;
+  }
+  _limits.maxTextureDimension2D = D3D12_REQ_TEXTURE2D_U_OR_V_DIMENSION;
+  _limits.maxUniformBufferBindingSize = D3D12_REQ_CONSTANT_BUFFER_ELEMENT_COUNT * 16;
+  _limits.minUniformBufferOffsetAlignment = D3D12_CONSTANT_BUFFER_DATA_PLACEMENT_ALIGNMENT;
+}
+
+CommandQueue* D3D12GPU::queue() const {
+  return commandQueue.get();
+}
+
+const shaderc::Compiler* D3D12GPU::shaderCompiler() const {
+  return compiler.get();
+}
+
+bool D3D12GPU::isFormatRenderable(PixelFormat format) const {
+  auto dxgiFormat = PixelFormatToDXGIFormat(format);
+  if (dxgiFormat == DXGI_FORMAT_UNKNOWN) {
+    return false;
+  }
+  D3D12_FEATURE_DATA_FORMAT_SUPPORT formatSupport = {};
+  formatSupport.Format = static_cast<DXGI_FORMAT>(dxgiFormat);
+  if (FAILED(d3d12Device->CheckFeatureSupport(D3D12_FEATURE_FORMAT_SUPPORT, &formatSupport,
+                                              sizeof(formatSupport)))) {
+    return false;
+  }
+  // Vulkan/Metal back-ends report any colour or depth-stencil attachment format as renderable.
+  // Match that language so callers using a backend-agnostic gate (e.g. createTexture's
+  // RENDER_ATTACHMENT pre-check) work uniformly when the format happens to be depth-stencil.
+  constexpr UINT attachmentMask =
+      D3D12_FORMAT_SUPPORT1_RENDER_TARGET | D3D12_FORMAT_SUPPORT1_DEPTH_STENCIL;
+  return (formatSupport.Support1 & attachmentMask) != 0;
+}
+
+std::shared_ptr<GPUBuffer> D3D12GPU::createBuffer(size_t size, uint32_t usage) {
+  if (size == 0) {
+    return nullptr;
+  }
+  return D3D12Buffer::Make(this, size, usage);
+}
+
+std::shared_ptr<Texture> D3D12GPU::createTexture(const TextureDescriptor& descriptor) {
+  if (descriptor.width <= 0 || descriptor.height <= 0) {
+    LOGE("D3D12GPU::createTexture() invalid dimensions: %dx%d", descriptor.width,
+         descriptor.height);
+    return nullptr;
+  }
+  if (!isFormatRenderable(descriptor.format) &&
+      (descriptor.usage & TextureUsage::RENDER_ATTACHMENT)) {
+    LOGE("D3D12GPU::createTexture() format not renderable for render attachment");
+    return nullptr;
+  }
+  auto texture = D3D12Texture::Make(this, descriptor);
+  if (texture == nullptr) {
+    LOGE("D3D12GPU::createTexture() D3D12Texture::Make failed for %dx%d format=%d",
+         descriptor.width, descriptor.height, static_cast<int>(descriptor.format));
+  }
+  return texture;
+}
+
+std::shared_ptr<Sampler> D3D12GPU::createSampler(const SamplerDescriptor& descriptor) {
+  auto key = MakeSamplerKey(descriptor);
+  auto iter = samplerCache.find(key);
+  if (iter != samplerCache.end()) {
+    return iter->second;
+  }
+  auto sampler = D3D12Sampler::Make(this, descriptor);
+  // Cache the result even when Make returned nullptr. The total number of distinct
+  // SamplerDescriptor values is bounded by the AddressMode/FilterMode/MipmapMode enums
+  // (currently 4 x 4 x 2 x 2 x 3 = 192), so the sampler heap (2048 slots) should never run
+  // out, but memoising the failure protects the hot path: a repeated failing key would
+  // otherwise retry CreateSampler + log the error on every draw call that needs a sampler.
+  samplerCache[key] = sampler;
+  return sampler;
+}
+
+ComPtr<ID3D12RootSignature> D3D12GPU::findRootSignature(const std::vector<uint8_t>& shapeKey) {
+  auto iter = rootSignatureCache.find(shapeKey);
+  if (iter == rootSignatureCache.end()) {
+    return nullptr;
+  }
+  return iter->second;
+}
+
+void D3D12GPU::cacheRootSignature(std::vector<uint8_t> shapeKey,
+                                  ComPtr<ID3D12RootSignature> rootSignature) {
+  if (rootSignature == nullptr) {
+    return;
+  }
+  rootSignatureCache.emplace(std::move(shapeKey), std::move(rootSignature));
+}
+
+uint32_t D3D12GPU::MakeSamplerKey(const SamplerDescriptor& descriptor) {
+  uint32_t key = 0;
+  key |= static_cast<uint32_t>(descriptor.addressModeX);
+  key |= static_cast<uint32_t>(descriptor.addressModeY) << 3;
+  key |= static_cast<uint32_t>(descriptor.minFilter) << 6;
+  key |= static_cast<uint32_t>(descriptor.magFilter) << 8;
+  key |= static_cast<uint32_t>(descriptor.mipmapMode) << 10;
+  return key;
+}
+
+std::shared_ptr<ShaderModule> D3D12GPU::createShaderModule(
+    const ShaderModuleDescriptor& descriptor) {
+  // Cache compiled DXBC blobs by (stage, hash(GLSL source)). The upper layer's program cache
+  // works at the (vertex+fragment) tuple level, so two distinct programs sharing one of the two
+  // sources still hit our backend twice. Caching here lets the second hit skip the full
+  // GLSL -> SPIR-V -> HLSL -> DXBC chain.
+  ShaderCacheKey key = {};
+  key.stage = static_cast<uint32_t>(descriptor.stage);
+  key.sourceHash = std::hash<std::string>{}(descriptor.code);
+  if (auto it = shaderModuleCache.find(key); it != shaderModuleCache.end()) {
+    return it->second;
+  }
+  auto module = D3D12ShaderModule::Make(this, descriptor);
+  if (module != nullptr) {
+    shaderModuleCache.emplace(key, module);
+  }
+  return module;
+}
+
+std::shared_ptr<RenderPipeline> D3D12GPU::createRenderPipeline(
+    const RenderPipelineDescriptor& descriptor) {
+  return D3D12RenderPipeline::Make(this, descriptor);
+}
+
+std::shared_ptr<CommandEncoder> D3D12GPU::createCommandEncoder() {
+  processUnreferencedResources();
+  return D3D12CommandEncoder::Make(this);
+}
+
+D3D12MipmapGenerator* D3D12GPU::mipmapGenerator() {
+  if (_mipmapGenerator == nullptr) {
+    _mipmapGenerator = std::unique_ptr<D3D12MipmapGenerator>(new D3D12MipmapGenerator(this));
+    if (!_mipmapGenerator->isReady()) {
+      _mipmapGenerator = nullptr;
+    }
+  }
+  return _mipmapGenerator.get();
+}
+
+int D3D12GPU::getSampleCount(int requestedCount, PixelFormat pixelFormat) const {
+  if (requestedCount <= 1) {
+    return 1;
+  }
+  auto dxgiFormat = PixelFormatToDXGIFormat(pixelFormat);
+  if (dxgiFormat == DXGI_FORMAT_UNKNOWN) {
+    return 1;
+  }
+  for (int sampleCount = requestedCount; sampleCount <= D3D12_MAX_MULTISAMPLE_SAMPLE_COUNT;
+       sampleCount++) {
+    D3D12_FEATURE_DATA_MULTISAMPLE_QUALITY_LEVELS qualityLevels = {};
+    qualityLevels.Format = static_cast<DXGI_FORMAT>(dxgiFormat);
+    qualityLevels.SampleCount = static_cast<UINT>(sampleCount);
+    if (SUCCEEDED(d3d12Device->CheckFeatureSupport(D3D12_FEATURE_MULTISAMPLE_QUALITY_LEVELS,
+                                                   &qualityLevels, sizeof(qualityLevels))) &&
+        qualityLevels.NumQualityLevels > 0) {
+      return sampleCount;
+    }
+  }
+  return 1;
+}
+
+std::vector<std::shared_ptr<Texture>> D3D12GPU::importHardwareTextures(HardwareBufferRef,
+                                                                       uint32_t) {
+  // D3D12 hardware buffer import is not supported yet.
+  return {};
+}
+
+std::shared_ptr<Texture> D3D12GPU::importBackendTexture(const BackendTexture& backendTexture,
+                                                        uint32_t usage, bool adopted) {
+  if (backendTexture.backend() != Backend::D3D12) {
+    return nullptr;
+  }
+  D3D12TextureInfo d3d12Info = {};
+  if (!backendTexture.getD3D12TextureInfo(&d3d12Info) || d3d12Info.resource == nullptr) {
+    return nullptr;
+  }
+  auto d3d12Resource =
+      const_cast<ID3D12Resource*>(static_cast<const ID3D12Resource*>(d3d12Info.resource));
+  ComPtr<ID3D12Resource> resource = nullptr;
+  d3d12Resource->QueryInterface(IID_PPV_ARGS(&resource));
+  if (resource == nullptr) {
+    return nullptr;
+  }
+  return D3D12Texture::MakeFrom(this, std::move(resource), d3d12Info.format, usage, adopted);
+}
+
+std::shared_ptr<Texture> D3D12GPU::importBackendRenderTarget(
+    const BackendRenderTarget& backendRenderTarget) {
+  if (backendRenderTarget.backend() != Backend::D3D12) {
+    return nullptr;
+  }
+  D3D12TextureInfo d3d12Info = {};
+  if (!backendRenderTarget.getD3D12TextureInfo(&d3d12Info) || d3d12Info.resource == nullptr) {
+    return nullptr;
+  }
+  auto format = backendRenderTarget.format();
+  if (!isFormatRenderable(format)) {
+    return nullptr;
+  }
+  auto d3d12Resource =
+      const_cast<ID3D12Resource*>(static_cast<const ID3D12Resource*>(d3d12Info.resource));
+  ComPtr<ID3D12Resource> resource = nullptr;
+  d3d12Resource->QueryInterface(IID_PPV_ARGS(&resource));
+  if (resource == nullptr) {
+    return nullptr;
+  }
+  return D3D12Texture::MakeFrom(this, std::move(resource), d3d12Info.format,
+                                TextureUsage::RENDER_ATTACHMENT, false);
+}
+
+std::shared_ptr<Semaphore> D3D12GPU::importBackendSemaphore(const BackendSemaphore& semaphore) {
+  if (semaphore.backend() != Backend::D3D12) {
+    return nullptr;
+  }
+  D3D12SyncInfo info = {};
+  if (!semaphore.getD3D12Sync(&info) || info.fence == nullptr) {
+    return nullptr;
+  }
+  auto rawFence = const_cast<ID3D12Fence*>(static_cast<const ID3D12Fence*>(info.fence));
+  ComPtr<ID3D12Fence> fence = nullptr;
+  rawFence->QueryInterface(IID_PPV_ARGS(&fence));
+  if (fence == nullptr) {
+    return nullptr;
+  }
+  return D3D12Semaphore::MakeFrom(this, std::move(fence), info.value);
+}
+
+BackendSemaphore D3D12GPU::stealBackendSemaphore(std::shared_ptr<Semaphore> semaphore) {
+  if (semaphore == nullptr || semaphore.use_count() > 2) {
+    return {};
+  }
+  return semaphore->getBackendSemaphore();
+}
+
+std::shared_ptr<D3D12Resource> D3D12GPU::addResource(D3D12Resource* resource) {
+  DEBUG_ASSERT(resource != nullptr);
+  resources.push_back(resource);
+  resource->cachedPosition = --resources.end();
+  return std::static_pointer_cast<D3D12Resource>(returnQueue->makeShared(resource));
+}
+
+void D3D12GPU::processUnreferencedResources() {
+  DEBUG_ASSERT(returnQueue != nullptr);
+  while (auto resource = static_cast<D3D12Resource*>(returnQueue->dequeue())) {
+    resources.erase(resource->cachedPosition);
+    resource->onRelease(this);
+    delete resource;
+  }
+}
+
+// Map a D3D12_AUTO_BREADCRUMB_OP enum value to a short readable string. Not exhaustive — only the
+// ops the TGFX backend actually emits are listed; everything else falls through to "<other>".
+static const char* AutoBreadcrumbOpName(D3D12_AUTO_BREADCRUMB_OP op) {
+  switch (op) {
+    case D3D12_AUTO_BREADCRUMB_OP_SETMARKER:
+      return "SetMarker";
+    case D3D12_AUTO_BREADCRUMB_OP_BEGINEVENT:
+      return "BeginEvent";
+    case D3D12_AUTO_BREADCRUMB_OP_ENDEVENT:
+      return "EndEvent";
+    case D3D12_AUTO_BREADCRUMB_OP_DRAWINSTANCED:
+      return "DrawInstanced";
+    case D3D12_AUTO_BREADCRUMB_OP_DRAWINDEXEDINSTANCED:
+      return "DrawIndexedInstanced";
+    case D3D12_AUTO_BREADCRUMB_OP_EXECUTEINDIRECT:
+      return "ExecuteIndirect";
+    case D3D12_AUTO_BREADCRUMB_OP_DISPATCH:
+      return "Dispatch";
+    case D3D12_AUTO_BREADCRUMB_OP_COPYBUFFERREGION:
+      return "CopyBufferRegion";
+    case D3D12_AUTO_BREADCRUMB_OP_COPYTEXTUREREGION:
+      return "CopyTextureRegion";
+    case D3D12_AUTO_BREADCRUMB_OP_COPYRESOURCE:
+      return "CopyResource";
+    case D3D12_AUTO_BREADCRUMB_OP_RESOLVESUBRESOURCE:
+      return "ResolveSubresource";
+    case D3D12_AUTO_BREADCRUMB_OP_CLEARRENDERTARGETVIEW:
+      return "ClearRenderTargetView";
+    case D3D12_AUTO_BREADCRUMB_OP_CLEARDEPTHSTENCILVIEW:
+      return "ClearDepthStencilView";
+    case D3D12_AUTO_BREADCRUMB_OP_CLEARUNORDEREDACCESSVIEW:
+      return "ClearUAV";
+    case D3D12_AUTO_BREADCRUMB_OP_RESOURCEBARRIER:
+      return "ResourceBarrier";
+    case D3D12_AUTO_BREADCRUMB_OP_PRESENT:
+      return "Present";
+    default:
+      return "<other>";
+  }
+}
+
+void D3D12GPU::markContextLost(const char* tag) {
+  if (contextLost) {
+    return;
+  }
+  contextLost = true;
+  // Once the GPU is gone every fence value associated with previously-committed inflight ring
+  // ranges will stay unsignalled forever, so retire() can never reclaim them and the per-ring
+  // outstandingSlots / outstandingBytes counters would saturate. Reset the bookkeeping in
+  // place — the underlying ID3D12DescriptorHeap / UPLOAD ID3D12Resource stay allocated so any
+  // diagnostics path that keeps the GPU instance around (DRED dump etc.) can still query the
+  // device — just stop the rings from rejecting future allocations forever.
+  _srvRing.resetForContextLost();
+  _rtvRing.resetForContextLost();
+  _dsvRing.resetForContextLost();
+  _uploadHeap.resetForContextLost();
+  dumpDeviceRemovedExtendedData(tag);
+}
+
+void D3D12GPU::dumpDeviceRemovedExtendedData(const char* tag) {
+  if (d3d12Device == nullptr) {
+    return;
+  }
+  auto reason = d3d12Device->GetDeviceRemovedReason();
+  if (SUCCEEDED(reason)) {
+    return;
+  }
+  // Some drivers populate DRED breadcrumb buffers asynchronously after the device transitions to
+  // a removed state. Sleep briefly so the breadcrumb / page-fault output is fully formed before
+  // we query it. Diagnostic-only path; cost is bounded to a couple of milliseconds at fault time.
+  Sleep(50);
+  LOGE("[DRED %s] device removed, reason=0x%08X", tag, static_cast<unsigned>(reason));
+
+  ComPtr<ID3D12DeviceRemovedExtendedData> dred = nullptr;
+  if (FAILED(d3d12Device.As(&dred))) {
+    LOGE("[DRED %s] DRED interface unavailable on this device.", tag);
+    return;
+  }
+
+  D3D12_DRED_AUTO_BREADCRUMBS_OUTPUT breadcrumbsOutput = {};
+  if (SUCCEEDED(dred->GetAutoBreadcrumbsOutput(&breadcrumbsOutput))) {
+    auto* node = breadcrumbsOutput.pHeadAutoBreadcrumbNode;
+    if (node == nullptr) {
+      LOGE(
+          "[DRED %s] auto-breadcrumb list is empty — either no command list executed before "
+          "the fault, or the driver did not record breadcrumbs (verify "
+          "SetAutoBreadcrumbsEnablement(FORCED_ON) was called before D3D12CreateDevice).",
+          tag);
+    }
+    int nodeIndex = 0;
+    while (node != nullptr) {
+      const char* listName =
+          node->pCommandListDebugNameA ? node->pCommandListDebugNameA : "<unnamed>";
+      const char* queueName =
+          node->pCommandQueueDebugNameA ? node->pCommandQueueDebugNameA : "<unnamed>";
+      auto last = node->pLastBreadcrumbValue ? *node->pLastBreadcrumbValue : 0u;
+      auto count = node->BreadcrumbCount;
+      LOGE("[DRED %s] node %d: list='%s' queue='%s' completed=%u/%u", tag, nodeIndex, listName,
+           queueName, last, count);
+      // Print a small window around the last completed op so we can see the failing call.
+      uint32_t windowStart = last >= 4 ? last - 4 : 0;
+      uint32_t windowEnd = (last + 8 < count) ? (last + 8) : count;
+      for (uint32_t i = windowStart; i < windowEnd; i++) {
+        const char* marker = (i == last) ? "  >>>" : "     ";
+        LOGE("[DRED %s] %s op[%u] = %s", tag, marker, i,
+             AutoBreadcrumbOpName(node->pCommandHistory[i]));
+      }
+      node = node->pNext;
+      nodeIndex++;
+    }
+  } else {
+    LOGE("[DRED %s] GetAutoBreadcrumbsOutput failed.", tag);
+  }
+
+  D3D12_DRED_PAGE_FAULT_OUTPUT pageFaultOutput = {};
+  if (SUCCEEDED(dred->GetPageFaultAllocationOutput(&pageFaultOutput))) {
+    if (pageFaultOutput.PageFaultVA != 0) {
+      LOGE("[DRED %s] page fault VA = 0x%llx", tag,
+           static_cast<unsigned long long>(pageFaultOutput.PageFaultVA));
+      auto* existing = pageFaultOutput.pHeadExistingAllocationNode;
+      while (existing != nullptr) {
+        LOGE("[DRED %s] existing alloc near fault: '%s' type=%d", tag,
+             existing->ObjectNameA ? existing->ObjectNameA : "<unnamed>",
+             static_cast<int>(existing->AllocationType));
+        existing = existing->pNext;
+      }
+      auto* recent = pageFaultOutput.pHeadRecentFreedAllocationNode;
+      while (recent != nullptr) {
+        LOGE("[DRED %s] recently freed: '%s' type=%d", tag,
+             recent->ObjectNameA ? recent->ObjectNameA : "<unnamed>",
+             static_cast<int>(recent->AllocationType));
+        recent = recent->pNext;
+      }
+    }
+  }
+}
+
+void D3D12GPU::releaseAll(bool releaseGPU) {
+  // Shutdown ordering must wait for all GPU work to complete before destroying anything that
+  // the GPU may still be reading. waitAllInflightSubmissions() also handles the device-removed
+  // case internally (it short-circuits to a synchronous reclaim instead of waiting on a fence
+  // that will never advance), so it is safe to call regardless of the releaseGPU flag and
+  // mirrors what VulkanGPU::releaseAll does. The earlier "releaseGPU == false skips the wait"
+  // path could leave inflight FrameSessions holding command lists / allocators the GPU was
+  // still reading, tripping OBJECT_DELETED_WHILE_STILL_IN_USE.
+  waitAllInflightSubmissions();
+  samplerCache.clear();
+  // Drop cached shader-module shared_ptrs so D3D12Resource cleanup can run. Without this the
+  // shaderModuleCache would hold strong refs through the resources-list walk below and prevent
+  // the modules from ever being released.
+  shaderModuleCache.clear();
+  rootSignatureCache.clear();
+  // Drop pooled command allocator/list pairs. Done after wait so the GPU is no longer using
+  // them; doing it before would still be safe (ComPtrs hold the only references) but ordering
+  // matches the rest of the shutdown path.
+  _commandListPool.clear();
+  // Drop the UPLOAD ring's underlying resource similarly. Inflight allocations have either been
+  // drained by the wait above or are tracked against fences that will never advance, in which
+  // case dropping the resource is the only safe action.
+  _uploadHeap.clear();
+  // Drop the cached mipmap generator's root signature + PSO before tearing the device down.
+  // Resources retained by inflight submissions have already been released above.
+  _mipmapGenerator = nullptr;
+  if (releaseGPU) {
+    for (auto& resource : resources) {
+      resource->onRelease(this);
+    }
+  }
+  resources.clear();
+  returnQueue = nullptr;
+}
+
+void D3D12GPU::reclaimAbandonedSession(D3D12FrameSession session) {
+  // Letting the local go out of scope releases the command list and allocator via ComPtr, and
+  // drops every shared_ptr in retainedResources / retainedDescriptorHeaps. RTV / DSV descriptor
+  // slots used by this session live in D3D12GPU::_rtvRing / _dsvRing and will be reclaimed by
+  // their fence-based retire path; nothing is held here for them.
+  // Resources whose refcount reaches zero enter the ReturnQueue and will be destroyed by the
+  // next processUnreferencedResources() call.
+  //
+  // The work that *does* need active rollback here is CPU-side D3D12 resource state tracking.
+  // Render-pass / copy / generate-mipmaps codepaths bumped D3D12Texture::_currentState while
+  // recording transition commands. If those commands never make it to the GPU (this abandon
+  // path), the GPU is still in the original state. Walking session.initialTextureStates and
+  // restoring the snapshotted state keeps CPU and GPU views aligned so the next encoder does
+  // not emit a barrier whose StateBefore disagrees with reality (which the D3D12 debug layer
+  // flags as "Before state mismatch").
+  for (auto& [texture, originalState] : session.initialTextureStates) {
+    if (texture != nullptr) {
+      texture->setCurrentState(originalState);
+    }
+  }
+  (void)session;
+}
+
+void D3D12GPU::executeSubmission(SubmitRequest request) {
+  // If the GPU has already reported a fatal error, drop the submission without waiting on the
+  // fence — DXGI_ERROR_DEVICE_REMOVED is sticky and the fence will never advance. Local cleanup
+  // still runs because session/uploads destructors release their D3D12 references.
+  if (contextLost) {
+    reclaimAbandonedSession(std::move(request.session));
+    request.uploads.clear();
+    return;
+  }
+
+  // Step 1: Non-blocking reclaim of any submissions whose fence values have already signalled.
+  pollCompletedSubmissions();
+
+  // Step 2: Backpressure — block until the oldest inflight submission completes if we have
+  // already filled the in-flight pipeline. A bounded timeout protects against TDR scenarios
+  // where the GPU never advances; on timeout we mark the context lost and stop tracking it.
+  if (inflightSubmissions.size() >= MAX_FRAMES_IN_FLIGHT) {
+    auto& oldest = inflightSubmissions.front();
+    if (_frameFence->GetCompletedValue() < oldest.fenceValue) {
+      _frameFence->SetEventOnCompletion(oldest.fenceValue, _frameFenceEvent);
+      auto waitResult = WaitForSingleObject(_frameFenceEvent, 5000);
+      if (waitResult != WAIT_OBJECT_0) {
+        LOGE(
+            "D3D12GPU::executeSubmission: backpressure wait timed out (target=%llu), "
+            "marking context lost.",
+            static_cast<unsigned long long>(oldest.fenceValue));
+        markContextLost("executeSubmission backpressure timeout");
+        reclaimAbandonedSession(std::move(request.session));
+        request.uploads.clear();
+        while (!inflightSubmissions.empty()) {
+          reclaimSubmission(inflightSubmissions.front());
+          inflightSubmissions.pop_front();
+        }
+        return;
+      }
+    }
+    pollCompletedSubmissions();
+    // Re-check device removal after the wait — if the GPU TDR'd while we were blocked, the
+    // event would have been signalled but no work has actually completed. Detect this and exit
+    // the inflight queue cleanup path immediately.
+    if (FAILED(d3d12Device->GetDeviceRemovedReason())) {
+      markContextLost("executeSubmission backpressure post-check");
+      reclaimAbandonedSession(std::move(request.session));
+      request.uploads.clear();
+      // Drop every still-tracked submission since the GPU will never signal again.
+      while (!inflightSubmissions.empty()) {
+        reclaimSubmission(inflightSubmissions.front());
+        inflightSubmissions.pop_front();
+      }
+      return;
+    }
+  }
+
+  auto cmdQueue = commandQueue->d3d12CommandQueue();
+  if (cmdQueue == nullptr) {
+    LOGE("D3D12GPU::executeSubmission: command queue is null, abandoning session.");
+    reclaimAbandonedSession(std::move(request.session));
+    return;
+  }
+
+  // Step 3: Optional cross-queue wait. D3D12Semaphore stores its target value as a host-readable
+  // member; the GPU side simply re-uses ID3D12CommandQueue::Wait().
+  if (request.waitSemaphore != nullptr) {
+    auto fence = request.waitSemaphore->d3d12Fence();
+    if (fence != nullptr) {
+      cmdQueue->Wait(fence, request.waitSemaphore->signalValue());
+    }
+  }
+
+  // Step 4: Execute auxiliary command lists (e.g. texture upload lists recorded by the queue
+  // outside the main render command list) followed by the main render command list. Order
+  // matters: uploads must complete before the render list can sample the destination textures.
+  std::vector<ID3D12CommandList*> lists;
+  lists.reserve(request.session.auxCommandLists.size() + 1);
+  for (auto& aux : request.session.auxCommandLists) {
+    if (aux != nullptr) {
+      lists.push_back(aux.Get());
+    }
+  }
+  if (request.session.commandList != nullptr) {
+    lists.push_back(request.session.commandList.Get());
+  }
+  if (!lists.empty()) {
+    cmdQueue->ExecuteCommandLists(static_cast<UINT>(lists.size()), lists.data());
+  }
+#ifdef TGFX_D3D12_DEBUG_LAYER
+  drainDebugMessages("executeSubmission");
+#endif
+
+  // Step 5: Optional signal of an external semaphore. The semaphore exposes a fixed fence value
+  // assigned at creation time — we just signal that value on this queue. After signalling, bump
+  // the semaphore's internal value so a subsequent insertSemaphore() call would see a fresh
+  // generation if the same fence object is re-used.
+  if (request.signalSemaphore != nullptr) {
+    auto fence = request.signalSemaphore->d3d12Fence();
+    if (fence != nullptr) {
+      auto target = request.signalSemaphore->nextSignalValue();
+      cmdQueue->Signal(fence, target);
+      request.signalSemaphore->commitSignalValue();
+    }
+    // Keep the semaphore alive until the GPU has consumed the Signal command. Without this
+    // retention the application could drop its last reference and the underlying ID3D12Fence
+    // would be released before the GPU is done with it.
+    request.session.retainedResources.push_back(std::move(request.signalSemaphore));
+  }
+  if (request.waitSemaphore != nullptr) {
+    request.session.retainedResources.push_back(std::move(request.waitSemaphore));
+  }
+
+  // Step 6: Signal the GPU's internal frame fence so we can later detect completion of this
+  // submission and reclaim its session. If the Signal call itself fails (it can return
+  // DXGI_ERROR_DEVICE_REMOVED if the GPU TDR'd while the previous ExecuteCommandLists was
+  // executing), trip the contextLost flag so subsequent calls don't block on a fence that will
+  // never advance.
+  ++_lastSignalledFenceValue;
+  // Tag every CBV/SRV/UAV descriptor-ring slot allocated during this submission with the
+  // about-to-be-signalled fence value. Once that fence value completes, the slots become
+  // reclaimable in retire(); see pollCompletedSubmissions(). The Sampler heap is append-only
+  // (slots persist for the GPU's lifetime) and does not need fence tracking. The upload ring's
+  // sub-allocations are tracked the same way so the staging bytes outlive their CopyTextureRegion
+  // commands.
+  _srvRing.commit(_lastSignalledFenceValue);
+  _rtvRing.commit(_lastSignalledFenceValue);
+  _dsvRing.commit(_lastSignalledFenceValue);
+  _uploadHeap.commit(_lastSignalledFenceValue);
+  auto signalHr = cmdQueue->Signal(_frameFence.Get(), _lastSignalledFenceValue);
+  if (FAILED(signalHr) || FAILED(d3d12Device->GetDeviceRemovedReason())) {
+    LOGE(
+        "D3D12GPU::executeSubmission: Signal failed (HRESULT=0x%08X) or device removed; "
+        "marking context lost.",
+        static_cast<unsigned>(signalHr));
+    markContextLost("executeSubmission Signal");
+    reclaimAbandonedSession(std::move(request.session));
+    request.uploads.clear();
+    while (!inflightSubmissions.empty()) {
+      reclaimSubmission(inflightSubmissions.front());
+      inflightSubmissions.pop_front();
+    }
+    return;
+  }
+
+  InflightSubmission inflight = {};
+  inflight.fenceValue = _lastSignalledFenceValue;
+  // Capture the submission timestamp so pollCompletedSubmissions() can later publish it as the
+  // "GPU completed up to this point" marker that ResourceCache uses to gate scratch reuse.
+  inflight.frameTime = request.frameTime;
+  inflight.session = std::move(request.session);
+  inflight.uploads = std::move(request.uploads);
+  inflightSubmissions.push_back(std::move(inflight));
+}
+
+void D3D12GPU::waitAllInflightSubmissions() {
+  if (_frameFence == nullptr) {
+    return;
+  }
+  if (contextLost || FAILED(d3d12Device->GetDeviceRemovedReason())) {
+    // Device is gone — fence will never advance again. Drop everything synchronously.
+    markContextLost("waitAllInflightSubmissions entry");
+    while (!inflightSubmissions.empty()) {
+      reclaimSubmission(inflightSubmissions.front());
+      inflightSubmissions.pop_front();
+    }
+    processUnreferencedResources();
+    return;
+  }
+  if (!inflightSubmissions.empty()) {
+    auto& last = inflightSubmissions.back();
+    if (_frameFence->GetCompletedValue() < last.fenceValue) {
+      // Use a finite timeout instead of INFINITE so we never hang the application even if some
+      // earlier submission had a corrupted command list that prevents the GPU from advancing.
+      // 5 seconds is well past any sensible draw frame budget; if it expires we fall through to
+      // the device-removal check below.
+      _frameFence->SetEventOnCompletion(last.fenceValue, _frameFenceEvent);
+      auto waitResult = WaitForSingleObject(_frameFenceEvent, 5000);
+      if (waitResult != WAIT_OBJECT_0) {
+        LOGE(
+            "D3D12GPU::waitAllInflightSubmissions: fence wait timed out (target=%llu), "
+            "marking context lost.",
+            static_cast<unsigned long long>(last.fenceValue));
+        markContextLost("waitAllInflightSubmissions timeout");
+        while (!inflightSubmissions.empty()) {
+          reclaimSubmission(inflightSubmissions.front());
+          inflightSubmissions.pop_front();
+        }
+        processUnreferencedResources();
+        return;
+      }
+    }
+  }
+  pollCompletedSubmissions();
+}
+
+uint64_t D3D12GPU::completedFenceValue() const {
+  return _frameFence != nullptr ? _frameFence->GetCompletedValue() : 0;
+}
+
+void D3D12GPU::pollCompletedSubmissions() {
+  if (_frameFence == nullptr) {
+    return;
+  }
+  auto completed = _frameFence->GetCompletedValue();
+  while (!inflightSubmissions.empty() && inflightSubmissions.front().fenceValue <= completed) {
+    auto& front = inflightSubmissions.front();
+    // Publish the just-completed submission's submit-time stamp so ResourceCache can decide
+    // which scratch resources the GPU has finished reading. Without this update, the default
+    // CommandQueue::completedFrameTime() returns the *current* frame time, telling the cache
+    // every resource is reusable immediately — that lets a second flush() steal a vertex
+    // buffer the first flush()'s GPU work is still reading (see RecordingTest race).
+    auto ticks = front.frameTime.time_since_epoch().count();
+    _lastFenceSignalTime.store(ticks, std::memory_order_release);
+    reclaimSubmission(front);
+    inflightSubmissions.pop_front();
+  }
+  // Free shader-visible CBV/SRV/UAV descriptor slots whose owning submissions have signalled.
+  // Sampler slots persist for the GPU's lifetime so they need no per-fence retirement. The
+  // upload ring sheds reclaimable byte ranges on the same schedule.
+  _srvRing.retire(completed);
+  _rtvRing.retire(completed);
+  _dsvRing.retire(completed);
+  _uploadHeap.retire(completed);
+  // Releasing retained shared_ptrs may have moved D3D12Resource instances into the return queue;
+  // free them now so the caller sees up-to-date memory accounting.
+  processUnreferencedResources();
+}
+
+std::chrono::steady_clock::time_point D3D12GPU::lastFenceSignalTime() const {
+  auto ticks = _lastFenceSignalTime.load(std::memory_order_acquire);
+  return std::chrono::steady_clock::time_point(std::chrono::steady_clock::duration(ticks));
+}
+
+void D3D12GPU::reclaimSubmission(InflightSubmission& submission) {
+  // The GPU has signalled the fence value associated with this submission, so every command
+  // allocator/list pair it referenced is safe to reuse. Return them to the pool before tearing
+  // down the rest of the session — once the FrameSession destructor runs, the ComPtrs are gone.
+  if (submission.session.commandAllocator != nullptr && submission.session.commandList != nullptr) {
+    D3D12CommandListPool::Entry entry = {};
+    entry.allocator = std::move(submission.session.commandAllocator);
+    entry.commandList = std::move(submission.session.commandList);
+    _commandListPool.release(std::move(entry));
+  }
+  // Auxiliary command lists (e.g. transient upload lists recorded by D3D12CommandQueue::submit)
+  // are paired one-to-one with their auxAllocators by index; recycle them together so the pool
+  // sees consistent (allocator, list) pairs.
+  size_t auxCount =
+      std::min(submission.session.auxAllocators.size(), submission.session.auxCommandLists.size());
+  for (size_t i = 0; i < auxCount; i++) {
+    if (submission.session.auxAllocators[i] != nullptr &&
+        submission.session.auxCommandLists[i] != nullptr) {
+      D3D12CommandListPool::Entry entry = {};
+      entry.allocator = std::move(submission.session.auxAllocators[i]);
+      entry.commandList = std::move(submission.session.auxCommandLists[i]);
+      _commandListPool.release(std::move(entry));
+    }
+  }
+  // The remaining ComPtr / shared_ptr destructors in FrameSession handle descriptor heaps,
+  // retained resources, and staging UPLOAD buffers.
+  submission.session = D3D12FrameSession{};
+  submission.uploads.clear();
+}
+
+}  // namespace tgfx
diff --git a/src/gpu/d3d12/D3D12GPU.h b/src/gpu/d3d12/D3D12GPU.h
new file mode 100644
index 000000000..9fdf04136
--- /dev/null
+++ b/src/gpu/d3d12/D3D12GPU.h
@@ -0,0 +1,444 @@
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Tencent is pleased to support the open source community by making tgfx available.
+//
+//  Copyright (C) 2026 Tencent. All rights reserved.
+//
+//  Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+//  in compliance with the License. You may obtain a copy of the License at
+//
+//      https://opensource.org/licenses/BSD-3-Clause
+//
+//  unless required by applicable law or agreed to in writing, software distributed under the
+//  license is distributed on an "as is" basis, without warranties or conditions of any kind,
+//  either express or implied. see the license for the specific language governing permissions
+//  and limitations under the license.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include <atomic>
+#include <chrono>
+#include <deque>
+#include <list>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+#include "D3D12CommandListPool.h"
+#include "D3D12Defines.h"
+#include "D3D12DescriptorRing.h"
+#include "D3D12FrameSession.h"
+#include "D3D12UploadHeap.h"
+#include "D3D12Util.h"
+#include "core/utils/ReturnQueue.h"
+#include "tgfx/gpu/GPU.h"
+
+namespace shaderc {
+class Compiler;
+};
+
+namespace tgfx {
+
+class D3D12CommandQueue;
+class D3D12MipmapGenerator;
+class D3D12Resource;
+class D3D12Semaphore;
+class D3D12ShaderModule;
+class D3D12Texture;
+
+/**
+ * D3D12 GPU implementation.
+ */
+class D3D12GPU : public GPU {
+ public:
+  static std::unique_ptr<D3D12GPU> Make(ComPtr<ID3D12Device> device);
+
+  ~D3D12GPU();
+
+  ID3D12Device* device() const {
+    return d3d12Device.Get();
+  }
+
+  IDXGIAdapter1* adapter() const {
+    return dxgiAdapter.Get();
+  }
+
+  const GPUInfo* info() const override {
+    return &_info;
+  }
+
+  const GPUFeatures* features() const override {
+    return &_features;
+  }
+
+  const GPULimits* limits() const override {
+    return &_limits;
+  }
+
+  CommandQueue* queue() const override;
+
+  const shaderc::Compiler* shaderCompiler() const;
+
+  unsigned getDXGIFormat(PixelFormat format) const {
+    return PixelFormatToDXGIFormat(format);
+  }
+
+  bool isFormatRenderable(PixelFormat format) const override;
+
+  std::shared_ptr<GPUBuffer> createBuffer(size_t size, uint32_t usage) override;
+
+  std::shared_ptr<Texture> createTexture(const TextureDescriptor& descriptor) override;
+
+  std::shared_ptr<Sampler> createSampler(const SamplerDescriptor& descriptor) override;
+
+  std::shared_ptr<ShaderModule> createShaderModule(
+      const ShaderModuleDescriptor& descriptor) override;
+
+  std::shared_ptr<RenderPipeline> createRenderPipeline(
+      const RenderPipelineDescriptor& descriptor) override;
+
+  std::shared_ptr<CommandEncoder> createCommandEncoder() override;
+
+  int getSampleCount(int requestedCount, PixelFormat pixelFormat) const override;
+
+  std::vector<std::shared_ptr<Texture>> importHardwareTextures(HardwareBufferRef hardwareBuffer,
+                                                               uint32_t usage) override;
+
+  std::shared_ptr<Texture> importBackendTexture(const BackendTexture& backendTexture,
+                                                uint32_t usage, bool adopted = false) override;
+
+  std::shared_ptr<Texture> importBackendRenderTarget(
+      const BackendRenderTarget& backendRenderTarget) override;
+
+  std::shared_ptr<Semaphore> importBackendSemaphore(const BackendSemaphore& semaphore) override;
+
+  BackendSemaphore stealBackendSemaphore(std::shared_ptr<Semaphore> semaphore) override;
+
+  template <typename T, typename... Args>
+  std::shared_ptr<T> makeResource(Args&&... args) {
+    static_assert(std::is_base_of_v<D3D12Resource, T>, "T must be a subclass of D3D12Resource!");
+    auto resource = new T(std::forward<Args>(args)...);
+    return std::static_pointer_cast<T>(addResource(resource));
+  }
+
+  void processUnreferencedResources();
+
+  void releaseAll(bool releaseGPU);
+
+  /**
+   * Reclaims resources from a D3D12FrameSession that was created but never submitted (abandon
+   * path). Invoked by D3D12CommandBuffer's destructor and by D3D12CommandEncoder::onRelease(). This
+   * is the same unified cleanup path used after the GPU fence signals successful completion.
+   */
+  void reclaimAbandonedSession(D3D12FrameSession session);
+
+  // -- Submission lifecycle -------------------------------------------------------------------
+  // The D3D12 backend mirrors the Vulkan FrameSession + InflightSubmission ownership model:
+  // each successful submit() moves a session into the inflight queue, where it is held alive
+  // until its fence value is signalled by the GPU. A staging upload buffer used by writeTexture
+  // is also tracked here so it can be safely released after the same fence signals.
+
+  struct PendingUpload {
+    ComPtr<ID3D12Resource> stagingBuffer;
+    std::shared_ptr<D3D12Texture> texture;
+  };
+
+  struct SubmitRequest {
+    D3D12FrameSession session;
+    std::vector<PendingUpload> uploads;
+    std::shared_ptr<D3D12Semaphore> signalSemaphore;
+    std::shared_ptr<D3D12Semaphore> waitSemaphore;
+    // Timestamp captured by D3D12CommandQueue::submit at the moment it hands the request to
+    // executeSubmission(). The GPU stamps the matching InflightSubmission with this value and,
+    // when the fence later signals, publishes it as _lastFenceSignalTime so the resource cache
+    // can identify scratch resources that are safe to reuse.
+    std::chrono::steady_clock::time_point frameTime = {};
+  };
+
+  /**
+   * Executes a complete submission: optional cross-queue waitSemaphore, ExecuteCommandLists for
+   * the recorded command list, optional signalSemaphore, then the internal frame fence. Moves
+   * session/uploads into the inflight queue. Polls completed submissions before submitting and
+   * applies backpressure if more than MAX_FRAMES_IN_FLIGHT submissions are outstanding.
+   */
+  void executeSubmission(SubmitRequest request);
+
+  /**
+   * Blocks until every outstanding submission's fence value has signalled, reclaiming each
+   * session along the way. Used by waitUntilCompleted() and releaseAll().
+   */
+  void waitAllInflightSubmissions();
+
+  /**
+   * Returns the latest fence value that the GPU has signalled. Used by D3D12Buffer::isReady() to
+   * answer "is the GPU done with this buffer?" without blocking.
+   */
+  uint64_t completedFenceValue() const;
+
+  /**
+   * Returns the steady_clock::time_point at which the most recently completed inflight
+   * submission was first submitted. Mirrors VulkanGPU::lastFenceSignalTime() and is used by
+   * D3D12CommandQueue::completedFrameTime() to gate scratch-resource reuse — without this,
+   * ResourceCache::findScratchResource() would happily hand back vertex/uniform buffers that
+   * the GPU is still reading, producing torn frames when CPU recording races ahead of GPU
+   * execution (see RecordingTest.MultipleRecordingsInOrder).
+   */
+  std::chrono::steady_clock::time_point lastFenceSignalTime() const;
+
+  ID3D12Fence* frameFence() const {
+    return _frameFence.Get();
+  }
+
+  uint64_t lastSignalledFenceValue() const {
+    return _lastSignalledFenceValue;
+  }
+
+  /**
+   * Returns true once the GPU has reported a fatal error (e.g. DXGI_ERROR_DEVICE_REMOVED) or a
+   * fence wait has timed out. After the flag is set, every executeSubmission() / wait* call
+   * short-circuits and the device is considered unusable for the remainder of the process.
+   */
+  bool isContextLost() const {
+    return contextLost;
+  }
+
+  /**
+   * If the device has been removed and DRED was enabled at device creation, queries
+   * ID3D12DeviceRemovedExtendedData and logs the auto-breadcrumb history (the command list
+   * operations the GPU had completed and was about to execute when it died) plus any page-fault
+   * information. Each (HRESULT-failing) call site that detects context loss should invoke this
+   * helper exactly once so the diagnostic appears next to the failure that triggered it. No-op on
+   * builds without DRED enabled or when GetDeviceRemovedReason() reports success.
+   */
+  void dumpDeviceRemovedExtendedData(const char* tag);
+
+  /**
+   * Drains the D3D12 debug-layer ID3D12InfoQueue and forwards every queued message to LOGE
+   * tagged with `tag`. Call sites should invoke this whenever a D3D12 API returns a failure
+   * (especially DEVICE_REMOVED) so the underlying validation error appears next to the failing
+   * call instead of being lost inside the runtime queue. No-op on builds without
+   * TGFX_D3D12_DEBUG_LAYER (and outside Debug builds), where the InfoQueue is not populated.
+   */
+  void drainDebugMessages(const char* tag);
+
+  /**
+   * Returns the singleton compute-shader mipmap generator, creating it on first use. The
+   * generator is owned by the GPU because its root signature and pipeline state can be reused
+   * across every D3D12CommandEncoder that asks to generate mipmaps. Returns nullptr if compute
+   * shader compilation or pipeline creation failed.
+   */
+  D3D12MipmapGenerator* mipmapGenerator();
+
+  /**
+   * Process-wide shader-visible CBV/SRV/UAV ring backing every D3D12RenderPass binding. One large
+   * heap is created once at GPU construction; render passes sub-allocate slots out of it and the
+   * descriptors are reclaimed as soon as the owning fence signals. Avoids the per-pass
+   * CreateDescriptorHeap call and the associated GPU-VA churn.
+   */
+  D3D12DescriptorRing& srvRing() {
+    return _srvRing;
+  }
+
+  /**
+   * Non-shader-visible RTV ring used by D3D12RenderPass to publish OMSetRenderTargets handles
+   * without paying for a CreateDescriptorHeap on every pass. Slots are reclaimed once the
+   * fence value committed at submit() signals.
+   */
+  D3D12DescriptorRing& rtvRing() {
+    return _rtvRing;
+  }
+
+  /**
+   * Non-shader-visible DSV ring counterpart of rtvRing(), shared across all render passes.
+   */
+  D3D12DescriptorRing& dsvRing() {
+    return _dsvRing;
+  }
+
+  /**
+   * Allocates a GPU descriptor handle in the process-wide shader-visible Sampler heap and writes
+   * `desc` into it. The slot is never freed; the sampler heap is bounded by D3D12's hard 2048
+   * limit and every distinct SamplerDescriptor is created at most once via D3D12GPU's sampler
+   * cache, so the heap effectively acts as an append-only descriptor table. Returns an
+   * uninitialised handle (.ptr == 0) if the heap is exhausted.
+   */
+  D3D12_GPU_DESCRIPTOR_HANDLE allocatePermanentSamplerSlot(const D3D12_SAMPLER_DESC& desc);
+
+  /**
+   * Returns the underlying shader-visible Sampler heap. Used by D3D12CommandEncoder when binding
+   * heaps onto a fresh command list (D3D12 requires SetDescriptorHeaps once per list).
+   */
+  ID3D12DescriptorHeap* samplerHeap() const {
+    return _samplerHeap.Get();
+  }
+
+  /**
+   * Pool of (ID3D12CommandAllocator, ID3D12GraphicsCommandList) pairs reused across
+   * encoders/queue uploads. Avoids the per-submission CreateCommandAllocator/CreateCommandList
+   * overhead. Pairs are returned to the pool by reclaimSubmission once their fence signals.
+   */
+  D3D12CommandListPool& commandListPool() {
+    return _commandListPool;
+  }
+
+  /**
+   * Returns a shared root signature matching `shapeKey`, or null if not cached. Pipelines query
+   * the cache before serialising/creating their own root signature; on a hit they reuse the
+   * existing object (incrementing its ComPtr refcount), avoiding the cost of
+   * D3D12SerializeRootSignature + ID3D12Device::CreateRootSignature on every PSO build.
+   */
+  ComPtr<ID3D12RootSignature> findRootSignature(const std::vector<uint8_t>& shapeKey);
+
+  /**
+   * Inserts a freshly-built root signature under `shapeKey`. Subsequent pipelines with the same
+   * binding shape will hit the cache. The shape key is a compact serialisation of the binding
+   * layout (uniform-block count + visibilities, sampler count) generated by D3D12RenderPipeline.
+   */
+  void cacheRootSignature(std::vector<uint8_t> shapeKey, ComPtr<ID3D12RootSignature> rootSignature);
+
+  /**
+   * Process-wide UPLOAD-heap ring used to stage texture pixel uploads (and other CPU-to-GPU
+   * data). Sub-allocations live until the owning fence signals; callers fall back to a one-off
+   * CreateCommittedResource only if a single allocation exceeds the ring's capacity or the
+   * ring is fully in flight.
+   */
+  D3D12UploadHeap& uploadHeap() {
+    return _uploadHeap;
+  }
+
+ private:
+  /// Single entry point for marking the context lost. Sets the flag, dumps DRED diagnostics on
+  /// the first transition (subsequent calls are silent), and short-circuits all wait paths.
+  void markContextLost(const char* tag);
+  explicit D3D12GPU(ComPtr<ID3D12Device> device, ComPtr<IDXGIAdapter1> adapter);
+
+  std::shared_ptr<D3D12Resource> addResource(D3D12Resource* resource);
+
+  static uint32_t MakeSamplerKey(const SamplerDescriptor& descriptor);
+
+  void initInfo();
+  void initFeatures();
+  void initLimits();
+
+  ComPtr<ID3D12Device> d3d12Device = nullptr;
+  ComPtr<IDXGIAdapter1> dxgiAdapter = nullptr;
+  GPUInfo _info = {};
+  GPUFeatures _features = {};
+  GPULimits _limits = {};
+  std::unique_ptr<D3D12CommandQueue> commandQueue = nullptr;
+  std::unique_ptr<shaderc::Compiler> compiler = nullptr;
+  std::list<D3D12Resource*> resources = {};
+  std::shared_ptr<ReturnQueue> returnQueue = ReturnQueue::Make();
+  std::unordered_map<uint32_t, std::shared_ptr<Sampler>> samplerCache = {};
+
+  // Process-wide cache of compiled shader modules keyed by (stage, hash(GLSL source)). The
+  // upper layer caches Programs by ProgramKey, but two unrelated programs frequently share
+  // the same vertex shader (or the same fragment shader template). Without this cache every
+  // program build re-runs GLSL -> SPIR-V -> HLSL -> DXBC even though the bytecode would be
+  // byte-identical. Empirical measurement on the test suite: 700 createShaderModule calls
+  // produce only 340 distinct sources (~51% redundancy).
+  struct ShaderCacheKey {
+    uint32_t stage = 0;
+    size_t sourceHash = 0;
+    bool operator==(const ShaderCacheKey& other) const {
+      return stage == other.stage && sourceHash == other.sourceHash;
+    }
+  };
+  struct ShaderCacheKeyHash {
+    size_t operator()(const ShaderCacheKey& k) const noexcept {
+      return k.sourceHash ^ (static_cast<size_t>(k.stage) * 0x9E3779B97F4A7C15ull);
+    }
+  };
+  std::unordered_map<ShaderCacheKey, std::shared_ptr<D3D12ShaderModule>, ShaderCacheKeyHash>
+      shaderModuleCache;
+
+  // Process-wide cache of root signatures keyed by their binding-layout shape (uniform-block
+  // count + visibilities, sampler count). Almost every pipeline in tgfx falls into one of a
+  // handful of shapes, so this turns the SerializeRootSignature + CreateRootSignature pair
+  // (combined ~30-100 us per call on first hit) into an unordered_map lookup for the steady
+  // state. Keys are arbitrary-length byte strings rather than a fixed integer to leave room
+  // for additional layout metadata without introducing hash collisions.
+  struct ShapeKeyHash {
+    size_t operator()(const std::vector<uint8_t>& key) const noexcept {
+      // FNV-1a 64-bit, sufficient for the small distinct-shape population we see in practice.
+      size_t hash = 1469598103934665603ull;
+      for (auto byte : key) {
+        hash ^= static_cast<size_t>(byte);
+        hash *= 1099511628211ull;
+      }
+      return hash;
+    }
+  };
+  std::unordered_map<std::vector<uint8_t>, ComPtr<ID3D12RootSignature>, ShapeKeyHash>
+      rootSignatureCache;
+  // Lazily-initialised compute pipeline used by D3D12CommandEncoder::generateMipmapsForTexture.
+  // Built on first use so backends that never request mipmaps don't pay the shader-compile cost.
+  std::unique_ptr<D3D12MipmapGenerator> _mipmapGenerator = nullptr;
+
+  // Process-wide shader-visible descriptor heaps used by render passes. The CBV/SRV/UAV ring is
+  // sized for thousands of unique bindings per frame and is recycled per-fence. The Sampler heap
+  // is append-only (capped at D3D12's 2048 limit; one entry per unique SamplerDescriptor for the
+  // life of the GPU instance) and therefore does not need a ring tail pointer.
+  static constexpr uint32_t SRV_RING_CAPACITY = 64 * 1024;
+  static constexpr uint32_t SAMPLER_HEAP_CAPACITY = 2048;
+  // RTV / DSV rings replace the per-render-pass CreateDescriptorHeap calls. Sized to handle a
+  // few dozen passes per frame with MAX_FRAMES_IN_FLIGHT outstanding; well under D3D12's hard
+  // 1024-RTV / 1024-DSV per-heap caps.
+  static constexpr uint32_t RTV_RING_CAPACITY = 512;
+  static constexpr uint32_t DSV_RING_CAPACITY = 64;
+  D3D12DescriptorRing _srvRing;
+  D3D12DescriptorRing _rtvRing;
+  D3D12DescriptorRing _dsvRing;
+  ComPtr<ID3D12DescriptorHeap> _samplerHeap = nullptr;
+  uint32_t _samplerHeapSize = 0;
+  uint32_t _samplerHeapCapacity = 0;
+  uint32_t _samplerDescriptorIncrement = 0;
+
+  // Pool of recycled command allocator + graphics command list pairs. Populated by
+  // reclaimSubmission once a submission's fence signals; consumed by D3D12CommandEncoder::Make
+  // and the transient upload-list paths inside D3D12CommandQueue.
+  D3D12CommandListPool _commandListPool;
+
+  // Process-wide UPLOAD ring used by D3D12CommandQueue::writeTexture / writeBuffer. Initial
+  // capacity is sized for typical glyph atlas / blur seed traffic; oversize allocations or a
+  // saturated ring fall back to per-call CreateCommittedResource so behaviour stays correct
+  // even when the steady-state fast path can't satisfy the request.
+  static constexpr size_t UPLOAD_HEAP_CAPACITY = 64 * 1024 * 1024;
+  D3D12UploadHeap _uploadHeap;
+
+  // Submission state. Following the Vulkan model, the GPU owns the frame fence and the inflight
+  // queue; D3D12CommandQueue is a thin coordination layer that builds a SubmitRequest and hands
+  // it to executeSubmission(). Frame-fence values are monotonically increasing.
+  static constexpr size_t MAX_FRAMES_IN_FLIGHT = 2;
+
+  struct InflightSubmission {
+    uint64_t fenceValue = 0;
+    // Steady_clock timestamp recorded at submit() time. When the matching fence value is
+    // reached, this value gets published to _lastFenceSignalTime so the ResourceCache can
+    // tell which scratch resources the GPU is definitely done reading.
+    std::chrono::steady_clock::time_point frameTime = {};
+    D3D12FrameSession session;
+    std::vector<PendingUpload> uploads;
+  };
+
+  void reclaimSubmission(InflightSubmission& submission);
+  void pollCompletedSubmissions();
+
+  ComPtr<ID3D12Fence> _frameFence = nullptr;
+  HANDLE _frameFenceEvent = nullptr;
+  uint64_t _lastSignalledFenceValue = 0;
+  // Timestamp of the most recently completed (fence-signalled) inflight submission. Stored as
+  // int64 ticks so it can be loaded/stored atomically. Updated from the same thread that owns
+  // the GPU, but exposed via std::atomic so a future readback thread (if any) sees a coherent
+  // value without taking the GPU lock.
+  std::atomic<int64_t> _lastFenceSignalTime = {0};
+  std::deque<InflightSubmission> inflightSubmissions;
+  // Sticky flag set when the device returns DXGI_ERROR_DEVICE_REMOVED or another fatal error.
+  // Once set, executeSubmission and waitAllInflightSubmissions stop blocking on the fence — the
+  // GPU will never signal again, and waiting INFINITE would hang the process. Submissions
+  // continue to clean up their resources locally so destruction terminates promptly.
+  bool contextLost = false;
+};
+
+}  // namespace tgfx
diff --git a/src/gpu/d3d12/D3D12MipmapGenerator.cpp b/src/gpu/d3d12/D3D12MipmapGenerator.cpp
new file mode 100644
index 000000000..25d0b3250
--- /dev/null
+++ b/src/gpu/d3d12/D3D12MipmapGenerator.cpp
@@ -0,0 +1,188 @@
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Tencent is pleased to support the open source community by making tgfx available.
+//
+//  Copyright (C) 2026 Tencent. All rights reserved.
+//
+//  Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+//  in compliance with the License. You may obtain a copy of the License at
+//
+//      https://opensource.org/licenses/BSD-3-Clause
+//
+//  unless required by applicable law or agreed to in writing, software distributed under the
+//  license is distributed on an "as is" basis, without warranties or conditions of any kind,
+//  either express or implied. see the license for the specific language governing permissions
+//  and limitations under the license.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "D3D12MipmapGenerator.h"
+#include <d3dcompiler.h>
+#include "D3D12GPU.h"
+#include "core/utils/Log.h"
+
+namespace tgfx {
+
+// Box-filter compute shader: each thread samples 2x2 source texels and writes one destination
+// texel. SamplePoint is required because textures we generate mipmaps on may not have linear
+// sampling enabled by the time this shader runs; sampling four corners with bilinear would
+// average twice and double the cost. The HLSL is intentionally inline and tiny so D3DCompile
+// finishes in well under a millisecond.
+//
+// Layout matches the root signature in createRootSignature():
+//   register(b0) — uint4 with mip dimensions and 1/dimensions
+//   register(t0) — input mip (mip[i])
+//   register(s0) — point sampler with clamp address mode
+//   register(u0) — output mip (mip[i+1])
+// Hardware linear sampling at the destination texel center performs a 2x2 weighted average of
+// the source texels with the same weights MTLBlitCommandEncoder generateMipmapsForTexture and
+// vkCmdBlitImage(VK_FILTER_LINEAR) use, so a single SampleLevel matches the Metal/Vulkan output
+// bit-for-bit on even-divided mip levels and follows GPU-driver edge handling on odd ones. The
+// older quincunx (four 0.25-texel offsets) effectively did a 16-tap blur and produced softer
+// mips than the other backends.
+static constexpr const char* kHLSLSource = R"(
+cbuffer MipmapCB : register(b0)
+{
+    uint  OutMipWidth;
+    uint  OutMipHeight;
+    float InvOutMipWidth;
+    float InvOutMipHeight;
+};
+
+Texture2D<float4>   InputMip   : register(t0);
+SamplerState        LinearClamp : register(s0);
+RWTexture2D<float4> OutputMip  : register(u0);
+
+[numthreads(8, 8, 1)]
+void main(uint3 dtID : SV_DispatchThreadID)
+{
+    if (dtID.x >= OutMipWidth || dtID.y >= OutMipHeight) {
+        return;
+    }
+    float2 uv = (float2(dtID.xy) + 0.5f) * float2(InvOutMipWidth, InvOutMipHeight);
+    OutputMip[dtID.xy] = InputMip.SampleLevel(LinearClamp, uv, 0);
+}
+)";
+
+D3D12MipmapGenerator::D3D12MipmapGenerator(D3D12GPU* gpu) {
+  if (!createRootSignature(gpu)) {
+    return;
+  }
+  if (!createPipelineState(gpu)) {
+    _rootSignature = nullptr;
+  }
+}
+
+D3D12MipmapGenerator* D3D12MipmapGenerator::Get(D3D12GPU* gpu) {
+  return gpu->mipmapGenerator();
+}
+
+bool D3D12MipmapGenerator::createRootSignature(D3D12GPU* gpu) {
+  // Constants + SRV table + UAV table. A single static sampler (point/clamp) means we don't have
+  // to thread a sampler descriptor heap through generateMipmapsForTexture().
+  D3D12_DESCRIPTOR_RANGE srvRange = {};
+  srvRange.RangeType = D3D12_DESCRIPTOR_RANGE_TYPE_SRV;
+  srvRange.NumDescriptors = 1;
+  srvRange.BaseShaderRegister = 0;
+  srvRange.RegisterSpace = 0;
+  srvRange.OffsetInDescriptorsFromTableStart = 0;
+
+  D3D12_DESCRIPTOR_RANGE uavRange = {};
+  uavRange.RangeType = D3D12_DESCRIPTOR_RANGE_TYPE_UAV;
+  uavRange.NumDescriptors = 1;
+  uavRange.BaseShaderRegister = 0;
+  uavRange.RegisterSpace = 0;
+  uavRange.OffsetInDescriptorsFromTableStart = 0;
+
+  D3D12_ROOT_PARAMETER params[3] = {};
+  params[0].ParameterType = D3D12_ROOT_PARAMETER_TYPE_32BIT_CONSTANTS;
+  params[0].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL;
+  params[0].Constants.ShaderRegister = 0;
+  params[0].Constants.RegisterSpace = 0;
+  params[0].Constants.Num32BitValues = 4;
+
+  params[1].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE;
+  params[1].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL;
+  params[1].DescriptorTable.NumDescriptorRanges = 1;
+  params[1].DescriptorTable.pDescriptorRanges = &srvRange;
+
+  params[2].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE;
+  params[2].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL;
+  params[2].DescriptorTable.NumDescriptorRanges = 1;
+  params[2].DescriptorTable.pDescriptorRanges = &uavRange;
+
+  D3D12_STATIC_SAMPLER_DESC samplerDesc = {};
+  samplerDesc.Filter = D3D12_FILTER_MIN_MAG_LINEAR_MIP_POINT;
+  samplerDesc.AddressU = D3D12_TEXTURE_ADDRESS_MODE_CLAMP;
+  samplerDesc.AddressV = D3D12_TEXTURE_ADDRESS_MODE_CLAMP;
+  samplerDesc.AddressW = D3D12_TEXTURE_ADDRESS_MODE_CLAMP;
+  samplerDesc.MipLODBias = 0.0f;
+  samplerDesc.MaxAnisotropy = 1;
+  samplerDesc.ComparisonFunc = D3D12_COMPARISON_FUNC_NEVER;
+  samplerDesc.BorderColor = D3D12_STATIC_BORDER_COLOR_TRANSPARENT_BLACK;
+  samplerDesc.MinLOD = 0.0f;
+  samplerDesc.MaxLOD = D3D12_FLOAT32_MAX;
+  samplerDesc.ShaderRegister = 0;
+  samplerDesc.RegisterSpace = 0;
+  samplerDesc.ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL;
+
+  D3D12_ROOT_SIGNATURE_DESC rootSigDesc = {};
+  rootSigDesc.NumParameters = 3;
+  rootSigDesc.pParameters = params;
+  rootSigDesc.NumStaticSamplers = 1;
+  rootSigDesc.pStaticSamplers = &samplerDesc;
+  rootSigDesc.Flags = D3D12_ROOT_SIGNATURE_FLAG_NONE;
+
+  ComPtr<ID3DBlob> blob = nullptr;
+  ComPtr<ID3DBlob> errorBlob = nullptr;
+  auto hr =
+      D3D12SerializeRootSignature(&rootSigDesc, D3D_ROOT_SIGNATURE_VERSION_1, &blob, &errorBlob);
+  if (FAILED(hr)) {
+    LOGE("D3D12MipmapGenerator: D3D12SerializeRootSignature failed (HRESULT=0x%08X).",
+         static_cast<unsigned>(hr));
+    return false;
+  }
+  hr = gpu->device()->CreateRootSignature(0, blob->GetBufferPointer(), blob->GetBufferSize(),
+                                          IID_PPV_ARGS(&_rootSignature));
+  if (FAILED(hr)) {
+    LOGE("D3D12MipmapGenerator: CreateRootSignature failed (HRESULT=0x%08X).",
+         static_cast<unsigned>(hr));
+    return false;
+  }
+  return true;
+}
+
+bool D3D12MipmapGenerator::createPipelineState(D3D12GPU* gpu) {
+  ComPtr<ID3DBlob> csBlob = nullptr;
+  ComPtr<ID3DBlob> errorBlob = nullptr;
+  UINT compileFlags = D3DCOMPILE_ENABLE_STRICTNESS;
+#ifdef _DEBUG
+  compileFlags |= D3DCOMPILE_DEBUG | D3DCOMPILE_SKIP_OPTIMIZATION;
+#else
+  compileFlags |= D3DCOMPILE_OPTIMIZATION_LEVEL3;
+#endif
+  auto hr = D3DCompile(kHLSLSource, strlen(kHLSLSource), nullptr, nullptr, nullptr, "main",
+                       "cs_5_0", compileFlags, 0, &csBlob, &errorBlob);
+  if (FAILED(hr)) {
+    LOGE("D3D12MipmapGenerator: D3DCompile failed (HRESULT=0x%08X): %s", static_cast<unsigned>(hr),
+         errorBlob ? static_cast<const char*>(errorBlob->GetBufferPointer()) : "<no message>");
+    return false;
+  }
+
+  D3D12_COMPUTE_PIPELINE_STATE_DESC desc = {};
+  desc.pRootSignature = _rootSignature.Get();
+  desc.CS.pShaderBytecode = csBlob->GetBufferPointer();
+  desc.CS.BytecodeLength = csBlob->GetBufferSize();
+  desc.NodeMask = 0;
+  desc.Flags = D3D12_PIPELINE_STATE_FLAG_NONE;
+
+  hr = gpu->device()->CreateComputePipelineState(&desc, IID_PPV_ARGS(&_pipelineState));
+  if (FAILED(hr)) {
+    LOGE("D3D12MipmapGenerator: CreateComputePipelineState failed (HRESULT=0x%08X).",
+         static_cast<unsigned>(hr));
+    return false;
+  }
+  return true;
+}
+
+}  // namespace tgfx
diff --git a/src/gpu/d3d12/D3D12MipmapGenerator.h b/src/gpu/d3d12/D3D12MipmapGenerator.h
new file mode 100644
index 000000000..a53cfe7ce
--- /dev/null
+++ b/src/gpu/d3d12/D3D12MipmapGenerator.h
@@ -0,0 +1,83 @@
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Tencent is pleased to support the open source community by making tgfx available.
+//
+//  Copyright (C) 2026 Tencent. All rights reserved.
+//
+//  Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+//  in compliance with the License. You may obtain a copy of the License at
+//
+//      https://opensource.org/licenses/BSD-3-Clause
+//
+//  unless required by applicable law or agreed to in writing, software distributed under the
+//  license is distributed on an "as is" basis, without warranties or conditions of any kind,
+//  either express or implied. see the license for the specific language governing permissions
+//  and limitations under the license.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include "D3D12Util.h"
+
+namespace tgfx {
+
+class D3D12GPU;
+class D3D12Texture;
+
+/// Number of threads per group on each axis. The compute shader is `[numthreads(8, 8, 1)]`, so
+/// dispatching encoders need to round the output mip dimensions up to this value.
+static constexpr unsigned D3D12_MIPMAP_THREAD_GROUP_SIZE = 8;
+
+/**
+ * Lazily-initialised compute pipeline that downsamples mip[i] into mip[i+1] for a 2D texture.
+ *
+ * D3D12 has no built-in equivalent to Metal's [blitEncoder generateMipmapsForTexture] or
+ * Vulkan's vkCmdBlitImage chain, so we ship a tiny box-filter compute shader and dispatch it
+ * per mip level. The PSO and root signature are cached on the GPU so repeated mipmap generation
+ * doesn't repeatedly recompile the shader.
+ *
+ * Root signature layout:
+ *   slot 0: 4 32-bit root constants (output mip width, height, inv_width, inv_height)
+ *   slot 1: SRV descriptor table referencing mip[i]
+ *   slot 2: UAV descriptor table referencing mip[i+1]
+ *
+ * The encoder is expected to:
+ *   - Place the SRV / UAV pair into a shader-visible heap that lives long enough to cover the
+ *     dispatch (added to the FrameSession's retainedDescriptorHeaps).
+ *   - Issue ResourceBarriers transitioning the parent texture's individual subresources between
+ *     UNORDERED_ACCESS and PIXEL_SHADER_RESOURCE as the chain walks up.
+ */
+class D3D12MipmapGenerator {
+ public:
+  static D3D12MipmapGenerator* Get(D3D12GPU* gpu);
+
+  ID3D12RootSignature* rootSignature() const {
+    return _rootSignature.Get();
+  }
+
+  ID3D12PipelineState* pipelineState() const {
+    return _pipelineState.Get();
+  }
+
+  /**
+   * Returns true once both the root signature and the pipeline state are ready. Callers should
+   * skip mipmap generation when this is false (and emit a one-time log).
+   */
+  bool isReady() const {
+    return _rootSignature != nullptr && _pipelineState != nullptr;
+  }
+
+ private:
+  explicit D3D12MipmapGenerator(D3D12GPU* gpu);
+
+  bool createRootSignature(D3D12GPU* gpu);
+  bool createPipelineState(D3D12GPU* gpu);
+
+  ComPtr<ID3D12RootSignature> _rootSignature;
+  ComPtr<ID3D12PipelineState> _pipelineState;
+
+  friend class D3D12GPU;
+};
+
+}  // namespace tgfx
diff --git a/src/gpu/d3d12/D3D12RenderPass.cpp b/src/gpu/d3d12/D3D12RenderPass.cpp
new file mode 100644
index 000000000..a1284e768
--- /dev/null
+++ b/src/gpu/d3d12/D3D12RenderPass.cpp
@@ -0,0 +1,600 @@
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Tencent is pleased to support the open source community by making tgfx available.
+//
+//  Copyright (C) 2026 Tencent. All rights reserved.
+//
+//  Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+//  in compliance with the License. You may obtain a copy of the License at
+//
+//      https://opensource.org/licenses/BSD-3-Clause
+//
+//  unless required by applicable law or agreed to in writing, software distributed under the
+//  license is distributed on an "as is" basis, without warranties or conditions of any kind,
+//  either express or implied. see the license for the specific language governing permissions
+//  and limitations under the license.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "D3D12RenderPass.h"
+#include "D3D12Buffer.h"
+#include "D3D12CommandEncoder.h"
+#include "D3D12GPU.h"
+#include "D3D12RenderPipeline.h"
+#include "D3D12Sampler.h"
+#include "D3D12Texture.h"
+#include "core/utils/Log.h"
+
+namespace tgfx {
+
+static D3D12_PRIMITIVE_TOPOLOGY ToD3D12Topology(PrimitiveType type) {
+  return type == PrimitiveType::TriangleStrip ? D3D_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP
+                                              : D3D_PRIMITIVE_TOPOLOGY_TRIANGLELIST;
+}
+
+std::shared_ptr<D3D12RenderPass> D3D12RenderPass::Make(D3D12CommandEncoder* encoder,
+                                                       const RenderPassDescriptor& descriptor) {
+  if (encoder == nullptr) {
+    return nullptr;
+  }
+  auto gpu = static_cast<D3D12GPU*>(encoder->gpu());
+  auto pass = std::shared_ptr<D3D12RenderPass>(new D3D12RenderPass(encoder, gpu, descriptor));
+  if (!pass->initialise(descriptor)) {
+    return nullptr;
+  }
+  return pass;
+}
+
+D3D12RenderPass::D3D12RenderPass(D3D12CommandEncoder* encoder, D3D12GPU* gpu,
+                                 const RenderPassDescriptor& passDescriptor)
+    : RenderPass(passDescriptor), encoder(encoder), d3d12GPU(gpu),
+      commandList(encoder->d3d12CommandList()) {
+}
+
+bool D3D12RenderPass::initialise(const RenderPassDescriptor& passDescriptor) {
+  if (commandList == nullptr) {
+    return false;
+  }
+  auto device = d3d12GPU->device();
+
+  // Step 1: Allocate RTV / DSV descriptor slots from the GPU-wide non-shader-visible rings.
+  // Each render pass used to call CreateDescriptorHeap for these every time it was begun; we
+  // now sub-allocate from a shared ring that is reclaimed by fence value, removing a kernel
+  // round-trip per pass on the hot path. Ring slots stay valid until the next pollCompleted-
+  // Submissions() retires them, so OMSetRenderTargets does not need the heap pinned in
+  // FrameSession::retainedRTVDSVHeaps any more.
+  uint32_t numColorAttachments = 0;
+  for (auto& ca : passDescriptor.colorAttachments) {
+    if (ca.texture != nullptr) {
+      numColorAttachments++;
+    }
+  }
+  bool hasDepth = (passDescriptor.depthStencilAttachment.texture != nullptr);
+
+  D3D12DescriptorRing::Range rtvRange = {};
+  if (numColorAttachments > 0) {
+    rtvRange = d3d12GPU->rtvRing().allocate(numColorAttachments);
+    if (!rtvRange.valid()) {
+      LOGE("D3D12RenderPass: RTV ring exhausted (requested %u slots).", numColorAttachments);
+      return false;
+    }
+  }
+
+  D3D12DescriptorRing::Range dsvRange = {};
+  if (hasDepth) {
+    dsvRange = d3d12GPU->dsvRing().allocate(1);
+    if (!dsvRange.valid()) {
+      LOGE("D3D12RenderPass: DSV ring exhausted.");
+      return false;
+    }
+  }
+
+  auto rtvDescriptorSize = d3d12GPU->rtvRing().descriptorSize();
+
+  std::vector<D3D12_CPU_DESCRIPTOR_HANDLE> rtvHandles;
+  rtvHandles.reserve(numColorAttachments);
+  uint32_t fbWidth = 0;
+  uint32_t fbHeight = 0;
+
+  // First pass over color attachments: create RTV descriptors and accumulate transitions.
+  // We deliberately separate this from the Clear* calls below so we can issue a single
+  // ResourceBarrier for every attachment (color + depth) instead of N+1 individual calls.
+  D3D12BarrierBatch entryBatch;
+  for (auto& ca : passDescriptor.colorAttachments) {
+    if (ca.texture == nullptr) {
+      continue;
+    }
+    auto d3d12Tex = std::static_pointer_cast<D3D12Texture>(ca.texture);
+    encoder->retainResource(d3d12Tex);
+    colorAttachments.push_back(d3d12Tex);
+
+    // Queue the transition to RENDER_TARGET. The "current" state is either COMMON (newly
+    // created or coming back from sampling) or RENDER_TARGET from a preceding render pass that
+    // already left it there; addTransition() collapses the latter into a no-op.
+    entryBatch.addTransition(d3d12Tex->d3d12Resource(), d3d12Tex->currentState(),
+                             D3D12_RESOURCE_STATE_RENDER_TARGET);
+    encoder->recordTextureStateChange(d3d12Tex.get(), D3D12_RESOURCE_STATE_RENDER_TARGET);
+
+    D3D12_CPU_DESCRIPTOR_HANDLE rtvHandle = rtvRange.cpuStart;
+    rtvHandle.ptr += static_cast<SIZE_T>(rtvHandles.size()) * rtvDescriptorSize;
+
+    D3D12_RENDER_TARGET_VIEW_DESC rtvDesc = {};
+    rtvDesc.Format = static_cast<DXGI_FORMAT>(d3d12Tex->dxgiFormat());
+    rtvDesc.ViewDimension = (d3d12Tex->sampleCount() > 1) ? D3D12_RTV_DIMENSION_TEXTURE2DMS
+                                                          : D3D12_RTV_DIMENSION_TEXTURE2D;
+    if (rtvDesc.ViewDimension == D3D12_RTV_DIMENSION_TEXTURE2D) {
+      rtvDesc.Texture2D.MipSlice = 0;
+      rtvDesc.Texture2D.PlaneSlice = 0;
+    }
+    device->CreateRenderTargetView(d3d12Tex->d3d12Resource(), &rtvDesc, rtvHandle);
+    rtvHandles.push_back(rtvHandle);
+
+    fbWidth = static_cast<uint32_t>(d3d12Tex->width());
+    fbHeight = static_cast<uint32_t>(d3d12Tex->height());
+
+    // Capture the optional resolve target for this attachment. A null entry keeps the parallel
+    // vector aligned with colorAttachments so the onEnd() loop can match them by index.
+    if (ca.resolveTexture != nullptr) {
+      auto resolveTex = std::static_pointer_cast<D3D12Texture>(ca.resolveTexture);
+      encoder->retainResource(resolveTex);
+      resolveTextures.push_back(std::move(resolveTex));
+    } else {
+      resolveTextures.push_back(nullptr);
+    }
+  }
+
+  D3D12_CPU_DESCRIPTOR_HANDLE dsvHandle = {};
+  if (hasDepth) {
+    auto d3d12Tex =
+        std::static_pointer_cast<D3D12Texture>(passDescriptor.depthStencilAttachment.texture);
+    encoder->retainResource(d3d12Tex);
+    depthStencilAttachment = d3d12Tex;
+
+    entryBatch.addTransition(d3d12Tex->d3d12Resource(), d3d12Tex->currentState(),
+                             D3D12_RESOURCE_STATE_DEPTH_WRITE);
+    encoder->recordTextureStateChange(d3d12Tex.get(), D3D12_RESOURCE_STATE_DEPTH_WRITE);
+
+    dsvHandle = dsvRange.cpuStart;
+    D3D12_DEPTH_STENCIL_VIEW_DESC dsvDesc = {};
+    dsvDesc.Format = static_cast<DXGI_FORMAT>(d3d12Tex->dxgiFormat());
+    dsvDesc.ViewDimension = (d3d12Tex->sampleCount() > 1) ? D3D12_DSV_DIMENSION_TEXTURE2DMS
+                                                          : D3D12_DSV_DIMENSION_TEXTURE2D;
+    if (dsvDesc.ViewDimension == D3D12_DSV_DIMENSION_TEXTURE2D) {
+      dsvDesc.Texture2D.MipSlice = 0;
+    }
+    device->CreateDepthStencilView(d3d12Tex->d3d12Resource(), &dsvDesc, dsvHandle);
+
+    if (fbWidth == 0) {
+      fbWidth = static_cast<uint32_t>(d3d12Tex->width());
+      fbHeight = static_cast<uint32_t>(d3d12Tex->height());
+    }
+  }
+
+  // Flush every pre-pass transition in one ResourceBarrier(N, ...) call. After this point all
+  // attachments are in their target state and clears are safe to issue.
+  entryBatch.flush(commandList);
+
+  // Second pass: ClearRenderTargetView / ClearDepthStencilView for any attachment with
+  // LoadAction::Clear. These run after the barrier flush so the resource state is correct.
+  // rtvHandles[] was populated in the order non-null colorAttachments are visited above, so we
+  // walk passDescriptor.colorAttachments again with a parallel running counter — keeping this
+  // O(N) instead of paying an inner search per attachment.
+  size_t rtvIndex = 0;
+  for (const auto& ca : passDescriptor.colorAttachments) {
+    if (ca.texture == nullptr) {
+      continue;
+    }
+    if (ca.loadAction == LoadAction::Clear) {
+      const float clear[4] = {ca.clearValue.red, ca.clearValue.green, ca.clearValue.blue,
+                              ca.clearValue.alpha};
+      commandList->ClearRenderTargetView(rtvHandles[rtvIndex], clear, 0, nullptr);
+    }
+    rtvIndex++;
+  }
+  if (hasDepth && passDescriptor.depthStencilAttachment.loadAction == LoadAction::Clear) {
+    D3D12_CLEAR_FLAGS clearFlags = D3D12_CLEAR_FLAG_DEPTH | D3D12_CLEAR_FLAG_STENCIL;
+    commandList->ClearDepthStencilView(
+        dsvHandle, clearFlags, passDescriptor.depthStencilAttachment.depthClearValue,
+        static_cast<UINT8>(passDescriptor.depthStencilAttachment.stencilClearValue), 0, nullptr);
+  }
+
+  // Step 2: Bind render targets.
+  commandList->OMSetRenderTargets(static_cast<UINT>(rtvHandles.size()),
+                                  rtvHandles.empty() ? nullptr : rtvHandles.data(), FALSE,
+                                  hasDepth ? &dsvHandle : nullptr);
+
+  // Step 3: Default viewport / scissor covering the entire framebuffer.
+  D3D12_VIEWPORT viewport = {};
+  viewport.TopLeftX = 0.0f;
+  viewport.TopLeftY = 0.0f;
+  viewport.Width = static_cast<float>(fbWidth);
+  viewport.Height = static_cast<float>(fbHeight);
+  viewport.MinDepth = 0.0f;
+  viewport.MaxDepth = 1.0f;
+  commandList->RSSetViewports(1, &viewport);
+
+  D3D12_RECT scissor = {};
+  scissor.left = 0;
+  scissor.top = 0;
+  scissor.right = static_cast<LONG>(fbWidth);
+  scissor.bottom = static_cast<LONG>(fbHeight);
+  commandList->RSSetScissorRects(1, &scissor);
+
+  // No per-pass descriptor heaps to retain any more: the RTV / DSV slots we allocated above
+  // live in D3D12GPU::_rtvRing / _dsvRing, whose underlying ID3D12DescriptorHeap is owned by
+  // the GPU instance and reclaimed by fence value (see pollCompletedSubmissions). Shader-
+  // visible heaps were already bound to this command list in D3D12CommandEncoder::Make().
+  return true;
+}
+
+GPU* D3D12RenderPass::gpu() const {
+  return d3d12GPU;
+}
+
+void D3D12RenderPass::setViewport(int x, int y, int width, int height) {
+  D3D12_VIEWPORT viewport = {};
+  viewport.TopLeftX = static_cast<float>(x);
+  viewport.TopLeftY = static_cast<float>(y);
+  viewport.Width = static_cast<float>(width);
+  viewport.Height = static_cast<float>(height);
+  viewport.MinDepth = 0.0f;
+  viewport.MaxDepth = 1.0f;
+  commandList->RSSetViewports(1, &viewport);
+}
+
+void D3D12RenderPass::setScissorRect(int x, int y, int width, int height) {
+  D3D12_RECT scissor = {};
+  scissor.left = x;
+  scissor.top = y;
+  scissor.right = x + width;
+  scissor.bottom = y + height;
+  commandList->RSSetScissorRects(1, &scissor);
+}
+
+void D3D12RenderPass::setPipeline(std::shared_ptr<RenderPipeline> pipeline) {
+  if (!pipeline) {
+    return;
+  }
+  auto d3d12Pipeline = std::static_pointer_cast<D3D12RenderPipeline>(pipeline);
+  if (currentPipeline == d3d12Pipeline) {
+    return;
+  }
+  if (d3d12Pipeline->d3d12PipelineState() == nullptr ||
+      d3d12Pipeline->d3d12RootSignature() == nullptr) {
+    return;
+  }
+  currentPipeline = d3d12Pipeline;
+  encoder->retainResource(d3d12Pipeline);
+  commandList->SetPipelineState(d3d12Pipeline->d3d12PipelineState());
+  commandList->SetGraphicsRootSignature(d3d12Pipeline->d3d12RootSignature());
+
+  // Switching pipelines invalidates root parameter state, so re-flag every binding as dirty.
+  for (auto& ub : uniformBindings) {
+    if (ub.gpuAddress != 0) {
+      ub.dirty = true;
+    }
+  }
+  for (auto& tb : textureBindings) {
+    if (tb.srvTableStart.ptr != 0) {
+      tb.dirty = true;
+    }
+  }
+}
+
+void D3D12RenderPass::setUniformBuffer(unsigned binding, std::shared_ptr<GPUBuffer> buffer,
+                                       size_t offset, size_t /*size*/) {
+  if (!buffer || binding >= MaxUniformBindings) {
+    return;
+  }
+  auto d3d12Buffer = std::static_pointer_cast<D3D12Buffer>(buffer);
+  encoder->retainResource(d3d12Buffer);
+  auto gpuAddr = d3d12Buffer->d3d12Resource()->GetGPUVirtualAddress() + offset;
+  auto& ub = uniformBindings[binding];
+  if (ub.gpuAddress != gpuAddr) {
+    ub.gpuAddress = gpuAddr;
+    ub.dirty = true;
+  }
+}
+
+void D3D12RenderPass::setTexture(unsigned binding, std::shared_ptr<Texture> texture,
+                                 std::shared_ptr<Sampler> sampler) {
+  if (!texture || !sampler || !currentPipeline || binding >= MaxTextureBindings) {
+    return;
+  }
+  auto d3d12Tex = std::static_pointer_cast<D3D12Texture>(texture);
+  auto d3d12Samp = std::static_pointer_cast<D3D12Sampler>(sampler);
+  encoder->retainResource(d3d12Tex);
+  encoder->retainResource(d3d12Samp);
+
+  // Color render targets and write-back textures may currently be in RENDER_TARGET / COPY_DEST.
+  // Queue a transition to PIXEL_SHADER_RESOURCE so the SRV will be valid by the time the next
+  // draw fires. The barrier is not issued immediately; pendingBarriers accumulates every state
+  // change recorded by setTexture() in this pass and flushBindingsIfNeeded() emits them all in
+  // a single ResourceBarrier(N, ...) call right before the draw. The CPU-side _currentState is
+  // updated immediately so a second setTexture() with the same texture sees the correct state
+  // and does not enqueue a redundant barrier.
+  auto current = d3d12Tex->currentState();
+  if (current != D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE) {
+    pendingBarriers.addTransition(d3d12Tex->d3d12Resource(), current,
+                                  D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE);
+    encoder->recordTextureStateChange(d3d12Tex.get(), D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE);
+    // Track this texture so onEnd() can transition it back to COMMON. Without that step, D3D12
+    // automatic state decay after ExecuteCommandLists drops the resource to COMMON, but our CPU
+    // tracker still believes it is in PIXEL_SHADER_RESOURCE — every subsequent transition then
+    // fails "Before state mismatch" validation and (on some drivers) destabilises the device.
+    shaderResourceTextures.push_back(d3d12Tex);
+  }
+
+  auto device = d3d12GPU->device();
+
+  // SRV slot: dedup by (resource, format, mipLevels). Repeated bindings of the same texture
+  // within one render pass share a single descriptor sub-allocated from the GPU's CBV/SRV/UAV
+  // ring.
+  SrvCacheKey srvKey = {};
+  srvKey.resource = d3d12Tex->d3d12Resource();
+  srvKey.format = static_cast<DXGI_FORMAT>(d3d12Tex->dxgiFormat());
+  srvKey.mipLevels = static_cast<UINT>(d3d12Tex->mipLevelCount());
+  D3D12_GPU_DESCRIPTOR_HANDLE srvGpu = {};
+  auto srvIt = srvSlotCache.find(srvKey);
+  if (srvIt != srvSlotCache.end()) {
+    srvGpu = srvIt->second;
+  } else {
+    auto range = d3d12GPU->srvRing().allocate(1);
+    if (!range.valid()) {
+      LOGE("D3D12RenderPass::setTexture: SRV ring exhausted.");
+      return;
+    }
+    D3D12_SHADER_RESOURCE_VIEW_DESC srvDesc = {};
+    srvDesc.Format = srvKey.format;
+    srvDesc.ViewDimension = (d3d12Tex->sampleCount() > 1) ? D3D12_SRV_DIMENSION_TEXTURE2DMS
+                                                          : D3D12_SRV_DIMENSION_TEXTURE2D;
+    srvDesc.Shader4ComponentMapping = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING;
+    if (srvDesc.ViewDimension == D3D12_SRV_DIMENSION_TEXTURE2D) {
+      srvDesc.Texture2D.MostDetailedMip = 0;
+      srvDesc.Texture2D.MipLevels = srvKey.mipLevels;
+      srvDesc.Texture2D.PlaneSlice = 0;
+      srvDesc.Texture2D.ResourceMinLODClamp = 0.0f;
+    }
+    device->CreateShaderResourceView(d3d12Tex->d3d12Resource(), &srvDesc, range.cpuStart);
+    srvGpu = range.gpuStart;
+    srvSlotCache.emplace(srvKey, srvGpu);
+  }
+
+  // Sampler GPU descriptor handle is owned by the D3D12Sampler instance and is permanent for the
+  // lifetime of the GPU. setTexture() never has to allocate or write a sampler descriptor.
+  D3D12_GPU_DESCRIPTOR_HANDLE sampGpu = d3d12Samp->gpuDescriptorHandle();
+
+  auto& tb = textureBindings[binding];
+  tb.srvTableStart = srvGpu;
+  tb.samplerTableStart = sampGpu;
+  tb.dirty = true;
+}
+
+void D3D12RenderPass::flushBindingsIfNeeded() {
+  if (!currentPipeline) {
+    return;
+  }
+  // Issue every queued state transition in a single ResourceBarrier(N, ...) call right before
+  // the draw. setTexture() may have added several transitions (one per unique sampled texture
+  // entering this pass); flushing them together lets the driver collapse redundant cache
+  // operations and avoids per-barrier API overhead.
+  pendingBarriers.flush(commandList);
+
+  // Apply uniform CBVs — one root constant buffer view per dirty uniform binding.
+  for (unsigned i = 0; i < MaxUniformBindings; i++) {
+    auto& ub = uniformBindings[i];
+    if (!ub.dirty) {
+      continue;
+    }
+    auto rootIndex = currentPipeline->getUniformRootParameterIndex(i);
+    if (rootIndex == UINT32_MAX) {
+      ub.dirty = false;
+      continue;
+    }
+    commandList->SetGraphicsRootConstantBufferView(rootIndex, ub.gpuAddress);
+    ub.dirty = false;
+  }
+
+  // Apply texture/sampler descriptor tables. Each texture binding occupies two consecutive root
+  // parameters in our root signature: an SRV table (in the CBV/SRV/UAV heap) and a Sampler table
+  // (in the Sampler heap). We bind both with separate SetGraphicsRootDescriptorTable calls.
+  for (unsigned i = 0; i < MaxTextureBindings; i++) {
+    auto& tb = textureBindings[i];
+    if (!tb.dirty) {
+      continue;
+    }
+    auto srvRoot = currentPipeline->getTextureRootParameterIndex(i);
+    auto samplerRoot = currentPipeline->getSamplerRootParameterIndex(i);
+    if (srvRoot != UINT32_MAX) {
+      commandList->SetGraphicsRootDescriptorTable(srvRoot, tb.srvTableStart);
+    }
+    if (samplerRoot != UINT32_MAX) {
+      commandList->SetGraphicsRootDescriptorTable(samplerRoot, tb.samplerTableStart);
+    }
+    tb.dirty = false;
+  }
+}
+
+void D3D12RenderPass::setVertexBuffer(unsigned slot, std::shared_ptr<GPUBuffer> buffer,
+                                      size_t offset) {
+  if (!buffer || !currentPipeline) {
+    return;
+  }
+  auto d3d12Buffer = std::static_pointer_cast<D3D12Buffer>(buffer);
+  // Guard against the size_t subtraction below underflowing. Unlike Vulkan/Metal where the
+  // backing API consumes (buffer, offset) directly, D3D12 expects us to compute SizeInBytes
+  // ourselves; an offset at or past the buffer end would wrap to ~0 and the UINT cast would
+  // then publish a 4 GB range to the GPU.
+  auto bufferSize = d3d12Buffer->size();
+  if (offset >= bufferSize) {
+    LOGE("D3D12RenderPass::setVertexBuffer: offset %zu is out of range (buffer size=%zu).", offset,
+         bufferSize);
+    return;
+  }
+  encoder->retainResource(d3d12Buffer);
+
+  D3D12_VERTEX_BUFFER_VIEW view = {};
+  view.BufferLocation = d3d12Buffer->d3d12Resource()->GetGPUVirtualAddress() + offset;
+  view.SizeInBytes = static_cast<UINT>(bufferSize - offset);
+  // D3D12 requires the per-vertex stride at draw time. We sourced it from the bound pipeline's
+  // VertexBufferLayout when the pipeline was built. The Vulkan backend keeps stride implicit in
+  // the VkPipeline's vertex input description; for D3D12 we must echo it back here.
+  view.StrideInBytes = currentPipeline->getVertexStride(slot);
+  commandList->IASetVertexBuffers(slot, 1, &view);
+}
+
+void D3D12RenderPass::setIndexBuffer(std::shared_ptr<GPUBuffer> buffer, IndexFormat format) {
+  if (!buffer) {
+    return;
+  }
+  auto d3d12Buffer = std::static_pointer_cast<D3D12Buffer>(buffer);
+  // Reject empty buffers up front: passing SizeInBytes=0 to IASetIndexBuffer leaves the index
+  // buffer effectively unset and later draws would silently produce no primitives. Mirrors the
+  // defensive offset check in setVertexBuffer.
+  auto bufferSize = d3d12Buffer->size();
+  if (bufferSize == 0) {
+    LOGE("D3D12RenderPass::setIndexBuffer: buffer has zero size.");
+    return;
+  }
+  encoder->retainResource(d3d12Buffer);
+
+  D3D12_INDEX_BUFFER_VIEW view = {};
+  view.BufferLocation = d3d12Buffer->d3d12Resource()->GetGPUVirtualAddress();
+  view.SizeInBytes = static_cast<UINT>(bufferSize);
+  view.Format = (format == IndexFormat::UInt32) ? DXGI_FORMAT_R32_UINT : DXGI_FORMAT_R16_UINT;
+  commandList->IASetIndexBuffer(&view);
+}
+
+void D3D12RenderPass::setStencilReference(uint32_t reference) {
+  commandList->OMSetStencilRef(reference);
+}
+
+void D3D12RenderPass::draw(PrimitiveType primitiveType, uint32_t vertexCount,
+                           uint32_t instanceCount, uint32_t firstVertex, uint32_t firstInstance) {
+  if (!currentPipeline) {
+    return;
+  }
+  auto topology = ToD3D12Topology(primitiveType);
+  if (!primitiveTopologySet || topology != currentTopology) {
+    commandList->IASetPrimitiveTopology(topology);
+    currentTopology = topology;
+    primitiveTopologySet = true;
+  }
+  flushBindingsIfNeeded();
+  commandList->DrawInstanced(vertexCount, instanceCount, firstVertex, firstInstance);
+}
+
+void D3D12RenderPass::drawIndexed(PrimitiveType primitiveType, uint32_t indexCount,
+                                  uint32_t instanceCount, uint32_t firstIndex, int32_t baseVertex,
+                                  uint32_t firstInstance) {
+  if (!currentPipeline) {
+    return;
+  }
+  auto topology = ToD3D12Topology(primitiveType);
+  if (!primitiveTopologySet || topology != currentTopology) {
+    commandList->IASetPrimitiveTopology(topology);
+    currentTopology = topology;
+    primitiveTopologySet = true;
+  }
+  flushBindingsIfNeeded();
+  commandList->DrawIndexedInstanced(indexCount, instanceCount, firstIndex, baseVertex,
+                                    firstInstance);
+}
+
+void D3D12RenderPass::onEnd() {
+  // Should not happen in normal flow (every draw flushes pendingBarriers), but guard against
+  // a render pass that ends without any draws ever recorded.
+  pendingBarriers.flush(commandList);
+
+  // Step 1: collect the pre-resolve transitions for every MSAA color attachment that has a
+  // resolveTexture, then issue them in a single ResourceBarrier(N, ...) before the actual
+  // ResolveSubresource calls. Driver state requirements are RESOLVE_SOURCE for the multi-sample
+  // source and RESOLVE_DEST for the single-sample destination; both are restored to COMMON
+  // afterwards in step 3, mirroring Vulkan's pResolveAttachments behaviour.
+  D3D12BarrierBatch resolveBatch;
+  for (size_t i = 0; i < colorAttachments.size(); i++) {
+    if (i >= resolveTextures.size() || resolveTextures[i] == nullptr) {
+      continue;
+    }
+    auto& src = colorAttachments[i];
+    auto& resolveDst = resolveTextures[i];
+    if (src == nullptr) {
+      continue;
+    }
+    auto srcState = src->currentState();
+    if (srcState != D3D12_RESOURCE_STATE_RESOLVE_SOURCE) {
+      resolveBatch.addTransition(src->d3d12Resource(), srcState,
+                                 D3D12_RESOURCE_STATE_RESOLVE_SOURCE);
+      encoder->recordTextureStateChange(src.get(), D3D12_RESOURCE_STATE_RESOLVE_SOURCE);
+    }
+    auto dstState = resolveDst->currentState();
+    if (dstState != D3D12_RESOURCE_STATE_RESOLVE_DEST) {
+      resolveBatch.addTransition(resolveDst->d3d12Resource(), dstState,
+                                 D3D12_RESOURCE_STATE_RESOLVE_DEST);
+      encoder->recordTextureStateChange(resolveDst.get(), D3D12_RESOURCE_STATE_RESOLVE_DEST);
+    }
+  }
+  resolveBatch.flush(commandList);
+
+  // Step 2: do the actual MSAA resolves now that every source/destination is in the right state.
+  for (size_t i = 0; i < colorAttachments.size(); i++) {
+    if (i >= resolveTextures.size() || resolveTextures[i] == nullptr) {
+      continue;
+    }
+    auto& src = colorAttachments[i];
+    auto& resolveDst = resolveTextures[i];
+    if (src == nullptr) {
+      continue;
+    }
+    commandList->ResolveSubresource(resolveDst->d3d12Resource(), 0, src->d3d12Resource(), 0,
+                                    static_cast<DXGI_FORMAT>(src->dxgiFormat()));
+    // The two zeros above are dst / src subresource indices. tgfx render targets are flat 2D
+    // textures (no array, MSAA targets are mip-locked to 1), so subresource 0 is the only valid
+    // index. Vulkan's pResolveAttachments and Metal's resolve attachments make the same
+    // assumption.
+  }
+
+  // Step 3: collapse every "back to COMMON" transition (color attachments, resolve targets,
+  // depth stencil, sampled textures) into a single ResourceBarrier. The next consumer
+  // (sample, copy, present) will issue its own transition from COMMON to whatever state it
+  // needs. D3D12 implicitly decays buffers and simultaneous-access textures to COMMON after
+  // the command list executes; explicitly issuing the matching CPU-side transition keeps our
+  // tracker aligned with the runtime so subsequent passes don't trip "Before state mismatch"
+  // barriers.
+  D3D12BarrierBatch finalBatch;
+  for (auto& tex : colorAttachments) {
+    if (tex == nullptr) continue;
+    auto current = tex->currentState();
+    if (current != D3D12_RESOURCE_STATE_COMMON) {
+      finalBatch.addTransition(tex->d3d12Resource(), current, D3D12_RESOURCE_STATE_COMMON);
+      encoder->recordTextureStateChange(tex.get(), D3D12_RESOURCE_STATE_COMMON);
+    }
+  }
+  for (auto& tex : resolveTextures) {
+    if (tex == nullptr) continue;
+    auto current = tex->currentState();
+    if (current != D3D12_RESOURCE_STATE_COMMON) {
+      finalBatch.addTransition(tex->d3d12Resource(), current, D3D12_RESOURCE_STATE_COMMON);
+      encoder->recordTextureStateChange(tex.get(), D3D12_RESOURCE_STATE_COMMON);
+    }
+  }
+  if (depthStencilAttachment != nullptr) {
+    auto current = depthStencilAttachment->currentState();
+    if (current != D3D12_RESOURCE_STATE_COMMON) {
+      finalBatch.addTransition(depthStencilAttachment->d3d12Resource(), current,
+                               D3D12_RESOURCE_STATE_COMMON);
+      encoder->recordTextureStateChange(depthStencilAttachment.get(), D3D12_RESOURCE_STATE_COMMON);
+    }
+  }
+  for (auto& tex : shaderResourceTextures) {
+    if (tex == nullptr) continue;
+    auto current = tex->currentState();
+    if (current != D3D12_RESOURCE_STATE_COMMON) {
+      finalBatch.addTransition(tex->d3d12Resource(), current, D3D12_RESOURCE_STATE_COMMON);
+      encoder->recordTextureStateChange(tex.get(), D3D12_RESOURCE_STATE_COMMON);
+    }
+  }
+  finalBatch.flush(commandList);
+}
+
+}  // namespace tgfx
diff --git a/src/gpu/d3d12/D3D12RenderPass.h b/src/gpu/d3d12/D3D12RenderPass.h
new file mode 100644
index 000000000..9528163b1
--- /dev/null
+++ b/src/gpu/d3d12/D3D12RenderPass.h
@@ -0,0 +1,161 @@
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Tencent is pleased to support the open source community by making tgfx available.
+//
+//  Copyright (C) 2026 Tencent. All rights reserved.
+//
+//  Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+//  in compliance with the License. You may obtain a copy of the License at
+//
+//      https://opensource.org/licenses/BSD-3-Clause
+//
+//  unless required by applicable law or agreed to in writing, software distributed under the
+//  license is distributed on an "as is" basis, without warranties or conditions of any kind,
+//  either express or implied. see the license for the specific language governing permissions
+//  and limitations under the license.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include <memory>
+#include <unordered_map>
+#include <vector>
+#include "D3D12BarrierBatch.h"
+#include "D3D12Util.h"
+#include "tgfx/gpu/RenderPass.h"
+
+namespace tgfx {
+
+class D3D12CommandEncoder;
+class D3D12GPU;
+class D3D12RenderPipeline;
+class D3D12Texture;
+
+/**
+ * D3D12 render pass implementation.
+ *
+ * On construction:
+ *   - Allocates per-pass non-shader-visible RTV/DSV heaps and creates one descriptor per
+ *     attachment. Issues ResourceBarrier transitions and OMSetRenderTargets.
+ *   - Performs ClearRenderTargetView / ClearDepthStencilView for any attachment with
+ *     LoadAction::Clear.
+ *
+ * Texture/sampler bindings:
+ *   - Sub-allocates SRV slots out of the GPU's process-wide D3D12DescriptorRing (committed and
+ *     fence-retired around each submission). No CreateDescriptorHeap call per render pass.
+ *   - Reuses each D3D12Sampler's stable GPU descriptor handle from the GPU's append-only
+ *     shader-visible Sampler heap; no per-binding CreateSampler is issued either.
+ *   - SetDescriptorHeaps was already issued once on the encoder's command list, so render passes
+ *     never need to call it.
+ *
+ * On end:
+ *   - Transitions color attachments back to COMMON so they can be sampled later. RTV/DSV heaps
+ *     remain alive in the FrameSession until the fence signals.
+ */
+class D3D12RenderPass : public RenderPass {
+ public:
+  static std::shared_ptr<D3D12RenderPass> Make(D3D12CommandEncoder* encoder,
+                                               const RenderPassDescriptor& descriptor);
+
+  ~D3D12RenderPass() override = default;
+
+  GPU* gpu() const override;
+  void setViewport(int x, int y, int width, int height) override;
+  void setScissorRect(int x, int y, int width, int height) override;
+  void setPipeline(std::shared_ptr<RenderPipeline> pipeline) override;
+  void setUniformBuffer(unsigned binding, std::shared_ptr<GPUBuffer> buffer, size_t offset,
+                        size_t size) override;
+  void setTexture(unsigned binding, std::shared_ptr<Texture> texture,
+                  std::shared_ptr<Sampler> sampler) override;
+  void setVertexBuffer(unsigned slot, std::shared_ptr<GPUBuffer> buffer,
+                       size_t offset = 0) override;
+  void setIndexBuffer(std::shared_ptr<GPUBuffer> buffer,
+                      IndexFormat format = IndexFormat::UInt16) override;
+  void setStencilReference(uint32_t reference) override;
+  void draw(PrimitiveType primitiveType, uint32_t vertexCount, uint32_t instanceCount = 1,
+            uint32_t firstVertex = 0, uint32_t firstInstance = 0) override;
+  void drawIndexed(PrimitiveType primitiveType, uint32_t indexCount, uint32_t instanceCount = 1,
+                   uint32_t firstIndex = 0, int32_t baseVertex = 0,
+                   uint32_t firstInstance = 0) override;
+
+ protected:
+  void onEnd() override;
+
+ private:
+  D3D12RenderPass(D3D12CommandEncoder* encoder, D3D12GPU* gpu,
+                  const RenderPassDescriptor& descriptor);
+
+  bool initialise(const RenderPassDescriptor& descriptor);
+
+  // Lazily writes any pending texture/uniform bindings to the shader-visible descriptor heaps and
+  // calls SetGraphicsRootConstantBufferView / SetGraphicsRootDescriptorTable as appropriate.
+  // Called from draw() / drawIndexed() before the actual draw command.
+  void flushBindingsIfNeeded();
+
+  D3D12CommandEncoder* encoder = nullptr;
+  D3D12GPU* d3d12GPU = nullptr;
+  ID3D12GraphicsCommandList* commandList = nullptr;
+
+  // Accumulator for resource state transitions queued by setTexture(). Flushed in
+  // flushBindingsIfNeeded() just before the actual draw, so a draw that touches N sampled
+  // textures issues a single ResourceBarrier(N, ...) call instead of N single-barrier calls.
+  D3D12BarrierBatch pendingBarriers;
+
+  // Per-render-pass dedup cache for SRV slots in the GPU's shader-visible CBV/SRV/UAV ring.
+  // Repeated bindings of the same (resource, format, mipLevels) within a single pass share one
+  // sub-allocated descriptor; the cache is cleared at the next pass start because ring slots may
+  // have been retired by then.
+  struct SrvCacheKey {
+    ID3D12Resource* resource = nullptr;
+    DXGI_FORMAT format = static_cast<DXGI_FORMAT>(0);  // DXGI_FORMAT_UNKNOWN
+    UINT mipLevels = 0;
+    bool operator==(const SrvCacheKey& other) const {
+      return resource == other.resource && format == other.format && mipLevels == other.mipLevels;
+    }
+  };
+  struct SrvCacheKeyHash {
+    size_t operator()(const SrvCacheKey& k) const noexcept {
+      auto h1 = std::hash<void*>{}(static_cast<void*>(k.resource));
+      auto h2 = std::hash<UINT>{}(static_cast<UINT>(k.format));
+      auto h3 = std::hash<UINT>{}(k.mipLevels);
+      return h1 ^ (h2 * 0x9E3779B97F4A7C15ull) ^ (h3 * 0xBF58476D1CE4E5B9ull);
+    }
+  };
+  std::unordered_map<SrvCacheKey, D3D12_GPU_DESCRIPTOR_HANDLE, SrvCacheKeyHash> srvSlotCache;
+
+  // Per-binding deferred state. setUniformBuffer / setTexture record the argument; the actual
+  // RootCBV / RootDescriptorTable bind happens at flushBindingsIfNeeded() (just before draw).
+  struct UniformBinding {
+    D3D12_GPU_VIRTUAL_ADDRESS gpuAddress = 0;
+    bool dirty = false;
+  };
+  struct TextureBinding {
+    D3D12_GPU_DESCRIPTOR_HANDLE srvTableStart = {};
+    D3D12_GPU_DESCRIPTOR_HANDLE samplerTableStart = {};
+    bool dirty = false;
+  };
+  static constexpr unsigned MaxUniformBindings = 8;
+  static constexpr unsigned MaxTextureBindings = 32;
+  UniformBinding uniformBindings[MaxUniformBindings] = {};
+  TextureBinding textureBindings[MaxTextureBindings] = {};
+
+  std::shared_ptr<D3D12RenderPipeline> currentPipeline = nullptr;
+  D3D12_PRIMITIVE_TOPOLOGY currentTopology = D3D_PRIMITIVE_TOPOLOGY_UNDEFINED;
+  bool primitiveTopologySet = false;
+
+  // Color attachments retained for state-restore at onEnd() time.
+  std::vector<std::shared_ptr<D3D12Texture>> colorAttachments;
+  // Per-color-attachment MSAA resolve target. Index N corresponds to colorAttachments[N]; an
+  // entry is nullptr when the matching color attachment has no resolve texture (i.e. sampleCount
+  // == 1). At onEnd() time we issue ResolveSubresource(colorAttachments[N], resolveTextures[N])
+  // for every non-null pair, mirroring Vulkan's pResolveAttachments behaviour.
+  std::vector<std::shared_ptr<D3D12Texture>> resolveTextures;
+  std::shared_ptr<D3D12Texture> depthStencilAttachment;
+  // Textures that were transitioned to PIXEL_SHADER_RESOURCE inside this pass via setTexture().
+  // We keep them tracked so that onEnd() can transition them back to COMMON, avoiding mismatches
+  // with D3D12's automatic state-decay rules between ExecuteCommandLists calls.
+  std::vector<std::shared_ptr<D3D12Texture>> shaderResourceTextures;
+};
+
+}  // namespace tgfx
diff --git a/src/gpu/d3d12/D3D12RenderPipeline.cpp b/src/gpu/d3d12/D3D12RenderPipeline.cpp
new file mode 100644
index 000000000..cd42ef642
--- /dev/null
+++ b/src/gpu/d3d12/D3D12RenderPipeline.cpp
@@ -0,0 +1,482 @@
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Tencent is pleased to support the open source community by making tgfx available.
+//
+//  Copyright (C) 2026 Tencent. All rights reserved.
+//
+//  Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+//  in compliance with the License. You may obtain a copy of the License at
+//
+//      https://opensource.org/licenses/BSD-3-Clause
+//
+//  unless required by applicable law or agreed to in writing, software distributed under the
+//  license is distributed on an "as is" basis, without warranties or conditions of any kind,
+//  either express or implied. see the license for the specific language governing permissions
+//  and limitations under the license.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "D3D12RenderPipeline.h"
+#include <d3dcompiler.h>
+#include <vector>
+#include "D3D12GPU.h"
+#include "D3D12ShaderModule.h"
+#include "core/utils/Log.h"
+#include "gpu/UniformData.h"
+#include "tgfx/gpu/ColorWriteMask.h"
+#include "tgfx/gpu/ShaderVisibility.h"
+
+namespace tgfx {
+
+// Map a TGFX ShaderVisibility bitmask to the D3D12 single-stage enum used by root parameters.
+// D3D12 only allows a single visibility per root parameter; combinations fall back to ALL.
+static D3D12_SHADER_VISIBILITY ToD3D12ShaderVisibility(uint32_t visibility) {
+  if (visibility == ShaderVisibility::Vertex) {
+    return D3D12_SHADER_VISIBILITY_VERTEX;
+  }
+  if (visibility == ShaderVisibility::Fragment) {
+    return D3D12_SHADER_VISIBILITY_PIXEL;
+  }
+  return D3D12_SHADER_VISIBILITY_ALL;
+}
+
+static UINT8 ToD3D12RenderTargetWriteMask(uint32_t mask) {
+  UINT8 result = 0;
+  if (mask & ColorWriteMask::RED) result |= D3D12_COLOR_WRITE_ENABLE_RED;
+  if (mask & ColorWriteMask::GREEN) result |= D3D12_COLOR_WRITE_ENABLE_GREEN;
+  if (mask & ColorWriteMask::BLUE) result |= D3D12_COLOR_WRITE_ENABLE_BLUE;
+  if (mask & ColorWriteMask::ALPHA) result |= D3D12_COLOR_WRITE_ENABLE_ALPHA;
+  return result;
+}
+
+// True when the descriptor declares any non-default stencil state, matching the same predicate
+// used by the Vulkan backend so that pipeline state is consistent across backends.
+static bool HasNonTrivialStencilState(const DepthStencilDescriptor& ds) {
+  return ds.stencilFront.compare != CompareFunction::Always ||
+         ds.stencilBack.compare != CompareFunction::Always ||
+         ds.stencilFront.failOp != StencilOperation::Keep ||
+         ds.stencilFront.passOp != StencilOperation::Keep ||
+         ds.stencilFront.depthFailOp != StencilOperation::Keep ||
+         ds.stencilBack.failOp != StencilOperation::Keep ||
+         ds.stencilBack.passOp != StencilOperation::Keep ||
+         ds.stencilBack.depthFailOp != StencilOperation::Keep;
+}
+
+std::shared_ptr<D3D12RenderPipeline> D3D12RenderPipeline::Make(
+    D3D12GPU* gpu, const RenderPipelineDescriptor& descriptor) {
+  if (gpu == nullptr) {
+    return nullptr;
+  }
+  auto pipeline = gpu->makeResource<D3D12RenderPipeline>(gpu, descriptor);
+  if (pipeline->pipelineState == nullptr) {
+    return nullptr;
+  }
+  return pipeline;
+}
+
+D3D12RenderPipeline::D3D12RenderPipeline(D3D12GPU* gpu,
+                                         const RenderPipelineDescriptor& descriptor) {
+  if (!createRootSignature(gpu, descriptor)) {
+    return;
+  }
+  if (!createPipelineState(gpu, descriptor)) {
+    return;
+  }
+}
+
+void D3D12RenderPipeline::onRelease(D3D12GPU*) {
+  pipelineState = nullptr;
+  rootSignature = nullptr;
+}
+
+uint32_t D3D12RenderPipeline::getUniformRootParameterIndex(unsigned binding) const {
+  auto it = uniformRootParameterIndex.find(binding);
+  return it != uniformRootParameterIndex.end() ? it->second : UINT32_MAX;
+}
+
+uint32_t D3D12RenderPipeline::getTextureRootParameterIndex(unsigned binding) const {
+  auto it = textureRootParameterIndex.find(binding);
+  return it != textureRootParameterIndex.end() ? it->second : UINT32_MAX;
+}
+
+uint32_t D3D12RenderPipeline::getSamplerRootParameterIndex(unsigned binding) const {
+  auto it = samplerRootParameterIndex.find(binding);
+  return it != samplerRootParameterIndex.end() ? it->second : UINT32_MAX;
+}
+
+unsigned D3D12RenderPipeline::getTextureIndex(unsigned binding) const {
+  auto it = textureUnits.find(binding);
+  return it != textureUnits.end() ? it->second : binding;
+}
+
+uint32_t D3D12RenderPipeline::getUniformBlockVisibility(unsigned binding) const {
+  auto it = uniformBlockVisibility.find(binding);
+  return it != uniformBlockVisibility.end() ? it->second : ShaderVisibility::VertexFragment;
+}
+
+bool D3D12RenderPipeline::createRootSignature(D3D12GPU* gpu,
+                                              const RenderPipelineDescriptor& descriptor) {
+  // First, populate the per-binding index maps — those are needed for every pipeline regardless
+  // of whether the underlying ID3D12RootSignature is cached. Walk uniform blocks first, then
+  // texture samplers, so the parameter indices line up with the order used when serialising.
+  std::vector<uint8_t> shapeKey;
+  // Reserve roughly: 1 byte UBO count + 4 bytes per UBO (2 visibility + 1 vertex register +
+  // 1 fragment register) + 1 byte sampler count + 2 bytes per sampler (visibility).
+  shapeKey.reserve(2 + descriptor.layout.uniformBlocks.size() * 4 +
+                   descriptor.layout.textureSamplers.size() * 2);
+
+  // Pre-scan uniform blocks to compute, for every entry, its 0-based register index inside the
+  // vertex and fragment stages. SPIR-V binding K is mapped to HLSL register b{idx} where idx is
+  // the entry's position among same-stage entries in BindingLayout.uniformBlocks. Two entries
+  // visible to the same stage must therefore yield different register indices, and the indices
+  // must agree with what D3D12ShaderModule produces when it walks the SPIR-V resources of that
+  // stage in declaration order.
+  std::vector<uint8_t> ubVertexRegister(descriptor.layout.uniformBlocks.size(), 0xFF);
+  std::vector<uint8_t> ubFragmentRegister(descriptor.layout.uniformBlocks.size(), 0xFF);
+  uint32_t nextVertexRegister = 0;
+  uint32_t nextFragmentRegister = 0;
+  for (size_t i = 0; i < descriptor.layout.uniformBlocks.size(); i++) {
+    const auto& entry = descriptor.layout.uniformBlocks[i];
+    if (entry.visibility & ShaderVisibility::Vertex) {
+      ubVertexRegister[i] = static_cast<uint8_t>(nextVertexRegister++);
+    }
+    if (entry.visibility & ShaderVisibility::Fragment) {
+      ubFragmentRegister[i] = static_cast<uint8_t>(nextFragmentRegister++);
+    }
+  }
+
+  uint32_t paramCursor = 0;
+  shapeKey.push_back(static_cast<uint8_t>(descriptor.layout.uniformBlocks.size()));
+  for (size_t i = 0; i < descriptor.layout.uniformBlocks.size(); i++) {
+    const auto& entry = descriptor.layout.uniformBlocks[i];
+    uniformRootParameterIndex[entry.binding] = paramCursor++;
+    uniformBlockVisibility[entry.binding] = entry.visibility;
+    uniformBindingSet.insert(entry.binding);
+    // Encode visibility plus per-stage register indices in the shape key. Different stage-local
+    // register layouts must hit different cached root signatures; otherwise a pipeline whose
+    // fragment UBO ends up at b1 (because it has a sibling at b0) would reuse another
+    // pipeline's root signature that still places it at b0.
+    shapeKey.push_back(static_cast<uint8_t>(entry.visibility & 0xFF));
+    shapeKey.push_back(static_cast<uint8_t>((entry.visibility >> 8) & 0xFF));
+    shapeKey.push_back(ubVertexRegister[i]);
+    shapeKey.push_back(ubFragmentRegister[i]);
+  }
+
+  shapeKey.push_back(static_cast<uint8_t>(descriptor.layout.textureSamplers.size()));
+  unsigned textureUnit = 0;
+  for (const auto& entry : descriptor.layout.textureSamplers) {
+    uint32_t srvParamIndex = paramCursor++;
+    uint32_t samplerParamIndex = paramCursor++;
+    textureRootParameterIndex[entry.binding] = srvParamIndex;
+    samplerRootParameterIndex[entry.binding] = samplerParamIndex;
+    textureUnits[entry.binding] = textureUnit++;
+    textureBindingSet.insert(entry.binding);
+    // Encode each sampler binding's visibility into the shape key. Without this two pipelines
+    // that differ only in vertex/fragment-only sampler visibility would collide on the cached
+    // root signature once the SRV/Sampler root parameters below honour entry.visibility.
+    shapeKey.push_back(static_cast<uint8_t>(entry.visibility & 0xFF));
+    shapeKey.push_back(static_cast<uint8_t>((entry.visibility >> 8) & 0xFF));
+  }
+
+  // Cache hit: reuse the existing D3D12 root signature object. Different pipelines sharing the
+  // same binding shape (e.g. all single-texture fragment-only shaders) end up referencing one
+  // ID3D12RootSignature, saving SerializeRootSignature + CreateRootSignature on every PSO.
+  if (auto cached = gpu->findRootSignature(shapeKey); cached != nullptr) {
+    rootSignature = std::move(cached);
+    return true;
+  }
+
+  // Cache miss: build the root signature description from scratch and serialise it.
+  std::vector<D3D12_ROOT_PARAMETER> rootParameters;
+  // Each texture binding contributes two descriptor tables (SRV + Sampler) that live in different
+  // descriptor heap types. They cannot share one D3D12_ROOT_PARAMETER because each table can only
+  // reference a single heap. We therefore store each range in its own array entry; the
+  // D3D12_ROOT_PARAMETER references the array by pointer, so the storage must outlive the
+  // SerializeRootSignature call. reserve() keeps pointers stable across emplace_back().
+  std::vector<D3D12_DESCRIPTOR_RANGE> srvRanges;
+  std::vector<D3D12_DESCRIPTOR_RANGE> samplerRanges;
+  srvRanges.reserve(descriptor.layout.textureSamplers.size());
+  samplerRanges.reserve(descriptor.layout.textureSamplers.size());
+
+  for (size_t i = 0; i < descriptor.layout.uniformBlocks.size(); i++) {
+    const auto& entry = descriptor.layout.uniformBlocks[i];
+    D3D12_ROOT_PARAMETER param = {};
+    param.ParameterType = D3D12_ROOT_PARAMETER_TYPE_CBV;
+    param.ShaderVisibility = ToD3D12ShaderVisibility(entry.visibility);
+    // ShaderRegister is per-stage in HLSL. Pick the register index from whichever stage the
+    // entry is visible to; for VertexFragment-visible UBOs the two stage-local indices must
+    // match, otherwise the single CBV root parameter cannot satisfy both stages with one
+    // register number. Such a configuration is rejected here so the mismatch surfaces early
+    // instead of producing silently broken bindings.
+    uint8_t vsReg = ubVertexRegister[i];
+    uint8_t fsReg = ubFragmentRegister[i];
+    if (vsReg != 0xFF && fsReg != 0xFF && vsReg != fsReg) {
+      LOGE(
+          "D3D12RenderPipeline: VertexFragment-visible UBO binding %u cannot share a single CBV "
+          "root parameter when its vertex-stage register (b%u) and fragment-stage register (b%u) "
+          "differ. Either split it into vertex-only and fragment-only entries, or extend the "
+          "root signature to emit two CBV root parameters for this binding.",
+          entry.binding, static_cast<unsigned>(vsReg), static_cast<unsigned>(fsReg));
+      return false;
+    }
+    param.Descriptor.ShaderRegister = (vsReg != 0xFF) ? vsReg : fsReg;
+    param.Descriptor.RegisterSpace = 0;
+    rootParameters.push_back(param);
+  }
+
+  unsigned rangeRegister = 0;
+  for (const auto& entry : descriptor.layout.textureSamplers) {
+    auto& srvRange = srvRanges.emplace_back();
+    srvRange = {};
+    srvRange.RangeType = D3D12_DESCRIPTOR_RANGE_TYPE_SRV;
+    srvRange.NumDescriptors = 1;
+    srvRange.BaseShaderRegister = rangeRegister;
+    srvRange.RegisterSpace = 0;
+    srvRange.OffsetInDescriptorsFromTableStart = 0;
+
+    D3D12_ROOT_PARAMETER srvParam = {};
+    srvParam.ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE;
+    // Honour the caller-declared visibility instead of forcing pixel-only. Vertex texture
+    // sampling (noise / displacement / geometry LOD lookups) needs SRVs visible to the vertex
+    // stage; the per-entry shapeKey above already partitions the cache so different visibility
+    // shapes do not collide.
+    srvParam.ShaderVisibility = ToD3D12ShaderVisibility(entry.visibility);
+    srvParam.DescriptorTable.NumDescriptorRanges = 1;
+    srvParam.DescriptorTable.pDescriptorRanges = &srvRange;
+    rootParameters.push_back(srvParam);
+
+    auto& samplerRange = samplerRanges.emplace_back();
+    samplerRange = {};
+    samplerRange.RangeType = D3D12_DESCRIPTOR_RANGE_TYPE_SAMPLER;
+    samplerRange.NumDescriptors = 1;
+    samplerRange.BaseShaderRegister = rangeRegister;
+    samplerRange.RegisterSpace = 0;
+    samplerRange.OffsetInDescriptorsFromTableStart = 0;
+
+    D3D12_ROOT_PARAMETER samplerParam = {};
+    samplerParam.ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE;
+    samplerParam.ShaderVisibility = ToD3D12ShaderVisibility(entry.visibility);
+    samplerParam.DescriptorTable.NumDescriptorRanges = 1;
+    samplerParam.DescriptorTable.pDescriptorRanges = &samplerRange;
+    rootParameters.push_back(samplerParam);
+
+    rangeRegister++;
+  }
+
+  D3D12_ROOT_SIGNATURE_DESC rootSigDesc = {};
+  rootSigDesc.NumParameters = static_cast<UINT>(rootParameters.size());
+  rootSigDesc.pParameters = rootParameters.empty() ? nullptr : rootParameters.data();
+  rootSigDesc.NumStaticSamplers = 0;
+  rootSigDesc.pStaticSamplers = nullptr;
+  rootSigDesc.Flags = D3D12_ROOT_SIGNATURE_FLAG_ALLOW_INPUT_ASSEMBLER_INPUT_LAYOUT;
+
+  ComPtr<ID3DBlob> blob = nullptr;
+  ComPtr<ID3DBlob> errorBlob = nullptr;
+  auto hr =
+      D3D12SerializeRootSignature(&rootSigDesc, D3D_ROOT_SIGNATURE_VERSION_1, &blob, &errorBlob);
+  if (FAILED(hr)) {
+    if (errorBlob != nullptr) {
+      LOGE("D3D12RenderPipeline: D3D12SerializeRootSignature failed (HRESULT=0x%08X): %s",
+           static_cast<unsigned>(hr), static_cast<const char*>(errorBlob->GetBufferPointer()));
+    } else {
+      LOGE("D3D12RenderPipeline: D3D12SerializeRootSignature failed (HRESULT=0x%08X).",
+           static_cast<unsigned>(hr));
+    }
+    return false;
+  }
+
+  hr = gpu->device()->CreateRootSignature(0, blob->GetBufferPointer(), blob->GetBufferSize(),
+                                          IID_PPV_ARGS(&rootSignature));
+  if (FAILED(hr)) {
+    LOGE("D3D12RenderPipeline: CreateRootSignature failed (HRESULT=0x%08X).",
+         static_cast<unsigned>(hr));
+    rootSignature = nullptr;
+    return false;
+  }
+  // Publish the freshly-built root signature so subsequent pipelines with the same shape hit
+  // the cache. The map keeps an additional ComPtr reference; the pipeline retains its own
+  // reference via the rootSignature member, so the object outlives whichever owner is dropped
+  // first.
+  gpu->cacheRootSignature(std::move(shapeKey), rootSignature);
+  return true;
+}
+
+bool D3D12RenderPipeline::createPipelineState(D3D12GPU* gpu,
+                                              const RenderPipelineDescriptor& descriptor) {
+  if (!descriptor.vertex.module || !descriptor.fragment.module) {
+    LOGE("D3D12RenderPipeline: vertex or fragment shader module is missing.");
+    return false;
+  }
+  auto vertexShader = std::static_pointer_cast<D3D12ShaderModule>(descriptor.vertex.module);
+  auto fragmentShader = std::static_pointer_cast<D3D12ShaderModule>(descriptor.fragment.module);
+  auto vsBytecode = vertexShader->shaderBytecode();
+  auto psBytecode = fragmentShader->shaderBytecode();
+  if (vsBytecode.pShaderBytecode == nullptr || psBytecode.pShaderBytecode == nullptr) {
+    LOGE("D3D12RenderPipeline: shader module produced empty bytecode.");
+    return false;
+  }
+
+  // Vertex input layout. Semantic names match the SPIRV-Cross HLSL convention of TEXCOORD{N},
+  // where N is the SPIR-V input location assigned by ShaderCompiler::PreprocessGLSL().
+  std::vector<D3D12_INPUT_ELEMENT_DESC> inputElements;
+  uint32_t globalLocation = 0;
+  vertexStrides.assign(descriptor.vertex.bufferLayouts.size(), 0);
+  for (uint32_t i = 0; i < static_cast<uint32_t>(descriptor.vertex.bufferLayouts.size()); i++) {
+    const auto& layout = descriptor.vertex.bufferLayouts[i];
+    uint32_t offset = 0;
+    for (const auto& attr : layout.attributes) {
+      D3D12_INPUT_ELEMENT_DESC element = {};
+      element.SemanticName = "TEXCOORD";
+      element.SemanticIndex = globalLocation++;
+      element.Format = ToD3D12VertexFormat(attr.format());
+      element.InputSlot = i;
+      element.AlignedByteOffset = offset;
+      element.InputSlotClass = (layout.stepMode == VertexStepMode::Instance)
+                                   ? D3D12_INPUT_CLASSIFICATION_PER_INSTANCE_DATA
+                                   : D3D12_INPUT_CLASSIFICATION_PER_VERTEX_DATA;
+      element.InstanceDataStepRate = (layout.stepMode == VertexStepMode::Instance) ? 1 : 0;
+      inputElements.push_back(element);
+      offset += static_cast<uint32_t>(attr.size());
+    }
+    // Fall back to the computed attribute total when the descriptor leaves stride at zero, which
+    // is the same convention the Vulkan/Metal backends use.
+    vertexStrides[i] = static_cast<uint32_t>(layout.stride > 0 ? layout.stride : offset);
+  }
+
+  D3D12_GRAPHICS_PIPELINE_STATE_DESC psoDesc = {};
+  psoDesc.pRootSignature = rootSignature.Get();
+  psoDesc.VS = vsBytecode;
+  psoDesc.PS = psBytecode;
+
+  // Blend state — one entry per color attachment.
+  psoDesc.BlendState.AlphaToCoverageEnable =
+      descriptor.multisample.alphaToCoverageEnabled ? TRUE : FALSE;
+  psoDesc.BlendState.IndependentBlendEnable =
+      (descriptor.fragment.colorAttachments.size() > 1) ? TRUE : FALSE;
+  for (size_t i = 0; i < descriptor.fragment.colorAttachments.size() &&
+                     i < D3D12_SIMULTANEOUS_RENDER_TARGET_COUNT;
+       i++) {
+    const auto& ca = descriptor.fragment.colorAttachments[i];
+    auto& rt = psoDesc.BlendState.RenderTarget[i];
+    rt.BlendEnable = ca.blendEnable ? TRUE : FALSE;
+    rt.LogicOpEnable = FALSE;
+    rt.SrcBlend = ToD3D12BlendFactor(ca.srcColorBlendFactor);
+    rt.DestBlend = ToD3D12BlendFactor(ca.dstColorBlendFactor);
+    rt.BlendOp = ToD3D12BlendOperation(ca.colorBlendOp);
+    rt.SrcBlendAlpha = ToD3D12BlendFactorAlpha(ca.srcAlphaBlendFactor);
+    rt.DestBlendAlpha = ToD3D12BlendFactorAlpha(ca.dstAlphaBlendFactor);
+    rt.BlendOpAlpha = ToD3D12BlendOperation(ca.alphaBlendOp);
+    rt.LogicOp = D3D12_LOGIC_OP_NOOP;
+    rt.RenderTargetWriteMask = ToD3D12RenderTargetWriteMask(ca.colorWriteMask);
+  }
+
+  psoDesc.SampleMask = descriptor.multisample.mask;
+
+  // Rasterizer.
+  psoDesc.RasterizerState.FillMode = D3D12_FILL_MODE_SOLID;
+  psoDesc.RasterizerState.CullMode = ToD3D12CullMode(descriptor.primitive.cullMode);
+  psoDesc.RasterizerState.FrontCounterClockwise =
+      ToD3D12FrontCounterClockwise(descriptor.primitive.frontFace) ? TRUE : FALSE;
+  psoDesc.RasterizerState.DepthBias = D3D12_DEFAULT_DEPTH_BIAS;
+  psoDesc.RasterizerState.DepthBiasClamp = D3D12_DEFAULT_DEPTH_BIAS_CLAMP;
+  psoDesc.RasterizerState.SlopeScaledDepthBias = D3D12_DEFAULT_SLOPE_SCALED_DEPTH_BIAS;
+  psoDesc.RasterizerState.DepthClipEnable = TRUE;
+  psoDesc.RasterizerState.MultisampleEnable = (descriptor.multisample.count > 1) ? TRUE : FALSE;
+  psoDesc.RasterizerState.AntialiasedLineEnable = FALSE;
+  psoDesc.RasterizerState.ForcedSampleCount = 0;
+  psoDesc.RasterizerState.ConservativeRaster = D3D12_CONSERVATIVE_RASTERIZATION_MODE_OFF;
+
+  // Depth-stencil. Depth test follows the same enable predicate as VulkanRenderPipeline.
+  bool depthTestEnable = (descriptor.depthStencil.depthCompare != CompareFunction::Always) ||
+                         descriptor.depthStencil.depthWriteEnabled;
+  psoDesc.DepthStencilState.DepthEnable = depthTestEnable ? TRUE : FALSE;
+  psoDesc.DepthStencilState.DepthWriteMask = descriptor.depthStencil.depthWriteEnabled
+                                                 ? D3D12_DEPTH_WRITE_MASK_ALL
+                                                 : D3D12_DEPTH_WRITE_MASK_ZERO;
+  psoDesc.DepthStencilState.DepthFunc =
+      ToD3D12CompareFunction(descriptor.depthStencil.depthCompare);
+  psoDesc.DepthStencilState.StencilEnable =
+      HasNonTrivialStencilState(descriptor.depthStencil) ? TRUE : FALSE;
+  psoDesc.DepthStencilState.StencilReadMask =
+      static_cast<UINT8>(descriptor.depthStencil.stencilReadMask);
+  psoDesc.DepthStencilState.StencilWriteMask =
+      static_cast<UINT8>(descriptor.depthStencil.stencilWriteMask);
+  psoDesc.DepthStencilState.FrontFace.StencilFailOp =
+      ToD3D12StencilOperation(descriptor.depthStencil.stencilFront.failOp);
+  psoDesc.DepthStencilState.FrontFace.StencilDepthFailOp =
+      ToD3D12StencilOperation(descriptor.depthStencil.stencilFront.depthFailOp);
+  psoDesc.DepthStencilState.FrontFace.StencilPassOp =
+      ToD3D12StencilOperation(descriptor.depthStencil.stencilFront.passOp);
+  psoDesc.DepthStencilState.FrontFace.StencilFunc =
+      ToD3D12CompareFunction(descriptor.depthStencil.stencilFront.compare);
+  psoDesc.DepthStencilState.BackFace.StencilFailOp =
+      ToD3D12StencilOperation(descriptor.depthStencil.stencilBack.failOp);
+  psoDesc.DepthStencilState.BackFace.StencilDepthFailOp =
+      ToD3D12StencilOperation(descriptor.depthStencil.stencilBack.depthFailOp);
+  psoDesc.DepthStencilState.BackFace.StencilPassOp =
+      ToD3D12StencilOperation(descriptor.depthStencil.stencilBack.passOp);
+  psoDesc.DepthStencilState.BackFace.StencilFunc =
+      ToD3D12CompareFunction(descriptor.depthStencil.stencilBack.compare);
+
+  psoDesc.InputLayout.pInputElementDescs = inputElements.empty() ? nullptr : inputElements.data();
+  psoDesc.InputLayout.NumElements = static_cast<UINT>(inputElements.size());
+  // Strip cut and topology type live on the PSO in D3D12, but tgfx exposes IndexFormat and
+  // PrimitiveType as per-draw-call state (RenderPass::setIndexBuffer / RenderPass::draw)
+  // rather than fields on RenderPipelineDescriptor. The two values below therefore must be
+  // chosen at PSO creation time without knowing what the eventual draws look like, so we hard
+  // code them to the only combination tgfx ever uses:
+  //   * IBStripCutValue=DISABLED — matches Vulkan, which sets primitiveRestartEnable=false on
+  //     its PSOs. No tgfx draw op relies on 0xFFFF/0xFFFFFFFF restarting a strip.
+  //   * PrimitiveTopologyType=TRIANGLE — tgfx's PrimitiveType only carries Triangles and
+  //     TriangleStrip today and ToD3D12PrimitiveTopologyType already collapses both onto
+  //     TRIANGLE. Once tgfx adds LINE/POINT (or moves these fields onto PrimitiveDescriptor)
+  //     this branch must be revisited together with the matching IASetPrimitiveTopology call
+  //     in D3D12RenderPass; until then a single PSO topology type covers every draw call.
+  psoDesc.IBStripCutValue = D3D12_INDEX_BUFFER_STRIP_CUT_VALUE_DISABLED;
+  psoDesc.PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE_TRIANGLE;
+  psoDesc.NumRenderTargets = static_cast<UINT>(descriptor.fragment.colorAttachments.size());
+  for (size_t i = 0; i < descriptor.fragment.colorAttachments.size() &&
+                     i < D3D12_SIMULTANEOUS_RENDER_TARGET_COUNT;
+       i++) {
+    psoDesc.RTVFormats[i] = static_cast<DXGI_FORMAT>(
+        gpu->getDXGIFormat(descriptor.fragment.colorAttachments[i].format));
+  }
+  psoDesc.DSVFormat =
+      (descriptor.depthStencil.format != PixelFormat::Unknown)
+          ? static_cast<DXGI_FORMAT>(gpu->getDXGIFormat(descriptor.depthStencil.format))
+          : static_cast<DXGI_FORMAT>(DXGI_FORMAT_UNKNOWN);
+  psoDesc.SampleDesc.Count = static_cast<UINT>(descriptor.multisample.count);
+  psoDesc.SampleDesc.Quality = 0;
+  psoDesc.NodeMask = 0;
+  psoDesc.Flags = D3D12_PIPELINE_STATE_FLAG_NONE;
+
+  auto hr = gpu->device()->CreateGraphicsPipelineState(&psoDesc, IID_PPV_ARGS(&pipelineState));
+  if (FAILED(hr)) {
+    LOGE("D3D12RenderPipeline: CreateGraphicsPipelineState failed (HRESULT=0x%08X).",
+         static_cast<unsigned>(hr));
+#ifdef TGFX_D3D12_DEBUG_LAYER
+    // Surface debug-layer messages so the underlying validation issue is visible. These are
+    // queued by the runtime when EnableDebugLayer was called before device creation.
+    ComPtr<ID3D12InfoQueue> infoQueue = nullptr;
+    if (SUCCEEDED(gpu->device()->QueryInterface(IID_PPV_ARGS(&infoQueue)))) {
+      auto count = infoQueue->GetNumStoredMessages();
+      for (UINT64 i = 0; i < count; i++) {
+        SIZE_T msgLen = 0;
+        infoQueue->GetMessage(i, nullptr, &msgLen);
+        std::vector<char> buf(msgLen);
+        auto* msg = reinterpret_cast<D3D12_MESSAGE*>(buf.data());
+        if (SUCCEEDED(infoQueue->GetMessage(i, msg, &msgLen))) {
+          LOGE("  D3D12 message: %.*s", static_cast<int>(msg->DescriptionByteLength),
+               msg->pDescription);
+        }
+      }
+      infoQueue->ClearStoredMessages();
+    }
+#endif
+    pipelineState = nullptr;
+    return false;
+  }
+  return true;
+}
+
+}  // namespace tgfx
diff --git a/src/gpu/d3d12/D3D12RenderPipeline.h b/src/gpu/d3d12/D3D12RenderPipeline.h
new file mode 100644
index 000000000..f1df24633
--- /dev/null
+++ b/src/gpu/d3d12/D3D12RenderPipeline.h
@@ -0,0 +1,146 @@
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Tencent is pleased to support the open source community by making tgfx available.
+//
+//  Copyright (C) 2026 Tencent. All rights reserved.
+//
+//  Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+//  in compliance with the License. You may obtain a copy of the License at
+//
+//      https://opensource.org/licenses/BSD-3-Clause
+//
+//  unless required by applicable law or agreed to in writing, software distributed under the
+//  license is distributed on an "as is" basis, without warranties or conditions of any kind,
+//  either express or implied. see the license for the specific language governing permissions
+//  and limitations under the license.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include "D3D12Resource.h"
+#include "D3D12Util.h"
+#include "tgfx/gpu/RenderPipeline.h"
+
+namespace tgfx {
+
+class D3D12GPU;
+
+/**
+ * D3D12 render pipeline implementation. Owns three D3D12 objects produced from a single
+ * RenderPipelineDescriptor:
+ *
+ *   1. Root signature  — equivalent to Vulkan's pipeline layout + descriptor set layout. Lays
+ *                        out where uniform buffers, textures, and samplers bind in the root
+ *                        argument table consumed by the command list.
+ *   2. Pipeline state object (PSO) — fixed-function + shader configuration; bound once via
+ *                        SetPipelineState() at the start of a draw sequence.
+ *   3. Binding metadata — lookup tables that translate user-facing binding numbers (the same
+ *                        numbers passed by GLSL programs) to root-parameter indices and texture
+ *                        unit ordinals consumed by D3D12RenderPass.
+ *
+ * Root signature layout produced for every pipeline (matches the SPIR-V -> HLSL register
+ * convention used by D3D12ShaderModule):
+ *
+ *   root parameter 0 : CBV  (b0, visibility = Vertex)        [VertexUniformBlock, optional]
+ *   root parameter 1 : CBV  (b0, visibility = Pixel)         [FragmentUniformBlock, optional]
+ *   root parameter 2..N+1 : DescriptorTable {SRV t{i}, Sampler s{i}, visibility = Pixel}
+ *                                                            [one per texture sampler binding]
+ *
+ * UBO root parameters are CBVs with raw GPU virtual addresses, allowing the command queue to
+ * dynamically supply per-draw uniform data without re-allocating descriptor heaps.
+ */
+class D3D12RenderPipeline : public RenderPipeline, public D3D12Resource {
+ public:
+  static std::shared_ptr<D3D12RenderPipeline> Make(D3D12GPU* gpu,
+                                                   const RenderPipelineDescriptor& descriptor);
+
+  ID3D12RootSignature* d3d12RootSignature() const {
+    return rootSignature.Get();
+  }
+
+  ID3D12PipelineState* d3d12PipelineState() const {
+    return pipelineState.Get();
+  }
+
+  /**
+   * Returns the root-parameter index that holds the CBV for the given uniform-block binding,
+   * or UINT32_MAX if the binding is not present in the pipeline.
+   */
+  uint32_t getUniformRootParameterIndex(unsigned binding) const;
+
+  /**
+   * Returns the root-parameter index of the descriptor table holding the SRV for the given
+   * texture-sampler binding, or UINT32_MAX if the binding is not present. The Sampler descriptor
+   * table for the same binding is stored at the next consecutive root parameter and can be
+   * obtained with getSamplerRootParameterIndex().
+   */
+  uint32_t getTextureRootParameterIndex(unsigned binding) const;
+
+  /**
+   * Returns the root-parameter index of the descriptor table holding the Sampler for the given
+   * texture-sampler binding, or UINT32_MAX if the binding is not present.
+   */
+  uint32_t getSamplerRootParameterIndex(unsigned binding) const;
+
+  /**
+   * Returns the dense 0-based texture unit index for a texture-sampler binding. Mirrors the
+   * VulkanRenderPipeline accessor used by RenderPass to map binding -> shader register.
+   */
+  unsigned getTextureIndex(unsigned binding) const;
+
+  /**
+   * Returns the visibility bitmask (ShaderVisibility::*) declared by the user for a uniform-block
+   * binding, or ShaderVisibility::VertexFragment if unspecified.
+   */
+  uint32_t getUniformBlockVisibility(unsigned binding) const;
+
+  /**
+   * Returns the byte stride for the vertex buffer slot at the given index, as declared by the
+   * pipeline's VertexBufferLayout. Returns 0 for slots that the pipeline does not consume.
+   */
+  uint32_t getVertexStride(unsigned slot) const {
+    return slot < vertexStrides.size() ? vertexStrides[slot] : 0;
+  }
+
+  bool hasUniformBinding(unsigned binding) const {
+    return uniformBindingSet.count(binding) > 0;
+  }
+
+  bool hasTextureBinding(unsigned binding) const {
+    return textureBindingSet.count(binding) > 0;
+  }
+
+  const std::unordered_set<unsigned>& getTextureBindings() const {
+    return textureBindingSet;
+  }
+
+ protected:
+  void onRelease(D3D12GPU* gpu) override;
+
+ private:
+  D3D12RenderPipeline(D3D12GPU* gpu, const RenderPipelineDescriptor& descriptor);
+  ~D3D12RenderPipeline() override = default;
+
+  bool createRootSignature(D3D12GPU* gpu, const RenderPipelineDescriptor& descriptor);
+  bool createPipelineState(D3D12GPU* gpu, const RenderPipelineDescriptor& descriptor);
+
+  ComPtr<ID3D12RootSignature> rootSignature = nullptr;
+  ComPtr<ID3D12PipelineState> pipelineState = nullptr;
+
+  std::unordered_map<unsigned, uint32_t> uniformRootParameterIndex = {};
+  std::unordered_map<unsigned, uint32_t> textureRootParameterIndex = {};
+  std::unordered_map<unsigned, uint32_t> samplerRootParameterIndex = {};
+  std::unordered_map<unsigned, unsigned> textureUnits = {};
+  std::unordered_map<unsigned, uint32_t> uniformBlockVisibility = {};
+  std::unordered_set<unsigned> uniformBindingSet = {};
+  std::unordered_set<unsigned> textureBindingSet = {};
+  std::vector<uint32_t> vertexStrides = {};
+
+  friend class D3D12GPU;
+};
+
+}  // namespace tgfx
diff --git a/src/gpu/d3d12/D3D12Resource.cpp b/src/gpu/d3d12/D3D12Resource.cpp
new file mode 100644
index 000000000..b33e6236b
--- /dev/null
+++ b/src/gpu/d3d12/D3D12Resource.cpp
@@ -0,0 +1,21 @@
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Tencent is pleased to support the open source community by making tgfx available.
+//
+//  Copyright (C) 2026 Tencent. All rights reserved.
+//
+//  Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+//  in compliance with the License. You may obtain a copy of the License at
+//
+//      https://opensource.org/licenses/BSD-3-Clause
+//
+//  unless required by applicable law or agreed to in writing, software distributed under the
+//  license is distributed on an "as is" basis, without warranties or conditions of any kind,
+//  either express or implied. see the license for the specific language governing permissions
+//  and limitations under the license.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "D3D12Resource.h"
+
+namespace tgfx {}  // namespace tgfx
diff --git a/src/gpu/d3d12/D3D12Resource.h b/src/gpu/d3d12/D3D12Resource.h
new file mode 100644
index 000000000..707abf10f
--- /dev/null
+++ b/src/gpu/d3d12/D3D12Resource.h
@@ -0,0 +1,47 @@
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Tencent is pleased to support the open source community by making tgfx available.
+//
+//  Copyright (C) 2026 Tencent. All rights reserved.
+//
+//  Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+//  in compliance with the License. You may obtain a copy of the License at
+//
+//      https://opensource.org/licenses/BSD-3-Clause
+//
+//  unless required by applicable law or agreed to in writing, software distributed under the
+//  license is distributed on an "as is" basis, without warranties or conditions of any kind,
+//  either express or implied. see the license for the specific language governing permissions
+//  and limitations under the license.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include <list>
+#include "core/utils/ReturnQueue.h"
+
+namespace tgfx {
+
+class D3D12GPU;
+
+/**
+ * Base class for D3D12 GPU resources. Subclasses must implement the onRelease() method to free all
+ * underlying GPU resources. No D3D12 API calls should be made during destruction since the resource
+ * may be destroyed on any thread.
+ */
+class D3D12Resource : public ReturnNode {
+ protected:
+  /**
+   * Overridden to free the underlying D3D12 resources. After calling this method, the D3D12Resource
+   * must not be used, as doing so may lead to undefined behavior.
+   */
+  virtual void onRelease(D3D12GPU* gpu) = 0;
+
+ private:
+  std::list<D3D12Resource*>::iterator cachedPosition;
+
+  friend class D3D12GPU;
+};
+
+}  // namespace tgfx
diff --git a/src/gpu/d3d12/D3D12Sampler.cpp b/src/gpu/d3d12/D3D12Sampler.cpp
new file mode 100644
index 000000000..0391ffa98
--- /dev/null
+++ b/src/gpu/d3d12/D3D12Sampler.cpp
@@ -0,0 +1,78 @@
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Tencent is pleased to support the open source community by making tgfx available.
+//
+//  Copyright (C) 2026 Tencent. All rights reserved.
+//
+//  Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+//  in compliance with the License. You may obtain a copy of the License at
+//
+//      https://opensource.org/licenses/BSD-3-Clause
+//
+//  unless required by applicable law or agreed to in writing, software distributed under the
+//  license is distributed on an "as is" basis, without warranties or conditions of any kind,
+//  either express or implied. see the license for the specific language governing permissions
+//  and limitations under the license.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "D3D12Sampler.h"
+#include "D3D12GPU.h"
+
+namespace tgfx {
+
+std::shared_ptr<D3D12Sampler> D3D12Sampler::Make(D3D12GPU* gpu,
+                                                 const SamplerDescriptor& descriptor) {
+  if (gpu == nullptr) {
+    return nullptr;
+  }
+
+  D3D12_SAMPLER_DESC samplerDesc = {};
+  samplerDesc.Filter =
+      ToD3D12Filter(descriptor.minFilter, descriptor.magFilter, descriptor.mipmapMode);
+  samplerDesc.AddressU = ToD3D12AddressMode(descriptor.addressModeX);
+  samplerDesc.AddressV = ToD3D12AddressMode(descriptor.addressModeY);
+  samplerDesc.AddressW = D3D12_TEXTURE_ADDRESS_MODE_CLAMP;
+  samplerDesc.MipLODBias = 0.0f;
+  samplerDesc.MaxAnisotropy = 1;
+  samplerDesc.ComparisonFunc = D3D12_COMPARISON_FUNC_NEVER;
+  // tgfx's public SamplerDescriptor does not expose a border colour today, so all three GPU
+  // backends hardcode transparent black: VulkanSampler picks VK_BORDER_COLOR_FLOAT_TRANSPARENT_
+  // BLACK and MetalSampler picks MTLSamplerBorderColorTransparentBlack. If a borderColor field
+  // is ever added to SamplerDescriptor, this branch must thread it through and D3D12GPU::
+  // MakeSamplerKey must include it in the cache key (otherwise two samplers differing only in
+  // border colour would collide in samplerCache). Keep the three backends in sync.
+  samplerDesc.BorderColor[0] = 0.0f;
+  samplerDesc.BorderColor[1] = 0.0f;
+  samplerDesc.BorderColor[2] = 0.0f;
+  samplerDesc.BorderColor[3] = 0.0f;
+  samplerDesc.MinLOD = 0.0f;
+  // When mipmap is disabled, clamp MaxLOD to 0 so the hardware always samples mip 0. Picking a
+  // D3D12_FILTER_*_MIP_POINT alone is not enough: the driver still walks the mip chain and may
+  // pick a smaller level (the "mipmap-disabled" filters in D3D12 only describe the filter shape
+  // used between mips, not whether mip selection happens). Mirror VulkanSampler's maxLod clamp
+  // so a SamplerDescriptor with mipmapMode=None produces the same result that
+  // RenderContext::drawImageRect (and other Strict-constraint paths) intend.
+  samplerDesc.MaxLOD = (descriptor.mipmapMode == MipmapMode::None) ? 0.0f : D3D12_FLOAT32_MAX;
+
+  // Permanently reserve a slot in the process-wide shader-visible Sampler heap and write the
+  // descriptor there. The slot lives for the rest of the GPU's lifetime, mirroring the cache
+  // semantics already enforced by D3D12GPU::createSampler.
+  auto gpuHandle = gpu->allocatePermanentSamplerSlot(samplerDesc);
+  if (gpuHandle.ptr == 0) {
+    return nullptr;
+  }
+
+  return gpu->makeResource<D3D12Sampler>(samplerDesc, gpuHandle);
+}
+
+D3D12Sampler::D3D12Sampler(const D3D12_SAMPLER_DESC& samplerDesc,
+                           D3D12_GPU_DESCRIPTOR_HANDLE gpuHandle)
+    : _samplerDesc(samplerDesc), _gpuHandle(gpuHandle) {
+}
+
+void D3D12Sampler::onRelease(D3D12GPU*) {
+  // D3D12 samplers are pure descriptors. No GPU resource to release.
+}
+
+}  // namespace tgfx
diff --git a/src/gpu/d3d12/D3D12Sampler.h b/src/gpu/d3d12/D3D12Sampler.h
new file mode 100644
index 000000000..33f06c7ab
--- /dev/null
+++ b/src/gpu/d3d12/D3D12Sampler.h
@@ -0,0 +1,69 @@
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Tencent is pleased to support the open source community by making tgfx available.
+//
+//  Copyright (C) 2026 Tencent. All rights reserved.
+//
+//  Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+//  in compliance with the License. You may obtain a copy of the License at
+//
+//      https://opensource.org/licenses/BSD-3-Clause
+//
+//  unless required by applicable law or agreed to in writing, software distributed under the
+//  license is distributed on an "as is" basis, without warranties or conditions of any kind,
+//  either express or implied. see the license for the specific language governing permissions
+//  and limitations under the license.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include "D3D12Resource.h"
+#include "D3D12Util.h"
+#include "tgfx/gpu/Sampler.h"
+
+namespace tgfx {
+
+class D3D12GPU;
+
+/**
+ * D3D12 sampler implementation.
+ *
+ * The sampler descriptor is written into the GPU's process-wide shader-visible Sampler heap at
+ * construction time and the resulting GPU descriptor handle is cached on the instance. That
+ * handle is what render passes bind via SetGraphicsRootDescriptorTable: there is no per-pass
+ * Sampler heap allocation or per-binding CreateSampler call.
+ */
+class D3D12Sampler : public Sampler, public D3D12Resource {
+ public:
+  static std::shared_ptr<D3D12Sampler> Make(D3D12GPU* gpu, const SamplerDescriptor& descriptor);
+
+  /**
+   * Returns the D3D12 sampler description.
+   */
+  const D3D12_SAMPLER_DESC& samplerDesc() const {
+    return _samplerDesc;
+  }
+
+  /**
+   * GPU descriptor handle pointing at this sampler's slot in the process-wide shader-visible
+   * Sampler heap. Stable for the lifetime of the GPU instance.
+   */
+  D3D12_GPU_DESCRIPTOR_HANDLE gpuDescriptorHandle() const {
+    return _gpuHandle;
+  }
+
+ protected:
+  void onRelease(D3D12GPU* gpu) override;
+
+ private:
+  D3D12Sampler(const D3D12_SAMPLER_DESC& samplerDesc, D3D12_GPU_DESCRIPTOR_HANDLE gpuHandle);
+  ~D3D12Sampler() override = default;
+
+  D3D12_SAMPLER_DESC _samplerDesc = {};
+  D3D12_GPU_DESCRIPTOR_HANDLE _gpuHandle = {};
+
+  friend class D3D12GPU;
+};
+
+}  // namespace tgfx
diff --git a/src/gpu/d3d12/D3D12Semaphore.cpp b/src/gpu/d3d12/D3D12Semaphore.cpp
new file mode 100644
index 000000000..a134fffce
--- /dev/null
+++ b/src/gpu/d3d12/D3D12Semaphore.cpp
@@ -0,0 +1,66 @@
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Tencent is pleased to support the open source community by making tgfx available.
+//
+//  Copyright (C) 2026 Tencent. All rights reserved.
+//
+//  Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+//  in compliance with the License. You may obtain a copy of the License at
+//
+//      https://opensource.org/licenses/BSD-3-Clause
+//
+//  unless required by applicable law or agreed to in writing, software distributed under the
+//  license is distributed on an "as is" basis, without warranties or conditions of any kind,
+//  either express or implied. see the license for the specific language governing permissions
+//  and limitations under the license.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "D3D12Semaphore.h"
+#include "D3D12GPU.h"
+#include "core/utils/Log.h"
+
+namespace tgfx {
+
+std::shared_ptr<D3D12Semaphore> D3D12Semaphore::Make(D3D12GPU* gpu) {
+  if (gpu == nullptr) {
+    return nullptr;
+  }
+  ComPtr<ID3D12Fence> fence = nullptr;
+  auto hr = gpu->device()->CreateFence(0, D3D12_FENCE_FLAG_NONE, IID_PPV_ARGS(&fence));
+  if (FAILED(hr)) {
+    LOGE("D3D12Semaphore::Make() CreateFence failed: HRESULT=0x%08X", static_cast<unsigned>(hr));
+    return nullptr;
+  }
+  return gpu->makeResource<D3D12Semaphore>(std::move(fence), static_cast<uint64_t>(0), false);
+}
+
+std::shared_ptr<D3D12Semaphore> D3D12Semaphore::MakeFrom(D3D12GPU* gpu, ComPtr<ID3D12Fence> fence,
+                                                         uint64_t value) {
+  if (gpu == nullptr || fence == nullptr) {
+    return nullptr;
+  }
+  return gpu->makeResource<D3D12Semaphore>(std::move(fence), value, true);
+}
+
+D3D12Semaphore::D3D12Semaphore(ComPtr<ID3D12Fence> fence, uint64_t value, bool adopted)
+    : _fence(std::move(fence)), _value(value), _adopted(adopted) {
+}
+
+BackendSemaphore D3D12Semaphore::getBackendSemaphore() const {
+  if (_fence == nullptr) {
+    return {};
+  }
+  D3D12SyncInfo info = {};
+  info.fence = _fence.Get();
+  info.value = _value;
+  return BackendSemaphore(info);
+}
+
+void D3D12Semaphore::onRelease(D3D12GPU*) {
+  if (!_adopted) {
+    _fence = nullptr;
+  }
+}
+
+}  // namespace tgfx
diff --git a/src/gpu/d3d12/D3D12Semaphore.h b/src/gpu/d3d12/D3D12Semaphore.h
new file mode 100644
index 000000000..2174583c5
--- /dev/null
+++ b/src/gpu/d3d12/D3D12Semaphore.h
@@ -0,0 +1,73 @@
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Tencent is pleased to support the open source community by making tgfx available.
+//
+//  Copyright (C) 2026 Tencent. All rights reserved.
+//
+//  Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+//  in compliance with the License. You may obtain a copy of the License at
+//
+//      https://opensource.org/licenses/BSD-3-Clause
+//
+//  unless required by applicable law or agreed to in writing, software distributed under the
+//  license is distributed on an "as is" basis, without warranties or conditions of any kind,
+//  either express or implied. see the license for the specific language governing permissions
+//  and limitations under the license.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include "D3D12Resource.h"
+#include "D3D12Util.h"
+#include "tgfx/gpu/Semaphore.h"
+
+namespace tgfx {
+
+class D3D12GPU;
+
+/**
+ * D3D12 semaphore implementation backed by an ID3D12Fence and a target signal value. Once the
+ * fence reaches the target value, all GPU work submitted before the signal is guaranteed to have
+ * completed, providing the same semantics as a Vulkan timeline semaphore.
+ */
+class D3D12Semaphore : public Semaphore, public D3D12Resource {
+ public:
+  static std::shared_ptr<D3D12Semaphore> Make(D3D12GPU* gpu);
+
+  static std::shared_ptr<D3D12Semaphore> MakeFrom(D3D12GPU* gpu, ComPtr<ID3D12Fence> fence,
+                                                  uint64_t value);
+
+  D3D12Semaphore(ComPtr<ID3D12Fence> fence, uint64_t value, bool adopted);
+  ~D3D12Semaphore() override = default;
+
+  ID3D12Fence* d3d12Fence() const {
+    return _fence.Get();
+  }
+
+  uint64_t signalValue() const {
+    return _value;
+  }
+
+  uint64_t nextSignalValue() const {
+    return _value + 1;
+  }
+
+  void commitSignalValue() {
+    ++_value;
+  }
+
+  BackendSemaphore getBackendSemaphore() const override;
+
+ protected:
+  void onRelease(D3D12GPU* gpu) override;
+
+ private:
+  ComPtr<ID3D12Fence> _fence = nullptr;
+  uint64_t _value = 0;
+  bool _adopted = false;
+
+  friend class D3D12GPU;
+};
+
+}  // namespace tgfx
diff --git a/src/gpu/d3d12/D3D12ShaderModule.cpp b/src/gpu/d3d12/D3D12ShaderModule.cpp
new file mode 100644
index 000000000..dbd59c344
--- /dev/null
+++ b/src/gpu/d3d12/D3D12ShaderModule.cpp
@@ -0,0 +1,174 @@
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Tencent is pleased to support the open source community by making tgfx available.
+//
+//  Copyright (C) 2026 Tencent. All rights reserved.
+//
+//  Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+//  in compliance with the License. You may obtain a copy of the License at
+//
+//      https://opensource.org/licenses/BSD-3-Clause
+//
+//  unless required by applicable law or agreed to in writing, software distributed under the
+//  license is distributed on an "as is" basis, without warranties or conditions of any kind,
+//  either express or implied. see the license for the specific language governing permissions
+//  and limitations under the license.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "D3D12ShaderModule.h"
+#include <shaderc/shaderc.hpp>
+#include "D3D12GPU.h"
+#include "core/utils/Log.h"
+#include "gpu/ShaderCompiler.h"
+#include "gpu/UniformData.h"
+// Suppress warnings from SPIRV-Cross headers
+#pragma warning(push)
+#pragma warning(disable : 4100 4458 4245 4127 4244)
+#include <spirv_hlsl.hpp>
+#include <spirv_parser.hpp>
+#pragma warning(pop)
+
+namespace tgfx {
+
+// Convert a SPIR-V binary to HLSL source code suitable for D3DCompile with profile vs_5_0/ps_5_0.
+//
+// Binding strategy:
+//   - UBOs are walked in the order SPIRV-Cross returns them — which is GLSL declaration order,
+//     and therefore matches the BindingLayout::uniformBlocks order seen by the pipeline side —
+//     and assigned consecutive CBV registers b0, b1, ... within this single stage. HLSL register
+//     namespaces are per-stage so the vertex stage and the pixel stage have independent b0
+//     packings; D3D12RenderPipeline's root signature mirrors this by giving each entry a
+//     stage-local register index.
+//   - Sampled images at SPIR-V bindings 2..N are mapped to (t{N-2}, s{N-2}). Shifting by
+//     TEXTURE_BINDING_POINT_START keeps the t/s register space dense starting at zero, which
+//     simplifies root-signature construction.
+static std::string convertSPIRVToHLSL(const std::vector<uint32_t>& spirvBinary, ShaderStage stage) {
+  spirv_cross::Parser spvParser(spirvBinary.data(), spirvBinary.size());
+  spvParser.parse();
+  spirv_cross::CompilerHLSL hlslCompiler(std::move(spvParser.get_parsed_ir()));
+
+  spirv_cross::CompilerHLSL::Options hlslOptions;
+  hlslOptions.shader_model = 50;
+  hlslCompiler.set_hlsl_options(hlslOptions);
+
+  auto commonOptions = hlslCompiler.get_common_options();
+  // Compensate for HLSL's clip-space Y direction matching Vulkan/GL after our standard flip.
+  commonOptions.vertex.flip_vert_y = true;
+  hlslCompiler.set_common_options(commonOptions);
+
+  auto executionModel =
+      (stage == ShaderStage::Vertex) ? spv::ExecutionModelVertex : spv::ExecutionModelFragment;
+
+  auto resources = hlslCompiler.get_shader_resources();
+
+  // Map UBOs: for the current stage, walk the SPIR-V uniform buffers in the order produced by
+  // SPIRV-Cross (which matches GLSL declaration order, identical to BindingLayout's
+  // uniformBlocks order on the pipeline side) and assign them HLSL CBV registers b0, b1, ...
+  // sequentially. HLSL register namespaces are per-stage, so this gives each stage a dense
+  // packing that matches D3D12RenderPipeline::createRootSignature, which assigns the same
+  // stage-local index to each entry's CBV root parameter.
+  uint32_t cbvRegister = 0;
+  for (auto& ubo : resources.uniform_buffers) {
+    uint32_t spvBinding = hlslCompiler.get_decoration(ubo.id, spv::DecorationBinding);
+    uint32_t spvDescSet = hlslCompiler.get_decoration(ubo.id, spv::DecorationDescriptorSet);
+    spirv_cross::HLSLResourceBinding resourceBinding = {};
+    resourceBinding.stage = executionModel;
+    resourceBinding.desc_set = spvDescSet;
+    resourceBinding.binding = spvBinding;
+    resourceBinding.cbv.register_binding = cbvRegister++;
+    resourceBinding.cbv.register_space = 0;
+    hlslCompiler.add_hlsl_resource_binding(resourceBinding);
+  }
+
+  // Map combined samplers: SPIR-V binding N -> (t{N - TEXTURE_BINDING_POINT_START},
+  // s{N - TEXTURE_BINDING_POINT_START}).
+  for (auto& image : resources.sampled_images) {
+    uint32_t spvBinding = hlslCompiler.get_decoration(image.id, spv::DecorationBinding);
+    uint32_t spvDescSet = hlslCompiler.get_decoration(image.id, spv::DecorationDescriptorSet);
+    uint32_t hlslSlot = (spvBinding >= static_cast<uint32_t>(TEXTURE_BINDING_POINT_START))
+                            ? spvBinding - static_cast<uint32_t>(TEXTURE_BINDING_POINT_START)
+                            : spvBinding;
+    spirv_cross::HLSLResourceBinding resourceBinding = {};
+    resourceBinding.stage = executionModel;
+    resourceBinding.desc_set = spvDescSet;
+    resourceBinding.binding = spvBinding;
+    resourceBinding.srv.register_binding = hlslSlot;
+    resourceBinding.srv.register_space = 0;
+    resourceBinding.sampler.register_binding = hlslSlot;
+    resourceBinding.sampler.register_space = 0;
+    hlslCompiler.add_hlsl_resource_binding(resourceBinding);
+  }
+
+  std::string hlsl = hlslCompiler.compile();
+  if (hlsl.empty()) {
+    LOGE("D3D12ShaderModule: SPIR-V to HLSL conversion produced empty source.");
+  }
+  return hlsl;
+}
+
+// Compile HLSL source to a DXBC bytecode blob using D3DCompile with the appropriate stage profile.
+static ComPtr<ID3DBlob> compileHLSLToDXBC(const std::string& hlsl, ShaderStage stage) {
+  const char* target = (stage == ShaderStage::Vertex) ? "vs_5_0" : "ps_5_0";
+  UINT flags = D3DCOMPILE_ENABLE_STRICTNESS;
+#ifdef _DEBUG
+  flags |= D3DCOMPILE_DEBUG | D3DCOMPILE_SKIP_OPTIMIZATION;
+#else
+  flags |= D3DCOMPILE_OPTIMIZATION_LEVEL3;
+#endif
+
+  ComPtr<ID3DBlob> codeBlob = nullptr;
+  ComPtr<ID3DBlob> errorBlob = nullptr;
+  auto hr = D3DCompile(hlsl.data(), hlsl.size(), nullptr, nullptr, nullptr, "main", target, flags,
+                       0, &codeBlob, &errorBlob);
+  if (FAILED(hr)) {
+    if (errorBlob != nullptr) {
+      LOGE("D3D12ShaderModule: D3DCompile failed (HRESULT=0x%08X): %s", static_cast<unsigned>(hr),
+           static_cast<const char*>(errorBlob->GetBufferPointer()));
+    } else {
+      LOGE("D3D12ShaderModule: D3DCompile failed (HRESULT=0x%08X) with no error message.",
+           static_cast<unsigned>(hr));
+    }
+    LOGE("D3D12ShaderModule: HLSL source (first 1024 chars):\n%.1024s", hlsl.c_str());
+    return nullptr;
+  }
+  return codeBlob;
+}
+
+std::shared_ptr<D3D12ShaderModule> D3D12ShaderModule::Make(
+    D3D12GPU* gpu, const ShaderModuleDescriptor& descriptor) {
+  if (gpu == nullptr) {
+    return nullptr;
+  }
+  auto module = gpu->makeResource<D3D12ShaderModule>(gpu, descriptor);
+  if (module->bytecode == nullptr) {
+    return nullptr;
+  }
+  return module;
+}
+
+D3D12ShaderModule::D3D12ShaderModule(D3D12GPU* gpu, const ShaderModuleDescriptor& descriptor)
+    : _stage(descriptor.stage) {
+  std::string vulkanGLSL = PreprocessGLSL(descriptor.code);
+  // D3D12 needs every declared interface variable to survive — see ShaderCompiler.h.
+  auto spirvBinary = CompileGLSLToSPIRV(gpu->shaderCompiler(), vulkanGLSL, descriptor.stage, true);
+  if (spirvBinary.empty()) {
+    LOGE("D3D12ShaderModule: GLSL to SPIR-V compilation failed.");
+    return;
+  }
+  std::string hlsl = convertSPIRVToHLSL(spirvBinary, descriptor.stage);
+  if (hlsl.empty()) {
+    return;
+  }
+  bytecode = compileHLSLToDXBC(hlsl, descriptor.stage);
+#ifdef TGFX_D3D12_DEBUG_LAYER
+  _hlslSource = std::move(hlsl);
+#endif
+}
+
+void D3D12ShaderModule::onRelease(D3D12GPU*) {
+  // ID3DBlob is reference counted via ComPtr; releasing the ComPtr frees the bytecode.
+  bytecode = nullptr;
+}
+
+}  // namespace tgfx
diff --git a/src/gpu/d3d12/D3D12ShaderModule.h b/src/gpu/d3d12/D3D12ShaderModule.h
new file mode 100644
index 000000000..f674d6a74
--- /dev/null
+++ b/src/gpu/d3d12/D3D12ShaderModule.h
@@ -0,0 +1,100 @@
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Tencent is pleased to support the open source community by making tgfx available.
+//
+//  Copyright (C) 2026 Tencent. All rights reserved.
+//
+//  Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+//  in compliance with the License. You may obtain a copy of the License at
+//
+//      https://opensource.org/licenses/BSD-3-Clause
+//
+//  unless required by applicable law or agreed to in writing, software distributed under the
+//  license is distributed on an "as is" basis, without warranties or conditions of any kind,
+//  either express or implied. see the license for the specific language governing permissions
+//  and limitations under the license.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include <d3dcompiler.h>
+#include <string>
+#include "D3D12Resource.h"
+#include "D3D12Util.h"
+#include "tgfx/gpu/ShaderModule.h"
+#include "tgfx/gpu/ShaderStage.h"
+
+namespace tgfx {
+
+class D3D12GPU;
+
+/**
+ * D3D12 shader module implementation.
+ *
+ * Compilation pipeline (matching the GLSL-first design used by Vulkan and Metal backends):
+ *   1. PreprocessGLSL — assigns explicit binding/location qualifiers (shared with Vulkan/Metal).
+ *   2. CompileGLSLToSPIRV — uses shaderc to produce SPIR-V (shared with Vulkan/Metal).
+ *   3. SPIR-V -> HLSL — uses spirv_cross::CompilerHLSL targeting shader model 5.0.
+ *   4. HLSL -> DXBC — uses D3DCompile with profile vs_5_0 / ps_5_0.
+ *
+ * The resulting DXBC blob is consumed by D3D12RenderPipeline via shaderBytecode().
+ *
+ * Resource binding mapping (SPIR-V binding -> HLSL register):
+ *   - VertexUniformBlock   (binding 0) -> b0
+ *   - FragmentUniformBlock (binding 1) -> b0  (HLSL b/t/s registers are per shader stage; both
+ *                                              stages can use b0 without colliding because the
+ *                                              D3D12 root signature distinguishes them via
+ *                                              ShaderVisibility.)
+ *   - sampler bindings (binding N >= 2) -> t{N-2} + s{N-2}
+ *
+ * SPIRV-Cross's default behaviour already matches CBV/SRV/Sampler register classes derived from
+ * the SPIR-V resource type, so the only customisation we need is shifting samplers to register 0.
+ */
+class D3D12ShaderModule : public ShaderModule, public D3D12Resource {
+ public:
+  static std::shared_ptr<D3D12ShaderModule> Make(D3D12GPU* gpu,
+                                                 const ShaderModuleDescriptor& descriptor);
+
+  /**
+   * Returns the compiled DXBC bytecode in the form expected by D3D12 pipeline state descriptors.
+   * The returned struct references memory owned by this object; its lifetime is bound to the
+   * lifetime of the D3D12ShaderModule.
+   */
+  D3D12_SHADER_BYTECODE shaderBytecode() const {
+    if (bytecode == nullptr) {
+      return {nullptr, 0};
+    }
+    return {bytecode->GetBufferPointer(), bytecode->GetBufferSize()};
+  }
+
+  ShaderStage stage() const {
+    return _stage;
+  }
+
+#ifdef TGFX_D3D12_DEBUG_LAYER
+  /// Returns the cross-compiled HLSL source captured during construction. Diagnostic-only:
+  /// available only when TGFX_D3D12_DEBUG_LAYER is defined so production builds don't pay the
+  /// memory cost of holding HLSL strings.
+  const std::string& hlslSource() const {
+    return _hlslSource;
+  }
+#endif
+
+ protected:
+  void onRelease(D3D12GPU* gpu) override;
+
+ private:
+  D3D12ShaderModule(D3D12GPU* gpu, const ShaderModuleDescriptor& descriptor);
+  ~D3D12ShaderModule() override = default;
+
+  ShaderStage _stage = ShaderStage::Vertex;
+  ComPtr<ID3DBlob> bytecode = nullptr;
+#ifdef TGFX_D3D12_DEBUG_LAYER
+  std::string _hlslSource;
+#endif
+
+  friend class D3D12GPU;
+};
+
+}  // namespace tgfx
diff --git a/src/gpu/d3d12/D3D12Texture.cpp b/src/gpu/d3d12/D3D12Texture.cpp
new file mode 100644
index 000000000..54550ecbe
--- /dev/null
+++ b/src/gpu/d3d12/D3D12Texture.cpp
@@ -0,0 +1,194 @@
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Tencent is pleased to support the open source community by making tgfx available.
+//
+//  Copyright (C) 2026 Tencent. All rights reserved.
+//
+//  Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+//  in compliance with the License. You may obtain a copy of the License at
+//
+//      https://opensource.org/licenses/BSD-3-Clause
+//
+//  unless required by applicable law or agreed to in writing, software distributed under the
+//  license is distributed on an "as is" basis, without warranties or conditions of any kind,
+//  either express or implied. see the license for the specific language governing permissions
+//  and limitations under the license.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "D3D12Texture.h"
+#include "D3D12Defines.h"
+#include "D3D12GPU.h"
+#include "core/utils/Log.h"
+
+namespace tgfx {
+
+static D3D12_RESOURCE_FLAGS ToD3D12ResourceFlags(uint32_t usage) {
+  D3D12_RESOURCE_FLAGS flags = D3D12_RESOURCE_FLAG_NONE;
+  if (usage & TextureUsage::RENDER_ATTACHMENT) {
+    flags |= D3D12_RESOURCE_FLAG_ALLOW_RENDER_TARGET;
+  }
+  return flags;
+}
+
+std::shared_ptr<D3D12Texture> D3D12Texture::Make(D3D12GPU* gpu,
+                                                 const TextureDescriptor& descriptor) {
+  if (gpu == nullptr || descriptor.width <= 0 || descriptor.height <= 0) {
+    return nullptr;
+  }
+
+  // D3D12 disallows D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS on MSAA resources, and we need
+  // UAV access on every mipmapped texture so the compute mipmap generator can write each
+  // downsampled level. Rather than create the resource and let CreateCommittedResource fail
+  // with a runtime debug-layer error, reject the combination up front. The GL backend already
+  // enforces the same contract in GLMultisampleTexture::MakeFrom; Metal's MTLTextureType2D-
+  // Multisample type cannot carry mip levels at all and so doesn't need an explicit check.
+  if (descriptor.mipLevelCount > 1 && descriptor.sampleCount > 1) {
+    LOGE(
+        "D3D12Texture::Make() multisample textures cannot have mip levels (mipLevelCount=%d, "
+        "sampleCount=%d).",
+        descriptor.mipLevelCount, descriptor.sampleCount);
+    return nullptr;
+  }
+
+  auto dxgiFormat = static_cast<DXGI_FORMAT>(gpu->getDXGIFormat(descriptor.format));
+  if (dxgiFormat == static_cast<DXGI_FORMAT>(DXGI_FORMAT_UNKNOWN)) {
+    LOGE("D3D12Texture::Make() unsupported pixel format: %d", static_cast<int>(descriptor.format));
+    return nullptr;
+  }
+
+  bool isDepthStencil = (descriptor.format == PixelFormat::DEPTH24_STENCIL8);
+
+  D3D12_RESOURCE_FLAGS resourceFlags = ToD3D12ResourceFlags(descriptor.usage);
+  if (isDepthStencil) {
+    resourceFlags |= D3D12_RESOURCE_FLAG_ALLOW_DEPTH_STENCIL;
+    resourceFlags &= ~D3D12_RESOURCE_FLAG_ALLOW_RENDER_TARGET;
+  } else {
+    // D3D12 requires ALLOW_RENDER_TARGET to be set at resource creation time before any
+    // CreateRenderTargetView call against the resource is legal. Other backends (Vulkan/Metal)
+    // can derive render-target capability lazily, so callers across the codebase commonly
+    // create textures with the default usage (TEXTURE_BINDING) and later wrap them via
+    // Surface::MakeFrom(context, backendTexture, ...). To keep that path working on D3D12 we
+    // unconditionally enable the flag for any non-depth, renderable colour format. The cost is
+    // marginal (some drivers skip a sampling-only compression path) and it avoids hard
+    // device-removal when a sampled texture is later asked to act as a render target.
+    if (gpu->isFormatRenderable(descriptor.format)) {
+      resourceFlags |= D3D12_RESOURCE_FLAG_ALLOW_RENDER_TARGET;
+    }
+    if (descriptor.mipLevelCount > 1) {
+      // Mipmapped textures need to be writable from a compute shader so that
+      // generateMipmapsForTexture() can downsample mip[i] -> mip[i+1] via UAV writes. The flag is
+      // a no-op for the basic sampling path and only adds a small driver-internal alignment
+      // overhead.
+      resourceFlags |= D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS;
+    }
+  }
+
+  D3D12_HEAP_PROPERTIES heapProperties = {};
+  heapProperties.Type = D3D12_HEAP_TYPE_DEFAULT;
+
+  D3D12_RESOURCE_DESC resourceDesc = {};
+  resourceDesc.Dimension = D3D12_RESOURCE_DIMENSION_TEXTURE2D;
+  resourceDesc.Width = static_cast<UINT64>(descriptor.width);
+  resourceDesc.Height = static_cast<UINT>(descriptor.height);
+  resourceDesc.DepthOrArraySize = 1;
+  resourceDesc.MipLevels = static_cast<UINT16>(descriptor.mipLevelCount);
+  resourceDesc.Format = dxgiFormat;
+  resourceDesc.SampleDesc.Count = static_cast<UINT>(descriptor.sampleCount);
+  resourceDesc.SampleDesc.Quality = 0;
+  resourceDesc.Layout = D3D12_TEXTURE_LAYOUT_UNKNOWN;
+  resourceDesc.Flags = resourceFlags;
+
+  // Optimised clear values let D3D12 fast-path ClearRenderTargetView / ClearDepthStencilView
+  // when the runtime-supplied clear matches. We don't know the clear colour at creation time
+  // (callers vary, e.g. RGBA transparent for offscreen surfaces, white for blur seed), so for
+  // colour render targets we pass nullptr — forcing the slow-but-deterministic clear path is
+  // preferable to a perpetual "clear values do not match" debug-layer warning that some drivers
+  // also turn into a stalled GPU clear. Depth-stencil keeps an optimised value because the test
+  // suite uses a single canonical (0.0 depth, 0 stencil) clear.
+  D3D12_CLEAR_VALUE* clearValue = nullptr;
+  D3D12_CLEAR_VALUE clearValueStorage = {};
+  if (isDepthStencil) {
+    clearValueStorage.Format = dxgiFormat;
+    clearValueStorage.DepthStencil.Depth = 0.0f;
+    clearValueStorage.DepthStencil.Stencil = 0;
+    clearValue = &clearValueStorage;
+  }
+
+  D3D12_RESOURCE_STATES initialState = D3D12_RESOURCE_STATE_COMMON;
+
+  ComPtr<ID3D12Resource> d3d12Resource = nullptr;
+  auto hr = gpu->device()->CreateCommittedResource(&heapProperties, D3D12_HEAP_FLAG_NONE,
+                                                   &resourceDesc, initialState, clearValue,
+                                                   IID_PPV_ARGS(&d3d12Resource));
+  if (FAILED(hr)) {
+    LOGE("D3D12Texture::Make() CreateCommittedResource failed, HRESULT=0x%08X",
+         static_cast<unsigned>(hr));
+    return nullptr;
+  }
+
+  return gpu->makeResource<D3D12Texture>(descriptor, std::move(d3d12Resource),
+                                         static_cast<unsigned>(dxgiFormat));
+}
+
+std::shared_ptr<D3D12Texture> D3D12Texture::MakeFrom(D3D12GPU* gpu, ComPtr<ID3D12Resource> resource,
+                                                     unsigned dxgiFormat, uint32_t usage,
+                                                     bool /*adopted*/) {
+  if (gpu == nullptr || resource == nullptr) {
+    return nullptr;
+  }
+
+  auto desc = resource->GetDesc();
+  TextureDescriptor descriptor = {};
+  descriptor.width = static_cast<int>(desc.Width);
+  descriptor.height = static_cast<int>(desc.Height);
+  descriptor.format = DXGIFormatToPixelFormat(dxgiFormat);
+  descriptor.mipLevelCount = static_cast<int>(desc.MipLevels);
+  descriptor.sampleCount = static_cast<int>(desc.SampleDesc.Count);
+  descriptor.usage = usage;
+  // The `adopted` flag is intentionally ignored on D3D12: COM reference counting makes the
+  // distinction Vulkan/Metal draw — "tgfx must explicitly destroy" vs "caller keeps owning it" —
+  // meaningless here. ComPtr<ID3D12Resource> always carries its own AddRef/Release pair, so:
+  //   * adopted == true  : caller hands its reference to us; on D3D12Texture destruction the
+  //     ComPtr Release brings the refcount to zero and the runtime destroys the resource.
+  //   * adopted == false : caller keeps its reference; we hold an additional one. The resource
+  //     stays alive at least until both refs are released, satisfying the GPU::importBackendTexture
+  //     contract that the backend texture remain valid for the wrapped Texture's lifetime.
+  // Either way the cleanup logic is identical, so a single code path is enough.
+  return gpu->makeResource<D3D12Texture>(descriptor, std::move(resource), dxgiFormat);
+}
+
+D3D12Texture::D3D12Texture(const TextureDescriptor& descriptor,
+                           ComPtr<ID3D12Resource> d3d12Resource, unsigned dxgiFormat)
+    : Texture(descriptor), resource(std::move(d3d12Resource)), _dxgiFormat(dxgiFormat) {
+}
+
+void D3D12Texture::onRelease(D3D12GPU*) {
+  onReleaseTexture();
+}
+
+void D3D12Texture::onReleaseTexture() {
+  resource = nullptr;
+}
+
+BackendTexture D3D12Texture::getBackendTexture() const {
+  if (resource == nullptr || !(descriptor.usage & TextureUsage::TEXTURE_BINDING)) {
+    return {};
+  }
+  D3D12TextureInfo d3d12Info = {};
+  d3d12Info.resource = resource.Get();
+  d3d12Info.format = _dxgiFormat;
+  return BackendTexture(d3d12Info, descriptor.width, descriptor.height);
+}
+
+BackendRenderTarget D3D12Texture::getBackendRenderTarget() const {
+  if (resource == nullptr || !(descriptor.usage & TextureUsage::RENDER_ATTACHMENT)) {
+    return {};
+  }
+  D3D12TextureInfo d3d12Info = {};
+  d3d12Info.resource = resource.Get();
+  d3d12Info.format = _dxgiFormat;
+  return BackendRenderTarget(d3d12Info, descriptor.width, descriptor.height);
+}
+
+}  // namespace tgfx
diff --git a/src/gpu/d3d12/D3D12Texture.h b/src/gpu/d3d12/D3D12Texture.h
new file mode 100644
index 000000000..a9f47fd3d
--- /dev/null
+++ b/src/gpu/d3d12/D3D12Texture.h
@@ -0,0 +1,93 @@
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Tencent is pleased to support the open source community by making tgfx available.
+//
+//  Copyright (C) 2026 Tencent. All rights reserved.
+//
+//  Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+//  in compliance with the License. You may obtain a copy of the License at
+//
+//      https://opensource.org/licenses/BSD-3-Clause
+//
+//  unless required by applicable law or agreed to in writing, software distributed under the
+//  license is distributed on an "as is" basis, without warranties or conditions of any kind,
+//  either express or implied. see the license for the specific language governing permissions
+//  and limitations under the license.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include "D3D12Resource.h"
+#include "D3D12Util.h"
+#include "tgfx/gpu/Texture.h"
+
+namespace tgfx {
+
+class D3D12GPU;
+
+/**
+ * D3D12 texture implementation.
+ */
+class D3D12Texture : public Texture, public D3D12Resource {
+ public:
+  static std::shared_ptr<D3D12Texture> Make(D3D12GPU* gpu, const TextureDescriptor& descriptor);
+
+  /**
+   * Creates a D3D12Texture wrapper from an external D3D12 resource.
+   */
+  static std::shared_ptr<D3D12Texture> MakeFrom(D3D12GPU* gpu, ComPtr<ID3D12Resource> resource,
+                                                unsigned dxgiFormat, uint32_t usage, bool adopted);
+
+  /**
+   * Returns the underlying D3D12 resource.
+   */
+  ID3D12Resource* d3d12Resource() const {
+    return resource.Get();
+  }
+
+  /**
+   * Returns the DXGI format of this texture.
+   */
+  unsigned dxgiFormat() const {
+    return _dxgiFormat;
+  }
+
+  /**
+   * Returns the resource state currently tracked on the CPU. D3D12, unlike Vulkan, requires the
+   * application to issue explicit ResourceBarrier calls to transition between read and write
+   * states. We track the most recently announced state per texture so that subsequent bindings
+   * can construct the correct transition barrier.
+   *
+   * Note: on textures imported from external D3D12 resources we initialise the state to COMMON
+   * (the value the application is required to leave the resource in when handing it off — see
+   * D3D12 SDK common-state promotion rules). This is conservative but correct.
+   */
+  D3D12_RESOURCE_STATES currentState() const {
+    return _currentState;
+  }
+
+  void setCurrentState(D3D12_RESOURCE_STATES state) {
+    _currentState = state;
+  }
+
+  BackendTexture getBackendTexture() const override;
+  BackendRenderTarget getBackendRenderTarget() const override;
+
+ protected:
+  D3D12Texture(const TextureDescriptor& descriptor, ComPtr<ID3D12Resource> resource,
+               unsigned dxgiFormat);
+  ~D3D12Texture() override = default;
+
+  void onRelease(D3D12GPU* gpu) override;
+
+  virtual void onReleaseTexture();
+
+  ComPtr<ID3D12Resource> resource = nullptr;
+  unsigned _dxgiFormat = 0;
+  D3D12_RESOURCE_STATES _currentState = D3D12_RESOURCE_STATE_COMMON;
+
+  friend class D3D12GPU;
+};
+
+}  // namespace tgfx
diff --git a/src/gpu/d3d12/D3D12UploadHeap.cpp b/src/gpu/d3d12/D3D12UploadHeap.cpp
new file mode 100644
index 000000000..641db358b
--- /dev/null
+++ b/src/gpu/d3d12/D3D12UploadHeap.cpp
@@ -0,0 +1,186 @@
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Tencent is pleased to support the open source community by making tgfx available.
+//
+//  Copyright (C) 2026 Tencent. All rights reserved.
+//
+//  Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+//  in compliance with the License. You may obtain a copy of the License at
+//
+//      https://opensource.org/licenses/BSD-3-Clause
+//
+//  unless required by applicable law or agreed to in writing, software distributed under the
+//  license is distributed on an "as is" basis, without warranties or conditions of any kind,
+//  either express or implied. see the license for the specific language governing permissions
+//  and limitations under the license.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "D3D12UploadHeap.h"
+#include "core/utils/Log.h"
+
+namespace tgfx {
+
+static size_t AlignUpSize(size_t value, size_t alignment) {
+  if (alignment <= 1) {
+    return value;
+  }
+  return (value + alignment - 1) & ~(alignment - 1);
+}
+
+bool D3D12UploadHeap::init(ID3D12Device* device, size_t capacity) {
+  if (device == nullptr || capacity == 0) {
+    return false;
+  }
+  D3D12_HEAP_PROPERTIES heapProps = {};
+  heapProps.Type = D3D12_HEAP_TYPE_UPLOAD;
+
+  D3D12_RESOURCE_DESC desc = {};
+  desc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER;
+  desc.Width = static_cast<UINT64>(capacity);
+  desc.Height = 1;
+  desc.DepthOrArraySize = 1;
+  desc.MipLevels = 1;
+  desc.Format = static_cast<DXGI_FORMAT>(DXGI_FORMAT_UNKNOWN);
+  desc.SampleDesc.Count = 1;
+  desc.SampleDesc.Quality = 0;
+  desc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR;
+  desc.Flags = D3D12_RESOURCE_FLAG_NONE;
+
+  ComPtr<ID3D12Resource> resource = nullptr;
+  auto hr = device->CreateCommittedResource(&heapProps, D3D12_HEAP_FLAG_NONE, &desc,
+                                            D3D12_RESOURCE_STATE_GENERIC_READ, nullptr,
+                                            IID_PPV_ARGS(&resource));
+  if (FAILED(hr)) {
+    LOGE("D3D12UploadHeap::init(): CreateCommittedResource failed (HRESULT=0x%08X size=%zu).",
+         static_cast<unsigned>(hr), capacity);
+    return false;
+  }
+
+  void* mapped = nullptr;
+  D3D12_RANGE readRange = {0, 0};
+  hr = resource->Map(0, &readRange, &mapped);
+  if (FAILED(hr) || mapped == nullptr) {
+    LOGE("D3D12UploadHeap::init(): Map failed (HRESULT=0x%08X).", static_cast<unsigned>(hr));
+    return false;
+  }
+
+  _resource = std::move(resource);
+  mappedCpu = mapped;
+  gpuVA = _resource->GetGPUVirtualAddress();
+  _capacity = capacity;
+  head = 0;
+  committedHead = 0;
+  outstandingBytes = 0;
+  // Drop any inflight entries left over from a previous init() so the post-init state really is
+  // "fresh", matching the resetForContextLost() invariant. There is no current re-init path,
+  // but if one is added later (device-lost recovery, test teardown) those inflight entries
+  // would otherwise reference the previous, just-released resource.
+  inflight.clear();
+  return true;
+}
+
+D3D12UploadHeap::Allocation D3D12UploadHeap::allocate(size_t size, size_t alignment) {
+  if (_resource == nullptr || size == 0 || size > _capacity) {
+    return {};
+  }
+  size_t alignedHead = AlignUpSize(head, alignment);
+  // Track how much of the ring is currently occupied by outstanding allocations. head alone
+  // cannot disambiguate "ring empty" from "ring full" because both produce the same value once
+  // an allocation wraps it; we add an explicit byte counter that is incremented on every
+  // allocate() and decremented on retire(). This is what stops a wrap from silently overwriting
+  // staging bytes that were allocated but not yet committed (see RecordingTest race details
+  // captured in commit notes).
+  size_t free = _capacity - outstandingBytes;
+  size_t needed = size;
+  size_t startOffset = alignedHead;
+  size_t skipped = 0;
+  if (alignedHead + size > _capacity) {
+    // Splitting the range across the wrap boundary is not supported (CopyTextureRegion needs a
+    // single contiguous PLACED_SUBRESOURCE_FOOTPRINT), so jump back to offset 0 and pay for the
+    // discarded bytes between head and the end of the ring out of the same free pool.
+    skipped = _capacity - head;
+    needed = size + skipped;
+    startOffset = 0;
+  } else {
+    needed = size + (alignedHead - head);
+  }
+  if (needed > free) {
+    return {};
+  }
+
+  Allocation result = {};
+  result.resource = _resource.Get();
+  result.cpu = static_cast<uint8_t*>(mappedCpu) + startOffset;
+  result.gpuVirtualAddress = gpuVA + startOffset;
+  result.offsetInResource = startOffset;
+  result.size = size;
+  head = startOffset + size;
+  if (head == _capacity) {
+    head = 0;
+  }
+  outstandingBytes += needed;
+  return result;
+}
+
+void D3D12UploadHeap::commit(uint64_t fenceValue) {
+  // Pair the about-to-be-signalled fence with the bytes consumed since the last commit so
+  // retire() can give those bytes back when the GPU finishes with them. Compute the byte total
+  // first because the fast `head == committedHead` check is ambiguous: it triggers both when
+  // truly nothing was allocated and when a single allocation spanned the entire capacity and
+  // wrapped head right back to committedHead.
+  size_t bytesSinceCommit =
+      (head >= committedHead) ? (head - committedHead) : (_capacity - (committedHead - head));
+  if (bytesSinceCommit == 0) {
+    if (outstandingBytes == 0) {
+      return;
+    }
+    // Whole-capacity allocation case — bill the full ring to this fence so the retire() path
+    // eventually drains outstandingBytes. Without this branch the bytes would leak and stop
+    // the ring from accepting any further allocations once outstandingBytes saturates.
+    bytesSinceCommit = _capacity;
+  }
+  InflightRange entry = {};
+  entry.fenceValue = fenceValue;
+  entry.bytes = bytesSinceCommit;
+  inflight.push_back(entry);
+  committedHead = head;
+}
+
+void D3D12UploadHeap::retire(uint64_t completedFenceValue) {
+  while (!inflight.empty() && inflight.front().fenceValue <= completedFenceValue) {
+    if (outstandingBytes >= inflight.front().bytes) {
+      outstandingBytes -= inflight.front().bytes;
+    } else {
+      // Defensive: bookkeeping should never drop below zero, but if it does we reset rather
+      // than wrap to ~0 and stop accepting allocations forever.
+      outstandingBytes = 0;
+    }
+    inflight.pop_front();
+  }
+}
+
+void D3D12UploadHeap::clear() {
+  if (_resource != nullptr) {
+    _resource->Unmap(0, nullptr);
+  }
+  _resource = nullptr;
+  mappedCpu = nullptr;
+  gpuVA = 0;
+  _capacity = 0;
+  head = 0;
+  committedHead = 0;
+  outstandingBytes = 0;
+  inflight.clear();
+}
+
+void D3D12UploadHeap::resetForContextLost() {
+  // Keep _resource / mappedCpu / gpuVA / _capacity intact; the ring stays usable. Just drop
+  // every accounting entry that is waiting on a fence that is never going to advance.
+  head = 0;
+  committedHead = 0;
+  outstandingBytes = 0;
+  inflight.clear();
+}
+
+}  // namespace tgfx
diff --git a/src/gpu/d3d12/D3D12UploadHeap.h b/src/gpu/d3d12/D3D12UploadHeap.h
new file mode 100644
index 000000000..e2f5533b3
--- /dev/null
+++ b/src/gpu/d3d12/D3D12UploadHeap.h
@@ -0,0 +1,138 @@
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Tencent is pleased to support the open source community by making tgfx available.
+//
+//  Copyright (C) 2026 Tencent. All rights reserved.
+//
+//  Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+//  in compliance with the License. You may obtain a copy of the License at
+//
+//      https://opensource.org/licenses/BSD-3-Clause
+//
+//  unless required by applicable law or agreed to in writing, software distributed under the
+//  license is distributed on an "as is" basis, without warranties or conditions of any kind,
+//  either express or implied. see the license for the specific language governing permissions
+//  and limitations under the license.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include <deque>
+#include "D3D12Util.h"
+
+namespace tgfx {
+
+/**
+ * Single permanently-mapped UPLOAD heap used as a byte-level fence-tracked ring for staging
+ * data on its way to GPU resources (texture pixel uploads, buffer writes, etc).
+ *
+ * Why not per-call CreateCommittedResource:
+ *   - Every D3D12 CreateCommittedResource(UPLOAD) call goes through the kernel allocator and
+ *     reserves a fresh GPU virtual address range. For high-frequency uploads (writeTexture()
+ *     called once per glyph atlas update, per render pass blur seed, etc) this accumulates real
+ *     CPU and driver memory cost.
+ *   - The standard solution is a single big UPLOAD buffer that stays Mapped() for its entire
+ *     lifetime; sub-allocations are pure pointer arithmetic.
+ *
+ * Allocation model:
+ *   - allocate(size, alignment) returns an Allocation that names a contiguous byte range inside
+ *     the underlying ID3D12Resource. The caller writes pixel data via the cpu pointer and uses
+ *     resource() + offsetInResource as the source of CopyTextureRegion / CopyBufferRegion.
+ *   - commit(fenceValue) snapshots head: every byte allocated since the last commit is now
+ *     "owned" by `fenceValue` and must outlive its signal.
+ *   - retire(completedFenceValue) reclaims byte ranges whose fence has signalled.
+ *
+ * Capacity management:
+ *   - A starting capacity is allocated up front. If a single allocation exceeds capacity OR the
+ *     ring is fully in flight, allocate() returns an invalid Allocation; the caller must fall
+ *     back to a one-off CreateCommittedResource. This avoids the complications of mid-frame
+ *     resource recreation while still letting the steady-state path skip the slow allocator.
+ *
+ * Thread safety: not thread-safe. Caller serialises access (matches the rest of the D3D12
+ * backend).
+ */
+class D3D12UploadHeap {
+ public:
+  struct Allocation {
+    // Lifetime-stable raw pointer; the heap's underlying ID3D12Resource is kept alive by the
+    // D3D12UploadHeap instance (which lives on D3D12GPU). Callers must not extend its lifetime
+    // beyond the next D3D12GPU shutdown.
+    ID3D12Resource* resource = nullptr;
+    void* cpu = nullptr;
+    uint64_t gpuVirtualAddress = 0;
+    uint64_t offsetInResource = 0;
+    size_t size = 0;
+    bool valid() const {
+      return resource != nullptr && cpu != nullptr && size > 0;
+    }
+  };
+
+  D3D12UploadHeap() = default;
+
+  /**
+   * Creates the underlying UPLOAD-heap committed resource and Map()s it permanently. Returns
+   * false on failure (logged inside).
+   */
+  bool init(ID3D12Device* device, size_t capacity);
+
+  /**
+   * Sub-allocates `size` bytes aligned to `alignment` within the ring. Returns an invalid
+   * Allocation if the ring cannot satisfy the request without overrunning still-in-flight
+   * bytes; callers must then fall back to a per-call upload buffer.
+   */
+  Allocation allocate(size_t size, size_t alignment);
+
+  /**
+   * Tags every allocation since the last commit() with `fenceValue`. The bytes become
+   * reclaimable in retire() once the GPU advances past `fenceValue`.
+   */
+  void commit(uint64_t fenceValue);
+
+  /**
+   * Reclaims byte ranges whose fence has signalled. Cheap to call.
+   */
+  void retire(uint64_t completedFenceValue);
+
+  /**
+   * Drops the underlying resource. Used by D3D12GPU::releaseAll on shutdown. After this call
+   * allocate() returns invalid until init() is invoked again.
+   */
+  void clear();
+
+  /**
+   * Drops every inflight byte range and zeroes the head / outstanding bookkeeping while
+   * keeping the mapped UPLOAD ID3D12Resource alive. Used by the context-lost recovery path so
+   * the ring stops accumulating inflight bytes whose fences will never advance, which would
+   * otherwise saturate outstandingBytes and reject every future allocation even though the
+   * GPU is no longer touching anything.
+   */
+  void resetForContextLost();
+
+  size_t capacity() const {
+    return _capacity;
+  }
+
+ private:
+  ComPtr<ID3D12Resource> _resource = nullptr;
+  void* mappedCpu = nullptr;
+  uint64_t gpuVA = 0;
+  size_t _capacity = 0;
+  size_t head = 0;
+  size_t committedHead = 0;
+  // Bytes currently held by either a still-uncommitted allocation or an inflight commit waiting
+  // for its fence to signal. Tracked explicitly so allocate() can know how many bytes are still
+  // in use — head alone cannot distinguish "ring empty" from "ring full" when an allocation
+  // wraps head right back to where it started.
+  size_t outstandingBytes = 0;
+
+  struct InflightRange {
+    uint64_t fenceValue = 0;
+    // Bytes consumed between the previous commit() and this one; returned to outstandingBytes
+    // when retire() reaches this entry.
+    size_t bytes = 0;
+  };
+  std::deque<InflightRange> inflight;
+};
+
+}  // namespace tgfx
diff --git a/src/gpu/d3d12/D3D12Util.cpp b/src/gpu/d3d12/D3D12Util.cpp
new file mode 100644
index 000000000..c992dd29f
--- /dev/null
+++ b/src/gpu/d3d12/D3D12Util.cpp
@@ -0,0 +1,313 @@
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Tencent is pleased to support the open source community by making tgfx available.
+//
+//  Copyright (C) 2026 Tencent. All rights reserved.
+//
+//  Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+//  in compliance with the License. You may obtain a copy of the License at
+//
+//      https://opensource.org/licenses/BSD-3-Clause
+//
+//  unless required by applicable law or agreed to in writing, software distributed under the
+//  license is distributed on an "as is" basis, without warranties or conditions of any kind,
+//  either express or implied. see the license for the specific language governing permissions
+//  and limitations under the license.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "D3D12Util.h"
+#include "tgfx/gpu/GPUBuffer.h"
+#include "tgfx/gpu/Texture.h"
+
+namespace tgfx {
+
+D3D12_PRIMITIVE_TOPOLOGY ToD3D12PrimitiveTopology(PrimitiveType primitiveType) {
+  switch (primitiveType) {
+    case PrimitiveType::Triangles:
+      return D3D_PRIMITIVE_TOPOLOGY_TRIANGLELIST;
+    case PrimitiveType::TriangleStrip:
+      return D3D_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP;
+    default:
+      return D3D_PRIMITIVE_TOPOLOGY_TRIANGLELIST;
+  }
+}
+
+D3D12_PRIMITIVE_TOPOLOGY_TYPE ToD3D12PrimitiveTopologyType(PrimitiveType primitiveType) {
+  switch (primitiveType) {
+    case PrimitiveType::Triangles:
+    case PrimitiveType::TriangleStrip:
+      return D3D12_PRIMITIVE_TOPOLOGY_TYPE_TRIANGLE;
+    default:
+      return D3D12_PRIMITIVE_TOPOLOGY_TYPE_TRIANGLE;
+  }
+}
+
+DXGI_FORMAT ToD3D12VertexFormat(VertexFormat format) {
+  switch (format) {
+    case VertexFormat::Float:
+      return DXGI_FORMAT_R32_FLOAT;
+    case VertexFormat::Float2:
+      return DXGI_FORMAT_R32G32_FLOAT;
+    case VertexFormat::Float3:
+      return DXGI_FORMAT_R32G32B32_FLOAT;
+    case VertexFormat::Float4:
+      return DXGI_FORMAT_R32G32B32A32_FLOAT;
+    case VertexFormat::Half:
+      return DXGI_FORMAT_R16_FLOAT;
+    case VertexFormat::Half2:
+      return DXGI_FORMAT_R16G16_FLOAT;
+    case VertexFormat::Half3:
+      // D3D12 does not have a native R16G16B16_FLOAT format. Use R16G16B16A16_FLOAT as fallback.
+      return DXGI_FORMAT_R16G16B16A16_FLOAT;
+    case VertexFormat::Half4:
+      return DXGI_FORMAT_R16G16B16A16_FLOAT;
+    case VertexFormat::Int:
+      return DXGI_FORMAT_R32_SINT;
+    case VertexFormat::Int2:
+      return DXGI_FORMAT_R32G32_SINT;
+    case VertexFormat::Int3:
+      return DXGI_FORMAT_R32G32B32_SINT;
+    case VertexFormat::Int4:
+      return DXGI_FORMAT_R32G32B32A32_SINT;
+    case VertexFormat::UByteNormalized:
+      return DXGI_FORMAT_R8_UNORM;
+    case VertexFormat::UByte2Normalized:
+      return DXGI_FORMAT_R8G8_UNORM;
+    case VertexFormat::UByte3Normalized:
+      // D3D12 does not have a native R8G8B8_UNORM vertex format. Use R8G8B8A8_UNORM as fallback.
+      return DXGI_FORMAT_R8G8B8A8_UNORM;
+    case VertexFormat::UByte4Normalized:
+      return DXGI_FORMAT_R8G8B8A8_UNORM;
+    default:
+      return DXGI_FORMAT_R32_FLOAT;
+  }
+}
+
+D3D12_COMPARISON_FUNC ToD3D12CompareFunction(CompareFunction compareFunction) {
+  switch (compareFunction) {
+    case CompareFunction::Never:
+      return D3D12_COMPARISON_FUNC_NEVER;
+    case CompareFunction::Less:
+      return D3D12_COMPARISON_FUNC_LESS;
+    case CompareFunction::Equal:
+      return D3D12_COMPARISON_FUNC_EQUAL;
+    case CompareFunction::LessEqual:
+      return D3D12_COMPARISON_FUNC_LESS_EQUAL;
+    case CompareFunction::Greater:
+      return D3D12_COMPARISON_FUNC_GREATER;
+    case CompareFunction::NotEqual:
+      return D3D12_COMPARISON_FUNC_NOT_EQUAL;
+    case CompareFunction::GreaterEqual:
+      return D3D12_COMPARISON_FUNC_GREATER_EQUAL;
+    case CompareFunction::Always:
+      return D3D12_COMPARISON_FUNC_ALWAYS;
+    default:
+      return D3D12_COMPARISON_FUNC_ALWAYS;
+  }
+}
+
+D3D12_STENCIL_OP ToD3D12StencilOperation(StencilOperation stencilOp) {
+  switch (stencilOp) {
+    case StencilOperation::Keep:
+      return D3D12_STENCIL_OP_KEEP;
+    case StencilOperation::Zero:
+      return D3D12_STENCIL_OP_ZERO;
+    case StencilOperation::Replace:
+      return D3D12_STENCIL_OP_REPLACE;
+    case StencilOperation::Invert:
+      return D3D12_STENCIL_OP_INVERT;
+    case StencilOperation::IncrementClamp:
+      return D3D12_STENCIL_OP_INCR_SAT;
+    case StencilOperation::DecrementClamp:
+      return D3D12_STENCIL_OP_DECR_SAT;
+    case StencilOperation::IncrementWrap:
+      return D3D12_STENCIL_OP_INCR;
+    case StencilOperation::DecrementWrap:
+      return D3D12_STENCIL_OP_DECR;
+    default:
+      return D3D12_STENCIL_OP_KEEP;
+  }
+}
+
+D3D12_BLEND ToD3D12BlendFactor(BlendFactor blendFactor) {
+  switch (blendFactor) {
+    case BlendFactor::Zero:
+      return D3D12_BLEND_ZERO;
+    case BlendFactor::One:
+      return D3D12_BLEND_ONE;
+    case BlendFactor::Src:
+      return D3D12_BLEND_SRC_COLOR;
+    case BlendFactor::OneMinusSrc:
+      return D3D12_BLEND_INV_SRC_COLOR;
+    case BlendFactor::Dst:
+      return D3D12_BLEND_DEST_COLOR;
+    case BlendFactor::OneMinusDst:
+      return D3D12_BLEND_INV_DEST_COLOR;
+    case BlendFactor::SrcAlpha:
+      return D3D12_BLEND_SRC_ALPHA;
+    case BlendFactor::OneMinusSrcAlpha:
+      return D3D12_BLEND_INV_SRC_ALPHA;
+    case BlendFactor::DstAlpha:
+      return D3D12_BLEND_DEST_ALPHA;
+    case BlendFactor::OneMinusDstAlpha:
+      return D3D12_BLEND_INV_DEST_ALPHA;
+    case BlendFactor::Src1:
+      return D3D12_BLEND_SRC1_COLOR;
+    case BlendFactor::OneMinusSrc1:
+      return D3D12_BLEND_INV_SRC1_COLOR;
+    case BlendFactor::Src1Alpha:
+      return D3D12_BLEND_SRC1_ALPHA;
+    case BlendFactor::OneMinusSrc1Alpha:
+      return D3D12_BLEND_INV_SRC1_ALPHA;
+    default:
+      return D3D12_BLEND_ONE;
+  }
+}
+
+D3D12_BLEND ToD3D12BlendFactorAlpha(BlendFactor blendFactor) {
+  // Alpha blend factors must use the *_ALPHA variants. D3D11/12 validation rejects color factors
+  // (SRC_COLOR / INV_SRC_COLOR / DEST_COLOR / INV_DEST_COLOR / SRC1_COLOR / INV_SRC1_COLOR) when
+  // they appear in SrcBlendAlpha or DestBlendAlpha — color and alpha are independent channels.
+  switch (blendFactor) {
+    case BlendFactor::Src:
+      return D3D12_BLEND_SRC_ALPHA;
+    case BlendFactor::OneMinusSrc:
+      return D3D12_BLEND_INV_SRC_ALPHA;
+    case BlendFactor::Dst:
+      return D3D12_BLEND_DEST_ALPHA;
+    case BlendFactor::OneMinusDst:
+      return D3D12_BLEND_INV_DEST_ALPHA;
+    case BlendFactor::Src1:
+      return D3D12_BLEND_SRC1_ALPHA;
+    case BlendFactor::OneMinusSrc1:
+      return D3D12_BLEND_INV_SRC1_ALPHA;
+    default:
+      return ToD3D12BlendFactor(blendFactor);
+  }
+}
+
+D3D12_BLEND_OP ToD3D12BlendOperation(BlendOperation blendOp) {
+  switch (blendOp) {
+    case BlendOperation::Add:
+      return D3D12_BLEND_OP_ADD;
+    case BlendOperation::Subtract:
+      return D3D12_BLEND_OP_SUBTRACT;
+    case BlendOperation::ReverseSubtract:
+      return D3D12_BLEND_OP_REV_SUBTRACT;
+    case BlendOperation::Min:
+      return D3D12_BLEND_OP_MIN;
+    case BlendOperation::Max:
+      return D3D12_BLEND_OP_MAX;
+    default:
+      return D3D12_BLEND_OP_ADD;
+  }
+}
+
+D3D12_TEXTURE_ADDRESS_MODE ToD3D12AddressMode(AddressMode addressMode) {
+  switch (addressMode) {
+    case AddressMode::ClampToEdge:
+      return D3D12_TEXTURE_ADDRESS_MODE_CLAMP;
+    case AddressMode::Repeat:
+      return D3D12_TEXTURE_ADDRESS_MODE_WRAP;
+    case AddressMode::MirrorRepeat:
+      return D3D12_TEXTURE_ADDRESS_MODE_MIRROR;
+    case AddressMode::ClampToBorder:
+      return D3D12_TEXTURE_ADDRESS_MODE_BORDER;
+    default:
+      return D3D12_TEXTURE_ADDRESS_MODE_CLAMP;
+  }
+}
+
+D3D12_FILTER ToD3D12Filter(FilterMode minFilter, FilterMode magFilter, MipmapMode mipmapMode) {
+  bool minLinear = (minFilter == FilterMode::Linear);
+  bool magLinear = (magFilter == FilterMode::Linear);
+  bool mipLinear = (mipmapMode == MipmapMode::Linear);
+  bool mipEnabled = (mipmapMode != MipmapMode::None);
+
+  if (!mipEnabled) {
+    if (minLinear && magLinear) {
+      return D3D12_FILTER_MIN_MAG_LINEAR_MIP_POINT;
+    }
+    if (minLinear) {
+      return D3D12_FILTER_MIN_LINEAR_MAG_MIP_POINT;
+    }
+    if (magLinear) {
+      return D3D12_FILTER_MIN_POINT_MAG_LINEAR_MIP_POINT;
+    }
+    return D3D12_FILTER_MIN_MAG_MIP_POINT;
+  }
+
+  if (minLinear && magLinear && mipLinear) {
+    return D3D12_FILTER_MIN_MAG_MIP_LINEAR;
+  }
+  if (minLinear && magLinear) {
+    return D3D12_FILTER_MIN_MAG_LINEAR_MIP_POINT;
+  }
+  if (minLinear && mipLinear) {
+    return D3D12_FILTER_MIN_LINEAR_MAG_POINT_MIP_LINEAR;
+  }
+  if (magLinear && mipLinear) {
+    return D3D12_FILTER_MIN_POINT_MAG_MIP_LINEAR;
+  }
+  if (minLinear) {
+    return D3D12_FILTER_MIN_LINEAR_MAG_MIP_POINT;
+  }
+  if (magLinear) {
+    return D3D12_FILTER_MIN_POINT_MAG_LINEAR_MIP_POINT;
+  }
+  if (mipLinear) {
+    return D3D12_FILTER_MIN_MAG_POINT_MIP_LINEAR;
+  }
+  return D3D12_FILTER_MIN_MAG_MIP_POINT;
+}
+
+D3D12_CULL_MODE ToD3D12CullMode(CullMode cullMode) {
+  switch (cullMode) {
+    case CullMode::None:
+      return D3D12_CULL_MODE_NONE;
+    case CullMode::Front:
+      return D3D12_CULL_MODE_FRONT;
+    case CullMode::Back:
+      return D3D12_CULL_MODE_BACK;
+  }
+  return D3D12_CULL_MODE_NONE;
+}
+
+bool ToD3D12FrontCounterClockwise(FrontFace frontFace) {
+  switch (frontFace) {
+    case FrontFace::CCW:
+      return true;
+    case FrontFace::CW:
+      return false;
+  }
+  return true;
+}
+
+D3D12_INDEX_BUFFER_STRIP_CUT_VALUE ToD3D12StripCutValue(IndexFormat indexFormat) {
+  switch (indexFormat) {
+    case IndexFormat::UInt16:
+      return D3D12_INDEX_BUFFER_STRIP_CUT_VALUE_0xFFFF;
+    case IndexFormat::UInt32:
+      return D3D12_INDEX_BUFFER_STRIP_CUT_VALUE_0xFFFFFFFF;
+  }
+  return D3D12_INDEX_BUFFER_STRIP_CUT_VALUE_DISABLED;
+}
+
+void TransitionResourceState(ID3D12GraphicsCommandList* commandList, ID3D12Resource* resource,
+                             D3D12_RESOURCE_STATES oldState, D3D12_RESOURCE_STATES newState,
+                             UINT subresource) {
+  if (commandList == nullptr || resource == nullptr || oldState == newState) {
+    return;
+  }
+  D3D12_RESOURCE_BARRIER barrier = {};
+  barrier.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
+  barrier.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE;
+  barrier.Transition.pResource = resource;
+  barrier.Transition.StateBefore = oldState;
+  barrier.Transition.StateAfter = newState;
+  barrier.Transition.Subresource = subresource;
+  commandList->ResourceBarrier(1, &barrier);
+}
+
+}  // namespace tgfx
diff --git a/src/gpu/d3d12/D3D12Util.h b/src/gpu/d3d12/D3D12Util.h
new file mode 100644
index 000000000..3d663338b
--- /dev/null
+++ b/src/gpu/d3d12/D3D12Util.h
@@ -0,0 +1,80 @@
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Tencent is pleased to support the open source community by making tgfx available.
+//
+//  Copyright (C) 2026 Tencent. All rights reserved.
+//
+//  Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+//  in compliance with the License. You may obtain a copy of the License at
+//
+//      https://opensource.org/licenses/BSD-3-Clause
+//
+//  unless required by applicable law or agreed to in writing, software distributed under the
+//  license is distributed on an "as is" basis, without warranties or conditions of any kind,
+//  either express or implied. see the license for the specific language governing permissions
+//  and limitations under the license.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#include <d3d12.h>
+#include <dxgi1_4.h>
+#include <wrl/client.h>
+#include "tgfx/gpu/Attribute.h"
+#include "tgfx/gpu/BlendFactor.h"
+#include "tgfx/gpu/BlendOperation.h"
+#include "tgfx/gpu/CompareFunction.h"
+#include "tgfx/gpu/FilterMode.h"
+#include "tgfx/gpu/MipmapMode.h"
+#include "tgfx/gpu/RenderPass.h"
+#include "tgfx/gpu/Sampler.h"
+#include "tgfx/gpu/StencilOperation.h"
+
+namespace tgfx {
+
+template <typename T>
+using ComPtr = Microsoft::WRL::ComPtr<T>;
+
+D3D12_PRIMITIVE_TOPOLOGY ToD3D12PrimitiveTopology(PrimitiveType primitiveType);
+
+D3D12_PRIMITIVE_TOPOLOGY_TYPE ToD3D12PrimitiveTopologyType(PrimitiveType primitiveType);
+
+DXGI_FORMAT ToD3D12VertexFormat(VertexFormat format);
+
+D3D12_COMPARISON_FUNC ToD3D12CompareFunction(CompareFunction compareFunction);
+
+D3D12_STENCIL_OP ToD3D12StencilOperation(StencilOperation stencilOp);
+
+D3D12_BLEND ToD3D12BlendFactor(BlendFactor blendFactor);
+
+/**
+ * Like ToD3D12BlendFactor() but rewrites the four COLOR-only D3D12 blend factors (SRC_COLOR,
+ * INV_SRC_COLOR, DEST_COLOR, INV_DEST_COLOR, plus their dual-source variants) into their ALPHA
+ * counterparts. CreateBlendState validation rejects color factors when applied to the alpha
+ * channel, so this helper must be used for {Src,Dest}BlendAlpha.
+ */
+D3D12_BLEND ToD3D12BlendFactorAlpha(BlendFactor blendFactor);
+
+D3D12_BLEND_OP ToD3D12BlendOperation(BlendOperation blendOp);
+
+D3D12_TEXTURE_ADDRESS_MODE ToD3D12AddressMode(AddressMode addressMode);
+
+D3D12_FILTER ToD3D12Filter(FilterMode minFilter, FilterMode magFilter, MipmapMode mipmapMode);
+
+D3D12_CULL_MODE ToD3D12CullMode(CullMode cullMode);
+
+bool ToD3D12FrontCounterClockwise(FrontFace frontFace);
+
+D3D12_INDEX_BUFFER_STRIP_CUT_VALUE ToD3D12StripCutValue(IndexFormat indexFormat);
+
+/**
+ * Records a single ID3D12Resource::ResourceBarrier(TRANSITION) on the given command list. No-op
+ * when oldState == newState. Used by every code path that needs to flip an ID3D12Resource between
+ * read- and write-only states (RTV/DSV setup, copy commands, shader sampling, etc.).
+ */
+void TransitionResourceState(ID3D12GraphicsCommandList* commandList, ID3D12Resource* resource,
+                             D3D12_RESOURCE_STATES oldState, D3D12_RESOURCE_STATES newState,
+                             UINT subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES);
+
+}  // namespace tgfx
diff --git a/src/gpu/d3d12/D3D12Window.cpp b/src/gpu/d3d12/D3D12Window.cpp
new file mode 100644
index 000000000..069d03e30
--- /dev/null
+++ b/src/gpu/d3d12/D3D12Window.cpp
@@ -0,0 +1,398 @@
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Tencent is pleased to support the open source community by making tgfx available.
+//
+//  Copyright (C) 2026 Tencent. All rights reserved.
+//
+//  Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+//  in compliance with the License. You may obtain a copy of the License at
+//
+//      https://opensource.org/licenses/BSD-3-Clause
+//
+//  unless required by applicable law or agreed to in writing, software distributed under the
+//  license is distributed on an "as is" basis, without warranties or conditions of any kind,
+//  either express or implied. see the license for the specific language governing permissions
+//  and limitations under the license.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "tgfx/gpu/d3d12/D3D12Window.h"
+#include <windows.h>
+#include <algorithm>
+#include <chrono>
+#include <vector>
+#include "D3D12CommandQueue.h"
+#include "D3D12Defines.h"
+#include "D3D12GPU.h"
+#include "core/utils/Log.h"
+#include "gpu/proxies/RenderTargetProxy.h"
+#include "gpu/resources/RenderTarget.h"
+#include "tgfx/gpu/Backend.h"
+#include "tgfx/gpu/Context.h"
+#include "tgfx/gpu/d3d12/D3D12Types.h"
+
+namespace tgfx {
+
+// Number of backbuffers in the swap chain. Two is the minimum allowed by FLIP_DISCARD; using three
+// gives the OS one extra frame to compose, smoothing latency spikes under heavy GPU load. We pick
+// two for parity with Vulkan's MAX_FRAMES_IN_FLIGHT and to keep peak VRAM low for typical 4K
+// windows. The presentation engine still queues a small number of frames internally.
+static constexpr UINT BACKBUFFER_COUNT = 2;
+
+// Private RenderTargetProxy that exposes the swap chain's current backbuffer as an external
+// D3D12 render target. The proxy is created once when the application calls Surface::MakeFrom()
+// and is then reused for every subsequent frame: Surface caches it for its entire lifetime
+// rather than re-acquiring on each render. To keep that pattern working with FLIP_DISCARD —
+// which rotates between BACKBUFFER_COUNT distinct ID3D12Resources — getRenderTarget() must
+// re-query GetCurrentBackBufferIndex every call and invalidate the cached RenderTarget when
+// the index changes. Otherwise every frame would be drawn into the same backbuffer slot and
+// the other slot would never get updated, manifesting as "no visible change" on user input.
+//
+// Defined at file scope (not in an anonymous namespace) so D3D12Window::PlatformState can store
+// a typed raw pointer to it; the .h does not expose this class, so it remains private to this
+// translation unit even without anonymous-namespace internal linkage.
+class D3D12SwapchainProxy : public RenderTargetProxy {
+ public:
+  D3D12SwapchainProxy(Context* context, IDXGISwapChain3* swapChain,
+                      const std::vector<ComPtr<ID3D12Resource>>* backBuffers, unsigned format,
+                      int width, int height)
+      : _context(context), _swapChain(swapChain), _backBuffers(backBuffers), _format(format),
+        _width(width), _height(height) {
+  }
+
+  Context* getContext() const override {
+    return _context;
+  }
+  int width() const override {
+    return _width;
+  }
+  int height() const override {
+    return _height;
+  }
+  PixelFormat format() const override {
+    return DXGIFormatToPixelFormat(_format);
+  }
+  int sampleCount() const override {
+    return 1;
+  }
+  ImageOrigin origin() const override {
+    return ImageOrigin::TopLeft;
+  }
+  bool externallyOwned() const override {
+    return true;
+  }
+  std::shared_ptr<TextureView> getTextureView() const override {
+    return nullptr;
+  }
+
+  std::shared_ptr<RenderTarget> getRenderTarget() const override {
+    if (_swapChain == nullptr || _backBuffers == nullptr || _backBuffers->empty()) {
+      return nullptr;
+    }
+    UINT index = _swapChain->GetCurrentBackBufferIndex();
+    if (index >= _backBuffers->size()) {
+      return nullptr;
+    }
+    auto* currentBuffer = (*_backBuffers)[index].Get();
+    if (_renderTarget != nullptr && currentBuffer == _cachedBackBuffer) {
+      return _renderTarget;
+    }
+    D3D12TextureInfo info = {};
+    info.resource = currentBuffer;
+    info.format = _format;
+    BackendRenderTarget backendRT(info, _width, _height);
+    _renderTarget = RenderTarget::MakeFrom(_context, backendRT, ImageOrigin::TopLeft);
+    _cachedBackBuffer = currentBuffer;
+    return _renderTarget;
+  }
+
+  /// Drops the cached RenderTarget so the next getRenderTarget() call goes through MakeFrom
+  /// again. Invoked by D3D12Window::onPresent — after Present() the swap chain promotes a new
+  /// backbuffer to "current", so the next acquisition must wrap that buffer instead of the one
+  /// the GPU just submitted to.
+  void releaseFrame() {
+    _renderTarget = nullptr;
+    _cachedBackBuffer = nullptr;
+  }
+
+ private:
+  Context* _context = nullptr;
+  IDXGISwapChain3* _swapChain = nullptr;
+  const std::vector<ComPtr<ID3D12Resource>>* _backBuffers = nullptr;
+  unsigned _format = DXGI_FORMAT_R8G8B8A8_UNORM;
+  int _width = 0;
+  int _height = 0;
+  mutable std::shared_ptr<RenderTarget> _renderTarget = nullptr;
+  mutable ID3D12Resource* _cachedBackBuffer = nullptr;
+};
+
+// Hidden state shared between D3D12Window and its private RenderTargetProxy. Stored as PImpl so
+// the public header doesn't need <dxgi.h> / <d3d12.h>. The DXGI format is kept as `unsigned` to
+// match the rest of the D3D12 backend (D3D12Defines.h shadows the SDK enum with constexpr
+// integers so an unqualified DXGI_FORMAT_R8G8B8A8_UNORM here is `unsigned`, not the enum type).
+struct D3D12Window::PlatformState {
+  ComPtr<IDXGISwapChain3> swapChain;
+  std::vector<ComPtr<ID3D12Resource>> backBuffers;
+  unsigned format = DXGI_FORMAT_R8G8B8A8_UNORM;
+  HWND hwnd = nullptr;
+  int width = 0;
+  int height = 0;
+
+  // Cached proxy for the currently-acquired backbuffer. Reset by onPresent() so the next
+  // onCreateRenderTarget() picks up the new frame's index. Held as a shared_ptr because tgfx's
+  // surface code may keep a strong reference for a single frame; currentProxyRaw mirrors the
+  // underlying D3D12SwapchainProxy* so onPresent() can call releaseFrame() without a static_cast
+  // from the base RenderTargetProxy*. The two pointers are written and cleared together so the
+  // raw view never outlives the shared owner.
+  std::shared_ptr<RenderTargetProxy> currentProxy;
+  D3D12SwapchainProxy* currentProxyRaw = nullptr;
+
+  bool buildBackBuffers();
+  bool rebuild(int newWidth, int newHeight);
+};
+
+bool D3D12Window::PlatformState::buildBackBuffers() {
+  backBuffers.clear();
+  backBuffers.resize(BACKBUFFER_COUNT);
+  for (UINT i = 0; i < BACKBUFFER_COUNT; i++) {
+    auto hr = swapChain->GetBuffer(i, IID_PPV_ARGS(&backBuffers[i]));
+    if (FAILED(hr)) {
+      LOGE("D3D12Window: GetBuffer(%u) failed, HRESULT=0x%08X", i, static_cast<unsigned>(hr));
+      backBuffers.clear();
+      return false;
+    }
+  }
+  return true;
+}
+
+bool D3D12Window::PlatformState::rebuild(int newWidth, int newHeight) {
+  // Releasing every backbuffer reference is mandatory before ResizeBuffers; otherwise the call
+  // returns DXGI_ERROR_INVALID_CALL because the swapchain still owns outstanding references.
+  backBuffers.clear();
+  currentProxy = nullptr;
+  currentProxyRaw = nullptr;
+  auto hr =
+      swapChain->ResizeBuffers(BACKBUFFER_COUNT, static_cast<UINT>(newWidth),
+                               static_cast<UINT>(newHeight), static_cast<DXGI_FORMAT>(format), 0);
+  if (FAILED(hr)) {
+    LOGE("D3D12Window: ResizeBuffers failed, HRESULT=0x%08X", static_cast<unsigned>(hr));
+    return false;
+  }
+  width = newWidth;
+  height = newHeight;
+  return buildBackBuffers();
+}
+
+#ifdef _WIN32
+
+std::shared_ptr<D3D12Window> D3D12Window::MakeFrom(HWND hwnd, std::shared_ptr<D3D12Device> device,
+                                                   std::shared_ptr<ColorSpace> colorSpace) {
+  if (hwnd == nullptr || device == nullptr) {
+    return nullptr;
+  }
+  if (colorSpace && !colorSpace->isSRGB()) {
+    LOGI(
+        "D3D12Window::MakeFrom(): non-sRGB colorSpace is not yet supported and will be ignored. "
+        "Only sRGB output is currently available.");
+  }
+
+  auto context = device->lockContext();
+  if (context == nullptr) {
+    return nullptr;
+  }
+  auto* gpu = static_cast<D3D12GPU*>(context->gpu());
+  auto* d3d12CommandQueue = static_cast<D3D12CommandQueue*>(gpu->queue())->d3d12CommandQueue();
+
+  RECT clientRect = {};
+  GetClientRect(hwnd, &clientRect);
+  int width = static_cast<int>(clientRect.right - clientRect.left);
+  int height = static_cast<int>(clientRect.bottom - clientRect.top);
+  if (width <= 0 || height <= 0) {
+    width = std::max(width, 1);
+    height = std::max(height, 1);
+  }
+
+  ComPtr<IDXGIFactory4> factory;
+  auto hr = CreateDXGIFactory1(IID_PPV_ARGS(&factory));
+  if (FAILED(hr)) {
+    LOGE("D3D12Window: CreateDXGIFactory1 failed, HRESULT=0x%08X", static_cast<unsigned>(hr));
+    device->unlock();
+    return nullptr;
+  }
+
+  DXGI_SWAP_CHAIN_DESC1 desc = {};
+  desc.BufferCount = BACKBUFFER_COUNT;
+  desc.Width = static_cast<UINT>(width);
+  desc.Height = static_cast<UINT>(height);
+  // DXGI_FORMAT_R8G8B8A8_UNORM in this TU resolves to the D3D12Defines.h `unsigned` constant
+  // (= 28) rather than the SDK enum, so cast back here for DXGI_SWAP_CHAIN_DESC1::Format which
+  // does want the real enum.
+  desc.Format = static_cast<DXGI_FORMAT>(DXGI_FORMAT_R8G8B8A8_UNORM);
+  desc.BufferUsage = DXGI_USAGE_RENDER_TARGET_OUTPUT;
+  desc.SwapEffect = DXGI_SWAP_EFFECT_FLIP_DISCARD;
+  desc.SampleDesc.Count = 1;
+  desc.SampleDesc.Quality = 0;
+  desc.AlphaMode = DXGI_ALPHA_MODE_IGNORE;
+  desc.Scaling = DXGI_SCALING_STRETCH;
+  desc.Flags = 0;
+
+  ComPtr<IDXGISwapChain1> swapChain1;
+  hr = factory->CreateSwapChainForHwnd(d3d12CommandQueue, hwnd, &desc, nullptr, nullptr,
+                                       &swapChain1);
+  if (FAILED(hr)) {
+    LOGE("D3D12Window: CreateSwapChainForHwnd failed, HRESULT=0x%08X", static_cast<unsigned>(hr));
+    device->unlock();
+    return nullptr;
+  }
+  // FLIP_DISCARD requires IDXGISwapChain3 for GetCurrentBackBufferIndex; QI is mandatory here.
+  ComPtr<IDXGISwapChain3> swapChain3;
+  hr = swapChain1.As(&swapChain3);
+  if (FAILED(hr) || swapChain3 == nullptr) {
+    LOGE("D3D12Window: failed to QI IDXGISwapChain3, HRESULT=0x%08X", static_cast<unsigned>(hr));
+    device->unlock();
+    return nullptr;
+  }
+  // Disable DXGI's default Alt+Enter fullscreen handling. tgfx callers manage that themselves.
+  factory->MakeWindowAssociation(hwnd, DXGI_MWA_NO_ALT_ENTER);
+
+  auto state = std::make_unique<PlatformState>();
+  state->swapChain = std::move(swapChain3);
+  state->format = desc.Format;
+  state->hwnd = hwnd;
+  state->width = width;
+  state->height = height;
+  if (!state->buildBackBuffers()) {
+    device->unlock();
+    return nullptr;
+  }
+
+  device->unlock();
+  return std::shared_ptr<D3D12Window>(new D3D12Window(device, std::move(state), colorSpace));
+}
+
+#endif
+
+D3D12Window::D3D12Window(std::shared_ptr<Device> device, std::unique_ptr<PlatformState> state,
+                         std::shared_ptr<ColorSpace> colorSpace)
+    : Window(std::move(device), std::move(colorSpace)), _platformState(std::move(state)) {
+}
+
+D3D12Window::~D3D12Window() {
+  // Tear-down ordering is delicate. After the last frame, swap-chain Present() schedules its
+  // own GPU work on our command queue (the GPU-side flip), but that work is *not* tracked by
+  // any tgfx fence — D3D12CommandQueue::waitUntilCompleted() only waits on submissions we
+  // submitted via executeSubmission. If we release the swap chain (or its backbuffers) while
+  // that Present work is still pending, the runtime fires
+  // OBJECT_DELETED_WHILE_STILL_IN_USE (#921) and the debug layer asserts.
+  //
+  // To make sure the queue really is idle, we Signal a fresh fence on the queue and wait for
+  // it: that flushes everything previously enqueued, Present included.
+  //
+  // Then we still have to release the in-tgfx owners of each backbuffer before destroying
+  // the swap chain itself:
+  //   - the cached ExternalRenderTarget / ExternalTexture pair (drained via ResourceCache and
+  //     D3D12GPU return queues)
+  //   - recycled command lists in D3D12CommandListPool (each list still pins the resources it
+  //     was last recorded against until its next Reset())
+  auto context = device->lockContext();
+  if (context != nullptr) {
+    auto* d3d12GPU = static_cast<D3D12GPU*>(context->gpu());
+    auto* d3d12CmdQueue = static_cast<D3D12CommandQueue*>(d3d12GPU->queue())->d3d12CommandQueue();
+
+    // 1. Wait for all tgfx-managed submissions to complete.
+    d3d12GPU->queue()->waitUntilCompleted();
+
+    // 2. Wait for any Present-driven work the queue still has queued up. Without this the
+    //    swap-chain release path below trips OBJECT_DELETED_WHILE_STILL_IN_USE because DXGI's
+    //    internal flip operation is still in flight on the queue.
+    ComPtr<ID3D12Fence> drainFence;
+    if (SUCCEEDED(
+            d3d12GPU->device()->CreateFence(0, D3D12_FENCE_FLAG_NONE, IID_PPV_ARGS(&drainFence)))) {
+      const UINT64 targetValue = 1;
+      if (SUCCEEDED(d3d12CmdQueue->Signal(drainFence.Get(), targetValue))) {
+        if (drainFence->GetCompletedValue() < targetValue) {
+          HANDLE evt = CreateEventW(nullptr, FALSE, FALSE, nullptr);
+          if (evt != nullptr) {
+            if (SUCCEEDED(drainFence->SetEventOnCompletion(targetValue, evt))) {
+              WaitForSingleObject(evt, 5000);
+            }
+            CloseHandle(evt);
+          }
+        }
+      }
+    }
+
+    // 3. Drop tgfx-side owners of the backbuffers.
+    _platformState->currentProxy = nullptr;
+    _platformState->currentProxyRaw = nullptr;
+    context->purgeResourcesNotUsedSince(std::chrono::steady_clock::now());
+    d3d12GPU->processUnreferencedResources();
+    d3d12GPU->commandListPool().clear();
+
+    // 4. Release the swap chain and our own backbuffer ComPtrs. The order between these two
+    //    is not important once the queue is idle and no tgfx object pins the backbuffers.
+    _platformState->backBuffers.clear();
+    _platformState->swapChain = nullptr;
+    device->unlock();
+  } else {
+    _platformState->currentProxy = nullptr;
+    _platformState->currentProxyRaw = nullptr;
+    _platformState->backBuffers.clear();
+    _platformState->swapChain = nullptr;
+  }
+}
+
+std::shared_ptr<RenderTargetProxy> D3D12Window::onCreateRenderTarget(Context* context) {
+  if (_platformState->swapChain == nullptr) {
+    return nullptr;
+  }
+  // Detect resize. The application's WM_SIZE handler is expected to reset the cached Surface,
+  // which in turn drops references to our previous proxy/backbuffer; only then is it safe to
+  // call ResizeBuffers (which requires zero outstanding backbuffer references).
+  RECT rect = {};
+  GetClientRect(_platformState->hwnd, &rect);
+  int width = static_cast<int>(rect.right - rect.left);
+  int height = static_cast<int>(rect.bottom - rect.top);
+  if (width <= 0 || height <= 0) {
+    return nullptr;
+  }
+  if (width != _platformState->width || height != _platformState->height) {
+    // Wait for the GPU to finish reading old backbuffers; ResizeBuffers cannot proceed while
+    // any reference is outstanding, including in-flight command lists.
+    context->gpu()->queue()->waitUntilCompleted();
+    if (!_platformState->rebuild(width, height)) {
+      return nullptr;
+    }
+  }
+
+  // Build one proxy per Surface and let it pull the current backbuffer index out of the swap
+  // chain on every getRenderTarget() call. Surface caches the proxy for its whole lifetime, so
+  // a per-frame allocation here would leak the freshly-created proxy and never reach the
+  // backbuffer-rotation code path.
+  auto proxy = std::make_shared<D3D12SwapchainProxy>(
+      context, _platformState->swapChain.Get(), &_platformState->backBuffers,
+      _platformState->format, _platformState->width, _platformState->height);
+  _platformState->currentProxyRaw = proxy.get();
+  _platformState->currentProxy = std::move(proxy);
+  return _platformState->currentProxy;
+}
+
+void D3D12Window::onPresent(Context* /*context*/) {
+  if (_platformState->swapChain == nullptr) {
+    return;
+  }
+  // SyncInterval=1 mirrors VK_PRESENT_MODE_FIFO_KHR: wait for the next vertical blank. Apps that
+  // need uncapped framerate can replace this with a FRAME_LATENCY_WAITABLE_OBJECT path later.
+  auto hr = _platformState->swapChain->Present(1, 0);
+  if (FAILED(hr)) {
+    LOGE("D3D12Window: Present failed, HRESULT=0x%08X", static_cast<unsigned>(hr));
+  }
+  // Tell the proxy to drop its cached RenderTarget so the next getRenderTarget() picks up the
+  // backbuffer the swap chain just rotated in. Without this Surface keeps drawing into the
+  // same slot forever and the user sees a frozen frame regardless of input.
+  if (_platformState->currentProxyRaw != nullptr) {
+    _platformState->currentProxyRaw->releaseFrame();
+  }
+}
+
+}  // namespace tgfx
diff --git a/test/src/d3d12/DevicePool.cpp b/test/src/d3d12/DevicePool.cpp
new file mode 100644
index 000000000..faf7c40f8
--- /dev/null
+++ b/test/src/d3d12/DevicePool.cpp
@@ -0,0 +1,38 @@
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Tencent is pleased to support the open source community by making tgfx available.
+//
+//  Copyright (C) 2026 Tencent. All rights reserved.
+//
+//  Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+//  in compliance with the License. You may obtain a copy of the License at
+//
+//      https://opensource.org/licenses/BSD-3-Clause
+//
+//  unless required by applicable law or agreed to in writing, software distributed under the
+//  license is distributed on an "as is" basis, without warranties or conditions of any kind,
+//  either express or implied. see the license for the specific language governing permissions
+//  and limitations under the license.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "utils/DevicePool.h"
+#include "tgfx/gpu/d3d12/D3D12Device.h"
+
+namespace tgfx {
+thread_local std::shared_ptr<Device> cachedDevice = nullptr;
+
+std::shared_ptr<Device> DevicePool::Make() {
+  if (cachedDevice == nullptr) {
+#ifdef TGFX_D3D12_USE_WARP
+    // CI opt-in (-DTGFX_D3D12_USE_WARP=ON): force the test suite onto WARP so headless runners
+    // without a hardware adapter can still exercise the D3D12 backend. WARP is functionally
+    // complete but very slow — never enable this for performance baselines.
+    cachedDevice = D3D12Device::MakeWarp();
+#else
+    cachedDevice = D3D12Device::Make();
+#endif
+  }
+  return cachedDevice;
+}
+}  // namespace tgfx
diff --git a/vcpkg/ports/tgfx/tgfx-functions.cmake b/vcpkg/ports/tgfx/tgfx-functions.cmake
index 0672f233b..1f925657a 100644
--- a/vcpkg/ports/tgfx/tgfx-functions.cmake
+++ b/vcpkg/ports/tgfx/tgfx-functions.cmake
@@ -74,6 +74,12 @@ function(build_tgfx_single_config SOURCE_PATH NODEJS OUTPUT_DIR IS_DEBUG)
         list(APPEND BUILD_ARGS -DTGFX_USE_OPENGL=OFF)
     endif()
 
+    if("d3d12" IN_LIST FEATURES)
+        list(APPEND BUILD_ARGS -DTGFX_USE_D3D12=ON)
+    else()
+        list(APPEND BUILD_ARGS -DTGFX_USE_D3D12=OFF)
+    endif()
+
     if("threads" IN_LIST FEATURES)
         list(APPEND BUILD_ARGS -DTGFX_USE_THREADS=ON)
         if(VCPKG_TARGET_IS_EMSCRIPTEN)
diff --git a/vcpkg/ports/tgfx/vcpkg.json b/vcpkg/ports/tgfx/vcpkg.json
index db9c42d22..30be2fd62 100644
--- a/vcpkg/ports/tgfx/vcpkg.json
+++ b/vcpkg/ports/tgfx/vcpkg.json
@@ -89,6 +89,10 @@
     "opengl": {
       "description": "Enable OpenGL as the GPU backend"
     },
+    "d3d12": {
+      "description": "Enable D3D12 as the GPU backend",
+      "supports": "windows"
+    },
     "threads": {
       "description": "Enable support for multithreaded rendering"
     },
diff --git a/vendor.json b/vendor.json
index c94abebcb..9f65e6153 100644
--- a/vendor.json
+++ b/vendor.json
@@ -347,10 +347,12 @@
       "cmake": {
         "targets": [
           "spirv-cross-core",
-          "spirv-cross-glsl"
+          "spirv-cross-glsl",
+          "spirv-cross-hlsl"
         ],
         "arguments": [
-          "-DSPIRV_CROSS_ENABLE_TESTS=OFF"
+          "-DSPIRV_CROSS_ENABLE_TESTS=OFF",
+          "-DCMAKE_CXX_FLAGS=\"-w\""
         ],
         "platforms": [
           "win"
diff --git a/win/CMakeLists.txt b/win/CMakeLists.txt
index 20fe3c897..85899dfa7 100644
--- a/win/CMakeLists.txt
+++ b/win/CMakeLists.txt
@@ -47,6 +47,9 @@ file(GLOB HELLO_2D_COMMON_FILES src/*.*)
 if (TGFX_USE_VULKAN)
     file(GLOB HELLO_2D_BACKEND_FILES vulkan/*.*)
     list(APPEND HELLO_2D_INCLUDES vulkan)
+elseif (TGFX_USE_D3D12)
+    file(GLOB HELLO_2D_BACKEND_FILES d3d12/*.*)
+    list(APPEND HELLO_2D_INCLUDES d3d12)
 else ()
     file(GLOB HELLO_2D_BACKEND_FILES wgl/*.*)
     list(APPEND HELLO_2D_INCLUDES wgl)
diff --git a/win/d3d12/TGFXWindow.cpp b/win/d3d12/TGFXWindow.cpp
new file mode 100644
index 000000000..f73e8a493
--- /dev/null
+++ b/win/d3d12/TGFXWindow.cpp
@@ -0,0 +1,399 @@
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Tencent is pleased to support the open source community by making tgfx available.
+//
+//  Copyright (C) 2026 Tencent. All rights reserved.
+//
+//  Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+//  in compliance with the License. You may obtain a copy of the License at
+//
+//      https://opensource.org/licenses/BSD-3-Clause
+//
+//  unless required by applicable law or agreed to in writing, software distributed under the
+//  license is distributed on an "as is" basis, without warranties or conditions of any kind,
+//  either express or implied. see the license for the specific language governing permissions
+//  and limitations under the license.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include "TGFXWindow.h"
+#include <algorithm>
+#include <cmath>
+#include <filesystem>
+#if WINVER >= 0x0603  // Windows 8.1
+#include <shellscalingapi.h>
+#endif
+#include "hello2d/AppHost.h"
+#include "hello2d/LayerBuilder.h"
+
+namespace hello2d {
+static constexpr LPCWSTR ClassName = L"TGFXWindow";
+static constexpr float MAX_ZOOM = 1000.0f;
+static constexpr float MIN_ZOOM = 0.001f;
+static constexpr float WHEEL_RATIO = 400.0f;
+
+TGFXWindow::TGFXWindow() {
+  createAppHost();
+}
+
+TGFXWindow::~TGFXWindow() {
+  destroy();
+}
+
+bool TGFXWindow::open() {
+  destroy();
+  WNDCLASS windowClass = RegisterWindowClass();
+  auto pixelRatio = getPixelRatio();
+  int initWidth = static_cast<int>(pixelRatio * 800);
+  int initHeight = static_cast<int>(pixelRatio * 600);
+  windowHandle =
+      CreateWindowEx(WS_EX_APPWINDOW, windowClass.lpszClassName, L"Hello2D", WS_OVERLAPPEDWINDOW, 0,
+                     0, initWidth, initHeight, nullptr, nullptr, windowClass.hInstance, this);
+
+  if (windowHandle == nullptr) {
+    return false;
+  }
+  RegisterTouchWindow(windowHandle, 0);
+  SetWindowLongPtr(windowHandle, GWLP_USERDATA, reinterpret_cast<LONG_PTR>(this));
+  centerAndShow();
+  ShowWindow(windowHandle, SW_SHOW);
+  UpdateWindow(windowHandle);
+  RECT rect = {};
+  GetClientRect(windowHandle, &rect);
+  lastSurfaceWidth = static_cast<int>(rect.right - rect.left);
+  lastSurfaceHeight = static_cast<int>(rect.bottom - rect.top);
+  updateZoomScaleAndOffset();
+  updateLayerTree();
+  ::InvalidateRect(windowHandle, nullptr, FALSE);
+  return true;
+}
+
+WNDCLASS TGFXWindow::RegisterWindowClass() {
+  auto hInstance = GetModuleHandle(nullptr);
+  WNDCLASS windowClass{};
+  windowClass.hCursor = LoadCursor(nullptr, IDC_ARROW);
+  windowClass.lpszClassName = ClassName;
+  windowClass.style = CS_HREDRAW | CS_VREDRAW;
+  windowClass.cbClsExtra = 0;
+  windowClass.cbWndExtra = 0;
+  windowClass.hInstance = hInstance;
+  windowClass.hIcon = LoadIcon(hInstance, L"IDI_ICON1");
+  windowClass.hbrBackground = nullptr;
+  windowClass.lpszMenuName = nullptr;
+  windowClass.lpfnWndProc = WndProc;
+  RegisterClass(&windowClass);
+  return windowClass;
+}
+
+LRESULT CALLBACK TGFXWindow::WndProc(HWND window, UINT message, WPARAM wparam,
+                                     LPARAM lparam) noexcept {
+  auto tgfxWindow = reinterpret_cast<TGFXWindow*>(GetWindowLongPtr(window, GWLP_USERDATA));
+  if (tgfxWindow != nullptr) {
+    return tgfxWindow->handleMessage(window, message, wparam, lparam);
+  }
+  return DefWindowProc(window, message, wparam, lparam);
+}
+
+LRESULT TGFXWindow::handleMessage(HWND hwnd, UINT message, WPARAM wparam, LPARAM lparam) noexcept {
+  switch (message) {
+    case WM_ACTIVATE:
+      isDrawing = (LOWORD(wparam) != WA_INACTIVE);
+      break;
+    case WM_DESTROY:
+      destroy();
+      PostQuitMessage(0);
+      break;
+    case WM_SIZE: {
+      RECT rect;
+      GetClientRect(windowHandle, &rect);
+      lastSurfaceWidth = static_cast<int>(rect.right - rect.left);
+      lastSurfaceHeight = static_cast<int>(rect.bottom - rect.top);
+      applyCenteringTransform();
+      if (tgfxWindow) {
+        surface = nullptr;
+        presentImmediately = true;
+      }
+      ::InvalidateRect(windowHandle, nullptr, FALSE);
+      break;
+    }
+    case WM_PAINT: {
+      PAINTSTRUCT ps;
+      BeginPaint(hwnd, &ps);
+      if (isDrawing) {
+        draw();
+        ::InvalidateRect(windowHandle, nullptr, FALSE);
+      }
+      EndPaint(hwnd, &ps);
+      break;
+    }
+    case WM_LBUTTONUP: {
+      int count = hello2d::LayerBuilder::Count();
+      if (count > 0) {
+        currentDrawerIndex = (currentDrawerIndex + 1) % count;
+        zoomScale = 1.0f;
+        contentOffset = {0.0f, 0.0f};
+        updateLayerTree();
+        updateZoomScaleAndOffset();
+        ::InvalidateRect(windowHandle, nullptr, FALSE);
+      }
+      break;
+    }
+    case WM_MOUSEWHEEL: {
+      POINT mousePoint = {GET_X_LPARAM(lparam), GET_Y_LPARAM(lparam)};
+      ScreenToClient(hwnd, &mousePoint);
+      float pixelX = static_cast<float>(mousePoint.x);
+      float pixelY = static_cast<float>(mousePoint.y);
+      bool isCtrlPressed = (GetKeyState(VK_CONTROL) & 0x8000) != 0;
+      bool isShiftPressed = (GetKeyState(VK_SHIFT) & 0x8000) != 0;
+
+      if (isCtrlPressed) {
+        float zoomStep = std::exp(GET_WHEEL_DELTA_WPARAM(wparam) / WHEEL_RATIO);
+        float newZoom = std::clamp(zoomScale * zoomStep, MIN_ZOOM, MAX_ZOOM);
+        float oldZoom = zoomScale;
+        contentOffset.x = pixelX - ((pixelX - contentOffset.x) / oldZoom) * newZoom;
+        contentOffset.y = pixelY - ((pixelY - contentOffset.y) / oldZoom) * newZoom;
+        zoomScale = newZoom;
+      } else {
+        float wheelDelta = static_cast<float>(GET_WHEEL_DELTA_WPARAM(wparam));
+        if (isShiftPressed) {
+          contentOffset.x += wheelDelta;
+        } else {
+          contentOffset.y -= wheelDelta;
+        }
+      }
+      updateZoomScaleAndOffset();
+      ::InvalidateRect(windowHandle, nullptr, FALSE);
+      break;
+    }
+    case WM_GESTURE: {
+      GESTUREINFO gestureInfo{};
+      gestureInfo.cbSize = sizeof(GESTUREINFO);
+      if (GetGestureInfo(reinterpret_cast<HGESTUREINFO>(lparam), &gestureInfo)) {
+        if (gestureInfo.dwID == GID_ZOOM) {
+          double currentArgument = static_cast<double>(gestureInfo.ullArguments);
+          if (lastZoomArgument != 0.0) {
+            double zoomFactor = currentArgument / lastZoomArgument;
+            POINT mousePoint = {GET_X_LPARAM(lparam), GET_Y_LPARAM(lparam)};
+            ScreenToClient(hwnd, &mousePoint);
+            float pixelX = static_cast<float>(mousePoint.x);
+            float pixelY = static_cast<float>(mousePoint.y);
+            float newZoom =
+                std::clamp(zoomScale * static_cast<float>(zoomFactor), MIN_ZOOM, MAX_ZOOM);
+            float oldZoom = zoomScale;
+            contentOffset.x = pixelX - ((pixelX - contentOffset.x) / oldZoom) * newZoom;
+            contentOffset.y = pixelY - ((pixelY - contentOffset.y) / oldZoom) * newZoom;
+            zoomScale = newZoom;
+          }
+          lastZoomArgument = currentArgument;
+        }
+        if (gestureInfo.dwFlags & GF_END) {
+          lastZoomArgument = 0.0;
+        }
+        CloseGestureInfoHandle(reinterpret_cast<HGESTUREINFO>(lparam));
+        updateZoomScaleAndOffset();
+        ::InvalidateRect(windowHandle, nullptr, FALSE);
+      }
+      break;
+    }
+    default:
+      return DefWindowProc(windowHandle, message, wparam, lparam);
+  }
+  return 0;
+}
+
+void TGFXWindow::destroy() {
+  if (windowHandle) {
+    DestroyWindow(windowHandle);
+    windowHandle = nullptr;
+    UnregisterClass(ClassName, nullptr);
+  }
+}
+
+void TGFXWindow::centerAndShow() {
+  if ((GetWindowStyle(windowHandle) & WS_CHILD) != 0) {
+    return;
+  }
+  RECT rcDlg = {0};
+  ::GetWindowRect(windowHandle, &rcDlg);
+  RECT rcArea = {0};
+  RECT rcCenter = {0};
+  HWND hWnd = windowHandle;
+  HWND hWndCenter = ::GetWindowOwner(windowHandle);
+  if (hWndCenter != nullptr) {
+    hWnd = hWndCenter;
+  }
+
+  MONITORINFO oMonitor = {};
+  oMonitor.cbSize = sizeof(oMonitor);
+  ::GetMonitorInfo(::MonitorFromWindow(hWnd, MONITOR_DEFAULTTONEAREST), &oMonitor);
+  rcArea = oMonitor.rcWork;
+
+  if (hWndCenter == nullptr) {
+    rcCenter = rcArea;
+  } else {
+    ::GetWindowRect(hWndCenter, &rcCenter);
+  }
+
+  int DlgWidth = rcDlg.right - rcDlg.left;
+  int DlgHeight = rcDlg.bottom - rcDlg.top;
+
+  int xLeft = (rcCenter.left + rcCenter.right) / 2 - DlgWidth / 2;
+  int yTop = (rcCenter.top + rcCenter.bottom) / 2 - DlgHeight / 2;
+
+  if (xLeft < rcArea.left) {
+    if (xLeft < 0) {
+      xLeft = GetSystemMetrics(SM_CXSCREEN) / 2 - DlgWidth / 2;
+    } else {
+      xLeft = rcArea.left;
+    }
+  } else if (xLeft + DlgWidth > rcArea.right) {
+    xLeft = rcArea.right - DlgWidth;
+  }
+
+  if (yTop < rcArea.top) {
+    if (yTop < 0) {
+      yTop = GetSystemMetrics(SM_CYSCREEN) / 2 - DlgHeight / 2;
+    } else {
+      yTop = rcArea.top;
+    }
+
+  } else if (yTop + DlgHeight > rcArea.bottom) {
+    yTop = rcArea.bottom - DlgHeight;
+  }
+  ::SetWindowPos(windowHandle, nullptr, xLeft, yTop, -1, -1,
+                 SWP_NOSIZE | SWP_NOZORDER | SWP_NOACTIVATE | SWP_SHOWWINDOW);
+}
+
+float TGFXWindow::getPixelRatio() {
+#if WINVER >= 0x0603
+  HMONITOR monitor = nullptr;
+  if (windowHandle != nullptr) {
+    monitor = ::MonitorFromWindow(windowHandle, MONITOR_DEFAULTTONEAREST);
+  } else {
+    monitor = ::MonitorFromPoint(POINT{0, 0}, MONITOR_DEFAULTTOPRIMARY);
+  }
+  UINT dpiX = 96;
+  UINT dpiY = 96;
+  GetDpiForMonitor(monitor, MDT_EFFECTIVE_DPI, &dpiX, &dpiY);
+  return static_cast<float>(dpiX) / 96.0f;
+#else
+  return 1.0f;
+#endif
+}
+
+void TGFXWindow::createAppHost() {
+  appHost = std::make_unique<hello2d::AppHost>();
+
+  displayList.setRenderMode(tgfx::RenderMode::Tiled);
+  displayList.setAllowZoomBlur(true);
+  displayList.setMaxTileCount(512);
+
+  std::filesystem::path filePath = __FILE__;
+  auto rootPath = filePath.parent_path().parent_path().parent_path().string();
+  auto imagePath = rootPath + R"(\resources\assets\bridge.jpg)";
+  auto image = tgfx::Image::MakeFromFile(imagePath);
+  appHost->addImage("bridge", image);
+  imagePath = rootPath + R"(\resources\assets\tgfx.png)";
+  appHost->addImage("TGFX", tgfx::Image::MakeFromFile(imagePath));
+  auto typeface = tgfx::Typeface::MakeFromName("Microsoft YaHei", "");
+  appHost->addTypeface("default", typeface);
+  auto emojiPath = rootPath + R"(\resources\font\NotoColorEmoji.ttf)";
+  typeface = tgfx::Typeface::MakeFromPath(emojiPath);
+  appHost->addTypeface("emoji", typeface);
+}
+
+void TGFXWindow::updateLayerTree() {
+  int count = hello2d::LayerBuilder::Count();
+  int index = (count > 0) ? (currentDrawerIndex % count) : 0;
+  if (index != lastDrawIndex || !contentLayer) {
+    auto builder = hello2d::LayerBuilder::GetByIndex(index);
+    if (builder) {
+      contentLayer = builder->buildLayerTree(appHost.get());
+      if (contentLayer) {
+        displayList.root()->removeChildren();
+        displayList.root()->addChild(contentLayer);
+        applyCenteringTransform();
+      }
+    }
+    lastDrawIndex = index;
+  }
+}
+
+void TGFXWindow::updateZoomScaleAndOffset() {
+  displayList.setZoomScale(zoomScale);
+  displayList.setContentOffset(contentOffset.x, contentOffset.y);
+}
+
+void TGFXWindow::applyCenteringTransform() {
+  if (lastSurfaceWidth > 0 && lastSurfaceHeight > 0 && contentLayer) {
+    hello2d::LayerBuilder::ApplyCenteringTransform(
+        contentLayer, static_cast<float>(lastSurfaceWidth), static_cast<float>(lastSurfaceHeight));
+  }
+}
+
+void TGFXWindow::draw() {
+  if (!tgfxWindow) {
+    auto device = tgfx::D3D12Device::Make();
+    if (device) {
+      tgfxWindow = tgfx::D3D12Window::MakeFrom(windowHandle, device);
+    }
+  }
+  if (tgfxWindow == nullptr) {
+    return;
+  }
+  RECT rect;
+  GetClientRect(windowHandle, &rect);
+  auto width = static_cast<int>(rect.right - rect.left);
+  auto height = static_cast<int>(rect.bottom - rect.top);
+  if (width <= 0 || height <= 0) {
+    return;
+  }
+  auto pixelRatio = getPixelRatio();
+
+  if (!displayList.hasContentChanged() && lastRecording == nullptr) {
+    return;
+  }
+
+  auto device = tgfxWindow->getDevice();
+  if (device == nullptr) {
+    return;
+  }
+  auto context = device->lockContext();
+  if (context == nullptr) {
+    return;
+  }
+  if (surface == nullptr) {
+    if (lastRecording) {
+      context->submit(std::move(lastRecording));
+    }
+    surface = tgfx::Surface::MakeFrom(context, tgfxWindow);
+  }
+  if (surface == nullptr) {
+    device->unlock();
+    return;
+  }
+
+  auto canvas = surface->getCanvas();
+  canvas->clear();
+  hello2d::DrawBackground(canvas, surface->width(), surface->height(), pixelRatio);
+
+  displayList.render(surface.get(), false);
+
+  auto recording = context->flush();
+
+  if (presentImmediately) {
+    presentImmediately = false;
+    if (recording) {
+      context->submit(std::move(recording));
+    }
+  } else {
+    std::swap(lastRecording, recording);
+
+    if (recording) {
+      context->submit(std::move(recording));
+    }
+  }
+
+  device->unlock();
+}
+}  // namespace hello2d
diff --git a/win/d3d12/TGFXWindow.h b/win/d3d12/TGFXWindow.h
new file mode 100644
index 000000000..c24cc6dc1
--- /dev/null
+++ b/win/d3d12/TGFXWindow.h
@@ -0,0 +1,81 @@
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+//  Tencent is pleased to support the open source community by making tgfx available.
+//
+//  Copyright (C) 2026 Tencent. All rights reserved.
+//
+//  Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+//  in compliance with the License. You may obtain a copy of the License at
+//
+//      https://opensource.org/licenses/BSD-3-Clause
+//
+//  unless required by applicable law or agreed to in writing, software distributed under the
+//  license is distributed on an "as is" basis, without warranties or conditions of any kind,
+//  either express or implied. see the license for the specific language governing permissions
+//  and limitations under the license.
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+#ifndef UNICODE
+#define UNICODE
+#endif
+
+#include <Windows.h>
+#include <Windowsx.h>
+#include <Winuser.h>
+#include <functional>
+#include <memory>
+#include <string>
+#include "hello2d/AppHost.h"
+#include "hello2d/LayerBuilder.h"
+#include "tgfx/core/Point.h"
+#include "tgfx/core/Surface.h"
+#include "tgfx/gpu/Recording.h"
+#include "tgfx/gpu/d3d12/D3D12Device.h"
+#include "tgfx/gpu/d3d12/D3D12Window.h"
+#include "tgfx/layers/DisplayList.h"
+
+namespace hello2d {
+class TGFXWindow {
+ public:
+  TGFXWindow();
+  virtual ~TGFXWindow();
+
+  bool open();
+
+ private:
+  HWND windowHandle = nullptr;
+  int currentDrawerIndex = 0;
+  int lastDrawIndex = -1;
+  double lastZoomArgument = 0.0;
+  float zoomScale = 1.0f;
+  tgfx::Point contentOffset = {0.0f, 0.0f};
+  std::shared_ptr<tgfx::D3D12Window> tgfxWindow = nullptr;
+  std::shared_ptr<tgfx::Surface> surface = nullptr;
+  std::shared_ptr<hello2d::AppHost> appHost = nullptr;
+  tgfx::DisplayList displayList = {};
+  std::shared_ptr<tgfx::Layer> contentLayer = nullptr;
+  std::unique_ptr<tgfx::Recording> lastRecording = nullptr;
+  int lastSurfaceWidth = 0;
+  int lastSurfaceHeight = 0;
+  bool presentImmediately = true;
+
+  static WNDCLASS RegisterWindowClass();
+  static LRESULT CALLBACK WndProc(HWND window, UINT message, WPARAM wparam, LPARAM lparam) noexcept;
+
+  LRESULT handleMessage(HWND window, UINT message, WPARAM wparam, LPARAM lparam) noexcept;
+
+  void destroy();
+  void centerAndShow();
+  float getPixelRatio();
+  void createAppHost();
+  void updateLayerTree();
+  void updateZoomScaleAndOffset();
+  void applyCenteringTransform();
+  void draw();
+
+  bool isDrawing = true;
+};
+}  // namespace hello2d