From 8c84bb29659bdec2a842d0c066d3952d4f5c6791 Mon Sep 17 00:00:00 2001
From: Exoridus <github@codexo.de>
Date: Sun, 28 Jun 2026 15:00:04 +0200
Subject: [PATCH 01/12] fix(examples): batch performance-overlay sprites via a
 Container

The overlay rendered 1600 sprites with one context.render() per sprite,
emitting one draw call each. Adding them to a Container and rendering it
once batches them into a single draw call (7.8ms -> 1.8ms on the spike).
---
 examples/debug-layer/performance-overlay.js | 12 +++++++++---
 examples/debug-layer/performance-overlay.ts | 11 +++++++++--
 2 files changed, 18 insertions(+), 5 deletions(-)
diff --git a/examples/debug-layer/performance-overlay.js b/examples/debug-layer/performance-overlay.js
index 275375b1..7534cc11 100644
--- a/examples/debug-layer/performance-overlay.js
+++ b/examples/debug-layer/performance-overlay.js
@@ -1,5 +1,5 @@
 // Auto-generated from performance-overlay.ts — edit the .ts source, not this file.
-import { Application, Color, Keyboard, Scene, Sprite, Texture } from '@codexo/exojs';
+import { Application, Color, Container, Keyboard, Scene, Sprite, Texture } from '@codexo/exojs';
 import { DebugOverlay } from '@codexo/exojs/debug';
 const app = new Application({
     canvas: {
@@ -17,14 +17,21 @@ const debug = new DebugOverlay(app);
 debug.layers.performance.visible = true;
 class PerformanceOverlayScene extends Scene {
     sprites;
+    layer;
     async load(loader) {
         await loader.load(Texture, { bunny: 'image/ship-a.png' });
     }
     init(loader) {
         const { width, height } = this.app.canvas;
+        // All sprites share one texture, so adding them to a single container and
+        // rendering it once lets the renderer batch them into a single draw call.
+        // Rendering each sprite with its own `context.render(sprite)` call would
+        // instead emit one draw call per sprite and tank the frame rate.
+        this.layer = new Container();
         this.sprites = Array.from({ length: 1600 }, () => {
             const sprite = new Sprite(loader.get(Texture, 'bunny')).setAnchor(0.5).setScale(0.25);
             sprite.setPosition(Math.random() * width, Math.random() * height);
+            this.layer.addChild(sprite);
             return {
                 sprite,
                 vx: (Math.random() - 0.5) * 120,
@@ -47,8 +54,7 @@ class PerformanceOverlayScene extends Scene {
     }
     draw(context) {
         context.backend.clear();
-        for (const { sprite } of this.sprites)
-            context.render(sprite);
+        context.render(this.layer);
     }
 }
 app.start(new PerformanceOverlayScene());
diff --git a/examples/debug-layer/performance-overlay.ts b/examples/debug-layer/performance-overlay.ts
index 7ff4b2b8..b5b1d415 100644
--- a/examples/debug-layer/performance-overlay.ts
+++ b/examples/debug-layer/performance-overlay.ts
@@ -1,4 +1,4 @@
-import { Application, Color, Keyboard, Scene, Sprite, Texture } from '@codexo/exojs';
+import { Application, Color, Container, Keyboard, Scene, Sprite, Texture } from '@codexo/exojs';
 import { DebugOverlay } from '@codexo/exojs/debug';
 
 const app = new Application({
@@ -19,6 +19,7 @@ debug.layers.performance.visible = true;
 
 class PerformanceOverlayScene extends Scene {
     private sprites!: { sprite: Sprite; vx: number; vy: number }[];
+    private layer!: Container;
 
     override async load(loader): Promise<void> {
         await loader.load(Texture, { bunny: 'image/ship-a.png' });
@@ -27,9 +28,15 @@ class PerformanceOverlayScene extends Scene {
     override init(loader): void {
         const { width, height } = this.app.canvas;
 
+        // All sprites share one texture, so adding them to a single container and
+        // rendering it once lets the renderer batch them into a single draw call.
+        // Rendering each sprite with its own `context.render(sprite)` call would
+        // instead emit one draw call per sprite and tank the frame rate.
+        this.layer = new Container();
         this.sprites = Array.from({ length: 1600 }, () => {
             const sprite = new Sprite(loader.get(Texture, 'bunny')).setAnchor(0.5).setScale(0.25);
             sprite.setPosition(Math.random() * width, Math.random() * height);
+            this.layer.addChild(sprite);
             return {
                 sprite,
                 vx: (Math.random() - 0.5) * 120,
@@ -53,7 +60,7 @@ class PerformanceOverlayScene extends Scene {
 
     override draw(context): void {
         context.backend.clear();
-        for (const { sprite } of this.sprites) context.render(sprite);
+        context.render(this.layer);
     }
 }
 

From cea0c6dc438375dc2ec1841256d92cd9f0439a2d Mon Sep 17 00:00:00 2001
From: Exoridus <github@codexo.de>
Date: Sun, 28 Jun 2026 15:03:12 +0200
Subject: [PATCH 02/12] test(rendering): add cross-call sprite batching
 regression test (red)

---
 test/perf/rendering/harness.ts                | 47 +++++++++++++++++++
 test/perf/rendering/structural-sprite.test.ts | 21 ++++++++-
 2 files changed, 67 insertions(+), 1 deletion(-)

diff --git a/test/perf/rendering/harness.ts b/test/perf/rendering/harness.ts
index cd21b523..67da7aae 100644
--- a/test/perf/rendering/harness.ts
+++ b/test/perf/rendering/harness.ts
@@ -9,6 +9,7 @@
  */
 import type { RenderNode } from '#rendering/RenderNode';
 import type { View } from '#rendering/View';
+import { playRenderTree } from '#rendering/plan/playRenderTree';
 import { WebGl2Backend } from '#rendering/webgl2/WebGl2Backend';
 
 import { wireCoreRenderers } from '../../rendering/browser/_coreRenderers';
@@ -161,3 +162,49 @@ export const measureSteadyFrame = (harness: WebGl2Harness, root: RenderNode, war
 
   return metrics!;
 };
+
+/**
+ * Render each node via its own setView + playRenderTree (exactly what
+ * RenderingContext.render does per call), then flush once — i.e. the
+ * "one context.render() per drawable in a loop" pattern. Returns the metrics of
+ * the final warmed frame.
+ */
+export const measureCrossCallFrame = (harness: WebGl2Harness, nodes: readonly RenderNode[], warmupFrames = 2): FrameMetrics => {
+  const { backend, recorder } = harness;
+  let metrics: FrameMetrics | null = null;
+
+  for (let i = 0; i <= warmupFrames; i++) {
+    backend.resetStats();
+    recorder.reset();
+    backend.clear();
+
+    const view = backend.view;
+    for (const node of nodes) {
+      backend.setView(view);
+      playRenderTree(node, backend);
+    }
+    backend.flush();
+
+    const stats = backend.stats;
+    metrics = {
+      drawCalls: stats.drawCalls,
+      batches: stats.batches,
+      instances: recorder.instances,
+      visibleNodes: stats.submittedNodes,
+      culledNodes: stats.culledNodes,
+      renderPasses: stats.renderPasses,
+      textureBinds: recorder.textureBinds,
+      samplerBinds: recorder.samplerBinds,
+      programChanges: recorder.programChanges,
+      blendChanges: recorder.blendChanges,
+      bufferUploads: recorder.bufferUploads,
+      bufferReallocations: recorder.bufferReallocations,
+      uploadedBufferBytes: recorder.bufferUploadBytes,
+      transformRows: recorder.transformRows,
+      transformUploads: recorder.transformUploads,
+      transformUploadBytes: recorder.transformUploadBytes,
+    };
+  }
+
+  return metrics!;
+};
diff --git a/test/perf/rendering/structural-sprite.test.ts b/test/perf/rendering/structural-sprite.test.ts
index 00fd1cb9..9a00c99d 100644
--- a/test/perf/rendering/structural-sprite.test.ts
+++ b/test/perf/rendering/structural-sprite.test.ts
@@ -14,7 +14,7 @@ import { Sprite } from '#rendering/sprite/Sprite';
 import type { BlendModes } from '#rendering/types';
 
 import { buildSpriteScene, makeTextures } from './fixtures';
-import { createWebGl2Harness, measureSteadyFrame, type WebGl2Harness } from './harness';
+import { createWebGl2Harness, measureCrossCallFrame, measureSteadyFrame, type WebGl2Harness } from './harness';
 
 const withHarness = (fn: (harness: WebGl2Harness) => void): void => {
   const harness = createWebGl2Harness();
@@ -138,6 +138,25 @@ describe('structural — Sprite', () => {
     });
   });
 
+  it('1000 per-call renders / 1 texture → one draw (cross-call batching)', () => {
+    withHarness(harness => {
+      const [texture] = makeTextures(1);
+      const sprites = Array.from({ length: 1000 }, (_, i) => {
+        const sprite = new Sprite(texture);
+        sprite.setPosition(i % 100, Math.floor(i / 100));
+        return sprite;
+      });
+
+      const m = measureCrossCallFrame(harness, sprites, 2);
+
+      expect(m.drawCalls).toBe(1);
+      expect(m.instances).toBe(1000);
+      expect(m.visibleNodes).toBe(1000);
+
+      for (const sprite of sprites) sprite.destroy();
+    });
+  });
+
   it('static transforms skip re-upload; moving transforms re-upload all rows', () => {
     withHarness(harness => {
       const staticScene = buildSpriteScene({ count: 500, textures: makeTextures(1) });

From 0775036f732f55b88fe91d8a6eecac6daf8c3bd9 Mon Sep 17 00:00:00 2001
From: Exoridus <github@codexo.de>
Date: Sun, 28 Jun 2026 15:08:42 +0200
Subject: [PATCH 03/12] perf(rendering): frame-scoped draw-plan lifecycle for
 cross-call batching
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

setView flushes only on real view change; the transform buffer resets once
per frame; the plan builder bases node indices at the frame buffer count;
nested plans isolate their rows. Per-call renders now batch (1000 -> 1 draw).
Leaves the barrier-path allocation gate red — fixed in the next commit.
---
 src/rendering/TransformBuffer.ts        | 24 ++++++++++++++
 src/rendering/plan/RenderPlanBuilder.ts |  8 +++--
 src/rendering/webgl2/WebGl2Backend.ts   | 43 +++++++++++++++++++++----
 3 files changed, 67 insertions(+), 8 deletions(-)

diff --git a/src/rendering/TransformBuffer.ts b/src/rendering/TransformBuffer.ts
index 8482f134..4b7da7f7 100644
--- a/src/rendering/TransformBuffer.ts
+++ b/src/rendering/TransformBuffer.ts
@@ -94,6 +94,11 @@ export class TransformBuffer {
     return this._version;
   }
 
+  /** Running content hash of the rows written since begin(). @internal */
+  public get frameHash(): number {
+    return this._frameHash;
+  }
+
   public begin(expectedCount = 0): this {
     if (expectedCount > 0) {
       this._ensureCapacity(expectedCount);
@@ -117,6 +122,25 @@ export class TransformBuffer {
     return slot;
   }
 
+  /**
+   * Rewind the write cursor to `count`, freeing the rows above it for reuse, and
+   * (optionally) restore the running content hash to its pre-rewind value so the
+   * freed rows' writes don't linger in the hash and trigger spurious re-uploads.
+   * Used by nested draw plans (filters / cacheAsBitmap) to isolate their slots.
+   * @internal
+   */
+  public rewindTo(count: number, frameHash?: number): this {
+    if (count >= 0 && count < this._count) {
+      this._count = count;
+
+      if (frameHash !== undefined) {
+        this._frameHash = frameHash >>> 0;
+      }
+    }
+
+    return this;
+  }
+
   public write(slot: number, transform: Matrix, tint: Color): this {
     if (!Number.isInteger(slot) || slot < 0) {
       throw new Error(`TransformBuffer slot must be a non-negative integer (got ${slot}).`);
diff --git a/src/rendering/plan/RenderPlanBuilder.ts b/src/rendering/plan/RenderPlanBuilder.ts
index db9cbe41..58053f25 100644
--- a/src/rendering/plan/RenderPlanBuilder.ts
+++ b/src/rendering/plan/RenderPlanBuilder.ts
@@ -93,7 +93,11 @@ export class RenderPlanBuilder {
     this._barrierEntryPoolCursor = 0;
     this._scopeStack.length = 0;
     this._hasPending = false;
-    this._nodeIndex = 0;
+    // Base this plan's node indices after whatever earlier render() calls already
+    // wrote into the frame-scoped transform buffer, so every draw across all
+    // render() calls in the frame references a distinct slot and can batch.
+    const frameBase = (backend as { transformBufferCount?: number }).transformBufferCount ?? 0;
+    this._nodeIndex = frameBase;
 
     const rootScope = this._acquireGroupScope(false);
 
@@ -110,7 +114,7 @@ export class RenderPlanBuilder {
       });
     }
 
-    this._plan.nodeCount = this._nodeIndex;
+    this._plan.nodeCount = this._nodeIndex - frameBase;
 
     return this._plan;
   }
diff --git a/src/rendering/webgl2/WebGl2Backend.ts b/src/rendering/webgl2/WebGl2Backend.ts
index e58d6a3a..d25f7e42 100644
--- a/src/rendering/webgl2/WebGl2Backend.ts
+++ b/src/rendering/webgl2/WebGl2Backend.ts
@@ -181,6 +181,8 @@ export class WebGl2Backend implements RenderBackend {
   private _transformTextureCount = -1;
   private _activeDrawCommand: DrawCommand | null = null;
   private _drawPlanDepth = 0;
+  private readonly _planBaseStack: number[] = [];
+  private readonly _planHashStack: number[] = [];
 
   public constructor(app: Application) {
     const canvasOptions = app.options.canvas ?? {};
@@ -279,13 +281,27 @@ export class WebGl2Backend implements RenderBackend {
 
   public resetStats(): this {
     resetRenderStats(this._stats);
+    // The transform buffer is frame-scoped: reset it once per frame here (was
+    // previously reset per render() call in _beginDrawPlan).
+    this._transformBuffer.begin();
 
     return this;
   }
 
+  /** Frame-global slot base the plan builder indexes from. @internal */
+  public get transformBufferCount(): number {
+    return this._transformBuffer.count;
+  }
+
   /** @internal */
-  public _beginDrawPlan(nodeCount: number): void {
-    this._transformBuffer.begin(nodeCount);
+  public _beginDrawPlan(_nodeCount: number): void {
+    // Do NOT reset the transform buffer here — it is frame-scoped (reset in
+    // resetStats). The builder already based this plan's node indices at the
+    // current buffer count, so writes land in fresh frame-global slots and
+    // batches survive across render() calls. Remember this plan's base so a
+    // nested plan can free its rows on end.
+    this._planBaseStack.push(this._transformBuffer.count);
+    this._planHashStack.push(this._transformBuffer.frameHash);
     this._activeDrawCommand = null;
     this._drawPlanDepth++;
   }
@@ -395,13 +411,23 @@ export class WebGl2Backend implements RenderBackend {
   public _endDrawPlan(): void {
     this._activeDrawCommand = null;
 
+    const planBase = this._planBaseStack.pop() ?? 0;
+    const planHash = this._planHashStack.pop() ?? 0;
+
     if (this._drawPlanDepth > 0) {
       this._drawPlanDepth--;
     }
 
-    // Only assert balance at the outermost plan: cacheAsBitmap draws a cache
-    // sprite via a nested render(), whose inner _endDrawPlan sees the still-open
-    // outer clips — those are not leaks.
+    // A nested plan (filter / cacheAsBitmap) just ended: flush its draws, then
+    // free its transform rows so the frame-scoped buffer only grows with
+    // top-level render() calls. Top-level plans (depth back to 0) keep their rows
+    // so cross-call batching survives to the frame-end flush.
+    if (this._drawPlanDepth > 0) {
+      this._flushActiveRenderer();
+      this._transformBuffer.rewindTo(planBase, planHash);
+    }
+
+    // Only assert balance at the outermost plan.
     if (this._drawPlanDepth === 0) {
       this._assertBalancedStencil();
     }
@@ -715,7 +741,12 @@ export class WebGl2Backend implements RenderBackend {
   }
 
   public setView(view: View | null): this {
-    this._flushActiveRenderer();
+    // Only flush the open batch when the view actually changes. The unconditional
+    // flush forced one draw call per render() call (each render() re-applies the
+    // same camera view), defeating cross-call batching.
+    if (this._renderTarget.view !== view) {
+      this._flushActiveRenderer();
+    }
     this._renderTarget.setView(view);
     this._bindRenderTarget(this._renderTarget);
 

From fc4c10ce0d4db3396936e1fed4a4d2cf93368c46 Mon Sep 17 00:00:00 2001
From: Exoridus <github@codexo.de>
Date: Sun, 28 Jun 2026 15:22:52 +0200
Subject: [PATCH 04/12] perf(rendering): delta-upload transform texture rows
 per flush

A frame-scoped buffer made barrier flushes re-upload a growing buffer
(O(N^2)). Uploading only [uploadedRows, count) per flush via commitRect makes
it O(N) while keeping the cross-frame hash-guard skip. Fixes the effect-barrier
gate.
---
 src/rendering/webgl2/WebGl2Backend.ts | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/src/rendering/webgl2/WebGl2Backend.ts b/src/rendering/webgl2/WebGl2Backend.ts
index d25f7e42..9fc23e1b 100644
--- a/src/rendering/webgl2/WebGl2Backend.ts
+++ b/src/rendering/webgl2/WebGl2Backend.ts
@@ -183,6 +183,8 @@ export class WebGl2Backend implements RenderBackend {
   private _drawPlanDepth = 0;
   private readonly _planBaseStack: number[] = [];
   private readonly _planHashStack: number[] = [];
+  /** Rows of the transform texture already uploaded this frame (delta-upload guard). */
+  private _uploadedRows = 0;
 
   public constructor(app: Application) {
     const canvasOptions = app.options.canvas ?? {};
@@ -284,6 +286,7 @@ export class WebGl2Backend implements RenderBackend {
     // The transform buffer is frame-scoped: reset it once per frame here (was
     // previously reset per render() call in _beginDrawPlan).
     this._transformBuffer.begin();
+    this._uploadedRows = 0;
 
     return this;
   }
@@ -425,6 +428,10 @@ export class WebGl2Backend implements RenderBackend {
     if (this._drawPlanDepth > 0) {
       this._flushActiveRenderer();
       this._transformBuffer.rewindTo(planBase, planHash);
+
+      if (planBase < this._uploadedRows) {
+        this._uploadedRows = planBase;
+      }
     }
 
     // Only assert balance at the outermost plan.
@@ -826,6 +833,7 @@ export class WebGl2Backend implements RenderBackend {
       });
       this._transformTextureHash = 0;
       this._transformTextureCount = -1;
+      this._uploadedRows = 0;
     }
 
     const snapshot = this._transformBuffer.commitSnapshot(requiredCount);
@@ -836,10 +844,17 @@ export class WebGl2Backend implements RenderBackend {
     }
 
     if (snapshot.changed || snapshot.count !== this._transformTextureCount || snapshot.hash !== this._transformTextureHash) {
-      nextTransformTexture.commitRect(0, 0, 3, snapshot.count);
-      this._transformBuffer.recordUpload(snapshot.count);
-      this._transformTextureHash = snapshot.hash;
+      const firstRow = Math.min(this._uploadedRows, snapshot.count);
+      const rowCount = snapshot.count - firstRow;
+
+      if (rowCount > 0) {
+        nextTransformTexture.commitRect(0, firstRow, 3, rowCount);
+        this._transformBuffer.recordUpload(rowCount);
+      }
+
+      this._uploadedRows = snapshot.count;
       this._transformTextureCount = snapshot.count;
+      this._transformTextureHash = snapshot.hash;
     }
 
     return this.bindTexture(nextTransformTexture, unit);

From f10e36c5a52a7511ea3a32ba2f9f84e7898f2956 Mon Sep 17 00:00:00 2001
From: Exoridus <github@codexo.de>
Date: Sun, 28 Jun 2026 15:28:12 +0200
Subject: [PATCH 05/12] test(rendering): per-call render output matches
 Container render

---
 test/perf/rendering/structural-sprite.test.ts | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/test/perf/rendering/structural-sprite.test.ts b/test/perf/rendering/structural-sprite.test.ts
index 9a00c99d..2ca4440c 100644
--- a/test/perf/rendering/structural-sprite.test.ts
+++ b/test/perf/rendering/structural-sprite.test.ts
@@ -183,4 +183,26 @@ describe('structural — Sprite', () => {
       root.destroy();
     });
   });
+
+  it('per-call renders match a Container render (same draws, instances, transform rows)', () => {
+    withHarness(harness => {
+      const [texture] = makeTextures(1);
+
+      const loose = Array.from({ length: 500 }, (_, i) => {
+        const sprite = new Sprite(texture);
+        sprite.setPosition((i * 7) % 640, (i * 13) % 480);
+        return sprite;
+      });
+      const crossCall = measureCrossCallFrame(harness, loose, 2);
+      for (const sprite of loose) sprite.destroy();
+
+      const { root } = buildSpriteScene({ count: 500, textures: makeTextures(1) });
+      const container = measureSteadyFrame(harness, root, 2);
+      root.destroy();
+
+      expect(crossCall.drawCalls).toBe(container.drawCalls);
+      expect(crossCall.instances).toBe(container.instances);
+      expect(crossCall.transformRows).toBe(container.transformRows);
+    });
+  });
 });

From fc413efb4ecfcda1357ea5cc5f47ddfd01d28fae Mon Sep 17 00:00:00 2001
From: Exoridus <github@codexo.de>
Date: Sun, 28 Jun 2026 15:42:22 +0200
Subject: [PATCH 06/12] fix(rendering): upload exact dirty transform-row range,
 not a high-water mark
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Task 3's delta upload tracked only the highest uploaded row, so a slot reused
below that mark (a filter composite reusing a row a nested plan had rewound)
was never re-uploaded, leaving stale transform data — the filter-boundary
browser test rendered the wrong color. Track the exact written-slot range
[dirtyMin, dirtyMax] in TransformBuffer instead; the delta upload pushes
precisely the changed rows regardless of reuse. Restores filter-boundary
(browser 149/149), keeps effect-barrier under budget and cross-call batching.
---
 src/rendering/TransformBuffer.ts      | 40 +++++++++++++++++++++++++++
 src/rendering/webgl2/WebGl2Backend.ts | 15 +++-------
 2 files changed, 44 insertions(+), 11 deletions(-)

diff --git a/src/rendering/TransformBuffer.ts b/src/rendering/TransformBuffer.ts
index 4b7da7f7..f63b0181 100644
--- a/src/rendering/TransformBuffer.ts
+++ b/src/rendering/TransformBuffer.ts
@@ -38,6 +38,12 @@ export class TransformBuffer {
   private _skippedWriteCount = 0;
   private _uploadCount = 0;
   private _uploadedRecordCount = 0;
+  // Dirty row range [_dirtyMin, _dirtyMax] written since the last upload — the
+  // exact rows a delta upload must push. Empty when `_dirtyMax < _dirtyMin`.
+  // Tracked by slot (not a high-water mark) so a reused slot (nested-plan
+  // rewind, filter composite) is correctly re-uploaded.
+  private _dirtyMin = 0;
+  private _dirtyMax = -1;
 
   public get count(): number {
     return this._count;
@@ -110,6 +116,8 @@ export class TransformBuffer {
     this._skippedWriteCount = 0;
     this._uploadCount = 0;
     this._uploadedRecordCount = 0;
+    this._dirtyMin = 0;
+    this._dirtyMax = -1;
 
     return this;
   }
@@ -141,6 +149,28 @@ export class TransformBuffer {
     return this;
   }
 
+  /**
+   * Consume the dirty row range written since the last upload, clamped to
+   * `[0, maxCount)`, and clear it. Returns the contiguous `[firstRow, firstRow +
+   * rowCount)` a delta upload should push (`rowCount === 0` when nothing is
+   * dirty). The backend calls this at its upload boundary.
+   * @internal
+   */
+  public consumeDirtyRange(maxCount: number): { firstRow: number; rowCount: number } {
+    if (this._dirtyMax < this._dirtyMin) {
+      return { firstRow: 0, rowCount: 0 };
+    }
+
+    const firstRow = Math.max(0, this._dirtyMin);
+    const lastRow = Math.min(this._dirtyMax, maxCount - 1);
+    const rowCount = lastRow >= firstRow ? lastRow - firstRow + 1 : 0;
+
+    this._dirtyMin = 0;
+    this._dirtyMax = -1;
+
+    return { firstRow, rowCount };
+  }
+
   public write(slot: number, transform: Matrix, tint: Color): this {
     if (!Number.isInteger(slot) || slot < 0) {
       throw new Error(`TransformBuffer slot must be a non-negative integer (got ${slot}).`);
@@ -168,6 +198,16 @@ export class TransformBuffer {
       this._count = slot + 1;
     }
 
+    // Track the exact written-slot range so a delta upload pushes precisely the
+    // changed rows — including a slot reused below the high-water mark.
+    if (this._dirtyMax < this._dirtyMin) {
+      this._dirtyMin = slot;
+      this._dirtyMax = slot;
+    } else {
+      if (slot < this._dirtyMin) this._dirtyMin = slot;
+      if (slot > this._dirtyMax) this._dirtyMax = slot;
+    }
+
     this._frameHash = this._mix(this._frameHash, slot);
 
     for (let i = 0; i < floatsPerSlot; i++) {
diff --git a/src/rendering/webgl2/WebGl2Backend.ts b/src/rendering/webgl2/WebGl2Backend.ts
index 9fc23e1b..87c1a5c5 100644
--- a/src/rendering/webgl2/WebGl2Backend.ts
+++ b/src/rendering/webgl2/WebGl2Backend.ts
@@ -183,8 +183,6 @@ export class WebGl2Backend implements RenderBackend {
   private _drawPlanDepth = 0;
   private readonly _planBaseStack: number[] = [];
   private readonly _planHashStack: number[] = [];
-  /** Rows of the transform texture already uploaded this frame (delta-upload guard). */
-  private _uploadedRows = 0;
 
   public constructor(app: Application) {
     const canvasOptions = app.options.canvas ?? {};
@@ -286,7 +284,6 @@ export class WebGl2Backend implements RenderBackend {
     // The transform buffer is frame-scoped: reset it once per frame here (was
     // previously reset per render() call in _beginDrawPlan).
     this._transformBuffer.begin();
-    this._uploadedRows = 0;
 
     return this;
   }
@@ -428,10 +425,6 @@ export class WebGl2Backend implements RenderBackend {
     if (this._drawPlanDepth > 0) {
       this._flushActiveRenderer();
       this._transformBuffer.rewindTo(planBase, planHash);
-
-      if (planBase < this._uploadedRows) {
-        this._uploadedRows = planBase;
-      }
     }
 
     // Only assert balance at the outermost plan.
@@ -833,7 +826,6 @@ export class WebGl2Backend implements RenderBackend {
       });
       this._transformTextureHash = 0;
       this._transformTextureCount = -1;
-      this._uploadedRows = 0;
     }
 
     const snapshot = this._transformBuffer.commitSnapshot(requiredCount);
@@ -844,15 +836,16 @@ export class WebGl2Backend implements RenderBackend {
     }
 
     if (snapshot.changed || snapshot.count !== this._transformTextureCount || snapshot.hash !== this._transformTextureHash) {
-      const firstRow = Math.min(this._uploadedRows, snapshot.count);
-      const rowCount = snapshot.count - firstRow;
+      // Upload only the rows actually written since the last upload (delta), so
+      // barrier-heavy frames don't re-upload the whole growing buffer. A reused
+      // slot below the high-water mark is in the dirty range, so it re-uploads.
+      const { firstRow, rowCount } = this._transformBuffer.consumeDirtyRange(snapshot.count);
 
       if (rowCount > 0) {
         nextTransformTexture.commitRect(0, firstRow, 3, rowCount);
         this._transformBuffer.recordUpload(rowCount);
       }
 
-      this._uploadedRows = snapshot.count;
       this._transformTextureCount = snapshot.count;
       this._transformTextureHash = snapshot.hash;
     }

From 52d943906eb99ffef11427eebec9a5583d97aec4 Mon Sep 17 00:00:00 2001
From: Exoridus <github@codexo.de>
Date: Sun, 28 Jun 2026 16:12:33 +0200
Subject: [PATCH 07/12] perf(rendering): WebGPU parity for frame-scoped
 cross-call batching

Mirror the WebGl2 backend's Tasks 2-4 lifecycle changes onto WebGPU:

- TransformBuffer is now frame-scoped (reset in resetStats, not per plan)
- Add transformBufferCount getter so RenderPlanBuilder offsets node indices
  correctly for WebGPU (previously fell back to 0 -> no cross-call batching)
- _beginDrawPlan: push base/hash stacks instead of resetting; reserve is
  based on frame-global count + plan nodes to avoid mid-frame reallocations
- _endDrawPlan: pop stacks; nested plans flush + rewindTo to free their rows
- setView: conditional flush (only on real view change) to stop breaking
  batches on every render() call that re-applies the same camera view
- WebGpuTransformStorage.getBuffer: delta upload via consumeDirtyRange
  instead of full-buffer writeBuffer on every flush boundary
---
 src/rendering/webgpu/WebGpuBackend.ts         | 48 ++++++++++++++++---
 .../webgpu/WebGpuTransformStorage.ts          | 22 +++++++--
 2 files changed, 59 insertions(+), 11 deletions(-)

diff --git a/src/rendering/webgpu/WebGpuBackend.ts b/src/rendering/webgpu/WebGpuBackend.ts
index 5d91790e..2a3f187a 100644
--- a/src/rendering/webgpu/WebGpuBackend.ts
+++ b/src/rendering/webgpu/WebGpuBackend.ts
@@ -127,6 +127,8 @@ export class WebGpuBackend implements RenderBackend {
   private _activeDrawCommand: DrawCommand | null = null;
   private _passCoordinatorInstance: WebGpuPassCoordinator | null = null;
   private _drawPlanDepth = 0;
+  private readonly _planBaseStack: number[] = [];
+  private readonly _planHashStack: number[] = [];
 
   public constructor(app: Application) {
     const canvasOptions = app.options.canvas ?? {};
@@ -243,22 +245,37 @@ export class WebGpuBackend implements RenderBackend {
 
   public resetStats(): this {
     resetRenderStats(this._stats);
+    // The transform buffer is frame-scoped: reset it once per frame here (was
+    // previously reset per render() call in _beginDrawPlan).
+    this._getTransformStorage().buffer.begin();
 
     return this;
   }
 
+  /** Frame-global slot base the plan builder indexes from. @internal */
+  public get transformBufferCount(): number {
+    return this._getTransformStorage().buffer.count;
+  }
+
   /** @internal */
   public _beginDrawPlan(nodeCount: number): void {
     const storage = this._getTransformStorage();
 
-    storage.begin(nodeCount);
+    // Do NOT reset the transform buffer here — it is frame-scoped (reset in
+    // resetStats). The builder already based this plan's node indices at the
+    // current buffer count, so writes land in fresh frame-global slots and
+    // batches survive across render() calls. Remember this plan's base so a
+    // nested plan can free its rows on end.
+    this._planBaseStack.push(storage.buffer.count);
+    this._planHashStack.push(storage.buffer.frameHash);
 
     // Pre-allocate the GPU storage buffer for the full plan before any group
-    // flush runs. Without this, a later flush with a higher maxNodeIndex would
-    // destroy and replace the buffer mid-frame while earlier command buffers
-    // may still reference the old allocation.
-    if (nodeCount > 0 && this._device !== null && !this._deviceLost) {
-      storage.reserve(this._device, nodeCount, this._accountant);
+    // flush runs. Base the reservation on the frame-global count + this plan's
+    // nodes so the buffer grows to cover both pre-existing frame rows and new rows.
+    const reserveCount = storage.buffer.count + nodeCount;
+
+    if (reserveCount > 0 && this._device !== null && !this._deviceLost) {
+      storage.reserve(this._device, reserveCount, this._accountant);
     }
 
     this._activeDrawCommand = null;
@@ -311,10 +328,22 @@ export class WebGpuBackend implements RenderBackend {
   public _endDrawPlan(): void {
     this._activeDrawCommand = null;
 
+    const planBase = this._planBaseStack.pop() ?? 0;
+    const planHash = this._planHashStack.pop() ?? 0;
+
     if (this._drawPlanDepth > 0) {
       this._drawPlanDepth--;
     }
 
+    // A nested plan (filter / cacheAsBitmap) just ended: flush its draws, then
+    // free its transform rows so the frame-scoped buffer only grows with
+    // top-level render() calls. Top-level plans (depth back to 0) keep their rows
+    // so cross-call batching survives to the frame-end flush.
+    if (this._drawPlanDepth > 0) {
+      this._flushActiveRenderer();
+      this._getTransformStorage().buffer.rewindTo(planBase, planHash);
+    }
+
     // Only assert balance at the outermost plan: a nested render() (e.g.
     // cacheAsBitmap drawing its cache sprite) sees the still-open outer clips,
     // which are not leaks.
@@ -594,7 +623,12 @@ export class WebGpuBackend implements RenderBackend {
   }
 
   public setView(view: View | null): this {
-    this._flushActiveRenderer();
+    // Only flush the open batch when the view actually changes. The unconditional
+    // flush forced one draw call per render() call (each render() re-applies the
+    // same camera view), defeating cross-call batching.
+    if (this._renderTarget.view !== view) {
+      this._flushActiveRenderer();
+    }
     this._renderTarget.setView(view);
 
     return this;
diff --git a/src/rendering/webgpu/WebGpuTransformStorage.ts b/src/rendering/webgpu/WebGpuTransformStorage.ts
index 72a3c9d8..837e2704 100644
--- a/src/rendering/webgpu/WebGpuTransformStorage.ts
+++ b/src/rendering/webgpu/WebGpuTransformStorage.ts
@@ -99,11 +99,25 @@ export class WebGpuTransformStorage {
     }
 
     if (snapshot.changed || snapshot.hash !== this._storageHash || snapshot.count !== this._storageCount) {
-      const bytes = snapshot.count * slotFloatCount * Float32Array.BYTES_PER_ELEMENT;
+      // Upload only the rows actually written since the last upload (delta), so
+      // barrier-heavy frames don't re-upload the whole growing buffer. A reused
+      // slot below the high-water mark is in the dirty range, so it re-uploads.
+      const { firstRow, rowCount } = this._buffer.consumeDirtyRange(snapshot.count);
+
+      if (rowCount > 0) {
+        const slotBytes = slotFloatCount * Float32Array.BYTES_PER_ELEMENT;
+
+        device.queue.writeBuffer(
+          this._storageBuffer!,
+          firstRow * slotBytes,
+          this._buffer.data.buffer,
+          this._buffer.data.byteOffset + firstRow * slotBytes,
+          rowCount * slotBytes,
+        );
+        this._buffer.recordUpload(rowCount);
+        this._accountant?.recordBufferUpload(rowCount * slotBytes);
+      }
 
-      device.queue.writeBuffer(this._storageBuffer!, 0, this._buffer.data.buffer, this._buffer.data.byteOffset, bytes);
-      this._buffer.recordUpload(snapshot.count);
-      this._accountant?.recordBufferUpload(bytes);
       this._storageHash = snapshot.hash;
       this._storageCount = snapshot.count;
     }

From 4d69cee54ce39837c32eef75cf8386f1e110627f Mon Sep 17 00:00:00 2001
From: Exoridus <github@codexo.de>
Date: Sun, 28 Jun 2026 16:29:56 +0200
Subject: [PATCH 08/12] fix(webgpu): full re-upload after storage-buffer grow;
 remove dead begin()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After _growBuffer creates a new empty GPUBuffer, set _needsFullUpload=true.
In getBuffer, always consumeDirtyRange first (clears stale range), then
branch: full [0,count) upload when _needsFullUpload, else delta rowCount>0.
Mirrors WebGl2's full-upload-on-grow so mid-frame reallocated slots are never
read as uninitialized transforms by the shader.

Also removes the dead begin(nodeCount) wrapper — callers use buffer.begin() directly.
---
 .../webgpu/WebGpuTransformStorage.ts          | 30 +++++++++++++------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/src/rendering/webgpu/WebGpuTransformStorage.ts b/src/rendering/webgpu/WebGpuTransformStorage.ts
index 837e2704..957061b9 100644
--- a/src/rendering/webgpu/WebGpuTransformStorage.ts
+++ b/src/rendering/webgpu/WebGpuTransformStorage.ts
@@ -14,6 +14,7 @@ export class WebGpuTransformStorage {
   private _storageCapacity = 0;
   private _storageHash = 0;
   private _storageCount = -1;
+  private _needsFullUpload = false;
   private _accountant: GpuResourceAccountant | null = null;
   /** GPU bytes currently booked for the storage buffer with the resource accountant. */
   private _accountedBytes = 0;
@@ -27,10 +28,6 @@ export class WebGpuTransformStorage {
     return this._buffer;
   }
 
-  public begin(nodeCount: number): void {
-    this._buffer.begin(nodeCount);
-  }
-
   public writeCommand(command: DrawCommand, transform?: Matrix): void {
     const drawable = command.drawable;
 
@@ -99,14 +96,28 @@ export class WebGpuTransformStorage {
     }
 
     if (snapshot.changed || snapshot.hash !== this._storageHash || snapshot.count !== this._storageCount) {
-      // Upload only the rows actually written since the last upload (delta), so
-      // barrier-heavy frames don't re-upload the whole growing buffer. A reused
-      // slot below the high-water mark is in the dirty range, so it re-uploads.
+      // Always consume the dirty range first to clear it, regardless of which upload
+      // path runs — a stale dirty range must never leak into the next flush.
       const { firstRow, rowCount } = this._buffer.consumeDirtyRange(snapshot.count);
 
-      if (rowCount > 0) {
-        const slotBytes = slotFloatCount * Float32Array.BYTES_PER_ELEMENT;
+      const slotBytes = slotFloatCount * Float32Array.BYTES_PER_ELEMENT;
 
+      if (this._needsFullUpload) {
+        // Post-grow: the new GPUBuffer is empty; upload the full [0, snapshot.count)
+        // range so rows already consumed by earlier flushes this frame are present.
+        device.queue.writeBuffer(
+          this._storageBuffer!,
+          0,
+          this._buffer.data.buffer,
+          this._buffer.data.byteOffset,
+          snapshot.count * slotBytes,
+        );
+        this._buffer.recordUpload(snapshot.count);
+        this._accountant?.recordBufferUpload(snapshot.count * slotBytes);
+        this._needsFullUpload = false;
+      } else if (rowCount > 0) {
+        // Normal delta path: upload only the rows written since the last upload.
+        // A reused slot below the high-water mark is in the dirty range, so it re-uploads.
         device.queue.writeBuffer(
           this._storageBuffer!,
           firstRow * slotBytes,
@@ -156,6 +167,7 @@ export class WebGpuTransformStorage {
     this._storageCapacity = nextCapacity;
     this._storageHash = 0;
     this._storageCount = -1;
+    this._needsFullUpload = true;
     // Re-book the storage footprint (free the prior buffer's bytes, allocate the new).
     this._accountedBytes = this._accountant?.reallocate(this._accountedBytes, nextCapacity) ?? this._accountedBytes;
   }

From ac08b290726b4eb9e82569b1e3d89d860a40faf3 Mon Sep 17 00:00:00 2001
From: Exoridus <github@codexo.de>
Date: Sun, 28 Jun 2026 16:47:58 +0200
Subject: [PATCH 09/12] fix(webgpu): restore begin() wrapper + fix RT-display
 test for frame-scoped slots
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The prior commit removed WebGpuTransformStorage.begin() as dead code, but it has
~25 test call sites (30 tests broke). Restore it. Also the webgpu-backend
RenderTexture+Sprite test asserted the sprite transform in slot 0, but with
frame-scoped batching the graphics-into-RT is slot 0 and the sprite lands in
slot 1 — read slot 1 (transform verified: tx=24, ty=18 present after the
full-upload-on-grow). Full exojs project green (2510); no other regressions.

Process note: the exojs unit project (test/**) was not run during the earlier
tasks — only rendering-perf + browser-webgl; this surfaced both issues.
---
 src/rendering/webgpu/WebGpuTransformStorage.ts |  5 +++++
 test/rendering/webgpu-backend.test.ts          | 17 +++++++++++------
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/src/rendering/webgpu/WebGpuTransformStorage.ts b/src/rendering/webgpu/WebGpuTransformStorage.ts
index 957061b9..f03a59fe 100644
--- a/src/rendering/webgpu/WebGpuTransformStorage.ts
+++ b/src/rendering/webgpu/WebGpuTransformStorage.ts
@@ -28,6 +28,11 @@ export class WebGpuTransformStorage {
     return this._buffer;
   }
 
+  /** Reset the underlying frame-scoped buffer. Used directly by tests. @internal */
+  public begin(nodeCount = 0): void {
+    this._buffer.begin(nodeCount);
+  }
+
   public writeCommand(command: DrawCommand, transform?: Matrix): void {
     const drawable = command.drawable;
 
diff --git a/test/rendering/webgpu-backend.test.ts b/test/rendering/webgpu-backend.test.ts
index d0e7c284..0e057969 100644
--- a/test/rendering/webgpu-backend.test.ts
+++ b/test/rendering/webgpu-backend.test.ts
@@ -1790,10 +1790,15 @@ describe('WebGpuBackend', () => {
       manager.flush();
       manager.destroy();
 
-      // The sprite's world transform now lives in the shared transform storage
-      // buffer (uploaded as the last writeBuffer of the sprite flush), not inline
-      // in the instance buffer. Slot 0 = (a, b, c, d, tx, ty, 0, 0, tint…); an
+      // The sprite's world transform lives in the shared transform storage buffer
+      // (the last writeBuffer of the sprite flush carries the whole buffer's
+      // ArrayBuffer), not inline in the instance buffer. The buffer is frame-scoped
+      // (cross-call batching): the graphics rendered into the RenderTexture is the
+      // first shared-buffer write (slot 0), so the sprite is the second and lands
+      // in slot 1. Each slot is 12 floats (a, b, c, d, tx, ty, 0, 0, tint…); an
       // unrotated sprite at (24, 18) has b == 0 and carries that translation.
+      const slotFloats = 12;
+      const spriteBase = 1 * slotFloats; // slot 1
       const transformWrite = environment.queue.writeBuffer.mock.calls[environment.queue.writeBuffer.mock.calls.length - 1];
       const data = new Float32Array(transformWrite[2] as ArrayBuffer);
 
@@ -1801,9 +1806,9 @@ describe('WebGpuBackend', () => {
       expect(environment.pass.drawIndexed).toHaveBeenCalled();
       expect(environment.queue.submit.mock.calls.length).toBeGreaterThanOrEqual(2);
       expect(environment.textures.length).toBeGreaterThan(0);
-      expect(data[1]).toBe(0);
-      expect(data[4]).toBe(24);
-      expect(data[5]).toBe(18);
+      expect(data[spriteBase + 1]).toBe(0);
+      expect(data[spriteBase + 4]).toBe(24);
+      expect(data[spriteBase + 5]).toBe(18);
     } finally {
       environment.restore();
     }

From afb0ec71598ed91ec2a0b76fb90222fb6c562af6 Mon Sep 17 00:00:00 2001
From: Exoridus <github@codexo.de>
Date: Sun, 28 Jun 2026 16:54:53 +0200
Subject: [PATCH 10/12] chore(rendering): fix import order + prettier
 formatting

Autofix: sort the playRenderTree import in the perf harness; prettier-format
WebGpuTransformStorage after the begin() restore. verify:quick green.
---
 src/rendering/webgpu/WebGpuTransformStorage.ts | 8 +-------
 test/perf/rendering/harness.ts                 | 2 +-
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/src/rendering/webgpu/WebGpuTransformStorage.ts b/src/rendering/webgpu/WebGpuTransformStorage.ts
index f03a59fe..5726bfc2 100644
--- a/src/rendering/webgpu/WebGpuTransformStorage.ts
+++ b/src/rendering/webgpu/WebGpuTransformStorage.ts
@@ -110,13 +110,7 @@ export class WebGpuTransformStorage {
       if (this._needsFullUpload) {
         // Post-grow: the new GPUBuffer is empty; upload the full [0, snapshot.count)
         // range so rows already consumed by earlier flushes this frame are present.
-        device.queue.writeBuffer(
-          this._storageBuffer!,
-          0,
-          this._buffer.data.buffer,
-          this._buffer.data.byteOffset,
-          snapshot.count * slotBytes,
-        );
+        device.queue.writeBuffer(this._storageBuffer!, 0, this._buffer.data.buffer, this._buffer.data.byteOffset, snapshot.count * slotBytes);
         this._buffer.recordUpload(snapshot.count);
         this._accountant?.recordBufferUpload(snapshot.count * slotBytes);
         this._needsFullUpload = false;
diff --git a/test/perf/rendering/harness.ts b/test/perf/rendering/harness.ts
index 67da7aae..2af75114 100644
--- a/test/perf/rendering/harness.ts
+++ b/test/perf/rendering/harness.ts
@@ -7,9 +7,9 @@
  *
  * @internal Test/perf-only.
  */
+import { playRenderTree } from '#rendering/plan/playRenderTree';
 import type { RenderNode } from '#rendering/RenderNode';
 import type { View } from '#rendering/View';
-import { playRenderTree } from '#rendering/plan/playRenderTree';
 import { WebGl2Backend } from '#rendering/webgl2/WebGl2Backend';
 
 import { wireCoreRenderers } from '../../rendering/browser/_coreRenderers';

From c9b34ff380e24a01fdad15f524a3d02395afd8d8 Mon Sep 17 00:00:00 2001
From: Exoridus <github@codexo.de>
Date: Sun, 28 Jun 2026 17:00:40 +0200
Subject: [PATCH 11/12] test(particles): seed frame-scoped batching stacks in
 WebGpuBackend mock

The particle GPU-injection test mocks the backend via Object.create(prototype),
bypassing the constructor that initializes _planBaseStack/_planHashStack (used by
_beginDrawPlan since the cross-call batching work). Seed them like the existing
device mock. Full test suite green (3609).
---
 packages/exojs-particles/test/particle-gpu.test.ts | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/packages/exojs-particles/test/particle-gpu.test.ts b/packages/exojs-particles/test/particle-gpu.test.ts
index 333945d0..93296074 100644
--- a/packages/exojs-particles/test/particle-gpu.test.ts
+++ b/packages/exojs-particles/test/particle-gpu.test.ts
@@ -384,6 +384,10 @@ describe('ParticleSystem render-inject backend detection', () => {
     const env = makeMockDevice();
     const fakeBackend = Object.create(WebGpuBackend.prototype) as object;
     Object.defineProperty(fakeBackend, 'device', { value: env.device, configurable: true });
+    // Frame-scoped batching uses these instance stacks in _beginDrawPlan/_endDrawPlan;
+    // Object.create bypasses the constructor that initializes them, so seed them here.
+    Object.defineProperty(fakeBackend, '_planBaseStack', { value: [], configurable: true });
+    Object.defineProperty(fakeBackend, '_planHashStack', { value: [], configurable: true });
 
     const system = new ParticleSystem(makeTexture(), { capacity: 4 });
     system.addUpdateModule(new ApplyForce(0, 0));

From 9fe903a7bf680165f0919e9caa7dbb9aaea50a2f Mon Sep 17 00:00:00 2001
From: Exoridus <github@codexo.de>
Date: Sun, 28 Jun 2026 17:34:36 +0200
Subject: [PATCH 12/12] docs(rendering): update stale comments + add
 TransformBuffer dirty-range tests

- RenderingContext: setView flushes only on view change (not unconditionally);
  correctness rests on trailing flush() and renderer-switch flushes
- RenderInstruction: nodeIndex is frame-global [frameBase, frameBase+nodeCount),
  not plan-local [0, nodeCount)
- WebGpuTransformStorage: clarify consumeDirtyRange is inside the upload branch only;
  add upload-guard note explaining why a skipped flush is safe
- WebGl2Backend: same upload-guard safety note as WebGpu counterpart
- test: 6 new TransformBuffer dirty-range cases (consumeDirtyRange sentinel,
  coverage+self-clearing, below-HWM reuse, clamping, rewindTo, begin reset)
---
 src/rendering/RenderingContext.ts             | 17 ++--
 src/rendering/plan/RenderInstruction.ts       |  7 +-
 src/rendering/webgl2/WebGl2Backend.ts         |  4 +
 .../webgpu/WebGpuTransformStorage.ts          | 10 ++-
 test/rendering/transform-buffer.test.ts       | 87 +++++++++++++++++++
 5 files changed, 113 insertions(+), 12 deletions(-)

diff --git a/src/rendering/RenderingContext.ts b/src/rendering/RenderingContext.ts
index 7e9b8a1b..2c66b99c 100644
--- a/src/rendering/RenderingContext.ts
+++ b/src/rendering/RenderingContext.ts
@@ -261,11 +261,10 @@ export class RenderingContext implements System {
     const view = options.view ?? this._camera;
     const mesh = (this._immediateMesh ??= new ImmediateMesh());
 
-    // Set the view first: this flushes whatever renderer a prior render() /
-    // drawGeometry left pending, so the shared transform buffer is free for this
-    // draw's synthetic slot and the pooled mesh is safe to reconfigure. The
-    // immediate flush below then keeps a later drawGeometry from observing this
-    // pooled mesh through a still-deferred draw.
+    // Set the view first: setView now only flushes when the view actually changes
+    // (not unconditionally). Correctness here rests on (a) the trailing flush()
+    // below — so a later drawGeometry cannot observe this pooled mesh through a
+    // still-deferred draw — and (b) any renderer switch flushing its pending batch.
     this._backend.setView(view);
     mesh.configure(geometry, transform, material, options.tint ?? null);
     this._backend.draw(mesh);
@@ -302,9 +301,11 @@ export class RenderingContext implements System {
     const view = options.view ?? this._camera;
     const mesh = (this._batchMesh ??= new ImmediateMesh());
 
-    // Set the view first (flushing any renderer left pending), configure the
-    // pooled geometry/look source, then submit a single instanced draw over the
-    // batch's per-instance transforms/tints and flush it immediately.
+    // Set the view first (setView only flushes when the view actually changes;
+    // correctness rests on the trailing flush() below and on any renderer switch
+    // flushing its pending batch), configure the pooled geometry/look source,
+    // then submit a single instanced draw over the batch's per-instance
+    // transforms/tints and flush it immediately.
     this._backend.setView(view);
     mesh.configureBatchSource(batch.geometry, batch.material);
     this._backend.drawInstanced(mesh, batch._instanceTransforms, batch._instanceTints, batch.count);
diff --git a/src/rendering/plan/RenderInstruction.ts b/src/rendering/plan/RenderInstruction.ts
index 27132776..91161945 100644
--- a/src/rendering/plan/RenderInstruction.ts
+++ b/src/rendering/plan/RenderInstruction.ts
@@ -9,8 +9,11 @@ import type { GroupScope } from './RenderScope';
  * names the concept the plan player consumes and that the batching layer
  * reorders, independent of how the draw happens to be stored in the scope
  * tree. Future {@link TransformBuffer} slotting keys on each instruction's
- * stable {@link DrawCommand.nodeIndex} (within the `[0, plan.nodeCount)`
- * slot space).
+ * stable {@link DrawCommand.nodeIndex}. Each index is frame-global —
+ * `[frameBase, frameBase + plan.nodeCount)` — because the transform buffer
+ * is frame-scoped and the builder bases node indices at the current buffer
+ * slot count (`frameBase`) so every plan in the frame occupies distinct
+ * slots and can batch cross-call.
  *
  * Batch units (maximal runs of consecutive instructions in a {@link GroupScope}
  * sharing GPU pipeline/bind state) are not materialized: the plan player walks
diff --git a/src/rendering/webgl2/WebGl2Backend.ts b/src/rendering/webgl2/WebGl2Backend.ts
index 87c1a5c5..c9e660ba 100644
--- a/src/rendering/webgl2/WebGl2Backend.ts
+++ b/src/rendering/webgl2/WebGl2Backend.ts
@@ -835,6 +835,10 @@ export class WebGl2Backend implements RenderBackend {
       throw new Error('Transform texture must be initialized before binding.');
     }
 
+    // A skipped flush (all three guards false) leaves the dirty range uncleared
+    // until the next begin(). Safe: every write() mixes its slot into _frameHash,
+    // so a non-empty dirty range always coincides with snapshot.changed = true —
+    // the upload branch is always taken before any dirty rows could be stale.
     if (snapshot.changed || snapshot.count !== this._transformTextureCount || snapshot.hash !== this._transformTextureHash) {
       // Upload only the rows actually written since the last upload (delta), so
       // barrier-heavy frames don't re-upload the whole growing buffer. A reused
diff --git a/src/rendering/webgpu/WebGpuTransformStorage.ts b/src/rendering/webgpu/WebGpuTransformStorage.ts
index 5726bfc2..529e3e0c 100644
--- a/src/rendering/webgpu/WebGpuTransformStorage.ts
+++ b/src/rendering/webgpu/WebGpuTransformStorage.ts
@@ -100,9 +100,15 @@ export class WebGpuTransformStorage {
       this._growBuffer(device, requiredBytes);
     }
 
+    // A skipped flush (all three guards false) leaves the dirty range uncleared
+    // until the next begin(). Safe: every write() mixes its slot into _frameHash,
+    // so a non-empty dirty range always coincides with snapshot.changed = true —
+    // the upload branch is always taken before any dirty rows could be stale.
     if (snapshot.changed || snapshot.hash !== this._storageHash || snapshot.count !== this._storageCount) {
-      // Always consume the dirty range first to clear it, regardless of which upload
-      // path runs — a stale dirty range must never leak into the next flush.
+      // Always consume the dirty range first to clear it — regardless of whether
+      // the full-upload path (post-grow) or the delta path runs below. Both paths
+      // are inside this if-branch; the skip case (snapshot unchanged) never reaches
+      // here, so the dirty range is only consumed when an upload is actually issued.
       const { firstRow, rowCount } = this._buffer.consumeDirtyRange(snapshot.count);
 
       const slotBytes = slotFloatCount * Float32Array.BYTES_PER_ELEMENT;
diff --git a/test/rendering/transform-buffer.test.ts b/test/rendering/transform-buffer.test.ts
index 5d5d5d30..ed74a847 100644
--- a/test/rendering/transform-buffer.test.ts
+++ b/test/rendering/transform-buffer.test.ts
@@ -168,4 +168,91 @@ describe('TransformBuffer', () => {
 
     parent.destroy();
   });
+
+  test('consumeDirtyRange returns empty sentinel on a fresh buffer after begin()', () => {
+    const buffer = new TransformBuffer();
+
+    buffer.begin();
+    const result = buffer.consumeDirtyRange(10);
+
+    expect(result.rowCount).toBe(0);
+    expect(result.firstRow).toBe(0);
+  });
+
+  test('consumeDirtyRange covers all written slots and clears itself on second call', () => {
+    const buffer = new TransformBuffer();
+    const identity = new Matrix();
+
+    buffer.begin();
+    buffer.write(0, identity, Color.white);
+    buffer.write(1, identity, Color.white);
+    buffer.write(2, identity, Color.white);
+
+    const first = buffer.consumeDirtyRange(3);
+
+    expect(first).toEqual({ firstRow: 0, rowCount: 3 });
+
+    const second = buffer.consumeDirtyRange(3);
+
+    expect(second.rowCount).toBe(0);
+  });
+
+  test('consumeDirtyRange tracks reuse below the high-water mark', () => {
+    const buffer = new TransformBuffer();
+    const identity = new Matrix();
+
+    buffer.begin();
+    buffer.write(0, identity, Color.white);
+    buffer.write(1, identity, Color.white);
+    buffer.write(2, identity, Color.white);
+    buffer.consumeDirtyRange(3); // clear after first writes
+
+    buffer.write(1, identity, Color.white); // reuse slot 1 below high-water mark
+
+    const result = buffer.consumeDirtyRange(3);
+
+    expect(result).toEqual({ firstRow: 1, rowCount: 1 });
+  });
+
+  test('consumeDirtyRange clamps to maxCount — a write above the limit is excluded', () => {
+    const buffer = new TransformBuffer();
+    const identity = new Matrix();
+
+    buffer.begin();
+    buffer.write(5, identity, Color.white); // slot 5 is above maxCount = 3
+
+    const result = buffer.consumeDirtyRange(3);
+
+    expect(result.rowCount).toBe(0);
+  });
+
+  test('rewindTo restores the write cursor and optionally the frame hash', () => {
+    const buffer = new TransformBuffer();
+    const identity = new Matrix();
+
+    buffer.begin();
+    buffer.write(0, identity, Color.white);
+    const savedHash = buffer.frameHash;
+
+    buffer.write(1, identity, Color.white);
+    buffer.rewindTo(1, savedHash);
+
+    expect(buffer.count).toBe(1);
+    expect(buffer.frameHash).toBe(savedHash);
+  });
+
+  test('begin() resets the dirty range so consumeDirtyRange returns empty', () => {
+    const buffer = new TransformBuffer();
+    const identity = new Matrix();
+
+    buffer.begin();
+    buffer.write(0, identity, Color.white);
+    buffer.write(1, identity, Color.white);
+
+    buffer.begin(); // should reset dirty range
+
+    const result = buffer.consumeDirtyRange(10);
+
+    expect(result.rowCount).toBe(0);
+  });
 });