diff --git a/examples/debug-layer/performance-overlay.js b/examples/debug-layer/performance-overlay.js
index 275375b1..7534cc11 100644
--- a/examples/debug-layer/performance-overlay.js
+++ b/examples/debug-layer/performance-overlay.js
@@ -1,5 +1,5 @@
 // Auto-generated from performance-overlay.ts — edit the .ts source, not this file.
-import { Application, Color, Keyboard, Scene, Sprite, Texture } from '@codexo/exojs';
+import { Application, Color, Container, Keyboard, Scene, Sprite, Texture } from '@codexo/exojs';
 import { DebugOverlay } from '@codexo/exojs/debug';
 const app = new Application({
     canvas: {
@@ -17,14 +17,21 @@ const debug = new DebugOverlay(app);
 debug.layers.performance.visible = true;
 class PerformanceOverlayScene extends Scene {
     sprites;
+    layer;
     async load(loader) {
         await loader.load(Texture, { bunny: 'image/ship-a.png' });
     }
     init(loader) {
         const { width, height } = this.app.canvas;
+        // All sprites share one texture, so adding them to a single container and
+        // rendering it once lets the renderer batch them into a single draw call.
+        // Rendering each sprite with its own `context.render(sprite)` call would
+        // instead emit one draw call per sprite and tank the frame rate.
+        this.layer = new Container();
         this.sprites = Array.from({ length: 1600 }, () => {
             const sprite = new Sprite(loader.get(Texture, 'bunny')).setAnchor(0.5).setScale(0.25);
             sprite.setPosition(Math.random() * width, Math.random() * height);
+            this.layer.addChild(sprite);
             return {
                 sprite,
                 vx: (Math.random() - 0.5) * 120,
@@ -47,8 +54,7 @@ class PerformanceOverlayScene extends Scene {
     }
     draw(context) {
         context.backend.clear();
-        for (const { sprite } of this.sprites)
-            context.render(sprite);
+        context.render(this.layer);
     }
 }
 app.start(new PerformanceOverlayScene());
diff --git a/examples/debug-layer/performance-overlay.ts b/examples/debug-layer/performance-overlay.ts
index 7ff4b2b8..b5b1d415 100644
--- a/examples/debug-layer/performance-overlay.ts
+++ b/examples/debug-layer/performance-overlay.ts
@@ -1,4 +1,4 @@
-import { Application, Color, Keyboard, Scene, Sprite, Texture } from '@codexo/exojs';
+import { Application, Color, Container, Keyboard, Scene, Sprite, Texture } from '@codexo/exojs';
 import { DebugOverlay } from '@codexo/exojs/debug';
 
 const app = new Application({
@@ -19,6 +19,7 @@ debug.layers.performance.visible = true;
 
 class PerformanceOverlayScene extends Scene {
     private sprites!: { sprite: Sprite; vx: number; vy: number }[];
+    private layer!: Container;
 
     override async load(loader): Promise<void> {
         await loader.load(Texture, { bunny: 'image/ship-a.png' });
@@ -27,9 +28,15 @@ class PerformanceOverlayScene extends Scene {
     override init(loader): void {
         const { width, height } = this.app.canvas;
 
+        // All sprites share one texture, so adding them to a single container and
+        // rendering it once lets the renderer batch them into a single draw call.
+        // Rendering each sprite with its own `context.render(sprite)` call would
+        // instead emit one draw call per sprite and tank the frame rate.
+        this.layer = new Container();
         this.sprites = Array.from({ length: 1600 }, () => {
             const sprite = new Sprite(loader.get(Texture, 'bunny')).setAnchor(0.5).setScale(0.25);
             sprite.setPosition(Math.random() * width, Math.random() * height);
+            this.layer.addChild(sprite);
             return {
                 sprite,
                 vx: (Math.random() - 0.5) * 120,
@@ -53,7 +60,7 @@ class PerformanceOverlayScene extends Scene {
 
     override draw(context): void {
         context.backend.clear();
-        for (const { sprite } of this.sprites) context.render(sprite);
+        context.render(this.layer);
     }
 }
 
diff --git a/packages/exojs-particles/test/particle-gpu.test.ts b/packages/exojs-particles/test/particle-gpu.test.ts
index 333945d0..93296074 100644
--- a/packages/exojs-particles/test/particle-gpu.test.ts
+++ b/packages/exojs-particles/test/particle-gpu.test.ts
@@ -384,6 +384,10 @@ describe('ParticleSystem render-inject backend detection', () => {
     const env = makeMockDevice();
     const fakeBackend = Object.create(WebGpuBackend.prototype) as object;
     Object.defineProperty(fakeBackend, 'device', { value: env.device, configurable: true });
+    // Frame-scoped batching uses these instance stacks in _beginDrawPlan/_endDrawPlan;
+    // Object.create bypasses the constructor that initializes them, so seed them here.
+    Object.defineProperty(fakeBackend, '_planBaseStack', { value: [], configurable: true });
+    Object.defineProperty(fakeBackend, '_planHashStack', { value: [], configurable: true });
 
     const system = new ParticleSystem(makeTexture(), { capacity: 4 });
     system.addUpdateModule(new ApplyForce(0, 0));
diff --git a/src/rendering/RenderingContext.ts b/src/rendering/RenderingContext.ts
index 7e9b8a1b..2c66b99c 100644
--- a/src/rendering/RenderingContext.ts
+++ b/src/rendering/RenderingContext.ts
@@ -261,11 +261,10 @@ export class RenderingContext implements System {
     const view = options.view ?? this._camera;
     const mesh = (this._immediateMesh ??= new ImmediateMesh());
 
-    // Set the view first: this flushes whatever renderer a prior render() /
-    // drawGeometry left pending, so the shared transform buffer is free for this
-    // draw's synthetic slot and the pooled mesh is safe to reconfigure. The
-    // immediate flush below then keeps a later drawGeometry from observing this
-    // pooled mesh through a still-deferred draw.
+    // Set the view first: setView now only flushes when the view actually changes
+    // (not unconditionally). Correctness here rests on (a) the trailing flush()
+    // below — so a later drawGeometry cannot observe this pooled mesh through a
+    // still-deferred draw — and (b) any renderer switch flushing its pending batch.
     this._backend.setView(view);
     mesh.configure(geometry, transform, material, options.tint ?? null);
     this._backend.draw(mesh);
@@ -302,9 +301,11 @@ export class RenderingContext implements System {
     const view = options.view ?? this._camera;
     const mesh = (this._batchMesh ??= new ImmediateMesh());
 
-    // Set the view first (flushing any renderer left pending), configure the
-    // pooled geometry/look source, then submit a single instanced draw over the
-    // batch's per-instance transforms/tints and flush it immediately.
+    // Set the view first (setView only flushes when the view actually changes;
+    // correctness rests on the trailing flush() below and on any renderer switch
+    // flushing its pending batch), configure the pooled geometry/look source,
+    // then submit a single instanced draw over the batch's per-instance
+    // transforms/tints and flush it immediately.
     this._backend.setView(view);
     mesh.configureBatchSource(batch.geometry, batch.material);
     this._backend.drawInstanced(mesh, batch._instanceTransforms, batch._instanceTints, batch.count);
diff --git a/src/rendering/TransformBuffer.ts b/src/rendering/TransformBuffer.ts
index 8482f134..f63b0181 100644
--- a/src/rendering/TransformBuffer.ts
+++ b/src/rendering/TransformBuffer.ts
@@ -38,6 +38,12 @@ export class TransformBuffer {
   private _skippedWriteCount = 0;
   private _uploadCount = 0;
   private _uploadedRecordCount = 0;
+  // Dirty row range [_dirtyMin, _dirtyMax] written since the last upload — the
+  // exact rows a delta upload must push. Empty when `_dirtyMax < _dirtyMin`.
+  // Tracked by slot (not a high-water mark) so a reused slot (nested-plan
+  // rewind, filter composite) is correctly re-uploaded.
+  private _dirtyMin = 0;
+  private _dirtyMax = -1;
 
   public get count(): number {
     return this._count;
@@ -94,6 +100,11 @@ export class TransformBuffer {
     return this._version;
   }
 
+  /** Running content hash of the rows written since begin(). @internal */
+  public get frameHash(): number {
+    return this._frameHash;
+  }
+
   public begin(expectedCount = 0): this {
     if (expectedCount > 0) {
       this._ensureCapacity(expectedCount);
@@ -105,6 +116,8 @@ export class TransformBuffer {
     this._skippedWriteCount = 0;
     this._uploadCount = 0;
     this._uploadedRecordCount = 0;
+    this._dirtyMin = 0;
+    this._dirtyMax = -1;
 
     return this;
   }
@@ -117,6 +130,47 @@ export class TransformBuffer {
     return slot;
   }
 
+  /**
+   * Rewind the write cursor to `count`, freeing the rows above it for reuse, and
+   * (optionally) restore the running content hash to its pre-rewind value so the
+   * freed rows' writes don't linger in the hash and trigger spurious re-uploads.
+   * Used by nested draw plans (filters / cacheAsBitmap) to isolate their slots.
+   * @internal
+   */
+  public rewindTo(count: number, frameHash?: number): this {
+    if (count >= 0 && count < this._count) {
+      this._count = count;
+
+      if (frameHash !== undefined) {
+        this._frameHash = frameHash >>> 0;
+      }
+    }
+
+    return this;
+  }
+
+  /**
+   * Consume the dirty row range written since the last upload, clamped to
+   * `[0, maxCount)`, and clear it. Returns the contiguous `[firstRow, firstRow +
+   * rowCount)` a delta upload should push (`rowCount === 0` when nothing is
+   * dirty). The backend calls this at its upload boundary.
+   * @internal
+   */
+  public consumeDirtyRange(maxCount: number): { firstRow: number; rowCount: number } {
+    if (this._dirtyMax < this._dirtyMin) {
+      return { firstRow: 0, rowCount: 0 };
+    }
+
+    const firstRow = Math.max(0, this._dirtyMin);
+    const lastRow = Math.min(this._dirtyMax, maxCount - 1);
+    const rowCount = lastRow >= firstRow ? lastRow - firstRow + 1 : 0;
+
+    this._dirtyMin = 0;
+    this._dirtyMax = -1;
+
+    return { firstRow, rowCount };
+  }
+
   public write(slot: number, transform: Matrix, tint: Color): this {
     if (!Number.isInteger(slot) || slot < 0) {
       throw new Error(`TransformBuffer slot must be a non-negative integer (got ${slot}).`);
@@ -144,6 +198,16 @@ export class TransformBuffer {
       this._count = slot + 1;
     }
 
+    // Track the exact written-slot range so a delta upload pushes precisely the
+    // changed rows — including a slot reused below the high-water mark.
+    if (this._dirtyMax < this._dirtyMin) {
+      this._dirtyMin = slot;
+      this._dirtyMax = slot;
+    } else {
+      if (slot < this._dirtyMin) this._dirtyMin = slot;
+      if (slot > this._dirtyMax) this._dirtyMax = slot;
+    }
+
     this._frameHash = this._mix(this._frameHash, slot);
 
     for (let i = 0; i < floatsPerSlot; i++) {
diff --git a/src/rendering/plan/RenderInstruction.ts b/src/rendering/plan/RenderInstruction.ts
index 27132776..91161945 100644
--- a/src/rendering/plan/RenderInstruction.ts
+++ b/src/rendering/plan/RenderInstruction.ts
@@ -9,8 +9,11 @@ import type { GroupScope } from './RenderScope';
  * names the concept the plan player consumes and that the batching layer
  * reorders, independent of how the draw happens to be stored in the scope
  * tree. Future {@link TransformBuffer} slotting keys on each instruction's
- * stable {@link DrawCommand.nodeIndex} (within the `[0, plan.nodeCount)`
- * slot space).
+ * stable {@link DrawCommand.nodeIndex}. Each index is frame-global —
+ * `[frameBase, frameBase + plan.nodeCount)` — because the transform buffer
+ * is frame-scoped and the builder bases node indices at the current buffer
+ * slot count (`frameBase`) so every plan in the frame occupies distinct
+ * slots and can batch cross-call.
  *
  * Batch units (maximal runs of consecutive instructions in a {@link GroupScope}
  * sharing GPU pipeline/bind state) are not materialized: the plan player walks
diff --git a/src/rendering/plan/RenderPlanBuilder.ts b/src/rendering/plan/RenderPlanBuilder.ts
index db9cbe41..58053f25 100644
--- a/src/rendering/plan/RenderPlanBuilder.ts
+++ b/src/rendering/plan/RenderPlanBuilder.ts
@@ -93,7 +93,11 @@ export class RenderPlanBuilder {
     this._barrierEntryPoolCursor = 0;
     this._scopeStack.length = 0;
     this._hasPending = false;
-    this._nodeIndex = 0;
+    // Base this plan's node indices after whatever earlier render() calls already
+    // wrote into the frame-scoped transform buffer, so every draw across all
+    // render() calls in the frame references a distinct slot and can batch.
+    const frameBase = (backend as { transformBufferCount?: number }).transformBufferCount ?? 0;
+    this._nodeIndex = frameBase;
 
     const rootScope = this._acquireGroupScope(false);
 
@@ -110,7 +114,7 @@ export class RenderPlanBuilder {
       });
     }
 
-    this._plan.nodeCount = this._nodeIndex;
+    this._plan.nodeCount = this._nodeIndex - frameBase;
 
     return this._plan;
   }
diff --git a/src/rendering/webgl2/WebGl2Backend.ts b/src/rendering/webgl2/WebGl2Backend.ts
index e58d6a3a..c9e660ba 100644
--- a/src/rendering/webgl2/WebGl2Backend.ts
+++ b/src/rendering/webgl2/WebGl2Backend.ts
@@ -181,6 +181,8 @@ export class WebGl2Backend implements RenderBackend {
   private _transformTextureCount = -1;
   private _activeDrawCommand: DrawCommand | null = null;
   private _drawPlanDepth = 0;
+  private readonly _planBaseStack: number[] = [];
+  private readonly _planHashStack: number[] = [];
 
   public constructor(app: Application) {
     const canvasOptions = app.options.canvas ?? {};
@@ -279,13 +281,27 @@ export class WebGl2Backend implements RenderBackend {
 
   public resetStats(): this {
     resetRenderStats(this._stats);
+    // The transform buffer is frame-scoped: reset it once per frame here (was
+    // previously reset per render() call in _beginDrawPlan).
+    this._transformBuffer.begin();
 
     return this;
   }
 
+  /** Frame-global slot base the plan builder indexes from. @internal */
+  public get transformBufferCount(): number {
+    return this._transformBuffer.count;
+  }
+
   /** @internal */
-  public _beginDrawPlan(nodeCount: number): void {
-    this._transformBuffer.begin(nodeCount);
+  public _beginDrawPlan(_nodeCount: number): void {
+    // Do NOT reset the transform buffer here — it is frame-scoped (reset in
+    // resetStats). The builder already based this plan's node indices at the
+    // current buffer count, so writes land in fresh frame-global slots and
+    // batches survive across render() calls. Remember this plan's base so a
+    // nested plan can free its rows on end.
+    this._planBaseStack.push(this._transformBuffer.count);
+    this._planHashStack.push(this._transformBuffer.frameHash);
     this._activeDrawCommand = null;
     this._drawPlanDepth++;
   }
@@ -395,13 +411,23 @@ export class WebGl2Backend implements RenderBackend {
   public _endDrawPlan(): void {
     this._activeDrawCommand = null;
 
+    const planBase = this._planBaseStack.pop() ?? 0;
+    const planHash = this._planHashStack.pop() ?? 0;
+
     if (this._drawPlanDepth > 0) {
       this._drawPlanDepth--;
     }
 
-    // Only assert balance at the outermost plan: cacheAsBitmap draws a cache
-    // sprite via a nested render(), whose inner _endDrawPlan sees the still-open
-    // outer clips — those are not leaks.
+    // A nested plan (filter / cacheAsBitmap) just ended: flush its draws, then
+    // free its transform rows so the frame-scoped buffer only grows with
+    // top-level render() calls. Top-level plans (depth back to 0) keep their rows
+    // so cross-call batching survives to the frame-end flush.
+    if (this._drawPlanDepth > 0) {
+      this._flushActiveRenderer();
+      this._transformBuffer.rewindTo(planBase, planHash);
+    }
+
+    // Only assert balance at the outermost plan.
     if (this._drawPlanDepth === 0) {
       this._assertBalancedStencil();
     }
@@ -715,7 +741,12 @@ export class WebGl2Backend implements RenderBackend {
   }
 
   public setView(view: View | null): this {
-    this._flushActiveRenderer();
+    // Only flush the open batch when the view actually changes. The unconditional
+    // flush forced one draw call per render() call (each render() re-applies the
+    // same camera view), defeating cross-call batching.
+    if (this._renderTarget.view !== view) {
+      this._flushActiveRenderer();
+    }
     this._renderTarget.setView(view);
     this._bindRenderTarget(this._renderTarget);
 
@@ -804,11 +835,23 @@ export class WebGl2Backend implements RenderBackend {
       throw new Error('Transform texture must be initialized before binding.');
     }
 
+    // A skipped flush (all three guards false) leaves the dirty range uncleared
+    // until the next begin(). Safe: every write() mixes its slot into _frameHash,
+    // so a non-empty dirty range always coincides with snapshot.changed = true —
+    // the upload branch is always taken before any dirty rows could be stale.
     if (snapshot.changed || snapshot.count !== this._transformTextureCount || snapshot.hash !== this._transformTextureHash) {
-      nextTransformTexture.commitRect(0, 0, 3, snapshot.count);
-      this._transformBuffer.recordUpload(snapshot.count);
-      this._transformTextureHash = snapshot.hash;
+      // Upload only the rows actually written since the last upload (delta), so
+      // barrier-heavy frames don't re-upload the whole growing buffer. A reused
+      // slot below the high-water mark is in the dirty range, so it re-uploads.
+      const { firstRow, rowCount } = this._transformBuffer.consumeDirtyRange(snapshot.count);
+
+      if (rowCount > 0) {
+        nextTransformTexture.commitRect(0, firstRow, 3, rowCount);
+        this._transformBuffer.recordUpload(rowCount);
+      }
+
       this._transformTextureCount = snapshot.count;
+      this._transformTextureHash = snapshot.hash;
     }
 
     return this.bindTexture(nextTransformTexture, unit);
diff --git a/src/rendering/webgpu/WebGpuBackend.ts b/src/rendering/webgpu/WebGpuBackend.ts
index 5d91790e..2a3f187a 100644
--- a/src/rendering/webgpu/WebGpuBackend.ts
+++ b/src/rendering/webgpu/WebGpuBackend.ts
@@ -127,6 +127,8 @@ export class WebGpuBackend implements RenderBackend {
   private _activeDrawCommand: DrawCommand | null = null;
   private _passCoordinatorInstance: WebGpuPassCoordinator | null = null;
   private _drawPlanDepth = 0;
+  private readonly _planBaseStack: number[] = [];
+  private readonly _planHashStack: number[] = [];
 
   public constructor(app: Application) {
     const canvasOptions = app.options.canvas ?? {};
@@ -243,22 +245,37 @@ export class WebGpuBackend implements RenderBackend {
 
   public resetStats(): this {
     resetRenderStats(this._stats);
+    // The transform buffer is frame-scoped: reset it once per frame here (was
+    // previously reset per render() call in _beginDrawPlan).
+    this._getTransformStorage().buffer.begin();
 
     return this;
   }
 
+  /** Frame-global slot base the plan builder indexes from. @internal */
+  public get transformBufferCount(): number {
+    return this._getTransformStorage().buffer.count;
+  }
+
   /** @internal */
   public _beginDrawPlan(nodeCount: number): void {
     const storage = this._getTransformStorage();
 
-    storage.begin(nodeCount);
+    // Do NOT reset the transform buffer here — it is frame-scoped (reset in
+    // resetStats). The builder already based this plan's node indices at the
+    // current buffer count, so writes land in fresh frame-global slots and
+    // batches survive across render() calls. Remember this plan's base so a
+    // nested plan can free its rows on end.
+    this._planBaseStack.push(storage.buffer.count);
+    this._planHashStack.push(storage.buffer.frameHash);
 
     // Pre-allocate the GPU storage buffer for the full plan before any group
-    // flush runs. Without this, a later flush with a higher maxNodeIndex would
-    // destroy and replace the buffer mid-frame while earlier command buffers
-    // may still reference the old allocation.
-    if (nodeCount > 0 && this._device !== null && !this._deviceLost) {
-      storage.reserve(this._device, nodeCount, this._accountant);
+    // flush runs. Base the reservation on the frame-global count + this plan's
+    // nodes so the buffer grows to cover both pre-existing frame rows and new rows.
+    const reserveCount = storage.buffer.count + nodeCount;
+
+    if (reserveCount > 0 && this._device !== null && !this._deviceLost) {
+      storage.reserve(this._device, reserveCount, this._accountant);
     }
 
     this._activeDrawCommand = null;
@@ -311,10 +328,22 @@ export class WebGpuBackend implements RenderBackend {
   public _endDrawPlan(): void {
     this._activeDrawCommand = null;
 
+    const planBase = this._planBaseStack.pop() ?? 0;
+    const planHash = this._planHashStack.pop() ?? 0;
+
     if (this._drawPlanDepth > 0) {
       this._drawPlanDepth--;
     }
 
+    // A nested plan (filter / cacheAsBitmap) just ended: flush its draws, then
+    // free its transform rows so the frame-scoped buffer only grows with
+    // top-level render() calls. Top-level plans (depth back to 0) keep their rows
+    // so cross-call batching survives to the frame-end flush.
+    if (this._drawPlanDepth > 0) {
+      this._flushActiveRenderer();
+      this._getTransformStorage().buffer.rewindTo(planBase, planHash);
+    }
+
     // Only assert balance at the outermost plan: a nested render() (e.g.
     // cacheAsBitmap drawing its cache sprite) sees the still-open outer clips,
     // which are not leaks.
@@ -594,7 +623,12 @@ export class WebGpuBackend implements RenderBackend {
   }
 
   public setView(view: View | null): this {
-    this._flushActiveRenderer();
+    // Only flush the open batch when the view actually changes. The unconditional
+    // flush forced one draw call per render() call (each render() re-applies the
+    // same camera view), defeating cross-call batching.
+    if (this._renderTarget.view !== view) {
+      this._flushActiveRenderer();
+    }
     this._renderTarget.setView(view);
 
     return this;
diff --git a/src/rendering/webgpu/WebGpuTransformStorage.ts b/src/rendering/webgpu/WebGpuTransformStorage.ts
index 72a3c9d8..529e3e0c 100644
--- a/src/rendering/webgpu/WebGpuTransformStorage.ts
+++ b/src/rendering/webgpu/WebGpuTransformStorage.ts
@@ -14,6 +14,7 @@ export class WebGpuTransformStorage {
   private _storageCapacity = 0;
   private _storageHash = 0;
   private _storageCount = -1;
+  private _needsFullUpload = false;
   private _accountant: GpuResourceAccountant | null = null;
   /** GPU bytes currently booked for the storage buffer with the resource accountant. */
   private _accountedBytes = 0;
@@ -27,7 +28,8 @@ export class WebGpuTransformStorage {
     return this._buffer;
   }
 
-  public begin(nodeCount: number): void {
+  /** Reset the underlying frame-scoped buffer. Used directly by tests. @internal */
+  public begin(nodeCount = 0): void {
     this._buffer.begin(nodeCount);
   }
 
@@ -98,12 +100,40 @@ export class WebGpuTransformStorage {
       this._growBuffer(device, requiredBytes);
     }
 
+    // A skipped flush (all three guards false) leaves the dirty range uncleared
+    // until the next begin(). Safe: every write() mixes its slot into _frameHash,
+    // so a non-empty dirty range always coincides with snapshot.changed = true —
+    // the upload branch is always taken before any dirty rows could be stale.
     if (snapshot.changed || snapshot.hash !== this._storageHash || snapshot.count !== this._storageCount) {
-      const bytes = snapshot.count * slotFloatCount * Float32Array.BYTES_PER_ELEMENT;
+      // Always consume the dirty range first to clear it — regardless of whether
+      // the full-upload path (post-grow) or the delta path runs below. Both paths
+      // are inside this if-branch; the skip case (snapshot unchanged) never reaches
+      // here, so the dirty range is only consumed when an upload is actually issued.
+      const { firstRow, rowCount } = this._buffer.consumeDirtyRange(snapshot.count);
+
+      const slotBytes = slotFloatCount * Float32Array.BYTES_PER_ELEMENT;
+
+      if (this._needsFullUpload) {
+        // Post-grow: the new GPUBuffer is empty; upload the full [0, snapshot.count)
+        // range so rows already consumed by earlier flushes this frame are present.
+        device.queue.writeBuffer(this._storageBuffer!, 0, this._buffer.data.buffer, this._buffer.data.byteOffset, snapshot.count * slotBytes);
+        this._buffer.recordUpload(snapshot.count);
+        this._accountant?.recordBufferUpload(snapshot.count * slotBytes);
+        this._needsFullUpload = false;
+      } else if (rowCount > 0) {
+        // Normal delta path: upload only the rows written since the last upload.
+        // A reused slot below the high-water mark is in the dirty range, so it re-uploads.
+        device.queue.writeBuffer(
+          this._storageBuffer!,
+          firstRow * slotBytes,
+          this._buffer.data.buffer,
+          this._buffer.data.byteOffset + firstRow * slotBytes,
+          rowCount * slotBytes,
+        );
+        this._buffer.recordUpload(rowCount);
+        this._accountant?.recordBufferUpload(rowCount * slotBytes);
+      }
 
-      device.queue.writeBuffer(this._storageBuffer!, 0, this._buffer.data.buffer, this._buffer.data.byteOffset, bytes);
-      this._buffer.recordUpload(snapshot.count);
-      this._accountant?.recordBufferUpload(bytes);
       this._storageHash = snapshot.hash;
       this._storageCount = snapshot.count;
     }
@@ -142,6 +172,7 @@ export class WebGpuTransformStorage {
     this._storageCapacity = nextCapacity;
     this._storageHash = 0;
     this._storageCount = -1;
+    this._needsFullUpload = true;
     // Re-book the storage footprint (free the prior buffer's bytes, allocate the new).
     this._accountedBytes = this._accountant?.reallocate(this._accountedBytes, nextCapacity) ?? this._accountedBytes;
   }
diff --git a/test/perf/rendering/harness.ts b/test/perf/rendering/harness.ts
index cd21b523..2af75114 100644
--- a/test/perf/rendering/harness.ts
+++ b/test/perf/rendering/harness.ts
@@ -7,6 +7,7 @@
  *
  * @internal Test/perf-only.
  */
+import { playRenderTree } from '#rendering/plan/playRenderTree';
 import type { RenderNode } from '#rendering/RenderNode';
 import type { View } from '#rendering/View';
 import { WebGl2Backend } from '#rendering/webgl2/WebGl2Backend';
@@ -161,3 +162,49 @@ export const measureSteadyFrame = (harness: WebGl2Harness, root: RenderNode, war
 
   return metrics!;
 };
+
+/**
+ * Render each node via its own setView + playRenderTree (exactly what
+ * RenderingContext.render does per call), then flush once — i.e. the
+ * "one context.render() per drawable in a loop" pattern. Returns the metrics of
+ * the final warmed frame.
+ */
+export const measureCrossCallFrame = (harness: WebGl2Harness, nodes: readonly RenderNode[], warmupFrames = 2): FrameMetrics => {
+  const { backend, recorder } = harness;
+  let metrics: FrameMetrics | null = null;
+
+  for (let i = 0; i <= warmupFrames; i++) {
+    backend.resetStats();
+    recorder.reset();
+    backend.clear();
+
+    const view = backend.view;
+    for (const node of nodes) {
+      backend.setView(view);
+      playRenderTree(node, backend);
+    }
+    backend.flush();
+
+    const stats = backend.stats;
+    metrics = {
+      drawCalls: stats.drawCalls,
+      batches: stats.batches,
+      instances: recorder.instances,
+      visibleNodes: stats.submittedNodes,
+      culledNodes: stats.culledNodes,
+      renderPasses: stats.renderPasses,
+      textureBinds: recorder.textureBinds,
+      samplerBinds: recorder.samplerBinds,
+      programChanges: recorder.programChanges,
+      blendChanges: recorder.blendChanges,
+      bufferUploads: recorder.bufferUploads,
+      bufferReallocations: recorder.bufferReallocations,
+      uploadedBufferBytes: recorder.bufferUploadBytes,
+      transformRows: recorder.transformRows,
+      transformUploads: recorder.transformUploads,
+      transformUploadBytes: recorder.transformUploadBytes,
+    };
+  }
+
+  return metrics!;
+};
diff --git a/test/perf/rendering/structural-sprite.test.ts b/test/perf/rendering/structural-sprite.test.ts
index 00fd1cb9..2ca4440c 100644
--- a/test/perf/rendering/structural-sprite.test.ts
+++ b/test/perf/rendering/structural-sprite.test.ts
@@ -14,7 +14,7 @@ import { Sprite } from '#rendering/sprite/Sprite';
 import type { BlendModes } from '#rendering/types';
 
 import { buildSpriteScene, makeTextures } from './fixtures';
-import { createWebGl2Harness, measureSteadyFrame, type WebGl2Harness } from './harness';
+import { createWebGl2Harness, measureCrossCallFrame, measureSteadyFrame, type WebGl2Harness } from './harness';
 
 const withHarness = (fn: (harness: WebGl2Harness) => void): void => {
   const harness = createWebGl2Harness();
@@ -138,6 +138,25 @@ describe('structural — Sprite', () => {
     });
   });
 
+  it('1000 per-call renders / 1 texture → one draw (cross-call batching)', () => {
+    withHarness(harness => {
+      const [texture] = makeTextures(1);
+      const sprites = Array.from({ length: 1000 }, (_, i) => {
+        const sprite = new Sprite(texture);
+        sprite.setPosition(i % 100, Math.floor(i / 100));
+        return sprite;
+      });
+
+      const m = measureCrossCallFrame(harness, sprites, 2);
+
+      expect(m.drawCalls).toBe(1);
+      expect(m.instances).toBe(1000);
+      expect(m.visibleNodes).toBe(1000);
+
+      for (const sprite of sprites) sprite.destroy();
+    });
+  });
+
   it('static transforms skip re-upload; moving transforms re-upload all rows', () => {
     withHarness(harness => {
       const staticScene = buildSpriteScene({ count: 500, textures: makeTextures(1) });
@@ -164,4 +183,26 @@ describe('structural — Sprite', () => {
       root.destroy();
     });
   });
+
+  it('per-call renders match a Container render (same draws, instances, transform rows)', () => {
+    withHarness(harness => {
+      const [texture] = makeTextures(1);
+
+      const loose = Array.from({ length: 500 }, (_, i) => {
+        const sprite = new Sprite(texture);
+        sprite.setPosition((i * 7) % 640, (i * 13) % 480);
+        return sprite;
+      });
+      const crossCall = measureCrossCallFrame(harness, loose, 2);
+      for (const sprite of loose) sprite.destroy();
+
+      const { root } = buildSpriteScene({ count: 500, textures: makeTextures(1) });
+      const container = measureSteadyFrame(harness, root, 2);
+      root.destroy();
+
+      expect(crossCall.drawCalls).toBe(container.drawCalls);
+      expect(crossCall.instances).toBe(container.instances);
+      expect(crossCall.transformRows).toBe(container.transformRows);
+    });
+  });
 });
diff --git a/test/rendering/transform-buffer.test.ts b/test/rendering/transform-buffer.test.ts
index 5d5d5d30..ed74a847 100644
--- a/test/rendering/transform-buffer.test.ts
+++ b/test/rendering/transform-buffer.test.ts
@@ -168,4 +168,91 @@ describe('TransformBuffer', () => {
 
     parent.destroy();
   });
+
+  test('consumeDirtyRange returns empty sentinel on a fresh buffer after begin()', () => {
+    const buffer = new TransformBuffer();
+
+    buffer.begin();
+    const result = buffer.consumeDirtyRange(10);
+
+    expect(result.rowCount).toBe(0);
+    expect(result.firstRow).toBe(0);
+  });
+
+  test('consumeDirtyRange covers all written slots and clears itself on second call', () => {
+    const buffer = new TransformBuffer();
+    const identity = new Matrix();
+
+    buffer.begin();
+    buffer.write(0, identity, Color.white);
+    buffer.write(1, identity, Color.white);
+    buffer.write(2, identity, Color.white);
+
+    const first = buffer.consumeDirtyRange(3);
+
+    expect(first).toEqual({ firstRow: 0, rowCount: 3 });
+
+    const second = buffer.consumeDirtyRange(3);
+
+    expect(second.rowCount).toBe(0);
+  });
+
+  test('consumeDirtyRange tracks reuse below the high-water mark', () => {
+    const buffer = new TransformBuffer();
+    const identity = new Matrix();
+
+    buffer.begin();
+    buffer.write(0, identity, Color.white);
+    buffer.write(1, identity, Color.white);
+    buffer.write(2, identity, Color.white);
+    buffer.consumeDirtyRange(3); // clear after first writes
+
+    buffer.write(1, identity, Color.white); // reuse slot 1 below high-water mark
+
+    const result = buffer.consumeDirtyRange(3);
+
+    expect(result).toEqual({ firstRow: 1, rowCount: 1 });
+  });
+
+  test('consumeDirtyRange clamps to maxCount — a write above the limit is excluded', () => {
+    const buffer = new TransformBuffer();
+    const identity = new Matrix();
+
+    buffer.begin();
+    buffer.write(5, identity, Color.white); // slot 5 is above maxCount = 3
+
+    const result = buffer.consumeDirtyRange(3);
+
+    expect(result.rowCount).toBe(0);
+  });
+
+  test('rewindTo restores the write cursor and optionally the frame hash', () => {
+    const buffer = new TransformBuffer();
+    const identity = new Matrix();
+
+    buffer.begin();
+    buffer.write(0, identity, Color.white);
+    const savedHash = buffer.frameHash;
+
+    buffer.write(1, identity, Color.white);
+    buffer.rewindTo(1, savedHash);
+
+    expect(buffer.count).toBe(1);
+    expect(buffer.frameHash).toBe(savedHash);
+  });
+
+  test('begin() resets the dirty range so consumeDirtyRange returns empty', () => {
+    const buffer = new TransformBuffer();
+    const identity = new Matrix();
+
+    buffer.begin();
+    buffer.write(0, identity, Color.white);
+    buffer.write(1, identity, Color.white);
+
+    buffer.begin(); // should reset dirty range
+
+    const result = buffer.consumeDirtyRange(10);
+
+    expect(result.rowCount).toBe(0);
+  });
 });
diff --git a/test/rendering/webgpu-backend.test.ts b/test/rendering/webgpu-backend.test.ts
index d0e7c284..0e057969 100644
--- a/test/rendering/webgpu-backend.test.ts
+++ b/test/rendering/webgpu-backend.test.ts
@@ -1790,10 +1790,15 @@ describe('WebGpuBackend', () => {
       manager.flush();
       manager.destroy();
 
-      // The sprite's world transform now lives in the shared transform storage
-      // buffer (uploaded as the last writeBuffer of the sprite flush), not inline
-      // in the instance buffer. Slot 0 = (a, b, c, d, tx, ty, 0, 0, tint…); an
+      // The sprite's world transform lives in the shared transform storage buffer
+      // (the last writeBuffer of the sprite flush carries the whole buffer's
+      // ArrayBuffer), not inline in the instance buffer. The buffer is frame-scoped
+      // (cross-call batching): the graphics rendered into the RenderTexture is the
+      // first shared-buffer write (slot 0), so the sprite is the second and lands
+      // in slot 1. Each slot is 12 floats (a, b, c, d, tx, ty, 0, 0, tint…); an
       // unrotated sprite at (24, 18) has b == 0 and carries that translation.
+      const slotFloats = 12;
+      const spriteBase = 1 * slotFloats; // slot 1
       const transformWrite = environment.queue.writeBuffer.mock.calls[environment.queue.writeBuffer.mock.calls.length - 1];
       const data = new Float32Array(transformWrite[2] as ArrayBuffer);
 
@@ -1801,9 +1806,9 @@ describe('WebGpuBackend', () => {
       expect(environment.pass.drawIndexed).toHaveBeenCalled();
       expect(environment.queue.submit.mock.calls.length).toBeGreaterThanOrEqual(2);
       expect(environment.textures.length).toBeGreaterThan(0);
-      expect(data[1]).toBe(0);
-      expect(data[4]).toBe(24);
-      expect(data[5]).toBe(18);
+      expect(data[spriteBase + 1]).toBe(0);
+      expect(data[spriteBase + 4]).toBe(24);
+      expect(data[spriteBase + 5]).toBe(18);
     } finally {
       environment.restore();
     }