diff --git a/examples/debug-layer/performance-overlay.js b/examples/debug-layer/performance-overlay.js index 275375b1..7534cc11 100644 --- a/examples/debug-layer/performance-overlay.js +++ b/examples/debug-layer/performance-overlay.js @@ -1,5 +1,5 @@ // Auto-generated from performance-overlay.ts — edit the .ts source, not this file. -import { Application, Color, Keyboard, Scene, Sprite, Texture } from '@codexo/exojs'; +import { Application, Color, Container, Keyboard, Scene, Sprite, Texture } from '@codexo/exojs'; import { DebugOverlay } from '@codexo/exojs/debug'; const app = new Application({ canvas: { @@ -17,14 +17,21 @@ const debug = new DebugOverlay(app); debug.layers.performance.visible = true; class PerformanceOverlayScene extends Scene { sprites; + layer; async load(loader) { await loader.load(Texture, { bunny: 'image/ship-a.png' }); } init(loader) { const { width, height } = this.app.canvas; + // All sprites share one texture, so adding them to a single container and + // rendering it once lets the renderer batch them into a single draw call. + // Rendering each sprite with its own `context.render(sprite)` call would + // instead emit one draw call per sprite and tank the frame rate. + this.layer = new Container(); this.sprites = Array.from({ length: 1600 }, () => { const sprite = new Sprite(loader.get(Texture, 'bunny')).setAnchor(0.5).setScale(0.25); sprite.setPosition(Math.random() * width, Math.random() * height); + this.layer.addChild(sprite); return { sprite, vx: (Math.random() - 0.5) * 120, @@ -47,8 +54,7 @@ class PerformanceOverlayScene extends Scene { } draw(context) { context.backend.clear(); - for (const { sprite } of this.sprites) - context.render(sprite); + context.render(this.layer); } } app.start(new PerformanceOverlayScene()); diff --git a/examples/debug-layer/performance-overlay.ts b/examples/debug-layer/performance-overlay.ts index 7ff4b2b8..b5b1d415 100644 --- a/examples/debug-layer/performance-overlay.ts +++ b/examples/debug-layer/performance-overlay.ts @@ -1,4 +1,4 @@ -import { Application, Color, Keyboard, Scene, Sprite, Texture } from '@codexo/exojs'; +import { Application, Color, Container, Keyboard, Scene, Sprite, Texture } from '@codexo/exojs'; import { DebugOverlay } from '@codexo/exojs/debug'; const app = new Application({ @@ -19,6 +19,7 @@ debug.layers.performance.visible = true; class PerformanceOverlayScene extends Scene { private sprites!: { sprite: Sprite; vx: number; vy: number }[]; + private layer!: Container; override async load(loader): Promise { await loader.load(Texture, { bunny: 'image/ship-a.png' }); @@ -27,9 +28,15 @@ class PerformanceOverlayScene extends Scene { override init(loader): void { const { width, height } = this.app.canvas; + // All sprites share one texture, so adding them to a single container and + // rendering it once lets the renderer batch them into a single draw call. + // Rendering each sprite with its own `context.render(sprite)` call would + // instead emit one draw call per sprite and tank the frame rate. + this.layer = new Container(); this.sprites = Array.from({ length: 1600 }, () => { const sprite = new Sprite(loader.get(Texture, 'bunny')).setAnchor(0.5).setScale(0.25); sprite.setPosition(Math.random() * width, Math.random() * height); + this.layer.addChild(sprite); return { sprite, vx: (Math.random() - 0.5) * 120, @@ -53,7 +60,7 @@ class PerformanceOverlayScene extends Scene { override draw(context): void { context.backend.clear(); - for (const { sprite } of this.sprites) context.render(sprite); + context.render(this.layer); } } diff --git a/packages/exojs-particles/test/particle-gpu.test.ts b/packages/exojs-particles/test/particle-gpu.test.ts index 333945d0..93296074 100644 --- a/packages/exojs-particles/test/particle-gpu.test.ts +++ b/packages/exojs-particles/test/particle-gpu.test.ts @@ -384,6 +384,10 @@ describe('ParticleSystem render-inject backend detection', () => { const env = makeMockDevice(); const fakeBackend = Object.create(WebGpuBackend.prototype) as object; Object.defineProperty(fakeBackend, 'device', { value: env.device, configurable: true }); + // Frame-scoped batching uses these instance stacks in _beginDrawPlan/_endDrawPlan; + // Object.create bypasses the constructor that initializes them, so seed them here. + Object.defineProperty(fakeBackend, '_planBaseStack', { value: [], configurable: true }); + Object.defineProperty(fakeBackend, '_planHashStack', { value: [], configurable: true }); const system = new ParticleSystem(makeTexture(), { capacity: 4 }); system.addUpdateModule(new ApplyForce(0, 0)); diff --git a/src/rendering/RenderingContext.ts b/src/rendering/RenderingContext.ts index 7e9b8a1b..2c66b99c 100644 --- a/src/rendering/RenderingContext.ts +++ b/src/rendering/RenderingContext.ts @@ -261,11 +261,10 @@ export class RenderingContext implements System { const view = options.view ?? this._camera; const mesh = (this._immediateMesh ??= new ImmediateMesh()); - // Set the view first: this flushes whatever renderer a prior render() / - // drawGeometry left pending, so the shared transform buffer is free for this - // draw's synthetic slot and the pooled mesh is safe to reconfigure. The - // immediate flush below then keeps a later drawGeometry from observing this - // pooled mesh through a still-deferred draw. + // Set the view first: setView now only flushes when the view actually changes + // (not unconditionally). Correctness here rests on (a) the trailing flush() + // below — so a later drawGeometry cannot observe this pooled mesh through a + // still-deferred draw — and (b) any renderer switch flushing its pending batch. this._backend.setView(view); mesh.configure(geometry, transform, material, options.tint ?? null); this._backend.draw(mesh); @@ -302,9 +301,11 @@ export class RenderingContext implements System { const view = options.view ?? this._camera; const mesh = (this._batchMesh ??= new ImmediateMesh()); - // Set the view first (flushing any renderer left pending), configure the - // pooled geometry/look source, then submit a single instanced draw over the - // batch's per-instance transforms/tints and flush it immediately. + // Set the view first (setView only flushes when the view actually changes; + // correctness rests on the trailing flush() below and on any renderer switch + // flushing its pending batch), configure the pooled geometry/look source, + // then submit a single instanced draw over the batch's per-instance + // transforms/tints and flush it immediately. this._backend.setView(view); mesh.configureBatchSource(batch.geometry, batch.material); this._backend.drawInstanced(mesh, batch._instanceTransforms, batch._instanceTints, batch.count); diff --git a/src/rendering/TransformBuffer.ts b/src/rendering/TransformBuffer.ts index 8482f134..f63b0181 100644 --- a/src/rendering/TransformBuffer.ts +++ b/src/rendering/TransformBuffer.ts @@ -38,6 +38,12 @@ export class TransformBuffer { private _skippedWriteCount = 0; private _uploadCount = 0; private _uploadedRecordCount = 0; + // Dirty row range [_dirtyMin, _dirtyMax] written since the last upload — the + // exact rows a delta upload must push. Empty when `_dirtyMax < _dirtyMin`. + // Tracked by slot (not a high-water mark) so a reused slot (nested-plan + // rewind, filter composite) is correctly re-uploaded. + private _dirtyMin = 0; + private _dirtyMax = -1; public get count(): number { return this._count; @@ -94,6 +100,11 @@ export class TransformBuffer { return this._version; } + /** Running content hash of the rows written since begin(). @internal */ + public get frameHash(): number { + return this._frameHash; + } + public begin(expectedCount = 0): this { if (expectedCount > 0) { this._ensureCapacity(expectedCount); @@ -105,6 +116,8 @@ export class TransformBuffer { this._skippedWriteCount = 0; this._uploadCount = 0; this._uploadedRecordCount = 0; + this._dirtyMin = 0; + this._dirtyMax = -1; return this; } @@ -117,6 +130,47 @@ export class TransformBuffer { return slot; } + /** + * Rewind the write cursor to `count`, freeing the rows above it for reuse, and + * (optionally) restore the running content hash to its pre-rewind value so the + * freed rows' writes don't linger in the hash and trigger spurious re-uploads. + * Used by nested draw plans (filters / cacheAsBitmap) to isolate their slots. + * @internal + */ + public rewindTo(count: number, frameHash?: number): this { + if (count >= 0 && count < this._count) { + this._count = count; + + if (frameHash !== undefined) { + this._frameHash = frameHash >>> 0; + } + } + + return this; + } + + /** + * Consume the dirty row range written since the last upload, clamped to + * `[0, maxCount)`, and clear it. Returns the contiguous `[firstRow, firstRow + + * rowCount)` a delta upload should push (`rowCount === 0` when nothing is + * dirty). The backend calls this at its upload boundary. + * @internal + */ + public consumeDirtyRange(maxCount: number): { firstRow: number; rowCount: number } { + if (this._dirtyMax < this._dirtyMin) { + return { firstRow: 0, rowCount: 0 }; + } + + const firstRow = Math.max(0, this._dirtyMin); + const lastRow = Math.min(this._dirtyMax, maxCount - 1); + const rowCount = lastRow >= firstRow ? lastRow - firstRow + 1 : 0; + + this._dirtyMin = 0; + this._dirtyMax = -1; + + return { firstRow, rowCount }; + } + public write(slot: number, transform: Matrix, tint: Color): this { if (!Number.isInteger(slot) || slot < 0) { throw new Error(`TransformBuffer slot must be a non-negative integer (got ${slot}).`); @@ -144,6 +198,16 @@ export class TransformBuffer { this._count = slot + 1; } + // Track the exact written-slot range so a delta upload pushes precisely the + // changed rows — including a slot reused below the high-water mark. + if (this._dirtyMax < this._dirtyMin) { + this._dirtyMin = slot; + this._dirtyMax = slot; + } else { + if (slot < this._dirtyMin) this._dirtyMin = slot; + if (slot > this._dirtyMax) this._dirtyMax = slot; + } + this._frameHash = this._mix(this._frameHash, slot); for (let i = 0; i < floatsPerSlot; i++) { diff --git a/src/rendering/plan/RenderInstruction.ts b/src/rendering/plan/RenderInstruction.ts index 27132776..91161945 100644 --- a/src/rendering/plan/RenderInstruction.ts +++ b/src/rendering/plan/RenderInstruction.ts @@ -9,8 +9,11 @@ import type { GroupScope } from './RenderScope'; * names the concept the plan player consumes and that the batching layer * reorders, independent of how the draw happens to be stored in the scope * tree. Future {@link TransformBuffer} slotting keys on each instruction's - * stable {@link DrawCommand.nodeIndex} (within the `[0, plan.nodeCount)` - * slot space). + * stable {@link DrawCommand.nodeIndex}. Each index is frame-global — + * `[frameBase, frameBase + plan.nodeCount)` — because the transform buffer + * is frame-scoped and the builder bases node indices at the current buffer + * slot count (`frameBase`) so every plan in the frame occupies distinct + * slots and can batch cross-call. * * Batch units (maximal runs of consecutive instructions in a {@link GroupScope} * sharing GPU pipeline/bind state) are not materialized: the plan player walks diff --git a/src/rendering/plan/RenderPlanBuilder.ts b/src/rendering/plan/RenderPlanBuilder.ts index db9cbe41..58053f25 100644 --- a/src/rendering/plan/RenderPlanBuilder.ts +++ b/src/rendering/plan/RenderPlanBuilder.ts @@ -93,7 +93,11 @@ export class RenderPlanBuilder { this._barrierEntryPoolCursor = 0; this._scopeStack.length = 0; this._hasPending = false; - this._nodeIndex = 0; + // Base this plan's node indices after whatever earlier render() calls already + // wrote into the frame-scoped transform buffer, so every draw across all + // render() calls in the frame references a distinct slot and can batch. + const frameBase = (backend as { transformBufferCount?: number }).transformBufferCount ?? 0; + this._nodeIndex = frameBase; const rootScope = this._acquireGroupScope(false); @@ -110,7 +114,7 @@ export class RenderPlanBuilder { }); } - this._plan.nodeCount = this._nodeIndex; + this._plan.nodeCount = this._nodeIndex - frameBase; return this._plan; } diff --git a/src/rendering/webgl2/WebGl2Backend.ts b/src/rendering/webgl2/WebGl2Backend.ts index e58d6a3a..c9e660ba 100644 --- a/src/rendering/webgl2/WebGl2Backend.ts +++ b/src/rendering/webgl2/WebGl2Backend.ts @@ -181,6 +181,8 @@ export class WebGl2Backend implements RenderBackend { private _transformTextureCount = -1; private _activeDrawCommand: DrawCommand | null = null; private _drawPlanDepth = 0; + private readonly _planBaseStack: number[] = []; + private readonly _planHashStack: number[] = []; public constructor(app: Application) { const canvasOptions = app.options.canvas ?? {}; @@ -279,13 +281,27 @@ export class WebGl2Backend implements RenderBackend { public resetStats(): this { resetRenderStats(this._stats); + // The transform buffer is frame-scoped: reset it once per frame here (was + // previously reset per render() call in _beginDrawPlan). + this._transformBuffer.begin(); return this; } + /** Frame-global slot base the plan builder indexes from. @internal */ + public get transformBufferCount(): number { + return this._transformBuffer.count; + } + /** @internal */ - public _beginDrawPlan(nodeCount: number): void { - this._transformBuffer.begin(nodeCount); + public _beginDrawPlan(_nodeCount: number): void { + // Do NOT reset the transform buffer here — it is frame-scoped (reset in + // resetStats). The builder already based this plan's node indices at the + // current buffer count, so writes land in fresh frame-global slots and + // batches survive across render() calls. Remember this plan's base so a + // nested plan can free its rows on end. + this._planBaseStack.push(this._transformBuffer.count); + this._planHashStack.push(this._transformBuffer.frameHash); this._activeDrawCommand = null; this._drawPlanDepth++; } @@ -395,13 +411,23 @@ export class WebGl2Backend implements RenderBackend { public _endDrawPlan(): void { this._activeDrawCommand = null; + const planBase = this._planBaseStack.pop() ?? 0; + const planHash = this._planHashStack.pop() ?? 0; + if (this._drawPlanDepth > 0) { this._drawPlanDepth--; } - // Only assert balance at the outermost plan: cacheAsBitmap draws a cache - // sprite via a nested render(), whose inner _endDrawPlan sees the still-open - // outer clips — those are not leaks. + // A nested plan (filter / cacheAsBitmap) just ended: flush its draws, then + // free its transform rows so the frame-scoped buffer only grows with + // top-level render() calls. Top-level plans (depth back to 0) keep their rows + // so cross-call batching survives to the frame-end flush. + if (this._drawPlanDepth > 0) { + this._flushActiveRenderer(); + this._transformBuffer.rewindTo(planBase, planHash); + } + + // Only assert balance at the outermost plan. if (this._drawPlanDepth === 0) { this._assertBalancedStencil(); } @@ -715,7 +741,12 @@ export class WebGl2Backend implements RenderBackend { } public setView(view: View | null): this { - this._flushActiveRenderer(); + // Only flush the open batch when the view actually changes. The unconditional + // flush forced one draw call per render() call (each render() re-applies the + // same camera view), defeating cross-call batching. + if (this._renderTarget.view !== view) { + this._flushActiveRenderer(); + } this._renderTarget.setView(view); this._bindRenderTarget(this._renderTarget); @@ -804,11 +835,23 @@ export class WebGl2Backend implements RenderBackend { throw new Error('Transform texture must be initialized before binding.'); } + // A skipped flush (all three guards false) leaves the dirty range uncleared + // until the next begin(). Safe: every write() mixes its slot into _frameHash, + // so a non-empty dirty range always coincides with snapshot.changed = true — + // the upload branch is always taken before any dirty rows could be stale. if (snapshot.changed || snapshot.count !== this._transformTextureCount || snapshot.hash !== this._transformTextureHash) { - nextTransformTexture.commitRect(0, 0, 3, snapshot.count); - this._transformBuffer.recordUpload(snapshot.count); - this._transformTextureHash = snapshot.hash; + // Upload only the rows actually written since the last upload (delta), so + // barrier-heavy frames don't re-upload the whole growing buffer. A reused + // slot below the high-water mark is in the dirty range, so it re-uploads. + const { firstRow, rowCount } = this._transformBuffer.consumeDirtyRange(snapshot.count); + + if (rowCount > 0) { + nextTransformTexture.commitRect(0, firstRow, 3, rowCount); + this._transformBuffer.recordUpload(rowCount); + } + this._transformTextureCount = snapshot.count; + this._transformTextureHash = snapshot.hash; } return this.bindTexture(nextTransformTexture, unit); diff --git a/src/rendering/webgpu/WebGpuBackend.ts b/src/rendering/webgpu/WebGpuBackend.ts index 5d91790e..2a3f187a 100644 --- a/src/rendering/webgpu/WebGpuBackend.ts +++ b/src/rendering/webgpu/WebGpuBackend.ts @@ -127,6 +127,8 @@ export class WebGpuBackend implements RenderBackend { private _activeDrawCommand: DrawCommand | null = null; private _passCoordinatorInstance: WebGpuPassCoordinator | null = null; private _drawPlanDepth = 0; + private readonly _planBaseStack: number[] = []; + private readonly _planHashStack: number[] = []; public constructor(app: Application) { const canvasOptions = app.options.canvas ?? {}; @@ -243,22 +245,37 @@ export class WebGpuBackend implements RenderBackend { public resetStats(): this { resetRenderStats(this._stats); + // The transform buffer is frame-scoped: reset it once per frame here (was + // previously reset per render() call in _beginDrawPlan). + this._getTransformStorage().buffer.begin(); return this; } + /** Frame-global slot base the plan builder indexes from. @internal */ + public get transformBufferCount(): number { + return this._getTransformStorage().buffer.count; + } + /** @internal */ public _beginDrawPlan(nodeCount: number): void { const storage = this._getTransformStorage(); - storage.begin(nodeCount); + // Do NOT reset the transform buffer here — it is frame-scoped (reset in + // resetStats). The builder already based this plan's node indices at the + // current buffer count, so writes land in fresh frame-global slots and + // batches survive across render() calls. Remember this plan's base so a + // nested plan can free its rows on end. + this._planBaseStack.push(storage.buffer.count); + this._planHashStack.push(storage.buffer.frameHash); // Pre-allocate the GPU storage buffer for the full plan before any group - // flush runs. Without this, a later flush with a higher maxNodeIndex would - // destroy and replace the buffer mid-frame while earlier command buffers - // may still reference the old allocation. - if (nodeCount > 0 && this._device !== null && !this._deviceLost) { - storage.reserve(this._device, nodeCount, this._accountant); + // flush runs. Base the reservation on the frame-global count + this plan's + // nodes so the buffer grows to cover both pre-existing frame rows and new rows. + const reserveCount = storage.buffer.count + nodeCount; + + if (reserveCount > 0 && this._device !== null && !this._deviceLost) { + storage.reserve(this._device, reserveCount, this._accountant); } this._activeDrawCommand = null; @@ -311,10 +328,22 @@ export class WebGpuBackend implements RenderBackend { public _endDrawPlan(): void { this._activeDrawCommand = null; + const planBase = this._planBaseStack.pop() ?? 0; + const planHash = this._planHashStack.pop() ?? 0; + if (this._drawPlanDepth > 0) { this._drawPlanDepth--; } + // A nested plan (filter / cacheAsBitmap) just ended: flush its draws, then + // free its transform rows so the frame-scoped buffer only grows with + // top-level render() calls. Top-level plans (depth back to 0) keep their rows + // so cross-call batching survives to the frame-end flush. + if (this._drawPlanDepth > 0) { + this._flushActiveRenderer(); + this._getTransformStorage().buffer.rewindTo(planBase, planHash); + } + // Only assert balance at the outermost plan: a nested render() (e.g. // cacheAsBitmap drawing its cache sprite) sees the still-open outer clips, // which are not leaks. @@ -594,7 +623,12 @@ export class WebGpuBackend implements RenderBackend { } public setView(view: View | null): this { - this._flushActiveRenderer(); + // Only flush the open batch when the view actually changes. The unconditional + // flush forced one draw call per render() call (each render() re-applies the + // same camera view), defeating cross-call batching. + if (this._renderTarget.view !== view) { + this._flushActiveRenderer(); + } this._renderTarget.setView(view); return this; diff --git a/src/rendering/webgpu/WebGpuTransformStorage.ts b/src/rendering/webgpu/WebGpuTransformStorage.ts index 72a3c9d8..529e3e0c 100644 --- a/src/rendering/webgpu/WebGpuTransformStorage.ts +++ b/src/rendering/webgpu/WebGpuTransformStorage.ts @@ -14,6 +14,7 @@ export class WebGpuTransformStorage { private _storageCapacity = 0; private _storageHash = 0; private _storageCount = -1; + private _needsFullUpload = false; private _accountant: GpuResourceAccountant | null = null; /** GPU bytes currently booked for the storage buffer with the resource accountant. */ private _accountedBytes = 0; @@ -27,7 +28,8 @@ export class WebGpuTransformStorage { return this._buffer; } - public begin(nodeCount: number): void { + /** Reset the underlying frame-scoped buffer. Used directly by tests. @internal */ + public begin(nodeCount = 0): void { this._buffer.begin(nodeCount); } @@ -98,12 +100,40 @@ export class WebGpuTransformStorage { this._growBuffer(device, requiredBytes); } + // A skipped flush (all three guards false) leaves the dirty range uncleared + // until the next begin(). Safe: every write() mixes its slot into _frameHash, + // so a non-empty dirty range always coincides with snapshot.changed = true — + // the upload branch is always taken before any dirty rows could be stale. if (snapshot.changed || snapshot.hash !== this._storageHash || snapshot.count !== this._storageCount) { - const bytes = snapshot.count * slotFloatCount * Float32Array.BYTES_PER_ELEMENT; + // Always consume the dirty range first to clear it — regardless of whether + // the full-upload path (post-grow) or the delta path runs below. Both paths + // are inside this if-branch; the skip case (snapshot unchanged) never reaches + // here, so the dirty range is only consumed when an upload is actually issued. + const { firstRow, rowCount } = this._buffer.consumeDirtyRange(snapshot.count); + + const slotBytes = slotFloatCount * Float32Array.BYTES_PER_ELEMENT; + + if (this._needsFullUpload) { + // Post-grow: the new GPUBuffer is empty; upload the full [0, snapshot.count) + // range so rows already consumed by earlier flushes this frame are present. + device.queue.writeBuffer(this._storageBuffer!, 0, this._buffer.data.buffer, this._buffer.data.byteOffset, snapshot.count * slotBytes); + this._buffer.recordUpload(snapshot.count); + this._accountant?.recordBufferUpload(snapshot.count * slotBytes); + this._needsFullUpload = false; + } else if (rowCount > 0) { + // Normal delta path: upload only the rows written since the last upload. + // A reused slot below the high-water mark is in the dirty range, so it re-uploads. + device.queue.writeBuffer( + this._storageBuffer!, + firstRow * slotBytes, + this._buffer.data.buffer, + this._buffer.data.byteOffset + firstRow * slotBytes, + rowCount * slotBytes, + ); + this._buffer.recordUpload(rowCount); + this._accountant?.recordBufferUpload(rowCount * slotBytes); + } - device.queue.writeBuffer(this._storageBuffer!, 0, this._buffer.data.buffer, this._buffer.data.byteOffset, bytes); - this._buffer.recordUpload(snapshot.count); - this._accountant?.recordBufferUpload(bytes); this._storageHash = snapshot.hash; this._storageCount = snapshot.count; } @@ -142,6 +172,7 @@ export class WebGpuTransformStorage { this._storageCapacity = nextCapacity; this._storageHash = 0; this._storageCount = -1; + this._needsFullUpload = true; // Re-book the storage footprint (free the prior buffer's bytes, allocate the new). this._accountedBytes = this._accountant?.reallocate(this._accountedBytes, nextCapacity) ?? this._accountedBytes; } diff --git a/test/perf/rendering/harness.ts b/test/perf/rendering/harness.ts index cd21b523..2af75114 100644 --- a/test/perf/rendering/harness.ts +++ b/test/perf/rendering/harness.ts @@ -7,6 +7,7 @@ * * @internal Test/perf-only. */ +import { playRenderTree } from '#rendering/plan/playRenderTree'; import type { RenderNode } from '#rendering/RenderNode'; import type { View } from '#rendering/View'; import { WebGl2Backend } from '#rendering/webgl2/WebGl2Backend'; @@ -161,3 +162,49 @@ export const measureSteadyFrame = (harness: WebGl2Harness, root: RenderNode, war return metrics!; }; + +/** + * Render each node via its own setView + playRenderTree (exactly what + * RenderingContext.render does per call), then flush once — i.e. the + * "one context.render() per drawable in a loop" pattern. Returns the metrics of + * the final warmed frame. + */ +export const measureCrossCallFrame = (harness: WebGl2Harness, nodes: readonly RenderNode[], warmupFrames = 2): FrameMetrics => { + const { backend, recorder } = harness; + let metrics: FrameMetrics | null = null; + + for (let i = 0; i <= warmupFrames; i++) { + backend.resetStats(); + recorder.reset(); + backend.clear(); + + const view = backend.view; + for (const node of nodes) { + backend.setView(view); + playRenderTree(node, backend); + } + backend.flush(); + + const stats = backend.stats; + metrics = { + drawCalls: stats.drawCalls, + batches: stats.batches, + instances: recorder.instances, + visibleNodes: stats.submittedNodes, + culledNodes: stats.culledNodes, + renderPasses: stats.renderPasses, + textureBinds: recorder.textureBinds, + samplerBinds: recorder.samplerBinds, + programChanges: recorder.programChanges, + blendChanges: recorder.blendChanges, + bufferUploads: recorder.bufferUploads, + bufferReallocations: recorder.bufferReallocations, + uploadedBufferBytes: recorder.bufferUploadBytes, + transformRows: recorder.transformRows, + transformUploads: recorder.transformUploads, + transformUploadBytes: recorder.transformUploadBytes, + }; + } + + return metrics!; +}; diff --git a/test/perf/rendering/structural-sprite.test.ts b/test/perf/rendering/structural-sprite.test.ts index 00fd1cb9..2ca4440c 100644 --- a/test/perf/rendering/structural-sprite.test.ts +++ b/test/perf/rendering/structural-sprite.test.ts @@ -14,7 +14,7 @@ import { Sprite } from '#rendering/sprite/Sprite'; import type { BlendModes } from '#rendering/types'; import { buildSpriteScene, makeTextures } from './fixtures'; -import { createWebGl2Harness, measureSteadyFrame, type WebGl2Harness } from './harness'; +import { createWebGl2Harness, measureCrossCallFrame, measureSteadyFrame, type WebGl2Harness } from './harness'; const withHarness = (fn: (harness: WebGl2Harness) => void): void => { const harness = createWebGl2Harness(); @@ -138,6 +138,25 @@ describe('structural — Sprite', () => { }); }); + it('1000 per-call renders / 1 texture → one draw (cross-call batching)', () => { + withHarness(harness => { + const [texture] = makeTextures(1); + const sprites = Array.from({ length: 1000 }, (_, i) => { + const sprite = new Sprite(texture); + sprite.setPosition(i % 100, Math.floor(i / 100)); + return sprite; + }); + + const m = measureCrossCallFrame(harness, sprites, 2); + + expect(m.drawCalls).toBe(1); + expect(m.instances).toBe(1000); + expect(m.visibleNodes).toBe(1000); + + for (const sprite of sprites) sprite.destroy(); + }); + }); + it('static transforms skip re-upload; moving transforms re-upload all rows', () => { withHarness(harness => { const staticScene = buildSpriteScene({ count: 500, textures: makeTextures(1) }); @@ -164,4 +183,26 @@ describe('structural — Sprite', () => { root.destroy(); }); }); + + it('per-call renders match a Container render (same draws, instances, transform rows)', () => { + withHarness(harness => { + const [texture] = makeTextures(1); + + const loose = Array.from({ length: 500 }, (_, i) => { + const sprite = new Sprite(texture); + sprite.setPosition((i * 7) % 640, (i * 13) % 480); + return sprite; + }); + const crossCall = measureCrossCallFrame(harness, loose, 2); + for (const sprite of loose) sprite.destroy(); + + const { root } = buildSpriteScene({ count: 500, textures: makeTextures(1) }); + const container = measureSteadyFrame(harness, root, 2); + root.destroy(); + + expect(crossCall.drawCalls).toBe(container.drawCalls); + expect(crossCall.instances).toBe(container.instances); + expect(crossCall.transformRows).toBe(container.transformRows); + }); + }); }); diff --git a/test/rendering/transform-buffer.test.ts b/test/rendering/transform-buffer.test.ts index 5d5d5d30..ed74a847 100644 --- a/test/rendering/transform-buffer.test.ts +++ b/test/rendering/transform-buffer.test.ts @@ -168,4 +168,91 @@ describe('TransformBuffer', () => { parent.destroy(); }); + + test('consumeDirtyRange returns empty sentinel on a fresh buffer after begin()', () => { + const buffer = new TransformBuffer(); + + buffer.begin(); + const result = buffer.consumeDirtyRange(10); + + expect(result.rowCount).toBe(0); + expect(result.firstRow).toBe(0); + }); + + test('consumeDirtyRange covers all written slots and clears itself on second call', () => { + const buffer = new TransformBuffer(); + const identity = new Matrix(); + + buffer.begin(); + buffer.write(0, identity, Color.white); + buffer.write(1, identity, Color.white); + buffer.write(2, identity, Color.white); + + const first = buffer.consumeDirtyRange(3); + + expect(first).toEqual({ firstRow: 0, rowCount: 3 }); + + const second = buffer.consumeDirtyRange(3); + + expect(second.rowCount).toBe(0); + }); + + test('consumeDirtyRange tracks reuse below the high-water mark', () => { + const buffer = new TransformBuffer(); + const identity = new Matrix(); + + buffer.begin(); + buffer.write(0, identity, Color.white); + buffer.write(1, identity, Color.white); + buffer.write(2, identity, Color.white); + buffer.consumeDirtyRange(3); // clear after first writes + + buffer.write(1, identity, Color.white); // reuse slot 1 below high-water mark + + const result = buffer.consumeDirtyRange(3); + + expect(result).toEqual({ firstRow: 1, rowCount: 1 }); + }); + + test('consumeDirtyRange clamps to maxCount — a write above the limit is excluded', () => { + const buffer = new TransformBuffer(); + const identity = new Matrix(); + + buffer.begin(); + buffer.write(5, identity, Color.white); // slot 5 is above maxCount = 3 + + const result = buffer.consumeDirtyRange(3); + + expect(result.rowCount).toBe(0); + }); + + test('rewindTo restores the write cursor and optionally the frame hash', () => { + const buffer = new TransformBuffer(); + const identity = new Matrix(); + + buffer.begin(); + buffer.write(0, identity, Color.white); + const savedHash = buffer.frameHash; + + buffer.write(1, identity, Color.white); + buffer.rewindTo(1, savedHash); + + expect(buffer.count).toBe(1); + expect(buffer.frameHash).toBe(savedHash); + }); + + test('begin() resets the dirty range so consumeDirtyRange returns empty', () => { + const buffer = new TransformBuffer(); + const identity = new Matrix(); + + buffer.begin(); + buffer.write(0, identity, Color.white); + buffer.write(1, identity, Color.white); + + buffer.begin(); // should reset dirty range + + const result = buffer.consumeDirtyRange(10); + + expect(result.rowCount).toBe(0); + }); }); diff --git a/test/rendering/webgpu-backend.test.ts b/test/rendering/webgpu-backend.test.ts index d0e7c284..0e057969 100644 --- a/test/rendering/webgpu-backend.test.ts +++ b/test/rendering/webgpu-backend.test.ts @@ -1790,10 +1790,15 @@ describe('WebGpuBackend', () => { manager.flush(); manager.destroy(); - // The sprite's world transform now lives in the shared transform storage - // buffer (uploaded as the last writeBuffer of the sprite flush), not inline - // in the instance buffer. Slot 0 = (a, b, c, d, tx, ty, 0, 0, tint…); an + // The sprite's world transform lives in the shared transform storage buffer + // (the last writeBuffer of the sprite flush carries the whole buffer's + // ArrayBuffer), not inline in the instance buffer. The buffer is frame-scoped + // (cross-call batching): the graphics rendered into the RenderTexture is the + // first shared-buffer write (slot 0), so the sprite is the second and lands + // in slot 1. Each slot is 12 floats (a, b, c, d, tx, ty, 0, 0, tint…); an // unrotated sprite at (24, 18) has b == 0 and carries that translation. + const slotFloats = 12; + const spriteBase = 1 * slotFloats; // slot 1 const transformWrite = environment.queue.writeBuffer.mock.calls[environment.queue.writeBuffer.mock.calls.length - 1]; const data = new Float32Array(transformWrite[2] as ArrayBuffer); @@ -1801,9 +1806,9 @@ describe('WebGpuBackend', () => { expect(environment.pass.drawIndexed).toHaveBeenCalled(); expect(environment.queue.submit.mock.calls.length).toBeGreaterThanOrEqual(2); expect(environment.textures.length).toBeGreaterThan(0); - expect(data[1]).toBe(0); - expect(data[4]).toBe(24); - expect(data[5]).toBe(18); + expect(data[spriteBase + 1]).toBe(0); + expect(data[spriteBase + 4]).toBe(24); + expect(data[spriteBase + 5]).toBe(18); } finally { environment.restore(); }