Skip to content

Commit

Permalink
Combine draw commands to improve rendering performance (#2421)
Browse files Browse the repository at this point in the history
  • Loading branch information
douira authored Feb 11, 2025
1 parent aef6c33 commit c83c3fb
Show file tree
Hide file tree
Showing 20 changed files with 470 additions and 245 deletions.

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package net.caffeinemc.mods.sodium.client.render.chunk;

import net.caffeinemc.mods.sodium.client.SodiumClientMod;
import net.caffeinemc.mods.sodium.client.gl.attribute.GlVertexAttributeBinding;
import net.caffeinemc.mods.sodium.client.gl.device.CommandList;
import net.caffeinemc.mods.sodium.client.gl.device.DrawCommandList;
import net.caffeinemc.mods.sodium.client.gl.device.MultiDrawBatch;
Expand All @@ -16,7 +15,6 @@
import net.caffeinemc.mods.sodium.client.render.chunk.lists.ChunkRenderList;
import net.caffeinemc.mods.sodium.client.render.chunk.lists.ChunkRenderListIterable;
import net.caffeinemc.mods.sodium.client.render.chunk.region.RenderRegion;
import net.caffeinemc.mods.sodium.client.render.chunk.shader.ChunkShaderBindingPoints;
import net.caffeinemc.mods.sodium.client.render.chunk.shader.ChunkShaderInterface;
import net.caffeinemc.mods.sodium.client.render.chunk.terrain.TerrainRenderPass;
import net.caffeinemc.mods.sodium.client.render.chunk.translucent_sorting.SortBehavior;
Expand Down Expand Up @@ -73,7 +71,7 @@ public void render(ChunkRenderMatrices matrices,
continue;
}

fillCommandBuffer(this.batch, region, storage, renderList, camera, renderPass, useBlockFaceCulling);
fillCommandBuffer(this.batch, region, storage, renderList, camera, renderPass, useBlockFaceCulling, useIndexedTessellation);

if (this.batch.isEmpty()) {
continue;
Expand Down Expand Up @@ -110,7 +108,8 @@ private static void fillCommandBuffer(MultiDrawBatch batch,
ChunkRenderList renderList,
CameraTransform camera,
TerrainRenderPass pass,
boolean useBlockFaceCulling) {
boolean useBlockFaceCulling,
boolean useIndexedTessellation) {
batch.clear();

var iterator = renderList.sectionsWithGeometryIterator(pass.isTranslucent());
Expand Down Expand Up @@ -150,30 +149,48 @@ private static void fillCommandBuffer(MultiDrawBatch batch,
continue;
}

if (pass.isTranslucent()) {
addIndexedDrawCommands(batch, pMeshData, slices);
// it's necessary to sometimes not the locally-indexed command generator even for indexed tessellations since
// sometimes the index buffer is shared, but not globally shared. This means that translucent sections that
// are sharing an index buffer amongst them need to use the shared index command generator since it sets the
// same element offset for each draw command and doesn't increment it. Recall that in each draw command the indexing
// of the elements needs to start at 0 and thus starting somewhere further into the shared index buffer is invalid.
// there's also the optimization that draw commands can be combined when using a shared index buffer, be it
// globally shared or just shared within the region, which isn't possible with the locally-indexed command generator.
if (useIndexedTessellation && SectionRenderDataUnsafe.isLocalIndex(pMeshData)) {
addLocalIndexedDrawCommands(batch, pMeshData, slices);
} else {
addNonIndexedDrawCommands(batch, pMeshData, slices);
addSharedIndexedDrawCommands(batch, pMeshData, slices);
}
}
}

/**
* Generates the draw commands for a chunk's meshes using the shared index buffer.
* Generates the draw commands for a chunk's meshes, where each mesh has a separate index buffer. This is used
* when rendering translucent geometry, as each geometry set needs a sorted index buffer.
*/
@SuppressWarnings("IntegerMultiplicationImplicitCastToLong")
private static void addNonIndexedDrawCommands(MultiDrawBatch batch, long pMeshData, int mask) {
private static void addLocalIndexedDrawCommands(MultiDrawBatch batch, long pMeshData, int mask) {
final var pElementPointer = batch.pElementPointer;
final var pBaseVertex = batch.pBaseVertex;
final var pElementCount = batch.pElementCount;

int size = batch.size;

long elementOffset = SectionRenderDataUnsafe.getBaseElement(pMeshData);
long baseVertex = SectionRenderDataUnsafe.getBaseVertex(pMeshData);

for (int facing = 0; facing < ModelQuadFacing.COUNT; facing++) {
// Uint32 -> Int32 cast is always safe and should be optimized away
MemoryUtil.memPutInt(pBaseVertex + (size << 2), (int) SectionRenderDataUnsafe.getVertexOffset(pMeshData, facing));
MemoryUtil.memPutInt(pElementCount + (size << 2), (int) SectionRenderDataUnsafe.getElementCount(pMeshData, facing));
MemoryUtil.memPutAddress(pElementPointer + (size << Pointer.POINTER_SHIFT), 0 /* using a shared index buffer */);
final long vertexCount = SectionRenderDataUnsafe.getVertexCount(pMeshData, facing);
final long elementCount = (vertexCount >> 2) * 6;

MemoryUtil.memPutInt(pElementCount + (size << 2), UInt32.uncheckedDowncast(elementCount));
MemoryUtil.memPutInt(pBaseVertex + (size << 2), UInt32.uncheckedDowncast(baseVertex));

// * 4 to convert to bytes (the index buffer contains integers)
MemoryUtil.memPutAddress(pElementPointer + (size << Pointer.POINTER_SHIFT), elementOffset << 2);

baseVertex += vertexCount;
elementOffset += elementCount;

size += (mask >> facing) & 1;
}
Expand All @@ -182,34 +199,57 @@ private static void addNonIndexedDrawCommands(MultiDrawBatch batch, long pMeshDa
}

/**
* Generates the draw commands for a chunk's meshes, where each mesh has a separate index buffer. This is used
* when rendering translucent geometry, as each geometry set needs a sorted index buffer.
* Generates the draw commands for a chunk's meshes using the shared index buffer.
*/
@SuppressWarnings("IntegerMultiplicationImplicitCastToLong")
private static void addIndexedDrawCommands(MultiDrawBatch batch, long pMeshData, int mask) {
private static void addSharedIndexedDrawCommands(MultiDrawBatch batch, long pMeshData, int mask) {
final var pElementPointer = batch.pElementPointer;
final var pBaseVertex = batch.pBaseVertex;
final var pElementCount = batch.pElementCount;

int size = batch.size;

long elementOffset = SectionRenderDataUnsafe.getBaseElement(pMeshData);
// this is either zero (global shared index buffer) or the offset to the location of the shared element buffer (region shared index buffer)
final var elementOffsetBytes = SectionRenderDataUnsafe.getBaseElement(pMeshData) << 2;
final var facingList = SectionRenderDataUnsafe.getFacingList(pMeshData);

for (int facing = 0; facing < ModelQuadFacing.COUNT; facing++) {
final long vertexOffset = SectionRenderDataUnsafe.getVertexOffset(pMeshData, facing);
final long elementCount = SectionRenderDataUnsafe.getElementCount(pMeshData, facing);

// Uint32 -> Int32 cast is always safe and should be optimized away
MemoryUtil.memPutInt(pBaseVertex + (size << 2), UInt32.uncheckedDowncast(vertexOffset));
MemoryUtil.memPutInt(pElementCount + (size << 2), UInt32.uncheckedDowncast(elementCount));
int size = batch.size;
long groupVertexCount = 0;
long baseVertex = SectionRenderDataUnsafe.getBaseVertex(pMeshData);
int lastMaskBit = 0;

for (int i = 0; i <= ModelQuadFacing.COUNT; i++) {
var maskBit = 0;
long vertexCount = 0;
if (i < ModelQuadFacing.COUNT) {
vertexCount = SectionRenderDataUnsafe.getVertexCount(pMeshData, i);

// if there's no vertexes, the mask bit is just 0
if (vertexCount != 0) {
var facing = (facingList >>> (i * 8)) & 0xFF;
maskBit = (mask >>> facing) & 1;
}
}

// * 4 to convert to bytes (the index buffer contains integers)
// the section render data storage for the indices stores the offset in indices (also called elements)
MemoryUtil.memPutAddress(pElementPointer + (size << Pointer.POINTER_SHIFT), elementOffset << 2);
if (maskBit == 0) {
if (lastMaskBit == 1) {
// delay writing out draw command if there's a zero-size group
if (i < ModelQuadFacing.COUNT && vertexCount == 0) {
continue;
}

MemoryUtil.memPutInt(pElementCount + (size << 2), UInt32.uncheckedDowncast((groupVertexCount >> 2) * 6));
MemoryUtil.memPutInt(pBaseVertex + (size << 2), UInt32.uncheckedDowncast(baseVertex));
MemoryUtil.memPutAddress(pElementPointer + (size << Pointer.POINTER_SHIFT), elementOffsetBytes);
size++;
baseVertex += groupVertexCount;
groupVertexCount = 0;
}

baseVertex += vertexCount;
} else {
groupVertexCount += vertexCount;
}

// adding the number of elements works because the index data has one index per element (which are the indices)
elementOffset += elementCount;
size += (mask >> facing) & 1;
lastMaskBit = maskBit;
}

batch.size = size;
Expand All @@ -224,7 +264,7 @@ private static void addIndexedDrawCommands(MultiDrawBatch batch, long pMeshData,
private static final int MODEL_NEG_Y = ModelQuadFacing.NEG_Y.ordinal();
private static final int MODEL_NEG_Z = ModelQuadFacing.NEG_Z.ordinal();

private static int getVisibleFaces(int originX, int originY, int originZ, int chunkX, int chunkY, int chunkZ) {
public static int getVisibleFaces(int originX, int originY, int originZ, int chunkX, int chunkY, int chunkZ) {
// This is carefully written so that we can keep everything branch-less.
//
// Normally, this would be a ridiculous way to handle the problem. But the Hotspot VM's
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -328,9 +328,9 @@ private boolean processChunkBuildResults(ArrayList<BuilderTaskOutput> results) {
result.render.setTranslucentData(chunkBuildOutput.translucentData);
}
} else if (result instanceof ChunkSortOutput sortOutput
&& sortOutput.getTopoSorter() != null
&& sortOutput.getDynamicSorter() != null
&& result.render.getTranslucentData() instanceof DynamicTopoData data) {
this.sortTriggering.applyTriggerChanges(data, sortOutput.getTopoSorter(), result.render.getPosition(), this.cameraPosition);
this.sortTriggering.applyTriggerChanges(data, sortOutput.getDynamicSorter(), result.render.getPosition(), this.cameraPosition);
}

var job = result.render.getTaskCancellationToken();
Expand Down Expand Up @@ -678,8 +678,10 @@ public Collection<String> getDebugStrings() {

int count = 0;

long deviceUsed = 0;
long deviceAllocated = 0;
long geometryDeviceUsed = 0;
long geometryDeviceAllocated = 0;
long indexDeviceUsed = 0;
long indexDeviceAllocated = 0;

for (var region : this.regions.getLoadedRegions()) {
var resources = region.getResources();
Expand All @@ -688,15 +690,20 @@ public Collection<String> getDebugStrings() {
continue;
}

var buffer = resources.getGeometryArena();
var geometryArena = resources.getGeometryArena();
geometryDeviceUsed += geometryArena.getDeviceUsedMemory();
geometryDeviceAllocated += geometryArena.getDeviceAllocatedMemory();

deviceUsed += buffer.getDeviceUsedMemory();
deviceAllocated += buffer.getDeviceAllocatedMemory();
var indexArena = resources.getIndexArena();
indexDeviceUsed += indexArena.getDeviceUsedMemory();
indexDeviceAllocated += indexArena.getDeviceAllocatedMemory();

count++;
}

list.add(String.format("Geometry Pool: %d/%d MiB (%d buffers)", MathUtil.toMib(deviceUsed), MathUtil.toMib(deviceAllocated), count));
list.add(String.format("Pools: Geometry %d/%d MiB, Index %d/%d MiB (%d buffers)",
MathUtil.toMib(geometryDeviceUsed), MathUtil.toMib(geometryDeviceAllocated),
MathUtil.toMib(indexDeviceUsed), MathUtil.toMib(indexDeviceAllocated), count));
list.add(String.format("Transfer Queue: %s", this.regions.getStagingBuffer().toString()));

list.add(String.format("Chunk Builder: Permits=%02d (E %03d) | Busy=%02d | Total=%02d",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import net.caffeinemc.mods.sodium.client.gl.device.CommandList;
import net.caffeinemc.mods.sodium.client.gl.tessellation.GlIndexType;
import net.caffeinemc.mods.sodium.client.gl.util.EnumBitField;
import net.caffeinemc.mods.sodium.client.util.NativeBuffer;

import java.nio.ByteBuffer;
import java.nio.IntBuffer;
Expand Down Expand Up @@ -55,6 +56,14 @@ private void grow(CommandList commandList, int primitiveCount) {
this.maxPrimitives = primitiveCount;
}

public static NativeBuffer createIndexBuffer(IndexType indexType, int primitiveCount) {
var bufferSize = primitiveCount * indexType.getBytesPerElement() * ELEMENTS_PER_PRIMITIVE;
var buffer = new NativeBuffer(bufferSize);

indexType.createIndexBuffer(buffer.getDirectBuffer(), primitiveCount);

return buffer;
}

public GlBuffer getBufferObject() {
return this.buffer;
Expand All @@ -64,14 +73,6 @@ public void delete(CommandList commandList) {
commandList.deleteBuffer(this.buffer);
}

public GlIndexType getIndexFormat() {
return this.indexType.getFormat();
}

public IndexType getIndexType() {
return this.indexType;
}

public enum IndexType {
SHORT(GlIndexType.UNSIGNED_SHORT, 64 * 1024) {
@Override
Expand Down
Loading

0 comments on commit c83c3fb

Please sign in to comment.