Merge pull request #111652 from DarioSamo/opaque-list-key

Organize render surface sorting key for optimizing API performance.
This commit is contained in:
Thaddeus Crews 2025-10-15 16:31:01 -05:00
commit 710a6e0303
No known key found for this signature in database
GPG key ID: 8C6E5FEB5FC03CCC
5 changed files with 44 additions and 12 deletions

View file

@ -844,7 +844,7 @@ void RenderForwardClustered::_fill_instance_data(RenderListType p_render_list, i
RenderElementInfo &element_info = rl->element_info[p_offset + i]; RenderElementInfo &element_info = rl->element_info[p_offset + i];
element_info.value = uint32_t((surface->sort.sort_key2 & 0x0FFF00000000) >> 32u); element_info.value = uint32_t(surface->sort.sort_key1 & 0xFFF);
if (cant_repeat) { if (cant_repeat) {
prev_surface = nullptr; prev_surface = nullptr;
@ -4075,7 +4075,8 @@ void RenderForwardClustered::_geometry_instance_add_surface_with_material(Geomet
sdcache->sort.sort_key2 = 0; sdcache->sort.sort_key2 = 0;
sdcache->sort.surface_index = p_surface; sdcache->sort.surface_index = p_surface;
sdcache->sort.material_id = p_material_id; sdcache->sort.material_id_hi = (p_material_id & 0xFF000000) >> 24;
sdcache->sort.material_id_lo = (p_material_id & 0x00FFFFFF);
sdcache->sort.shader_id = p_shader_id; sdcache->sort.shader_id = p_shader_id;
sdcache->sort.geometry_id = p_mesh.get_local_index(); //only meshes can repeat anyway sdcache->sort.geometry_id = p_mesh.get_local_index(); //only meshes can repeat anyway
sdcache->sort.uses_forward_gi = ginstance->can_sdfgi; sdcache->sort.uses_forward_gi = ginstance->can_sdfgi;

View file

@ -498,17 +498,23 @@ private:
uint64_t sort_key2; uint64_t sort_key2;
}; };
struct { struct {
uint64_t geometry_id : 32; // Needs to be grouped together to be used in RenderElementInfo, as the value is masked directly.
uint64_t material_id : 32;
uint64_t shader_id : 32;
uint64_t lod_index : 8; uint64_t lod_index : 8;
uint64_t uses_softshadow : 1; uint64_t uses_softshadow : 1;
uint64_t uses_projector : 1; uint64_t uses_projector : 1;
uint64_t uses_forward_gi : 1; uint64_t uses_forward_gi : 1;
uint64_t uses_lightmap : 1; uint64_t uses_lightmap : 1;
// Sorted based on optimal order for respecting priority and reducing the amount of rebinding of shaders, materials,
// and geometry. This current order was found to be the most optimal in large projects. If you wish to measure
// differences, refer to RenderingDeviceGraph and the methods available to print statistics for draw lists.
uint64_t depth_layer : 4; uint64_t depth_layer : 4;
uint64_t surface_index : 8; uint64_t surface_index : 8;
uint64_t geometry_id : 32;
uint64_t material_id_hi : 8;
uint64_t material_id_lo : 24;
uint64_t shader_id : 32;
uint64_t priority : 8; uint64_t priority : 8;
}; };
} sort; } sort;

View file

@ -1981,7 +1981,7 @@ void RenderForwardMobile::_fill_instance_data(RenderListType p_render_list, uint
RenderElementInfo &element_info = rl->element_info[p_offset + i]; RenderElementInfo &element_info = rl->element_info[p_offset + i];
// Sets lod_index and uses_lightmap at once. // Sets lod_index and uses_lightmap at once.
element_info.value = uint32_t((surface->sort.sort_key2 & 0x01FF00000000) >> 32u); element_info.value = uint32_t(surface->sort.sort_key1 & 0x1FF);
} }
if (p_update_buffer) { if (p_update_buffer) {
@ -2764,7 +2764,8 @@ void RenderForwardMobile::_geometry_instance_add_surface_with_material(GeometryI
sdcache->sort.sort_key2 = 0; sdcache->sort.sort_key2 = 0;
sdcache->sort.surface_index = p_surface; sdcache->sort.surface_index = p_surface;
sdcache->sort.material_id = p_material_id; sdcache->sort.material_id_hi = (p_material_id & 0xFF000000) >> 24;
sdcache->sort.material_id_lo = (p_material_id & 0x00FFFFFF);
sdcache->sort.shader_id = p_shader_id; sdcache->sort.shader_id = p_shader_id;
sdcache->sort.geometry_id = p_mesh.get_local_index(); sdcache->sort.geometry_id = p_mesh.get_local_index();
sdcache->sort.priority = p_material->priority; sdcache->sort.priority = p_material->priority;

View file

@ -478,15 +478,21 @@ protected:
uint64_t sort_key2; uint64_t sort_key2;
}; };
struct { struct {
uint64_t geometry_id : 32; // Needs to be grouped together to be used in RenderElementInfo, as the value is masked directly.
uint64_t material_id : 32;
uint64_t shader_id : 32;
uint64_t lod_index : 8; uint64_t lod_index : 8;
uint64_t uses_lightmap : 1; uint64_t uses_lightmap : 1;
uint64_t pad : 3; uint64_t pad : 3;
// Sorted based on optimal order for respecting priority and reducing the amount of rebinding of shaders, materials,
// and geometry. This current order was found to be the most optimal in large projects. If you wish to measure
// differences, refer to RenderingDeviceGraph and the methods available to print statistics for draw lists.
uint64_t depth_layer : 4; uint64_t depth_layer : 4;
uint64_t surface_index : 8; uint64_t surface_index : 8;
uint64_t geometry_id : 32;
uint64_t material_id_hi : 8;
uint64_t material_id_lo : 24;
uint64_t shader_id : 32;
uint64_t priority : 8; uint64_t priority : 8;
}; };
} sort; } sort;

View file

@ -35,6 +35,9 @@
#define PRINT_RESOURCE_TRACKER_TOTAL 0 #define PRINT_RESOURCE_TRACKER_TOTAL 0
#define PRINT_COMMAND_RECORDING 0 #define PRINT_COMMAND_RECORDING 0
// Prints the total number of bytes used for draw lists in a frame.
#define PRINT_DRAW_LIST_STATS 0
RenderingDeviceGraph::RenderingDeviceGraph() { RenderingDeviceGraph::RenderingDeviceGraph() {
driver_honors_barriers = false; driver_honors_barriers = false;
driver_clears_with_copy_engine = false; driver_clears_with_copy_engine = false;
@ -835,7 +838,15 @@ void RenderingDeviceGraph::_get_draw_list_render_pass_and_framebuffer(const Reco
r_framebuffer = it->value.framebuffer; r_framebuffer = it->value.framebuffer;
} }
#if PRINT_DRAW_LIST_STATS
static uint32_t draw_list_total_size = 0;
#endif
void RenderingDeviceGraph::_run_draw_list_command(RDD::CommandBufferID p_command_buffer, const uint8_t *p_instruction_data, uint32_t p_instruction_data_size) { void RenderingDeviceGraph::_run_draw_list_command(RDD::CommandBufferID p_command_buffer, const uint8_t *p_instruction_data, uint32_t p_instruction_data_size) {
#if PRINT_DRAW_LIST_STATS
draw_list_total_size += p_instruction_data_size;
#endif
uint32_t instruction_data_cursor = 0; uint32_t instruction_data_cursor = 0;
while (instruction_data_cursor < p_instruction_data_size) { while (instruction_data_cursor < p_instruction_data_size) {
DEV_ASSERT((instruction_data_cursor + sizeof(DrawListInstruction)) <= p_instruction_data_size); DEV_ASSERT((instruction_data_cursor + sizeof(DrawListInstruction)) <= p_instruction_data_size);
@ -2366,6 +2377,10 @@ void RenderingDeviceGraph::end(bool p_reorder_commands, bool p_full_barriers, RD
workarounds_state.draw_list_found = false; workarounds_state.draw_list_found = false;
} }
#if PRINT_DRAW_LIST_STATS
draw_list_total_size = 0;
#endif
if (p_reorder_commands) { if (p_reorder_commands) {
#if PRINT_RENDER_GRAPH #if PRINT_RENDER_GRAPH
print_line("BEFORE SORT"); print_line("BEFORE SORT");
@ -2416,6 +2431,9 @@ void RenderingDeviceGraph::end(bool p_reorder_commands, bool p_full_barriers, RD
_run_label_command_change(r_command_buffer, -1, -1, false, false, nullptr, 0, current_label_index, current_label_level); _run_label_command_change(r_command_buffer, -1, -1, false, false, nullptr, 0, current_label_index, current_label_level);
#if PRINT_DRAW_LIST_STATS
print_line(vformat("Draw list %d bytes", draw_list_total_size));
#endif
#if PRINT_COMMAND_RECORDING #if PRINT_COMMAND_RECORDING
print_line(vformat("Recorded %d commands", command_count)); print_line(vformat("Recorded %d commands", command_count));
#endif #endif