From b49aea89d066071272edd0d4e7bb7765a7425dbe Mon Sep 17 00:00:00 2001 From: Dario Date: Tue, 14 Oct 2025 15:54:22 -0300 Subject: [PATCH] Organize render surface sorting key for optimizing API performance. --- .../render_forward_clustered.cpp | 5 +++-- .../render_forward_clustered.h | 14 ++++++++++---- .../forward_mobile/render_forward_mobile.cpp | 5 +++-- .../forward_mobile/render_forward_mobile.h | 14 ++++++++++---- servers/rendering/rendering_device_graph.cpp | 18 ++++++++++++++++++ 5 files changed, 44 insertions(+), 12 deletions(-) diff --git a/servers/rendering/renderer_rd/forward_clustered/render_forward_clustered.cpp b/servers/rendering/renderer_rd/forward_clustered/render_forward_clustered.cpp index 6779def309e..2a122cafdc9 100644 --- a/servers/rendering/renderer_rd/forward_clustered/render_forward_clustered.cpp +++ b/servers/rendering/renderer_rd/forward_clustered/render_forward_clustered.cpp @@ -844,7 +844,7 @@ void RenderForwardClustered::_fill_instance_data(RenderListType p_render_list, i RenderElementInfo &element_info = rl->element_info[p_offset + i]; - element_info.value = uint32_t((surface->sort.sort_key2 & 0x0FFF00000000) >> 32u); + element_info.value = uint32_t(surface->sort.sort_key1 & 0xFFF); if (cant_repeat) { prev_surface = nullptr; @@ -4075,7 +4075,8 @@ void RenderForwardClustered::_geometry_instance_add_surface_with_material(Geomet sdcache->sort.sort_key2 = 0; sdcache->sort.surface_index = p_surface; - sdcache->sort.material_id = p_material_id; + sdcache->sort.material_id_hi = (p_material_id & 0xFF000000) >> 24; + sdcache->sort.material_id_lo = (p_material_id & 0x00FFFFFF); sdcache->sort.shader_id = p_shader_id; sdcache->sort.geometry_id = p_mesh.get_local_index(); //only meshes can repeat anyway sdcache->sort.uses_forward_gi = ginstance->can_sdfgi; diff --git a/servers/rendering/renderer_rd/forward_clustered/render_forward_clustered.h b/servers/rendering/renderer_rd/forward_clustered/render_forward_clustered.h index 847a55a290f..d3f8d8efdfd 100644 --- a/servers/rendering/renderer_rd/forward_clustered/render_forward_clustered.h +++ b/servers/rendering/renderer_rd/forward_clustered/render_forward_clustered.h @@ -498,17 +498,23 @@ private: uint64_t sort_key2; }; struct { - uint64_t geometry_id : 32; - uint64_t material_id : 32; - - uint64_t shader_id : 32; + // Needs to be grouped together to be used in RenderElementInfo, as the value is masked directly. uint64_t lod_index : 8; uint64_t uses_softshadow : 1; uint64_t uses_projector : 1; uint64_t uses_forward_gi : 1; uint64_t uses_lightmap : 1; + + // Sorted based on optimal order for respecting priority and reducing the amount of rebinding of shaders, materials, + // and geometry. This current order was found to be the most optimal in large projects. If you wish to measure + // differences, refer to RenderingDeviceGraph and the methods available to print statistics for draw lists. uint64_t depth_layer : 4; uint64_t surface_index : 8; + uint64_t geometry_id : 32; + uint64_t material_id_hi : 8; + + uint64_t material_id_lo : 24; + uint64_t shader_id : 32; uint64_t priority : 8; }; } sort; diff --git a/servers/rendering/renderer_rd/forward_mobile/render_forward_mobile.cpp b/servers/rendering/renderer_rd/forward_mobile/render_forward_mobile.cpp index 8e9718756b0..8b63e0e443b 100644 --- a/servers/rendering/renderer_rd/forward_mobile/render_forward_mobile.cpp +++ b/servers/rendering/renderer_rd/forward_mobile/render_forward_mobile.cpp @@ -1981,7 +1981,7 @@ void RenderForwardMobile::_fill_instance_data(RenderListType p_render_list, uint RenderElementInfo &element_info = rl->element_info[p_offset + i]; // Sets lod_index and uses_lightmap at once. - element_info.value = uint32_t((surface->sort.sort_key2 & 0x01FF00000000) >> 32u); + element_info.value = uint32_t(surface->sort.sort_key1 & 0x1FF); } if (p_update_buffer) { @@ -2764,7 +2764,8 @@ void RenderForwardMobile::_geometry_instance_add_surface_with_material(GeometryI sdcache->sort.sort_key2 = 0; sdcache->sort.surface_index = p_surface; - sdcache->sort.material_id = p_material_id; + sdcache->sort.material_id_hi = (p_material_id & 0xFF000000) >> 24; + sdcache->sort.material_id_lo = (p_material_id & 0x00FFFFFF); sdcache->sort.shader_id = p_shader_id; sdcache->sort.geometry_id = p_mesh.get_local_index(); sdcache->sort.priority = p_material->priority; diff --git a/servers/rendering/renderer_rd/forward_mobile/render_forward_mobile.h b/servers/rendering/renderer_rd/forward_mobile/render_forward_mobile.h index 5ef75d353c5..938e47df757 100644 --- a/servers/rendering/renderer_rd/forward_mobile/render_forward_mobile.h +++ b/servers/rendering/renderer_rd/forward_mobile/render_forward_mobile.h @@ -478,15 +478,21 @@ protected: uint64_t sort_key2; }; struct { - uint64_t geometry_id : 32; - uint64_t material_id : 32; - - uint64_t shader_id : 32; + // Needs to be grouped together to be used in RenderElementInfo, as the value is masked directly. uint64_t lod_index : 8; uint64_t uses_lightmap : 1; uint64_t pad : 3; + + // Sorted based on optimal order for respecting priority and reducing the amount of rebinding of shaders, materials, + // and geometry. This current order was found to be the most optimal in large projects. If you wish to measure + // differences, refer to RenderingDeviceGraph and the methods available to print statistics for draw lists. uint64_t depth_layer : 4; uint64_t surface_index : 8; + uint64_t geometry_id : 32; + uint64_t material_id_hi : 8; + + uint64_t material_id_lo : 24; + uint64_t shader_id : 32; uint64_t priority : 8; }; } sort; diff --git a/servers/rendering/rendering_device_graph.cpp b/servers/rendering/rendering_device_graph.cpp index f71bea094aa..9e419ed6dc6 100644 --- a/servers/rendering/rendering_device_graph.cpp +++ b/servers/rendering/rendering_device_graph.cpp @@ -35,6 +35,9 @@ #define PRINT_RESOURCE_TRACKER_TOTAL 0 #define PRINT_COMMAND_RECORDING 0 +// Prints the total number of bytes used for draw lists in a frame. +#define PRINT_DRAW_LIST_STATS 0 + RenderingDeviceGraph::RenderingDeviceGraph() { driver_honors_barriers = false; driver_clears_with_copy_engine = false; @@ -835,7 +838,15 @@ void RenderingDeviceGraph::_get_draw_list_render_pass_and_framebuffer(const Reco r_framebuffer = it->value.framebuffer; } +#if PRINT_DRAW_LIST_STATS +static uint32_t draw_list_total_size = 0; +#endif + void RenderingDeviceGraph::_run_draw_list_command(RDD::CommandBufferID p_command_buffer, const uint8_t *p_instruction_data, uint32_t p_instruction_data_size) { +#if PRINT_DRAW_LIST_STATS + draw_list_total_size += p_instruction_data_size; +#endif + uint32_t instruction_data_cursor = 0; while (instruction_data_cursor < p_instruction_data_size) { DEV_ASSERT((instruction_data_cursor + sizeof(DrawListInstruction)) <= p_instruction_data_size); @@ -2366,6 +2377,10 @@ void RenderingDeviceGraph::end(bool p_reorder_commands, bool p_full_barriers, RD workarounds_state.draw_list_found = false; } +#if PRINT_DRAW_LIST_STATS + draw_list_total_size = 0; +#endif + if (p_reorder_commands) { #if PRINT_RENDER_GRAPH print_line("BEFORE SORT"); @@ -2416,6 +2431,9 @@ void RenderingDeviceGraph::end(bool p_reorder_commands, bool p_full_barriers, RD _run_label_command_change(r_command_buffer, -1, -1, false, false, nullptr, 0, current_label_index, current_label_level); +#if PRINT_DRAW_LIST_STATS + print_line(vformat("Draw list %d bytes", draw_list_total_size)); +#endif #if PRINT_COMMAND_RECORDING print_line(vformat("Recorded %d commands", command_count)); #endif