godot/servers/rendering/renderer_rd/shader_rd.cpp

1048 lines
34 KiB
C++
Raw Normal View History

/**************************************************************************/
/* shader_rd.cpp */
/**************************************************************************/
/* This file is part of: */
/* GODOT ENGINE */
/* https://godotengine.org */
/**************************************************************************/
/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur. */
/* */
/* Permission is hereby granted, free of charge, to any person obtaining */
/* a copy of this software and associated documentation files (the */
/* "Software"), to deal in the Software without restriction, including */
/* without limitation the rights to use, copy, modify, merge, publish, */
/* distribute, sublicense, and/or sell copies of the Software, and to */
/* permit persons to whom the Software is furnished to do so, subject to */
/* the following conditions: */
/* */
/* The above copyright notice and this permission notice shall be */
/* included in all copies or substantial portions of the Software. */
/* */
/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */
/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */
/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */
/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */
/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */
/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
/**************************************************************************/
#include "shader_rd.h"
#include "core/io/dir_access.h"
#include "core/io/file_access.h"
#include "core/object/worker_thread_pool.h"
#include "core/version.h"
#include "servers/rendering/rendering_device.h"
#include "servers/rendering/shader_include_db.h"
#define ENABLE_SHADER_CACHE 1
void ShaderRD::_add_stage(const char *p_code, StageType p_stage_type) {
Vector<String> lines = String(p_code).split("\n");
String text;
int line_count = lines.size();
for (int i = 0; i < line_count; i++) {
const String &l = lines[i];
bool push_chunk = false;
StageTemplate::Chunk chunk;
if (l.begins_with("#VERSION_DEFINES")) {
chunk.type = StageTemplate::Chunk::TYPE_VERSION_DEFINES;
push_chunk = true;
} else if (l.begins_with("#GLOBALS")) {
switch (p_stage_type) {
case STAGE_TYPE_VERTEX:
chunk.type = StageTemplate::Chunk::TYPE_VERTEX_GLOBALS;
break;
case STAGE_TYPE_FRAGMENT:
chunk.type = StageTemplate::Chunk::TYPE_FRAGMENT_GLOBALS;
break;
case STAGE_TYPE_COMPUTE:
chunk.type = StageTemplate::Chunk::TYPE_COMPUTE_GLOBALS;
break;
default: {
}
}
push_chunk = true;
} else if (l.begins_with("#MATERIAL_UNIFORMS")) {
chunk.type = StageTemplate::Chunk::TYPE_MATERIAL_UNIFORMS;
push_chunk = true;
} else if (l.begins_with("#CODE")) {
chunk.type = StageTemplate::Chunk::TYPE_CODE;
push_chunk = true;
chunk.code = l.replace_first("#CODE", String()).remove_char(':').strip_edges().to_upper();
} else if (l.begins_with("#include ")) {
String include_file = l.replace("#include ", "").strip_edges();
if (include_file[0] == '"') {
int end_pos = include_file.find_char('"', 1);
if (end_pos >= 0) {
include_file = include_file.substr(1, end_pos - 1);
String include_code = ShaderIncludeDB::get_built_in_include_file(include_file);
if (!include_code.is_empty()) {
// Add these lines into our parse list so we parse them as well.
Vector<String> include_lines = include_code.split("\n");
for (int j = include_lines.size() - 1; j >= 0; j--) {
lines.insert(i + 1, include_lines[j]);
}
line_count = lines.size();
} else {
// Add it in as is.
text += l + "\n";
}
} else {
// Add it in as is.
text += l + "\n";
}
} else {
// Add it in as is.
text += l + "\n";
}
} else {
text += l + "\n";
}
if (push_chunk) {
if (!text.is_empty()) {
StageTemplate::Chunk text_chunk;
text_chunk.type = StageTemplate::Chunk::TYPE_TEXT;
text_chunk.text = text.utf8();
stage_templates[p_stage_type].chunks.push_back(text_chunk);
text = String();
}
stage_templates[p_stage_type].chunks.push_back(chunk);
}
}
2019-09-25 16:44:44 -03:00
if (!text.is_empty()) {
StageTemplate::Chunk text_chunk;
text_chunk.type = StageTemplate::Chunk::TYPE_TEXT;
text_chunk.text = text.utf8();
stage_templates[p_stage_type].chunks.push_back(text_chunk);
text = String();
}
}
void ShaderRD::setup(const char *p_vertex_code, const char *p_fragment_code, const char *p_compute_code, const char *p_name) {
name = p_name;
2019-09-25 16:44:44 -03:00
if (p_compute_code) {
_add_stage(p_compute_code, STAGE_TYPE_COMPUTE);
2019-09-25 16:44:44 -03:00
is_compute = true;
} else {
is_compute = false;
if (p_vertex_code) {
_add_stage(p_vertex_code, STAGE_TYPE_VERTEX);
2019-09-25 16:44:44 -03:00
}
if (p_fragment_code) {
_add_stage(p_fragment_code, STAGE_TYPE_FRAGMENT);
2019-09-25 16:44:44 -03:00
}
}
StringBuilder tohash;
tohash.append("[GodotVersionNumber]");
tohash.append(GODOT_VERSION_NUMBER);
tohash.append("[GodotVersionHash]");
tohash.append(GODOT_VERSION_HASH);
tohash.append("[Vertex]");
tohash.append(p_vertex_code ? p_vertex_code : "");
tohash.append("[Fragment]");
tohash.append(p_fragment_code ? p_fragment_code : "");
tohash.append("[Compute]");
tohash.append(p_compute_code ? p_compute_code : "");
base_sha256 = tohash.as_string().sha256_text();
}
RID ShaderRD::version_create(bool p_embedded) {
//initialize() was never called
ERR_FAIL_COND_V(group_to_variant_map.is_empty(), RID());
Version version;
version.dirty = true;
version.valid = false;
version.initialize_needed = true;
version.embedded = p_embedded;
version.variants.clear();
version.variant_data.clear();
version.mutex = memnew(Mutex);
RID rid = version_owner.make_rid(version);
{
MutexLock lock(versions_mutex);
version_mutexes.insert(rid, version.mutex);
}
if (p_embedded) {
MutexLock lock(shader_versions_embedded_set_mutex);
shader_versions_embedded_set.insert({ this, rid });
}
return rid;
}
void ShaderRD::_initialize_version(Version *p_version) {
_clear_version(p_version);
p_version->valid = false;
p_version->dirty = false;
p_version->variants.resize_initialized(variant_defines.size());
p_version->variant_data.resize(variant_defines.size());
p_version->group_compilation_tasks.resize_initialized(group_enabled.size());
}
void ShaderRD::_clear_version(Version *p_version) {
_compile_ensure_finished(p_version);
// Clear versions if they exist.
if (!p_version->variants.is_empty()) {
for (int i = 0; i < variant_defines.size(); i++) {
if (p_version->variants[i].is_valid()) {
RD::get_singleton()->free(p_version->variants[i]);
}
}
p_version->variants.clear();
p_version->variant_data.clear();
}
}
void ShaderRD::_build_variant_code(StringBuilder &builder, uint32_t p_variant, const Version *p_version, const StageTemplate &p_template) {
for (const StageTemplate::Chunk &chunk : p_template.chunks) {
switch (chunk.type) {
case StageTemplate::Chunk::TYPE_VERSION_DEFINES: {
builder.append("\n"); //make sure defines begin at newline
builder.append(general_defines.get_data());
builder.append(variant_defines[p_variant].text.get_data());
for (int j = 0; j < p_version->custom_defines.size(); j++) {
builder.append(p_version->custom_defines[j].get_data());
}
builder.append("\n"); //make sure defines begin at newline
if (p_version->uniforms.size()) {
builder.append("#define MATERIAL_UNIFORMS_USED\n");
}
2021-08-09 14:13:42 -06:00
for (const KeyValue<StringName, CharString> &E : p_version->code_sections) {
builder.append(String("#define ") + String(E.key) + "_CODE_USED\n");
}
#if (defined(MACOS_ENABLED) || defined(APPLE_EMBEDDED_ENABLED))
if (RD::get_singleton()->get_device_capabilities().device_family == RDD::DEVICE_VULKAN) {
builder.append("#define MOLTENVK_USED\n");
}
// Image atomics are supported on Metal 3.1 but no support in MoltenVK or SPIRV-Cross yet.
builder.append("#define NO_IMAGE_ATOMICS\n");
#endif
builder.append(String("#define RENDER_DRIVER_") + OS::get_singleton()->get_current_rendering_driver_name().to_upper() + "\n");
builder.append("#define samplerExternalOES sampler2D\n");
builder.append("#define textureExternalOES texture2D\n");
} break;
case StageTemplate::Chunk::TYPE_MATERIAL_UNIFORMS: {
builder.append(p_version->uniforms.get_data()); //uniforms (same for vertex and fragment)
} break;
case StageTemplate::Chunk::TYPE_VERTEX_GLOBALS: {
builder.append(p_version->vertex_globals.get_data()); // vertex globals
} break;
case StageTemplate::Chunk::TYPE_FRAGMENT_GLOBALS: {
builder.append(p_version->fragment_globals.get_data()); // fragment globals
} break;
case StageTemplate::Chunk::TYPE_COMPUTE_GLOBALS: {
builder.append(p_version->compute_globals.get_data()); // compute globals
} break;
case StageTemplate::Chunk::TYPE_CODE: {
if (p_version->code_sections.has(chunk.code)) {
builder.append(p_version->code_sections[chunk.code].get_data());
}
} break;
case StageTemplate::Chunk::TYPE_TEXT: {
builder.append(chunk.text.get_data());
} break;
}
}
}
Vector<String> ShaderRD::_build_variant_stage_sources(uint32_t p_variant, CompileData p_data) {
if (!variants_enabled[p_variant]) {
return Vector<String>(); // Variant is disabled, return.
}
Vector<String> stage_sources;
stage_sources.resize(RD::SHADER_STAGE_MAX);
2019-09-25 16:44:44 -03:00
if (is_compute) {
// Compute stage.
2019-09-25 16:44:44 -03:00
StringBuilder builder;
_build_variant_code(builder, p_variant, p_data.version, stage_templates[STAGE_TYPE_COMPUTE]);
stage_sources.write[RD::SHADER_STAGE_COMPUTE] = builder.as_string();
} else {
{
// Vertex stage.
StringBuilder builder;
_build_variant_code(builder, p_variant, p_data.version, stage_templates[STAGE_TYPE_VERTEX]);
stage_sources.write[RD::SHADER_STAGE_VERTEX] = builder.as_string();
}
{
// Fragment stage.
StringBuilder builder;
_build_variant_code(builder, p_variant, p_data.version, stage_templates[STAGE_TYPE_FRAGMENT]);
stage_sources.write[RD::SHADER_STAGE_FRAGMENT] = builder.as_string();
2019-09-25 16:44:44 -03:00
}
}
return stage_sources;
}
void ShaderRD::_compile_variant(uint32_t p_variant, CompileData p_data) {
uint32_t variant = group_to_variant_map[p_data.group][p_variant];
if (!variants_enabled[variant]) {
return; // Variant is disabled, return.
}
Vector<String> variant_stage_sources = _build_variant_stage_sources(variant, p_data);
Vector<RD::ShaderStageSPIRVData> variant_stages = compile_stages(variant_stage_sources);
ERR_FAIL_COND(variant_stages.is_empty());
Vector<uint8_t> shader_data = RD::get_singleton()->shader_compile_binary_from_spirv(variant_stages, name + ":" + itos(variant));
ERR_FAIL_COND(shader_data.is_empty());
{
Improvements from TheForge (see description) The work was performed by collaboration of TheForge and Google. I am merely splitting it up into smaller PRs and cleaning it up. This is the most "risky" PR so far because the previous ones have been miscellaneous stuff aimed at either [improve debugging](https://github.com/godotengine/godot/pull/90993) (e.g. device lost), [improve Android experience](https://github.com/godotengine/godot/pull/96439) (add Swappy for better Frame Pacing + Pre-Transformed Swapchains for slightly better performance), or harmless [ASTC improvements](https://github.com/godotengine/godot/pull/96045) (better performance by simply toggling a feature when available). However this PR contains larger modifications aimed at improving performance or reducing memory fragmentation. With greater modifications, come greater risks of bugs or breakage. Changes introduced by this PR: TBDR GPUs (e.g. most of Android + iOS + M1 Apple) support rendering to Render Targets that are not backed by actual GPU memory (everything stays in cache). This works as long as load action isn't `LOAD`, and store action must be `DONT_CARE`. This saves VRAM (it also makes painfully obvious when a mistake introduces a performance regression). Of particular usefulness is when doing MSAA and keeping the raw MSAA content is not necessary. Some GPUs get faster when the sampler settings are hard-coded into the GLSL shaders (instead of being dynamically bound at runtime). This required changes to the GLSL shaders, PSO creation routines, Descriptor creation routines, and Descriptor binding routines. - `bool immutable_samplers_enabled = true` Setting it to false enforces the old behavior. Useful for debugging bugs and regressions. Immutable samplers requires that the samplers stay... immutable, hence this boolean is useful if the promise gets broken. We might want to turn this into a `GLOBAL_DEF` setting. Instead of creating dozen/hundreds/thousands of `VkDescriptorSet` every frame that need to be freed individually when they are no longer needed, they all get freed at once by resetting the whole pool. Once the whole pool is no longer in use by the GPU, it gets reset and its memory recycled. Descriptor sets that are created to be kept around for longer or forever (i.e. not created and freed within the same frame) **must not** use linear pools. There may be more than one pool per frame. How many pools per frame Godot ends up with depends on its capacity, and that is controlled by `rendering/rendering_device/vulkan/max_descriptors_per_pool`. - **Possible improvement for later:** It should be possible for Godot to adapt to how many descriptors per pool are needed on a per-key basis (i.e. grow their capacity like `std::vector` does) after rendering a few frames; which would be better than the current solution of having a single global value for all pools (`max_descriptors_per_pool`) that the user needs to tweak. - `bool linear_descriptor_pools_enabled = true` Setting it to false enforces the old behavior. Useful for debugging bugs and regressions. Setting it to false is required when workarounding driver bugs (e.g. Adreno 730). A ridiculous optimization. Ridiculous because the original code should've done this in the first place. Previously Godot was doing the following: 1. Create a command buffer **pool**. One per frame. 2. Create multiple command buffers from the pool in point 1. 3. Call `vkBeginCommandBuffer` on the cmd buffer in point 2. This resets the cmd buffer because Godot requests the `VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT` flag. 4. Add commands to the cmd buffers from point 2. 5. Submit those commands. 6. On frame N + 2, recycle the buffer pool and cmd buffers from pt 1 & 2, and repeat from step 3. The problem here is that step 3 resets each command buffer individually. Initially Godot used to have 1 cmd buffer per pool, thus the impact is very low. But not anymore (specially with Adreno workarounds to force splitting compute dispatches into a new cmd buffer, more on this later). However Godot keeps around a very low amount of command buffers per frame. The recommended method is to reset the whole pool, to reset all cmd buffers at once. Hence the new steps would be: 1. Create a command buffer **pool**. One per frame. 2. Create multiple command buffers from the pool in point 1. 3. Call `vkBeginCommandBuffer` on the cmd buffer in point 2, which is already reset/empty (see step 6). 4. Add commands to the cmd buffers from point 2. 5. Submit those commands. 6. On frame N + 2, recycle the buffer pool and cmd buffers from pt 1 & 2, call `vkResetCommandPool` and repeat from step 3. **Possible issues:** @dariosamo added `transfer_worker` which creates a command buffer pool: ```cpp transfer_worker->command_pool = driver->command_pool_create(transfer_queue_family, RDD::COMMAND_BUFFER_TYPE_PRIMARY); ``` As expected, validation was complaining that command buffers were being reused without being reset (that's good, we now know Validation Layers will warn us of wrong use). I fixed it by adding: ```cpp void RenderingDevice::_wait_for_transfer_worker(TransferWorker *p_transfer_worker) { driver->fence_wait(p_transfer_worker->command_fence); driver->command_pool_reset(p_transfer_worker->command_pool); // ! New line ! ``` **Secondary cmd buffers are subject to the same issue but I didn't alter them. I talked this with Dario and he is aware of this.** Secondary cmd buffers are currently disabled due to other issues (it's disabled on master). - `bool RenderingDeviceCommons::command_pool_reset_enabled` Setting it to false enforces the old behavior. Useful for debugging bugs and regressions. There's no other reason for this boolean. Possibly once it becomes well tested, the boolean could be removed entirely. Adds `command_bind_render_uniform_sets` and `add_draw_list_bind_uniform_sets` (+ compute variants). It performs the same as `add_draw_list_bind_uniform_set` (notice singular vs plural), but on multiple consecutive uniform sets, thus reducing graph and draw call overhead. - `bool descriptor_set_batching = true;` Setting it to false enforces the old behavior. Useful for debugging bugs and regressions. There's no other reason for this boolean. Possibly once it becomes well tested, the boolean could be removed entirely. Godot currently does the following: 1. Fill the entire cmd buffer with commands. 2. `submit()` - Wait with a semaphore for the swapchain. - Trigger a semaphore to indicate when we're done (so the swapchain can submit). 3. `present()` The optimization opportunity here is that 95% of Godot's rendering is done offscreen. Then a fullscreen pass copies everything to the swapchain. Godot doesn't practically render directly to the swapchain. The problem with this is that the GPU has to wait for the swapchain to be released **to start anything**, when we could start *much earlier*. Only the final blit pass must wait for the swapchain. TheForge changed it to the following (more complicated, I'm simplifying the idea): 1. Fill the entire cmd buffer with commands. 2. In `screen_prepare_for_drawing` do `submit()` - There are no semaphore waits for the swapchain. - Trigger a semaphore to indicate when we're done. 3. Fill a new cmd buffer that only does the final blit to the swapchain. 4. `submit()` - Wait with a semaphore for the submit() from step 2. - Wait with a semaphore for the swapchain (so the swapchain can submit). - Trigger a semaphore to indicate when we're done (so the swapchain can submit). 5. `present()` Dario discovered this problem independently while working on a different platform. **However TheForge's solution had to be rewritten from scratch:** The complexity to achieve the solution was high and quite difficult to maintain with the way Godot works now (after Übershaders PR). But on the other hand, re-implementing the solution became much simpler because Dario already had to do something similar: To fix an Adreno 730 driver bug, he had to implement splitting command buffers. **This is exactly what we need!**. Thus it was re-written using this existing functionality for a new purpose. To achieve this, I added a new argument, `bool p_split_cmd_buffer`, to `RenderingDeviceGraph::add_draw_list_begin`, which is only set to true by `RenderingDevice::draw_list_begin_for_screen`. The graph will split the draw list into its own command buffer. - `bool split_swapchain_into_its_own_cmd_buffer = true;` Setting it to false enforces the old behavior. This might be necessary for consoles which follow an alternate solution to the same problem. If not, then we should consider removing it. PR #90993 added `shader_destroy_modules()` but it was not actually in use. This PR adds several places where `shader_destroy_modules()` is called after initialization to free up memory of SPIR-V structures that are no longer needed.
2024-11-14 13:03:14 -03:00
p_data.version->variants.write[variant] = RD::get_singleton()->shader_create_from_bytecode_with_samplers(shader_data, p_data.version->variants[variant], immutable_samplers);
p_data.version->variant_data.write[variant] = shader_data;
}
}
Vector<String> ShaderRD::version_build_variant_stage_sources(RID p_version, int p_variant) {
Version *version = version_owner.get_or_null(p_version);
ERR_FAIL_NULL_V(version, Vector<String>());
if (version->dirty) {
_initialize_version(version);
}
CompileData compile_data;
compile_data.version = version;
compile_data.group = variant_to_group[p_variant];
return _build_variant_stage_sources(p_variant, compile_data);
}
RS::ShaderNativeSourceCode ShaderRD::version_get_native_source_code(RID p_version) {
Version *version = version_owner.get_or_null(p_version);
RS::ShaderNativeSourceCode source_code;
ERR_FAIL_NULL_V(version, source_code);
MutexLock lock(*version->mutex);
source_code.versions.resize(variant_defines.size());
for (int i = 0; i < source_code.versions.size(); i++) {
if (!is_compute) {
//vertex stage
StringBuilder builder;
_build_variant_code(builder, i, version, stage_templates[STAGE_TYPE_VERTEX]);
RS::ShaderNativeSourceCode::Version::Stage stage;
stage.name = "vertex";
stage.code = builder.as_string();
source_code.versions.write[i].stages.push_back(stage);
}
if (!is_compute) {
//fragment stage
StringBuilder builder;
_build_variant_code(builder, i, version, stage_templates[STAGE_TYPE_FRAGMENT]);
RS::ShaderNativeSourceCode::Version::Stage stage;
stage.name = "fragment";
stage.code = builder.as_string();
source_code.versions.write[i].stages.push_back(stage);
}
if (is_compute) {
//compute stage
StringBuilder builder;
_build_variant_code(builder, i, version, stage_templates[STAGE_TYPE_COMPUTE]);
RS::ShaderNativeSourceCode::Version::Stage stage;
stage.name = "compute";
stage.code = builder.as_string();
source_code.versions.write[i].stages.push_back(stage);
}
}
return source_code;
}
String ShaderRD::version_get_cache_file_relative_path(RID p_version, int p_group, const String &p_api_name) {
Version *version = version_owner.get_or_null(p_version);
ERR_FAIL_NULL_V(version, String());
return _get_cache_file_relative_path(version, p_group, p_api_name);
}
String ShaderRD::_version_get_sha1(Version *p_version) const {
StringBuilder hash_build;
hash_build.append("[uniforms]");
hash_build.append(p_version->uniforms.get_data());
hash_build.append("[vertex_globals]");
hash_build.append(p_version->vertex_globals.get_data());
hash_build.append("[fragment_globals]");
hash_build.append(p_version->fragment_globals.get_data());
hash_build.append("[compute_globals]");
hash_build.append(p_version->compute_globals.get_data());
Vector<StringName> code_sections;
2021-08-09 14:13:42 -06:00
for (const KeyValue<StringName, CharString> &E : p_version->code_sections) {
code_sections.push_back(E.key);
}
code_sections.sort_custom<StringName::AlphCompare>();
for (int i = 0; i < code_sections.size(); i++) {
hash_build.append(String("[code:") + String(code_sections[i]) + "]");
hash_build.append(p_version->code_sections[code_sections[i]].get_data());
}
for (int i = 0; i < p_version->custom_defines.size(); i++) {
hash_build.append("[custom_defines:" + itos(i) + "]");
hash_build.append(p_version->custom_defines[i].get_data());
}
return hash_build.as_string().sha1_text();
}
static const char *shader_file_header = "GDSC";
static const uint32_t cache_file_version = 4;
String ShaderRD::_get_cache_file_relative_path(Version *p_version, int p_group, const String &p_api_name) {
String sha1 = _version_get_sha1(p_version);
return name.path_join(group_sha256[p_group]).path_join(sha1) + "." + p_api_name + ".cache";
}
String ShaderRD::_get_cache_file_path(Version *p_version, int p_group, const String &p_api_name, bool p_user_dir) {
const String &shader_cache_dir = p_user_dir ? shader_cache_user_dir : shader_cache_res_dir;
String relative_path = _get_cache_file_relative_path(p_version, p_group, p_api_name);
return shader_cache_dir.path_join(relative_path);
}
bool ShaderRD::_load_from_cache(Version *p_version, int p_group) {
String api_safe_name = String(RD::get_singleton()->get_device_api_name()).validate_filename().to_lower();
Ref<FileAccess> f;
if (shader_cache_user_dir_valid) {
f = FileAccess::open(_get_cache_file_path(p_version, p_group, api_safe_name, true), FileAccess::READ);
}
if (f.is_null()) {
f = FileAccess::open(_get_cache_file_path(p_version, p_group, api_safe_name, false), FileAccess::READ);
}
if (f.is_null()) {
const String &sha1 = _version_get_sha1(p_version);
print_verbose(vformat("Shader cache miss for %s", name.path_join(group_sha256[p_group]).path_join(sha1)));
return false;
}
char header[5] = { 0, 0, 0, 0, 0 };
f->get_buffer((uint8_t *)header, 4);
ERR_FAIL_COND_V(header != String(shader_file_header), false);
uint32_t file_version = f->get_32();
if (file_version != cache_file_version) {
return false; // wrong version
}
uint32_t variant_count = f->get_32();
ERR_FAIL_COND_V(variant_count != (uint32_t)group_to_variant_map[p_group].size(), false); //should not happen but check
for (uint32_t i = 0; i < variant_count; i++) {
int variant_id = group_to_variant_map[p_group][i];
uint32_t variant_size = f->get_32();
ERR_FAIL_COND_V(variant_size == 0 && variants_enabled[variant_id], false);
if (!variants_enabled[variant_id]) {
continue;
}
Vector<uint8_t> variant_bytes;
variant_bytes.resize(variant_size);
uint32_t br = f->get_buffer(variant_bytes.ptrw(), variant_size);
ERR_FAIL_COND_V(br != variant_size, false);
p_version->variant_data.write[variant_id] = variant_bytes;
}
for (uint32_t i = 0; i < variant_count; i++) {
int variant_id = group_to_variant_map[p_group][i];
if (!variants_enabled[variant_id]) {
p_version->variants.write[variant_id] = RID();
continue;
}
{
Improvements from TheForge (see description) The work was performed by collaboration of TheForge and Google. I am merely splitting it up into smaller PRs and cleaning it up. This is the most "risky" PR so far because the previous ones have been miscellaneous stuff aimed at either [improve debugging](https://github.com/godotengine/godot/pull/90993) (e.g. device lost), [improve Android experience](https://github.com/godotengine/godot/pull/96439) (add Swappy for better Frame Pacing + Pre-Transformed Swapchains for slightly better performance), or harmless [ASTC improvements](https://github.com/godotengine/godot/pull/96045) (better performance by simply toggling a feature when available). However this PR contains larger modifications aimed at improving performance or reducing memory fragmentation. With greater modifications, come greater risks of bugs or breakage. Changes introduced by this PR: TBDR GPUs (e.g. most of Android + iOS + M1 Apple) support rendering to Render Targets that are not backed by actual GPU memory (everything stays in cache). This works as long as load action isn't `LOAD`, and store action must be `DONT_CARE`. This saves VRAM (it also makes painfully obvious when a mistake introduces a performance regression). Of particular usefulness is when doing MSAA and keeping the raw MSAA content is not necessary. Some GPUs get faster when the sampler settings are hard-coded into the GLSL shaders (instead of being dynamically bound at runtime). This required changes to the GLSL shaders, PSO creation routines, Descriptor creation routines, and Descriptor binding routines. - `bool immutable_samplers_enabled = true` Setting it to false enforces the old behavior. Useful for debugging bugs and regressions. Immutable samplers requires that the samplers stay... immutable, hence this boolean is useful if the promise gets broken. We might want to turn this into a `GLOBAL_DEF` setting. Instead of creating dozen/hundreds/thousands of `VkDescriptorSet` every frame that need to be freed individually when they are no longer needed, they all get freed at once by resetting the whole pool. Once the whole pool is no longer in use by the GPU, it gets reset and its memory recycled. Descriptor sets that are created to be kept around for longer or forever (i.e. not created and freed within the same frame) **must not** use linear pools. There may be more than one pool per frame. How many pools per frame Godot ends up with depends on its capacity, and that is controlled by `rendering/rendering_device/vulkan/max_descriptors_per_pool`. - **Possible improvement for later:** It should be possible for Godot to adapt to how many descriptors per pool are needed on a per-key basis (i.e. grow their capacity like `std::vector` does) after rendering a few frames; which would be better than the current solution of having a single global value for all pools (`max_descriptors_per_pool`) that the user needs to tweak. - `bool linear_descriptor_pools_enabled = true` Setting it to false enforces the old behavior. Useful for debugging bugs and regressions. Setting it to false is required when workarounding driver bugs (e.g. Adreno 730). A ridiculous optimization. Ridiculous because the original code should've done this in the first place. Previously Godot was doing the following: 1. Create a command buffer **pool**. One per frame. 2. Create multiple command buffers from the pool in point 1. 3. Call `vkBeginCommandBuffer` on the cmd buffer in point 2. This resets the cmd buffer because Godot requests the `VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT` flag. 4. Add commands to the cmd buffers from point 2. 5. Submit those commands. 6. On frame N + 2, recycle the buffer pool and cmd buffers from pt 1 & 2, and repeat from step 3. The problem here is that step 3 resets each command buffer individually. Initially Godot used to have 1 cmd buffer per pool, thus the impact is very low. But not anymore (specially with Adreno workarounds to force splitting compute dispatches into a new cmd buffer, more on this later). However Godot keeps around a very low amount of command buffers per frame. The recommended method is to reset the whole pool, to reset all cmd buffers at once. Hence the new steps would be: 1. Create a command buffer **pool**. One per frame. 2. Create multiple command buffers from the pool in point 1. 3. Call `vkBeginCommandBuffer` on the cmd buffer in point 2, which is already reset/empty (see step 6). 4. Add commands to the cmd buffers from point 2. 5. Submit those commands. 6. On frame N + 2, recycle the buffer pool and cmd buffers from pt 1 & 2, call `vkResetCommandPool` and repeat from step 3. **Possible issues:** @dariosamo added `transfer_worker` which creates a command buffer pool: ```cpp transfer_worker->command_pool = driver->command_pool_create(transfer_queue_family, RDD::COMMAND_BUFFER_TYPE_PRIMARY); ``` As expected, validation was complaining that command buffers were being reused without being reset (that's good, we now know Validation Layers will warn us of wrong use). I fixed it by adding: ```cpp void RenderingDevice::_wait_for_transfer_worker(TransferWorker *p_transfer_worker) { driver->fence_wait(p_transfer_worker->command_fence); driver->command_pool_reset(p_transfer_worker->command_pool); // ! New line ! ``` **Secondary cmd buffers are subject to the same issue but I didn't alter them. I talked this with Dario and he is aware of this.** Secondary cmd buffers are currently disabled due to other issues (it's disabled on master). - `bool RenderingDeviceCommons::command_pool_reset_enabled` Setting it to false enforces the old behavior. Useful for debugging bugs and regressions. There's no other reason for this boolean. Possibly once it becomes well tested, the boolean could be removed entirely. Adds `command_bind_render_uniform_sets` and `add_draw_list_bind_uniform_sets` (+ compute variants). It performs the same as `add_draw_list_bind_uniform_set` (notice singular vs plural), but on multiple consecutive uniform sets, thus reducing graph and draw call overhead. - `bool descriptor_set_batching = true;` Setting it to false enforces the old behavior. Useful for debugging bugs and regressions. There's no other reason for this boolean. Possibly once it becomes well tested, the boolean could be removed entirely. Godot currently does the following: 1. Fill the entire cmd buffer with commands. 2. `submit()` - Wait with a semaphore for the swapchain. - Trigger a semaphore to indicate when we're done (so the swapchain can submit). 3. `present()` The optimization opportunity here is that 95% of Godot's rendering is done offscreen. Then a fullscreen pass copies everything to the swapchain. Godot doesn't practically render directly to the swapchain. The problem with this is that the GPU has to wait for the swapchain to be released **to start anything**, when we could start *much earlier*. Only the final blit pass must wait for the swapchain. TheForge changed it to the following (more complicated, I'm simplifying the idea): 1. Fill the entire cmd buffer with commands. 2. In `screen_prepare_for_drawing` do `submit()` - There are no semaphore waits for the swapchain. - Trigger a semaphore to indicate when we're done. 3. Fill a new cmd buffer that only does the final blit to the swapchain. 4. `submit()` - Wait with a semaphore for the submit() from step 2. - Wait with a semaphore for the swapchain (so the swapchain can submit). - Trigger a semaphore to indicate when we're done (so the swapchain can submit). 5. `present()` Dario discovered this problem independently while working on a different platform. **However TheForge's solution had to be rewritten from scratch:** The complexity to achieve the solution was high and quite difficult to maintain with the way Godot works now (after Übershaders PR). But on the other hand, re-implementing the solution became much simpler because Dario already had to do something similar: To fix an Adreno 730 driver bug, he had to implement splitting command buffers. **This is exactly what we need!**. Thus it was re-written using this existing functionality for a new purpose. To achieve this, I added a new argument, `bool p_split_cmd_buffer`, to `RenderingDeviceGraph::add_draw_list_begin`, which is only set to true by `RenderingDevice::draw_list_begin_for_screen`. The graph will split the draw list into its own command buffer. - `bool split_swapchain_into_its_own_cmd_buffer = true;` Setting it to false enforces the old behavior. This might be necessary for consoles which follow an alternate solution to the same problem. If not, then we should consider removing it. PR #90993 added `shader_destroy_modules()` but it was not actually in use. This PR adds several places where `shader_destroy_modules()` is called after initialization to free up memory of SPIR-V structures that are no longer needed.
2024-11-14 13:03:14 -03:00
RID shader = RD::get_singleton()->shader_create_from_bytecode_with_samplers(p_version->variant_data[variant_id], p_version->variants[variant_id], immutable_samplers);
if (shader.is_null()) {
for (uint32_t j = 0; j < i; j++) {
int variant_free_id = group_to_variant_map[p_group][j];
RD::get_singleton()->free(p_version->variants[variant_free_id]);
}
ERR_FAIL_COND_V(shader.is_null(), false);
}
p_version->variants.write[variant_id] = shader;
}
}
p_version->valid = true;
return true;
}
void ShaderRD::_save_to_cache(Version *p_version, int p_group) {
ERR_FAIL_COND(!shader_cache_user_dir_valid);
String api_safe_name = String(RD::get_singleton()->get_device_api_name()).validate_filename().to_lower();
const String &path = _get_cache_file_path(p_version, p_group, api_safe_name, true);
Ref<FileAccess> f = FileAccess::open(path, FileAccess::WRITE);
ERR_FAIL_COND(f.is_null());
PackedByteArray shader_cache_bytes = ShaderRD::save_shader_cache_bytes(group_to_variant_map[p_group], p_version->variant_data);
f->store_buffer(shader_cache_bytes);
}
void ShaderRD::_allocate_placeholders(Version *p_version, int p_group) {
ERR_FAIL_COND(p_version->variants.is_empty());
for (uint32_t i = 0; i < group_to_variant_map[p_group].size(); i++) {
int variant_id = group_to_variant_map[p_group][i];
RID shader = RD::get_singleton()->shader_create_placeholder();
{
p_version->variants.write[variant_id] = shader;
}
}
}
// Try to compile all variants for a given group.
// Will skip variants that are disabled.
void ShaderRD::_compile_version_start(Version *p_version, int p_group) {
if (!group_enabled[p_group]) {
return;
}
p_version->dirty = false;
#if ENABLE_SHADER_CACHE
if (_load_from_cache(p_version, p_group)) {
return;
}
#endif
CompileData compile_data;
compile_data.version = p_version;
compile_data.group = p_group;
WorkerThreadPool::GroupID group_task = WorkerThreadPool::get_singleton()->add_template_group_task(this, &ShaderRD::_compile_variant, compile_data, group_to_variant_map[p_group].size(), -1, true, SNAME("ShaderCompilation"));
p_version->group_compilation_tasks.write[p_group] = group_task;
}
void ShaderRD::_compile_version_end(Version *p_version, int p_group) {
if (p_version->group_compilation_tasks.size() <= p_group || p_version->group_compilation_tasks[p_group] == 0) {
return;
}
WorkerThreadPool::GroupID group_task = p_version->group_compilation_tasks[p_group];
WorkerThreadPool::get_singleton()->wait_for_group_task_completion(group_task);
p_version->group_compilation_tasks.write[p_group] = 0;
bool all_valid = true;
for (uint32_t i = 0; i < group_to_variant_map[p_group].size(); i++) {
int variant_id = group_to_variant_map[p_group][i];
if (!variants_enabled[variant_id]) {
continue; // Disabled.
}
if (p_version->variants[variant_id].is_null()) {
all_valid = false;
break;
}
}
if (!all_valid) {
// Clear versions if they exist.
for (int i = 0; i < variant_defines.size(); i++) {
if (!variants_enabled[i] || !group_enabled[variant_defines[i].group]) {
continue; // Disabled.
}
if (!p_version->variants[i].is_null()) {
RD::get_singleton()->free(p_version->variants[i]);
}
}
p_version->variants.clear();
p_version->variant_data.clear();
return;
}
#if ENABLE_SHADER_CACHE
else if (shader_cache_user_dir_valid) {
_save_to_cache(p_version, p_group);
}
#endif
p_version->valid = true;
}
void ShaderRD::_compile_ensure_finished(Version *p_version) {
// Wait for compilation of existing groups if necessary.
for (int i = 0; i < group_enabled.size(); i++) {
_compile_version_end(p_version, i);
}
}
void ShaderRD::version_set_code(RID p_version, const HashMap<String, String> &p_code, const String &p_uniforms, const String &p_vertex_globals, const String &p_fragment_globals, const Vector<String> &p_custom_defines) {
2019-09-25 16:44:44 -03:00
ERR_FAIL_COND(is_compute);
Version *version = version_owner.get_or_null(p_version);
ERR_FAIL_NULL(version);
MutexLock lock(*version->mutex);
_compile_ensure_finished(version);
version->vertex_globals = p_vertex_globals.utf8();
version->fragment_globals = p_fragment_globals.utf8();
version->uniforms = p_uniforms.utf8();
version->code_sections.clear();
2021-08-09 14:13:42 -06:00
for (const KeyValue<String, String> &E : p_code) {
version->code_sections[StringName(E.key.to_upper())] = E.value.utf8();
}
version->custom_defines.clear();
for (int i = 0; i < p_custom_defines.size(); i++) {
version->custom_defines.push_back(p_custom_defines[i].utf8());
}
version->dirty = true;
if (version->initialize_needed) {
_initialize_version(version);
for (int i = 0; i < group_enabled.size(); i++) {
if (!group_enabled[i]) {
_allocate_placeholders(version, i);
continue;
}
_compile_version_start(version, i);
}
version->initialize_needed = false;
}
}
void ShaderRD::version_set_compute_code(RID p_version, const HashMap<String, String> &p_code, const String &p_uniforms, const String &p_compute_globals, const Vector<String> &p_custom_defines) {
2019-09-25 16:44:44 -03:00
ERR_FAIL_COND(!is_compute);
Version *version = version_owner.get_or_null(p_version);
ERR_FAIL_NULL(version);
MutexLock lock(*version->mutex);
_compile_ensure_finished(version);
2019-09-25 16:44:44 -03:00
version->compute_globals = p_compute_globals.utf8();
version->uniforms = p_uniforms.utf8();
version->code_sections.clear();
2021-08-09 14:13:42 -06:00
for (const KeyValue<String, String> &E : p_code) {
version->code_sections[StringName(E.key.to_upper())] = E.value.utf8();
}
2019-09-25 16:44:44 -03:00
version->custom_defines.clear();
for (int i = 0; i < p_custom_defines.size(); i++) {
version->custom_defines.push_back(p_custom_defines[i].utf8());
}
version->dirty = true;
if (version->initialize_needed) {
_initialize_version(version);
for (int i = 0; i < group_enabled.size(); i++) {
if (!group_enabled[i]) {
_allocate_placeholders(version, i);
continue;
}
_compile_version_start(version, i);
}
2019-09-25 16:44:44 -03:00
version->initialize_needed = false;
}
}
bool ShaderRD::version_is_valid(RID p_version) {
Version *version = version_owner.get_or_null(p_version);
ERR_FAIL_NULL_V(version, false);
MutexLock lock(*version->mutex);
if (version->dirty) {
_initialize_version(version);
for (int i = 0; i < group_enabled.size(); i++) {
if (!group_enabled[i]) {
_allocate_placeholders(version, i);
continue;
}
_compile_version_start(version, i);
}
}
_compile_ensure_finished(version);
return version->valid;
}
bool ShaderRD::version_free(RID p_version) {
if (version_owner.owns(p_version)) {
{
MutexLock lock(versions_mutex);
version_mutexes.erase(p_version);
}
Version *version = version_owner.get_or_null(p_version);
if (version->embedded) {
MutexLock lock(shader_versions_embedded_set_mutex);
shader_versions_embedded_set.erase({ this, p_version });
}
version->mutex->lock();
_clear_version(version);
version_owner.free(p_version);
version->mutex->unlock();
memdelete(version->mutex);
} else {
return false;
}
return true;
}
void ShaderRD::set_variant_enabled(int p_variant, bool p_enabled) {
ERR_FAIL_COND(version_owner.get_rid_count() > 0); //versions exist
ERR_FAIL_INDEX(p_variant, variants_enabled.size());
variants_enabled.write[p_variant] = p_enabled;
}
bool ShaderRD::is_variant_enabled(int p_variant) const {
ERR_FAIL_INDEX_V(p_variant, variants_enabled.size(), false);
return variants_enabled[p_variant];
}
int64_t ShaderRD::get_variant_count() const {
return variants_enabled.size();
}
int ShaderRD::get_variant_to_group(int p_variant) const {
return variant_to_group[p_variant];
}
void ShaderRD::enable_group(int p_group) {
ERR_FAIL_INDEX(p_group, group_enabled.size());
if (group_enabled[p_group]) {
// Group already enabled, do nothing.
return;
}
group_enabled.write[p_group] = true;
// Compile all versions again to include the new group.
2025-01-21 21:23:46 +08:00
for (const RID &version_rid : version_owner.get_owned_list()) {
Version *version = version_owner.get_or_null(version_rid);
version->mutex->lock();
_compile_version_start(version, p_group);
version->mutex->unlock();
}
}
bool ShaderRD::is_group_enabled(int p_group) const {
return group_enabled[p_group];
}
int64_t ShaderRD::get_group_count() const {
return group_enabled.size();
}
const LocalVector<int> &ShaderRD::get_group_to_variants(int p_group) const {
return group_to_variant_map[p_group];
}
const String &ShaderRD::get_name() const {
return name;
}
bool ShaderRD::shader_cache_cleanup_on_start = false;
ShaderRD::ShaderRD() {
// Do not feel forced to use this, in most cases it makes little to no difference.
bool use_32_threads = false;
if (RD::get_singleton()->get_device_vendor_name() == "NVIDIA") {
use_32_threads = true;
}
String base_compute_define_text;
if (use_32_threads) {
base_compute_define_text = "\n#define NATIVE_LOCAL_GROUP_SIZE 32\n#define NATIVE_LOCAL_SIZE_2D_X 8\n#define NATIVE_LOCAL_SIZE_2D_Y 4\n";
} else {
base_compute_define_text = "\n#define NATIVE_LOCAL_GROUP_SIZE 64\n#define NATIVE_LOCAL_SIZE_2D_X 8\n#define NATIVE_LOCAL_SIZE_2D_Y 8\n";
}
base_compute_defines = base_compute_define_text.ascii();
}
void ShaderRD::initialize(const Vector<String> &p_variant_defines, const String &p_general_defines, const Vector<RD::PipelineImmutableSampler> &p_immutable_samplers) {
ERR_FAIL_COND(variant_defines.size());
ERR_FAIL_COND(p_variant_defines.is_empty());
general_defines = p_general_defines.utf8();
immutable_samplers = p_immutable_samplers;
// When initialized this way, there is just one group and its always enabled.
group_to_variant_map.insert(0, LocalVector<int>{});
group_enabled.push_back(true);
for (int i = 0; i < p_variant_defines.size(); i++) {
variant_defines.push_back(VariantDefine(0, p_variant_defines[i], true));
variants_enabled.push_back(true);
variant_to_group.push_back(0);
group_to_variant_map[0].push_back(i);
}
if (!shader_cache_user_dir.is_empty() || !shader_cache_res_dir.is_empty()) {
group_sha256.resize(1);
_initialize_cache();
}
}
void ShaderRD::_initialize_cache() {
shader_cache_user_dir_valid = !shader_cache_user_dir.is_empty();
if (!shader_cache_user_dir_valid) {
return;
}
for (const KeyValue<int, LocalVector<int>> &E : group_to_variant_map) {
StringBuilder hash_build;
hash_build.append("[base_hash]");
hash_build.append(base_sha256);
hash_build.append("[general_defines]");
hash_build.append(general_defines.get_data());
hash_build.append("[group_id]");
hash_build.append(itos(E.key));
for (uint32_t i = 0; i < E.value.size(); i++) {
hash_build.append("[variant_defines:" + itos(E.value[i]) + "]");
hash_build.append(variant_defines[E.value[i]].text.get_data());
}
group_sha256[E.key] = hash_build.as_string().sha256_text();
if (!shader_cache_user_dir.is_empty()) {
// Validate if it's possible to write to all the directories required by in the user directory.
Ref<DirAccess> d = DirAccess::open(shader_cache_user_dir);
if (d.is_null()) {
shader_cache_user_dir_valid = false;
ERR_FAIL_MSG(vformat("Unable to open shader cache directory at %s.", shader_cache_user_dir));
}
if (d->change_dir(name) != OK) {
Error err = d->make_dir(name);
if (err != OK) {
shader_cache_user_dir_valid = false;
ERR_FAIL_MSG(vformat("Unable to create shader cache directory %s at %s.", name, shader_cache_user_dir));
}
d->change_dir(name);
}
if (d->change_dir(group_sha256[E.key]) != OK) {
Error err = d->make_dir(group_sha256[E.key]);
if (err != OK) {
shader_cache_user_dir_valid = false;
ERR_FAIL_MSG(vformat("Unable to create shader cache directory %s/%s at %s.", name, group_sha256[E.key], shader_cache_user_dir));
}
}
}
print_verbose("Shader '" + name + "' (group " + itos(E.key) + ") SHA256: " + group_sha256[E.key]);
}
}
// Same as above, but allows specifying shader compilation groups.
void ShaderRD::initialize(const Vector<VariantDefine> &p_variant_defines, const String &p_general_defines, const Vector<RD::PipelineImmutableSampler> &p_immutable_samplers) {
ERR_FAIL_COND(variant_defines.size());
ERR_FAIL_COND(p_variant_defines.is_empty());
general_defines = p_general_defines.utf8();
immutable_samplers = p_immutable_samplers;
int max_group_id = 0;
for (int i = 0; i < p_variant_defines.size(); i++) {
// Fill variant array.
variant_defines.push_back(p_variant_defines[i]);
variants_enabled.push_back(true);
variant_to_group.push_back(p_variant_defines[i].group);
// Map variant array index to group id, so we can iterate over groups later.
if (!group_to_variant_map.has(p_variant_defines[i].group)) {
group_to_variant_map.insert(p_variant_defines[i].group, LocalVector<int>{});
}
group_to_variant_map[p_variant_defines[i].group].push_back(i);
// Track max size.
if (p_variant_defines[i].group > max_group_id) {
max_group_id = p_variant_defines[i].group;
}
}
// Set all to groups to false, then enable those that should be default.
group_enabled.resize_initialized(max_group_id + 1);
bool *enabled_ptr = group_enabled.ptrw();
for (int i = 0; i < p_variant_defines.size(); i++) {
if (p_variant_defines[i].default_enabled) {
enabled_ptr[p_variant_defines[i].group] = true;
}
}
if (!shader_cache_user_dir.is_empty()) {
group_sha256.resize(max_group_id + 1);
_initialize_cache();
}
}
void ShaderRD::shaders_embedded_set_lock() {
shader_versions_embedded_set_mutex.lock();
}
const ShaderRD::ShaderVersionPairSet &ShaderRD::shaders_embedded_set_get() {
return shader_versions_embedded_set;
}
void ShaderRD::shaders_embedded_set_unlock() {
shader_versions_embedded_set_mutex.unlock();
}
void ShaderRD::set_shader_cache_user_dir(const String &p_dir) {
shader_cache_user_dir = p_dir;
}
const String &ShaderRD::get_shader_cache_user_dir() {
return shader_cache_user_dir;
}
void ShaderRD::set_shader_cache_res_dir(const String &p_dir) {
shader_cache_res_dir = p_dir;
}
const String &ShaderRD::get_shader_cache_res_dir() {
return shader_cache_res_dir;
}
void ShaderRD::set_shader_cache_save_compressed(bool p_enable) {
shader_cache_save_compressed = p_enable;
}
void ShaderRD::set_shader_cache_save_compressed_zstd(bool p_enable) {
shader_cache_save_compressed_zstd = p_enable;
}
void ShaderRD::set_shader_cache_save_debug(bool p_enable) {
shader_cache_save_debug = p_enable;
}
Vector<RD::ShaderStageSPIRVData> ShaderRD::compile_stages(const Vector<String> &p_stage_sources) {
RD::ShaderStageSPIRVData stage;
Vector<RD::ShaderStageSPIRVData> stages;
String error;
RD::ShaderStage compilation_failed_stage = RD::SHADER_STAGE_MAX;
bool compilation_failed = false;
for (int64_t i = 0; i < p_stage_sources.size() && !compilation_failed; i++) {
if (p_stage_sources[i].is_empty()) {
continue;
}
stage.spirv = RD::get_singleton()->shader_compile_spirv_from_source(RD::ShaderStage(i), p_stage_sources[i], RD::SHADER_LANGUAGE_GLSL, &error);
stage.shader_stage = RD::ShaderStage(i);
if (!stage.spirv.is_empty()) {
stages.push_back(stage);
} else {
compilation_failed_stage = RD::ShaderStage(i);
compilation_failed = true;
}
}
if (compilation_failed) {
ERR_PRINT("Error compiling " + String(compilation_failed_stage == RD::SHADER_STAGE_COMPUTE ? "Compute " : (compilation_failed_stage == RD::SHADER_STAGE_VERTEX ? "Vertex" : "Fragment")) + " shader.");
ERR_PRINT(error);
#ifdef DEBUG_ENABLED
ERR_PRINT("code:\n" + p_stage_sources[compilation_failed_stage].get_with_code_lines());
#endif
return Vector<RD::ShaderStageSPIRVData>();
} else {
return stages;
}
}
PackedByteArray ShaderRD::save_shader_cache_bytes(const LocalVector<int> &p_variants, const Vector<Vector<uint8_t>> &p_variant_data) {
uint32_t variant_count = p_variants.size();
PackedByteArray bytes;
int64_t total_size = 0;
total_size += 4 + sizeof(uint32_t) * 2;
for (uint32_t i = 0; i < variant_count; i++) {
total_size += sizeof(uint32_t) + p_variant_data[p_variants[i]].size();
}
bytes.resize(total_size);
uint8_t *bytes_ptr = bytes.ptrw();
memcpy(bytes_ptr, shader_file_header, 4);
bytes_ptr += 4;
*(uint32_t *)(bytes_ptr) = cache_file_version;
bytes_ptr += sizeof(uint32_t);
*(uint32_t *)(bytes_ptr) = variant_count;
bytes_ptr += sizeof(uint32_t);
for (uint32_t i = 0; i < variant_count; i++) {
int variant_id = p_variants[i];
*(uint32_t *)(bytes_ptr) = uint32_t(p_variant_data[variant_id].size());
bytes_ptr += sizeof(uint32_t);
memcpy(bytes_ptr, p_variant_data[variant_id].ptr(), p_variant_data[variant_id].size());
bytes_ptr += p_variant_data[variant_id].size();
}
DEV_ASSERT((bytes.ptrw() + bytes.size()) == bytes_ptr);
return bytes;
}
String ShaderRD::shader_cache_user_dir;
String ShaderRD::shader_cache_res_dir;
bool ShaderRD::shader_cache_save_compressed = true;
bool ShaderRD::shader_cache_save_compressed_zstd = true;
bool ShaderRD::shader_cache_save_debug = true;
ShaderRD::~ShaderRD() {
2025-01-21 21:23:46 +08:00
LocalVector<RID> remaining = version_owner.get_owned_list();
if (remaining.size()) {
ERR_PRINT(itos(remaining.size()) + " shaders of type " + name + " were never freed");
2025-01-21 21:23:46 +08:00
for (const RID &version_rid : remaining) {
version_free(version_rid);
}
}
}