2023-01-05 13:25:55 +01:00
|
|
|
/**************************************************************************/
|
|
|
|
/* shader_rd.cpp */
|
|
|
|
/**************************************************************************/
|
|
|
|
/* This file is part of: */
|
|
|
|
/* GODOT ENGINE */
|
|
|
|
/* https://godotengine.org */
|
|
|
|
/**************************************************************************/
|
|
|
|
/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
|
|
|
|
/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur. */
|
|
|
|
/* */
|
|
|
|
/* Permission is hereby granted, free of charge, to any person obtaining */
|
|
|
|
/* a copy of this software and associated documentation files (the */
|
|
|
|
/* "Software"), to deal in the Software without restriction, including */
|
|
|
|
/* without limitation the rights to use, copy, modify, merge, publish, */
|
|
|
|
/* distribute, sublicense, and/or sell copies of the Software, and to */
|
|
|
|
/* permit persons to whom the Software is furnished to do so, subject to */
|
|
|
|
/* the following conditions: */
|
|
|
|
/* */
|
|
|
|
/* The above copyright notice and this permission notice shall be */
|
|
|
|
/* included in all copies or substantial portions of the Software. */
|
|
|
|
/* */
|
|
|
|
/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */
|
|
|
|
/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */
|
|
|
|
/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
|
|
|
|
/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */
|
|
|
|
/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */
|
|
|
|
/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */
|
|
|
|
/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
|
|
|
|
/**************************************************************************/
|
2019-06-15 23:45:24 -03:00
|
|
|
|
|
|
|
#include "shader_rd.h"
|
2020-03-24 09:50:51 +01:00
|
|
|
|
2021-06-11 14:51:48 +02:00
|
|
|
#include "core/io/dir_access.h"
|
|
|
|
#include "core/io/file_access.h"
|
2023-02-13 13:45:06 -05:00
|
|
|
#include "core/object/worker_thread_pool.h"
|
2023-01-09 17:44:29 +01:00
|
|
|
#include "core/version.h"
|
2020-03-27 15:21:27 -03:00
|
|
|
#include "servers/rendering/rendering_device.h"
|
2023-09-22 13:58:02 +10:00
|
|
|
#include "servers/rendering/shader_include_db.h"
|
2019-06-15 23:45:24 -03:00
|
|
|
|
2024-03-15 14:13:31 -03:00
|
|
|
#define ENABLE_SHADER_CACHE 1
|
|
|
|
|
2021-04-13 17:01:43 -03:00
|
|
|
void ShaderRD::_add_stage(const char *p_code, StageType p_stage_type) {
|
|
|
|
Vector<String> lines = String(p_code).split("\n");
|
|
|
|
|
|
|
|
String text;
|
|
|
|
|
2023-09-22 13:58:02 +10:00
|
|
|
int line_count = lines.size();
|
|
|
|
for (int i = 0; i < line_count; i++) {
|
2023-11-18 17:40:56 -05:00
|
|
|
const String &l = lines[i];
|
2021-04-13 17:01:43 -03:00
|
|
|
bool push_chunk = false;
|
|
|
|
|
|
|
|
StageTemplate::Chunk chunk;
|
|
|
|
|
|
|
|
if (l.begins_with("#VERSION_DEFINES")) {
|
|
|
|
chunk.type = StageTemplate::Chunk::TYPE_VERSION_DEFINES;
|
|
|
|
push_chunk = true;
|
|
|
|
} else if (l.begins_with("#GLOBALS")) {
|
|
|
|
switch (p_stage_type) {
|
|
|
|
case STAGE_TYPE_VERTEX:
|
|
|
|
chunk.type = StageTemplate::Chunk::TYPE_VERTEX_GLOBALS;
|
|
|
|
break;
|
|
|
|
case STAGE_TYPE_FRAGMENT:
|
|
|
|
chunk.type = StageTemplate::Chunk::TYPE_FRAGMENT_GLOBALS;
|
|
|
|
break;
|
|
|
|
case STAGE_TYPE_COMPUTE:
|
|
|
|
chunk.type = StageTemplate::Chunk::TYPE_COMPUTE_GLOBALS;
|
|
|
|
break;
|
|
|
|
default: {
|
2019-06-15 23:45:24 -03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-04-13 17:01:43 -03:00
|
|
|
push_chunk = true;
|
|
|
|
} else if (l.begins_with("#MATERIAL_UNIFORMS")) {
|
|
|
|
chunk.type = StageTemplate::Chunk::TYPE_MATERIAL_UNIFORMS;
|
|
|
|
push_chunk = true;
|
|
|
|
} else if (l.begins_with("#CODE")) {
|
|
|
|
chunk.type = StageTemplate::Chunk::TYPE_CODE;
|
|
|
|
push_chunk = true;
|
2024-05-28 12:55:07 +02:00
|
|
|
chunk.code = l.replace_first("#CODE", String()).remove_char(':').strip_edges().to_upper();
|
2023-09-22 13:58:02 +10:00
|
|
|
} else if (l.begins_with("#include ")) {
|
|
|
|
String include_file = l.replace("#include ", "").strip_edges();
|
|
|
|
if (include_file[0] == '"') {
|
|
|
|
int end_pos = include_file.find_char('"', 1);
|
|
|
|
if (end_pos >= 0) {
|
|
|
|
include_file = include_file.substr(1, end_pos - 1);
|
|
|
|
|
|
|
|
String include_code = ShaderIncludeDB::get_built_in_include_file(include_file);
|
|
|
|
if (!include_code.is_empty()) {
|
|
|
|
// Add these lines into our parse list so we parse them as well.
|
|
|
|
Vector<String> include_lines = include_code.split("\n");
|
|
|
|
|
|
|
|
for (int j = include_lines.size() - 1; j >= 0; j--) {
|
|
|
|
lines.insert(i + 1, include_lines[j]);
|
|
|
|
}
|
|
|
|
|
|
|
|
line_count = lines.size();
|
|
|
|
} else {
|
|
|
|
// Add it in as is.
|
|
|
|
text += l + "\n";
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// Add it in as is.
|
|
|
|
text += l + "\n";
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// Add it in as is.
|
|
|
|
text += l + "\n";
|
|
|
|
}
|
2021-04-13 17:01:43 -03:00
|
|
|
} else {
|
|
|
|
text += l + "\n";
|
2019-06-15 23:45:24 -03:00
|
|
|
}
|
|
|
|
|
2021-04-13 17:01:43 -03:00
|
|
|
if (push_chunk) {
|
2021-12-09 03:42:46 -06:00
|
|
|
if (!text.is_empty()) {
|
2021-04-13 17:01:43 -03:00
|
|
|
StageTemplate::Chunk text_chunk;
|
|
|
|
text_chunk.type = StageTemplate::Chunk::TYPE_TEXT;
|
|
|
|
text_chunk.text = text.utf8();
|
|
|
|
stage_templates[p_stage_type].chunks.push_back(text_chunk);
|
|
|
|
text = String();
|
2019-06-15 23:45:24 -03:00
|
|
|
}
|
2021-04-13 17:01:43 -03:00
|
|
|
stage_templates[p_stage_type].chunks.push_back(chunk);
|
2019-06-15 23:45:24 -03:00
|
|
|
}
|
|
|
|
}
|
2019-09-25 16:44:44 -03:00
|
|
|
|
2021-12-09 03:42:46 -06:00
|
|
|
if (!text.is_empty()) {
|
2021-04-13 17:01:43 -03:00
|
|
|
StageTemplate::Chunk text_chunk;
|
|
|
|
text_chunk.type = StageTemplate::Chunk::TYPE_TEXT;
|
|
|
|
text_chunk.text = text.utf8();
|
|
|
|
stage_templates[p_stage_type].chunks.push_back(text_chunk);
|
|
|
|
text = String();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void ShaderRD::setup(const char *p_vertex_code, const char *p_fragment_code, const char *p_compute_code, const char *p_name) {
|
|
|
|
name = p_name;
|
2021-05-24 21:25:11 -03:00
|
|
|
|
2019-09-25 16:44:44 -03:00
|
|
|
if (p_compute_code) {
|
2021-04-13 17:01:43 -03:00
|
|
|
_add_stage(p_compute_code, STAGE_TYPE_COMPUTE);
|
2019-09-25 16:44:44 -03:00
|
|
|
is_compute = true;
|
2021-04-13 17:01:43 -03:00
|
|
|
} else {
|
|
|
|
is_compute = false;
|
|
|
|
if (p_vertex_code) {
|
|
|
|
_add_stage(p_vertex_code, STAGE_TYPE_VERTEX);
|
2019-09-25 16:44:44 -03:00
|
|
|
}
|
2021-04-13 17:01:43 -03:00
|
|
|
if (p_fragment_code) {
|
|
|
|
_add_stage(p_fragment_code, STAGE_TYPE_FRAGMENT);
|
2019-09-25 16:44:44 -03:00
|
|
|
}
|
|
|
|
}
|
2021-05-24 21:25:11 -03:00
|
|
|
|
|
|
|
StringBuilder tohash;
|
2023-01-09 17:44:29 +01:00
|
|
|
tohash.append("[GodotVersionNumber]");
|
2025-03-03 22:27:29 -08:00
|
|
|
tohash.append(GODOT_VERSION_NUMBER);
|
2023-01-09 17:44:29 +01:00
|
|
|
tohash.append("[GodotVersionHash]");
|
2025-03-03 22:27:29 -08:00
|
|
|
tohash.append(GODOT_VERSION_HASH);
|
Implement Binary Shader Compilation
* Added an extra stage before compiling shader, which is generating a binary blob.
* On Vulkan, this allows caching the SPIRV reflection information, which is expensive to parse.
* On other (future) RenderingDevices, it allows caching converted binary data, such as DXIL or MSL.
This PR makes the shader cache include the reflection information, hence editor startup times are significantly improved.
I tested this well and it appears to work, and I added a lot of consistency checks, but because it includes writing and reading binary information, rare bugs may pop up, so be aware.
There was not much of a choice for storing the reflection information, given shaders can be a lot, take a lot of space and take time to parse.
2021-07-25 11:22:55 -03:00
|
|
|
tohash.append("[SpirvCacheKey]");
|
|
|
|
tohash.append(RenderingDevice::get_singleton()->shader_get_spirv_cache_key());
|
|
|
|
tohash.append("[BinaryCacheKey]");
|
|
|
|
tohash.append(RenderingDevice::get_singleton()->shader_get_binary_cache_key());
|
2021-05-24 21:25:11 -03:00
|
|
|
tohash.append("[Vertex]");
|
|
|
|
tohash.append(p_vertex_code ? p_vertex_code : "");
|
|
|
|
tohash.append("[Fragment]");
|
|
|
|
tohash.append(p_fragment_code ? p_fragment_code : "");
|
|
|
|
tohash.append("[Compute]");
|
|
|
|
tohash.append(p_compute_code ? p_compute_code : "");
|
|
|
|
|
|
|
|
base_sha256 = tohash.as_string().sha256_text();
|
2019-06-15 23:45:24 -03:00
|
|
|
}
|
|
|
|
|
|
|
|
RID ShaderRD::version_create() {
|
|
|
|
//initialize() was never called
|
2024-01-19 13:21:39 +01:00
|
|
|
ERR_FAIL_COND_V(group_to_variant_map.is_empty(), RID());
|
2019-06-15 23:45:24 -03:00
|
|
|
|
|
|
|
Version version;
|
|
|
|
version.dirty = true;
|
|
|
|
version.valid = false;
|
|
|
|
version.initialize_needed = true;
|
2024-03-15 14:13:31 -03:00
|
|
|
version.variants.clear();
|
|
|
|
version.variant_data.clear();
|
2025-04-08 15:17:04 +10:00
|
|
|
version.mutex = memnew(Mutex);
|
|
|
|
RID rid = version_owner.make_rid(version);
|
|
|
|
MutexLock lock(versions_mutex);
|
|
|
|
version_mutexes.insert(rid, version.mutex);
|
|
|
|
return rid;
|
2019-06-15 23:45:24 -03:00
|
|
|
}
|
|
|
|
|
2023-07-18 11:21:27 +02:00
|
|
|
void ShaderRD::_initialize_version(Version *p_version) {
|
|
|
|
_clear_version(p_version);
|
|
|
|
|
|
|
|
p_version->valid = false;
|
|
|
|
p_version->dirty = false;
|
|
|
|
|
2024-03-15 14:13:31 -03:00
|
|
|
p_version->variants.resize_zeroed(variant_defines.size());
|
|
|
|
p_version->variant_data.resize(variant_defines.size());
|
2025-03-14 09:26:29 +01:00
|
|
|
p_version->group_compilation_tasks.resize_zeroed(group_enabled.size());
|
2023-07-18 11:21:27 +02:00
|
|
|
}
|
|
|
|
|
2019-06-15 23:45:24 -03:00
|
|
|
void ShaderRD::_clear_version(Version *p_version) {
|
2024-03-15 14:13:31 -03:00
|
|
|
_compile_ensure_finished(p_version);
|
|
|
|
|
2023-07-18 11:21:27 +02:00
|
|
|
// Clear versions if they exist.
|
2024-03-15 14:13:31 -03:00
|
|
|
if (!p_version->variants.is_empty()) {
|
2019-06-15 23:45:24 -03:00
|
|
|
for (int i = 0; i < variant_defines.size(); i++) {
|
2023-08-03 16:13:33 +02:00
|
|
|
if (p_version->variants[i].is_valid()) {
|
2021-07-31 08:52:50 +02:00
|
|
|
RD::get_singleton()->free(p_version->variants[i]);
|
|
|
|
}
|
2019-06-15 23:45:24 -03:00
|
|
|
}
|
|
|
|
|
2024-03-15 14:13:31 -03:00
|
|
|
p_version->variants.clear();
|
|
|
|
p_version->variant_data.clear();
|
2019-06-15 23:45:24 -03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-04-13 17:01:43 -03:00
|
|
|
void ShaderRD::_build_variant_code(StringBuilder &builder, uint32_t p_variant, const Version *p_version, const StageTemplate &p_template) {
|
2022-12-29 01:24:45 +01:00
|
|
|
for (const StageTemplate::Chunk &chunk : p_template.chunks) {
|
2021-04-13 17:01:43 -03:00
|
|
|
switch (chunk.type) {
|
|
|
|
case StageTemplate::Chunk::TYPE_VERSION_DEFINES: {
|
|
|
|
builder.append("\n"); //make sure defines begin at newline
|
|
|
|
builder.append(general_defines.get_data());
|
2023-07-18 11:21:27 +02:00
|
|
|
builder.append(variant_defines[p_variant].text.get_data());
|
2021-04-13 17:01:43 -03:00
|
|
|
for (int j = 0; j < p_version->custom_defines.size(); j++) {
|
|
|
|
builder.append(p_version->custom_defines[j].get_data());
|
|
|
|
}
|
|
|
|
builder.append("\n"); //make sure defines begin at newline
|
|
|
|
if (p_version->uniforms.size()) {
|
|
|
|
builder.append("#define MATERIAL_UNIFORMS_USED\n");
|
|
|
|
}
|
2021-08-09 14:13:42 -06:00
|
|
|
for (const KeyValue<StringName, CharString> &E : p_version->code_sections) {
|
|
|
|
builder.append(String("#define ") + String(E.key) + "_CODE_USED\n");
|
2021-04-13 17:01:43 -03:00
|
|
|
}
|
2025-05-14 12:04:10 +01:00
|
|
|
#if (defined(MACOS_ENABLED) || defined(APPLE_EMBEDDED_ENABLED))
|
2024-02-20 05:52:00 +11:00
|
|
|
if (RD::get_singleton()->get_device_capabilities().device_family == RDD::DEVICE_VULKAN) {
|
|
|
|
builder.append("#define MOLTENVK_USED\n");
|
|
|
|
}
|
|
|
|
// Image atomics are supported on Metal 3.1 but no support in MoltenVK or SPIRV-Cross yet.
|
|
|
|
builder.append("#define NO_IMAGE_ATOMICS\n");
|
2021-11-24 09:14:19 +02:00
|
|
|
#endif
|
2024-02-20 05:52:00 +11:00
|
|
|
|
2022-10-06 20:45:56 +02:00
|
|
|
builder.append(String("#define RENDER_DRIVER_") + OS::get_singleton()->get_current_rendering_driver_name().to_upper() + "\n");
|
2023-10-14 03:58:59 -07:00
|
|
|
builder.append("#define samplerExternalOES sampler2D\n");
|
|
|
|
builder.append("#define textureExternalOES texture2D\n");
|
2021-04-13 17:01:43 -03:00
|
|
|
} break;
|
|
|
|
case StageTemplate::Chunk::TYPE_MATERIAL_UNIFORMS: {
|
|
|
|
builder.append(p_version->uniforms.get_data()); //uniforms (same for vertex and fragment)
|
|
|
|
} break;
|
|
|
|
case StageTemplate::Chunk::TYPE_VERTEX_GLOBALS: {
|
|
|
|
builder.append(p_version->vertex_globals.get_data()); // vertex globals
|
|
|
|
} break;
|
|
|
|
case StageTemplate::Chunk::TYPE_FRAGMENT_GLOBALS: {
|
|
|
|
builder.append(p_version->fragment_globals.get_data()); // fragment globals
|
|
|
|
} break;
|
|
|
|
case StageTemplate::Chunk::TYPE_COMPUTE_GLOBALS: {
|
|
|
|
builder.append(p_version->compute_globals.get_data()); // compute globals
|
|
|
|
} break;
|
|
|
|
case StageTemplate::Chunk::TYPE_CODE: {
|
|
|
|
if (p_version->code_sections.has(chunk.code)) {
|
|
|
|
builder.append(p_version->code_sections[chunk.code].get_data());
|
|
|
|
}
|
|
|
|
} break;
|
|
|
|
case StageTemplate::Chunk::TYPE_TEXT: {
|
|
|
|
builder.append(chunk.text.get_data());
|
|
|
|
} break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-03-15 14:13:31 -03:00
|
|
|
void ShaderRD::_compile_variant(uint32_t p_variant, CompileData p_data) {
|
|
|
|
uint32_t variant = group_to_variant_map[p_data.group][p_variant];
|
2023-07-18 11:21:27 +02:00
|
|
|
|
|
|
|
if (!variants_enabled[variant]) {
|
|
|
|
return; // Variant is disabled, return.
|
2020-12-07 18:27:38 -03:00
|
|
|
}
|
|
|
|
|
Implement Binary Shader Compilation
* Added an extra stage before compiling shader, which is generating a binary blob.
* On Vulkan, this allows caching the SPIRV reflection information, which is expensive to parse.
* On other (future) RenderingDevices, it allows caching converted binary data, such as DXIL or MSL.
This PR makes the shader cache include the reflection information, hence editor startup times are significantly improved.
I tested this well and it appears to work, and I added a lot of consistency checks, but because it includes writing and reading binary information, rare bugs may pop up, so be aware.
There was not much of a choice for storing the reflection information, given shaders can be a lot, take a lot of space and take time to parse.
2021-07-25 11:22:55 -03:00
|
|
|
Vector<RD::ShaderStageSPIRVData> stages;
|
2019-06-15 23:45:24 -03:00
|
|
|
|
2019-07-29 12:59:18 -03:00
|
|
|
String error;
|
|
|
|
String current_source;
|
|
|
|
RD::ShaderStage current_stage = RD::SHADER_STAGE_VERTEX;
|
|
|
|
bool build_ok = true;
|
2019-06-15 23:45:24 -03:00
|
|
|
|
2019-09-25 16:44:44 -03:00
|
|
|
if (!is_compute) {
|
2019-07-29 12:59:18 -03:00
|
|
|
//vertex stage
|
2019-06-15 23:45:24 -03:00
|
|
|
|
2019-07-29 12:59:18 -03:00
|
|
|
StringBuilder builder;
|
2024-03-15 14:13:31 -03:00
|
|
|
_build_variant_code(builder, variant, p_data.version, stage_templates[STAGE_TYPE_VERTEX]);
|
2019-06-15 23:45:24 -03:00
|
|
|
|
2019-07-29 12:59:18 -03:00
|
|
|
current_source = builder.as_string();
|
Implement Binary Shader Compilation
* Added an extra stage before compiling shader, which is generating a binary blob.
* On Vulkan, this allows caching the SPIRV reflection information, which is expensive to parse.
* On other (future) RenderingDevices, it allows caching converted binary data, such as DXIL or MSL.
This PR makes the shader cache include the reflection information, hence editor startup times are significantly improved.
I tested this well and it appears to work, and I added a lot of consistency checks, but because it includes writing and reading binary information, rare bugs may pop up, so be aware.
There was not much of a choice for storing the reflection information, given shaders can be a lot, take a lot of space and take time to parse.
2021-07-25 11:22:55 -03:00
|
|
|
RD::ShaderStageSPIRVData stage;
|
2023-12-19 12:48:02 +01:00
|
|
|
stage.spirv = RD::get_singleton()->shader_compile_spirv_from_source(RD::SHADER_STAGE_VERTEX, current_source, RD::SHADER_LANGUAGE_GLSL, &error);
|
2025-03-20 00:07:31 +08:00
|
|
|
if (stage.spirv.is_empty()) {
|
2019-07-29 12:59:18 -03:00
|
|
|
build_ok = false;
|
|
|
|
} else {
|
|
|
|
stage.shader_stage = RD::SHADER_STAGE_VERTEX;
|
|
|
|
stages.push_back(stage);
|
|
|
|
}
|
|
|
|
}
|
2019-06-15 23:45:24 -03:00
|
|
|
|
2019-09-25 16:44:44 -03:00
|
|
|
if (!is_compute && build_ok) {
|
2019-07-29 12:59:18 -03:00
|
|
|
//fragment stage
|
|
|
|
current_stage = RD::SHADER_STAGE_FRAGMENT;
|
2019-06-15 23:45:24 -03:00
|
|
|
|
2019-07-29 12:59:18 -03:00
|
|
|
StringBuilder builder;
|
2024-03-15 14:13:31 -03:00
|
|
|
_build_variant_code(builder, variant, p_data.version, stage_templates[STAGE_TYPE_FRAGMENT]);
|
2019-06-15 23:45:24 -03:00
|
|
|
|
2019-07-29 12:59:18 -03:00
|
|
|
current_source = builder.as_string();
|
Implement Binary Shader Compilation
* Added an extra stage before compiling shader, which is generating a binary blob.
* On Vulkan, this allows caching the SPIRV reflection information, which is expensive to parse.
* On other (future) RenderingDevices, it allows caching converted binary data, such as DXIL or MSL.
This PR makes the shader cache include the reflection information, hence editor startup times are significantly improved.
I tested this well and it appears to work, and I added a lot of consistency checks, but because it includes writing and reading binary information, rare bugs may pop up, so be aware.
There was not much of a choice for storing the reflection information, given shaders can be a lot, take a lot of space and take time to parse.
2021-07-25 11:22:55 -03:00
|
|
|
RD::ShaderStageSPIRVData stage;
|
2023-12-19 12:48:02 +01:00
|
|
|
stage.spirv = RD::get_singleton()->shader_compile_spirv_from_source(RD::SHADER_STAGE_FRAGMENT, current_source, RD::SHADER_LANGUAGE_GLSL, &error);
|
2025-03-20 00:07:31 +08:00
|
|
|
if (stage.spirv.is_empty()) {
|
2019-07-29 12:59:18 -03:00
|
|
|
build_ok = false;
|
|
|
|
} else {
|
|
|
|
stage.shader_stage = RD::SHADER_STAGE_FRAGMENT;
|
|
|
|
stages.push_back(stage);
|
|
|
|
}
|
|
|
|
}
|
2019-06-15 23:45:24 -03:00
|
|
|
|
2019-09-25 16:44:44 -03:00
|
|
|
if (is_compute) {
|
|
|
|
//compute stage
|
|
|
|
current_stage = RD::SHADER_STAGE_COMPUTE;
|
|
|
|
|
|
|
|
StringBuilder builder;
|
2024-03-15 14:13:31 -03:00
|
|
|
_build_variant_code(builder, variant, p_data.version, stage_templates[STAGE_TYPE_COMPUTE]);
|
2019-09-25 16:44:44 -03:00
|
|
|
|
|
|
|
current_source = builder.as_string();
|
2021-04-17 12:21:03 -03:00
|
|
|
|
Implement Binary Shader Compilation
* Added an extra stage before compiling shader, which is generating a binary blob.
* On Vulkan, this allows caching the SPIRV reflection information, which is expensive to parse.
* On other (future) RenderingDevices, it allows caching converted binary data, such as DXIL or MSL.
This PR makes the shader cache include the reflection information, hence editor startup times are significantly improved.
I tested this well and it appears to work, and I added a lot of consistency checks, but because it includes writing and reading binary information, rare bugs may pop up, so be aware.
There was not much of a choice for storing the reflection information, given shaders can be a lot, take a lot of space and take time to parse.
2021-07-25 11:22:55 -03:00
|
|
|
RD::ShaderStageSPIRVData stage;
|
2023-12-19 12:48:02 +01:00
|
|
|
stage.spirv = RD::get_singleton()->shader_compile_spirv_from_source(RD::SHADER_STAGE_COMPUTE, current_source, RD::SHADER_LANGUAGE_GLSL, &error);
|
2025-03-20 00:07:31 +08:00
|
|
|
if (stage.spirv.is_empty()) {
|
2019-09-25 16:44:44 -03:00
|
|
|
build_ok = false;
|
|
|
|
} else {
|
|
|
|
stage.shader_stage = RD::SHADER_STAGE_COMPUTE;
|
|
|
|
stages.push_back(stage);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-07-29 12:59:18 -03:00
|
|
|
if (!build_ok) {
|
2023-07-18 11:21:27 +02:00
|
|
|
ERR_PRINT("Error compiling " + String(current_stage == RD::SHADER_STAGE_COMPUTE ? "Compute " : (current_stage == RD::SHADER_STAGE_VERTEX ? "Vertex" : "Fragment")) + " shader, variant #" + itos(variant) + " (" + variant_defines[variant].text.get_data() + ").");
|
2019-07-29 12:59:18 -03:00
|
|
|
ERR_PRINT(error);
|
2019-06-15 23:45:24 -03:00
|
|
|
|
2019-07-29 12:59:18 -03:00
|
|
|
#ifdef DEBUG_ENABLED
|
|
|
|
ERR_PRINT("code:\n" + current_source.get_with_code_lines());
|
|
|
|
#endif
|
|
|
|
return;
|
|
|
|
}
|
2019-07-28 19:58:32 -03:00
|
|
|
|
2023-07-18 11:21:27 +02:00
|
|
|
Vector<uint8_t> shader_data = RD::get_singleton()->shader_compile_binary_from_spirv(stages, name + ":" + itos(variant));
|
Implement Binary Shader Compilation
* Added an extra stage before compiling shader, which is generating a binary blob.
* On Vulkan, this allows caching the SPIRV reflection information, which is expensive to parse.
* On other (future) RenderingDevices, it allows caching converted binary data, such as DXIL or MSL.
This PR makes the shader cache include the reflection information, hence editor startup times are significantly improved.
I tested this well and it appears to work, and I added a lot of consistency checks, but because it includes writing and reading binary information, rare bugs may pop up, so be aware.
There was not much of a choice for storing the reflection information, given shaders can be a lot, take a lot of space and take time to parse.
2021-07-25 11:22:55 -03:00
|
|
|
|
2024-01-19 13:21:39 +01:00
|
|
|
ERR_FAIL_COND(shader_data.is_empty());
|
Implement Binary Shader Compilation
* Added an extra stage before compiling shader, which is generating a binary blob.
* On Vulkan, this allows caching the SPIRV reflection information, which is expensive to parse.
* On other (future) RenderingDevices, it allows caching converted binary data, such as DXIL or MSL.
This PR makes the shader cache include the reflection information, hence editor startup times are significantly improved.
I tested this well and it appears to work, and I added a lot of consistency checks, but because it includes writing and reading binary information, rare bugs may pop up, so be aware.
There was not much of a choice for storing the reflection information, given shaders can be a lot, take a lot of space and take time to parse.
2021-07-25 11:22:55 -03:00
|
|
|
|
2020-02-26 11:28:13 +01:00
|
|
|
{
|
Improvements from TheForge (see description)
The work was performed by collaboration of TheForge and Google. I am
merely splitting it up into smaller PRs and cleaning it up.
This is the most "risky" PR so far because the previous ones have been
miscellaneous stuff aimed at either [improve
debugging](https://github.com/godotengine/godot/pull/90993) (e.g. device
lost), [improve Android
experience](https://github.com/godotengine/godot/pull/96439) (add Swappy
for better Frame Pacing + Pre-Transformed Swapchains for slightly better
performance), or harmless [ASTC
improvements](https://github.com/godotengine/godot/pull/96045) (better
performance by simply toggling a feature when available).
However this PR contains larger modifications aimed at improving
performance or reducing memory fragmentation. With greater
modifications, come greater risks of bugs or breakage.
Changes introduced by this PR:
TBDR GPUs (e.g. most of Android + iOS + M1 Apple) support rendering to
Render Targets that are not backed by actual GPU memory (everything
stays in cache). This works as long as load action isn't `LOAD`, and
store action must be `DONT_CARE`. This saves VRAM (it also makes
painfully obvious when a mistake introduces a performance regression).
Of particular usefulness is when doing MSAA and keeping the raw MSAA
content is not necessary.
Some GPUs get faster when the sampler settings are hard-coded into the
GLSL shaders (instead of being dynamically bound at runtime). This
required changes to the GLSL shaders, PSO creation routines, Descriptor
creation routines, and Descriptor binding routines.
- `bool immutable_samplers_enabled = true`
Setting it to false enforces the old behavior. Useful for debugging bugs
and regressions.
Immutable samplers requires that the samplers stay... immutable, hence
this boolean is useful if the promise gets broken. We might want to turn
this into a `GLOBAL_DEF` setting.
Instead of creating dozen/hundreds/thousands of `VkDescriptorSet` every
frame that need to be freed individually when they are no longer needed,
they all get freed at once by resetting the whole pool. Once the whole
pool is no longer in use by the GPU, it gets reset and its memory
recycled. Descriptor sets that are created to be kept around for longer
or forever (i.e. not created and freed within the same frame) **must
not** use linear pools. There may be more than one pool per frame. How
many pools per frame Godot ends up with depends on its capacity, and
that is controlled by
`rendering/rendering_device/vulkan/max_descriptors_per_pool`.
- **Possible improvement for later:** It should be possible for Godot
to adapt to how many descriptors per pool are needed on a per-key basis
(i.e. grow their capacity like `std::vector` does) after rendering a few
frames; which would be better than the current solution of having a
single global value for all pools (`max_descriptors_per_pool`) that the
user needs to tweak.
- `bool linear_descriptor_pools_enabled = true`
Setting it to false enforces the old behavior. Useful for debugging bugs
and regressions.
Setting it to false is required when workarounding driver bugs (e.g.
Adreno 730).
A ridiculous optimization. Ridiculous because the original code
should've done this in the first place. Previously Godot was doing the
following:
1. Create a command buffer **pool**. One per frame.
2. Create multiple command buffers from the pool in point 1.
3. Call `vkBeginCommandBuffer` on the cmd buffer in point 2. This
resets the cmd buffer because Godot requests the
`VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT` flag.
4. Add commands to the cmd buffers from point 2.
5. Submit those commands.
6. On frame N + 2, recycle the buffer pool and cmd buffers from pt 1 &
2, and repeat from step 3.
The problem here is that step 3 resets each command buffer individually.
Initially Godot used to have 1 cmd buffer per pool, thus the impact is
very low.
But not anymore (specially with Adreno workarounds to force splitting
compute dispatches into a new cmd buffer, more on this later). However
Godot keeps around a very low amount of command buffers per frame.
The recommended method is to reset the whole pool, to reset all cmd
buffers at once. Hence the new steps would be:
1. Create a command buffer **pool**. One per frame.
2. Create multiple command buffers from the pool in point 1.
3. Call `vkBeginCommandBuffer` on the cmd buffer in point 2, which is
already reset/empty (see step 6).
4. Add commands to the cmd buffers from point 2.
5. Submit those commands.
6. On frame N + 2, recycle the buffer pool and cmd buffers from pt 1 &
2, call `vkResetCommandPool` and repeat from step 3.
**Possible issues:** @dariosamo added `transfer_worker` which creates a
command buffer pool:
```cpp
transfer_worker->command_pool =
driver->command_pool_create(transfer_queue_family,
RDD::COMMAND_BUFFER_TYPE_PRIMARY);
```
As expected, validation was complaining that command buffers were being
reused without being reset (that's good, we now know Validation Layers
will warn us of wrong use).
I fixed it by adding:
```cpp
void RenderingDevice::_wait_for_transfer_worker(TransferWorker
*p_transfer_worker) {
driver->fence_wait(p_transfer_worker->command_fence);
driver->command_pool_reset(p_transfer_worker->command_pool); //
! New line !
```
**Secondary cmd buffers are subject to the same issue but I didn't alter
them. I talked this with Dario and he is aware of this.**
Secondary cmd buffers are currently disabled due to other issues (it's
disabled on master).
- `bool RenderingDeviceCommons::command_pool_reset_enabled`
Setting it to false enforces the old behavior. Useful for debugging bugs
and regressions.
There's no other reason for this boolean. Possibly once it becomes well
tested, the boolean could be removed entirely.
Adds `command_bind_render_uniform_sets` and
`add_draw_list_bind_uniform_sets` (+ compute variants).
It performs the same as `add_draw_list_bind_uniform_set` (notice
singular vs plural), but on multiple consecutive uniform sets, thus
reducing graph and draw call overhead.
- `bool descriptor_set_batching = true;`
Setting it to false enforces the old behavior. Useful for debugging bugs
and regressions.
There's no other reason for this boolean. Possibly once it becomes well
tested, the boolean could be removed entirely.
Godot currently does the following:
1. Fill the entire cmd buffer with commands.
2. `submit()`
- Wait with a semaphore for the swapchain.
- Trigger a semaphore to indicate when we're done (so the swapchain
can submit).
3. `present()`
The optimization opportunity here is that 95% of Godot's rendering is
done offscreen.
Then a fullscreen pass copies everything to the swapchain. Godot doesn't
practically render directly to the swapchain.
The problem with this is that the GPU has to wait for the swapchain to
be released **to start anything**, when we could start *much earlier*.
Only the final blit pass must wait for the swapchain.
TheForge changed it to the following (more complicated, I'm simplifying
the idea):
1. Fill the entire cmd buffer with commands.
2. In `screen_prepare_for_drawing` do `submit()`
- There are no semaphore waits for the swapchain.
- Trigger a semaphore to indicate when we're done.
3. Fill a new cmd buffer that only does the final blit to the
swapchain.
4. `submit()`
- Wait with a semaphore for the submit() from step 2.
- Wait with a semaphore for the swapchain (so the swapchain can
submit).
- Trigger a semaphore to indicate when we're done (so the swapchain
can submit).
5. `present()`
Dario discovered this problem independently while working on a different
platform.
**However TheForge's solution had to be rewritten from scratch:** The
complexity to achieve the solution was high and quite difficult to
maintain with the way Godot works now (after Übershaders PR).
But on the other hand, re-implementing the solution became much simpler
because Dario already had to do something similar: To fix an Adreno 730
driver bug, he had to implement splitting command buffers. **This is
exactly what we need!**. Thus it was re-written using this existing
functionality for a new purpose.
To achieve this, I added a new argument, `bool p_split_cmd_buffer`, to
`RenderingDeviceGraph::add_draw_list_begin`, which is only set to true
by `RenderingDevice::draw_list_begin_for_screen`.
The graph will split the draw list into its own command buffer.
- `bool split_swapchain_into_its_own_cmd_buffer = true;`
Setting it to false enforces the old behavior. This might be necessary
for consoles which follow an alternate solution to the same problem.
If not, then we should consider removing it.
PR #90993 added `shader_destroy_modules()` but it was not actually in
use.
This PR adds several places where `shader_destroy_modules()` is called
after initialization to free up memory of SPIR-V structures that are no
longer needed.
2024-11-14 13:03:14 -03:00
|
|
|
p_data.version->variants.write[variant] = RD::get_singleton()->shader_create_from_bytecode_with_samplers(shader_data, p_data.version->variants[variant], immutable_samplers);
|
2024-03-15 14:13:31 -03:00
|
|
|
p_data.version->variant_data.write[variant] = shader_data;
|
2020-02-26 11:28:13 +01:00
|
|
|
}
|
2019-07-29 12:59:18 -03:00
|
|
|
}
|
2019-06-15 23:45:24 -03:00
|
|
|
|
2021-01-05 20:01:50 -03:00
|
|
|
RS::ShaderNativeSourceCode ShaderRD::version_get_native_source_code(RID p_version) {
|
2021-09-29 19:08:41 +02:00
|
|
|
Version *version = version_owner.get_or_null(p_version);
|
2021-01-05 20:01:50 -03:00
|
|
|
RS::ShaderNativeSourceCode source_code;
|
2023-09-09 17:04:18 +02:00
|
|
|
ERR_FAIL_NULL_V(version, source_code);
|
2021-01-05 20:01:50 -03:00
|
|
|
|
2025-04-08 15:17:04 +10:00
|
|
|
MutexLock lock(*version->mutex);
|
|
|
|
|
2021-01-05 20:01:50 -03:00
|
|
|
source_code.versions.resize(variant_defines.size());
|
|
|
|
|
|
|
|
for (int i = 0; i < source_code.versions.size(); i++) {
|
|
|
|
if (!is_compute) {
|
|
|
|
//vertex stage
|
|
|
|
|
|
|
|
StringBuilder builder;
|
2021-04-13 17:01:43 -03:00
|
|
|
_build_variant_code(builder, i, version, stage_templates[STAGE_TYPE_VERTEX]);
|
2021-01-05 20:01:50 -03:00
|
|
|
|
|
|
|
RS::ShaderNativeSourceCode::Version::Stage stage;
|
|
|
|
stage.name = "vertex";
|
|
|
|
stage.code = builder.as_string();
|
|
|
|
|
|
|
|
source_code.versions.write[i].stages.push_back(stage);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!is_compute) {
|
|
|
|
//fragment stage
|
|
|
|
|
|
|
|
StringBuilder builder;
|
2021-04-13 17:01:43 -03:00
|
|
|
_build_variant_code(builder, i, version, stage_templates[STAGE_TYPE_FRAGMENT]);
|
2021-01-05 20:01:50 -03:00
|
|
|
|
|
|
|
RS::ShaderNativeSourceCode::Version::Stage stage;
|
|
|
|
stage.name = "fragment";
|
|
|
|
stage.code = builder.as_string();
|
|
|
|
|
|
|
|
source_code.versions.write[i].stages.push_back(stage);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (is_compute) {
|
|
|
|
//compute stage
|
|
|
|
|
|
|
|
StringBuilder builder;
|
2021-04-13 17:01:43 -03:00
|
|
|
_build_variant_code(builder, i, version, stage_templates[STAGE_TYPE_COMPUTE]);
|
2021-01-05 20:01:50 -03:00
|
|
|
|
|
|
|
RS::ShaderNativeSourceCode::Version::Stage stage;
|
|
|
|
stage.name = "compute";
|
|
|
|
stage.code = builder.as_string();
|
|
|
|
|
|
|
|
source_code.versions.write[i].stages.push_back(stage);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return source_code;
|
|
|
|
}
|
|
|
|
|
2021-05-24 21:25:11 -03:00
|
|
|
String ShaderRD::_version_get_sha1(Version *p_version) const {
|
|
|
|
StringBuilder hash_build;
|
|
|
|
|
|
|
|
hash_build.append("[uniforms]");
|
|
|
|
hash_build.append(p_version->uniforms.get_data());
|
|
|
|
hash_build.append("[vertex_globals]");
|
|
|
|
hash_build.append(p_version->vertex_globals.get_data());
|
|
|
|
hash_build.append("[fragment_globals]");
|
|
|
|
hash_build.append(p_version->fragment_globals.get_data());
|
|
|
|
hash_build.append("[compute_globals]");
|
|
|
|
hash_build.append(p_version->compute_globals.get_data());
|
|
|
|
|
|
|
|
Vector<StringName> code_sections;
|
2021-08-09 14:13:42 -06:00
|
|
|
for (const KeyValue<StringName, CharString> &E : p_version->code_sections) {
|
|
|
|
code_sections.push_back(E.key);
|
2021-05-24 21:25:11 -03:00
|
|
|
}
|
|
|
|
code_sections.sort_custom<StringName::AlphCompare>();
|
|
|
|
|
|
|
|
for (int i = 0; i < code_sections.size(); i++) {
|
|
|
|
hash_build.append(String("[code:") + String(code_sections[i]) + "]");
|
|
|
|
hash_build.append(p_version->code_sections[code_sections[i]].get_data());
|
|
|
|
}
|
|
|
|
for (int i = 0; i < p_version->custom_defines.size(); i++) {
|
|
|
|
hash_build.append("[custom_defines:" + itos(i) + "]");
|
|
|
|
hash_build.append(p_version->custom_defines[i].get_data());
|
|
|
|
}
|
|
|
|
|
|
|
|
return hash_build.as_string().sha1_text();
|
|
|
|
}
|
|
|
|
|
|
|
|
static const char *shader_file_header = "GDSC";
|
2025-03-07 18:29:52 -08:00
|
|
|
static const uint32_t cache_file_version = 4;
|
2021-05-24 21:25:11 -03:00
|
|
|
|
2024-01-31 20:12:48 +01:00
|
|
|
String ShaderRD::_get_cache_file_path(Version *p_version, int p_group) {
|
|
|
|
const String &sha1 = _version_get_sha1(p_version);
|
2023-12-19 14:57:56 -03:00
|
|
|
const String &api_safe_name = String(RD::get_singleton()->get_device_api_name()).validate_filename().to_lower();
|
2024-01-31 20:12:48 +01:00
|
|
|
const String &path = shader_cache_dir.path_join(name).path_join(group_sha256[p_group]).path_join(sha1) + "." + api_safe_name + ".cache";
|
|
|
|
return path;
|
|
|
|
}
|
2021-05-24 21:25:11 -03:00
|
|
|
|
2024-01-31 20:12:48 +01:00
|
|
|
bool ShaderRD::_load_from_cache(Version *p_version, int p_group) {
|
|
|
|
const String &path = _get_cache_file_path(p_version, p_group);
|
2022-03-23 11:08:58 +02:00
|
|
|
Ref<FileAccess> f = FileAccess::open(path, FileAccess::READ);
|
|
|
|
if (f.is_null()) {
|
2021-05-24 21:25:11 -03:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
char header[5] = { 0, 0, 0, 0, 0 };
|
|
|
|
f->get_buffer((uint8_t *)header, 4);
|
|
|
|
ERR_FAIL_COND_V(header != String(shader_file_header), false);
|
|
|
|
|
|
|
|
uint32_t file_version = f->get_32();
|
|
|
|
if (file_version != cache_file_version) {
|
|
|
|
return false; // wrong version
|
|
|
|
}
|
|
|
|
|
|
|
|
uint32_t variant_count = f->get_32();
|
|
|
|
|
2023-07-18 11:21:27 +02:00
|
|
|
ERR_FAIL_COND_V(variant_count != (uint32_t)group_to_variant_map[p_group].size(), false); //should not happen but check
|
2021-05-24 21:25:11 -03:00
|
|
|
|
|
|
|
for (uint32_t i = 0; i < variant_count; i++) {
|
2023-07-18 11:21:27 +02:00
|
|
|
int variant_id = group_to_variant_map[p_group][i];
|
Implement Binary Shader Compilation
* Added an extra stage before compiling shader, which is generating a binary blob.
* On Vulkan, this allows caching the SPIRV reflection information, which is expensive to parse.
* On other (future) RenderingDevices, it allows caching converted binary data, such as DXIL or MSL.
This PR makes the shader cache include the reflection information, hence editor startup times are significantly improved.
I tested this well and it appears to work, and I added a lot of consistency checks, but because it includes writing and reading binary information, rare bugs may pop up, so be aware.
There was not much of a choice for storing the reflection information, given shaders can be a lot, take a lot of space and take time to parse.
2021-07-25 11:22:55 -03:00
|
|
|
uint32_t variant_size = f->get_32();
|
2023-07-18 11:21:27 +02:00
|
|
|
ERR_FAIL_COND_V(variant_size == 0 && variants_enabled[variant_id], false);
|
|
|
|
if (!variants_enabled[variant_id]) {
|
Implement Binary Shader Compilation
* Added an extra stage before compiling shader, which is generating a binary blob.
* On Vulkan, this allows caching the SPIRV reflection information, which is expensive to parse.
* On other (future) RenderingDevices, it allows caching converted binary data, such as DXIL or MSL.
This PR makes the shader cache include the reflection information, hence editor startup times are significantly improved.
I tested this well and it appears to work, and I added a lot of consistency checks, but because it includes writing and reading binary information, rare bugs may pop up, so be aware.
There was not much of a choice for storing the reflection information, given shaders can be a lot, take a lot of space and take time to parse.
2021-07-25 11:22:55 -03:00
|
|
|
continue;
|
2021-05-24 21:25:11 -03:00
|
|
|
}
|
Implement Binary Shader Compilation
* Added an extra stage before compiling shader, which is generating a binary blob.
* On Vulkan, this allows caching the SPIRV reflection information, which is expensive to parse.
* On other (future) RenderingDevices, it allows caching converted binary data, such as DXIL or MSL.
This PR makes the shader cache include the reflection information, hence editor startup times are significantly improved.
I tested this well and it appears to work, and I added a lot of consistency checks, but because it includes writing and reading binary information, rare bugs may pop up, so be aware.
There was not much of a choice for storing the reflection information, given shaders can be a lot, take a lot of space and take time to parse.
2021-07-25 11:22:55 -03:00
|
|
|
Vector<uint8_t> variant_bytes;
|
|
|
|
variant_bytes.resize(variant_size);
|
2021-05-24 21:25:11 -03:00
|
|
|
|
Implement Binary Shader Compilation
* Added an extra stage before compiling shader, which is generating a binary blob.
* On Vulkan, this allows caching the SPIRV reflection information, which is expensive to parse.
* On other (future) RenderingDevices, it allows caching converted binary data, such as DXIL or MSL.
This PR makes the shader cache include the reflection information, hence editor startup times are significantly improved.
I tested this well and it appears to work, and I added a lot of consistency checks, but because it includes writing and reading binary information, rare bugs may pop up, so be aware.
There was not much of a choice for storing the reflection information, given shaders can be a lot, take a lot of space and take time to parse.
2021-07-25 11:22:55 -03:00
|
|
|
uint32_t br = f->get_buffer(variant_bytes.ptrw(), variant_size);
|
2021-05-24 21:25:11 -03:00
|
|
|
|
Implement Binary Shader Compilation
* Added an extra stage before compiling shader, which is generating a binary blob.
* On Vulkan, this allows caching the SPIRV reflection information, which is expensive to parse.
* On other (future) RenderingDevices, it allows caching converted binary data, such as DXIL or MSL.
This PR makes the shader cache include the reflection information, hence editor startup times are significantly improved.
I tested this well and it appears to work, and I added a lot of consistency checks, but because it includes writing and reading binary information, rare bugs may pop up, so be aware.
There was not much of a choice for storing the reflection information, given shaders can be a lot, take a lot of space and take time to parse.
2021-07-25 11:22:55 -03:00
|
|
|
ERR_FAIL_COND_V(br != variant_size, false);
|
2021-05-24 21:25:11 -03:00
|
|
|
|
2024-03-15 14:13:31 -03:00
|
|
|
p_version->variant_data.write[variant_id] = variant_bytes;
|
Implement Binary Shader Compilation
* Added an extra stage before compiling shader, which is generating a binary blob.
* On Vulkan, this allows caching the SPIRV reflection information, which is expensive to parse.
* On other (future) RenderingDevices, it allows caching converted binary data, such as DXIL or MSL.
This PR makes the shader cache include the reflection information, hence editor startup times are significantly improved.
I tested this well and it appears to work, and I added a lot of consistency checks, but because it includes writing and reading binary information, rare bugs may pop up, so be aware.
There was not much of a choice for storing the reflection information, given shaders can be a lot, take a lot of space and take time to parse.
2021-07-25 11:22:55 -03:00
|
|
|
}
|
2021-05-24 21:25:11 -03:00
|
|
|
|
|
|
|
for (uint32_t i = 0; i < variant_count; i++) {
|
2023-07-18 11:21:27 +02:00
|
|
|
int variant_id = group_to_variant_map[p_group][i];
|
|
|
|
if (!variants_enabled[variant_id]) {
|
2024-03-15 14:13:31 -03:00
|
|
|
p_version->variants.write[variant_id] = RID();
|
Implement Binary Shader Compilation
* Added an extra stage before compiling shader, which is generating a binary blob.
* On Vulkan, this allows caching the SPIRV reflection information, which is expensive to parse.
* On other (future) RenderingDevices, it allows caching converted binary data, such as DXIL or MSL.
This PR makes the shader cache include the reflection information, hence editor startup times are significantly improved.
I tested this well and it appears to work, and I added a lot of consistency checks, but because it includes writing and reading binary information, rare bugs may pop up, so be aware.
There was not much of a choice for storing the reflection information, given shaders can be a lot, take a lot of space and take time to parse.
2021-07-25 11:22:55 -03:00
|
|
|
continue;
|
|
|
|
}
|
2021-05-24 21:25:11 -03:00
|
|
|
{
|
Improvements from TheForge (see description)
The work was performed by collaboration of TheForge and Google. I am
merely splitting it up into smaller PRs and cleaning it up.
This is the most "risky" PR so far because the previous ones have been
miscellaneous stuff aimed at either [improve
debugging](https://github.com/godotengine/godot/pull/90993) (e.g. device
lost), [improve Android
experience](https://github.com/godotengine/godot/pull/96439) (add Swappy
for better Frame Pacing + Pre-Transformed Swapchains for slightly better
performance), or harmless [ASTC
improvements](https://github.com/godotengine/godot/pull/96045) (better
performance by simply toggling a feature when available).
However this PR contains larger modifications aimed at improving
performance or reducing memory fragmentation. With greater
modifications, come greater risks of bugs or breakage.
Changes introduced by this PR:
TBDR GPUs (e.g. most of Android + iOS + M1 Apple) support rendering to
Render Targets that are not backed by actual GPU memory (everything
stays in cache). This works as long as load action isn't `LOAD`, and
store action must be `DONT_CARE`. This saves VRAM (it also makes
painfully obvious when a mistake introduces a performance regression).
Of particular usefulness is when doing MSAA and keeping the raw MSAA
content is not necessary.
Some GPUs get faster when the sampler settings are hard-coded into the
GLSL shaders (instead of being dynamically bound at runtime). This
required changes to the GLSL shaders, PSO creation routines, Descriptor
creation routines, and Descriptor binding routines.
- `bool immutable_samplers_enabled = true`
Setting it to false enforces the old behavior. Useful for debugging bugs
and regressions.
Immutable samplers requires that the samplers stay... immutable, hence
this boolean is useful if the promise gets broken. We might want to turn
this into a `GLOBAL_DEF` setting.
Instead of creating dozen/hundreds/thousands of `VkDescriptorSet` every
frame that need to be freed individually when they are no longer needed,
they all get freed at once by resetting the whole pool. Once the whole
pool is no longer in use by the GPU, it gets reset and its memory
recycled. Descriptor sets that are created to be kept around for longer
or forever (i.e. not created and freed within the same frame) **must
not** use linear pools. There may be more than one pool per frame. How
many pools per frame Godot ends up with depends on its capacity, and
that is controlled by
`rendering/rendering_device/vulkan/max_descriptors_per_pool`.
- **Possible improvement for later:** It should be possible for Godot
to adapt to how many descriptors per pool are needed on a per-key basis
(i.e. grow their capacity like `std::vector` does) after rendering a few
frames; which would be better than the current solution of having a
single global value for all pools (`max_descriptors_per_pool`) that the
user needs to tweak.
- `bool linear_descriptor_pools_enabled = true`
Setting it to false enforces the old behavior. Useful for debugging bugs
and regressions.
Setting it to false is required when workarounding driver bugs (e.g.
Adreno 730).
A ridiculous optimization. Ridiculous because the original code
should've done this in the first place. Previously Godot was doing the
following:
1. Create a command buffer **pool**. One per frame.
2. Create multiple command buffers from the pool in point 1.
3. Call `vkBeginCommandBuffer` on the cmd buffer in point 2. This
resets the cmd buffer because Godot requests the
`VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT` flag.
4. Add commands to the cmd buffers from point 2.
5. Submit those commands.
6. On frame N + 2, recycle the buffer pool and cmd buffers from pt 1 &
2, and repeat from step 3.
The problem here is that step 3 resets each command buffer individually.
Initially Godot used to have 1 cmd buffer per pool, thus the impact is
very low.
But not anymore (specially with Adreno workarounds to force splitting
compute dispatches into a new cmd buffer, more on this later). However
Godot keeps around a very low amount of command buffers per frame.
The recommended method is to reset the whole pool, to reset all cmd
buffers at once. Hence the new steps would be:
1. Create a command buffer **pool**. One per frame.
2. Create multiple command buffers from the pool in point 1.
3. Call `vkBeginCommandBuffer` on the cmd buffer in point 2, which is
already reset/empty (see step 6).
4. Add commands to the cmd buffers from point 2.
5. Submit those commands.
6. On frame N + 2, recycle the buffer pool and cmd buffers from pt 1 &
2, call `vkResetCommandPool` and repeat from step 3.
**Possible issues:** @dariosamo added `transfer_worker` which creates a
command buffer pool:
```cpp
transfer_worker->command_pool =
driver->command_pool_create(transfer_queue_family,
RDD::COMMAND_BUFFER_TYPE_PRIMARY);
```
As expected, validation was complaining that command buffers were being
reused without being reset (that's good, we now know Validation Layers
will warn us of wrong use).
I fixed it by adding:
```cpp
void RenderingDevice::_wait_for_transfer_worker(TransferWorker
*p_transfer_worker) {
driver->fence_wait(p_transfer_worker->command_fence);
driver->command_pool_reset(p_transfer_worker->command_pool); //
! New line !
```
**Secondary cmd buffers are subject to the same issue but I didn't alter
them. I talked this with Dario and he is aware of this.**
Secondary cmd buffers are currently disabled due to other issues (it's
disabled on master).
- `bool RenderingDeviceCommons::command_pool_reset_enabled`
Setting it to false enforces the old behavior. Useful for debugging bugs
and regressions.
There's no other reason for this boolean. Possibly once it becomes well
tested, the boolean could be removed entirely.
Adds `command_bind_render_uniform_sets` and
`add_draw_list_bind_uniform_sets` (+ compute variants).
It performs the same as `add_draw_list_bind_uniform_set` (notice
singular vs plural), but on multiple consecutive uniform sets, thus
reducing graph and draw call overhead.
- `bool descriptor_set_batching = true;`
Setting it to false enforces the old behavior. Useful for debugging bugs
and regressions.
There's no other reason for this boolean. Possibly once it becomes well
tested, the boolean could be removed entirely.
Godot currently does the following:
1. Fill the entire cmd buffer with commands.
2. `submit()`
- Wait with a semaphore for the swapchain.
- Trigger a semaphore to indicate when we're done (so the swapchain
can submit).
3. `present()`
The optimization opportunity here is that 95% of Godot's rendering is
done offscreen.
Then a fullscreen pass copies everything to the swapchain. Godot doesn't
practically render directly to the swapchain.
The problem with this is that the GPU has to wait for the swapchain to
be released **to start anything**, when we could start *much earlier*.
Only the final blit pass must wait for the swapchain.
TheForge changed it to the following (more complicated, I'm simplifying
the idea):
1. Fill the entire cmd buffer with commands.
2. In `screen_prepare_for_drawing` do `submit()`
- There are no semaphore waits for the swapchain.
- Trigger a semaphore to indicate when we're done.
3. Fill a new cmd buffer that only does the final blit to the
swapchain.
4. `submit()`
- Wait with a semaphore for the submit() from step 2.
- Wait with a semaphore for the swapchain (so the swapchain can
submit).
- Trigger a semaphore to indicate when we're done (so the swapchain
can submit).
5. `present()`
Dario discovered this problem independently while working on a different
platform.
**However TheForge's solution had to be rewritten from scratch:** The
complexity to achieve the solution was high and quite difficult to
maintain with the way Godot works now (after Übershaders PR).
But on the other hand, re-implementing the solution became much simpler
because Dario already had to do something similar: To fix an Adreno 730
driver bug, he had to implement splitting command buffers. **This is
exactly what we need!**. Thus it was re-written using this existing
functionality for a new purpose.
To achieve this, I added a new argument, `bool p_split_cmd_buffer`, to
`RenderingDeviceGraph::add_draw_list_begin`, which is only set to true
by `RenderingDevice::draw_list_begin_for_screen`.
The graph will split the draw list into its own command buffer.
- `bool split_swapchain_into_its_own_cmd_buffer = true;`
Setting it to false enforces the old behavior. This might be necessary
for consoles which follow an alternate solution to the same problem.
If not, then we should consider removing it.
PR #90993 added `shader_destroy_modules()` but it was not actually in
use.
This PR adds several places where `shader_destroy_modules()` is called
after initialization to free up memory of SPIR-V structures that are no
longer needed.
2024-11-14 13:03:14 -03:00
|
|
|
RID shader = RD::get_singleton()->shader_create_from_bytecode_with_samplers(p_version->variant_data[variant_id], p_version->variants[variant_id], immutable_samplers);
|
2023-07-18 11:21:27 +02:00
|
|
|
if (shader.is_null()) {
|
|
|
|
for (uint32_t j = 0; j < i; j++) {
|
|
|
|
int variant_free_id = group_to_variant_map[p_group][j];
|
|
|
|
RD::get_singleton()->free(p_version->variants[variant_free_id]);
|
|
|
|
}
|
|
|
|
ERR_FAIL_COND_V(shader.is_null(), false);
|
|
|
|
}
|
|
|
|
|
2024-03-15 14:13:31 -03:00
|
|
|
p_version->variants.write[variant_id] = shader;
|
2021-05-24 21:25:11 -03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
p_version->valid = true;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2023-07-18 11:21:27 +02:00
|
|
|
void ShaderRD::_save_to_cache(Version *p_version, int p_group) {
|
2024-01-12 11:17:57 +08:00
|
|
|
ERR_FAIL_COND(!shader_cache_dir_valid);
|
2024-01-31 20:12:48 +01:00
|
|
|
const String &path = _get_cache_file_path(p_version, p_group);
|
2022-03-23 11:08:58 +02:00
|
|
|
Ref<FileAccess> f = FileAccess::open(path, FileAccess::WRITE);
|
|
|
|
ERR_FAIL_COND(f.is_null());
|
2021-05-24 21:25:11 -03:00
|
|
|
f->store_buffer((const uint8_t *)shader_file_header, 4);
|
2023-07-18 11:21:27 +02:00
|
|
|
f->store_32(cache_file_version); // File version.
|
|
|
|
uint32_t variant_count = group_to_variant_map[p_group].size();
|
|
|
|
f->store_32(variant_count); // Variant count.
|
2021-05-24 21:25:11 -03:00
|
|
|
for (uint32_t i = 0; i < variant_count; i++) {
|
2023-07-18 11:21:27 +02:00
|
|
|
int variant_id = group_to_variant_map[p_group][i];
|
|
|
|
f->store_32(p_version->variant_data[variant_id].size()); // Stage count.
|
|
|
|
f->store_buffer(p_version->variant_data[variant_id].ptr(), p_version->variant_data[variant_id].size());
|
2021-05-24 21:25:11 -03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-07-18 11:21:27 +02:00
|
|
|
void ShaderRD::_allocate_placeholders(Version *p_version, int p_group) {
|
2024-03-15 14:13:31 -03:00
|
|
|
ERR_FAIL_COND(p_version->variants.is_empty());
|
|
|
|
|
2023-07-18 11:21:27 +02:00
|
|
|
for (uint32_t i = 0; i < group_to_variant_map[p_group].size(); i++) {
|
|
|
|
int variant_id = group_to_variant_map[p_group][i];
|
|
|
|
RID shader = RD::get_singleton()->shader_create_placeholder();
|
|
|
|
{
|
2024-03-15 14:13:31 -03:00
|
|
|
p_version->variants.write[variant_id] = shader;
|
2023-07-18 11:21:27 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2019-06-15 23:45:24 -03:00
|
|
|
|
2023-07-18 11:21:27 +02:00
|
|
|
// Try to compile all variants for a given group.
|
|
|
|
// Will skip variants that are disabled.
|
2024-03-15 14:13:31 -03:00
|
|
|
void ShaderRD::_compile_version_start(Version *p_version, int p_group) {
|
2023-07-18 11:21:27 +02:00
|
|
|
if (!group_enabled[p_group]) {
|
|
|
|
return;
|
|
|
|
}
|
2019-06-15 23:45:24 -03:00
|
|
|
|
2023-07-18 11:21:27 +02:00
|
|
|
p_version->dirty = false;
|
|
|
|
|
2024-03-15 14:13:31 -03:00
|
|
|
#if ENABLE_SHADER_CACHE
|
2021-05-24 21:25:11 -03:00
|
|
|
if (shader_cache_dir_valid) {
|
2023-07-18 11:21:27 +02:00
|
|
|
if (_load_from_cache(p_version, p_group)) {
|
2021-05-24 21:25:11 -03:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
2024-03-15 14:13:31 -03:00
|
|
|
#endif
|
2021-05-24 21:25:11 -03:00
|
|
|
|
2023-07-18 11:21:27 +02:00
|
|
|
CompileData compile_data;
|
|
|
|
compile_data.version = p_version;
|
|
|
|
compile_data.group = p_group;
|
2019-06-15 23:45:24 -03:00
|
|
|
|
2025-04-08 15:17:04 +10:00
|
|
|
WorkerThreadPool::GroupID group_task = WorkerThreadPool::get_singleton()->add_template_group_task(this, &ShaderRD::_compile_variant, compile_data, group_to_variant_map[p_group].size(), -1, true, SNAME("ShaderCompilation"));
|
2024-03-15 14:13:31 -03:00
|
|
|
p_version->group_compilation_tasks.write[p_group] = group_task;
|
|
|
|
}
|
2022-07-23 19:12:41 +02:00
|
|
|
|
2024-03-15 14:13:31 -03:00
|
|
|
void ShaderRD::_compile_version_end(Version *p_version, int p_group) {
|
|
|
|
if (p_version->group_compilation_tasks.size() <= p_group || p_version->group_compilation_tasks[p_group] == 0) {
|
|
|
|
return;
|
2019-07-29 12:59:18 -03:00
|
|
|
}
|
2024-03-15 14:13:31 -03:00
|
|
|
WorkerThreadPool::GroupID group_task = p_version->group_compilation_tasks[p_group];
|
2025-04-08 15:17:04 +10:00
|
|
|
WorkerThreadPool::get_singleton()->wait_for_group_task_completion(group_task);
|
2024-03-15 14:13:31 -03:00
|
|
|
p_version->group_compilation_tasks.write[p_group] = 0;
|
2019-07-28 19:58:32 -03:00
|
|
|
|
2019-07-29 12:59:18 -03:00
|
|
|
bool all_valid = true;
|
2023-07-18 11:21:27 +02:00
|
|
|
|
|
|
|
for (uint32_t i = 0; i < group_to_variant_map[p_group].size(); i++) {
|
|
|
|
int variant_id = group_to_variant_map[p_group][i];
|
|
|
|
if (!variants_enabled[variant_id]) {
|
|
|
|
continue; // Disabled.
|
2020-12-07 18:27:38 -03:00
|
|
|
}
|
2023-07-18 11:21:27 +02:00
|
|
|
if (p_version->variants[variant_id].is_null()) {
|
2019-07-29 12:59:18 -03:00
|
|
|
all_valid = false;
|
|
|
|
break;
|
2019-07-28 19:58:32 -03:00
|
|
|
}
|
2019-07-29 12:59:18 -03:00
|
|
|
}
|
2019-07-28 19:58:32 -03:00
|
|
|
|
2019-07-29 12:59:18 -03:00
|
|
|
if (!all_valid) {
|
2023-07-18 11:21:27 +02:00
|
|
|
// Clear versions if they exist.
|
2019-07-29 12:59:18 -03:00
|
|
|
for (int i = 0; i < variant_defines.size(); i++) {
|
2023-07-18 11:21:27 +02:00
|
|
|
if (!variants_enabled[i] || !group_enabled[variant_defines[i].group]) {
|
|
|
|
continue; // Disabled.
|
2020-12-07 18:27:38 -03:00
|
|
|
}
|
2019-07-29 12:59:18 -03:00
|
|
|
if (!p_version->variants[i].is_null()) {
|
|
|
|
RD::get_singleton()->free(p_version->variants[i]);
|
|
|
|
}
|
|
|
|
}
|
2024-03-15 14:13:31 -03:00
|
|
|
|
|
|
|
p_version->variants.clear();
|
|
|
|
p_version->variant_data.clear();
|
2019-07-29 12:59:18 -03:00
|
|
|
return;
|
2024-03-15 14:13:31 -03:00
|
|
|
}
|
|
|
|
#if ENABLE_SHADER_CACHE
|
|
|
|
else if (shader_cache_dir_valid) {
|
2023-07-18 11:21:27 +02:00
|
|
|
_save_to_cache(p_version, p_group);
|
2019-06-15 23:45:24 -03:00
|
|
|
}
|
2024-03-15 14:13:31 -03:00
|
|
|
#endif
|
2021-05-24 21:25:11 -03:00
|
|
|
|
2019-06-15 23:45:24 -03:00
|
|
|
p_version->valid = true;
|
|
|
|
}
|
|
|
|
|
2024-03-15 14:13:31 -03:00
|
|
|
void ShaderRD::_compile_ensure_finished(Version *p_version) {
|
|
|
|
// Wait for compilation of existing groups if necessary.
|
|
|
|
for (int i = 0; i < group_enabled.size(); i++) {
|
|
|
|
_compile_version_end(p_version, i);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-05-13 15:04:37 +02:00
|
|
|
void ShaderRD::version_set_code(RID p_version, const HashMap<String, String> &p_code, const String &p_uniforms, const String &p_vertex_globals, const String &p_fragment_globals, const Vector<String> &p_custom_defines) {
|
2019-09-25 16:44:44 -03:00
|
|
|
ERR_FAIL_COND(is_compute);
|
|
|
|
|
2021-09-29 19:08:41 +02:00
|
|
|
Version *version = version_owner.get_or_null(p_version);
|
2023-09-09 17:04:18 +02:00
|
|
|
ERR_FAIL_NULL(version);
|
2024-03-15 14:13:31 -03:00
|
|
|
|
2025-04-08 15:17:04 +10:00
|
|
|
MutexLock lock(*version->mutex);
|
|
|
|
|
2024-03-15 14:13:31 -03:00
|
|
|
_compile_ensure_finished(version);
|
|
|
|
|
2019-06-15 23:45:24 -03:00
|
|
|
version->vertex_globals = p_vertex_globals.utf8();
|
|
|
|
version->fragment_globals = p_fragment_globals.utf8();
|
2019-07-21 11:31:30 -03:00
|
|
|
version->uniforms = p_uniforms.utf8();
|
2021-04-13 17:01:43 -03:00
|
|
|
version->code_sections.clear();
|
2021-08-09 14:13:42 -06:00
|
|
|
for (const KeyValue<String, String> &E : p_code) {
|
|
|
|
version->code_sections[StringName(E.key.to_upper())] = E.value.utf8();
|
2021-04-13 17:01:43 -03:00
|
|
|
}
|
2019-07-21 11:31:30 -03:00
|
|
|
|
2019-06-15 23:45:24 -03:00
|
|
|
version->custom_defines.clear();
|
|
|
|
for (int i = 0; i < p_custom_defines.size(); i++) {
|
|
|
|
version->custom_defines.push_back(p_custom_defines[i].utf8());
|
|
|
|
}
|
|
|
|
|
|
|
|
version->dirty = true;
|
|
|
|
if (version->initialize_needed) {
|
2023-07-18 11:21:27 +02:00
|
|
|
_initialize_version(version);
|
|
|
|
for (int i = 0; i < group_enabled.size(); i++) {
|
|
|
|
if (!group_enabled[i]) {
|
|
|
|
_allocate_placeholders(version, i);
|
|
|
|
continue;
|
|
|
|
}
|
2024-03-15 14:13:31 -03:00
|
|
|
_compile_version_start(version, i);
|
2023-07-18 11:21:27 +02:00
|
|
|
}
|
2019-06-15 23:45:24 -03:00
|
|
|
version->initialize_needed = false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-05-13 15:04:37 +02:00
|
|
|
void ShaderRD::version_set_compute_code(RID p_version, const HashMap<String, String> &p_code, const String &p_uniforms, const String &p_compute_globals, const Vector<String> &p_custom_defines) {
|
2019-09-25 16:44:44 -03:00
|
|
|
ERR_FAIL_COND(!is_compute);
|
|
|
|
|
2021-09-29 19:08:41 +02:00
|
|
|
Version *version = version_owner.get_or_null(p_version);
|
2023-09-09 17:04:18 +02:00
|
|
|
ERR_FAIL_NULL(version);
|
2021-04-13 17:01:43 -03:00
|
|
|
|
2025-04-08 15:17:04 +10:00
|
|
|
MutexLock lock(*version->mutex);
|
|
|
|
|
2024-03-15 14:13:31 -03:00
|
|
|
_compile_ensure_finished(version);
|
|
|
|
|
2019-09-25 16:44:44 -03:00
|
|
|
version->compute_globals = p_compute_globals.utf8();
|
|
|
|
version->uniforms = p_uniforms.utf8();
|
|
|
|
|
2021-04-13 17:01:43 -03:00
|
|
|
version->code_sections.clear();
|
2021-08-09 14:13:42 -06:00
|
|
|
for (const KeyValue<String, String> &E : p_code) {
|
|
|
|
version->code_sections[StringName(E.key.to_upper())] = E.value.utf8();
|
2021-04-13 17:01:43 -03:00
|
|
|
}
|
|
|
|
|
2019-09-25 16:44:44 -03:00
|
|
|
version->custom_defines.clear();
|
|
|
|
for (int i = 0; i < p_custom_defines.size(); i++) {
|
|
|
|
version->custom_defines.push_back(p_custom_defines[i].utf8());
|
|
|
|
}
|
|
|
|
|
|
|
|
version->dirty = true;
|
|
|
|
if (version->initialize_needed) {
|
2023-07-18 11:21:27 +02:00
|
|
|
_initialize_version(version);
|
|
|
|
for (int i = 0; i < group_enabled.size(); i++) {
|
|
|
|
if (!group_enabled[i]) {
|
|
|
|
_allocate_placeholders(version, i);
|
|
|
|
continue;
|
|
|
|
}
|
2024-03-15 14:13:31 -03:00
|
|
|
_compile_version_start(version, i);
|
2023-07-18 11:21:27 +02:00
|
|
|
}
|
2019-09-25 16:44:44 -03:00
|
|
|
version->initialize_needed = false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-07-27 10:23:24 -03:00
|
|
|
bool ShaderRD::version_is_valid(RID p_version) {
|
2021-09-29 19:08:41 +02:00
|
|
|
Version *version = version_owner.get_or_null(p_version);
|
2023-09-09 17:04:18 +02:00
|
|
|
ERR_FAIL_NULL_V(version, false);
|
2019-07-27 10:23:24 -03:00
|
|
|
|
2025-04-08 15:17:04 +10:00
|
|
|
MutexLock lock(*version->mutex);
|
|
|
|
|
2019-07-27 10:23:24 -03:00
|
|
|
if (version->dirty) {
|
2023-07-18 11:21:27 +02:00
|
|
|
_initialize_version(version);
|
|
|
|
for (int i = 0; i < group_enabled.size(); i++) {
|
|
|
|
if (!group_enabled[i]) {
|
|
|
|
_allocate_placeholders(version, i);
|
|
|
|
continue;
|
|
|
|
}
|
2024-03-15 14:13:31 -03:00
|
|
|
_compile_version_start(version, i);
|
2023-07-18 11:21:27 +02:00
|
|
|
}
|
2019-07-27 10:23:24 -03:00
|
|
|
}
|
|
|
|
|
2024-03-15 14:13:31 -03:00
|
|
|
_compile_ensure_finished(version);
|
|
|
|
|
2019-07-27 10:23:24 -03:00
|
|
|
return version->valid;
|
|
|
|
}
|
|
|
|
|
2019-06-15 23:45:24 -03:00
|
|
|
bool ShaderRD::version_free(RID p_version) {
|
|
|
|
if (version_owner.owns(p_version)) {
|
2025-04-08 15:17:04 +10:00
|
|
|
{
|
|
|
|
MutexLock lock(versions_mutex);
|
|
|
|
version_mutexes.erase(p_version);
|
|
|
|
}
|
|
|
|
|
2021-09-29 19:08:41 +02:00
|
|
|
Version *version = version_owner.get_or_null(p_version);
|
2025-04-08 15:17:04 +10:00
|
|
|
version->mutex->lock();
|
2019-06-15 23:45:24 -03:00
|
|
|
_clear_version(version);
|
|
|
|
version_owner.free(p_version);
|
2025-04-08 15:17:04 +10:00
|
|
|
version->mutex->unlock();
|
|
|
|
memdelete(version->mutex);
|
2019-06-15 23:45:24 -03:00
|
|
|
} else {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2020-12-07 18:27:38 -03:00
|
|
|
void ShaderRD::set_variant_enabled(int p_variant, bool p_enabled) {
|
|
|
|
ERR_FAIL_COND(version_owner.get_rid_count() > 0); //versions exist
|
|
|
|
ERR_FAIL_INDEX(p_variant, variants_enabled.size());
|
|
|
|
variants_enabled.write[p_variant] = p_enabled;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool ShaderRD::is_variant_enabled(int p_variant) const {
|
|
|
|
ERR_FAIL_INDEX_V(p_variant, variants_enabled.size(), false);
|
|
|
|
return variants_enabled[p_variant];
|
|
|
|
}
|
|
|
|
|
2023-07-18 11:21:27 +02:00
|
|
|
void ShaderRD::enable_group(int p_group) {
|
|
|
|
ERR_FAIL_INDEX(p_group, group_enabled.size());
|
|
|
|
|
|
|
|
if (group_enabled[p_group]) {
|
|
|
|
// Group already enabled, do nothing.
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
group_enabled.write[p_group] = true;
|
|
|
|
|
|
|
|
// Compile all versions again to include the new group.
|
2025-01-21 21:23:46 +08:00
|
|
|
for (const RID &version_rid : version_owner.get_owned_list()) {
|
|
|
|
Version *version = version_owner.get_or_null(version_rid);
|
2025-04-08 15:17:04 +10:00
|
|
|
version->mutex->lock();
|
2024-03-15 14:13:31 -03:00
|
|
|
_compile_version_start(version, p_group);
|
2025-04-08 15:17:04 +10:00
|
|
|
version->mutex->unlock();
|
2023-07-18 11:21:27 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
bool ShaderRD::is_group_enabled(int p_group) const {
|
|
|
|
return group_enabled[p_group];
|
|
|
|
}
|
|
|
|
|
2021-05-24 21:25:11 -03:00
|
|
|
bool ShaderRD::shader_cache_cleanup_on_start = false;
|
|
|
|
|
2021-02-02 16:51:36 -03:00
|
|
|
ShaderRD::ShaderRD() {
|
|
|
|
// Do not feel forced to use this, in most cases it makes little to no difference.
|
|
|
|
bool use_32_threads = false;
|
|
|
|
if (RD::get_singleton()->get_device_vendor_name() == "NVIDIA") {
|
|
|
|
use_32_threads = true;
|
|
|
|
}
|
|
|
|
String base_compute_define_text;
|
|
|
|
if (use_32_threads) {
|
|
|
|
base_compute_define_text = "\n#define NATIVE_LOCAL_GROUP_SIZE 32\n#define NATIVE_LOCAL_SIZE_2D_X 8\n#define NATIVE_LOCAL_SIZE_2D_Y 4\n";
|
|
|
|
} else {
|
|
|
|
base_compute_define_text = "\n#define NATIVE_LOCAL_GROUP_SIZE 64\n#define NATIVE_LOCAL_SIZE_2D_X 8\n#define NATIVE_LOCAL_SIZE_2D_Y 8\n";
|
|
|
|
}
|
|
|
|
|
|
|
|
base_compute_defines = base_compute_define_text.ascii();
|
|
|
|
}
|
|
|
|
|
Improvements from TheForge (see description)
The work was performed by collaboration of TheForge and Google. I am
merely splitting it up into smaller PRs and cleaning it up.
This is the most "risky" PR so far because the previous ones have been
miscellaneous stuff aimed at either [improve
debugging](https://github.com/godotengine/godot/pull/90993) (e.g. device
lost), [improve Android
experience](https://github.com/godotengine/godot/pull/96439) (add Swappy
for better Frame Pacing + Pre-Transformed Swapchains for slightly better
performance), or harmless [ASTC
improvements](https://github.com/godotengine/godot/pull/96045) (better
performance by simply toggling a feature when available).
However this PR contains larger modifications aimed at improving
performance or reducing memory fragmentation. With greater
modifications, come greater risks of bugs or breakage.
Changes introduced by this PR:
TBDR GPUs (e.g. most of Android + iOS + M1 Apple) support rendering to
Render Targets that are not backed by actual GPU memory (everything
stays in cache). This works as long as load action isn't `LOAD`, and
store action must be `DONT_CARE`. This saves VRAM (it also makes
painfully obvious when a mistake introduces a performance regression).
Of particular usefulness is when doing MSAA and keeping the raw MSAA
content is not necessary.
Some GPUs get faster when the sampler settings are hard-coded into the
GLSL shaders (instead of being dynamically bound at runtime). This
required changes to the GLSL shaders, PSO creation routines, Descriptor
creation routines, and Descriptor binding routines.
- `bool immutable_samplers_enabled = true`
Setting it to false enforces the old behavior. Useful for debugging bugs
and regressions.
Immutable samplers requires that the samplers stay... immutable, hence
this boolean is useful if the promise gets broken. We might want to turn
this into a `GLOBAL_DEF` setting.
Instead of creating dozen/hundreds/thousands of `VkDescriptorSet` every
frame that need to be freed individually when they are no longer needed,
they all get freed at once by resetting the whole pool. Once the whole
pool is no longer in use by the GPU, it gets reset and its memory
recycled. Descriptor sets that are created to be kept around for longer
or forever (i.e. not created and freed within the same frame) **must
not** use linear pools. There may be more than one pool per frame. How
many pools per frame Godot ends up with depends on its capacity, and
that is controlled by
`rendering/rendering_device/vulkan/max_descriptors_per_pool`.
- **Possible improvement for later:** It should be possible for Godot
to adapt to how many descriptors per pool are needed on a per-key basis
(i.e. grow their capacity like `std::vector` does) after rendering a few
frames; which would be better than the current solution of having a
single global value for all pools (`max_descriptors_per_pool`) that the
user needs to tweak.
- `bool linear_descriptor_pools_enabled = true`
Setting it to false enforces the old behavior. Useful for debugging bugs
and regressions.
Setting it to false is required when workarounding driver bugs (e.g.
Adreno 730).
A ridiculous optimization. Ridiculous because the original code
should've done this in the first place. Previously Godot was doing the
following:
1. Create a command buffer **pool**. One per frame.
2. Create multiple command buffers from the pool in point 1.
3. Call `vkBeginCommandBuffer` on the cmd buffer in point 2. This
resets the cmd buffer because Godot requests the
`VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT` flag.
4. Add commands to the cmd buffers from point 2.
5. Submit those commands.
6. On frame N + 2, recycle the buffer pool and cmd buffers from pt 1 &
2, and repeat from step 3.
The problem here is that step 3 resets each command buffer individually.
Initially Godot used to have 1 cmd buffer per pool, thus the impact is
very low.
But not anymore (specially with Adreno workarounds to force splitting
compute dispatches into a new cmd buffer, more on this later). However
Godot keeps around a very low amount of command buffers per frame.
The recommended method is to reset the whole pool, to reset all cmd
buffers at once. Hence the new steps would be:
1. Create a command buffer **pool**. One per frame.
2. Create multiple command buffers from the pool in point 1.
3. Call `vkBeginCommandBuffer` on the cmd buffer in point 2, which is
already reset/empty (see step 6).
4. Add commands to the cmd buffers from point 2.
5. Submit those commands.
6. On frame N + 2, recycle the buffer pool and cmd buffers from pt 1 &
2, call `vkResetCommandPool` and repeat from step 3.
**Possible issues:** @dariosamo added `transfer_worker` which creates a
command buffer pool:
```cpp
transfer_worker->command_pool =
driver->command_pool_create(transfer_queue_family,
RDD::COMMAND_BUFFER_TYPE_PRIMARY);
```
As expected, validation was complaining that command buffers were being
reused without being reset (that's good, we now know Validation Layers
will warn us of wrong use).
I fixed it by adding:
```cpp
void RenderingDevice::_wait_for_transfer_worker(TransferWorker
*p_transfer_worker) {
driver->fence_wait(p_transfer_worker->command_fence);
driver->command_pool_reset(p_transfer_worker->command_pool); //
! New line !
```
**Secondary cmd buffers are subject to the same issue but I didn't alter
them. I talked this with Dario and he is aware of this.**
Secondary cmd buffers are currently disabled due to other issues (it's
disabled on master).
- `bool RenderingDeviceCommons::command_pool_reset_enabled`
Setting it to false enforces the old behavior. Useful for debugging bugs
and regressions.
There's no other reason for this boolean. Possibly once it becomes well
tested, the boolean could be removed entirely.
Adds `command_bind_render_uniform_sets` and
`add_draw_list_bind_uniform_sets` (+ compute variants).
It performs the same as `add_draw_list_bind_uniform_set` (notice
singular vs plural), but on multiple consecutive uniform sets, thus
reducing graph and draw call overhead.
- `bool descriptor_set_batching = true;`
Setting it to false enforces the old behavior. Useful for debugging bugs
and regressions.
There's no other reason for this boolean. Possibly once it becomes well
tested, the boolean could be removed entirely.
Godot currently does the following:
1. Fill the entire cmd buffer with commands.
2. `submit()`
- Wait with a semaphore for the swapchain.
- Trigger a semaphore to indicate when we're done (so the swapchain
can submit).
3. `present()`
The optimization opportunity here is that 95% of Godot's rendering is
done offscreen.
Then a fullscreen pass copies everything to the swapchain. Godot doesn't
practically render directly to the swapchain.
The problem with this is that the GPU has to wait for the swapchain to
be released **to start anything**, when we could start *much earlier*.
Only the final blit pass must wait for the swapchain.
TheForge changed it to the following (more complicated, I'm simplifying
the idea):
1. Fill the entire cmd buffer with commands.
2. In `screen_prepare_for_drawing` do `submit()`
- There are no semaphore waits for the swapchain.
- Trigger a semaphore to indicate when we're done.
3. Fill a new cmd buffer that only does the final blit to the
swapchain.
4. `submit()`
- Wait with a semaphore for the submit() from step 2.
- Wait with a semaphore for the swapchain (so the swapchain can
submit).
- Trigger a semaphore to indicate when we're done (so the swapchain
can submit).
5. `present()`
Dario discovered this problem independently while working on a different
platform.
**However TheForge's solution had to be rewritten from scratch:** The
complexity to achieve the solution was high and quite difficult to
maintain with the way Godot works now (after Übershaders PR).
But on the other hand, re-implementing the solution became much simpler
because Dario already had to do something similar: To fix an Adreno 730
driver bug, he had to implement splitting command buffers. **This is
exactly what we need!**. Thus it was re-written using this existing
functionality for a new purpose.
To achieve this, I added a new argument, `bool p_split_cmd_buffer`, to
`RenderingDeviceGraph::add_draw_list_begin`, which is only set to true
by `RenderingDevice::draw_list_begin_for_screen`.
The graph will split the draw list into its own command buffer.
- `bool split_swapchain_into_its_own_cmd_buffer = true;`
Setting it to false enforces the old behavior. This might be necessary
for consoles which follow an alternate solution to the same problem.
If not, then we should consider removing it.
PR #90993 added `shader_destroy_modules()` but it was not actually in
use.
This PR adds several places where `shader_destroy_modules()` is called
after initialization to free up memory of SPIR-V structures that are no
longer needed.
2024-11-14 13:03:14 -03:00
|
|
|
void ShaderRD::initialize(const Vector<String> &p_variant_defines, const String &p_general_defines, const Vector<RD::PipelineImmutableSampler> &r_immutable_samplers) {
|
|
|
|
immutable_samplers = r_immutable_samplers;
|
2019-06-15 23:45:24 -03:00
|
|
|
ERR_FAIL_COND(variant_defines.size());
|
2024-01-19 13:21:39 +01:00
|
|
|
ERR_FAIL_COND(p_variant_defines.is_empty());
|
2020-12-07 18:27:38 -03:00
|
|
|
|
2019-07-10 17:44:55 -03:00
|
|
|
general_defines = p_general_defines.utf8();
|
2020-12-07 18:27:38 -03:00
|
|
|
|
2023-07-18 11:21:27 +02:00
|
|
|
// When initialized this way, there is just one group and its always enabled.
|
|
|
|
group_to_variant_map.insert(0, LocalVector<int>{});
|
|
|
|
group_enabled.push_back(true);
|
|
|
|
|
2019-06-15 23:45:24 -03:00
|
|
|
for (int i = 0; i < p_variant_defines.size(); i++) {
|
2023-07-18 11:21:27 +02:00
|
|
|
variant_defines.push_back(VariantDefine(0, p_variant_defines[i], true));
|
2020-12-07 18:27:38 -03:00
|
|
|
variants_enabled.push_back(true);
|
2024-03-15 14:13:31 -03:00
|
|
|
variant_to_group.push_back(0);
|
2023-07-18 11:21:27 +02:00
|
|
|
group_to_variant_map[0].push_back(i);
|
2019-06-15 23:45:24 -03:00
|
|
|
}
|
2021-05-24 21:25:11 -03:00
|
|
|
|
2021-12-09 03:42:46 -06:00
|
|
|
if (!shader_cache_dir.is_empty()) {
|
2023-07-18 11:21:27 +02:00
|
|
|
group_sha256.resize(1);
|
|
|
|
_initialize_cache();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void ShaderRD::_initialize_cache() {
|
|
|
|
for (const KeyValue<int, LocalVector<int>> &E : group_to_variant_map) {
|
2021-05-24 21:25:11 -03:00
|
|
|
StringBuilder hash_build;
|
|
|
|
|
|
|
|
hash_build.append("[base_hash]");
|
|
|
|
hash_build.append(base_sha256);
|
|
|
|
hash_build.append("[general_defines]");
|
|
|
|
hash_build.append(general_defines.get_data());
|
2023-07-18 11:21:27 +02:00
|
|
|
hash_build.append("[group_id]");
|
|
|
|
hash_build.append(itos(E.key));
|
|
|
|
for (uint32_t i = 0; i < E.value.size(); i++) {
|
|
|
|
hash_build.append("[variant_defines:" + itos(E.value[i]) + "]");
|
|
|
|
hash_build.append(variant_defines[E.value[i]].text.get_data());
|
2021-05-24 21:25:11 -03:00
|
|
|
}
|
|
|
|
|
2023-07-18 11:21:27 +02:00
|
|
|
group_sha256[E.key] = hash_build.as_string().sha256_text();
|
2021-05-24 21:25:11 -03:00
|
|
|
|
2022-03-23 11:08:58 +02:00
|
|
|
Ref<DirAccess> d = DirAccess::open(shader_cache_dir);
|
|
|
|
ERR_FAIL_COND(d.is_null());
|
2021-05-24 21:25:11 -03:00
|
|
|
if (d->change_dir(name) != OK) {
|
|
|
|
Error err = d->make_dir(name);
|
|
|
|
ERR_FAIL_COND(err != OK);
|
|
|
|
d->change_dir(name);
|
|
|
|
}
|
|
|
|
|
2023-07-18 11:21:27 +02:00
|
|
|
// Erase other versions?
|
2021-05-24 21:25:11 -03:00
|
|
|
if (shader_cache_cleanup_on_start) {
|
|
|
|
}
|
|
|
|
//
|
2023-07-18 11:21:27 +02:00
|
|
|
if (d->change_dir(group_sha256[E.key]) != OK) {
|
|
|
|
Error err = d->make_dir(group_sha256[E.key]);
|
2021-05-24 21:25:11 -03:00
|
|
|
ERR_FAIL_COND(err != OK);
|
|
|
|
}
|
|
|
|
shader_cache_dir_valid = true;
|
|
|
|
|
2023-07-18 11:21:27 +02:00
|
|
|
print_verbose("Shader '" + name + "' (group " + itos(E.key) + ") SHA256: " + group_sha256[E.key]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Same as above, but allows specifying shader compilation groups.
|
|
|
|
void ShaderRD::initialize(const Vector<VariantDefine> &p_variant_defines, const String &p_general_defines) {
|
|
|
|
ERR_FAIL_COND(variant_defines.size());
|
2024-01-19 13:21:39 +01:00
|
|
|
ERR_FAIL_COND(p_variant_defines.is_empty());
|
2023-07-18 11:21:27 +02:00
|
|
|
|
|
|
|
general_defines = p_general_defines.utf8();
|
|
|
|
|
|
|
|
int max_group_id = 0;
|
|
|
|
|
|
|
|
for (int i = 0; i < p_variant_defines.size(); i++) {
|
|
|
|
// Fill variant array.
|
|
|
|
variant_defines.push_back(p_variant_defines[i]);
|
|
|
|
variants_enabled.push_back(true);
|
2024-03-15 14:13:31 -03:00
|
|
|
variant_to_group.push_back(p_variant_defines[i].group);
|
2023-07-18 11:21:27 +02:00
|
|
|
|
|
|
|
// Map variant array index to group id, so we can iterate over groups later.
|
|
|
|
if (!group_to_variant_map.has(p_variant_defines[i].group)) {
|
|
|
|
group_to_variant_map.insert(p_variant_defines[i].group, LocalVector<int>{});
|
|
|
|
}
|
|
|
|
group_to_variant_map[p_variant_defines[i].group].push_back(i);
|
|
|
|
|
|
|
|
// Track max size.
|
|
|
|
if (p_variant_defines[i].group > max_group_id) {
|
|
|
|
max_group_id = p_variant_defines[i].group;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Set all to groups to false, then enable those that should be default.
|
|
|
|
group_enabled.resize_zeroed(max_group_id + 1);
|
|
|
|
bool *enabled_ptr = group_enabled.ptrw();
|
|
|
|
for (int i = 0; i < p_variant_defines.size(); i++) {
|
|
|
|
if (p_variant_defines[i].default_enabled) {
|
|
|
|
enabled_ptr[p_variant_defines[i].group] = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!shader_cache_dir.is_empty()) {
|
|
|
|
group_sha256.resize(max_group_id + 1);
|
|
|
|
_initialize_cache();
|
2021-05-24 21:25:11 -03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void ShaderRD::set_shader_cache_dir(const String &p_dir) {
|
|
|
|
shader_cache_dir = p_dir;
|
|
|
|
}
|
|
|
|
|
|
|
|
void ShaderRD::set_shader_cache_save_compressed(bool p_enable) {
|
|
|
|
shader_cache_save_compressed = p_enable;
|
2019-06-15 23:45:24 -03:00
|
|
|
}
|
|
|
|
|
2021-05-24 21:25:11 -03:00
|
|
|
void ShaderRD::set_shader_cache_save_compressed_zstd(bool p_enable) {
|
|
|
|
shader_cache_save_compressed_zstd = p_enable;
|
|
|
|
}
|
|
|
|
|
|
|
|
void ShaderRD::set_shader_cache_save_debug(bool p_enable) {
|
|
|
|
shader_cache_save_debug = p_enable;
|
|
|
|
}
|
|
|
|
|
|
|
|
String ShaderRD::shader_cache_dir;
|
|
|
|
bool ShaderRD::shader_cache_save_compressed = true;
|
|
|
|
bool ShaderRD::shader_cache_save_compressed_zstd = true;
|
|
|
|
bool ShaderRD::shader_cache_save_debug = true;
|
|
|
|
|
2019-06-15 23:45:24 -03:00
|
|
|
ShaderRD::~ShaderRD() {
|
2025-01-21 21:23:46 +08:00
|
|
|
LocalVector<RID> remaining = version_owner.get_owned_list();
|
2019-06-15 23:45:24 -03:00
|
|
|
if (remaining.size()) {
|
|
|
|
ERR_PRINT(itos(remaining.size()) + " shaders of type " + name + " were never freed");
|
2025-01-21 21:23:46 +08:00
|
|
|
for (const RID &version_rid : remaining) {
|
|
|
|
version_free(version_rid);
|
2019-06-15 23:45:24 -03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|