Apple: Use image atomic operations on supported Apple hardware

Co-authored-by: A Thousand Ships <96648715+AThousandShips@users.noreply.github.com>
2025-10-19 07:53:26 +00:00 · 2025-06-27 09:59:21 +10:00 · 2025-06-27 09:59:21 +10:00 · 5230f6c60c
commit 5230f6c60c
parent 9b22b41531
32 changed files with 5354 additions and 719 deletions
--- a/doc/classes/RenderingDevice.xml
+++ b/doc/classes/RenderingDevice.xml
@ -2490,6 +2490,9 @@
 		<constant name="SUPPORTS_BUFFER_DEVICE_ADDRESS" value="6" enum="Features">
 			Features support for buffer device address extension.
 		</constant>
+		<constant name="SUPPORTS_IMAGE_ATOMIC_32_BIT" value="7" enum="Features">
+			Support for 32-bit image atomic operations.
+		</constant>
 		<constant name="LIMIT_MAX_BOUND_UNIFORM_SETS" value="0" enum="Limit">
 			Maximum number of uniform sets that can be bound at a given time.
 		</constant>
--- a/drivers/apple/foundation_helpers.h
+++ b/drivers/apple/foundation_helpers.h
@ -0,0 +1,56 @@
+/**************************************************************************/
+/*  foundation_helpers.h                                                  */
+/**************************************************************************/
+/*                         This file is part of:                          */
+/*                             GODOT ENGINE                               */
+/*                        https://godotengine.org                         */
+/**************************************************************************/
+/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
+/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur.                  */
+/*                                                                        */
+/* Permission is hereby granted, free of charge, to any person obtaining  */
+/* a copy of this software and associated documentation files (the        */
+/* "Software"), to deal in the Software without restriction, including    */
+/* without limitation the rights to use, copy, modify, merge, publish,    */
+/* distribute, sublicense, and/or sell copies of the Software, and to     */
+/* permit persons to whom the Software is furnished to do so, subject to  */
+/* the following conditions:                                              */
+/*                                                                        */
+/* The above copyright notice and this permission notice shall be         */
+/* included in all copies or substantial portions of the Software.        */
+/*                                                                        */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,        */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF     */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY   */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,   */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE      */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                 */
+/**************************************************************************/
+
+#pragma once
+
+#import <Foundation/NSString.h>
+
+class String;
+template <typename T>
+class CharStringT;
+
+using CharString = CharStringT<char>;
+
+namespace conv {
+
+/**
+ * Converts a Godot String to an NSString without allocating an intermediate UTF-8 buffer.
+ * */
+NSString *to_nsstring(const String &p_str);
+/**
+ * Converts a Godot CharString to an NSString without allocating an intermediate UTF-8 buffer.
+ * */
+NSString *to_nsstring(const CharString &p_str);
+/**
+ * Converts an NSString to a Godot String without allocating intermediate buffers.
+ * */
+String to_string(NSString *p_str);
+
+} //namespace conv
--- a/drivers/apple/foundation_helpers.mm
+++ b/drivers/apple/foundation_helpers.mm
@ -0,0 +1,85 @@
+/**************************************************************************/
+/*  foundation_helpers.mm                                                 */
+/**************************************************************************/
+/*                         This file is part of:                          */
+/*                             GODOT ENGINE                               */
+/*                        https://godotengine.org                         */
+/**************************************************************************/
+/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
+/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur.                  */
+/*                                                                        */
+/* Permission is hereby granted, free of charge, to any person obtaining  */
+/* a copy of this software and associated documentation files (the        */
+/* "Software"), to deal in the Software without restriction, including    */
+/* without limitation the rights to use, copy, modify, merge, publish,    */
+/* distribute, sublicense, and/or sell copies of the Software, and to     */
+/* permit persons to whom the Software is furnished to do so, subject to  */
+/* the following conditions:                                              */
+/*                                                                        */
+/* The above copyright notice and this permission notice shall be         */
+/* included in all copies or substantial portions of the Software.        */
+/*                                                                        */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,        */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF     */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY   */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,   */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE      */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                 */
+/**************************************************************************/
+
+#import "foundation_helpers.h"
+
+#import "core/string/ustring.h"
+
+#import <CoreFoundation/CFString.h>
+
+namespace conv {
+
+NSString *to_nsstring(const String &p_str) {
+	return [[NSString alloc] initWithBytes:(const void *)p_str.ptr()
+									length:p_str.length() * sizeof(char32_t)
+								  encoding:NSUTF32LittleEndianStringEncoding];
+}
+
+NSString *to_nsstring(const CharString &p_str) {
+	return [[NSString alloc] initWithBytes:(const void *)p_str.ptr()
+									length:p_str.length()
+								  encoding:NSUTF8StringEncoding];
+}
+
+String to_string(NSString *p_str) {
+	CFStringRef str = (__bridge CFStringRef)p_str;
+	CFStringEncoding fastest = CFStringGetFastestEncoding(str);
+	// Sometimes, CFString will return a pointer to it's encoded data,
+	// so we can create the string without allocating intermediate buffers.
+	const char *p = CFStringGetCStringPtr(str, fastest);
+	if (p) {
+		switch (fastest) {
+			case kCFStringEncodingASCII:
+				return String::ascii(Span(p, CFStringGetLength(str)));
+			case kCFStringEncodingUTF8:
+				return String::utf8(p);
+			case kCFStringEncodingUTF32LE:
+				return String::utf32(Span((char32_t *)p, CFStringGetLength(str)));
+			default:
+				break;
+		}
+	}
+
+	CFRange range = CFRangeMake(0, CFStringGetLength(str));
+	CFIndex byte_len = 0;
+	// Try to losslessly convert the string directly into a String's buffer to avoid intermediate allocations.
+	CFIndex n = CFStringGetBytes(str, range, kCFStringEncodingUTF32LE, 0, NO, nil, 0, &byte_len);
+	if (n == range.length) {
+		String res;
+		res.resize_uninitialized((byte_len / sizeof(char32_t)) + 1);
+		res[n] = 0;
+		n = CFStringGetBytes(str, range, kCFStringEncodingUTF32LE, 0, NO, (UInt8 *)res.ptrw(), res.length() * sizeof(char32_t), nil);
+		return res;
+	}
+
+	return String::utf8(p_str.UTF8String);
+}
+
+} //namespace conv
--- a/drivers/d3d12/rendering_device_driver_d3d12.cpp
+++ b/drivers/d3d12/rendering_device_driver_d3d12.cpp
@ -5586,6 +5586,8 @@ bool RenderingDeviceDriverD3D12::has_feature(Features p_feature) {
 			return true;
 		case SUPPORTS_BUFFER_DEVICE_ADDRESS:
 			return true;
+		case SUPPORTS_IMAGE_ATOMIC_32_BIT:
+			return true;
 		default:
 			return false;
 	}
--- a/drivers/metal/SCsub
+++ b/drivers/metal/SCsub
@ -12,7 +12,6 @@ thirdparty_obj = []
 thirdparty_dir = "#thirdparty/spirv-cross/"
 thirdparty_sources = [
    "spirv_cfg.cpp",
-    "spirv_cross_util.cpp",
    "spirv_cross.cpp",
    "spirv_parser.cpp",
    "spirv_msl.cpp",
--- a/drivers/metal/metal_device_properties.h
+++ b/drivers/metal/metal_device_properties.h
@ -94,6 +94,8 @@ struct API_AVAILABLE(macos(11.0), ios(14.0), tvos(14.0)) MetalFeatures {
 	bool metal_fx_spatial = false; /**< If true, Metal FX spatial functions are supported. */
 	bool metal_fx_temporal = false; /**< If true, Metal FX temporal functions are supported. */
 	bool supports_gpu_address = false; /**< If true, referencing a GPU address in a shader is supported. */
+	bool supports_image_atomic_32_bit = false; /**< If true, 32-bit atomic operations on images are supported. */
+	bool supports_image_atomic_64_bit = false; /**< If true, 64-bit atomic operations on images are supported. */
 };

 struct MetalLimits {
--- a/drivers/metal/metal_device_properties.mm
+++ b/drivers/metal/metal_device_properties.mm
@ -121,6 +121,12 @@ void MetalDeviceProperties::init_features(id<MTLDevice> p_device) {
 	features.simdPermute = [p_device supportsFamily:MTLGPUFamilyApple6];
 	features.simdReduction = [p_device supportsFamily:MTLGPUFamilyApple7];
 	features.argument_buffers_tier = p_device.argumentBuffersSupport;
+	features.supports_image_atomic_32_bit = [p_device supportsFamily:MTLGPUFamilyApple6];
+	features.supports_image_atomic_64_bit = [p_device supportsFamily:MTLGPUFamilyApple8];
+	if (OS::get_singleton()->get_environment("GODOT_MTL_DISABLE_IMAGE_ATOMICS") == "1") {
+		features.supports_image_atomic_32_bit = false;
+		features.supports_image_atomic_64_bit = false;
+	}

 	if (@available(macOS 13.0, iOS 16.0, tvOS 16.0, *)) {
 		features.needs_arg_encoders = !([p_device supportsFamily:MTLGPUFamilyMetal3] && features.argument_buffers_tier == MTLArgumentBuffersTier2);
--- a/drivers/metal/metal_objects.h
+++ b/drivers/metal/metal_objects.h
@ -309,9 +309,23 @@ public:

 class API_AVAILABLE(macos(11.0), ios(14.0), tvos(14.0)) MDCommandBuffer {
 private:
+#pragma mark - Common State
+
+	// From RenderingDevice
+	static constexpr uint32_t MAX_PUSH_CONSTANT_SIZE = 128;
+
 	RenderingDeviceDriverMetal *device_driver = nullptr;
 	id<MTLCommandQueue> queue = nil;
 	id<MTLCommandBuffer> commandBuffer = nil;
+	bool state_begin = false;
+
+	_FORCE_INLINE_ id<MTLCommandBuffer> command_buffer() {
+		DEV_ASSERT(state_begin);
+		if (commandBuffer == nil) {
+			commandBuffer = queue.commandBuffer;
+		}
+		return commandBuffer;
+	}

 	void _end_compute_dispatch();
 	void _end_blit();
@ -326,6 +340,11 @@ private:
 	void _end_render_pass();
 	void _render_clear_render_area();

+#pragma mark - Compute
+
+	void _compute_set_dirty_state();
+	void _compute_bind_uniform_sets();
+
 public:
 	MDCommandBufferStateType type = MDCommandBufferStateType::None;

@ -349,18 +368,18 @@ public:
 		LocalVector<NSUInteger> vertex_offsets;
 		ResourceUsageMap resource_usage;
 		// clang-format off
-		enum DirtyFlag: uint8_t {
-			DIRTY_NONE     = 0b0000'0000,
-			DIRTY_PIPELINE = 0b0000'0001, //! pipeline state
-			DIRTY_UNIFORMS = 0b0000'0010, //! uniform sets
-			DIRTY_DEPTH    = 0b0000'0100, //! depth / stencil state
-			DIRTY_VERTEX   = 0b0000'1000, //! vertex buffers
-			DIRTY_VIEWPORT = 0b0001'0000, //! viewport rectangles
-			DIRTY_SCISSOR  = 0b0010'0000, //! scissor rectangles
-			DIRTY_BLEND    = 0b0100'0000, //! blend state
-			DIRTY_RASTER   = 0b1000'0000, //! encoder state like cull mode
-
-			DIRTY_ALL      = 0xff,
+		enum DirtyFlag: uint16_t {
+			DIRTY_NONE     = 0,
+			DIRTY_PIPELINE = 1 << 0, //! pipeline state
+			DIRTY_UNIFORMS = 1 << 1, //! uniform sets
+			DIRTY_PUSH     = 1 << 2, //! push constants
+			DIRTY_DEPTH    = 1 << 3, //! depth / stencil state
+			DIRTY_VERTEX   = 1 << 4, //! vertex buffers
+			DIRTY_VIEWPORT = 1 << 5, //! viewport rectangles
+			DIRTY_SCISSOR  = 1 << 6, //! scissor rectangles
+			DIRTY_BLEND    = 1 << 7, //! blend state
+			DIRTY_RASTER   = 1 << 8, //! encoder state like cull mode
+			DIRTY_ALL      = (1 << 9) - 1,
 		};
 		// clang-format on
 		BitField<DirtyFlag> dirty = DIRTY_NONE;
@ -368,6 +387,9 @@ public:
 		LocalVector<MDUniformSet *> uniform_sets;
 		// Bit mask of the uniform sets that are dirty, to prevent redundant binding.
 		uint64_t uniform_set_mask = 0;
+		uint8_t push_constant_data[MAX_PUSH_CONSTANT_SIZE];
+		uint32_t push_constant_data_len = 0;
+		uint32_t push_constant_bindings[2] = { 0 };

 		_FORCE_INLINE_ void reset();
 		void end_encoding();
@ -422,6 +444,13 @@ public:
 			dirty.set_flag(DirtyFlag::DIRTY_UNIFORMS);
 		}

+		_FORCE_INLINE_ void mark_push_constants_dirty() {
+			if (push_constant_data_len == 0) {
+				return;
+			}
+			dirty.set_flag(DirtyFlag::DIRTY_PUSH);
+		}
+
 		_FORCE_INLINE_ void mark_blend_dirty() {
 			if (!blend_constants.has_value()) {
 				return;
@ -464,16 +493,46 @@ public:
 		MDComputePipeline *pipeline = nullptr;
 		id<MTLComputeCommandEncoder> encoder = nil;
 		ResourceUsageMap resource_usage;
-		_FORCE_INLINE_ void reset() {
-			pipeline = nil;
-			encoder = nil;
-			// Keep the keys, as they are likely to be used again.
-			for (KeyValue<StageResourceUsage, LocalVector<__unsafe_unretained id<MTLResource>>> &kv : resource_usage) {
-				kv.value.clear();
+		// clang-format off
+		enum DirtyFlag: uint16_t {
+			DIRTY_NONE     = 0,
+			DIRTY_PIPELINE = 1 << 0, //! pipeline state
+			DIRTY_UNIFORMS = 1 << 1, //! uniform sets
+			DIRTY_PUSH     = 1 << 2, //! push constants
+			DIRTY_ALL      = (1 << 3) - 1,
+		};
+		// clang-format on
+		BitField<DirtyFlag> dirty = DIRTY_NONE;
+
+		LocalVector<MDUniformSet *> uniform_sets;
+		// Bit mask of the uniform sets that are dirty, to prevent redundant binding.
+		uint64_t uniform_set_mask = 0;
+		uint8_t push_constant_data[MAX_PUSH_CONSTANT_SIZE];
+		uint32_t push_constant_data_len = 0;
+		uint32_t push_constant_bindings[1] = { 0 };
+
+		_FORCE_INLINE_ void reset();
+		void end_encoding();
+
+		_FORCE_INLINE_ void mark_uniforms_dirty(void) {
+			if (uniform_sets.is_empty()) {
+				return;
 			}
+			for (uint32_t i = 0; i < uniform_sets.size(); i++) {
+				if (uniform_sets[i] != nullptr) {
+					uniform_set_mask |= 1 << i;
+				}
+			}
+			dirty.set_flag(DirtyFlag::DIRTY_UNIFORMS);
+		}
+
+		_FORCE_INLINE_ void mark_push_constants_dirty() {
+			if (push_constant_data_len == 0) {
+				return;
+			}
+			dirty.set_flag(DirtyFlag::DIRTY_PUSH);
 		}

-		void end_encoding();
 	} compute;

 	// State specific to a blit pass.
@ -496,6 +555,7 @@ public:
 	void encodeRenderCommandEncoderWithDescriptor(MTLRenderPassDescriptor *p_desc, NSString *p_label);

 	void bind_pipeline(RDD::PipelineID p_pipeline);
+	void encode_push_constant_data(RDD::ShaderID p_shader, VectorView<uint32_t> p_data);

 #pragma mark - Render Commands

@ -661,8 +721,6 @@ public:
 	Vector<UniformSet> sets;
 	bool uses_argument_buffers = true;

-	virtual void encode_push_constant_data(VectorView<uint32_t> p_data, MDCommandBuffer *p_cb) = 0;
-
 	MDShader(CharString p_name, Vector<UniformSet> p_sets, bool p_uses_argument_buffers) :
 			name(p_name), sets(p_sets), uses_argument_buffers(p_uses_argument_buffers) {}
 	virtual ~MDShader() = default;
@ -671,15 +729,13 @@ public:
 class API_AVAILABLE(macos(11.0), ios(14.0), tvos(14.0)) MDComputeShader final : public MDShader {
 public:
 	struct {
-		uint32_t binding = -1;
+		int32_t binding = -1;
 		uint32_t size = 0;
 	} push_constants;
 	MTLSize local = {};

 	MDLibrary *kernel;

-	void encode_push_constant_data(VectorView<uint32_t> p_data, MDCommandBuffer *p_cb) final;
-
 	MDComputeShader(CharString p_name, Vector<UniformSet> p_sets, bool p_uses_argument_buffers, MDLibrary *p_kernel);
 };

@ -700,8 +756,6 @@ public:
 	MDLibrary *vert;
 	MDLibrary *frag;

-	void encode_push_constant_data(VectorView<uint32_t> p_data, MDCommandBuffer *p_cb) final;
-
 	MDRenderShader(CharString p_name,
 			Vector<UniformSet> p_sets,
 			bool p_needs_view_mask_buffer,
--- a/drivers/metal/metal_objects.mm
+++ b/drivers/metal/metal_objects.mm
@ -62,8 +62,8 @@
 #undef MAX

 void MDCommandBuffer::begin() {
-	DEV_ASSERT(commandBuffer == nil);
-	commandBuffer = queue.commandBuffer;
+	DEV_ASSERT(commandBuffer == nil && !state_begin);
+	state_begin = true;
 }

 void MDCommandBuffer::end() {
@ -83,6 +83,7 @@ void MDCommandBuffer::commit() {
 	end();
 	[commandBuffer commit];
 	commandBuffer = nil;
+	state_begin = false;
 }

 void MDCommandBuffer::bind_pipeline(RDD::PipelineID p_pipeline) {
@ -136,7 +137,7 @@ void MDCommandBuffer::bind_pipeline(RDD::PipelineID p_pipeline) {
 				render.desc.colorAttachments[0].resolveTexture = res_tex;
 			}
 #endif
-			render.encoder = [commandBuffer renderCommandEncoderWithDescriptor:render.desc];
+			render.encoder = [command_buffer() renderCommandEncoderWithDescriptor:render.desc];
 		}

 		if (render.pipeline != rp) {
@ -160,9 +161,44 @@ void MDCommandBuffer::bind_pipeline(RDD::PipelineID p_pipeline) {
 		DEV_ASSERT(type == MDCommandBufferStateType::None);
 		type = MDCommandBufferStateType::Compute;

-		compute.pipeline = (MDComputePipeline *)p;
-		compute.encoder = commandBuffer.computeCommandEncoder;
-		[compute.encoder setComputePipelineState:compute.pipeline->state];
+		if (compute.pipeline != p) {
+			compute.dirty.set_flag(ComputeState::DIRTY_PIPELINE);
+			compute.mark_uniforms_dirty();
+			compute.pipeline = (MDComputePipeline *)p;
+		}
+	}
+}
+
+void MDCommandBuffer::encode_push_constant_data(RDD::ShaderID p_shader, VectorView<uint32_t> p_data) {
+	switch (type) {
+		case MDCommandBufferStateType::Render: {
+			MDRenderShader *shader = (MDRenderShader *)(p_shader.id);
+			if (shader->push_constants.vert.binding == -1 && shader->push_constants.frag.binding == -1) {
+				return;
+			}
+			render.push_constant_bindings[0] = shader->push_constants.vert.binding;
+			render.push_constant_bindings[1] = shader->push_constants.frag.binding;
+			void const *ptr = p_data.ptr();
+			render.push_constant_data_len = p_data.size() * sizeof(uint32_t);
+			DEV_ASSERT(render.push_constant_data_len <= sizeof(RenderState::push_constant_data));
+			memcpy(render.push_constant_data, ptr, render.push_constant_data_len);
+			render.mark_push_constants_dirty();
+		} break;
+		case MDCommandBufferStateType::Compute: {
+			MDComputeShader *shader = (MDComputeShader *)(p_shader.id);
+			if (shader->push_constants.binding == -1) {
+				return;
+			}
+			compute.push_constant_bindings[0] = shader->push_constants.binding;
+			void const *ptr = p_data.ptr();
+			compute.push_constant_data_len = p_data.size() * sizeof(uint32_t);
+			DEV_ASSERT(compute.push_constant_data_len <= sizeof(ComputeState::push_constant_data));
+			memcpy(compute.push_constant_data, ptr, compute.push_constant_data_len);
+			compute.mark_push_constants_dirty();
+		} break;
+		case MDCommandBufferStateType::Blit:
+		case MDCommandBufferStateType::None:
+			return;
 	}
 }

@ -181,7 +217,7 @@ id<MTLBlitCommandEncoder> MDCommandBuffer::blit_command_encoder() {
 	}

 	type = MDCommandBufferStateType::Blit;
-	blit.encoder = commandBuffer.blitCommandEncoder;
+	blit.encoder = command_buffer().blitCommandEncoder;
 	return blit.encoder;
 }

@ -200,7 +236,7 @@ void MDCommandBuffer::encodeRenderCommandEncoderWithDescriptor(MTLRenderPassDesc
 			break;
 	}

-	id<MTLRenderCommandEncoder> enc = [commandBuffer renderCommandEncoderWithDescriptor:p_desc];
+	id<MTLRenderCommandEncoder> enc = [command_buffer() renderCommandEncoderWithDescriptor:p_desc];
 	if (p_label != nil) {
 		[enc pushDebugGroup:p_label];
 		[enc popDebugGroup];
@ -344,6 +380,19 @@ void MDCommandBuffer::render_clear_attachments(VectorView<RDD::AttachmentClear>
 void MDCommandBuffer::_render_set_dirty_state() {
 	_render_bind_uniform_sets();

+	if (render.dirty.has_flag(RenderState::DIRTY_PUSH)) {
+		if (render.push_constant_bindings[0] != (uint32_t)-1) {
+			[render.encoder setVertexBytes:render.push_constant_data
+									length:render.push_constant_data_len
+								   atIndex:render.push_constant_bindings[0]];
+		}
+		if (render.push_constant_bindings[1] != (uint32_t)-1) {
+			[render.encoder setFragmentBytes:render.push_constant_data
+									  length:render.push_constant_data_len
+									 atIndex:render.push_constant_bindings[1]];
+		}
+	}
+
 	MDSubpass const &subpass = render.get_subpass();
 	if (subpass.view_count > 1) {
 		uint32_t view_range[2] = { 0, subpass.view_count };
@ -552,7 +601,7 @@ uint32_t MDCommandBuffer::_populate_vertices(simd::float4 *p_vertices, uint32_t
 }

 void MDCommandBuffer::render_begin_pass(RDD::RenderPassID p_render_pass, RDD::FramebufferID p_frameBuffer, RDD::CommandBufferType p_cmd_buffer_type, const Rect2i &p_rect, VectorView<RDD::RenderPassClearValue> p_clear_values) {
-	DEV_ASSERT(commandBuffer != nil);
+	DEV_ASSERT(command_buffer() != nil);
 	end();

 	MDRenderPass *pass = (MDRenderPass *)(p_render_pass.id);
@ -639,7 +688,7 @@ void MDCommandBuffer::_render_clear_render_area() {
 }

 void MDCommandBuffer::render_next_subpass() {
-	DEV_ASSERT(commandBuffer != nil);
+	DEV_ASSERT(command_buffer() != nil);

 	if (render.current_subpass == UINT32_MAX) {
 		render.current_subpass = 0;
@ -726,7 +775,7 @@ void MDCommandBuffer::render_next_subpass() {
 		// the defaultRasterSampleCount from the pipeline's sample count.
 		render.desc = desc;
 	} else {
-		render.encoder = [commandBuffer renderCommandEncoderWithDescriptor:desc];
+		render.encoder = [command_buffer() renderCommandEncoderWithDescriptor:desc];

 		if (!render.is_rendering_entire_area) {
 			_render_clear_render_area();
@ -895,6 +944,7 @@ void MDCommandBuffer::RenderState::reset() {
 	dirty = DIRTY_NONE;
 	uniform_sets.clear();
 	uniform_set_mask = 0;
+	push_constant_data_len = 0;
 	clear_values.clear();
 	viewports.clear();
 	scissors.clear();
@ -960,29 +1010,108 @@ void MDCommandBuffer::ComputeState::end_encoding() {

 #pragma mark - Compute

+void MDCommandBuffer::_compute_set_dirty_state() {
+	if (compute.dirty.has_flag(ComputeState::DIRTY_PIPELINE)) {
+		compute.encoder = [command_buffer() computeCommandEncoderWithDispatchType:MTLDispatchTypeConcurrent];
+		[compute.encoder setComputePipelineState:compute.pipeline->state];
+	}
+
+	_compute_bind_uniform_sets();
+
+	if (compute.dirty.has_flag(ComputeState::DIRTY_PUSH)) {
+		if (compute.push_constant_bindings[0] != (uint32_t)-1) {
+			[compute.encoder setBytes:compute.push_constant_data
+							   length:compute.push_constant_data_len
+							  atIndex:compute.push_constant_bindings[0]];
+		}
+	}
+
+	compute.dirty.clear();
+}
+
+void MDCommandBuffer::_compute_bind_uniform_sets() {
+	DEV_ASSERT(type == MDCommandBufferStateType::Compute);
+	if (!compute.dirty.has_flag(ComputeState::DIRTY_UNIFORMS)) {
+		return;
+	}
+
+	compute.dirty.clear_flag(ComputeState::DIRTY_UNIFORMS);
+	uint64_t set_uniforms = compute.uniform_set_mask;
+	compute.uniform_set_mask = 0;
+
+	MDComputeShader *shader = compute.pipeline->shader;
+
+	while (set_uniforms != 0) {
+		// Find the index of the next set bit.
+		uint32_t index = (uint32_t)__builtin_ctzll(set_uniforms);
+		// Clear the set bit.
+		set_uniforms &= (set_uniforms - 1);
+		MDUniformSet *set = compute.uniform_sets[index];
+		if (set == nullptr || index >= (uint32_t)shader->sets.size()) {
+			continue;
+		}
+		set->bind_uniforms(shader, compute, index);
+	}
+}
+
+void MDCommandBuffer::ComputeState::reset() {
+	pipeline = nil;
+	encoder = nil;
+	dirty = DIRTY_NONE;
+	uniform_sets.clear();
+	uniform_set_mask = 0;
+	push_constant_data_len = 0;
+	// Keep the keys, as they are likely to be used again.
+	for (KeyValue<StageResourceUsage, LocalVector<__unsafe_unretained id<MTLResource>>> &kv : resource_usage) {
+		kv.value.clear();
+	}
+}
+
 void MDCommandBuffer::compute_bind_uniform_set(RDD::UniformSetID p_uniform_set, RDD::ShaderID p_shader, uint32_t p_set_index) {
 	DEV_ASSERT(type == MDCommandBufferStateType::Compute);

-	MDShader *shader = (MDShader *)(p_shader.id);
 	MDUniformSet *set = (MDUniformSet *)(p_uniform_set.id);
-	set->bind_uniforms(shader, compute, p_set_index);
+	if (compute.uniform_sets.size() <= p_set_index) {
+		uint32_t s = render.uniform_sets.size();
+		compute.uniform_sets.resize(p_set_index + 1);
+		// Set intermediate values to null.
+		std::fill(&compute.uniform_sets[s], &compute.uniform_sets[p_set_index] + 1, nullptr);
+	}
+
+	if (compute.uniform_sets[p_set_index] != set) {
+		compute.dirty.set_flag(ComputeState::DIRTY_UNIFORMS);
+		compute.uniform_set_mask |= 1ULL << p_set_index;
+		compute.uniform_sets[p_set_index] = set;
+	}
 }

 void MDCommandBuffer::compute_bind_uniform_sets(VectorView<RDD::UniformSetID> p_uniform_sets, RDD::ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count) {
 	DEV_ASSERT(type == MDCommandBufferStateType::Compute);

-	MDShader *shader = (MDShader *)(p_shader.id);
-
-	// TODO(sgc): Bind multiple buffers using [encoder setBuffers:offsets:withRange:]
-	for (size_t i = 0u; i < p_set_count; ++i) {
+	for (size_t i = 0; i < p_set_count; ++i) {
 		MDUniformSet *set = (MDUniformSet *)(p_uniform_sets[i].id);
-		set->bind_uniforms(shader, compute, p_first_set_index + i);
+
+		uint32_t index = p_first_set_index + i;
+		if (compute.uniform_sets.size() <= index) {
+			uint32_t s = compute.uniform_sets.size();
+			compute.uniform_sets.resize(index + 1);
+			// Set intermediate values to null.
+			std::fill(&compute.uniform_sets[s], &compute.uniform_sets[index] + 1, nullptr);
+		}
+
+		if (compute.uniform_sets[index] != set) {
+			compute.dirty.set_flag(ComputeState::DIRTY_UNIFORMS);
+			compute.uniform_set_mask |= 1ULL << index;
+			compute.uniform_sets[index] = set;
+		}
 	}
 }

 void MDCommandBuffer::compute_dispatch(uint32_t p_x_groups, uint32_t p_y_groups, uint32_t p_z_groups) {
 	DEV_ASSERT(type == MDCommandBufferStateType::Compute);

+	_compute_set_dirty_state();
+
 	MTLRegion region = MTLRegionMake3D(0, 0, 0, p_x_groups, p_y_groups, p_z_groups);

 	id<MTLComputeCommandEncoder> enc = compute.encoder;
@ -992,6 +1121,8 @@ void MDCommandBuffer::compute_dispatch(uint32_t p_x_groups, uint32_t p_y_groups,
 void MDCommandBuffer::compute_dispatch_indirect(RDD::BufferID p_indirect_buffer, uint64_t p_offset) {
 	DEV_ASSERT(type == MDCommandBufferStateType::Compute);

+	_compute_set_dirty_state();
+
 	id<MTLBuffer> indirectBuffer = rid::get(p_indirect_buffer);

 	id<MTLComputeCommandEncoder> enc = compute.encoder;
@ -1021,20 +1152,6 @@ MDComputeShader::MDComputeShader(CharString p_name,
 		MDShader(p_name, p_sets, p_uses_argument_buffers), kernel(p_kernel) {
 }

-void MDComputeShader::encode_push_constant_data(VectorView<uint32_t> p_data, MDCommandBuffer *p_cb) {
-	DEV_ASSERT(p_cb->type == MDCommandBufferStateType::Compute);
-	if (push_constants.binding == (uint32_t)-1) {
-		return;
-	}
-
-	id<MTLComputeCommandEncoder> enc = p_cb->compute.encoder;
-
-	void const *ptr = p_data.ptr();
-	size_t length = p_data.size() * sizeof(uint32_t);
-
-	[enc setBytes:ptr length:length atIndex:push_constants.binding];
-}
-
 MDRenderShader::MDRenderShader(CharString p_name,
 		Vector<UniformSet> p_sets,
 		bool p_needs_view_mask_buffer,
@ -1046,22 +1163,6 @@ MDRenderShader::MDRenderShader(CharString p_name,
 		frag(p_frag) {
 }

-void MDRenderShader::encode_push_constant_data(VectorView<uint32_t> p_data, MDCommandBuffer *p_cb) {
-	DEV_ASSERT(p_cb->type == MDCommandBufferStateType::Render);
-	id<MTLRenderCommandEncoder> __unsafe_unretained enc = p_cb->render.encoder;
-
-	void const *ptr = p_data.ptr();
-	size_t length = p_data.size() * sizeof(uint32_t);
-
-	if (push_constants.vert.binding > -1) {
-		[enc setVertexBytes:ptr length:length atIndex:push_constants.vert.binding];
-	}
-
-	if (push_constants.frag.binding > -1) {
-		[enc setFragmentBytes:ptr length:length atIndex:push_constants.frag.binding];
-	}
-}
-
 void MDUniformSet::bind_uniforms_argument_buffers(MDShader *p_shader, MDCommandBuffer::RenderState &p_state, uint32_t p_set_index) {
 	DEV_ASSERT(p_shader->uses_argument_buffers);
 	DEV_ASSERT(p_state.encoder != nil);
--- a/drivers/metal/rendering_device_driver_metal.mm
+++ b/drivers/metal/rendering_device_driver_metal.mm
@ -58,6 +58,7 @@
 #include "core/io/marshalls.h"
 #include "core/string/ustring.h"
 #include "core/templates/hash_map.h"
+#include "drivers/apple/foundation_helpers.h"

 #import <Metal/MTLTexture.h>
 #import <Metal/Metal.h>
@ -317,12 +318,6 @@ RDD::TextureID RenderingDeviceDriverMetal::texture_create(const TextureFormat &p
 		desc.usage |= MTLTextureUsageShaderWrite;
 	}

-	if (@available(macOS 14.0, iOS 17.0, tvOS 17.0, *)) {
-		if (format_caps & kMTLFmtCapsAtomic) {
-			desc.usage |= MTLTextureUsageShaderAtomic;
-		}
-	}
-
 	bool can_be_attachment = flags::any(format_caps, (kMTLFmtCapsColorAtt | kMTLFmtCapsDSAtt));

 	if (flags::any(p_format.usage_bits, TEXTURE_USAGE_COLOR_ATTACHMENT_BIT | TEXTURE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT) &&
@ -334,6 +329,18 @@ RDD::TextureID RenderingDeviceDriverMetal::texture_create(const TextureFormat &p
 		desc.usage |= MTLTextureUsageShaderRead;
 	}

+	if (p_format.usage_bits & TEXTURE_USAGE_STORAGE_ATOMIC_BIT) {
+		if (@available(macOS 14.0, iOS 17.0, tvOS 17.0, *)) {
+			if (format_caps & kMTLFmtCapsAtomic) {
+				desc.usage |= MTLTextureUsageShaderAtomic;
+			} else {
+				ERR_FAIL_V_MSG(RDD::TextureID(), "Atomic operations on this texture format are not supported.");
+			}
+		} else {
+			ERR_FAIL_V_MSG(RDD::TextureID(), "Atomic texture operations not supported on this OS version.");
+		}
+	}
+
 	if (p_format.usage_bits & TEXTURE_USAGE_VRS_ATTACHMENT_BIT) {
 		ERR_FAIL_V_MSG(RDD::TextureID(), "unsupported: TEXTURE_USAGE_VRS_ATTACHMENT_BIT");
 	}
@ -363,7 +370,29 @@ RDD::TextureID RenderingDeviceDriverMetal::texture_create(const TextureFormat &p

 	// Check if it is a linear format for atomic operations and therefore needs a buffer,
 	// as generally Metal does not support atomic operations on textures.
-	bool needs_buffer = is_linear || (p_format.array_layers == 1 && p_format.mipmaps == 1 && p_format.texture_type == TEXTURE_TYPE_2D && flags::any(p_format.usage_bits, TEXTURE_USAGE_STORAGE_BIT) && (p_format.format == DATA_FORMAT_R32_UINT || p_format.format == DATA_FORMAT_R32_SINT || p_format.format == DATA_FORMAT_R32G32_UINT || p_format.format == DATA_FORMAT_R32G32_SINT));
+	bool needs_buffer = is_linear;
+
+	// Check for atomic requirements.
+	if (flags::any(p_format.usage_bits, TEXTURE_USAGE_STORAGE_BIT) && p_format.array_layers == 1 && p_format.mipmaps == 1 && p_format.texture_type == TEXTURE_TYPE_2D) {
+		switch (p_format.format) {
+			case RenderingDeviceCommons::DATA_FORMAT_R32_SINT:
+			case RenderingDeviceCommons::DATA_FORMAT_R32_UINT: {
+				if (!device_properties->features.supports_image_atomic_32_bit) {
+					// We can emulate 32-bit atomic operations on textures.
+					needs_buffer = true;
+				}
+			} break;
+			case RenderingDeviceCommons::DATA_FORMAT_R32G32_SINT:
+			case RenderingDeviceCommons::DATA_FORMAT_R32G32_UINT: {
+				if (!device_properties->features.supports_image_atomic_64_bit) {
+					// No emulation for 64-bit atomics.
+					ERR_FAIL_V_MSG(TextureID(), "64-bit atomic operations are not supported.");
+				}
+			} break;
+			default:
+				break;
+		}
+	}

 	id<MTLTexture> obj = nil;
 	if (needs_buffer) {
@ -900,9 +929,15 @@ Error RenderingDeviceDriverMetal::command_queue_execute_and_present(CommandQueue
 	MDCommandBuffer *cmd_buffer = (MDCommandBuffer *)(p_cmd_buffers[size - 1].id);
 	Fence *fence = (Fence *)(p_cmd_fence.id);
 	if (fence != nullptr) {
-		[cmd_buffer->get_command_buffer() addCompletedHandler:^(id<MTLCommandBuffer> buffer) {
+		id<MTLCommandBuffer> cb = cmd_buffer->get_command_buffer();
+		if (cb == nil) {
+			// If there is nothing to do, signal the fence immediately.
 			dispatch_semaphore_signal(fence->semaphore);
-		}];
+		} else {
+			[cb addCompletedHandler:^(id<MTLCommandBuffer> buffer) {
+				dispatch_semaphore_signal(fence->semaphore);
+			}];
+		}
 	}

 	for (uint32_t i = 0; i < p_swap_chains.size(); i++) {
@ -1730,8 +1765,7 @@ void RenderingDeviceDriverMetal::pipeline_free(PipelineID p_pipeline_id) {

 void RenderingDeviceDriverMetal::command_bind_push_constants(CommandBufferID p_cmd_buffer, ShaderID p_shader, uint32_t p_dst_first_index, VectorView<uint32_t> p_data) {
 	MDCommandBuffer *cb = (MDCommandBuffer *)(p_cmd_buffer.id);
-	MDShader *shader = (MDShader *)(p_shader.id);
-	shader->encode_push_constant_data(p_data, cb);
+	cb->encode_push_constant_data(p_shader, p_data);
 }

 // ----- CACHE -----
@ -2417,6 +2451,7 @@ RDD::PipelineID RenderingDeviceDriverMetal::compute_pipeline_create(ShaderID p_s

 	MTLComputePipelineDescriptor *desc = [MTLComputePipelineDescriptor new];
 	desc.computeFunction = function;
+	desc.label = conv::to_nsstring(shader->name);
 	if (archive) {
 		desc.binaryArchives = @[ archive ];
 	}
@ -2735,6 +2770,8 @@ bool RenderingDeviceDriverMetal::has_feature(Features p_feature) {
 			return device_properties->features.metal_fx_spatial;
 		case SUPPORTS_METALFX_TEMPORAL:
 			return device_properties->features.metal_fx_temporal;
+		case SUPPORTS_IMAGE_ATOMIC_32_BIT:
+			return device_properties->features.supports_image_atomic_32_bit;
 		default:
 			return false;
 	}
--- a/drivers/metal/rendering_shader_container_metal.mm
+++ b/drivers/metal/rendering_shader_container_metal.mm
@ -199,6 +199,8 @@ bool RenderingShaderContainerMetal::_set_code_from_spirv(const Vector<RenderingD
 		set_indexes[set_indexes_size - 1] = offset;
 	}
 	CompilerMSL::Options msl_options{};
+	// MAJOR * 10000 + MINOR * 100
+	uint32_t msl_version = CompilerMSL::Options::make_msl_version(device_profile->features.mslVersionMajor, device_profile->features.mslVersionMinor);
 	msl_options.set_msl_version(device_profile->features.mslVersionMajor, device_profile->features.mslVersionMinor);
 	mtl_reflection_data.msl_version = msl_options.msl_version;
 	msl_options.platform = device_profile->platform == MetalDeviceProfile::Platform::macOS ? CompilerMSL::Options::macOS : CompilerMSL::Options::iOS;
@ -209,7 +211,7 @@ bool RenderingShaderContainerMetal::_set_code_from_spirv(const Vector<RenderingD
 	}

 	bool disable_argument_buffers = false;
-	if (String v = OS::get_singleton()->get_environment(U"GODOT_DISABLE_ARGUMENT_BUFFERS"); v == U"1") {
+	if (String v = OS::get_singleton()->get_environment("GODOT_MTL_DISABLE_ARGUMENT_BUFFERS"); v == "1") {
 		disable_argument_buffers = true;
 	}

@ -236,6 +238,10 @@ bool RenderingShaderContainerMetal::_set_code_from_spirv(const Vector<RenderingD
 		msl_options.multiview_layered_rendering = true;
 		msl_options.view_mask_buffer_index = VIEW_MASK_BUFFER_INDEX;
 	}
+	if (msl_version >= CompilerMSL::Options::make_msl_version(3, 2)) {
+		// All 3.2+ versions support device coherence, so we can disable texture fences.
+		msl_options.readwrite_texture_fences = false;
+	}

 	CompilerGLSL::Options options{};
 	options.vertex.flip_vert_y = true;
@ -417,6 +423,10 @@ bool RenderingShaderContainerMetal::_set_code_from_spirv(const Vector<RenderingD
 							// VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER
 							primary.texture_type = MTLTextureTypeTextureBuffer;
 						} break;
+						case spv::DimTileImageDataEXT: {
+							// Godot does not use this extension.
+							// See: https://registry.khronos.org/vulkan/specs/latest/man/html/VK_EXT_shader_tile_image.html
+						} break;
 						case spv::DimMax: {
 							// Add all enumerations to silence the compiler warning
 							// and generate future warnings, should a new one be added.
--- a/drivers/vulkan/rendering_device_driver_vulkan.cpp
+++ b/drivers/vulkan/rendering_device_driver_vulkan.cpp
@ -5894,6 +5894,13 @@ bool RenderingDeviceDriverVulkan::has_feature(Features p_feature) {
 			return true;
 		case SUPPORTS_BUFFER_DEVICE_ADDRESS:
 			return buffer_device_address_support;
+		case SUPPORTS_IMAGE_ATOMIC_32_BIT:
+#if (defined(MACOS_ENABLED) || defined(APPLE_EMBEDDED_ENABLED))
+			// MoltenVK has previously had issues with 32-bit atomics on images.
+			return false;
+#else
+			return true;
+#endif
 		default:
 			return false;
 	}
--- a/servers/rendering/renderer_rd/environment/fog.cpp
+++ b/servers/rendering/renderer_rd/environment/fog.cpp
@ -417,6 +417,7 @@ void Fog::VolumetricFog::init(const Vector3i &fog_size, RID p_sky_shader) {
 	width = fog_size.x;
 	height = fog_size.y;
 	depth = fog_size.z;
+	atomic_type = RD::get_singleton()->has_feature(RD::SUPPORTS_IMAGE_ATOMIC_32_BIT) ? RD::UNIFORM_TYPE_IMAGE : RD::UNIFORM_TYPE_STORAGE_BUFFER;

 	RD::TextureFormat tf;
 	tf.format = RD::DATA_FORMAT_R16G16B16A16_SFLOAT;
@ -440,29 +441,29 @@ void Fog::VolumetricFog::init(const Vector3i &fog_size, RID p_sky_shader) {
 	fog_map = RD::get_singleton()->texture_create(tf, RD::TextureView());
 	RD::get_singleton()->set_resource_name(fog_map, "Fog map");

-#if defined(MACOS_ENABLED) || defined(APPLE_EMBEDDED_ENABLED)
-	Vector<uint8_t> dm;
-	dm.resize_initialized(fog_size.x * fog_size.y * fog_size.z * 4);
+	if (atomic_type == RD::UNIFORM_TYPE_STORAGE_BUFFER) {
+		Vector<uint8_t> dm;
+		dm.resize_initialized(fog_size.x * fog_size.y * fog_size.z * 4);

-	density_map = RD::get_singleton()->storage_buffer_create(dm.size(), dm);
-	RD::get_singleton()->set_resource_name(density_map, "Fog density map");
-	light_map = RD::get_singleton()->storage_buffer_create(dm.size(), dm);
-	RD::get_singleton()->set_resource_name(light_map, "Fog light map");
-	emissive_map = RD::get_singleton()->storage_buffer_create(dm.size(), dm);
-	RD::get_singleton()->set_resource_name(emissive_map, "Fog emissive map");
-#else
-	tf.format = RD::DATA_FORMAT_R32_UINT;
-	tf.usage_bits = RD::TEXTURE_USAGE_STORAGE_BIT | RD::TEXTURE_USAGE_CAN_COPY_TO_BIT;
-	density_map = RD::get_singleton()->texture_create(tf, RD::TextureView());
-	RD::get_singleton()->set_resource_name(density_map, "Fog density map");
-	RD::get_singleton()->texture_clear(density_map, Color(0, 0, 0, 0), 0, 1, 0, 1);
-	light_map = RD::get_singleton()->texture_create(tf, RD::TextureView());
-	RD::get_singleton()->set_resource_name(light_map, "Fog light map");
-	RD::get_singleton()->texture_clear(light_map, Color(0, 0, 0, 0), 0, 1, 0, 1);
-	emissive_map = RD::get_singleton()->texture_create(tf, RD::TextureView());
-	RD::get_singleton()->set_resource_name(emissive_map, "Fog emissive map");
-	RD::get_singleton()->texture_clear(emissive_map, Color(0, 0, 0, 0), 0, 1, 0, 1);
-#endif
+		density_map = RD::get_singleton()->storage_buffer_create(dm.size(), dm);
+		RD::get_singleton()->set_resource_name(density_map, "Fog density map");
+		light_map = RD::get_singleton()->storage_buffer_create(dm.size(), dm);
+		RD::get_singleton()->set_resource_name(light_map, "Fog light map");
+		emissive_map = RD::get_singleton()->storage_buffer_create(dm.size(), dm);
+		RD::get_singleton()->set_resource_name(emissive_map, "Fog emissive map");
+	} else {
+		tf.format = RD::DATA_FORMAT_R32_UINT;
+		tf.usage_bits = RD::TEXTURE_USAGE_STORAGE_BIT | RD::TEXTURE_USAGE_CAN_COPY_TO_BIT | RD::TEXTURE_USAGE_STORAGE_ATOMIC_BIT;
+		density_map = RD::get_singleton()->texture_create(tf, RD::TextureView());
+		RD::get_singleton()->set_resource_name(density_map, "Fog density map");
+		RD::get_singleton()->texture_clear(density_map, Color(0, 0, 0, 0), 0, 1, 0, 1);
+		light_map = RD::get_singleton()->texture_create(tf, RD::TextureView());
+		RD::get_singleton()->set_resource_name(light_map, "Fog light map");
+		RD::get_singleton()->texture_clear(light_map, Color(0, 0, 0, 0), 0, 1, 0, 1);
+		emissive_map = RD::get_singleton()->texture_create(tf, RD::TextureView());
+		RD::get_singleton()->set_resource_name(emissive_map, "Fog emissive map");
+		RD::get_singleton()->texture_clear(emissive_map, Color(0, 0, 0, 0), 0, 1, 0, 1);
+	}

 	Vector<RD::Uniform> uniforms;
 	{
@ -579,11 +580,7 @@ void Fog::volumetric_fog_update(const VolumetricFogSettings &p_settings, const P

 			{
 				RD::Uniform u;
-#if defined(MACOS_ENABLED) || defined(APPLE_EMBEDDED_ENABLED)
-				u.uniform_type = RD::UNIFORM_TYPE_STORAGE_BUFFER;
-#else
-				u.uniform_type = RD::UNIFORM_TYPE_IMAGE;
-#endif
+				u.uniform_type = fog->atomic_type;
 				u.binding = 1;
 				u.append_id(fog->emissive_map);
 				uniforms.push_back(u);
@ -599,11 +596,7 @@ void Fog::volumetric_fog_update(const VolumetricFogSettings &p_settings, const P

 			{
 				RD::Uniform u;
-#if defined(MACOS_ENABLED) || defined(APPLE_EMBEDDED_ENABLED)
-				u.uniform_type = RD::UNIFORM_TYPE_STORAGE_BUFFER;
-#else
-				u.uniform_type = RD::UNIFORM_TYPE_IMAGE;
-#endif
+				u.uniform_type = fog->atomic_type;
 				u.binding = 3;
 				u.append_id(fog->density_map);
 				uniforms.push_back(u);
@ -611,11 +604,7 @@ void Fog::volumetric_fog_update(const VolumetricFogSettings &p_settings, const P

 			{
 				RD::Uniform u;
-#if defined(MACOS_ENABLED) || defined(APPLE_EMBEDDED_ENABLED)
-				u.uniform_type = RD::UNIFORM_TYPE_STORAGE_BUFFER;
-#else
-				u.uniform_type = RD::UNIFORM_TYPE_IMAGE;
-#endif
+				u.uniform_type = fog->atomic_type;
 				u.binding = 4;
 				u.append_id(fog->light_map);
 				uniforms.push_back(u);
@ -918,22 +907,14 @@ void Fog::volumetric_fog_update(const VolumetricFogSettings &p_settings, const P
 		}
 		{
 			RD::Uniform u;
-#if defined(MACOS_ENABLED) || defined(APPLE_EMBEDDED_ENABLED)
-			u.uniform_type = RD::UNIFORM_TYPE_STORAGE_BUFFER;
-#else
-			u.uniform_type = RD::UNIFORM_TYPE_IMAGE;
-#endif
+			u.uniform_type = fog->atomic_type;
 			u.binding = 16;
 			u.append_id(fog->density_map);
 			uniforms.push_back(u);
 		}
 		{
 			RD::Uniform u;
-#if defined(MACOS_ENABLED) || defined(APPLE_EMBEDDED_ENABLED)
-			u.uniform_type = RD::UNIFORM_TYPE_STORAGE_BUFFER;
-#else
-			u.uniform_type = RD::UNIFORM_TYPE_IMAGE;
-#endif
+			u.uniform_type = fog->atomic_type;
 			u.binding = 17;
 			u.append_id(fog->light_map);
 			uniforms.push_back(u);
@ -941,11 +922,7 @@ void Fog::volumetric_fog_update(const VolumetricFogSettings &p_settings, const P

 		{
 			RD::Uniform u;
-#if defined(MACOS_ENABLED) || defined(APPLE_EMBEDDED_ENABLED)
-			u.uniform_type = RD::UNIFORM_TYPE_STORAGE_BUFFER;
-#else
-			u.uniform_type = RD::UNIFORM_TYPE_IMAGE;
-#endif
+			u.uniform_type = fog->atomic_type;
 			u.binding = 18;
 			u.append_id(fog->emissive_map);
 			uniforms.push_back(u);
--- a/servers/rendering/renderer_rd/environment/fog.h
+++ b/servers/rendering/renderer_rd/environment/fog.h
@ -316,6 +316,9 @@ public:

 		int last_shadow_filter = -1;

+		// If the device doesn't support image atomics, use storage buffers instead.
+		RD::UniformType atomic_type = RD::UNIFORM_TYPE_IMAGE;
+
 		virtual void configure(RenderSceneBuffersRD *p_render_buffers) override {}
 		virtual void free_data() override {}

--- a/servers/rendering/renderer_rd/shader_rd.cpp
+++ b/servers/rendering/renderer_rd/shader_rd.cpp
@ -234,11 +234,13 @@ void ShaderRD::_build_variant_code(StringBuilder &builder, uint32_t p_variant, c
 					builder.append(String("#define ") + String(E.key) + "_CODE_USED\n");
 				}
 #if (defined(MACOS_ENABLED) || defined(APPLE_EMBEDDED_ENABLED))
-				if (RD::get_singleton()->get_device_capabilities().device_family == RDD::DEVICE_VULKAN) {
+				RenderingDevice *rd = RD::get_singleton();
+				if (rd->get_device_capabilities().device_family == RDD::DEVICE_VULKAN) {
 					builder.append("#define MOLTENVK_USED\n");
 				}
-				// Image atomics are supported on Metal 3.1 but no support in MoltenVK or SPIRV-Cross yet.
-				builder.append("#define NO_IMAGE_ATOMICS\n");
+				if (!rd->has_feature(RD::SUPPORTS_IMAGE_ATOMIC_32_BIT)) {
+					builder.append("#define NO_IMAGE_ATOMICS\n");
+				}
 #endif

 				builder.append(String("#define RENDER_DRIVER_") + OS::get_singleton()->get_current_rendering_driver_name().to_upper() + "\n");
--- a/servers/rendering/renderer_rd/shaders/environment/volumetric_fog.glsl
+++ b/servers/rendering/renderer_rd/shaders/environment/volumetric_fog.glsl
@ -2,6 +2,8 @@

 #version 450

+#pragma use_vulkan_memory_model
+
 #VERSION_DEFINES

 layout(local_size_x = 4, local_size_y = 4, local_size_z = 4) in;
--- a/servers/rendering/renderer_rd/shaders/environment/volumetric_fog_process.glsl
+++ b/servers/rendering/renderer_rd/shaders/environment/volumetric_fog_process.glsl
@ -2,6 +2,8 @@

 #version 450

+#pragma use_vulkan_memory_model
+
 #VERSION_DEFINES

 #ifdef MODE_DENSITY
--- a/servers/rendering/rendering_device.cpp
+++ b/servers/rendering/rendering_device.cpp
@ -7979,6 +7979,7 @@ void RenderingDevice::_bind_methods() {
 	BIND_ENUM_CONSTANT(SUPPORTS_METALFX_SPATIAL);
 	BIND_ENUM_CONSTANT(SUPPORTS_METALFX_TEMPORAL);
 	BIND_ENUM_CONSTANT(SUPPORTS_BUFFER_DEVICE_ADDRESS);
+	BIND_ENUM_CONSTANT(SUPPORTS_IMAGE_ATOMIC_32_BIT);

 	BIND_ENUM_CONSTANT(LIMIT_MAX_BOUND_UNIFORM_SETS);
 	BIND_ENUM_CONSTANT(LIMIT_MAX_FRAMEBUFFER_COLOR_ATTACHMENTS);
--- a/servers/rendering/rendering_device_commons.h
+++ b/servers/rendering/rendering_device_commons.h
@ -952,6 +952,7 @@ public:
 		// If not supported, a fragment shader with only side effects (i.e., writes  to buffers, but doesn't output to attachments), may be optimized down to no-op by the GPU driver.
 		SUPPORTS_FRAGMENT_SHADER_WITH_ONLY_SIDE_EFFECTS,
 		SUPPORTS_BUFFER_DEVICE_ADDRESS,
+		SUPPORTS_IMAGE_ATOMIC_32_BIT,
 	};

 	enum SubgroupOperations {
--- a/thirdparty/README.md
+++ b/thirdparty/README.md
@ -978,7 +978,7 @@ Its version and license is described in this file under `hidapi`.
 ## spirv-cross

 - Upstream: https://github.com/KhronosGroup/SPIRV-Cross
- Version: git (6173e24b31f09a0c3217103a130e74c4ddec14a6, 2024)
+- Version: git (d7440cbc6c50332600fdf21c45e6a5df0b07e54c, 2025)
 - License: Apache 2.0

 Files extracted from upstream source:
--- a/thirdparty/spirv-cross/spirv.hpp
+++ b/thirdparty/spirv-cross/spirv.hpp
--- a/thirdparty/spirv-cross/spirv_common.hpp
+++ b/thirdparty/spirv-cross/spirv_common.hpp
@ -580,7 +580,10 @@ struct SPIRType : IVariant
 		Interpolant,
 		Char,
 		// MSL specific type, that is used by 'object'(analog of 'task' from glsl) shader.
-		MeshGridProperties
+		MeshGridProperties,
+		BFloat16,
+		FloatE4M3,
+		FloatE5M2
 	};

 	// Scalar/vector/matrix support.
@ -605,6 +608,14 @@ struct SPIRType : IVariant
 	bool pointer = false;
 	bool forward_pointer = false;

+	struct
+	{
+		uint32_t use_id = 0;
+		uint32_t rows_id = 0;
+		uint32_t columns_id = 0;
+		uint32_t scope_id = 0;
+	} cooperative;
+
 	spv::StorageClass storage = spv::StorageClassGeneric;

 	SmallVector<TypeID> member_types;
@ -686,6 +697,7 @@ struct SPIREntryPoint
 	FunctionID self = 0;
 	std::string name;
 	std::string orig_name;
+	std::unordered_map<uint32_t, uint32_t> fp_fast_math_defaults;
 	SmallVector<VariableID> interface_variables;

 	Bitset flags;
@ -1026,6 +1038,9 @@ struct SPIRFunction : IVariant
 	// consider arrays value types.
 	SmallVector<ID> constant_arrays_needed_on_stack;

+	// Does this function (or any function called by it), emit geometry?
+	bool emits_geometry = false;
+
 	bool active = false;
 	bool flush_undeclared = true;
 	bool do_combined_parameters = true;
@ -1226,6 +1241,26 @@ struct SPIRConstant : IVariant
 		return u.f32;
 	}

+	static inline float fe4m3_to_f32(uint8_t v)
+	{
+		if ((v & 0x7f) == 0x7f)
+		{
+			union
+			{
+				float f32;
+				uint32_t u32;
+			} u;
+
+			u.u32 = (v & 0x80) ? 0xffffffffu : 0x7fffffffu;
+			return u.f32;
+		}
+		else
+		{
+			// Reuse the FP16 to FP32 code. Cute bit-hackery.
+			return f16_to_f32((int16_t(int8_t(v)) << 7) & (0xffff ^ 0x4000)) * 256.0f;
+		}
+	}
+
 	inline uint32_t specialization_constant_id(uint32_t col, uint32_t row) const
 	{
 		return m.c[col].id[row];
@ -1266,6 +1301,24 @@ struct SPIRConstant : IVariant
 		return f16_to_f32(scalar_u16(col, row));
 	}

+	inline float scalar_bf16(uint32_t col = 0, uint32_t row = 0) const
+	{
+		uint32_t v = scalar_u16(col, row) << 16;
+		float fp32;
+		memcpy(&fp32, &v, sizeof(float));
+		return fp32;
+	}
+
+	inline float scalar_floate4m3(uint32_t col = 0, uint32_t row = 0) const
+	{
+		return fe4m3_to_f32(scalar_u8(col, row));
+	}
+
+	inline float scalar_bf8(uint32_t col = 0, uint32_t row = 0) const
+	{
+		return f16_to_f32(scalar_u8(col, row) << 8);
+	}
+
 	inline float scalar_f32(uint32_t col = 0, uint32_t row = 0) const
 	{
 		return m.c[col].r[row].f32;
@ -1336,9 +1389,10 @@ struct SPIRConstant : IVariant

 	SPIRConstant() = default;

-	SPIRConstant(TypeID constant_type_, const uint32_t *elements, uint32_t num_elements, bool specialized)
+	SPIRConstant(TypeID constant_type_, const uint32_t *elements, uint32_t num_elements, bool specialized, bool replicated_ = false)
 	    : constant_type(constant_type_)
 	    , specialization(specialized)
+	    , replicated(replicated_)
 	{
 		subconstants.reserve(num_elements);
 		for (uint32_t i = 0; i < num_elements; i++)
@ -1410,9 +1464,16 @@ struct SPIRConstant : IVariant
 	// If true, this is a LUT, and should always be declared in the outer scope.
 	bool is_used_as_lut = false;

+	// If this is a null constant of array type with specialized length.
+	// May require special handling in initializer
+	bool is_null_array_specialized_length = false;
+
 	// For composites which are constant arrays, etc.
 	SmallVector<ConstantID> subconstants;

+	// Whether the subconstants are intended to be replicated (e.g. OpConstantCompositeReplicateEXT)
+	bool replicated = false;
+
 	// Non-Vulkan GLSL, HLSL and sometimes MSL emits defines for each specialization constant,
 	// and uses them to initialize the constant. This allows the user
 	// to still be able to specialize the value by supplying corresponding
@ -1708,6 +1769,7 @@ struct Meta
 		uint32_t spec_id = 0;
 		uint32_t index = 0;
 		spv::FPRoundingMode fp_rounding_mode = spv::FPRoundingModeMax;
+		spv::FPFastMathModeMask fp_fast_math_mode = spv::FPFastMathModeMaskNone;
 		bool builtin = false;
 		bool qualified_alias_explicit_override = false;

--- a/thirdparty/spirv-cross/spirv_cross.cpp
+++ b/thirdparty/spirv-cross/spirv_cross.cpp
@ -82,7 +82,7 @@ bool Compiler::variable_storage_is_aliased(const SPIRVariable &v)
 	            ir.meta[type.self].decoration.decoration_flags.get(DecorationBufferBlock);
 	bool image = type.basetype == SPIRType::Image;
 	bool counter = type.basetype == SPIRType::AtomicCounter;
-	bool buffer_reference = type.storage == StorageClassPhysicalStorageBufferEXT;
+	bool buffer_reference = type.storage == StorageClassPhysicalStorageBuffer;

 	bool is_restrict;
 	if (ssbo)
@ -171,6 +171,7 @@ bool Compiler::block_is_control_dependent(const SPIRBlock &block)
 		case OpGroupNonUniformLogicalXor:
 		case OpGroupNonUniformQuadBroadcast:
 		case OpGroupNonUniformQuadSwap:
+		case OpGroupNonUniformRotateKHR:

 		// Control barriers
 		case OpControlBarrier:
@ -210,6 +211,7 @@ bool Compiler::block_is_pure(const SPIRBlock &block)

 		case OpCopyMemory:
 		case OpStore:
+		case OpCooperativeMatrixStoreKHR:
 		{
 			auto &type = expression_type(ops[0]);
 			if (type.storage != StorageClassFunction)
@ -370,6 +372,7 @@ void Compiler::register_global_read_dependencies(const SPIRBlock &block, uint32_
 		}

 		case OpLoad:
+		case OpCooperativeMatrixLoadKHR:
 		case OpImageRead:
 		{
 			// If we're in a storage class which does not get invalidated, adding dependencies here is no big deal.
@ -481,7 +484,7 @@ void Compiler::register_write(uint32_t chain)
 			}
 		}

-		if (type.storage == StorageClassPhysicalStorageBufferEXT || variable_storage_is_aliased(*var))
+		if (type.storage == StorageClassPhysicalStorageBuffer || variable_storage_is_aliased(*var))
 			flush_all_aliased_variables();
 		else if (var)
 			flush_dependees(*var);
@ -587,6 +590,7 @@ const SPIRType &Compiler::expression_type(uint32_t id) const
 bool Compiler::expression_is_lvalue(uint32_t id) const
 {
 	auto &type = expression_type(id);
+
 	switch (type.basetype)
 	{
 	case SPIRType::SampledImage:
@ -818,6 +822,7 @@ bool Compiler::InterfaceVariableAccessHandler::handle(Op opcode, const uint32_t

 	case OpAtomicStore:
 	case OpStore:
+	case OpCooperativeMatrixStoreKHR:
 		// Invalid SPIR-V.
 		if (length < 1)
 			return false;
@ -910,6 +915,7 @@ bool Compiler::InterfaceVariableAccessHandler::handle(Op opcode, const uint32_t
 	case OpInBoundsAccessChain:
 	case OpPtrAccessChain:
 	case OpLoad:
+	case OpCooperativeMatrixLoadKHR:
 	case OpCopyObject:
 	case OpImageTexelPointer:
 	case OpAtomicLoad:
@ -2364,6 +2370,10 @@ void Compiler::set_execution_mode(ExecutionMode mode, uint32_t arg0, uint32_t ar
 		execution.output_primitives = arg0;
 		break;

+	case ExecutionModeFPFastMathDefault:
+		execution.fp_fast_math_defaults[arg0] = arg1;
+		break;
+
 	default:
 		break;
 	}
@ -3461,6 +3471,7 @@ bool Compiler::AnalyzeVariableScopeAccessHandler::handle(spv::Op op, const uint3
 	switch (op)
 	{
 	case OpStore:
+	case OpCooperativeMatrixStoreKHR:
 	{
 		if (length < 2)
 			return false;
@ -3581,6 +3592,7 @@ bool Compiler::AnalyzeVariableScopeAccessHandler::handle(spv::Op op, const uint3
 	}

 	case OpLoad:
+	case OpCooperativeMatrixLoadKHR:
 	{
 		if (length < 3)
 			return false;
@ -3800,6 +3812,7 @@ bool Compiler::StaticExpressionAccessHandler::handle(spv::Op op, const uint32_t
 	switch (op)
 	{
 	case OpStore:
+	case OpCooperativeMatrixStoreKHR:
 		if (length < 2)
 			return false;
 		if (args[0] == variable_id)
@ -3810,6 +3823,7 @@ bool Compiler::StaticExpressionAccessHandler::handle(spv::Op op, const uint32_t
 		break;

 	case OpLoad:
+	case OpCooperativeMatrixLoadKHR:
 		if (length < 3)
 			return false;
 		if (args[2] == variable_id && static_expression == 0) // Tried to read from variable before it was initialized.
@ -4285,6 +4299,7 @@ bool Compiler::may_read_undefined_variable_in_block(const SPIRBlock &block, uint
 		switch (op.op)
 		{
 		case OpStore:
+		case OpCooperativeMatrixStoreKHR:
 		case OpCopyMemory:
 			if (ops[0] == var)
 				return false;
@ -4323,6 +4338,7 @@ bool Compiler::may_read_undefined_variable_in_block(const SPIRBlock &block, uint

 		case OpCopyObject:
 		case OpLoad:
+		case OpCooperativeMatrixLoadKHR:
 			if (ops[2] == var)
 				return true;
 			break;
@ -4350,6 +4366,39 @@ bool Compiler::may_read_undefined_variable_in_block(const SPIRBlock &block, uint
 	return true;
 }

+bool Compiler::GeometryEmitDisocveryHandler::handle(spv::Op opcode, const uint32_t *, uint32_t)
+{
+	if (opcode == OpEmitVertex || opcode == OpEndPrimitive)
+	{
+		for (auto *func : function_stack)
+			func->emits_geometry = true;
+	}
+
+	return true;
+}
+
+bool Compiler::GeometryEmitDisocveryHandler::begin_function_scope(const uint32_t *stream, uint32_t)
+{
+	auto &callee = compiler.get<SPIRFunction>(stream[2]);
+	function_stack.push_back(&callee);
+	return true;
+}
+
+bool Compiler::GeometryEmitDisocveryHandler::end_function_scope([[maybe_unused]] const uint32_t *stream, uint32_t)
+{
+	assert(function_stack.back() == &compiler.get<SPIRFunction>(stream[2]));
+	function_stack.pop_back();
+
+	return true;
+}
+
+void Compiler::discover_geometry_emitters()
+{
+	GeometryEmitDisocveryHandler handler(*this);
+
+	traverse_all_reachable_opcodes(get<SPIRFunction>(ir.default_entry_point), handler);
+}
+
 Bitset Compiler::get_buffer_block_flags(VariableID id) const
 {
 	return ir.get_buffer_block_flags(get<SPIRVariable>(id));
@ -4462,6 +4511,7 @@ bool Compiler::ActiveBuiltinHandler::handle(spv::Op opcode, const uint32_t *args
 	switch (opcode)
 	{
 	case OpStore:
+	case OpCooperativeMatrixStoreKHR:
 		if (length < 1)
 			return false;

@ -4478,6 +4528,7 @@ bool Compiler::ActiveBuiltinHandler::handle(spv::Op opcode, const uint32_t *args

 	case OpCopyObject:
 	case OpLoad:
+	case OpCooperativeMatrixLoadKHR:
 		if (length < 3)
 			return false;

@ -4910,13 +4961,16 @@ void Compiler::make_constant_null(uint32_t id, uint32_t type)
 		uint32_t parent_id = ir.increase_bound_by(1);
 		make_constant_null(parent_id, constant_type.parent_type);

-		if (!constant_type.array_size_literal.back())
-			SPIRV_CROSS_THROW("Array size of OpConstantNull must be a literal.");
-
-		SmallVector<uint32_t> elements(constant_type.array.back());
-		for (uint32_t i = 0; i < constant_type.array.back(); i++)
+		// The array size of OpConstantNull can be either literal or specialization constant.
+		// In the latter case, we cannot take the value as-is, as it can be changed to anything.
+		// Rather, we assume it to be *one* for the sake of initializer.
+		bool is_literal_array_size = constant_type.array_size_literal.back();
+		uint32_t count = is_literal_array_size ? constant_type.array.back() : 1;
+		SmallVector<uint32_t> elements(count);
+		for (uint32_t i = 0; i < count; i++)
 			elements[i] = parent_id;
-		set<SPIRConstant>(id, type, elements.data(), uint32_t(elements.size()), false);
+		auto &constant = set<SPIRConstant>(id, type, elements.data(), uint32_t(elements.size()), false);
+		constant.is_null_array_specialized_length = !is_literal_array_size;
 	}
 	else if (!constant_type.member_types.empty())
 	{
@ -5177,7 +5231,7 @@ bool Compiler::PhysicalStorageBufferPointerHandler::type_is_bda_block_entry(uint

 uint32_t Compiler::PhysicalStorageBufferPointerHandler::get_minimum_scalar_alignment(const SPIRType &type) const
 {
-	if (type.storage == spv::StorageClassPhysicalStorageBufferEXT)
+	if (type.storage == spv::StorageClassPhysicalStorageBuffer)
 		return 8;
 	else if (type.basetype == SPIRType::Struct)
 	{
@ -5252,6 +5306,13 @@ bool Compiler::PhysicalStorageBufferPointerHandler::handle(Op op, const uint32_t
 		break;
 	}

+	case OpCooperativeMatrixLoadKHR:
+	case OpCooperativeMatrixStoreKHR:
+	{
+		// TODO: Can we meaningfully deal with this?
+		break;
+	}
+
 	default:
 		break;
 	}
@ -5274,6 +5335,10 @@ uint32_t Compiler::PhysicalStorageBufferPointerHandler::get_base_non_block_type_

 void Compiler::PhysicalStorageBufferPointerHandler::analyze_non_block_types_from_block(const SPIRType &type)
 {
+	if (analyzed_type_ids.count(type.self))
+		return;
+	analyzed_type_ids.insert(type.self);
+
 	for (auto &member : type.member_types)
 	{
 		auto &subtype = compiler.get<SPIRType>(member);
@ -5407,6 +5472,7 @@ bool Compiler::InterlockedResourceAccessHandler::handle(Op opcode, const uint32_
 	switch (opcode)
 	{
 	case OpLoad:
+	case OpCooperativeMatrixLoadKHR:
 	{
 		if (length < 3)
 			return false;
@ -5484,6 +5550,7 @@ bool Compiler::InterlockedResourceAccessHandler::handle(Op opcode, const uint32_
 	case OpStore:
 	case OpImageWrite:
 	case OpAtomicStore:
+	case OpCooperativeMatrixStoreKHR:
 	{
 		if (length < 1)
 			return false;
--- a/thirdparty/spirv-cross/spirv_cross.hpp
+++ b/thirdparty/spirv-cross/spirv_cross.hpp
@ -1054,6 +1054,7 @@ protected:
 		std::unordered_set<uint32_t> non_block_types;
 		std::unordered_map<uint32_t, PhysicalBlockMeta> physical_block_type_meta;
 		std::unordered_map<uint32_t, PhysicalBlockMeta *> access_chain_to_physical_block;
+		std::unordered_set<uint32_t> analyzed_type_ids;

 		void mark_aligned_access(uint32_t id, const uint32_t *args, uint32_t length);
 		PhysicalBlockMeta *find_block_meta(uint32_t id) const;
@ -1072,6 +1073,22 @@ protected:
 	                              bool single_function);
 	bool may_read_undefined_variable_in_block(const SPIRBlock &block, uint32_t var);

+	struct GeometryEmitDisocveryHandler : OpcodeHandler
+	{
+		explicit GeometryEmitDisocveryHandler(Compiler &compiler_)
+		    : compiler(compiler_)
+		{
+		}
+		Compiler &compiler;
+
+		bool handle(spv::Op opcode, const uint32_t *args, uint32_t length) override;
+		bool begin_function_scope(const uint32_t *, uint32_t) override;
+		bool end_function_scope(const uint32_t *, uint32_t) override;
+		SmallVector<SPIRFunction *> function_stack;
+	};
+
+	void discover_geometry_emitters();
+
 	// Finds all resources that are written to from inside the critical section, if present.
 	// The critical section is delimited by OpBeginInvocationInterlockEXT and
 	// OpEndInvocationInterlockEXT instructions. In MSL and HLSL, any resources written
--- a/thirdparty/spirv-cross/spirv_cross_parsed_ir.cpp
+++ b/thirdparty/spirv-cross/spirv_cross_parsed_ir.cpp
@ -452,6 +452,10 @@ void ParsedIR::set_decoration(ID id, Decoration decoration, uint32_t argument)
 		dec.fp_rounding_mode = static_cast<FPRoundingMode>(argument);
 		break;

+	case DecorationFPFastMathMode:
+		dec.fp_fast_math_mode = static_cast<FPFastMathModeMask>(argument);
+		break;
+
 	default:
 		break;
 	}
@ -643,6 +647,8 @@ uint32_t ParsedIR::get_decoration(ID id, Decoration decoration) const
 		return dec.index;
 	case DecorationFPRoundingMode:
 		return dec.fp_rounding_mode;
+	case DecorationFPFastMathMode:
+		return dec.fp_fast_math_mode;
 	default:
 		return 1;
 	}
@ -730,6 +736,10 @@ void ParsedIR::unset_decoration(ID id, Decoration decoration)
 		dec.fp_rounding_mode = FPRoundingModeMax;
 		break;

+	case DecorationFPFastMathMode:
+		dec.fp_fast_math_mode = FPFastMathModeMaskNone;
+		break;
+
 	case DecorationHlslCounterBufferGOOGLE:
 	{
 		auto &counter = meta[id].hlsl_magic_counter_buffer;
@ -1050,16 +1060,21 @@ void ParsedIR::make_constant_null(uint32_t id, uint32_t type, bool add_to_typed_
 		uint32_t parent_id = increase_bound_by(1);
 		make_constant_null(parent_id, constant_type.parent_type, add_to_typed_id_set);

-		if (!constant_type.array_size_literal.back())
-			SPIRV_CROSS_THROW("Array size of OpConstantNull must be a literal.");
+		// The array size of OpConstantNull can be either literal or specialization constant.
+		// In the latter case, we cannot take the value as-is, as it can be changed to anything.
+		// Rather, we assume it to be *one* for the sake of initializer.
+		bool is_literal_array_size = constant_type.array_size_literal.back();
+		uint32_t count = is_literal_array_size ? constant_type.array.back() : 1;

-		SmallVector<uint32_t> elements(constant_type.array.back());
-		for (uint32_t i = 0; i < constant_type.array.back(); i++)
+		SmallVector<uint32_t> elements(count);
+		for (uint32_t i = 0; i < count; i++)
 			elements[i] = parent_id;

 		if (add_to_typed_id_set)
 			add_typed_id(TypeConstant, id);
-		variant_set<SPIRConstant>(ids[id], type, elements.data(), uint32_t(elements.size()), false).self = id;
+		auto& constant = variant_set<SPIRConstant>(ids[id], type, elements.data(), uint32_t(elements.size()), false);
+		constant.self = id;
+		constant.is_null_array_specialized_length = !is_literal_array_size;
 	}
 	else if (!constant_type.member_types.empty())
 	{
--- a/thirdparty/spirv-cross/spirv_cross_util.cpp
+++ b/thirdparty/spirv-cross/spirv_cross_util.cpp
@ -1,77 +0,0 @@
-/*
- * Copyright 2015-2021 Arm Limited
- * SPDX-License-Identifier: Apache-2.0 OR MIT
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * At your option, you may choose to accept this material under either:
- *  1. The Apache License, Version 2.0, found at <http://www.apache.org/licenses/LICENSE-2.0>, or
- *  2. The MIT License, found at <http://opensource.org/licenses/MIT>.
- */
-
-#include "spirv_cross_util.hpp"
-#include "spirv_common.hpp"
-
-using namespace spv;
-using namespace SPIRV_CROSS_NAMESPACE;
-
-namespace spirv_cross_util
-{
-void rename_interface_variable(Compiler &compiler, const SmallVector<Resource> &resources, uint32_t location,
-                               const std::string &name)
-{
-	for (auto &v : resources)
-	{
-		if (!compiler.has_decoration(v.id, spv::DecorationLocation))
-			continue;
-
-		auto loc = compiler.get_decoration(v.id, spv::DecorationLocation);
-		if (loc != location)
-			continue;
-
-		auto &type = compiler.get_type(v.base_type_id);
-
-		// This is more of a friendly variant. If we need to rename interface variables, we might have to rename
-		// structs as well and make sure all the names match up.
-		if (type.basetype == SPIRType::Struct)
-		{
-			compiler.set_name(v.base_type_id, join("SPIRV_Cross_Interface_Location", location));
-			for (uint32_t i = 0; i < uint32_t(type.member_types.size()); i++)
-				compiler.set_member_name(v.base_type_id, i, join("InterfaceMember", i));
-		}
-
-		compiler.set_name(v.id, name);
-	}
-}
-
-void inherit_combined_sampler_bindings(Compiler &compiler)
-{
-	auto &samplers = compiler.get_combined_image_samplers();
-	for (auto &s : samplers)
-	{
-		if (compiler.has_decoration(s.image_id, spv::DecorationDescriptorSet))
-		{
-			uint32_t set = compiler.get_decoration(s.image_id, spv::DecorationDescriptorSet);
-			compiler.set_decoration(s.combined_id, spv::DecorationDescriptorSet, set);
-		}
-
-		if (compiler.has_decoration(s.image_id, spv::DecorationBinding))
-		{
-			uint32_t binding = compiler.get_decoration(s.image_id, spv::DecorationBinding);
-			compiler.set_decoration(s.combined_id, spv::DecorationBinding, binding);
-		}
-	}
-}
-} // namespace spirv_cross_util
--- a/thirdparty/spirv-cross/spirv_cross_util.hpp
+++ b/thirdparty/spirv-cross/spirv_cross_util.hpp
@ -1,37 +0,0 @@
-/*
- * Copyright 2015-2021 Arm Limited
- * SPDX-License-Identifier: Apache-2.0 OR MIT
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * At your option, you may choose to accept this material under either:
- *  1. The Apache License, Version 2.0, found at <http://www.apache.org/licenses/LICENSE-2.0>, or
- *  2. The MIT License, found at <http://opensource.org/licenses/MIT>.
- */
-
-#ifndef SPIRV_CROSS_UTIL_HPP
-#define SPIRV_CROSS_UTIL_HPP
-
-#include "spirv_cross.hpp"
-
-namespace spirv_cross_util
-{
-void rename_interface_variable(SPIRV_CROSS_NAMESPACE::Compiler &compiler,
-                               const SPIRV_CROSS_NAMESPACE::SmallVector<SPIRV_CROSS_NAMESPACE::Resource> &resources,
-                               uint32_t location, const std::string &name);
-void inherit_combined_sampler_bindings(SPIRV_CROSS_NAMESPACE::Compiler &compiler);
-} // namespace spirv_cross_util
-
-#endif
--- a/thirdparty/spirv-cross/spirv_glsl.cpp
+++ b/thirdparty/spirv-cross/spirv_glsl.cpp
--- a/thirdparty/spirv-cross/spirv_glsl.hpp
+++ b/thirdparty/spirv-cross/spirv_glsl.hpp
@ -297,6 +297,9 @@ public:
 		float_formatter = formatter;
 	}

+	// Returns the macro name corresponding to constant id
+	std::string constant_value_macro_name(uint32_t id) const;
+
 protected:
 	struct ShaderSubgroupSupportHelper
 	{
@ -450,6 +453,7 @@ protected:
 	virtual std::string variable_decl(const SPIRType &type, const std::string &name, uint32_t id = 0);
 	virtual bool variable_decl_is_remapped_storage(const SPIRVariable &var, spv::StorageClass storage) const;
 	virtual std::string to_func_call_arg(const SPIRFunction::Parameter &arg, uint32_t id);
+	virtual void emit_workgroup_initialization(const SPIRVariable &var);

 	struct TextureFunctionBaseArguments
 	{
@ -622,6 +626,8 @@ protected:
 		const char *uint16_t_literal_suffix = "us";
 		const char *nonuniform_qualifier = "nonuniformEXT";
 		const char *boolean_mix_function = "mix";
+		const char *printf_function = "debugPrintfEXT";
+		std::string constant_null_initializer = "";
 		SPIRType::BaseType boolean_in_struct_remapped_type = SPIRType::Boolean;
 		bool swizzle_is_function = false;
 		bool shared_is_implied = false;
@ -629,6 +635,7 @@ protected:
 		bool explicit_struct_type = false;
 		bool use_initializer_list = false;
 		bool use_typed_initializer_list = false;
+		bool requires_matching_array_initializer = false;
 		bool can_declare_struct_inline = true;
 		bool can_declare_arrays_inline = true;
 		bool native_row_major_matrix = true;
@ -679,7 +686,6 @@ protected:
 	                                    const SmallVector<uint32_t> &indices);
 	void emit_block_chain(SPIRBlock &block);
 	void emit_hoisted_temporaries(SmallVector<std::pair<TypeID, ID>> &temporaries);
-	std::string constant_value_macro_name(uint32_t id);
 	int get_constant_mapping_to_workgroup_component(const SPIRConstant &constant) const;
 	void emit_constant(const SPIRConstant &constant);
 	void emit_specialization_constant_op(const SPIRConstantOp &constant);
@ -695,6 +701,7 @@ protected:
 	void emit_variable_temporary_copies(const SPIRVariable &var);

 	bool should_dereference(uint32_t id);
+	bool should_dereference_caller_param(uint32_t id);
 	bool should_forward(uint32_t id) const;
 	bool should_suppress_usage_tracking(uint32_t id) const;
 	void emit_mix_op(uint32_t result_type, uint32_t id, uint32_t left, uint32_t right, uint32_t lerp);
@ -762,7 +769,7 @@ protected:
 	spv::StorageClass get_expression_effective_storage_class(uint32_t ptr);
 	virtual bool access_chain_needs_stage_io_builtin_translation(uint32_t base);

-	virtual void check_physical_type_cast(std::string &expr, const SPIRType *type, uint32_t physical_type);
+	virtual bool check_physical_type_cast(std::string &expr, const SPIRType *type, uint32_t physical_type);
 	virtual bool prepare_access_chain_for_scalar_access(std::string &expr, const SPIRType &type,
 	                                                    spv::StorageClass storage, bool &is_packed);

@ -792,8 +799,9 @@ protected:
 	std::string declare_temporary(uint32_t type, uint32_t id);
 	void emit_uninitialized_temporary(uint32_t type, uint32_t id);
 	SPIRExpression &emit_uninitialized_temporary_expression(uint32_t type, uint32_t id);
-	void append_global_func_args(const SPIRFunction &func, uint32_t index, SmallVector<std::string> &arglist);
+	virtual void append_global_func_args(const SPIRFunction &func, uint32_t index, SmallVector<std::string> &arglist);
 	std::string to_non_uniform_aware_expression(uint32_t id);
+	std::string to_atomic_ptr_expression(uint32_t id);
 	std::string to_expression(uint32_t id, bool register_expression_read = true);
 	std::string to_composite_constructor_expression(const SPIRType &parent_type, uint32_t id, bool block_like_type);
 	std::string to_rerolled_array_expression(const SPIRType &parent_type, const std::string &expr, const SPIRType &type);
@ -1009,6 +1017,8 @@ protected:
 	const Instruction *get_next_instruction_in_block(const Instruction &instr);
 	static uint32_t mask_relevant_memory_semantics(uint32_t semantics);

+	std::string convert_floate4m3_to_string(const SPIRConstant &value, uint32_t col, uint32_t row);
+	std::string convert_floate5m2_to_string(const SPIRConstant &value, uint32_t col, uint32_t row);
 	std::string convert_half_to_string(const SPIRConstant &value, uint32_t col, uint32_t row);
 	std::string convert_float_to_string(const SPIRConstant &value, uint32_t col, uint32_t row);
 	std::string convert_double_to_string(const SPIRConstant &value, uint32_t col, uint32_t row);
--- a/thirdparty/spirv-cross/spirv_msl.cpp
+++ b/thirdparty/spirv-cross/spirv_msl.cpp
--- a/thirdparty/spirv-cross/spirv_msl.hpp
+++ b/thirdparty/spirv-cross/spirv_msl.hpp
@ -324,6 +324,8 @@ public:
 		// of the shader with the additional fixed sample mask.
 		uint32_t additional_fixed_sample_mask = 0xffffffff;
 		bool enable_point_size_builtin = true;
+		bool enable_point_size_default = false;
+		float default_point_size = 1.0f;
 		bool enable_frag_depth_builtin = true;
 		bool enable_frag_stencil_ref_builtin = true;
 		bool disable_rasterization = false;
@ -536,6 +538,14 @@ public:
 		// if the fragment does not modify the depth value.
 		bool input_attachment_is_ds_attachment = false;

+		// If BuiltInPosition is not written, automatically disable rasterization.
+		// The result can be queried with get_is_rasterization_disabled.
+		bool auto_disable_rasterization = false;
+
+		// Use Fast Math pragmas in MSL code, based on SPIR-V float controls and FP ExecutionModes.
+		// Requires MSL 3.2 or above, and has no effect with earlier MSL versions.
+		bool use_fast_math_pragmas = false;
+
 		bool is_ios() const
 		{
 			return platform == iOS;
@ -756,6 +766,19 @@ public:
 	void set_combined_sampler_suffix(const char *suffix);
 	const char *get_combined_sampler_suffix() const;

+	// Information about specialization constants that are translated into MSL macros
+	// instead of using function constant
+	// These must only be called after a successful call to CompilerMSL::compile().
+	bool specialization_constant_is_macro(uint32_t constant_id) const;
+
+	// Returns a mask of SPIR-V FP Fast Math Mode flags, that represents the set of flags that can be applied
+	// across all floating-point types. Each FPFastMathDefault execution mode operation identifies the flags
+	// for one floating-point type, and the value returned here is a bitwise-AND combination across all types.
+	// If incl_ops is enabled, the FPFastMathMode of any SPIR-V operations are also included in the bitwise-AND
+	// to determine the minimal fast-math that applies to all default execution modes and all operations.
+	// The returned value is also affected by execution modes SignedZeroInfNanPreserve and ContractionOff.
+	uint32_t get_fp_fast_math_flags(bool incl_ops);
+
 protected:
 	// An enum of SPIR-V functions that are implemented in additional
 	// source code that is added to the shader if necessary.
@ -763,6 +786,7 @@ protected:
 	{
 		SPVFuncImplNone,
 		SPVFuncImplMod,
+		SPVFuncImplSMod,
 		SPVFuncImplRadians,
 		SPVFuncImplDegrees,
 		SPVFuncImplFindILsb,
@ -784,12 +808,11 @@ protected:
 		SPVFuncImplInverse4x4,
 		SPVFuncImplInverse3x3,
 		SPVFuncImplInverse2x2,
-		// It is very important that this come before *Swizzle and ChromaReconstruct*, to ensure it's
-		// emitted before them.
-		SPVFuncImplForwardArgs,
-		// Likewise, this must come before *Swizzle.
+		// It is very important that this come before *Swizzle, to ensure it's emitted before them.
 		SPVFuncImplGetSwizzle,
 		SPVFuncImplTextureSwizzle,
+		SPVFuncImplGatherReturn,
+		SPVFuncImplGatherCompareReturn,
 		SPVFuncImplGatherSwizzle,
 		SPVFuncImplGatherCompareSwizzle,
 		SPVFuncImplGatherConstOffsets,
@ -806,6 +829,30 @@ protected:
 		SPVFuncImplSubgroupShuffleXor,
 		SPVFuncImplSubgroupShuffleUp,
 		SPVFuncImplSubgroupShuffleDown,
+		SPVFuncImplSubgroupRotate,
+		SPVFuncImplSubgroupClusteredAdd,
+		SPVFuncImplSubgroupClusteredFAdd = SPVFuncImplSubgroupClusteredAdd,
+		SPVFuncImplSubgroupClusteredIAdd = SPVFuncImplSubgroupClusteredAdd,
+		SPVFuncImplSubgroupClusteredMul,
+		SPVFuncImplSubgroupClusteredFMul = SPVFuncImplSubgroupClusteredMul,
+		SPVFuncImplSubgroupClusteredIMul = SPVFuncImplSubgroupClusteredMul,
+		SPVFuncImplSubgroupClusteredMin,
+		SPVFuncImplSubgroupClusteredFMin = SPVFuncImplSubgroupClusteredMin,
+		SPVFuncImplSubgroupClusteredSMin = SPVFuncImplSubgroupClusteredMin,
+		SPVFuncImplSubgroupClusteredUMin = SPVFuncImplSubgroupClusteredMin,
+		SPVFuncImplSubgroupClusteredMax,
+		SPVFuncImplSubgroupClusteredFMax = SPVFuncImplSubgroupClusteredMax,
+		SPVFuncImplSubgroupClusteredSMax = SPVFuncImplSubgroupClusteredMax,
+		SPVFuncImplSubgroupClusteredUMax = SPVFuncImplSubgroupClusteredMax,
+		SPVFuncImplSubgroupClusteredAnd,
+		SPVFuncImplSubgroupClusteredBitwiseAnd = SPVFuncImplSubgroupClusteredAnd,
+		SPVFuncImplSubgroupClusteredLogicalAnd = SPVFuncImplSubgroupClusteredAnd,
+		SPVFuncImplSubgroupClusteredOr,
+		SPVFuncImplSubgroupClusteredBitwiseOr = SPVFuncImplSubgroupClusteredOr,
+		SPVFuncImplSubgroupClusteredLogicalOr = SPVFuncImplSubgroupClusteredOr,
+		SPVFuncImplSubgroupClusteredXor,
+		SPVFuncImplSubgroupClusteredBitwiseXor = SPVFuncImplSubgroupClusteredXor,
+		SPVFuncImplSubgroupClusteredLogicalXor = SPVFuncImplSubgroupClusteredXor,
 		SPVFuncImplQuadBroadcast,
 		SPVFuncImplQuadSwap,
 		SPVFuncImplReflectScalar,
@ -841,6 +888,7 @@ protected:
 		SPVFuncImplTextureCast,
 		SPVFuncImplMulExtended,
 		SPVFuncImplSetMeshOutputsEXT,
+		SPVFuncImplAssume,
 	};

 	// If the underlying resource has been used for comparison then duplicate loads of that resource must be too
@ -858,6 +906,11 @@ protected:
 	void emit_function_prototype(SPIRFunction &func, const Bitset &return_flags) override;
 	void emit_sampled_image_op(uint32_t result_type, uint32_t result_id, uint32_t image_id, uint32_t samp_id) override;
 	void emit_subgroup_op(const Instruction &i) override;
+	void emit_subgroup_cluster_op(uint32_t result_type, uint32_t result_id, uint32_t cluster_size, uint32_t op0,
+	                              const char *op);
+	void emit_subgroup_cluster_op_cast(uint32_t result_type, uint32_t result_id, uint32_t cluster_size, uint32_t op0,
+	                                   const char *op, SPIRType::BaseType input_type,
+	                                   SPIRType::BaseType expected_result_type);
 	std::string to_texture_op(const Instruction &i, bool sparse, bool *forward,
 	                          SmallVector<uint32_t> &inherited_expressions) override;
 	void emit_fixup() override;
@ -872,6 +925,7 @@ protected:
 	void emit_mesh_entry_point();
 	void emit_mesh_outputs();
 	void emit_mesh_tasks(SPIRBlock &block) override;
+	void emit_workgroup_initialization(const SPIRVariable &var) override;

 	// Allow Metal to use the array<T> template to make arrays a value type
 	std::string type_to_array_glsl(const SPIRType &type, uint32_t variable_id) override;
@ -985,6 +1039,7 @@ protected:
 	void add_tess_level_input_to_interface_block(const std::string &ib_var_ref, SPIRType &ib_type, SPIRVariable &var);
 	void add_tess_level_input(const std::string &base_ref, const std::string &mbr_name, SPIRVariable &var);

+	void ensure_struct_members_valid_vecsizes(SPIRType &struct_type, uint32_t &location);
 	void fix_up_interface_member_indices(spv::StorageClass storage, uint32_t ib_type_id);

 	void mark_location_as_used_by_shader(uint32_t location, const SPIRType &type,
@ -1069,7 +1124,8 @@ protected:
 	bool validate_member_packing_rules_msl(const SPIRType &type, uint32_t index) const;
 	std::string get_argument_address_space(const SPIRVariable &argument);
 	std::string get_type_address_space(const SPIRType &type, uint32_t id, bool argument = false);
-	static bool decoration_flags_signal_volatile(const Bitset &flags);
+	bool decoration_flags_signal_volatile(const Bitset &flags) const;
+	bool decoration_flags_signal_coherent(const Bitset &flags) const;
 	const char *to_restrict(uint32_t id, bool space);
 	SPIRType &get_stage_in_struct_type();
 	SPIRType &get_stage_out_struct_type();
@ -1082,7 +1138,7 @@ protected:
 	                         uint32_t mem_order_1, uint32_t mem_order_2, bool has_mem_order_2, uint32_t op0, uint32_t op1 = 0,
 	                         bool op1_is_pointer = false, bool op1_is_literal = false, uint32_t op2 = 0);
 	const char *get_memory_order(uint32_t spv_mem_sem);
-	void add_pragma_line(const std::string &line);
+	void add_pragma_line(const std::string &line, bool recompile_on_unique);
 	void add_typedef_line(const std::string &line);
 	void emit_barrier(uint32_t id_exe_scope, uint32_t id_mem_scope, uint32_t id_mem_sem);
 	bool emit_array_copy(const char *expr, uint32_t lhs_id, uint32_t rhs_id,
@ -1133,12 +1189,13 @@ protected:
 	void emit_store_statement(uint32_t lhs_expression, uint32_t rhs_expression) override;

 	void analyze_sampled_image_usage();
+	void analyze_workgroup_variables();

 	bool access_chain_needs_stage_io_builtin_translation(uint32_t base) override;
 	bool prepare_access_chain_for_scalar_access(std::string &expr, const SPIRType &type, spv::StorageClass storage,
 	                                            bool &is_packed) override;
 	void fix_up_interpolant_access_chain(const uint32_t *ops, uint32_t length);
-	void check_physical_type_cast(std::string &expr, const SPIRType *type, uint32_t physical_type) override;
+	bool check_physical_type_cast(std::string &expr, const SPIRType *type, uint32_t physical_type) override;

 	bool emit_tessellation_access_chain(const uint32_t *ops, uint32_t length);
 	bool emit_tessellation_io_load(uint32_t result_type, uint32_t id, uint32_t ptr);
@ -1164,9 +1221,10 @@ protected:
 	std::unordered_map<uint32_t, uint32_t> fragment_output_components;
 	std::unordered_map<uint32_t, uint32_t> builtin_to_automatic_input_location;
 	std::unordered_map<uint32_t, uint32_t> builtin_to_automatic_output_location;
-	std::set<std::string> pragma_lines;
-	std::set<std::string> typedef_lines;
+	std::vector<std::string> pragma_lines;
+	std::vector<std::string> typedef_lines;
 	SmallVector<uint32_t> vars_needing_early_declaration;
+	std::unordered_set<uint32_t> constant_macro_ids;

 	std::unordered_map<StageSetBinding, std::pair<MSLResourceBinding, bool>, InternalHasher> resource_bindings;
 	std::unordered_map<StageSetBinding, uint32_t, InternalHasher> resource_arg_buff_idx_to_binding_number;
@ -1210,11 +1268,14 @@ protected:
 	bool needs_swizzle_buffer_def = false;
 	bool used_swizzle_buffer = false;
 	bool added_builtin_tess_level = false;
+	bool needs_local_invocation_index = false;
 	bool needs_subgroup_invocation_id = false;
 	bool needs_subgroup_size = false;
 	bool needs_sample_id = false;
 	bool needs_helper_invocation = false;
+	bool needs_workgroup_zero_init = false;
 	bool writes_to_depth = false;
+	bool writes_to_point_size = false;
 	std::string qual_pos_var_name;
 	std::string stage_in_var_name = "in";
 	std::string stage_out_var_name = "out";
@ -1276,6 +1337,7 @@ protected:

 	bool suppress_missing_prototypes = false;
 	bool suppress_incompatible_pointer_types_discard_qualifiers = false;
+	bool suppress_sometimes_unitialized = false;

 	void add_spv_func_and_recompile(SPVFuncImpl spv_func);

@ -1308,7 +1370,7 @@ protected:
 		}

 		bool handle(spv::Op opcode, const uint32_t *args, uint32_t length) override;
-		CompilerMSL::SPVFuncImpl get_spv_func_impl(spv::Op opcode, const uint32_t *args);
+		CompilerMSL::SPVFuncImpl get_spv_func_impl(spv::Op opcode, const uint32_t *args, uint32_t length);
 		void check_resource_write(uint32_t var_id);

 		CompilerMSL &compiler;
@ -1319,6 +1381,7 @@ protected:
 		bool uses_image_write = false;
 		bool uses_buffer_write = false;
 		bool uses_discard = false;
+		bool needs_local_invocation_index = false;
 		bool needs_subgroup_invocation_id = false;
 		bool needs_subgroup_size = false;
 		bool needs_sample_id = false;
--- a/thirdparty/spirv-cross/spirv_parser.cpp
+++ b/thirdparty/spirv-cross/spirv_parser.cpp
@ -305,6 +305,7 @@ void Parser::parse(const Instruction &instruction)
 	}

 	case OpExtInst:
+	case OpExtInstWithForwardRefsKHR:
 	{
 		// The SPIR-V debug information extended instructions might come at global scope.
 		if (current_block)
@ -380,13 +381,21 @@ void Parser::parse(const Instruction &instruction)
 		auto mode = static_cast<ExecutionMode>(ops[1]);
 		execution.flags.set(mode);

-		if (mode == ExecutionModeLocalSizeId)
+		switch (mode)
 		{
+		case ExecutionModeLocalSizeId:
 			execution.workgroup_size.id_x = ops[2];
 			execution.workgroup_size.id_y = ops[3];
 			execution.workgroup_size.id_z = ops[4];
-		}
+			break;

+		case ExecutionModeFPFastMathDefault:
+			execution.fp_fast_math_defaults[ops[2]] = ops[3];
+			break;
+
+		default:
+			break;
+		}
 		break;
 	}

@ -536,12 +545,37 @@ void Parser::parse(const Instruction &instruction)
 		uint32_t id = ops[0];
 		uint32_t width = ops[1];
 		auto &type = set<SPIRType>(id, op);
+
+		if (width != 16 && width != 8 && length > 2)
+			SPIRV_CROSS_THROW("Unrecognized FP encoding mode for OpTypeFloat.");
+
 		if (width == 64)
 			type.basetype = SPIRType::Double;
 		else if (width == 32)
 			type.basetype = SPIRType::Float;
 		else if (width == 16)
-			type.basetype = SPIRType::Half;
+		{
+			if (length > 2)
+			{
+				if (ops[2] == spv::FPEncodingBFloat16KHR)
+					type.basetype = SPIRType::BFloat16;
+				else
+					SPIRV_CROSS_THROW("Unrecognized encoding for OpTypeFloat 16.");
+			}
+			else
+				type.basetype = SPIRType::Half;
+		}
+		else if (width == 8)
+		{
+			if (length < 2)
+				SPIRV_CROSS_THROW("Missing encoding for OpTypeFloat 8.");
+			else if (ops[2] == spv::FPEncodingFloat8E4M3EXT)
+				type.basetype = SPIRType::FloatE4M3;
+			else if (ops[2] == spv::FPEncodingFloat8E5M2EXT)
+				type.basetype = SPIRType::FloatE5M2;
+			else
+				SPIRV_CROSS_THROW("Invalid encoding for OpTypeFloat 8.");
+		}
 		else
 			SPIRV_CROSS_THROW("Unrecognized bit-width of floating point type.");
 		type.width = width;
@ -592,6 +626,22 @@ void Parser::parse(const Instruction &instruction)
 		break;
 	}

+	case OpTypeCooperativeMatrixKHR:
+	{
+		uint32_t id = ops[0];
+		auto &base = get<SPIRType>(ops[1]);
+		auto &matrixbase = set<SPIRType>(id, base);
+
+		matrixbase.op = op;
+		matrixbase.cooperative.scope_id = ops[2];
+		matrixbase.cooperative.rows_id = ops[3];
+		matrixbase.cooperative.columns_id = ops[4];
+		matrixbase.cooperative.use_id = ops[5];
+		matrixbase.self = id;
+		matrixbase.parent_type = ops[1];
+		break;
+	}
+
 	case OpTypeArray:
 	{
 		uint32_t id = ops[0];
@ -835,17 +885,27 @@ void Parser::parse(const Instruction &instruction)
 		break;
 	}

-		// Constants
+	// Constants
 	case OpSpecConstant:
 	case OpConstant:
+	case OpConstantCompositeReplicateEXT:
+	case OpSpecConstantCompositeReplicateEXT:
 	{
 		uint32_t id = ops[1];
 		auto &type = get<SPIRType>(ops[0]);
-
-		if (type.width > 32)
-			set<SPIRConstant>(id, ops[0], ops[2] | (uint64_t(ops[3]) << 32), op == OpSpecConstant);
+		if (op == OpConstantCompositeReplicateEXT || op == OpSpecConstantCompositeReplicateEXT)
+		{
+			auto subconstant = uint32_t(ops[2]);
+			set<SPIRConstant>(id, ops[0], &subconstant, 1, op == OpSpecConstantCompositeReplicateEXT, true);
+		}
 		else
-			set<SPIRConstant>(id, ops[0], ops[2], op == OpSpecConstant);
+		{
+
+			if (type.width > 32)
+				set<SPIRConstant>(id, ops[0], ops[2] | (uint64_t(ops[3]) << 32), op == OpSpecConstant);
+			else
+				set<SPIRConstant>(id, ops[0], ops[2], op == OpSpecConstant);
+		}
 		break;
 	}