mirror of
				https://github.com/godotengine/godot.git
				synced 2025-10-30 21:21:10 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			1816 lines
		
	
	
	
		
			54 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			1816 lines
		
	
	
	
		
			54 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| /*
 | |
| Convection Texture Tools
 | |
| Copyright (c) 2018-2019 Eric Lasota
 | |
| 
 | |
| Permission is hereby granted, free of charge, to any person obtaining
 | |
| a copy of this software and associated documentation files (the
 | |
| "Software"), to deal in the Software without restriction, including
 | |
| without limitation the rights to use, copy, modify, merge, publish,
 | |
| distribute, sublicense, and/or sell copies of the Software, and to
 | |
| permit persons to whom the Software is furnished to do so, subject
 | |
| to the following conditions:
 | |
| 
 | |
| The above copyright notice and this permission notice shall be included
 | |
| in all copies or substantial portions of the Software.
 | |
| 
 | |
| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 | |
| OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 | |
| MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 | |
| IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 | |
| CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 | |
| TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 | |
| SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 | |
| 
 | |
| */
 | |
| #pragma once
 | |
| #ifndef __CVTT_PARALLELMATH_H__
 | |
| #define __CVTT_PARALLELMATH_H__
 | |
| 
 | |
| #include "ConvectionKernels.h"
 | |
| #include "ConvectionKernels_Config.h"
 | |
| 
 | |
| #ifdef CVTT_USE_SSE2
 | |
| #include <emmintrin.h>
 | |
| #endif
 | |
| 
 | |
| #include <float.h>
 | |
| #include <assert.h>
 | |
| #include <string.h>
 | |
| #include <algorithm>
 | |
| #include <math.h>
 | |
| 
 | |
| #define UNREFERENCED_PARAMETER(n) ((void)n)
 | |
| 
 | |
| // Parallel math implementation
 | |
| //
 | |
| // After preprocessor defs are handled, what this should do is expose the following types:
 | |
| // SInt16 - Signed 16-bit integer
 | |
| // UInt16 - Signed 16-bit integer
 | |
| // UInt15 - Unsigned 15-bit integer
 | |
| // SInt32 - Signed 32-bit integer
 | |
| // UInt31 - Unsigned 31-bit integer
 | |
| // AInt16 - 16-bit integer of unknown signedness (only used for storage)
 | |
| // Int16CompFlag - Comparison flags from comparing 16-bit integers
 | |
| // Int32CompFlag - Comparison flags from comparing 32-bit integers
 | |
| // FloatCompFlag - Comparison flags from comparing 32-bit floats
 | |
| //
 | |
| // The reason for these distinctions are that depending on the instruction set, signed or unsigned versions of certain ops
 | |
| // (particularly max, min, compares, and right shift) may not be available.  In cases where ops are not available, it's
 | |
| // necessary to do high bit manipulations to accomplish the operation with 16-bit numbers.  The 15-bit and 31-bit uint types
 | |
| // can elide the bit flips if unsigned versions are not available.
 | |
| 
 | |
| namespace cvtt
 | |
| {
 | |
| #ifdef CVTT_USE_SSE2
 | |
|     // SSE2 version
 | |
|     struct ParallelMath
 | |
|     {
 | |
|         typedef uint16_t ScalarUInt16;
 | |
|         typedef int16_t ScalarSInt16;
 | |
| 
 | |
|         template<unsigned int TRoundingMode>
 | |
|         struct RoundForScope
 | |
|         {
 | |
|             unsigned int m_oldCSR;
 | |
| 
 | |
|             RoundForScope()
 | |
|             {
 | |
|                 m_oldCSR = _mm_getcsr();
 | |
|                 _mm_setcsr((m_oldCSR & ~_MM_ROUND_MASK) | (TRoundingMode));
 | |
|             }
 | |
| 
 | |
|             ~RoundForScope()
 | |
|             {
 | |
|                 _mm_setcsr(m_oldCSR);
 | |
|             }
 | |
|         };
 | |
| 
 | |
|         struct RoundTowardZeroForScope : RoundForScope<_MM_ROUND_TOWARD_ZERO>
 | |
|         {
 | |
|         };
 | |
| 
 | |
|         struct RoundTowardNearestForScope : RoundForScope<_MM_ROUND_NEAREST>
 | |
|         {
 | |
|         };
 | |
| 
 | |
|         struct RoundUpForScope : RoundForScope<_MM_ROUND_UP>
 | |
|         {
 | |
|         };
 | |
| 
 | |
|         struct RoundDownForScope : RoundForScope<_MM_ROUND_DOWN>
 | |
|         {
 | |
|         };
 | |
| 
 | |
|         static const int ParallelSize = 8;
 | |
| 
 | |
|         enum Int16Subtype
 | |
|         {
 | |
|             IntSubtype_Signed,
 | |
|             IntSubtype_UnsignedFull,
 | |
|             IntSubtype_UnsignedTruncated,
 | |
|             IntSubtype_Abstract,
 | |
|         };
 | |
| 
 | |
|         template<int TSubtype>
 | |
|         struct VInt16
 | |
|         {
 | |
|             __m128i m_value;
 | |
| 
 | |
|             inline VInt16 operator+(int16_t other) const
 | |
|             {
 | |
|                 VInt16 result;
 | |
|                 result.m_value = _mm_add_epi16(m_value, _mm_set1_epi16(static_cast<int16_t>(other)));
 | |
|                 return result;
 | |
|             }
 | |
| 
 | |
|             inline VInt16 operator+(const VInt16 &other) const
 | |
|             {
 | |
|                 VInt16 result;
 | |
|                 result.m_value = _mm_add_epi16(m_value, other.m_value);
 | |
|                 return result;
 | |
|             }
 | |
| 
 | |
|             inline VInt16 operator|(const VInt16 &other) const
 | |
|             {
 | |
|                 VInt16 result;
 | |
|                 result.m_value = _mm_or_si128(m_value, other.m_value);
 | |
|                 return result;
 | |
|             }
 | |
| 
 | |
|             inline VInt16 operator&(const VInt16 &other) const
 | |
|             {
 | |
|                 VInt16 result;
 | |
|                 result.m_value = _mm_and_si128(m_value, other.m_value);
 | |
|                 return result;
 | |
|             }
 | |
| 
 | |
|             inline VInt16 operator-(const VInt16 &other) const
 | |
|             {
 | |
|                 VInt16 result;
 | |
|                 result.m_value = _mm_sub_epi16(m_value, other.m_value);
 | |
|                 return result;
 | |
|             }
 | |
| 
 | |
|             inline VInt16 operator<<(int bits) const
 | |
|             {
 | |
|                 VInt16 result;
 | |
|                 result.m_value = _mm_slli_epi16(m_value, bits);
 | |
|                 return result;
 | |
|             }
 | |
| 
 | |
|             inline VInt16 operator^(const VInt16 &other) const
 | |
|             {
 | |
|                 VInt16 result;
 | |
|                 result.m_value = _mm_xor_si128(m_value, other.m_value);
 | |
|                 return result;
 | |
|             }
 | |
|         };
 | |
| 
 | |
|         typedef VInt16<IntSubtype_Signed> SInt16;
 | |
|         typedef VInt16<IntSubtype_UnsignedFull> UInt16;
 | |
|         typedef VInt16<IntSubtype_UnsignedTruncated> UInt15;
 | |
|         typedef VInt16<IntSubtype_Abstract> AInt16;
 | |
| 
 | |
|         template<int TSubtype>
 | |
|         struct VInt32
 | |
|         {
 | |
|             __m128i m_values[2];
 | |
| 
 | |
|             inline VInt32 operator+(const VInt32& other) const
 | |
|             {
 | |
|                 VInt32 result;
 | |
|                 result.m_values[0] = _mm_add_epi32(m_values[0], other.m_values[0]);
 | |
|                 result.m_values[1] = _mm_add_epi32(m_values[1], other.m_values[1]);
 | |
|                 return result;
 | |
|             }
 | |
| 
 | |
|             inline VInt32 operator-(const VInt32& other) const
 | |
|             {
 | |
|                 VInt32 result;
 | |
|                 result.m_values[0] = _mm_sub_epi32(m_values[0], other.m_values[0]);
 | |
|                 result.m_values[1] = _mm_sub_epi32(m_values[1], other.m_values[1]);
 | |
|                 return result;
 | |
|             }
 | |
| 
 | |
|             inline VInt32 operator<<(const int other) const
 | |
|             {
 | |
|                 VInt32 result;
 | |
|                 result.m_values[0] = _mm_slli_epi32(m_values[0], other);
 | |
|                 result.m_values[1] = _mm_slli_epi32(m_values[1], other);
 | |
|                 return result;
 | |
|             }
 | |
| 
 | |
|             inline VInt32 operator|(const VInt32& other) const
 | |
|             {
 | |
|                 VInt32 result;
 | |
|                 result.m_values[0] = _mm_or_si128(m_values[0], other.m_values[0]);
 | |
|                 result.m_values[1] = _mm_or_si128(m_values[1], other.m_values[1]);
 | |
|                 return result;
 | |
|             }
 | |
|         };
 | |
| 
 | |
|         typedef VInt32<IntSubtype_Signed> SInt32;
 | |
|         typedef VInt32<IntSubtype_UnsignedTruncated> UInt31;
 | |
|         typedef VInt32<IntSubtype_UnsignedFull> UInt32;
 | |
|         typedef VInt32<IntSubtype_Abstract> AInt32;
 | |
| 
 | |
|         template<class TTargetType>
 | |
|         struct LosslessCast
 | |
|         {
 | |
| #ifdef CVTT_PERMIT_ALIASING
 | |
|             template<int TSrcSubtype>
 | |
|             static const TTargetType& Cast(const VInt32<TSrcSubtype> &src)
 | |
|             {
 | |
|                 return reinterpret_cast<VInt32<TSubtype>&>(src);
 | |
|             }
 | |
| 
 | |
|             template<int TSrcSubtype>
 | |
|             static const TTargetType& Cast(const VInt16<TSrcSubtype> &src)
 | |
|             {
 | |
|                 return reinterpret_cast<VInt16<TSubtype>&>(src);
 | |
|             }
 | |
| #else
 | |
|             template<int TSrcSubtype>
 | |
|             static TTargetType Cast(const VInt32<TSrcSubtype> &src)
 | |
|             {
 | |
|                 TTargetType result;
 | |
|                 result.m_values[0] = src.m_values[0];
 | |
|                 result.m_values[1] = src.m_values[1];
 | |
|                 return result;
 | |
|             }
 | |
| 
 | |
|             template<int TSrcSubtype>
 | |
|             static TTargetType Cast(const VInt16<TSrcSubtype> &src)
 | |
|             {
 | |
|                 TTargetType result;
 | |
|                 result.m_value = src.m_value;
 | |
|                 return result;
 | |
|             }
 | |
| #endif
 | |
|         };
 | |
| 
 | |
|         struct Int64
 | |
|         {
 | |
|             __m128i m_values[4];
 | |
|         };
 | |
| 
 | |
|         struct Float
 | |
|         {
 | |
|             __m128 m_values[2];
 | |
| 
 | |
|             inline Float operator+(const Float &other) const
 | |
|             {
 | |
|                 Float result;
 | |
|                 result.m_values[0] = _mm_add_ps(m_values[0], other.m_values[0]);
 | |
|                 result.m_values[1] = _mm_add_ps(m_values[1], other.m_values[1]);
 | |
|                 return result;
 | |
|             }
 | |
| 
 | |
|             inline Float operator+(float other) const
 | |
|             {
 | |
|                 Float result;
 | |
|                 result.m_values[0] = _mm_add_ps(m_values[0], _mm_set1_ps(other));
 | |
|                 result.m_values[1] = _mm_add_ps(m_values[1], _mm_set1_ps(other));
 | |
|                 return result;
 | |
|             }
 | |
| 
 | |
|             inline Float operator-(const Float& other) const
 | |
|             {
 | |
|                 Float result;
 | |
|                 result.m_values[0] = _mm_sub_ps(m_values[0], other.m_values[0]);
 | |
|                 result.m_values[1] = _mm_sub_ps(m_values[1], other.m_values[1]);
 | |
|                 return result;
 | |
|             }
 | |
| 
 | |
|             inline Float operator-() const
 | |
|             {
 | |
|                 Float result;
 | |
|                 result.m_values[0] = _mm_sub_ps(_mm_setzero_ps(), m_values[0]);
 | |
|                 result.m_values[1] = _mm_sub_ps(_mm_setzero_ps(), m_values[1]);
 | |
|                 return result;
 | |
|             }
 | |
| 
 | |
|             inline Float operator*(const Float& other) const
 | |
|             {
 | |
|                 Float result;
 | |
|                 result.m_values[0] = _mm_mul_ps(m_values[0], other.m_values[0]);
 | |
|                 result.m_values[1] = _mm_mul_ps(m_values[1], other.m_values[1]);
 | |
|                 return result;
 | |
|             }
 | |
| 
 | |
|             inline Float operator*(float other) const
 | |
|             {
 | |
|                 Float result;
 | |
|                 result.m_values[0] = _mm_mul_ps(m_values[0], _mm_set1_ps(other));
 | |
|                 result.m_values[1] = _mm_mul_ps(m_values[1], _mm_set1_ps(other));
 | |
|                 return result;
 | |
|             }
 | |
| 
 | |
|             inline Float operator/(const Float &other) const
 | |
|             {
 | |
|                 Float result;
 | |
|                 result.m_values[0] = _mm_div_ps(m_values[0], other.m_values[0]);
 | |
|                 result.m_values[1] = _mm_div_ps(m_values[1], other.m_values[1]);
 | |
|                 return result;
 | |
|             }
 | |
| 
 | |
|             inline Float operator/(float other) const
 | |
|             {
 | |
|                 Float result;
 | |
|                 result.m_values[0] = _mm_div_ps(m_values[0], _mm_set1_ps(other));
 | |
|                 result.m_values[1] = _mm_div_ps(m_values[1], _mm_set1_ps(other));
 | |
|                 return result;
 | |
|             }
 | |
|         };
 | |
| 
 | |
|         struct Int16CompFlag
 | |
|         {
 | |
|             __m128i m_value;
 | |
| 
 | |
|             inline Int16CompFlag operator&(const Int16CompFlag &other) const
 | |
|             {
 | |
|                 Int16CompFlag result;
 | |
|                 result.m_value = _mm_and_si128(m_value, other.m_value);
 | |
|                 return result;
 | |
|             }
 | |
| 
 | |
|             inline Int16CompFlag operator|(const Int16CompFlag &other) const
 | |
|             {
 | |
|                 Int16CompFlag result;
 | |
|                 result.m_value = _mm_or_si128(m_value, other.m_value);
 | |
|                 return result;
 | |
|             }
 | |
|         };
 | |
| 
 | |
|         struct Int32CompFlag
 | |
|         {
 | |
|             __m128i m_values[2];
 | |
| 
 | |
|             inline Int32CompFlag operator&(const Int32CompFlag &other) const
 | |
|             {
 | |
|                 Int32CompFlag result;
 | |
|                 result.m_values[0] = _mm_and_si128(m_values[0], other.m_values[0]);
 | |
|                 result.m_values[1] = _mm_and_si128(m_values[1], other.m_values[1]);
 | |
|                 return result;
 | |
|             }
 | |
| 
 | |
|             inline Int32CompFlag operator|(const Int32CompFlag &other) const
 | |
|             {
 | |
|                 Int32CompFlag result;
 | |
|                 result.m_values[0] = _mm_or_si128(m_values[0], other.m_values[0]);
 | |
|                 result.m_values[1] = _mm_or_si128(m_values[1], other.m_values[1]);
 | |
|                 return result;
 | |
|             }
 | |
|         };
 | |
| 
 | |
|         struct FloatCompFlag
 | |
|         {
 | |
|             __m128 m_values[2];
 | |
| 
 | |
|             inline FloatCompFlag operator&(const FloatCompFlag &other) const
 | |
|             {
 | |
|                 FloatCompFlag result;
 | |
|                 result.m_values[0] = _mm_and_ps(m_values[0], other.m_values[0]);
 | |
|                 result.m_values[1] = _mm_and_ps(m_values[1], other.m_values[1]);
 | |
|                 return result;
 | |
|             }
 | |
| 
 | |
|             inline FloatCompFlag operator|(const FloatCompFlag &other) const
 | |
|             {
 | |
|                 FloatCompFlag result;
 | |
|                 result.m_values[0] = _mm_or_ps(m_values[0], other.m_values[0]);
 | |
|                 result.m_values[1] = _mm_or_ps(m_values[1], other.m_values[1]);
 | |
|                 return result;
 | |
|             }
 | |
|         };
 | |
| 
 | |
|         template<int TSubtype>
 | |
|         static VInt16<TSubtype> AbstractAdd(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
 | |
|         {
 | |
|             VInt16<TSubtype> result;
 | |
|             result.m_value = _mm_add_epi16(a.m_value, b.m_value);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         template<int TSubtype>
 | |
|         static VInt16<TSubtype> AbstractSubtract(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
 | |
|         {
 | |
|             VInt16<TSubtype> result;
 | |
|             result.m_value = _mm_sub_epi16(a.m_value, b.m_value);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static Float Select(const FloatCompFlag &flag, const Float &a, const Float &b)
 | |
|         {
 | |
|             Float result;
 | |
|             for (int i = 0; i < 2; i++)
 | |
|                 result.m_values[i] = _mm_or_ps(_mm_and_ps(flag.m_values[i], a.m_values[i]), _mm_andnot_ps(flag.m_values[i], b.m_values[i]));
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         template<int TSubtype>
 | |
|         static VInt16<TSubtype> Select(const Int16CompFlag &flag, const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
 | |
|         {
 | |
|             VInt16<TSubtype> result;
 | |
|             result.m_value = _mm_or_si128(_mm_and_si128(flag.m_value, a.m_value), _mm_andnot_si128(flag.m_value, b.m_value));
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         template<int TSubtype>
 | |
|         static VInt16<TSubtype> SelectOrZero(const Int16CompFlag &flag, const VInt16<TSubtype> &a)
 | |
|         {
 | |
|             VInt16<TSubtype> result;
 | |
|             result.m_value = _mm_and_si128(flag.m_value, a.m_value);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         template<int TSubtype>
 | |
|         static void ConditionalSet(VInt16<TSubtype> &dest, const Int16CompFlag &flag, const VInt16<TSubtype> &src)
 | |
|         {
 | |
|             dest.m_value = _mm_or_si128(_mm_andnot_si128(flag.m_value, dest.m_value), _mm_and_si128(flag.m_value, src.m_value));
 | |
|         }
 | |
| 
 | |
|         template<int TSubtype>
 | |
|         static void ConditionalSet(VInt32<TSubtype> &dest, const Int16CompFlag &flag, const VInt32<TSubtype> &src)
 | |
|         {
 | |
|             __m128i lowFlags = _mm_unpacklo_epi16(flag.m_value, flag.m_value);
 | |
|             __m128i highFlags = _mm_unpackhi_epi16(flag.m_value, flag.m_value);
 | |
|             dest.m_values[0] = _mm_or_si128(_mm_andnot_si128(lowFlags, dest.m_values[0]), _mm_and_si128(lowFlags, src.m_values[0]));
 | |
|             dest.m_values[1] = _mm_or_si128(_mm_andnot_si128(highFlags, dest.m_values[1]), _mm_and_si128(highFlags, src.m_values[1]));
 | |
|         }
 | |
| 
 | |
|         static void ConditionalSet(ParallelMath::Int16CompFlag &dest, const Int16CompFlag &flag, const ParallelMath::Int16CompFlag &src)
 | |
|         {
 | |
|             dest.m_value = _mm_or_si128(_mm_andnot_si128(flag.m_value, dest.m_value), _mm_and_si128(flag.m_value, src.m_value));
 | |
|         }
 | |
| 
 | |
|         static SInt16 ConditionalNegate(const Int16CompFlag &flag, const SInt16 &v)
 | |
|         {
 | |
|             SInt16 result;
 | |
|             result.m_value = _mm_add_epi16(_mm_xor_si128(flag.m_value, v.m_value), _mm_srli_epi16(flag.m_value, 15));
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         template<int TSubtype>
 | |
|         static void NotConditionalSet(VInt16<TSubtype> &dest, const Int16CompFlag &flag, const VInt16<TSubtype> &src)
 | |
|         {
 | |
|             dest.m_value = _mm_or_si128(_mm_and_si128(flag.m_value, dest.m_value), _mm_andnot_si128(flag.m_value, src.m_value));
 | |
|         }
 | |
| 
 | |
|         static void ConditionalSet(Float &dest, const FloatCompFlag &flag, const Float &src)
 | |
|         {
 | |
|             for (int i = 0; i < 2; i++)
 | |
|                 dest.m_values[i] = _mm_or_ps(_mm_andnot_ps(flag.m_values[i], dest.m_values[i]), _mm_and_ps(flag.m_values[i], src.m_values[i]));
 | |
|         }
 | |
| 
 | |
|         static void NotConditionalSet(Float &dest, const FloatCompFlag &flag, const Float &src)
 | |
|         {
 | |
|             for (int i = 0; i < 2; i++)
 | |
|                 dest.m_values[i] = _mm_or_ps(_mm_and_ps(flag.m_values[i], dest.m_values[i]), _mm_andnot_ps(flag.m_values[i], src.m_values[i]));
 | |
|         }
 | |
| 
 | |
|         static void MakeSafeDenominator(Float& v)
 | |
|         {
 | |
|             ConditionalSet(v, Equal(v, MakeFloatZero()), MakeFloat(1.0f));
 | |
|         }
 | |
| 
 | |
|         static SInt16 TruncateToPrecisionSigned(const SInt16 &v, int precision)
 | |
|         {
 | |
|             int lostBits = 16 - precision;
 | |
|             if (lostBits == 0)
 | |
|                 return v;
 | |
| 
 | |
|             SInt16 result;
 | |
|             result.m_value = _mm_srai_epi16(_mm_slli_epi16(v.m_value, lostBits), lostBits);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static UInt16 TruncateToPrecisionUnsigned(const UInt16 &v, int precision)
 | |
|         {
 | |
|             int lostBits = 16 - precision;
 | |
|             if (lostBits == 0)
 | |
|                 return v;
 | |
| 
 | |
|             UInt16 result;
 | |
|             result.m_value = _mm_srli_epi16(_mm_slli_epi16(v.m_value, lostBits), lostBits);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static UInt16 Min(const UInt16 &a, const UInt16 &b)
 | |
|         {
 | |
|             __m128i bitFlip = _mm_set1_epi16(-32768);
 | |
| 
 | |
|             UInt16 result;
 | |
|             result.m_value = _mm_xor_si128(_mm_min_epi16(_mm_xor_si128(a.m_value, bitFlip), _mm_xor_si128(b.m_value, bitFlip)), bitFlip);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static SInt16 Min(const SInt16 &a, const SInt16 &b)
 | |
|         {
 | |
|             SInt16 result;
 | |
|             result.m_value = _mm_min_epi16(a.m_value, b.m_value);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static UInt15 Min(const UInt15 &a, const UInt15 &b)
 | |
|         {
 | |
|             UInt15 result;
 | |
|             result.m_value = _mm_min_epi16(a.m_value, b.m_value);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static Float Min(const Float &a, const Float &b)
 | |
|         {
 | |
|             Float result;
 | |
|             for (int i = 0; i < 2; i++)
 | |
|                 result.m_values[i] = _mm_min_ps(a.m_values[i], b.m_values[i]);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static UInt16 Max(const UInt16 &a, const UInt16 &b)
 | |
|         {
 | |
|             __m128i bitFlip = _mm_set1_epi16(-32768);
 | |
| 
 | |
|             UInt16 result;
 | |
|             result.m_value = _mm_xor_si128(_mm_max_epi16(_mm_xor_si128(a.m_value, bitFlip), _mm_xor_si128(b.m_value, bitFlip)), bitFlip);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static SInt16 Max(const SInt16 &a, const SInt16 &b)
 | |
|         {
 | |
|             SInt16 result;
 | |
|             result.m_value = _mm_max_epi16(a.m_value, b.m_value);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static UInt15 Max(const UInt15 &a, const UInt15 &b)
 | |
|         {
 | |
|             UInt15 result;
 | |
|             result.m_value = _mm_max_epi16(a.m_value, b.m_value);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static Float Max(const Float &a, const Float &b)
 | |
|         {
 | |
|             Float result;
 | |
|             for (int i = 0; i < 2; i++)
 | |
|                 result.m_values[i] = _mm_max_ps(a.m_values[i], b.m_values[i]);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static Float Clamp(const Float &v, float min, float max)
 | |
|         {
 | |
|             Float result;
 | |
|             for (int i = 0; i < 2; i++)
 | |
|                 result.m_values[i] = _mm_max_ps(_mm_min_ps(v.m_values[i], _mm_set1_ps(max)), _mm_set1_ps(min));
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static Float Reciprocal(const Float &v)
 | |
|         {
 | |
|             Float result;
 | |
|             for (int i = 0; i < 2; i++)
 | |
|                 result.m_values[i] = _mm_rcp_ps(v.m_values[i]);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static void ConvertLDRInputs(const PixelBlockU8* inputBlocks, int pxOffset, int channel, UInt15 &chOut)
 | |
|         {
 | |
|             int16_t values[8];
 | |
|             for (int i = 0; i < 8; i++)
 | |
|                 values[i] = inputBlocks[i].m_pixels[pxOffset][channel];
 | |
| 
 | |
|             chOut.m_value = _mm_set_epi16(values[7], values[6], values[5], values[4], values[3], values[2], values[1], values[0]);
 | |
|         }
 | |
| 
 | |
|         static void ConvertHDRInputs(const PixelBlockF16* inputBlocks, int pxOffset, int channel, SInt16 &chOut)
 | |
|         {
 | |
|             int16_t values[8];
 | |
|             for (int i = 0; i < 8; i++)
 | |
|                 values[i] = inputBlocks[i].m_pixels[pxOffset][channel];
 | |
| 
 | |
|             chOut.m_value = _mm_set_epi16(values[7], values[6], values[5], values[4], values[3], values[2], values[1], values[0]);
 | |
|         }
 | |
| 
 | |
|         static Float MakeFloat(float v)
 | |
|         {
 | |
|             Float f;
 | |
|             f.m_values[0] = f.m_values[1] = _mm_set1_ps(v);
 | |
|             return f;
 | |
|         }
 | |
| 
 | |
|         static Float MakeFloatZero()
 | |
|         {
 | |
|             Float f;
 | |
|             f.m_values[0] = f.m_values[1] = _mm_setzero_ps();
 | |
|             return f;
 | |
|         }
 | |
| 
 | |
|         static UInt16 MakeUInt16(uint16_t v)
 | |
|         {
 | |
|             UInt16 result;
 | |
|             result.m_value = _mm_set1_epi16(static_cast<short>(v));
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static SInt16 MakeSInt16(int16_t v)
 | |
|         {
 | |
|             SInt16 result;
 | |
|             result.m_value = _mm_set1_epi16(static_cast<short>(v));
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static AInt16 MakeAInt16(int16_t v)
 | |
|         {
 | |
|             AInt16 result;
 | |
|             result.m_value = _mm_set1_epi16(static_cast<short>(v));
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static UInt15 MakeUInt15(uint16_t v)
 | |
|         {
 | |
|             UInt15 result;
 | |
|             result.m_value = _mm_set1_epi16(static_cast<short>(v));
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static SInt32 MakeSInt32(int32_t v)
 | |
|         {
 | |
|             SInt32 result;
 | |
|             result.m_values[0] = _mm_set1_epi32(v);
 | |
|             result.m_values[1] = _mm_set1_epi32(v);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static UInt31 MakeUInt31(uint32_t v)
 | |
|         {
 | |
|             UInt31 result;
 | |
|             result.m_values[0] = _mm_set1_epi32(v);
 | |
|             result.m_values[1] = _mm_set1_epi32(v);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static uint16_t Extract(const UInt16 &v, int offset)
 | |
|         {
 | |
|             return reinterpret_cast<const uint16_t*>(&v.m_value)[offset];
 | |
|         }
 | |
| 
 | |
|         static int16_t Extract(const SInt16 &v, int offset)
 | |
|         {
 | |
|             return reinterpret_cast<const int16_t*>(&v.m_value)[offset];
 | |
|         }
 | |
| 
 | |
|         static uint16_t Extract(const UInt15 &v, int offset)
 | |
|         {
 | |
|             return reinterpret_cast<const uint16_t*>(&v.m_value)[offset];
 | |
|         }
 | |
| 
 | |
|         static int16_t Extract(const AInt16 &v, int offset)
 | |
|         {
 | |
|             return reinterpret_cast<const int16_t*>(&v.m_value)[offset];
 | |
|         }
 | |
| 
 | |
|         static int32_t Extract(const SInt32 &v, int offset)
 | |
|         {
 | |
|             return reinterpret_cast<const int32_t*>(&v.m_values[offset >> 2])[offset & 3];
 | |
|         }
 | |
| 
 | |
|         static float Extract(const Float &v, int offset)
 | |
|         {
 | |
|             return reinterpret_cast<const float*>(&v.m_values[offset >> 2])[offset & 3];
 | |
|         }
 | |
| 
 | |
|         static bool Extract(const ParallelMath::Int16CompFlag &v, int offset)
 | |
|         {
 | |
|             return reinterpret_cast<const int16_t*>(&v.m_value)[offset] != 0;
 | |
|         }
 | |
| 
 | |
|         static void PutUInt16(UInt16 &dest, int offset, uint16_t v)
 | |
|         {
 | |
|             reinterpret_cast<uint16_t*>(&dest)[offset] = v;
 | |
|         }
 | |
| 
 | |
|         static void PutUInt15(UInt15 &dest, int offset, uint16_t v)
 | |
|         {
 | |
|             reinterpret_cast<uint16_t*>(&dest)[offset] = v;
 | |
|         }
 | |
| 
 | |
|         static void PutSInt16(SInt16 &dest, int offset, int16_t v)
 | |
|         {
 | |
|             reinterpret_cast<int16_t*>(&dest)[offset] = v;
 | |
|         }
 | |
| 
 | |
|         static float ExtractFloat(const Float& v, int offset)
 | |
|         {
 | |
|             return reinterpret_cast<const float*>(&v)[offset];
 | |
|         }
 | |
| 
 | |
|         static void PutFloat(Float &dest, int offset, float v)
 | |
|         {
 | |
|             reinterpret_cast<float*>(&dest)[offset] = v;
 | |
|         }
 | |
| 
 | |
|         static void PutBoolInt16(Int16CompFlag &dest, int offset, bool v)
 | |
|         {
 | |
|             reinterpret_cast<int16_t*>(&dest)[offset] = v ? -1 : 0;
 | |
|         }
 | |
| 
 | |
|         static Int32CompFlag Less(const UInt31 &a, const UInt31 &b)
 | |
|         {
 | |
|             Int32CompFlag result;
 | |
|             result.m_values[0] = _mm_cmplt_epi32(a.m_values[0], b.m_values[0]);
 | |
|             result.m_values[1] = _mm_cmplt_epi32(a.m_values[1], b.m_values[1]);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static Int16CompFlag Less(const SInt16 &a, const SInt16 &b)
 | |
|         {
 | |
|             Int16CompFlag result;
 | |
|             result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static Int16CompFlag Less(const UInt15 &a, const UInt15 &b)
 | |
|         {
 | |
|             Int16CompFlag result;
 | |
|             result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static Int16CompFlag LessOrEqual(const UInt15 &a, const UInt15 &b)
 | |
|         {
 | |
|             Int16CompFlag result;
 | |
|             result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static FloatCompFlag Less(const Float &a, const Float &b)
 | |
|         {
 | |
|             FloatCompFlag result;
 | |
|             for (int i = 0; i < 2; i++)
 | |
|                 result.m_values[i] = _mm_cmplt_ps(a.m_values[i], b.m_values[i]);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static FloatCompFlag LessOrEqual(const Float &a, const Float &b)
 | |
|         {
 | |
|             FloatCompFlag result;
 | |
|             for (int i = 0; i < 2; i++)
 | |
|                 result.m_values[i] = _mm_cmple_ps(a.m_values[i], b.m_values[i]);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         template<int TSubtype>
 | |
|         static Int16CompFlag Equal(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
 | |
|         {
 | |
|             Int16CompFlag result;
 | |
|             result.m_value = _mm_cmpeq_epi16(a.m_value, b.m_value);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static FloatCompFlag Equal(const Float &a, const Float &b)
 | |
|         {
 | |
|             FloatCompFlag result;
 | |
|             for (int i = 0; i < 2; i++)
 | |
|                 result.m_values[i] = _mm_cmpeq_ps(a.m_values[i], b.m_values[i]);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static Int16CompFlag Equal(const Int16CompFlag &a, const Int16CompFlag &b)
 | |
|         {
 | |
|             Int16CompFlag notResult;
 | |
|             notResult.m_value = _mm_xor_si128(a.m_value, b.m_value);
 | |
|             return Not(notResult);
 | |
|         }
 | |
| 
 | |
|         static Float ToFloat(const UInt16 &v)
 | |
|         {
 | |
|             Float result;
 | |
|             result.m_values[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v.m_value, _mm_setzero_si128()));
 | |
|             result.m_values[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v.m_value, _mm_setzero_si128()));
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static UInt31 ToUInt31(const UInt16 &v)
 | |
|         {
 | |
|             UInt31 result;
 | |
|             result.m_values[0] = _mm_unpacklo_epi16(v.m_value, _mm_setzero_si128());
 | |
|             result.m_values[1] = _mm_unpackhi_epi16(v.m_value, _mm_setzero_si128());
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static SInt32 ToInt32(const UInt16 &v)
 | |
|         {
 | |
|             SInt32 result;
 | |
|             result.m_values[0] = _mm_unpacklo_epi16(v.m_value, _mm_setzero_si128());
 | |
|             result.m_values[1] = _mm_unpackhi_epi16(v.m_value, _mm_setzero_si128());
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static SInt32 ToInt32(const UInt15 &v)
 | |
|         {
 | |
|             SInt32 result;
 | |
|             result.m_values[0] = _mm_unpacklo_epi16(v.m_value, _mm_setzero_si128());
 | |
|             result.m_values[1] = _mm_unpackhi_epi16(v.m_value, _mm_setzero_si128());
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static SInt32 ToInt32(const SInt16 &v)
 | |
|         {
 | |
|             SInt32 result;
 | |
|             result.m_values[0] = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), v.m_value), 16);
 | |
|             result.m_values[1] = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), v.m_value), 16);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static Float ToFloat(const SInt16 &v)
 | |
|         {
 | |
|             Float result;
 | |
|             result.m_values[0] = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), v.m_value), 16));
 | |
|             result.m_values[1] = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), v.m_value), 16));
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static Float ToFloat(const UInt15 &v)
 | |
|         {
 | |
|             Float result;
 | |
|             result.m_values[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v.m_value, _mm_setzero_si128()));
 | |
|             result.m_values[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v.m_value, _mm_setzero_si128()));
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static Float ToFloat(const UInt31 &v)
 | |
|         {
 | |
|             Float result;
 | |
|             result.m_values[0] = _mm_cvtepi32_ps(v.m_values[0]);
 | |
|             result.m_values[1] = _mm_cvtepi32_ps(v.m_values[1]);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static Int16CompFlag FloatFlagToInt16(const FloatCompFlag &v)
 | |
|         {
 | |
|             __m128i lo = _mm_castps_si128(v.m_values[0]);
 | |
|             __m128i hi = _mm_castps_si128(v.m_values[1]);
 | |
| 
 | |
|             Int16CompFlag result;
 | |
|             result.m_value = _mm_packs_epi32(lo, hi);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static FloatCompFlag Int16FlagToFloat(const Int16CompFlag &v)
 | |
|         {
 | |
|             __m128i lo = _mm_unpacklo_epi16(v.m_value, v.m_value);
 | |
|             __m128i hi = _mm_unpackhi_epi16(v.m_value, v.m_value);
 | |
| 
 | |
|             FloatCompFlag result;
 | |
|             result.m_values[0] = _mm_castsi128_ps(lo);
 | |
|             result.m_values[1] = _mm_castsi128_ps(hi);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static Int16CompFlag Int32FlagToInt16(const Int32CompFlag &v)
 | |
|         {
 | |
|             __m128i lo = v.m_values[0];
 | |
|             __m128i hi = v.m_values[1];
 | |
| 
 | |
|             Int16CompFlag result;
 | |
|             result.m_value = _mm_packs_epi32(lo, hi);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static Int16CompFlag MakeBoolInt16(bool b)
 | |
|         {
 | |
|             Int16CompFlag result;
 | |
|             if (b)
 | |
|                 result.m_value = _mm_set1_epi16(-1);
 | |
|             else
 | |
|                 result.m_value = _mm_setzero_si128();
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static FloatCompFlag MakeBoolFloat(bool b)
 | |
|         {
 | |
|             FloatCompFlag result;
 | |
|             if (b)
 | |
|                 result.m_values[0] = result.m_values[1] = _mm_castsi128_ps(_mm_set1_epi32(-1));
 | |
|             else
 | |
|                 result.m_values[0] = result.m_values[1] = _mm_setzero_ps();
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static Int16CompFlag AndNot(const Int16CompFlag &a, const Int16CompFlag &b)
 | |
|         {
 | |
|             Int16CompFlag result;
 | |
|             result.m_value = _mm_andnot_si128(b.m_value, a.m_value);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static Int16CompFlag Not(const Int16CompFlag &b)
 | |
|         {
 | |
|             Int16CompFlag result;
 | |
|             result.m_value = _mm_xor_si128(b.m_value, _mm_set1_epi32(-1));
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static Int32CompFlag Not(const Int32CompFlag &b)
 | |
|         {
 | |
|             Int32CompFlag result;
 | |
|             result.m_values[0] = _mm_xor_si128(b.m_values[0], _mm_set1_epi32(-1));
 | |
|             result.m_values[1] = _mm_xor_si128(b.m_values[1], _mm_set1_epi32(-1));
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static UInt16 RoundAndConvertToU16(const Float &v, const void* /*roundingMode*/)
 | |
|         {
 | |
|             __m128i lo = _mm_cvtps_epi32(_mm_add_ps(v.m_values[0], _mm_set1_ps(-32768)));
 | |
|             __m128i hi = _mm_cvtps_epi32(_mm_add_ps(v.m_values[1], _mm_set1_ps(-32768)));
 | |
| 
 | |
|             __m128i packed = _mm_packs_epi32(lo, hi);
 | |
| 
 | |
|             UInt16 result;
 | |
|             result.m_value = _mm_xor_si128(packed, _mm_set1_epi16(-32768));
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static UInt15 RoundAndConvertToU15(const Float &v, const void* /*roundingMode*/)
 | |
|         {
 | |
|             __m128i lo = _mm_cvtps_epi32(v.m_values[0]);
 | |
|             __m128i hi = _mm_cvtps_epi32(v.m_values[1]);
 | |
| 
 | |
|             __m128i packed = _mm_packs_epi32(lo, hi);
 | |
| 
 | |
|             UInt15 result;
 | |
|             result.m_value = _mm_packs_epi32(lo, hi);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static SInt16 RoundAndConvertToS16(const Float &v, const void* /*roundingMode*/)
 | |
|         {
 | |
|             __m128i lo = _mm_cvtps_epi32(v.m_values[0]);
 | |
|             __m128i hi = _mm_cvtps_epi32(v.m_values[1]);
 | |
| 
 | |
|             __m128i packed = _mm_packs_epi32(lo, hi);
 | |
| 
 | |
|             SInt16 result;
 | |
|             result.m_value = _mm_packs_epi32(lo, hi);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static Float Sqrt(const Float &f)
 | |
|         {
 | |
|             Float result;
 | |
|             for (int i = 0; i < 2; i++)
 | |
|                 result.m_values[i] = _mm_sqrt_ps(f.m_values[i]);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static UInt16 Abs(const SInt16 &a)
 | |
|         {
 | |
|             __m128i signBitsXor = _mm_srai_epi16(a.m_value, 15);
 | |
|             __m128i signBitsAdd = _mm_srli_epi16(a.m_value, 15);
 | |
| 
 | |
|             UInt16 result;
 | |
|             result.m_value = _mm_add_epi16(_mm_xor_si128(a.m_value, signBitsXor), signBitsAdd);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static Float Abs(const Float& a)
 | |
|         {
 | |
|             __m128 invMask = _mm_set1_ps(-0.0f);
 | |
| 
 | |
|             Float result;
 | |
|             result.m_values[0] = _mm_andnot_ps(invMask, a.m_values[0]);
 | |
|             result.m_values[1] = _mm_andnot_ps(invMask, a.m_values[1]);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static UInt16 SqDiffUInt8(const UInt15 &a, const UInt15 &b)
 | |
|         {
 | |
|             __m128i diff = _mm_sub_epi16(a.m_value, b.m_value);
 | |
| 
 | |
|             UInt16 result;
 | |
|             result.m_value = _mm_mullo_epi16(diff, diff);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static Float SqDiffSInt16(const SInt16 &a, const SInt16 &b)
 | |
|         {
 | |
|             __m128i diffU = _mm_sub_epi16(_mm_max_epi16(a.m_value, b.m_value), _mm_min_epi16(a.m_value, b.m_value));
 | |
| 
 | |
|             __m128i mulHi = _mm_mulhi_epu16(diffU, diffU);
 | |
|             __m128i mulLo = _mm_mullo_epi16(diffU, diffU);
 | |
|             __m128i sqDiffHi = _mm_unpackhi_epi16(mulLo, mulHi);
 | |
|             __m128i sqDiffLo = _mm_unpacklo_epi16(mulLo, mulHi);
 | |
| 
 | |
|             Float result;
 | |
|             result.m_values[0] = _mm_cvtepi32_ps(sqDiffLo);
 | |
|             result.m_values[1] = _mm_cvtepi32_ps(sqDiffHi);
 | |
| 
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static Float TwosCLHalfToFloat(const SInt16 &v)
 | |
|         {
 | |
|             __m128i absV = _mm_add_epi16(_mm_xor_si128(v.m_value, _mm_srai_epi16(v.m_value, 15)), _mm_srli_epi16(v.m_value, 15));
 | |
| 
 | |
|             __m128i signBits = _mm_and_si128(v.m_value, _mm_set1_epi16(-32768));
 | |
|             __m128i mantissa = _mm_and_si128(v.m_value, _mm_set1_epi16(0x03ff));
 | |
|             __m128i exponent = _mm_and_si128(v.m_value, _mm_set1_epi16(0x7c00));
 | |
| 
 | |
|             __m128i isDenormal = _mm_cmpeq_epi16(exponent, _mm_setzero_si128());
 | |
| 
 | |
|             // Convert exponent to high-bits 
 | |
|             exponent = _mm_add_epi16(_mm_srli_epi16(exponent, 3), _mm_set1_epi16(14336));
 | |
| 
 | |
|             __m128i denormalCorrectionHigh = _mm_and_si128(isDenormal, _mm_or_si128(signBits, _mm_set1_epi16(14336)));
 | |
| 
 | |
|             __m128i highBits = _mm_or_si128(signBits, _mm_or_si128(exponent, _mm_srli_epi16(mantissa, 3)));
 | |
|             __m128i lowBits = _mm_slli_epi16(mantissa, 13);
 | |
| 
 | |
|             __m128i flow = _mm_unpacklo_epi16(lowBits, highBits);
 | |
|             __m128i fhigh = _mm_unpackhi_epi16(lowBits, highBits);
 | |
| 
 | |
|             __m128i correctionLow = _mm_unpacklo_epi16(_mm_setzero_si128(), denormalCorrectionHigh);
 | |
|             __m128i correctionHigh = _mm_unpackhi_epi16(_mm_setzero_si128(), denormalCorrectionHigh);
 | |
| 
 | |
|             Float result;
 | |
|             result.m_values[0] = _mm_sub_ps(_mm_castsi128_ps(flow), _mm_castsi128_ps(correctionLow));
 | |
|             result.m_values[1] = _mm_sub_ps(_mm_castsi128_ps(fhigh), _mm_castsi128_ps(correctionHigh));
 | |
| 
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static Float SqDiff2CLFloat(const SInt16 &a, const Float &b)
 | |
|         {
 | |
|             Float fa = TwosCLHalfToFloat(a);
 | |
| 
 | |
|             Float diff = fa - b;
 | |
|             return diff * diff;
 | |
|         }
 | |
| 
 | |
|         static Float SqDiff2CL(const SInt16 &a, const SInt16 &b)
 | |
|         {
 | |
|             Float fa = TwosCLHalfToFloat(a);
 | |
|             Float fb = TwosCLHalfToFloat(b);
 | |
| 
 | |
|             Float diff = fa - fb;
 | |
|             return diff * diff;
 | |
|         }
 | |
| 
 | |
|         static Float SqDiff2CLFloat(const SInt16 &a, float aWeight, const Float &b)
 | |
|         {
 | |
|             Float fa = TwosCLHalfToFloat(a) * aWeight;
 | |
| 
 | |
|             Float diff = fa - b;
 | |
|             return diff * diff;
 | |
|         }
 | |
| 
 | |
|         static UInt16 RightShift(const UInt16 &v, int bits)
 | |
|         {
 | |
|             UInt16 result;
 | |
|             result.m_value = _mm_srli_epi16(v.m_value, bits);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static UInt31 RightShift(const UInt31 &v, int bits)
 | |
|         {
 | |
|             UInt31 result;
 | |
|             result.m_values[0] = _mm_srli_epi32(v.m_values[0], bits);
 | |
|             result.m_values[1] = _mm_srli_epi32(v.m_values[1], bits);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static SInt16 RightShift(const SInt16 &v, int bits)
 | |
|         {
 | |
|             SInt16 result;
 | |
|             result.m_value = _mm_srai_epi16(v.m_value, bits);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static UInt15 RightShift(const UInt15 &v, int bits)
 | |
|         {
 | |
|             UInt15 result;
 | |
|             result.m_value = _mm_srli_epi16(v.m_value, bits);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static SInt32 RightShift(const SInt32 &v, int bits)
 | |
|         {
 | |
|             SInt32 result;
 | |
|             result.m_values[0] = _mm_srai_epi32(v.m_values[0], bits);
 | |
|             result.m_values[1] = _mm_srai_epi32(v.m_values[1], bits);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static SInt16 ToSInt16(const SInt32 &v)
 | |
|         {
 | |
|             SInt16 result;
 | |
|             result.m_value = _mm_packs_epi32(v.m_values[0], v.m_values[1]);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static SInt16 ToSInt16(const UInt16 &v)
 | |
|         {
 | |
|             SInt16 result;
 | |
|             result.m_value = v.m_value;
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static SInt16 ToSInt16(const UInt15 &v)
 | |
|         {
 | |
|             SInt16 result;
 | |
|             result.m_value = v.m_value;
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static UInt16 ToUInt16(const UInt32 &v)
 | |
|         {
 | |
|             __m128i low = _mm_srai_epi32(_mm_slli_epi32(v.m_values[0], 16), 16);
 | |
|             __m128i high = _mm_srai_epi32(_mm_slli_epi32(v.m_values[1], 16), 16);
 | |
| 
 | |
|             UInt16 result;
 | |
|             result.m_value = _mm_packs_epi32(low, high);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static UInt16 ToUInt16(const UInt31 &v)
 | |
|         {
 | |
|             __m128i low = _mm_srai_epi32(_mm_slli_epi32(v.m_values[0], 16), 16);
 | |
|             __m128i high = _mm_srai_epi32(_mm_slli_epi32(v.m_values[1], 16), 16);
 | |
| 
 | |
|             UInt16 result;
 | |
|             result.m_value = _mm_packs_epi32(low, high);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static UInt15 ToUInt15(const UInt31 &v)
 | |
|         {
 | |
|             UInt15 result;
 | |
|             result.m_value = _mm_packs_epi32(v.m_values[0], v.m_values[1]);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static UInt15 ToUInt15(const SInt16 &v)
 | |
|         {
 | |
|             UInt15 result;
 | |
|             result.m_value = v.m_value;
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static UInt15 ToUInt15(const UInt16 &v)
 | |
|         {
 | |
|             UInt15 result;
 | |
|             result.m_value = v.m_value;
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static SInt32 XMultiply(const SInt16 &a, const SInt16 &b)
 | |
|         {
 | |
|             __m128i high = _mm_mulhi_epi16(a.m_value, b.m_value);
 | |
|             __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
 | |
| 
 | |
|             SInt32 result;
 | |
|             result.m_values[0] = _mm_unpacklo_epi16(low, high);
 | |
|             result.m_values[1] = _mm_unpackhi_epi16(low, high);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static SInt32 XMultiply(const SInt16 &a, const UInt15 &b)
 | |
|         {
 | |
|             __m128i high = _mm_mulhi_epi16(a.m_value, b.m_value);
 | |
|             __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
 | |
| 
 | |
|             SInt32 result;
 | |
|             result.m_values[0] = _mm_unpacklo_epi16(low, high);
 | |
|             result.m_values[1] = _mm_unpackhi_epi16(low, high);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static SInt32 XMultiply(const UInt15 &a, const SInt16 &b)
 | |
|         {
 | |
|             return XMultiply(b, a);
 | |
|         }
 | |
| 
 | |
|         static UInt32 XMultiply(const UInt16 &a, const UInt16 &b)
 | |
|         {
 | |
|             __m128i high = _mm_mulhi_epu16(a.m_value, b.m_value);
 | |
|             __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
 | |
| 
 | |
|             UInt32 result;
 | |
|             result.m_values[0] = _mm_unpacklo_epi16(low, high);
 | |
|             result.m_values[1] = _mm_unpackhi_epi16(low, high);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static UInt16 CompactMultiply(const UInt16 &a, const UInt15 &b)
 | |
|         {
 | |
|             UInt16 result;
 | |
|             result.m_value = _mm_mullo_epi16(a.m_value, b.m_value);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static UInt16 CompactMultiply(const UInt15 &a, const UInt15 &b)
 | |
|         {
 | |
|             UInt16 result;
 | |
|             result.m_value = _mm_mullo_epi16(a.m_value, b.m_value);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static SInt16 CompactMultiply(const SInt16 &a, const UInt15 &b)
 | |
|         {
 | |
|             SInt16 result;
 | |
|             result.m_value = _mm_mullo_epi16(a.m_value, b.m_value);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static SInt16 CompactMultiply(const SInt16 &a, const SInt16 &b)
 | |
|         {
 | |
|             SInt16 result;
 | |
|             result.m_value = _mm_mullo_epi16(a.m_value, b.m_value);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static UInt31 XMultiply(const UInt15 &a, const UInt15 &b)
 | |
|         {
 | |
|             __m128i high = _mm_mulhi_epu16(a.m_value, b.m_value);
 | |
|             __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
 | |
| 
 | |
|             UInt31 result;
 | |
|             result.m_values[0] = _mm_unpacklo_epi16(low, high);
 | |
|             result.m_values[1] = _mm_unpackhi_epi16(low, high);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static UInt31 XMultiply(const UInt16 &a, const UInt15 &b)
 | |
|         {
 | |
|             __m128i high = _mm_mulhi_epu16(a.m_value, b.m_value);
 | |
|             __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
 | |
| 
 | |
|             UInt31 result;
 | |
|             result.m_values[0] = _mm_unpacklo_epi16(low, high);
 | |
|             result.m_values[1] = _mm_unpackhi_epi16(low, high);
 | |
|             return result;
 | |
|         }
 | |
| 
 | |
|         static UInt31 XMultiply(const UInt15 &a, const UInt16 &b)
 | |
|         {
 | |
|             return XMultiply(b, a);
 | |
|         }
 | |
| 
 | |
|         static bool AnySet(const Int16CompFlag &v)
 | |
|         {
 | |
|             return _mm_movemask_epi8(v.m_value) != 0;
 | |
|         }
 | |
| 
 | |
|         static bool AllSet(const Int16CompFlag &v)
 | |
|         {
 | |
|             return _mm_movemask_epi8(v.m_value) == 0xffff;
 | |
|         }
 | |
| 
 | |
|         static bool AnySet(const FloatCompFlag &v)
 | |
|         {
 | |
|             return _mm_movemask_ps(v.m_values[0]) != 0 || _mm_movemask_ps(v.m_values[1]) != 0;
 | |
|         }
 | |
| 
 | |
|         static bool AllSet(const FloatCompFlag &v)
 | |
|         {
 | |
|             return _mm_movemask_ps(v.m_values[0]) == 0xf && _mm_movemask_ps(v.m_values[1]) == 0xf;
 | |
|         }
 | |
|     };
 | |
| 
 | |
| #else
 | |
|     // Scalar version
 | |
|     struct ParallelMath
 | |
|     {
 | |
|         struct RoundTowardZeroForScope
 | |
|         {
 | |
|         };
 | |
| 
 | |
|         struct RoundTowardNearestForScope
 | |
|         {
 | |
|         };
 | |
| 
 | |
|         struct RoundUpForScope
 | |
|         {
 | |
|         };
 | |
| 
 | |
|         struct RoundDownForScope
 | |
|         {
 | |
|         };
 | |
| 
 | |
|         static const int ParallelSize = 1;
 | |
| 
 | |
|         enum Int16Subtype
 | |
|         {
 | |
|             IntSubtype_Signed,
 | |
|             IntSubtype_UnsignedFull,
 | |
|             IntSubtype_UnsignedTruncated,
 | |
|             IntSubtype_Abstract,
 | |
|         };
 | |
| 
 | |
|         typedef int32_t SInt16;
 | |
|         typedef int32_t UInt15;
 | |
|         typedef int32_t UInt16;
 | |
|         typedef int32_t AInt16;
 | |
| 
 | |
|         typedef int32_t SInt32;
 | |
|         typedef int32_t UInt31;
 | |
|         typedef int32_t UInt32;
 | |
|         typedef int32_t AInt32;
 | |
| 
 | |
|         typedef int32_t ScalarUInt16;
 | |
|         typedef int32_t ScalarSInt16;
 | |
| 
 | |
|         typedef float Float;
 | |
| 
 | |
|         template<class TTargetType>
 | |
|         struct LosslessCast
 | |
|         {
 | |
|             static const int32_t& Cast(const int32_t &src)
 | |
|             {
 | |
|                 return src;
 | |
|             }
 | |
|         };
 | |
| 
 | |
|         typedef bool Int16CompFlag;
 | |
|         typedef bool FloatCompFlag;
 | |
| 
 | |
|         static int32_t AbstractAdd(const int32_t &a, const int32_t &b)
 | |
|         {
 | |
|             return a + b;
 | |
|         }
 | |
| 
 | |
|         static int32_t AbstractSubtract(const int32_t &a, const int32_t &b)
 | |
|         {
 | |
|             return a - b;
 | |
|         }
 | |
| 
 | |
|         static float Select(bool flag, float a, float b)
 | |
|         {
 | |
|             return flag ? a : b;
 | |
|         }
 | |
| 
 | |
|         static int32_t Select(bool flag, int32_t a, int32_t b)
 | |
|         {
 | |
|             return flag ? a : b;
 | |
|         }
 | |
| 
 | |
|         static int32_t SelectOrZero(bool flag, int32_t a)
 | |
|         {
 | |
|             return flag ? a : 0;
 | |
|         }
 | |
| 
 | |
|         static void ConditionalSet(int32_t& dest, bool flag, int32_t src)
 | |
|         {
 | |
|             if (flag)
 | |
|                 dest = src;
 | |
|         }
 | |
| 
 | |
|         static void ConditionalSet(bool& dest, bool flag, bool src)
 | |
|         {
 | |
|             if (flag)
 | |
|                 dest = src;
 | |
|         }
 | |
| 
 | |
|         static int32_t ConditionalNegate(bool flag, int32_t v)
 | |
|         {
 | |
|             return (flag) ? -v : v;
 | |
|         }
 | |
| 
 | |
|         static void NotConditionalSet(int32_t& dest, bool flag, int32_t src)
 | |
|         {
 | |
|             if (!flag)
 | |
|                 dest = src;
 | |
|         }
 | |
| 
 | |
|         static void ConditionalSet(float& dest, bool flag, float src)
 | |
|         {
 | |
|             if (flag)
 | |
|                 dest = src;
 | |
|         }
 | |
| 
 | |
|         static void NotConditionalSet(float& dest, bool flag, float src)
 | |
|         {
 | |
|             if (!flag)
 | |
|                 dest = src;
 | |
|         }
 | |
| 
 | |
|         static void MakeSafeDenominator(float& v)
 | |
|         {
 | |
|             if (v == 0.0f)
 | |
|                 v = 1.0f;
 | |
|         }
 | |
| 
 | |
|         static int32_t SignedRightShift(int32_t v, int bits)
 | |
|         {
 | |
|             return v >> bits;
 | |
|         }
 | |
| 
 | |
|         static int32_t TruncateToPrecisionSigned(int32_t v, int precision)
 | |
|         {
 | |
|             v = (v << (32 - precision)) & 0xffffffff;
 | |
|             return SignedRightShift(v, 32 - precision);
 | |
|         }
 | |
| 
 | |
|         static int32_t TruncateToPrecisionUnsigned(int32_t v, int precision)
 | |
|         {
 | |
|             return v & ((1 << precision) - 1);
 | |
|         }
 | |
| 
 | |
|         static int32_t Min(int32_t a, int32_t b)
 | |
|         {
 | |
|             if (a < b)
 | |
|                 return a;
 | |
|             return b;
 | |
|         }
 | |
| 
 | |
|         static float Min(float a, float b)
 | |
|         {
 | |
|             if (a < b)
 | |
|                 return a;
 | |
|             return b;
 | |
|         }
 | |
| 
 | |
|         static int32_t Max(int32_t a, int32_t b)
 | |
|         {
 | |
|             if (a > b)
 | |
|                 return a;
 | |
|             return b;
 | |
|         }
 | |
| 
 | |
|         static float Max(float a, float b)
 | |
|         {
 | |
|             if (a > b)
 | |
|                 return a;
 | |
|             return b;
 | |
|         }
 | |
| 
 | |
|         static float Abs(float a)
 | |
|         {
 | |
|             return fabsf(a);
 | |
|         }
 | |
| 
 | |
|         static int32_t Abs(int32_t a)
 | |
|         {
 | |
|             if (a < 0)
 | |
|                 return -a;
 | |
|             return a;
 | |
|         }
 | |
| 
 | |
|         static float Clamp(float v, float min, float max)
 | |
|         {
 | |
|             if (v < min)
 | |
|                 return min;
 | |
|             if (v > max)
 | |
|                 return max;
 | |
|             return v;
 | |
|         }
 | |
| 
 | |
|         static float Reciprocal(float v)
 | |
|         {
 | |
|             return 1.0f / v;
 | |
|         }
 | |
| 
 | |
|         static void ConvertLDRInputs(const PixelBlockU8* inputBlocks, int pxOffset, int channel, int32_t& chOut)
 | |
|         {
 | |
|             chOut = inputBlocks[0].m_pixels[pxOffset][channel];
 | |
|         }
 | |
| 
 | |
|         static void ConvertHDRInputs(const PixelBlockF16* inputBlocks, int pxOffset, int channel, int32_t& chOut)
 | |
|         {
 | |
|             chOut = inputBlocks[0].m_pixels[pxOffset][channel];
 | |
|         }
 | |
| 
 | |
|         static float MakeFloat(float v)
 | |
|         {
 | |
|             return v;
 | |
|         }
 | |
| 
 | |
|         static float MakeFloatZero()
 | |
|         {
 | |
|             return 0.0f;
 | |
|         }
 | |
| 
 | |
|         static int32_t MakeUInt16(uint16_t v)
 | |
|         {
 | |
|             return v;
 | |
|         }
 | |
| 
 | |
|         static int32_t MakeSInt16(int16_t v)
 | |
|         {
 | |
|             return v;
 | |
|         }
 | |
| 
 | |
|         static int32_t MakeAInt16(int16_t v)
 | |
|         {
 | |
|             return v;
 | |
|         }
 | |
| 
 | |
|         static int32_t MakeUInt15(uint16_t v)
 | |
|         {
 | |
|             return v;
 | |
|         }
 | |
| 
 | |
|         static int32_t MakeSInt32(int32_t v)
 | |
|         {
 | |
|             return v;
 | |
|         }
 | |
| 
 | |
|         static int32_t MakeUInt31(int32_t v)
 | |
|         {
 | |
|             return v;
 | |
|         }
 | |
| 
 | |
|         static int32_t Extract(int32_t v, int offset)
 | |
|         {
 | |
|             UNREFERENCED_PARAMETER(offset);
 | |
|             return v;
 | |
|         }
 | |
| 
 | |
|         static bool Extract(bool v, int offset)
 | |
|         {
 | |
|             UNREFERENCED_PARAMETER(offset);
 | |
|             return v;
 | |
|         }
 | |
| 
 | |
|         static float Extract(float v, int offset)
 | |
|         {
 | |
|             UNREFERENCED_PARAMETER(offset);
 | |
|             return v;
 | |
|         }
 | |
| 
 | |
|         static void PutUInt16(int32_t &dest, int offset, ParallelMath::ScalarUInt16 v)
 | |
|         {
 | |
|             UNREFERENCED_PARAMETER(offset);
 | |
|             dest = v;
 | |
|         }
 | |
| 
 | |
|         static void PutUInt15(int32_t &dest, int offset, ParallelMath::ScalarUInt16 v)
 | |
|         {
 | |
|             UNREFERENCED_PARAMETER(offset);
 | |
|             dest = v;
 | |
|         }
 | |
| 
 | |
|         static void PutSInt16(int32_t &dest, int offset, ParallelMath::ScalarSInt16 v)
 | |
|         {
 | |
|             UNREFERENCED_PARAMETER(offset);
 | |
|             dest = v;
 | |
|         }
 | |
| 
 | |
|         static float ExtractFloat(float v, int offset)
 | |
|         {
 | |
|             UNREFERENCED_PARAMETER(offset);
 | |
|             return v;
 | |
|         }
 | |
| 
 | |
|         static void PutFloat(float &dest, int offset, float v)
 | |
|         {
 | |
|             UNREFERENCED_PARAMETER(offset);
 | |
|             dest = v;
 | |
|         }
 | |
| 
 | |
|         static void PutBoolInt16(bool &dest, int offset, bool v)
 | |
|         {
 | |
|             UNREFERENCED_PARAMETER(offset);
 | |
|             dest = v;
 | |
|         }
 | |
| 
 | |
|         static bool Less(int32_t a, int32_t b)
 | |
|         {
 | |
|             return a < b;
 | |
|         }
 | |
| 
 | |
|         static bool Less(float a, float b)
 | |
|         {
 | |
|             return a < b;
 | |
|         }
 | |
| 
 | |
|         static bool LessOrEqual(int32_t a, int32_t b)
 | |
|         {
 | |
|             return a < b;
 | |
|         }
 | |
| 
 | |
|         static bool LessOrEqual(float a, float b)
 | |
|         {
 | |
|             return a < b;
 | |
|         }
 | |
| 
 | |
|         static bool Equal(int32_t a, int32_t b)
 | |
|         {
 | |
|             return a == b;
 | |
|         }
 | |
| 
 | |
|         static bool Equal(float a, float b)
 | |
|         {
 | |
|             return a == b;
 | |
|         }
 | |
| 
 | |
|         static float ToFloat(int32_t v)
 | |
|         {
 | |
|             return static_cast<float>(v);
 | |
|         }
 | |
| 
 | |
|         static int32_t ToUInt31(int32_t v)
 | |
|         {
 | |
|             return v;
 | |
|         }
 | |
| 
 | |
|         static int32_t ToInt32(int32_t v)
 | |
|         {
 | |
|             return v;
 | |
|         }
 | |
| 
 | |
|         static bool FloatFlagToInt16(bool v)
 | |
|         {
 | |
|             return v;
 | |
|         }
 | |
| 
 | |
|         static bool Int32FlagToInt16(bool v)
 | |
|         {
 | |
|             return v;
 | |
|         }
 | |
| 
 | |
|         static bool Int16FlagToFloat(bool v)
 | |
|         {
 | |
|             return v;
 | |
|         }
 | |
| 
 | |
|         static bool MakeBoolInt16(bool b)
 | |
|         {
 | |
|             return b;
 | |
|         }
 | |
| 
 | |
|         static bool MakeBoolFloat(bool b)
 | |
|         {
 | |
|             return b;
 | |
|         }
 | |
| 
 | |
|         static bool AndNot(bool a, bool b)
 | |
|         {
 | |
|             return a && !b;
 | |
|         }
 | |
| 
 | |
|         static bool Not(bool b)
 | |
|         {
 | |
|             return !b;
 | |
|         }
 | |
| 
 | |
|         static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundTowardZeroForScope *rtz)
 | |
|         {
 | |
|             UNREFERENCED_PARAMETER(rtz);
 | |
|             return static_cast<int>(v);
 | |
|         }
 | |
| 
 | |
|         static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundUpForScope *ru)
 | |
|         {
 | |
|             UNREFERENCED_PARAMETER(ru);
 | |
|             return static_cast<int>(ceilf(v));
 | |
|         }
 | |
| 
 | |
|         static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundDownForScope *rd)
 | |
|         {
 | |
|             UNREFERENCED_PARAMETER(rd);
 | |
|             return static_cast<int>(floorf(v));
 | |
|         }
 | |
| 
 | |
|         static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundTowardNearestForScope *rtn)
 | |
|         {
 | |
|             UNREFERENCED_PARAMETER(rtn);
 | |
|             return static_cast<int>(floorf(v + 0.5f));
 | |
|         }
 | |
| 
 | |
|         template<class TRoundMode>
 | |
|         static int32_t RoundAndConvertToU16(float v, const TRoundMode *roundingMode)
 | |
|         {
 | |
|             return RoundAndConvertToInt(v, roundingMode);
 | |
|         }
 | |
| 
 | |
|         template<class TRoundMode>
 | |
|         static int32_t RoundAndConvertToU15(float v, const TRoundMode *roundingMode)
 | |
|         {
 | |
|             return RoundAndConvertToInt(v, roundingMode);
 | |
|         }
 | |
| 
 | |
|         template<class TRoundMode>
 | |
|         static int32_t RoundAndConvertToS16(float v, const TRoundMode *roundingMode)
 | |
|         {
 | |
|             return RoundAndConvertToInt(v, roundingMode);
 | |
|         }
 | |
| 
 | |
|         static float Sqrt(float f)
 | |
|         {
 | |
|             return sqrtf(f);
 | |
|         }
 | |
| 
 | |
|         static int32_t SqDiffUInt8(int32_t a, int32_t b)
 | |
|         {
 | |
|             int32_t delta = a - b;
 | |
|             return delta * delta;
 | |
|         }
 | |
| 
 | |
|         static int32_t SqDiffInt16(int32_t a, int32_t b)
 | |
|         {
 | |
|             int32_t delta = a - b;
 | |
|             return delta * delta;
 | |
|         }
 | |
| 
 | |
|         static int32_t SqDiffSInt16(int32_t a, int32_t b)
 | |
|         {
 | |
|             int32_t delta = a - b;
 | |
|             return delta * delta;
 | |
|         }
 | |
| 
 | |
|         static float TwosCLHalfToFloat(int32_t v)
 | |
|         {
 | |
|             int32_t absV = (v < 0) ? -v : v;
 | |
| 
 | |
|             int32_t signBits = (absV & -32768);
 | |
|             int32_t mantissa = (absV & 0x03ff);
 | |
|             int32_t exponent = (absV & 0x7c00);
 | |
| 
 | |
|             bool isDenormal = (exponent == 0);
 | |
| 
 | |
|             // Convert exponent to high-bits
 | |
|             exponent = (exponent >> 3) + 14336;
 | |
| 
 | |
|             int32_t denormalCorrection = (isDenormal ? (signBits | 14336) : 0) << 16;
 | |
| 
 | |
|             int32_t fBits = ((exponent | signBits) << 16) | (mantissa << 13);
 | |
| 
 | |
|             float f, correction;
 | |
|             memcpy(&f, &fBits, 4);
 | |
|             memcpy(&correction, &denormalCorrection, 4);
 | |
| 
 | |
|             return f - correction;
 | |
|         }
 | |
| 
 | |
|         static Float SqDiff2CLFloat(const SInt16 &a, const Float &b)
 | |
|         {
 | |
|             Float fa = TwosCLHalfToFloat(a);
 | |
| 
 | |
|             Float diff = fa - b;
 | |
|             return diff * diff;
 | |
|         }
 | |
| 
 | |
|         static Float SqDiff2CL(const SInt16 &a, const SInt16 &b)
 | |
|         {
 | |
|             Float fa = TwosCLHalfToFloat(a);
 | |
|             Float fb = TwosCLHalfToFloat(b);
 | |
| 
 | |
|             Float diff = fa - fb;
 | |
|             return diff * diff;
 | |
|         }
 | |
| 
 | |
|         static Float SqDiff2CLFloat(const SInt16 &a, float aWeight, const Float &b)
 | |
|         {
 | |
|             Float fa = TwosCLHalfToFloat(a) * aWeight;
 | |
| 
 | |
|             Float diff = fa - b;
 | |
|             return diff * diff;
 | |
|         }
 | |
| 
 | |
|         static int32_t RightShift(int32_t v, int bits)
 | |
|         {
 | |
|             return SignedRightShift(v, bits);
 | |
|         }
 | |
| 
 | |
|         static int32_t ToSInt16(int32_t v)
 | |
|         {
 | |
|             return v;
 | |
|         }
 | |
| 
 | |
|         static int32_t ToUInt16(int32_t v)
 | |
|         {
 | |
|             return v;
 | |
|         }
 | |
| 
 | |
|         static int32_t ToUInt15(int32_t v)
 | |
|         {
 | |
|             return v;
 | |
|         }
 | |
| 
 | |
|         static int32_t XMultiply(int32_t a, int32_t b)
 | |
|         {
 | |
|             return a * b;
 | |
|         }
 | |
| 
 | |
|         static int32_t CompactMultiply(int32_t a, int32_t b)
 | |
|         {
 | |
|             return a * b;
 | |
|         }
 | |
| 
 | |
|         static bool AnySet(bool v)
 | |
|         {
 | |
|             return v;
 | |
|         }
 | |
| 
 | |
|         static bool AllSet(bool v)
 | |
|         {
 | |
|             return v;
 | |
|         }
 | |
|     };
 | |
| 
 | |
| #endif
 | |
| }
 | |
| 
 | |
| #endif
 | 
