mirror of
				https://github.com/godotengine/godot.git
				synced 2025-11-04 07:31:16 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			7586 lines
		
	
	
	
		
			365 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			7586 lines
		
	
	
	
		
			365 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
/*
 | 
						|
Convection Texture Tools
 | 
						|
Copyright (c) 2018 Eric Lasota
 | 
						|
 | 
						|
Permission is hereby granted, free of charge, to any person obtaining
 | 
						|
a copy of this software and associated documentation files (the
 | 
						|
"Software"), to deal in the Software without restriction, including
 | 
						|
without limitation the rights to use, copy, modify, merge, publish,
 | 
						|
distribute, sublicense, and/or sell copies of the Software, and to
 | 
						|
permit persons to whom the Software is furnished to do so, subject
 | 
						|
to the following conditions:
 | 
						|
 | 
						|
The above copyright notice and this permission notice shall be included
 | 
						|
in all copies or substantial portions of the Software.
 | 
						|
 | 
						|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 | 
						|
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 | 
						|
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 | 
						|
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 | 
						|
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 | 
						|
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 | 
						|
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 | 
						|
 | 
						|
-------------------------------------------------------------------------------------
 | 
						|
 | 
						|
Portions based on DirectX Texture Library (DirectXTex)
 | 
						|
 | 
						|
Copyright (c) Microsoft Corporation. All rights reserved.
 | 
						|
Licensed under the MIT License.
 | 
						|
 | 
						|
http://go.microsoft.com/fwlink/?LinkId=248926
 | 
						|
*/
 | 
						|
#include "ConvectionKernels.h"
 | 
						|
#include "ConvectionKernels_BC7_SingleColor.h"
 | 
						|
 | 
						|
#if (defined(_M_IX86_FP) && _M_IX86_FP >= 2) || defined(_M_X64) || defined(__SSE2__)
 | 
						|
#define CVTT_USE_SSE2
 | 
						|
#endif
 | 
						|
 | 
						|
#ifdef CVTT_USE_SSE2
 | 
						|
#include <emmintrin.h>
 | 
						|
#endif
 | 
						|
 | 
						|
#include <float.h>
 | 
						|
#include <assert.h>
 | 
						|
#include <string.h>
 | 
						|
#include <algorithm>
 | 
						|
#include <math.h>
 | 
						|
 | 
						|
#define UNREFERENCED_PARAMETER(n) ((void)n)
 | 
						|
 | 
						|
namespace cvtt
 | 
						|
{
 | 
						|
#ifdef CVTT_USE_SSE2
 | 
						|
    // SSE2 version
 | 
						|
    struct ParallelMath
 | 
						|
    {
 | 
						|
        typedef uint16_t ScalarUInt16;
 | 
						|
        typedef int16_t ScalarSInt16;
 | 
						|
 | 
						|
        template<unsigned int TRoundingMode>
 | 
						|
        struct RoundForScope
 | 
						|
        {
 | 
						|
            unsigned int m_oldCSR;
 | 
						|
 | 
						|
            RoundForScope()
 | 
						|
            {
 | 
						|
                m_oldCSR = _mm_getcsr();
 | 
						|
                _mm_setcsr((m_oldCSR & ~_MM_ROUND_MASK) | (TRoundingMode));
 | 
						|
            }
 | 
						|
 | 
						|
            ~RoundForScope()
 | 
						|
            {
 | 
						|
                _mm_setcsr(m_oldCSR);
 | 
						|
            }
 | 
						|
        };
 | 
						|
 | 
						|
        struct RoundTowardZeroForScope : RoundForScope<_MM_ROUND_TOWARD_ZERO>
 | 
						|
        {
 | 
						|
        };
 | 
						|
 | 
						|
        struct RoundTowardNearestForScope : RoundForScope<_MM_ROUND_NEAREST>
 | 
						|
        {
 | 
						|
        };
 | 
						|
 | 
						|
        struct RoundUpForScope : RoundForScope<_MM_ROUND_UP>
 | 
						|
        {
 | 
						|
        };
 | 
						|
 | 
						|
        struct RoundDownForScope : RoundForScope<_MM_ROUND_DOWN>
 | 
						|
        {
 | 
						|
        };
 | 
						|
 | 
						|
        static const int ParallelSize = 8;
 | 
						|
 | 
						|
        enum Int16Subtype
 | 
						|
        {
 | 
						|
            IntSubtype_Signed,
 | 
						|
            IntSubtype_UnsignedFull,
 | 
						|
            IntSubtype_UnsignedTruncated,
 | 
						|
            IntSubtype_Abstract,
 | 
						|
        };
 | 
						|
 | 
						|
        template<int TSubtype>
 | 
						|
        struct VInt16
 | 
						|
        {
 | 
						|
            __m128i m_value;
 | 
						|
 | 
						|
            inline VInt16 operator+(int16_t other) const
 | 
						|
            {
 | 
						|
                VInt16 result;
 | 
						|
                result.m_value = _mm_add_epi16(m_value, _mm_set1_epi16(static_cast<int16_t>(other)));
 | 
						|
                return result;
 | 
						|
            }
 | 
						|
 | 
						|
            inline VInt16 operator+(const VInt16 &other) const
 | 
						|
            {
 | 
						|
                VInt16 result;
 | 
						|
                result.m_value = _mm_add_epi16(m_value, other.m_value);
 | 
						|
                return result;
 | 
						|
            }
 | 
						|
 | 
						|
            inline VInt16 operator|(const VInt16 &other) const
 | 
						|
            {
 | 
						|
                VInt16 result;
 | 
						|
                result.m_value = _mm_or_si128(m_value, other.m_value);
 | 
						|
                return result;
 | 
						|
            }
 | 
						|
 | 
						|
            inline VInt16 operator&(const VInt16 &other) const
 | 
						|
            {
 | 
						|
                VInt16 result;
 | 
						|
                result.m_value = _mm_and_si128(m_value, other.m_value);
 | 
						|
                return result;
 | 
						|
            }
 | 
						|
 | 
						|
            inline VInt16 operator-(const VInt16 &other) const
 | 
						|
            {
 | 
						|
                VInt16 result;
 | 
						|
                result.m_value = _mm_sub_epi16(m_value, other.m_value);
 | 
						|
                return result;
 | 
						|
            }
 | 
						|
 | 
						|
            inline VInt16 operator<<(int bits) const
 | 
						|
            {
 | 
						|
                VInt16 result;
 | 
						|
                result.m_value = _mm_slli_epi16(m_value, bits);
 | 
						|
                return result;
 | 
						|
            }
 | 
						|
        };
 | 
						|
 | 
						|
        typedef VInt16<IntSubtype_Signed> SInt16;
 | 
						|
        typedef VInt16<IntSubtype_UnsignedFull> UInt16;
 | 
						|
        typedef VInt16<IntSubtype_UnsignedTruncated> UInt15;
 | 
						|
        typedef VInt16<IntSubtype_Abstract> AInt16;
 | 
						|
 | 
						|
        template<int TSubtype>
 | 
						|
        struct VInt32
 | 
						|
        {
 | 
						|
            __m128i m_values[2];
 | 
						|
 | 
						|
            inline VInt32 operator+(const VInt32& other) const
 | 
						|
            {
 | 
						|
                VInt32 result;
 | 
						|
                result.m_values[0] = _mm_add_epi32(m_values[0], other.m_values[0]);
 | 
						|
                result.m_values[1] = _mm_add_epi32(m_values[1], other.m_values[1]);
 | 
						|
                return result;
 | 
						|
            }
 | 
						|
 | 
						|
            inline VInt32 operator-(const VInt32& other) const
 | 
						|
            {
 | 
						|
                VInt32 result;
 | 
						|
                result.m_values[0] = _mm_sub_epi32(m_values[0], other.m_values[0]);
 | 
						|
                result.m_values[1] = _mm_sub_epi32(m_values[1], other.m_values[1]);
 | 
						|
                return result;
 | 
						|
            }
 | 
						|
 | 
						|
            inline VInt32 operator<<(const int other) const
 | 
						|
            {
 | 
						|
                VInt32 result;
 | 
						|
                result.m_values[0] = _mm_slli_epi32(m_values[0], other);
 | 
						|
                result.m_values[1] = _mm_slli_epi32(m_values[1], other);
 | 
						|
                return result;
 | 
						|
            }
 | 
						|
        };
 | 
						|
 | 
						|
        typedef VInt32<IntSubtype_Signed> SInt32;
 | 
						|
        typedef VInt32<IntSubtype_UnsignedTruncated> UInt31;
 | 
						|
        typedef VInt32<IntSubtype_UnsignedFull> UInt32;
 | 
						|
        typedef VInt32<IntSubtype_Abstract> AInt32;
 | 
						|
 | 
						|
        template<class TTargetType>
 | 
						|
        struct LosslessCast
 | 
						|
        {
 | 
						|
#ifdef CVTT_PERMIT_ALIASING
 | 
						|
            template<int TSrcSubtype>
 | 
						|
            static const TTargetType& Cast(const VInt32<TSrcSubtype> &src)
 | 
						|
            {
 | 
						|
                return reinterpret_cast<VInt32<TSubtype>&>(src);
 | 
						|
            }
 | 
						|
 | 
						|
            template<int TSrcSubtype>
 | 
						|
            static const TTargetType& Cast(const VInt16<TSrcSubtype> &src)
 | 
						|
            {
 | 
						|
                return reinterpret_cast<VInt16<TSubtype>&>(src);
 | 
						|
            }
 | 
						|
#else
 | 
						|
            template<int TSrcSubtype>
 | 
						|
            static TTargetType Cast(const VInt32<TSrcSubtype> &src)
 | 
						|
            {
 | 
						|
                TTargetType result;
 | 
						|
                result.m_values[0] = src.m_values[0];
 | 
						|
                result.m_values[1] = src.m_values[1];
 | 
						|
                return result;
 | 
						|
            }
 | 
						|
 | 
						|
            template<int TSrcSubtype>
 | 
						|
            static TTargetType Cast(const VInt16<TSrcSubtype> &src)
 | 
						|
            {
 | 
						|
                TTargetType result;
 | 
						|
                result.m_value = src.m_value;
 | 
						|
                return result;
 | 
						|
            }
 | 
						|
#endif
 | 
						|
        };
 | 
						|
 | 
						|
        struct Int64
 | 
						|
        {
 | 
						|
            __m128i m_values[4];
 | 
						|
        };
 | 
						|
 | 
						|
        struct Float
 | 
						|
        {
 | 
						|
            __m128 m_values[2];
 | 
						|
 | 
						|
            inline Float operator+(const Float &other) const
 | 
						|
            {
 | 
						|
                Float result;
 | 
						|
                result.m_values[0] = _mm_add_ps(m_values[0], other.m_values[0]);
 | 
						|
                result.m_values[1] = _mm_add_ps(m_values[1], other.m_values[1]);
 | 
						|
                return result;
 | 
						|
            }
 | 
						|
 | 
						|
            inline Float operator+(float other) const
 | 
						|
            {
 | 
						|
                Float result;
 | 
						|
                result.m_values[0] = _mm_add_ps(m_values[0], _mm_set1_ps(other));
 | 
						|
                result.m_values[1] = _mm_add_ps(m_values[1], _mm_set1_ps(other));
 | 
						|
                return result;
 | 
						|
            }
 | 
						|
 | 
						|
            inline Float operator-(const Float& other) const
 | 
						|
            {
 | 
						|
                Float result;
 | 
						|
                result.m_values[0] = _mm_sub_ps(m_values[0], other.m_values[0]);
 | 
						|
                result.m_values[1] = _mm_sub_ps(m_values[1], other.m_values[1]);
 | 
						|
                return result;
 | 
						|
            }
 | 
						|
 | 
						|
            inline Float operator-() const
 | 
						|
            {
 | 
						|
                Float result;
 | 
						|
                result.m_values[0] = _mm_sub_ps(_mm_setzero_ps(), m_values[0]);
 | 
						|
                result.m_values[1] = _mm_sub_ps(_mm_setzero_ps(), m_values[1]);
 | 
						|
                return result;
 | 
						|
            }
 | 
						|
 | 
						|
            inline Float operator*(const Float& other) const
 | 
						|
            {
 | 
						|
                Float result;
 | 
						|
                result.m_values[0] = _mm_mul_ps(m_values[0], other.m_values[0]);
 | 
						|
                result.m_values[1] = _mm_mul_ps(m_values[1], other.m_values[1]);
 | 
						|
                return result;
 | 
						|
            }
 | 
						|
 | 
						|
            inline Float operator*(float other) const
 | 
						|
            {
 | 
						|
                Float result;
 | 
						|
                result.m_values[0] = _mm_mul_ps(m_values[0], _mm_set1_ps(other));
 | 
						|
                result.m_values[1] = _mm_mul_ps(m_values[1], _mm_set1_ps(other));
 | 
						|
                return result;
 | 
						|
            }
 | 
						|
 | 
						|
            inline Float operator/(const Float &other) const
 | 
						|
            {
 | 
						|
                Float result;
 | 
						|
                result.m_values[0] = _mm_div_ps(m_values[0], other.m_values[0]);
 | 
						|
                result.m_values[1] = _mm_div_ps(m_values[1], other.m_values[1]);
 | 
						|
                return result;
 | 
						|
            }
 | 
						|
 | 
						|
            inline Float operator/(float other) const
 | 
						|
            {
 | 
						|
                Float result;
 | 
						|
                result.m_values[0] = _mm_div_ps(m_values[0], _mm_set1_ps(other));
 | 
						|
                result.m_values[1] = _mm_div_ps(m_values[1], _mm_set1_ps(other));
 | 
						|
                return result;
 | 
						|
            }
 | 
						|
        };
 | 
						|
 | 
						|
        struct Int16CompFlag
 | 
						|
        {
 | 
						|
            __m128i m_value;
 | 
						|
 | 
						|
            inline Int16CompFlag operator&(const Int16CompFlag &other) const
 | 
						|
            {
 | 
						|
                Int16CompFlag result;
 | 
						|
                result.m_value = _mm_and_si128(m_value, other.m_value);
 | 
						|
                return result;
 | 
						|
            }
 | 
						|
 | 
						|
            inline Int16CompFlag operator|(const Int16CompFlag &other) const
 | 
						|
            {
 | 
						|
                Int16CompFlag result;
 | 
						|
                result.m_value = _mm_or_si128(m_value, other.m_value);
 | 
						|
                return result;
 | 
						|
            }
 | 
						|
        };
 | 
						|
 | 
						|
        struct FloatCompFlag
 | 
						|
        {
 | 
						|
            __m128 m_values[2];
 | 
						|
        };
 | 
						|
 | 
						|
        template<int TSubtype>
 | 
						|
        static VInt16<TSubtype> AbstractAdd(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
 | 
						|
        {
 | 
						|
            VInt16<TSubtype> result;
 | 
						|
            result.m_value = _mm_add_epi16(a.m_value, b.m_value);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        template<int TSubtype>
 | 
						|
        static VInt16<TSubtype> AbstractSubtract(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
 | 
						|
        {
 | 
						|
            VInt16<TSubtype> result;
 | 
						|
            result.m_value = _mm_sub_epi16(a.m_value, b.m_value);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static Float Select(const FloatCompFlag &flag, const Float &a, const Float &b)
 | 
						|
        {
 | 
						|
            Float result;
 | 
						|
            for (int i = 0; i < 2; i++)
 | 
						|
                result.m_values[i] = _mm_or_ps(_mm_and_ps(flag.m_values[i], a.m_values[i]), _mm_andnot_ps(flag.m_values[i], b.m_values[i]));
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        template<int TSubtype>
 | 
						|
        static VInt16<TSubtype> Select(const Int16CompFlag &flag, const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
 | 
						|
        {
 | 
						|
            VInt16<TSubtype> result;
 | 
						|
            result.m_value = _mm_or_si128(_mm_and_si128(flag.m_value, a.m_value), _mm_andnot_si128(flag.m_value, b.m_value));
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        template<int TSubtype>
 | 
						|
        static VInt16<TSubtype> SelectOrZero(const Int16CompFlag &flag, const VInt16<TSubtype> &a)
 | 
						|
        {
 | 
						|
            VInt16<TSubtype> result;
 | 
						|
            result.m_value = _mm_and_si128(flag.m_value, a.m_value);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        template<int TSubtype>
 | 
						|
        static void ConditionalSet(VInt16<TSubtype> &dest, const Int16CompFlag &flag, const VInt16<TSubtype> &src)
 | 
						|
        {
 | 
						|
            dest.m_value = _mm_or_si128(_mm_andnot_si128(flag.m_value, dest.m_value), _mm_and_si128(flag.m_value, src.m_value));
 | 
						|
        }
 | 
						|
 | 
						|
        static SInt16 ConditionalNegate(const Int16CompFlag &flag, const SInt16 &v)
 | 
						|
        {
 | 
						|
            SInt16 result;
 | 
						|
            result.m_value = _mm_add_epi16(_mm_xor_si128(flag.m_value, v.m_value), _mm_srli_epi16(flag.m_value, 15));
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        template<int TSubtype>
 | 
						|
        static void NotConditionalSet(VInt16<TSubtype> &dest, const Int16CompFlag &flag, const VInt16<TSubtype> &src)
 | 
						|
        {
 | 
						|
            dest.m_value = _mm_or_si128(_mm_and_si128(flag.m_value, dest.m_value), _mm_andnot_si128(flag.m_value, src.m_value));
 | 
						|
        }
 | 
						|
 | 
						|
        static void ConditionalSet(Float &dest, const FloatCompFlag &flag, const Float &src)
 | 
						|
        {
 | 
						|
            for (int i = 0; i < 2; i++)
 | 
						|
                dest.m_values[i] = _mm_or_ps(_mm_andnot_ps(flag.m_values[i], dest.m_values[i]), _mm_and_ps(flag.m_values[i], src.m_values[i]));
 | 
						|
        }
 | 
						|
 | 
						|
        static void NotConditionalSet(Float &dest, const FloatCompFlag &flag, const Float &src)
 | 
						|
        {
 | 
						|
            for (int i = 0; i < 2; i++)
 | 
						|
                dest.m_values[i] = _mm_or_ps(_mm_and_ps(flag.m_values[i], dest.m_values[i]), _mm_andnot_ps(flag.m_values[i], src.m_values[i]));
 | 
						|
        }
 | 
						|
 | 
						|
        static void MakeSafeDenominator(Float& v)
 | 
						|
        {
 | 
						|
            ConditionalSet(v, Equal(v, MakeFloatZero()), MakeFloat(1.0f));
 | 
						|
        }
 | 
						|
 | 
						|
        static SInt16 TruncateToPrecisionSigned(const SInt16 &v, int precision)
 | 
						|
        {
 | 
						|
            int lostBits = 16 - precision;
 | 
						|
            if (lostBits == 0)
 | 
						|
                return v;
 | 
						|
 | 
						|
            SInt16 result;
 | 
						|
            result.m_value = _mm_srai_epi16(_mm_slli_epi16(v.m_value, lostBits), lostBits);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static UInt16 TruncateToPrecisionUnsigned(const UInt16 &v, int precision)
 | 
						|
        {
 | 
						|
            int lostBits = 16 - precision;
 | 
						|
            if (lostBits == 0)
 | 
						|
                return v;
 | 
						|
 | 
						|
            UInt16 result;
 | 
						|
            result.m_value = _mm_srli_epi16(_mm_slli_epi16(v.m_value, lostBits), lostBits);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static UInt16 Min(const UInt16 &a, const UInt16 &b)
 | 
						|
        {
 | 
						|
            __m128i bitFlip = _mm_set1_epi16(-32768);
 | 
						|
 | 
						|
            UInt16 result;
 | 
						|
            result.m_value = _mm_xor_si128(_mm_min_epi16(_mm_xor_si128(a.m_value, bitFlip), _mm_xor_si128(b.m_value, bitFlip)), bitFlip);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static SInt16 Min(const SInt16 &a, const SInt16 &b)
 | 
						|
        {
 | 
						|
            SInt16 result;
 | 
						|
            result.m_value = _mm_min_epi16(a.m_value, b.m_value);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static UInt15 Min(const UInt15 &a, const UInt15 &b)
 | 
						|
        {
 | 
						|
            UInt15 result;
 | 
						|
            result.m_value = _mm_min_epi16(a.m_value, b.m_value);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static Float Min(const Float &a, const Float &b)
 | 
						|
        {
 | 
						|
            Float result;
 | 
						|
            for (int i = 0; i < 2; i++)
 | 
						|
                result.m_values[i] = _mm_min_ps(a.m_values[i], b.m_values[i]);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static UInt16 Max(const UInt16 &a, const UInt16 &b)
 | 
						|
        {
 | 
						|
            __m128i bitFlip = _mm_set1_epi16(-32768);
 | 
						|
 | 
						|
            UInt16 result;
 | 
						|
            result.m_value = _mm_xor_si128(_mm_max_epi16(_mm_xor_si128(a.m_value, bitFlip), _mm_xor_si128(b.m_value, bitFlip)), bitFlip);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static SInt16 Max(const SInt16 &a, const SInt16 &b)
 | 
						|
        {
 | 
						|
            SInt16 result;
 | 
						|
            result.m_value = _mm_max_epi16(a.m_value, b.m_value);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static UInt15 Max(const UInt15 &a, const UInt15 &b)
 | 
						|
        {
 | 
						|
            UInt15 result;
 | 
						|
            result.m_value = _mm_max_epi16(a.m_value, b.m_value);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static Float Max(const Float &a, const Float &b)
 | 
						|
        {
 | 
						|
            Float result;
 | 
						|
            for (int i = 0; i < 2; i++)
 | 
						|
                result.m_values[i] = _mm_max_ps(a.m_values[i], b.m_values[i]);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static Float Clamp(const Float &v, float min, float max)
 | 
						|
        {
 | 
						|
            Float result;
 | 
						|
            for (int i = 0; i < 2; i++)
 | 
						|
                result.m_values[i] = _mm_max_ps(_mm_min_ps(v.m_values[i], _mm_set1_ps(max)), _mm_set1_ps(min));
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static Float Reciprocal(const Float &v)
 | 
						|
        {
 | 
						|
            Float result;
 | 
						|
            for (int i = 0; i < 2; i++)
 | 
						|
                result.m_values[i] = _mm_rcp_ps(v.m_values[i]);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static void ConvertLDRInputs(const PixelBlockU8* inputBlocks, int pxOffset, int channel, UInt15 &chOut)
 | 
						|
        {
 | 
						|
            int16_t values[8];
 | 
						|
            for (int i = 0; i < 8; i++)
 | 
						|
                values[i] = inputBlocks[i].m_pixels[pxOffset][channel];
 | 
						|
 | 
						|
            chOut.m_value = _mm_set_epi16(values[7], values[6], values[5], values[4], values[3], values[2], values[1], values[0]);
 | 
						|
        }
 | 
						|
 | 
						|
        static void ConvertHDRInputs(const PixelBlockF16* inputBlocks, int pxOffset, int channel, SInt16 &chOut)
 | 
						|
        {
 | 
						|
            int16_t values[8];
 | 
						|
            for (int i = 0; i < 8; i++)
 | 
						|
                values[i] = inputBlocks[i].m_pixels[pxOffset][channel];
 | 
						|
 | 
						|
            chOut.m_value = _mm_set_epi16(values[7], values[6], values[5], values[4], values[3], values[2], values[1], values[0]);
 | 
						|
        }
 | 
						|
 | 
						|
        static Float MakeFloat(float v)
 | 
						|
        {
 | 
						|
            Float f;
 | 
						|
            f.m_values[0] = f.m_values[1] = _mm_set1_ps(v);
 | 
						|
            return f;
 | 
						|
        }
 | 
						|
 | 
						|
        static Float MakeFloatZero()
 | 
						|
        {
 | 
						|
            Float f;
 | 
						|
            f.m_values[0] = f.m_values[1] = _mm_setzero_ps();
 | 
						|
            return f;
 | 
						|
        }
 | 
						|
 | 
						|
        static UInt16 MakeUInt16(uint16_t v)
 | 
						|
        {
 | 
						|
            UInt16 result;
 | 
						|
            result.m_value = _mm_set1_epi16(static_cast<short>(v));
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static SInt16 MakeSInt16(int16_t v)
 | 
						|
        {
 | 
						|
            SInt16 result;
 | 
						|
            result.m_value = _mm_set1_epi16(static_cast<short>(v));
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static AInt16 MakeAInt16(int16_t v)
 | 
						|
        {
 | 
						|
            AInt16 result;
 | 
						|
            result.m_value = _mm_set1_epi16(static_cast<short>(v));
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static UInt15 MakeUInt15(uint16_t v)
 | 
						|
        {
 | 
						|
            UInt15 result;
 | 
						|
            result.m_value = _mm_set1_epi16(static_cast<short>(v));
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static SInt32 MakeSInt32(int32_t v)
 | 
						|
        {
 | 
						|
            SInt32 result;
 | 
						|
            result.m_values[0] = _mm_set1_epi32(v);
 | 
						|
            result.m_values[1] = _mm_set1_epi32(v);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static UInt31 MakeUInt31(uint32_t v)
 | 
						|
        {
 | 
						|
            UInt31 result;
 | 
						|
            result.m_values[0] = _mm_set1_epi32(v);
 | 
						|
            result.m_values[1] = _mm_set1_epi32(v);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static uint16_t Extract(const UInt16 &v, int offset)
 | 
						|
        {
 | 
						|
            return reinterpret_cast<const uint16_t*>(&v.m_value)[offset];
 | 
						|
        }
 | 
						|
 | 
						|
        static int16_t Extract(const SInt16 &v, int offset)
 | 
						|
        {
 | 
						|
            return reinterpret_cast<const int16_t*>(&v.m_value)[offset];
 | 
						|
        }
 | 
						|
 | 
						|
        static uint16_t Extract(const UInt15 &v, int offset)
 | 
						|
        {
 | 
						|
            return reinterpret_cast<const uint16_t*>(&v.m_value)[offset];
 | 
						|
        }
 | 
						|
 | 
						|
        static int16_t Extract(const AInt16 &v, int offset)
 | 
						|
        {
 | 
						|
            return reinterpret_cast<const int16_t*>(&v.m_value)[offset];
 | 
						|
        }
 | 
						|
 | 
						|
        static void PutUInt16(UInt16 &dest, int offset, uint16_t v)
 | 
						|
        {
 | 
						|
            reinterpret_cast<uint16_t*>(&dest)[offset] = v;
 | 
						|
        }
 | 
						|
 | 
						|
        static void PutUInt15(UInt15 &dest, int offset, uint16_t v)
 | 
						|
        {
 | 
						|
            reinterpret_cast<uint16_t*>(&dest)[offset] = v;
 | 
						|
        }
 | 
						|
 | 
						|
        static void PutSInt16(SInt16 &dest, int offset, int16_t v)
 | 
						|
        {
 | 
						|
            reinterpret_cast<int16_t*>(&dest)[offset] = v;
 | 
						|
        }
 | 
						|
 | 
						|
        static float ExtractFloat(const Float& v, int offset)
 | 
						|
        {
 | 
						|
            return reinterpret_cast<const float*>(&v)[offset];
 | 
						|
        }
 | 
						|
 | 
						|
        static void PutFloat(Float &dest, int offset, float v)
 | 
						|
        {
 | 
						|
            reinterpret_cast<float*>(&dest)[offset] = v;
 | 
						|
        }
 | 
						|
 | 
						|
        static Int16CompFlag Less(const SInt16 &a, const SInt16 &b)
 | 
						|
        {
 | 
						|
            Int16CompFlag result;
 | 
						|
            result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static Int16CompFlag Less(const UInt15 &a, const UInt15 &b)
 | 
						|
        {
 | 
						|
            Int16CompFlag result;
 | 
						|
            result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static Int16CompFlag LessOrEqual(const UInt15 &a, const UInt15 &b)
 | 
						|
        {
 | 
						|
            Int16CompFlag result;
 | 
						|
            result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static FloatCompFlag Less(const Float &a, const Float &b)
 | 
						|
        {
 | 
						|
            FloatCompFlag result;
 | 
						|
            for (int i = 0; i < 2; i++)
 | 
						|
                result.m_values[i] = _mm_cmplt_ps(a.m_values[i], b.m_values[i]);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static FloatCompFlag LessOrEqual(const Float &a, const Float &b)
 | 
						|
        {
 | 
						|
            FloatCompFlag result;
 | 
						|
            for (int i = 0; i < 2; i++)
 | 
						|
                result.m_values[i] = _mm_cmple_ps(a.m_values[i], b.m_values[i]);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        template<int TSubtype>
 | 
						|
        static Int16CompFlag Equal(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
 | 
						|
        {
 | 
						|
            Int16CompFlag result;
 | 
						|
            result.m_value = _mm_cmpeq_epi16(a.m_value, b.m_value);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static FloatCompFlag Equal(const Float &a, const Float &b)
 | 
						|
        {
 | 
						|
            FloatCompFlag result;
 | 
						|
            for (int i = 0; i < 2; i++)
 | 
						|
                result.m_values[i] = _mm_cmpeq_ps(a.m_values[i], b.m_values[i]);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static Float ToFloat(const UInt16 &v)
 | 
						|
        {
 | 
						|
            Float result;
 | 
						|
            result.m_values[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v.m_value, _mm_setzero_si128()));
 | 
						|
            result.m_values[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v.m_value, _mm_setzero_si128()));
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static UInt31 ToUInt31(const UInt16 &v)
 | 
						|
        {
 | 
						|
            UInt31 result;
 | 
						|
            result.m_values[0] = _mm_unpacklo_epi16(v.m_value, _mm_setzero_si128());
 | 
						|
            result.m_values[1] = _mm_unpackhi_epi16(v.m_value, _mm_setzero_si128());
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static SInt32 ToInt32(const UInt16 &v)
 | 
						|
        {
 | 
						|
            SInt32 result;
 | 
						|
            result.m_values[0] = _mm_unpacklo_epi16(v.m_value, _mm_setzero_si128());
 | 
						|
            result.m_values[1] = _mm_unpackhi_epi16(v.m_value, _mm_setzero_si128());
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static SInt32 ToInt32(const SInt16 &v)
 | 
						|
        {
 | 
						|
            SInt32 result;
 | 
						|
            result.m_values[0] = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), v.m_value), 16);
 | 
						|
            result.m_values[1] = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), v.m_value), 16);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static Float ToFloat(const SInt16 &v)
 | 
						|
        {
 | 
						|
            Float result;
 | 
						|
            result.m_values[0] = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), v.m_value), 16));
 | 
						|
            result.m_values[1] = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), v.m_value), 16));
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static Float ToFloat(const UInt15 &v)
 | 
						|
        {
 | 
						|
            Float result;
 | 
						|
            result.m_values[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v.m_value, _mm_setzero_si128()));
 | 
						|
            result.m_values[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v.m_value, _mm_setzero_si128()));
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static Float ToFloat(const UInt31 &v)
 | 
						|
        {
 | 
						|
            Float result;
 | 
						|
            result.m_values[0] = _mm_cvtepi32_ps(v.m_values[0]);
 | 
						|
            result.m_values[1] = _mm_cvtepi32_ps(v.m_values[1]);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static Int16CompFlag FloatFlagToInt16(const FloatCompFlag &v)
 | 
						|
        {
 | 
						|
            __m128i lo = _mm_castps_si128(v.m_values[0]);
 | 
						|
            __m128i hi = _mm_castps_si128(v.m_values[1]);
 | 
						|
 | 
						|
            Int16CompFlag result;
 | 
						|
            result.m_value = _mm_packs_epi32(lo, hi);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static FloatCompFlag Int16FlagToFloat(const Int16CompFlag &v)
 | 
						|
        {
 | 
						|
            __m128i lo = _mm_unpacklo_epi16(v.m_value, v.m_value);
 | 
						|
            __m128i hi = _mm_unpackhi_epi16(v.m_value, v.m_value);
 | 
						|
 | 
						|
            FloatCompFlag result;
 | 
						|
            result.m_values[0] = _mm_castsi128_ps(lo);
 | 
						|
            result.m_values[1] = _mm_castsi128_ps(hi);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static Int16CompFlag MakeBoolInt16(bool b)
 | 
						|
        {
 | 
						|
            Int16CompFlag result;
 | 
						|
            if (b)
 | 
						|
                result.m_value = _mm_set1_epi16(-1);
 | 
						|
            else
 | 
						|
                result.m_value = _mm_setzero_si128();
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static FloatCompFlag MakeBoolFloat(bool b)
 | 
						|
        {
 | 
						|
            FloatCompFlag result;
 | 
						|
            if (b)
 | 
						|
                result.m_values[0] = result.m_values[1] = _mm_castsi128_ps(_mm_set1_epi32(-1));
 | 
						|
            else
 | 
						|
                result.m_values[0] = result.m_values[1] = _mm_setzero_ps();
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static Int16CompFlag AndNot(const Int16CompFlag &a, const Int16CompFlag &b)
 | 
						|
        {
 | 
						|
            Int16CompFlag result;
 | 
						|
            result.m_value = _mm_andnot_si128(b.m_value, a.m_value);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static UInt16 RoundAndConvertToU16(const Float &v, const void* /*roundingMode*/)
 | 
						|
        {
 | 
						|
            __m128i lo = _mm_cvtps_epi32(_mm_add_ps(v.m_values[0], _mm_set1_ps(-32768)));
 | 
						|
            __m128i hi = _mm_cvtps_epi32(_mm_add_ps(v.m_values[1], _mm_set1_ps(-32768)));
 | 
						|
 | 
						|
            __m128i packed = _mm_packs_epi32(lo, hi);
 | 
						|
 | 
						|
            UInt16 result;
 | 
						|
            result.m_value = _mm_xor_si128(packed, _mm_set1_epi16(-32768));
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static UInt15 RoundAndConvertToU15(const Float &v, const void* /*roundingMode*/)
 | 
						|
        {
 | 
						|
            __m128i lo = _mm_cvtps_epi32(v.m_values[0]);
 | 
						|
            __m128i hi = _mm_cvtps_epi32(v.m_values[1]);
 | 
						|
 | 
						|
            __m128i packed = _mm_packs_epi32(lo, hi);
 | 
						|
 | 
						|
            UInt15 result;
 | 
						|
            result.m_value = _mm_packs_epi32(lo, hi);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static SInt16 RoundAndConvertToS16(const Float &v, const void* /*roundingMode*/)
 | 
						|
        {
 | 
						|
            __m128i lo = _mm_cvtps_epi32(v.m_values[0]);
 | 
						|
            __m128i hi = _mm_cvtps_epi32(v.m_values[1]);
 | 
						|
 | 
						|
            __m128i packed = _mm_packs_epi32(lo, hi);
 | 
						|
 | 
						|
            SInt16 result;
 | 
						|
            result.m_value = _mm_packs_epi32(lo, hi);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static Float Sqrt(const Float &f)
 | 
						|
        {
 | 
						|
            Float result;
 | 
						|
            for (int i = 0; i < 2; i++)
 | 
						|
                result.m_values[i] = _mm_sqrt_ps(f.m_values[i]);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static UInt16 Abs(const SInt16 &a)
 | 
						|
        {
 | 
						|
            __m128i signBitsXor = _mm_srai_epi16(a.m_value, 15);
 | 
						|
            __m128i signBitsAdd = _mm_srli_epi16(a.m_value, 15);
 | 
						|
 | 
						|
            UInt16 result;
 | 
						|
            result.m_value = _mm_add_epi16(_mm_xor_si128(a.m_value, signBitsXor), signBitsAdd);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static Float Abs(const Float& a)
 | 
						|
        {
 | 
						|
            __m128 invMask = _mm_set1_ps(-0.0f);
 | 
						|
 | 
						|
            Float result;
 | 
						|
            result.m_values[0] = _mm_andnot_ps(invMask, a.m_values[0]);
 | 
						|
            result.m_values[1] = _mm_andnot_ps(invMask, a.m_values[1]);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static UInt16 SqDiffUInt8(const UInt15 &a, const UInt15 &b)
 | 
						|
        {
 | 
						|
            __m128i diff = _mm_sub_epi16(a.m_value, b.m_value);
 | 
						|
 | 
						|
            UInt16 result;
 | 
						|
            result.m_value = _mm_mullo_epi16(diff, diff);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static Float SqDiffSInt16(const SInt16 &a, const SInt16 &b)
 | 
						|
        {
 | 
						|
            __m128i diffU = _mm_sub_epi16(_mm_max_epi16(a.m_value, b.m_value), _mm_min_epi16(a.m_value, b.m_value));
 | 
						|
 | 
						|
            __m128i mulHi = _mm_mulhi_epu16(diffU, diffU);
 | 
						|
            __m128i mulLo = _mm_mullo_epi16(diffU, diffU);
 | 
						|
            __m128i sqDiffHi = _mm_unpackhi_epi16(mulLo, mulHi);
 | 
						|
            __m128i sqDiffLo = _mm_unpacklo_epi16(mulLo, mulHi);
 | 
						|
 | 
						|
            Float result;
 | 
						|
            result.m_values[0] = _mm_cvtepi32_ps(sqDiffLo);
 | 
						|
            result.m_values[1] = _mm_cvtepi32_ps(sqDiffHi);
 | 
						|
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static Float TwosCLHalfToFloat(const SInt16 &v)
 | 
						|
        {
 | 
						|
            __m128i absV = _mm_add_epi16(_mm_xor_si128(v.m_value, _mm_srai_epi16(v.m_value, 15)), _mm_srli_epi16(v.m_value, 15));
 | 
						|
 | 
						|
            __m128i signBits = _mm_and_si128(v.m_value, _mm_set1_epi16(-32768));
 | 
						|
            __m128i mantissa = _mm_and_si128(v.m_value, _mm_set1_epi16(0x03ff));
 | 
						|
            __m128i exponent = _mm_and_si128(v.m_value, _mm_set1_epi16(0x7c00));
 | 
						|
 | 
						|
            __m128i isDenormal = _mm_cmpeq_epi16(exponent, _mm_setzero_si128());
 | 
						|
 | 
						|
            // Convert exponent to high-bits 
 | 
						|
            exponent = _mm_add_epi16(_mm_srli_epi16(exponent, 3), _mm_set1_epi16(14336));
 | 
						|
 | 
						|
            __m128i denormalCorrectionHigh = _mm_and_si128(isDenormal, _mm_or_si128(signBits, _mm_set1_epi16(14336)));
 | 
						|
 | 
						|
            __m128i highBits = _mm_or_si128(signBits, _mm_or_si128(exponent, _mm_srli_epi16(mantissa, 3)));
 | 
						|
            __m128i lowBits = _mm_slli_epi16(mantissa, 13);
 | 
						|
 | 
						|
            __m128i flow = _mm_unpacklo_epi16(lowBits, highBits);
 | 
						|
            __m128i fhigh = _mm_unpackhi_epi16(lowBits, highBits);
 | 
						|
 | 
						|
            __m128i correctionLow = _mm_unpacklo_epi16(_mm_setzero_si128(), denormalCorrectionHigh);
 | 
						|
            __m128i correctionHigh = _mm_unpackhi_epi16(_mm_setzero_si128(), denormalCorrectionHigh);
 | 
						|
 | 
						|
            Float result;
 | 
						|
            result.m_values[0] = _mm_sub_ps(_mm_castsi128_ps(flow), _mm_castsi128_ps(correctionLow));
 | 
						|
            result.m_values[1] = _mm_sub_ps(_mm_castsi128_ps(fhigh), _mm_castsi128_ps(correctionHigh));
 | 
						|
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static Float SqDiff2CLFloat(const SInt16 &a, const Float &b)
 | 
						|
        {
 | 
						|
            Float fa = TwosCLHalfToFloat(a);
 | 
						|
 | 
						|
            Float diff = fa - b;
 | 
						|
            return diff * diff;
 | 
						|
        }
 | 
						|
 | 
						|
        static Float SqDiff2CL(const SInt16 &a, const SInt16 &b)
 | 
						|
        {
 | 
						|
            Float fa = TwosCLHalfToFloat(a);
 | 
						|
            Float fb = TwosCLHalfToFloat(b);
 | 
						|
 | 
						|
            Float diff = fa - fb;
 | 
						|
            return diff * diff;
 | 
						|
        }
 | 
						|
 | 
						|
        static Float SqDiff2CLFloat(const SInt16 &a, float aWeight, const Float &b)
 | 
						|
        {
 | 
						|
            Float fa = TwosCLHalfToFloat(a) * aWeight;
 | 
						|
 | 
						|
            Float diff = fa - b;
 | 
						|
            return diff * diff;
 | 
						|
        }
 | 
						|
 | 
						|
        static UInt16 RightShift(const UInt16 &v, int bits)
 | 
						|
        {
 | 
						|
            UInt16 result;
 | 
						|
            result.m_value = _mm_srli_epi16(v.m_value, bits);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static UInt31 RightShift(const UInt31 &v, int bits)
 | 
						|
        {
 | 
						|
            UInt31 result;
 | 
						|
            result.m_values[0] = _mm_srli_epi32(v.m_values[0], bits);
 | 
						|
            result.m_values[1] = _mm_srli_epi32(v.m_values[1], bits);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static SInt16 RightShift(const SInt16 &v, int bits)
 | 
						|
        {
 | 
						|
            SInt16 result;
 | 
						|
            result.m_value = _mm_srai_epi16(v.m_value, bits);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static UInt15 RightShift(const UInt15 &v, int bits)
 | 
						|
        {
 | 
						|
            UInt15 result;
 | 
						|
            result.m_value = _mm_srli_epi16(v.m_value, bits);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static SInt32 RightShift(const SInt32 &v, int bits)
 | 
						|
        {
 | 
						|
            SInt32 result;
 | 
						|
            result.m_values[0] = _mm_srai_epi32(v.m_values[0], bits);
 | 
						|
            result.m_values[1] = _mm_srai_epi32(v.m_values[1], bits);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static SInt16 ToSInt16(const SInt32 &v)
 | 
						|
        {
 | 
						|
            SInt16 result;
 | 
						|
            result.m_value = _mm_packs_epi32(v.m_values[0], v.m_values[1]);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static UInt16 ToUInt16(const UInt32 &v)
 | 
						|
        {
 | 
						|
            __m128i low = _mm_srai_epi32(_mm_slli_epi32(v.m_values[0], 16), 16);
 | 
						|
            __m128i high = _mm_srai_epi32(_mm_slli_epi32(v.m_values[1], 16), 16);
 | 
						|
 | 
						|
            UInt16 result;
 | 
						|
            result.m_value = _mm_packs_epi32(low, high);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static UInt16 ToUInt16(const UInt31 &v)
 | 
						|
        {
 | 
						|
            __m128i low = _mm_srai_epi32(_mm_slli_epi32(v.m_values[0], 16), 16);
 | 
						|
            __m128i high = _mm_srai_epi32(_mm_slli_epi32(v.m_values[1], 16), 16);
 | 
						|
 | 
						|
            UInt16 result;
 | 
						|
            result.m_value = _mm_packs_epi32(low, high);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static UInt15 ToUInt15(const UInt31 &v)
 | 
						|
        {
 | 
						|
            UInt15 result;
 | 
						|
            result.m_value = _mm_packs_epi32(v.m_values[0], v.m_values[1]);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static SInt32 XMultiply(const SInt16 &a, const SInt16 &b)
 | 
						|
        {
 | 
						|
            __m128i high = _mm_mulhi_epi16(a.m_value, b.m_value);
 | 
						|
            __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
 | 
						|
 | 
						|
            SInt32 result;
 | 
						|
            result.m_values[0] = _mm_unpacklo_epi16(low, high);
 | 
						|
            result.m_values[1] = _mm_unpackhi_epi16(low, high);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static SInt32 XMultiply(const SInt16 &a, const UInt15 &b)
 | 
						|
        {
 | 
						|
            __m128i high = _mm_mulhi_epi16(a.m_value, b.m_value);
 | 
						|
            __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
 | 
						|
 | 
						|
            SInt32 result;
 | 
						|
            result.m_values[0] = _mm_unpacklo_epi16(low, high);
 | 
						|
            result.m_values[1] = _mm_unpackhi_epi16(low, high);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static SInt32 XMultiply(const UInt15 &a, const SInt16 &b)
 | 
						|
        {
 | 
						|
            return XMultiply(b, a);
 | 
						|
        }
 | 
						|
 | 
						|
        static UInt32 XMultiply(const UInt16 &a, const UInt16 &b)
 | 
						|
        {
 | 
						|
            __m128i high = _mm_mulhi_epu16(a.m_value, b.m_value);
 | 
						|
            __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
 | 
						|
 | 
						|
            UInt32 result;
 | 
						|
            result.m_values[0] = _mm_unpacklo_epi16(low, high);
 | 
						|
            result.m_values[1] = _mm_unpackhi_epi16(low, high);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static UInt16 CompactMultiply(const UInt16 &a, const UInt15 &b)
 | 
						|
        {
 | 
						|
            UInt16 result;
 | 
						|
            result.m_value = _mm_mullo_epi16(a.m_value, b.m_value);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static UInt16 CompactMultiply(const UInt15 &a, const UInt15 &b)
 | 
						|
        {
 | 
						|
            UInt16 result;
 | 
						|
            result.m_value = _mm_mullo_epi16(a.m_value, b.m_value);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static UInt31 XMultiply(const UInt15 &a, const UInt15 &b)
 | 
						|
        {
 | 
						|
            __m128i high = _mm_mulhi_epu16(a.m_value, b.m_value);
 | 
						|
            __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
 | 
						|
 | 
						|
            UInt31 result;
 | 
						|
            result.m_values[0] = _mm_unpacklo_epi16(low, high);
 | 
						|
            result.m_values[1] = _mm_unpackhi_epi16(low, high);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static UInt31 XMultiply(const UInt16 &a, const UInt15 &b)
 | 
						|
        {
 | 
						|
            __m128i high = _mm_mulhi_epu16(a.m_value, b.m_value);
 | 
						|
            __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
 | 
						|
 | 
						|
            UInt31 result;
 | 
						|
            result.m_values[0] = _mm_unpacklo_epi16(low, high);
 | 
						|
            result.m_values[1] = _mm_unpackhi_epi16(low, high);
 | 
						|
            return result;
 | 
						|
        }
 | 
						|
 | 
						|
        static UInt31 XMultiply(const UInt15 &a, const UInt16 &b)
 | 
						|
        {
 | 
						|
            return XMultiply(b, a);
 | 
						|
        }
 | 
						|
 | 
						|
        static bool AnySet(const Int16CompFlag &v)
 | 
						|
        {
 | 
						|
            return _mm_movemask_epi8(v.m_value) != 0;
 | 
						|
        }
 | 
						|
 | 
						|
        static bool AllSet(const Int16CompFlag &v)
 | 
						|
        {
 | 
						|
            return _mm_movemask_epi8(v.m_value) == 0xffff;
 | 
						|
        }
 | 
						|
 | 
						|
        static bool AnySet(const FloatCompFlag &v)
 | 
						|
        {
 | 
						|
            return _mm_movemask_ps(v.m_values[0]) != 0 || _mm_movemask_ps(v.m_values[1]) != 0;
 | 
						|
        }
 | 
						|
 | 
						|
        static bool AllSet(const FloatCompFlag &v)
 | 
						|
        {
 | 
						|
            return _mm_movemask_ps(v.m_values[0]) == 0xf && _mm_movemask_ps(v.m_values[1]) == 0xf;
 | 
						|
        }
 | 
						|
    };
 | 
						|
 | 
						|
#else
 | 
						|
    // Scalar version
 | 
						|
    struct ParallelMath
 | 
						|
    {
 | 
						|
        struct RoundTowardZeroForScope
 | 
						|
        {
 | 
						|
        };
 | 
						|
 | 
						|
        struct RoundTowardNearestForScope
 | 
						|
        {
 | 
						|
        };
 | 
						|
 | 
						|
        struct RoundUpForScope
 | 
						|
        {
 | 
						|
        };
 | 
						|
 | 
						|
        struct RoundDownForScope
 | 
						|
        {
 | 
						|
        };
 | 
						|
 | 
						|
        static const int ParallelSize = 1;
 | 
						|
 | 
						|
        enum Int16Subtype
 | 
						|
        {
 | 
						|
            IntSubtype_Signed,
 | 
						|
            IntSubtype_UnsignedFull,
 | 
						|
            IntSubtype_UnsignedTruncated,
 | 
						|
            IntSubtype_Abstract,
 | 
						|
        };
 | 
						|
 | 
						|
        typedef int32_t SInt16;
 | 
						|
        typedef int32_t UInt15;
 | 
						|
        typedef int32_t UInt16;
 | 
						|
        typedef int32_t AInt16;
 | 
						|
 | 
						|
        typedef int32_t SInt32;
 | 
						|
        typedef int32_t UInt31;
 | 
						|
        typedef int32_t UInt32;
 | 
						|
        typedef int32_t AInt32;
 | 
						|
 | 
						|
        typedef int32_t ScalarUInt16;
 | 
						|
        typedef int32_t ScalarSInt16;
 | 
						|
 | 
						|
        typedef float Float;
 | 
						|
 | 
						|
        template<class TTargetType>
 | 
						|
        struct LosslessCast
 | 
						|
        {
 | 
						|
            static const int32_t& Cast(const int32_t &src)
 | 
						|
            {
 | 
						|
                return src;
 | 
						|
            }
 | 
						|
        };
 | 
						|
 | 
						|
        typedef bool Int16CompFlag;
 | 
						|
        typedef bool FloatCompFlag;
 | 
						|
 | 
						|
        static int32_t AbstractAdd(const int32_t &a, const int32_t &b)
 | 
						|
        {
 | 
						|
            return a + b;
 | 
						|
        }
 | 
						|
 | 
						|
        static int32_t AbstractSubtract(const int32_t &a, const int32_t &b)
 | 
						|
        {
 | 
						|
            return a - b;
 | 
						|
        }
 | 
						|
 | 
						|
        static float Select(bool flag, float a, float b)
 | 
						|
        {
 | 
						|
            return flag ? a : b;
 | 
						|
        }
 | 
						|
 | 
						|
        static int32_t Select(bool flag, int32_t a, int32_t b)
 | 
						|
        {
 | 
						|
            return flag ? a : b;
 | 
						|
        }
 | 
						|
 | 
						|
        static int32_t SelectOrZero(bool flag, int32_t a)
 | 
						|
        {
 | 
						|
            return flag ? a : 0;
 | 
						|
        }
 | 
						|
 | 
						|
        static void ConditionalSet(int32_t& dest, bool flag, int32_t src)
 | 
						|
        {
 | 
						|
            if (flag)
 | 
						|
                dest = src;
 | 
						|
        }
 | 
						|
 | 
						|
        static int32_t ConditionalNegate(bool flag, int32_t v)
 | 
						|
        {
 | 
						|
            return (flag) ? -v : v;
 | 
						|
        }
 | 
						|
 | 
						|
        static void NotConditionalSet(int32_t& dest, bool flag, int32_t src)
 | 
						|
        {
 | 
						|
            if (!flag)
 | 
						|
                dest = src;
 | 
						|
        }
 | 
						|
 | 
						|
        static void ConditionalSet(float& dest, bool flag, float src)
 | 
						|
        {
 | 
						|
            if (flag)
 | 
						|
                dest = src;
 | 
						|
        }
 | 
						|
 | 
						|
        static void NotConditionalSet(float& dest, bool flag, float src)
 | 
						|
        {
 | 
						|
            if (!flag)
 | 
						|
                dest = src;
 | 
						|
        }
 | 
						|
 | 
						|
        static void MakeSafeDenominator(float& v)
 | 
						|
        {
 | 
						|
            if (v == 0.0f)
 | 
						|
                v = 1.0f;
 | 
						|
        }
 | 
						|
 | 
						|
        static int32_t SignedRightShift(int32_t v, int bits)
 | 
						|
        {
 | 
						|
            return v >> bits;
 | 
						|
        }
 | 
						|
 | 
						|
        static int32_t TruncateToPrecisionSigned(int32_t v, int precision)
 | 
						|
        {
 | 
						|
            v = (v << (32 - precision)) & 0xffffffff;
 | 
						|
            return SignedRightShift(v, 32 - precision);
 | 
						|
        }
 | 
						|
 | 
						|
        static int32_t TruncateToPrecisionUnsigned(int32_t v, int precision)
 | 
						|
        {
 | 
						|
            return v & ((1 << precision) - 1);
 | 
						|
        }
 | 
						|
 | 
						|
        static int32_t Min(int32_t a, int32_t b)
 | 
						|
        {
 | 
						|
            if (a < b)
 | 
						|
                return a;
 | 
						|
            return b;
 | 
						|
        }
 | 
						|
 | 
						|
        static float Min(float a, float b)
 | 
						|
        {
 | 
						|
            if (a < b)
 | 
						|
                return a;
 | 
						|
            return b;
 | 
						|
        }
 | 
						|
 | 
						|
        static int32_t Max(int32_t a, int32_t b)
 | 
						|
        {
 | 
						|
            if (a > b)
 | 
						|
                return a;
 | 
						|
            return b;
 | 
						|
        }
 | 
						|
 | 
						|
        static float Max(float a, float b)
 | 
						|
        {
 | 
						|
            if (a > b)
 | 
						|
                return a;
 | 
						|
            return b;
 | 
						|
        }
 | 
						|
 | 
						|
        static float Abs(float a)
 | 
						|
        {
 | 
						|
            return fabsf(a);
 | 
						|
        }
 | 
						|
 | 
						|
        static int32_t Abs(int32_t a)
 | 
						|
        {
 | 
						|
            if (a < 0)
 | 
						|
                return -a;
 | 
						|
            return a;
 | 
						|
        }
 | 
						|
 | 
						|
        static float Clamp(float v, float min, float max)
 | 
						|
        {
 | 
						|
            if (v < min)
 | 
						|
                return min;
 | 
						|
            if (v > max)
 | 
						|
                return max;
 | 
						|
            return v;
 | 
						|
        }
 | 
						|
 | 
						|
        static float Reciprocal(float v)
 | 
						|
        {
 | 
						|
            return 1.0f / v;
 | 
						|
        }
 | 
						|
 | 
						|
        static void ConvertLDRInputs(const PixelBlockU8* inputBlocks, int pxOffset, int channel, int32_t& chOut)
 | 
						|
        {
 | 
						|
            chOut = inputBlocks[0].m_pixels[pxOffset][channel];
 | 
						|
        }
 | 
						|
 | 
						|
        static void ConvertHDRInputs(const PixelBlockF16* inputBlocks, int pxOffset, int channel, int32_t& chOut)
 | 
						|
        {
 | 
						|
            chOut = inputBlocks[0].m_pixels[pxOffset][channel];
 | 
						|
        }
 | 
						|
 | 
						|
        static float MakeFloat(float v)
 | 
						|
        {
 | 
						|
            return v;
 | 
						|
        }
 | 
						|
 | 
						|
        static float MakeFloatZero()
 | 
						|
        {
 | 
						|
            return 0.0f;
 | 
						|
        }
 | 
						|
 | 
						|
        static int32_t MakeUInt16(uint16_t v)
 | 
						|
        {
 | 
						|
            return v;
 | 
						|
        }
 | 
						|
 | 
						|
        static int32_t MakeSInt16(int16_t v)
 | 
						|
        {
 | 
						|
            return v;
 | 
						|
        }
 | 
						|
 | 
						|
        static int32_t MakeAInt16(int16_t v)
 | 
						|
        {
 | 
						|
            return v;
 | 
						|
        }
 | 
						|
 | 
						|
        static int32_t MakeUInt15(uint16_t v)
 | 
						|
        {
 | 
						|
            return v;
 | 
						|
        }
 | 
						|
 | 
						|
        static int32_t MakeSInt32(int32_t v)
 | 
						|
        {
 | 
						|
            return v;
 | 
						|
        }
 | 
						|
 | 
						|
        static int32_t MakeUInt31(int32_t v)
 | 
						|
        {
 | 
						|
            return v;
 | 
						|
        }
 | 
						|
 | 
						|
        static int32_t Extract(int32_t v, int offset)
 | 
						|
        {
 | 
						|
            UNREFERENCED_PARAMETER(offset);
 | 
						|
            return v;
 | 
						|
        }
 | 
						|
 | 
						|
        static void PutUInt16(int32_t &dest, int offset, ParallelMath::ScalarUInt16 v)
 | 
						|
        {
 | 
						|
            UNREFERENCED_PARAMETER(offset);
 | 
						|
            dest = v;
 | 
						|
        }
 | 
						|
 | 
						|
        static void PutUInt15(int32_t &dest, int offset, ParallelMath::ScalarUInt16 v)
 | 
						|
        {
 | 
						|
            UNREFERENCED_PARAMETER(offset);
 | 
						|
            dest = v;
 | 
						|
        }
 | 
						|
 | 
						|
        static void PutSInt16(int32_t &dest, int offset, ParallelMath::ScalarSInt16 v)
 | 
						|
        {
 | 
						|
            UNREFERENCED_PARAMETER(offset);
 | 
						|
            dest = v;
 | 
						|
        }
 | 
						|
 | 
						|
        static float ExtractFloat(float v, int offset)
 | 
						|
        {
 | 
						|
            UNREFERENCED_PARAMETER(offset);
 | 
						|
            return v;
 | 
						|
        }
 | 
						|
 | 
						|
        static void PutFloat(float &dest, int offset, float v)
 | 
						|
        {
 | 
						|
            UNREFERENCED_PARAMETER(offset);
 | 
						|
            dest = v;
 | 
						|
        }
 | 
						|
 | 
						|
        static bool Less(int32_t a, int32_t b)
 | 
						|
        {
 | 
						|
            return a < b;
 | 
						|
        }
 | 
						|
 | 
						|
        static bool Less(float a, float b)
 | 
						|
        {
 | 
						|
            return a < b;
 | 
						|
        }
 | 
						|
 | 
						|
        static bool LessOrEqual(int32_t a, int32_t b)
 | 
						|
        {
 | 
						|
            return a < b;
 | 
						|
        }
 | 
						|
 | 
						|
        static bool LessOrEqual(float a, float b)
 | 
						|
        {
 | 
						|
            return a < b;
 | 
						|
        }
 | 
						|
 | 
						|
        static bool Equal(int32_t a, int32_t b)
 | 
						|
        {
 | 
						|
            return a == b;
 | 
						|
        }
 | 
						|
 | 
						|
        static bool Equal(float a, float b)
 | 
						|
        {
 | 
						|
            return a == b;
 | 
						|
        }
 | 
						|
 | 
						|
        static float ToFloat(int32_t v)
 | 
						|
        {
 | 
						|
            return static_cast<float>(v);
 | 
						|
        }
 | 
						|
 | 
						|
        static int32_t ToUInt31(int32_t v)
 | 
						|
        {
 | 
						|
            return v;
 | 
						|
        }
 | 
						|
 | 
						|
        static int32_t ToInt32(int32_t v)
 | 
						|
        {
 | 
						|
            return v;
 | 
						|
        }
 | 
						|
 | 
						|
        static bool FloatFlagToInt16(bool v)
 | 
						|
        {
 | 
						|
            return v;
 | 
						|
        }
 | 
						|
 | 
						|
        static bool Int16FlagToFloat(bool v)
 | 
						|
        {
 | 
						|
            return v;
 | 
						|
        }
 | 
						|
 | 
						|
        static bool MakeBoolInt16(bool b)
 | 
						|
        {
 | 
						|
            return b;
 | 
						|
        }
 | 
						|
 | 
						|
        static bool MakeBoolFloat(bool b)
 | 
						|
        {
 | 
						|
            return b;
 | 
						|
        }
 | 
						|
 | 
						|
        static bool AndNot(bool a, bool b)
 | 
						|
        {
 | 
						|
            return a && !b;
 | 
						|
        }
 | 
						|
 | 
						|
        static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundTowardZeroForScope *rtz)
 | 
						|
        {
 | 
						|
            UNREFERENCED_PARAMETER(rtz);
 | 
						|
            return static_cast<int>(v);
 | 
						|
        }
 | 
						|
 | 
						|
        static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundUpForScope *ru)
 | 
						|
        {
 | 
						|
            UNREFERENCED_PARAMETER(ru);
 | 
						|
            return static_cast<int>(ceilf(v));
 | 
						|
        }
 | 
						|
 | 
						|
        static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundDownForScope *rd)
 | 
						|
        {
 | 
						|
            UNREFERENCED_PARAMETER(rd);
 | 
						|
            return static_cast<int>(floorf(v));
 | 
						|
        }
 | 
						|
 | 
						|
        static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundTowardNearestForScope *rtn)
 | 
						|
        {
 | 
						|
            UNREFERENCED_PARAMETER(rtn);
 | 
						|
            return static_cast<int>(floorf(v + 0.5f));
 | 
						|
        }
 | 
						|
 | 
						|
        template<class TRoundMode>
 | 
						|
        static int32_t RoundAndConvertToU16(float v, const TRoundMode *roundingMode)
 | 
						|
        {
 | 
						|
            return RoundAndConvertToInt(v, roundingMode);
 | 
						|
        }
 | 
						|
 | 
						|
        template<class TRoundMode>
 | 
						|
        static int32_t RoundAndConvertToU15(float v, const TRoundMode *roundingMode)
 | 
						|
        {
 | 
						|
            return RoundAndConvertToInt(v, roundingMode);
 | 
						|
        }
 | 
						|
 | 
						|
        template<class TRoundMode>
 | 
						|
        static int32_t RoundAndConvertToS16(float v, const TRoundMode *roundingMode)
 | 
						|
        {
 | 
						|
            return RoundAndConvertToInt(v, roundingMode);
 | 
						|
        }
 | 
						|
 | 
						|
        static float Sqrt(float f)
 | 
						|
        {
 | 
						|
            return sqrtf(f);
 | 
						|
        }
 | 
						|
 | 
						|
        static int32_t SqDiffUInt8(int32_t a, int32_t b)
 | 
						|
        {
 | 
						|
            int32_t delta = a - b;
 | 
						|
            return delta * delta;
 | 
						|
        }
 | 
						|
 | 
						|
        static int32_t SqDiffInt16(int32_t a, int32_t b)
 | 
						|
        {
 | 
						|
            int32_t delta = a - b;
 | 
						|
            return delta * delta;
 | 
						|
        }
 | 
						|
 | 
						|
        static int32_t SqDiffSInt16(int32_t a, int32_t b)
 | 
						|
        {
 | 
						|
            int32_t delta = a - b;
 | 
						|
            return delta * delta;
 | 
						|
        }
 | 
						|
 | 
						|
        static float TwosCLHalfToFloat(int32_t v)
 | 
						|
        {
 | 
						|
            int32_t absV = (v < 0) ? -v : v;
 | 
						|
 | 
						|
            int32_t signBits = (absV & -32768);
 | 
						|
            int32_t mantissa = (absV & 0x03ff);
 | 
						|
            int32_t exponent = (absV & 0x7c00);
 | 
						|
 | 
						|
            bool isDenormal = (exponent == 0);
 | 
						|
 | 
						|
            // Convert exponent to high-bits
 | 
						|
            exponent = (exponent >> 3) + 14336;
 | 
						|
 | 
						|
            int32_t denormalCorrection = (isDenormal ? (signBits | 14336) : 0) << 16;
 | 
						|
 | 
						|
            int32_t fBits = ((exponent | signBits) << 16) | (mantissa << 13);
 | 
						|
 | 
						|
            float f, correction;
 | 
						|
            memcpy(&f, &fBits, 4);
 | 
						|
            memcpy(&correction, &denormalCorrection, 4);
 | 
						|
 | 
						|
            return f - correction;
 | 
						|
        }
 | 
						|
 | 
						|
        static Float SqDiff2CLFloat(const SInt16 &a, const Float &b)
 | 
						|
        {
 | 
						|
            Float fa = TwosCLHalfToFloat(a);
 | 
						|
 | 
						|
            Float diff = fa - b;
 | 
						|
            return diff * diff;
 | 
						|
        }
 | 
						|
 | 
						|
        static Float SqDiff2CL(const SInt16 &a, const SInt16 &b)
 | 
						|
        {
 | 
						|
            Float fa = TwosCLHalfToFloat(a);
 | 
						|
            Float fb = TwosCLHalfToFloat(b);
 | 
						|
 | 
						|
            Float diff = fa - fb;
 | 
						|
            return diff * diff;
 | 
						|
        }
 | 
						|
 | 
						|
        static Float SqDiff2CLFloat(const SInt16 &a, float aWeight, const Float &b)
 | 
						|
        {
 | 
						|
            Float fa = TwosCLHalfToFloat(a) * aWeight;
 | 
						|
 | 
						|
            Float diff = fa - b;
 | 
						|
            return diff * diff;
 | 
						|
        }
 | 
						|
 | 
						|
        static int32_t RightShift(int32_t v, int bits)
 | 
						|
        {
 | 
						|
            return SignedRightShift(v, bits);
 | 
						|
        }
 | 
						|
 | 
						|
        static int32_t ToSInt16(int32_t v)
 | 
						|
        {
 | 
						|
            return v;
 | 
						|
        }
 | 
						|
 | 
						|
        static int32_t ToUInt16(int32_t v)
 | 
						|
        {
 | 
						|
            return v;
 | 
						|
        }
 | 
						|
 | 
						|
        static int32_t ToUInt15(int32_t v)
 | 
						|
        {
 | 
						|
            return v;
 | 
						|
        }
 | 
						|
 | 
						|
        static int32_t XMultiply(int32_t a, int32_t b)
 | 
						|
        {
 | 
						|
            return a * b;
 | 
						|
        }
 | 
						|
 | 
						|
        static int32_t CompactMultiply(int32_t a, int32_t b)
 | 
						|
        {
 | 
						|
            return a * b;
 | 
						|
        }
 | 
						|
 | 
						|
        static bool AnySet(bool v)
 | 
						|
        {
 | 
						|
            return v;
 | 
						|
        }
 | 
						|
 | 
						|
        static bool AllSet(bool v)
 | 
						|
        {
 | 
						|
            return v;
 | 
						|
        }
 | 
						|
    };
 | 
						|
 | 
						|
#endif
 | 
						|
 | 
						|
    namespace Internal
 | 
						|
    {
 | 
						|
        namespace BC7Data
 | 
						|
        {
 | 
						|
            enum AlphaMode
 | 
						|
            {
 | 
						|
                AlphaMode_Combined,
 | 
						|
                AlphaMode_Separate,
 | 
						|
                AlphaMode_None,
 | 
						|
            };
 | 
						|
 | 
						|
            enum PBitMode
 | 
						|
            {
 | 
						|
                PBitMode_PerEndpoint,
 | 
						|
                PBitMode_PerSubset,
 | 
						|
                PBitMode_None
 | 
						|
            };
 | 
						|
 | 
						|
            struct BC7ModeInfo
 | 
						|
            {
 | 
						|
                PBitMode m_pBitMode;
 | 
						|
                AlphaMode m_alphaMode;
 | 
						|
                int m_rgbBits;
 | 
						|
                int m_alphaBits;
 | 
						|
                int m_partitionBits;
 | 
						|
                int m_numSubsets;
 | 
						|
                int m_indexBits;
 | 
						|
                int m_alphaIndexBits;
 | 
						|
                bool m_hasIndexSelector;
 | 
						|
            };
 | 
						|
 | 
						|
            BC7ModeInfo g_modes[] =
 | 
						|
            {
 | 
						|
                { PBitMode_PerEndpoint, AlphaMode_None, 4, 0, 4, 3, 3, 0, false },     // 0
 | 
						|
                { PBitMode_PerSubset, AlphaMode_None, 6, 0, 6, 2, 3, 0, false },       // 1
 | 
						|
                { PBitMode_None, AlphaMode_None, 5, 0, 6, 3, 2, 0, false },            // 2
 | 
						|
                { PBitMode_PerEndpoint, AlphaMode_None, 7, 0, 6, 2, 2, 0, false },     // 3 (Mode reference has an error, P-bit is really per-endpoint)
 | 
						|
 | 
						|
                { PBitMode_None, AlphaMode_Separate, 5, 6, 0, 1, 2, 3, true },         // 4
 | 
						|
                { PBitMode_None, AlphaMode_Separate, 7, 8, 0, 1, 2, 2, false },        // 5
 | 
						|
                { PBitMode_PerEndpoint, AlphaMode_Combined, 7, 7, 0, 1, 4, 0, false }, // 6
 | 
						|
                { PBitMode_PerEndpoint, AlphaMode_Combined, 5, 5, 6, 2, 2, 0, false }  // 7
 | 
						|
            };
 | 
						|
 | 
						|
			const int g_weight2[] = { 0, 21, 43, 64 };
 | 
						|
			const int g_weight3[] = { 0, 9, 18, 27, 37, 46, 55, 64 };
 | 
						|
			const int g_weight4[] = { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 };
 | 
						|
 | 
						|
			const int *g_weightTables[] =
 | 
						|
			{
 | 
						|
				NULL,
 | 
						|
				NULL,
 | 
						|
				g_weight2,
 | 
						|
				g_weight3,
 | 
						|
				g_weight4
 | 
						|
			};
 | 
						|
 | 
						|
            struct BC6HModeInfo
 | 
						|
            {
 | 
						|
                uint16_t m_modeID;
 | 
						|
                bool m_partitioned;
 | 
						|
                bool m_transformed;
 | 
						|
                int m_aPrec;
 | 
						|
                int m_bPrec[3];
 | 
						|
            };
 | 
						|
 | 
						|
            // [partitioned][precision]
 | 
						|
            bool g_hdrModesExistForPrecision[2][17] =
 | 
						|
            {
 | 
						|
                //0      1      2      3      4      5      6      7      8      9      10     11     12     13     14     15     16
 | 
						|
                { false, false, false, false, false, false, false, false, false, false, true,  true,  true,  false, false, false, true },
 | 
						|
                { false, false, false, false, false, false, true,  true,  true,  true,  true,  true,  false, false, false, false, false },
 | 
						|
            };
 | 
						|
 | 
						|
            BC6HModeInfo g_hdrModes[] =
 | 
						|
            {
 | 
						|
                { 0x00, true,  true,  10,{ 5, 5, 5 } },
 | 
						|
                { 0x01, true,  true,  7,{ 6, 6, 6 } },
 | 
						|
                { 0x02, true,  true,  11,{ 5, 4, 4 } },
 | 
						|
                { 0x06, true,  true,  11,{ 4, 5, 4 } },
 | 
						|
                { 0x0a, true,  true,  11,{ 4, 4, 5 } },
 | 
						|
                { 0x0e, true,  true,  9,{ 5, 5, 5 } },
 | 
						|
                { 0x12, true,  true,  8,{ 6, 5, 5 } },
 | 
						|
                { 0x16, true,  true,  8,{ 5, 6, 5 } },
 | 
						|
                { 0x1a, true,  true,  8,{ 5, 5, 6 } },
 | 
						|
                { 0x1e, true,  false, 6,{ 6, 6, 6 } },
 | 
						|
                { 0x03, false, false, 10,{ 10, 10, 10 } },
 | 
						|
                { 0x07, false, true,  11,{ 9, 9, 9 } },
 | 
						|
                { 0x0b, false, true,  12,{ 8, 8, 8 } },
 | 
						|
                { 0x0f, false, true,  16,{ 4, 4, 4 } },
 | 
						|
            };
 | 
						|
 | 
						|
            const int g_maxHDRPrecision = 16;
 | 
						|
 | 
						|
            static const size_t g_numHDRModes = sizeof(g_hdrModes) / sizeof(g_hdrModes[0]);
 | 
						|
 | 
						|
            static uint16_t g_partitionMap[64] =
 | 
						|
            {
 | 
						|
                0xCCCC, 0x8888, 0xEEEE, 0xECC8,
 | 
						|
                0xC880, 0xFEEC, 0xFEC8, 0xEC80,
 | 
						|
                0xC800, 0xFFEC, 0xFE80, 0xE800,
 | 
						|
                0xFFE8, 0xFF00, 0xFFF0, 0xF000,
 | 
						|
                0xF710, 0x008E, 0x7100, 0x08CE,
 | 
						|
                0x008C, 0x7310, 0x3100, 0x8CCE,
 | 
						|
                0x088C, 0x3110, 0x6666, 0x366C,
 | 
						|
                0x17E8, 0x0FF0, 0x718E, 0x399C,
 | 
						|
                0xaaaa, 0xf0f0, 0x5a5a, 0x33cc,
 | 
						|
                0x3c3c, 0x55aa, 0x9696, 0xa55a,
 | 
						|
                0x73ce, 0x13c8, 0x324c, 0x3bdc,
 | 
						|
                0x6996, 0xc33c, 0x9966, 0x660,
 | 
						|
                0x272, 0x4e4, 0x4e40, 0x2720,
 | 
						|
                0xc936, 0x936c, 0x39c6, 0x639c,
 | 
						|
                0x9336, 0x9cc6, 0x817e, 0xe718,
 | 
						|
                0xccf0, 0xfcc, 0x7744, 0xee22,
 | 
						|
            };
 | 
						|
 | 
						|
            static uint32_t g_partitionMap2[64] =
 | 
						|
            {
 | 
						|
                0xaa685050, 0x6a5a5040, 0x5a5a4200, 0x5450a0a8,
 | 
						|
                0xa5a50000, 0xa0a05050, 0x5555a0a0, 0x5a5a5050,
 | 
						|
                0xaa550000, 0xaa555500, 0xaaaa5500, 0x90909090,
 | 
						|
                0x94949494, 0xa4a4a4a4, 0xa9a59450, 0x2a0a4250,
 | 
						|
                0xa5945040, 0x0a425054, 0xa5a5a500, 0x55a0a0a0,
 | 
						|
                0xa8a85454, 0x6a6a4040, 0xa4a45000, 0x1a1a0500,
 | 
						|
                0x0050a4a4, 0xaaa59090, 0x14696914, 0x69691400,
 | 
						|
                0xa08585a0, 0xaa821414, 0x50a4a450, 0x6a5a0200,
 | 
						|
                0xa9a58000, 0x5090a0a8, 0xa8a09050, 0x24242424,
 | 
						|
                0x00aa5500, 0x24924924, 0x24499224, 0x50a50a50,
 | 
						|
                0x500aa550, 0xaaaa4444, 0x66660000, 0xa5a0a5a0,
 | 
						|
                0x50a050a0, 0x69286928, 0x44aaaa44, 0x66666600,
 | 
						|
                0xaa444444, 0x54a854a8, 0x95809580, 0x96969600,
 | 
						|
                0xa85454a8, 0x80959580, 0xaa141414, 0x96960000,
 | 
						|
                0xaaaa1414, 0xa05050a0, 0xa0a5a5a0, 0x96000000,
 | 
						|
                0x40804080, 0xa9a8a9a8, 0xaaaaaa44, 0x2a4a5254,
 | 
						|
            };
 | 
						|
 | 
						|
            static int g_fixupIndexes2[64] =
 | 
						|
            {
 | 
						|
                15,15,15,15,
 | 
						|
                15,15,15,15,
 | 
						|
                15,15,15,15,
 | 
						|
                15,15,15,15,
 | 
						|
                15, 2, 8, 2,
 | 
						|
                2, 8, 8,15,
 | 
						|
                2, 8, 2, 2,
 | 
						|
                8, 8, 2, 2,
 | 
						|
 | 
						|
                15,15, 6, 8,
 | 
						|
                2, 8,15,15,
 | 
						|
                2, 8, 2, 2,
 | 
						|
                2,15,15, 6,
 | 
						|
                6, 2, 6, 8,
 | 
						|
                15,15, 2, 2,
 | 
						|
                15,15,15,15,
 | 
						|
                15, 2, 2,15,
 | 
						|
            };
 | 
						|
 | 
						|
            static int g_fixupIndexes3[64][2] =
 | 
						|
            {
 | 
						|
                { 3,15 },{ 3, 8 },{ 15, 8 },{ 15, 3 },
 | 
						|
                { 8,15 },{ 3,15 },{ 15, 3 },{ 15, 8 },
 | 
						|
                { 8,15 },{ 8,15 },{ 6,15 },{ 6,15 },
 | 
						|
                { 6,15 },{ 5,15 },{ 3,15 },{ 3, 8 },
 | 
						|
                { 3,15 },{ 3, 8 },{ 8,15 },{ 15, 3 },
 | 
						|
                { 3,15 },{ 3, 8 },{ 6,15 },{ 10, 8 },
 | 
						|
                { 5, 3 },{ 8,15 },{ 8, 6 },{ 6,10 },
 | 
						|
                { 8,15 },{ 5,15 },{ 15,10 },{ 15, 8 },
 | 
						|
 | 
						|
                { 8,15 },{ 15, 3 },{ 3,15 },{ 5,10 },
 | 
						|
                { 6,10 },{ 10, 8 },{ 8, 9 },{ 15,10 },
 | 
						|
                { 15, 6 },{ 3,15 },{ 15, 8 },{ 5,15 },
 | 
						|
                { 15, 3 },{ 15, 6 },{ 15, 6 },{ 15, 8 },
 | 
						|
                { 3,15 },{ 15, 3 },{ 5,15 },{ 5,15 },
 | 
						|
                { 5,15 },{ 8,15 },{ 5,15 },{ 10,15 },
 | 
						|
                { 5,15 },{ 10,15 },{ 8,15 },{ 13,15 },
 | 
						|
                { 15, 3 },{ 12,15 },{ 3,15 },{ 3, 8 },
 | 
						|
            };
 | 
						|
 | 
						|
            static const unsigned char g_fragments[] =
 | 
						|
            {
 | 
						|
                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  // 0, 16
 | 
						|
                0, 1, 2, 3,  // 16, 4
 | 
						|
                0, 1, 4,  // 20, 3
 | 
						|
                0, 1, 2, 4,  // 23, 4
 | 
						|
                2, 3, 7,  // 27, 3
 | 
						|
                1, 2, 3, 7,  // 30, 4
 | 
						|
                0, 1, 2, 3, 4, 5, 6, 7,  // 34, 8
 | 
						|
                0, 1, 4, 8,  // 42, 4
 | 
						|
                0, 1, 2, 4, 5, 8,  // 46, 6
 | 
						|
                0, 1, 2, 3, 4, 5, 6, 8,  // 52, 8
 | 
						|
                1, 4, 5, 6, 9,  // 60, 5
 | 
						|
                2, 5, 6, 7, 10,  // 65, 5
 | 
						|
                5, 6, 9, 10,  // 70, 4
 | 
						|
                2, 3, 7, 11,  // 74, 4
 | 
						|
                1, 2, 3, 6, 7, 11,  // 78, 6
 | 
						|
                0, 1, 2, 3, 5, 6, 7, 11,  // 84, 8
 | 
						|
                0, 1, 2, 3, 8, 9, 10, 11,  // 92, 8
 | 
						|
                2, 3, 6, 7, 8, 9, 10, 11,  // 100, 8
 | 
						|
                4, 5, 6, 7, 8, 9, 10, 11,  // 108, 8
 | 
						|
                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,  // 116, 12
 | 
						|
                0, 4, 8, 12,  // 128, 4
 | 
						|
                0, 2, 3, 4, 6, 7, 8, 12,  // 132, 8
 | 
						|
                0, 1, 2, 4, 5, 8, 9, 12,  // 140, 8
 | 
						|
                0, 1, 2, 3, 4, 5, 6, 8, 9, 12,  // 148, 10
 | 
						|
                3, 6, 7, 8, 9, 12,  // 158, 6
 | 
						|
                3, 5, 6, 7, 8, 9, 10, 12,  // 164, 8
 | 
						|
                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12,  // 172, 12
 | 
						|
                0, 1, 2, 5, 6, 7, 11, 12,  // 184, 8
 | 
						|
                5, 8, 9, 10, 13,  // 192, 5
 | 
						|
                8, 12, 13,  // 197, 3
 | 
						|
                4, 8, 12, 13,  // 200, 4
 | 
						|
                2, 3, 6, 9, 12, 13,  // 204, 6
 | 
						|
                0, 1, 2, 3, 8, 9, 12, 13,  // 210, 8
 | 
						|
                0, 1, 4, 5, 8, 9, 12, 13,  // 218, 8
 | 
						|
                2, 3, 6, 7, 8, 9, 12, 13,  // 226, 8
 | 
						|
                2, 3, 5, 6, 9, 10, 12, 13,  // 234, 8
 | 
						|
                0, 3, 6, 7, 9, 10, 12, 13,  // 242, 8
 | 
						|
                0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 12, 13,  // 250, 12
 | 
						|
                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13,  // 262, 13
 | 
						|
                2, 3, 4, 7, 8, 11, 12, 13,  // 275, 8
 | 
						|
                1, 2, 6, 7, 8, 11, 12, 13,  // 283, 8
 | 
						|
                2, 3, 4, 6, 7, 8, 9, 11, 12, 13,  // 291, 10
 | 
						|
                2, 3, 4, 5, 10, 11, 12, 13,  // 301, 8
 | 
						|
                0, 1, 6, 7, 10, 11, 12, 13,  // 309, 8
 | 
						|
                6, 9, 10, 11, 14,  // 317, 5
 | 
						|
                0, 2, 4, 6, 8, 10, 12, 14,  // 322, 8
 | 
						|
                1, 3, 5, 7, 8, 10, 12, 14,  // 330, 8
 | 
						|
                1, 3, 4, 6, 9, 11, 12, 14,  // 338, 8
 | 
						|
                0, 2, 5, 7, 9, 11, 12, 14,  // 346, 8
 | 
						|
                0, 3, 4, 5, 8, 9, 13, 14,  // 354, 8
 | 
						|
                2, 3, 4, 7, 8, 9, 13, 14,  // 362, 8
 | 
						|
                1, 2, 5, 6, 9, 10, 13, 14,  // 370, 8
 | 
						|
                0, 3, 4, 7, 9, 10, 13, 14,  // 378, 8
 | 
						|
                0, 3, 5, 6, 8, 11, 13, 14,  // 386, 8
 | 
						|
                1, 2, 4, 7, 8, 11, 13, 14,  // 394, 8
 | 
						|
                0, 1, 4, 7, 10, 11, 13, 14,  // 402, 8
 | 
						|
                0, 3, 6, 7, 10, 11, 13, 14,  // 410, 8
 | 
						|
                8, 12, 13, 14,  // 418, 4
 | 
						|
                1, 2, 3, 7, 8, 12, 13, 14,  // 422, 8
 | 
						|
                4, 8, 9, 12, 13, 14,  // 430, 6
 | 
						|
                0, 4, 5, 8, 9, 12, 13, 14,  // 436, 8
 | 
						|
                1, 2, 3, 6, 7, 8, 9, 12, 13, 14,  // 444, 10
 | 
						|
                2, 6, 8, 9, 10, 12, 13, 14,  // 454, 8
 | 
						|
                0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14,  // 462, 12
 | 
						|
                0, 7, 9, 10, 11, 12, 13, 14,  // 474, 8
 | 
						|
                1, 2, 3, 4, 5, 6, 8, 15,  // 482, 8
 | 
						|
                3, 7, 11, 15,  // 490, 4
 | 
						|
                0, 1, 3, 4, 5, 7, 11, 15,  // 494, 8
 | 
						|
                0, 4, 5, 10, 11, 15,  // 502, 6
 | 
						|
                1, 2, 3, 6, 7, 10, 11, 15,  // 508, 8
 | 
						|
                0, 1, 2, 3, 5, 6, 7, 10, 11, 15,  // 516, 10
 | 
						|
                0, 4, 5, 6, 9, 10, 11, 15,  // 526, 8
 | 
						|
                0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 15,  // 534, 12
 | 
						|
                1, 2, 4, 5, 8, 9, 12, 15,  // 546, 8
 | 
						|
                2, 3, 5, 6, 8, 9, 12, 15,  // 554, 8
 | 
						|
                0, 3, 5, 6, 9, 10, 12, 15,  // 562, 8
 | 
						|
                1, 2, 4, 7, 9, 10, 12, 15,  // 570, 8
 | 
						|
                1, 2, 5, 6, 8, 11, 12, 15,  // 578, 8
 | 
						|
                0, 3, 4, 7, 8, 11, 12, 15,  // 586, 8
 | 
						|
                0, 1, 5, 6, 10, 11, 12, 15,  // 594, 8
 | 
						|
                1, 2, 6, 7, 10, 11, 12, 15,  // 602, 8
 | 
						|
                1, 3, 4, 6, 8, 10, 13, 15,  // 610, 8
 | 
						|
                0, 2, 5, 7, 8, 10, 13, 15,  // 618, 8
 | 
						|
                0, 2, 4, 6, 9, 11, 13, 15,  // 626, 8
 | 
						|
                1, 3, 5, 7, 9, 11, 13, 15,  // 634, 8
 | 
						|
                0, 1, 2, 3, 4, 5, 7, 8, 12, 13, 15,  // 642, 11
 | 
						|
                2, 3, 4, 5, 8, 9, 14, 15,  // 653, 8
 | 
						|
                0, 1, 6, 7, 8, 9, 14, 15,  // 661, 8
 | 
						|
                0, 1, 5, 10, 14, 15,  // 669, 6
 | 
						|
                0, 3, 4, 5, 9, 10, 14, 15,  // 675, 8
 | 
						|
                0, 1, 5, 6, 9, 10, 14, 15,  // 683, 8
 | 
						|
                11, 14, 15,  // 691, 3
 | 
						|
                7, 11, 14, 15,  // 694, 4
 | 
						|
                1, 2, 4, 5, 8, 11, 14, 15,  // 698, 8
 | 
						|
                0, 1, 4, 7, 8, 11, 14, 15,  // 706, 8
 | 
						|
                0, 1, 4, 5, 10, 11, 14, 15,  // 714, 8
 | 
						|
                2, 3, 6, 7, 10, 11, 14, 15,  // 722, 8
 | 
						|
                4, 5, 6, 7, 10, 11, 14, 15,  // 730, 8
 | 
						|
                0, 1, 4, 5, 7, 8, 10, 11, 14, 15,  // 738, 10
 | 
						|
                0, 1, 2, 3, 5, 6, 7, 9, 10, 11, 14, 15,  // 748, 12
 | 
						|
                0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 14, 15,  // 760, 13
 | 
						|
                0, 1, 2, 3, 4, 6, 7, 11, 12, 14, 15,  // 773, 11
 | 
						|
                3, 4, 8, 9, 10, 13, 14, 15,  // 784, 8
 | 
						|
                11, 13, 14, 15,  // 792, 4
 | 
						|
                0, 1, 2, 4, 11, 13, 14, 15,  // 796, 8
 | 
						|
                0, 1, 2, 4, 5, 10, 11, 13, 14, 15,  // 804, 10
 | 
						|
                7, 10, 11, 13, 14, 15,  // 814, 6
 | 
						|
                3, 6, 7, 10, 11, 13, 14, 15,  // 820, 8
 | 
						|
                1, 5, 9, 10, 11, 13, 14, 15,  // 828, 8
 | 
						|
                1, 2, 3, 5, 6, 7, 9, 10, 11, 13, 14, 15,  // 836, 12
 | 
						|
                12, 13, 14, 15,  // 848, 4
 | 
						|
                0, 1, 2, 3, 12, 13, 14, 15,  // 852, 8
 | 
						|
                0, 1, 4, 5, 12, 13, 14, 15,  // 860, 8
 | 
						|
                4, 5, 6, 7, 12, 13, 14, 15,  // 868, 8
 | 
						|
                4, 8, 9, 10, 12, 13, 14, 15,  // 876, 8
 | 
						|
                0, 4, 5, 8, 9, 10, 12, 13, 14, 15,  // 884, 10
 | 
						|
                0, 1, 4, 5, 6, 8, 9, 10, 12, 13, 14, 15,  // 894, 12
 | 
						|
                0, 1, 2, 3, 4, 7, 8, 11, 12, 13, 14, 15,  // 906, 12
 | 
						|
                0, 1, 3, 4, 8, 9, 11, 12, 13, 14, 15,  // 918, 11
 | 
						|
                0, 2, 3, 7, 8, 10, 11, 12, 13, 14, 15,  // 929, 11
 | 
						|
                7, 9, 10, 11, 12, 13, 14, 15,  // 940, 8
 | 
						|
                3, 6, 7, 9, 10, 11, 12, 13, 14, 15,  // 948, 10
 | 
						|
                2, 3, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15,  // 958, 12
 | 
						|
                8, 9, 10, 11, 12, 13, 14, 15,  // 970, 8
 | 
						|
                0, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15,  // 978, 12
 | 
						|
                0, 1, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15,  // 990, 13
 | 
						|
                3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  // 1003, 12
 | 
						|
                2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  // 1015, 13
 | 
						|
                4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  // 1028, 12
 | 
						|
                0, 2,  // 1040, 2
 | 
						|
                1, 3,  // 1042, 2
 | 
						|
                0, 1, 4, 5,  // 1044, 4
 | 
						|
                0, 1, 2, 4, 5,  // 1048, 5
 | 
						|
                2, 3, 6,  // 1053, 3
 | 
						|
                0, 2, 4, 6,  // 1056, 4
 | 
						|
                1, 2, 5, 6,  // 1060, 4
 | 
						|
                0, 1, 2, 3, 5, 6,  // 1064, 6
 | 
						|
                0, 1, 2, 4, 5, 6,  // 1070, 6
 | 
						|
                0, 1, 2, 3, 4, 5, 6,  // 1076, 7
 | 
						|
                0, 3, 4, 7,  // 1083, 4
 | 
						|
                0, 1, 2, 3, 4, 7,  // 1087, 6
 | 
						|
                1, 3, 5, 7,  // 1093, 4
 | 
						|
                2, 3, 6, 7,  // 1097, 4
 | 
						|
                1, 2, 3, 6, 7,  // 1101, 5
 | 
						|
                1, 2, 3, 5, 6, 7,  // 1106, 6
 | 
						|
                0, 1, 2, 3, 5, 6, 7,  // 1112, 7
 | 
						|
                4, 5, 6, 7,  // 1119, 4
 | 
						|
                0, 8,  // 1123, 2
 | 
						|
                0, 1, 4, 5, 8,  // 1125, 5
 | 
						|
                0, 1, 8, 9,  // 1130, 4
 | 
						|
                4, 5, 8, 9,  // 1134, 4
 | 
						|
                0, 1, 4, 5, 8, 9,  // 1138, 6
 | 
						|
                2, 6, 8, 9,  // 1144, 4
 | 
						|
                6, 7, 8, 9,  // 1148, 4
 | 
						|
                0, 2, 4, 6, 8, 10,  // 1152, 6
 | 
						|
                1, 2, 5, 6, 9, 10,  // 1158, 6
 | 
						|
                0, 3, 4, 7, 9, 10,  // 1164, 6
 | 
						|
                0, 1, 2, 8, 9, 10,  // 1170, 6
 | 
						|
                4, 5, 6, 8, 9, 10,  // 1176, 6
 | 
						|
                3, 11,  // 1182, 2
 | 
						|
                2, 3, 6, 7, 11,  // 1184, 5
 | 
						|
                0, 3, 8, 11,  // 1189, 4
 | 
						|
                0, 3, 4, 7, 8, 11,  // 1193, 6
 | 
						|
                1, 3, 5, 7, 9, 11,  // 1199, 6
 | 
						|
                2, 3, 10, 11,  // 1205, 4
 | 
						|
                1, 5, 10, 11,  // 1209, 4
 | 
						|
                4, 5, 10, 11,  // 1213, 4
 | 
						|
                6, 7, 10, 11,  // 1217, 4
 | 
						|
                2, 3, 6, 7, 10, 11,  // 1221, 6
 | 
						|
                1, 2, 3, 9, 10, 11,  // 1227, 6
 | 
						|
                5, 6, 7, 9, 10, 11,  // 1233, 6
 | 
						|
                8, 9, 10, 11,  // 1239, 4
 | 
						|
                4, 12,  // 1243, 2
 | 
						|
                0, 1, 2, 3, 4, 5, 8, 12,  // 1245, 8
 | 
						|
                8, 9, 12,  // 1253, 3
 | 
						|
                0, 4, 5, 8, 9, 12,  // 1256, 6
 | 
						|
                0, 1, 4, 5, 8, 9, 12,  // 1262, 7
 | 
						|
                2, 3, 5, 6, 8, 9, 12,  // 1269, 7
 | 
						|
                1, 5, 9, 13,  // 1276, 4
 | 
						|
                6, 7, 9, 13,  // 1280, 4
 | 
						|
                1, 4, 7, 10, 13,  // 1284, 5
 | 
						|
                1, 6, 8, 11, 13,  // 1289, 5
 | 
						|
                0, 1, 12, 13,  // 1294, 4
 | 
						|
                4, 5, 12, 13,  // 1298, 4
 | 
						|
                0, 1, 6, 7, 12, 13,  // 1302, 6
 | 
						|
                0, 1, 4, 8, 12, 13,  // 1308, 6
 | 
						|
                8, 9, 12, 13,  // 1314, 4
 | 
						|
                4, 8, 9, 12, 13,  // 1318, 5
 | 
						|
                4, 5, 8, 9, 12, 13,  // 1323, 6
 | 
						|
                0, 4, 5, 8, 9, 12, 13,  // 1329, 7
 | 
						|
                0, 1, 6, 10, 12, 13,  // 1336, 6
 | 
						|
                3, 6, 7, 9, 10, 12, 13,  // 1342, 7
 | 
						|
                0, 1, 10, 11, 12, 13,  // 1349, 6
 | 
						|
                2, 4, 7, 9, 14,  // 1355, 5
 | 
						|
                4, 5, 10, 14,  // 1360, 4
 | 
						|
                2, 6, 10, 14,  // 1364, 4
 | 
						|
                2, 5, 8, 11, 14,  // 1368, 5
 | 
						|
                0, 2, 12, 14,  // 1373, 4
 | 
						|
                8, 10, 12, 14,  // 1377, 4
 | 
						|
                4, 6, 8, 10, 12, 14,  // 1381, 6
 | 
						|
                13, 14,  // 1387, 2
 | 
						|
                9, 10, 13, 14,  // 1389, 4
 | 
						|
                5, 6, 9, 10, 13, 14,  // 1393, 6
 | 
						|
                0, 1, 2, 12, 13, 14,  // 1399, 6
 | 
						|
                4, 5, 6, 12, 13, 14,  // 1405, 6
 | 
						|
                8, 9, 12, 13, 14,  // 1411, 5
 | 
						|
                8, 9, 10, 12, 13, 14,  // 1416, 6
 | 
						|
                7, 15,  // 1422, 2
 | 
						|
                0, 5, 10, 15,  // 1424, 4
 | 
						|
                0, 1, 2, 3, 6, 7, 11, 15,  // 1428, 8
 | 
						|
                10, 11, 15,  // 1436, 3
 | 
						|
                0, 1, 5, 6, 10, 11, 15,  // 1439, 7
 | 
						|
                3, 6, 7, 10, 11, 15,  // 1446, 6
 | 
						|
                12, 15,  // 1452, 2
 | 
						|
                0, 3, 12, 15,  // 1454, 4
 | 
						|
                4, 7, 12, 15,  // 1458, 4
 | 
						|
                0, 3, 6, 9, 12, 15,  // 1462, 6
 | 
						|
                0, 3, 5, 10, 12, 15,  // 1468, 6
 | 
						|
                8, 11, 12, 15,  // 1474, 4
 | 
						|
                5, 6, 8, 11, 12, 15,  // 1478, 6
 | 
						|
                4, 7, 8, 11, 12, 15,  // 1484, 6
 | 
						|
                1, 3, 13, 15,  // 1490, 4
 | 
						|
                9, 11, 13, 15,  // 1494, 4
 | 
						|
                5, 7, 9, 11, 13, 15,  // 1498, 6
 | 
						|
                2, 3, 14, 15,  // 1504, 4
 | 
						|
                2, 3, 4, 5, 14, 15,  // 1508, 6
 | 
						|
                6, 7, 14, 15,  // 1514, 4
 | 
						|
                2, 3, 5, 9, 14, 15,  // 1518, 6
 | 
						|
                2, 3, 8, 9, 14, 15,  // 1524, 6
 | 
						|
                10, 14, 15,  // 1530, 3
 | 
						|
                0, 4, 5, 9, 10, 14, 15,  // 1533, 7
 | 
						|
                2, 3, 7, 11, 14, 15,  // 1540, 6
 | 
						|
                10, 11, 14, 15,  // 1546, 4
 | 
						|
                7, 10, 11, 14, 15,  // 1550, 5
 | 
						|
                6, 7, 10, 11, 14, 15,  // 1555, 6
 | 
						|
                1, 2, 3, 13, 14, 15,  // 1561, 6
 | 
						|
                5, 6, 7, 13, 14, 15,  // 1567, 6
 | 
						|
                10, 11, 13, 14, 15,  // 1573, 5
 | 
						|
                9, 10, 11, 13, 14, 15,  // 1578, 6
 | 
						|
                0, 4, 8, 9, 12, 13, 14, 15,  // 1584, 8
 | 
						|
                9, 10, 12, 13, 14, 15,  // 1592, 6
 | 
						|
                8, 11, 12, 13, 14, 15,  // 1598, 6
 | 
						|
                3, 7, 10, 11, 12, 13, 14, 15,  // 1604, 8
 | 
						|
            };
 | 
						|
            static const int g_shapeRanges[][2] =
 | 
						|
            {
 | 
						|
                { 0, 16 },{ 16, 4 },{ 20, 3 },{ 23, 4 },{ 27, 3 },{ 30, 4 },{ 34, 8 },{ 42, 4 },{ 46, 6 },{ 52, 8 },{ 60, 5 },
 | 
						|
                { 65, 5 },{ 70, 4 },{ 74, 4 },{ 78, 6 },{ 84, 8 },{ 92, 8 },{ 100, 8 },{ 108, 8 },{ 116, 12 },{ 128, 4 },{ 132, 8 },
 | 
						|
                { 140, 8 },{ 148, 10 },{ 158, 6 },{ 164, 8 },{ 172, 12 },{ 184, 8 },{ 192, 5 },{ 197, 3 },{ 200, 4 },{ 204, 6 },{ 210, 8 },
 | 
						|
                { 218, 8 },{ 226, 8 },{ 234, 8 },{ 242, 8 },{ 250, 12 },{ 262, 13 },{ 275, 8 },{ 283, 8 },{ 291, 10 },{ 301, 8 },{ 309, 8 },
 | 
						|
                { 317, 5 },{ 322, 8 },{ 330, 8 },{ 338, 8 },{ 346, 8 },{ 354, 8 },{ 362, 8 },{ 370, 8 },{ 378, 8 },{ 386, 8 },{ 394, 8 },
 | 
						|
                { 402, 8 },{ 410, 8 },{ 418, 4 },{ 422, 8 },{ 430, 6 },{ 436, 8 },{ 444, 10 },{ 454, 8 },{ 462, 12 },{ 474, 8 },{ 482, 8 },
 | 
						|
                { 490, 4 },{ 494, 8 },{ 502, 6 },{ 508, 8 },{ 516, 10 },{ 526, 8 },{ 534, 12 },{ 546, 8 },{ 554, 8 },{ 562, 8 },{ 570, 8 },
 | 
						|
                { 578, 8 },{ 586, 8 },{ 594, 8 },{ 602, 8 },{ 610, 8 },{ 618, 8 },{ 626, 8 },{ 634, 8 },{ 642, 11 },{ 653, 8 },{ 661, 8 },
 | 
						|
                { 669, 6 },{ 675, 8 },{ 683, 8 },{ 691, 3 },{ 694, 4 },{ 698, 8 },{ 706, 8 },{ 714, 8 },{ 722, 8 },{ 730, 8 },{ 738, 10 },
 | 
						|
                { 748, 12 },{ 760, 13 },{ 773, 11 },{ 784, 8 },{ 792, 4 },{ 796, 8 },{ 804, 10 },{ 814, 6 },{ 820, 8 },{ 828, 8 },{ 836, 12 },
 | 
						|
                { 848, 4 },{ 852, 8 },{ 860, 8 },{ 868, 8 },{ 876, 8 },{ 884, 10 },{ 894, 12 },{ 906, 12 },{ 918, 11 },{ 929, 11 },{ 940, 8 },
 | 
						|
                { 948, 10 },{ 958, 12 },{ 970, 8 },{ 978, 12 },{ 990, 13 },{ 1003, 12 },{ 1015, 13 },{ 1028, 12 },{ 1040, 2 },{ 1042, 2 },{ 1044, 4 },
 | 
						|
                { 1048, 5 },{ 1053, 3 },{ 1056, 4 },{ 1060, 4 },{ 1064, 6 },{ 1070, 6 },{ 1076, 7 },{ 1083, 4 },{ 1087, 6 },{ 1093, 4 },{ 1097, 4 },
 | 
						|
                { 1101, 5 },{ 1106, 6 },{ 1112, 7 },{ 1119, 4 },{ 1123, 2 },{ 1125, 5 },{ 1130, 4 },{ 1134, 4 },{ 1138, 6 },{ 1144, 4 },{ 1148, 4 },
 | 
						|
                { 1152, 6 },{ 1158, 6 },{ 1164, 6 },{ 1170, 6 },{ 1176, 6 },{ 1182, 2 },{ 1184, 5 },{ 1189, 4 },{ 1193, 6 },{ 1199, 6 },{ 1205, 4 },
 | 
						|
                { 1209, 4 },{ 1213, 4 },{ 1217, 4 },{ 1221, 6 },{ 1227, 6 },{ 1233, 6 },{ 1239, 4 },{ 1243, 2 },{ 1245, 8 },{ 1253, 3 },{ 1256, 6 },
 | 
						|
                { 1262, 7 },{ 1269, 7 },{ 1276, 4 },{ 1280, 4 },{ 1284, 5 },{ 1289, 5 },{ 1294, 4 },{ 1298, 4 },{ 1302, 6 },{ 1308, 6 },{ 1314, 4 },
 | 
						|
                { 1318, 5 },{ 1323, 6 },{ 1329, 7 },{ 1336, 6 },{ 1342, 7 },{ 1349, 6 },{ 1355, 5 },{ 1360, 4 },{ 1364, 4 },{ 1368, 5 },{ 1373, 4 },
 | 
						|
                { 1377, 4 },{ 1381, 6 },{ 1387, 2 },{ 1389, 4 },{ 1393, 6 },{ 1399, 6 },{ 1405, 6 },{ 1411, 5 },{ 1416, 6 },{ 1422, 2 },{ 1424, 4 },
 | 
						|
                { 1428, 8 },{ 1436, 3 },{ 1439, 7 },{ 1446, 6 },{ 1452, 2 },{ 1454, 4 },{ 1458, 4 },{ 1462, 6 },{ 1468, 6 },{ 1474, 4 },{ 1478, 6 },
 | 
						|
                { 1484, 6 },{ 1490, 4 },{ 1494, 4 },{ 1498, 6 },{ 1504, 4 },{ 1508, 6 },{ 1514, 4 },{ 1518, 6 },{ 1524, 6 },{ 1530, 3 },{ 1533, 7 },
 | 
						|
                { 1540, 6 },{ 1546, 4 },{ 1550, 5 },{ 1555, 6 },{ 1561, 6 },{ 1567, 6 },{ 1573, 5 },{ 1578, 6 },{ 1584, 8 },{ 1592, 6 },{ 1598, 6 },
 | 
						|
                { 1604, 8 },
 | 
						|
            };
 | 
						|
            static const int g_shapes1[][2] =
 | 
						|
            {
 | 
						|
                { 0, 16 }
 | 
						|
            };
 | 
						|
            static const int g_shapes2[64][2] =
 | 
						|
            {
 | 
						|
                { 33, 96 },{ 63, 66 },{ 20, 109 },{ 22, 107 },{ 37, 92 },{ 7, 122 },{ 8, 121 },{ 23, 106 },
 | 
						|
                { 38, 91 },{ 2, 127 },{ 9, 120 },{ 26, 103 },{ 3, 126 },{ 6, 123 },{ 1, 128 },{ 19, 110 },
 | 
						|
                { 15, 114 },{ 124, 5 },{ 72, 57 },{ 115, 14 },{ 125, 4 },{ 70, 59 },{ 100, 29 },{ 60, 69 },
 | 
						|
                { 116, 13 },{ 99, 30 },{ 78, 51 },{ 94, 35 },{ 104, 25 },{ 111, 18 },{ 71, 58 },{ 90, 39 },
 | 
						|
                { 45, 84 },{ 16, 113 },{ 82, 47 },{ 95, 34 },{ 87, 42 },{ 83, 46 },{ 53, 76 },{ 48, 81 },
 | 
						|
                { 68, 61 },{ 105, 24 },{ 98, 31 },{ 88, 41 },{ 75, 54 },{ 43, 86 },{ 52, 77 },{ 117, 12 },
 | 
						|
                { 119, 10 },{ 118, 11 },{ 85, 44 },{ 101, 28 },{ 36, 93 },{ 55, 74 },{ 89, 40 },{ 79, 50 },
 | 
						|
                { 56, 73 },{ 49, 80 },{ 64, 65 },{ 27, 102 },{ 32, 97 },{ 112, 17 },{ 67, 62 },{ 21, 108 },
 | 
						|
            };
 | 
						|
            static const int g_shapes3[64][3] =
 | 
						|
            {
 | 
						|
                { 148, 160, 240 },{ 132, 212, 205 },{ 136, 233, 187 },{ 175, 237, 143 },{ 6, 186, 232 },{ 33, 142, 232 },{ 131, 123, 142 },{ 131, 96, 186 },
 | 
						|
                { 6, 171, 110 },{ 1, 18, 110 },{ 1, 146, 123 },{ 33, 195, 66 },{ 20, 51, 66 },{ 20, 178, 96 },{ 2, 177, 106 },{ 211, 4, 59 },
 | 
						|
                { 8, 191, 91 },{ 230, 14, 29 },{ 1, 188, 234 },{ 151, 110, 168 },{ 20, 144, 238 },{ 137, 66, 206 },{ 173, 179, 232 },{ 209, 194, 186 },
 | 
						|
                { 239, 165, 142 },{ 131, 152, 242 },{ 214, 54, 12 },{ 140, 219, 201 },{ 190, 150, 231 },{ 156, 135, 241 },{ 185, 227, 167 },{ 145, 210, 59 },
 | 
						|
                { 138, 174, 106 },{ 189, 229, 14 },{ 176, 133, 106 },{ 78, 178, 195 },{ 111, 146, 171 },{ 216, 180, 196 },{ 217, 181, 193 },{ 184, 228, 166 },
 | 
						|
                { 192, 225, 153 },{ 134, 141, 123 },{ 6, 222, 198 },{ 149, 183, 96 },{ 33, 226, 164 },{ 161, 215, 51 },{ 197, 221, 18 },{ 1, 223, 199 },
 | 
						|
                { 154, 163, 110 },{ 20, 236, 169 },{ 157, 204, 66 },{ 1, 202, 220 },{ 20, 170, 235 },{ 203, 158, 66 },{ 162, 155, 110 },{ 6, 201, 218 },
 | 
						|
                { 139, 135, 123 },{ 33, 167, 224 },{ 182, 150, 96 },{ 19, 200, 213 },{ 63, 207, 159 },{ 147, 172, 109 },{ 129, 130, 128 },{ 208, 14, 59 },
 | 
						|
            };
 | 
						|
 | 
						|
            static const int g_shapeList1[] =
 | 
						|
            {
 | 
						|
                0,
 | 
						|
            };
 | 
						|
 | 
						|
            static const int g_shapeList1Collapse[] =
 | 
						|
            {
 | 
						|
                0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1,
 | 
						|
            };
 | 
						|
            static const int g_shapeList2[] =
 | 
						|
            {
 | 
						|
                1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
 | 
						|
                12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
 | 
						|
                23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
 | 
						|
                34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
 | 
						|
                45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
 | 
						|
                56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66,
 | 
						|
                67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
 | 
						|
                78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88,
 | 
						|
                89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
 | 
						|
                100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110,
 | 
						|
                111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
 | 
						|
                122, 123, 124, 125, 126, 127, 128,
 | 
						|
            };
 | 
						|
            static const int g_shapeList2Collapse[] =
 | 
						|
            {
 | 
						|
                -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
 | 
						|
                10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
 | 
						|
                21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
 | 
						|
                32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42,
 | 
						|
                43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
 | 
						|
                54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
 | 
						|
                65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75,
 | 
						|
                76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86,
 | 
						|
                87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97,
 | 
						|
                98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108,
 | 
						|
                109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
 | 
						|
                120, 121, 122, 123, 124, 125, 126, 127, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1,
 | 
						|
            };
 | 
						|
 | 
						|
            static const int g_shapeList12[] =
 | 
						|
            {
 | 
						|
                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
 | 
						|
                11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
 | 
						|
                22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
 | 
						|
                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
 | 
						|
                44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
 | 
						|
                55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,
 | 
						|
                66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,
 | 
						|
                77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
 | 
						|
                88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98,
 | 
						|
                99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
 | 
						|
                110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
 | 
						|
                121, 122, 123, 124, 125, 126, 127, 128,
 | 
						|
            };
 | 
						|
 | 
						|
            static const int g_shapeList12Collapse[] =
 | 
						|
            {
 | 
						|
                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
 | 
						|
                11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
 | 
						|
                22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
 | 
						|
                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
 | 
						|
                44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
 | 
						|
                55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,
 | 
						|
                66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,
 | 
						|
                77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
 | 
						|
                88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98,
 | 
						|
                99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
 | 
						|
                110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
 | 
						|
                121, 122, 123, 124, 125, 126, 127, 128, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1,
 | 
						|
            };
 | 
						|
 | 
						|
            static const int g_shapeList3[] =
 | 
						|
            {
 | 
						|
                1, 2, 4, 6, 8, 12, 14, 18, 19, 20, 29,
 | 
						|
                33, 51, 54, 59, 63, 66, 78, 91, 96, 106, 109,
 | 
						|
                110, 111, 123, 128, 129, 130, 131, 132, 133, 134, 135,
 | 
						|
                136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146,
 | 
						|
                147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157,
 | 
						|
                158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
 | 
						|
                169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
 | 
						|
                180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
 | 
						|
                191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201,
 | 
						|
                202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212,
 | 
						|
                213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
 | 
						|
                224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234,
 | 
						|
                235, 236, 237, 238, 239, 240, 241, 242,
 | 
						|
            };
 | 
						|
 | 
						|
            static const int g_shapeList3Collapse[] =
 | 
						|
            {
 | 
						|
                -1, 0, 1, -1, 2, -1, 3, -1, 4, -1, -1,
 | 
						|
                -1, 5, -1, 6, -1, -1, -1, 7, 8, 9, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, 10, -1, -1, -1,
 | 
						|
                11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, 12, -1, -1, 13,
 | 
						|
                -1, -1, -1, -1, 14, -1, -1, -1, 15, -1, -1,
 | 
						|
                16, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1, 17, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1, -1, -1, 18, -1, -1, -1, -1, 19, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, 20, -1, -1, 21,
 | 
						|
                22, 23, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1, -1, 24, -1, -1, -1, -1, 25, 26, 27, 28,
 | 
						|
                29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
 | 
						|
                40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
 | 
						|
                51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
 | 
						|
                62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72,
 | 
						|
                73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83,
 | 
						|
                84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94,
 | 
						|
                95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
 | 
						|
                106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
 | 
						|
                117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
 | 
						|
                128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138,
 | 
						|
                139,
 | 
						|
            };
 | 
						|
 | 
						|
            static const int g_shapeList3Short[] =
 | 
						|
            {
 | 
						|
                1, 2, 4, 6, 18, 20, 33, 51, 59, 66, 96,
 | 
						|
                106, 110, 123, 131, 132, 136, 142, 143, 146, 148, 160,
 | 
						|
                171, 175, 177, 178, 186, 187, 195, 205, 211, 212, 232,
 | 
						|
                233, 237, 240,
 | 
						|
            };
 | 
						|
 | 
						|
            static const int g_shapeList3ShortCollapse[] =
 | 
						|
            {
 | 
						|
                -1, 0, 1, -1, 2, -1, 3, -1, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, 4, -1, 5, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, 7, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, 8, -1, -1, -1, -1, -1, -1,
 | 
						|
                9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, -1, 10, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, 11, -1, -1, -1,
 | 
						|
                12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1, -1, 13, -1, -1, -1, -1, -1, -1, -1, 14,
 | 
						|
                15, -1, -1, -1, 16, -1, -1, -1, -1, -1, 17,
 | 
						|
                18, -1, -1, 19, -1, 20, -1, -1, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, 21, -1, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, 22, -1, -1, -1, 23,
 | 
						|
                -1, 24, 25, -1, -1, -1, -1, -1, -1, -1, 26,
 | 
						|
                27, -1, -1, -1, -1, -1, -1, -1, 28, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, 29, -1, -1, -1,
 | 
						|
                -1, -1, 30, 31, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
 | 
						|
                -1, 32, 33, -1, -1, -1, 34, -1, -1, 35, -1,
 | 
						|
                -1,
 | 
						|
            };
 | 
						|
 | 
						|
            static const int g_shapeListAll[] =
 | 
						|
            {
 | 
						|
                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
 | 
						|
                11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
 | 
						|
                22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
 | 
						|
                33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
 | 
						|
                44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
 | 
						|
                55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,
 | 
						|
                66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,
 | 
						|
                77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
 | 
						|
                88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98,
 | 
						|
                99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
 | 
						|
                110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
 | 
						|
                121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131,
 | 
						|
                132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
 | 
						|
                143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153,
 | 
						|
                154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
 | 
						|
                165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
 | 
						|
                176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186,
 | 
						|
                187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197,
 | 
						|
                198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208,
 | 
						|
                209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219,
 | 
						|
                220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230,
 | 
						|
                231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241,
 | 
						|
                242,
 | 
						|
            };
 | 
						|
 | 
						|
            static const int g_numShapes1 = sizeof(g_shapeList1) / sizeof(g_shapeList1[0]);
 | 
						|
            static const int g_numShapes2 = sizeof(g_shapeList2) / sizeof(g_shapeList2[0]);
 | 
						|
            static const int g_numShapes12 = sizeof(g_shapeList12) / sizeof(g_shapeList12[0]);
 | 
						|
            static const int g_numShapes3 = sizeof(g_shapeList3) / sizeof(g_shapeList3[0]);
 | 
						|
            static const int g_numShapes3Short = sizeof(g_shapeList3Short) / sizeof(g_shapeList3Short[0]);
 | 
						|
            static const int g_numShapesAll = sizeof(g_shapeListAll) / sizeof(g_shapeListAll[0]);
 | 
						|
            static const int g_numFragments = sizeof(g_fragments) / sizeof(g_fragments[0]);
 | 
						|
 | 
						|
            static const int g_maxFragmentsPerMode = (g_numShapes2 > g_numShapes3) ? g_numShapes2 : g_numShapes3;
 | 
						|
        }
 | 
						|
 | 
						|
        namespace BC6HData
 | 
						|
        {
 | 
						|
            enum EField
 | 
						|
            {
 | 
						|
                NA, // N/A
 | 
						|
                M,  // Mode
 | 
						|
                D,  // Shape
 | 
						|
                RW,
 | 
						|
                RX,
 | 
						|
                RY,
 | 
						|
                RZ,
 | 
						|
                GW,
 | 
						|
                GX,
 | 
						|
                GY,
 | 
						|
                GZ,
 | 
						|
                BW,
 | 
						|
                BX,
 | 
						|
                BY,
 | 
						|
                BZ,
 | 
						|
            };
 | 
						|
 | 
						|
            struct ModeDescriptor
 | 
						|
            {
 | 
						|
                EField m_eField;
 | 
						|
                uint8_t   m_uBit;
 | 
						|
            };
 | 
						|
 | 
						|
            const ModeDescriptor g_modeDescriptors[14][82] =
 | 
						|
            {
 | 
						|
                {   // Mode 1 (0x00) - 10 5 5 5
 | 
						|
                    { M, 0 },{ M, 1 },{ GY, 4 },{ BY, 4 },{ BZ, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
 | 
						|
                    { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
 | 
						|
                    { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
 | 
						|
                    { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
 | 
						|
                    { GZ, 4 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
 | 
						|
                    { BZ, 0 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
 | 
						|
                    { BZ, 1 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
 | 
						|
                    { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 },
 | 
						|
                    { D, 3 },{ D, 4 },
 | 
						|
                },
 | 
						|
 | 
						|
                {   // Mode 2 (0x01) - 7 6 6 6
 | 
						|
                    { M, 0 },{ M, 1 },{ GY, 5 },{ GZ, 4 },{ GZ, 5 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
 | 
						|
                    { RW, 5 },{ RW, 6 },{ BZ, 0 },{ BZ, 1 },{ BY, 4 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
 | 
						|
                    { GW, 5 },{ GW, 6 },{ BY, 5 },{ BZ, 2 },{ GY, 4 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
 | 
						|
                    { BW, 5 },{ BW, 6 },{ BZ, 3 },{ BZ, 5 },{ BZ, 4 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
 | 
						|
                    { RX, 5 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
 | 
						|
                    { GX, 5 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
 | 
						|
                    { BX, 5 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
 | 
						|
                    { RY, 5 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ RZ, 5 },{ D, 0 },{ D, 1 },{ D, 2 },
 | 
						|
                    { D, 3 },{ D, 4 },
 | 
						|
                },
 | 
						|
 | 
						|
                {   // Mode 3 (0x02) - 11 5 4 4
 | 
						|
                    { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
 | 
						|
                    { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
 | 
						|
                    { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
 | 
						|
                    { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
 | 
						|
                    { RW,10 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GW,10 },
 | 
						|
                    { BZ, 0 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BW,10 },
 | 
						|
                    { BZ, 1 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
 | 
						|
                    { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 },
 | 
						|
                    { D, 3 },{ D, 4 },
 | 
						|
                },
 | 
						|
 | 
						|
                {   // Mode 4 (0x06) - 11 4 5 4
 | 
						|
                    { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
 | 
						|
                    { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
 | 
						|
                    { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
 | 
						|
                    { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RW,10 },
 | 
						|
                    { GZ, 4 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
 | 
						|
                    { GW,10 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BW,10 },
 | 
						|
                    { BZ, 1 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ BZ, 0 },
 | 
						|
                    { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ GY, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 },
 | 
						|
                    { D, 3 },{ D, 4 },
 | 
						|
                },
 | 
						|
 | 
						|
                {   // Mode 5 (0x0a) - 11 4 4 5
 | 
						|
                    { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
 | 
						|
                    { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
 | 
						|
                    { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
 | 
						|
                    { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RW,10 },
 | 
						|
                    { BY, 4 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GW,10 },
 | 
						|
                    { BZ, 0 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
 | 
						|
                    { BW,10 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ BZ, 1 },
 | 
						|
                    { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ BZ, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 },
 | 
						|
                    { D, 3 },{ D, 4 },
 | 
						|
                },
 | 
						|
 | 
						|
                {   // Mode 6 (0x0e) - 9 5 5 5
 | 
						|
                    { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
 | 
						|
                    { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ BY, 4 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
 | 
						|
                    { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GY, 4 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
 | 
						|
                    { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BZ, 4 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
 | 
						|
                    { GZ, 4 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
 | 
						|
                    { BZ, 0 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
 | 
						|
                    { BZ, 1 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
 | 
						|
                    { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 },
 | 
						|
                    { D, 3 },{ D, 4 },
 | 
						|
                },
 | 
						|
 | 
						|
                {   // Mode 7 (0x12) - 8 6 5 5
 | 
						|
                    { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
 | 
						|
                    { RW, 5 },{ RW, 6 },{ RW, 7 },{ GZ, 4 },{ BY, 4 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
 | 
						|
                    { GW, 5 },{ GW, 6 },{ GW, 7 },{ BZ, 2 },{ GY, 4 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
 | 
						|
                    { BW, 5 },{ BW, 6 },{ BW, 7 },{ BZ, 3 },{ BZ, 4 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
 | 
						|
                    { RX, 5 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
 | 
						|
                    { BZ, 0 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
 | 
						|
                    { BZ, 1 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
 | 
						|
                    { RY, 5 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ RZ, 5 },{ D, 0 },{ D, 1 },{ D, 2 },
 | 
						|
                    { D, 3 },{ D, 4 },
 | 
						|
                },
 | 
						|
 | 
						|
                {   // Mode 8 (0x16) - 8 5 6 5
 | 
						|
                    { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
 | 
						|
                    { RW, 5 },{ RW, 6 },{ RW, 7 },{ BZ, 0 },{ BY, 4 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
 | 
						|
                    { GW, 5 },{ GW, 6 },{ GW, 7 },{ GY, 5 },{ GY, 4 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
 | 
						|
                    { BW, 5 },{ BW, 6 },{ BW, 7 },{ GZ, 5 },{ BZ, 4 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
 | 
						|
                    { GZ, 4 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
 | 
						|
                    { GX, 5 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
 | 
						|
                    { BZ, 1 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
 | 
						|
                    { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 },
 | 
						|
                    { D, 3 },{ D, 4 },
 | 
						|
                },
 | 
						|
 | 
						|
                {   // Mode 9 (0x1a) - 8 5 5 6
 | 
						|
                    { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
 | 
						|
                    { RW, 5 },{ RW, 6 },{ RW, 7 },{ BZ, 1 },{ BY, 4 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
 | 
						|
                    { GW, 5 },{ GW, 6 },{ GW, 7 },{ BY, 5 },{ GY, 4 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
 | 
						|
                    { BW, 5 },{ BW, 6 },{ BW, 7 },{ BZ, 5 },{ BZ, 4 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
 | 
						|
                    { GZ, 4 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
 | 
						|
                    { BZ, 0 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
 | 
						|
                    { BX, 5 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
 | 
						|
                    { BZ, 2 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ BZ, 3 },{ D, 0 },{ D, 1 },{ D, 2 },
 | 
						|
                    { D, 3 },{ D, 4 },
 | 
						|
                },
 | 
						|
 | 
						|
                {   // Mode 10 (0x1e) - 6 6 6 6
 | 
						|
                    { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
 | 
						|
                    { RW, 5 },{ GZ, 4 },{ BZ, 0 },{ BZ, 1 },{ BY, 4 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
 | 
						|
                    { GW, 5 },{ GY, 5 },{ BY, 5 },{ BZ, 2 },{ GY, 4 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
 | 
						|
                    { BW, 5 },{ GZ, 5 },{ BZ, 3 },{ BZ, 5 },{ BZ, 4 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
 | 
						|
                    { RX, 5 },{ GY, 0 },{ GY, 1 },{ GY, 2 },{ GY, 3 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
 | 
						|
                    { GX, 5 },{ GZ, 0 },{ GZ, 1 },{ GZ, 2 },{ GZ, 3 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
 | 
						|
                    { BX, 5 },{ BY, 0 },{ BY, 1 },{ BY, 2 },{ BY, 3 },{ RY, 0 },{ RY, 1 },{ RY, 2 },{ RY, 3 },{ RY, 4 },
 | 
						|
                    { RY, 5 },{ RZ, 0 },{ RZ, 1 },{ RZ, 2 },{ RZ, 3 },{ RZ, 4 },{ RZ, 5 },{ D, 0 },{ D, 1 },{ D, 2 },
 | 
						|
                    { D, 3 },{ D, 4 },
 | 
						|
                },
 | 
						|
 | 
						|
                {   // Mode 11 (0x03) - 10 10
 | 
						|
                    { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
 | 
						|
                    { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
 | 
						|
                    { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
 | 
						|
                    { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
 | 
						|
                    { RX, 5 },{ RX, 6 },{ RX, 7 },{ RX, 8 },{ RX, 9 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
 | 
						|
                    { GX, 5 },{ GX, 6 },{ GX, 7 },{ GX, 8 },{ GX, 9 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
 | 
						|
                    { BX, 5 },{ BX, 6 },{ BX, 7 },{ BX, 8 },{ BX, 9 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
 | 
						|
                    { NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
 | 
						|
                    { NA, 0 },{ NA, 0 },
 | 
						|
                },
 | 
						|
 | 
						|
                {   // Mode 12 (0x07) - 11 9
 | 
						|
                    { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
 | 
						|
                    { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
 | 
						|
                    { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
 | 
						|
                    { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
 | 
						|
                    { RX, 5 },{ RX, 6 },{ RX, 7 },{ RX, 8 },{ RW,10 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
 | 
						|
                    { GX, 5 },{ GX, 6 },{ GX, 7 },{ GX, 8 },{ GW,10 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
 | 
						|
                    { BX, 5 },{ BX, 6 },{ BX, 7 },{ BX, 8 },{ BW,10 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
 | 
						|
                    { NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
 | 
						|
                    { NA, 0 },{ NA, 0 },
 | 
						|
                },
 | 
						|
 | 
						|
                {   // Mode 13 (0x0b) - 12 8
 | 
						|
                    { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
 | 
						|
                    { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
 | 
						|
                    { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
 | 
						|
                    { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RX, 4 },
 | 
						|
                    { RX, 5 },{ RX, 6 },{ RX, 7 },{ RW,11 },{ RW,10 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GX, 4 },
 | 
						|
                    { GX, 5 },{ GX, 6 },{ GX, 7 },{ GW,11 },{ GW,10 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BX, 4 },
 | 
						|
                    { BX, 5 },{ BX, 6 },{ BX, 7 },{ BW,11 },{ BW,10 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
 | 
						|
                    { NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
 | 
						|
                    { NA, 0 },{ NA, 0 },
 | 
						|
                },
 | 
						|
 | 
						|
                {   // Mode 14 (0x0f) - 16 4
 | 
						|
                    { M, 0 },{ M, 1 },{ M, 2 },{ M, 3 },{ M, 4 },{ RW, 0 },{ RW, 1 },{ RW, 2 },{ RW, 3 },{ RW, 4 },
 | 
						|
                    { RW, 5 },{ RW, 6 },{ RW, 7 },{ RW, 8 },{ RW, 9 },{ GW, 0 },{ GW, 1 },{ GW, 2 },{ GW, 3 },{ GW, 4 },
 | 
						|
                    { GW, 5 },{ GW, 6 },{ GW, 7 },{ GW, 8 },{ GW, 9 },{ BW, 0 },{ BW, 1 },{ BW, 2 },{ BW, 3 },{ BW, 4 },
 | 
						|
                    { BW, 5 },{ BW, 6 },{ BW, 7 },{ BW, 8 },{ BW, 9 },{ RX, 0 },{ RX, 1 },{ RX, 2 },{ RX, 3 },{ RW,15 },
 | 
						|
                    { RW,14 },{ RW,13 },{ RW,12 },{ RW,11 },{ RW,10 },{ GX, 0 },{ GX, 1 },{ GX, 2 },{ GX, 3 },{ GW,15 },
 | 
						|
                    { GW,14 },{ GW,13 },{ GW,12 },{ GW,11 },{ GW,10 },{ BX, 0 },{ BX, 1 },{ BX, 2 },{ BX, 3 },{ BW,15 },
 | 
						|
                    { BW,14 },{ BW,13 },{ BW,12 },{ BW,11 },{ BW,10 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
 | 
						|
                    { NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },{ NA, 0 },
 | 
						|
                    { NA, 0 },{ NA, 0 },
 | 
						|
                },
 | 
						|
            };
 | 
						|
        }
 | 
						|
 | 
						|
        struct PackingVector
 | 
						|
        {
 | 
						|
            uint32_t m_vector[4];
 | 
						|
            int m_offset;
 | 
						|
 | 
						|
            void Init()
 | 
						|
            {
 | 
						|
                for (int i = 0; i < 4; i++)
 | 
						|
                    m_vector[i] = 0;
 | 
						|
 | 
						|
                m_offset = 0;
 | 
						|
            }
 | 
						|
 | 
						|
            inline void Pack(ParallelMath::ScalarUInt16 value, int bits)
 | 
						|
            {
 | 
						|
                int vOffset = m_offset >> 5;
 | 
						|
                int bitOffset = m_offset & 0x1f;
 | 
						|
 | 
						|
                m_vector[vOffset] |= (static_cast<uint32_t>(value) << bitOffset) & static_cast<uint32_t>(0xffffffff);
 | 
						|
 | 
						|
                int overflowBits = bitOffset + bits - 32;
 | 
						|
                if (overflowBits > 0)
 | 
						|
                    m_vector[vOffset + 1] |= (static_cast<uint32_t>(value) >> (bits - overflowBits));
 | 
						|
 | 
						|
                m_offset += bits;
 | 
						|
            }
 | 
						|
 | 
						|
            inline void Flush(uint8_t* output)
 | 
						|
            {
 | 
						|
                assert(m_offset == 128);
 | 
						|
 | 
						|
                for (int v = 0; v < 4; v++)
 | 
						|
                {
 | 
						|
                    uint32_t chunk = m_vector[v];
 | 
						|
                    for (int b = 0; b < 4; b++)
 | 
						|
                        output[v * 4 + b] = static_cast<uint8_t>((chunk >> (b * 8)) & 0xff);
 | 
						|
                }
 | 
						|
            }
 | 
						|
        };
 | 
						|
 | 
						|
 | 
						|
		struct UnpackingVector
 | 
						|
		{
 | 
						|
			uint32_t m_vector[4];
 | 
						|
 | 
						|
			void Init(const uint8_t *bytes)
 | 
						|
			{
 | 
						|
				for (int i = 0; i < 4; i++)
 | 
						|
					m_vector[i] = 0;
 | 
						|
 | 
						|
				for (int b = 0; b < 16; b++)
 | 
						|
					m_vector[b / 4] |= (bytes[b] << ((b % 4) * 8));
 | 
						|
			}
 | 
						|
 | 
						|
			inline ParallelMath::ScalarUInt16 Unpack(int bits)
 | 
						|
			{
 | 
						|
				uint32_t bitMask = (1 << bits) - 1;
 | 
						|
 | 
						|
				ParallelMath::ScalarUInt16 result = static_cast<ParallelMath::ScalarUInt16>(m_vector[0] & bitMask);
 | 
						|
 | 
						|
				for (int i = 0; i < 4; i++)
 | 
						|
				{
 | 
						|
					m_vector[i] >>= bits;
 | 
						|
					if (i != 3)
 | 
						|
						m_vector[i] |= (m_vector[i + 1] & bitMask) << (32 - bits);
 | 
						|
				}
 | 
						|
 | 
						|
				return result;
 | 
						|
			}
 | 
						|
		};
 | 
						|
 | 
						|
        void ComputeTweakFactors(int tweak, int range, float *outFactors)
 | 
						|
        {
 | 
						|
            int totalUnits = range - 1;
 | 
						|
            int minOutsideUnits = ((tweak >> 1) & 1);
 | 
						|
            int maxOutsideUnits = (tweak & 1);
 | 
						|
            int insideUnits = totalUnits - minOutsideUnits - maxOutsideUnits;
 | 
						|
 | 
						|
            outFactors[0] = -static_cast<float>(minOutsideUnits) / static_cast<float>(insideUnits);
 | 
						|
            outFactors[1] = static_cast<float>(maxOutsideUnits) / static_cast<float>(insideUnits) + 1.0f;
 | 
						|
        }
 | 
						|
 | 
						|
        ParallelMath::Float ScaleHDRValue(const ParallelMath::Float &v, bool isSigned)
 | 
						|
        {
 | 
						|
            if (isSigned)
 | 
						|
            {
 | 
						|
                ParallelMath::Float offset = ParallelMath::Select(ParallelMath::Less(v, ParallelMath::MakeFloatZero()), ParallelMath::MakeFloat(-30.0f), ParallelMath::MakeFloat(30.0f));
 | 
						|
                return (v * 32.0f + offset) / 31.0f;
 | 
						|
            }
 | 
						|
            else
 | 
						|
                return (v * 64.0f + 30.0f) / 31.0f;
 | 
						|
        }
 | 
						|
 | 
						|
        ParallelMath::SInt16 UnscaleHDRValueSigned(const ParallelMath::SInt16 &v)
 | 
						|
        {
 | 
						|
#ifdef CVTT_ENABLE_ASSERTS
 | 
						|
            for (int i = 0; i < ParallelMath::ParallelSize; i++)
 | 
						|
                assert(ParallelMath::Extract(v, i) != -32768)
 | 
						|
#endif
 | 
						|
 | 
						|
            ParallelMath::Int16CompFlag negative = ParallelMath::Less(v, ParallelMath::MakeSInt16(0));
 | 
						|
            ParallelMath::UInt15 absComp = ParallelMath::LosslessCast<ParallelMath::UInt15>::Cast(ParallelMath::Select(negative, ParallelMath::SInt16(ParallelMath::MakeSInt16(0) - v), v));
 | 
						|
 | 
						|
            ParallelMath::UInt31 multiplied = ParallelMath::XMultiply(absComp, ParallelMath::MakeUInt15(31));
 | 
						|
            ParallelMath::UInt31 shifted = ParallelMath::RightShift(multiplied, 5);
 | 
						|
            ParallelMath::UInt15 absCompScaled = ParallelMath::ToUInt15(shifted);
 | 
						|
            ParallelMath::SInt16 signBits = ParallelMath::SelectOrZero(negative, ParallelMath::MakeSInt16(-32768));
 | 
						|
 | 
						|
            return ParallelMath::LosslessCast<ParallelMath::SInt16>::Cast(absCompScaled) | signBits;
 | 
						|
        }
 | 
						|
 | 
						|
        ParallelMath::UInt15 UnscaleHDRValueUnsigned(const ParallelMath::UInt16 &v)
 | 
						|
        {
 | 
						|
            return ParallelMath::ToUInt15(ParallelMath::RightShift(ParallelMath::XMultiply(v, ParallelMath::MakeUInt15(31)), 6));
 | 
						|
        }
 | 
						|
 | 
						|
        void UnscaleHDREndpoints(const ParallelMath::AInt16 inEP[2][3], ParallelMath::AInt16 outEP[2][3], bool isSigned)
 | 
						|
        {
 | 
						|
            for (int epi = 0; epi < 2; epi++)
 | 
						|
            {
 | 
						|
                for (int ch = 0; ch < 3; ch++)
 | 
						|
                {
 | 
						|
                    if (isSigned)
 | 
						|
                        outEP[epi][ch] = ParallelMath::LosslessCast<ParallelMath::AInt16>::Cast(UnscaleHDRValueSigned(ParallelMath::LosslessCast<ParallelMath::SInt16>::Cast(inEP[epi][ch])));
 | 
						|
                    else
 | 
						|
                        outEP[epi][ch] = ParallelMath::LosslessCast<ParallelMath::AInt16>::Cast(UnscaleHDRValueUnsigned(ParallelMath::LosslessCast<ParallelMath::UInt16>::Cast(inEP[epi][ch])));
 | 
						|
                }
 | 
						|
            }
 | 
						|
        }
 | 
						|
 | 
						|
        template<int TVectorSize>
 | 
						|
        class UnfinishedEndpoints
 | 
						|
        {
 | 
						|
        public:
 | 
						|
            typedef ParallelMath::Float MFloat;
 | 
						|
            typedef ParallelMath::UInt16 MUInt16;
 | 
						|
            typedef ParallelMath::UInt15 MUInt15;
 | 
						|
            typedef ParallelMath::SInt16 MSInt16;
 | 
						|
            typedef ParallelMath::SInt32 MSInt32;
 | 
						|
 | 
						|
            UnfinishedEndpoints()
 | 
						|
            {
 | 
						|
            }
 | 
						|
 | 
						|
            UnfinishedEndpoints(const MFloat *base, const MFloat *offset)
 | 
						|
            {
 | 
						|
                for (int ch = 0; ch < TVectorSize; ch++)
 | 
						|
                    m_base[ch] = base[ch];
 | 
						|
                for (int ch = 0; ch < TVectorSize; ch++)
 | 
						|
                    m_offset[ch] = offset[ch];
 | 
						|
            }
 | 
						|
 | 
						|
            UnfinishedEndpoints(const UnfinishedEndpoints& other)
 | 
						|
            {
 | 
						|
                for (int ch = 0; ch < TVectorSize; ch++)
 | 
						|
                    m_base[ch] = other.m_base[ch];
 | 
						|
                for (int ch = 0; ch < TVectorSize; ch++)
 | 
						|
                    m_offset[ch] = other.m_offset[ch];
 | 
						|
            }
 | 
						|
 | 
						|
            void FinishHDRUnsigned(int tweak, int range, MSInt16 *outEP0, MSInt16 *outEP1, ParallelMath::RoundTowardNearestForScope *roundingMode)
 | 
						|
            {
 | 
						|
                float tweakFactors[2];
 | 
						|
                ComputeTweakFactors(tweak, range, tweakFactors);
 | 
						|
 | 
						|
                for (int ch = 0; ch < TVectorSize; ch++)
 | 
						|
                {
 | 
						|
                    MUInt15 channelEPs[2];
 | 
						|
                    for (int epi = 0; epi < 2; epi++)
 | 
						|
                    {
 | 
						|
                        MFloat f = ParallelMath::Clamp(m_base[ch] + m_offset[ch] * tweakFactors[epi], 0.0f, 31743.0f);
 | 
						|
                        channelEPs[epi] = ParallelMath::RoundAndConvertToU15(f, roundingMode);
 | 
						|
                    }
 | 
						|
 | 
						|
                    outEP0[ch] = ParallelMath::LosslessCast<MSInt16>::Cast(channelEPs[0]);
 | 
						|
                    outEP1[ch] = ParallelMath::LosslessCast<MSInt16>::Cast(channelEPs[1]);
 | 
						|
                }
 | 
						|
            }
 | 
						|
 | 
						|
            void FinishHDRSigned(int tweak, int range, MSInt16* outEP0, MSInt16* outEP1, ParallelMath::RoundTowardNearestForScope* roundingMode)
 | 
						|
            {
 | 
						|
                float tweakFactors[2];
 | 
						|
                ComputeTweakFactors(tweak, range, tweakFactors);
 | 
						|
 | 
						|
                for (int ch = 0; ch < TVectorSize; ch++)
 | 
						|
                {
 | 
						|
                    MSInt16 channelEPs[2];
 | 
						|
                    for (int epi = 0; epi < 2; epi++)
 | 
						|
                    {
 | 
						|
                        MFloat f = ParallelMath::Clamp(m_base[ch] + m_offset[ch] * tweakFactors[epi], -31743.0f, 31743.0f);
 | 
						|
                        channelEPs[epi] = ParallelMath::RoundAndConvertToS16(f, roundingMode);
 | 
						|
                    }
 | 
						|
 | 
						|
                    outEP0[ch] = channelEPs[0];
 | 
						|
                    outEP1[ch] = channelEPs[1];
 | 
						|
                }
 | 
						|
            }
 | 
						|
 | 
						|
            void FinishLDR(int tweak, int range, MUInt15* outEP0, MUInt15* outEP1)
 | 
						|
            {
 | 
						|
                ParallelMath::RoundTowardNearestForScope roundingMode;
 | 
						|
 | 
						|
                float tweakFactors[2];
 | 
						|
                ComputeTweakFactors(tweak, range, tweakFactors);
 | 
						|
 | 
						|
                for (int ch = 0; ch < TVectorSize; ch++)
 | 
						|
                {
 | 
						|
                    MFloat ep0f = ParallelMath::Clamp(m_base[ch] + m_offset[ch] * tweakFactors[0], 0.0f, 255.0f);
 | 
						|
                    MFloat ep1f = ParallelMath::Clamp(m_base[ch] + m_offset[ch] * tweakFactors[1], 0.0f, 255.0f);
 | 
						|
                    outEP0[ch] = ParallelMath::RoundAndConvertToU15(ep0f, &roundingMode);
 | 
						|
                    outEP1[ch] = ParallelMath::RoundAndConvertToU15(ep1f, &roundingMode);
 | 
						|
                }
 | 
						|
            }
 | 
						|
 | 
						|
            template<int TNewVectorSize>
 | 
						|
            UnfinishedEndpoints<TNewVectorSize> ExpandTo(float filler)
 | 
						|
            {
 | 
						|
                MFloat newBase[TNewVectorSize];
 | 
						|
                MFloat newOffset[TNewVectorSize];
 | 
						|
 | 
						|
                for (int ch = 0; ch < TNewVectorSize && ch < TVectorSize; ch++)
 | 
						|
                {
 | 
						|
                    newBase[ch] = m_base[ch];
 | 
						|
                    newOffset[ch] = m_offset[ch];
 | 
						|
                }
 | 
						|
 | 
						|
                MFloat fillerV = ParallelMath::MakeFloat(filler);
 | 
						|
 | 
						|
                for (int ch = TVectorSize; ch < TNewVectorSize; ch++)
 | 
						|
                {
 | 
						|
                    newBase[ch] = fillerV;
 | 
						|
                    newOffset[ch] = ParallelMath::MakeFloatZero();
 | 
						|
                }
 | 
						|
 | 
						|
                return UnfinishedEndpoints<TNewVectorSize>(newBase, newOffset);
 | 
						|
            }
 | 
						|
 | 
						|
        private:
 | 
						|
            MFloat m_base[TVectorSize];
 | 
						|
            MFloat m_offset[TVectorSize];
 | 
						|
        };
 | 
						|
 | 
						|
        template<int TMatrixSize>
 | 
						|
        class PackedCovarianceMatrix
 | 
						|
        {
 | 
						|
        public:
 | 
						|
            // 0: xx,
 | 
						|
            // 1: xy, yy
 | 
						|
            // 3: xz, yz, zz 
 | 
						|
            // 6: xw, yw, zw, ww
 | 
						|
            // ... etc.
 | 
						|
            static const int PyramidSize = (TMatrixSize * (TMatrixSize + 1)) / 2;
 | 
						|
 | 
						|
            typedef ParallelMath::Float MFloat;
 | 
						|
 | 
						|
            PackedCovarianceMatrix()
 | 
						|
            {
 | 
						|
                for (int i = 0; i < PyramidSize; i++)
 | 
						|
                    m_values[i] = ParallelMath::MakeFloatZero();
 | 
						|
            }
 | 
						|
 | 
						|
            void Add(const ParallelMath::Float *vec, const ParallelMath::Float &weight)
 | 
						|
            {
 | 
						|
                int index = 0;
 | 
						|
                for (int row = 0; row < TMatrixSize; row++)
 | 
						|
                {
 | 
						|
                    for (int col = 0; col <= row; col++)
 | 
						|
                    {
 | 
						|
                        m_values[index] = m_values[index] + vec[row] * vec[col] * weight;
 | 
						|
                        index++;
 | 
						|
                    }
 | 
						|
                }
 | 
						|
            }
 | 
						|
 | 
						|
            void Product(MFloat *outVec, const MFloat *inVec)
 | 
						|
            {
 | 
						|
                for (int row = 0; row < TMatrixSize; row++)
 | 
						|
                {
 | 
						|
                    MFloat sum = ParallelMath::MakeFloatZero();
 | 
						|
 | 
						|
                    int index = (row * (row + 1)) >> 1;
 | 
						|
                    for (int col = 0; col < TMatrixSize; col++)
 | 
						|
                    {
 | 
						|
                        sum = sum + inVec[col] * m_values[index];
 | 
						|
                        if (col >= row)
 | 
						|
                            index += col + 1;
 | 
						|
                        else
 | 
						|
                            index++;
 | 
						|
                    }
 | 
						|
 | 
						|
                    outVec[row] = sum;
 | 
						|
                }
 | 
						|
            }
 | 
						|
 | 
						|
        private:
 | 
						|
            ParallelMath::Float m_values[PyramidSize];
 | 
						|
        };
 | 
						|
 | 
						|
        static const int NumEndpointSelectorPasses = 3;
 | 
						|
 | 
						|
        template<int TVectorSize, int TIterationCount>
 | 
						|
        class EndpointSelector
 | 
						|
        {
 | 
						|
        public:
 | 
						|
            typedef ParallelMath::Float MFloat;
 | 
						|
 | 
						|
            EndpointSelector()
 | 
						|
            {
 | 
						|
                for (int ch = 0; ch < TVectorSize; ch++)
 | 
						|
                {
 | 
						|
                    m_centroid[ch] = ParallelMath::MakeFloatZero();
 | 
						|
                    m_direction[ch] = ParallelMath::MakeFloatZero();
 | 
						|
                }
 | 
						|
                m_weightTotal = ParallelMath::MakeFloatZero();
 | 
						|
                m_minDist = ParallelMath::MakeFloat(FLT_MAX);
 | 
						|
                m_maxDist = ParallelMath::MakeFloat(-FLT_MAX);
 | 
						|
            }
 | 
						|
 | 
						|
            void ContributePass(const MFloat *value, int pass, const MFloat &weight)
 | 
						|
            {
 | 
						|
                if (pass == 0)
 | 
						|
                    ContributeCentroid(value, weight);
 | 
						|
                else if (pass == 1)
 | 
						|
                    ContributeDirection(value, weight);
 | 
						|
                else if (pass == 2)
 | 
						|
                    ContributeMinMax(value);
 | 
						|
            }
 | 
						|
 | 
						|
            void FinishPass(int pass)
 | 
						|
            {
 | 
						|
                if (pass == 0)
 | 
						|
                    FinishCentroid();
 | 
						|
                else if (pass == 1)
 | 
						|
                    FinishDirection();
 | 
						|
            }
 | 
						|
 | 
						|
            UnfinishedEndpoints<TVectorSize> GetEndpoints(const float channelWeights[TVectorSize]) const
 | 
						|
            {
 | 
						|
                MFloat unweightedBase[TVectorSize];
 | 
						|
                MFloat unweightedOffset[TVectorSize];
 | 
						|
 | 
						|
                for (int ch = 0; ch < TVectorSize; ch++)
 | 
						|
                {
 | 
						|
                    MFloat min = m_centroid[ch] + m_direction[ch] * m_minDist;
 | 
						|
                    MFloat max = m_centroid[ch] + m_direction[ch] * m_maxDist;
 | 
						|
 | 
						|
                    float safeWeight = channelWeights[ch];
 | 
						|
                    if (safeWeight == 0.f)
 | 
						|
                        safeWeight = 1.0f;
 | 
						|
 | 
						|
                    unweightedBase[ch] = min / channelWeights[ch];
 | 
						|
                    unweightedOffset[ch] = (max - min) / channelWeights[ch];
 | 
						|
                }
 | 
						|
 | 
						|
                return UnfinishedEndpoints<TVectorSize>(unweightedBase, unweightedOffset);
 | 
						|
            }
 | 
						|
 | 
						|
        private:
 | 
						|
            void ContributeCentroid(const MFloat *value, const MFloat &weight)
 | 
						|
            {
 | 
						|
                for (int ch = 0; ch < TVectorSize; ch++)
 | 
						|
                    m_centroid[ch] = m_centroid[ch] + value[ch] * weight;
 | 
						|
                m_weightTotal = m_weightTotal + weight;
 | 
						|
            }
 | 
						|
 | 
						|
            void FinishCentroid()
 | 
						|
            {
 | 
						|
                MFloat denom = m_weightTotal;
 | 
						|
                ParallelMath::MakeSafeDenominator(denom);
 | 
						|
 | 
						|
                for (int ch = 0; ch < TVectorSize; ch++)
 | 
						|
                    m_centroid[ch] = m_centroid[ch] / denom;
 | 
						|
            }
 | 
						|
 | 
						|
            void ContributeDirection(const MFloat *value, const MFloat &weight)
 | 
						|
            {
 | 
						|
                MFloat diff[TVectorSize];
 | 
						|
                for (int ch = 0; ch < TVectorSize; ch++)
 | 
						|
                    diff[ch] = value[ch] - m_centroid[ch];
 | 
						|
 | 
						|
                m_covarianceMatrix.Add(diff, weight);
 | 
						|
            }
 | 
						|
 | 
						|
            void FinishDirection()
 | 
						|
            {
 | 
						|
                MFloat approx[TVectorSize];
 | 
						|
                for (int ch = 0; ch < TVectorSize; ch++)
 | 
						|
                    approx[ch] = ParallelMath::MakeFloat(1.0f);
 | 
						|
 | 
						|
                for (int i = 0; i < TIterationCount; i++)
 | 
						|
                {
 | 
						|
                    MFloat product[TVectorSize];
 | 
						|
                    m_covarianceMatrix.Product(product, approx);
 | 
						|
 | 
						|
                    MFloat largestComponent = product[0];
 | 
						|
                    for (int ch = 1; ch < TVectorSize; ch++)
 | 
						|
                        largestComponent = ParallelMath::Max(largestComponent, product[ch]);
 | 
						|
 | 
						|
                    // product = largestComponent*newApprox
 | 
						|
                    ParallelMath::MakeSafeDenominator(largestComponent);
 | 
						|
                    for (int ch = 0; ch < TVectorSize; ch++)
 | 
						|
                        approx[ch] = product[ch] / largestComponent;
 | 
						|
                }
 | 
						|
 | 
						|
                // Normalize
 | 
						|
                MFloat approxLen = ParallelMath::MakeFloatZero();
 | 
						|
                for (int ch = 0; ch < TVectorSize; ch++)
 | 
						|
                    approxLen = approxLen + approx[ch] * approx[ch];
 | 
						|
 | 
						|
                approxLen = ParallelMath::Sqrt(approxLen);
 | 
						|
 | 
						|
                ParallelMath::MakeSafeDenominator(approxLen);
 | 
						|
 | 
						|
                for (int ch = 0; ch < TVectorSize; ch++)
 | 
						|
                    m_direction[ch] = approx[ch] / approxLen;
 | 
						|
            }
 | 
						|
 | 
						|
            void ContributeMinMax(const MFloat *value)
 | 
						|
            {
 | 
						|
                MFloat dist = ParallelMath::MakeFloatZero();
 | 
						|
                for (int ch = 0; ch < TVectorSize; ch++)
 | 
						|
                    dist = dist + m_direction[ch] * (value[ch] - m_centroid[ch]);
 | 
						|
 | 
						|
                m_minDist = ParallelMath::Min(m_minDist, dist);
 | 
						|
                m_maxDist = ParallelMath::Max(m_maxDist, dist);
 | 
						|
            }
 | 
						|
 | 
						|
            ParallelMath::Float m_centroid[TVectorSize];
 | 
						|
            ParallelMath::Float m_direction[TVectorSize];
 | 
						|
            PackedCovarianceMatrix<TVectorSize> m_covarianceMatrix;
 | 
						|
            ParallelMath::Float m_weightTotal;
 | 
						|
 | 
						|
            ParallelMath::Float m_minDist;
 | 
						|
            ParallelMath::Float m_maxDist;
 | 
						|
        };
 | 
						|
 | 
						|
        static const ParallelMath::UInt16 g_weightReciprocals[] =
 | 
						|
        {
 | 
						|
            ParallelMath::MakeUInt16(0),        // -1 
 | 
						|
            ParallelMath::MakeUInt16(0),        // 0
 | 
						|
            ParallelMath::MakeUInt16(32768),    // 1
 | 
						|
            ParallelMath::MakeUInt16(16384),    // 2
 | 
						|
            ParallelMath::MakeUInt16(10923),    // 3
 | 
						|
            ParallelMath::MakeUInt16(8192),     // 4
 | 
						|
            ParallelMath::MakeUInt16(6554),     // 5
 | 
						|
            ParallelMath::MakeUInt16(5461),     // 6
 | 
						|
            ParallelMath::MakeUInt16(4681),     // 7
 | 
						|
            ParallelMath::MakeUInt16(4096),     // 8
 | 
						|
            ParallelMath::MakeUInt16(3641),     // 9
 | 
						|
            ParallelMath::MakeUInt16(3277),     // 10
 | 
						|
            ParallelMath::MakeUInt16(2979),     // 11
 | 
						|
            ParallelMath::MakeUInt16(2731),     // 12
 | 
						|
            ParallelMath::MakeUInt16(2521),     // 13
 | 
						|
            ParallelMath::MakeUInt16(2341),     // 14
 | 
						|
            ParallelMath::MakeUInt16(2185),     // 15
 | 
						|
        };
 | 
						|
 | 
						|
        template<int TVectorSize>
 | 
						|
        class IndexSelector
 | 
						|
        {
 | 
						|
        public:
 | 
						|
            typedef ParallelMath::Float MFloat;
 | 
						|
            typedef ParallelMath::UInt16 MUInt16;
 | 
						|
            typedef ParallelMath::UInt15 MUInt15;
 | 
						|
            typedef ParallelMath::SInt16 MSInt16;
 | 
						|
            typedef ParallelMath::AInt16 MAInt16;
 | 
						|
            typedef ParallelMath::SInt32 MSInt32;
 | 
						|
            typedef ParallelMath::UInt31 MUInt31;
 | 
						|
 | 
						|
            template<class TInterpolationEPType, class TColorEPType>
 | 
						|
            void Init(const float *channelWeights, const TInterpolationEPType interpolationEndPoints[2][TVectorSize], const TColorEPType colorSpaceEndpoints[2][TVectorSize], int range)
 | 
						|
            {
 | 
						|
                // In BC6H, the interpolation endpoints are higher-precision than the endpoints in color space.
 | 
						|
                // We need to select indexes using the color-space endpoints.
 | 
						|
 | 
						|
                m_isUniform = true;
 | 
						|
                for (int ch = 1; ch < TVectorSize; ch++)
 | 
						|
                {
 | 
						|
                    if (channelWeights[ch] != channelWeights[0])
 | 
						|
                        m_isUniform = false;
 | 
						|
                }
 | 
						|
 | 
						|
                // To work with channel weights, we need something where:
 | 
						|
                // pxDiff = px - ep[0]
 | 
						|
                // epDiff = ep[1] - ep[0]
 | 
						|
                //
 | 
						|
                // weightedEPDiff = epDiff * channelWeights
 | 
						|
                // normalizedWeightedAxis = weightedEPDiff / len(weightedEPDiff)
 | 
						|
                // normalizedIndex = dot(pxDiff * channelWeights, normalizedWeightedAxis) / len(weightedEPDiff)
 | 
						|
                // index = normalizedIndex * maxValue
 | 
						|
                //
 | 
						|
                // Equivalent to:
 | 
						|
                // axis = channelWeights * maxValue * epDiff * channelWeights / lenSquared(epDiff * channelWeights)
 | 
						|
                // index = dot(axis, pxDiff)
 | 
						|
 | 
						|
                for (int ep = 0; ep < 2; ep++)
 | 
						|
                    for (int ch = 0; ch < TVectorSize; ch++)
 | 
						|
                        m_endPoint[ep][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(interpolationEndPoints[ep][ch]);
 | 
						|
 | 
						|
                m_range = range;
 | 
						|
                m_maxValue = static_cast<float>(range - 1);
 | 
						|
 | 
						|
                MFloat epDiffWeighted[TVectorSize];
 | 
						|
                for (int ch = 0; ch < TVectorSize; ch++)
 | 
						|
                {
 | 
						|
                    m_origin[ch] = ParallelMath::ToFloat(colorSpaceEndpoints[0][ch]);
 | 
						|
                    MFloat opposingOriginCh = ParallelMath::ToFloat(colorSpaceEndpoints[1][ch]);
 | 
						|
                    epDiffWeighted[ch] = (opposingOriginCh - m_origin[ch]) * channelWeights[ch];
 | 
						|
                }
 | 
						|
 | 
						|
                MFloat lenSquared = epDiffWeighted[0] * epDiffWeighted[0];
 | 
						|
                for (int ch = 1; ch < TVectorSize; ch++)
 | 
						|
                    lenSquared = lenSquared + epDiffWeighted[ch] * epDiffWeighted[ch];
 | 
						|
 | 
						|
                ParallelMath::MakeSafeDenominator(lenSquared);
 | 
						|
 | 
						|
                MFloat maxValueDividedByLengthSquared = ParallelMath::MakeFloat(m_maxValue) / lenSquared;
 | 
						|
 | 
						|
                for (int ch = 0; ch < TVectorSize; ch++)
 | 
						|
                    m_axis[ch] = epDiffWeighted[ch] * channelWeights[ch] * maxValueDividedByLengthSquared;
 | 
						|
            }
 | 
						|
 | 
						|
            template<bool TSigned>
 | 
						|
            void Init(const float channelWeights[TVectorSize], const MUInt15 endPoints[2][TVectorSize], int range)
 | 
						|
            {
 | 
						|
                MAInt16 converted[2][TVectorSize];
 | 
						|
                for (int epi = 0; epi < 2; epi++)
 | 
						|
                    for (int ch = 0; ch < TVectorSize; ch++)
 | 
						|
                        converted[epi][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(endPoints[epi][ch]);
 | 
						|
 | 
						|
                Init<MUInt15, MUInt15>(channelWeights, endPoints, endPoints, range);
 | 
						|
            }
 | 
						|
 | 
						|
            void ReconstructLDR_BC7(const MUInt15 &index, MUInt15* pixel, int numRealChannels)
 | 
						|
            {
 | 
						|
                MUInt15 weight = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(g_weightReciprocals[m_range], index) + 256, 9));
 | 
						|
 | 
						|
                for (int ch = 0; ch < numRealChannels; ch++)
 | 
						|
                {
 | 
						|
                    MUInt15 ep0f = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply((ParallelMath::MakeUInt15(64) - weight), ParallelMath::LosslessCast<MUInt15>::Cast(m_endPoint[0][ch])));
 | 
						|
                    MUInt15 ep1f = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(weight, ParallelMath::LosslessCast<MUInt15>::Cast(m_endPoint[1][ch])));
 | 
						|
                    pixel[ch] = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ep0f + ep1f + ParallelMath::MakeUInt15(32), 6));
 | 
						|
                }
 | 
						|
            }
 | 
						|
 | 
						|
            void ReconstructLDRPrecise(const MUInt15 &index, MUInt15* pixel, int numRealChannels)
 | 
						|
            {
 | 
						|
                MUInt15 weight = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(g_weightReciprocals[m_range], index) + 64, 7));
 | 
						|
 | 
						|
                for (int ch = 0; ch < numRealChannels; ch++)
 | 
						|
                {
 | 
						|
                    MUInt15 ep0f = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply((ParallelMath::MakeUInt15(256) - weight), ParallelMath::LosslessCast<MUInt15>::Cast(m_endPoint[0][ch])));
 | 
						|
                    MUInt15 ep1f = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::CompactMultiply(weight, ParallelMath::LosslessCast<MUInt15>::Cast(m_endPoint[1][ch])));
 | 
						|
                    pixel[ch] = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ep0f + ep1f + ParallelMath::MakeUInt15(128), 8));
 | 
						|
                }
 | 
						|
            }
 | 
						|
 | 
						|
            void ReconstructLDR_BC7(const MUInt15 &index, MUInt15* pixel)
 | 
						|
            {
 | 
						|
                ReconstructLDR_BC7(index, pixel, TVectorSize);
 | 
						|
            }
 | 
						|
 | 
						|
            void ReconstructLDRPrecise(const MUInt15 &index, MUInt15* pixel)
 | 
						|
            {
 | 
						|
                ReconstructLDRPrecise(index, pixel, TVectorSize);
 | 
						|
            }
 | 
						|
 | 
						|
            MUInt15 SelectIndexLDR(const MFloat* pixel, const ParallelMath::RoundTowardNearestForScope* rtn) const
 | 
						|
            {
 | 
						|
                MFloat dist = (pixel[0] - m_origin[0]) * m_axis[0];
 | 
						|
                for (int ch = 1; ch < TVectorSize; ch++)
 | 
						|
                    dist = dist + (pixel[ch] - m_origin[ch]) * m_axis[ch];
 | 
						|
 | 
						|
                return ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(dist, 0.0f, m_maxValue), rtn);
 | 
						|
            }
 | 
						|
 | 
						|
        protected:
 | 
						|
            MAInt16 m_endPoint[2][TVectorSize];
 | 
						|
 | 
						|
        private:
 | 
						|
            MFloat m_origin[TVectorSize];
 | 
						|
            MFloat m_axis[TVectorSize];
 | 
						|
            int m_range;
 | 
						|
            float m_maxValue;
 | 
						|
            bool m_isUniform;
 | 
						|
        };
 | 
						|
 | 
						|
 | 
						|
        template<int TVectorSize>
 | 
						|
        class IndexSelectorHDR : public IndexSelector<TVectorSize>
 | 
						|
        {
 | 
						|
        public:
 | 
						|
            typedef ParallelMath::UInt15 MUInt15;
 | 
						|
            typedef ParallelMath::UInt16 MUInt16;
 | 
						|
            typedef ParallelMath::UInt31 MUInt31;
 | 
						|
            typedef ParallelMath::SInt16 MSInt16;
 | 
						|
            typedef ParallelMath::SInt32 MSInt32;
 | 
						|
            typedef ParallelMath::Float MFloat;
 | 
						|
 | 
						|
        private:
 | 
						|
 | 
						|
            MUInt15 InvertSingle(const MUInt15& anIndex) const
 | 
						|
            {
 | 
						|
                MUInt15 inverted = m_maxValueMinusOne - anIndex;
 | 
						|
                return ParallelMath::Select(m_isInverted, inverted, anIndex);
 | 
						|
            }
 | 
						|
 | 
						|
            void ReconstructHDRSignedUninverted(const MUInt15 &index, MSInt16* pixel) const
 | 
						|
            {
 | 
						|
                MUInt15 weight = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(g_weightReciprocals[m_range], index) + 256, 9));
 | 
						|
 | 
						|
                for (int ch = 0; ch < TVectorSize; ch++)
 | 
						|
                {
 | 
						|
                    MSInt16 ep0 = ParallelMath::LosslessCast<MSInt16>::Cast(this->m_endPoint[0][ch]);
 | 
						|
                    MSInt16 ep1 = ParallelMath::LosslessCast<MSInt16>::Cast(this->m_endPoint[1][ch]);
 | 
						|
 | 
						|
                    MSInt32 pixel32 = ParallelMath::XMultiply((ParallelMath::MakeUInt15(64) - weight), ep0) + ParallelMath::XMultiply(weight, ep1);
 | 
						|
 | 
						|
                    pixel32 = ParallelMath::RightShift(pixel32 + ParallelMath::MakeSInt32(32), 6);
 | 
						|
 | 
						|
                    pixel[ch] = UnscaleHDRValueSigned(ParallelMath::ToSInt16(pixel32));
 | 
						|
                }
 | 
						|
            }
 | 
						|
 | 
						|
            void ReconstructHDRUnsignedUninverted(const MUInt15 &index, MSInt16* pixel) const
 | 
						|
            {
 | 
						|
                MUInt15 weight = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(g_weightReciprocals[m_range], index) + 256, 9));
 | 
						|
 | 
						|
                for (int ch = 0; ch < TVectorSize; ch++)
 | 
						|
                {
 | 
						|
                    MUInt16 ep0 = ParallelMath::LosslessCast<MUInt16>::Cast(this->m_endPoint[0][ch]);
 | 
						|
                    MUInt16 ep1 = ParallelMath::LosslessCast<MUInt16>::Cast(this->m_endPoint[1][ch]);
 | 
						|
 | 
						|
                    MUInt31 pixel31 = ParallelMath::XMultiply((ParallelMath::MakeUInt15(64) - weight), ep0) + ParallelMath::XMultiply(weight, ep1);
 | 
						|
 | 
						|
                    pixel31 = ParallelMath::RightShift(pixel31 + ParallelMath::MakeUInt31(32), 6);
 | 
						|
 | 
						|
                    pixel[ch] = ParallelMath::LosslessCast<MSInt16>::Cast(UnscaleHDRValueUnsigned(ParallelMath::ToUInt16(pixel31)));
 | 
						|
                }
 | 
						|
            }
 | 
						|
 | 
						|
            MFloat ErrorForInterpolatorComponent(int index, int ch, const MFloat *pixel) const
 | 
						|
            {
 | 
						|
                MFloat diff = pixel[ch] - m_reconstructedInterpolators[index][ch];
 | 
						|
                return diff * diff;
 | 
						|
            }
 | 
						|
 | 
						|
            MFloat ErrorForInterpolator(int index, const MFloat *pixel) const
 | 
						|
            {
 | 
						|
                MFloat error = ErrorForInterpolatorComponent(index, 0, pixel);
 | 
						|
                for (int ch = 1; ch < TVectorSize; ch++)
 | 
						|
                    error = error + ErrorForInterpolatorComponent(index, ch, pixel);
 | 
						|
                return error;
 | 
						|
            }
 | 
						|
 | 
						|
        public:
 | 
						|
 | 
						|
            void InitHDR(int range, bool isSigned, bool fastIndexing, const float *channelWeights)
 | 
						|
            {
 | 
						|
                assert(range <= 16);
 | 
						|
 | 
						|
                m_range = range;
 | 
						|
 | 
						|
                m_isInverted = ParallelMath::MakeBoolInt16(false);
 | 
						|
                m_maxValueMinusOne = ParallelMath::MakeUInt15(static_cast<uint16_t>(range - 1));
 | 
						|
 | 
						|
                if (!fastIndexing)
 | 
						|
                {
 | 
						|
                    for (int i = 0; i < range; i++)
 | 
						|
                    {
 | 
						|
                        MSInt16 recon2CL[TVectorSize];
 | 
						|
 | 
						|
                        if (isSigned)
 | 
						|
                            ReconstructHDRSignedUninverted(ParallelMath::MakeUInt15(static_cast<uint16_t>(i)), recon2CL);
 | 
						|
                        else
 | 
						|
                            ReconstructHDRUnsignedUninverted(ParallelMath::MakeUInt15(static_cast<uint16_t>(i)), recon2CL);
 | 
						|
 | 
						|
                        for (int ch = 0; ch < TVectorSize; ch++)
 | 
						|
                            m_reconstructedInterpolators[i][ch] = ParallelMath::TwosCLHalfToFloat(recon2CL[ch]) * channelWeights[ch];
 | 
						|
                    }
 | 
						|
                }
 | 
						|
            }
 | 
						|
 | 
						|
            void ReconstructHDRSigned(const MUInt15 &index, MSInt16* pixel) const
 | 
						|
            {
 | 
						|
                ReconstructHDRSignedUninverted(InvertSingle(index), pixel);
 | 
						|
            }
 | 
						|
 | 
						|
            void ReconstructHDRUnsigned(const MUInt15 &index, MSInt16* pixel) const
 | 
						|
            {
 | 
						|
                ReconstructHDRUnsignedUninverted(InvertSingle(index), pixel);
 | 
						|
            }
 | 
						|
 | 
						|
            void ConditionalInvert(const ParallelMath::Int16CompFlag &invert)
 | 
						|
            {
 | 
						|
                m_isInverted = invert;
 | 
						|
            }
 | 
						|
 | 
						|
            MUInt15 SelectIndexHDRSlow(const MFloat* pixel, const ParallelMath::RoundTowardNearestForScope*) const
 | 
						|
            {
 | 
						|
                MUInt15 index = ParallelMath::MakeUInt15(0);
 | 
						|
 | 
						|
                MFloat bestError = ErrorForInterpolator(0, pixel);
 | 
						|
                for (int i = 1; i < m_range; i++)
 | 
						|
                {
 | 
						|
                    MFloat error = ErrorForInterpolator(i, pixel);
 | 
						|
                    ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);
 | 
						|
                    ParallelMath::ConditionalSet(index, ParallelMath::FloatFlagToInt16(errorBetter), ParallelMath::MakeUInt15(static_cast<uint16_t>(i)));
 | 
						|
                    bestError = ParallelMath::Min(bestError, error);
 | 
						|
                }
 | 
						|
 | 
						|
                return InvertSingle(index);
 | 
						|
            }
 | 
						|
 | 
						|
            MUInt15 SelectIndexHDRFast(const MFloat* pixel, const ParallelMath::RoundTowardNearestForScope* rtn) const
 | 
						|
            {
 | 
						|
                return InvertSingle(this->SelectIndexLDR(pixel, rtn));
 | 
						|
            }
 | 
						|
 | 
						|
        private:
 | 
						|
            MFloat m_reconstructedInterpolators[16][TVectorSize];
 | 
						|
            ParallelMath::Int16CompFlag m_isInverted;
 | 
						|
            MUInt15 m_maxValueMinusOne;
 | 
						|
            int m_range;
 | 
						|
        };
 | 
						|
 | 
						|
        // Solve for a, b where v = a*t + b
 | 
						|
        // This allows endpoints to be mapped to where T=0 and T=1
 | 
						|
        // Least squares from totals:
 | 
						|
        // a = (tv - t*v/w)/(tt - t*t/w)
 | 
						|
        // b = (v - a*t)/w
 | 
						|
        template<int TVectorSize>
 | 
						|
        class EndpointRefiner
 | 
						|
        {
 | 
						|
        public:
 | 
						|
            typedef ParallelMath::Float MFloat;
 | 
						|
            typedef ParallelMath::UInt16 MUInt16;
 | 
						|
            typedef ParallelMath::UInt15 MUInt15;
 | 
						|
            typedef ParallelMath::AInt16 MAInt16;
 | 
						|
            typedef ParallelMath::SInt16 MSInt16;
 | 
						|
            typedef ParallelMath::SInt32 MSInt32;
 | 
						|
 | 
						|
            MFloat m_tv[TVectorSize];
 | 
						|
            MFloat m_v[TVectorSize];
 | 
						|
            MFloat m_tt;
 | 
						|
            MFloat m_t;
 | 
						|
            MFloat m_w;
 | 
						|
            int m_wu;
 | 
						|
 | 
						|
            float m_rcpMaxIndex;
 | 
						|
            float m_channelWeights[TVectorSize];
 | 
						|
            float m_rcpChannelWeights[TVectorSize];
 | 
						|
 | 
						|
            void Init(int indexRange, const float channelWeights[TVectorSize])
 | 
						|
            {
 | 
						|
                for (int ch = 0; ch < TVectorSize; ch++)
 | 
						|
                {
 | 
						|
                    m_tv[ch] = ParallelMath::MakeFloatZero();
 | 
						|
                    m_v[ch] = ParallelMath::MakeFloatZero();
 | 
						|
                }
 | 
						|
                m_tt = ParallelMath::MakeFloatZero();
 | 
						|
                m_t = ParallelMath::MakeFloatZero();
 | 
						|
                m_w = ParallelMath::MakeFloatZero();
 | 
						|
 | 
						|
                m_rcpMaxIndex = 1.0f / static_cast<float>(indexRange - 1);
 | 
						|
 | 
						|
                for (int ch = 0; ch < TVectorSize; ch++)
 | 
						|
                {
 | 
						|
                    m_channelWeights[ch] = channelWeights[ch];
 | 
						|
                    m_rcpChannelWeights[ch] = 1.0f;
 | 
						|
                    if (m_channelWeights[ch] != 0.0f)
 | 
						|
                        m_rcpChannelWeights[ch] = 1.0f / channelWeights[ch];
 | 
						|
                }
 | 
						|
 | 
						|
                m_wu = 0;
 | 
						|
            }
 | 
						|
 | 
						|
            void ContributePW(const MFloat *pwFloatPixel, const MUInt15 &index, const MFloat &weight)
 | 
						|
            {
 | 
						|
                MFloat t = ParallelMath::ToFloat(index) * m_rcpMaxIndex;
 | 
						|
 | 
						|
                for (int ch = 0; ch < TVectorSize; ch++)
 | 
						|
                {
 | 
						|
                    MFloat v = pwFloatPixel[ch] * weight;
 | 
						|
 | 
						|
                    m_tv[ch] = m_tv[ch] + t * v;
 | 
						|
                    m_v[ch] = m_v[ch] + v;
 | 
						|
                }
 | 
						|
                m_tt = m_tt + weight * t * t;
 | 
						|
                m_t = m_t + weight * t;
 | 
						|
                m_w = m_w + weight;
 | 
						|
            }
 | 
						|
 | 
						|
            void ContributeUnweightedPW(const MFloat *pwFloatPixel, const MUInt15 &index, int numRealChannels)
 | 
						|
            {
 | 
						|
                MFloat t = ParallelMath::ToFloat(index) * m_rcpMaxIndex;
 | 
						|
 | 
						|
                for (int ch = 0; ch < numRealChannels; ch++)
 | 
						|
                {
 | 
						|
                    MFloat v = pwFloatPixel[ch];
 | 
						|
 | 
						|
                    m_tv[ch] = m_tv[ch] + t * v;
 | 
						|
                    m_v[ch] = m_v[ch] + v;
 | 
						|
                }
 | 
						|
                m_tt = m_tt + t * t;
 | 
						|
                m_t = m_t + t;
 | 
						|
                m_wu++;
 | 
						|
            }
 | 
						|
 | 
						|
            void ContributeUnweightedPW(const MFloat *floatPixel, const MUInt15 &index)
 | 
						|
            {
 | 
						|
                ContributeUnweightedPW(floatPixel, index, TVectorSize);
 | 
						|
            }
 | 
						|
 | 
						|
            void GetRefinedEndpoints(MFloat endPoint[2][TVectorSize])
 | 
						|
            {
 | 
						|
                // a = (tv - t*v/w)/(tt - t*t/w)
 | 
						|
                // b = (v - a*t)/w
 | 
						|
                MFloat w = m_w + ParallelMath::MakeFloat(static_cast<float>(m_wu));
 | 
						|
 | 
						|
                ParallelMath::MakeSafeDenominator(w);
 | 
						|
                MFloat wRcp = ParallelMath::Reciprocal(w);
 | 
						|
 | 
						|
                MFloat adenom = (m_tt * w - m_t * m_t) * wRcp;
 | 
						|
 | 
						|
                ParallelMath::FloatCompFlag adenomZero = ParallelMath::Equal(adenom, ParallelMath::MakeFloatZero());
 | 
						|
                ParallelMath::ConditionalSet(adenom, adenomZero, ParallelMath::MakeFloat(1.0f));
 | 
						|
 | 
						|
                for (int ch = 0; ch < TVectorSize; ch++)
 | 
						|
                {
 | 
						|
                    /*
 | 
						|
                    if (adenom == 0.0)
 | 
						|
                        p1 = p2 = er.v / er.w;
 | 
						|
                    else
 | 
						|
                    {
 | 
						|
                        float4 a = (er.tv - er.t*er.v / er.w) / adenom;
 | 
						|
                        float4 b = (er.v - a * er.t) / er.w;
 | 
						|
                        p1 = b;
 | 
						|
                        p2 = a + b;
 | 
						|
                    }
 | 
						|
                    */
 | 
						|
 | 
						|
                    MFloat a = (m_tv[ch] - m_t * m_v[ch] * wRcp) / adenom;
 | 
						|
                    MFloat b = (m_v[ch] - a * m_t) * wRcp;
 | 
						|
 | 
						|
                    MFloat p1 = b;
 | 
						|
                    MFloat p2 = a + b;
 | 
						|
 | 
						|
                    ParallelMath::ConditionalSet(p1, adenomZero, (m_v[ch] * wRcp));
 | 
						|
                    ParallelMath::ConditionalSet(p2, adenomZero, p1);
 | 
						|
 | 
						|
                    // Unweight
 | 
						|
                    float inverseWeight = m_rcpChannelWeights[ch];
 | 
						|
 | 
						|
                    endPoint[0][ch] = p1 * inverseWeight;
 | 
						|
                    endPoint[1][ch] = p2 * inverseWeight;
 | 
						|
                }
 | 
						|
            }
 | 
						|
 | 
						|
            void GetRefinedEndpointsLDR(MUInt15 endPoint[2][TVectorSize], int numRealChannels, const ParallelMath::RoundTowardNearestForScope *roundingMode)
 | 
						|
            {
 | 
						|
                MFloat floatEndPoint[2][TVectorSize];
 | 
						|
                GetRefinedEndpoints(floatEndPoint);
 | 
						|
 | 
						|
                for (int epi = 0; epi < 2; epi++)
 | 
						|
                    for (int ch = 0; ch < TVectorSize; ch++)
 | 
						|
                        endPoint[epi][ch] = ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(floatEndPoint[epi][ch], 0.0f, 255.0f), roundingMode);
 | 
						|
            }
 | 
						|
 | 
						|
            void GetRefinedEndpointsLDR(MUInt15 endPoint[2][TVectorSize], const ParallelMath::RoundTowardNearestForScope *roundingMode)
 | 
						|
            {
 | 
						|
                GetRefinedEndpointsLDR(endPoint, TVectorSize, roundingMode);
 | 
						|
            }
 | 
						|
 | 
						|
            void GetRefinedEndpointsHDR(MSInt16 endPoint[2][TVectorSize], bool isSigned, const ParallelMath::RoundTowardNearestForScope *roundingMode)
 | 
						|
            {
 | 
						|
                MFloat floatEndPoint[2][TVectorSize];
 | 
						|
                GetRefinedEndpoints(floatEndPoint);
 | 
						|
 | 
						|
                for (int epi = 0; epi < 2; epi++)
 | 
						|
                {
 | 
						|
                    for (int ch = 0; ch < TVectorSize; ch++)
 | 
						|
                    {
 | 
						|
                        MFloat f = floatEndPoint[epi][ch];
 | 
						|
                        if (isSigned)
 | 
						|
                            endPoint[epi][ch] = ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::RoundAndConvertToS16(ParallelMath::Clamp(f, -31743.0f, 31743.0f), roundingMode));
 | 
						|
                        else
 | 
						|
                            endPoint[epi][ch] = ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(f, 0.0f, 31743.0f), roundingMode));
 | 
						|
                    }
 | 
						|
                }
 | 
						|
            }
 | 
						|
        };
 | 
						|
 | 
						|
        template<int TVectorSize>
 | 
						|
        class AggregatedError
 | 
						|
        {
 | 
						|
        public:
 | 
						|
            typedef ParallelMath::UInt16 MUInt16;
 | 
						|
            typedef ParallelMath::UInt31 MUInt31;
 | 
						|
            typedef ParallelMath::Float MFloat;
 | 
						|
 | 
						|
            AggregatedError()
 | 
						|
            {
 | 
						|
                for (int ch = 0; ch < TVectorSize; ch++)
 | 
						|
                    m_errorUnweighted[ch] = ParallelMath::MakeUInt31(0);
 | 
						|
            }
 | 
						|
 | 
						|
            void Add(const MUInt16 &channelErrorUnweighted, int ch)
 | 
						|
            {
 | 
						|
                m_errorUnweighted[ch] = m_errorUnweighted[ch] + ParallelMath::ToUInt31(channelErrorUnweighted);
 | 
						|
            }
 | 
						|
 | 
						|
            MFloat Finalize(uint32_t flags, const float channelWeightsSq[TVectorSize]) const
 | 
						|
            {
 | 
						|
                if (flags & cvtt::Flags::Uniform)
 | 
						|
                {
 | 
						|
                    MUInt31 total = m_errorUnweighted[0];
 | 
						|
                    for (int ch = 1; ch < TVectorSize; ch++)
 | 
						|
                        total = total + m_errorUnweighted[ch];
 | 
						|
                    return ParallelMath::ToFloat(total);
 | 
						|
                }
 | 
						|
                else
 | 
						|
                {
 | 
						|
                    MFloat total = ParallelMath::ToFloat(m_errorUnweighted[0]) * channelWeightsSq[0];
 | 
						|
                    for (int ch = 1; ch < TVectorSize; ch++)
 | 
						|
                        total = total + ParallelMath::ToFloat(m_errorUnweighted[ch]) * channelWeightsSq[ch];
 | 
						|
                    return total;
 | 
						|
                }
 | 
						|
            }
 | 
						|
 | 
						|
        private:
 | 
						|
            MUInt31 m_errorUnweighted[TVectorSize];
 | 
						|
        };
 | 
						|
 | 
						|
        class BCCommon
 | 
						|
        {
 | 
						|
        public:
 | 
						|
            typedef ParallelMath::Float MFloat;
 | 
						|
            typedef ParallelMath::UInt16 MUInt16;
 | 
						|
            typedef ParallelMath::UInt15 MUInt15;
 | 
						|
            typedef ParallelMath::AInt16 MAInt16;
 | 
						|
            typedef ParallelMath::SInt16 MSInt16;
 | 
						|
            typedef ParallelMath::SInt32 MSInt32;
 | 
						|
 | 
						|
            static int TweakRoundsForRange(int range)
 | 
						|
            {
 | 
						|
                if (range == 3)
 | 
						|
                    return 3;
 | 
						|
                return 4;
 | 
						|
            }
 | 
						|
 | 
						|
            template<int TVectorSize>
 | 
						|
            static void ComputeErrorLDR(uint32_t flags, const MUInt15 reconstructed[TVectorSize], const MUInt15 original[TVectorSize], int numRealChannels, AggregatedError<TVectorSize> &aggError)
 | 
						|
            {
 | 
						|
                for (int ch = 0; ch < numRealChannels; ch++)
 | 
						|
                    aggError.Add(ParallelMath::SqDiffUInt8(reconstructed[ch], original[ch]), ch);
 | 
						|
            }
 | 
						|
 | 
						|
            template<int TVectorSize>
 | 
						|
            static void ComputeErrorLDR(uint32_t flags, const MUInt15 reconstructed[TVectorSize], const MUInt15 original[TVectorSize], AggregatedError<TVectorSize> &aggError)
 | 
						|
            {
 | 
						|
                ComputeErrorLDR<TVectorSize>(flags, reconstructed, original, TVectorSize, aggError);
 | 
						|
            }
 | 
						|
 | 
						|
            template<int TVectorSize>
 | 
						|
            static MFloat ComputeErrorLDRSimple(uint32_t flags, const MUInt15 reconstructed[TVectorSize], const MUInt15 original[TVectorSize], int numRealChannels, const float *channelWeightsSq)
 | 
						|
            {
 | 
						|
                AggregatedError<TVectorSize> aggError;
 | 
						|
                ComputeErrorLDR<TVectorSize>(flags, reconstructed, original, numRealChannels, aggError);
 | 
						|
                return aggError.Finalize(flags, channelWeightsSq);
 | 
						|
            }
 | 
						|
 | 
						|
            template<int TVectorSize>
 | 
						|
            static MFloat ComputeErrorHDRFast(uint32_t flags, const MSInt16 reconstructed[TVectorSize], const MSInt16 original[TVectorSize], const float channelWeightsSq[TVectorSize])
 | 
						|
            {
 | 
						|
                MFloat error = ParallelMath::MakeFloatZero();
 | 
						|
                if (flags & Flags::Uniform)
 | 
						|
                {
 | 
						|
                    for (int ch = 0; ch < TVectorSize; ch++)
 | 
						|
                        error = error + ParallelMath::SqDiffSInt16(reconstructed[ch], original[ch]);
 | 
						|
                }
 | 
						|
                else
 | 
						|
                {
 | 
						|
                    for (int ch = 0; ch < TVectorSize; ch++)
 | 
						|
                        error = error + ParallelMath::SqDiffSInt16(reconstructed[ch], original[ch]) * ParallelMath::MakeFloat(channelWeightsSq[ch]);
 | 
						|
                }
 | 
						|
 | 
						|
                return error;
 | 
						|
            }
 | 
						|
 | 
						|
            template<int TVectorSize>
 | 
						|
            static MFloat ComputeErrorHDRSlow(uint32_t flags, const MSInt16 reconstructed[TVectorSize], const MSInt16 original[TVectorSize], const float channelWeightsSq[TVectorSize])
 | 
						|
            {
 | 
						|
                MFloat error = ParallelMath::MakeFloatZero();
 | 
						|
                if (flags & Flags::Uniform)
 | 
						|
                {
 | 
						|
                    for (int ch = 0; ch < TVectorSize; ch++)
 | 
						|
                        error = error + ParallelMath::SqDiff2CL(reconstructed[ch], original[ch]);
 | 
						|
                }
 | 
						|
                else
 | 
						|
                {
 | 
						|
                    for (int ch = 0; ch < TVectorSize; ch++)
 | 
						|
                        error = error + ParallelMath::SqDiff2CL(reconstructed[ch], original[ch]) * ParallelMath::MakeFloat(channelWeightsSq[ch]);
 | 
						|
                }
 | 
						|
 | 
						|
                return error;
 | 
						|
            }
 | 
						|
 | 
						|
            template<int TChannelCount>
 | 
						|
            static void PreWeightPixelsLDR(MFloat preWeightedPixels[16][TChannelCount], const MUInt15 pixels[16][TChannelCount], const float channelWeights[TChannelCount])
 | 
						|
            {
 | 
						|
                for (int px = 0; px < 16; px++)
 | 
						|
                {
 | 
						|
                    for (int ch = 0; ch < TChannelCount; ch++)
 | 
						|
                        preWeightedPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]) * channelWeights[ch];
 | 
						|
                }
 | 
						|
            }
 | 
						|
 | 
						|
            template<int TChannelCount>
 | 
						|
            static void PreWeightPixelsHDR(MFloat preWeightedPixels[16][TChannelCount], const MSInt16 pixels[16][TChannelCount], const float channelWeights[TChannelCount])
 | 
						|
            {
 | 
						|
                for (int px = 0; px < 16; px++)
 | 
						|
                {
 | 
						|
                    for (int ch = 0; ch < TChannelCount; ch++)
 | 
						|
                        preWeightedPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]) * channelWeights[ch];
 | 
						|
                }
 | 
						|
            }
 | 
						|
        };
 | 
						|
 | 
						|
        class BC7Computer
 | 
						|
        {
 | 
						|
        public:
 | 
						|
            static const int MaxTweakRounds = 4;
 | 
						|
 | 
						|
            typedef ParallelMath::SInt16 MSInt16;
 | 
						|
            typedef ParallelMath::UInt15 MUInt15;
 | 
						|
            typedef ParallelMath::UInt16 MUInt16;
 | 
						|
            typedef ParallelMath::SInt32 MSInt32;
 | 
						|
            typedef ParallelMath::Float MFloat;
 | 
						|
 | 
						|
            struct WorkInfo
 | 
						|
            {
 | 
						|
                MUInt15 m_mode;
 | 
						|
                MFloat m_error;
 | 
						|
                MUInt15 m_ep[3][2][4];
 | 
						|
                MUInt15 m_indexes[16];
 | 
						|
                MUInt15 m_indexes2[16];
 | 
						|
 | 
						|
                union
 | 
						|
                {
 | 
						|
                    MUInt15 m_partition;
 | 
						|
                    struct IndexSelectorAndRotation
 | 
						|
                    {
 | 
						|
                        MUInt15 m_indexSelector;
 | 
						|
                        MUInt15 m_rotation;
 | 
						|
                    } m_isr;
 | 
						|
                } m_u;
 | 
						|
            };
 | 
						|
 | 
						|
            static void TweakAlpha(const MUInt15 original[2], int tweak, int range, MUInt15 result[2])
 | 
						|
            {
 | 
						|
                ParallelMath::RoundTowardNearestForScope roundingMode;
 | 
						|
 | 
						|
                float tf[2];
 | 
						|
                ComputeTweakFactors(tweak, range, tf);
 | 
						|
 | 
						|
                MFloat base = ParallelMath::ToFloat(original[0]);
 | 
						|
                MFloat offs = ParallelMath::ToFloat(original[1]) - base;
 | 
						|
 | 
						|
                result[0] = ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(base + offs * tf[0], 0.0f, 255.0f), &roundingMode);
 | 
						|
                result[1] = ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(base + offs * tf[1], 0.0f, 255.0f), &roundingMode);
 | 
						|
            }
 | 
						|
 | 
						|
            static void Quantize(MUInt15* color, int bits, int channels, const ParallelMath::RoundTowardNearestForScope *roundingMode)
 | 
						|
            {
 | 
						|
                float maxColor = static_cast<float>((1 << bits) - 1);
 | 
						|
 | 
						|
                for (int i = 0; i < channels; i++)
 | 
						|
                    color[i] = ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(ParallelMath::ToFloat(color[i]) * ParallelMath::MakeFloat(1.0f / 255.0f) * maxColor, 0.f, 255.f), roundingMode);
 | 
						|
            }
 | 
						|
 | 
						|
            static void QuantizeP(MUInt15* color, int bits, uint16_t p, int channels, const ParallelMath::RoundTowardNearestForScope *roundingMode)
 | 
						|
            {
 | 
						|
                uint16_t pShift = static_cast<uint16_t>(1 << (7 - bits));
 | 
						|
                MUInt15 pShiftV = ParallelMath::MakeUInt15(pShift);
 | 
						|
 | 
						|
                float maxColorF = static_cast<float>(255 - (1 << (7 - bits)));
 | 
						|
 | 
						|
                float maxQuantized = static_cast<float>((1 << bits) - 1);
 | 
						|
 | 
						|
                for (int ch = 0; ch < channels; ch++)
 | 
						|
                {
 | 
						|
                    MUInt15 clr = color[ch];
 | 
						|
                    if (p)
 | 
						|
                        clr = ParallelMath::Max(clr, pShiftV) - pShiftV;
 | 
						|
 | 
						|
                    MFloat rerangedColor = ParallelMath::ToFloat(clr) * maxQuantized / maxColorF;
 | 
						|
 | 
						|
                    clr = ParallelMath::RoundAndConvertToU15(ParallelMath::Clamp(rerangedColor, 0.0f, maxQuantized), roundingMode) << 1;
 | 
						|
                    if (p)
 | 
						|
                        clr = clr | ParallelMath::MakeUInt15(1);
 | 
						|
 | 
						|
                    color[ch] = clr;
 | 
						|
                }
 | 
						|
            }
 | 
						|
 | 
						|
            static void Unquantize(MUInt15* color, int bits, int channels)
 | 
						|
            {
 | 
						|
                for (int ch = 0; ch < channels; ch++)
 | 
						|
                {
 | 
						|
                    MUInt15 clr = color[ch];
 | 
						|
                    clr = clr << (8 - bits);
 | 
						|
                    color[ch] = clr | ParallelMath::RightShift(clr, bits);
 | 
						|
                }
 | 
						|
            }
 | 
						|
 | 
						|
            static void CompressEndpoints0(MUInt15 ep[2][4], uint16_t p[2], const ParallelMath::RoundTowardNearestForScope *roundingMode)
 | 
						|
            {
 | 
						|
                for (int j = 0; j < 2; j++)
 | 
						|
                {
 | 
						|
                    QuantizeP(ep[j], 4, p[j], 3, roundingMode);
 | 
						|
                    Unquantize(ep[j], 5, 3);
 | 
						|
                    ep[j][3] = ParallelMath::MakeUInt15(255);
 | 
						|
                }
 | 
						|
            }
 | 
						|
 | 
						|
            static void CompressEndpoints1(MUInt15 ep[2][4], uint16_t p, const ParallelMath::RoundTowardNearestForScope *roundingMode)
 | 
						|
            {
 | 
						|
                for (int j = 0; j < 2; j++)
 | 
						|
                {
 | 
						|
                    QuantizeP(ep[j], 6, p, 3, roundingMode);
 | 
						|
                    Unquantize(ep[j], 7, 3);
 | 
						|
                    ep[j][3] = ParallelMath::MakeUInt15(255);
 | 
						|
                }
 | 
						|
            }
 | 
						|
 | 
						|
            static void CompressEndpoints2(MUInt15 ep[2][4], const ParallelMath::RoundTowardNearestForScope *roundingMode)
 | 
						|
            {
 | 
						|
                for (int j = 0; j < 2; j++)
 | 
						|
                {
 | 
						|
                    Quantize(ep[j], 5, 3, roundingMode);
 | 
						|
                    Unquantize(ep[j], 5, 3);
 | 
						|
                    ep[j][3] = ParallelMath::MakeUInt15(255);
 | 
						|
                }
 | 
						|
            }
 | 
						|
 | 
						|
            static void CompressEndpoints3(MUInt15 ep[2][4], uint16_t p[2], const ParallelMath::RoundTowardNearestForScope *roundingMode)
 | 
						|
            {
 | 
						|
                for (int j = 0; j < 2; j++)
 | 
						|
                {
 | 
						|
                    QuantizeP(ep[j], 7, p[j], 3, roundingMode);
 | 
						|
                    ep[j][3] = ParallelMath::MakeUInt15(255);
 | 
						|
                }
 | 
						|
            }
 | 
						|
 | 
						|
            static void CompressEndpoints4(MUInt15 epRGB[2][3], MUInt15 epA[2], const ParallelMath::RoundTowardNearestForScope *roundingMode)
 | 
						|
            {
 | 
						|
                for (int j = 0; j < 2; j++)
 | 
						|
                {
 | 
						|
                    Quantize(epRGB[j], 5, 3, roundingMode);
 | 
						|
                    Unquantize(epRGB[j], 5, 3);
 | 
						|
 | 
						|
                    Quantize(epA + j, 6, 1, roundingMode);
 | 
						|
                    Unquantize(epA + j, 6, 1);
 | 
						|
                }
 | 
						|
            }
 | 
						|
 | 
						|
            static void CompressEndpoints5(MUInt15 epRGB[2][3], MUInt15 epA[2], const ParallelMath::RoundTowardNearestForScope *roundingMode)
 | 
						|
            {
 | 
						|
                for (int j = 0; j < 2; j++)
 | 
						|
                {
 | 
						|
                    Quantize(epRGB[j], 7, 3, roundingMode);
 | 
						|
                    Unquantize(epRGB[j], 7, 3);
 | 
						|
                }
 | 
						|
 | 
						|
                // Alpha is full precision
 | 
						|
                (void)epA;
 | 
						|
            }
 | 
						|
 | 
						|
            static void CompressEndpoints6(MUInt15 ep[2][4], uint16_t p[2], const ParallelMath::RoundTowardNearestForScope *roundingMode)
 | 
						|
            {
 | 
						|
                for (int j = 0; j < 2; j++)
 | 
						|
                    QuantizeP(ep[j], 7, p[j], 4, roundingMode);
 | 
						|
            }
 | 
						|
 | 
						|
            static void CompressEndpoints7(MUInt15 ep[2][4], uint16_t p[2], const ParallelMath::RoundTowardNearestForScope *roundingMode)
 | 
						|
            {
 | 
						|
                for (int j = 0; j < 2; j++)
 | 
						|
                {
 | 
						|
                    QuantizeP(ep[j], 5, p[j], 4, roundingMode);
 | 
						|
                    Unquantize(ep[j], 6, 4);
 | 
						|
                }
 | 
						|
            }
 | 
						|
 | 
						|
            struct SinglePlaneTemporaries
 | 
						|
            {
 | 
						|
                UnfinishedEndpoints<3> unfinishedRGB[BC7Data::g_numShapesAll];
 | 
						|
                UnfinishedEndpoints<4> unfinishedRGBA[BC7Data::g_numShapes12];
 | 
						|
 | 
						|
                MUInt15 fragmentBestIndexes[BC7Data::g_numFragments];
 | 
						|
                MUInt15 shapeBestEP[BC7Data::g_maxFragmentsPerMode][2][4];
 | 
						|
                MFloat shapeBestError[BC7Data::g_maxFragmentsPerMode];
 | 
						|
            };
 | 
						|
 | 
						|
            static void TrySingleColorRGBAMultiTable(uint32_t flags, const MUInt15 pixels[16][4], const MFloat average[4], int numRealChannels, const uint8_t *fragmentStart, int shapeLength, const MFloat &staticAlphaError, const ParallelMath::Int16CompFlag punchThroughInvalid[4], MFloat& shapeBestError, MUInt15 shapeBestEP[2][4], MUInt15 *fragmentBestIndexes, const float *channelWeightsSq, const cvtt::Tables::BC7SC::Table*const* tables, int numTables, const ParallelMath::RoundTowardNearestForScope *rtn)
 | 
						|
            {
 | 
						|
                MFloat bestAverageError = ParallelMath::MakeFloat(FLT_MAX);
 | 
						|
 | 
						|
                MUInt15 intAverage[4];
 | 
						|
                for (int ch = 0; ch < 4; ch++)
 | 
						|
                    intAverage[ch] = ParallelMath::RoundAndConvertToU15(average[ch], rtn);
 | 
						|
 | 
						|
                MUInt15 eps[2][4];
 | 
						|
                MUInt15 reconstructed[4];
 | 
						|
                MUInt15 index = ParallelMath::MakeUInt15(0);
 | 
						|
 | 
						|
                for (int epi = 0; epi < 2; epi++)
 | 
						|
                {
 | 
						|
                    for (int ch = 0; ch < 3; ch++)
 | 
						|
                        eps[epi][ch] = ParallelMath::MakeUInt15(0);
 | 
						|
                    eps[epi][3] = ParallelMath::MakeUInt15(255);
 | 
						|
                }
 | 
						|
 | 
						|
                for (int ch = 0; ch < 3; ch++)
 | 
						|
                    reconstructed[ch] = ParallelMath::MakeUInt15(0);
 | 
						|
                reconstructed[3] = ParallelMath::MakeUInt15(255);
 | 
						|
 | 
						|
                // Depending on the target index and parity bits, there are multiple valid solid colors.
 | 
						|
                // We want to find the one closest to the actual average.
 | 
						|
                MFloat epsAverageDiff = ParallelMath::MakeFloat(FLT_MAX);
 | 
						|
                for (int t = 0; t < numTables; t++)
 | 
						|
                {
 | 
						|
                    const cvtt::Tables::BC7SC::Table& table = *(tables[t]);
 | 
						|
 | 
						|
                    ParallelMath::Int16CompFlag pti = punchThroughInvalid[table.m_pBits];
 | 
						|
 | 
						|
                    MUInt15 candidateReconstructed[4];
 | 
						|
                    MUInt15 candidateEPs[2][4];
 | 
						|
 | 
						|
                    for (int i = 0; i < ParallelMath::ParallelSize; i++)
 | 
						|
                    {
 | 
						|
                        for (int ch = 0; ch < numRealChannels; ch++)
 | 
						|
                        {
 | 
						|
                            ParallelMath::ScalarUInt16 avgValue = ParallelMath::Extract(intAverage[ch], i);
 | 
						|
                            assert(avgValue >= 0 && avgValue <= 255);
 | 
						|
 | 
						|
                            const cvtt::Tables::BC7SC::TableEntry &entry = table.m_entries[avgValue];
 | 
						|
 | 
						|
                            ParallelMath::PutUInt15(candidateEPs[0][ch], i, entry.m_min);
 | 
						|
                            ParallelMath::PutUInt15(candidateEPs[1][ch], i, entry.m_max);
 | 
						|
                            ParallelMath::PutUInt15(candidateReconstructed[ch], i, entry.m_actualColor);
 | 
						|
                        }
 | 
						|
                    }
 | 
						|
 | 
						|
                    MFloat avgError = ParallelMath::MakeFloatZero();
 | 
						|
                    for (int ch = 0; ch < numRealChannels; ch++)
 | 
						|
                    {
 | 
						|
                        MFloat delta = ParallelMath::ToFloat(candidateReconstructed[ch]) - average[ch];
 | 
						|
                        avgError = avgError + delta * delta * channelWeightsSq[ch];
 | 
						|
                    }
 | 
						|
 | 
						|
                    ParallelMath::Int16CompFlag better = ParallelMath::FloatFlagToInt16(ParallelMath::Less(avgError, bestAverageError));
 | 
						|
                    better = ParallelMath::AndNot(pti, better); // Mask out punch-through invalidations
 | 
						|
 | 
						|
                    if (ParallelMath::AnySet(better))
 | 
						|
                    {
 | 
						|
                        ParallelMath::ConditionalSet(bestAverageError, ParallelMath::Int16FlagToFloat(better), avgError);
 | 
						|
 | 
						|
                        MUInt15 candidateIndex = ParallelMath::MakeUInt15(table.m_index);
 | 
						|
 | 
						|
                        ParallelMath::ConditionalSet(index, better, candidateIndex);
 | 
						|
 | 
						|
                        for (int ch = 0; ch < numRealChannels; ch++)
 | 
						|
                            ParallelMath::ConditionalSet(reconstructed[ch], better, candidateReconstructed[ch]);
 | 
						|
 | 
						|
                        for (int epi = 0; epi < 2; epi++)
 | 
						|
                            for (int ch = 0; ch < numRealChannels; ch++)
 | 
						|
                                ParallelMath::ConditionalSet(eps[epi][ch], better, candidateEPs[epi][ch]);
 | 
						|
                    }
 | 
						|
                }
 | 
						|
 | 
						|
                AggregatedError<4> aggError;
 | 
						|
                for (int pxi = 0; pxi < shapeLength; pxi++)
 | 
						|
                {
 | 
						|
                    int px = fragmentStart[pxi];
 | 
						|
 | 
						|
                    BCCommon::ComputeErrorLDR<4>(flags, reconstructed, pixels[px], numRealChannels, aggError);
 | 
						|
                }
 | 
						|
 | 
						|
                MFloat error = aggError.Finalize(flags, channelWeightsSq) + staticAlphaError;
 | 
						|
 | 
						|
                ParallelMath::Int16CompFlag better = ParallelMath::FloatFlagToInt16(ParallelMath::Less(error, shapeBestError));
 | 
						|
                if (ParallelMath::AnySet(better))
 | 
						|
                {
 | 
						|
                    shapeBestError = ParallelMath::Min(shapeBestError, error);
 | 
						|
                    for (int epi = 0; epi < 2; epi++)
 | 
						|
                    {
 | 
						|
                        for (int ch = 0; ch < numRealChannels; ch++)
 | 
						|
                            ParallelMath::ConditionalSet(shapeBestEP[epi][ch], better, eps[epi][ch]);
 | 
						|
                    }
 | 
						|
 | 
						|
                    for (int pxi = 0; pxi < shapeLength; pxi++)
 | 
						|
                        ParallelMath::ConditionalSet(fragmentBestIndexes[pxi], better, index);
 | 
						|
                }
 | 
						|
            }
 | 
						|
 | 
						|
 | 
						|
            static void TrySinglePlane(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const float channelWeights[4], int numTweakRounds, int numRefineRounds, WorkInfo& work, const ParallelMath::RoundTowardNearestForScope *rtn)
 | 
						|
            {
 | 
						|
                if (numRefineRounds < 1)
 | 
						|
                    numRefineRounds = 1;
 | 
						|
 | 
						|
                if (numTweakRounds < 1)
 | 
						|
                    numTweakRounds = 1;
 | 
						|
                else if (numTweakRounds > MaxTweakRounds)
 | 
						|
                    numTweakRounds = MaxTweakRounds;
 | 
						|
 | 
						|
                float channelWeightsSq[4];
 | 
						|
 | 
						|
                for (int ch = 0; ch < 4; ch++)
 | 
						|
                    channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
 | 
						|
 | 
						|
                SinglePlaneTemporaries temps;
 | 
						|
 | 
						|
                MUInt15 maxAlpha = ParallelMath::MakeUInt15(0);
 | 
						|
                MUInt15 minAlpha = ParallelMath::MakeUInt15(255);
 | 
						|
                ParallelMath::Int16CompFlag isPunchThrough = ParallelMath::MakeBoolInt16(true);
 | 
						|
                for (int px = 0; px < 16; px++)
 | 
						|
                {
 | 
						|
                    MUInt15 a = pixels[px][3];
 | 
						|
                    maxAlpha = ParallelMath::Max(maxAlpha, a);
 | 
						|
                    minAlpha = ParallelMath::Min(minAlpha, a);
 | 
						|
 | 
						|
                    isPunchThrough = (isPunchThrough & (ParallelMath::Equal(a, ParallelMath::MakeUInt15(0)) | ParallelMath::Equal(a, ParallelMath::MakeUInt15(255))));
 | 
						|
                }
 | 
						|
 | 
						|
                ParallelMath::Int16CompFlag blockHasNonMaxAlpha = ParallelMath::Less(minAlpha, ParallelMath::MakeUInt15(255));
 | 
						|
                ParallelMath::Int16CompFlag blockHasNonZeroAlpha = ParallelMath::Less(ParallelMath::MakeUInt15(0), maxAlpha);
 | 
						|
 | 
						|
                bool anyBlockHasAlpha = ParallelMath::AnySet(blockHasNonMaxAlpha);
 | 
						|
 | 
						|
                // Try RGB modes if any block has a min alpha 251 or higher
 | 
						|
                bool allowRGBModes = ParallelMath::AnySet(ParallelMath::Less(ParallelMath::MakeUInt15(250), minAlpha));
 | 
						|
 | 
						|
                // Try mode 7 if any block has alpha.
 | 
						|
                // Mode 7 is almost never selected for RGB blocks because mode 4 has very accurate 7.7.7.1 endpoints
 | 
						|
                // and its parity bit doesn't affect alpha, meaning mode 7 can only be better in extremely specific
 | 
						|
                // situations, and only by at most 1 unit of error per pixel.
 | 
						|
                bool allowMode7 = anyBlockHasAlpha;
 | 
						|
 | 
						|
                MFloat preWeightedPixels[16][4];
 | 
						|
 | 
						|
                BCCommon::PreWeightPixelsLDR<4>(preWeightedPixels, pixels, channelWeights);
 | 
						|
 | 
						|
                const int *rgbInitialEPCollapseList = NULL;
 | 
						|
 | 
						|
                // Get initial RGB endpoints
 | 
						|
                if (allowRGBModes)
 | 
						|
                {
 | 
						|
                    const int *shapeList;
 | 
						|
                    int numShapesToEvaluate;
 | 
						|
 | 
						|
                    if (flags & Flags::BC7_EnablePartitioning)
 | 
						|
                    {
 | 
						|
                        if (flags & Flags::BC7_Enable3Subsets)
 | 
						|
                        {
 | 
						|
                            shapeList = BC7Data::g_shapeListAll;
 | 
						|
                            rgbInitialEPCollapseList = BC7Data::g_shapeListAll;
 | 
						|
                            numShapesToEvaluate = BC7Data::g_numShapesAll;
 | 
						|
                        }
 | 
						|
                        else
 | 
						|
                        {
 | 
						|
                            shapeList = BC7Data::g_shapeList12;
 | 
						|
                            rgbInitialEPCollapseList = BC7Data::g_shapeList12Collapse;
 | 
						|
                            numShapesToEvaluate = BC7Data::g_numShapes12;
 | 
						|
                        }
 | 
						|
                    }
 | 
						|
                    else
 | 
						|
                    {
 | 
						|
                        shapeList = BC7Data::g_shapeList1;
 | 
						|
                        rgbInitialEPCollapseList = BC7Data::g_shapeList1Collapse;
 | 
						|
                        numShapesToEvaluate = BC7Data::g_numShapes1;
 | 
						|
                    }
 | 
						|
 | 
						|
                    for (int shapeIter = 0; shapeIter < numShapesToEvaluate; shapeIter++)
 | 
						|
                    {
 | 
						|
                        int shape = shapeList[shapeIter];
 | 
						|
 | 
						|
                        int shapeStart = BC7Data::g_shapeRanges[shape][0];
 | 
						|
                        int shapeSize = BC7Data::g_shapeRanges[shape][1];
 | 
						|
 | 
						|
                        EndpointSelector<3, 8> epSelector;
 | 
						|
 | 
						|
                        for (int epPass = 0; epPass < NumEndpointSelectorPasses; epPass++)
 | 
						|
                        {
 | 
						|
                            for (int spx = 0; spx < shapeSize; spx++)
 | 
						|
                            {
 | 
						|
                                int px = BC7Data::g_fragments[shapeStart + spx];
 | 
						|
                                epSelector.ContributePass(preWeightedPixels[px], epPass, ParallelMath::MakeFloat(1.0f));
 | 
						|
                            }
 | 
						|
                            epSelector.FinishPass(epPass);
 | 
						|
                        }
 | 
						|
                        temps.unfinishedRGB[shapeIter] = epSelector.GetEndpoints(channelWeights);
 | 
						|
                    }
 | 
						|
                }
 | 
						|
 | 
						|
                const int *rgbaInitialEPCollapseList = BC7Data::g_shapeList12Collapse;
 | 
						|
 | 
						|
                // Get initial RGBA endpoints
 | 
						|
                {
 | 
						|
                    const int *shapeList = BC7Data::g_shapeList12;
 | 
						|
                    int numShapesToEvaluate = BC7Data::g_numShapes12;
 | 
						|
 | 
						|
                    for (int shapeIter = 0; shapeIter < numShapesToEvaluate; shapeIter++)
 | 
						|
                    {
 | 
						|
                        int shape = shapeList[shapeIter];
 | 
						|
 | 
						|
                        if (anyBlockHasAlpha || !allowRGBModes)
 | 
						|
                        {
 | 
						|
                            int shapeStart = BC7Data::g_shapeRanges[shape][0];
 | 
						|
                            int shapeSize = BC7Data::g_shapeRanges[shape][1];
 | 
						|
 | 
						|
                            EndpointSelector<4, 8> epSelector;
 | 
						|
 | 
						|
                            for (int epPass = 0; epPass < NumEndpointSelectorPasses; epPass++)
 | 
						|
                            {
 | 
						|
                                for (int spx = 0; spx < shapeSize; spx++)
 | 
						|
                                {
 | 
						|
                                    int px = BC7Data::g_fragments[shapeStart + spx];
 | 
						|
                                    epSelector.ContributePass(preWeightedPixels[px], epPass, ParallelMath::MakeFloat(1.0f));
 | 
						|
                                }
 | 
						|
                                epSelector.FinishPass(epPass);
 | 
						|
                            }
 | 
						|
                            temps.unfinishedRGBA[shapeIter] = epSelector.GetEndpoints(channelWeights);
 | 
						|
                        }
 | 
						|
                        else
 | 
						|
                        {
 | 
						|
                            temps.unfinishedRGBA[shapeIter] = temps.unfinishedRGB[rgbInitialEPCollapseList[shape]].ExpandTo<4>(255);
 | 
						|
                        }
 | 
						|
                    }
 | 
						|
                }
 | 
						|
 | 
						|
                for (uint16_t mode = 0; mode <= 7; mode++)
 | 
						|
                {
 | 
						|
                    if (!(flags & Flags::BC7_EnablePartitioning) && BC7Data::g_modes[mode].m_numSubsets != 1)
 | 
						|
                        continue;
 | 
						|
 | 
						|
                    if (!(flags & Flags::BC7_Enable3Subsets) && BC7Data::g_modes[mode].m_numSubsets == 3)
 | 
						|
                        continue;
 | 
						|
 | 
						|
                    if (mode == 4 || mode == 5)
 | 
						|
                        continue;
 | 
						|
 | 
						|
                    if (mode < 4 && !allowRGBModes)
 | 
						|
                        continue;
 | 
						|
 | 
						|
                    if (mode == 7 && !allowMode7)
 | 
						|
                        continue;
 | 
						|
 | 
						|
                    bool isRGB = (mode < 4);
 | 
						|
 | 
						|
                    unsigned int numPartitions = 1 << BC7Data::g_modes[mode].m_partitionBits;
 | 
						|
                    int numSubsets = BC7Data::g_modes[mode].m_numSubsets;
 | 
						|
                    int indexPrec = BC7Data::g_modes[mode].m_indexBits;
 | 
						|
 | 
						|
                    int parityBitMax = 1;
 | 
						|
                    if (BC7Data::g_modes[mode].m_pBitMode == BC7Data::PBitMode_PerEndpoint)
 | 
						|
                        parityBitMax = 4;
 | 
						|
                    else if (BC7Data::g_modes[mode].m_pBitMode == BC7Data::PBitMode_PerSubset)
 | 
						|
                        parityBitMax = 2;
 | 
						|
 | 
						|
                    int numRealChannels = isRGB ? 3 : 4;
 | 
						|
 | 
						|
                    int numShapes;
 | 
						|
                    const int *shapeList;
 | 
						|
                    const int *shapeCollapseList;
 | 
						|
 | 
						|
                    if (numSubsets == 1)
 | 
						|
                    {
 | 
						|
                        numShapes = BC7Data::g_numShapes1;
 | 
						|
                        shapeList = BC7Data::g_shapeList1;
 | 
						|
                        shapeCollapseList = BC7Data::g_shapeList1Collapse;
 | 
						|
                    }
 | 
						|
                    else if (numSubsets == 2)
 | 
						|
                    {
 | 
						|
                        numShapes = BC7Data::g_numShapes2;
 | 
						|
                        shapeList = BC7Data::g_shapeList2;
 | 
						|
                        shapeCollapseList = BC7Data::g_shapeList2Collapse;
 | 
						|
                    }
 | 
						|
                    else
 | 
						|
                    {
 | 
						|
                        assert(numSubsets == 3);
 | 
						|
                        if (numPartitions == 16)
 | 
						|
                        {
 | 
						|
                            numShapes = BC7Data::g_numShapes3Short;
 | 
						|
                            shapeList = BC7Data::g_shapeList3Short;
 | 
						|
                            shapeCollapseList = BC7Data::g_shapeList3ShortCollapse;
 | 
						|
                        }
 | 
						|
                        else
 | 
						|
                        {
 | 
						|
                            assert(numPartitions == 64);
 | 
						|
                            numShapes = BC7Data::g_numShapes3;
 | 
						|
                            shapeList = BC7Data::g_shapeList3;
 | 
						|
                            shapeCollapseList = BC7Data::g_shapeList3Collapse;
 | 
						|
                        }
 | 
						|
                    }
 | 
						|
 | 
						|
                    for (int slot = 0; slot < BC7Data::g_maxFragmentsPerMode; slot++)
 | 
						|
                        temps.shapeBestError[slot] = ParallelMath::MakeFloat(FLT_MAX);
 | 
						|
 | 
						|
                    for (int shapeIter = 0; shapeIter < numShapes; shapeIter++)
 | 
						|
                    {
 | 
						|
                        int shape = shapeList[shapeIter];
 | 
						|
                        int shapeStart = BC7Data::g_shapeRanges[shape][0];
 | 
						|
                        int shapeLength = BC7Data::g_shapeRanges[shape][1];
 | 
						|
                        int shapeCollapsedEvalIndex = shapeCollapseList[shape];
 | 
						|
 | 
						|
                        AggregatedError<1> alphaAggError;
 | 
						|
                        if (isRGB && anyBlockHasAlpha)
 | 
						|
                        {
 | 
						|
                            MUInt15 filledAlpha[1] = { ParallelMath::MakeUInt15(255) };
 | 
						|
 | 
						|
                            for (int pxi = 0; pxi < shapeLength; pxi++)
 | 
						|
                            {
 | 
						|
                                int px = BC7Data::g_fragments[shapeStart + pxi];
 | 
						|
                                MUInt15 original[1] = { pixels[px][3] };
 | 
						|
                                BCCommon::ComputeErrorLDR<1>(flags, filledAlpha, original, alphaAggError);
 | 
						|
                            }
 | 
						|
                        }
 | 
						|
 | 
						|
                        float alphaWeightsSq[1] = { channelWeightsSq[3] };
 | 
						|
                        MFloat staticAlphaError = alphaAggError.Finalize(flags, alphaWeightsSq);
 | 
						|
 | 
						|
                        assert(shapeCollapsedEvalIndex >= 0);
 | 
						|
 | 
						|
                        MUInt15 tweakBaseEP[MaxTweakRounds][2][4];
 | 
						|
 | 
						|
                        for (int tweak = 0; tweak < numTweakRounds; tweak++)
 | 
						|
                        {
 | 
						|
                            if (isRGB)
 | 
						|
                            {
 | 
						|
                                temps.unfinishedRGB[rgbInitialEPCollapseList[shape]].FinishLDR(tweak, 1 << indexPrec, tweakBaseEP[tweak][0], tweakBaseEP[tweak][1]);
 | 
						|
                                tweakBaseEP[tweak][0][3] = tweakBaseEP[tweak][1][3] = ParallelMath::MakeUInt15(255);
 | 
						|
                            }
 | 
						|
                            else
 | 
						|
                            {
 | 
						|
                                temps.unfinishedRGBA[rgbaInitialEPCollapseList[shape]].FinishLDR(tweak, 1 << indexPrec, tweakBaseEP[tweak][0], tweakBaseEP[tweak][1]);
 | 
						|
                            }
 | 
						|
                        }
 | 
						|
 | 
						|
                        ParallelMath::Int16CompFlag punchThroughInvalid[4];
 | 
						|
                        for (int pIter = 0; pIter < parityBitMax; pIter++)
 | 
						|
                        {
 | 
						|
                            punchThroughInvalid[pIter] = ParallelMath::MakeBoolInt16(false);
 | 
						|
 | 
						|
                            if ((flags & Flags::BC7_RespectPunchThrough) && (mode == 6 || mode == 7))
 | 
						|
                            {
 | 
						|
                                // Modes 6 and 7 have parity bits that affect alpha
 | 
						|
                                if (pIter == 0)
 | 
						|
                                    punchThroughInvalid[pIter] = (isPunchThrough & blockHasNonZeroAlpha);
 | 
						|
                                else if (pIter == parityBitMax - 1)
 | 
						|
                                    punchThroughInvalid[pIter] = (isPunchThrough & blockHasNonMaxAlpha);
 | 
						|
                                else
 | 
						|
                                    punchThroughInvalid[pIter] = isPunchThrough;
 | 
						|
                            }
 | 
						|
                        }
 | 
						|
 | 
						|
                        for (int pIter = 0; pIter < parityBitMax; pIter++)
 | 
						|
                        {
 | 
						|
                            if (ParallelMath::AllSet(punchThroughInvalid[pIter]))
 | 
						|
                                continue;
 | 
						|
 | 
						|
                            bool needPunchThroughCheck = ParallelMath::AnySet(punchThroughInvalid[pIter]);
 | 
						|
 | 
						|
                            for (int tweak = 0; tweak < numTweakRounds; tweak++)
 | 
						|
                            {
 | 
						|
                                uint16_t p[2];
 | 
						|
                                p[0] = (pIter & 1);
 | 
						|
                                p[1] = ((pIter >> 1) & 1);
 | 
						|
 | 
						|
                                MUInt15 ep[2][4];
 | 
						|
 | 
						|
                                for (int epi = 0; epi < 2; epi++)
 | 
						|
                                    for (int ch = 0; ch < 4; ch++)
 | 
						|
                                        ep[epi][ch] = tweakBaseEP[tweak][epi][ch];
 | 
						|
 | 
						|
                                for (int refine = 0; refine < numRefineRounds; refine++)
 | 
						|
                                {
 | 
						|
                                    switch (mode)
 | 
						|
                                    {
 | 
						|
                                    case 0:
 | 
						|
                                        CompressEndpoints0(ep, p, rtn);
 | 
						|
                                        break;
 | 
						|
                                    case 1:
 | 
						|
                                        CompressEndpoints1(ep, p[0], rtn);
 | 
						|
                                        break;
 | 
						|
                                    case 2:
 | 
						|
                                        CompressEndpoints2(ep, rtn);
 | 
						|
                                        break;
 | 
						|
                                    case 3:
 | 
						|
                                        CompressEndpoints3(ep, p, rtn);
 | 
						|
                                        break;
 | 
						|
                                    case 6:
 | 
						|
                                        CompressEndpoints6(ep, p, rtn);
 | 
						|
                                        break;
 | 
						|
                                    case 7:
 | 
						|
                                        CompressEndpoints7(ep, p, rtn);
 | 
						|
                                        break;
 | 
						|
                                    default:
 | 
						|
                                        assert(false);
 | 
						|
                                        break;
 | 
						|
                                    };
 | 
						|
 | 
						|
                                    MFloat shapeError = ParallelMath::MakeFloatZero();
 | 
						|
 | 
						|
                                    IndexSelector<4> indexSelector;
 | 
						|
                                    indexSelector.Init<false>(channelWeights, ep, 1 << indexPrec);
 | 
						|
 | 
						|
                                    EndpointRefiner<4> epRefiner;
 | 
						|
                                    epRefiner.Init(1 << indexPrec, channelWeights);
 | 
						|
 | 
						|
                                    MUInt15 indexes[16];
 | 
						|
 | 
						|
                                    AggregatedError<4> aggError;
 | 
						|
                                    for (int pxi = 0; pxi < shapeLength; pxi++)
 | 
						|
                                    {
 | 
						|
                                        int px = BC7Data::g_fragments[shapeStart + pxi];
 | 
						|
 | 
						|
                                        MUInt15 index;
 | 
						|
                                        MUInt15 reconstructed[4];
 | 
						|
 | 
						|
                                        index = indexSelector.SelectIndexLDR(floatPixels[px], rtn);
 | 
						|
                                        indexSelector.ReconstructLDR_BC7(index, reconstructed, numRealChannels);
 | 
						|
 | 
						|
                                        if (flags & cvtt::Flags::BC7_FastIndexing)
 | 
						|
                                            BCCommon::ComputeErrorLDR<4>(flags, reconstructed, pixels[px], numRealChannels, aggError);
 | 
						|
                                        else
 | 
						|
                                        {
 | 
						|
                                            MFloat error = BCCommon::ComputeErrorLDRSimple<4>(flags, reconstructed, pixels[px], numRealChannels, channelWeightsSq);
 | 
						|
 | 
						|
                                            MUInt15 altIndexes[2];
 | 
						|
                                            altIndexes[0] = ParallelMath::Max(index, ParallelMath::MakeUInt15(1)) - ParallelMath::MakeUInt15(1);
 | 
						|
                                            altIndexes[1] = ParallelMath::Min(index + ParallelMath::MakeUInt15(1), ParallelMath::MakeUInt15(static_cast<uint16_t>((1 << indexPrec) - 1)));
 | 
						|
 | 
						|
                                            for (int ii = 0; ii < 2; ii++)
 | 
						|
                                            {
 | 
						|
                                                indexSelector.ReconstructLDR_BC7(altIndexes[ii], reconstructed, numRealChannels);
 | 
						|
 | 
						|
                                                MFloat altError = BCCommon::ComputeErrorLDRSimple<4>(flags, reconstructed, pixels[px], numRealChannels, channelWeightsSq);
 | 
						|
                                                ParallelMath::Int16CompFlag better = ParallelMath::FloatFlagToInt16(ParallelMath::Less(altError, error));
 | 
						|
                                                error = ParallelMath::Min(error, altError);
 | 
						|
                                                ParallelMath::ConditionalSet(index, better, altIndexes[ii]);
 | 
						|
                                            }
 | 
						|
 | 
						|
                                            shapeError = shapeError + error;
 | 
						|
                                        }
 | 
						|
 | 
						|
                                        if (refine != numRefineRounds - 1)
 | 
						|
                                            epRefiner.ContributeUnweightedPW(preWeightedPixels[px], index, numRealChannels);
 | 
						|
 | 
						|
                                        indexes[pxi] = index;
 | 
						|
                                    }
 | 
						|
 | 
						|
                                    if (flags & cvtt::Flags::BC7_FastIndexing)
 | 
						|
                                        shapeError = aggError.Finalize(flags, channelWeightsSq);
 | 
						|
 | 
						|
                                    if (isRGB)
 | 
						|
                                        shapeError = shapeError + staticAlphaError;
 | 
						|
 | 
						|
                                    ParallelMath::FloatCompFlag shapeErrorBetter;
 | 
						|
                                    ParallelMath::Int16CompFlag shapeErrorBetter16;
 | 
						|
 | 
						|
                                    shapeErrorBetter = ParallelMath::Less(shapeError, temps.shapeBestError[shapeCollapsedEvalIndex]);
 | 
						|
                                    shapeErrorBetter16 = ParallelMath::FloatFlagToInt16(shapeErrorBetter);
 | 
						|
 | 
						|
                                    if (ParallelMath::AnySet(shapeErrorBetter16))
 | 
						|
                                    {
 | 
						|
                                        bool punchThroughOK = true;
 | 
						|
                                        if (needPunchThroughCheck)
 | 
						|
                                        {
 | 
						|
                                            shapeErrorBetter16 = ParallelMath::AndNot(punchThroughInvalid[pIter], shapeErrorBetter16);
 | 
						|
                                            shapeErrorBetter = ParallelMath::Int16FlagToFloat(shapeErrorBetter16);
 | 
						|
 | 
						|
                                            if (!ParallelMath::AnySet(shapeErrorBetter16))
 | 
						|
                                                punchThroughOK = false;
 | 
						|
                                        }
 | 
						|
 | 
						|
                                        if (punchThroughOK)
 | 
						|
                                        {
 | 
						|
                                            ParallelMath::ConditionalSet(temps.shapeBestError[shapeCollapsedEvalIndex], shapeErrorBetter, shapeError);
 | 
						|
                                            for (int epi = 0; epi < 2; epi++)
 | 
						|
                                                for (int ch = 0; ch < numRealChannels; ch++)
 | 
						|
                                                    ParallelMath::ConditionalSet(temps.shapeBestEP[shapeCollapsedEvalIndex][epi][ch], shapeErrorBetter16, ep[epi][ch]);
 | 
						|
 | 
						|
                                            for (int pxi = 0; pxi < shapeLength; pxi++)
 | 
						|
                                                ParallelMath::ConditionalSet(temps.fragmentBestIndexes[shapeStart + pxi], shapeErrorBetter16, indexes[pxi]);
 | 
						|
                                        }
 | 
						|
                                    }
 | 
						|
 | 
						|
                                    if (refine != numRefineRounds - 1)
 | 
						|
                                        epRefiner.GetRefinedEndpointsLDR(ep, numRealChannels, rtn);
 | 
						|
                                } // refine
 | 
						|
                            } // tweak
 | 
						|
                        } // p
 | 
						|
 | 
						|
                        if (flags & cvtt::Flags::BC7_TrySingleColor)
 | 
						|
                        {
 | 
						|
                            MUInt15 total[4];
 | 
						|
                            for (int ch = 0; ch < 4; ch++)
 | 
						|
                                total[ch] = ParallelMath::MakeUInt15(0);
 | 
						|
 | 
						|
                            for (int pxi = 0; pxi < shapeLength; pxi++)
 | 
						|
                            {
 | 
						|
                                int px = BC7Data::g_fragments[shapeStart + pxi];
 | 
						|
                                for (int ch = 0; ch < 4; ch++)
 | 
						|
                                    total[ch] = total[ch] + pixels[pxi][ch];
 | 
						|
                            }
 | 
						|
 | 
						|
                            MFloat rcpShapeLength = ParallelMath::MakeFloat(1.0f / static_cast<float>(shapeLength));
 | 
						|
                            MFloat average[4];
 | 
						|
                            for (int ch = 0; ch < 4; ch++)
 | 
						|
                                average[ch] = ParallelMath::ToFloat(total[ch]) * rcpShapeLength;
 | 
						|
 | 
						|
                            const uint8_t *fragment = BC7Data::g_fragments + shapeStart;
 | 
						|
                            MFloat &shapeBestError = temps.shapeBestError[shapeCollapsedEvalIndex];
 | 
						|
                            MUInt15(&shapeBestEP)[2][4] = temps.shapeBestEP[shapeCollapsedEvalIndex];
 | 
						|
                            MUInt15 *fragmentBestIndexes = temps.fragmentBestIndexes + shapeStart;
 | 
						|
 | 
						|
                            const cvtt::Tables::BC7SC::Table **scTables = NULL;
 | 
						|
                            int numSCTables = 0;
 | 
						|
 | 
						|
                            switch (mode)
 | 
						|
                            {
 | 
						|
                            case 0:
 | 
						|
                                {
 | 
						|
                                    const cvtt::Tables::BC7SC::Table *tables[] =
 | 
						|
                                    {
 | 
						|
                                        &cvtt::Tables::BC7SC::g_mode0_p00_i1,
 | 
						|
                                        &cvtt::Tables::BC7SC::g_mode0_p00_i2,
 | 
						|
                                        &cvtt::Tables::BC7SC::g_mode0_p00_i3,
 | 
						|
                                        &cvtt::Tables::BC7SC::g_mode0_p01_i1,
 | 
						|
                                        &cvtt::Tables::BC7SC::g_mode0_p01_i2,
 | 
						|
                                        &cvtt::Tables::BC7SC::g_mode0_p01_i3,
 | 
						|
                                        &cvtt::Tables::BC7SC::g_mode0_p10_i1,
 | 
						|
                                        &cvtt::Tables::BC7SC::g_mode0_p10_i2,
 | 
						|
                                        &cvtt::Tables::BC7SC::g_mode0_p10_i3,
 | 
						|
                                        &cvtt::Tables::BC7SC::g_mode0_p11_i1,
 | 
						|
                                        &cvtt::Tables::BC7SC::g_mode0_p11_i2,
 | 
						|
                                        &cvtt::Tables::BC7SC::g_mode0_p11_i3,
 | 
						|
                                    };
 | 
						|
                                    scTables = tables;
 | 
						|
                                    numSCTables = sizeof(tables) / sizeof(tables[0]);
 | 
						|
                                }
 | 
						|
                                break;
 | 
						|
                            case 1:
 | 
						|
                                {
 | 
						|
                                    const cvtt::Tables::BC7SC::Table *tables[] =
 | 
						|
                                    {
 | 
						|
                                        &cvtt::Tables::BC7SC::g_mode1_p0_i1,
 | 
						|
                                        &cvtt::Tables::BC7SC::g_mode1_p0_i2,
 | 
						|
                                        &cvtt::Tables::BC7SC::g_mode1_p0_i3,
 | 
						|
                                        &cvtt::Tables::BC7SC::g_mode1_p1_i1,
 | 
						|
                                        &cvtt::Tables::BC7SC::g_mode1_p1_i2,
 | 
						|
                                        &cvtt::Tables::BC7SC::g_mode1_p1_i3,
 | 
						|
                                    };
 | 
						|
                                    scTables = tables;
 | 
						|
                                    numSCTables = sizeof(tables) / sizeof(tables[0]);
 | 
						|
                                }
 | 
						|
                                break;
 | 
						|
                            case 2:
 | 
						|
                                {
 | 
						|
                                    const cvtt::Tables::BC7SC::Table *tables[] =
 | 
						|
                                    {
 | 
						|
                                        &cvtt::Tables::BC7SC::g_mode2,
 | 
						|
                                    };
 | 
						|
                                    scTables = tables;
 | 
						|
                                    numSCTables = sizeof(tables) / sizeof(tables[0]);
 | 
						|
                                }
 | 
						|
                                break;
 | 
						|
                            case 3:
 | 
						|
                                {
 | 
						|
                                    const cvtt::Tables::BC7SC::Table *tables[] =
 | 
						|
                                    {
 | 
						|
                                        &cvtt::Tables::BC7SC::g_mode3_p0,
 | 
						|
                                        &cvtt::Tables::BC7SC::g_mode3_p1,
 | 
						|
                                    };
 | 
						|
                                    scTables = tables;
 | 
						|
                                    numSCTables = sizeof(tables) / sizeof(tables[0]);
 | 
						|
                                }
 | 
						|
                                break;
 | 
						|
                            case 6:
 | 
						|
                                {
 | 
						|
                                    const cvtt::Tables::BC7SC::Table *tables[] =
 | 
						|
                                    {
 | 
						|
                                        &cvtt::Tables::BC7SC::g_mode6_p0_i1,
 | 
						|
                                        &cvtt::Tables::BC7SC::g_mode6_p0_i2,
 | 
						|
                                        &cvtt::Tables::BC7SC::g_mode6_p0_i3,
 | 
						|
                                        &cvtt::Tables::BC7SC::g_mode6_p0_i4,
 | 
						|
                                        &cvtt::Tables::BC7SC::g_mode6_p0_i5,
 | 
						|
                                        &cvtt::Tables::BC7SC::g_mode6_p0_i6,
 | 
						|
                                        &cvtt::Tables::BC7SC::g_mode6_p0_i7,
 | 
						|
                                        &cvtt::Tables::BC7SC::g_mode6_p1_i1,
 | 
						|
                                        &cvtt::Tables::BC7SC::g_mode6_p1_i2,
 | 
						|
                                        &cvtt::Tables::BC7SC::g_mode6_p1_i3,
 | 
						|
                                        &cvtt::Tables::BC7SC::g_mode6_p1_i4,
 | 
						|
                                        &cvtt::Tables::BC7SC::g_mode6_p1_i5,
 | 
						|
                                        &cvtt::Tables::BC7SC::g_mode6_p1_i6,
 | 
						|
                                        &cvtt::Tables::BC7SC::g_mode6_p1_i7,
 | 
						|
                                    };
 | 
						|
                                    scTables = tables;
 | 
						|
                                    numSCTables = sizeof(tables) / sizeof(tables[0]);
 | 
						|
                                }
 | 
						|
                                break;
 | 
						|
                            case 7:
 | 
						|
                                {
 | 
						|
                                    const cvtt::Tables::BC7SC::Table *tables[] =
 | 
						|
                                    {
 | 
						|
                                        &cvtt::Tables::BC7SC::g_mode7_p00,
 | 
						|
                                        &cvtt::Tables::BC7SC::g_mode7_p01,
 | 
						|
                                        &cvtt::Tables::BC7SC::g_mode7_p10,
 | 
						|
                                        &cvtt::Tables::BC7SC::g_mode7_p11,
 | 
						|
                                    };
 | 
						|
                                    scTables = tables;
 | 
						|
                                    numSCTables = sizeof(tables) / sizeof(tables[0]);
 | 
						|
                                }
 | 
						|
                                break;
 | 
						|
                            default:
 | 
						|
                                assert(false);
 | 
						|
                                break;
 | 
						|
                            }
 | 
						|
 | 
						|
                            TrySingleColorRGBAMultiTable(flags, pixels, average, numRealChannels, fragment, shapeLength, staticAlphaError, punchThroughInvalid, shapeBestError, shapeBestEP, fragmentBestIndexes, channelWeightsSq, scTables, numSCTables, rtn);
 | 
						|
                        }
 | 
						|
                    } // shapeIter
 | 
						|
 | 
						|
                    for (uint16_t partition = 0; partition < numPartitions; partition++)
 | 
						|
                    {
 | 
						|
                        const int *partitionShapes;
 | 
						|
                        if (numSubsets == 1)
 | 
						|
                            partitionShapes = BC7Data::g_shapes1[partition];
 | 
						|
                        else if (numSubsets == 2)
 | 
						|
                            partitionShapes = BC7Data::g_shapes2[partition];
 | 
						|
                        else
 | 
						|
                        {
 | 
						|
                            assert(numSubsets == 3);
 | 
						|
                            partitionShapes = BC7Data::g_shapes3[partition];
 | 
						|
                        }
 | 
						|
 | 
						|
                        MFloat totalError = ParallelMath::MakeFloatZero();
 | 
						|
                        for (int subset = 0; subset < numSubsets; subset++)
 | 
						|
                            totalError = totalError + temps.shapeBestError[shapeCollapseList[partitionShapes[subset]]];
 | 
						|
 | 
						|
                        ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(totalError, work.m_error);
 | 
						|
                        ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);
 | 
						|
 | 
						|
                        if (ParallelMath::AnySet(errorBetter16))
 | 
						|
                        {
 | 
						|
                            for (int subset = 0; subset < numSubsets; subset++)
 | 
						|
                            {
 | 
						|
                                int shape = partitionShapes[subset];
 | 
						|
                                int shapeStart = BC7Data::g_shapeRanges[shape][0];
 | 
						|
                                int shapeLength = BC7Data::g_shapeRanges[shape][1];
 | 
						|
                                int shapeCollapsedEvalIndex = shapeCollapseList[shape];
 | 
						|
 | 
						|
                                for (int epi = 0; epi < 2; epi++)
 | 
						|
                                    for (int ch = 0; ch < 4; ch++)
 | 
						|
                                        ParallelMath::ConditionalSet(work.m_ep[subset][epi][ch], errorBetter16, temps.shapeBestEP[shapeCollapsedEvalIndex][epi][ch]);
 | 
						|
 | 
						|
                                for (int pxi = 0; pxi < shapeLength; pxi++)
 | 
						|
                                {
 | 
						|
                                    int px = BC7Data::g_fragments[shapeStart + pxi];
 | 
						|
                                    ParallelMath::ConditionalSet(work.m_indexes[px], errorBetter16, temps.fragmentBestIndexes[shapeStart + pxi]);
 | 
						|
                                }
 | 
						|
                            }
 | 
						|
 | 
						|
                            work.m_error = ParallelMath::Min(totalError, work.m_error);
 | 
						|
                            ParallelMath::ConditionalSet(work.m_mode, errorBetter16, ParallelMath::MakeUInt15(mode));
 | 
						|
                            ParallelMath::ConditionalSet(work.m_u.m_partition, errorBetter16, ParallelMath::MakeUInt15(partition));
 | 
						|
                        }
 | 
						|
                    }
 | 
						|
                }
 | 
						|
            }
 | 
						|
 | 
						|
            static void TryDualPlane(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const float channelWeights[4], int numTweakRounds, int numRefineRounds, WorkInfo& work, const ParallelMath::RoundTowardNearestForScope *rtn)
 | 
						|
            {
 | 
						|
                // TODO: These error calculations are not optimal for weight-by-alpha, but this routine needs to be mostly rewritten for that.
 | 
						|
                // The alpha/color solutions are co-dependent in that case, but a good way to solve it would probably be to
 | 
						|
                // solve the alpha channel first, then solve the RGB channels, which in turn breaks down into two cases:
 | 
						|
                // - Separate alpha channel, then weighted RGB
 | 
						|
                // - Alpha+2 other channels, then the independent channel
 | 
						|
 | 
						|
                if (!(flags & Flags::BC7_EnableDualPlane))
 | 
						|
                    return;
 | 
						|
 | 
						|
                if (numRefineRounds < 1)
 | 
						|
                    numRefineRounds = 1;
 | 
						|
 | 
						|
                if (numTweakRounds < 1)
 | 
						|
                    numTweakRounds = 1;
 | 
						|
                else if (numTweakRounds > MaxTweakRounds)
 | 
						|
                    numTweakRounds = MaxTweakRounds;
 | 
						|
 | 
						|
                float channelWeightsSq[4];
 | 
						|
                for (int ch = 0; ch < 4; ch++)
 | 
						|
                    channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
 | 
						|
 | 
						|
                for (uint16_t mode = 4; mode <= 5; mode++)
 | 
						|
                {
 | 
						|
                    for (uint16_t rotation = 0; rotation < 4; rotation++)
 | 
						|
                    {
 | 
						|
                        int alphaChannel = (rotation + 3) & 3;
 | 
						|
                        int redChannel = (rotation == 1) ? 3 : 0;
 | 
						|
                        int greenChannel = (rotation == 2) ? 3 : 1;
 | 
						|
                        int blueChannel = (rotation == 3) ? 3 : 2;
 | 
						|
 | 
						|
                        MUInt15 rotatedRGB[16][3];
 | 
						|
                        MFloat floatRotatedRGB[16][3];
 | 
						|
 | 
						|
                        for (int px = 0; px < 16; px++)
 | 
						|
                        {
 | 
						|
                            rotatedRGB[px][0] = pixels[px][redChannel];
 | 
						|
                            rotatedRGB[px][1] = pixels[px][greenChannel];
 | 
						|
                            rotatedRGB[px][2] = pixels[px][blueChannel];
 | 
						|
 | 
						|
                            for (int ch = 0; ch < 3; ch++)
 | 
						|
                                floatRotatedRGB[px][ch] = ParallelMath::ToFloat(rotatedRGB[px][ch]);
 | 
						|
                        }
 | 
						|
 | 
						|
                        uint16_t maxIndexSelector = (mode == 4) ? 2 : 1;
 | 
						|
 | 
						|
                        float rotatedRGBWeights[3] = { channelWeights[redChannel], channelWeights[greenChannel], channelWeights[blueChannel] };
 | 
						|
                        float rotatedRGBWeightsSq[3] = { channelWeightsSq[redChannel], channelWeightsSq[greenChannel], channelWeightsSq[blueChannel] };
 | 
						|
                        float rotatedAlphaWeight[1] = { channelWeights[alphaChannel] };
 | 
						|
                        float rotatedAlphaWeightSq[1] = { channelWeightsSq[alphaChannel] };
 | 
						|
 | 
						|
                        float uniformWeight[1] = { 1.0f };   // Since the alpha channel is independent, there's no need to bother with weights when doing refinement or selection, only error
 | 
						|
 | 
						|
                        MFloat preWeightedRotatedRGB[16][3];
 | 
						|
                        BCCommon::PreWeightPixelsLDR<3>(preWeightedRotatedRGB, rotatedRGB, rotatedRGBWeights);
 | 
						|
 | 
						|
                        for (uint16_t indexSelector = 0; indexSelector < maxIndexSelector; indexSelector++)
 | 
						|
                        {
 | 
						|
                            EndpointSelector<3, 8> rgbSelector;
 | 
						|
 | 
						|
                            for (int epPass = 0; epPass < NumEndpointSelectorPasses; epPass++)
 | 
						|
                            {
 | 
						|
                                for (int px = 0; px < 16; px++)
 | 
						|
                                    rgbSelector.ContributePass(preWeightedRotatedRGB[px], epPass, ParallelMath::MakeFloat(1.0f));
 | 
						|
 | 
						|
                                rgbSelector.FinishPass(epPass);
 | 
						|
                            }
 | 
						|
 | 
						|
                            MUInt15 alphaRange[2];
 | 
						|
 | 
						|
                            alphaRange[0] = alphaRange[1] = pixels[0][alphaChannel];
 | 
						|
                            for (int px = 1; px < 16; px++)
 | 
						|
                            {
 | 
						|
                                alphaRange[0] = ParallelMath::Min(pixels[px][alphaChannel], alphaRange[0]);
 | 
						|
                                alphaRange[1] = ParallelMath::Max(pixels[px][alphaChannel], alphaRange[1]);
 | 
						|
                            }
 | 
						|
 | 
						|
                            int rgbPrec = 0;
 | 
						|
                            int alphaPrec = 0;
 | 
						|
 | 
						|
                            if (mode == 4)
 | 
						|
                            {
 | 
						|
                                rgbPrec = indexSelector ? 3 : 2;
 | 
						|
                                alphaPrec = indexSelector ? 2 : 3;
 | 
						|
                            }
 | 
						|
                            else
 | 
						|
                                rgbPrec = alphaPrec = 2;
 | 
						|
 | 
						|
                            UnfinishedEndpoints<3> unfinishedRGB = rgbSelector.GetEndpoints(rotatedRGBWeights);
 | 
						|
 | 
						|
                            MFloat bestRGBError = ParallelMath::MakeFloat(FLT_MAX);
 | 
						|
                            MFloat bestAlphaError = ParallelMath::MakeFloat(FLT_MAX);
 | 
						|
 | 
						|
                            MUInt15 bestRGBIndexes[16];
 | 
						|
                            MUInt15 bestAlphaIndexes[16];
 | 
						|
                            MUInt15 bestEP[2][4];
 | 
						|
 | 
						|
                            for (int px = 0; px < 16; px++)
 | 
						|
                                bestRGBIndexes[px] = bestAlphaIndexes[px] = ParallelMath::MakeUInt15(0);
 | 
						|
 | 
						|
                            for (int tweak = 0; tweak < numTweakRounds; tweak++)
 | 
						|
                            {
 | 
						|
                                MUInt15 rgbEP[2][3];
 | 
						|
                                MUInt15 alphaEP[2];
 | 
						|
 | 
						|
                                unfinishedRGB.FinishLDR(tweak, 1 << rgbPrec, rgbEP[0], rgbEP[1]);
 | 
						|
 | 
						|
                                TweakAlpha(alphaRange, tweak, 1 << alphaPrec, alphaEP);
 | 
						|
 | 
						|
                                for (int refine = 0; refine < numRefineRounds; refine++)
 | 
						|
                                {
 | 
						|
                                    if (mode == 4)
 | 
						|
                                        CompressEndpoints4(rgbEP, alphaEP, rtn);
 | 
						|
                                    else
 | 
						|
                                        CompressEndpoints5(rgbEP, alphaEP, rtn);
 | 
						|
 | 
						|
 | 
						|
                                    IndexSelector<1> alphaIndexSelector;
 | 
						|
                                    IndexSelector<3> rgbIndexSelector;
 | 
						|
 | 
						|
                                    {
 | 
						|
                                        MUInt15 alphaEPTemp[2][1] = { { alphaEP[0] },{ alphaEP[1] } };
 | 
						|
                                        alphaIndexSelector.Init<false>(uniformWeight, alphaEPTemp, 1 << alphaPrec);
 | 
						|
                                    }
 | 
						|
                                    rgbIndexSelector.Init<false>(rotatedRGBWeights, rgbEP, 1 << rgbPrec);
 | 
						|
 | 
						|
                                    EndpointRefiner<3> rgbRefiner;
 | 
						|
                                    EndpointRefiner<1> alphaRefiner;
 | 
						|
 | 
						|
                                    rgbRefiner.Init(1 << rgbPrec, rotatedRGBWeights);
 | 
						|
                                    alphaRefiner.Init(1 << alphaPrec, uniformWeight);
 | 
						|
 | 
						|
                                    MFloat errorRGB = ParallelMath::MakeFloatZero();
 | 
						|
                                    MFloat errorA = ParallelMath::MakeFloatZero();
 | 
						|
 | 
						|
                                    MUInt15 rgbIndexes[16];
 | 
						|
                                    MUInt15 alphaIndexes[16];
 | 
						|
 | 
						|
                                    AggregatedError<3> rgbAggError;
 | 
						|
                                    AggregatedError<1> alphaAggError;
 | 
						|
 | 
						|
                                    for (int px = 0; px < 16; px++)
 | 
						|
                                    {
 | 
						|
                                        MUInt15 rgbIndex = rgbIndexSelector.SelectIndexLDR(floatRotatedRGB[px], rtn);
 | 
						|
                                        MUInt15 alphaIndex = alphaIndexSelector.SelectIndexLDR(floatPixels[px] + alphaChannel, rtn);
 | 
						|
 | 
						|
                                        MUInt15 reconstructedRGB[3];
 | 
						|
                                        MUInt15 reconstructedAlpha[1];
 | 
						|
 | 
						|
                                        rgbIndexSelector.ReconstructLDR_BC7(rgbIndex, reconstructedRGB);
 | 
						|
                                        alphaIndexSelector.ReconstructLDR_BC7(alphaIndex, reconstructedAlpha);
 | 
						|
 | 
						|
                                        if (flags & cvtt::Flags::BC7_FastIndexing)
 | 
						|
                                        {
 | 
						|
                                            BCCommon::ComputeErrorLDR<3>(flags, reconstructedRGB, rotatedRGB[px], rgbAggError);
 | 
						|
                                            BCCommon::ComputeErrorLDR<1>(flags, reconstructedAlpha, pixels[px] + alphaChannel, alphaAggError);
 | 
						|
                                        }
 | 
						|
                                        else
 | 
						|
                                        {
 | 
						|
                                            AggregatedError<3> baseRGBAggError;
 | 
						|
                                            AggregatedError<1> baseAlphaAggError;
 | 
						|
 | 
						|
                                            BCCommon::ComputeErrorLDR<3>(flags, reconstructedRGB, rotatedRGB[px], baseRGBAggError);
 | 
						|
                                            BCCommon::ComputeErrorLDR<1>(flags, reconstructedAlpha, pixels[px] + alphaChannel, baseAlphaAggError);
 | 
						|
 | 
						|
                                            MFloat rgbError = baseRGBAggError.Finalize(flags, rotatedRGBWeightsSq);
 | 
						|
                                            MFloat alphaError = baseAlphaAggError.Finalize(flags, rotatedAlphaWeightSq);
 | 
						|
 | 
						|
                                            MUInt15 altRGBIndexes[2];
 | 
						|
                                            MUInt15 altAlphaIndexes[2];
 | 
						|
 | 
						|
                                            altRGBIndexes[0] = ParallelMath::Max(rgbIndex, ParallelMath::MakeUInt15(1)) - ParallelMath::MakeUInt15(1);
 | 
						|
                                            altRGBIndexes[1] = ParallelMath::Min(rgbIndex + ParallelMath::MakeUInt15(1), ParallelMath::MakeUInt15(static_cast<uint16_t>((1 << rgbPrec) - 1)));
 | 
						|
 | 
						|
                                            altAlphaIndexes[0] = ParallelMath::Max(alphaIndex, ParallelMath::MakeUInt15(1)) - ParallelMath::MakeUInt15(1);
 | 
						|
                                            altAlphaIndexes[1] = ParallelMath::Min(alphaIndex + ParallelMath::MakeUInt15(1), ParallelMath::MakeUInt15(static_cast<uint16_t>((1 << alphaPrec) - 1)));
 | 
						|
 | 
						|
                                            for (int ii = 0; ii < 2; ii++)
 | 
						|
                                            {
 | 
						|
                                                rgbIndexSelector.ReconstructLDR_BC7(altRGBIndexes[ii], reconstructedRGB);
 | 
						|
                                                alphaIndexSelector.ReconstructLDR_BC7(altAlphaIndexes[ii], reconstructedAlpha);
 | 
						|
 | 
						|
                                                AggregatedError<3> altRGBAggError;
 | 
						|
                                                AggregatedError<1> altAlphaAggError;
 | 
						|
 | 
						|
                                                BCCommon::ComputeErrorLDR<3>(flags, reconstructedRGB, rotatedRGB[px], altRGBAggError);
 | 
						|
                                                BCCommon::ComputeErrorLDR<1>(flags, reconstructedAlpha, pixels[px] + alphaChannel, altAlphaAggError);
 | 
						|
 | 
						|
                                                MFloat altRGBError = altRGBAggError.Finalize(flags, rotatedRGBWeightsSq);
 | 
						|
                                                MFloat altAlphaError = altAlphaAggError.Finalize(flags, rotatedAlphaWeightSq);
 | 
						|
 | 
						|
                                                ParallelMath::Int16CompFlag rgbBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(altRGBError, rgbError));
 | 
						|
                                                ParallelMath::Int16CompFlag alphaBetter = ParallelMath::FloatFlagToInt16(ParallelMath::Less(altAlphaError, alphaError));
 | 
						|
 | 
						|
                                                rgbError = ParallelMath::Min(altRGBError, rgbError);
 | 
						|
                                                alphaError = ParallelMath::Min(altAlphaError, alphaError);
 | 
						|
 | 
						|
                                                ParallelMath::ConditionalSet(rgbIndex, rgbBetter, altRGBIndexes[ii]);
 | 
						|
                                                ParallelMath::ConditionalSet(alphaIndex, alphaBetter, altAlphaIndexes[ii]);
 | 
						|
                                            }
 | 
						|
 | 
						|
                                            errorRGB = errorRGB + rgbError;
 | 
						|
                                            errorA = errorA + alphaError;
 | 
						|
                                        }
 | 
						|
 | 
						|
                                        if (refine != numRefineRounds - 1)
 | 
						|
                                        {
 | 
						|
                                            rgbRefiner.ContributeUnweightedPW(preWeightedRotatedRGB[px], rgbIndex);
 | 
						|
                                            alphaRefiner.ContributeUnweightedPW(floatPixels[px] + alphaChannel, alphaIndex);
 | 
						|
                                        }
 | 
						|
 | 
						|
                                        if (flags & Flags::BC7_FastIndexing)
 | 
						|
                                        {
 | 
						|
                                            errorRGB = rgbAggError.Finalize(flags, rotatedRGBWeightsSq);
 | 
						|
                                            errorA = rgbAggError.Finalize(flags, rotatedAlphaWeightSq);
 | 
						|
                                        }
 | 
						|
 | 
						|
                                        rgbIndexes[px] = rgbIndex;
 | 
						|
                                        alphaIndexes[px] = alphaIndex;
 | 
						|
                                    }
 | 
						|
 | 
						|
                                    ParallelMath::FloatCompFlag rgbBetter = ParallelMath::Less(errorRGB, bestRGBError);
 | 
						|
                                    ParallelMath::FloatCompFlag alphaBetter = ParallelMath::Less(errorA, bestAlphaError);
 | 
						|
 | 
						|
                                    ParallelMath::Int16CompFlag rgbBetterInt16 = ParallelMath::FloatFlagToInt16(rgbBetter);
 | 
						|
                                    ParallelMath::Int16CompFlag alphaBetterInt16 = ParallelMath::FloatFlagToInt16(alphaBetter);
 | 
						|
 | 
						|
                                    if (ParallelMath::AnySet(rgbBetterInt16))
 | 
						|
                                    {
 | 
						|
                                        bestRGBError = ParallelMath::Min(errorRGB, bestRGBError);
 | 
						|
 | 
						|
                                        for (int px = 0; px < 16; px++)
 | 
						|
                                            ParallelMath::ConditionalSet(bestRGBIndexes[px], rgbBetterInt16, rgbIndexes[px]);
 | 
						|
 | 
						|
                                        for (int ep = 0; ep < 2; ep++)
 | 
						|
                                        {
 | 
						|
                                            for (int ch = 0; ch < 3; ch++)
 | 
						|
                                                ParallelMath::ConditionalSet(bestEP[ep][ch], rgbBetterInt16, rgbEP[ep][ch]);
 | 
						|
                                        }
 | 
						|
                                    }
 | 
						|
 | 
						|
                                    if (ParallelMath::AnySet(alphaBetterInt16))
 | 
						|
                                    {
 | 
						|
                                        bestAlphaError = ParallelMath::Min(errorA, bestAlphaError);
 | 
						|
 | 
						|
                                        for (int px = 0; px < 16; px++)
 | 
						|
                                            ParallelMath::ConditionalSet(bestAlphaIndexes[px], alphaBetterInt16, alphaIndexes[px]);
 | 
						|
 | 
						|
                                        for (int ep = 0; ep < 2; ep++)
 | 
						|
                                            ParallelMath::ConditionalSet(bestEP[ep][3], alphaBetterInt16, alphaEP[ep]);
 | 
						|
                                    }
 | 
						|
 | 
						|
                                    if (refine != numRefineRounds - 1)
 | 
						|
                                    {
 | 
						|
                                        rgbRefiner.GetRefinedEndpointsLDR(rgbEP, rtn);
 | 
						|
 | 
						|
                                        MUInt15 alphaEPTemp[2][1];
 | 
						|
                                        alphaRefiner.GetRefinedEndpointsLDR(alphaEPTemp, rtn);
 | 
						|
 | 
						|
                                        for (int i = 0; i < 2; i++)
 | 
						|
                                            alphaEP[i] = alphaEPTemp[i][0];
 | 
						|
                                    }
 | 
						|
                                }	// refine
 | 
						|
                            } // tweak
 | 
						|
 | 
						|
                            MFloat combinedError = bestRGBError + bestAlphaError;
 | 
						|
 | 
						|
                            ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(combinedError, work.m_error);
 | 
						|
                            ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);
 | 
						|
 | 
						|
                            work.m_error = ParallelMath::Min(combinedError, work.m_error);
 | 
						|
 | 
						|
                            ParallelMath::ConditionalSet(work.m_mode, errorBetter16, ParallelMath::MakeUInt15(mode));
 | 
						|
                            ParallelMath::ConditionalSet(work.m_u.m_isr.m_rotation, errorBetter16, ParallelMath::MakeUInt15(rotation));
 | 
						|
                            ParallelMath::ConditionalSet(work.m_u.m_isr.m_indexSelector, errorBetter16, ParallelMath::MakeUInt15(indexSelector));
 | 
						|
 | 
						|
                            for (int px = 0; px < 16; px++)
 | 
						|
                            {
 | 
						|
                                ParallelMath::ConditionalSet(work.m_indexes[px], errorBetter16, indexSelector ? bestAlphaIndexes[px] : bestRGBIndexes[px]);
 | 
						|
                                ParallelMath::ConditionalSet(work.m_indexes2[px], errorBetter16, indexSelector ? bestRGBIndexes[px] : bestAlphaIndexes[px]);
 | 
						|
                            }
 | 
						|
 | 
						|
                            for (int ep = 0; ep < 2; ep++)
 | 
						|
                                for (int ch = 0; ch < 4; ch++)
 | 
						|
                                    ParallelMath::ConditionalSet(work.m_ep[0][ep][ch], errorBetter16, bestEP[ep][ch]);
 | 
						|
                        }
 | 
						|
                    }
 | 
						|
                }
 | 
						|
            }
 | 
						|
 | 
						|
            template<class T>
 | 
						|
            static void Swap(T& a, T& b)
 | 
						|
            {
 | 
						|
                T temp = a;
 | 
						|
                a = b;
 | 
						|
                b = temp;
 | 
						|
            }
 | 
						|
 | 
						|
            static void Pack(uint32_t flags, const PixelBlockU8* inputs, uint8_t* packedBlocks, const float channelWeights[4], int numTweakRounds, int numRefineRounds)
 | 
						|
            {
 | 
						|
                MUInt15 pixels[16][4];
 | 
						|
                MFloat floatPixels[16][4];
 | 
						|
 | 
						|
                for (int px = 0; px < 16; px++)
 | 
						|
                {
 | 
						|
                    for (int ch = 0; ch < 4; ch++)
 | 
						|
                        ParallelMath::ConvertLDRInputs(inputs, px, ch, pixels[px][ch]);
 | 
						|
                }
 | 
						|
 | 
						|
                for (int px = 0; px < 16; px++)
 | 
						|
                {
 | 
						|
                    for (int ch = 0; ch < 4; ch++)
 | 
						|
                        floatPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]);
 | 
						|
                }
 | 
						|
 | 
						|
                WorkInfo work;
 | 
						|
                memset(&work, 0, sizeof(work));
 | 
						|
 | 
						|
                work.m_error = ParallelMath::MakeFloat(FLT_MAX);
 | 
						|
 | 
						|
                {
 | 
						|
                    ParallelMath::RoundTowardNearestForScope rtn;
 | 
						|
                    TrySinglePlane(flags, pixels, floatPixels, channelWeights, numTweakRounds, numRefineRounds, work, &rtn);
 | 
						|
                    TryDualPlane(flags, pixels, floatPixels, channelWeights, numTweakRounds, numRefineRounds, work, &rtn);
 | 
						|
                }
 | 
						|
 | 
						|
                for (int block = 0; block < ParallelMath::ParallelSize; block++)
 | 
						|
                {
 | 
						|
                    PackingVector pv;
 | 
						|
                    pv.Init();
 | 
						|
 | 
						|
                    ParallelMath::ScalarUInt16 mode = ParallelMath::Extract(work.m_mode, block);
 | 
						|
                    ParallelMath::ScalarUInt16 partition = ParallelMath::Extract(work.m_u.m_partition, block);
 | 
						|
                    ParallelMath::ScalarUInt16 indexSelector = ParallelMath::Extract(work.m_u.m_isr.m_indexSelector, block);
 | 
						|
 | 
						|
                    const BC7Data::BC7ModeInfo& modeInfo = BC7Data::g_modes[mode];
 | 
						|
 | 
						|
                    ParallelMath::ScalarUInt16 indexes[16];
 | 
						|
                    ParallelMath::ScalarUInt16 indexes2[16];
 | 
						|
                    ParallelMath::ScalarUInt16 endPoints[3][2][4];
 | 
						|
 | 
						|
                    for (int i = 0; i < 16; i++)
 | 
						|
                    {
 | 
						|
                        indexes[i] = ParallelMath::Extract(work.m_indexes[i], block);
 | 
						|
                        if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
 | 
						|
                            indexes2[i] = ParallelMath::Extract(work.m_indexes2[i], block);
 | 
						|
                    }
 | 
						|
 | 
						|
                    for (int subset = 0; subset < 3; subset++)
 | 
						|
                    {
 | 
						|
                        for (int ep = 0; ep < 2; ep++)
 | 
						|
                        {
 | 
						|
                            for (int ch = 0; ch < 4; ch++)
 | 
						|
                                endPoints[subset][ep][ch] = ParallelMath::Extract(work.m_ep[subset][ep][ch], block);
 | 
						|
                        }
 | 
						|
                    }
 | 
						|
 | 
						|
                    int fixups[3] = { 0, 0, 0 };
 | 
						|
 | 
						|
                    if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
 | 
						|
                    {
 | 
						|
                        bool flipRGB = ((indexes[0] & (1 << (modeInfo.m_indexBits - 1))) != 0);
 | 
						|
                        bool flipAlpha = ((indexes2[0] & (1 << (modeInfo.m_alphaIndexBits - 1))) != 0);
 | 
						|
 | 
						|
                        if (flipRGB)
 | 
						|
                        {
 | 
						|
                            uint16_t highIndex = (1 << modeInfo.m_indexBits) - 1;
 | 
						|
                            for (int px = 0; px < 16; px++)
 | 
						|
                                indexes[px] = highIndex - indexes[px];
 | 
						|
                        }
 | 
						|
 | 
						|
                        if (flipAlpha)
 | 
						|
                        {
 | 
						|
                            uint16_t highIndex = (1 << modeInfo.m_alphaIndexBits) - 1;
 | 
						|
                            for (int px = 0; px < 16; px++)
 | 
						|
                                indexes2[px] = highIndex - indexes2[px];
 | 
						|
                        }
 | 
						|
 | 
						|
                        if (indexSelector)
 | 
						|
                            Swap(flipRGB, flipAlpha);
 | 
						|
 | 
						|
                        if (flipRGB)
 | 
						|
                        {
 | 
						|
                            for (int ch = 0; ch < 3; ch++)
 | 
						|
                                Swap(endPoints[0][0][ch], endPoints[0][1][ch]);
 | 
						|
                        }
 | 
						|
                        if (flipAlpha)
 | 
						|
                            Swap(endPoints[0][0][3], endPoints[0][1][3]);
 | 
						|
 | 
						|
                    }
 | 
						|
                    else
 | 
						|
                    {
 | 
						|
                        if (modeInfo.m_numSubsets == 2)
 | 
						|
                            fixups[1] = BC7Data::g_fixupIndexes2[partition];
 | 
						|
                        else if (modeInfo.m_numSubsets == 3)
 | 
						|
                        {
 | 
						|
                            fixups[1] = BC7Data::g_fixupIndexes3[partition][0];
 | 
						|
                            fixups[2] = BC7Data::g_fixupIndexes3[partition][1];
 | 
						|
                        }
 | 
						|
 | 
						|
                        bool flip[3] = { false, false, false };
 | 
						|
                        for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
 | 
						|
                            flip[subset] = ((indexes[fixups[subset]] & (1 << (modeInfo.m_indexBits - 1))) != 0);
 | 
						|
 | 
						|
                        if (flip[0] || flip[1] || flip[2])
 | 
						|
                        {
 | 
						|
                            uint16_t highIndex = (1 << modeInfo.m_indexBits) - 1;
 | 
						|
                            for (int px = 0; px < 16; px++)
 | 
						|
                            {
 | 
						|
                                int subset = 0;
 | 
						|
                                if (modeInfo.m_numSubsets == 2)
 | 
						|
                                    subset = (BC7Data::g_partitionMap[partition] >> px) & 1;
 | 
						|
                                else if (modeInfo.m_numSubsets == 3)
 | 
						|
                                    subset = (BC7Data::g_partitionMap2[partition] >> (px * 2)) & 3;
 | 
						|
 | 
						|
                                if (flip[subset])
 | 
						|
                                    indexes[px] = highIndex - indexes[px];
 | 
						|
                            }
 | 
						|
 | 
						|
                            int maxCH = (modeInfo.m_alphaMode == BC7Data::AlphaMode_Combined) ? 4 : 3;
 | 
						|
                            for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
 | 
						|
                            {
 | 
						|
                                if (flip[subset])
 | 
						|
                                    for (int ch = 0; ch < maxCH; ch++)
 | 
						|
                                        Swap(endPoints[subset][0][ch], endPoints[subset][1][ch]);
 | 
						|
                            }
 | 
						|
                        }
 | 
						|
                    }
 | 
						|
 | 
						|
                    pv.Pack(static_cast<uint8_t>(1 << mode), mode + 1);
 | 
						|
 | 
						|
                    if (modeInfo.m_partitionBits)
 | 
						|
                        pv.Pack(partition, modeInfo.m_partitionBits);
 | 
						|
 | 
						|
                    if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
 | 
						|
                    {
 | 
						|
                        ParallelMath::ScalarUInt16 rotation = ParallelMath::Extract(work.m_u.m_isr.m_rotation, block);
 | 
						|
                        pv.Pack(rotation, 2);
 | 
						|
                    }
 | 
						|
 | 
						|
                    if (modeInfo.m_hasIndexSelector)
 | 
						|
                        pv.Pack(indexSelector, 1);
 | 
						|
 | 
						|
                    // Encode RGB
 | 
						|
                    for (int ch = 0; ch < 3; ch++)
 | 
						|
                    {
 | 
						|
                        for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
 | 
						|
                        {
 | 
						|
                            for (int ep = 0; ep < 2; ep++)
 | 
						|
                            {
 | 
						|
                                ParallelMath::ScalarUInt16 epPart = endPoints[subset][ep][ch];
 | 
						|
                                epPart >>= (8 - modeInfo.m_rgbBits);
 | 
						|
 | 
						|
                                pv.Pack(epPart, modeInfo.m_rgbBits);
 | 
						|
                            }
 | 
						|
                        }
 | 
						|
                    }
 | 
						|
 | 
						|
                    // Encode alpha
 | 
						|
                    if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
 | 
						|
                    {
 | 
						|
                        for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
 | 
						|
                        {
 | 
						|
                            for (int ep = 0; ep < 2; ep++)
 | 
						|
                            {
 | 
						|
                                ParallelMath::ScalarUInt16 epPart = endPoints[subset][ep][3];
 | 
						|
                                epPart >>= (8 - modeInfo.m_alphaBits);
 | 
						|
 | 
						|
                                pv.Pack(epPart, modeInfo.m_alphaBits);
 | 
						|
                            }
 | 
						|
                        }
 | 
						|
                    }
 | 
						|
 | 
						|
                    // Encode parity bits
 | 
						|
                    if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerSubset)
 | 
						|
                    {
 | 
						|
                        for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
 | 
						|
                        {
 | 
						|
                            ParallelMath::ScalarUInt16 epPart = endPoints[subset][0][0];
 | 
						|
                            epPart >>= (7 - modeInfo.m_rgbBits);
 | 
						|
                            epPart &= 1;
 | 
						|
 | 
						|
                            pv.Pack(epPart, 1);
 | 
						|
                        }
 | 
						|
                    }
 | 
						|
                    else if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerEndpoint)
 | 
						|
                    {
 | 
						|
                        for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
 | 
						|
                        {
 | 
						|
                            for (int ep = 0; ep < 2; ep++)
 | 
						|
                            {
 | 
						|
                                ParallelMath::ScalarUInt16 epPart = endPoints[subset][ep][0];
 | 
						|
                                epPart >>= (7 - modeInfo.m_rgbBits);
 | 
						|
                                epPart &= 1;
 | 
						|
 | 
						|
                                pv.Pack(epPart, 1);
 | 
						|
                            }
 | 
						|
                        }
 | 
						|
                    }
 | 
						|
 | 
						|
                    // Encode indexes
 | 
						|
                    for (int px = 0; px < 16; px++)
 | 
						|
                    {
 | 
						|
                        int bits = modeInfo.m_indexBits;
 | 
						|
                        if ((px == 0) || (px == fixups[1]) || (px == fixups[2]))
 | 
						|
                            bits--;
 | 
						|
 | 
						|
                        pv.Pack(indexes[px], bits);
 | 
						|
                    }
 | 
						|
 | 
						|
                    // Encode secondary indexes
 | 
						|
                    if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
 | 
						|
                    {
 | 
						|
                        for (int px = 0; px < 16; px++)
 | 
						|
                        {
 | 
						|
                            int bits = modeInfo.m_alphaIndexBits;
 | 
						|
                            if (px == 0)
 | 
						|
                                bits--;
 | 
						|
 | 
						|
                            pv.Pack(indexes2[px], bits);
 | 
						|
                        }
 | 
						|
                    }
 | 
						|
 | 
						|
                    pv.Flush(packedBlocks);
 | 
						|
 | 
						|
                    packedBlocks += 16;
 | 
						|
                }
 | 
						|
            }
 | 
						|
 | 
						|
            static void UnpackOne(PixelBlockU8 &output, const uint8_t* packedBlock)
 | 
						|
            {
 | 
						|
                UnpackingVector pv;
 | 
						|
                pv.Init(packedBlock);
 | 
						|
 | 
						|
                int mode = 8;
 | 
						|
                for (int i = 0; i < 8; i++)
 | 
						|
                {
 | 
						|
                    if (pv.Unpack(1) == 1)
 | 
						|
                    {
 | 
						|
                        mode = i;
 | 
						|
                        break;
 | 
						|
                    }
 | 
						|
                }
 | 
						|
 | 
						|
                if (mode > 7)
 | 
						|
                {
 | 
						|
                    for (int px = 0; px < 16; px++)
 | 
						|
                        for (int ch = 0; ch < 4; ch++)
 | 
						|
                            output.m_pixels[px][ch] = 0;
 | 
						|
 | 
						|
                    return;
 | 
						|
                }
 | 
						|
 | 
						|
                const BC7Data::BC7ModeInfo &modeInfo = BC7Data::g_modes[mode];
 | 
						|
 | 
						|
                int partition = 0;
 | 
						|
                if (modeInfo.m_partitionBits)
 | 
						|
                    partition = pv.Unpack(modeInfo.m_partitionBits);
 | 
						|
 | 
						|
                int rotation = 0;
 | 
						|
                if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
 | 
						|
                    rotation = pv.Unpack(2);
 | 
						|
 | 
						|
                int indexSelector = 0;
 | 
						|
                if (modeInfo.m_hasIndexSelector)
 | 
						|
                    indexSelector = pv.Unpack(1);
 | 
						|
 | 
						|
                // Resolve fixups
 | 
						|
                int fixups[3] = { 0, 0, 0 };
 | 
						|
 | 
						|
                if (modeInfo.m_alphaMode != BC7Data::AlphaMode_Separate)
 | 
						|
                {
 | 
						|
                    if (modeInfo.m_numSubsets == 2)
 | 
						|
                        fixups[1] = BC7Data::g_fixupIndexes2[partition];
 | 
						|
                    else if (modeInfo.m_numSubsets == 3)
 | 
						|
                    {
 | 
						|
                        fixups[1] = BC7Data::g_fixupIndexes3[partition][0];
 | 
						|
                        fixups[2] = BC7Data::g_fixupIndexes3[partition][1];
 | 
						|
                    }
 | 
						|
                }
 | 
						|
 | 
						|
                int endPoints[3][2][4];
 | 
						|
 | 
						|
                // Decode RGB
 | 
						|
                for (int ch = 0; ch < 3; ch++)
 | 
						|
                {
 | 
						|
                    for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
 | 
						|
                    {
 | 
						|
                        for (int ep = 0; ep < 2; ep++)
 | 
						|
                            endPoints[subset][ep][ch] = (pv.Unpack(modeInfo.m_rgbBits) << (8 - modeInfo.m_rgbBits));
 | 
						|
                    }
 | 
						|
                }
 | 
						|
 | 
						|
                // Decode alpha
 | 
						|
                if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
 | 
						|
                {
 | 
						|
                    for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
 | 
						|
                    {
 | 
						|
                        for (int ep = 0; ep < 2; ep++)
 | 
						|
                            endPoints[subset][ep][3] = (pv.Unpack(modeInfo.m_alphaBits) << (8 - modeInfo.m_alphaBits));
 | 
						|
                    }
 | 
						|
                }
 | 
						|
                else
 | 
						|
                {
 | 
						|
                    for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
 | 
						|
                    {
 | 
						|
                        for (int ep = 0; ep < 2; ep++)
 | 
						|
                            endPoints[subset][ep][3] = 255;
 | 
						|
                    }
 | 
						|
                }
 | 
						|
 | 
						|
                int parityBits = 0;
 | 
						|
 | 
						|
                // Decode parity bits
 | 
						|
                if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerSubset)
 | 
						|
                {
 | 
						|
                    for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
 | 
						|
                    {
 | 
						|
                        int p = pv.Unpack(1);
 | 
						|
 | 
						|
                        for (int ep = 0; ep < 2; ep++)
 | 
						|
                        {
 | 
						|
                            for (int ch = 0; ch < 3; ch++)
 | 
						|
                                endPoints[subset][ep][ch] |= p << (7 - modeInfo.m_rgbBits);
 | 
						|
 | 
						|
                            if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
 | 
						|
                                endPoints[subset][ep][3] |= p << (7 - modeInfo.m_alphaBits);
 | 
						|
                        }
 | 
						|
                    }
 | 
						|
 | 
						|
                    parityBits = 1;
 | 
						|
                }
 | 
						|
                else if (modeInfo.m_pBitMode == BC7Data::PBitMode_PerEndpoint)
 | 
						|
                {
 | 
						|
                    for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
 | 
						|
                    {
 | 
						|
                        for (int ep = 0; ep < 2; ep++)
 | 
						|
                        {
 | 
						|
                            int p = pv.Unpack(1);
 | 
						|
 | 
						|
                            for (int ch = 0; ch < 3; ch++)
 | 
						|
                                endPoints[subset][ep][ch] |= p << (7 - modeInfo.m_rgbBits);
 | 
						|
 | 
						|
                            if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
 | 
						|
                                endPoints[subset][ep][3] |= p << (7 - modeInfo.m_alphaBits);
 | 
						|
                        }
 | 
						|
                    }
 | 
						|
 | 
						|
                    parityBits = 1;
 | 
						|
                }
 | 
						|
 | 
						|
                // Fill endpoint bits
 | 
						|
                for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
 | 
						|
                {
 | 
						|
                    for (int ep = 0; ep < 2; ep++)
 | 
						|
                    {
 | 
						|
                        for (int ch = 0; ch < 3; ch++)
 | 
						|
                            endPoints[subset][ep][ch] |= (endPoints[subset][ep][ch] >> (modeInfo.m_rgbBits + parityBits));
 | 
						|
 | 
						|
                        if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
 | 
						|
                            endPoints[subset][ep][3] |= (endPoints[subset][ep][3] >> (modeInfo.m_alphaBits + parityBits));
 | 
						|
                    }
 | 
						|
                }
 | 
						|
 | 
						|
                int indexes[16];
 | 
						|
                int indexes2[16];
 | 
						|
 | 
						|
                // Decode indexes
 | 
						|
                for (int px = 0; px < 16; px++)
 | 
						|
                {
 | 
						|
                    int bits = modeInfo.m_indexBits;
 | 
						|
                    if ((px == 0) || (px == fixups[1]) || (px == fixups[2]))
 | 
						|
                        bits--;
 | 
						|
 | 
						|
                    indexes[px] = pv.Unpack(bits);
 | 
						|
                }
 | 
						|
 | 
						|
                // Decode secondary indexes
 | 
						|
                if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
 | 
						|
                {
 | 
						|
                    for (int px = 0; px < 16; px++)
 | 
						|
                    {
 | 
						|
                        int bits = modeInfo.m_alphaIndexBits;
 | 
						|
                        if (px == 0)
 | 
						|
                            bits--;
 | 
						|
 | 
						|
                        indexes2[px] = pv.Unpack(bits);
 | 
						|
                    }
 | 
						|
                }
 | 
						|
                else
 | 
						|
                {
 | 
						|
                    for (int px = 0; px < 16; px++)
 | 
						|
                        indexes2[px] = 0;
 | 
						|
                }
 | 
						|
 | 
						|
                const int *alphaWeights = BC7Data::g_weightTables[modeInfo.m_alphaIndexBits];
 | 
						|
                const int *rgbWeights = BC7Data::g_weightTables[modeInfo.m_indexBits];
 | 
						|
 | 
						|
                // Decode each pixel
 | 
						|
                for (int px = 0; px < 16; px++)
 | 
						|
                {
 | 
						|
                    int rgbWeight = 0;
 | 
						|
                    int alphaWeight = 0;
 | 
						|
 | 
						|
                    int rgbIndex = indexes[px];
 | 
						|
 | 
						|
                    rgbWeight = rgbWeights[indexes[px]];
 | 
						|
 | 
						|
                    if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Combined)
 | 
						|
                        alphaWeight = rgbWeight;
 | 
						|
                    else if (modeInfo.m_alphaMode == BC7Data::AlphaMode_Separate)
 | 
						|
                        alphaWeight = alphaWeights[indexes2[px]];
 | 
						|
 | 
						|
                    if (indexSelector == 1)
 | 
						|
                    {
 | 
						|
                        int temp = rgbWeight;
 | 
						|
                        rgbWeight = alphaWeight;
 | 
						|
                        alphaWeight = temp;
 | 
						|
                    }
 | 
						|
 | 
						|
                    int pixel[4] = { 0, 0, 0, 255 };
 | 
						|
 | 
						|
                    int subset = 0;
 | 
						|
 | 
						|
                    if (modeInfo.m_numSubsets == 2)
 | 
						|
                        subset = (BC7Data::g_partitionMap[partition] >> px) & 1;
 | 
						|
                    else if (modeInfo.m_numSubsets == 3)
 | 
						|
                        subset = (BC7Data::g_partitionMap2[partition] >> (px * 2)) & 3;
 | 
						|
 | 
						|
                    for (int ch = 0; ch < 3; ch++)
 | 
						|
                        pixel[ch] = ((64 - rgbWeight) * endPoints[subset][0][ch] + rgbWeight * endPoints[subset][1][ch] + 32) >> 6;
 | 
						|
 | 
						|
                    if (modeInfo.m_alphaMode != BC7Data::AlphaMode_None)
 | 
						|
                        pixel[3] = ((64 - alphaWeight) * endPoints[subset][0][3] + alphaWeight * endPoints[subset][1][3] + 32) >> 6;
 | 
						|
 | 
						|
                    if (rotation != 0)
 | 
						|
                    {
 | 
						|
                        int ch = rotation - 1;
 | 
						|
                        int temp = pixel[ch];
 | 
						|
                        pixel[ch] = pixel[3];
 | 
						|
                        pixel[3] = temp;
 | 
						|
                    }
 | 
						|
 | 
						|
                    for (int ch = 0; ch < 4; ch++)
 | 
						|
                        output.m_pixels[px][ch] = static_cast<uint8_t>(pixel[ch]);
 | 
						|
                }
 | 
						|
            }
 | 
						|
        };
 | 
						|
 | 
						|
        class BC6HComputer
 | 
						|
        {
 | 
						|
        public:
 | 
						|
            typedef ParallelMath::Float MFloat;
 | 
						|
            typedef ParallelMath::SInt16 MSInt16;
 | 
						|
            typedef ParallelMath::UInt16 MUInt16;
 | 
						|
            typedef ParallelMath::UInt15 MUInt15;
 | 
						|
            typedef ParallelMath::AInt16 MAInt16;
 | 
						|
            typedef ParallelMath::SInt32 MSInt32;
 | 
						|
            typedef ParallelMath::UInt31 MUInt31;
 | 
						|
 | 
						|
            static const int MaxTweakRounds = 4;
 | 
						|
            static const int MaxRefineRounds = 3;
 | 
						|
 | 
						|
            static MSInt16 QuantizeSingleEndpointElementSigned(const MSInt16 &elem2CL, int precision, const ParallelMath::RoundUpForScope* ru)
 | 
						|
            {
 | 
						|
                assert(ParallelMath::AllSet(ParallelMath::Less(elem2CL, ParallelMath::MakeSInt16(31744))));
 | 
						|
                assert(ParallelMath::AllSet(ParallelMath::Less(ParallelMath::MakeSInt16(-31744), elem2CL)));
 | 
						|
 | 
						|
                // Expand to full range
 | 
						|
                ParallelMath::Int16CompFlag isNegative = ParallelMath::Less(elem2CL, ParallelMath::MakeSInt16(0));
 | 
						|
                MUInt15 absElem = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Select(isNegative, ParallelMath::MakeSInt16(0) - elem2CL, elem2CL));
 | 
						|
 | 
						|
                absElem = ParallelMath::RightShift(ParallelMath::RoundAndConvertToU15(ParallelMath::ToFloat(absElem) * 32.0f / 31.0f, ru), 16 - precision);
 | 
						|
 | 
						|
                MSInt16 absElemS16 = ParallelMath::LosslessCast<MSInt16>::Cast(absElem);
 | 
						|
 | 
						|
                return ParallelMath::Select(isNegative, ParallelMath::MakeSInt16(0) - absElemS16, absElemS16);
 | 
						|
            }
 | 
						|
 | 
						|
            static MUInt15 QuantizeSingleEndpointElementUnsigned(const MUInt15 &elem, int precision, const ParallelMath::RoundUpForScope* ru)
 | 
						|
            {
 | 
						|
                MUInt16 expandedElem = ParallelMath::RoundAndConvertToU16(ParallelMath::Min(ParallelMath::ToFloat(elem) * 64.0f / 31.0f, ParallelMath::MakeFloat(65535.0f)), ru);
 | 
						|
                return ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(expandedElem, 16 - precision));
 | 
						|
            }
 | 
						|
 | 
						|
            static void UnquantizeSingleEndpointElementSigned(const MSInt16 &comp, int precision, MSInt16 &outUnquantized, MSInt16 &outUnquantizedFinished2CL)
 | 
						|
            {
 | 
						|
                MSInt16 zero = ParallelMath::MakeSInt16(0);
 | 
						|
 | 
						|
                ParallelMath::Int16CompFlag negative = ParallelMath::Less(comp, zero);
 | 
						|
                MUInt15 absComp = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::Select(negative, MSInt16(zero - comp), comp));
 | 
						|
 | 
						|
                MSInt16 unq;
 | 
						|
                MUInt15 absUnq;
 | 
						|
 | 
						|
                if (precision >= 16)
 | 
						|
                {
 | 
						|
                    unq = comp;
 | 
						|
                    absUnq = absComp;
 | 
						|
                }
 | 
						|
                else
 | 
						|
                {
 | 
						|
                    MSInt16 maxCompMinusOne = ParallelMath::MakeSInt16(static_cast<int16_t>((1 << (precision - 1)) - 2));
 | 
						|
                    ParallelMath::Int16CompFlag isZero = ParallelMath::Equal(comp, zero);
 | 
						|
                    ParallelMath::Int16CompFlag isMax = ParallelMath::Less(maxCompMinusOne, comp);
 | 
						|
 | 
						|
                    absUnq = (absComp << (16 - precision)) + ParallelMath::MakeUInt15(static_cast<uint16_t>(0x4000 >> (precision - 1)));
 | 
						|
                    ParallelMath::ConditionalSet(absUnq, isZero, ParallelMath::MakeUInt15(0));
 | 
						|
                    ParallelMath::ConditionalSet(absUnq, isMax, ParallelMath::MakeUInt15(0x7fff));
 | 
						|
 | 
						|
                    unq = ParallelMath::ConditionalNegate(negative, ParallelMath::LosslessCast<MSInt16>::Cast(absUnq));
 | 
						|
                }
 | 
						|
 | 
						|
                outUnquantized = unq;
 | 
						|
 | 
						|
                MUInt15 funq = ParallelMath::ToUInt15(ParallelMath::RightShift(ParallelMath::XMultiply(absUnq, ParallelMath::MakeUInt15(31)), 5));
 | 
						|
 | 
						|
                outUnquantizedFinished2CL = ParallelMath::ConditionalNegate(negative, ParallelMath::LosslessCast<MSInt16>::Cast(funq));
 | 
						|
            }
 | 
						|
 | 
						|
            static void UnquantizeSingleEndpointElementUnsigned(const MUInt15 &comp, int precision, MUInt16 &outUnquantized, MUInt16 &outUnquantizedFinished)
 | 
						|
            {
 | 
						|
                MUInt16 unq = ParallelMath::LosslessCast<MUInt16>::Cast(comp);
 | 
						|
                if (precision < 15)
 | 
						|
                {
 | 
						|
                    MUInt15 zero = ParallelMath::MakeUInt15(0);
 | 
						|
                    MUInt15 maxCompMinusOne = ParallelMath::MakeUInt15(static_cast<uint16_t>((1 << precision) - 2));
 | 
						|
 | 
						|
                    ParallelMath::Int16CompFlag isZero = ParallelMath::Equal(comp, zero);
 | 
						|
                    ParallelMath::Int16CompFlag isMax = ParallelMath::Less(maxCompMinusOne, comp);
 | 
						|
 | 
						|
                    unq = (ParallelMath::LosslessCast<MUInt16>::Cast(comp) << (16 - precision)) + ParallelMath::MakeUInt16(static_cast<uint16_t>(0x8000 >> precision));
 | 
						|
 | 
						|
                    ParallelMath::ConditionalSet(unq, isZero, ParallelMath::MakeUInt16(0));
 | 
						|
                    ParallelMath::ConditionalSet(unq, isMax, ParallelMath::MakeUInt16(0xffff));
 | 
						|
                }
 | 
						|
 | 
						|
                outUnquantized = unq;
 | 
						|
                outUnquantizedFinished = ParallelMath::ToUInt16(ParallelMath::RightShift(ParallelMath::XMultiply(unq, ParallelMath::MakeUInt15(31)), 6));
 | 
						|
            }
 | 
						|
 | 
						|
            static void QuantizeEndpointsSigned(const MSInt16 endPoints[2][3], const MFloat floatPixelsColorSpace[16][3], const MFloat floatPixelsLinearWeighted[16][3], MAInt16 quantizedEndPoints[2][3], MUInt15 indexes[16], IndexSelectorHDR<3> &indexSelector, int fixupIndex, int precision, int indexRange, const float *channelWeights, bool fastIndexing, const ParallelMath::RoundTowardNearestForScope *rtn)
 | 
						|
            {
 | 
						|
                MSInt16 unquantizedEP[2][3];
 | 
						|
                MSInt16 finishedUnquantizedEP[2][3];
 | 
						|
 | 
						|
                {
 | 
						|
                    ParallelMath::RoundUpForScope ru;
 | 
						|
 | 
						|
                    for (int epi = 0; epi < 2; epi++)
 | 
						|
                    {
 | 
						|
                        for (int ch = 0; ch < 3; ch++)
 | 
						|
                        {
 | 
						|
                            MSInt16 qee = QuantizeSingleEndpointElementSigned(endPoints[epi][ch], precision, &ru);
 | 
						|
                            UnquantizeSingleEndpointElementSigned(qee, precision, unquantizedEP[epi][ch], finishedUnquantizedEP[epi][ch]);
 | 
						|
                            quantizedEndPoints[epi][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(qee);
 | 
						|
                        }
 | 
						|
                    }
 | 
						|
                }
 | 
						|
 | 
						|
                indexSelector.Init(channelWeights, unquantizedEP, finishedUnquantizedEP, indexRange);
 | 
						|
                indexSelector.InitHDR(indexRange, true, fastIndexing, channelWeights);
 | 
						|
 | 
						|
                MUInt15 halfRangeMinusOne = ParallelMath::MakeUInt15(static_cast<uint16_t>(indexRange / 2) - 1);
 | 
						|
 | 
						|
                MUInt15 index = fastIndexing ? indexSelector.SelectIndexHDRFast(floatPixelsColorSpace[fixupIndex], rtn) : indexSelector.SelectIndexHDRSlow(floatPixelsLinearWeighted[fixupIndex], rtn);
 | 
						|
 | 
						|
                ParallelMath::Int16CompFlag invert = ParallelMath::Less(halfRangeMinusOne, index);
 | 
						|
 | 
						|
                if (ParallelMath::AnySet(invert))
 | 
						|
                {
 | 
						|
                    ParallelMath::ConditionalSet(index, invert, MUInt15(ParallelMath::MakeUInt15(static_cast<uint16_t>(indexRange - 1)) - index));
 | 
						|
 | 
						|
                    indexSelector.ConditionalInvert(invert);
 | 
						|
 | 
						|
                    for (int ch = 0; ch < 3; ch++)
 | 
						|
                    {
 | 
						|
                        MAInt16 firstEP = quantizedEndPoints[0][ch];
 | 
						|
                        MAInt16 secondEP = quantizedEndPoints[1][ch];
 | 
						|
 | 
						|
                        quantizedEndPoints[0][ch] = ParallelMath::Select(invert, secondEP, firstEP);
 | 
						|
                        quantizedEndPoints[1][ch] = ParallelMath::Select(invert, firstEP, secondEP);
 | 
						|
                    }
 | 
						|
                }
 | 
						|
 | 
						|
                indexes[fixupIndex] = index;
 | 
						|
            }
 | 
						|
 | 
						|
            static void QuantizeEndpointsUnsigned(const MSInt16 endPoints[2][3], const MFloat floatPixelsColorSpace[16][3], const MFloat floatPixelsLinearWeighted[16][3], MAInt16 quantizedEndPoints[2][3], MUInt15 indexes[16], IndexSelectorHDR<3> &indexSelector, int fixupIndex, int precision, int indexRange, const float *channelWeights, bool fastIndexing, const ParallelMath::RoundTowardNearestForScope *rtn)
 | 
						|
            {
 | 
						|
                MUInt16 unquantizedEP[2][3];
 | 
						|
                MUInt16 finishedUnquantizedEP[2][3];
 | 
						|
 | 
						|
                {
 | 
						|
                    ParallelMath::RoundUpForScope ru;
 | 
						|
 | 
						|
                    for (int epi = 0; epi < 2; epi++)
 | 
						|
                    {
 | 
						|
                        for (int ch = 0; ch < 3; ch++)
 | 
						|
                        {
 | 
						|
                            MUInt15 qee = QuantizeSingleEndpointElementUnsigned(ParallelMath::LosslessCast<MUInt15>::Cast(endPoints[epi][ch]), precision, &ru);
 | 
						|
                            UnquantizeSingleEndpointElementUnsigned(qee, precision, unquantizedEP[epi][ch], finishedUnquantizedEP[epi][ch]);
 | 
						|
                            quantizedEndPoints[epi][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(qee);
 | 
						|
                        }
 | 
						|
                    }
 | 
						|
                }
 | 
						|
 | 
						|
                indexSelector.Init(channelWeights, unquantizedEP, finishedUnquantizedEP, indexRange);
 | 
						|
                indexSelector.InitHDR(indexRange, false, fastIndexing, channelWeights);
 | 
						|
 | 
						|
                MUInt15 halfRangeMinusOne = ParallelMath::MakeUInt15(static_cast<uint16_t>(indexRange / 2) - 1);
 | 
						|
 | 
						|
                MUInt15 index = fastIndexing ? indexSelector.SelectIndexHDRFast(floatPixelsColorSpace[fixupIndex], rtn) : indexSelector.SelectIndexHDRSlow(floatPixelsLinearWeighted[fixupIndex], rtn);
 | 
						|
 | 
						|
                ParallelMath::Int16CompFlag invert = ParallelMath::Less(halfRangeMinusOne, index);
 | 
						|
 | 
						|
                if (ParallelMath::AnySet(invert))
 | 
						|
                {
 | 
						|
                    ParallelMath::ConditionalSet(index, invert, MUInt15(ParallelMath::MakeUInt15(static_cast<uint16_t>(indexRange - 1)) - index));
 | 
						|
 | 
						|
                    indexSelector.ConditionalInvert(invert);
 | 
						|
 | 
						|
                    for (int ch = 0; ch < 3; ch++)
 | 
						|
                    {
 | 
						|
                        MAInt16 firstEP = quantizedEndPoints[0][ch];
 | 
						|
                        MAInt16 secondEP = quantizedEndPoints[1][ch];
 | 
						|
 | 
						|
                        quantizedEndPoints[0][ch] = ParallelMath::Select(invert, secondEP, firstEP);
 | 
						|
                        quantizedEndPoints[1][ch] = ParallelMath::Select(invert, firstEP, secondEP);
 | 
						|
                    }
 | 
						|
                }
 | 
						|
 | 
						|
                indexes[fixupIndex] = index;
 | 
						|
            }
 | 
						|
 | 
						|
            static void EvaluatePartitionedLegality(const MAInt16 ep0[2][3], const MAInt16 ep1[2][3], int aPrec, const int bPrec[3], bool isTransformed, MAInt16 outEncodedEPs[2][2][3], ParallelMath::Int16CompFlag& outIsLegal)
 | 
						|
            {
 | 
						|
                ParallelMath::Int16CompFlag allLegal = ParallelMath::MakeBoolInt16(true);
 | 
						|
 | 
						|
                MAInt16 aSignificantMask = ParallelMath::MakeAInt16(static_cast<int16_t>((1 << aPrec) - 1));
 | 
						|
 | 
						|
                for (int ch = 0; ch < 3; ch++)
 | 
						|
                {
 | 
						|
                    outEncodedEPs[0][0][ch] = ep0[0][ch];
 | 
						|
                    outEncodedEPs[0][1][ch] = ep0[1][ch];
 | 
						|
                    outEncodedEPs[1][0][ch] = ep1[0][ch];
 | 
						|
                    outEncodedEPs[1][1][ch] = ep1[1][ch];
 | 
						|
 | 
						|
                    if (isTransformed)
 | 
						|
                    {
 | 
						|
                        for (int subset = 0; subset < 2; subset++)
 | 
						|
                        {
 | 
						|
                            for (int epi = 0; epi < 2; epi++)
 | 
						|
                            {
 | 
						|
                                if (epi == 0 && subset == 0)
 | 
						|
                                    continue;
 | 
						|
 | 
						|
                                MAInt16 bReduced = (outEncodedEPs[subset][epi][ch] & aSignificantMask);
 | 
						|
 | 
						|
                                MSInt16 delta = ParallelMath::TruncateToPrecisionSigned(ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::AbstractSubtract(outEncodedEPs[subset][epi][ch], outEncodedEPs[0][0][ch])), bPrec[ch]);
 | 
						|
 | 
						|
                                outEncodedEPs[subset][epi][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(delta);
 | 
						|
 | 
						|
                                MAInt16 reconstructed = (ParallelMath::AbstractAdd(outEncodedEPs[subset][epi][ch], outEncodedEPs[0][0][ch]) & aSignificantMask);
 | 
						|
                                allLegal = allLegal & ParallelMath::Equal(reconstructed, bReduced);
 | 
						|
                            }
 | 
						|
                        }
 | 
						|
                    }
 | 
						|
 | 
						|
                    if (!ParallelMath::AnySet(allLegal))
 | 
						|
                        break;
 | 
						|
                }
 | 
						|
 | 
						|
                outIsLegal = allLegal;
 | 
						|
            }
 | 
						|
 | 
						|
            static void EvaluateSingleLegality(const MAInt16 ep[2][3], int aPrec, const int bPrec[3], bool isTransformed, MAInt16 outEncodedEPs[2][3], ParallelMath::Int16CompFlag& outIsLegal)
 | 
						|
            {
 | 
						|
                ParallelMath::Int16CompFlag allLegal = ParallelMath::MakeBoolInt16(true);
 | 
						|
 | 
						|
                MAInt16 aSignificantMask = ParallelMath::MakeAInt16(static_cast<int16_t>((1 << aPrec) - 1));
 | 
						|
 | 
						|
                for (int ch = 0; ch < 3; ch++)
 | 
						|
                {
 | 
						|
                    outEncodedEPs[0][ch] = ep[0][ch];
 | 
						|
                    outEncodedEPs[1][ch] = ep[1][ch];
 | 
						|
 | 
						|
                    if (isTransformed)
 | 
						|
                    {
 | 
						|
                        MAInt16 bReduced = (outEncodedEPs[1][ch] & aSignificantMask);
 | 
						|
 | 
						|
                        MSInt16 delta = ParallelMath::TruncateToPrecisionSigned(ParallelMath::LosslessCast<MSInt16>::Cast(ParallelMath::AbstractSubtract(outEncodedEPs[1][ch], outEncodedEPs[0][ch])), bPrec[ch]);
 | 
						|
 | 
						|
                        outEncodedEPs[1][ch] = ParallelMath::LosslessCast<MAInt16>::Cast(delta);
 | 
						|
 | 
						|
                        MAInt16 reconstructed = (ParallelMath::AbstractAdd(outEncodedEPs[1][ch], outEncodedEPs[0][ch]) & aSignificantMask);
 | 
						|
                        allLegal = allLegal & ParallelMath::Equal(reconstructed, bReduced);
 | 
						|
                    }
 | 
						|
                }
 | 
						|
 | 
						|
                outIsLegal = allLegal;
 | 
						|
            }
 | 
						|
 | 
						|
            static void Pack(uint32_t flags, const PixelBlockF16* inputs, uint8_t* packedBlocks, const float channelWeights[4], bool isSigned, int numTweakRounds, int numRefineRounds)
 | 
						|
            {
 | 
						|
                if (numTweakRounds < 1)
 | 
						|
                    numTweakRounds = 1;
 | 
						|
                else if (numTweakRounds > MaxTweakRounds)
 | 
						|
                    numTweakRounds = MaxTweakRounds;
 | 
						|
 | 
						|
                if (numRefineRounds < 1)
 | 
						|
                    numRefineRounds = 1;
 | 
						|
                else if (numRefineRounds > MaxRefineRounds)
 | 
						|
                    numRefineRounds = MaxRefineRounds;
 | 
						|
 | 
						|
                bool fastIndexing = ((flags & cvtt::Flags::BC6H_FastIndexing) != 0);
 | 
						|
                float channelWeightsSq[3];
 | 
						|
 | 
						|
                ParallelMath::RoundTowardNearestForScope rtn;
 | 
						|
 | 
						|
                MSInt16 pixels[16][3];
 | 
						|
                MFloat floatPixels2CL[16][3];
 | 
						|
                MFloat floatPixelsLinearWeighted[16][3];
 | 
						|
 | 
						|
                MSInt16 low15Bits = ParallelMath::MakeSInt16(32767);
 | 
						|
 | 
						|
                for (int ch = 0; ch < 3; ch++)
 | 
						|
                    channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
 | 
						|
 | 
						|
                for (int px = 0; px < 16; px++)
 | 
						|
                {
 | 
						|
                    for (int ch = 0; ch < 3; ch++)
 | 
						|
                    {
 | 
						|
                        MSInt16 pixelValue;
 | 
						|
                        ParallelMath::ConvertHDRInputs(inputs, px, ch, pixelValue);
 | 
						|
 | 
						|
                        // Convert from sign+magnitude to 2CL
 | 
						|
                        if (isSigned)
 | 
						|
                        {
 | 
						|
                            ParallelMath::Int16CompFlag negative = ParallelMath::Less(pixelValue, ParallelMath::MakeSInt16(0));
 | 
						|
                            MSInt16 magnitude = (pixelValue & low15Bits);
 | 
						|
                            ParallelMath::ConditionalSet(pixelValue, negative, ParallelMath::MakeSInt16(0) - magnitude);
 | 
						|
                            pixelValue = ParallelMath::Max(pixelValue, ParallelMath::MakeSInt16(-31743));
 | 
						|
                        }
 | 
						|
                        else
 | 
						|
                            pixelValue = ParallelMath::Max(pixelValue, ParallelMath::MakeSInt16(0));
 | 
						|
 | 
						|
                        pixelValue = ParallelMath::Min(pixelValue, ParallelMath::MakeSInt16(31743));
 | 
						|
 | 
						|
                        pixels[px][ch] = pixelValue;
 | 
						|
                        floatPixels2CL[px][ch] = ParallelMath::ToFloat(pixelValue);
 | 
						|
                        floatPixelsLinearWeighted[px][ch] = ParallelMath::TwosCLHalfToFloat(pixelValue) * channelWeights[ch];
 | 
						|
                    }
 | 
						|
                }
 | 
						|
 | 
						|
                MFloat preWeightedPixels[16][3];
 | 
						|
 | 
						|
                BCCommon::PreWeightPixelsHDR<3>(preWeightedPixels, pixels, channelWeights);
 | 
						|
 | 
						|
                MAInt16 bestEndPoints[2][2][3];
 | 
						|
                MUInt15 bestIndexes[16];
 | 
						|
                MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
 | 
						|
                MUInt15 bestMode = ParallelMath::MakeUInt15(0);
 | 
						|
                MUInt15 bestPartition = ParallelMath::MakeUInt15(0);
 | 
						|
 | 
						|
                for (int px = 0; px < 16; px++)
 | 
						|
                    bestIndexes[px] = ParallelMath::MakeUInt15(0);
 | 
						|
 | 
						|
                for (int subset = 0; subset < 2; subset++)
 | 
						|
                    for (int epi = 0; epi < 2; epi++)
 | 
						|
                        for (int ch = 0; ch < 3; ch++)
 | 
						|
                            bestEndPoints[subset][epi][ch] = ParallelMath::MakeAInt16(0);
 | 
						|
 | 
						|
                UnfinishedEndpoints<3> partitionedUFEP[32][2];
 | 
						|
                UnfinishedEndpoints<3> singleUFEP;
 | 
						|
 | 
						|
                // Generate UFEP for partitions
 | 
						|
                for (int p = 0; p < 32; p++)
 | 
						|
                {
 | 
						|
                    int partitionMask = BC7Data::g_partitionMap[p];
 | 
						|
 | 
						|
                    EndpointSelector<3, 8> epSelectors[2];
 | 
						|
 | 
						|
                    for (int pass = 0; pass < NumEndpointSelectorPasses; pass++)
 | 
						|
                    {
 | 
						|
                        for (int px = 0; px < 16; px++)
 | 
						|
                        {
 | 
						|
                            int subset = (partitionMask >> px) & 1;
 | 
						|
                            epSelectors[subset].ContributePass(preWeightedPixels[px], pass, ParallelMath::MakeFloat(1.0f));
 | 
						|
                        }
 | 
						|
 | 
						|
                        for (int subset = 0; subset < 2; subset++)
 | 
						|
                            epSelectors[subset].FinishPass(pass);
 | 
						|
                    }
 | 
						|
 | 
						|
                    for (int subset = 0; subset < 2; subset++)
 | 
						|
                        partitionedUFEP[p][subset] = epSelectors[subset].GetEndpoints(channelWeights);
 | 
						|
                }
 | 
						|
 | 
						|
                // Generate UFEP for single
 | 
						|
                {
 | 
						|
                    EndpointSelector<3, 8> epSelector;
 | 
						|
 | 
						|
                    for (int pass = 0; pass < NumEndpointSelectorPasses; pass++)
 | 
						|
                    {
 | 
						|
                        for (int px = 0; px < 16; px++)
 | 
						|
                            epSelector.ContributePass(preWeightedPixels[px], pass, ParallelMath::MakeFloat(1.0f));
 | 
						|
 | 
						|
                        epSelector.FinishPass(pass);
 | 
						|
                    }
 | 
						|
 | 
						|
                    singleUFEP = epSelector.GetEndpoints(channelWeights);
 | 
						|
                }
 | 
						|
 | 
						|
                for (int partitionedInt = 0; partitionedInt < 2; partitionedInt++)
 | 
						|
                {
 | 
						|
                    bool partitioned = (partitionedInt == 1);
 | 
						|
 | 
						|
                    for (int aPrec = BC7Data::g_maxHDRPrecision; aPrec >= 0; aPrec--)
 | 
						|
                    {
 | 
						|
                        if (!BC7Data::g_hdrModesExistForPrecision[partitionedInt][aPrec])
 | 
						|
                            continue;
 | 
						|
 | 
						|
                        int numPartitions = partitioned ? 32 : 1;
 | 
						|
                        int numSubsets = partitioned ? 2 : 1;
 | 
						|
                        int indexBits = partitioned ? 3 : 4;
 | 
						|
                        int indexRange = (1 << indexBits);
 | 
						|
 | 
						|
                        for (int p = 0; p < numPartitions; p++)
 | 
						|
                        {
 | 
						|
                            int partitionMask = partitioned ? BC7Data::g_partitionMap[p] : 0;
 | 
						|
 | 
						|
                            const int MaxMetaRounds = MaxTweakRounds * MaxRefineRounds;
 | 
						|
 | 
						|
                            MAInt16 metaEndPointsQuantized[MaxMetaRounds][2][2][3];
 | 
						|
                            MUInt15 metaIndexes[MaxMetaRounds][16];
 | 
						|
                            MFloat metaError[MaxMetaRounds][2];
 | 
						|
 | 
						|
                            bool roundValid[MaxMetaRounds][2];
 | 
						|
 | 
						|
                            for (int r = 0; r < MaxMetaRounds; r++)
 | 
						|
                                for (int subset = 0; subset < 2; subset++)
 | 
						|
                                    roundValid[r][subset] = true;
 | 
						|
 | 
						|
                            for (int subset = 0; subset < numSubsets; subset++)
 | 
						|
                            {
 | 
						|
                                for (int tweak = 0; tweak < MaxTweakRounds; tweak++)
 | 
						|
                                {
 | 
						|
                                    EndpointRefiner<3> refiners[2];
 | 
						|
 | 
						|
                                    bool abortRemainingRefines = false;
 | 
						|
                                    for (int refinePass = 0; refinePass < MaxRefineRounds; refinePass++)
 | 
						|
                                    {
 | 
						|
                                        int metaRound = tweak * MaxRefineRounds + refinePass;
 | 
						|
 | 
						|
                                        if (tweak >= numTweakRounds || refinePass >= numRefineRounds)
 | 
						|
                                            abortRemainingRefines = true;
 | 
						|
 | 
						|
                                        if (abortRemainingRefines)
 | 
						|
                                        {
 | 
						|
                                            roundValid[metaRound][subset] = false;
 | 
						|
                                            continue;
 | 
						|
                                        }
 | 
						|
 | 
						|
                                        MAInt16(&mrQuantizedEndPoints)[2][2][3] = metaEndPointsQuantized[metaRound];
 | 
						|
                                        MUInt15(&mrIndexes)[16] = metaIndexes[metaRound];
 | 
						|
 | 
						|
                                        MSInt16 endPointsColorSpace[2][3];
 | 
						|
 | 
						|
                                        if (refinePass == 0)
 | 
						|
                                        {
 | 
						|
                                            UnfinishedEndpoints<3> ufep = partitioned ? partitionedUFEP[p][subset] : singleUFEP;
 | 
						|
 | 
						|
                                            if (isSigned)
 | 
						|
                                                ufep.FinishHDRSigned(tweak, indexRange, endPointsColorSpace[0], endPointsColorSpace[1], &rtn);
 | 
						|
                                            else
 | 
						|
                                                ufep.FinishHDRUnsigned(tweak, indexRange, endPointsColorSpace[0], endPointsColorSpace[1], &rtn);
 | 
						|
                                        }
 | 
						|
                                        else
 | 
						|
                                            refiners[subset].GetRefinedEndpointsHDR(endPointsColorSpace, isSigned, &rtn);
 | 
						|
 | 
						|
                                        refiners[subset].Init(indexRange, channelWeights);
 | 
						|
 | 
						|
                                        int fixupIndex = (subset == 0) ? 0 : BC7Data::g_fixupIndexes2[p];
 | 
						|
 | 
						|
                                        IndexSelectorHDR<3> indexSelector;
 | 
						|
                                        if (isSigned)
 | 
						|
                                            QuantizeEndpointsSigned(endPointsColorSpace, floatPixels2CL, floatPixelsLinearWeighted, mrQuantizedEndPoints[subset], mrIndexes, indexSelector, fixupIndex, aPrec, indexRange, channelWeights, fastIndexing, &rtn);
 | 
						|
                                        else
 | 
						|
                                            QuantizeEndpointsUnsigned(endPointsColorSpace, floatPixels2CL, floatPixelsLinearWeighted, mrQuantizedEndPoints[subset], mrIndexes, indexSelector, fixupIndex, aPrec, indexRange, channelWeights, fastIndexing, &rtn);
 | 
						|
 | 
						|
                                        if (metaRound > 0)
 | 
						|
                                        {
 | 
						|
                                            ParallelMath::Int16CompFlag anySame = ParallelMath::MakeBoolInt16(false);
 | 
						|
 | 
						|
                                            for (int prevRound = 0; prevRound < metaRound; prevRound++)
 | 
						|
                                            {
 | 
						|
                                                MAInt16(&prevRoundEPs)[2][3] = metaEndPointsQuantized[prevRound][subset];
 | 
						|
 | 
						|
                                                ParallelMath::Int16CompFlag same = ParallelMath::MakeBoolInt16(true);
 | 
						|
 | 
						|
                                                for (int epi = 0; epi < 2; epi++)
 | 
						|
                                                    for (int ch = 0; ch < 3; ch++)
 | 
						|
                                                        same = (same & ParallelMath::Equal(prevRoundEPs[epi][ch], mrQuantizedEndPoints[subset][epi][ch]));
 | 
						|
 | 
						|
                                                anySame = (anySame | same);
 | 
						|
                                                if (ParallelMath::AllSet(anySame))
 | 
						|
                                                    break;
 | 
						|
                                            }
 | 
						|
 | 
						|
                                            if (ParallelMath::AllSet(anySame))
 | 
						|
                                            {
 | 
						|
                                                roundValid[metaRound][subset] = false;
 | 
						|
                                                continue;
 | 
						|
                                            }
 | 
						|
                                        }
 | 
						|
 | 
						|
                                        MFloat subsetError = ParallelMath::MakeFloatZero();
 | 
						|
 | 
						|
                                        {
 | 
						|
                                            for (int px = 0; px < 16; px++)
 | 
						|
                                            {
 | 
						|
                                                if (subset != ((partitionMask >> px) & 1))
 | 
						|
                                                    continue;
 | 
						|
 | 
						|
                                                MUInt15 index;
 | 
						|
                                                if (px == fixupIndex)
 | 
						|
                                                    index = mrIndexes[px];
 | 
						|
                                                else
 | 
						|
                                                {
 | 
						|
                                                    index = fastIndexing ? indexSelector.SelectIndexHDRFast(floatPixels2CL[px], &rtn) : indexSelector.SelectIndexHDRSlow(floatPixelsLinearWeighted[px], &rtn);
 | 
						|
                                                    mrIndexes[px] = index;
 | 
						|
                                                }
 | 
						|
 | 
						|
                                                MSInt16 reconstructed[3];
 | 
						|
                                                if (isSigned)
 | 
						|
                                                    indexSelector.ReconstructHDRSigned(mrIndexes[px], reconstructed);
 | 
						|
                                                else
 | 
						|
                                                    indexSelector.ReconstructHDRUnsigned(mrIndexes[px], reconstructed);
 | 
						|
 | 
						|
                                                subsetError = subsetError + (fastIndexing ? BCCommon::ComputeErrorHDRFast<3>(flags, reconstructed, pixels[px], channelWeightsSq) : BCCommon::ComputeErrorHDRSlow<3>(flags, reconstructed, pixels[px], channelWeightsSq));
 | 
						|
 | 
						|
                                                if (refinePass != numRefineRounds - 1)
 | 
						|
                                                    refiners[subset].ContributeUnweightedPW(preWeightedPixels[px], index);
 | 
						|
                                            }
 | 
						|
                                        }
 | 
						|
 | 
						|
                                        metaError[metaRound][subset] = subsetError;
 | 
						|
                                    }
 | 
						|
                                }
 | 
						|
                            }
 | 
						|
 | 
						|
                            // Now we have a bunch of attempts, but not all of them will fit in the delta coding scheme
 | 
						|
                            int numMeta1 = partitioned ? MaxMetaRounds : 1;
 | 
						|
                            for (int meta0 = 0; meta0 < MaxMetaRounds; meta0++)
 | 
						|
                            {
 | 
						|
                                if (!roundValid[meta0][0])
 | 
						|
                                    continue;
 | 
						|
 | 
						|
                                for (int meta1 = 0; meta1 < numMeta1; meta1++)
 | 
						|
                                {
 | 
						|
                                    MFloat combinedError = metaError[meta0][0];
 | 
						|
                                    if (partitioned)
 | 
						|
                                    {
 | 
						|
                                        if (!roundValid[meta1][1])
 | 
						|
                                            continue;
 | 
						|
 | 
						|
                                        combinedError = combinedError + metaError[meta1][1];
 | 
						|
                                    }
 | 
						|
 | 
						|
                                    ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(combinedError, bestError);
 | 
						|
                                    if (!ParallelMath::AnySet(errorBetter))
 | 
						|
                                        continue;
 | 
						|
 | 
						|
                                    ParallelMath::Int16CompFlag needsCommit = ParallelMath::FloatFlagToInt16(errorBetter);
 | 
						|
 | 
						|
                                    // Figure out if this is encodable
 | 
						|
                                    for (int mode = 0; mode < BC7Data::g_numHDRModes; mode++)
 | 
						|
                                    {
 | 
						|
                                        const BC7Data::BC6HModeInfo &modeInfo = BC7Data::g_hdrModes[mode];
 | 
						|
 | 
						|
                                        if (modeInfo.m_partitioned != partitioned || modeInfo.m_aPrec != aPrec)
 | 
						|
                                            continue;
 | 
						|
 | 
						|
                                        MAInt16 encodedEPs[2][2][3];
 | 
						|
                                        ParallelMath::Int16CompFlag isLegal;
 | 
						|
                                        if (partitioned)
 | 
						|
                                            EvaluatePartitionedLegality(metaEndPointsQuantized[meta0][0], metaEndPointsQuantized[meta1][1], modeInfo.m_aPrec, modeInfo.m_bPrec, modeInfo.m_transformed, encodedEPs, isLegal);
 | 
						|
                                        else
 | 
						|
                                            EvaluateSingleLegality(metaEndPointsQuantized[meta0][0], modeInfo.m_aPrec, modeInfo.m_bPrec, modeInfo.m_transformed, encodedEPs[0], isLegal);
 | 
						|
 | 
						|
                                        ParallelMath::Int16CompFlag isLegalAndBetter = (ParallelMath::FloatFlagToInt16(errorBetter) & isLegal);
 | 
						|
                                        if (!ParallelMath::AnySet(isLegalAndBetter))
 | 
						|
                                            continue;
 | 
						|
 | 
						|
                                        ParallelMath::FloatCompFlag isLegalAndBetterFloat = ParallelMath::Int16FlagToFloat(isLegalAndBetter);
 | 
						|
 | 
						|
                                        ParallelMath::ConditionalSet(bestError, isLegalAndBetterFloat, combinedError);
 | 
						|
                                        ParallelMath::ConditionalSet(bestMode, isLegalAndBetter, ParallelMath::MakeUInt15(static_cast<uint16_t>(mode)));
 | 
						|
                                        ParallelMath::ConditionalSet(bestPartition, isLegalAndBetter, ParallelMath::MakeUInt15(static_cast<uint16_t>(p)));
 | 
						|
 | 
						|
                                        for (int subset = 0; subset < numSubsets; subset++)
 | 
						|
                                        {
 | 
						|
                                            for (int epi = 0; epi < 2; epi++)
 | 
						|
                                            {
 | 
						|
                                                for (int ch = 0; ch < 3; ch++)
 | 
						|
                                                    ParallelMath::ConditionalSet(bestEndPoints[subset][epi][ch], isLegalAndBetter, encodedEPs[subset][epi][ch]);
 | 
						|
                                            }
 | 
						|
                                        }
 | 
						|
 | 
						|
                                        for (int px = 0; px < 16; px++)
 | 
						|
                                        {
 | 
						|
                                            int subset = ((partitionMask >> px) & 1);
 | 
						|
                                            if (subset == 0)
 | 
						|
                                                ParallelMath::ConditionalSet(bestIndexes[px], isLegalAndBetter, metaIndexes[meta0][px]);
 | 
						|
                                            else
 | 
						|
                                                ParallelMath::ConditionalSet(bestIndexes[px], isLegalAndBetter, metaIndexes[meta1][px]);
 | 
						|
                                        }
 | 
						|
 | 
						|
                                        needsCommit = ParallelMath::AndNot(needsCommit, isLegalAndBetter);
 | 
						|
                                        if (!ParallelMath::AnySet(needsCommit))
 | 
						|
                                            break;
 | 
						|
                                    }
 | 
						|
                                }
 | 
						|
                            }
 | 
						|
                        }
 | 
						|
                    }
 | 
						|
                }
 | 
						|
 | 
						|
                // At this point, everything should be set
 | 
						|
                for (int block = 0; block < ParallelMath::ParallelSize; block++)
 | 
						|
                {
 | 
						|
                    ParallelMath::ScalarUInt16 mode = ParallelMath::Extract(bestMode, block);
 | 
						|
                    ParallelMath::ScalarUInt16 partition = ParallelMath::Extract(bestPartition, block);
 | 
						|
                    int32_t eps[2][2][3];
 | 
						|
                    ParallelMath::ScalarUInt16 indexes[16];
 | 
						|
 | 
						|
                    const BC7Data::BC6HModeInfo& modeInfo = BC7Data::g_hdrModes[mode];
 | 
						|
 | 
						|
                    const BC6HData::ModeDescriptor* desc = BC6HData::g_modeDescriptors[mode];
 | 
						|
 | 
						|
                    const size_t headerBits = modeInfo.m_partitioned ? 82 : 65;
 | 
						|
 | 
						|
                    for (int subset = 0; subset < 2; subset++)
 | 
						|
                    {
 | 
						|
                        for (int epi = 0; epi < 2; epi++)
 | 
						|
                        {
 | 
						|
                            for (int ch = 0; ch < 3; ch++)
 | 
						|
                                eps[subset][epi][ch] = ParallelMath::Extract(bestEndPoints[subset][epi][ch], block);
 | 
						|
                        }
 | 
						|
                    }
 | 
						|
 | 
						|
                    for (int px = 0; px < 16; px++)
 | 
						|
                        indexes[px] = ParallelMath::Extract(bestIndexes[px], block);
 | 
						|
 | 
						|
                    uint16_t modeID = modeInfo.m_modeID;
 | 
						|
 | 
						|
                    PackingVector pv;
 | 
						|
                    pv.Init();
 | 
						|
 | 
						|
                    for (size_t i = 0; i < headerBits; i++)
 | 
						|
                    {
 | 
						|
                        int32_t codedValue = 0;
 | 
						|
                        switch (desc[i].m_eField)
 | 
						|
                        {
 | 
						|
                        case BC6HData::M:  codedValue = modeID; break;
 | 
						|
                        case BC6HData::D:  codedValue = partition; break;
 | 
						|
                        case BC6HData::RW: codedValue = eps[0][0][0]; break;
 | 
						|
                        case BC6HData::RX: codedValue = eps[0][1][0]; break;
 | 
						|
                        case BC6HData::RY: codedValue = eps[1][0][0]; break;
 | 
						|
                        case BC6HData::RZ: codedValue = eps[1][1][0]; break;
 | 
						|
                        case BC6HData::GW: codedValue = eps[0][0][1]; break;
 | 
						|
                        case BC6HData::GX: codedValue = eps[0][1][1]; break;
 | 
						|
                        case BC6HData::GY: codedValue = eps[1][0][1]; break;
 | 
						|
                        case BC6HData::GZ: codedValue = eps[1][1][1]; break;
 | 
						|
                        case BC6HData::BW: codedValue = eps[0][0][2]; break;
 | 
						|
                        case BC6HData::BX: codedValue = eps[0][1][2]; break;
 | 
						|
                        case BC6HData::BY: codedValue = eps[1][0][2]; break;
 | 
						|
                        case BC6HData::BZ: codedValue = eps[1][1][2]; break;
 | 
						|
                        default: assert(false); break;
 | 
						|
                        }
 | 
						|
 | 
						|
                        pv.Pack(static_cast<uint16_t>((codedValue >> desc[i].m_uBit) & 1), 1);
 | 
						|
                    }
 | 
						|
 | 
						|
                    int fixupIndex1 = 0;
 | 
						|
                    int indexBits = 4;
 | 
						|
                    if (modeInfo.m_partitioned)
 | 
						|
                    {
 | 
						|
                        fixupIndex1 = BC7Data::g_fixupIndexes2[partition];
 | 
						|
                        indexBits = 3;
 | 
						|
                    }
 | 
						|
 | 
						|
                    for (int px = 0; px < 16; px++)
 | 
						|
                    {
 | 
						|
                        ParallelMath::ScalarUInt16 index = ParallelMath::Extract(bestIndexes[px], block);
 | 
						|
                        if (px == 0 || px == fixupIndex1)
 | 
						|
                            pv.Pack(index, indexBits - 1);
 | 
						|
                        else
 | 
						|
                            pv.Pack(index, indexBits);
 | 
						|
                    }
 | 
						|
 | 
						|
                    pv.Flush(packedBlocks + 16 * block);
 | 
						|
                }
 | 
						|
            }
 | 
						|
 | 
						|
            static void SignExtendSingle(int &v, int bits)
 | 
						|
            {
 | 
						|
                if (v & (1 << (bits - 1)))
 | 
						|
                    v |= -(1 << bits);
 | 
						|
            }
 | 
						|
 | 
						|
            static void UnpackOne(PixelBlockF16 &output, const uint8_t *pBC, bool isSigned)
 | 
						|
            {
 | 
						|
                UnpackingVector pv;
 | 
						|
                pv.Init(pBC);
 | 
						|
 | 
						|
                int numModeBits = 2;
 | 
						|
                int modeBits = pv.Unpack(2);
 | 
						|
                if (modeBits != 0 && modeBits != 1)
 | 
						|
                {
 | 
						|
                    modeBits |= pv.Unpack(3) << 2;
 | 
						|
                    numModeBits += 3;
 | 
						|
                }
 | 
						|
 | 
						|
                int mode = -1;
 | 
						|
                for (int possibleMode = 0; possibleMode < BC7Data::g_numHDRModes; possibleMode++)
 | 
						|
                {
 | 
						|
                    if (BC7Data::g_hdrModes[possibleMode].m_modeID == modeBits)
 | 
						|
                    {
 | 
						|
                        mode = possibleMode;
 | 
						|
                        break;
 | 
						|
                    }
 | 
						|
                }
 | 
						|
 | 
						|
                if (mode < 0)
 | 
						|
                {
 | 
						|
                    for (int px = 0; px < 16; px++)
 | 
						|
                    {
 | 
						|
                        for (int ch = 0; ch < 3; ch++)
 | 
						|
                            output.m_pixels[px][ch] = 0;
 | 
						|
                        output.m_pixels[px][3] = 0x3c00;	// 1.0
 | 
						|
                    }
 | 
						|
                    return;
 | 
						|
                }
 | 
						|
 | 
						|
                const BC7Data::BC6HModeInfo& modeInfo = BC7Data::g_hdrModes[mode];
 | 
						|
                const size_t headerBits = modeInfo.m_partitioned ? 82 : 65;
 | 
						|
                const BC6HData::ModeDescriptor* desc = BC6HData::g_modeDescriptors[mode];
 | 
						|
 | 
						|
                int32_t partition = 0;
 | 
						|
                int32_t eps[2][2][3];
 | 
						|
 | 
						|
                for (int subset = 0; subset < 2; subset++)
 | 
						|
                    for (int epi = 0; epi < 2; epi++)
 | 
						|
                        for (int ch = 0; ch < 3; ch++)
 | 
						|
                            eps[subset][epi][ch] = 0;
 | 
						|
 | 
						|
                for (size_t i = numModeBits; i < headerBits; i++)
 | 
						|
                {
 | 
						|
                    int32_t *pCodedValue = NULL;
 | 
						|
 | 
						|
                    switch (desc[i].m_eField)
 | 
						|
                    {
 | 
						|
                    case BC6HData::D:  pCodedValue = &partition; break;
 | 
						|
                    case BC6HData::RW: pCodedValue = &eps[0][0][0]; break;
 | 
						|
                    case BC6HData::RX: pCodedValue = &eps[0][1][0]; break;
 | 
						|
                    case BC6HData::RY: pCodedValue = &eps[1][0][0]; break;
 | 
						|
                    case BC6HData::RZ: pCodedValue = &eps[1][1][0]; break;
 | 
						|
                    case BC6HData::GW: pCodedValue = &eps[0][0][1]; break;
 | 
						|
                    case BC6HData::GX: pCodedValue = &eps[0][1][1]; break;
 | 
						|
                    case BC6HData::GY: pCodedValue = &eps[1][0][1]; break;
 | 
						|
                    case BC6HData::GZ: pCodedValue = &eps[1][1][1]; break;
 | 
						|
                    case BC6HData::BW: pCodedValue = &eps[0][0][2]; break;
 | 
						|
                    case BC6HData::BX: pCodedValue = &eps[0][1][2]; break;
 | 
						|
                    case BC6HData::BY: pCodedValue = &eps[1][0][2]; break;
 | 
						|
                    case BC6HData::BZ: pCodedValue = &eps[1][1][2]; break;
 | 
						|
                    default: assert(false); break;
 | 
						|
                    }
 | 
						|
 | 
						|
                    (*pCodedValue) |= pv.Unpack(1) << desc[i].m_uBit;
 | 
						|
                }
 | 
						|
 | 
						|
 | 
						|
                uint16_t modeID = modeInfo.m_modeID;
 | 
						|
 | 
						|
                int fixupIndex1 = 0;
 | 
						|
                int indexBits = 4;
 | 
						|
                int numSubsets = 1;
 | 
						|
                if (modeInfo.m_partitioned)
 | 
						|
                {
 | 
						|
                    fixupIndex1 = BC7Data::g_fixupIndexes2[partition];
 | 
						|
                    indexBits = 3;
 | 
						|
                    numSubsets = 2;
 | 
						|
                }
 | 
						|
 | 
						|
                int indexes[16];
 | 
						|
                for (int px = 0; px < 16; px++)
 | 
						|
                {
 | 
						|
                    if (px == 0 || px == fixupIndex1)
 | 
						|
                        indexes[px] = pv.Unpack(indexBits - 1);
 | 
						|
                    else
 | 
						|
                        indexes[px] = pv.Unpack(indexBits);
 | 
						|
                }
 | 
						|
 | 
						|
                if (modeInfo.m_partitioned)
 | 
						|
                {
 | 
						|
                    for (int ch = 0; ch < 3; ch++)
 | 
						|
                    {
 | 
						|
                        if (isSigned)
 | 
						|
                            SignExtendSingle(eps[0][0][ch], modeInfo.m_aPrec);
 | 
						|
                        if (modeInfo.m_transformed || isSigned)
 | 
						|
                        {
 | 
						|
                            SignExtendSingle(eps[0][1][ch], modeInfo.m_bPrec[ch]);
 | 
						|
                            SignExtendSingle(eps[1][0][ch], modeInfo.m_bPrec[ch]);
 | 
						|
                            SignExtendSingle(eps[1][1][ch], modeInfo.m_bPrec[ch]);
 | 
						|
                        }
 | 
						|
                    }
 | 
						|
                }
 | 
						|
                else
 | 
						|
                {
 | 
						|
                    for (int ch = 0; ch < 3; ch++)
 | 
						|
                    {
 | 
						|
                        if (isSigned)
 | 
						|
                            SignExtendSingle(eps[0][0][ch], modeInfo.m_aPrec);
 | 
						|
                        if (modeInfo.m_transformed || isSigned)
 | 
						|
                            SignExtendSingle(eps[0][1][ch], modeInfo.m_bPrec[ch]);
 | 
						|
                    }
 | 
						|
                }
 | 
						|
 | 
						|
                int aPrec = modeInfo.m_aPrec;
 | 
						|
 | 
						|
                if (modeInfo.m_transformed)
 | 
						|
                {
 | 
						|
                    for (int ch = 0; ch < 3; ch++)
 | 
						|
                    {
 | 
						|
                        int wrapMask = (1 << aPrec) - 1;
 | 
						|
 | 
						|
                        eps[0][1][ch] = ((eps[0][0][ch] + eps[0][1][ch]) & wrapMask);
 | 
						|
                        if (isSigned)
 | 
						|
                            SignExtendSingle(eps[0][1][ch], aPrec);
 | 
						|
 | 
						|
                        if (modeInfo.m_partitioned)
 | 
						|
                        {
 | 
						|
                            eps[1][0][ch] = ((eps[0][0][ch] + eps[1][0][ch]) & wrapMask);
 | 
						|
                            eps[1][1][ch] = ((eps[0][0][ch] + eps[1][1][ch]) & wrapMask);
 | 
						|
 | 
						|
                            if (isSigned)
 | 
						|
                            {
 | 
						|
                                SignExtendSingle(eps[1][0][ch], aPrec);
 | 
						|
                                SignExtendSingle(eps[1][1][ch], aPrec);
 | 
						|
                            }
 | 
						|
                        }
 | 
						|
                    }
 | 
						|
                }
 | 
						|
 | 
						|
                // Unquantize endpoints
 | 
						|
                for (int subset = 0; subset < numSubsets; subset++)
 | 
						|
                {
 | 
						|
                    for (int epi = 0; epi < 2; epi++)
 | 
						|
                    {
 | 
						|
                        for (int ch = 0; ch < 3; ch++)
 | 
						|
                        {
 | 
						|
                            int &v = eps[subset][epi][ch];
 | 
						|
 | 
						|
                            if (isSigned)
 | 
						|
                            {
 | 
						|
                                if (aPrec >= 16)
 | 
						|
                                {
 | 
						|
                                    // Nothing
 | 
						|
                                }
 | 
						|
                                else
 | 
						|
                                {
 | 
						|
                                    bool s = false;
 | 
						|
                                    int comp = v;
 | 
						|
                                    if (v < 0)
 | 
						|
                                    {
 | 
						|
                                        s = true;
 | 
						|
                                        comp = -comp;
 | 
						|
                                    }
 | 
						|
 | 
						|
                                    int unq = 0;
 | 
						|
                                    if (comp == 0)
 | 
						|
                                        unq = 0;
 | 
						|
                                    else if (comp >= ((1 << (aPrec - 1)) - 1))
 | 
						|
                                        unq = 0x7fff;
 | 
						|
                                    else
 | 
						|
                                        unq = ((comp << 15) + 0x4000) >> (aPrec - 1);
 | 
						|
 | 
						|
                                    if (s)
 | 
						|
                                        unq = -unq;
 | 
						|
 | 
						|
                                    v = unq;
 | 
						|
                                }
 | 
						|
                            }
 | 
						|
                            else
 | 
						|
                            {
 | 
						|
                                if (aPrec >= 15)
 | 
						|
                                {
 | 
						|
                                    // Nothing
 | 
						|
                                }
 | 
						|
                                else if (v == 0)
 | 
						|
                                {
 | 
						|
                                    // Nothing
 | 
						|
                                }
 | 
						|
                                else if (v == ((1 << aPrec) - 1))
 | 
						|
                                    v = 0xffff;
 | 
						|
                                else
 | 
						|
                                    v = ((v << 16) + 0x8000) >> aPrec;
 | 
						|
                            }
 | 
						|
                        }
 | 
						|
                    }
 | 
						|
                }
 | 
						|
 | 
						|
                const int *weights = BC7Data::g_weightTables[indexBits];
 | 
						|
 | 
						|
                for (int px = 0; px < 16; px++)
 | 
						|
                {
 | 
						|
                    int subset = 0;
 | 
						|
                    if (modeInfo.m_partitioned)
 | 
						|
                        subset = (BC7Data::g_partitionMap[partition] >> px) & 1;
 | 
						|
 | 
						|
                    int w = weights[indexes[px]];
 | 
						|
                    for (int ch = 0; ch < 3; ch++)
 | 
						|
                    {
 | 
						|
                        int comp = ((64 - w) * eps[subset][0][ch] + w * eps[subset][1][ch] + 32) >> 6;
 | 
						|
 | 
						|
                        if (isSigned)
 | 
						|
                        {
 | 
						|
                            if (comp < 0)
 | 
						|
                                comp = -(((-comp) * 31) >> 5);
 | 
						|
                            else
 | 
						|
                                comp = (comp * 31) >> 5;
 | 
						|
 | 
						|
                            int s = 0;
 | 
						|
                            if (comp < 0)
 | 
						|
                            {
 | 
						|
                                s = 0x8000;
 | 
						|
                                comp = -comp;
 | 
						|
                            }
 | 
						|
 | 
						|
                            output.m_pixels[px][ch] = static_cast<uint16_t>(s | comp);
 | 
						|
                        }
 | 
						|
                        else
 | 
						|
                        {
 | 
						|
                            comp = (comp * 31) >> 6;
 | 
						|
                            output.m_pixels[px][ch] = static_cast<uint16_t>(comp);
 | 
						|
                        }
 | 
						|
                    }
 | 
						|
                    output.m_pixels[px][3] = 0x3c00;	// 1.0
 | 
						|
                }
 | 
						|
            }
 | 
						|
        };
 | 
						|
 | 
						|
        namespace S3TCSingleColorTables
 | 
						|
        {
 | 
						|
            struct SingleColorTableEntry
 | 
						|
            {
 | 
						|
                uint8_t m_min;
 | 
						|
                uint8_t m_max;
 | 
						|
                uint8_t m_actualColor;
 | 
						|
                uint8_t m_span;
 | 
						|
            };
 | 
						|
 | 
						|
            SingleColorTableEntry g_singleColor5_3[256] =
 | 
						|
            {
 | 
						|
                { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 8, 0, 2, 8 }, { 8, 0, 2, 8 }, { 0, 8, 5, 8 }, { 0, 8, 5, 8 }, { 0, 8, 5, 8 }, { 8, 8, 8, 0 },
 | 
						|
                { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 16, 8, 10, 8 }, { 33, 0, 11, 33 }, { 8, 16, 13, 8 }, { 8, 16, 13, 8 }, { 8, 16, 13, 8 }, { 16, 16, 16, 0 },
 | 
						|
                { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 24, 16, 18, 8 }, { 41, 8, 19, 33 }, { 16, 24, 21, 8 }, { 16, 24, 21, 8 }, { 0, 33, 22, 33 }, { 24, 24, 24, 0 },
 | 
						|
                { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 33, 24, 27, 9 }, { 33, 24, 27, 9 }, { 33, 24, 27, 9 }, { 41, 24, 29, 17 }, { 24, 33, 30, 9 }, { 24, 33, 30, 9 },
 | 
						|
                { 16, 41, 32, 25 }, { 33, 33, 33, 0 }, { 33, 33, 33, 0 }, { 41, 33, 35, 8 }, { 41, 33, 35, 8 }, { 33, 41, 38, 8 }, { 33, 41, 38, 8 }, { 33, 41, 38, 8 },
 | 
						|
                { 24, 49, 40, 25 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 49, 41, 43, 8 }, { 66, 33, 44, 33 }, { 41, 49, 46, 8 }, { 41, 49, 46, 8 }, { 41, 49, 46, 8 },
 | 
						|
                { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 57, 49, 51, 8 }, { 74, 41, 52, 33 }, { 49, 57, 54, 8 }, { 49, 57, 54, 8 }, { 33, 66, 55, 33 },
 | 
						|
                { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 66, 57, 60, 9 }, { 66, 57, 60, 9 }, { 66, 57, 60, 9 }, { 74, 57, 62, 17 }, { 57, 66, 63, 9 },
 | 
						|
                { 57, 66, 63, 9 }, { 49, 74, 65, 25 }, { 66, 66, 66, 0 }, { 66, 66, 66, 0 }, { 74, 66, 68, 8 }, { 74, 66, 68, 8 }, { 66, 74, 71, 8 }, { 66, 74, 71, 8 },
 | 
						|
                { 66, 74, 71, 8 }, { 57, 82, 73, 25 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 82, 74, 76, 8 }, { 99, 66, 77, 33 }, { 74, 82, 79, 8 }, { 74, 82, 79, 8 },
 | 
						|
                { 74, 82, 79, 8 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 90, 82, 84, 8 }, { 107, 74, 85, 33 }, { 82, 90, 87, 8 }, { 82, 90, 87, 8 },
 | 
						|
                { 66, 99, 88, 33 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 99, 90, 93, 9 }, { 99, 90, 93, 9 }, { 99, 90, 93, 9 }, { 107, 90, 95, 17 },
 | 
						|
                { 90, 99, 96, 9 }, { 90, 99, 96, 9 }, { 82, 107, 98, 25 }, { 99, 99, 99, 0 }, { 99, 99, 99, 0 }, { 107, 99, 101, 8 }, { 107, 99, 101, 8 }, { 99, 107, 104, 8 },
 | 
						|
                { 99, 107, 104, 8 }, { 99, 107, 104, 8 }, { 90, 115, 106, 25 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 115, 107, 109, 8 }, { 132, 99, 110, 33 }, { 107, 115, 112, 8 },
 | 
						|
                { 107, 115, 112, 8 }, { 107, 115, 112, 8 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 123, 115, 117, 8 }, { 140, 107, 118, 33 }, { 115, 123, 120, 8 },
 | 
						|
                { 115, 123, 120, 8 }, { 99, 132, 121, 33 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 132, 123, 126, 9 }, { 132, 123, 126, 9 }, { 132, 123, 126, 9 },
 | 
						|
                { 140, 123, 128, 17 }, { 123, 132, 129, 9 }, { 123, 132, 129, 9 }, { 115, 140, 131, 25 }, { 132, 132, 132, 0 }, { 132, 132, 132, 0 }, { 140, 132, 134, 8 }, { 140, 132, 134, 8 },
 | 
						|
                { 132, 140, 137, 8 }, { 132, 140, 137, 8 }, { 132, 140, 137, 8 }, { 123, 148, 139, 25 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 148, 140, 142, 8 }, { 165, 132, 143, 33 },
 | 
						|
                { 140, 148, 145, 8 }, { 140, 148, 145, 8 }, { 140, 148, 145, 8 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 156, 148, 150, 8 }, { 173, 140, 151, 33 },
 | 
						|
                { 148, 156, 153, 8 }, { 148, 156, 153, 8 }, { 132, 165, 154, 33 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 165, 156, 159, 9 }, { 165, 156, 159, 9 },
 | 
						|
                { 165, 156, 159, 9 }, { 173, 156, 161, 17 }, { 156, 165, 162, 9 }, { 156, 165, 162, 9 }, { 148, 173, 164, 25 }, { 165, 165, 165, 0 }, { 165, 165, 165, 0 }, { 173, 165, 167, 8 },
 | 
						|
                { 173, 165, 167, 8 }, { 165, 173, 170, 8 }, { 165, 173, 170, 8 }, { 165, 173, 170, 8 }, { 156, 181, 172, 25 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 }, { 181, 173, 175, 8 },
 | 
						|
                { 198, 165, 176, 33 }, { 173, 181, 178, 8 }, { 173, 181, 178, 8 }, { 173, 181, 178, 8 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 189, 181, 183, 8 },
 | 
						|
                { 206, 173, 184, 33 }, { 181, 189, 186, 8 }, { 181, 189, 186, 8 }, { 165, 198, 187, 33 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 198, 189, 192, 9 },
 | 
						|
                { 198, 189, 192, 9 }, { 198, 189, 192, 9 }, { 206, 189, 194, 17 }, { 189, 198, 195, 9 }, { 189, 198, 195, 9 }, { 181, 206, 197, 25 }, { 198, 198, 198, 0 }, { 198, 198, 198, 0 },
 | 
						|
                { 206, 198, 200, 8 }, { 206, 198, 200, 8 }, { 198, 206, 203, 8 }, { 198, 206, 203, 8 }, { 198, 206, 203, 8 }, { 189, 214, 205, 25 }, { 206, 206, 206, 0 }, { 206, 206, 206, 0 },
 | 
						|
                { 214, 206, 208, 8 }, { 231, 198, 209, 33 }, { 206, 214, 211, 8 }, { 206, 214, 211, 8 }, { 206, 214, 211, 8 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 },
 | 
						|
                { 222, 214, 216, 8 }, { 239, 206, 217, 33 }, { 214, 222, 219, 8 }, { 214, 222, 219, 8 }, { 198, 231, 220, 33 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 },
 | 
						|
                { 231, 222, 225, 9 }, { 231, 222, 225, 9 }, { 231, 222, 225, 9 }, { 239, 222, 227, 17 }, { 222, 231, 228, 9 }, { 222, 231, 228, 9 }, { 214, 239, 230, 25 }, { 231, 231, 231, 0 },
 | 
						|
                { 231, 231, 231, 0 }, { 239, 231, 233, 8 }, { 239, 231, 233, 8 }, { 231, 239, 236, 8 }, { 231, 239, 236, 8 }, { 231, 239, 236, 8 }, { 222, 247, 238, 25 }, { 239, 239, 239, 0 },
 | 
						|
                { 239, 239, 239, 0 }, { 247, 239, 241, 8 }, { 247, 239, 241, 8 }, { 239, 247, 244, 8 }, { 239, 247, 244, 8 }, { 239, 247, 244, 8 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
 | 
						|
                { 247, 247, 247, 0 }, { 255, 247, 249, 8 }, { 255, 247, 249, 8 }, { 247, 255, 252, 8 }, { 247, 255, 252, 8 }, { 247, 255, 252, 8 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
 | 
						|
            };
 | 
						|
 | 
						|
            SingleColorTableEntry g_singleColor6_3[256] =
 | 
						|
            {
 | 
						|
                { 0, 0, 0, 0 }, { 4, 0, 1, 4 }, { 0, 4, 2, 4 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 8, 4, 5, 4 }, { 4, 8, 6, 4 }, { 8, 8, 8, 0 },
 | 
						|
                { 8, 8, 8, 0 }, { 12, 8, 9, 4 }, { 8, 12, 10, 4 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 16, 12, 13, 4 }, { 12, 16, 14, 4 }, { 16, 16, 16, 0 },
 | 
						|
                { 16, 16, 16, 0 }, { 20, 16, 17, 4 }, { 16, 20, 18, 4 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 24, 20, 21, 4 }, { 20, 24, 22, 4 }, { 69, 0, 23, 69 },
 | 
						|
                { 24, 24, 24, 0 }, { 28, 24, 25, 4 }, { 24, 28, 26, 4 }, { 65, 8, 27, 57 }, { 28, 28, 28, 0 }, { 32, 28, 29, 4 }, { 28, 32, 30, 4 }, { 69, 12, 31, 57 },
 | 
						|
                { 32, 32, 32, 0 }, { 36, 32, 33, 4 }, { 32, 36, 34, 4 }, { 65, 20, 35, 45 }, { 36, 36, 36, 0 }, { 40, 36, 37, 4 }, { 36, 40, 38, 4 }, { 69, 24, 39, 45 },
 | 
						|
                { 40, 40, 40, 0 }, { 44, 40, 41, 4 }, { 40, 44, 42, 4 }, { 65, 32, 43, 33 }, { 44, 44, 44, 0 }, { 48, 44, 45, 4 }, { 44, 48, 46, 4 }, { 69, 36, 47, 33 },
 | 
						|
                { 48, 48, 48, 0 }, { 52, 48, 49, 4 }, { 48, 52, 50, 4 }, { 65, 44, 51, 21 }, { 52, 52, 52, 0 }, { 56, 52, 53, 4 }, { 52, 56, 54, 4 }, { 69, 48, 55, 21 },
 | 
						|
                { 56, 56, 56, 0 }, { 60, 56, 57, 4 }, { 56, 60, 58, 4 }, { 65, 56, 59, 9 }, { 60, 60, 60, 0 }, { 65, 60, 61, 5 }, { 56, 65, 62, 9 }, { 60, 65, 63, 5 },
 | 
						|
                { 56, 69, 64, 13 }, { 65, 65, 65, 0 }, { 69, 65, 66, 4 }, { 65, 69, 67, 4 }, { 60, 73, 68, 13 }, { 69, 69, 69, 0 }, { 73, 69, 70, 4 }, { 69, 73, 71, 4 },
 | 
						|
                { 56, 81, 72, 25 }, { 73, 73, 73, 0 }, { 77, 73, 74, 4 }, { 73, 77, 75, 4 }, { 60, 85, 76, 25 }, { 77, 77, 77, 0 }, { 81, 77, 78, 4 }, { 77, 81, 79, 4 },
 | 
						|
                { 56, 93, 80, 37 }, { 81, 81, 81, 0 }, { 85, 81, 82, 4 }, { 81, 85, 83, 4 }, { 60, 97, 84, 37 }, { 85, 85, 85, 0 }, { 89, 85, 86, 4 }, { 85, 89, 87, 4 },
 | 
						|
                { 56, 105, 88, 49 }, { 89, 89, 89, 0 }, { 93, 89, 90, 4 }, { 89, 93, 91, 4 }, { 60, 109, 92, 49 }, { 93, 93, 93, 0 }, { 97, 93, 94, 4 }, { 93, 97, 95, 4 },
 | 
						|
                { 134, 77, 96, 57 }, { 97, 97, 97, 0 }, { 101, 97, 98, 4 }, { 97, 101, 99, 4 }, { 130, 85, 100, 45 }, { 101, 101, 101, 0 }, { 105, 101, 102, 4 }, { 101, 105, 103, 4 },
 | 
						|
                { 134, 89, 104, 45 }, { 105, 105, 105, 0 }, { 109, 105, 106, 4 }, { 105, 109, 107, 4 }, { 130, 97, 108, 33 }, { 109, 109, 109, 0 }, { 113, 109, 110, 4 }, { 109, 113, 111, 4 },
 | 
						|
                { 134, 101, 112, 33 }, { 113, 113, 113, 0 }, { 117, 113, 114, 4 }, { 113, 117, 115, 4 }, { 130, 109, 116, 21 }, { 117, 117, 117, 0 }, { 121, 117, 118, 4 }, { 117, 121, 119, 4 },
 | 
						|
                { 134, 113, 120, 21 }, { 121, 121, 121, 0 }, { 125, 121, 122, 4 }, { 121, 125, 123, 4 }, { 130, 121, 124, 9 }, { 125, 125, 125, 0 }, { 130, 125, 126, 5 }, { 121, 130, 127, 9 },
 | 
						|
                { 125, 130, 128, 5 }, { 121, 134, 129, 13 }, { 130, 130, 130, 0 }, { 134, 130, 131, 4 }, { 130, 134, 132, 4 }, { 125, 138, 133, 13 }, { 134, 134, 134, 0 }, { 138, 134, 135, 4 },
 | 
						|
                { 134, 138, 136, 4 }, { 121, 146, 137, 25 }, { 138, 138, 138, 0 }, { 142, 138, 139, 4 }, { 138, 142, 140, 4 }, { 125, 150, 141, 25 }, { 142, 142, 142, 0 }, { 146, 142, 143, 4 },
 | 
						|
                { 142, 146, 144, 4 }, { 121, 158, 145, 37 }, { 146, 146, 146, 0 }, { 150, 146, 147, 4 }, { 146, 150, 148, 4 }, { 125, 162, 149, 37 }, { 150, 150, 150, 0 }, { 154, 150, 151, 4 },
 | 
						|
                { 150, 154, 152, 4 }, { 121, 170, 153, 49 }, { 154, 154, 154, 0 }, { 158, 154, 155, 4 }, { 154, 158, 156, 4 }, { 125, 174, 157, 49 }, { 158, 158, 158, 0 }, { 162, 158, 159, 4 },
 | 
						|
                { 158, 162, 160, 4 }, { 199, 142, 161, 57 }, { 162, 162, 162, 0 }, { 166, 162, 163, 4 }, { 162, 166, 164, 4 }, { 195, 150, 165, 45 }, { 166, 166, 166, 0 }, { 170, 166, 167, 4 },
 | 
						|
                { 166, 170, 168, 4 }, { 199, 154, 169, 45 }, { 170, 170, 170, 0 }, { 174, 170, 171, 4 }, { 170, 174, 172, 4 }, { 195, 162, 173, 33 }, { 174, 174, 174, 0 }, { 178, 174, 175, 4 },
 | 
						|
                { 174, 178, 176, 4 }, { 199, 166, 177, 33 }, { 178, 178, 178, 0 }, { 182, 178, 179, 4 }, { 178, 182, 180, 4 }, { 195, 174, 181, 21 }, { 182, 182, 182, 0 }, { 186, 182, 183, 4 },
 | 
						|
                { 182, 186, 184, 4 }, { 199, 178, 185, 21 }, { 186, 186, 186, 0 }, { 190, 186, 187, 4 }, { 186, 190, 188, 4 }, { 195, 186, 189, 9 }, { 190, 190, 190, 0 }, { 195, 190, 191, 5 },
 | 
						|
                { 186, 195, 192, 9 }, { 190, 195, 193, 5 }, { 186, 199, 194, 13 }, { 195, 195, 195, 0 }, { 199, 195, 196, 4 }, { 195, 199, 197, 4 }, { 190, 203, 198, 13 }, { 199, 199, 199, 0 },
 | 
						|
                { 203, 199, 200, 4 }, { 199, 203, 201, 4 }, { 186, 211, 202, 25 }, { 203, 203, 203, 0 }, { 207, 203, 204, 4 }, { 203, 207, 205, 4 }, { 190, 215, 206, 25 }, { 207, 207, 207, 0 },
 | 
						|
                { 211, 207, 208, 4 }, { 207, 211, 209, 4 }, { 186, 223, 210, 37 }, { 211, 211, 211, 0 }, { 215, 211, 212, 4 }, { 211, 215, 213, 4 }, { 190, 227, 214, 37 }, { 215, 215, 215, 0 },
 | 
						|
                { 219, 215, 216, 4 }, { 215, 219, 217, 4 }, { 186, 235, 218, 49 }, { 219, 219, 219, 0 }, { 223, 219, 220, 4 }, { 219, 223, 221, 4 }, { 190, 239, 222, 49 }, { 223, 223, 223, 0 },
 | 
						|
                { 227, 223, 224, 4 }, { 223, 227, 225, 4 }, { 186, 247, 226, 61 }, { 227, 227, 227, 0 }, { 231, 227, 228, 4 }, { 227, 231, 229, 4 }, { 190, 251, 230, 61 }, { 231, 231, 231, 0 },
 | 
						|
                { 235, 231, 232, 4 }, { 231, 235, 233, 4 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 239, 235, 236, 4 }, { 235, 239, 237, 4 }, { 239, 239, 239, 0 }, { 239, 239, 239, 0 },
 | 
						|
                { 243, 239, 240, 4 }, { 239, 243, 241, 4 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 247, 243, 244, 4 }, { 243, 247, 245, 4 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
 | 
						|
                { 251, 247, 248, 4 }, { 247, 251, 249, 4 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 255, 251, 252, 4 }, { 251, 255, 253, 4 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
 | 
						|
            };
 | 
						|
 | 
						|
            SingleColorTableEntry g_singleColor5_2[256] =
 | 
						|
            {
 | 
						|
                { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 8, 4, 8 }, { 0, 8, 4, 8 }, { 0, 8, 4, 8 }, { 8, 8, 8, 0 }, { 8, 8, 8, 0 },
 | 
						|
                { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 16, 12, 8 }, { 8, 16, 12, 8 }, { 8, 16, 12, 8 }, { 16, 16, 16, 0 }, { 16, 16, 16, 0 },
 | 
						|
                { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 24, 20, 8 }, { 16, 24, 20, 8 }, { 16, 24, 20, 8 }, { 24, 24, 24, 0 }, { 24, 24, 24, 0 },
 | 
						|
                { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 41, 32, 17 },
 | 
						|
                { 24, 41, 32, 17 }, { 33, 33, 33, 0 }, { 33, 33, 33, 0 }, { 24, 49, 36, 25 }, { 24, 49, 36, 25 }, { 33, 41, 37, 8 }, { 33, 41, 37, 8 }, { 24, 57, 40, 33 },
 | 
						|
                { 24, 57, 40, 33 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 41, 49, 45, 8 }, { 41, 49, 45, 8 }, { 41, 49, 45, 8 }, { 49, 49, 49, 0 },
 | 
						|
                { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 57, 53, 8 }, { 49, 57, 53, 8 }, { 49, 57, 53, 8 }, { 57, 57, 57, 0 },
 | 
						|
                { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 },
 | 
						|
                { 57, 74, 65, 17 }, { 57, 74, 65, 17 }, { 66, 66, 66, 0 }, { 66, 66, 66, 0 }, { 57, 82, 69, 25 }, { 57, 82, 69, 25 }, { 66, 74, 70, 8 }, { 66, 74, 70, 8 },
 | 
						|
                { 57, 90, 73, 33 }, { 57, 90, 73, 33 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 74, 82, 78, 8 }, { 74, 82, 78, 8 }, { 74, 82, 78, 8 },
 | 
						|
                { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 90, 86, 8 }, { 82, 90, 86, 8 }, { 82, 90, 86, 8 },
 | 
						|
                { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 99, 94, 9 }, { 90, 99, 94, 9 }, { 90, 99, 94, 9 },
 | 
						|
                { 90, 99, 94, 9 }, { 90, 107, 98, 17 }, { 90, 107, 98, 17 }, { 99, 99, 99, 0 }, { 99, 99, 99, 0 }, { 90, 115, 102, 25 }, { 90, 115, 102, 25 }, { 99, 107, 103, 8 },
 | 
						|
                { 99, 107, 103, 8 }, { 90, 123, 106, 33 }, { 90, 123, 106, 33 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 107, 115, 111, 8 }, { 107, 115, 111, 8 },
 | 
						|
                { 107, 115, 111, 8 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 123, 119, 8 }, { 115, 123, 119, 8 },
 | 
						|
                { 115, 123, 119, 8 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 132, 127, 9 }, { 123, 132, 127, 9 },
 | 
						|
                { 123, 132, 127, 9 }, { 123, 132, 127, 9 }, { 123, 140, 131, 17 }, { 123, 140, 131, 17 }, { 132, 132, 132, 0 }, { 132, 132, 132, 0 }, { 123, 148, 135, 25 }, { 123, 148, 135, 25 },
 | 
						|
                { 132, 140, 136, 8 }, { 132, 140, 136, 8 }, { 123, 156, 139, 33 }, { 123, 156, 139, 33 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 140, 148, 144, 8 },
 | 
						|
                { 140, 148, 144, 8 }, { 140, 148, 144, 8 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 156, 152, 8 },
 | 
						|
                { 148, 156, 152, 8 }, { 148, 156, 152, 8 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 165, 160, 9 },
 | 
						|
                { 156, 165, 160, 9 }, { 156, 165, 160, 9 }, { 156, 165, 160, 9 }, { 156, 173, 164, 17 }, { 156, 173, 164, 17 }, { 165, 165, 165, 0 }, { 165, 165, 165, 0 }, { 156, 181, 168, 25 },
 | 
						|
                { 156, 181, 168, 25 }, { 165, 173, 169, 8 }, { 165, 173, 169, 8 }, { 156, 189, 172, 33 }, { 156, 189, 172, 33 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 },
 | 
						|
                { 173, 181, 177, 8 }, { 173, 181, 177, 8 }, { 173, 181, 177, 8 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 },
 | 
						|
                { 181, 189, 185, 8 }, { 181, 189, 185, 8 }, { 181, 189, 185, 8 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 },
 | 
						|
                { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 206, 197, 17 }, { 189, 206, 197, 17 }, { 198, 198, 198, 0 }, { 198, 198, 198, 0 },
 | 
						|
                { 189, 214, 201, 25 }, { 189, 214, 201, 25 }, { 198, 206, 202, 8 }, { 198, 206, 202, 8 }, { 189, 222, 205, 33 }, { 189, 222, 205, 33 }, { 206, 206, 206, 0 }, { 206, 206, 206, 0 },
 | 
						|
                { 206, 206, 206, 0 }, { 206, 214, 210, 8 }, { 206, 214, 210, 8 }, { 206, 214, 210, 8 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 },
 | 
						|
                { 214, 214, 214, 0 }, { 214, 222, 218, 8 }, { 214, 222, 218, 8 }, { 214, 222, 218, 8 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 },
 | 
						|
                { 222, 222, 222, 0 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 239, 230, 17 }, { 222, 239, 230, 17 }, { 231, 231, 231, 0 },
 | 
						|
                { 231, 231, 231, 0 }, { 222, 247, 234, 25 }, { 222, 247, 234, 25 }, { 231, 239, 235, 8 }, { 231, 239, 235, 8 }, { 222, 255, 238, 33 }, { 222, 255, 238, 33 }, { 239, 239, 239, 0 },
 | 
						|
                { 239, 239, 239, 0 }, { 239, 239, 239, 0 }, { 239, 247, 243, 8 }, { 239, 247, 243, 8 }, { 239, 247, 243, 8 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
 | 
						|
                { 247, 247, 247, 0 }, { 247, 247, 247, 0 }, { 247, 255, 251, 8 }, { 247, 255, 251, 8 }, { 247, 255, 251, 8 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
 | 
						|
            };
 | 
						|
 | 
						|
            SingleColorTableEntry g_singleColor6_2[256] =
 | 
						|
            {
 | 
						|
                { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 4, 2, 4 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 4, 8, 6, 4 }, { 8, 8, 8, 0 },
 | 
						|
                { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 12, 10, 4 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 12, 16, 14, 4 }, { 16, 16, 16, 0 },
 | 
						|
                { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 20, 18, 4 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 20, 24, 22, 4 }, { 24, 24, 24, 0 },
 | 
						|
                { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 28, 26, 4 }, { 28, 28, 28, 0 }, { 28, 28, 28, 0 }, { 28, 28, 28, 0 }, { 28, 32, 30, 4 }, { 32, 32, 32, 0 },
 | 
						|
                { 32, 32, 32, 0 }, { 32, 32, 32, 0 }, { 32, 36, 34, 4 }, { 36, 36, 36, 0 }, { 36, 36, 36, 0 }, { 36, 36, 36, 0 }, { 36, 40, 38, 4 }, { 40, 40, 40, 0 },
 | 
						|
                { 40, 40, 40, 0 }, { 40, 40, 40, 0 }, { 40, 44, 42, 4 }, { 44, 44, 44, 0 }, { 44, 44, 44, 0 }, { 44, 44, 44, 0 }, { 44, 48, 46, 4 }, { 48, 48, 48, 0 },
 | 
						|
                { 48, 48, 48, 0 }, { 48, 48, 48, 0 }, { 48, 52, 50, 4 }, { 52, 52, 52, 0 }, { 52, 52, 52, 0 }, { 52, 52, 52, 0 }, { 52, 56, 54, 4 }, { 56, 56, 56, 0 },
 | 
						|
                { 56, 56, 56, 0 }, { 56, 56, 56, 0 }, { 56, 60, 58, 4 }, { 60, 60, 60, 0 }, { 60, 60, 60, 0 }, { 60, 60, 60, 0 }, { 60, 65, 62, 5 }, { 60, 65, 62, 5 },
 | 
						|
                { 60, 69, 64, 9 }, { 65, 65, 65, 0 }, { 60, 73, 66, 13 }, { 65, 69, 67, 4 }, { 60, 77, 68, 17 }, { 69, 69, 69, 0 }, { 60, 81, 70, 21 }, { 69, 73, 71, 4 },
 | 
						|
                { 60, 85, 72, 25 }, { 73, 73, 73, 0 }, { 60, 89, 74, 29 }, { 73, 77, 75, 4 }, { 60, 93, 76, 33 }, { 77, 77, 77, 0 }, { 60, 97, 78, 37 }, { 77, 81, 79, 4 },
 | 
						|
                { 60, 101, 80, 41 }, { 81, 81, 81, 0 }, { 60, 105, 82, 45 }, { 81, 85, 83, 4 }, { 60, 109, 84, 49 }, { 85, 85, 85, 0 }, { 60, 113, 86, 53 }, { 85, 89, 87, 4 },
 | 
						|
                { 60, 117, 88, 57 }, { 89, 89, 89, 0 }, { 60, 121, 90, 61 }, { 89, 93, 91, 4 }, { 60, 125, 92, 65 }, { 93, 93, 93, 0 }, { 93, 93, 93, 0 }, { 93, 97, 95, 4 },
 | 
						|
                { 97, 97, 97, 0 }, { 97, 97, 97, 0 }, { 97, 97, 97, 0 }, { 97, 101, 99, 4 }, { 101, 101, 101, 0 }, { 101, 101, 101, 0 }, { 101, 101, 101, 0 }, { 101, 105, 103, 4 },
 | 
						|
                { 105, 105, 105, 0 }, { 105, 105, 105, 0 }, { 105, 105, 105, 0 }, { 105, 109, 107, 4 }, { 109, 109, 109, 0 }, { 109, 109, 109, 0 }, { 109, 109, 109, 0 }, { 109, 113, 111, 4 },
 | 
						|
                { 113, 113, 113, 0 }, { 113, 113, 113, 0 }, { 113, 113, 113, 0 }, { 113, 117, 115, 4 }, { 117, 117, 117, 0 }, { 117, 117, 117, 0 }, { 117, 117, 117, 0 }, { 117, 121, 119, 4 },
 | 
						|
                { 121, 121, 121, 0 }, { 121, 121, 121, 0 }, { 121, 121, 121, 0 }, { 121, 125, 123, 4 }, { 125, 125, 125, 0 }, { 125, 125, 125, 0 }, { 125, 125, 125, 0 }, { 125, 130, 127, 5 },
 | 
						|
                { 125, 130, 127, 5 }, { 125, 134, 129, 9 }, { 130, 130, 130, 0 }, { 125, 138, 131, 13 }, { 130, 134, 132, 4 }, { 125, 142, 133, 17 }, { 134, 134, 134, 0 }, { 125, 146, 135, 21 },
 | 
						|
                { 134, 138, 136, 4 }, { 125, 150, 137, 25 }, { 138, 138, 138, 0 }, { 125, 154, 139, 29 }, { 138, 142, 140, 4 }, { 125, 158, 141, 33 }, { 142, 142, 142, 0 }, { 125, 162, 143, 37 },
 | 
						|
                { 142, 146, 144, 4 }, { 125, 166, 145, 41 }, { 146, 146, 146, 0 }, { 125, 170, 147, 45 }, { 146, 150, 148, 4 }, { 125, 174, 149, 49 }, { 150, 150, 150, 0 }, { 125, 178, 151, 53 },
 | 
						|
                { 150, 154, 152, 4 }, { 125, 182, 153, 57 }, { 154, 154, 154, 0 }, { 125, 186, 155, 61 }, { 154, 158, 156, 4 }, { 125, 190, 157, 65 }, { 158, 158, 158, 0 }, { 158, 158, 158, 0 },
 | 
						|
                { 158, 162, 160, 4 }, { 162, 162, 162, 0 }, { 162, 162, 162, 0 }, { 162, 162, 162, 0 }, { 162, 166, 164, 4 }, { 166, 166, 166, 0 }, { 166, 166, 166, 0 }, { 166, 166, 166, 0 },
 | 
						|
                { 166, 170, 168, 4 }, { 170, 170, 170, 0 }, { 170, 170, 170, 0 }, { 170, 170, 170, 0 }, { 170, 174, 172, 4 }, { 174, 174, 174, 0 }, { 174, 174, 174, 0 }, { 174, 174, 174, 0 },
 | 
						|
                { 174, 178, 176, 4 }, { 178, 178, 178, 0 }, { 178, 178, 178, 0 }, { 178, 178, 178, 0 }, { 178, 182, 180, 4 }, { 182, 182, 182, 0 }, { 182, 182, 182, 0 }, { 182, 182, 182, 0 },
 | 
						|
                { 182, 186, 184, 4 }, { 186, 186, 186, 0 }, { 186, 186, 186, 0 }, { 186, 186, 186, 0 }, { 186, 190, 188, 4 }, { 190, 190, 190, 0 }, { 190, 190, 190, 0 }, { 190, 190, 190, 0 },
 | 
						|
                { 190, 195, 192, 5 }, { 190, 195, 192, 5 }, { 190, 199, 194, 9 }, { 195, 195, 195, 0 }, { 190, 203, 196, 13 }, { 195, 199, 197, 4 }, { 190, 207, 198, 17 }, { 199, 199, 199, 0 },
 | 
						|
                { 190, 211, 200, 21 }, { 199, 203, 201, 4 }, { 190, 215, 202, 25 }, { 203, 203, 203, 0 }, { 190, 219, 204, 29 }, { 203, 207, 205, 4 }, { 190, 223, 206, 33 }, { 207, 207, 207, 0 },
 | 
						|
                { 190, 227, 208, 37 }, { 207, 211, 209, 4 }, { 190, 231, 210, 41 }, { 211, 211, 211, 0 }, { 190, 235, 212, 45 }, { 211, 215, 213, 4 }, { 190, 239, 214, 49 }, { 215, 215, 215, 0 },
 | 
						|
                { 190, 243, 216, 53 }, { 215, 219, 217, 4 }, { 190, 247, 218, 57 }, { 219, 219, 219, 0 }, { 190, 251, 220, 61 }, { 219, 223, 221, 4 }, { 190, 255, 222, 65 }, { 223, 223, 223, 0 },
 | 
						|
                { 223, 223, 223, 0 }, { 223, 227, 225, 4 }, { 227, 227, 227, 0 }, { 227, 227, 227, 0 }, { 227, 227, 227, 0 }, { 227, 231, 229, 4 }, { 231, 231, 231, 0 }, { 231, 231, 231, 0 },
 | 
						|
                { 231, 231, 231, 0 }, { 231, 235, 233, 4 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 235, 239, 237, 4 }, { 239, 239, 239, 0 }, { 239, 239, 239, 0 },
 | 
						|
                { 239, 239, 239, 0 }, { 239, 243, 241, 4 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 243, 247, 245, 4 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
 | 
						|
                { 247, 247, 247, 0 }, { 247, 251, 249, 4 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 251, 255, 253, 4 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
 | 
						|
            };
 | 
						|
 | 
						|
            SingleColorTableEntry g_singleColor5_3_p[256] =
 | 
						|
            {
 | 
						|
                { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 8, 0, 2, 8 }, { 8, 0, 2, 8 }, { 0, 8, 5, 8 }, { 0, 8, 5, 8 }, { 0, 8, 5, 8 }, { 8, 8, 8, 0 },
 | 
						|
                { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 16, 8, 10, 8 }, { 33, 0, 11, 33 }, { 8, 16, 13, 8 }, { 8, 16, 13, 8 }, { 8, 16, 13, 8 }, { 16, 16, 16, 0 },
 | 
						|
                { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 24, 16, 18, 8 }, { 41, 8, 19, 33 }, { 16, 24, 21, 8 }, { 16, 24, 21, 8 }, { 0, 33, 22, 33 }, { 24, 24, 24, 0 },
 | 
						|
                { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 33, 24, 27, 9 }, { 33, 24, 27, 9 }, { 33, 24, 27, 9 }, { 41, 24, 29, 17 }, { 24, 33, 30, 9 }, { 24, 33, 30, 9 },
 | 
						|
                { 16, 41, 32, 25 }, { 33, 33, 33, 0 }, { 33, 33, 33, 0 }, { 41, 33, 35, 8 }, { 41, 33, 35, 8 }, { 33, 41, 38, 8 }, { 33, 41, 38, 8 }, { 33, 41, 38, 8 },
 | 
						|
                { 24, 49, 40, 25 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 49, 41, 43, 8 }, { 66, 33, 44, 33 }, { 41, 49, 46, 8 }, { 41, 49, 46, 8 }, { 41, 49, 46, 8 },
 | 
						|
                { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 57, 49, 51, 8 }, { 74, 41, 52, 33 }, { 49, 57, 54, 8 }, { 49, 57, 54, 8 }, { 33, 66, 55, 33 },
 | 
						|
                { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 66, 57, 60, 9 }, { 66, 57, 60, 9 }, { 66, 57, 60, 9 }, { 74, 57, 62, 17 }, { 57, 66, 63, 9 },
 | 
						|
                { 57, 66, 63, 9 }, { 49, 74, 65, 25 }, { 66, 66, 66, 0 }, { 66, 66, 66, 0 }, { 74, 66, 68, 8 }, { 74, 66, 68, 8 }, { 66, 74, 71, 8 }, { 66, 74, 71, 8 },
 | 
						|
                { 66, 74, 71, 8 }, { 57, 82, 73, 25 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 82, 74, 76, 8 }, { 99, 66, 77, 33 }, { 74, 82, 79, 8 }, { 74, 82, 79, 8 },
 | 
						|
                { 74, 82, 79, 8 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 90, 82, 84, 8 }, { 107, 74, 85, 33 }, { 82, 90, 87, 8 }, { 82, 90, 87, 8 },
 | 
						|
                { 66, 99, 88, 33 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 99, 90, 93, 9 }, { 99, 90, 93, 9 }, { 99, 90, 93, 9 }, { 107, 90, 95, 17 },
 | 
						|
                { 90, 99, 96, 9 }, { 90, 99, 96, 9 }, { 82, 107, 98, 25 }, { 99, 99, 99, 0 }, { 99, 99, 99, 0 }, { 107, 99, 101, 8 }, { 107, 99, 101, 8 }, { 99, 107, 104, 8 },
 | 
						|
                { 99, 107, 104, 8 }, { 99, 107, 104, 8 }, { 90, 115, 106, 25 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 115, 107, 109, 8 }, { 132, 99, 110, 33 }, { 107, 115, 112, 8 },
 | 
						|
                { 107, 115, 112, 8 }, { 107, 115, 112, 8 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 123, 115, 117, 8 }, { 140, 107, 118, 33 }, { 115, 123, 120, 8 },
 | 
						|
                { 115, 123, 120, 8 }, { 99, 132, 121, 33 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 132, 123, 126, 9 }, { 132, 123, 126, 9 }, { 132, 123, 126, 9 },
 | 
						|
                { 140, 123, 128, 17 }, { 123, 132, 129, 9 }, { 123, 132, 129, 9 }, { 115, 140, 131, 25 }, { 132, 132, 132, 0 }, { 132, 132, 132, 0 }, { 140, 132, 134, 8 }, { 140, 132, 134, 8 },
 | 
						|
                { 132, 140, 137, 8 }, { 132, 140, 137, 8 }, { 132, 140, 137, 8 }, { 123, 148, 139, 25 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 148, 140, 142, 8 }, { 165, 132, 143, 33 },
 | 
						|
                { 140, 148, 145, 8 }, { 140, 148, 145, 8 }, { 140, 148, 145, 8 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 156, 148, 150, 8 }, { 173, 140, 151, 33 },
 | 
						|
                { 148, 156, 153, 8 }, { 148, 156, 153, 8 }, { 132, 165, 154, 33 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 165, 156, 159, 9 }, { 165, 156, 159, 9 },
 | 
						|
                { 165, 156, 159, 9 }, { 173, 156, 161, 17 }, { 156, 165, 162, 9 }, { 156, 165, 162, 9 }, { 148, 173, 164, 25 }, { 165, 165, 165, 0 }, { 165, 165, 165, 0 }, { 173, 165, 167, 8 },
 | 
						|
                { 173, 165, 167, 8 }, { 165, 173, 170, 8 }, { 165, 173, 170, 8 }, { 165, 173, 170, 8 }, { 156, 181, 172, 25 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 }, { 181, 173, 175, 8 },
 | 
						|
                { 198, 165, 176, 33 }, { 173, 181, 178, 8 }, { 173, 181, 178, 8 }, { 173, 181, 178, 8 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 189, 181, 183, 8 },
 | 
						|
                { 206, 173, 184, 33 }, { 181, 189, 186, 8 }, { 181, 189, 186, 8 }, { 165, 198, 187, 33 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 198, 189, 192, 9 },
 | 
						|
                { 198, 189, 192, 9 }, { 198, 189, 192, 9 }, { 206, 189, 194, 17 }, { 189, 198, 195, 9 }, { 189, 198, 195, 9 }, { 181, 206, 197, 25 }, { 198, 198, 198, 0 }, { 198, 198, 198, 0 },
 | 
						|
                { 206, 198, 200, 8 }, { 206, 198, 200, 8 }, { 198, 206, 203, 8 }, { 198, 206, 203, 8 }, { 198, 206, 203, 8 }, { 189, 214, 205, 25 }, { 206, 206, 206, 0 }, { 206, 206, 206, 0 },
 | 
						|
                { 214, 206, 208, 8 }, { 231, 198, 209, 33 }, { 206, 214, 211, 8 }, { 206, 214, 211, 8 }, { 206, 214, 211, 8 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 },
 | 
						|
                { 222, 214, 216, 8 }, { 239, 206, 217, 33 }, { 214, 222, 219, 8 }, { 214, 222, 219, 8 }, { 198, 231, 220, 33 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 },
 | 
						|
                { 231, 222, 225, 9 }, { 231, 222, 225, 9 }, { 231, 222, 225, 9 }, { 239, 222, 227, 17 }, { 222, 231, 228, 9 }, { 222, 231, 228, 9 }, { 214, 239, 230, 25 }, { 231, 231, 231, 0 },
 | 
						|
                { 231, 231, 231, 0 }, { 239, 231, 233, 8 }, { 239, 231, 233, 8 }, { 231, 239, 236, 8 }, { 231, 239, 236, 8 }, { 231, 239, 236, 8 }, { 222, 247, 238, 25 }, { 239, 239, 239, 0 },
 | 
						|
                { 239, 239, 239, 0 }, { 247, 239, 241, 8 }, { 247, 239, 241, 8 }, { 239, 247, 244, 8 }, { 239, 247, 244, 8 }, { 239, 247, 244, 8 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
 | 
						|
                { 247, 247, 247, 0 }, { 255, 247, 249, 8 }, { 255, 247, 249, 8 }, { 247, 255, 252, 8 }, { 247, 255, 252, 8 }, { 247, 255, 252, 8 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
 | 
						|
            };
 | 
						|
 | 
						|
            SingleColorTableEntry g_singleColor6_3_p[256] =
 | 
						|
            {
 | 
						|
                { 0, 0, 0, 0 }, { 4, 0, 1, 4 }, { 0, 4, 2, 4 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 8, 4, 5, 4 }, { 4, 8, 6, 4 }, { 8, 8, 8, 0 },
 | 
						|
                { 8, 8, 8, 0 }, { 12, 8, 9, 4 }, { 8, 12, 10, 4 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 16, 12, 13, 4 }, { 12, 16, 14, 4 }, { 16, 16, 16, 0 },
 | 
						|
                { 16, 16, 16, 0 }, { 20, 16, 17, 4 }, { 16, 20, 18, 4 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 24, 20, 21, 4 }, { 20, 24, 22, 4 }, { 24, 24, 24, 0 },
 | 
						|
                { 24, 24, 24, 0 }, { 28, 24, 25, 4 }, { 24, 28, 26, 4 }, { 28, 28, 28, 0 }, { 28, 28, 28, 0 }, { 32, 28, 29, 4 }, { 28, 32, 30, 4 }, { 32, 32, 32, 0 },
 | 
						|
                { 32, 32, 32, 0 }, { 36, 32, 33, 4 }, { 32, 36, 34, 4 }, { 36, 36, 36, 0 }, { 36, 36, 36, 0 }, { 40, 36, 37, 4 }, { 36, 40, 38, 4 }, { 40, 40, 40, 0 },
 | 
						|
                { 40, 40, 40, 0 }, { 44, 40, 41, 4 }, { 40, 44, 42, 4 }, { 65, 32, 43, 33 }, { 44, 44, 44, 0 }, { 48, 44, 45, 4 }, { 44, 48, 46, 4 }, { 69, 36, 47, 33 },
 | 
						|
                { 48, 48, 48, 0 }, { 52, 48, 49, 4 }, { 48, 52, 50, 4 }, { 65, 44, 51, 21 }, { 52, 52, 52, 0 }, { 56, 52, 53, 4 }, { 52, 56, 54, 4 }, { 69, 48, 55, 21 },
 | 
						|
                { 56, 56, 56, 0 }, { 60, 56, 57, 4 }, { 56, 60, 58, 4 }, { 65, 56, 59, 9 }, { 60, 60, 60, 0 }, { 65, 60, 61, 5 }, { 56, 65, 62, 9 }, { 60, 65, 63, 5 },
 | 
						|
                { 56, 69, 64, 13 }, { 65, 65, 65, 0 }, { 69, 65, 66, 4 }, { 65, 69, 67, 4 }, { 60, 73, 68, 13 }, { 69, 69, 69, 0 }, { 73, 69, 70, 4 }, { 69, 73, 71, 4 },
 | 
						|
                { 56, 81, 72, 25 }, { 73, 73, 73, 0 }, { 77, 73, 74, 4 }, { 73, 77, 75, 4 }, { 60, 85, 76, 25 }, { 77, 77, 77, 0 }, { 81, 77, 78, 4 }, { 77, 81, 79, 4 },
 | 
						|
                { 81, 81, 81, 0 }, { 81, 81, 81, 0 }, { 85, 81, 82, 4 }, { 81, 85, 83, 4 }, { 85, 85, 85, 0 }, { 85, 85, 85, 0 }, { 89, 85, 86, 4 }, { 85, 89, 87, 4 },
 | 
						|
                { 89, 89, 89, 0 }, { 89, 89, 89, 0 }, { 93, 89, 90, 4 }, { 89, 93, 91, 4 }, { 93, 93, 93, 0 }, { 93, 93, 93, 0 }, { 97, 93, 94, 4 }, { 93, 97, 95, 4 },
 | 
						|
                { 97, 97, 97, 0 }, { 97, 97, 97, 0 }, { 101, 97, 98, 4 }, { 97, 101, 99, 4 }, { 101, 101, 101, 0 }, { 101, 101, 101, 0 }, { 105, 101, 102, 4 }, { 101, 105, 103, 4 },
 | 
						|
                { 105, 105, 105, 0 }, { 105, 105, 105, 0 }, { 109, 105, 106, 4 }, { 105, 109, 107, 4 }, { 130, 97, 108, 33 }, { 109, 109, 109, 0 }, { 113, 109, 110, 4 }, { 109, 113, 111, 4 },
 | 
						|
                { 134, 101, 112, 33 }, { 113, 113, 113, 0 }, { 117, 113, 114, 4 }, { 113, 117, 115, 4 }, { 130, 109, 116, 21 }, { 117, 117, 117, 0 }, { 121, 117, 118, 4 }, { 117, 121, 119, 4 },
 | 
						|
                { 134, 113, 120, 21 }, { 121, 121, 121, 0 }, { 125, 121, 122, 4 }, { 121, 125, 123, 4 }, { 130, 121, 124, 9 }, { 125, 125, 125, 0 }, { 130, 125, 126, 5 }, { 121, 130, 127, 9 },
 | 
						|
                { 125, 130, 128, 5 }, { 121, 134, 129, 13 }, { 130, 130, 130, 0 }, { 134, 130, 131, 4 }, { 130, 134, 132, 4 }, { 125, 138, 133, 13 }, { 134, 134, 134, 0 }, { 138, 134, 135, 4 },
 | 
						|
                { 134, 138, 136, 4 }, { 121, 146, 137, 25 }, { 138, 138, 138, 0 }, { 142, 138, 139, 4 }, { 138, 142, 140, 4 }, { 125, 150, 141, 25 }, { 142, 142, 142, 0 }, { 146, 142, 143, 4 },
 | 
						|
                { 142, 146, 144, 4 }, { 146, 146, 146, 0 }, { 146, 146, 146, 0 }, { 150, 146, 147, 4 }, { 146, 150, 148, 4 }, { 150, 150, 150, 0 }, { 150, 150, 150, 0 }, { 154, 150, 151, 4 },
 | 
						|
                { 150, 154, 152, 4 }, { 154, 154, 154, 0 }, { 154, 154, 154, 0 }, { 158, 154, 155, 4 }, { 154, 158, 156, 4 }, { 158, 158, 158, 0 }, { 158, 158, 158, 0 }, { 162, 158, 159, 4 },
 | 
						|
                { 158, 162, 160, 4 }, { 162, 162, 162, 0 }, { 162, 162, 162, 0 }, { 166, 162, 163, 4 }, { 162, 166, 164, 4 }, { 166, 166, 166, 0 }, { 166, 166, 166, 0 }, { 170, 166, 167, 4 },
 | 
						|
                { 166, 170, 168, 4 }, { 170, 170, 170, 0 }, { 170, 170, 170, 0 }, { 174, 170, 171, 4 }, { 170, 174, 172, 4 }, { 195, 162, 173, 33 }, { 174, 174, 174, 0 }, { 178, 174, 175, 4 },
 | 
						|
                { 174, 178, 176, 4 }, { 199, 166, 177, 33 }, { 178, 178, 178, 0 }, { 182, 178, 179, 4 }, { 178, 182, 180, 4 }, { 195, 174, 181, 21 }, { 182, 182, 182, 0 }, { 186, 182, 183, 4 },
 | 
						|
                { 182, 186, 184, 4 }, { 199, 178, 185, 21 }, { 186, 186, 186, 0 }, { 190, 186, 187, 4 }, { 186, 190, 188, 4 }, { 195, 186, 189, 9 }, { 190, 190, 190, 0 }, { 195, 190, 191, 5 },
 | 
						|
                { 186, 195, 192, 9 }, { 190, 195, 193, 5 }, { 186, 199, 194, 13 }, { 195, 195, 195, 0 }, { 199, 195, 196, 4 }, { 195, 199, 197, 4 }, { 190, 203, 198, 13 }, { 199, 199, 199, 0 },
 | 
						|
                { 203, 199, 200, 4 }, { 199, 203, 201, 4 }, { 186, 211, 202, 25 }, { 203, 203, 203, 0 }, { 207, 203, 204, 4 }, { 203, 207, 205, 4 }, { 190, 215, 206, 25 }, { 207, 207, 207, 0 },
 | 
						|
                { 211, 207, 208, 4 }, { 207, 211, 209, 4 }, { 211, 211, 211, 0 }, { 211, 211, 211, 0 }, { 215, 211, 212, 4 }, { 211, 215, 213, 4 }, { 215, 215, 215, 0 }, { 215, 215, 215, 0 },
 | 
						|
                { 219, 215, 216, 4 }, { 215, 219, 217, 4 }, { 219, 219, 219, 0 }, { 219, 219, 219, 0 }, { 223, 219, 220, 4 }, { 219, 223, 221, 4 }, { 223, 223, 223, 0 }, { 223, 223, 223, 0 },
 | 
						|
                { 227, 223, 224, 4 }, { 223, 227, 225, 4 }, { 227, 227, 227, 0 }, { 227, 227, 227, 0 }, { 231, 227, 228, 4 }, { 227, 231, 229, 4 }, { 231, 231, 231, 0 }, { 231, 231, 231, 0 },
 | 
						|
                { 235, 231, 232, 4 }, { 231, 235, 233, 4 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 239, 235, 236, 4 }, { 235, 239, 237, 4 }, { 239, 239, 239, 0 }, { 239, 239, 239, 0 },
 | 
						|
                { 243, 239, 240, 4 }, { 239, 243, 241, 4 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 247, 243, 244, 4 }, { 243, 247, 245, 4 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
 | 
						|
                { 251, 247, 248, 4 }, { 247, 251, 249, 4 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 255, 251, 252, 4 }, { 251, 255, 253, 4 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
 | 
						|
            };
 | 
						|
 | 
						|
            SingleColorTableEntry g_singleColor5_2_p[256] =
 | 
						|
            {
 | 
						|
                { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 8, 4, 8 }, { 0, 8, 4, 8 }, { 0, 8, 4, 8 }, { 8, 8, 8, 0 }, { 8, 8, 8, 0 },
 | 
						|
                { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 16, 12, 8 }, { 8, 16, 12, 8 }, { 8, 16, 12, 8 }, { 16, 16, 16, 0 }, { 16, 16, 16, 0 },
 | 
						|
                { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 24, 20, 8 }, { 16, 24, 20, 8 }, { 16, 24, 20, 8 }, { 24, 24, 24, 0 }, { 24, 24, 24, 0 },
 | 
						|
                { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 33, 28, 9 }, { 24, 41, 32, 17 },
 | 
						|
                { 24, 41, 32, 17 }, { 33, 33, 33, 0 }, { 33, 33, 33, 0 }, { 24, 49, 36, 25 }, { 24, 49, 36, 25 }, { 33, 41, 37, 8 }, { 33, 41, 37, 8 }, { 24, 57, 40, 33 },
 | 
						|
                { 24, 57, 40, 33 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 41, 41, 41, 0 }, { 41, 49, 45, 8 }, { 41, 49, 45, 8 }, { 41, 49, 45, 8 }, { 49, 49, 49, 0 },
 | 
						|
                { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 49, 49, 0 }, { 49, 57, 53, 8 }, { 49, 57, 53, 8 }, { 49, 57, 53, 8 }, { 57, 57, 57, 0 },
 | 
						|
                { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 57, 57, 0 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 }, { 57, 66, 61, 9 },
 | 
						|
                { 57, 74, 65, 17 }, { 57, 74, 65, 17 }, { 66, 66, 66, 0 }, { 66, 66, 66, 0 }, { 57, 82, 69, 25 }, { 57, 82, 69, 25 }, { 66, 74, 70, 8 }, { 66, 74, 70, 8 },
 | 
						|
                { 57, 90, 73, 33 }, { 57, 90, 73, 33 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 74, 74, 74, 0 }, { 74, 82, 78, 8 }, { 74, 82, 78, 8 }, { 74, 82, 78, 8 },
 | 
						|
                { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 82, 82, 0 }, { 82, 90, 86, 8 }, { 82, 90, 86, 8 }, { 82, 90, 86, 8 },
 | 
						|
                { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 90, 90, 0 }, { 90, 99, 94, 9 }, { 90, 99, 94, 9 }, { 90, 99, 94, 9 },
 | 
						|
                { 90, 99, 94, 9 }, { 90, 107, 98, 17 }, { 90, 107, 98, 17 }, { 99, 99, 99, 0 }, { 99, 99, 99, 0 }, { 90, 115, 102, 25 }, { 90, 115, 102, 25 }, { 99, 107, 103, 8 },
 | 
						|
                { 99, 107, 103, 8 }, { 90, 123, 106, 33 }, { 90, 123, 106, 33 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 107, 107, 107, 0 }, { 107, 115, 111, 8 }, { 107, 115, 111, 8 },
 | 
						|
                { 107, 115, 111, 8 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 115, 115, 0 }, { 115, 123, 119, 8 }, { 115, 123, 119, 8 },
 | 
						|
                { 115, 123, 119, 8 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 123, 123, 0 }, { 123, 132, 127, 9 }, { 123, 132, 127, 9 },
 | 
						|
                { 123, 132, 127, 9 }, { 123, 132, 127, 9 }, { 123, 140, 131, 17 }, { 123, 140, 131, 17 }, { 132, 132, 132, 0 }, { 132, 132, 132, 0 }, { 123, 148, 135, 25 }, { 123, 148, 135, 25 },
 | 
						|
                { 132, 140, 136, 8 }, { 132, 140, 136, 8 }, { 123, 156, 139, 33 }, { 123, 156, 139, 33 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 140, 140, 140, 0 }, { 140, 148, 144, 8 },
 | 
						|
                { 140, 148, 144, 8 }, { 140, 148, 144, 8 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 148, 148, 0 }, { 148, 156, 152, 8 },
 | 
						|
                { 148, 156, 152, 8 }, { 148, 156, 152, 8 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 156, 156, 0 }, { 156, 165, 160, 9 },
 | 
						|
                { 156, 165, 160, 9 }, { 156, 165, 160, 9 }, { 156, 165, 160, 9 }, { 156, 173, 164, 17 }, { 156, 173, 164, 17 }, { 165, 165, 165, 0 }, { 165, 165, 165, 0 }, { 156, 181, 168, 25 },
 | 
						|
                { 156, 181, 168, 25 }, { 165, 173, 169, 8 }, { 165, 173, 169, 8 }, { 156, 189, 172, 33 }, { 156, 189, 172, 33 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 }, { 173, 173, 173, 0 },
 | 
						|
                { 173, 181, 177, 8 }, { 173, 181, 177, 8 }, { 173, 181, 177, 8 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 }, { 181, 181, 181, 0 },
 | 
						|
                { 181, 189, 185, 8 }, { 181, 189, 185, 8 }, { 181, 189, 185, 8 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 }, { 189, 189, 189, 0 },
 | 
						|
                { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 198, 193, 9 }, { 189, 206, 197, 17 }, { 189, 206, 197, 17 }, { 198, 198, 198, 0 }, { 198, 198, 198, 0 },
 | 
						|
                { 189, 214, 201, 25 }, { 189, 214, 201, 25 }, { 198, 206, 202, 8 }, { 198, 206, 202, 8 }, { 189, 222, 205, 33 }, { 189, 222, 205, 33 }, { 206, 206, 206, 0 }, { 206, 206, 206, 0 },
 | 
						|
                { 206, 206, 206, 0 }, { 206, 214, 210, 8 }, { 206, 214, 210, 8 }, { 206, 214, 210, 8 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 }, { 214, 214, 214, 0 },
 | 
						|
                { 214, 214, 214, 0 }, { 214, 222, 218, 8 }, { 214, 222, 218, 8 }, { 214, 222, 218, 8 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 }, { 222, 222, 222, 0 },
 | 
						|
                { 222, 222, 222, 0 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 231, 226, 9 }, { 222, 239, 230, 17 }, { 222, 239, 230, 17 }, { 231, 231, 231, 0 },
 | 
						|
                { 231, 231, 231, 0 }, { 222, 247, 234, 25 }, { 222, 247, 234, 25 }, { 231, 239, 235, 8 }, { 231, 239, 235, 8 }, { 222, 255, 238, 33 }, { 222, 255, 238, 33 }, { 239, 239, 239, 0 },
 | 
						|
                { 239, 239, 239, 0 }, { 239, 239, 239, 0 }, { 239, 247, 243, 8 }, { 239, 247, 243, 8 }, { 239, 247, 243, 8 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
 | 
						|
                { 247, 247, 247, 0 }, { 247, 247, 247, 0 }, { 247, 255, 251, 8 }, { 247, 255, 251, 8 }, { 247, 255, 251, 8 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
 | 
						|
            };
 | 
						|
 | 
						|
            SingleColorTableEntry g_singleColor6_2_p[256] =
 | 
						|
            {
 | 
						|
                { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 4, 2, 4 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 4, 4, 4, 0 }, { 4, 8, 6, 4 }, { 8, 8, 8, 0 },
 | 
						|
                { 8, 8, 8, 0 }, { 8, 8, 8, 0 }, { 8, 12, 10, 4 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 12, 12, 12, 0 }, { 12, 16, 14, 4 }, { 16, 16, 16, 0 },
 | 
						|
                { 16, 16, 16, 0 }, { 16, 16, 16, 0 }, { 16, 20, 18, 4 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 20, 20, 20, 0 }, { 20, 24, 22, 4 }, { 24, 24, 24, 0 },
 | 
						|
                { 24, 24, 24, 0 }, { 24, 24, 24, 0 }, { 24, 28, 26, 4 }, { 28, 28, 28, 0 }, { 28, 28, 28, 0 }, { 28, 28, 28, 0 }, { 28, 32, 30, 4 }, { 32, 32, 32, 0 },
 | 
						|
                { 32, 32, 32, 0 }, { 32, 32, 32, 0 }, { 32, 36, 34, 4 }, { 36, 36, 36, 0 }, { 36, 36, 36, 0 }, { 36, 36, 36, 0 }, { 36, 40, 38, 4 }, { 40, 40, 40, 0 },
 | 
						|
                { 40, 40, 40, 0 }, { 40, 40, 40, 0 }, { 40, 44, 42, 4 }, { 44, 44, 44, 0 }, { 44, 44, 44, 0 }, { 44, 44, 44, 0 }, { 44, 48, 46, 4 }, { 48, 48, 48, 0 },
 | 
						|
                { 48, 48, 48, 0 }, { 48, 48, 48, 0 }, { 48, 52, 50, 4 }, { 52, 52, 52, 0 }, { 52, 52, 52, 0 }, { 52, 52, 52, 0 }, { 52, 56, 54, 4 }, { 56, 56, 56, 0 },
 | 
						|
                { 56, 56, 56, 0 }, { 56, 56, 56, 0 }, { 56, 60, 58, 4 }, { 60, 60, 60, 0 }, { 60, 60, 60, 0 }, { 60, 60, 60, 0 }, { 60, 65, 62, 5 }, { 60, 65, 62, 5 },
 | 
						|
                { 60, 69, 64, 9 }, { 65, 65, 65, 0 }, { 60, 73, 66, 13 }, { 65, 69, 67, 4 }, { 60, 77, 68, 17 }, { 69, 69, 69, 0 }, { 60, 81, 70, 21 }, { 69, 73, 71, 4 },
 | 
						|
                { 60, 85, 72, 25 }, { 73, 73, 73, 0 }, { 60, 89, 74, 29 }, { 73, 77, 75, 4 }, { 60, 93, 76, 33 }, { 77, 77, 77, 0 }, { 77, 77, 77, 0 }, { 77, 81, 79, 4 },
 | 
						|
                { 81, 81, 81, 0 }, { 81, 81, 81, 0 }, { 81, 81, 81, 0 }, { 81, 85, 83, 4 }, { 85, 85, 85, 0 }, { 85, 85, 85, 0 }, { 85, 85, 85, 0 }, { 85, 89, 87, 4 },
 | 
						|
                { 89, 89, 89, 0 }, { 89, 89, 89, 0 }, { 89, 89, 89, 0 }, { 89, 93, 91, 4 }, { 93, 93, 93, 0 }, { 93, 93, 93, 0 }, { 93, 93, 93, 0 }, { 93, 97, 95, 4 },
 | 
						|
                { 97, 97, 97, 0 }, { 97, 97, 97, 0 }, { 97, 97, 97, 0 }, { 97, 101, 99, 4 }, { 101, 101, 101, 0 }, { 101, 101, 101, 0 }, { 101, 101, 101, 0 }, { 101, 105, 103, 4 },
 | 
						|
                { 105, 105, 105, 0 }, { 105, 105, 105, 0 }, { 105, 105, 105, 0 }, { 105, 109, 107, 4 }, { 109, 109, 109, 0 }, { 109, 109, 109, 0 }, { 109, 109, 109, 0 }, { 109, 113, 111, 4 },
 | 
						|
                { 113, 113, 113, 0 }, { 113, 113, 113, 0 }, { 113, 113, 113, 0 }, { 113, 117, 115, 4 }, { 117, 117, 117, 0 }, { 117, 117, 117, 0 }, { 117, 117, 117, 0 }, { 117, 121, 119, 4 },
 | 
						|
                { 121, 121, 121, 0 }, { 121, 121, 121, 0 }, { 121, 121, 121, 0 }, { 121, 125, 123, 4 }, { 125, 125, 125, 0 }, { 125, 125, 125, 0 }, { 125, 125, 125, 0 }, { 125, 130, 127, 5 },
 | 
						|
                { 125, 130, 127, 5 }, { 125, 134, 129, 9 }, { 130, 130, 130, 0 }, { 125, 138, 131, 13 }, { 130, 134, 132, 4 }, { 125, 142, 133, 17 }, { 134, 134, 134, 0 }, { 125, 146, 135, 21 },
 | 
						|
                { 134, 138, 136, 4 }, { 125, 150, 137, 25 }, { 138, 138, 138, 0 }, { 125, 154, 139, 29 }, { 138, 142, 140, 4 }, { 125, 158, 141, 33 }, { 142, 142, 142, 0 }, { 142, 142, 142, 0 },
 | 
						|
                { 142, 146, 144, 4 }, { 146, 146, 146, 0 }, { 146, 146, 146, 0 }, { 146, 146, 146, 0 }, { 146, 150, 148, 4 }, { 150, 150, 150, 0 }, { 150, 150, 150, 0 }, { 150, 150, 150, 0 },
 | 
						|
                { 150, 154, 152, 4 }, { 154, 154, 154, 0 }, { 154, 154, 154, 0 }, { 154, 154, 154, 0 }, { 154, 158, 156, 4 }, { 158, 158, 158, 0 }, { 158, 158, 158, 0 }, { 158, 158, 158, 0 },
 | 
						|
                { 158, 162, 160, 4 }, { 162, 162, 162, 0 }, { 162, 162, 162, 0 }, { 162, 162, 162, 0 }, { 162, 166, 164, 4 }, { 166, 166, 166, 0 }, { 166, 166, 166, 0 }, { 166, 166, 166, 0 },
 | 
						|
                { 166, 170, 168, 4 }, { 170, 170, 170, 0 }, { 170, 170, 170, 0 }, { 170, 170, 170, 0 }, { 170, 174, 172, 4 }, { 174, 174, 174, 0 }, { 174, 174, 174, 0 }, { 174, 174, 174, 0 },
 | 
						|
                { 174, 178, 176, 4 }, { 178, 178, 178, 0 }, { 178, 178, 178, 0 }, { 178, 178, 178, 0 }, { 178, 182, 180, 4 }, { 182, 182, 182, 0 }, { 182, 182, 182, 0 }, { 182, 182, 182, 0 },
 | 
						|
                { 182, 186, 184, 4 }, { 186, 186, 186, 0 }, { 186, 186, 186, 0 }, { 186, 186, 186, 0 }, { 186, 190, 188, 4 }, { 190, 190, 190, 0 }, { 190, 190, 190, 0 }, { 190, 190, 190, 0 },
 | 
						|
                { 190, 195, 192, 5 }, { 190, 195, 192, 5 }, { 190, 199, 194, 9 }, { 195, 195, 195, 0 }, { 190, 203, 196, 13 }, { 195, 199, 197, 4 }, { 190, 207, 198, 17 }, { 199, 199, 199, 0 },
 | 
						|
                { 190, 211, 200, 21 }, { 199, 203, 201, 4 }, { 190, 215, 202, 25 }, { 203, 203, 203, 0 }, { 190, 219, 204, 29 }, { 203, 207, 205, 4 }, { 190, 223, 206, 33 }, { 207, 207, 207, 0 },
 | 
						|
                { 207, 207, 207, 0 }, { 207, 211, 209, 4 }, { 211, 211, 211, 0 }, { 211, 211, 211, 0 }, { 211, 211, 211, 0 }, { 211, 215, 213, 4 }, { 215, 215, 215, 0 }, { 215, 215, 215, 0 },
 | 
						|
                { 215, 215, 215, 0 }, { 215, 219, 217, 4 }, { 219, 219, 219, 0 }, { 219, 219, 219, 0 }, { 219, 219, 219, 0 }, { 219, 223, 221, 4 }, { 223, 223, 223, 0 }, { 223, 223, 223, 0 },
 | 
						|
                { 223, 223, 223, 0 }, { 223, 227, 225, 4 }, { 227, 227, 227, 0 }, { 227, 227, 227, 0 }, { 227, 227, 227, 0 }, { 227, 231, 229, 4 }, { 231, 231, 231, 0 }, { 231, 231, 231, 0 },
 | 
						|
                { 231, 231, 231, 0 }, { 231, 235, 233, 4 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 235, 235, 235, 0 }, { 235, 239, 237, 4 }, { 239, 239, 239, 0 }, { 239, 239, 239, 0 },
 | 
						|
                { 239, 239, 239, 0 }, { 239, 243, 241, 4 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 243, 243, 243, 0 }, { 243, 247, 245, 4 }, { 247, 247, 247, 0 }, { 247, 247, 247, 0 },
 | 
						|
                { 247, 247, 247, 0 }, { 247, 251, 249, 4 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 251, 251, 251, 0 }, { 251, 255, 253, 4 }, { 255, 255, 255, 0 }, { 255, 255, 255, 0 },
 | 
						|
            };
 | 
						|
        }
 | 
						|
 | 
						|
        class S3TCComputer
 | 
						|
        {
 | 
						|
        public:
 | 
						|
            typedef ParallelMath::Float MFloat;
 | 
						|
            typedef ParallelMath::SInt16 MSInt16;
 | 
						|
            typedef ParallelMath::UInt15 MUInt15;
 | 
						|
            typedef ParallelMath::UInt16 MUInt16;
 | 
						|
            typedef ParallelMath::SInt32 MSInt32;
 | 
						|
 | 
						|
            static void Init(MFloat& error)
 | 
						|
            {
 | 
						|
                error = ParallelMath::MakeFloat(FLT_MAX);
 | 
						|
            }
 | 
						|
 | 
						|
            static void QuantizeTo6Bits(MUInt15& v)
 | 
						|
            {
 | 
						|
                MUInt15 reduced = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(v, ParallelMath::MakeUInt15(253)) + ParallelMath::MakeUInt16(512), 10));
 | 
						|
                v = (reduced << 2) | ParallelMath::RightShift(reduced, 4);
 | 
						|
            }
 | 
						|
 | 
						|
            static void QuantizeTo5Bits(MUInt15& v)
 | 
						|
            {
 | 
						|
                MUInt15 reduced = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(v, ParallelMath::MakeUInt15(249)) + ParallelMath::MakeUInt16(1024), 11));
 | 
						|
                v = (reduced << 3) | ParallelMath::RightShift(reduced, 2);
 | 
						|
            }
 | 
						|
 | 
						|
            static void QuantizeTo565(MUInt15 endPoint[3])
 | 
						|
            {
 | 
						|
                QuantizeTo5Bits(endPoint[0]);
 | 
						|
                QuantizeTo6Bits(endPoint[1]);
 | 
						|
                QuantizeTo5Bits(endPoint[2]);
 | 
						|
            }
 | 
						|
 | 
						|
            static MFloat ParanoidFactorForSpan(const MSInt16& span)
 | 
						|
            {
 | 
						|
                return ParallelMath::Abs(ParallelMath::ToFloat(span)) * 0.03f;
 | 
						|
            }
 | 
						|
 | 
						|
            static MFloat ParanoidDiff(const MUInt15& a, const MUInt15& b, const MFloat& d)
 | 
						|
            {
 | 
						|
                MFloat absDiff = ParallelMath::Abs(ParallelMath::ToFloat(ParallelMath::LosslessCast<MSInt16>::Cast(a) - ParallelMath::LosslessCast<MSInt16>::Cast(b)));
 | 
						|
                absDiff = absDiff + d;
 | 
						|
                return absDiff * absDiff;
 | 
						|
            }
 | 
						|
 | 
						|
            static void TestSingleColor(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], int range, const float* channelWeights,
 | 
						|
                MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange, const ParallelMath::RoundTowardNearestForScope *rtn)
 | 
						|
            {
 | 
						|
                float channelWeightsSq[3];
 | 
						|
 | 
						|
                for (int ch = 0; ch < 3; ch++)
 | 
						|
                    channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
 | 
						|
 | 
						|
                MUInt15 totals[3] = { ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0) };
 | 
						|
 | 
						|
                for (int px = 0; px < 16; px++)
 | 
						|
                {
 | 
						|
                    for (int ch = 0; ch < 3; ch++)
 | 
						|
                        totals[ch] = totals[ch] + pixels[px][ch];
 | 
						|
                }
 | 
						|
 | 
						|
                MUInt15 average[3];
 | 
						|
                for (int ch = 0; ch < 3; ch++)
 | 
						|
                    average[ch] = ParallelMath::RightShift(totals[ch] + ParallelMath::MakeUInt15(8), 4);
 | 
						|
 | 
						|
                const S3TCSingleColorTables::SingleColorTableEntry* rbTable = NULL;
 | 
						|
                const S3TCSingleColorTables::SingleColorTableEntry* gTable = NULL;
 | 
						|
                if (flags & cvtt::Flags::S3TC_Paranoid)
 | 
						|
                {
 | 
						|
                    if (range == 4)
 | 
						|
                    {
 | 
						|
                        rbTable = S3TCSingleColorTables::g_singleColor5_3_p;
 | 
						|
                        gTable = S3TCSingleColorTables::g_singleColor6_3_p;
 | 
						|
                    }
 | 
						|
                    else
 | 
						|
                    {
 | 
						|
                        assert(range == 3);
 | 
						|
                        rbTable = S3TCSingleColorTables::g_singleColor5_2_p;
 | 
						|
                        gTable = S3TCSingleColorTables::g_singleColor6_2_p;
 | 
						|
                    }
 | 
						|
                }
 | 
						|
                else
 | 
						|
                {
 | 
						|
                    if (range == 4)
 | 
						|
                    {
 | 
						|
                        rbTable = S3TCSingleColorTables::g_singleColor5_3;
 | 
						|
                        gTable = S3TCSingleColorTables::g_singleColor6_3;
 | 
						|
                    }
 | 
						|
                    else
 | 
						|
                    {
 | 
						|
                        assert(range == 3);
 | 
						|
                        rbTable = S3TCSingleColorTables::g_singleColor5_2;
 | 
						|
                        gTable = S3TCSingleColorTables::g_singleColor6_2;
 | 
						|
                    }
 | 
						|
                }
 | 
						|
 | 
						|
                MUInt15 interpolated[3];
 | 
						|
                MUInt15 eps[2][3];
 | 
						|
                MSInt16 spans[3];
 | 
						|
                for (int i = 0; i < ParallelMath::ParallelSize; i++)
 | 
						|
                {
 | 
						|
                    for (int ch = 0; ch < 3; ch++)
 | 
						|
                    {
 | 
						|
                        uint16_t avg = ParallelMath::Extract(average[ch], i);
 | 
						|
                        const S3TCSingleColorTables::SingleColorTableEntry& tableEntry = ((ch == 1) ? gTable[avg] : rbTable[avg]);
 | 
						|
                        ParallelMath::PutUInt15(eps[0][ch], i, tableEntry.m_min);
 | 
						|
                        ParallelMath::PutUInt15(eps[1][ch], i, tableEntry.m_max);
 | 
						|
                        ParallelMath::PutUInt15(interpolated[ch], i, tableEntry.m_actualColor);
 | 
						|
                        ParallelMath::PutSInt16(spans[ch], i, tableEntry.m_span);
 | 
						|
                    }
 | 
						|
                }
 | 
						|
 | 
						|
                MFloat error = ParallelMath::MakeFloatZero();
 | 
						|
                if (flags & cvtt::Flags::S3TC_Paranoid)
 | 
						|
                {
 | 
						|
                    MFloat spanParanoidFactors[3];
 | 
						|
                    for (int ch = 0; ch < 3; ch++)
 | 
						|
                        spanParanoidFactors[ch] = ParanoidFactorForSpan(spans[ch]);
 | 
						|
 | 
						|
                    for (int px = 0; px < 16; px++)
 | 
						|
                    {
 | 
						|
                        for (int ch = 0; ch < 3; ch++)
 | 
						|
                            error = error + ParanoidDiff(interpolated[ch], pixels[px][ch], spanParanoidFactors[ch]) * channelWeightsSq[ch];
 | 
						|
                    }
 | 
						|
                }
 | 
						|
                else
 | 
						|
                {
 | 
						|
                    for (int px = 0; px < 16; px++)
 | 
						|
                    {
 | 
						|
                        for (int ch = 0; ch < 3; ch++)
 | 
						|
                            error = error + ParallelMath::ToFloat(ParallelMath::SqDiffUInt8(interpolated[ch], pixels[px][ch])) * channelWeightsSq[ch];
 | 
						|
                    }
 | 
						|
                }
 | 
						|
 | 
						|
                ParallelMath::FloatCompFlag better = ParallelMath::Less(error, bestError);
 | 
						|
                ParallelMath::Int16CompFlag better16 = ParallelMath::FloatFlagToInt16(better);
 | 
						|
 | 
						|
                if (ParallelMath::AnySet(better16))
 | 
						|
                {
 | 
						|
                    bestError = ParallelMath::Min(bestError, error);
 | 
						|
                    for (int epi = 0; epi < 2; epi++)
 | 
						|
                        for (int ch = 0; ch < 3; ch++)
 | 
						|
                            ParallelMath::ConditionalSet(bestEndpoints[epi][ch], better16, eps[epi][ch]);
 | 
						|
 | 
						|
                    MUInt15 vindexes = ParallelMath::MakeUInt15(1);
 | 
						|
                    for (int px = 0; px < 16; px++)
 | 
						|
                        ParallelMath::ConditionalSet(bestIndexes[px], better16, vindexes);
 | 
						|
 | 
						|
                    ParallelMath::ConditionalSet(bestRange, better16, ParallelMath::MakeUInt15(range));
 | 
						|
                }
 | 
						|
            }
 | 
						|
 | 
						|
            static void TestEndpoints(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const MFloat preWeightedPixels[16][4], const MUInt15 unquantizedEndPoints[2][3], int range, const float* channelWeights,
 | 
						|
                MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange, EndpointRefiner<3> *refiner, const ParallelMath::RoundTowardNearestForScope *rtn)
 | 
						|
            {
 | 
						|
                float channelWeightsSq[3];
 | 
						|
 | 
						|
                for (int ch = 0; ch < 3; ch++)
 | 
						|
                    channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch];
 | 
						|
 | 
						|
                MUInt15 endPoints[2][3];
 | 
						|
 | 
						|
                for (int ep = 0; ep < 2; ep++)
 | 
						|
                    for (int ch = 0; ch < 3; ch++)
 | 
						|
                        endPoints[ep][ch] = unquantizedEndPoints[ep][ch];
 | 
						|
 | 
						|
                QuantizeTo565(endPoints[0]);
 | 
						|
                QuantizeTo565(endPoints[1]);
 | 
						|
 | 
						|
                IndexSelector<3> selector;
 | 
						|
                selector.Init<false>(channelWeights, endPoints, range);
 | 
						|
 | 
						|
                MUInt15 indexes[16];
 | 
						|
 | 
						|
                MFloat paranoidFactors[3];
 | 
						|
                for (int ch = 0; ch < 3; ch++)
 | 
						|
                    paranoidFactors[ch] = ParanoidFactorForSpan(ParallelMath::LosslessCast<MSInt16>::Cast(endPoints[0][ch]) - ParallelMath::LosslessCast<MSInt16>::Cast(endPoints[1][ch]));
 | 
						|
 | 
						|
                MFloat error = ParallelMath::MakeFloatZero();
 | 
						|
                AggregatedError<3> aggError;
 | 
						|
                for (int px = 0; px < 16; px++)
 | 
						|
                {
 | 
						|
                    MUInt15 index = selector.SelectIndexLDR(floatPixels[px], rtn);
 | 
						|
                    indexes[px] = index;
 | 
						|
 | 
						|
                    if (refiner)
 | 
						|
                        refiner->ContributeUnweightedPW(preWeightedPixels[px], index);
 | 
						|
 | 
						|
                    MUInt15 reconstructed[3];
 | 
						|
                    selector.ReconstructLDRPrecise(index, reconstructed);
 | 
						|
 | 
						|
                    if (flags & Flags::S3TC_Paranoid)
 | 
						|
                    {
 | 
						|
                        for (int ch = 0; ch < 3; ch++)
 | 
						|
                            error = error + ParanoidDiff(reconstructed[ch], pixels[px][ch], paranoidFactors[ch]) * channelWeightsSq[ch];
 | 
						|
                    }
 | 
						|
                    else
 | 
						|
                        BCCommon::ComputeErrorLDR<3>(flags, reconstructed, pixels[px], aggError);
 | 
						|
                }
 | 
						|
 | 
						|
                if (!(flags & Flags::S3TC_Paranoid))
 | 
						|
                    error = aggError.Finalize(flags, channelWeightsSq);
 | 
						|
 | 
						|
                ParallelMath::FloatCompFlag better = ParallelMath::Less(error, bestError);
 | 
						|
 | 
						|
                if (ParallelMath::AnySet(better))
 | 
						|
                {
 | 
						|
                    ParallelMath::Int16CompFlag betterInt16 = ParallelMath::FloatFlagToInt16(better);
 | 
						|
 | 
						|
                    ParallelMath::ConditionalSet(bestError, better, error);
 | 
						|
 | 
						|
                    for (int ep = 0; ep < 2; ep++)
 | 
						|
                        for (int ch = 0; ch < 3; ch++)
 | 
						|
                            ParallelMath::ConditionalSet(bestEndpoints[ep][ch], betterInt16, endPoints[ep][ch]);
 | 
						|
 | 
						|
                    for (int px = 0; px < 16; px++)
 | 
						|
                        ParallelMath::ConditionalSet(bestIndexes[px], betterInt16, indexes[px]);
 | 
						|
 | 
						|
                    ParallelMath::ConditionalSet(bestRange, betterInt16, ParallelMath::MakeUInt15(static_cast<uint16_t>(range)));
 | 
						|
                }
 | 
						|
            }
 | 
						|
 | 
						|
            static void TestCounts(uint32_t flags, const int *counts, int nCounts, const MUInt15 &numElements, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const MFloat preWeightedPixels[16][4], bool alphaTest,
 | 
						|
                const MFloat floatSortedInputs[16][4], const MFloat preWeightedFloatSortedInputs[16][4], const float *channelWeights, MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange,
 | 
						|
                const ParallelMath::RoundTowardNearestForScope* rtn)
 | 
						|
            {
 | 
						|
                UNREFERENCED_PARAMETER(alphaTest);
 | 
						|
                UNREFERENCED_PARAMETER(flags);
 | 
						|
 | 
						|
                EndpointRefiner<3> refiner;
 | 
						|
 | 
						|
                refiner.Init(nCounts, channelWeights);
 | 
						|
 | 
						|
                bool escape = false;
 | 
						|
                int e = 0;
 | 
						|
                for (int i = 0; i < nCounts; i++)
 | 
						|
                {
 | 
						|
                    for (int n = 0; n < counts[i]; n++)
 | 
						|
                    {
 | 
						|
                        ParallelMath::Int16CompFlag valid = ParallelMath::Less(ParallelMath::MakeUInt15(static_cast<uint16_t>(n)), numElements);
 | 
						|
                        if (!ParallelMath::AnySet(valid))
 | 
						|
                        {
 | 
						|
                            escape = true;
 | 
						|
                            break;
 | 
						|
                        }
 | 
						|
 | 
						|
                        if (ParallelMath::AllSet(valid))
 | 
						|
                            refiner.ContributeUnweightedPW(preWeightedFloatSortedInputs[e++], ParallelMath::MakeUInt15(static_cast<uint16_t>(i)));
 | 
						|
                        else
 | 
						|
                        {
 | 
						|
                            MFloat weight = ParallelMath::Select(ParallelMath::Int16FlagToFloat(valid), ParallelMath::MakeFloat(1.0f), ParallelMath::MakeFloat(0.0f));
 | 
						|
                            refiner.ContributePW(preWeightedFloatSortedInputs[e++], ParallelMath::MakeUInt15(static_cast<uint16_t>(i)), weight);
 | 
						|
                        }
 | 
						|
                    }
 | 
						|
 | 
						|
                    if (escape)
 | 
						|
                        break;
 | 
						|
                }
 | 
						|
 | 
						|
                MUInt15 endPoints[2][3];
 | 
						|
                refiner.GetRefinedEndpointsLDR(endPoints, rtn);
 | 
						|
 | 
						|
                TestEndpoints(flags, pixels, floatPixels, preWeightedPixels, endPoints, nCounts, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, NULL, rtn);
 | 
						|
            }
 | 
						|
 | 
						|
            static void PackExplicitAlpha(uint32_t flags, const PixelBlockU8* inputs, int inputChannel, uint8_t* packedBlocks, size_t packedBlockStride)
 | 
						|
            {
 | 
						|
                UNREFERENCED_PARAMETER(flags);
 | 
						|
                ParallelMath::RoundTowardNearestForScope rtn;
 | 
						|
 | 
						|
                float weights[1] = { 1.0f };
 | 
						|
 | 
						|
                MUInt15 pixels[16];
 | 
						|
                MFloat floatPixels[16];
 | 
						|
 | 
						|
                for (int px = 0; px < 16; px++)
 | 
						|
                {
 | 
						|
                    ParallelMath::ConvertLDRInputs(inputs, px, inputChannel, pixels[px]);
 | 
						|
                    floatPixels[px] = ParallelMath::ToFloat(pixels[px]);
 | 
						|
                }
 | 
						|
 | 
						|
                MUInt15 ep[2][1] = { { ParallelMath::MakeUInt15(0) },{ ParallelMath::MakeUInt15(255) } };
 | 
						|
 | 
						|
                IndexSelector<1> selector;
 | 
						|
                selector.Init<false>(weights, ep, 16);
 | 
						|
 | 
						|
                MUInt15 indexes[16];
 | 
						|
 | 
						|
                for (int px = 0; px < 16; px++)
 | 
						|
                    indexes[px] = selector.SelectIndexLDR(&floatPixels[px], &rtn);
 | 
						|
 | 
						|
                for (int block = 0; block < ParallelMath::ParallelSize; block++)
 | 
						|
                {
 | 
						|
                    for (int px = 0; px < 16; px += 8)
 | 
						|
                    {
 | 
						|
                        int index0 = ParallelMath::Extract(indexes[px], block);
 | 
						|
                        int index1 = ParallelMath::Extract(indexes[px], block);
 | 
						|
 | 
						|
                        packedBlocks[px / 2] = static_cast<uint8_t>(index0 | (index1 << 4));
 | 
						|
                    }
 | 
						|
 | 
						|
                    packedBlocks += packedBlockStride;
 | 
						|
                }
 | 
						|
            }
 | 
						|
 | 
						|
            static void PackInterpolatedAlpha(uint32_t flags, const PixelBlockU8* inputs, int inputChannel, uint8_t* packedBlocks, size_t packedBlockStride, bool isSigned, int maxTweakRounds, int numRefineRounds)
 | 
						|
            {
 | 
						|
                if (maxTweakRounds < 1)
 | 
						|
                    maxTweakRounds = 1;
 | 
						|
 | 
						|
                if (numRefineRounds < 1)
 | 
						|
                    numRefineRounds = 1;
 | 
						|
 | 
						|
                ParallelMath::RoundTowardNearestForScope rtn;
 | 
						|
 | 
						|
                float oneWeight[1] = { 1.0f };
 | 
						|
 | 
						|
                MUInt15 pixels[16];
 | 
						|
                MFloat floatPixels[16];
 | 
						|
 | 
						|
                MUInt15 highTerminal = isSigned ? ParallelMath::MakeUInt15(254) : ParallelMath::MakeUInt15(255);
 | 
						|
                MUInt15 highTerminalMinusOne = highTerminal - ParallelMath::MakeUInt15(1);
 | 
						|
 | 
						|
                for (int px = 0; px < 16; px++)
 | 
						|
                {
 | 
						|
                    ParallelMath::ConvertLDRInputs(inputs, px, inputChannel, pixels[px]);
 | 
						|
 | 
						|
                    if (isSigned)
 | 
						|
                        pixels[px] = ParallelMath::Min(pixels[px], highTerminal);
 | 
						|
 | 
						|
                    floatPixels[px] = ParallelMath::ToFloat(pixels[px]);
 | 
						|
                }
 | 
						|
 | 
						|
                MUInt15 sortedPixels[16];
 | 
						|
                for (int px = 0; px < 16; px++)
 | 
						|
                    sortedPixels[px] = pixels[px];
 | 
						|
 | 
						|
                for (int sortEnd = 15; sortEnd > 0; sortEnd--)
 | 
						|
                {
 | 
						|
                    for (int sortOffset = 0; sortOffset < sortEnd; sortOffset++)
 | 
						|
                    {
 | 
						|
                        MUInt15 a = sortedPixels[sortOffset];
 | 
						|
                        MUInt15 b = sortedPixels[sortOffset + 1];
 | 
						|
 | 
						|
                        sortedPixels[sortOffset] = ParallelMath::Min(a, b);
 | 
						|
                        sortedPixels[sortOffset + 1] = ParallelMath::Max(a, b);
 | 
						|
                    }
 | 
						|
                }
 | 
						|
 | 
						|
                MUInt15 zero = ParallelMath::MakeUInt15(0);
 | 
						|
                MUInt15 one = ParallelMath::MakeUInt15(1);
 | 
						|
 | 
						|
                MUInt15 bestIsFullRange = zero;
 | 
						|
                MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
 | 
						|
                MUInt15 bestEP[2] = { zero, zero };
 | 
						|
                MUInt15 bestIndexes[16] = {
 | 
						|
                    zero, zero, zero, zero,
 | 
						|
                    zero, zero, zero, zero,
 | 
						|
                    zero, zero, zero, zero,
 | 
						|
                    zero, zero, zero, zero
 | 
						|
                };
 | 
						|
 | 
						|
                // Full-precision
 | 
						|
                {
 | 
						|
                    MUInt15 minEP = sortedPixels[0];
 | 
						|
                    MUInt15 maxEP = sortedPixels[15];
 | 
						|
 | 
						|
                    MFloat base[1] = { ParallelMath::ToFloat(minEP) };
 | 
						|
                    MFloat offset[1] = { ParallelMath::ToFloat(maxEP - minEP) };
 | 
						|
 | 
						|
                    UnfinishedEndpoints<1> ufep = UnfinishedEndpoints<1>(base, offset);
 | 
						|
 | 
						|
                    int numTweakRounds = BCCommon::TweakRoundsForRange(8);
 | 
						|
                    if (numTweakRounds > maxTweakRounds)
 | 
						|
                        numTweakRounds = maxTweakRounds;
 | 
						|
 | 
						|
                    for (int tweak = 0; tweak < numTweakRounds; tweak++)
 | 
						|
                    {
 | 
						|
                        MUInt15 ep[2][1];
 | 
						|
 | 
						|
                        ufep.FinishLDR(tweak, 8, ep[0], ep[1]);
 | 
						|
 | 
						|
                        for (int refinePass = 0; refinePass < numRefineRounds; refinePass++)
 | 
						|
                        {
 | 
						|
                            EndpointRefiner<1> refiner;
 | 
						|
                            refiner.Init(8, oneWeight);
 | 
						|
 | 
						|
                            if (isSigned)
 | 
						|
                                for (int epi = 0; epi < 2; epi++)
 | 
						|
                                    ep[epi][0] = ParallelMath::Min(ep[epi][0], highTerminal);
 | 
						|
 | 
						|
                            IndexSelector<1> indexSelector;
 | 
						|
                            indexSelector.Init<false>(oneWeight, ep, 8);
 | 
						|
 | 
						|
                            MUInt15 indexes[16];
 | 
						|
 | 
						|
                            AggregatedError<1> aggError;
 | 
						|
                            for (int px = 0; px < 16; px++)
 | 
						|
                            {
 | 
						|
                                MUInt15 index = indexSelector.SelectIndexLDR(&floatPixels[px], &rtn);
 | 
						|
 | 
						|
                                MUInt15 reconstructedPixel;
 | 
						|
 | 
						|
                                indexSelector.ReconstructLDRPrecise(index, &reconstructedPixel);
 | 
						|
                                BCCommon::ComputeErrorLDR<1>(flags, &reconstructedPixel, &pixels[px], aggError);
 | 
						|
 | 
						|
                                if (refinePass != numRefineRounds - 1)
 | 
						|
                                    refiner.ContributeUnweightedPW(&floatPixels[px], index);
 | 
						|
 | 
						|
                                indexes[px] = index;
 | 
						|
                            }
 | 
						|
                            MFloat error = aggError.Finalize(flags | Flags::Uniform, oneWeight);
 | 
						|
 | 
						|
                            ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);
 | 
						|
                            ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);
 | 
						|
 | 
						|
                            if (ParallelMath::AnySet(errorBetter16))
 | 
						|
                            {
 | 
						|
                                bestError = ParallelMath::Min(error, bestError);
 | 
						|
                                ParallelMath::ConditionalSet(bestIsFullRange, errorBetter16, one);
 | 
						|
                                for (int px = 0; px < 16; px++)
 | 
						|
                                    ParallelMath::ConditionalSet(bestIndexes[px], errorBetter16, indexes[px]);
 | 
						|
 | 
						|
                                for (int epi = 0; epi < 2; epi++)
 | 
						|
                                    ParallelMath::ConditionalSet(bestEP[epi], errorBetter16, ep[epi][0]);
 | 
						|
                            }
 | 
						|
 | 
						|
                            if (refinePass != numRefineRounds - 1)
 | 
						|
                                refiner.GetRefinedEndpointsLDR(ep, &rtn);
 | 
						|
                        }
 | 
						|
                    }
 | 
						|
                }
 | 
						|
 | 
						|
                // Reduced precision with special endpoints
 | 
						|
                {
 | 
						|
                    MUInt15 bestHeuristicMin = sortedPixels[0];
 | 
						|
                    MUInt15 bestHeuristicMax = sortedPixels[15];
 | 
						|
 | 
						|
                    ParallelMath::Int16CompFlag canTryClipping;
 | 
						|
 | 
						|
                    // In reduced precision, we want try putting endpoints at the reserved indexes at the ends.
 | 
						|
                    // The heuristic we use is to assign indexes to the end as long as they aren't off by more than half of the index range.
 | 
						|
                    // This will usually not find anything, but it's cheap to check.
 | 
						|
 | 
						|
                    {
 | 
						|
                        MUInt15 largestPossibleRange = bestHeuristicMax - bestHeuristicMin; // Max: 255
 | 
						|
                        MUInt15 lowestPossibleClearance = ParallelMath::Min(bestHeuristicMin, static_cast<MUInt15>(highTerminal - bestHeuristicMax));
 | 
						|
 | 
						|
                        MUInt15 lowestPossibleClearanceTimes10 = (lowestPossibleClearance << 2) + (lowestPossibleClearance << 4);
 | 
						|
                        canTryClipping = ParallelMath::LessOrEqual(lowestPossibleClearanceTimes10, largestPossibleRange);
 | 
						|
                    }
 | 
						|
 | 
						|
                    if (ParallelMath::AnySet(canTryClipping))
 | 
						|
                    {
 | 
						|
                        MUInt15 lowClearances[16];
 | 
						|
                        MUInt15 highClearances[16];
 | 
						|
                        MUInt15 bestSkipCount = ParallelMath::MakeUInt15(0);
 | 
						|
 | 
						|
                        lowClearances[0] = highClearances[0] = ParallelMath::MakeUInt15(0);
 | 
						|
 | 
						|
                        for (int px = 1; px < 16; px++)
 | 
						|
                        {
 | 
						|
                            lowClearances[px] = sortedPixels[px - 1];
 | 
						|
                            highClearances[px] = highTerminal - sortedPixels[16 - px];
 | 
						|
                        }
 | 
						|
 | 
						|
                        for (uint16_t firstIndex = 0; firstIndex < 16; firstIndex++)
 | 
						|
                        {
 | 
						|
                            uint16_t numSkippedLow = firstIndex;
 | 
						|
 | 
						|
                            MUInt15 lowClearance = lowClearances[firstIndex];
 | 
						|
 | 
						|
                            for (uint16_t lastIndex = firstIndex; lastIndex < 16; lastIndex++)
 | 
						|
                            {
 | 
						|
                                uint16_t numSkippedHigh = 15 - lastIndex;
 | 
						|
                                uint16_t numSkipped = numSkippedLow + numSkippedHigh;
 | 
						|
 | 
						|
                                MUInt15 numSkippedV = ParallelMath::MakeUInt15(numSkipped);
 | 
						|
 | 
						|
                                ParallelMath::Int16CompFlag areMoreSkipped = ParallelMath::Less(bestSkipCount, numSkippedV);
 | 
						|
 | 
						|
                                if (!ParallelMath::AnySet(areMoreSkipped))
 | 
						|
                                    continue;
 | 
						|
 | 
						|
                                MUInt15 clearance = ParallelMath::Max(highClearances[numSkippedHigh], lowClearance);
 | 
						|
                                MUInt15 clearanceTimes10 = (clearance << 2) + (clearance << 4);
 | 
						|
 | 
						|
                                MUInt15 range = sortedPixels[lastIndex] - sortedPixels[firstIndex];
 | 
						|
 | 
						|
                                ParallelMath::Int16CompFlag isBetter = (areMoreSkipped & ParallelMath::LessOrEqual(clearanceTimes10, range));
 | 
						|
                                ParallelMath::ConditionalSet(bestHeuristicMin, isBetter, sortedPixels[firstIndex]);
 | 
						|
                                ParallelMath::ConditionalSet(bestHeuristicMax, isBetter, sortedPixels[lastIndex]);
 | 
						|
                            }
 | 
						|
                        }
 | 
						|
                    }
 | 
						|
 | 
						|
                    MUInt15 bestSimpleMin = one;
 | 
						|
                    MUInt15 bestSimpleMax = highTerminalMinusOne;
 | 
						|
 | 
						|
                    for (int px = 0; px < 16; px++)
 | 
						|
                    {
 | 
						|
                        ParallelMath::ConditionalSet(bestSimpleMin, ParallelMath::Less(zero, sortedPixels[15 - px]), sortedPixels[15 - px]);
 | 
						|
                        ParallelMath::ConditionalSet(bestSimpleMax, ParallelMath::Less(sortedPixels[px], highTerminal), sortedPixels[px]);
 | 
						|
                    }
 | 
						|
 | 
						|
                    MUInt15 minEPs[2] = { bestSimpleMin, bestHeuristicMin };
 | 
						|
                    MUInt15 maxEPs[2] = { bestSimpleMax, bestHeuristicMax };
 | 
						|
 | 
						|
                    int minEPRange = 2;
 | 
						|
                    if (ParallelMath::AllSet(ParallelMath::Equal(minEPs[0], minEPs[1])))
 | 
						|
                        minEPRange = 1;
 | 
						|
 | 
						|
                    int maxEPRange = 2;
 | 
						|
                    if (ParallelMath::AllSet(ParallelMath::Equal(maxEPs[0], maxEPs[1])))
 | 
						|
                        maxEPRange = 1;
 | 
						|
 | 
						|
                    for (int minEPIndex = 0; minEPIndex < minEPRange; minEPIndex++)
 | 
						|
                    {
 | 
						|
                        for (int maxEPIndex = 0; maxEPIndex < maxEPRange; maxEPIndex++)
 | 
						|
                        {
 | 
						|
                            MFloat base[1] = { ParallelMath::ToFloat(minEPs[minEPIndex]) };
 | 
						|
                            MFloat offset[1] = { ParallelMath::ToFloat(maxEPs[maxEPIndex] - minEPs[minEPIndex]) };
 | 
						|
 | 
						|
                            UnfinishedEndpoints<1> ufep = UnfinishedEndpoints<1>(base, offset);
 | 
						|
 | 
						|
                            int numTweakRounds = BCCommon::TweakRoundsForRange(6);
 | 
						|
                            if (numTweakRounds > maxTweakRounds)
 | 
						|
                                numTweakRounds = maxTweakRounds;
 | 
						|
 | 
						|
                            for (int tweak = 0; tweak < numTweakRounds; tweak++)
 | 
						|
                            {
 | 
						|
                                MUInt15 ep[2][1];
 | 
						|
 | 
						|
                                ufep.FinishLDR(tweak, 8, ep[0], ep[1]);
 | 
						|
 | 
						|
                                for (int refinePass = 0; refinePass < numRefineRounds; refinePass++)
 | 
						|
                                {
 | 
						|
                                    EndpointRefiner<1> refiner;
 | 
						|
                                    refiner.Init(6, oneWeight);
 | 
						|
 | 
						|
                                    if (isSigned)
 | 
						|
                                        for (int epi = 0; epi < 2; epi++)
 | 
						|
                                            ep[epi][0] = ParallelMath::Min(ep[epi][0], highTerminal);
 | 
						|
 | 
						|
                                    IndexSelector<1> indexSelector;
 | 
						|
                                    indexSelector.Init<false>(oneWeight, ep, 6);
 | 
						|
 | 
						|
                                    MUInt15 indexes[16];
 | 
						|
                                    MFloat error = ParallelMath::MakeFloatZero();
 | 
						|
 | 
						|
                                    for (int px = 0; px < 16; px++)
 | 
						|
                                    {
 | 
						|
                                        MUInt15 selectedIndex = indexSelector.SelectIndexLDR(&floatPixels[px], &rtn);
 | 
						|
 | 
						|
                                        MUInt15 reconstructedPixel;
 | 
						|
 | 
						|
                                        indexSelector.ReconstructLDRPrecise(selectedIndex, &reconstructedPixel);
 | 
						|
 | 
						|
                                        MFloat zeroError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &zero, &pixels[px], 1, oneWeight);
 | 
						|
                                        MFloat highTerminalError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &highTerminal, &pixels[px], 1, oneWeight);
 | 
						|
                                        MFloat selectedIndexError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &reconstructedPixel, &pixels[px], 1, oneWeight);
 | 
						|
 | 
						|
                                        MFloat bestPixelError = zeroError;
 | 
						|
                                        MUInt15 index = ParallelMath::MakeUInt15(6);
 | 
						|
 | 
						|
                                        ParallelMath::ConditionalSet(index, ParallelMath::FloatFlagToInt16(ParallelMath::Less(highTerminalError, bestPixelError)), ParallelMath::MakeUInt15(7));
 | 
						|
                                        bestPixelError = ParallelMath::Min(bestPixelError, highTerminalError);
 | 
						|
 | 
						|
                                        ParallelMath::FloatCompFlag selectedIndexBetter = ParallelMath::Less(selectedIndexError, bestPixelError);
 | 
						|
 | 
						|
                                        if (ParallelMath::AllSet(selectedIndexBetter))
 | 
						|
                                        {
 | 
						|
                                            if (refinePass != numRefineRounds - 1)
 | 
						|
                                                refiner.ContributeUnweightedPW(&floatPixels[px], selectedIndex);
 | 
						|
                                        }
 | 
						|
                                        else
 | 
						|
                                        {
 | 
						|
                                            MFloat refineWeight = ParallelMath::Select(selectedIndexBetter, ParallelMath::MakeFloat(1.0f), ParallelMath::MakeFloatZero());
 | 
						|
 | 
						|
                                            if (refinePass != numRefineRounds - 1)
 | 
						|
                                                refiner.ContributePW(&floatPixels[px], selectedIndex, refineWeight);
 | 
						|
                                        }
 | 
						|
 | 
						|
                                        ParallelMath::ConditionalSet(index, ParallelMath::FloatFlagToInt16(selectedIndexBetter), selectedIndex);
 | 
						|
                                        bestPixelError = ParallelMath::Min(bestPixelError, selectedIndexError);
 | 
						|
 | 
						|
                                        error = error + bestPixelError;
 | 
						|
 | 
						|
                                        indexes[px] = index;
 | 
						|
                                    }
 | 
						|
 | 
						|
                                    ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError);
 | 
						|
                                    ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);
 | 
						|
 | 
						|
                                    if (ParallelMath::AnySet(errorBetter16))
 | 
						|
                                    {
 | 
						|
                                        bestError = ParallelMath::Min(error, bestError);
 | 
						|
                                        ParallelMath::ConditionalSet(bestIsFullRange, errorBetter16, zero);
 | 
						|
                                        for (int px = 0; px < 16; px++)
 | 
						|
                                            ParallelMath::ConditionalSet(bestIndexes[px], errorBetter16, indexes[px]);
 | 
						|
 | 
						|
                                        for (int epi = 0; epi < 2; epi++)
 | 
						|
                                            ParallelMath::ConditionalSet(bestEP[epi], errorBetter16, ep[epi][0]);
 | 
						|
                                    }
 | 
						|
 | 
						|
                                    if (refinePass != numRefineRounds - 1)
 | 
						|
                                        refiner.GetRefinedEndpointsLDR(ep, &rtn);
 | 
						|
                                }
 | 
						|
                            }
 | 
						|
                        }
 | 
						|
                    }
 | 
						|
                }
 | 
						|
 | 
						|
                for (int block = 0; block < ParallelMath::ParallelSize; block++)
 | 
						|
                {
 | 
						|
                    int ep0 = ParallelMath::Extract(bestEP[0], block);
 | 
						|
                    int ep1 = ParallelMath::Extract(bestEP[1], block);
 | 
						|
                    int isFullRange = ParallelMath::Extract(bestIsFullRange, block);
 | 
						|
 | 
						|
                    if (isSigned)
 | 
						|
                    {
 | 
						|
                        ep0 -= 127;
 | 
						|
                        ep1 -= 127;
 | 
						|
 | 
						|
                        assert(ep0 >= -127 && ep0 <= 127);
 | 
						|
                        assert(ep1 >= -127 && ep1 <= 127);
 | 
						|
                    }
 | 
						|
 | 
						|
 | 
						|
                    bool swapEndpoints = (isFullRange != 0) != (ep0 > ep1);
 | 
						|
 | 
						|
                    if (swapEndpoints)
 | 
						|
                        std::swap(ep0, ep1);
 | 
						|
 | 
						|
                    uint16_t dumpBits = 0;
 | 
						|
                    int dumpBitsOffset = 0;
 | 
						|
                    int dumpByteOffset = 2;
 | 
						|
                    packedBlocks[0] = static_cast<uint8_t>(ep0 & 0xff);
 | 
						|
                    packedBlocks[1] = static_cast<uint8_t>(ep1 & 0xff);
 | 
						|
 | 
						|
                    int maxValue = (isFullRange != 0) ? 7 : 5;
 | 
						|
 | 
						|
                    for (int px = 0; px < 16; px++)
 | 
						|
                    {
 | 
						|
                        int index = ParallelMath::Extract(bestIndexes[px], block);
 | 
						|
 | 
						|
                        if (swapEndpoints && index <= maxValue)
 | 
						|
                            index = maxValue - index;
 | 
						|
 | 
						|
                        if (index != 0)
 | 
						|
                        {
 | 
						|
                            if (index == maxValue)
 | 
						|
                                index = 1;
 | 
						|
                            else if (index < maxValue)
 | 
						|
                                index++;
 | 
						|
                        }
 | 
						|
 | 
						|
                        assert(index >= 0 && index < 8);
 | 
						|
 | 
						|
                        dumpBits |= static_cast<uint16_t>(index << dumpBitsOffset);
 | 
						|
                        dumpBitsOffset += 3;
 | 
						|
 | 
						|
                        if (dumpBitsOffset >= 8)
 | 
						|
                        {
 | 
						|
                            assert(dumpByteOffset < 8);
 | 
						|
                            packedBlocks[dumpByteOffset] = static_cast<uint8_t>(dumpBits & 0xff);
 | 
						|
                            dumpBits >>= 8;
 | 
						|
                            dumpBitsOffset -= 8;
 | 
						|
                            dumpByteOffset++;
 | 
						|
                        }
 | 
						|
                    }
 | 
						|
 | 
						|
                    assert(dumpBitsOffset == 0);
 | 
						|
                    assert(dumpByteOffset == 8);
 | 
						|
 | 
						|
                    packedBlocks += packedBlockStride;
 | 
						|
                }
 | 
						|
            }
 | 
						|
 | 
						|
            static void PackRGB(uint32_t flags, const PixelBlockU8* inputs, uint8_t* packedBlocks, size_t packedBlockStride, const float channelWeights[4], bool alphaTest, float alphaThreshold, bool exhaustive, int maxTweakRounds, int numRefineRounds)
 | 
						|
            {
 | 
						|
                ParallelMath::RoundTowardNearestForScope rtn;
 | 
						|
 | 
						|
                if (numRefineRounds < 1)
 | 
						|
                    numRefineRounds = 1;
 | 
						|
 | 
						|
                if (maxTweakRounds < 1)
 | 
						|
                    maxTweakRounds = 1;
 | 
						|
 | 
						|
                EndpointSelector<3, 8> endpointSelector;
 | 
						|
 | 
						|
                MUInt15 pixels[16][4];
 | 
						|
                MFloat floatPixels[16][4];
 | 
						|
 | 
						|
                MFloat preWeightedPixels[16][4];
 | 
						|
 | 
						|
                for (int px = 0; px < 16; px++)
 | 
						|
                {
 | 
						|
                    for (int ch = 0; ch < 4; ch++)
 | 
						|
                        ParallelMath::ConvertLDRInputs(inputs, px, ch, pixels[px][ch]);
 | 
						|
                }
 | 
						|
 | 
						|
                for (int px = 0; px < 16; px++)
 | 
						|
                {
 | 
						|
                    for (int ch = 0; ch < 4; ch++)
 | 
						|
                        floatPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]);
 | 
						|
                }
 | 
						|
 | 
						|
                if (alphaTest)
 | 
						|
                {
 | 
						|
                    MUInt15 threshold = ParallelMath::MakeUInt15(static_cast<uint16_t>(floor(alphaThreshold * 255.0f + 0.5f)));
 | 
						|
 | 
						|
                    for (int px = 0; px < 16; px++)
 | 
						|
                    {
 | 
						|
                        ParallelMath::Int16CompFlag belowThreshold = ParallelMath::Less(pixels[px][3], threshold);
 | 
						|
                        pixels[px][3] = ParallelMath::Select(belowThreshold, ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(255));
 | 
						|
                    }
 | 
						|
                }
 | 
						|
 | 
						|
                BCCommon::PreWeightPixelsLDR<4>(preWeightedPixels, pixels, channelWeights);
 | 
						|
 | 
						|
                MUInt15 minAlpha = ParallelMath::MakeUInt15(255);
 | 
						|
 | 
						|
                for (int px = 0; px < 16; px++)
 | 
						|
                    minAlpha = ParallelMath::Min(minAlpha, pixels[px][3]);
 | 
						|
 | 
						|
                MFloat pixelWeights[16];
 | 
						|
                for (int px = 0; px < 16; px++)
 | 
						|
                {
 | 
						|
                    pixelWeights[px] = ParallelMath::MakeFloat(1.0f);
 | 
						|
                    if (alphaTest)
 | 
						|
                    {
 | 
						|
                        ParallelMath::Int16CompFlag isTransparent = ParallelMath::Less(pixels[px][3], ParallelMath::MakeUInt15(255));
 | 
						|
 | 
						|
                        ParallelMath::ConditionalSet(pixelWeights[px], ParallelMath::Int16FlagToFloat(isTransparent), ParallelMath::MakeFloatZero());
 | 
						|
                    }
 | 
						|
                }
 | 
						|
 | 
						|
                for (int pass = 0; pass < NumEndpointSelectorPasses; pass++)
 | 
						|
                {
 | 
						|
                    for (int px = 0; px < 16; px++)
 | 
						|
                        endpointSelector.ContributePass(preWeightedPixels[px], pass, pixelWeights[px]);
 | 
						|
 | 
						|
                    endpointSelector.FinishPass(pass);
 | 
						|
                }
 | 
						|
 | 
						|
                UnfinishedEndpoints<3> ufep = endpointSelector.GetEndpoints(channelWeights);
 | 
						|
 | 
						|
                MUInt15 bestEndpoints[2][3];
 | 
						|
                MUInt15 bestIndexes[16];
 | 
						|
                MUInt15 bestRange = ParallelMath::MakeUInt15(0);
 | 
						|
                MFloat bestError = ParallelMath::MakeFloat(FLT_MAX);
 | 
						|
 | 
						|
                for (int px = 0; px < 16; px++)
 | 
						|
                    bestIndexes[px] = ParallelMath::MakeUInt15(0);
 | 
						|
 | 
						|
                for (int ep = 0; ep < 2; ep++)
 | 
						|
                    for (int ch = 0; ch < 3; ch++)
 | 
						|
                        bestEndpoints[ep][ch] = ParallelMath::MakeUInt15(0);
 | 
						|
 | 
						|
                if (exhaustive)
 | 
						|
                {
 | 
						|
                    MSInt16 sortBins[16];
 | 
						|
 | 
						|
                    {
 | 
						|
                        // Compute an 11-bit index, change it to signed, stuff it in the high bits of the sort bins,
 | 
						|
                        // and pack the original indexes into the low bits.
 | 
						|
 | 
						|
                        MUInt15 sortEP[2][3];
 | 
						|
                        ufep.FinishLDR(0, 11, sortEP[0], sortEP[1]);
 | 
						|
 | 
						|
                        IndexSelector<3> sortSelector;
 | 
						|
                        sortSelector.Init<false>(channelWeights, sortEP, 1 << 11);
 | 
						|
 | 
						|
                        for (int16_t px = 0; px < 16; px++)
 | 
						|
                        {
 | 
						|
                            MSInt16 sortBin = ParallelMath::LosslessCast<MSInt16>::Cast(sortSelector.SelectIndexLDR(floatPixels[px], &rtn) << 4);
 | 
						|
 | 
						|
                            if (alphaTest)
 | 
						|
                            {
 | 
						|
                                ParallelMath::Int16CompFlag isTransparent = ParallelMath::Less(pixels[px][3], ParallelMath::MakeUInt15(255));
 | 
						|
 | 
						|
                                ParallelMath::ConditionalSet(sortBin, isTransparent, ParallelMath::MakeSInt16(-16)); // 0xfff0
 | 
						|
                            }
 | 
						|
 | 
						|
                            sortBin = sortBin + ParallelMath::MakeSInt16(px);
 | 
						|
 | 
						|
                            sortBins[px] = sortBin;
 | 
						|
                        }
 | 
						|
                    }
 | 
						|
 | 
						|
                    // Sort bins
 | 
						|
                    for (int sortEnd = 1; sortEnd < 16; sortEnd++)
 | 
						|
                    {
 | 
						|
                        for (int sortLoc = sortEnd; sortLoc > 0; sortLoc--)
 | 
						|
                        {
 | 
						|
                            MSInt16 a = sortBins[sortLoc];
 | 
						|
                            MSInt16 b = sortBins[sortLoc - 1];
 | 
						|
 | 
						|
                            sortBins[sortLoc] = ParallelMath::Max(a, b);
 | 
						|
                            sortBins[sortLoc - 1] = ParallelMath::Min(a, b);
 | 
						|
                        }
 | 
						|
                    }
 | 
						|
 | 
						|
                    MUInt15 firstElement = ParallelMath::MakeUInt15(0);
 | 
						|
                    for (uint16_t e = 0; e < 16; e++)
 | 
						|
                    {
 | 
						|
                        ParallelMath::Int16CompFlag isInvalid = ParallelMath::Less(sortBins[e], ParallelMath::MakeSInt16(0));
 | 
						|
                        ParallelMath::ConditionalSet(firstElement, isInvalid, ParallelMath::MakeUInt15(e + 1));
 | 
						|
                        if (!ParallelMath::AnySet(isInvalid))
 | 
						|
                            break;
 | 
						|
                    }
 | 
						|
 | 
						|
                    MUInt15 numElements = ParallelMath::MakeUInt15(16) - firstElement;
 | 
						|
 | 
						|
                    MUInt15 sortedInputs[16][4];
 | 
						|
                    MFloat floatSortedInputs[16][4];
 | 
						|
                    MFloat pwFloatSortedInputs[16][4];
 | 
						|
 | 
						|
                    for (int e = 0; e < 16; e++)
 | 
						|
                    {
 | 
						|
                        for (int ch = 0; ch < 4; ch++)
 | 
						|
                            sortedInputs[e][ch] = ParallelMath::MakeUInt15(0);
 | 
						|
                    }
 | 
						|
 | 
						|
                    for (int block = 0; block < ParallelMath::ParallelSize; block++)
 | 
						|
                    {
 | 
						|
                        for (int e = ParallelMath::Extract(firstElement, block); e < 16; e++)
 | 
						|
                        {
 | 
						|
                            ParallelMath::ScalarUInt16 sortBin = ParallelMath::Extract(sortBins[e], block);
 | 
						|
                            int originalIndex = (sortBin & 15);
 | 
						|
 | 
						|
                            for (int ch = 0; ch < 4; ch++)
 | 
						|
                                ParallelMath::PutUInt15(sortedInputs[15 - e][ch], block, ParallelMath::Extract(pixels[originalIndex][ch], block));
 | 
						|
                        }
 | 
						|
                    }
 | 
						|
 | 
						|
                    for (int e = 0; e < 16; e++)
 | 
						|
                    {
 | 
						|
                        for (int ch = 0; ch < 4; ch++)
 | 
						|
                        {
 | 
						|
                            MFloat f = ParallelMath::ToFloat(sortedInputs[e][ch]);
 | 
						|
                            floatSortedInputs[e][ch] = f;
 | 
						|
                            pwFloatSortedInputs[e][ch] = f * channelWeights[ch];
 | 
						|
                        }
 | 
						|
                    }
 | 
						|
 | 
						|
                    for (int n0 = 0; n0 <= 15; n0++)
 | 
						|
                    {
 | 
						|
                        int remainingFor1 = 16 - n0;
 | 
						|
                        if (remainingFor1 == 16)
 | 
						|
                            remainingFor1 = 15;
 | 
						|
 | 
						|
                        for (int n1 = 0; n1 <= remainingFor1; n1++)
 | 
						|
                        {
 | 
						|
                            int remainingFor2 = 16 - n1 - n0;
 | 
						|
                            if (remainingFor2 == 16)
 | 
						|
                                remainingFor2 = 15;
 | 
						|
 | 
						|
                            for (int n2 = 0; n2 <= remainingFor2; n2++)
 | 
						|
                            {
 | 
						|
                                int n3 = 16 - n2 - n1 - n0;
 | 
						|
 | 
						|
                                if (n3 == 16)
 | 
						|
                                    continue;
 | 
						|
 | 
						|
                                int counts[4] = { n0, n1, n2, n3 };
 | 
						|
 | 
						|
                                TestCounts(flags, counts, 4, numElements, pixels, floatPixels, preWeightedPixels, alphaTest, floatSortedInputs, pwFloatSortedInputs, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
 | 
						|
                            }
 | 
						|
                        }
 | 
						|
                    }
 | 
						|
 | 
						|
                    TestSingleColor(flags, pixels, floatPixels, 4, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
 | 
						|
 | 
						|
                    if (alphaTest)
 | 
						|
                    {
 | 
						|
                        for (int n0 = 0; n0 <= 15; n0++)
 | 
						|
                        {
 | 
						|
                            int remainingFor1 = 16 - n0;
 | 
						|
                            if (remainingFor1 == 16)
 | 
						|
                                remainingFor1 = 15;
 | 
						|
 | 
						|
                            for (int n1 = 0; n1 <= remainingFor1; n1++)
 | 
						|
                            {
 | 
						|
                                int n2 = 16 - n1 - n0;
 | 
						|
 | 
						|
                                if (n2 == 16)
 | 
						|
                                    continue;
 | 
						|
 | 
						|
                                int counts[3] = { n0, n1, n2 };
 | 
						|
 | 
						|
                                TestCounts(flags, counts, 3, numElements, pixels, floatPixels, preWeightedPixels, alphaTest, floatSortedInputs, pwFloatSortedInputs, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
 | 
						|
                            }
 | 
						|
                        }
 | 
						|
 | 
						|
                        TestSingleColor(flags, pixels, floatPixels, 3, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn);
 | 
						|
                    }
 | 
						|
                }
 | 
						|
                else
 | 
						|
                {
 | 
						|
                    int minRange = alphaTest ? 3 : 4;
 | 
						|
 | 
						|
                    for (int range = minRange; range <= 4; range++)
 | 
						|
                    {
 | 
						|
                        int tweakRounds = BCCommon::TweakRoundsForRange(range);
 | 
						|
                        if (tweakRounds > maxTweakRounds)
 | 
						|
                            tweakRounds = maxTweakRounds;
 | 
						|
 | 
						|
                        for (int tweak = 0; tweak < tweakRounds; tweak++)
 | 
						|
                        {
 | 
						|
                            MUInt15 endPoints[2][3];
 | 
						|
 | 
						|
                            ufep.FinishLDR(tweak, range, endPoints[0], endPoints[1]);
 | 
						|
 | 
						|
                            for (int refine = 0; refine < numRefineRounds; refine++)
 | 
						|
                            {
 | 
						|
                                EndpointRefiner<3> refiner;
 | 
						|
                                refiner.Init(range, channelWeights);
 | 
						|
 | 
						|
                                TestEndpoints(flags, pixels, floatPixels, preWeightedPixels, endPoints, range, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &refiner, &rtn);
 | 
						|
 | 
						|
                                if (refine != numRefineRounds - 1)
 | 
						|
                                    refiner.GetRefinedEndpointsLDR(endPoints, &rtn);
 | 
						|
                            }
 | 
						|
                        }
 | 
						|
                    }
 | 
						|
                }
 | 
						|
 | 
						|
                for (int block = 0; block < ParallelMath::ParallelSize; block++)
 | 
						|
                {
 | 
						|
                    ParallelMath::ScalarUInt16 range = ParallelMath::Extract(bestRange, block);
 | 
						|
                    assert(range == 3 || range == 4);
 | 
						|
 | 
						|
                    ParallelMath::ScalarUInt16 compressedEP[2];
 | 
						|
                    for (int ep = 0; ep < 2; ep++)
 | 
						|
                    {
 | 
						|
                        ParallelMath::ScalarUInt16 endPoint[3];
 | 
						|
                        for (int ch = 0; ch < 3; ch++)
 | 
						|
                            endPoint[ch] = ParallelMath::Extract(bestEndpoints[ep][ch], block);
 | 
						|
 | 
						|
                        int compressed = (endPoint[0] & 0xf8) << 8;
 | 
						|
                        compressed |= (endPoint[1] & 0xfc) << 3;
 | 
						|
                        compressed |= (endPoint[2] & 0xf8) >> 3;
 | 
						|
 | 
						|
                        compressedEP[ep] = static_cast<ParallelMath::ScalarUInt16>(compressed);
 | 
						|
                    }
 | 
						|
 | 
						|
                    int indexOrder[4];
 | 
						|
 | 
						|
                    if (range == 4)
 | 
						|
                    {
 | 
						|
                        if (compressedEP[0] == compressedEP[1])
 | 
						|
                        {
 | 
						|
                            indexOrder[0] = 0;
 | 
						|
                            indexOrder[1] = 0;
 | 
						|
                            indexOrder[2] = 0;
 | 
						|
                            indexOrder[3] = 0;
 | 
						|
                        }
 | 
						|
                        else if (compressedEP[0] < compressedEP[1])
 | 
						|
                        {
 | 
						|
                            std::swap(compressedEP[0], compressedEP[1]);
 | 
						|
                            indexOrder[0] = 1;
 | 
						|
                            indexOrder[1] = 3;
 | 
						|
                            indexOrder[2] = 2;
 | 
						|
                            indexOrder[3] = 0;
 | 
						|
                        }
 | 
						|
                        else
 | 
						|
                        {
 | 
						|
                            indexOrder[0] = 0;
 | 
						|
                            indexOrder[1] = 2;
 | 
						|
                            indexOrder[2] = 3;
 | 
						|
                            indexOrder[3] = 1;
 | 
						|
                        }
 | 
						|
                    }
 | 
						|
                    else
 | 
						|
                    {
 | 
						|
                        assert(range == 3);
 | 
						|
 | 
						|
                        if (compressedEP[0] > compressedEP[1])
 | 
						|
                        {
 | 
						|
                            std::swap(compressedEP[0], compressedEP[1]);
 | 
						|
                            indexOrder[0] = 1;
 | 
						|
                            indexOrder[1] = 2;
 | 
						|
                            indexOrder[2] = 0;
 | 
						|
                        }
 | 
						|
                        else
 | 
						|
                        {
 | 
						|
                            indexOrder[0] = 0;
 | 
						|
                            indexOrder[1] = 2;
 | 
						|
                            indexOrder[2] = 1;
 | 
						|
                        }
 | 
						|
                        indexOrder[3] = 3;
 | 
						|
                    }
 | 
						|
 | 
						|
                    packedBlocks[0] = static_cast<uint8_t>(compressedEP[0] & 0xff);
 | 
						|
                    packedBlocks[1] = static_cast<uint8_t>((compressedEP[0] >> 8) & 0xff);
 | 
						|
                    packedBlocks[2] = static_cast<uint8_t>(compressedEP[1] & 0xff);
 | 
						|
                    packedBlocks[3] = static_cast<uint8_t>((compressedEP[1] >> 8) & 0xff);
 | 
						|
 | 
						|
                    for (int i = 0; i < 16; i += 4)
 | 
						|
                    {
 | 
						|
                        int packedIndexes = 0;
 | 
						|
                        for (int subi = 0; subi < 4; subi++)
 | 
						|
                        {
 | 
						|
                            ParallelMath::ScalarUInt16 index = ParallelMath::Extract(bestIndexes[i + subi], block);
 | 
						|
                            packedIndexes |= (indexOrder[index] << (subi * 2));
 | 
						|
                        }
 | 
						|
 | 
						|
                        packedBlocks[4 + i / 4] = static_cast<uint8_t>(packedIndexes);
 | 
						|
                    }
 | 
						|
 | 
						|
                    packedBlocks += packedBlockStride;
 | 
						|
                }
 | 
						|
            }
 | 
						|
        };
 | 
						|
 | 
						|
        // Signed input blocks are converted into unsigned space, with the maximum value being 254
 | 
						|
        void BiasSignedInput(PixelBlockU8 inputNormalized[ParallelMath::ParallelSize], const PixelBlockS8 inputSigned[ParallelMath::ParallelSize])
 | 
						|
        {
 | 
						|
            for (size_t block = 0; block < ParallelMath::ParallelSize; block++)
 | 
						|
            {
 | 
						|
                const PixelBlockS8& inputSignedBlock = inputSigned[block];
 | 
						|
                PixelBlockU8& inputNormalizedBlock = inputNormalized[block];
 | 
						|
 | 
						|
                for (size_t px = 0; px < 16; px++)
 | 
						|
                {
 | 
						|
                    for (size_t ch = 0; ch < 4; ch++)
 | 
						|
                        inputNormalizedBlock.m_pixels[px][ch] = static_cast<uint8_t>(std::max<int>(inputSignedBlock.m_pixels[px][ch], -127) + 127);
 | 
						|
                }
 | 
						|
            }
 | 
						|
        }
 | 
						|
 | 
						|
        void FillWeights(const Options &options, float channelWeights[4])
 | 
						|
        {
 | 
						|
            if (options.flags & Flags::Uniform)
 | 
						|
                channelWeights[0] = channelWeights[1] = channelWeights[2] = channelWeights[3] = 1.0f;
 | 
						|
            else
 | 
						|
            {
 | 
						|
                channelWeights[0] = options.redWeight;
 | 
						|
                channelWeights[1] = options.greenWeight;
 | 
						|
                channelWeights[2] = options.blueWeight;
 | 
						|
                channelWeights[3] = options.alphaWeight;
 | 
						|
            }
 | 
						|
        }
 | 
						|
    }
 | 
						|
 | 
						|
    namespace Kernels
 | 
						|
    {
 | 
						|
        void EncodeBC7(uint8_t *pBC, const PixelBlockU8 *pBlocks, const cvtt::Options &options)
 | 
						|
        {
 | 
						|
            assert(pBlocks);
 | 
						|
            assert(pBC);
 | 
						|
 | 
						|
            float channelWeights[4];
 | 
						|
            Internal::FillWeights(options, channelWeights);
 | 
						|
 | 
						|
            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
 | 
						|
            {
 | 
						|
                Internal::BC7Computer::Pack(options.flags, pBlocks + blockBase, pBC, channelWeights, options.seedPoints, options.refineRoundsBC7);
 | 
						|
                pBC += ParallelMath::ParallelSize * 16;
 | 
						|
            }
 | 
						|
        }
 | 
						|
 | 
						|
        void EncodeBC6HU(uint8_t *pBC, const PixelBlockF16 *pBlocks, const cvtt::Options &options)
 | 
						|
        {
 | 
						|
            assert(pBlocks);
 | 
						|
            assert(pBC);
 | 
						|
 | 
						|
            float channelWeights[4];
 | 
						|
            Internal::FillWeights(options, channelWeights);
 | 
						|
 | 
						|
            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
 | 
						|
            {
 | 
						|
                Internal::BC6HComputer::Pack(options.flags, pBlocks + blockBase, pBC, channelWeights, false, options.seedPoints, options.refineRoundsBC6H);
 | 
						|
                pBC += ParallelMath::ParallelSize * 16;
 | 
						|
            }
 | 
						|
        }
 | 
						|
 | 
						|
        void EncodeBC6HS(uint8_t *pBC, const PixelBlockF16 *pBlocks, const cvtt::Options &options)
 | 
						|
        {
 | 
						|
            assert(pBlocks);
 | 
						|
            assert(pBC);
 | 
						|
 | 
						|
            float channelWeights[4];
 | 
						|
            Internal::FillWeights(options, channelWeights);
 | 
						|
 | 
						|
            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
 | 
						|
            {
 | 
						|
                Internal::BC6HComputer::Pack(options.flags, pBlocks + blockBase, pBC, channelWeights, true, options.seedPoints, options.refineRoundsBC6H);
 | 
						|
                pBC += ParallelMath::ParallelSize * 16;
 | 
						|
            }
 | 
						|
        }
 | 
						|
 | 
						|
        void EncodeBC1(uint8_t *pBC, const PixelBlockU8 *pBlocks, const cvtt::Options &options)
 | 
						|
        {
 | 
						|
            assert(pBlocks);
 | 
						|
            assert(pBC);
 | 
						|
 | 
						|
            float channelWeights[4];
 | 
						|
            Internal::FillWeights(options, channelWeights);
 | 
						|
 | 
						|
            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
 | 
						|
            {
 | 
						|
                Internal::S3TCComputer::PackRGB(options.flags, pBlocks + blockBase, pBC, 8, channelWeights, true, options.threshold, (options.flags & Flags::S3TC_Exhaustive) != 0, options.seedPoints, options.refineRoundsS3TC);
 | 
						|
                pBC += ParallelMath::ParallelSize * 8;
 | 
						|
            }
 | 
						|
        }
 | 
						|
 | 
						|
        void EncodeBC2(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options)
 | 
						|
        {
 | 
						|
            assert(pBlocks);
 | 
						|
            assert(pBC);
 | 
						|
 | 
						|
            float channelWeights[4];
 | 
						|
            Internal::FillWeights(options, channelWeights);
 | 
						|
 | 
						|
            for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
 | 
						|
            {
 | 
						|
                Internal::S3TCComputer::PackRGB(options.flags, pBlocks + blockBase, pBC + 8, 16, channelWeights, false, 1.0f, (options.flags & Flags::S3TC_Exhaustive) != 0, options.seedPoints, options.refineRoundsS3TC);
 | 
						|
                Internal::S3TCComputer::PackExplicitAlpha(options.flags, pBlocks + blockBase, 3, pBC, 16);
 | 
						|
                pBC += ParallelMath::ParallelSize * 16;
 | 
						|
            }
 | 
						|
        }
 | 
						|
 | 
						|
        void EncodeBC3(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options)
 | 
						|
        {
 | 
						|
            assert(pBlocks);
 | 
						|
            assert(pBC);
 | 
						|
 | 
						|
            float channelWeights[4];
 | 
						|
            Internal::FillWeights(options, channelWeights);
 | 
						|
 | 
						|
            for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
 | 
						|
            {
 | 
						|
                Internal::S3TCComputer::PackRGB(options.flags, pBlocks + blockBase, pBC + 8, 16, channelWeights, false, 1.0f, (options.flags & Flags::S3TC_Exhaustive) != 0, options.seedPoints, options.refineRoundsS3TC);
 | 
						|
                Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, pBlocks + blockBase, 3, pBC, 16, false, options.seedPoints, options.refineRoundsIIC);
 | 
						|
                pBC += ParallelMath::ParallelSize * 16;
 | 
						|
            }
 | 
						|
        }
 | 
						|
 | 
						|
        void EncodeBC4U(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options)
 | 
						|
        {
 | 
						|
            assert(pBlocks);
 | 
						|
            assert(pBC);
 | 
						|
 | 
						|
            float channelWeights[4];
 | 
						|
            Internal::FillWeights(options, channelWeights);
 | 
						|
 | 
						|
            for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
 | 
						|
            {
 | 
						|
                Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, pBlocks + blockBase, 0, pBC, 8, false, options.seedPoints, options.refineRoundsIIC);
 | 
						|
                pBC += ParallelMath::ParallelSize * 8;
 | 
						|
            }
 | 
						|
        }
 | 
						|
 | 
						|
        void EncodeBC4S(uint8_t *pBC, const PixelBlockS8 *pBlocks, const Options &options)
 | 
						|
        {
 | 
						|
            assert(pBlocks);
 | 
						|
            assert(pBC);
 | 
						|
 | 
						|
            float channelWeights[4];
 | 
						|
            Internal::FillWeights(options, channelWeights);
 | 
						|
 | 
						|
            for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
 | 
						|
            {
 | 
						|
                PixelBlockU8 inputBlocks[ParallelMath::ParallelSize];
 | 
						|
                Internal::BiasSignedInput(inputBlocks, pBlocks + blockBase);
 | 
						|
 | 
						|
                Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, inputBlocks, 0, pBC, 8, true, options.seedPoints, options.refineRoundsIIC);
 | 
						|
                pBC += ParallelMath::ParallelSize * 8;
 | 
						|
            }
 | 
						|
        }
 | 
						|
 | 
						|
        void EncodeBC5U(uint8_t *pBC, const PixelBlockU8 *pBlocks, const Options &options)
 | 
						|
        {
 | 
						|
            assert(pBlocks);
 | 
						|
            assert(pBC);
 | 
						|
 | 
						|
            float channelWeights[4];
 | 
						|
            Internal::FillWeights(options, channelWeights);
 | 
						|
 | 
						|
            for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
 | 
						|
            {
 | 
						|
                Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, pBlocks + blockBase, 0, pBC, 16, false, options.seedPoints, options.refineRoundsIIC);
 | 
						|
                Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, pBlocks + blockBase, 1, pBC + 8, 16, false, options.seedPoints, options.refineRoundsIIC);
 | 
						|
                pBC += ParallelMath::ParallelSize * 16;
 | 
						|
            }
 | 
						|
        }
 | 
						|
 | 
						|
        void EncodeBC5S(uint8_t *pBC, const PixelBlockS8 *pBlocks, const Options &options)
 | 
						|
        {
 | 
						|
            assert(pBlocks);
 | 
						|
            assert(pBC);
 | 
						|
 | 
						|
            float channelWeights[4];
 | 
						|
            Internal::FillWeights(options, channelWeights);
 | 
						|
 | 
						|
            for (size_t blockBase = 0; blockBase < NumParallelBlocks; blockBase += ParallelMath::ParallelSize)
 | 
						|
            {
 | 
						|
                PixelBlockU8 inputBlocks[ParallelMath::ParallelSize];
 | 
						|
                Internal::BiasSignedInput(inputBlocks, pBlocks + blockBase);
 | 
						|
 | 
						|
                Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, inputBlocks, 0, pBC, 16, true, options.seedPoints, options.refineRoundsIIC);
 | 
						|
                Internal::S3TCComputer::PackInterpolatedAlpha(options.flags, inputBlocks, 1, pBC + 8, 16, true, options.seedPoints, options.refineRoundsIIC);
 | 
						|
                pBC += ParallelMath::ParallelSize * 16;
 | 
						|
            }
 | 
						|
        }
 | 
						|
 | 
						|
        void DecodeBC7(PixelBlockU8 *pBlocks, const uint8_t *pBC)
 | 
						|
        {
 | 
						|
            assert(pBlocks);
 | 
						|
            assert(pBC);
 | 
						|
 | 
						|
            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase++)
 | 
						|
            {
 | 
						|
                Internal::BC7Computer::UnpackOne(pBlocks[blockBase], pBC);
 | 
						|
                pBC += 16;
 | 
						|
            }
 | 
						|
        }
 | 
						|
 | 
						|
        void DecodeBC6HU(PixelBlockF16 *pBlocks, const uint8_t *pBC)
 | 
						|
        {
 | 
						|
            assert(pBlocks);
 | 
						|
            assert(pBC);
 | 
						|
 | 
						|
            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase++)
 | 
						|
            {
 | 
						|
                Internal::BC6HComputer::UnpackOne(pBlocks[blockBase], pBC, false);
 | 
						|
                pBC += 16;
 | 
						|
            }
 | 
						|
        }
 | 
						|
 | 
						|
        void DecodeBC6HS(PixelBlockF16 *pBlocks, const uint8_t *pBC)
 | 
						|
        {
 | 
						|
            assert(pBlocks);
 | 
						|
            assert(pBC);
 | 
						|
 | 
						|
            for (size_t blockBase = 0; blockBase < cvtt::NumParallelBlocks; blockBase++)
 | 
						|
            {
 | 
						|
                Internal::BC6HComputer::UnpackOne(pBlocks[blockBase], pBC, true);
 | 
						|
                pBC += 16;
 | 
						|
            }
 | 
						|
        }
 | 
						|
    }
 | 
						|
}
 |