mirror of
				https://github.com/godotengine/godot.git
				synced 2025-10-31 13:41:03 +00:00 
			
		
		
		
	
		
			
	
	
		
			1055 lines
		
	
	
	
		
			39 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
		
		
			
		
	
	
			1055 lines
		
	
	
	
		
			39 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
|   | /*
 | ||
|  | Convection Texture Tools | ||
|  | Copyright (c) 2018-2019 Eric Lasota | ||
|  | 
 | ||
|  | Permission is hereby granted, free of charge, to any person obtaining | ||
|  | a copy of this software and associated documentation files (the | ||
|  | "Software"), to deal in the Software without restriction, including | ||
|  | without limitation the rights to use, copy, modify, merge, publish, | ||
|  | distribute, sublicense, and/or sell copies of the Software, and to | ||
|  | permit persons to whom the Software is furnished to do so, subject | ||
|  | to the following conditions: | ||
|  | 
 | ||
|  | The above copyright notice and this permission notice shall be included | ||
|  | in all copies or substantial portions of the Software. | ||
|  | 
 | ||
|  | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS | ||
|  | OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
|  | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | ||
|  | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY | ||
|  | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, | ||
|  | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE | ||
|  | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | ||
|  | 
 | ||
|  | ------------------------------------------------------------------------------------- | ||
|  | 
 | ||
|  | Portions based on DirectX Texture Library (DirectXTex) | ||
|  | 
 | ||
|  | Copyright (c) Microsoft Corporation. All rights reserved. | ||
|  | Licensed under the MIT License. | ||
|  | 
 | ||
|  | http://go.microsoft.com/fwlink/?LinkId=248926
 | ||
|  | */ | ||
|  | #include "ConvectionKernels_Config.h"
 | ||
|  | 
 | ||
|  | #if !defined(CVTT_SINGLE_FILE) || defined(CVTT_SINGLE_FILE_IMPL)
 | ||
|  | 
 | ||
|  | #include "ConvectionKernels_S3TC.h"
 | ||
|  | 
 | ||
|  | #include "ConvectionKernels_AggregatedError.h"
 | ||
|  | #include "ConvectionKernels_BCCommon.h"
 | ||
|  | #include "ConvectionKernels_EndpointRefiner.h"
 | ||
|  | #include "ConvectionKernels_EndpointSelector.h"
 | ||
|  | #include "ConvectionKernels_IndexSelector.h"
 | ||
|  | #include "ConvectionKernels_UnfinishedEndpoints.h"
 | ||
|  | #include "ConvectionKernels_S3TC_SingleColor.h"
 | ||
|  | 
 | ||
|  | void cvtt::Internal::S3TCComputer::Init(MFloat& error) | ||
|  | { | ||
|  |     error = ParallelMath::MakeFloat(FLT_MAX); | ||
|  | } | ||
|  | 
 | ||
|  | void cvtt::Internal::S3TCComputer::QuantizeTo6Bits(MUInt15& v) | ||
|  | { | ||
|  |     MUInt15 reduced = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(v, ParallelMath::MakeUInt15(253)) + ParallelMath::MakeUInt16(512), 10)); | ||
|  |     v = (reduced << 2) | ParallelMath::RightShift(reduced, 4); | ||
|  | } | ||
|  | 
 | ||
|  | void cvtt::Internal::S3TCComputer::QuantizeTo5Bits(MUInt15& v) | ||
|  | { | ||
|  |     MUInt15 reduced = ParallelMath::LosslessCast<MUInt15>::Cast(ParallelMath::RightShift(ParallelMath::CompactMultiply(v, ParallelMath::MakeUInt15(249)) + ParallelMath::MakeUInt16(1024), 11)); | ||
|  |     v = (reduced << 3) | ParallelMath::RightShift(reduced, 2); | ||
|  | } | ||
|  | 
 | ||
|  | void cvtt::Internal::S3TCComputer::QuantizeTo565(MUInt15 endPoint[3]) | ||
|  | { | ||
|  |     QuantizeTo5Bits(endPoint[0]); | ||
|  |     QuantizeTo6Bits(endPoint[1]); | ||
|  |     QuantizeTo5Bits(endPoint[2]); | ||
|  | } | ||
|  | 
 | ||
|  | cvtt::ParallelMath::Float cvtt::Internal::S3TCComputer::ParanoidFactorForSpan(const MSInt16& span) | ||
|  | { | ||
|  |     return ParallelMath::Abs(ParallelMath::ToFloat(span)) * 0.03f; | ||
|  | } | ||
|  | 
 | ||
|  | cvtt::ParallelMath::Float cvtt::Internal::S3TCComputer::ParanoidDiff(const MUInt15& a, const MUInt15& b, const MFloat& d) | ||
|  | { | ||
|  |     MFloat absDiff = ParallelMath::Abs(ParallelMath::ToFloat(ParallelMath::LosslessCast<MSInt16>::Cast(a) - ParallelMath::LosslessCast<MSInt16>::Cast(b))); | ||
|  |     absDiff = absDiff + d; | ||
|  |     return absDiff * absDiff; | ||
|  | } | ||
|  | 
 | ||
|  | void cvtt::Internal::S3TCComputer::TestSingleColor(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], int range, const float* channelWeights, | ||
|  |     MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange, const ParallelMath::RoundTowardNearestForScope *rtn) | ||
|  | { | ||
|  |     float channelWeightsSq[3]; | ||
|  | 
 | ||
|  |     for (int ch = 0; ch < 3; ch++) | ||
|  |         channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch]; | ||
|  | 
 | ||
|  |     MUInt15 totals[3] = { ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(0) }; | ||
|  | 
 | ||
|  |     for (int px = 0; px < 16; px++) | ||
|  |     { | ||
|  |         for (int ch = 0; ch < 3; ch++) | ||
|  |             totals[ch] = totals[ch] + pixels[px][ch]; | ||
|  |     } | ||
|  | 
 | ||
|  |     MUInt15 average[3]; | ||
|  |     for (int ch = 0; ch < 3; ch++) | ||
|  |         average[ch] = ParallelMath::RightShift(totals[ch] + ParallelMath::MakeUInt15(8), 4); | ||
|  | 
 | ||
|  |     const Tables::S3TCSC::TableEntry* rbTable = NULL; | ||
|  |     const Tables::S3TCSC::TableEntry* gTable = NULL; | ||
|  |     if (flags & cvtt::Flags::S3TC_Paranoid) | ||
|  |     { | ||
|  |         if (range == 4) | ||
|  |         { | ||
|  |             rbTable = Tables::S3TCSC::g_singleColor5_3_p; | ||
|  |             gTable = Tables::S3TCSC::g_singleColor6_3_p; | ||
|  |         } | ||
|  |         else | ||
|  |         { | ||
|  |             assert(range == 3); | ||
|  |             rbTable = Tables::S3TCSC::g_singleColor5_2_p; | ||
|  |             gTable = Tables::S3TCSC::g_singleColor6_2_p; | ||
|  |         } | ||
|  |     } | ||
|  |     else | ||
|  |     { | ||
|  |         if (range == 4) | ||
|  |         { | ||
|  |             rbTable = Tables::S3TCSC::g_singleColor5_3; | ||
|  |             gTable = Tables::S3TCSC::g_singleColor6_3; | ||
|  |         } | ||
|  |         else | ||
|  |         { | ||
|  |             assert(range == 3); | ||
|  |             rbTable = Tables::S3TCSC::g_singleColor5_2; | ||
|  |             gTable = Tables::S3TCSC::g_singleColor6_2; | ||
|  |         } | ||
|  |     } | ||
|  | 
 | ||
|  |     MUInt15 interpolated[3]; | ||
|  |     MUInt15 eps[2][3]; | ||
|  |     MSInt16 spans[3]; | ||
|  |     for (int i = 0; i < ParallelMath::ParallelSize; i++) | ||
|  |     { | ||
|  |         for (int ch = 0; ch < 3; ch++) | ||
|  |         { | ||
|  |             uint16_t avg = ParallelMath::Extract(average[ch], i); | ||
|  |             const Tables::S3TCSC::TableEntry& tableEntry = ((ch == 1) ? gTable[avg] : rbTable[avg]); | ||
|  |             ParallelMath::PutUInt15(eps[0][ch], i, tableEntry.m_min); | ||
|  |             ParallelMath::PutUInt15(eps[1][ch], i, tableEntry.m_max); | ||
|  |             ParallelMath::PutUInt15(interpolated[ch], i, tableEntry.m_actualColor); | ||
|  |             ParallelMath::PutSInt16(spans[ch], i, tableEntry.m_span); | ||
|  |         } | ||
|  |     } | ||
|  | 
 | ||
|  |     MFloat error = ParallelMath::MakeFloatZero(); | ||
|  |     if (flags & cvtt::Flags::S3TC_Paranoid) | ||
|  |     { | ||
|  |         MFloat spanParanoidFactors[3]; | ||
|  |         for (int ch = 0; ch < 3; ch++) | ||
|  |             spanParanoidFactors[ch] = ParanoidFactorForSpan(spans[ch]); | ||
|  | 
 | ||
|  |         for (int px = 0; px < 16; px++) | ||
|  |         { | ||
|  |             for (int ch = 0; ch < 3; ch++) | ||
|  |                 error = error + ParanoidDiff(interpolated[ch], pixels[px][ch], spanParanoidFactors[ch]) * channelWeightsSq[ch]; | ||
|  |         } | ||
|  |     } | ||
|  |     else | ||
|  |     { | ||
|  |         for (int px = 0; px < 16; px++) | ||
|  |         { | ||
|  |             for (int ch = 0; ch < 3; ch++) | ||
|  |                 error = error + ParallelMath::ToFloat(ParallelMath::SqDiffUInt8(interpolated[ch], pixels[px][ch])) * channelWeightsSq[ch]; | ||
|  |         } | ||
|  |     } | ||
|  | 
 | ||
|  |     ParallelMath::FloatCompFlag better = ParallelMath::Less(error, bestError); | ||
|  |     ParallelMath::Int16CompFlag better16 = ParallelMath::FloatFlagToInt16(better); | ||
|  | 
 | ||
|  |     if (ParallelMath::AnySet(better16)) | ||
|  |     { | ||
|  |         bestError = ParallelMath::Min(bestError, error); | ||
|  |         for (int epi = 0; epi < 2; epi++) | ||
|  |             for (int ch = 0; ch < 3; ch++) | ||
|  |                 ParallelMath::ConditionalSet(bestEndpoints[epi][ch], better16, eps[epi][ch]); | ||
|  | 
 | ||
|  |         MUInt15 vindexes = ParallelMath::MakeUInt15(1); | ||
|  |         for (int px = 0; px < 16; px++) | ||
|  |             ParallelMath::ConditionalSet(bestIndexes[px], better16, vindexes); | ||
|  | 
 | ||
|  |         ParallelMath::ConditionalSet(bestRange, better16, ParallelMath::MakeUInt15(range)); | ||
|  |     } | ||
|  | } | ||
|  | 
 | ||
|  | void cvtt::Internal::S3TCComputer::TestEndpoints(uint32_t flags, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const MFloat preWeightedPixels[16][4], const MUInt15 unquantizedEndPoints[2][3], int range, const float* channelWeights, | ||
|  |     MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange, EndpointRefiner<3> *refiner, const ParallelMath::RoundTowardNearestForScope *rtn) | ||
|  | { | ||
|  |     float channelWeightsSq[3]; | ||
|  | 
 | ||
|  |     for (int ch = 0; ch < 3; ch++) | ||
|  |         channelWeightsSq[ch] = channelWeights[ch] * channelWeights[ch]; | ||
|  | 
 | ||
|  |     MUInt15 endPoints[2][3]; | ||
|  | 
 | ||
|  |     for (int ep = 0; ep < 2; ep++) | ||
|  |         for (int ch = 0; ch < 3; ch++) | ||
|  |             endPoints[ep][ch] = unquantizedEndPoints[ep][ch]; | ||
|  | 
 | ||
|  |     QuantizeTo565(endPoints[0]); | ||
|  |     QuantizeTo565(endPoints[1]); | ||
|  | 
 | ||
|  |     IndexSelector<3> selector; | ||
|  |     selector.Init<false>(channelWeights, endPoints, range); | ||
|  | 
 | ||
|  |     MUInt15 indexes[16]; | ||
|  | 
 | ||
|  |     MFloat paranoidFactors[3]; | ||
|  |     for (int ch = 0; ch < 3; ch++) | ||
|  |         paranoidFactors[ch] = ParanoidFactorForSpan(ParallelMath::LosslessCast<MSInt16>::Cast(endPoints[0][ch]) - ParallelMath::LosslessCast<MSInt16>::Cast(endPoints[1][ch])); | ||
|  | 
 | ||
|  |     MFloat error = ParallelMath::MakeFloatZero(); | ||
|  |     AggregatedError<3> aggError; | ||
|  |     for (int px = 0; px < 16; px++) | ||
|  |     { | ||
|  |         MUInt15 index = selector.SelectIndexLDR(floatPixels[px], rtn); | ||
|  |         indexes[px] = index; | ||
|  | 
 | ||
|  |         if (refiner) | ||
|  |             refiner->ContributeUnweightedPW(preWeightedPixels[px], index); | ||
|  | 
 | ||
|  |         MUInt15 reconstructed[3]; | ||
|  |         selector.ReconstructLDRPrecise(index, reconstructed); | ||
|  | 
 | ||
|  |         if (flags & Flags::S3TC_Paranoid) | ||
|  |         { | ||
|  |             for (int ch = 0; ch < 3; ch++) | ||
|  |                 error = error + ParanoidDiff(reconstructed[ch], pixels[px][ch], paranoidFactors[ch]) * channelWeightsSq[ch]; | ||
|  |         } | ||
|  |         else | ||
|  |             BCCommon::ComputeErrorLDR<3>(flags, reconstructed, pixels[px], aggError); | ||
|  |     } | ||
|  | 
 | ||
|  |     if (!(flags & Flags::S3TC_Paranoid)) | ||
|  |         error = aggError.Finalize(flags, channelWeightsSq); | ||
|  | 
 | ||
|  |     ParallelMath::FloatCompFlag better = ParallelMath::Less(error, bestError); | ||
|  | 
 | ||
|  |     if (ParallelMath::AnySet(better)) | ||
|  |     { | ||
|  |         ParallelMath::Int16CompFlag betterInt16 = ParallelMath::FloatFlagToInt16(better); | ||
|  | 
 | ||
|  |         ParallelMath::ConditionalSet(bestError, better, error); | ||
|  | 
 | ||
|  |         for (int ep = 0; ep < 2; ep++) | ||
|  |             for (int ch = 0; ch < 3; ch++) | ||
|  |                 ParallelMath::ConditionalSet(bestEndpoints[ep][ch], betterInt16, endPoints[ep][ch]); | ||
|  | 
 | ||
|  |         for (int px = 0; px < 16; px++) | ||
|  |             ParallelMath::ConditionalSet(bestIndexes[px], betterInt16, indexes[px]); | ||
|  | 
 | ||
|  |         ParallelMath::ConditionalSet(bestRange, betterInt16, ParallelMath::MakeUInt15(static_cast<uint16_t>(range))); | ||
|  |     } | ||
|  | } | ||
|  | 
 | ||
|  | void cvtt::Internal::S3TCComputer::TestCounts(uint32_t flags, const int *counts, int nCounts, const MUInt15 &numElements, const MUInt15 pixels[16][4], const MFloat floatPixels[16][4], const MFloat preWeightedPixels[16][4], bool alphaTest, | ||
|  |     const MFloat floatSortedInputs[16][4], const MFloat preWeightedFloatSortedInputs[16][4], const float *channelWeights, MFloat &bestError, MUInt15 bestEndpoints[2][3], MUInt15 bestIndexes[16], MUInt15 &bestRange, | ||
|  |     const ParallelMath::RoundTowardNearestForScope* rtn) | ||
|  | { | ||
|  |     UNREFERENCED_PARAMETER(alphaTest); | ||
|  |     UNREFERENCED_PARAMETER(flags); | ||
|  | 
 | ||
|  |     EndpointRefiner<3> refiner; | ||
|  | 
 | ||
|  |     refiner.Init(nCounts, channelWeights); | ||
|  | 
 | ||
|  |     bool escape = false; | ||
|  |     int e = 0; | ||
|  |     for (int i = 0; i < nCounts; i++) | ||
|  |     { | ||
|  |         for (int n = 0; n < counts[i]; n++) | ||
|  |         { | ||
|  |             ParallelMath::Int16CompFlag valid = ParallelMath::Less(ParallelMath::MakeUInt15(static_cast<uint16_t>(n)), numElements); | ||
|  |             if (!ParallelMath::AnySet(valid)) | ||
|  |             { | ||
|  |                 escape = true; | ||
|  |                 break; | ||
|  |             } | ||
|  | 
 | ||
|  |             if (ParallelMath::AllSet(valid)) | ||
|  |                 refiner.ContributeUnweightedPW(preWeightedFloatSortedInputs[e++], ParallelMath::MakeUInt15(static_cast<uint16_t>(i))); | ||
|  |             else | ||
|  |             { | ||
|  |                 MFloat weight = ParallelMath::Select(ParallelMath::Int16FlagToFloat(valid), ParallelMath::MakeFloat(1.0f), ParallelMath::MakeFloat(0.0f)); | ||
|  |                 refiner.ContributePW(preWeightedFloatSortedInputs[e++], ParallelMath::MakeUInt15(static_cast<uint16_t>(i)), weight); | ||
|  |             } | ||
|  |         } | ||
|  | 
 | ||
|  |         if (escape) | ||
|  |             break; | ||
|  |     } | ||
|  | 
 | ||
|  |     MUInt15 endPoints[2][3]; | ||
|  |     refiner.GetRefinedEndpointsLDR(endPoints, rtn); | ||
|  | 
 | ||
|  |     TestEndpoints(flags, pixels, floatPixels, preWeightedPixels, endPoints, nCounts, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, NULL, rtn); | ||
|  | } | ||
|  | 
 | ||
|  | void cvtt::Internal::S3TCComputer::PackExplicitAlpha(uint32_t flags, const PixelBlockU8* inputs, int inputChannel, uint8_t* packedBlocks, size_t packedBlockStride) | ||
|  | { | ||
|  |     UNREFERENCED_PARAMETER(flags); | ||
|  |     ParallelMath::RoundTowardNearestForScope rtn; | ||
|  | 
 | ||
|  |     float weights[1] = { 1.0f }; | ||
|  | 
 | ||
|  |     MUInt15 pixels[16]; | ||
|  |     MFloat floatPixels[16]; | ||
|  | 
 | ||
|  |     for (int px = 0; px < 16; px++) | ||
|  |     { | ||
|  |         ParallelMath::ConvertLDRInputs(inputs, px, inputChannel, pixels[px]); | ||
|  |         floatPixels[px] = ParallelMath::ToFloat(pixels[px]); | ||
|  |     } | ||
|  | 
 | ||
|  |     MUInt15 ep[2][1] = { { ParallelMath::MakeUInt15(0) },{ ParallelMath::MakeUInt15(255) } }; | ||
|  | 
 | ||
|  |     IndexSelector<1> selector; | ||
|  |     selector.Init<false>(weights, ep, 16); | ||
|  | 
 | ||
|  |     MUInt15 indexes[16]; | ||
|  | 
 | ||
|  |     for (int px = 0; px < 16; px++) | ||
|  |         indexes[px] = selector.SelectIndexLDR(&floatPixels[px], &rtn); | ||
|  | 
 | ||
|  |     for (int block = 0; block < ParallelMath::ParallelSize; block++) | ||
|  |     { | ||
|  |         for (int px = 0; px < 16; px += 2) | ||
|  |         { | ||
|  |             int index0 = ParallelMath::Extract(indexes[px], block); | ||
|  |             int index1 = ParallelMath::Extract(indexes[px + 1], block); | ||
|  | 
 | ||
|  |             packedBlocks[px / 2] = static_cast<uint8_t>(index0 | (index1 << 4)); | ||
|  |         } | ||
|  | 
 | ||
|  |         packedBlocks += packedBlockStride; | ||
|  |     } | ||
|  | } | ||
|  | 
 | ||
|  | void cvtt::Internal::S3TCComputer::PackInterpolatedAlpha(uint32_t flags, const PixelBlockU8* inputs, int inputChannel, uint8_t* packedBlocks, size_t packedBlockStride, bool isSigned, int maxTweakRounds, int numRefineRounds) | ||
|  | { | ||
|  |     if (maxTweakRounds < 1) | ||
|  |         maxTweakRounds = 1; | ||
|  | 
 | ||
|  |     if (numRefineRounds < 1) | ||
|  |         numRefineRounds = 1; | ||
|  | 
 | ||
|  |     ParallelMath::RoundTowardNearestForScope rtn; | ||
|  | 
 | ||
|  |     float oneWeight[1] = { 1.0f }; | ||
|  | 
 | ||
|  |     MUInt15 pixels[16]; | ||
|  |     MFloat floatPixels[16]; | ||
|  | 
 | ||
|  |     MUInt15 highTerminal = isSigned ? ParallelMath::MakeUInt15(254) : ParallelMath::MakeUInt15(255); | ||
|  |     MUInt15 highTerminalMinusOne = highTerminal - ParallelMath::MakeUInt15(1); | ||
|  | 
 | ||
|  |     for (int px = 0; px < 16; px++) | ||
|  |     { | ||
|  |         ParallelMath::ConvertLDRInputs(inputs, px, inputChannel, pixels[px]); | ||
|  | 
 | ||
|  |         if (isSigned) | ||
|  |             pixels[px] = ParallelMath::Min(pixels[px], highTerminal); | ||
|  | 
 | ||
|  |         floatPixels[px] = ParallelMath::ToFloat(pixels[px]); | ||
|  |     } | ||
|  | 
 | ||
|  |     MUInt15 sortedPixels[16]; | ||
|  |     for (int px = 0; px < 16; px++) | ||
|  |         sortedPixels[px] = pixels[px]; | ||
|  | 
 | ||
|  |     for (int sortEnd = 15; sortEnd > 0; sortEnd--) | ||
|  |     { | ||
|  |         for (int sortOffset = 0; sortOffset < sortEnd; sortOffset++) | ||
|  |         { | ||
|  |             MUInt15 a = sortedPixels[sortOffset]; | ||
|  |             MUInt15 b = sortedPixels[sortOffset + 1]; | ||
|  | 
 | ||
|  |             sortedPixels[sortOffset] = ParallelMath::Min(a, b); | ||
|  |             sortedPixels[sortOffset + 1] = ParallelMath::Max(a, b); | ||
|  |         } | ||
|  |     } | ||
|  | 
 | ||
|  |     MUInt15 zero = ParallelMath::MakeUInt15(0); | ||
|  |     MUInt15 one = ParallelMath::MakeUInt15(1); | ||
|  | 
 | ||
|  |     MUInt15 bestIsFullRange = zero; | ||
|  |     MFloat bestError = ParallelMath::MakeFloat(FLT_MAX); | ||
|  |     MUInt15 bestEP[2] = { zero, zero }; | ||
|  |     MUInt15 bestIndexes[16] = { | ||
|  |         zero, zero, zero, zero, | ||
|  |         zero, zero, zero, zero, | ||
|  |         zero, zero, zero, zero, | ||
|  |         zero, zero, zero, zero | ||
|  |     }; | ||
|  | 
 | ||
|  |     // Full-precision
 | ||
|  |     { | ||
|  |         MUInt15 minEP = sortedPixels[0]; | ||
|  |         MUInt15 maxEP = sortedPixels[15]; | ||
|  | 
 | ||
|  |         MFloat base[1] = { ParallelMath::ToFloat(minEP) }; | ||
|  |         MFloat offset[1] = { ParallelMath::ToFloat(maxEP - minEP) }; | ||
|  | 
 | ||
|  |         UnfinishedEndpoints<1> ufep = UnfinishedEndpoints<1>(base, offset); | ||
|  | 
 | ||
|  |         int numTweakRounds = BCCommon::TweakRoundsForRange(8); | ||
|  |         if (numTweakRounds > maxTweakRounds) | ||
|  |             numTweakRounds = maxTweakRounds; | ||
|  | 
 | ||
|  |         for (int tweak = 0; tweak < numTweakRounds; tweak++) | ||
|  |         { | ||
|  |             MUInt15 ep[2][1]; | ||
|  | 
 | ||
|  |             ufep.FinishLDR(tweak, 8, ep[0], ep[1]); | ||
|  | 
 | ||
|  |             for (int refinePass = 0; refinePass < numRefineRounds; refinePass++) | ||
|  |             { | ||
|  |                 EndpointRefiner<1> refiner; | ||
|  |                 refiner.Init(8, oneWeight); | ||
|  | 
 | ||
|  |                 if (isSigned) | ||
|  |                     for (int epi = 0; epi < 2; epi++) | ||
|  |                         ep[epi][0] = ParallelMath::Min(ep[epi][0], highTerminal); | ||
|  | 
 | ||
|  |                 IndexSelector<1> indexSelector; | ||
|  |                 indexSelector.Init<false>(oneWeight, ep, 8); | ||
|  | 
 | ||
|  |                 MUInt15 indexes[16]; | ||
|  | 
 | ||
|  |                 AggregatedError<1> aggError; | ||
|  |                 for (int px = 0; px < 16; px++) | ||
|  |                 { | ||
|  |                     MUInt15 index = indexSelector.SelectIndexLDR(&floatPixels[px], &rtn); | ||
|  | 
 | ||
|  |                     MUInt15 reconstructedPixel; | ||
|  | 
 | ||
|  |                     indexSelector.ReconstructLDRPrecise(index, &reconstructedPixel); | ||
|  |                     BCCommon::ComputeErrorLDR<1>(flags, &reconstructedPixel, &pixels[px], aggError); | ||
|  | 
 | ||
|  |                     if (refinePass != numRefineRounds - 1) | ||
|  |                         refiner.ContributeUnweightedPW(&floatPixels[px], index); | ||
|  | 
 | ||
|  |                     indexes[px] = index; | ||
|  |                 } | ||
|  |                 MFloat error = aggError.Finalize(flags | Flags::Uniform, oneWeight); | ||
|  | 
 | ||
|  |                 ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError); | ||
|  |                 ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter); | ||
|  | 
 | ||
|  |                 if (ParallelMath::AnySet(errorBetter16)) | ||
|  |                 { | ||
|  |                     bestError = ParallelMath::Min(error, bestError); | ||
|  |                     ParallelMath::ConditionalSet(bestIsFullRange, errorBetter16, one); | ||
|  |                     for (int px = 0; px < 16; px++) | ||
|  |                         ParallelMath::ConditionalSet(bestIndexes[px], errorBetter16, indexes[px]); | ||
|  | 
 | ||
|  |                     for (int epi = 0; epi < 2; epi++) | ||
|  |                         ParallelMath::ConditionalSet(bestEP[epi], errorBetter16, ep[epi][0]); | ||
|  |                 } | ||
|  | 
 | ||
|  |                 if (refinePass != numRefineRounds - 1) | ||
|  |                     refiner.GetRefinedEndpointsLDR(ep, &rtn); | ||
|  |             } | ||
|  |         } | ||
|  |     } | ||
|  | 
 | ||
|  |     // Reduced precision with special endpoints
 | ||
|  |     { | ||
|  |         MUInt15 bestHeuristicMin = sortedPixels[0]; | ||
|  |         MUInt15 bestHeuristicMax = sortedPixels[15]; | ||
|  | 
 | ||
|  |         ParallelMath::Int16CompFlag canTryClipping; | ||
|  | 
 | ||
|  |         // In reduced precision, we want try putting endpoints at the reserved indexes at the ends.
 | ||
|  |         // The heuristic we use is to assign indexes to the end as long as they aren't off by more than half of the index range.
 | ||
|  |         // This will usually not find anything, but it's cheap to check.
 | ||
|  | 
 | ||
|  |         { | ||
|  |             MUInt15 largestPossibleRange = bestHeuristicMax - bestHeuristicMin; // Max: 255
 | ||
|  |             MUInt15 lowestPossibleClearance = ParallelMath::Min(bestHeuristicMin, static_cast<MUInt15>(highTerminal - bestHeuristicMax)); | ||
|  | 
 | ||
|  |             MUInt15 lowestPossibleClearanceTimes10 = (lowestPossibleClearance << 2) + (lowestPossibleClearance << 4); | ||
|  |             canTryClipping = ParallelMath::LessOrEqual(lowestPossibleClearanceTimes10, largestPossibleRange); | ||
|  |         } | ||
|  | 
 | ||
|  |         if (ParallelMath::AnySet(canTryClipping)) | ||
|  |         { | ||
|  |             MUInt15 lowClearances[16]; | ||
|  |             MUInt15 highClearances[16]; | ||
|  |             MUInt15 bestSkipCount = ParallelMath::MakeUInt15(0); | ||
|  | 
 | ||
|  |             lowClearances[0] = highClearances[0] = ParallelMath::MakeUInt15(0); | ||
|  | 
 | ||
|  |             for (int px = 1; px < 16; px++) | ||
|  |             { | ||
|  |                 lowClearances[px] = sortedPixels[px - 1]; | ||
|  |                 highClearances[px] = highTerminal - sortedPixels[16 - px]; | ||
|  |             } | ||
|  | 
 | ||
|  |             for (uint16_t firstIndex = 0; firstIndex < 16; firstIndex++) | ||
|  |             { | ||
|  |                 uint16_t numSkippedLow = firstIndex; | ||
|  | 
 | ||
|  |                 MUInt15 lowClearance = lowClearances[firstIndex]; | ||
|  | 
 | ||
|  |                 for (uint16_t lastIndex = firstIndex; lastIndex < 16; lastIndex++) | ||
|  |                 { | ||
|  |                     uint16_t numSkippedHigh = 15 - lastIndex; | ||
|  |                     uint16_t numSkipped = numSkippedLow + numSkippedHigh; | ||
|  | 
 | ||
|  |                     MUInt15 numSkippedV = ParallelMath::MakeUInt15(numSkipped); | ||
|  | 
 | ||
|  |                     ParallelMath::Int16CompFlag areMoreSkipped = ParallelMath::Less(bestSkipCount, numSkippedV); | ||
|  | 
 | ||
|  |                     if (!ParallelMath::AnySet(areMoreSkipped)) | ||
|  |                         continue; | ||
|  | 
 | ||
|  |                     MUInt15 clearance = ParallelMath::Max(highClearances[numSkippedHigh], lowClearance); | ||
|  |                     MUInt15 clearanceTimes10 = (clearance << 2) + (clearance << 4); | ||
|  | 
 | ||
|  |                     MUInt15 range = sortedPixels[lastIndex] - sortedPixels[firstIndex]; | ||
|  | 
 | ||
|  |                     ParallelMath::Int16CompFlag isBetter = (areMoreSkipped & ParallelMath::LessOrEqual(clearanceTimes10, range)); | ||
|  |                     ParallelMath::ConditionalSet(bestHeuristicMin, isBetter, sortedPixels[firstIndex]); | ||
|  |                     ParallelMath::ConditionalSet(bestHeuristicMax, isBetter, sortedPixels[lastIndex]); | ||
|  |                 } | ||
|  |             } | ||
|  |         } | ||
|  | 
 | ||
|  |         MUInt15 bestSimpleMin = one; | ||
|  |         MUInt15 bestSimpleMax = highTerminalMinusOne; | ||
|  | 
 | ||
|  |         for (int px = 0; px < 16; px++) | ||
|  |         { | ||
|  |             ParallelMath::ConditionalSet(bestSimpleMin, ParallelMath::Less(zero, sortedPixels[15 - px]), sortedPixels[15 - px]); | ||
|  |             ParallelMath::ConditionalSet(bestSimpleMax, ParallelMath::Less(sortedPixels[px], highTerminal), sortedPixels[px]); | ||
|  |         } | ||
|  | 
 | ||
|  |         MUInt15 minEPs[2] = { bestSimpleMin, bestHeuristicMin }; | ||
|  |         MUInt15 maxEPs[2] = { bestSimpleMax, bestHeuristicMax }; | ||
|  | 
 | ||
|  |         int minEPRange = 2; | ||
|  |         if (ParallelMath::AllSet(ParallelMath::Equal(minEPs[0], minEPs[1]))) | ||
|  |             minEPRange = 1; | ||
|  | 
 | ||
|  |         int maxEPRange = 2; | ||
|  |         if (ParallelMath::AllSet(ParallelMath::Equal(maxEPs[0], maxEPs[1]))) | ||
|  |             maxEPRange = 1; | ||
|  | 
 | ||
|  |         for (int minEPIndex = 0; minEPIndex < minEPRange; minEPIndex++) | ||
|  |         { | ||
|  |             for (int maxEPIndex = 0; maxEPIndex < maxEPRange; maxEPIndex++) | ||
|  |             { | ||
|  |                 MFloat base[1] = { ParallelMath::ToFloat(minEPs[minEPIndex]) }; | ||
|  |                 MFloat offset[1] = { ParallelMath::ToFloat(maxEPs[maxEPIndex] - minEPs[minEPIndex]) }; | ||
|  | 
 | ||
|  |                 UnfinishedEndpoints<1> ufep = UnfinishedEndpoints<1>(base, offset); | ||
|  | 
 | ||
|  |                 int numTweakRounds = BCCommon::TweakRoundsForRange(6); | ||
|  |                 if (numTweakRounds > maxTweakRounds) | ||
|  |                     numTweakRounds = maxTweakRounds; | ||
|  | 
 | ||
|  |                 for (int tweak = 0; tweak < numTweakRounds; tweak++) | ||
|  |                 { | ||
|  |                     MUInt15 ep[2][1]; | ||
|  | 
 | ||
|  |                     ufep.FinishLDR(tweak, 8, ep[0], ep[1]); | ||
|  | 
 | ||
|  |                     for (int refinePass = 0; refinePass < numRefineRounds; refinePass++) | ||
|  |                     { | ||
|  |                         EndpointRefiner<1> refiner; | ||
|  |                         refiner.Init(6, oneWeight); | ||
|  | 
 | ||
|  |                         if (isSigned) | ||
|  |                             for (int epi = 0; epi < 2; epi++) | ||
|  |                                 ep[epi][0] = ParallelMath::Min(ep[epi][0], highTerminal); | ||
|  | 
 | ||
|  |                         IndexSelector<1> indexSelector; | ||
|  |                         indexSelector.Init<false>(oneWeight, ep, 6); | ||
|  | 
 | ||
|  |                         MUInt15 indexes[16]; | ||
|  |                         MFloat error = ParallelMath::MakeFloatZero(); | ||
|  | 
 | ||
|  |                         for (int px = 0; px < 16; px++) | ||
|  |                         { | ||
|  |                             MUInt15 selectedIndex = indexSelector.SelectIndexLDR(&floatPixels[px], &rtn); | ||
|  | 
 | ||
|  |                             MUInt15 reconstructedPixel; | ||
|  | 
 | ||
|  |                             indexSelector.ReconstructLDRPrecise(selectedIndex, &reconstructedPixel); | ||
|  | 
 | ||
|  |                             MFloat zeroError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &zero, &pixels[px], 1, oneWeight); | ||
|  |                             MFloat highTerminalError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &highTerminal, &pixels[px], 1, oneWeight); | ||
|  |                             MFloat selectedIndexError = BCCommon::ComputeErrorLDRSimple<1>(flags | Flags::Uniform, &reconstructedPixel, &pixels[px], 1, oneWeight); | ||
|  | 
 | ||
|  |                             MFloat bestPixelError = zeroError; | ||
|  |                             MUInt15 index = ParallelMath::MakeUInt15(6); | ||
|  | 
 | ||
|  |                             ParallelMath::ConditionalSet(index, ParallelMath::FloatFlagToInt16(ParallelMath::Less(highTerminalError, bestPixelError)), ParallelMath::MakeUInt15(7)); | ||
|  |                             bestPixelError = ParallelMath::Min(bestPixelError, highTerminalError); | ||
|  | 
 | ||
|  |                             ParallelMath::FloatCompFlag selectedIndexBetter = ParallelMath::Less(selectedIndexError, bestPixelError); | ||
|  | 
 | ||
|  |                             if (ParallelMath::AllSet(selectedIndexBetter)) | ||
|  |                             { | ||
|  |                                 if (refinePass != numRefineRounds - 1) | ||
|  |                                     refiner.ContributeUnweightedPW(&floatPixels[px], selectedIndex); | ||
|  |                             } | ||
|  |                             else | ||
|  |                             { | ||
|  |                                 MFloat refineWeight = ParallelMath::Select(selectedIndexBetter, ParallelMath::MakeFloat(1.0f), ParallelMath::MakeFloatZero()); | ||
|  | 
 | ||
|  |                                 if (refinePass != numRefineRounds - 1) | ||
|  |                                     refiner.ContributePW(&floatPixels[px], selectedIndex, refineWeight); | ||
|  |                             } | ||
|  | 
 | ||
|  |                             ParallelMath::ConditionalSet(index, ParallelMath::FloatFlagToInt16(selectedIndexBetter), selectedIndex); | ||
|  |                             bestPixelError = ParallelMath::Min(bestPixelError, selectedIndexError); | ||
|  | 
 | ||
|  |                             error = error + bestPixelError; | ||
|  | 
 | ||
|  |                             indexes[px] = index; | ||
|  |                         } | ||
|  | 
 | ||
|  |                         ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(error, bestError); | ||
|  |                         ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter); | ||
|  | 
 | ||
|  |                         if (ParallelMath::AnySet(errorBetter16)) | ||
|  |                         { | ||
|  |                             bestError = ParallelMath::Min(error, bestError); | ||
|  |                             ParallelMath::ConditionalSet(bestIsFullRange, errorBetter16, zero); | ||
|  |                             for (int px = 0; px < 16; px++) | ||
|  |                                 ParallelMath::ConditionalSet(bestIndexes[px], errorBetter16, indexes[px]); | ||
|  | 
 | ||
|  |                             for (int epi = 0; epi < 2; epi++) | ||
|  |                                 ParallelMath::ConditionalSet(bestEP[epi], errorBetter16, ep[epi][0]); | ||
|  |                         } | ||
|  | 
 | ||
|  |                         if (refinePass != numRefineRounds - 1) | ||
|  |                             refiner.GetRefinedEndpointsLDR(ep, &rtn); | ||
|  |                     } | ||
|  |                 } | ||
|  |             } | ||
|  |         } | ||
|  |     } | ||
|  | 
 | ||
|  |     for (int block = 0; block < ParallelMath::ParallelSize; block++) | ||
|  |     { | ||
|  |         int ep0 = ParallelMath::Extract(bestEP[0], block); | ||
|  |         int ep1 = ParallelMath::Extract(bestEP[1], block); | ||
|  |         int isFullRange = ParallelMath::Extract(bestIsFullRange, block); | ||
|  | 
 | ||
|  |         if (isSigned) | ||
|  |         { | ||
|  |             ep0 -= 127; | ||
|  |             ep1 -= 127; | ||
|  | 
 | ||
|  |             assert(ep0 >= -127 && ep0 <= 127); | ||
|  |             assert(ep1 >= -127 && ep1 <= 127); | ||
|  |         } | ||
|  | 
 | ||
|  | 
 | ||
|  |         bool swapEndpoints = (isFullRange != 0) != (ep0 > ep1); | ||
|  | 
 | ||
|  |         if (swapEndpoints) | ||
|  |             std::swap(ep0, ep1); | ||
|  | 
 | ||
|  |         uint16_t dumpBits = 0; | ||
|  |         int dumpBitsOffset = 0; | ||
|  |         int dumpByteOffset = 2; | ||
|  |         packedBlocks[0] = static_cast<uint8_t>(ep0 & 0xff); | ||
|  |         packedBlocks[1] = static_cast<uint8_t>(ep1 & 0xff); | ||
|  | 
 | ||
|  |         int maxValue = (isFullRange != 0) ? 7 : 5; | ||
|  | 
 | ||
|  |         for (int px = 0; px < 16; px++) | ||
|  |         { | ||
|  |             int index = ParallelMath::Extract(bestIndexes[px], block); | ||
|  | 
 | ||
|  |             if (swapEndpoints && index <= maxValue) | ||
|  |                 index = maxValue - index; | ||
|  | 
 | ||
|  |             if (index != 0) | ||
|  |             { | ||
|  |                 if (index == maxValue) | ||
|  |                     index = 1; | ||
|  |                 else if (index < maxValue) | ||
|  |                     index++; | ||
|  |             } | ||
|  | 
 | ||
|  |             assert(index >= 0 && index < 8); | ||
|  | 
 | ||
|  |             dumpBits |= static_cast<uint16_t>(index << dumpBitsOffset); | ||
|  |             dumpBitsOffset += 3; | ||
|  | 
 | ||
|  |             if (dumpBitsOffset >= 8) | ||
|  |             { | ||
|  |                 assert(dumpByteOffset < 8); | ||
|  |                 packedBlocks[dumpByteOffset] = static_cast<uint8_t>(dumpBits & 0xff); | ||
|  |                 dumpBits >>= 8; | ||
|  |                 dumpBitsOffset -= 8; | ||
|  |                 dumpByteOffset++; | ||
|  |             } | ||
|  |         } | ||
|  | 
 | ||
|  |         assert(dumpBitsOffset == 0); | ||
|  |         assert(dumpByteOffset == 8); | ||
|  | 
 | ||
|  |         packedBlocks += packedBlockStride; | ||
|  |     } | ||
|  | } | ||
|  | 
 | ||
|  | void cvtt::Internal::S3TCComputer::PackRGB(uint32_t flags, const PixelBlockU8* inputs, uint8_t* packedBlocks, size_t packedBlockStride, const float channelWeights[4], bool alphaTest, float alphaThreshold, bool exhaustive, int maxTweakRounds, int numRefineRounds) | ||
|  | { | ||
|  |     ParallelMath::RoundTowardNearestForScope rtn; | ||
|  | 
 | ||
|  |     if (numRefineRounds < 1) | ||
|  |         numRefineRounds = 1; | ||
|  | 
 | ||
|  |     if (maxTweakRounds < 1) | ||
|  |         maxTweakRounds = 1; | ||
|  | 
 | ||
|  |     EndpointSelector<3, 8> endpointSelector; | ||
|  | 
 | ||
|  |     MUInt15 pixels[16][4]; | ||
|  |     MFloat floatPixels[16][4]; | ||
|  | 
 | ||
|  |     MFloat preWeightedPixels[16][4]; | ||
|  | 
 | ||
|  |     for (int px = 0; px < 16; px++) | ||
|  |     { | ||
|  |         for (int ch = 0; ch < 4; ch++) | ||
|  |             ParallelMath::ConvertLDRInputs(inputs, px, ch, pixels[px][ch]); | ||
|  |     } | ||
|  | 
 | ||
|  |     for (int px = 0; px < 16; px++) | ||
|  |     { | ||
|  |         for (int ch = 0; ch < 4; ch++) | ||
|  |             floatPixels[px][ch] = ParallelMath::ToFloat(pixels[px][ch]); | ||
|  |     } | ||
|  | 
 | ||
|  |     if (alphaTest) | ||
|  |     { | ||
|  |         MUInt15 threshold = ParallelMath::MakeUInt15(static_cast<uint16_t>(floor(alphaThreshold * 255.0f + 0.5f))); | ||
|  | 
 | ||
|  |         for (int px = 0; px < 16; px++) | ||
|  |         { | ||
|  |             ParallelMath::Int16CompFlag belowThreshold = ParallelMath::Less(pixels[px][3], threshold); | ||
|  |             pixels[px][3] = ParallelMath::Select(belowThreshold, ParallelMath::MakeUInt15(0), ParallelMath::MakeUInt15(255)); | ||
|  |         } | ||
|  |     } | ||
|  | 
 | ||
|  |     BCCommon::PreWeightPixelsLDR<4>(preWeightedPixels, pixels, channelWeights); | ||
|  | 
 | ||
|  |     MUInt15 minAlpha = ParallelMath::MakeUInt15(255); | ||
|  | 
 | ||
|  |     for (int px = 0; px < 16; px++) | ||
|  |         minAlpha = ParallelMath::Min(minAlpha, pixels[px][3]); | ||
|  | 
 | ||
|  |     MFloat pixelWeights[16]; | ||
|  |     for (int px = 0; px < 16; px++) | ||
|  |     { | ||
|  |         pixelWeights[px] = ParallelMath::MakeFloat(1.0f); | ||
|  |         if (alphaTest) | ||
|  |         { | ||
|  |             ParallelMath::Int16CompFlag isTransparent = ParallelMath::Less(pixels[px][3], ParallelMath::MakeUInt15(255)); | ||
|  | 
 | ||
|  |             ParallelMath::ConditionalSet(pixelWeights[px], ParallelMath::Int16FlagToFloat(isTransparent), ParallelMath::MakeFloatZero()); | ||
|  |         } | ||
|  |     } | ||
|  | 
 | ||
|  |     for (int pass = 0; pass < NumEndpointSelectorPasses; pass++) | ||
|  |     { | ||
|  |         for (int px = 0; px < 16; px++) | ||
|  |             endpointSelector.ContributePass(preWeightedPixels[px], pass, pixelWeights[px]); | ||
|  | 
 | ||
|  |         endpointSelector.FinishPass(pass); | ||
|  |     } | ||
|  | 
 | ||
|  |     UnfinishedEndpoints<3> ufep = endpointSelector.GetEndpoints(channelWeights); | ||
|  | 
 | ||
|  |     MUInt15 bestEndpoints[2][3]; | ||
|  |     MUInt15 bestIndexes[16]; | ||
|  |     MUInt15 bestRange = ParallelMath::MakeUInt15(0); | ||
|  |     MFloat bestError = ParallelMath::MakeFloat(FLT_MAX); | ||
|  | 
 | ||
|  |     for (int px = 0; px < 16; px++) | ||
|  |         bestIndexes[px] = ParallelMath::MakeUInt15(0); | ||
|  | 
 | ||
|  |     for (int ep = 0; ep < 2; ep++) | ||
|  |         for (int ch = 0; ch < 3; ch++) | ||
|  |             bestEndpoints[ep][ch] = ParallelMath::MakeUInt15(0); | ||
|  | 
 | ||
|  |     if (exhaustive) | ||
|  |     { | ||
|  |         MSInt16 sortBins[16]; | ||
|  | 
 | ||
|  |         { | ||
|  |             // Compute an 11-bit index, change it to signed, stuff it in the high bits of the sort bins,
 | ||
|  |             // and pack the original indexes into the low bits.
 | ||
|  | 
 | ||
|  |             MUInt15 sortEP[2][3]; | ||
|  |             ufep.FinishLDR(0, 11, sortEP[0], sortEP[1]); | ||
|  | 
 | ||
|  |             IndexSelector<3> sortSelector; | ||
|  |             sortSelector.Init<false>(channelWeights, sortEP, 1 << 11); | ||
|  | 
 | ||
|  |             for (int16_t px = 0; px < 16; px++) | ||
|  |             { | ||
|  |                 MSInt16 sortBin = ParallelMath::LosslessCast<MSInt16>::Cast(sortSelector.SelectIndexLDR(floatPixels[px], &rtn) << 4); | ||
|  | 
 | ||
|  |                 if (alphaTest) | ||
|  |                 { | ||
|  |                     ParallelMath::Int16CompFlag isTransparent = ParallelMath::Less(pixels[px][3], ParallelMath::MakeUInt15(255)); | ||
|  | 
 | ||
|  |                     ParallelMath::ConditionalSet(sortBin, isTransparent, ParallelMath::MakeSInt16(-16)); // 0xfff0
 | ||
|  |                 } | ||
|  | 
 | ||
|  |                 sortBin = sortBin + ParallelMath::MakeSInt16(px); | ||
|  | 
 | ||
|  |                 sortBins[px] = sortBin; | ||
|  |             } | ||
|  |         } | ||
|  | 
 | ||
|  |         // Sort bins
 | ||
|  |         for (int sortEnd = 1; sortEnd < 16; sortEnd++) | ||
|  |         { | ||
|  |             for (int sortLoc = sortEnd; sortLoc > 0; sortLoc--) | ||
|  |             { | ||
|  |                 MSInt16 a = sortBins[sortLoc]; | ||
|  |                 MSInt16 b = sortBins[sortLoc - 1]; | ||
|  | 
 | ||
|  |                 sortBins[sortLoc] = ParallelMath::Max(a, b); | ||
|  |                 sortBins[sortLoc - 1] = ParallelMath::Min(a, b); | ||
|  |             } | ||
|  |         } | ||
|  | 
 | ||
|  |         MUInt15 firstElement = ParallelMath::MakeUInt15(0); | ||
|  |         for (uint16_t e = 0; e < 16; e++) | ||
|  |         { | ||
|  |             ParallelMath::Int16CompFlag isInvalid = ParallelMath::Less(sortBins[e], ParallelMath::MakeSInt16(0)); | ||
|  |             ParallelMath::ConditionalSet(firstElement, isInvalid, ParallelMath::MakeUInt15(e + 1)); | ||
|  |             if (!ParallelMath::AnySet(isInvalid)) | ||
|  |                 break; | ||
|  |         } | ||
|  | 
 | ||
|  |         MUInt15 numElements = ParallelMath::MakeUInt15(16) - firstElement; | ||
|  | 
 | ||
|  |         MUInt15 sortedInputs[16][4]; | ||
|  |         MFloat floatSortedInputs[16][4]; | ||
|  |         MFloat pwFloatSortedInputs[16][4]; | ||
|  | 
 | ||
|  |         for (int e = 0; e < 16; e++) | ||
|  |         { | ||
|  |             for (int ch = 0; ch < 4; ch++) | ||
|  |                 sortedInputs[e][ch] = ParallelMath::MakeUInt15(0); | ||
|  |         } | ||
|  | 
 | ||
|  |         for (int block = 0; block < ParallelMath::ParallelSize; block++) | ||
|  |         { | ||
|  |             for (int e = ParallelMath::Extract(firstElement, block); e < 16; e++) | ||
|  |             { | ||
|  |                 ParallelMath::ScalarUInt16 sortBin = ParallelMath::Extract(sortBins[e], block); | ||
|  |                 int originalIndex = (sortBin & 15); | ||
|  | 
 | ||
|  |                 for (int ch = 0; ch < 4; ch++) | ||
|  |                     ParallelMath::PutUInt15(sortedInputs[15 - e][ch], block, ParallelMath::Extract(pixels[originalIndex][ch], block)); | ||
|  |             } | ||
|  |         } | ||
|  | 
 | ||
|  |         for (int e = 0; e < 16; e++) | ||
|  |         { | ||
|  |             for (int ch = 0; ch < 4; ch++) | ||
|  |             { | ||
|  |                 MFloat f = ParallelMath::ToFloat(sortedInputs[e][ch]); | ||
|  |                 floatSortedInputs[e][ch] = f; | ||
|  |                 pwFloatSortedInputs[e][ch] = f * channelWeights[ch]; | ||
|  |             } | ||
|  |         } | ||
|  | 
 | ||
|  |         for (int n0 = 0; n0 <= 15; n0++) | ||
|  |         { | ||
|  |             int remainingFor1 = 16 - n0; | ||
|  |             if (remainingFor1 == 16) | ||
|  |                 remainingFor1 = 15; | ||
|  | 
 | ||
|  |             for (int n1 = 0; n1 <= remainingFor1; n1++) | ||
|  |             { | ||
|  |                 int remainingFor2 = 16 - n1 - n0; | ||
|  |                 if (remainingFor2 == 16) | ||
|  |                     remainingFor2 = 15; | ||
|  | 
 | ||
|  |                 for (int n2 = 0; n2 <= remainingFor2; n2++) | ||
|  |                 { | ||
|  |                     int n3 = 16 - n2 - n1 - n0; | ||
|  | 
 | ||
|  |                     if (n3 == 16) | ||
|  |                         continue; | ||
|  | 
 | ||
|  |                     int counts[4] = { n0, n1, n2, n3 }; | ||
|  | 
 | ||
|  |                     TestCounts(flags, counts, 4, numElements, pixels, floatPixels, preWeightedPixels, alphaTest, floatSortedInputs, pwFloatSortedInputs, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn); | ||
|  |                 } | ||
|  |             } | ||
|  |         } | ||
|  | 
 | ||
|  |         TestSingleColor(flags, pixels, floatPixels, 4, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn); | ||
|  | 
 | ||
|  |         if (alphaTest) | ||
|  |         { | ||
|  |             for (int n0 = 0; n0 <= 15; n0++) | ||
|  |             { | ||
|  |                 int remainingFor1 = 16 - n0; | ||
|  |                 if (remainingFor1 == 16) | ||
|  |                     remainingFor1 = 15; | ||
|  | 
 | ||
|  |                 for (int n1 = 0; n1 <= remainingFor1; n1++) | ||
|  |                 { | ||
|  |                     int n2 = 16 - n1 - n0; | ||
|  | 
 | ||
|  |                     if (n2 == 16) | ||
|  |                         continue; | ||
|  | 
 | ||
|  |                     int counts[3] = { n0, n1, n2 }; | ||
|  | 
 | ||
|  |                     TestCounts(flags, counts, 3, numElements, pixels, floatPixels, preWeightedPixels, alphaTest, floatSortedInputs, pwFloatSortedInputs, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn); | ||
|  |                 } | ||
|  |             } | ||
|  | 
 | ||
|  |             TestSingleColor(flags, pixels, floatPixels, 3, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &rtn); | ||
|  |         } | ||
|  |     } | ||
|  |     else | ||
|  |     { | ||
|  |         int minRange = alphaTest ? 3 : 4; | ||
|  | 
 | ||
|  |         for (int range = minRange; range <= 4; range++) | ||
|  |         { | ||
|  |             int tweakRounds = BCCommon::TweakRoundsForRange(range); | ||
|  |             if (tweakRounds > maxTweakRounds) | ||
|  |                 tweakRounds = maxTweakRounds; | ||
|  | 
 | ||
|  |             for (int tweak = 0; tweak < tweakRounds; tweak++) | ||
|  |             { | ||
|  |                 MUInt15 endPoints[2][3]; | ||
|  | 
 | ||
|  |                 ufep.FinishLDR(tweak, range, endPoints[0], endPoints[1]); | ||
|  | 
 | ||
|  |                 for (int refine = 0; refine < numRefineRounds; refine++) | ||
|  |                 { | ||
|  |                     EndpointRefiner<3> refiner; | ||
|  |                     refiner.Init(range, channelWeights); | ||
|  | 
 | ||
|  |                     TestEndpoints(flags, pixels, floatPixels, preWeightedPixels, endPoints, range, channelWeights, bestError, bestEndpoints, bestIndexes, bestRange, &refiner, &rtn); | ||
|  | 
 | ||
|  |                     if (refine != numRefineRounds - 1) | ||
|  |                         refiner.GetRefinedEndpointsLDR(endPoints, &rtn); | ||
|  |                 } | ||
|  |             } | ||
|  |         } | ||
|  |     } | ||
|  | 
 | ||
|  |     for (int block = 0; block < ParallelMath::ParallelSize; block++) | ||
|  |     { | ||
|  |         ParallelMath::ScalarUInt16 range = ParallelMath::Extract(bestRange, block); | ||
|  |         assert(range == 3 || range == 4); | ||
|  | 
 | ||
|  |         ParallelMath::ScalarUInt16 compressedEP[2]; | ||
|  |         for (int ep = 0; ep < 2; ep++) | ||
|  |         { | ||
|  |             ParallelMath::ScalarUInt16 endPoint[3]; | ||
|  |             for (int ch = 0; ch < 3; ch++) | ||
|  |                 endPoint[ch] = ParallelMath::Extract(bestEndpoints[ep][ch], block); | ||
|  | 
 | ||
|  |             int compressed = (endPoint[0] & 0xf8) << 8; | ||
|  |             compressed |= (endPoint[1] & 0xfc) << 3; | ||
|  |             compressed |= (endPoint[2] & 0xf8) >> 3; | ||
|  | 
 | ||
|  |             compressedEP[ep] = static_cast<ParallelMath::ScalarUInt16>(compressed); | ||
|  |         } | ||
|  | 
 | ||
|  |         int indexOrder[4]; | ||
|  | 
 | ||
|  |         if (range == 4) | ||
|  |         { | ||
|  |             if (compressedEP[0] == compressedEP[1]) | ||
|  |             { | ||
|  |                 indexOrder[0] = 0; | ||
|  |                 indexOrder[1] = 0; | ||
|  |                 indexOrder[2] = 0; | ||
|  |                 indexOrder[3] = 0; | ||
|  |             } | ||
|  |             else if (compressedEP[0] < compressedEP[1]) | ||
|  |             { | ||
|  |                 std::swap(compressedEP[0], compressedEP[1]); | ||
|  |                 indexOrder[0] = 1; | ||
|  |                 indexOrder[1] = 3; | ||
|  |                 indexOrder[2] = 2; | ||
|  |                 indexOrder[3] = 0; | ||
|  |             } | ||
|  |             else | ||
|  |             { | ||
|  |                 indexOrder[0] = 0; | ||
|  |                 indexOrder[1] = 2; | ||
|  |                 indexOrder[2] = 3; | ||
|  |                 indexOrder[3] = 1; | ||
|  |             } | ||
|  |         } | ||
|  |         else | ||
|  |         { | ||
|  |             assert(range == 3); | ||
|  | 
 | ||
|  |             if (compressedEP[0] > compressedEP[1]) | ||
|  |             { | ||
|  |                 std::swap(compressedEP[0], compressedEP[1]); | ||
|  |                 indexOrder[0] = 1; | ||
|  |                 indexOrder[1] = 2; | ||
|  |                 indexOrder[2] = 0; | ||
|  |             } | ||
|  |             else | ||
|  |             { | ||
|  |                 indexOrder[0] = 0; | ||
|  |                 indexOrder[1] = 2; | ||
|  |                 indexOrder[2] = 1; | ||
|  |             } | ||
|  |             indexOrder[3] = 3; | ||
|  |         } | ||
|  | 
 | ||
|  |         packedBlocks[0] = static_cast<uint8_t>(compressedEP[0] & 0xff); | ||
|  |         packedBlocks[1] = static_cast<uint8_t>((compressedEP[0] >> 8) & 0xff); | ||
|  |         packedBlocks[2] = static_cast<uint8_t>(compressedEP[1] & 0xff); | ||
|  |         packedBlocks[3] = static_cast<uint8_t>((compressedEP[1] >> 8) & 0xff); | ||
|  | 
 | ||
|  |         for (int i = 0; i < 16; i += 4) | ||
|  |         { | ||
|  |             int packedIndexes = 0; | ||
|  |             for (int subi = 0; subi < 4; subi++) | ||
|  |             { | ||
|  |                 ParallelMath::ScalarUInt16 index = ParallelMath::Extract(bestIndexes[i + subi], block); | ||
|  |                 packedIndexes |= (indexOrder[index] << (subi * 2)); | ||
|  |             } | ||
|  | 
 | ||
|  |             packedBlocks[4 + i / 4] = static_cast<uint8_t>(packedIndexes); | ||
|  |         } | ||
|  | 
 | ||
|  |         packedBlocks += packedBlockStride; | ||
|  |     } | ||
|  | } | ||
|  | 
 | ||
|  | #endif
 |