| 
									
										
										
										
											2021-05-20 12:49:33 +02:00
										 |  |  | // Copyright 2009-2021 Intel Corporation
 | 
					
						
							| 
									
										
										
										
											2021-04-20 18:38:09 +02:00
										 |  |  | // SPDX-License-Identifier: Apache-2.0
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #pragma once
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #include "parallel_for.h"
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | namespace embree | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |   template<typename Value> | 
					
						
							|  |  |  |     struct ParallelPrefixSumState  | 
					
						
							|  |  |  |   { | 
					
						
							|  |  |  |     enum { MAX_TASKS = 64 }; | 
					
						
							|  |  |  |     Value counts[MAX_TASKS]; | 
					
						
							|  |  |  |     Value sums  [MAX_TASKS]; | 
					
						
							|  |  |  |   }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   template<typename Index, typename Value, typename Func, typename Reduction> | 
					
						
							|  |  |  |     __forceinline Value parallel_prefix_sum( ParallelPrefixSumState<Value>& state, Index first, Index last, Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction) | 
					
						
							|  |  |  |   { | 
					
						
							|  |  |  |     /* calculate number of tasks to use */ | 
					
						
							|  |  |  |     const size_t numThreads = TaskScheduler::threadCount(); | 
					
						
							|  |  |  |     const size_t numBlocks  = (last-first+minStepSize-1)/minStepSize; | 
					
						
							|  |  |  |     const size_t taskCount  = min(numThreads,numBlocks,size_t(ParallelPrefixSumState<Value>::MAX_TASKS)); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     /* perform parallel prefix sum */ | 
					
						
							|  |  |  |     parallel_for(taskCount, [&](const size_t taskIndex) | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |       const size_t i0 = first+(taskIndex+0)*(last-first)/taskCount; | 
					
						
							|  |  |  |       const size_t i1 = first+(taskIndex+1)*(last-first)/taskCount; | 
					
						
							|  |  |  |       state.counts[taskIndex] = func(range<size_t>(i0,i1),state.sums[taskIndex]); | 
					
						
							|  |  |  |     }); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     /* calculate prefix sum */ | 
					
						
							|  |  |  |     Value sum=identity; | 
					
						
							|  |  |  |     for (size_t i=0; i<taskCount; i++)  | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |       const Value c = state.counts[i]; | 
					
						
							|  |  |  |       state.sums[i] = sum; | 
					
						
							|  |  |  |       sum=reduction(sum,c); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return sum; | 
					
						
							|  |  |  |   } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   /*! parallel calculation of prefix sums */ | 
					
						
							|  |  |  |   template<typename SrcArray, typename DstArray, typename Value, typename Add> | 
					
						
							|  |  |  |     __forceinline Value parallel_prefix_sum(const SrcArray& src, DstArray& dst, size_t N, const Value& identity, const Add& add, const size_t SINGLE_THREAD_THRESHOLD = 4096)  | 
					
						
							|  |  |  |   { | 
					
						
							|  |  |  |     /* perform single threaded prefix operation for small N */ | 
					
						
							|  |  |  |     if (N < SINGLE_THREAD_THRESHOLD)  | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |       Value sum=identity; | 
					
						
							|  |  |  |       for (size_t i=0; i<N; sum=add(sum,src[i++])) dst[i] = sum; | 
					
						
							|  |  |  |       return sum; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |      | 
					
						
							|  |  |  |     /* perform parallel prefix operation for large N */ | 
					
						
							|  |  |  |     else  | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |       ParallelPrefixSumState<Value> state; | 
					
						
							|  |  |  |        | 
					
						
							|  |  |  |       /* initial run just sets up start values for subtasks */ | 
					
						
							|  |  |  |       parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range<size_t>& r, const Value& sum) -> Value { | 
					
						
							|  |  |  |            | 
					
						
							|  |  |  |           Value s = identity; | 
					
						
							|  |  |  |           for (size_t i=r.begin(); i<r.end(); i++) s = add(s,src[i]); | 
					
						
							|  |  |  |           return s; | 
					
						
							|  |  |  |            | 
					
						
							|  |  |  |         }, add); | 
					
						
							|  |  |  |        | 
					
						
							|  |  |  |       /* final run calculates prefix sum */ | 
					
						
							|  |  |  |       return parallel_prefix_sum( state, size_t(0), size_t(N), size_t(1024), identity, [&](const range<size_t>& r, const Value& sum) -> Value { | 
					
						
							|  |  |  |            | 
					
						
							|  |  |  |           Value s = identity; | 
					
						
							|  |  |  |           for (size_t i=r.begin(); i<r.end(); i++) { | 
					
						
							|  |  |  |             dst[i] = add(sum,s); | 
					
						
							|  |  |  |             s = add(s,src[i]); | 
					
						
							|  |  |  |           } | 
					
						
							|  |  |  |           return s; | 
					
						
							|  |  |  |            | 
					
						
							|  |  |  |         }, add); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  |   } | 
					
						
							|  |  |  | } |