| 
									
										
										
										
											2021-05-20 12:49:33 +02:00
										 |  |  | // Copyright 2009-2021 Intel Corporation
 | 
					
						
							| 
									
										
										
										
											2021-04-20 18:38:09 +02:00
										 |  |  | // SPDX-License-Identifier: Apache-2.0
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #pragma once
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #include "../common/default.h"
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* force a complete cache invalidation when running out of allocation space */ | 
					
						
							|  |  |  | #define FORCE_SIMPLE_FLUSH 0
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #define THREAD_BLOCK_ATOMIC_ADD 4
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #if defined(DEBUG)
 | 
					
						
							|  |  |  | #define CACHE_STATS(x) 
 | 
					
						
							|  |  |  | #else
 | 
					
						
							|  |  |  | #define CACHE_STATS(x) 
 | 
					
						
							|  |  |  | #endif
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | namespace embree | 
					
						
							|  |  |  | { | 
					
						
							|  |  |  |   class SharedTessellationCacheStats | 
					
						
							|  |  |  |   { | 
					
						
							|  |  |  |   public: | 
					
						
							|  |  |  |     /* stats */ | 
					
						
							|  |  |  |     static std::atomic<size_t> cache_accesses; | 
					
						
							|  |  |  |     static std::atomic<size_t> cache_hits; | 
					
						
							|  |  |  |     static std::atomic<size_t> cache_misses; | 
					
						
							|  |  |  |     static std::atomic<size_t> cache_flushes;                 | 
					
						
							|  |  |  |     static size_t        cache_num_patches; | 
					
						
							|  |  |  |     __aligned(64) static SpinLock mtx; | 
					
						
							|  |  |  |      | 
					
						
							|  |  |  |     /* print stats for debugging */                  | 
					
						
							|  |  |  |     static void printStats(); | 
					
						
							|  |  |  |     static void clearStats(); | 
					
						
							|  |  |  |   }; | 
					
						
							|  |  |  |    | 
					
						
							|  |  |  |   void resizeTessellationCache(size_t new_size); | 
					
						
							|  |  |  |   void resetTessellationCache(); | 
					
						
							|  |  |  |    | 
					
						
							|  |  |  |  ////////////////////////////////////////////////////////////////////////////////
 | 
					
						
							|  |  |  |  ////////////////////////////////////////////////////////////////////////////////
 | 
					
						
							|  |  |  |  ////////////////////////////////////////////////////////////////////////////////
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |  struct __aligned(64) ThreadWorkState  | 
					
						
							|  |  |  |  { | 
					
						
							|  |  |  |    ALIGNED_STRUCT_(64); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |    std::atomic<size_t> counter; | 
					
						
							|  |  |  |    ThreadWorkState* next; | 
					
						
							|  |  |  |    bool allocated; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |    __forceinline ThreadWorkState(bool allocated = false)  | 
					
						
							|  |  |  |      : counter(0), next(nullptr), allocated(allocated)  | 
					
						
							|  |  |  |    { | 
					
						
							|  |  |  |      assert( ((size_t)this % 64) == 0 );  | 
					
						
							|  |  |  |    }    | 
					
						
							|  |  |  |  }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |  class __aligned(64) SharedLazyTessellationCache  | 
					
						
							|  |  |  |  { | 
					
						
							|  |  |  |  public: | 
					
						
							|  |  |  |     | 
					
						
							|  |  |  |    static const size_t NUM_CACHE_SEGMENTS              = 8; | 
					
						
							|  |  |  |    static const size_t NUM_PREALLOC_THREAD_WORK_STATES = 512; | 
					
						
							|  |  |  |    static const size_t COMMIT_INDEX_SHIFT              = 32+8; | 
					
						
							| 
									
										
										
										
											2021-05-20 12:49:33 +02:00
										 |  |  | #if defined(__64BIT__)
 | 
					
						
							| 
									
										
										
										
											2021-04-20 18:38:09 +02:00
										 |  |  |    static const size_t REF_TAG_MASK                    = 0xffffffffff; | 
					
						
							|  |  |  | #else
 | 
					
						
							|  |  |  |    static const size_t REF_TAG_MASK                    = 0x7FFFFFFF; | 
					
						
							|  |  |  | #endif
 | 
					
						
							|  |  |  |    static const size_t MAX_TESSELLATION_CACHE_SIZE     = REF_TAG_MASK+1; | 
					
						
							|  |  |  |    static const size_t BLOCK_SIZE                      = 64; | 
					
						
							|  |  |  |     | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     /*! Per thread tessellation ref cache */ | 
					
						
							|  |  |  |    static __thread ThreadWorkState* init_t_state; | 
					
						
							|  |  |  |    static ThreadWorkState* current_t_state; | 
					
						
							|  |  |  |     | 
					
						
							|  |  |  |    static __forceinline ThreadWorkState *threadState()  | 
					
						
							|  |  |  |    { | 
					
						
							|  |  |  |      if (unlikely(!init_t_state)) | 
					
						
							|  |  |  |        /* sets init_t_state, can't return pointer due to macosx icc bug*/ | 
					
						
							|  |  |  |        SharedLazyTessellationCache::sharedLazyTessellationCache.getNextRenderThreadWorkState(); | 
					
						
							|  |  |  |      return init_t_state; | 
					
						
							|  |  |  |    } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |    struct Tag | 
					
						
							|  |  |  |    { | 
					
						
							|  |  |  |      __forceinline Tag() : data(0) {} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |      __forceinline Tag(void* ptr, size_t combinedTime) {  | 
					
						
							|  |  |  |        init(ptr,combinedTime); | 
					
						
							|  |  |  |      } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |      __forceinline Tag(size_t ptr, size_t combinedTime) { | 
					
						
							|  |  |  |        init((void*)ptr,combinedTime);  | 
					
						
							|  |  |  |      } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |      __forceinline void init(void* ptr, size_t combinedTime) | 
					
						
							|  |  |  |      { | 
					
						
							|  |  |  |        if (ptr == nullptr) { | 
					
						
							|  |  |  |          data = 0; | 
					
						
							|  |  |  |          return; | 
					
						
							|  |  |  |        } | 
					
						
							|  |  |  |        int64_t new_root_ref = (int64_t) ptr; | 
					
						
							|  |  |  |        new_root_ref -= (int64_t)SharedLazyTessellationCache::sharedLazyTessellationCache.getDataPtr();                                 | 
					
						
							|  |  |  |        assert( new_root_ref <= (int64_t)REF_TAG_MASK ); | 
					
						
							|  |  |  |        new_root_ref |= (int64_t)combinedTime << COMMIT_INDEX_SHIFT;  | 
					
						
							|  |  |  |        data = new_root_ref; | 
					
						
							|  |  |  |      } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |      __forceinline int64_t get() const { return data.load(); } | 
					
						
							|  |  |  |      __forceinline void set( int64_t v ) { data.store(v); } | 
					
						
							|  |  |  |      __forceinline void reset() { data.store(0); } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |    private: | 
					
						
							|  |  |  |      atomic<int64_t> data; | 
					
						
							|  |  |  |    }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |    static __forceinline size_t extractCommitIndex(const int64_t v) { return v >> SharedLazyTessellationCache::COMMIT_INDEX_SHIFT; } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |    struct CacheEntry | 
					
						
							|  |  |  |    { | 
					
						
							|  |  |  |      Tag tag; | 
					
						
							|  |  |  |      SpinLock mutex; | 
					
						
							|  |  |  |    }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |  private: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |    float *data; | 
					
						
							|  |  |  |    bool hugepages; | 
					
						
							|  |  |  |    size_t size; | 
					
						
							|  |  |  |    size_t maxBlocks; | 
					
						
							|  |  |  |    ThreadWorkState *threadWorkState; | 
					
						
							|  |  |  |        | 
					
						
							|  |  |  |    __aligned(64) std::atomic<size_t> localTime; | 
					
						
							|  |  |  |    __aligned(64) std::atomic<size_t> next_block; | 
					
						
							|  |  |  |    __aligned(64) SpinLock   reset_state; | 
					
						
							|  |  |  |    __aligned(64) SpinLock   linkedlist_mtx; | 
					
						
							|  |  |  |    __aligned(64) std::atomic<size_t> switch_block_threshold; | 
					
						
							|  |  |  |    __aligned(64) std::atomic<size_t> numRenderThreads; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |  public: | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |        | 
					
						
							|  |  |  |    SharedLazyTessellationCache(); | 
					
						
							|  |  |  |    ~SharedLazyTessellationCache(); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |    void getNextRenderThreadWorkState(); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |    __forceinline size_t maxAllocSize() const { | 
					
						
							|  |  |  |      return switch_block_threshold; | 
					
						
							|  |  |  |    } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |    __forceinline size_t getCurrentIndex() { return localTime.load(); } | 
					
						
							|  |  |  |    __forceinline void   addCurrentIndex(const size_t i=1) { localTime.fetch_add(i); } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |    __forceinline size_t getTime(const size_t globalTime) { | 
					
						
							|  |  |  |      return localTime.load()+NUM_CACHE_SEGMENTS*globalTime; | 
					
						
							|  |  |  |    } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |    __forceinline size_t lockThread  (ThreadWorkState *const t_state, const ssize_t plus=1) { return t_state->counter.fetch_add(plus);  } | 
					
						
							|  |  |  |    __forceinline size_t unlockThread(ThreadWorkState *const t_state, const ssize_t plus=-1) { assert(isLocked(t_state)); return t_state->counter.fetch_add(plus); } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |    __forceinline bool isLocked(ThreadWorkState *const t_state) { return t_state->counter.load() != 0; } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |    static __forceinline void lock  () { sharedLazyTessellationCache.lockThread(threadState()); } | 
					
						
							|  |  |  |    static __forceinline void unlock() { sharedLazyTessellationCache.unlockThread(threadState()); } | 
					
						
							|  |  |  |    static __forceinline bool isLocked() { return sharedLazyTessellationCache.isLocked(threadState()); } | 
					
						
							|  |  |  |    static __forceinline size_t getState() { return threadState()->counter.load(); } | 
					
						
							|  |  |  |    static __forceinline void lockThreadLoop() { sharedLazyTessellationCache.lockThreadLoop(threadState()); } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |    static __forceinline size_t getTCacheTime(const size_t globalTime) { | 
					
						
							|  |  |  |      return sharedLazyTessellationCache.getTime(globalTime); | 
					
						
							|  |  |  |    } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |    /* per thread lock */ | 
					
						
							|  |  |  |    __forceinline void lockThreadLoop (ThreadWorkState *const t_state)  | 
					
						
							|  |  |  |    {  | 
					
						
							|  |  |  |      while(1) | 
					
						
							|  |  |  |      { | 
					
						
							|  |  |  |        size_t lock = SharedLazyTessellationCache::sharedLazyTessellationCache.lockThread(t_state,1); | 
					
						
							|  |  |  |        if (unlikely(lock >= THREAD_BLOCK_ATOMIC_ADD)) | 
					
						
							|  |  |  |        { | 
					
						
							|  |  |  |          /* lock failed wait until sync phase is over */ | 
					
						
							|  |  |  |          sharedLazyTessellationCache.unlockThread(t_state,-1);	        | 
					
						
							|  |  |  |          sharedLazyTessellationCache.waitForUsersLessEqual(t_state,0); | 
					
						
							|  |  |  |        } | 
					
						
							|  |  |  |        else | 
					
						
							|  |  |  |          break; | 
					
						
							|  |  |  |      } | 
					
						
							|  |  |  |    } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |    static __forceinline void* lookup(CacheEntry& entry, size_t globalTime) | 
					
						
							|  |  |  |    {    | 
					
						
							|  |  |  |      const int64_t subdiv_patch_root_ref = entry.tag.get();  | 
					
						
							|  |  |  |      CACHE_STATS(SharedTessellationCacheStats::cache_accesses++); | 
					
						
							|  |  |  |       | 
					
						
							|  |  |  |      if (likely(subdiv_patch_root_ref != 0))  | 
					
						
							|  |  |  |      { | 
					
						
							|  |  |  |        const size_t subdiv_patch_root = (subdiv_patch_root_ref & REF_TAG_MASK) + (size_t)sharedLazyTessellationCache.getDataPtr(); | 
					
						
							|  |  |  |        const size_t subdiv_patch_cache_index = extractCommitIndex(subdiv_patch_root_ref); | 
					
						
							|  |  |  |         | 
					
						
							|  |  |  |        if (likely( sharedLazyTessellationCache.validCacheIndex(subdiv_patch_cache_index,globalTime) )) | 
					
						
							|  |  |  |        { | 
					
						
							|  |  |  |          CACHE_STATS(SharedTessellationCacheStats::cache_hits++); | 
					
						
							|  |  |  |          return (void*) subdiv_patch_root; | 
					
						
							|  |  |  |        } | 
					
						
							|  |  |  |      } | 
					
						
							|  |  |  |      CACHE_STATS(SharedTessellationCacheStats::cache_misses++); | 
					
						
							|  |  |  |      return nullptr; | 
					
						
							|  |  |  |    } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |    template<typename Constructor> | 
					
						
							|  |  |  |      static __forceinline auto lookup (CacheEntry& entry, size_t globalTime, const Constructor constructor, const bool before=false) -> decltype(constructor()) | 
					
						
							|  |  |  |    { | 
					
						
							|  |  |  |      ThreadWorkState *t_state = SharedLazyTessellationCache::threadState(); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |      while (true) | 
					
						
							|  |  |  |      { | 
					
						
							|  |  |  |        sharedLazyTessellationCache.lockThreadLoop(t_state); | 
					
						
							|  |  |  |        void* patch = SharedLazyTessellationCache::lookup(entry,globalTime); | 
					
						
							|  |  |  |        if (patch) return (decltype(constructor())) patch; | 
					
						
							|  |  |  |         | 
					
						
							|  |  |  |        if (entry.mutex.try_lock()) | 
					
						
							|  |  |  |        { | 
					
						
							|  |  |  |          if (!validTag(entry.tag,globalTime))  | 
					
						
							|  |  |  |          { | 
					
						
							|  |  |  |            auto timeBefore = sharedLazyTessellationCache.getTime(globalTime); | 
					
						
							|  |  |  |            auto ret = constructor(); // thread is locked here!
 | 
					
						
							|  |  |  |            assert(ret); | 
					
						
							|  |  |  |            /* this should never return nullptr */ | 
					
						
							|  |  |  |            auto timeAfter = sharedLazyTessellationCache.getTime(globalTime); | 
					
						
							|  |  |  |            auto time = before ? timeBefore : timeAfter; | 
					
						
							|  |  |  |            __memory_barrier(); | 
					
						
							|  |  |  |            entry.tag = SharedLazyTessellationCache::Tag(ret,time); | 
					
						
							|  |  |  |            __memory_barrier(); | 
					
						
							|  |  |  |            entry.mutex.unlock(); | 
					
						
							|  |  |  |            return ret; | 
					
						
							|  |  |  |          } | 
					
						
							|  |  |  |          entry.mutex.unlock(); | 
					
						
							|  |  |  |        } | 
					
						
							|  |  |  |        SharedLazyTessellationCache::sharedLazyTessellationCache.unlockThread(t_state); | 
					
						
							|  |  |  |      } | 
					
						
							|  |  |  |    } | 
					
						
							|  |  |  |     | 
					
						
							|  |  |  |    __forceinline bool validCacheIndex(const size_t i, const size_t globalTime) | 
					
						
							|  |  |  |    { | 
					
						
							|  |  |  | #if FORCE_SIMPLE_FLUSH == 1
 | 
					
						
							|  |  |  |      return i == getTime(globalTime); | 
					
						
							|  |  |  | #else
 | 
					
						
							|  |  |  |      return i+(NUM_CACHE_SEGMENTS-1) >= getTime(globalTime); | 
					
						
							|  |  |  | #endif
 | 
					
						
							|  |  |  |    } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |    static __forceinline bool validTime(const size_t oldtime, const size_t newTime) | 
					
						
							|  |  |  |    { | 
					
						
							|  |  |  |      return oldtime+(NUM_CACHE_SEGMENTS-1) >= newTime; | 
					
						
							|  |  |  |    } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     static __forceinline bool validTag(const Tag& tag, size_t globalTime) | 
					
						
							|  |  |  |     { | 
					
						
							|  |  |  |       const int64_t subdiv_patch_root_ref = tag.get();  | 
					
						
							|  |  |  |       if (subdiv_patch_root_ref == 0) return false; | 
					
						
							|  |  |  |       const size_t subdiv_patch_cache_index = extractCommitIndex(subdiv_patch_root_ref); | 
					
						
							|  |  |  |       return sharedLazyTessellationCache.validCacheIndex(subdiv_patch_cache_index,globalTime); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |    void waitForUsersLessEqual(ThreadWorkState *const t_state, | 
					
						
							|  |  |  | 			      const unsigned int users); | 
					
						
							|  |  |  |      | 
					
						
							|  |  |  |    __forceinline size_t alloc(const size_t blocks) | 
					
						
							|  |  |  |    { | 
					
						
							|  |  |  |      if (unlikely(blocks >= switch_block_threshold)) | 
					
						
							|  |  |  |        throw_RTCError(RTC_ERROR_INVALID_OPERATION,"allocation exceeds size of tessellation cache segment"); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |      assert(blocks < switch_block_threshold); | 
					
						
							|  |  |  |      size_t index = next_block.fetch_add(blocks); | 
					
						
							|  |  |  |      if (unlikely(index + blocks >= switch_block_threshold)) return (size_t)-1; | 
					
						
							|  |  |  |      return index; | 
					
						
							|  |  |  |    } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |    static __forceinline void* malloc(const size_t bytes) | 
					
						
							|  |  |  |    { | 
					
						
							|  |  |  |      size_t block_index = -1; | 
					
						
							|  |  |  |      ThreadWorkState *const t_state = threadState(); | 
					
						
							|  |  |  |      while (true) | 
					
						
							|  |  |  |      { | 
					
						
							|  |  |  |        block_index = sharedLazyTessellationCache.alloc((bytes+BLOCK_SIZE-1)/BLOCK_SIZE); | 
					
						
							|  |  |  |        if (block_index == (size_t)-1) | 
					
						
							|  |  |  |        { | 
					
						
							|  |  |  |          sharedLazyTessellationCache.unlockThread(t_state);		   | 
					
						
							|  |  |  |          sharedLazyTessellationCache.allocNextSegment(); | 
					
						
							|  |  |  |          sharedLazyTessellationCache.lockThread(t_state); | 
					
						
							|  |  |  |          continue;  | 
					
						
							|  |  |  |        } | 
					
						
							|  |  |  |        break; | 
					
						
							|  |  |  |      } | 
					
						
							|  |  |  |      return sharedLazyTessellationCache.getBlockPtr(block_index); | 
					
						
							|  |  |  |    } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |    __forceinline void *getBlockPtr(const size_t block_index) | 
					
						
							|  |  |  |    { | 
					
						
							|  |  |  |      assert(block_index < maxBlocks); | 
					
						
							|  |  |  |      assert(data); | 
					
						
							|  |  |  |      assert(block_index*16 <= size); | 
					
						
							|  |  |  |      return (void*)&data[block_index*16]; | 
					
						
							|  |  |  |    } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |    __forceinline void*  getDataPtr()      { return data; } | 
					
						
							|  |  |  |    __forceinline size_t getNumUsedBytes() { return next_block * BLOCK_SIZE; } | 
					
						
							|  |  |  |    __forceinline size_t getMaxBlocks()    { return maxBlocks; } | 
					
						
							|  |  |  |    __forceinline size_t getSize()         { return size; } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |    void allocNextSegment(); | 
					
						
							|  |  |  |    void realloc(const size_t newSize); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |    void reset(); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |    static SharedLazyTessellationCache sharedLazyTessellationCache; | 
					
						
							|  |  |  |  }; | 
					
						
							|  |  |  | } |