| // Copyright (c) 2008, Google Inc. |
| // All rights reserved. |
| // |
| // Redistribution and use in source and binary forms, with or without |
| // modification, are permitted provided that the following conditions are |
| // met: |
| // |
| // * Redistributions of source code must retain the above copyright |
| // notice, this list of conditions and the following disclaimer. |
| // * Redistributions in binary form must reproduce the above |
| // copyright notice, this list of conditions and the following disclaimer |
| // in the documentation and/or other materials provided with the |
| // distribution. |
| // * Neither the name of Google Inc. nor the names of its |
| // contributors may be used to endorse or promote products derived from |
| // this software without specific prior written permission. |
| // |
| // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| // --- |
| // Author: Sanjay Ghemawat <opensource@google.com> |
| |
| #ifndef TCMALLOC_THREAD_CACHE_H_ |
| #define TCMALLOC_THREAD_CACHE_H_ |
| |
| #include <config.h> |
| #ifdef HAVE_PTHREAD |
| #include <pthread.h> // for pthread_t, pthread_key_t |
| #endif |
| #include <stddef.h> // for size_t, NULL |
| #ifdef HAVE_STDINT_H |
| #include <stdint.h> // for uint32_t, uint64_t |
| #endif |
| #include <sys/types.h> // for ssize_t |
| #include "common.h" // for SizeMap, kMaxSize, etc |
| #include "free_list.h" // for FL_Pop, FL_PopRange, etc |
| #include "internal_logging.h" // for ASSERT, etc |
| #include "maybe_threads.h" |
| #include "page_heap_allocator.h" // for PageHeapAllocator |
| #include "sampler.h" // for Sampler |
| #include "static_vars.h" // for Static |
| |
| namespace tcmalloc { |
| |
| // Even if we have support for thread-local storage in the compiler |
| // and linker, the OS may not support it. We need to check that at |
| // runtime. Right now, we have to keep a manual set of "bad" OSes. |
| #if defined(HAVE_TLS) |
| extern bool kernel_supports_tls; // defined in thread_cache.cc |
| void CheckIfKernelSupportsTLS(); |
| inline bool KernelSupportsTLS() { |
| return kernel_supports_tls; |
| } |
| #endif // HAVE_TLS |
| |
| //------------------------------------------------------------------- |
| // Data kept per thread |
| //------------------------------------------------------------------- |
| |
| class ThreadCache { |
| public: |
| // All ThreadCache objects are kept in a linked list (for stats collection) |
| ThreadCache* next_; |
| ThreadCache* prev_; |
| |
| void Init(pthread_t tid); |
| void Cleanup(); |
| |
| // Accessors (mostly just for printing stats) |
| int freelist_length(size_t cl) const { return list_[cl].length(); } |
| |
| // Total byte size in cache |
| size_t Size() const { return size_; } |
| |
| // Allocate an object of the given size and class. The size given |
| // must be the same as the size of the class in the size map. |
| void* Allocate(size_t size, size_t cl); |
| void Deallocate(void* ptr, size_t size_class); |
| |
| void Scavenge(); |
| |
| int GetSamplePeriod(); |
| |
| // Record allocation of "k" bytes. Return true iff allocation |
| // should be sampled |
| bool SampleAllocation(size_t k); |
| |
| // Record additional bytes allocated. |
| void AddToByteAllocatedTotal(size_t k) { total_bytes_allocated_ += k; } |
| |
| // Return the total number of bytes allocated from this heap. The value will |
| // wrap when there is an overflow, and so only the differences between two |
| // values should be relied on (and even then, modulo 2^32). |
| uint32 GetTotalBytesAllocated() const; |
| |
| // On the current thread, return GetTotalBytesAllocated(). |
| static uint32 GetBytesAllocatedOnCurrentThread(); |
| |
| static void InitModule(); |
| static void InitTSD(); |
| static ThreadCache* GetThreadHeap(); |
| static ThreadCache* GetCache(); |
| static ThreadCache* GetCacheIfPresent(); |
| static ThreadCache* CreateCacheIfNecessary(); |
| static void BecomeIdle(); |
| |
| // Return the number of thread heaps in use. |
| static inline int HeapsInUse(); |
| |
| // Writes to total_bytes the total number of bytes used by all thread heaps. |
| // class_count must be an array of size kNumClasses. Writes the number of |
| // items on the corresponding freelist. class_count may be NULL. |
| // The storage of both parameters must be zero intialized. |
| // REQUIRES: Static::pageheap_lock is held. |
| static void GetThreadStats(uint64_t* total_bytes, uint64_t* class_count); |
| |
| // Sets the total thread cache size to new_size, recomputing the |
| // individual thread cache sizes as necessary. |
| // REQUIRES: Static::pageheap lock is held. |
| static void set_overall_thread_cache_size(size_t new_size); |
| static size_t overall_thread_cache_size() { |
| return overall_thread_cache_size_; |
| } |
| |
| private: |
| class FreeList { |
| private: |
| void* list_; // Linked list of nodes |
| |
| #ifdef _LP64 |
| // On 64-bit hardware, manipulating 16-bit values may be slightly slow. |
| uint32_t length_; // Current length. |
| uint32_t lowater_; // Low water mark for list length. |
| uint32_t max_length_; // Dynamic max list length based on usage. |
| // Tracks the number of times a deallocation has caused |
| // length_ > max_length_. After the kMaxOverages'th time, max_length_ |
| // shrinks and length_overages_ is reset to zero. |
| uint32_t length_overages_; |
| #else |
| // If we aren't using 64-bit pointers then pack these into less space. |
| uint16_t length_; |
| uint16_t lowater_; |
| uint16_t max_length_; |
| uint16_t length_overages_; |
| #endif |
| |
| public: |
| void Init() { |
| list_ = NULL; |
| length_ = 0; |
| lowater_ = 0; |
| max_length_ = 1; |
| length_overages_ = 0; |
| } |
| |
| // Return current length of list |
| size_t length() const { |
| return length_; |
| } |
| |
| // Return the maximum length of the list. |
| size_t max_length() const { |
| return max_length_; |
| } |
| |
| // Set the maximum length of the list. If 'new_max' > length(), the |
| // client is responsible for removing objects from the list. |
| void set_max_length(size_t new_max) { |
| max_length_ = new_max; |
| } |
| |
| // Return the number of times that length() has gone over max_length(). |
| size_t length_overages() const { |
| return length_overages_; |
| } |
| |
| void set_length_overages(size_t new_count) { |
| length_overages_ = new_count; |
| } |
| |
| // Is list empty? |
| bool empty() const { |
| return list_ == NULL; |
| } |
| |
| // Low-water mark management |
| int lowwatermark() const { return lowater_; } |
| void clear_lowwatermark() { lowater_ = length_; } |
| |
| void Push(void* ptr) { |
| FL_Push(&list_, ptr); |
| length_++; |
| } |
| |
| void* Pop() { |
| ASSERT(list_ != NULL); |
| length_--; |
| if (length_ < lowater_) lowater_ = length_; |
| return FL_Pop(&list_); |
| } |
| |
| void* Next() { |
| if (list_ == NULL) return NULL; |
| return FL_Next(list_); |
| } |
| |
| void PushRange(int N, void *start, void *end) { |
| FL_PushRange(&list_, start, end); |
| length_ += N; |
| } |
| |
| void PopRange(int N, void **start, void **end) { |
| FL_PopRange(&list_, N, start, end); |
| ASSERT(length_ >= N); |
| length_ -= N; |
| if (length_ < lowater_) lowater_ = length_; |
| } |
| }; |
| |
| // Gets and returns an object from the central cache, and, if possible, |
| // also adds some objects of that size class to this thread cache. |
| void* FetchFromCentralCache(size_t cl, size_t byte_size); |
| |
| // Releases some number of items from src. Adjusts the list's max_length |
| // to eventually converge on num_objects_to_move(cl). |
| void ListTooLong(FreeList* src, size_t cl); |
| |
| // Releases N items from this thread cache. |
| void ReleaseToCentralCache(FreeList* src, size_t cl, int N); |
| |
| // Increase max_size_ by reducing unclaimed_cache_space_ or by |
| // reducing the max_size_ of some other thread. In both cases, |
| // the delta is kStealAmount. |
| void IncreaseCacheLimit(); |
| // Same as above but requires Static::pageheap_lock() is held. |
| void IncreaseCacheLimitLocked(); |
| |
| // If TLS is available, we also store a copy of the per-thread object |
| // in a __thread variable since __thread variables are faster to read |
| // than pthread_getspecific(). We still need pthread_setspecific() |
| // because __thread variables provide no way to run cleanup code when |
| // a thread is destroyed. |
| // We also give a hint to the compiler to use the "initial exec" TLS |
| // model. This is faster than the default TLS model, at the cost that |
| // you cannot dlopen this library. (To see the difference, look at |
| // the CPU use of __tls_get_addr with and without this attribute.) |
| // Since we don't really use dlopen in google code -- and using dlopen |
| // on a malloc replacement is asking for trouble in any case -- that's |
| // a good tradeoff for us. |
| #ifdef HAVE_TLS |
| static __thread ThreadCache* threadlocal_heap_ |
| // This code links against pyautolib.so, which causes dlopen() on that shared |
| // object to fail when -fprofile-generate is used with it. Ideally |
| // pyautolib.so should not link against this code. There is a bug filed for |
| // that: |
| // http://code.google.com/p/chromium/issues/detail?id=124489 |
| // For now the workaround is to pass in -DPGO_GENERATE when building Chrome |
| // for instrumentation (-fprofile-generate). |
| // For all non-instrumentation builds, this define will not be set and the |
| // performance benefit of "intial-exec" will be achieved. |
| // |
| // gcc has a problem with this tls model on arm. |
| // See https://bugs.chromium.org/p/chromium/issues/detail?id=650137 |
| #if defined(HAVE___ATTRIBUTE__) && !defined(PGO_GENERATE) && \ |
| !(!defined(__clang__) && defined(OS_CHROMEOS) && defined(__arm__)) |
| __attribute__ ((tls_model ("initial-exec"))) |
| # endif |
| ; |
| #endif |
| |
| // Thread-specific key. Initialization here is somewhat tricky |
| // because some Linux startup code invokes malloc() before it |
| // is in a good enough state to handle pthread_keycreate(). |
| // Therefore, we use TSD keys only after tsd_inited is set to true. |
| // Until then, we use a slow path to get the heap object. |
| static bool tsd_inited_; |
| static pthread_key_t heap_key_; |
| |
| // Linked list of heap objects. Protected by Static::pageheap_lock. |
| static ThreadCache* thread_heaps_; |
| static int thread_heap_count_; |
| |
| // A pointer to one of the objects in thread_heaps_. Represents |
| // the next ThreadCache from which a thread over its max_size_ should |
| // steal memory limit. Round-robin through all of the objects in |
| // thread_heaps_. Protected by Static::pageheap_lock. |
| static ThreadCache* next_memory_steal_; |
| |
| // Overall thread cache size. Protected by Static::pageheap_lock. |
| static size_t overall_thread_cache_size_; |
| |
| // Global per-thread cache size. Writes are protected by |
| // Static::pageheap_lock. Reads are done without any locking, which should be |
| // fine as long as size_t can be written atomically and we don't place |
| // invariants between this variable and other pieces of state. |
| static volatile size_t per_thread_cache_size_; |
| |
| // Represents overall_thread_cache_size_ minus the sum of max_size_ |
| // across all ThreadCaches. Protected by Static::pageheap_lock. |
| static ssize_t unclaimed_cache_space_; |
| |
| // This class is laid out with the most frequently used fields |
| // first so that hot elements are placed on the same cache line. |
| |
| size_t size_; // Combined size of data |
| size_t max_size_; // size_ > max_size_ --> Scavenge() |
| |
| // The following is the tally of bytes allocated on a thread as a response to |
| // any flavor of malloc() call. The aggegated amount includes all padding to |
| // the smallest class that can hold the request, or to the nearest whole page |
| // when a large allocation is made without using a class. This sum is |
| // currently used for Chromium profiling, where tallies are kept of the amount |
| // of memory allocated during the running of each task on each thread. |
| uint32 total_bytes_allocated_; // Total, modulo 2^32. |
| |
| // We sample allocations, biased by the size of the allocation |
| Sampler sampler_; // A sampler |
| |
| FreeList list_[kNumClasses]; // Array indexed by size-class |
| |
| pthread_t tid_; // Which thread owns it |
| bool in_setspecific_; // In call to pthread_setspecific? |
| |
| // Allocate a new heap. REQUIRES: Static::pageheap_lock is held. |
| static ThreadCache* NewHeap(pthread_t tid); |
| |
| // Use only as pthread thread-specific destructor function. |
| static void DestroyThreadCache(void* ptr); |
| |
| static void DeleteCache(ThreadCache* heap); |
| static void RecomputePerThreadCacheSize(); |
| |
| // Ensure that this class is cacheline-aligned. This is critical for |
| // performance, as false sharing would negate many of the benefits |
| // of a per-thread cache. |
| } CACHELINE_ALIGNED; |
| |
| // Allocator for thread heaps |
| // This is logically part of the ThreadCache class, but MSVC, at |
| // least, does not like using ThreadCache as a template argument |
| // before the class is fully defined. So we put it outside the class. |
| extern PageHeapAllocator<ThreadCache> threadcache_allocator; |
| |
| inline int ThreadCache::HeapsInUse() { |
| return threadcache_allocator.inuse(); |
| } |
| |
| inline bool ThreadCache::SampleAllocation(size_t k) { |
| return sampler_.SampleAllocation(k); |
| } |
| |
| inline uint32 ThreadCache::GetTotalBytesAllocated() const { |
| return total_bytes_allocated_; |
| } |
| |
| inline void* ThreadCache::Allocate(size_t size, size_t cl) { |
| ASSERT(size <= kMaxSize); |
| ASSERT(size == Static::sizemap()->ByteSizeForClass(cl)); |
| |
| FreeList* list = &list_[cl]; |
| if (list->empty()) { |
| return FetchFromCentralCache(cl, size); |
| } |
| size_ -= size; |
| return list->Pop(); |
| } |
| |
| inline void ThreadCache::Deallocate(void* ptr, size_t cl) { |
| FreeList* list = &list_[cl]; |
| size_ += Static::sizemap()->ByteSizeForClass(cl); |
| ssize_t size_headroom = max_size_ - size_ - 1; |
| |
| // This catches back-to-back frees of allocs in the same size |
| // class. A more comprehensive (and expensive) test would be to walk |
| // the entire freelist. But this might be enough to find some bugs. |
| ASSERT(ptr != list->Next()); |
| |
| list->Push(ptr); |
| ssize_t list_headroom = |
| static_cast<ssize_t>(list->max_length()) - list->length(); |
| |
| // There are two relatively uncommon things that require further work. |
| // In the common case we're done, and in that case we need a single branch |
| // because of the bitwise-or trick that follows. |
| if ((list_headroom | size_headroom) < 0) { |
| if (list_headroom < 0) { |
| ListTooLong(list, cl); |
| } |
| if (size_ >= max_size_) Scavenge(); |
| } |
| } |
| |
| inline ThreadCache* ThreadCache::GetThreadHeap() { |
| #ifdef HAVE_TLS |
| // __thread is faster, but only when the kernel supports it |
| if (KernelSupportsTLS()) |
| return threadlocal_heap_; |
| #endif |
| return reinterpret_cast<ThreadCache *>( |
| perftools_pthread_getspecific(heap_key_)); |
| } |
| |
| inline ThreadCache* ThreadCache::GetCache() { |
| ThreadCache* ptr = NULL; |
| if (!tsd_inited_) { |
| InitModule(); |
| } else { |
| ptr = GetThreadHeap(); |
| } |
| if (ptr == NULL) ptr = CreateCacheIfNecessary(); |
| return ptr; |
| } |
| |
| // In deletion paths, we do not try to create a thread-cache. This is |
| // because we may be in the thread destruction code and may have |
| // already cleaned up the cache for this thread. |
| inline ThreadCache* ThreadCache::GetCacheIfPresent() { |
| if (!tsd_inited_) return NULL; |
| return GetThreadHeap(); |
| } |
| |
| } // namespace tcmalloc |
| |
| #endif // TCMALLOC_THREAD_CACHE_H_ |