third_party/tcmalloc/chromium/src/thread_cache.h - src.git - Git at Google

 // Copyright (c) 2008, Google Inc.
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
 //
 //     * Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
 //     * Redistributions in binary form must reproduce the above
 // copyright notice, this list of conditions and the following disclaimer
 // in the documentation and/or other materials provided with the
 // distribution.
 //     * Neither the name of Google Inc. nor the names of its
 // contributors may be used to endorse or promote products derived from
 // this software without specific prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 // ---
 // Author: Sanjay Ghemawat <opensource@google.com>

 #ifndef TCMALLOC_THREAD_CACHE_H_
 #define TCMALLOC_THREAD_CACHE_H_

 #include <config.h>
 #ifdef HAVE_PTHREAD
 #include <pthread.h>                    // for pthread_t, pthread_key_t
 #endif
 #include <stddef.h>                     // for size_t, NULL
 #ifdef HAVE_STDINT_H
 #include <stdint.h>                     // for uint32_t, uint64_t
 #endif
 #include <sys/types.h>                  // for ssize_t
 #include "common.h"            // for SizeMap, kMaxSize, etc
 #include "free_list.h"  // for FL_Pop, FL_PopRange, etc
 #include "internal_logging.h"  // for ASSERT, etc
 #include "maybe_threads.h"
 #include "page_heap_allocator.h"  // for PageHeapAllocator
 #include "sampler.h"           // for Sampler
 #include "static_vars.h"       // for Static

 namespace tcmalloc {

 // Even if we have support for thread-local storage in the compiler
 // and linker, the OS may not support it.  We need to check that at
 // runtime.  Right now, we have to keep a manual set of "bad" OSes.
 #if defined(HAVE_TLS)
 extern bool kernel_supports_tls;   // defined in thread_cache.cc
 void CheckIfKernelSupportsTLS();
 inline bool KernelSupportsTLS() {
   return kernel_supports_tls;
 }
 #endif    // HAVE_TLS

 //-------------------------------------------------------------------
 // Data kept per thread
 //-------------------------------------------------------------------

 class ThreadCache {
  public:
   // All ThreadCache objects are kept in a linked list (for stats collection)
   ThreadCache* next_;
   ThreadCache* prev_;

   void Init(pthread_t tid);
   void Cleanup();

   // Accessors (mostly just for printing stats)
   int freelist_length(size_t cl) const { return list_[cl].length(); }

   // Total byte size in cache
   size_t Size() const { return size_; }

   // Allocate an object of the given size and class. The size given
   // must be the same as the size of the class in the size map.
   void* Allocate(size_t size, size_t cl);
   void Deallocate(void* ptr, size_t size_class);

   void Scavenge();

   int GetSamplePeriod();

   // Record allocation of "k" bytes.  Return true iff allocation
   // should be sampled
   bool SampleAllocation(size_t k);

   // Record additional bytes allocated.
   void AddToByteAllocatedTotal(size_t k) { total_bytes_allocated_ += k; }

   // Return the total number of bytes allocated from this heap.  The value will
   // wrap when there is an overflow, and so only the differences between two
   // values should be relied on (and even then, modulo 2^32).
   uint32 GetTotalBytesAllocated() const;

   // On the current thread, return GetTotalBytesAllocated().
   static uint32 GetBytesAllocatedOnCurrentThread();

   static void         InitModule();
   static void         InitTSD();
   static ThreadCache* GetThreadHeap();
   static ThreadCache* GetCache();
   static ThreadCache* GetCacheIfPresent();
   static ThreadCache* CreateCacheIfNecessary();
   static void         BecomeIdle();

   // Return the number of thread heaps in use.
   static inline int HeapsInUse();

   // Writes to total_bytes the total number of bytes used by all thread heaps.
   // class_count must be an array of size kNumClasses.  Writes the number of
   // items on the corresponding freelist.  class_count may be NULL.
   // The storage of both parameters must be zero intialized.
   // REQUIRES: Static::pageheap_lock is held.
   static void GetThreadStats(uint64_t* total_bytes, uint64_t* class_count);

   // Sets the total thread cache size to new_size, recomputing the
   // individual thread cache sizes as necessary.
   // REQUIRES: Static::pageheap lock is held.
   static void set_overall_thread_cache_size(size_t new_size);
   static size_t overall_thread_cache_size() {
     return overall_thread_cache_size_;
   }

  private:
   class FreeList {
    private:
     void*    list_;       // Linked list of nodes

 #ifdef _LP64
     // On 64-bit hardware, manipulating 16-bit values may be slightly slow.
     uint32_t length_;      // Current length.
     uint32_t lowater_;     // Low water mark for list length.
     uint32_t max_length_;  // Dynamic max list length based on usage.
     // Tracks the number of times a deallocation has caused
     // length_ > max_length_.  After the kMaxOverages'th time, max_length_
     // shrinks and length_overages_ is reset to zero.
     uint32_t length_overages_;
 #else
     // If we aren't using 64-bit pointers then pack these into less space.
     uint16_t length_;
     uint16_t lowater_;
     uint16_t max_length_;
     uint16_t length_overages_;
 #endif

    public:
     void Init() {
       list_ = NULL;
       length_ = 0;
       lowater_ = 0;
       max_length_ = 1;
       length_overages_ = 0;
     }

     // Return current length of list
     size_t length() const {
       return length_;
     }

     // Return the maximum length of the list.
     size_t max_length() const {
       return max_length_;
     }

     // Set the maximum length of the list.  If 'new_max' > length(), the
     // client is responsible for removing objects from the list.
     void set_max_length(size_t new_max) {
       max_length_ = new_max;
     }

     // Return the number of times that length() has gone over max_length().
     size_t length_overages() const {
       return length_overages_;
     }

     void set_length_overages(size_t new_count) {
       length_overages_ = new_count;
     }

     // Is list empty?
     bool empty() const {
       return list_ == NULL;
     }

     // Low-water mark management
     int lowwatermark() const { return lowater_; }
     void clear_lowwatermark() { lowater_ = length_; }

     void Push(void* ptr) {
       FL_Push(&list_, ptr);
       length_++;
     }

     void* Pop() {
       ASSERT(list_ != NULL);
       length_--;
       if (length_ < lowater_) lowater_ = length_;
       return FL_Pop(&list_);
     }

     void* Next() {
       if (list_ == NULL) return NULL;
       return FL_Next(list_);
     }

     void PushRange(int N, void *start, void *end) {
       FL_PushRange(&list_, start, end);
       length_ += N;
     }

     void PopRange(int N, void **start, void **end) {
       FL_PopRange(&list_, N, start, end);
       ASSERT(length_ >= N);
       length_ -= N;
       if (length_ < lowater_) lowater_ = length_;
     }
   };

   // Gets and returns an object from the central cache, and, if possible,
   // also adds some objects of that size class to this thread cache.
   void* FetchFromCentralCache(size_t cl, size_t byte_size);

   // Releases some number of items from src.  Adjusts the list's max_length
   // to eventually converge on num_objects_to_move(cl).
   void ListTooLong(FreeList* src, size_t cl);

   // Releases N items from this thread cache.
   void ReleaseToCentralCache(FreeList* src, size_t cl, int N);

   // Increase max_size_ by reducing unclaimed_cache_space_ or by
   // reducing the max_size_ of some other thread.  In both cases,
   // the delta is kStealAmount.
   void IncreaseCacheLimit();
   // Same as above but requires Static::pageheap_lock() is held.
   void IncreaseCacheLimitLocked();

   // If TLS is available, we also store a copy of the per-thread object
   // in a __thread variable since __thread variables are faster to read
   // than pthread_getspecific().  We still need pthread_setspecific()
   // because __thread variables provide no way to run cleanup code when
   // a thread is destroyed.
   // We also give a hint to the compiler to use the "initial exec" TLS
   // model.  This is faster than the default TLS model, at the cost that
   // you cannot dlopen this library.  (To see the difference, look at
   // the CPU use of __tls_get_addr with and without this attribute.)
   // Since we don't really use dlopen in google code -- and using dlopen
   // on a malloc replacement is asking for trouble in any case -- that's
   // a good tradeoff for us.
 #ifdef HAVE_TLS
   static __thread ThreadCache* threadlocal_heap_
   // This code links against pyautolib.so, which causes dlopen() on that shared
   // object to fail when -fprofile-generate is used with it. Ideally
   // pyautolib.so should not link against this code. There is a bug filed for
   // that:
   // http://code.google.com/p/chromium/issues/detail?id=124489
   // For now the workaround is to pass in -DPGO_GENERATE when building Chrome
   // for instrumentation (-fprofile-generate).
   // For all non-instrumentation builds, this define will not be set and the
   // performance benefit of "intial-exec" will be achieved.
   //
   // gcc has a problem with this tls model on arm.
   // See https://bugs.chromium.org/p/chromium/issues/detail?id=650137
 #if defined(HAVE___ATTRIBUTE__) && !defined(PGO_GENERATE) && \
     !(!defined(__clang__) && defined(OS_CHROMEOS) && defined(__arm__))
    __attribute__ ((tls_model ("initial-exec")))
 # endif
    ;
 #endif

   // Thread-specific key.  Initialization here is somewhat tricky
   // because some Linux startup code invokes malloc() before it
   // is in a good enough state to handle pthread_keycreate().
   // Therefore, we use TSD keys only after tsd_inited is set to true.
   // Until then, we use a slow path to get the heap object.
   static bool tsd_inited_;
   static pthread_key_t heap_key_;

   // Linked list of heap objects.  Protected by Static::pageheap_lock.
   static ThreadCache* thread_heaps_;
   static int thread_heap_count_;

   // A pointer to one of the objects in thread_heaps_.  Represents
   // the next ThreadCache from which a thread over its max_size_ should
   // steal memory limit.  Round-robin through all of the objects in
   // thread_heaps_.  Protected by Static::pageheap_lock.
   static ThreadCache* next_memory_steal_;

   // Overall thread cache size.  Protected by Static::pageheap_lock.
   static size_t overall_thread_cache_size_;

   // Global per-thread cache size.  Writes are protected by
   // Static::pageheap_lock.  Reads are done without any locking, which should be
   // fine as long as size_t can be written atomically and we don't place
   // invariants between this variable and other pieces of state.
   static volatile size_t per_thread_cache_size_;

   // Represents overall_thread_cache_size_ minus the sum of max_size_
   // across all ThreadCaches.  Protected by Static::pageheap_lock.
   static ssize_t unclaimed_cache_space_;

   // This class is laid out with the most frequently used fields
   // first so that hot elements are placed on the same cache line.

   size_t        size_;                  // Combined size of data
   size_t        max_size_;              // size_ > max_size_ --> Scavenge()

   // The following is the tally of bytes allocated on a thread as a response to
   // any flavor of malloc() call.  The aggegated amount includes all padding to
   // the smallest class that can hold the request, or to the nearest whole page
   // when a large allocation is made without using a class.  This sum is
   // currently used for Chromium profiling, where tallies are kept of the amount
   // of memory allocated during the running of each task on each thread.
   uint32        total_bytes_allocated_;  // Total, modulo 2^32.

   // We sample allocations, biased by the size of the allocation
   Sampler       sampler_;               // A sampler

   FreeList      list_[kNumClasses];     // Array indexed by size-class

   pthread_t     tid_;                   // Which thread owns it
   bool          in_setspecific_;        // In call to pthread_setspecific?

   // Allocate a new heap. REQUIRES: Static::pageheap_lock is held.
   static ThreadCache* NewHeap(pthread_t tid);

   // Use only as pthread thread-specific destructor function.
   static void DestroyThreadCache(void* ptr);

   static void DeleteCache(ThreadCache* heap);
   static void RecomputePerThreadCacheSize();

   // Ensure that this class is cacheline-aligned. This is critical for
   // performance, as false sharing would negate many of the benefits
   // of a per-thread cache.
 } CACHELINE_ALIGNED;

 // Allocator for thread heaps
 // This is logically part of the ThreadCache class, but MSVC, at
 // least, does not like using ThreadCache as a template argument
 // before the class is fully defined.  So we put it outside the class.
 extern PageHeapAllocator<ThreadCache> threadcache_allocator;

 inline int ThreadCache::HeapsInUse() {
   return threadcache_allocator.inuse();
 }

 inline bool ThreadCache::SampleAllocation(size_t k) {
   return sampler_.SampleAllocation(k);
 }

 inline uint32 ThreadCache::GetTotalBytesAllocated() const {
   return total_bytes_allocated_;
 }

 inline void* ThreadCache::Allocate(size_t size, size_t cl) {
   ASSERT(size <= kMaxSize);
   ASSERT(size == Static::sizemap()->ByteSizeForClass(cl));

   FreeList* list = &list_[cl];
   if (list->empty()) {
     return FetchFromCentralCache(cl, size);
   }
   size_ -= size;
   return list->Pop();
 }

 inline void ThreadCache::Deallocate(void* ptr, size_t cl) {
   FreeList* list = &list_[cl];
   size_ += Static::sizemap()->ByteSizeForClass(cl);
   ssize_t size_headroom = max_size_ - size_ - 1;

   // This catches back-to-back frees of allocs in the same size
   // class. A more comprehensive (and expensive) test would be to walk
   // the entire freelist. But this might be enough to find some bugs.
   ASSERT(ptr != list->Next());

   list->Push(ptr);
   ssize_t list_headroom =
       static_cast<ssize_t>(list->max_length()) - list->length();

   // There are two relatively uncommon things that require further work.
   // In the common case we're done, and in that case we need a single branch
   // because of the bitwise-or trick that follows.
   if ((list_headroom | size_headroom) < 0) {
     if (list_headroom < 0) {
       ListTooLong(list, cl);
     }
     if (size_ >= max_size_) Scavenge();
   }
 }

 inline ThreadCache* ThreadCache::GetThreadHeap() {
 #ifdef HAVE_TLS
   // __thread is faster, but only when the kernel supports it
   if (KernelSupportsTLS())
     return threadlocal_heap_;
 #endif
   return reinterpret_cast<ThreadCache *>(
       perftools_pthread_getspecific(heap_key_));
 }

 inline ThreadCache* ThreadCache::GetCache() {
   ThreadCache* ptr = NULL;
   if (!tsd_inited_) {
     InitModule();
   } else {
     ptr = GetThreadHeap();
   }
   if (ptr == NULL) ptr = CreateCacheIfNecessary();
   return ptr;
 }

 // In deletion paths, we do not try to create a thread-cache.  This is
 // because we may be in the thread destruction code and may have
 // already cleaned up the cache for this thread.
 inline ThreadCache* ThreadCache::GetCacheIfPresent() {
   if (!tsd_inited_) return NULL;
   return GetThreadHeap();
 }

 }  // namespace tcmalloc

 #endif  // TCMALLOC_THREAD_CACHE_H_
	// Copyright (c) 2008, Google Inc.
	// All rights reserved.
	//
	// Redistribution and use in source and binary forms, with or without
	// modification, are permitted provided that the following conditions are
	// met:
	//
	// * Redistributions of source code must retain the above copyright
	// notice, this list of conditions and the following disclaimer.
	// * Redistributions in binary form must reproduce the above
	// copyright notice, this list of conditions and the following disclaimer
	// in the documentation and/or other materials provided with the
	// distribution.
	// * Neither the name of Google Inc. nor the names of its
	// contributors may be used to endorse or promote products derived from
	// this software without specific prior written permission.
	//
	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

	// ---
	// Author: Sanjay Ghemawat <opensource@google.com>

	#ifndef TCMALLOC_THREAD_CACHE_H_
	#define TCMALLOC_THREAD_CACHE_H_

	#include <config.h>
	#ifdef HAVE_PTHREAD
	#include <pthread.h> // for pthread_t, pthread_key_t
	#endif
	#include <stddef.h> // for size_t, NULL
	#ifdef HAVE_STDINT_H
	#include <stdint.h> // for uint32_t, uint64_t
	#endif
	#include <sys/types.h> // for ssize_t
	#include "common.h" // for SizeMap, kMaxSize, etc
	#include "free_list.h" // for FL_Pop, FL_PopRange, etc
	#include "internal_logging.h" // for ASSERT, etc
	#include "maybe_threads.h"
	#include "page_heap_allocator.h" // for PageHeapAllocator
	#include "sampler.h" // for Sampler
	#include "static_vars.h" // for Static

	namespace tcmalloc {

	// Even if we have support for thread-local storage in the compiler
	// and linker, the OS may not support it. We need to check that at
	// runtime. Right now, we have to keep a manual set of "bad" OSes.
	#if defined(HAVE_TLS)
	extern bool kernel_supports_tls; // defined in thread_cache.cc
	void CheckIfKernelSupportsTLS();
	inline bool KernelSupportsTLS() {
	return kernel_supports_tls;
	}
	#endif // HAVE_TLS

	//-------------------------------------------------------------------
	// Data kept per thread
	//-------------------------------------------------------------------

	class ThreadCache {
	public:
	// All ThreadCache objects are kept in a linked list (for stats collection)
	ThreadCache* next_;
	ThreadCache* prev_;

	void Init(pthread_t tid);
	void Cleanup();

	// Accessors (mostly just for printing stats)
	int freelist_length(size_t cl) const { return list_[cl].length(); }

	// Total byte size in cache
	size_t Size() const { return size_; }

	// Allocate an object of the given size and class. The size given
	// must be the same as the size of the class in the size map.
	void* Allocate(size_t size, size_t cl);
	void Deallocate(void* ptr, size_t size_class);

	void Scavenge();

	int GetSamplePeriod();

	// Record allocation of "k" bytes. Return true iff allocation
	// should be sampled
	bool SampleAllocation(size_t k);

	// Record additional bytes allocated.
	void AddToByteAllocatedTotal(size_t k) { total_bytes_allocated_ += k; }

	// Return the total number of bytes allocated from this heap. The value will
	// wrap when there is an overflow, and so only the differences between two
	// values should be relied on (and even then, modulo 2^32).
	uint32 GetTotalBytesAllocated() const;

	// On the current thread, return GetTotalBytesAllocated().
	static uint32 GetBytesAllocatedOnCurrentThread();

	static void InitModule();
	static void InitTSD();
	static ThreadCache* GetThreadHeap();
	static ThreadCache* GetCache();
	static ThreadCache* GetCacheIfPresent();
	static ThreadCache* CreateCacheIfNecessary();
	static void BecomeIdle();

	// Return the number of thread heaps in use.
	static inline int HeapsInUse();

	// Writes to total_bytes the total number of bytes used by all thread heaps.
	// class_count must be an array of size kNumClasses. Writes the number of
	// items on the corresponding freelist. class_count may be NULL.
	// The storage of both parameters must be zero intialized.
	// REQUIRES: Static::pageheap_lock is held.
	static void GetThreadStats(uint64_t* total_bytes, uint64_t* class_count);

	// Sets the total thread cache size to new_size, recomputing the
	// individual thread cache sizes as necessary.
	// REQUIRES: Static::pageheap lock is held.
	static void set_overall_thread_cache_size(size_t new_size);
	static size_t overall_thread_cache_size() {
	return overall_thread_cache_size_;
	}

	private:
	class FreeList {
	private:
	void* list_; // Linked list of nodes

	#ifdef _LP64
	// On 64-bit hardware, manipulating 16-bit values may be slightly slow.
	uint32_t length_; // Current length.
	uint32_t lowater_; // Low water mark for list length.
	uint32_t max_length_; // Dynamic max list length based on usage.
	// Tracks the number of times a deallocation has caused
	// length_ > max_length_. After the kMaxOverages'th time, max_length_
	// shrinks and length_overages_ is reset to zero.
	uint32_t length_overages_;
	#else
	// If we aren't using 64-bit pointers then pack these into less space.
	uint16_t length_;
	uint16_t lowater_;
	uint16_t max_length_;
	uint16_t length_overages_;
	#endif

	public:
	void Init() {
	list_ = NULL;
	length_ = 0;
	lowater_ = 0;
	max_length_ = 1;
	length_overages_ = 0;
	}

	// Return current length of list
	size_t length() const {
	return length_;
	}

	// Return the maximum length of the list.
	size_t max_length() const {
	return max_length_;
	}

	// Set the maximum length of the list. If 'new_max' > length(), the
	// client is responsible for removing objects from the list.
	void set_max_length(size_t new_max) {
	max_length_ = new_max;
	}

	// Return the number of times that length() has gone over max_length().
	size_t length_overages() const {
	return length_overages_;
	}

	void set_length_overages(size_t new_count) {
	length_overages_ = new_count;
	}

	// Is list empty?
	bool empty() const {
	return list_ == NULL;
	}

	// Low-water mark management
	int lowwatermark() const { return lowater_; }
	void clear_lowwatermark() { lowater_ = length_; }

	void Push(void* ptr) {
	FL_Push(&list_, ptr);
	length_++;
	}

	void* Pop() {
	ASSERT(list_ != NULL);
	length_--;
	if (length_ < lowater_) lowater_ = length_;
	return FL_Pop(&list_);
	}

	void* Next() {
	if (list_ == NULL) return NULL;
	return FL_Next(list_);
	}

	void PushRange(int N, void start, void end) {
	FL_PushRange(&list_, start, end);
	length_ += N;
	}

	void PopRange(int N, void start, void end) {
	FL_PopRange(&list_, N, start, end);
	ASSERT(length_ >= N);
	length_ -= N;
	if (length_ < lowater_) lowater_ = length_;
	}
	};

	// Gets and returns an object from the central cache, and, if possible,
	// also adds some objects of that size class to this thread cache.
	void* FetchFromCentralCache(size_t cl, size_t byte_size);

	// Releases some number of items from src. Adjusts the list's max_length
	// to eventually converge on num_objects_to_move(cl).
	void ListTooLong(FreeList* src, size_t cl);

	// Releases N items from this thread cache.
	void ReleaseToCentralCache(FreeList* src, size_t cl, int N);

	// Increase max_size_ by reducing unclaimed_cache_space_ or by
	// reducing the max_size_ of some other thread. In both cases,
	// the delta is kStealAmount.
	void IncreaseCacheLimit();
	// Same as above but requires Static::pageheap_lock() is held.
	void IncreaseCacheLimitLocked();

	// If TLS is available, we also store a copy of the per-thread object
	// in a __thread variable since __thread variables are faster to read
	// than pthread_getspecific(). We still need pthread_setspecific()
	// because __thread variables provide no way to run cleanup code when
	// a thread is destroyed.
	// We also give a hint to the compiler to use the "initial exec" TLS
	// model. This is faster than the default TLS model, at the cost that
	// you cannot dlopen this library. (To see the difference, look at
	// the CPU use of __tls_get_addr with and without this attribute.)
	// Since we don't really use dlopen in google code -- and using dlopen
	// on a malloc replacement is asking for trouble in any case -- that's
	// a good tradeoff for us.
	#ifdef HAVE_TLS
	static __thread ThreadCache* threadlocal_heap_
	// This code links against pyautolib.so, which causes dlopen() on that shared
	// object to fail when -fprofile-generate is used with it. Ideally
	// pyautolib.so should not link against this code. There is a bug filed for
	// that:
	// http://code.google.com/p/chromium/issues/detail?id=124489
	// For now the workaround is to pass in -DPGO_GENERATE when building Chrome
	// for instrumentation (-fprofile-generate).
	// For all non-instrumentation builds, this define will not be set and the
	// performance benefit of "intial-exec" will be achieved.
	//
	// gcc has a problem with this tls model on arm.
	// See https://bugs.chromium.org/p/chromium/issues/detail?id=650137
	#if defined(HAVE___ATTRIBUTE__) && !defined(PGO_GENERATE) && \
	!(!defined(__clang__) && defined(OS_CHROMEOS) && defined(__arm__))
	__attribute__ ((tls_model ("initial-exec")))
	# endif
	;
	#endif

	// Thread-specific key. Initialization here is somewhat tricky
	// because some Linux startup code invokes malloc() before it
	// is in a good enough state to handle pthread_keycreate().
	// Therefore, we use TSD keys only after tsd_inited is set to true.
	// Until then, we use a slow path to get the heap object.
	static bool tsd_inited_;
	static pthread_key_t heap_key_;

	// Linked list of heap objects. Protected by Static::pageheap_lock.
	static ThreadCache* thread_heaps_;
	static int thread_heap_count_;

	// A pointer to one of the objects in thread_heaps_. Represents
	// the next ThreadCache from which a thread over its max_size_ should
	// steal memory limit. Round-robin through all of the objects in
	// thread_heaps_. Protected by Static::pageheap_lock.
	static ThreadCache* next_memory_steal_;

	// Overall thread cache size. Protected by Static::pageheap_lock.
	static size_t overall_thread_cache_size_;

	// Global per-thread cache size. Writes are protected by
	// Static::pageheap_lock. Reads are done without any locking, which should be
	// fine as long as size_t can be written atomically and we don't place
	// invariants between this variable and other pieces of state.
	static volatile size_t per_thread_cache_size_;

	// Represents overall_thread_cache_size_ minus the sum of max_size_
	// across all ThreadCaches. Protected by Static::pageheap_lock.
	static ssize_t unclaimed_cache_space_;

	// This class is laid out with the most frequently used fields
	// first so that hot elements are placed on the same cache line.

	size_t size_; // Combined size of data
	size_t max_size_; // size_ > max_size_ --> Scavenge()

	// The following is the tally of bytes allocated on a thread as a response to
	// any flavor of malloc() call. The aggegated amount includes all padding to
	// the smallest class that can hold the request, or to the nearest whole page
	// when a large allocation is made without using a class. This sum is
	// currently used for Chromium profiling, where tallies are kept of the amount
	// of memory allocated during the running of each task on each thread.
	uint32 total_bytes_allocated_; // Total, modulo 2^32.

	// We sample allocations, biased by the size of the allocation
	Sampler sampler_; // A sampler

	FreeList list_[kNumClasses]; // Array indexed by size-class

	pthread_t tid_; // Which thread owns it
	bool in_setspecific_; // In call to pthread_setspecific?

	// Allocate a new heap. REQUIRES: Static::pageheap_lock is held.
	static ThreadCache* NewHeap(pthread_t tid);

	// Use only as pthread thread-specific destructor function.
	static void DestroyThreadCache(void* ptr);

	static void DeleteCache(ThreadCache* heap);
	static void RecomputePerThreadCacheSize();

	// Ensure that this class is cacheline-aligned. This is critical for
	// performance, as false sharing would negate many of the benefits
	// of a per-thread cache.
	} CACHELINE_ALIGNED;

	// Allocator for thread heaps
	// This is logically part of the ThreadCache class, but MSVC, at
	// least, does not like using ThreadCache as a template argument
	// before the class is fully defined. So we put it outside the class.
	extern PageHeapAllocator<ThreadCache> threadcache_allocator;

	inline int ThreadCache::HeapsInUse() {
	return threadcache_allocator.inuse();
	}

	inline bool ThreadCache::SampleAllocation(size_t k) {
	return sampler_.SampleAllocation(k);
	}

	inline uint32 ThreadCache::GetTotalBytesAllocated() const {
	return total_bytes_allocated_;
	}

	inline void* ThreadCache::Allocate(size_t size, size_t cl) {
	ASSERT(size <= kMaxSize);
	ASSERT(size == Static::sizemap()->ByteSizeForClass(cl));

	FreeList* list = &list_[cl];
	if (list->empty()) {
	return FetchFromCentralCache(cl, size);
	}
	size_ -= size;
	return list->Pop();
	}

	inline void ThreadCache::Deallocate(void* ptr, size_t cl) {
	FreeList* list = &list_[cl];
	size_ += Static::sizemap()->ByteSizeForClass(cl);
	ssize_t size_headroom = max_size_ - size_ - 1;

	// This catches back-to-back frees of allocs in the same size
	// class. A more comprehensive (and expensive) test would be to walk
	// the entire freelist. But this might be enough to find some bugs.
	ASSERT(ptr != list->Next());

	list->Push(ptr);
	ssize_t list_headroom =
	static_cast<ssize_t>(list->max_length()) - list->length();

	// There are two relatively uncommon things that require further work.
	// In the common case we're done, and in that case we need a single branch
	// because of the bitwise-or trick that follows.
	if ((list_headroom \| size_headroom) < 0) {
	if (list_headroom < 0) {
	ListTooLong(list, cl);
	}
	if (size_ >= max_size_) Scavenge();
	}
	}

	inline ThreadCache* ThreadCache::GetThreadHeap() {
	#ifdef HAVE_TLS
	// __thread is faster, but only when the kernel supports it
	if (KernelSupportsTLS())
	return threadlocal_heap_;
	#endif
	return reinterpret_cast<ThreadCache *>(
	perftools_pthread_getspecific(heap_key_));
	}

	inline ThreadCache* ThreadCache::GetCache() {
	ThreadCache* ptr = NULL;
	if (!tsd_inited_) {
	InitModule();
	} else {
	ptr = GetThreadHeap();
	}
	if (ptr == NULL) ptr = CreateCacheIfNecessary();
	return ptr;
	}

	// In deletion paths, we do not try to create a thread-cache. This is
	// because we may be in the thread destruction code and may have
	// already cleaned up the cache for this thread.
	inline ThreadCache* ThreadCache::GetCacheIfPresent() {
	if (!tsd_inited_) return NULL;
	return GetThreadHeap();
	}

	} // namespace tcmalloc

	#endif // TCMALLOC_THREAD_CACHE_H_