3#ifndef CUDA_BATTERY_ALLOCATOR_HPP
4#define CUDA_BATTERY_ALLOCATOR_HPP
31 return std::malloc(bytes);
43 return p.allocate(bytes);
47 return p.deallocate(ptr);
57class global_allocator {
59 CUDA NI void* allocate(
size_t bytes) {
64 void* data = std::malloc(bytes);
65 if (data ==
nullptr) {
66 printf(
"Allocation of device memory failed\n");
71 cudaError_t rc = cudaMalloc(&data, bytes);
72 if (rc != cudaSuccess) {
73 std::cerr <<
"Allocation of global memory failed: " << cudaGetErrorString(rc) << std::endl;
80 CUDA NI void deallocate(
void* data) {
83 #ifndef CUDA_THREADS_PER_BLOCK
87 cudaError_t rc = cudaFree(data);
88 if (rc != cudaSuccess) {
89 std::cerr <<
"Free of global memory failed: " << cudaGetErrorString(rc) << std::endl;
94 CUDA bool operator==(
const global_allocator&)
const {
return true; }
98class managed_allocator {
100 CUDA NI void* allocate(
size_t bytes) {
102 printf(
"Managed memory cannot be allocated in device functions.\n");
109 void* data =
nullptr;
110 cudaError_t rc = cudaMallocManaged(&data, bytes);
111 if (rc != cudaSuccess) {
112 std::cerr <<
"Allocation of managed memory failed: " << cudaGetErrorString(rc) << std::endl;
119 CUDA NI void deallocate(
void* data) {
121 printf(
"Managed memory cannot be freed in device functions.\n");
124 cudaError_t rc = cudaFree(data);
125 if (rc != cudaSuccess) {
126 std::cerr <<
"Free of managed memory failed: " << cudaGetErrorString(rc) << std::endl;
131 CUDA bool operator==(
const managed_allocator&)
const {
return true; }
143class pinned_allocator {
145 CUDA NI void* allocate(
size_t bytes) {
147 return global_allocator{}.allocate(bytes);
152 void* data =
nullptr;
153 cudaError_t rc = cudaMallocHost(&data, bytes);
154 if (rc != cudaSuccess) {
155 std::cerr <<
"Allocation of pinned memory failed: " << cudaGetErrorString(rc) << std::endl;
162 CUDA NI void deallocate(
void* data) {
164 return global_allocator{}.deallocate(data);
166 cudaError_t rc = cudaFreeHost(data);
167 if (rc != cudaSuccess) {
168 std::cerr <<
"Free of pinned memory failed: " << cudaGetErrorString(rc) << std::endl;
173 CUDA bool operator==(
const pinned_allocator&)
const {
return true; }
178CUDA inline void*
operator new(
size_t bytes, battery::managed_allocator& p) {
179 return p.allocate(bytes);
182CUDA inline void operator delete(
void* ptr, battery::managed_allocator& p) {
183 return p.deallocate(ptr);
186CUDA inline void*
operator new(
size_t bytes, battery::global_allocator& p) {
187 return p.allocate(bytes);
190CUDA inline void operator delete(
void* ptr, battery::global_allocator& p) {
194CUDA inline void*
operator new(
size_t bytes, battery::pinned_allocator& p) {
195 return p.allocate(bytes);
198CUDA inline void operator delete(
void* ptr, battery::pinned_allocator& p) {
210static const int power2[17] = {0, 1, 2, 2, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16};
220 struct control_block {
225 size_t num_deallocations;
226 size_t num_allocations;
227 size_t unaligned_wasted_bytes;
230 CUDA control_block(
unsigned char* mem,
size_t capacity,
size_t alignment)
231 : mem(mem), capacity(capacity), offset(0), alignment(alignment), num_deallocations(0), num_allocations(0), unaligned_wasted_bytes(0), counter(1)
234 CUDA void* allocate(
size_t bytes) {
239 size_t smallest_alignment = (bytes > alignment || alignment > 16) ? alignment : impl::power2[bytes];
240 if(
size_t(&mem[offset]) % smallest_alignment != 0) {
241 size_t old_offset = offset;
242 offset += smallest_alignment - (size_t(&mem[offset]) % smallest_alignment);
243 unaligned_wasted_bytes += (offset - old_offset);
245 assert(offset + bytes <= capacity);
246 assert((
size_t)&mem[offset] % smallest_alignment == 0);
247 void* m = (
void*)&mem[offset];
253 CUDA void deallocate(
void* ptr) {
260 control_block* block;
268 if(block !=
nullptr) {
276 other.block =
nullptr;
280 : block(new control_block(mem,
capacity, alignment))
284 CUDA void destroy() {
286 if(block->counter == 0) {
289 #ifdef CUDA_THREADS_PER_BLOCK
298 if(block !=
nullptr) {
304 if(block !=
nullptr) {
308 other.block =
nullptr;
313 size_t old = block->alignment;
314 block->alignment = alignment;
319 return block->allocate(bytes);
323 block->deallocate(ptr);
328 printf(
"%% %" PRIu64
" / %" PRIu64
" used [%" PRIu64
"/%" PRIu64
"]KB [%" PRIu64
"/%" PRIu64
"]MB\n",
329 block->offset, block->capacity,
330 block->offset/1000, block->capacity/1000,
331 block->offset/1000/1000, block->capacity/1000/1000);
332 printf(
"%% %" PRIu64
" / %" PRIu64
" wasted for alignment [%" PRIu64
"/%" PRIu64
"]KB [%" PRIu64
"/%" PRIu64
"]MB\n",
333 block->unaligned_wasted_bytes, block->offset,
334 block->unaligned_wasted_bytes/1000, block->offset/1000,
335 block->unaligned_wasted_bytes/1000/1000, block->offset/1000/1000);
336 printf(
"%% %" PRIu64
" allocations and %" PRIu64
" deallocations\n", block->num_allocations, block->num_deallocations);
340 return block->offset;
344 return block->capacity;
348 return block->num_deallocations;
352 return block->num_allocations;
356 return block->unaligned_wasted_bytes;
360 return block == rhs.block;
366 return p.allocate(bytes);
370 return p.deallocate(ptr);
374 template <
class Allocator,
class InternalAllocator = Allocator>
376 struct control_block {
379 size_t num_deallocations;
380 size_t num_allocations;
381 size_t total_bytes_allocated;
383 CUDA control_block(
const Allocator& allocator)
384 : allocator(allocator), counter(1), num_deallocations(0), num_allocations(0), total_bytes_allocated(0)
387 CUDA NI void* allocate(
size_t bytes) {
389 total_bytes_allocated += bytes;
390 return allocator.allocate(bytes);
393 CUDA NI void deallocate(
void* ptr) {
396 allocator.deallocate(ptr);
401 InternalAllocator internal_allocator;
402 control_block* block;
408 : internal_allocator(other.internal_allocator), block(other.block)
414 : internal_allocator(internal_allocator)
416 block =
static_cast<control_block*
>(this->internal_allocator.allocate(
sizeof(control_block)));
417 new(block) control_block(allocator);
422 if(block->counter == 0) {
423 internal_allocator.deallocate(block);
428 return block->allocate(bytes);
432 block->deallocate(ptr);
436 return block->num_allocations;
440 return block->num_deallocations;
444 return block->total_bytes_allocated;
448 return block == rhs.block;
Definition allocator.hpp:218
CUDA NI ~pool_allocator()
Definition allocator.hpp:297
CUDA size_t used() const
Definition allocator.hpp:339
CUDA NI void deallocate(void *ptr)
Definition allocator.hpp:322
CUDA NI pool_allocator(unsigned char *mem, size_t capacity, size_t alignment=alignof(std::max_align_t))
Definition allocator.hpp:279
NI pool_allocator()=default
CUDA NI void * allocate(size_t bytes)
Definition allocator.hpp:318
CUDA size_t unaligned_wasted_bytes() const
Definition allocator.hpp:355
CUDA size_t num_allocations() const
Definition allocator.hpp:351
CUDA bool operator==(const pool_allocator &rhs) const
Definition allocator.hpp:359
CUDA NI pool_allocator(const pool_allocator &other)
Definition allocator.hpp:265
CUDA NI pool_allocator & operator=(pool_allocator &&other)
Definition allocator.hpp:303
CUDA size_t align_at(size_t alignment)
Definition allocator.hpp:312
CUDA size_t capacity() const
Definition allocator.hpp:343
CUDA NI pool_allocator(pool_allocator &&other)
Definition allocator.hpp:273
CUDA NI void print() const
Definition allocator.hpp:326
CUDA size_t num_deallocations() const
Definition allocator.hpp:347
Definition allocator.hpp:25
CUDA NI void deallocate(void *data)
Definition allocator.hpp:34
CUDA bool operator==(const standard_allocator &) const
Definition allocator.hpp:38
CUDA NI void * allocate(size_t bytes)
Definition allocator.hpp:27
Definition allocator.hpp:375
CUDA NI statistics_allocator(const statistics_allocator &other)
Definition allocator.hpp:407
CUDA size_t num_deallocations() const
Definition allocator.hpp:439
CUDA bool operator==(const this_type &rhs) const
Definition allocator.hpp:447
CUDA NI void * allocate(size_t bytes)
Definition allocator.hpp:427
CUDA size_t total_bytes_allocated() const
Definition allocator.hpp:443
CUDA NI statistics_allocator(const Allocator &allocator=Allocator(), const InternalAllocator &internal_allocator=InternalAllocator())
Definition allocator.hpp:413
CUDA NI void deallocate(void *ptr)
Definition allocator.hpp:431
CUDA NI ~statistics_allocator()
Definition allocator.hpp:420
CUDA size_t num_allocations() const
Definition allocator.hpp:435
Definition algorithm.hpp:10
CUDA bool operator==(const string< Alloc1 > &lhs, const string< Alloc2 > &rhs)
Definition string.hpp:110
#define CUDA
Definition utility.hpp:59
#define NI
Definition utility.hpp:62