Cuda battery library
Loading...
Searching...
No Matches
allocator.hpp
Go to the documentation of this file.
1// Copyright 2021 Pierre Talbot
2
3#ifndef CUDA_BATTERY_ALLOCATOR_HPP
4#define CUDA_BATTERY_ALLOCATOR_HPP
5
6/** \file allocator.hpp
7We provide several allocators compatible with the data structures provided by this library.
8The allocators are aimed to be used to distinguish in which memory (shared, global, managed or the "standard" C++ memory) we should allocate data.
9This allows us to provide uniform interfaces for both host (C++) and device (CUDA) code.
10
11As a general comment, be careful to always deallocate the memory from the side you allocated it, e.g., do not allocate on the host then try to deallocate it on the device.
12To avoid these kind of mistakes, you should use `battery::shared_ptr` when passing data to a CUDA kernel, see the manual for examples.
13*/
14
15#include <cassert>
16#include <cstddef>
17#include <iostream>
18#include <type_traits>
19#include <inttypes.h>
20#include "utility.hpp"
21
22namespace battery {
23
24/** This allocator call the standard `malloc` and `free` functions. */
26public:
27 CUDA NI void* allocate(size_t bytes) {
28 if(bytes == 0) {
29 return nullptr;
30 }
31 return std::malloc(bytes);
32 }
33
34 CUDA NI void deallocate(void* data) {
35 std::free(data);
36 }
37
38 CUDA bool operator==(const standard_allocator&) const { return true; }
39};
40} // namespace battery
41
42CUDA inline void* operator new(size_t bytes, battery::standard_allocator& p) {
43 return p.allocate(bytes);
44}
45
46CUDA inline void operator delete(void* ptr, battery::standard_allocator& p) {
47 return p.deallocate(ptr);
48}
49
50#ifdef __CUDACC__
51
52namespace battery {
53
54/** An allocator using the global memory of CUDA.
55This can be used from both the host and device side, but the memory can only be accessed when in a device function.
56Beware that allocation and deallocation must occur on the same side. */
57class global_allocator {
58public:
59 CUDA NI void* allocate(size_t bytes) {
60 if(bytes == 0) {
61 return nullptr;
62 }
63 #ifdef __CUDA_ARCH__
64 void* data = std::malloc(bytes);
65 if (data == nullptr) {
66 printf("Allocation of device memory failed\n");
67 }
68 return data;
69 #else
70 void* data = nullptr;
71 cudaError_t rc = cudaMalloc(&data, bytes);
72 if (rc != cudaSuccess) {
73 std::cerr << "Allocation of global memory failed: " << cudaGetErrorString(rc) << std::endl;
74 return nullptr;
75 }
76 return data;
77 #endif
78 }
79
80 CUDA NI void deallocate(void* data) {
81 #ifdef __CUDA_ARCH__
82 std::free(data);
83 #else
84 cudaError_t rc = cudaFree(data);
85 if (rc != cudaSuccess) {
86 std::cerr << "Free of global memory failed: " << cudaGetErrorString(rc) << std::endl;
87 }
88 #endif
89 }
90
91 CUDA bool operator==(const global_allocator&) const { return true; }
92};
93
94/** An allocator using the managed memory of CUDA when the memory is allocated on the host. */
95class managed_allocator {
96public:
97 CUDA NI void* allocate(size_t bytes) {
98 #ifdef __CUDA_ARCH__
99 printf("Managed memory cannot be allocated in device functions.\n");
100 assert(false);
101 return nullptr;
102 #else
103 if(bytes == 0) {
104 return nullptr;
105 }
106 void* data = nullptr;
107 cudaError_t rc = cudaMallocManaged(&data, bytes);
108 if (rc != cudaSuccess) {
109 std::cerr << "Allocation of managed memory failed: " << cudaGetErrorString(rc) << std::endl;
110 return nullptr;
111 }
112 return data;
113 #endif
114 }
115
116 CUDA NI void deallocate(void* data) {
117 #ifdef __CUDA_ARCH__
118 printf("Managed memory cannot be freed in device functions.\n");
119 assert(false);
120 #else
121 cudaError_t rc = cudaFree(data);
122 if (rc != cudaSuccess) {
123 std::cerr << "Free of managed memory failed: " << cudaGetErrorString(rc) << std::endl;
124 }
125 #endif
126 }
127
128 CUDA bool operator==(const managed_allocator&) const { return true; }
129};
130
131/** An allocator using pinned memory for shared access between the host and the device.
132 * This type of memory is required on Microsoft Windows, on the Windows Subsystem for Linux (WSL), and on NVIDIA GRID (virtual GPU), because these systems do not support concurrent access to managed memory while a CUDA kernel is running.
133 *
134 * This allocator requires that you first set cudaDeviceMapHost using cudaSetDeviceFlags.
135 *
136 * We suppose unified virtual addressing (UVA) is enabled (the property `unifiedAddressing` is true).
137 *
138 * We delegate the allocation to `global_allocator` when the allocation is done on the device, since host memory cannot be allocated in device functions.
139 * */
140class pinned_allocator {
141public:
142 CUDA NI void* allocate(size_t bytes) {
143 #ifdef __CUDA_ARCH__
144 return global_allocator{}.allocate(bytes);
145 #else
146 if(bytes == 0) {
147 return nullptr;
148 }
149 void* data = nullptr;
150 cudaError_t rc = cudaMallocHost(&data, bytes); // pinned
151 if (rc != cudaSuccess) {
152 std::cerr << "Allocation of pinned memory failed: " << cudaGetErrorString(rc) << std::endl;
153 return nullptr;
154 }
155 return data;
156 #endif
157 }
158
159 CUDA NI void deallocate(void* data) {
160 #ifdef __CUDA_ARCH__
161 return global_allocator{}.deallocate(data);
162 #else
163 cudaError_t rc = cudaFreeHost(data);
164 if (rc != cudaSuccess) {
165 std::cerr << "Free of pinned memory failed: " << cudaGetErrorString(rc) << std::endl;
166 }
167 #endif
168 }
169
170 CUDA bool operator==(const pinned_allocator&) const { return true; }
171};
172
173} // namespace battery
174
175CUDA inline void* operator new(size_t bytes, battery::managed_allocator& p) {
176 return p.allocate(bytes);
177}
178
179CUDA inline void operator delete(void* ptr, battery::managed_allocator& p) {
180 return p.deallocate(ptr);
181}
182
183CUDA inline void* operator new(size_t bytes, battery::global_allocator& p) {
184 return p.allocate(bytes);
185}
186
187CUDA inline void operator delete(void* ptr, battery::global_allocator& p) {
188 p.deallocate(ptr);
189}
190
191CUDA inline void* operator new(size_t bytes, battery::pinned_allocator& p) {
192 return p.allocate(bytes);
193}
194
195CUDA inline void operator delete(void* ptr, battery::pinned_allocator& p) {
196 p.deallocate(ptr);
197}
198
199#endif // __CUDACC__
200
201namespace battery {
202
203namespace impl {
204#ifdef __CUDA_ARCH__
205 __constant__
206#endif
207static const int power2[17] = {0, 1, 2, 2, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16};
208}
209
210/** An allocator allocating memory from a pool of memory.
211The memory can for instance be the CUDA shared memory.
212This allocator is incomplete as it never frees the memory allocated.
213It allocates a control block using the "normal" `operator new`, where the address to the pool, its capacity and current offset are stored.
214*/
216
217 struct control_block {
218 unsigned char* mem;
219 size_t capacity;
220 size_t offset;
221 size_t alignment;
222 size_t num_deallocations;
223 size_t num_allocations;
224 size_t unaligned_wasted_bytes;
225 size_t counter;
226
227 CUDA control_block(unsigned char* mem, size_t capacity, size_t alignment)
228 : mem(mem), capacity(capacity), offset(0), alignment(alignment), num_deallocations(0), num_allocations(0), unaligned_wasted_bytes(0), counter(1)
229 {}
230
231 CUDA void* allocate(size_t bytes) {
232 // printf("%p: allocate %lu bytes / %lu offset / %lu capacity / %lu alignment / %p current mem\n", mem, bytes, offset, capacity, alignment, &mem[offset]);
233 if(bytes == 0) {
234 return nullptr;
235 }
236 size_t smallest_alignment = (bytes > alignment || alignment > 16) ? alignment : impl::power2[bytes];
237 if(size_t(&mem[offset]) % smallest_alignment != 0) { // If we are currently unaligned.
238 size_t old_offset = offset;
239 offset += smallest_alignment - (size_t(&mem[offset]) % smallest_alignment);
240 unaligned_wasted_bytes += (offset - old_offset);
241 }
242 assert(offset + bytes <= capacity);
243 assert((size_t)&mem[offset] % smallest_alignment == 0);
244 void* m = (void*)&mem[offset];
245 offset += bytes;
246 num_allocations++;
247 return m;
248 }
249
250 CUDA void deallocate(void* ptr) {
251 if(ptr != nullptr) {
252 num_deallocations++;
253 }
254 }
255 };
256
257 control_block* block;
258
259public:
260 NI pool_allocator() = default;
261
263 block(other.block)
264 {
265 if(block != nullptr) {
266 block->counter++;
267 }
268 }
269
271 block(other.block)
272 {
273 other.block = nullptr;
274 }
275
276 CUDA NI pool_allocator(unsigned char* mem, size_t capacity, size_t alignment = alignof(std::max_align_t))
277 : block(new control_block(mem, capacity, alignment))
278 {}
279
280private:
281 CUDA void destroy() {
282 block->counter--;
283 if(block->counter == 0) {
284 // This is a temporary hack to disable deleting the block when using Turbo...
285 // Unfortunately, there is a bug with -arch gpu and it seems the block is deleted from host while allocating on device (or vice-versa).
286 #ifdef CUDA_THREADS_PER_BLOCK
287 #else
288 delete block;
289 #endif
290 }
291 }
292
293public:
295 if(block != nullptr) {
296 destroy();
297 }
298 }
299
301 if(block != nullptr) {
302 destroy();
303 }
304 block = other.block;
305 other.block = nullptr;
306 return *this;
307 }
308
309 CUDA size_t align_at(size_t alignment) {
310 size_t old = block->alignment;
311 block->alignment = alignment;
312 return old;
313 }
314
315 CUDA NI void* allocate(size_t bytes) {
316 return block->allocate(bytes);
317 }
318
319 CUDA NI void deallocate(void* ptr) {
320 block->deallocate(ptr);
321 }
322
323 CUDA NI void print() const {
324 // CUDA printf does not support "%zu" -- use PRIu64 macro (Windows / Linux)
325 printf("%% %" PRIu64 " / %" PRIu64 " used [%" PRIu64 "/%" PRIu64 "]KB [%" PRIu64 "/%" PRIu64 "]MB\n",
326 block->offset, block->capacity,
327 block->offset/1000, block->capacity/1000,
328 block->offset/1000/1000, block->capacity/1000/1000);
329 printf("%% %" PRIu64 " / %" PRIu64 " wasted for alignment [%" PRIu64 "/%" PRIu64 "]KB [%" PRIu64 "/%" PRIu64 "]MB\n",
330 block->unaligned_wasted_bytes, block->offset,
331 block->unaligned_wasted_bytes/1000, block->offset/1000,
332 block->unaligned_wasted_bytes/1000/1000, block->offset/1000/1000);
333 printf("%% %" PRIu64 " allocations and %" PRIu64 " deallocations\n", block->num_allocations, block->num_deallocations);
334 }
335
336 CUDA size_t used() const {
337 return block->offset;
338 }
339
340 CUDA size_t capacity() const {
341 return block->capacity;
342 }
343
344 CUDA size_t num_deallocations() const {
345 return block->num_deallocations;
346 }
347
348 CUDA size_t num_allocations() const {
349 return block->num_allocations;
350 }
351
353 return block->unaligned_wasted_bytes;
354 }
355
356 CUDA bool operator==(const pool_allocator& rhs) const {
357 return block == rhs.block;
358 }
359};
360}
361
362CUDA inline void* operator new(size_t bytes, battery::pool_allocator& p) {
363 return p.allocate(bytes);
364}
365
366CUDA inline void operator delete(void* ptr, battery::pool_allocator& p) {
367 return p.deallocate(ptr);
368}
369
370namespace battery {
371 template <class Allocator, class InternalAllocator = Allocator>
373 struct control_block {
374 Allocator allocator;
375 size_t counter;
376 size_t num_deallocations;
377 size_t num_allocations;
378 size_t total_bytes_allocated;
379
380 CUDA control_block(const Allocator& allocator)
381 : allocator(allocator), counter(1), num_deallocations(0), num_allocations(0), total_bytes_allocated(0)
382 {}
383
384 CUDA NI void* allocate(size_t bytes) {
385 num_allocations++;
386 total_bytes_allocated += bytes;
387 return allocator.allocate(bytes);
388 }
389
390 CUDA NI void deallocate(void* ptr) {
391 if(ptr != nullptr) {
392 num_deallocations++;
393 allocator.deallocate(ptr);
394 }
395 }
396 };
397
398 InternalAllocator internal_allocator;
399 control_block* block;
400
401 public:
403
405 : internal_allocator(other.internal_allocator), block(other.block)
406 {
407 block->counter++;
408 }
409
410 CUDA NI statistics_allocator(const Allocator& allocator = Allocator(), const InternalAllocator& internal_allocator = InternalAllocator())
411 : internal_allocator(internal_allocator)
412 {
413 block = static_cast<control_block*>(this->internal_allocator.allocate(sizeof(control_block)));
414 new(block) control_block(allocator);
415 }
416
418 block->counter--;
419 if(block->counter == 0) {
420 internal_allocator.deallocate(block);
421 }
422 }
423
424 CUDA NI void* allocate(size_t bytes) {
425 return block->allocate(bytes);
426 }
427
428 CUDA NI void deallocate(void* ptr) {
429 block->deallocate(ptr);
430 }
431
432 CUDA size_t num_allocations() const {
433 return block->num_allocations;
434 }
435
436 CUDA size_t num_deallocations() const {
437 return block->num_deallocations;
438 }
439
441 return block->total_bytes_allocated;
442 }
443
444 CUDA inline bool operator==(const this_type& rhs) const {
445 return block == rhs.block;
446 }
447 };
448}
449
450#endif
Definition allocator.hpp:215
CUDA NI ~pool_allocator()
Definition allocator.hpp:294
CUDA size_t used() const
Definition allocator.hpp:336
CUDA NI void deallocate(void *ptr)
Definition allocator.hpp:319
CUDA NI pool_allocator(unsigned char *mem, size_t capacity, size_t alignment=alignof(std::max_align_t))
Definition allocator.hpp:276
NI pool_allocator()=default
CUDA NI void * allocate(size_t bytes)
Definition allocator.hpp:315
CUDA size_t unaligned_wasted_bytes() const
Definition allocator.hpp:352
CUDA size_t num_allocations() const
Definition allocator.hpp:348
CUDA bool operator==(const pool_allocator &rhs) const
Definition allocator.hpp:356
CUDA NI pool_allocator(const pool_allocator &other)
Definition allocator.hpp:262
CUDA NI pool_allocator & operator=(pool_allocator &&other)
Definition allocator.hpp:300
CUDA size_t align_at(size_t alignment)
Definition allocator.hpp:309
CUDA size_t capacity() const
Definition allocator.hpp:340
CUDA NI pool_allocator(pool_allocator &&other)
Definition allocator.hpp:270
CUDA NI void print() const
Definition allocator.hpp:323
CUDA size_t num_deallocations() const
Definition allocator.hpp:344
Definition allocator.hpp:25
CUDA NI void deallocate(void *data)
Definition allocator.hpp:34
CUDA bool operator==(const standard_allocator &) const
Definition allocator.hpp:38
CUDA NI void * allocate(size_t bytes)
Definition allocator.hpp:27
Definition allocator.hpp:372
CUDA NI statistics_allocator(const statistics_allocator &other)
Definition allocator.hpp:404
CUDA size_t num_deallocations() const
Definition allocator.hpp:436
CUDA bool operator==(const this_type &rhs) const
Definition allocator.hpp:444
CUDA NI void * allocate(size_t bytes)
Definition allocator.hpp:424
CUDA size_t total_bytes_allocated() const
Definition allocator.hpp:440
CUDA NI statistics_allocator(const Allocator &allocator=Allocator(), const InternalAllocator &internal_allocator=InternalAllocator())
Definition allocator.hpp:410
CUDA NI void deallocate(void *ptr)
Definition allocator.hpp:428
CUDA NI ~statistics_allocator()
Definition allocator.hpp:417
CUDA size_t num_allocations() const
Definition allocator.hpp:432
Definition algorithm.hpp:10
CUDA bool operator==(const string< Alloc1 > &lhs, const string< Alloc2 > &rhs)
Definition string.hpp:110
#define CUDA
Definition utility.hpp:59
#define NI
Definition utility.hpp:62