Cuda battery library
Loading...
Searching...
No Matches
allocator.hpp
Go to the documentation of this file.
1// Copyright 2021 Pierre Talbot
2
3#ifndef CUDA_BATTERY_ALLOCATOR_HPP
4#define CUDA_BATTERY_ALLOCATOR_HPP
5
6/** \file allocator.hpp
7We provide several allocators compatible with the data structures provided by this library.
8The allocators are aimed to be used to distinguish in which memory (shared, global, managed or the "standard" C++ memory) we should allocate data.
9This allows us to provide uniform interfaces for both host (C++) and device (CUDA) code.
10
11As a general comment, be careful to always deallocate the memory from the side you allocated it, e.g., do not allocate on the host then try to deallocate it on the device.
12To avoid these kind of mistakes, you should use `battery::shared_ptr` when passing data to a CUDA kernel, see the manual for examples.
13*/
14
15#include <cassert>
16#include <cstddef>
17#include <iostream>
18#include <type_traits>
19#include <inttypes.h>
20#include "utility.hpp"
21
22namespace battery {
23
24/** This allocator call the standard `malloc` and `free` functions. */
26public:
27 CUDA NI void* allocate(size_t bytes) {
28 if(bytes == 0) {
29 return nullptr;
30 }
31 return std::malloc(bytes);
32 }
33
34 CUDA NI void deallocate(void* data) {
35 std::free(data);
36 }
37
38 CUDA bool operator==(const standard_allocator&) const { return true; }
39};
40} // namespace battery
41
42CUDA inline void* operator new(size_t bytes, battery::standard_allocator& p) {
43 return p.allocate(bytes);
44}
45
46CUDA inline void operator delete(void* ptr, battery::standard_allocator& p) {
47 return p.deallocate(ptr);
48}
49
50#ifdef __CUDACC__
51
52namespace battery {
53
54/** An allocator using the global memory of CUDA.
55This can be used from both the host and device side, but the memory can only be accessed when in a device function.
56Beware that allocation and deallocation must occur on the same side. */
57class global_allocator {
58public:
59 CUDA NI void* allocate(size_t bytes) {
60 if(bytes == 0) {
61 return nullptr;
62 }
63 #ifdef __CUDA_ARCH__
64 void* data = std::malloc(bytes);
65 if (data == nullptr) {
66 printf("Allocation of device memory failed\n");
67 }
68 return data;
69 #else
70 void* data = nullptr;
71 cudaError_t rc = cudaMalloc(&data, bytes);
72 if (rc != cudaSuccess) {
73 std::cerr << "Allocation of global memory failed: " << cudaGetErrorString(rc) << std::endl;
74 return nullptr;
75 }
76 return data;
77 #endif
78 }
79
80 CUDA NI void deallocate(void* data) {
81 #ifdef __CUDA_ARCH__
82 // Bug in Turbo when deleting with free. Couldn't find the reason yet.
83 #ifndef CUDA_THREADS_PER_BLOCK
84 std::free(data);
85 #endif
86 #else
87 cudaError_t rc = cudaFree(data);
88 if (rc != cudaSuccess) {
89 std::cerr << "Free of global memory failed: " << cudaGetErrorString(rc) << std::endl;
90 }
91 #endif
92 }
93
94 CUDA bool operator==(const global_allocator&) const { return true; }
95};
96
97/** An allocator using the managed memory of CUDA when the memory is allocated on the host. */
98class managed_allocator {
99public:
100 CUDA NI void* allocate(size_t bytes) {
101 #ifdef __CUDA_ARCH__
102 printf("Managed memory cannot be allocated in device functions.\n");
103 assert(false);
104 return nullptr;
105 #else
106 if(bytes == 0) {
107 return nullptr;
108 }
109 void* data = nullptr;
110 cudaError_t rc = cudaMallocManaged(&data, bytes);
111 if (rc != cudaSuccess) {
112 std::cerr << "Allocation of managed memory failed: " << cudaGetErrorString(rc) << std::endl;
113 return nullptr;
114 }
115 return data;
116 #endif
117 }
118
119 CUDA NI void deallocate(void* data) {
120 #ifdef __CUDA_ARCH__
121 printf("Managed memory cannot be freed in device functions.\n");
122 assert(false);
123 #else
124 cudaError_t rc = cudaFree(data);
125 if (rc != cudaSuccess) {
126 std::cerr << "Free of managed memory failed: " << cudaGetErrorString(rc) << std::endl;
127 }
128 #endif
129 }
130
131 CUDA bool operator==(const managed_allocator&) const { return true; }
132};
133
134/** An allocator using pinned memory for shared access between the host and the device.
135 * This type of memory is required on Microsoft Windows, on the Windows Subsystem for Linux (WSL), and on NVIDIA GRID (virtual GPU), because these systems do not support concurrent access to managed memory while a CUDA kernel is running.
136 *
137 * This allocator requires that you first set cudaDeviceMapHost using cudaSetDeviceFlags.
138 *
139 * We suppose unified virtual addressing (UVA) is enabled (the property `unifiedAddressing` is true).
140 *
141 * We delegate the allocation to `global_allocator` when the allocation is done on the device, since host memory cannot be allocated in device functions.
142 * */
143class pinned_allocator {
144public:
145 CUDA NI void* allocate(size_t bytes) {
146 #ifdef __CUDA_ARCH__
147 return global_allocator{}.allocate(bytes);
148 #else
149 if(bytes == 0) {
150 return nullptr;
151 }
152 void* data = nullptr;
153 cudaError_t rc = cudaMallocHost(&data, bytes); // pinned
154 if (rc != cudaSuccess) {
155 std::cerr << "Allocation of pinned memory failed: " << cudaGetErrorString(rc) << std::endl;
156 return nullptr;
157 }
158 return data;
159 #endif
160 }
161
162 CUDA NI void deallocate(void* data) {
163 #ifdef __CUDA_ARCH__
164 return global_allocator{}.deallocate(data);
165 #else
166 cudaError_t rc = cudaFreeHost(data);
167 if (rc != cudaSuccess) {
168 std::cerr << "Free of pinned memory failed: " << cudaGetErrorString(rc) << std::endl;
169 }
170 #endif
171 }
172
173 CUDA bool operator==(const pinned_allocator&) const { return true; }
174};
175
176} // namespace battery
177
178CUDA inline void* operator new(size_t bytes, battery::managed_allocator& p) {
179 return p.allocate(bytes);
180}
181
182CUDA inline void operator delete(void* ptr, battery::managed_allocator& p) {
183 return p.deallocate(ptr);
184}
185
186CUDA inline void* operator new(size_t bytes, battery::global_allocator& p) {
187 return p.allocate(bytes);
188}
189
190CUDA inline void operator delete(void* ptr, battery::global_allocator& p) {
191 p.deallocate(ptr);
192}
193
194CUDA inline void* operator new(size_t bytes, battery::pinned_allocator& p) {
195 return p.allocate(bytes);
196}
197
198CUDA inline void operator delete(void* ptr, battery::pinned_allocator& p) {
199 p.deallocate(ptr);
200}
201
202#endif // __CUDACC__
203
204namespace battery {
205
206namespace impl {
207#ifdef __CUDA_ARCH__
208 __constant__
209#endif
210static const int power2[17] = {0, 1, 2, 2, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16};
211}
212
213/** An allocator allocating memory from a pool of memory.
214The memory can for instance be the CUDA shared memory.
215This allocator is incomplete as it never frees the memory allocated.
216It allocates a control block using the "normal" `operator new`, where the address to the pool, its capacity and current offset are stored.
217*/
219
220 struct control_block {
221 unsigned char* mem;
222 size_t capacity;
223 size_t offset;
224 size_t alignment;
225 size_t num_deallocations;
226 size_t num_allocations;
227 size_t unaligned_wasted_bytes;
228 size_t counter;
229
230 CUDA control_block(unsigned char* mem, size_t capacity, size_t alignment)
231 : mem(mem), capacity(capacity), offset(0), alignment(alignment), num_deallocations(0), num_allocations(0), unaligned_wasted_bytes(0), counter(1)
232 {}
233
234 CUDA void* allocate(size_t bytes) {
235 // printf("%p: allocate %lu bytes / %lu offset / %lu capacity / %lu alignment / %p current mem\n", mem, bytes, offset, capacity, alignment, &mem[offset]);
236 if(bytes == 0) {
237 return nullptr;
238 }
239 size_t smallest_alignment = (bytes > alignment || alignment > 16) ? alignment : impl::power2[bytes];
240 if(size_t(&mem[offset]) % smallest_alignment != 0) { // If we are currently unaligned.
241 size_t old_offset = offset;
242 offset += smallest_alignment - (size_t(&mem[offset]) % smallest_alignment);
243 unaligned_wasted_bytes += (offset - old_offset);
244 }
245 assert(offset + bytes <= capacity);
246 assert((size_t)&mem[offset] % smallest_alignment == 0);
247 void* m = (void*)&mem[offset];
248 offset += bytes;
249 num_allocations++;
250 return m;
251 }
252
253 CUDA void deallocate(void* ptr) {
254 if(ptr != nullptr) {
255 num_deallocations++;
256 }
257 }
258 };
259
260 control_block* block;
261
262public:
263 NI pool_allocator() = default;
264
266 block(other.block)
267 {
268 if(block != nullptr) {
269 block->counter++;
270 }
271 }
272
274 block(other.block)
275 {
276 other.block = nullptr;
277 }
278
279 CUDA NI pool_allocator(unsigned char* mem, size_t capacity, size_t alignment = alignof(std::max_align_t))
280 : block(new control_block(mem, capacity, alignment))
281 {}
282
283private:
284 CUDA void destroy() {
285 block->counter--;
286 if(block->counter == 0) {
287 // This is a temporary hack to disable deleting the block when using Turbo...
288 // Unfortunately, there is a bug with -arch gpu and it seems the block is deleted from host while allocating on device (or vice-versa).
289 #ifdef CUDA_THREADS_PER_BLOCK
290 #else
291 delete block;
292 #endif
293 }
294 }
295
296public:
298 if(block != nullptr) {
299 destroy();
300 }
301 }
302
304 if(block != nullptr) {
305 destroy();
306 }
307 block = other.block;
308 other.block = nullptr;
309 return *this;
310 }
311
312 CUDA size_t align_at(size_t alignment) {
313 size_t old = block->alignment;
314 block->alignment = alignment;
315 return old;
316 }
317
318 CUDA NI void* allocate(size_t bytes) {
319 return block->allocate(bytes);
320 }
321
322 CUDA NI void deallocate(void* ptr) {
323 block->deallocate(ptr);
324 }
325
326 CUDA NI void print() const {
327 // CUDA printf does not support "%zu" -- use PRIu64 macro (Windows / Linux)
328 printf("%% %" PRIu64 " / %" PRIu64 " used [%" PRIu64 "/%" PRIu64 "]KB [%" PRIu64 "/%" PRIu64 "]MB\n",
329 block->offset, block->capacity,
330 block->offset/1000, block->capacity/1000,
331 block->offset/1000/1000, block->capacity/1000/1000);
332 printf("%% %" PRIu64 " / %" PRIu64 " wasted for alignment [%" PRIu64 "/%" PRIu64 "]KB [%" PRIu64 "/%" PRIu64 "]MB\n",
333 block->unaligned_wasted_bytes, block->offset,
334 block->unaligned_wasted_bytes/1000, block->offset/1000,
335 block->unaligned_wasted_bytes/1000/1000, block->offset/1000/1000);
336 printf("%% %" PRIu64 " allocations and %" PRIu64 " deallocations\n", block->num_allocations, block->num_deallocations);
337 }
338
339 CUDA size_t used() const {
340 return block->offset;
341 }
342
343 CUDA size_t capacity() const {
344 return block->capacity;
345 }
346
347 CUDA size_t num_deallocations() const {
348 return block->num_deallocations;
349 }
350
351 CUDA size_t num_allocations() const {
352 return block->num_allocations;
353 }
354
356 return block->unaligned_wasted_bytes;
357 }
358
359 CUDA bool operator==(const pool_allocator& rhs) const {
360 return block == rhs.block;
361 }
362};
363}
364
365CUDA inline void* operator new(size_t bytes, battery::pool_allocator& p) {
366 return p.allocate(bytes);
367}
368
369CUDA inline void operator delete(void* ptr, battery::pool_allocator& p) {
370 return p.deallocate(ptr);
371}
372
373namespace battery {
374 template <class Allocator, class InternalAllocator = Allocator>
376 struct control_block {
377 Allocator allocator;
378 size_t counter;
379 size_t num_deallocations;
380 size_t num_allocations;
381 size_t total_bytes_allocated;
382
383 CUDA control_block(const Allocator& allocator)
384 : allocator(allocator), counter(1), num_deallocations(0), num_allocations(0), total_bytes_allocated(0)
385 {}
386
387 CUDA NI void* allocate(size_t bytes) {
388 num_allocations++;
389 total_bytes_allocated += bytes;
390 return allocator.allocate(bytes);
391 }
392
393 CUDA NI void deallocate(void* ptr) {
394 if(ptr != nullptr) {
395 num_deallocations++;
396 allocator.deallocate(ptr);
397 }
398 }
399 };
400
401 InternalAllocator internal_allocator;
402 control_block* block;
403
404 public:
406
408 : internal_allocator(other.internal_allocator), block(other.block)
409 {
410 block->counter++;
411 }
412
413 CUDA NI statistics_allocator(const Allocator& allocator = Allocator(), const InternalAllocator& internal_allocator = InternalAllocator())
414 : internal_allocator(internal_allocator)
415 {
416 block = static_cast<control_block*>(this->internal_allocator.allocate(sizeof(control_block)));
417 new(block) control_block(allocator);
418 }
419
421 block->counter--;
422 if(block->counter == 0) {
423 internal_allocator.deallocate(block);
424 }
425 }
426
427 CUDA NI void* allocate(size_t bytes) {
428 return block->allocate(bytes);
429 }
430
431 CUDA NI void deallocate(void* ptr) {
432 block->deallocate(ptr);
433 }
434
435 CUDA size_t num_allocations() const {
436 return block->num_allocations;
437 }
438
439 CUDA size_t num_deallocations() const {
440 return block->num_deallocations;
441 }
442
444 return block->total_bytes_allocated;
445 }
446
447 CUDA inline bool operator==(const this_type& rhs) const {
448 return block == rhs.block;
449 }
450 };
451}
452
453#endif
Definition allocator.hpp:218
CUDA NI ~pool_allocator()
Definition allocator.hpp:297
CUDA size_t used() const
Definition allocator.hpp:339
CUDA NI void deallocate(void *ptr)
Definition allocator.hpp:322
CUDA NI pool_allocator(unsigned char *mem, size_t capacity, size_t alignment=alignof(std::max_align_t))
Definition allocator.hpp:279
NI pool_allocator()=default
CUDA NI void * allocate(size_t bytes)
Definition allocator.hpp:318
CUDA size_t unaligned_wasted_bytes() const
Definition allocator.hpp:355
CUDA size_t num_allocations() const
Definition allocator.hpp:351
CUDA bool operator==(const pool_allocator &rhs) const
Definition allocator.hpp:359
CUDA NI pool_allocator(const pool_allocator &other)
Definition allocator.hpp:265
CUDA NI pool_allocator & operator=(pool_allocator &&other)
Definition allocator.hpp:303
CUDA size_t align_at(size_t alignment)
Definition allocator.hpp:312
CUDA size_t capacity() const
Definition allocator.hpp:343
CUDA NI pool_allocator(pool_allocator &&other)
Definition allocator.hpp:273
CUDA NI void print() const
Definition allocator.hpp:326
CUDA size_t num_deallocations() const
Definition allocator.hpp:347
Definition allocator.hpp:25
CUDA NI void deallocate(void *data)
Definition allocator.hpp:34
CUDA bool operator==(const standard_allocator &) const
Definition allocator.hpp:38
CUDA NI void * allocate(size_t bytes)
Definition allocator.hpp:27
Definition allocator.hpp:375
CUDA NI statistics_allocator(const statistics_allocator &other)
Definition allocator.hpp:407
CUDA size_t num_deallocations() const
Definition allocator.hpp:439
CUDA bool operator==(const this_type &rhs) const
Definition allocator.hpp:447
CUDA NI void * allocate(size_t bytes)
Definition allocator.hpp:427
CUDA size_t total_bytes_allocated() const
Definition allocator.hpp:443
CUDA NI statistics_allocator(const Allocator &allocator=Allocator(), const InternalAllocator &internal_allocator=InternalAllocator())
Definition allocator.hpp:413
CUDA NI void deallocate(void *ptr)
Definition allocator.hpp:431
CUDA NI ~statistics_allocator()
Definition allocator.hpp:420
CUDA size_t num_allocations() const
Definition allocator.hpp:435
Definition algorithm.hpp:10
CUDA bool operator==(const string< Alloc1 > &lhs, const string< Alloc2 > &rhs)
Definition string.hpp:110
#define CUDA
Definition utility.hpp:59
#define NI
Definition utility.hpp:62