Cuda battery library
Loading...
Searching...
No Matches
allocator.hpp
Go to the documentation of this file.
1// Copyright 2021 Pierre Talbot
2
3#ifndef CUDA_BATTERY_ALLOCATOR_HPP
4#define CUDA_BATTERY_ALLOCATOR_HPP
5
6/** \file allocator.hpp
7We provide several allocators compatible with the data structures provided by this library.
8The allocators are aimed to be used to distinguish in which memory (shared, global, managed or the "standard" C++ memory) we should allocate data.
9This allows us to provide uniform interfaces for both host (C++) and device (CUDA) code.
10
11As a general comment, be careful to always deallocate the memory from the side you allocated it, e.g., do not allocate on the host then try to deallocate it on the device.
12To avoid these kind of mistakes, you should use `battery::shared_ptr` when passing data to a CUDA kernel, see the manual for examples.
13*/
14
15#include <cassert>
16#include <cstddef>
17#include <iostream>
18#include <type_traits>
19#include <inttypes.h>
20#include "utility.hpp"
21
22#ifdef __CUDACC__
23
24namespace battery {
25
26/** An allocator using the global memory of CUDA.
27This can be used from both the host and device side, but the memory can only be accessed when in a device function.
28Beware that allocation and deallocation must occur on the same side. */
29class global_allocator {
30public:
31 CUDA NI void* allocate(size_t bytes) {
32 if(bytes == 0) {
33 return nullptr;
34 }
35 #ifdef __CUDA_ARCH__
36 void* data = std::malloc(bytes);
37 if (data == nullptr) {
38 printf("Allocation of device memory failed\n");
39 }
40 return data;
41 #else
42 void* data = nullptr;
43 cudaError_t rc = cudaMalloc(&data, bytes);
44 if (rc != cudaSuccess) {
45 std::cerr << "Allocation of global memory failed: " << cudaGetErrorString(rc) << std::endl;
46 return nullptr;
47 }
48 return data;
49 #endif
50 }
51
52 CUDA NI void deallocate(void* data) {
53 #ifdef __CUDA_ARCH__
54 std::free(data);
55 #else
56 cudaError_t rc = cudaFree(data);
57 if (rc != cudaSuccess) {
58 std::cerr << "Free of global memory failed: " << cudaGetErrorString(rc) << std::endl;
59 }
60 #endif
61 }
62
63 CUDA bool operator==(const global_allocator&) const { return true; }
64};
65
66/** An allocator using the managed memory of CUDA when the memory is allocated on the host.
67 * We delegate the allocation to `global_allocator` when the allocation is done on the device (since managed memory cannot be allocated in device functions). */
68class managed_allocator {
69public:
70 CUDA NI void* allocate(size_t bytes) {
71 #ifdef __CUDA_ARCH__
72 return global_allocator{}.allocate(bytes);
73 #else
74 if(bytes == 0) {
75 return nullptr;
76 }
77 void* data = nullptr;
78 cudaError_t rc = cudaMallocManaged(&data, bytes);
79 if (rc != cudaSuccess) {
80 std::cerr << "Allocation of managed memory failed: " << cudaGetErrorString(rc) << std::endl;
81 return nullptr;
82 }
83 return data;
84 #endif
85 }
86
87 CUDA NI void deallocate(void* data) {
88 #ifdef __CUDA_ARCH__
89 return global_allocator{}.deallocate(data);
90 #else
91 cudaError_t rc = cudaFree(data);
92 if (rc != cudaSuccess) {
93 std::cerr << "Free of managed memory failed: " << cudaGetErrorString(rc) << std::endl;
94 }
95 #endif
96 }
97
98 CUDA bool operator==(const managed_allocator&) const { return true; }
99};
100
101/** An allocator using pinned memory for shared access between the host and the device.
102 * This type of memory is required on Microsoft Windows, on the Windows Subsystem for Linux (WSL), and on NVIDIA GRID (virtual GPU), because these systems do not support concurrent access to managed memory while a CUDA kernel is running.
103 *
104 * This allocator requires that you first set cudaDeviceMapHost using cudaSetDeviceFlags.
105 *
106 * We suppose unified virtual addressing (UVA) is enabled (the property `unifiedAddressing` is true).
107 *
108 * We delegate the allocation to `global_allocator` when the allocation is done on the device, since host memory cannot be allocated in device functions.
109 * */
110class pinned_allocator {
111public:
112 CUDA NI void* allocate(size_t bytes) {
113 #ifdef __CUDA_ARCH__
114 return global_allocator{}.allocate(bytes);
115 #else
116 if(bytes == 0) {
117 return nullptr;
118 }
119 void* data = nullptr;
120 cudaError_t rc = cudaMallocHost(&data, bytes); // pinned
121 if (rc != cudaSuccess) {
122 std::cerr << "Allocation of pinned memory failed: " << cudaGetErrorString(rc) << std::endl;
123 return nullptr;
124 }
125 return data;
126 #endif
127 }
128
129 CUDA NI void deallocate(void* data) {
130 #ifdef __CUDA_ARCH__
131 return global_allocator{}.deallocate(data);
132 #else
133 cudaError_t rc = cudaFreeHost(data);
134 if (rc != cudaSuccess) {
135 std::cerr << "Free of pinned memory failed: " << cudaGetErrorString(rc) << std::endl;
136 }
137 #endif
138 }
139
140 CUDA bool operator==(const pinned_allocator&) const { return true; }
141};
142
143} // namespace battery
144
145CUDA inline void* operator new(size_t bytes, battery::managed_allocator& p) {
146 return p.allocate(bytes);
147}
148
149CUDA inline void operator delete(void* ptr, battery::managed_allocator& p) {
150 return p.deallocate(ptr);
151}
152
153CUDA inline void* operator new(size_t bytes, battery::global_allocator& p) {
154 return p.allocate(bytes);
155}
156
157CUDA inline void operator delete(void* ptr, battery::global_allocator& p) {
158 p.deallocate(ptr);
159}
160
161CUDA inline void* operator new(size_t bytes, battery::pinned_allocator& p) {
162 return p.allocate(bytes);
163}
164
165CUDA inline void operator delete(void* ptr, battery::pinned_allocator& p) {
166 p.deallocate(ptr);
167}
168
169#endif // __CUDACC__
170
171namespace battery {
172
173namespace impl {
174#ifdef __CUDA_ARCH__
175 __constant__
176#endif
177static const int power2[17] = {0, 1, 2, 2, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16};
178}
179
180/** An allocator allocating memory from a pool of memory.
181The memory can for instance be the CUDA shared memory.
182This allocator is incomplete as it never frees the memory allocated.
183It allocates a control block using the "normal" `operator new`, where the address to the pool, its capacity and current offset are stored.
184*/
186
187 struct control_block {
188 unsigned char* mem;
189 size_t capacity;
190 size_t offset;
191 size_t alignment;
192 size_t num_deallocations;
193 size_t num_allocations;
194 size_t unaligned_wasted_bytes;
195 size_t counter;
196
197 CUDA control_block(unsigned char* mem, size_t capacity, size_t alignment)
198 : mem(mem), capacity(capacity), offset(0), alignment(alignment), num_deallocations(0), num_allocations(0), unaligned_wasted_bytes(0), counter(1)
199 {}
200
201 CUDA void* allocate(size_t bytes) {
202 // printf("%p: allocate %lu bytes / %lu offset / %lu capacity / %lu alignment / %p current mem\n", mem, bytes, offset, capacity, alignment, &mem[offset]);
203 if(bytes == 0) {
204 return nullptr;
205 }
206 size_t smallest_alignment = (bytes > alignment || alignment > 16) ? alignment : impl::power2[bytes];
207 if(size_t(&mem[offset]) % smallest_alignment != 0) { // If we are currently unaligned.
208 size_t old_offset = offset;
209 offset += smallest_alignment - (size_t(&mem[offset]) % smallest_alignment);
210 unaligned_wasted_bytes += (offset - old_offset);
211 }
212 assert(offset + bytes <= capacity);
213 assert((size_t)&mem[offset] % smallest_alignment == 0);
214 void* m = (void*)&mem[offset];
215 offset += bytes;
216 num_allocations++;
217 return m;
218 }
219
220 CUDA void deallocate(void* ptr) {
221 if(ptr != nullptr) {
222 num_deallocations++;
223 }
224 }
225 };
226
227 control_block* block;
228
229public:
230 CUDA NI pool_allocator() = default;
231
233 block(other.block)
234 {
235 if(block != nullptr) {
236 block->counter++;
237 }
238 }
239
241 block(other.block)
242 {
243 other.block = nullptr;
244 }
245
246 CUDA NI pool_allocator(unsigned char* mem, size_t capacity, size_t alignment = alignof(std::max_align_t))
247 : block(::new control_block(mem, capacity, alignment))
248 {}
249
250private:
251 CUDA void destroy() {
252 block->counter--;
253 if(block->counter == 0) {
254 ::delete block;
255 }
256 }
257
258public:
260 if(block != nullptr) {
261 destroy();
262 }
263 }
264
266 if(block != nullptr) {
267 destroy();
268 }
269 block = other.block;
270 other.block = nullptr;
271 return *this;
272 }
273
274 CUDA size_t align_at(size_t alignment) {
275 size_t old = block->alignment;
276 block->alignment = alignment;
277 return old;
278 }
279
280 CUDA NI void* allocate(size_t bytes) {
281 return block->allocate(bytes);
282 }
283
284 CUDA NI void deallocate(void* ptr) {
285 block->deallocate(ptr);
286 }
287
288 CUDA NI void print() const {
289 // CUDA printf does not support "%zu" -- use PRIu64 macro (Windows / Linux)
290 printf("%% %" PRIu64 " / %" PRIu64 " used [%" PRIu64 "/%" PRIu64 "]KB [%" PRIu64 "/%" PRIu64 "]MB\n",
291 block->offset, block->capacity,
292 block->offset/1000, block->capacity/1000,
293 block->offset/1000/1000, block->capacity/1000/1000);
294 printf("%% %" PRIu64 " / %" PRIu64 " wasted for alignment [%" PRIu64 "/%" PRIu64 "]KB [%" PRIu64 "/%" PRIu64 "]MB\n",
295 block->unaligned_wasted_bytes, block->offset,
296 block->unaligned_wasted_bytes/1000, block->offset/1000,
297 block->unaligned_wasted_bytes/1000/1000, block->offset/1000/1000);
298 printf("%% %" PRIu64 " allocations and %" PRIu64 " deallocations\n", block->num_allocations, block->num_deallocations);
299 }
300
301 CUDA size_t used() const {
302 return block->offset;
303 }
304
305 CUDA size_t capacity() const {
306 return block->capacity;
307 }
308
309 CUDA size_t num_deallocations() const {
310 return block->num_deallocations;
311 }
312
313 CUDA size_t num_allocations() const {
314 return block->num_allocations;
315 }
316
318 return block->unaligned_wasted_bytes;
319 }
320
321 CUDA bool operator==(const pool_allocator& rhs) const {
322 return block == rhs.block;
323 }
324};
325}
326
327CUDA inline void* operator new(size_t bytes, battery::pool_allocator& p) {
328 return p.allocate(bytes);
329}
330
331CUDA inline void operator delete(void* ptr, battery::pool_allocator& p) {
332 return p.deallocate(ptr);
333}
334
335namespace battery {
336
337/** This allocator call the standard `malloc` and `free` functions. */
339public:
340 CUDA NI void* allocate(size_t bytes) {
341 if(bytes == 0) {
342 return nullptr;
343 }
344 return std::malloc(bytes);
345 }
346
347 CUDA NI void deallocate(void* data) {
348 std::free(data);
349 }
350
351 CUDA bool operator==(const standard_allocator&) const { return true; }
352};
353} // namespace battery
354
355CUDA inline void* operator new(size_t bytes, battery::standard_allocator& p) {
356 return p.allocate(bytes);
357}
358
359CUDA inline void operator delete(void* ptr, battery::standard_allocator& p) {
360 return p.deallocate(ptr);
361}
362
363
364namespace battery {
365 template <class Allocator, class InternalAllocator = Allocator>
367 struct control_block {
368 Allocator allocator;
369 size_t counter;
370 size_t num_deallocations;
371 size_t num_allocations;
372 size_t total_bytes_allocated;
373
374 CUDA control_block(const Allocator& allocator)
375 : allocator(allocator), counter(1), num_deallocations(0), num_allocations(0), total_bytes_allocated(0)
376 {}
377
378 CUDA NI void* allocate(size_t bytes) {
379 num_allocations++;
380 total_bytes_allocated += bytes;
381 return allocator.allocate(bytes);
382 }
383
384 CUDA NI void deallocate(void* ptr) {
385 if(ptr != nullptr) {
386 num_deallocations++;
387 allocator.deallocate(ptr);
388 }
389 }
390 };
391
392 InternalAllocator internal_allocator;
393 control_block* block;
394
395 public:
397
399 : internal_allocator(other.internal_allocator), block(other.block)
400 {
401 block->counter++;
402 }
403
404 CUDA NI statistics_allocator(const Allocator& allocator = Allocator(), const InternalAllocator& internal_allocator = InternalAllocator())
405 : internal_allocator(internal_allocator)
406 {
407 block = static_cast<control_block*>(this->internal_allocator.allocate(sizeof(control_block)));
408 new(block) control_block(allocator);
409 }
410
412 block->counter--;
413 if(block->counter == 0) {
414 internal_allocator.deallocate(block);
415 }
416 }
417
418 CUDA NI void* allocate(size_t bytes) {
419 return block->allocate(bytes);
420 }
421
422 CUDA NI void deallocate(void* ptr) {
423 block->deallocate(ptr);
424 }
425
426 CUDA size_t num_allocations() const {
427 return block->num_allocations;
428 }
429
430 CUDA size_t num_deallocations() const {
431 return block->num_deallocations;
432 }
433
435 return block->total_bytes_allocated;
436 }
437
438 CUDA inline bool operator==(const this_type& rhs) const {
439 return block == rhs.block;
440 }
441 };
442}
443
444#endif
Definition allocator.hpp:185
CUDA NI ~pool_allocator()
Definition allocator.hpp:259
CUDA size_t used() const
Definition allocator.hpp:301
CUDA NI void deallocate(void *ptr)
Definition allocator.hpp:284
CUDA NI pool_allocator(unsigned char *mem, size_t capacity, size_t alignment=alignof(std::max_align_t))
Definition allocator.hpp:246
CUDA NI void * allocate(size_t bytes)
Definition allocator.hpp:280
CUDA NI pool_allocator()=default
CUDA size_t unaligned_wasted_bytes() const
Definition allocator.hpp:317
CUDA size_t num_allocations() const
Definition allocator.hpp:313
CUDA bool operator==(const pool_allocator &rhs) const
Definition allocator.hpp:321
CUDA NI pool_allocator(const pool_allocator &other)
Definition allocator.hpp:232
CUDA NI pool_allocator & operator=(pool_allocator &&other)
Definition allocator.hpp:265
CUDA size_t align_at(size_t alignment)
Definition allocator.hpp:274
CUDA size_t capacity() const
Definition allocator.hpp:305
CUDA NI pool_allocator(pool_allocator &&other)
Definition allocator.hpp:240
CUDA NI void print() const
Definition allocator.hpp:288
CUDA size_t num_deallocations() const
Definition allocator.hpp:309
Definition allocator.hpp:338
CUDA NI void deallocate(void *data)
Definition allocator.hpp:347
CUDA bool operator==(const standard_allocator &) const
Definition allocator.hpp:351
CUDA NI void * allocate(size_t bytes)
Definition allocator.hpp:340
Definition allocator.hpp:366
CUDA NI statistics_allocator(const statistics_allocator &other)
Definition allocator.hpp:398
CUDA size_t num_deallocations() const
Definition allocator.hpp:430
CUDA bool operator==(const this_type &rhs) const
Definition allocator.hpp:438
CUDA NI void * allocate(size_t bytes)
Definition allocator.hpp:418
CUDA size_t total_bytes_allocated() const
Definition allocator.hpp:434
CUDA NI statistics_allocator(const Allocator &allocator=Allocator(), const InternalAllocator &internal_allocator=InternalAllocator())
Definition allocator.hpp:404
CUDA NI void deallocate(void *ptr)
Definition allocator.hpp:422
CUDA NI ~statistics_allocator()
Definition allocator.hpp:411
CUDA size_t num_allocations() const
Definition allocator.hpp:426
Definition algorithm.hpp:10
CUDA bool operator==(const string< Alloc1 > &lhs, const string< Alloc2 > &rhs)
Definition string.hpp:110
#define CUDA
Definition utility.hpp:59
#define NI
Definition utility.hpp:62