Cuda battery library
Loading...
Searching...
No Matches
allocator.hpp
Go to the documentation of this file.
1// Copyright 2021 Pierre Talbot
2
3#ifndef CUDA_BATTERY_ALLOCATOR_HPP
4#define CUDA_BATTERY_ALLOCATOR_HPP
5
6/** \file allocator.hpp
7We provide several allocators compatible with the data structures provided by this library.
8The allocators are aimed to be used to distinguish in which memory (shared, global, managed or the "standard" C++ memory) we should allocate data.
9This allows us to provide uniform interfaces for both host (C++) and device (CUDA) code.
10
11As a general comment, be careful to always deallocate the memory from the side you allocated it, e.g., do not allocate on the host then try to deallocate it on the device.
12To avoid these kind of mistakes, you should use `battery::shared_ptr` when passing data to a CUDA kernel, see the manual for examples.
13*/
14
15#include <cassert>
16#include <cstddef>
17#include <iostream>
18#include <type_traits>
19#include <inttypes.h>
20#include "utility.hpp"
21
22#ifdef __CUDACC__
23
24namespace battery {
25
26/** An allocator using the global memory of CUDA.
27This can be used from both the host and device side, but the memory can only be accessed when in a device function.
28Beware that allocation and deallocation must occur on the same side. */
29class global_allocator {
30public:
31 CUDA NI void* allocate(size_t bytes) {
32 if(bytes == 0) {
33 return nullptr;
34 }
35 #ifdef __CUDA_ARCH__
36 void* data = std::malloc(bytes);
37 if (data == nullptr) {
38 printf("Allocation of device memory failed\n");
39 }
40 return data;
41 #else
42 void* data = nullptr;
43 cudaError_t rc = cudaMalloc(&data, bytes);
44 if (rc != cudaSuccess) {
45 std::cerr << "Allocation of global memory failed: " << cudaGetErrorString(rc) << std::endl;
46 return nullptr;
47 }
48 return data;
49 #endif
50 }
51
52 CUDA NI void deallocate(void* data) {
53 #ifdef __CUDA_ARCH__
54 std::free(data);
55 #else
56 cudaError_t rc = cudaFree(data);
57 if (rc != cudaSuccess) {
58 std::cerr << "Free of global memory failed: " << cudaGetErrorString(rc) << std::endl;
59 }
60 #endif
61 }
62
63 CUDA bool operator==(const global_allocator&) const { return true; }
64};
65
66/** An allocator using the managed memory of CUDA when the memory is allocated on the host.
67 * We delegate the allocation to `global_allocator` when the allocation is done on the device (since managed memory cannot be allocated in device functions). */
68class managed_allocator {
69public:
70 CUDA NI void* allocate(size_t bytes) {
71 #ifdef __CUDA_ARCH__
72 return global_allocator{}.allocate(bytes);
73 #else
74 if(bytes == 0) {
75 return nullptr;
76 }
77 void* data = nullptr;
78 cudaError_t rc = cudaMallocManaged(&data, bytes);
79 if (rc != cudaSuccess) {
80 std::cerr << "Allocation of managed memory failed: " << cudaGetErrorString(rc) << std::endl;
81 return nullptr;
82 }
83 return data;
84 #endif
85 }
86
87 CUDA NI void deallocate(void* data) {
88 #ifdef __CUDA_ARCH__
89 return global_allocator{}.deallocate(data);
90 #else
91 cudaError_t rc = cudaFree(data);
92 if (rc != cudaSuccess) {
93 std::cerr << "Free of managed memory failed: " << cudaGetErrorString(rc) << std::endl;
94 }
95 #endif
96 }
97
98 CUDA bool operator==(const managed_allocator&) const { return true; }
99};
100
101/** An allocator using pinned memory for shared access between the host and the device.
102 * This type of memory is required on Microsoft Windows, on the Windows Subsystem for Linux (WSL), and on NVIDIA GRID (virtual GPU), because these systems do not support concurrent access to managed memory while a CUDA kernel is running.
103 *
104 * This allocator requires that you first set cudaDeviceMapHost using cudaSetDeviceFlags.
105 *
106 * We suppose unified virtual addressing (UVA) is enabled (the property `unifiedAddressing` is true).
107 *
108 * We delegate the allocation to `global_allocator` when the allocation is done on the device, since host memory cannot be allocated in device functions.
109 * */
110class pinned_allocator {
111public:
112 CUDA NI void* allocate(size_t bytes) {
113 #ifdef __CUDA_ARCH__
114 return global_allocator{}.allocate(bytes);
115 #else
116 if(bytes == 0) {
117 return nullptr;
118 }
119 void* data = nullptr;
120 cudaError_t rc = cudaMallocHost(&data, bytes); // pinned
121 if (rc != cudaSuccess) {
122 std::cerr << "Allocation of pinned memory failed: " << cudaGetErrorString(rc) << std::endl;
123 return nullptr;
124 }
125 return data;
126 #endif
127 }
128
129 CUDA NI void deallocate(void* data) {
130 #ifdef __CUDA_ARCH__
131 return global_allocator{}.deallocate(data);
132 #else
133 cudaError_t rc = cudaFreeHost(data);
134 if (rc != cudaSuccess) {
135 std::cerr << "Free of pinned memory failed: " << cudaGetErrorString(rc) << std::endl;
136 }
137 #endif
138 }
139
140 CUDA bool operator==(const pinned_allocator&) const { return true; }
141};
142
143} // namespace battery
144
145CUDA inline void* operator new(size_t bytes, battery::managed_allocator& p) {
146 return p.allocate(bytes);
147}
148
149CUDA inline void operator delete(void* ptr, battery::managed_allocator& p) {
150 return p.deallocate(ptr);
151}
152
153CUDA inline void* operator new(size_t bytes, battery::global_allocator& p) {
154 return p.allocate(bytes);
155}
156
157CUDA inline void operator delete(void* ptr, battery::global_allocator& p) {
158 p.deallocate(ptr);
159}
160
161CUDA inline void* operator new(size_t bytes, battery::pinned_allocator& p) {
162 return p.allocate(bytes);
163}
164
165CUDA inline void operator delete(void* ptr, battery::pinned_allocator& p) {
166 p.deallocate(ptr);
167}
168
169#endif // __CUDACC__
170
171namespace battery {
172
173namespace impl {
174#ifdef __CUDA_ARCH__
175 __constant__
176#endif
177static const int power2[17] = {0, 1, 2, 2, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16};
178}
179
180/** An allocator allocating memory from a pool of memory.
181The memory can for instance be the CUDA shared memory.
182This allocator is incomplete as it never frees the memory allocated.
183It allocates a control block using the "normal" `operator new`, where the address to the pool, its capacity and current offset are stored.
184*/
186
187 struct control_block {
188 unsigned char* mem;
189 size_t capacity;
190 size_t offset;
191 size_t alignment;
192 size_t num_deallocations;
193 size_t num_allocations;
194 size_t unaligned_wasted_bytes;
195 size_t counter;
196
197 CUDA control_block(unsigned char* mem, size_t capacity, size_t alignment)
198 : mem(mem), capacity(capacity), offset(0), alignment(alignment), num_deallocations(0), num_allocations(0), unaligned_wasted_bytes(0), counter(1)
199 {}
200
201 CUDA void* allocate(size_t bytes) {
202 // printf("%p: allocate %lu bytes / %lu offset / %lu capacity / %lu alignment / %p current mem\n", mem, bytes, offset, capacity, alignment, &mem[offset]);
203 if(bytes == 0) {
204 return nullptr;
205 }
206 size_t smallest_alignment = (bytes > alignment || alignment > 16) ? alignment : impl::power2[bytes];
207 if(size_t(&mem[offset]) % smallest_alignment != 0) { // If we are currently unaligned.
208 size_t old_offset = offset;
209 offset += smallest_alignment - (size_t(&mem[offset]) % smallest_alignment);
210 unaligned_wasted_bytes += (offset - old_offset);
211 }
212 assert(offset + bytes <= capacity);
213 assert((size_t)&mem[offset] % smallest_alignment == 0);
214 void* m = (void*)&mem[offset];
215 offset += bytes;
216 num_allocations++;
217 return m;
218 }
219
220 CUDA void deallocate(void* ptr) {
221 if(ptr != nullptr) {
222 num_deallocations++;
223 }
224 }
225 };
226
227 control_block* block;
228
229public:
231 block(other.block)
232 {
233 if(block != nullptr) {
234 block->counter++;
235 }
236 }
237
238 CUDA NI pool_allocator(unsigned char* mem, size_t capacity, size_t alignment = alignof(std::max_align_t))
239 : block(::new control_block(mem, capacity, alignment))
240 {}
241
243 if(block != nullptr) {
244 block->counter--;
245 if(block->counter == 0) {
246 ::delete block;
247 }
248 }
249 }
250
251 CUDA size_t align_at(size_t alignment) {
252 size_t old = block->alignment;
253 block->alignment = alignment;
254 return old;
255 }
256
257 CUDA NI void* allocate(size_t bytes) {
258 return block->allocate(bytes);
259 }
260
261 CUDA NI void deallocate(void* ptr) {
262 block->deallocate(ptr);
263 }
264
265 CUDA NI void print() const {
266 // CUDA printf does not support "%zu" -- use PRIu64 macro (Windows / Linux)
267 printf("%% %" PRIu64 " / %" PRIu64 " used [%" PRIu64 "/%" PRIu64 "]KB [%" PRIu64 "/%" PRIu64 "]MB\n",
268 block->offset, block->capacity,
269 block->offset/1000, block->capacity/1000,
270 block->offset/1000/1000, block->capacity/1000/1000);
271 printf("%% %" PRIu64 " / %" PRIu64 " wasted for alignment [%" PRIu64 "/%" PRIu64 "]KB [%" PRIu64 "/%" PRIu64 "]MB\n",
272 block->unaligned_wasted_bytes, block->offset,
273 block->unaligned_wasted_bytes/1000, block->offset/1000,
274 block->unaligned_wasted_bytes/1000/1000, block->offset/1000/1000);
275 printf("%% %" PRIu64 " allocations and %" PRIu64 " deallocations\n", block->num_allocations, block->num_deallocations);
276 }
277
278 CUDA size_t used() const {
279 return block->offset;
280 }
281
282 CUDA size_t capacity() const {
283 return block->capacity;
284 }
285
286 CUDA size_t num_deallocations() const {
287 return block->num_deallocations;
288 }
289
290 CUDA size_t num_allocations() const {
291 return block->num_allocations;
292 }
293
295 return block->unaligned_wasted_bytes;
296 }
297
298 CUDA bool operator==(const pool_allocator& rhs) const {
299 return block == rhs.block;
300 }
301};
302}
303
304CUDA inline void* operator new(size_t bytes, battery::pool_allocator& p) {
305 return p.allocate(bytes);
306}
307
308CUDA inline void operator delete(void* ptr, battery::pool_allocator& p) {
309 return p.deallocate(ptr);
310}
311
312namespace battery {
313
314/** This allocator call the standard `malloc` and `free` functions. */
316public:
317 CUDA NI void* allocate(size_t bytes) {
318 if(bytes == 0) {
319 return nullptr;
320 }
321 return std::malloc(bytes);
322 }
323
324 CUDA NI void deallocate(void* data) {
325 std::free(data);
326 }
327
328 CUDA bool operator==(const standard_allocator&) const { return true; }
329};
330} // namespace battery
331
332CUDA inline void* operator new(size_t bytes, battery::standard_allocator& p) {
333 return p.allocate(bytes);
334}
335
336CUDA inline void operator delete(void* ptr, battery::standard_allocator& p) {
337 return p.deallocate(ptr);
338}
339
340
341namespace battery {
342 template <class Allocator, class InternalAllocator = Allocator>
344 struct control_block {
345 Allocator allocator;
346 size_t counter;
347 size_t num_deallocations;
348 size_t num_allocations;
349 size_t total_bytes_allocated;
350
351 CUDA control_block(const Allocator& allocator)
352 : allocator(allocator), counter(1), num_deallocations(0), num_allocations(0), total_bytes_allocated(0)
353 {}
354
355 CUDA NI void* allocate(size_t bytes) {
356 num_allocations++;
357 total_bytes_allocated += bytes;
358 return allocator.allocate(bytes);
359 }
360
361 CUDA NI void deallocate(void* ptr) {
362 if(ptr != nullptr) {
363 num_deallocations++;
364 allocator.deallocate(ptr);
365 }
366 }
367 };
368
369 InternalAllocator internal_allocator;
370 control_block* block;
371
372 public:
374
376 : internal_allocator(other.internal_allocator), block(other.block)
377 {
378 block->counter++;
379 }
380
381 CUDA NI statistics_allocator(const Allocator& allocator = Allocator(), const InternalAllocator& internal_allocator = InternalAllocator())
382 : internal_allocator(internal_allocator)
383 {
384 block = static_cast<control_block*>(this->internal_allocator.allocate(sizeof(control_block)));
385 new(block) control_block(allocator);
386 }
387
389 block->counter--;
390 if(block->counter == 0) {
391 internal_allocator.deallocate(block);
392 }
393 }
394
395 CUDA NI void* allocate(size_t bytes) {
396 return block->allocate(bytes);
397 }
398
399 CUDA NI void deallocate(void* ptr) {
400 block->deallocate(ptr);
401 }
402
403 CUDA size_t num_allocations() const {
404 return block->num_allocations;
405 }
406
407 CUDA size_t num_deallocations() const {
408 return block->num_deallocations;
409 }
410
412 return block->total_bytes_allocated;
413 }
414
415 CUDA inline bool operator==(const this_type& rhs) const {
416 return block == rhs.block;
417 }
418 };
419}
420
421#endif
Definition allocator.hpp:185
CUDA NI ~pool_allocator()
Definition allocator.hpp:242
CUDA size_t used() const
Definition allocator.hpp:278
CUDA NI void deallocate(void *ptr)
Definition allocator.hpp:261
CUDA NI pool_allocator(unsigned char *mem, size_t capacity, size_t alignment=alignof(std::max_align_t))
Definition allocator.hpp:238
CUDA NI void * allocate(size_t bytes)
Definition allocator.hpp:257
CUDA size_t unaligned_wasted_bytes() const
Definition allocator.hpp:294
CUDA size_t num_allocations() const
Definition allocator.hpp:290
CUDA bool operator==(const pool_allocator &rhs) const
Definition allocator.hpp:298
CUDA NI pool_allocator(const pool_allocator &other)
Definition allocator.hpp:230
CUDA size_t align_at(size_t alignment)
Definition allocator.hpp:251
CUDA size_t capacity() const
Definition allocator.hpp:282
CUDA NI void print() const
Definition allocator.hpp:265
CUDA size_t num_deallocations() const
Definition allocator.hpp:286
Definition allocator.hpp:315
CUDA NI void deallocate(void *data)
Definition allocator.hpp:324
CUDA bool operator==(const standard_allocator &) const
Definition allocator.hpp:328
CUDA NI void * allocate(size_t bytes)
Definition allocator.hpp:317
Definition allocator.hpp:343
CUDA NI statistics_allocator(const statistics_allocator &other)
Definition allocator.hpp:375
CUDA size_t num_deallocations() const
Definition allocator.hpp:407
CUDA bool operator==(const this_type &rhs) const
Definition allocator.hpp:415
CUDA NI void * allocate(size_t bytes)
Definition allocator.hpp:395
CUDA size_t total_bytes_allocated() const
Definition allocator.hpp:411
CUDA NI statistics_allocator(const Allocator &allocator=Allocator(), const InternalAllocator &internal_allocator=InternalAllocator())
Definition allocator.hpp:381
CUDA NI void deallocate(void *ptr)
Definition allocator.hpp:399
CUDA NI ~statistics_allocator()
Definition allocator.hpp:388
CUDA size_t num_allocations() const
Definition allocator.hpp:403
Definition algorithm.hpp:10
CUDA bool operator==(const string< Alloc1 > &lhs, const string< Alloc2 > &rhs)
Definition string.hpp:110
#define CUDA
Definition utility.hpp:59
#define NI
Definition utility.hpp:62