Turbo Constraint Solver
Loading...
Searching...
No Matches
memory_gpu.hpp
Go to the documentation of this file.
1// Copyright 2025 Pierre Talbot
2
3#ifndef TURBO_MEMORY_GPU_HPP
4#define TURBO_MEMORY_GPU_HPP
5
6#include "battery/allocator.hpp"
7#include "config.hpp"
8
9namespace bt = ::battery;
10
11#ifdef __CUDACC__
12
13/** Depending on the problem, we can store the abstract elements in different memories.
14 * The "worst" is everything in global memory (GLOBAL) when the problem is too large for the shared memory.
15 * The "best" is when the ternary constraint network (both the store of variables and the propagators) can be stored in shared memory (TCN_SHARED).
16 * A third possibility is to store only the variables' domains in the shared memory (STORE_SHARED).
17*/
18enum class MemoryKind {
19 GLOBAL,
20 STORE_SHARED,
21 TCN_SHARED
22};
23
24/** The shared memory must be configured by hand before the kernel is launched.
25 * This class encapsulates information about the size of the store and propagators, and help creating the allocators accordingly.
26*/
27struct MemoryConfig {
28 MemoryKind mem_kind;
29 size_t shared_bytes;
30 size_t store_bytes;
31 size_t prop_bytes;
32
33 MemoryConfig() = default;
34 MemoryConfig(const MemoryConfig&) = default;
35
36 MemoryConfig(size_t store_bytes, size_t prop_bytes):
37 mem_kind(MemoryKind::GLOBAL),
38 shared_bytes(0),
39 store_bytes(store_bytes),
40 prop_bytes(prop_bytes)
41 {}
42
43 MemoryConfig(const void* kernel, int verbose, int blocks_per_sm, size_t store_bytes, size_t prop_bytes):
44 store_bytes(store_bytes),
45 prop_bytes(prop_bytes)
46 {
47 int maxSharedMemPerSM;
48 cudaDeviceGetAttribute(&maxSharedMemPerSM, cudaDevAttrMaxSharedMemoryPerMultiprocessor, 0);
49 cudaFuncAttributes attr;
50 cudaFuncGetAttributes(&attr, kernel);
51 if(verbose >= 1) {
52 printf("%% max_shared_memory=%d\n", maxSharedMemPerSM);
53 printf("%% static_shared_memory=%zu\n", attr.sharedSizeBytes);
54 }
55
56 int alignment = 128; // just in case...
57 if(blocks_per_sm * (store_bytes + prop_bytes + alignment + attr.sharedSizeBytes) < maxSharedMemPerSM) {
58 shared_bytes = store_bytes + prop_bytes + alignment;
59 mem_kind = MemoryKind::TCN_SHARED;
60 }
61 else if(blocks_per_sm * (store_bytes + alignment + attr.sharedSizeBytes) < maxSharedMemPerSM) {
62 shared_bytes = store_bytes + alignment;
63 mem_kind = MemoryKind::STORE_SHARED;
64 }
65 else {
66 shared_bytes = 0;
67 mem_kind = MemoryKind::GLOBAL;
68 }
69 if(shared_bytes != 0) {
70 cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_bytes);
71 }
72 }
73
74 CUDA bt::pool_allocator make_global_pool(size_t bytes) const {
75 void* mem_pool = bt::global_allocator{}.allocate(bytes);
76 return bt::pool_allocator(static_cast<unsigned char*>(mem_pool), bytes);
77 }
78
79 CUDA bt::pool_allocator make_shared_pool(unsigned char* shared_mem) const {
80 return bt::pool_allocator(shared_mem, shared_bytes);
81 }
82
83 CUDA bt::pool_allocator make_prop_pool(bt::pool_allocator shared_mem) const {
84 if(mem_kind == MemoryKind::TCN_SHARED) {
85 return shared_mem;
86 }
87 else {
88 return make_global_pool(prop_bytes);
89 }
90 }
91
92 CUDA bt::pool_allocator make_store_pool(bt::pool_allocator shared_mem) const {
93 if(mem_kind == MemoryKind::TCN_SHARED || mem_kind == MemoryKind::STORE_SHARED) {
94 return shared_mem;
95 }
96 else {
97 return make_global_pool(store_bytes);
98 }
99 }
100
101 template <class Config, class Stat>
102 CUDA void print_mzn_statistics(const Config& config, const Stat& stats) const {
103 stats.print_stat("memory_configuration",
104 mem_kind == MemoryKind::GLOBAL ? "global" : (
105 mem_kind == MemoryKind::STORE_SHARED ? "store_shared" : "tcn_shared"));
106 stats.print_memory_statistics(config.verbose_solving, "shared_mem", shared_bytes);
107 stats.print_memory_statistics(config.verbose_solving, "store_mem", store_bytes);
108 stats.print_memory_statistics(config.verbose_solving, "propagator_mem", prop_bytes);
109 stats.print_mzn_end_stats();
110 }
111};
112
113template <class T>
114__global__ void gpu_sizeof_kernel(size_t* size) {
115 *size = sizeof(T);
116}
117
118template <class T>
119size_t gpu_sizeof() {
120 auto s = bt::make_unique<size_t, bt::managed_allocator>();
121 gpu_sizeof_kernel<T><<<1, 1>>>(s.get());
122 CUDAEX(cudaDeviceSynchronize());
123 return *s;
124}
125
126void check_support_managed_memory() {
127 int attr = 0;
128 int dev = 0;
129 CUDAEX(cudaDeviceGetAttribute(&attr, cudaDevAttrManagedMemory, dev));
130 if (!attr) {
131 std::cerr << "The GPU does not support managed memory." << std::endl;
132 exit(EXIT_FAILURE);
133 }
134}
135
136void check_support_concurrent_managed_memory() {
137 int attr = 0;
138 int dev = 0;
139 CUDAEX(cudaDeviceGetAttribute(&attr, cudaDevAttrConcurrentManagedAccess, dev));
140 if (!attr) {
141#ifdef NO_CONCURRENT_MANAGED_MEMORY
142 printf("%% WARNING: The GPU does not support concurrent access to managed memory, hence we fall back on pinned memory.\n");
143 /** Set cudaDeviceMapHost to allow cudaMallocHost() to allocate pinned memory
144 * for concurrent access between the device and the host. It must be called
145 * early, before any CUDA management functions, so that we can fall back to
146 * using the pinned_allocator instead of the managed_allocator.
147 * This is required on Windows, WSL, macOS, and NVIDIA GRID.
148 * See also PR #18.
149 */
150 unsigned int flags = 0;
151 CUDAEX(cudaGetDeviceFlags(&flags));
152 flags |= cudaDeviceMapHost;
153 CUDAEX(cudaSetDeviceFlags(flags));
154#else
155 printf("%% To run Turbo on this GPU you need to build Turbo with the option NO_CONCURRENT_MANAGED_MEMORY.\n");
156 exit(EXIT_FAILURE);
157#endif
158 }
159}
160
161/** Wait the solving ends because of a timeout, CTRL-C or because the kernel finished. */
162template<class CP, class Timepoint>
163bool wait_solving_ends(cuda::std::atomic_flag& stop, CP& root, const Timepoint& start) {
164 cudaEvent_t event;
165 cudaEventCreateWithFlags(&event,cudaEventDisableTiming);
166 cudaEventRecord(event);
167 while(!must_quit(root) && check_timeout(root, start) && cudaEventQuery(event) == cudaErrorNotReady) {
168 std::this_thread::sleep_for(std::chrono::milliseconds(100));
169 }
170 if(cudaEventQuery(event) == cudaErrorNotReady) {
171 stop.test_and_set();
172 root.prune();
173 return true;
174 }
175 else {
176 cudaError error = cudaDeviceSynchronize();
177 if(error == cudaErrorIllegalAddress) {
178 printf("%% ERROR: CUDA kernel failed due to an illegal memory access. This might be due to a stack overflow because it is too small. Try increasing the stack size with the options -stack. If it does not work, please report it as a bug.\n");
179 exit(EXIT_FAILURE);
180 }
181 CUDAEX(error);
182 return false;
183 }
184}
185
186#endif
187#endif
bool must_quit(A &a)
Definition common_solving.hpp:78
bool check_timeout(A &a, const Timepoint &start)
Definition common_solving.hpp:90
Definition common_solving.hpp:144
CUDA void prune()
Definition common_solving.hpp:803