Turbo Constraint Solver
Loading...
Searching...
No Matches
gpu_solving.hpp
Go to the documentation of this file.
1// Copyright 2023 Pierre Talbot
2
3#ifndef TURBO_GPU_SOLVING_HPP
4#define TURBO_GPU_SOLVING_HPP
5
6#include "common_solving.hpp"
7#include <thread>
8#include <algorithm>
9
10namespace bt = ::battery;
11
12#ifdef __CUDACC__
13
14#include <cuda/std/chrono>
15#include <cuda/semaphore>
16
17template <
18 class Universe0, // Universe used locally to one thread.
19 class Universe1, // Universe used in the scope of a block.
20 class Universe2, // Universe used in the scope of a grid.
21 class ConcurrentAllocator> // This allocator allocates memory that can be both accessed by the CPU and the GPU. See PR #18 for the reason (basically, non-Linux systems do not support concurrent managed memory (accessed from CPU and GPU) and must rely on pinned memory instead).
22struct StateTypes {
23
24 using U0 = Universe0;
25 using U1 = Universe1;
26 using U2 = Universe2;
27 using concurrent_allocator = ConcurrentAllocator;
28
29 /** We first interpret the formula in an abstract domain with sequential concurrent memory, that we call `GridCP`. */
30 using GridCP = AbstractDomains<U0,
31 bt::statistics_allocator<ConcurrentAllocator>,
32 bt::statistics_allocator<UniqueLightAlloc<ConcurrentAllocator, 0>>,
33 bt::statistics_allocator<UniqueLightAlloc<ConcurrentAllocator, 1>>>;
34
35 /** Then, once everything is initialized, we rely on a parallel abstract domain called `BlockCP`, usually using atomic shared and global memory. */
36 using BlockCP = AbstractDomains<U1,
37 bt::global_allocator,
38 bt::pool_allocator,
40};
41
42using Itv0 = Interval<ZLB<int, bt::local_memory>>;
43using Itv1 = Interval<ZLB<int, bt::atomic_memory_block>>;
44using Itv2 = Interval<ZLB<int, bt::atomic_memory_grid>>;
45using AtomicBool = B<bt::atomic_memory_block>;
46using FPEngine = BlockAsynchronousIterationGPU<bt::pool_allocator>;
47
48// Version for non-Linux systems such as Windows where pinned memory must be used (see PR #19).
49#ifdef NO_CONCURRENT_MANAGED_MEMORY
50 using ItvSolverPinned = StateTypes<Itv0, Itv1, Itv2, bt::pinned_allocator>;
51 using ItvSolverPinnedNoAtomics = StateTypes<Itv0, Itv0, Itv0, bt::pinned_allocator>;
52#else
53 using ItvSolver = StateTypes<Itv0, Itv1, Itv2, bt::managed_allocator>;
54 // Deactivate atomics for the domain of variables (for benchmarking only, it is not safe according to CUDA consistency model).
55 using ItvSolverNoAtomics = StateTypes<Itv0, Itv0, Itv0, bt::managed_allocator>;
56#endif
57
58/** Depending on the problem, we can store the abstract elements in different memories.
59 * The "worst" is everything in global memory (GLOBAL) when the problem is too large for the shared memory.
60 * The "best" is when both the store of variables and the propagators (STORE_PC_SHARED) can be stored in shared memory.
61 * A third possibility is to store only the variables' domains in the shared memory (STORE_SHARED).
62*/
63enum class MemoryKind {
64 GLOBAL,
65 STORE_SHARED,
66 STORE_PC_SHARED
67};
68
69/** The shared memory must be configured by hand before the kernel is launched.
70 * This class encapsulates information about the size of each relevant abstract elements, and help creating the allocators accordingly.
71*/
72struct MemoryConfig {
73 MemoryKind mem_kind;
74 size_t shared_bytes;
75 size_t store_bytes;
76 size_t pc_bytes;
77
78 CUDA bt::pool_allocator make_global_pool(size_t bytes) {
79 void* mem_pool = bt::global_allocator{}.allocate(bytes);
80 return bt::pool_allocator(static_cast<unsigned char*>(mem_pool), bytes);
81 }
82
83 CUDA bt::pool_allocator make_shared_pool(unsigned char* shared_mem) {
84 return bt::pool_allocator(shared_mem, shared_bytes);
85 }
86
87 CUDA bt::pool_allocator make_pc_pool(bt::pool_allocator shared_mem) {
88 if(mem_kind == MemoryKind::STORE_PC_SHARED) {
89 return shared_mem;
90 }
91 else {
92 return make_global_pool(pc_bytes);
93 }
94 }
95
96 CUDA bt::pool_allocator make_store_pool(bt::pool_allocator shared_mem) {
97 if(mem_kind == MemoryKind::STORE_PC_SHARED || mem_kind == MemoryKind::STORE_SHARED) {
98 return shared_mem;
99 }
100 else {
101 return make_global_pool(store_bytes);
102 }
103 }
104
105 CUDA void print_mzn_statistics() const {
106 printf("%%%%%%mzn-stat: memory_configuration=\"%s\"\n",
107 mem_kind == MemoryKind::GLOBAL ? "global" : (
108 mem_kind == MemoryKind::STORE_SHARED ? "store_shared" : "store_pc_shared"));
109 printf("%%%%%%mzn-stat: shared_mem=%" PRIu64 "\n", shared_bytes);
110 printf("%%%%%%mzn-stat: store_mem=%" PRIu64 "\n", store_bytes);
111 printf("%%%%%%mzn-stat: propagator_mem=%" PRIu64 "\n", pc_bytes);
112 }
113};
114
115template <class S>
116struct BlockData;
117
118/** `GridData` represents the problem to be solved (`root`), the data of each block when running EPS (`blocks`), the index of the subproblem that needs to be solved next (`next_subproblem`), an estimation of the best bound found across subproblems in case of optimisation problem and other global information.
119 */
120template <class S>
121struct GridData {
122 using GridCP = typename S::GridCP;
123 using BlockCP = typename S::BlockCP;
124 using U2 = typename S::U2;
125
126 GridCP root;
127 // `blocks_root` is a copy of `root` but with the same allocators than the ones used in `blocks`.
128 // This is helpful to share immutable data among blocks (for instance the propagators).
129 bt::shared_ptr<BlockCP, bt::global_allocator> blocks_root;
130 // Stop from the CPU, for instance because of a timeout.
131 volatile bool cpu_stop;
132 // Boolean indicating that the blocks have been reduced, and the CPU can now print the statistics.
133 volatile bool blocks_reduced;
134 MemoryConfig mem_config;
135 bt::vector<BlockData<S>, bt::global_allocator> blocks;
136 // Stop from a block on the GPU, for instance because we found a solution.
137 bt::shared_ptr<B<bt::atomic_memory_grid>, bt::global_allocator> gpu_stop;
138 bt::shared_ptr<ZLB<size_t, bt::atomic_memory_grid>, bt::global_allocator> next_subproblem;
139 bt::shared_ptr<U2, bt::global_allocator> best_bound;
140
141 // All of what follows is only to support printing while the kernel is running.
142 // In particular, we transfer the solution to the CPU where it is printed, because printing on the GPU can be very slow when the problem is large.
143 bt::shared_ptr<cuda::binary_semaphore<cuda::thread_scope_device>, bt::global_allocator> print_lock;
144 cuda::std::atomic_flag ready_to_produce;
145 cuda::std::atomic_flag ready_to_consume;
146
147 GridData(const GridCP& root, const MemoryConfig& mem_config)
148 : root(root)
149 , mem_config(mem_config)
150 , cpu_stop(false)
151 , blocks_reduced(false)
152 {
153 ready_to_consume.clear();
154 ready_to_produce.test_and_set();
155 cuda::atomic_thread_fence(cuda::memory_order_seq_cst, cuda::thread_scope_system);
156 }
157
158 template <class BlockBAB>
159 __device__ void produce_solution(const BlockBAB& bab) {
160 print_lock->acquire();
161 if(!cpu_stop) {
162 ready_to_produce.wait(false, cuda::std::memory_order_seq_cst);
163 ready_to_produce.clear();
164 bab.extract(*(root.bab));
165 cuda::atomic_thread_fence(cuda::memory_order_seq_cst, cuda::thread_scope_system);
166 ready_to_consume.test_and_set(cuda::std::memory_order_seq_cst);
167 ready_to_consume.notify_one();
168 }
169 print_lock->release();
170 }
171
172 __host__ bool consume_solution() {
173 ready_to_consume.wait(false, cuda::std::memory_order_seq_cst);
174 ready_to_consume.clear();
175 if(blocks_reduced) {
176 root.print_final_solution();
177 if(root.config.print_statistics) {
178 root.print_mzn_statistics();
179 }
180 return true;
181 }
182 else {
183 root.print_solution();
184 }
185 cuda::atomic_thread_fence(cuda::memory_order_seq_cst, cuda::thread_scope_system);
186 ready_to_produce.test_and_set(cuda::std::memory_order_seq_cst);
187 ready_to_produce.notify_one();
188 return false;
189 }
190
191 __device__ void allocate() {
192 assert(threadIdx.x == 0 && blockIdx.x == 0);
193 auto root_mem_config(mem_config);
194 root_mem_config.mem_kind = MemoryKind::GLOBAL;
195 blocks_root = bt::make_shared<BlockCP, bt::global_allocator>(
196 typename BlockCP::tag_gpu_block_copy{},
197 false, // Due to different allocators between BlockCP and GridCP, it won't be able to share data anyways.
198 root,
199 bt::global_allocator{},
200 root_mem_config.make_pc_pool(bt::pool_allocator(nullptr,0)),
201 root_mem_config.make_store_pool(bt::pool_allocator(nullptr,0)));
202 blocks = bt::vector<BlockData<S>, bt::global_allocator>(root.config.or_nodes);
203 gpu_stop = bt::make_shared<B<bt::atomic_memory_grid>, bt::global_allocator>(false);
204 print_lock = bt::make_shared<cuda::binary_semaphore<cuda::thread_scope_device>, bt::global_allocator>(1);
205 next_subproblem = bt::make_shared<ZLB<size_t, bt::atomic_memory_grid>, bt::global_allocator>(0);
206 best_bound = bt::make_shared<U2, bt::global_allocator>();
207 }
208
209 __device__ void deallocate() {
210 assert(threadIdx.x == 0 && blockIdx.x == 0);
211 blocks = bt::vector<BlockData<S>, bt::global_allocator>();
212 blocks_root->deallocate();
213 blocks_root.reset();
214 gpu_stop.reset();
215 print_lock.reset();
216 next_subproblem.reset();
217 best_bound.reset();
218 }
219};
220
221/** `BlockData` contains all the structures required to solve a subproblem including the problem itself (`root`) and the fixpoint engine (`fp_engine`). */
222template <class S>
223struct BlockData {
224 using GridCP = typename S::GridCP;
225 using BlockCP = typename S::BlockCP;
226
227 using snapshot_type = typename BlockCP::IST::snapshot_type<bt::global_allocator>;
228 size_t subproblem_idx;
229 bt::shared_ptr<FPEngine, bt::global_allocator> fp_engine;
230 bt::shared_ptr<AtomicBool, bt::pool_allocator> has_changed;
231 bt::shared_ptr<AtomicBool, bt::pool_allocator> stop;
232 bt::shared_ptr<BlockCP, bt::global_allocator> root;
233 bt::shared_ptr<snapshot_type, bt::global_allocator> snapshot_root;
234
235 __device__ BlockData():
236 has_changed(nullptr, bt::pool_allocator(nullptr, 0)),
237 stop(nullptr, bt::pool_allocator(nullptr, 0))
238 {}
239
240public:
241 /** Initialize the block data.
242 * Allocate the abstract domains in the best memory available depending on how large are the abstract domains. */
243 __device__ void allocate(GridData<S>& grid_data, unsigned char* shared_mem) {
244 auto block = cooperative_groups::this_thread_block();
245 if(threadIdx.x == 0) {
246 subproblem_idx = blockIdx.x;
247 MemoryConfig& mem_config = grid_data.mem_config;
248 bt::pool_allocator shared_mem_pool(mem_config.make_shared_pool(shared_mem));
249 fp_engine = bt::make_shared<FPEngine, bt::global_allocator>(block, shared_mem_pool);
250 has_changed = bt::allocate_shared<AtomicBool, bt::pool_allocator>(shared_mem_pool, true);
251 stop = bt::allocate_shared<AtomicBool, bt::pool_allocator>(shared_mem_pool, false);
252 root = bt::make_shared<BlockCP, bt::global_allocator>(typename BlockCP::tag_gpu_block_copy{},
253 (mem_config.mem_kind != MemoryKind::STORE_PC_SHARED),
254 *(grid_data.blocks_root),
255 bt::global_allocator{},
256 mem_config.make_pc_pool(shared_mem_pool),
257 mem_config.make_store_pool(shared_mem_pool));
258 snapshot_root = bt::make_shared<snapshot_type, bt::global_allocator>(root->search_tree->template snapshot<bt::global_allocator>());
259 }
260 block.sync();
261 }
262
263 __device__ void deallocate_shared() {
264 if(threadIdx.x == 0) {
265 fp_engine.reset();
266 has_changed.reset();
267 stop.reset();
268 root->deallocate();
269 snapshot_root.reset();
270 }
271 cooperative_groups::this_thread_block().sync();
272 }
273
274 __device__ void restore() {
275 if(threadIdx.x == 0) {
276 root->search_tree->restore(*snapshot_root);
277 root->eps_split->reset();
278 }
279 cooperative_groups::this_thread_block().sync();
280 }
281};
282
283template <class S>
284__global__ void initialize_grid_data(GridData<S>* grid_data) {
285 grid_data->allocate();
286 size_t num_subproblems = 1;
287 num_subproblems <<= grid_data->root.config.subproblems_power;
288 grid_data->next_subproblem->meet(ZLB<size_t, bt::local_memory>(grid_data->root.config.or_nodes));
289 grid_data->root.stats.eps_num_subproblems = num_subproblems;
290}
291
292template <class S>
293__global__ void deallocate_grid_data(GridData<S>* grid_data) {
294 grid_data->deallocate();
295}
296
297/** We update the bound found by the current block so it is visible to all other blocks.
298 * Note that this operation might not always succeed, which is okay, the best bound is still preserved in `block_data` and then reduced at the end (in `reduce_blocks`).
299 * The worst that can happen is that a best bound is found twice, which does not prevent the correctness of the algorithm.
300 */
301template <class S>
302__device__ bool update_grid_best_bound(BlockData<S>& block_data, GridData<S>& grid_data) {
303 using U0 = typename S::U0;
304 assert(threadIdx.x == 0);
305 if(block_data.root->bab->is_optimization()) {
306 const auto& bab = block_data.root->bab;
307 auto local_best = bab->optimum().project(bab->objective_var());
308 // printf("[new bound] %d: [%d..%d] (current best: [%d..%d])\n", blockIdx.x, local_best.lb().value(), local_best.ub().value(), grid_data.best_bound->lb().value(), grid_data.best_bound->ub().value());
309 if(bab->is_maximization()) {
310 return grid_data.best_bound->meet_lb(dual<typename U0::LB>(local_best.ub()));
311 }
312 else {
313 return grid_data.best_bound->meet_ub(dual<typename U0::UB>(local_best.lb()));
314 }
315 }
316 return false;
317}
318
319/** This function updates the best bound of the current block according to the best bound found so far across blocks.
320 * We directly update the store with the best bound.
321 * This function should be called in each node, since the best bound is erased on backtracking (it is not included in the snapshot).
322 */
323template <class S>
324__device__ void update_block_best_bound(BlockData<S>& block_data, GridData<S>& grid_data) {
325 using U0 = typename S::U0;
326 if(threadIdx.x == 0 && block_data.root->bab->is_optimization()) {
327 const auto& bab = block_data.root->bab;
328 VarEnv<bt::global_allocator> empty_env{};
329 auto best_formula = bab->template deinterpret_best_bound<bt::global_allocator>(
330 bab->is_maximization()
331 ? U0(dual<typename U0::UB>(grid_data.best_bound->lb()))
332 : U0(dual<typename U0::LB>(grid_data.best_bound->ub())));
333 // printf("global best: "); grid_data.best_bound->ub().print(); printf("\n");
334 // best_formula.print(); printf("\n");
335 IDiagnostics diagnostics;
336 interpret_and_tell(best_formula, empty_env, *block_data.root->store, diagnostics);
337 }
338}
339
340/** Propagate a node of the search tree and process the leaf nodes (failed or solution).
341 * Branching on unknown nodes is a task left to the caller.
342 */
343template <class S>
344__device__ bool propagate(BlockData<S>& block_data, GridData<S>& grid_data) {
345 using BlockCP = typename S::BlockCP;
346 bool is_leaf_node = false;
347 BlockCP& cp = *block_data.root;
348 auto& fp_engine = *block_data.fp_engine;
349#ifdef TURBO_PROFILE_MODE
350 cuda::std::chrono::system_clock::time_point start;
351 if(threadIdx.x == 0) {
352 start = cuda::std::chrono::system_clock::now();
353 }
354 fp_engine.barrier();
355#endif
356 local::B thread_has_changed{false};
357 size_t iterations = fp_engine.fixpoint(*cp.ipc, thread_has_changed, &grid_data.cpu_stop);
358 if(threadIdx.x == 0) {
359#ifdef TURBO_PROFILE_MODE
360 auto end = cuda::std::chrono::system_clock::now();
361 cuda::std::chrono::duration<double> diff = end - start;
362 cp.stats.propagation_time += diff.count();
363#endif
364 cp.stats.fixpoint_iterations += iterations;
365 bool is_pruned = cp.on_node();
366 if(cp.ipc->is_bot()) {
367 is_leaf_node = true;
368 cp.on_failed_node();
369 }
370 else if(cp.search_tree->template is_extractable<AtomicExtraction>()) {
371 is_leaf_node = true;
372 if(cp.bab->is_satisfaction() || cp.bab->compare_bound(*cp.store, cp.bab->optimum())) {
373 cp.bab->deduce();
374 bool best_has_changed = update_grid_best_bound(block_data, grid_data);
375 if(cp.bab->is_satisfaction() || (best_has_changed && cp.is_printing_intermediate_sol())) {
376 grid_data.produce_solution(*cp.bab);
377 }
378 is_pruned |= cp.update_solution_stats();
379 }
380 }
381 if(is_pruned) {
382 grid_data.gpu_stop->join(true);
383 }
384#ifdef TURBO_PROFILE_MODE
385 auto end2 = cuda::std::chrono::system_clock::now();
386 diff = end2 - end;
387 cp.stats.search_time += diff.count();
388#endif
389 }
390 return is_leaf_node;
391}
392
393/** The initial problem tackled during the dive must always be the same.
394 * Hence, don't be tempted to add the objective during diving because it might lead to ignoring some subproblems since the splitting decisions will differ.
395 */
396template <class S>
397__device__ size_t dive(BlockData<S>& block_data, GridData<S>& grid_data) {
398 using BlockCP = typename S::BlockCP;
399 BlockCP& cp = *block_data.root;
400 auto& fp_engine = *block_data.fp_engine;
401 auto& stop = *block_data.stop;
402 // Note that we use `block_has_changed` to stop the "diving", not really to indicate something has changed or not (since we do not need this information for this algorithm).
403 auto& stop_diving = *block_data.has_changed;
404 stop.meet(false);
405 stop_diving.meet(false);
406 fp_engine.barrier();
407 size_t remaining_depth = grid_data.root.config.subproblems_power;
408 while(remaining_depth > 0 && !stop_diving && !stop) {
409 remaining_depth--;
410 bool is_leaf_node = propagate(block_data, grid_data);
411 if(threadIdx.x == 0) {
412 if(is_leaf_node) {
413 stop_diving.join(true);
414 }
415 else {
416 size_t branch_idx = (block_data.subproblem_idx & (size_t{1} << remaining_depth)) >> remaining_depth;
417 auto branches = cp.eps_split->split();
418 assert(branches.size() == 2);
419 cp.ipc->deduce(branches[branch_idx]);
420 }
421 stop.join(grid_data.cpu_stop || *(grid_data.gpu_stop));
422 }
423 fp_engine.barrier();
424 }
425 return remaining_depth;
426}
427
428template <class S>
429__device__ void solve_problem(BlockData<S>& block_data, GridData<S>& grid_data) {
430 using BlockCP = typename S::BlockCP;
431 BlockCP& cp = *block_data.root;
432 auto& fp_engine = *block_data.fp_engine;
433 auto& block_has_changed = *block_data.has_changed;
434 auto& stop = *block_data.stop;
435 block_has_changed.join(true);
436 stop.meet(false);
437 fp_engine.barrier();
438 // In the condition, we must only read variables that are local to this block.
439 // Otherwise, two threads might read different values if it is changed in between by another block.
440 while(block_has_changed && !stop) {
441 update_block_best_bound(block_data, grid_data);
442 propagate(block_data, grid_data);
443 if(threadIdx.x == 0) {
444 stop.join(grid_data.cpu_stop || *(grid_data.gpu_stop));
445 // propagate induces a memory fence, therefore all threads are already past the "while" condition.
446 block_has_changed.meet(cp.search_tree->deduce());
447 }
448 fp_engine.barrier();
449 }
450}
451
452template <class S>
453CUDA void reduce_blocks(GridData<S>* grid_data) {
454 for(int i = 0; i < grid_data->blocks.size(); ++i) {
455 if(grid_data->blocks[i].root) { // `nullptr` could happen if we try to terminate the program before all blocks are even created.
456 grid_data->root.meet(*(grid_data->blocks[i].root));
457 }
458 }
459}
460
461template <class S>
462__global__ void gpu_solve_kernel(GridData<S>* grid_data)
463{
464 if(threadIdx.x == 0 && blockIdx.x == 0 && grid_data->root.config.verbose_solving) {
465 printf("%% GPU kernel started, starting solving...\n");
466 }
467 extern __shared__ unsigned char shared_mem[];
468 size_t num_subproblems = grid_data->root.stats.eps_num_subproblems;
469 BlockData<S>& block_data = grid_data->blocks[blockIdx.x];
470 block_data.allocate(*grid_data, shared_mem);
471 while(block_data.subproblem_idx < num_subproblems && !*(block_data.stop)) {
472 if(threadIdx.x == 0 && grid_data->root.config.verbose_solving) {
473 grid_data->print_lock->acquire();
474 printf("%% Block %d solves subproblem num %" PRIu64 "\n", blockIdx.x, block_data.subproblem_idx);
475 grid_data->print_lock->release();
476 }
477 block_data.restore();
478 cooperative_groups::this_thread_block().sync();
479 size_t remaining_depth = dive(block_data, *grid_data);
480 if(remaining_depth == 0) {
481 solve_problem(block_data, *grid_data);
482 if(threadIdx.x == 0 && !*(block_data.stop)) {
483 block_data.root->stats.eps_solved_subproblems += 1;
484 }
485 }
486 else {
487 if(threadIdx.x == 0 && !*(block_data.stop)) {
488 size_t next_subproblem_idx = ((block_data.subproblem_idx >> remaining_depth) + size_t{1}) << remaining_depth;
489 grid_data->next_subproblem->meet(ZLB<size_t, bt::local_memory>(next_subproblem_idx));
490 // It is possible that several blocks skip similar subproblems. Hence, we only count the subproblems skipped by the block solving the left most subproblem.
491 if((block_data.subproblem_idx & ((size_t{1} << remaining_depth) - size_t{1})) == size_t{0}) {
492 block_data.root->stats.eps_skipped_subproblems += next_subproblem_idx - block_data.subproblem_idx;
493 }
494 }
495 }
496 // Load next problem.
497 if(threadIdx.x == 0 && !*(block_data.stop)) {
498 block_data.subproblem_idx = grid_data->next_subproblem->value();
499 grid_data->next_subproblem->meet(ZLB<size_t, bt::local_memory>(block_data.subproblem_idx + size_t{1}));
500 }
501 cooperative_groups::this_thread_block().sync();
502 }
503 cooperative_groups::this_thread_block().sync();
504 if(threadIdx.x == 0 && !*(block_data.stop)) {
505 block_data.root->stats.num_blocks_done = 1;
506 }
507 if(threadIdx.x == 0) {
508 grid_data->print_lock->acquire();
509 if(!grid_data->blocks_reduced) {
510 int n = 0;
511 for(int i = 0; i < grid_data->blocks.size(); ++i) {
512 if(grid_data->blocks[i].root) { // `nullptr` could happen if we try to terminate the program before all blocks are even created.
513 n += grid_data->blocks[i].root->stats.num_blocks_done;
514 }
515 }
516 if(block_data.stop->value() || n == grid_data->blocks.size()) {
517 reduce_blocks(grid_data);
518 grid_data->blocks_reduced = true;
519 cuda::atomic_thread_fence(cuda::memory_order_seq_cst, cuda::thread_scope_system);
520 grid_data->ready_to_consume.test_and_set(cuda::std::memory_order_seq_cst);
521 grid_data->ready_to_consume.notify_one();
522 }
523 }
524 grid_data->print_lock->release();
525 }
526 // We must destroy all objects allocated in the shared memory, trying to destroy them anywhere else will lead to segfault.
527 block_data.deallocate_shared();
528}
529
530template <class T> __global__ void gpu_sizeof_kernel(size_t* size) { *size = sizeof(T); }
531template <class T>
532size_t gpu_sizeof() {
533 auto s = bt::make_unique<size_t, bt::managed_allocator>();
534 gpu_sizeof_kernel<T><<<1, 1>>>(s.get());
535 CUDAEX(cudaDeviceSynchronize());
536 return *s;
537}
538
539template <class S, class U>
540size_t sizeof_store(const CP<U>& root) {
541 return gpu_sizeof<typename S::BlockCP::IStore>()
542 + gpu_sizeof<typename S::BlockCP::IStore::universe_type>() * root.store->vars();
543}
544
545void print_memory_statistics(const char* key, size_t bytes) {
546 printf("%% %s=%zu [", key, bytes);
547 if(bytes < 1000 * 1000) {
548 printf("%.2fKB", static_cast<double>(bytes) / 1000);
549 }
550 else if(bytes < 1000 * 1000 * 1000) {
551 printf("%.2fMB", static_cast<double>(bytes) / (1000 * 1000));
552 }
553 else {
554 printf("%.2fGB", static_cast<double>(bytes) / (1000 * 1000 * 1000));
555 }
556 printf("]\n");
557}
558
559/** \returns the size of the shared memory and the kind of memory used. */
560template <class S, class U>
561MemoryConfig configure_memory(CP<U>& root) {
562 cudaDeviceProp deviceProp;
563 cudaGetDeviceProperties(&deviceProp, 0);
564 const auto& config = root.config;
565 size_t shared_mem_capacity = deviceProp.sharedMemPerBlock;
566
567 // Copy the root to know how large are the abstract domains.
568 CP<U> root2(root);
569
570 size_t store_alignment = 200; // The store does not create much alignment overhead since it is almost only a single array.
571
572 MemoryConfig mem_config;
573 // Need a bit of shared memory for the fixpoint engine.
574 mem_config.shared_bytes = 100;
575 mem_config.store_bytes = sizeof_store<S>(root2) + store_alignment;
576 // We add 20% extra memory due to the alignment of the shared memory which is not taken into account in the statistics.
577 // From limited experiments, alignment overhead is usually around 10%.
578 mem_config.pc_bytes = root2.prop_allocator.total_bytes_allocated();
579 mem_config.pc_bytes += mem_config.pc_bytes / 5;
580 if(config.only_global_memory || shared_mem_capacity < mem_config.shared_bytes + mem_config.store_bytes) {
581 if(!config.only_global_memory && config.verbose_solving) {
582 printf("%% The store of variables (%zuKB) cannot be stored in the shared memory of the GPU (%zuKB), therefore we use the global memory.\n",
583 mem_config.store_bytes / 1000,
584 shared_mem_capacity / 1000);
585 }
586 mem_config.mem_kind = MemoryKind::GLOBAL;
587 }
588 else if(shared_mem_capacity > mem_config.shared_bytes + mem_config.store_bytes + mem_config.pc_bytes) {
589 if(config.verbose_solving) {
590 printf("%% The store of variables and the propagators (%zuKB) are stored in the shared memory of the GPU (%zuKB).\n",
591 (mem_config.shared_bytes + mem_config.store_bytes + mem_config.pc_bytes) / 1000,
592 shared_mem_capacity / 1000);
593 }
594 mem_config.shared_bytes += mem_config.store_bytes + mem_config.pc_bytes;
595 mem_config.mem_kind = MemoryKind::STORE_PC_SHARED;
596 }
597 else {
598 if(config.verbose_solving) {
599 printf("%% The store of variables (%zuKB) is stored in the shared memory of the GPU (%zuKB).\n",
600 mem_config.store_bytes / 1000,
601 shared_mem_capacity / 1000);
602 }
603 mem_config.shared_bytes += mem_config.store_bytes;
604 mem_config.mem_kind = MemoryKind::STORE_SHARED;
605 }
606 if(config.verbose_solving) {
607 print_memory_statistics("store_memory_real", root2.store_allocator.total_bytes_allocated());
608 print_memory_statistics("pc_memory_real", root2.prop_allocator.total_bytes_allocated());
609 print_memory_statistics("other_memory_real", root2.basic_allocator.total_bytes_allocated());
610 }
611 return mem_config;
612}
613
614/** Wait the solving ends because of a timeout, CTRL-C or because the kernel finished. */
615template<class S, class Timepoint>
616bool wait_solving_ends(GridData<S>& grid_data, const Timepoint& start) {
617 cudaEvent_t event;
618 cudaEventCreateWithFlags(&event,cudaEventDisableTiming);
619 cudaEventRecord(event);
620 while(!must_quit() && check_timeout(grid_data.root, start) && cudaEventQuery(event) == cudaErrorNotReady) {
621 std::this_thread::sleep_for(std::chrono::milliseconds(100));
622 }
623 if(cudaEventQuery(event) == cudaErrorNotReady) {
624 grid_data.cpu_stop = true;
625 grid_data.root.prune();
626 return true;
627 }
628 else {
629 cudaError error = cudaDeviceSynchronize();
630 if(error == cudaErrorIllegalAddress) {
631 printf("%% ERROR: CUDA kernel failed due to an illegal memory access. This might be due to a stack overflow because it is too small. Try increasing the stack size with the options -stack. If it does not work, please report it as a bug.\n");
632 exit(EXIT_FAILURE);
633 }
634 CUDAEX(error);
635 return false;
636 }
637}
638
639template <class S>
640void consume_kernel_solutions(GridData<S>& grid_data) {
641 while(!grid_data.consume_solution()) {}
642}
643
644template <class S, class U, class Timepoint>
645void transfer_memory_and_run(CP<U>& root, MemoryConfig mem_config, const Timepoint& start) {
646 using concurrent_allocator = typename S::concurrent_allocator;
647 auto grid_data = bt::make_shared<GridData<S>, concurrent_allocator>(std::move(root), mem_config);
648 initialize_grid_data<<<1,1>>>(grid_data.get());
649 CUDAEX(cudaDeviceSynchronize());
650 if(grid_data->root.config.print_statistics) {
651 mem_config.print_mzn_statistics();
652 }
653 std::thread consumer_thread(consume_kernel_solutions<S>, std::ref(*grid_data));
654 gpu_solve_kernel
655 <<<static_cast<unsigned int>(grid_data->root.config.or_nodes),
656 static_cast<unsigned int>(grid_data->root.config.and_nodes),
657 grid_data->mem_config.shared_bytes>>>
658 (grid_data.get());
659 bool interrupted = wait_solving_ends(*grid_data, start);
660 consumer_thread.join();
661 CUDAEX(cudaDeviceSynchronize());
662 deallocate_grid_data<<<1,1>>>(grid_data.get());
663 CUDAEX(cudaDeviceSynchronize());
664}
665
666// From https://stackoverflow.com/a/32531982/2231159
667int threads_per_sm(cudaDeviceProp devProp) {
668 switch (devProp.major){
669 case 2: return (devProp.minor == 1) ? 48 : 32; // Fermi
670 case 3: return 192; // Kepler
671 case 5: return 128; // Maxwell
672 case 6: return (devProp.minor == 0) ? 64 : 128; // Pascal
673 case 7: return 64; // Volta and Turing
674 case 8: return (devProp.minor == 0) ? 64 : 128; // Ampere
675 case 9: return 128; // Hopper
676 default: return 64;
677 }
678}
679
680template <class S, class U>
681void configure_blocks_threads(CP<U>& root, const MemoryConfig& mem_config) {
682 int hint_num_blocks;
683 int hint_num_threads;
684 CUDAE(cudaOccupancyMaxPotentialBlockSize(&hint_num_blocks, &hint_num_threads, (void*) gpu_solve_kernel<S>, (int)mem_config.shared_bytes));
685
686 cudaDeviceProp deviceProp;
687 cudaGetDeviceProperties(&deviceProp, 0);
688
689 size_t total_global_mem = deviceProp.totalGlobalMem;
690 size_t num_sm = deviceProp.multiProcessorCount;
691 size_t num_threads_per_sm = threads_per_sm(deviceProp);
692
693 auto& config = root.config;
694 config.or_nodes = (config.or_nodes == 0) ? hint_num_blocks : config.or_nodes;
695 config.and_nodes = (config.and_nodes == 0) ? hint_num_threads : config.and_nodes;
696
697 if(config.and_nodes > deviceProp.maxThreadsPerBlock) {
698 if(config.verbose_solving) {
699 printf("%% WARNING: -and %zu too high for this GPU, we use the maximum %d instead.", config.and_nodes, deviceProp.maxThreadsPerBlock);
700 }
701 config.and_nodes = deviceProp.maxThreadsPerBlock;
702 }
703
704 // The stack allocated depends on the maximum number of threads per SM, not on the actual number of threads per block.
705 size_t total_stack_size = num_sm * deviceProp.maxThreadsPerMultiProcessor * config.stack_kb * 1000;
706 size_t remaining_global_mem = total_global_mem - total_stack_size;
707 remaining_global_mem -= remaining_global_mem / 10; // We leave 10% of global memory free for CUDA allocations, not sure if it is useful though.
708
709 // Basically the size of the store and propagator, and 100 bytes per variable.
710 // +1 for the root node in GridCP.
711 size_t heap_usage_estimation = (config.or_nodes + 1) * (mem_config.pc_bytes + mem_config.store_bytes + 100 * root.store->vars());
712 while(heap_usage_estimation > remaining_global_mem) {
713 config.or_nodes--;
714 }
715
716 CUDAEX(cudaDeviceSetLimit(cudaLimitStackSize, config.stack_kb*1000));
717 CUDAEX(cudaDeviceSetLimit(cudaLimitMallocHeapSize, remaining_global_mem));
718
719 if(config.verbose_solving) {
720 print_memory_statistics("stack_memory", total_stack_size);
721 print_memory_statistics("heap_memory", remaining_global_mem);
722 print_memory_statistics("heap_usage_estimation", heap_usage_estimation);
723 printf("%% and_nodes=%zu\n", config.and_nodes);
724 printf("%% or_nodes=%zu\n", config.or_nodes);
725 }
726}
727
728template <class S, class U, class Timepoint>
729void configure_and_run(CP<U>& root, const Timepoint& start) {
730 MemoryConfig mem_config = configure_memory<S>(root);
731 configure_blocks_threads<S>(root, mem_config);
732 transfer_memory_and_run<S>(root, mem_config, start);
733}
734
735void check_support_unified_memory() {
736 int attr = 0;
737 int dev = 0;
738 CUDAEX(cudaDeviceGetAttribute(&attr, cudaDevAttrManagedMemory, dev));
739 if (!attr) {
740 std::cerr << "The GPU does not support unified memory." << std::endl;
741 exit(EXIT_FAILURE);
742 }
743}
744
745void check_support_concurrent_managed_memory() {
746 int attr = 0;
747 int dev = 0;
748 CUDAEX(cudaDeviceGetAttribute(&attr, cudaDevAttrConcurrentManagedAccess, dev));
749 if (!attr) {
750#ifdef NO_CONCURRENT_MANAGED_MEMORY
751 printf("%% WARNING: The GPU does not support concurrent access to managed memory, hence we fall back on pinned memory.\n");
752 /** Set cudaDeviceMapHost to allow cudaMallocHost() to allocate pinned memory
753 * for concurrent access between the device and the host. It must be called
754 * early, before any CUDA management functions, so that we can fall back to
755 * using the pinned_allocator instead of the managed_allocator.
756 * This is required on Windows, WSL, macOS, and NVIDIA GRID.
757 * See also PR #18.
758 */
759 unsigned int flags = 0;
760 CUDAEX(cudaGetDeviceFlags(&flags));
761 flags |= cudaDeviceMapHost;
762 CUDAEX(cudaSetDeviceFlags(flags));
763#else
764 printf("%% To run Turbo on this GPU you need to build Turbo with the option NO_CONCURRENT_MANAGED_MEMORY.\n");
765 exit(EXIT_FAILURE);
766#endif
767 }
768}
769
770#endif // __CUDACC__
771
773#ifndef __CUDACC__
774 std::cerr << "You must use a CUDA compiler (nvcc or clang) to compile Turbo on GPU." << std::endl;
775#else
776 check_support_unified_memory();
777 check_support_concurrent_managed_memory();
778 auto start = std::chrono::high_resolution_clock::now();
779 CP<Itv> root(config);
780 root.preprocess();
782#ifdef NO_CONCURRENT_MANAGED_MEMORY
783 if(root.config.noatomics) {
784 configure_and_run<ItvSolverPinnedNoAtomics>(root, start);
785 }
786 else {
787 configure_and_run<ItvSolverPinned>(root, start);
788 }
789#else
790 if(root.config.noatomics) {
791 configure_and_run<ItvSolverNoAtomics>(root, start);
792 }
793 else {
794 configure_and_run<ItvSolver>(root, start);
795 }
796#endif
797#endif
798}
799
800#endif // TURBO_GPU_SOLVING_HPP
bool must_quit()
Definition common_solving.hpp:89
void block_signal_ctrlc()
Definition common_solving.hpp:77
bool check_timeout(A &a, const Timepoint &start)
Definition common_solving.hpp:101
void gpu_solve(Configuration< bt::standard_allocator > &config)
Definition gpu_solving.hpp:772
Definition common_solving.hpp:154
Configuration< BasicAllocator > config
Definition common_solving.hpp:277
abstract_ptr< IStore > store
Definition common_solving.hpp:262
void preprocess()
Definition common_solving.hpp:451
Definition config.hpp:28
size_t or_nodes
Definition config.hpp:40
bool noatomics
Definition config.hpp:38
Definition common_solving.hpp:119
CUDA void * allocate(size_t bytes)
Definition common_solving.hpp:125