Turbo Constraint Solver
Loading...
Searching...
No Matches
gpu_dive_and_solve.hpp
Go to the documentation of this file.
1// Copyright 2023 Pierre Talbot
2
3#ifndef TURBO_GPU_DIVE_AND_SOLVE_HPP
4#define TURBO_GPU_DIVE_AND_SOLVE_HPP
5
6#include "common_solving.hpp"
7#include <thread>
8#include <algorithm>
9
10namespace bt = ::battery;
11
12#ifdef __CUDACC__
13
14#include <cuda/std/chrono>
15#include <cuda/semaphore>
16
17template <
18 class Universe0, // Universe used locally to one thread.
19 class Universe1, // Universe used in the scope of a block.
20 class Universe2, // Universe used in the scope of a grid.
21 class ConcurrentAllocator> // This allocator allocates memory that can be both accessed by the CPU and the GPU. See PR #18 for the reason (basically, non-Linux systems do not support concurrent managed memory (accessed from CPU and GPU) and must rely on pinned memory instead).
22struct StateTypes {
23
24 using U0 = Universe0;
25 using U1 = Universe1;
26 using U2 = Universe2;
27 using concurrent_allocator = ConcurrentAllocator;
28
29 /** We first interpret the formula in an abstract domain with sequential concurrent memory, that we call `GridCP`. */
30 using GridCP = AbstractDomains<U0,
31 bt::statistics_allocator<ConcurrentAllocator>,
32 bt::statistics_allocator<UniqueLightAlloc<ConcurrentAllocator, 0>>,
33 bt::statistics_allocator<UniqueLightAlloc<ConcurrentAllocator, 1>>>;
34
35 /** Then, once everything is initialized, we rely on a parallel abstract domain called `BlockCP`, usually using atomic shared and global memory. */
36 using BlockCP = AbstractDomains<U1,
37 bt::global_allocator,
38 bt::pool_allocator,
40};
41
42using Itv0 = Interval<ZLB<int, bt::local_memory>>;
43using Itv1 = Interval<ZLB<int, bt::atomic_memory_block>>;
44using Itv2 = Interval<ZLB<int, bt::atomic_memory_grid>>;
45using AtomicBool = B<bt::atomic_memory_block>;
46using FPEngine = BlockAsynchronousIterationGPU;
47
48// Version for non-Linux systems such as Windows where pinned memory must be used (see PR #19).
49#ifdef NO_CONCURRENT_MANAGED_MEMORY
50 using ItvSolverPinned = StateTypes<Itv0, Itv1, Itv2, bt::pinned_allocator>;
51 using ItvSolverPinnedNoAtomics = StateTypes<Itv0, Itv0, Itv0, bt::pinned_allocator>;
52#else
53 using ItvSolver = StateTypes<Itv0, Itv1, Itv2, bt::managed_allocator>;
54 // Deactivate atomics for the domain of variables (for benchmarking only, it is not safe according to CUDA consistency model).
55 using ItvSolverNoAtomics = StateTypes<Itv0, Itv0, Itv0, bt::managed_allocator>;
56#endif
57
58/** Depending on the problem, we can store the abstract elements in different memories.
59 * The "worst" is everything in global memory (GLOBAL) when the problem is too large for the shared memory.
60 * The "best" is when both the store of variables and the propagators (STORE_PC_SHARED) can be stored in shared memory.
61 * A third possibility is to store only the variables' domains in the shared memory (STORE_SHARED).
62*/
63enum class MemoryKind {
64 GLOBAL,
65 STORE_SHARED,
66 STORE_PC_SHARED
67};
68
69/** The shared memory must be configured by hand before the kernel is launched.
70 * This class encapsulates information about the size of each relevant abstract elements, and help creating the allocators accordingly.
71*/
72struct MemoryConfig {
73 MemoryKind mem_kind;
74 size_t shared_bytes;
75 size_t store_bytes;
76 size_t pc_bytes;
77
78 CUDA bt::pool_allocator make_global_pool(size_t bytes) {
79 void* mem_pool = bt::global_allocator{}.allocate(bytes);
80 return bt::pool_allocator(static_cast<unsigned char*>(mem_pool), bytes);
81 }
82
83 CUDA bt::pool_allocator make_shared_pool(unsigned char* shared_mem) {
84 return bt::pool_allocator(shared_mem, shared_bytes);
85 }
86
87 CUDA bt::pool_allocator make_pc_pool(bt::pool_allocator shared_mem) {
88 if(mem_kind == MemoryKind::STORE_PC_SHARED) {
89 return shared_mem;
90 }
91 else {
92 return make_global_pool(pc_bytes);
93 }
94 }
95
96 CUDA bt::pool_allocator make_store_pool(bt::pool_allocator shared_mem) {
97 if(mem_kind == MemoryKind::STORE_PC_SHARED || mem_kind == MemoryKind::STORE_SHARED) {
98 return shared_mem;
99 }
100 else {
101 return make_global_pool(store_bytes);
102 }
103 }
104
105 CUDA void print_mzn_statistics() const {
106 printf("%%%%%%mzn-stat: memory_configuration=\"%s\"\n",
107 mem_kind == MemoryKind::GLOBAL ? "global" : (
108 mem_kind == MemoryKind::STORE_SHARED ? "store_shared" : "store_pc_shared"));
109 printf("%%%%%%mzn-stat: shared_mem=%" PRIu64 "\n", shared_bytes);
110 printf("%%%%%%mzn-stat: store_mem=%" PRIu64 "\n", store_bytes);
111 printf("%%%%%%mzn-stat: propagator_mem=%" PRIu64 "\n", pc_bytes);
112 }
113};
114
115template <class S>
116struct BlockData;
117
118/** `GridData` represents the problem to be solved (`root`), the data of each block when running EPS (`blocks`), the index of the subproblem that needs to be solved next (`next_subproblem`), an estimation of the best bound found across subproblems in case of optimisation problem and other global information.
119 */
120template <class S>
121struct GridData {
122 using GridCP = typename S::GridCP;
123 using BlockCP = typename S::BlockCP;
124 using U2 = typename S::U2;
125
126 GridCP root;
127 // `blocks_root` is a copy of `root` but with the same allocators than the ones used in `blocks`.
128 // This is helpful to share immutable data among blocks (for instance the propagators).
129 bt::shared_ptr<BlockCP, bt::global_allocator> blocks_root;
130 // Stop from the CPU, for instance because of a timeout.
131 volatile bool cpu_stop;
132 // Boolean indicating that the blocks have been reduced, and the CPU can now print the statistics.
133 volatile bool blocks_reduced;
134 MemoryConfig mem_config;
135 bt::vector<BlockData<S>, bt::global_allocator> blocks;
136 // Stop from a block on the GPU, for instance because we found a solution.
137 bt::shared_ptr<B<bt::atomic_memory_grid>, bt::global_allocator> gpu_stop;
138 bt::shared_ptr<ZLB<size_t, bt::atomic_memory_grid>, bt::global_allocator> next_subproblem;
139 bt::shared_ptr<U2, bt::global_allocator> best_bound;
140
141 // All of what follows is only to support printing while the kernel is running.
142 // In particular, we transfer the solution to the CPU where it is printed, because printing on the GPU can be very slow when the problem is large.
143 bt::shared_ptr<cuda::binary_semaphore<cuda::thread_scope_device>, bt::global_allocator> print_lock;
144 cuda::std::atomic_flag ready_to_produce;
145 cuda::std::atomic_flag ready_to_consume;
146
147 GridData(const GridCP& root, const MemoryConfig& mem_config)
148 : root(root)
149 , mem_config(mem_config)
150 , cpu_stop(false)
151 , blocks_reduced(false)
152 {
153 ready_to_consume.clear();
154 ready_to_produce.test_and_set();
155 cuda::atomic_thread_fence(cuda::memory_order_seq_cst, cuda::thread_scope_system);
156 }
157
158 template <class BlockBAB>
159 __device__ void produce_solution(const BlockBAB& bab) {
160 print_lock->acquire();
161 if(!cpu_stop) {
162 ready_to_produce.wait(false, cuda::std::memory_order_seq_cst);
163 ready_to_produce.clear();
164 bab.extract(*(root.bab));
165 cuda::atomic_thread_fence(cuda::memory_order_seq_cst, cuda::thread_scope_system);
166 ready_to_consume.test_and_set(cuda::std::memory_order_seq_cst);
167 ready_to_consume.notify_one();
168 }
169 print_lock->release();
170 }
171
172 __host__ bool consume_solution() {
173 ready_to_consume.wait(false, cuda::std::memory_order_seq_cst);
174 ready_to_consume.clear();
175 if(blocks_reduced) {
176 root.print_final_solution();
177 if(root.config.print_statistics) {
178 root.print_mzn_statistics();
179 }
180 return true;
181 }
182 else {
183 root.print_solution();
184 }
185 cuda::atomic_thread_fence(cuda::memory_order_seq_cst, cuda::thread_scope_system);
186 ready_to_produce.test_and_set(cuda::std::memory_order_seq_cst);
187 ready_to_produce.notify_one();
188 return false;
189 }
190
191 __device__ void allocate() {
192 assert(threadIdx.x == 0 && blockIdx.x == 0);
193 auto root_mem_config(mem_config);
194 root_mem_config.mem_kind = MemoryKind::GLOBAL;
195 blocks_root = bt::make_shared<BlockCP, bt::global_allocator>(
196 typename BlockCP::tag_gpu_block_copy{},
197 false, // Due to different allocators between BlockCP and GridCP, it won't be able to share data anyways.
198 root,
199 bt::global_allocator{},
200 root_mem_config.make_pc_pool(bt::pool_allocator(nullptr,0)),
201 root_mem_config.make_store_pool(bt::pool_allocator(nullptr,0)));
202 blocks = bt::vector<BlockData<S>, bt::global_allocator>(root.config.or_nodes);
203 gpu_stop = bt::make_shared<B<bt::atomic_memory_grid>, bt::global_allocator>(false);
204 print_lock = bt::make_shared<cuda::binary_semaphore<cuda::thread_scope_device>, bt::global_allocator>(1);
205 next_subproblem = bt::make_shared<ZLB<size_t, bt::atomic_memory_grid>, bt::global_allocator>(0);
206 best_bound = bt::make_shared<U2, bt::global_allocator>();
207 }
208
209 __device__ void deallocate() {
210 assert(threadIdx.x == 0 && blockIdx.x == 0);
211 blocks = bt::vector<BlockData<S>, bt::global_allocator>();
212 blocks_root->deallocate();
213 blocks_root.reset();
214 gpu_stop.reset();
215 print_lock.reset();
216 next_subproblem.reset();
217 best_bound.reset();
218 }
219};
220
221/** `BlockData` contains all the structures required to solve a subproblem including the problem itself (`root`) and the fixpoint engine (`fp_engine`). */
222template <class S>
223struct BlockData {
224 using GridCP = typename S::GridCP;
225 using BlockCP = typename S::BlockCP;
226
227 using snapshot_type = typename BlockCP::IST::snapshot_type<bt::global_allocator>;
228 size_t subproblem_idx;
229 bt::shared_ptr<FPEngine, bt::pool_allocator> fp_engine;
230 bt::shared_ptr<AtomicBool, bt::pool_allocator> has_changed;
231 bt::shared_ptr<AtomicBool, bt::pool_allocator> stop;
232 bt::shared_ptr<BlockCP, bt::global_allocator> root;
233 bt::shared_ptr<snapshot_type, bt::global_allocator> snapshot_root;
234
235 __device__ BlockData():
236 has_changed(nullptr, bt::pool_allocator(nullptr, 0)),
237 stop(nullptr, bt::pool_allocator(nullptr, 0))
238 {}
239
240public:
241 /** Initialize the block data.
242 * Allocate the abstract domains in the best memory available depending on how large are the abstract domains. */
243 __device__ void allocate(GridData<S>& grid_data, unsigned char* shared_mem) {
244 auto block = cooperative_groups::this_thread_block();
245 if(threadIdx.x == 0) {
246 subproblem_idx = blockIdx.x;
247 MemoryConfig& mem_config = grid_data.mem_config;
248 bt::pool_allocator shared_mem_pool(mem_config.make_shared_pool(shared_mem));
249 fp_engine = bt::allocate_shared<FPEngine, bt::pool_allocator>(shared_mem_pool);
250 has_changed = bt::allocate_shared<AtomicBool, bt::pool_allocator>(shared_mem_pool, true);
251 stop = bt::allocate_shared<AtomicBool, bt::pool_allocator>(shared_mem_pool, false);
252 root = bt::make_shared<BlockCP, bt::global_allocator>(typename BlockCP::tag_gpu_block_copy{},
253 (mem_config.mem_kind != MemoryKind::STORE_PC_SHARED),
254 *(grid_data.blocks_root),
255 bt::global_allocator{},
256 mem_config.make_pc_pool(shared_mem_pool),
257 mem_config.make_store_pool(shared_mem_pool));
258 snapshot_root = bt::make_shared<snapshot_type, bt::global_allocator>(root->search_tree->template snapshot<bt::global_allocator>());
259 }
260 block.sync();
261 }
262
263 __device__ void deallocate_shared() {
264 if(threadIdx.x == 0) {
265 fp_engine.reset();
266 has_changed.reset();
267 stop.reset();
268 root->deallocate();
269 snapshot_root.reset();
270 }
271 cooperative_groups::this_thread_block().sync();
272 }
273
274 __device__ void restore() {
275 if(threadIdx.x == 0) {
276 root->search_tree->restore(*snapshot_root);
277 root->eps_split->reset();
278 }
279 cooperative_groups::this_thread_block().sync();
280 }
281};
282
283template <class S>
284__global__ void initialize_grid_data(GridData<S>* grid_data) {
285 grid_data->allocate();
286 size_t num_subproblems = 1;
287 num_subproblems <<= grid_data->root.config.subproblems_power;
288 grid_data->next_subproblem->meet(ZLB<size_t, bt::local_memory>(grid_data->root.config.or_nodes));
289 grid_data->root.stats.eps_num_subproblems = num_subproblems;
290}
291
292template <class S>
293__global__ void deallocate_grid_data(GridData<S>* grid_data) {
294 grid_data->deallocate();
295}
296
297/** We update the bound found by the current block so it is visible to all other blocks.
298 * Note that this operation might not always succeed, which is okay, the best bound is still preserved in `block_data` and then reduced at the end (in `reduce_blocks`).
299 * The worst that can happen is that a best bound is found twice, which does not prevent the correctness of the algorithm.
300 */
301template <class S>
302__device__ bool update_grid_best_bound(BlockData<S>& block_data, GridData<S>& grid_data) {
303 using U0 = typename S::U0;
304 assert(threadIdx.x == 0);
305 if(block_data.root->bab->is_optimization()) {
306 const auto& bab = block_data.root->bab;
307 auto local_best = bab->optimum().project(bab->objective_var());
308 // printf("[new bound] %d: [%d..%d] (current best: [%d..%d])\n", blockIdx.x, local_best.lb().value(), local_best.ub().value(), grid_data.best_bound->lb().value(), grid_data.best_bound->ub().value());
309 if(bab->is_maximization()) {
310 return grid_data.best_bound->meet_lb(dual<typename U0::LB>(local_best.ub()));
311 }
312 else {
313 return grid_data.best_bound->meet_ub(dual<typename U0::UB>(local_best.lb()));
314 }
315 }
316 return false;
317}
318
319/** This function updates the best bound of the current block according to the best bound found so far across blocks.
320 * We directly update the store with the best bound.
321 * This function should be called in each node, since the best bound is erased on backtracking (it is not included in the snapshot).
322 */
323template <class S>
324__device__ void update_block_best_bound(BlockData<S>& block_data, GridData<S>& grid_data) {
325 using U0 = typename S::U0;
326 if(threadIdx.x == 0 && block_data.root->bab->is_optimization()) {
327 const auto& bab = block_data.root->bab;
328 VarEnv<bt::global_allocator> empty_env{};
329 auto best_formula = bab->template deinterpret_best_bound<bt::global_allocator>(
330 bab->is_maximization()
331 ? U0(dual<typename U0::UB>(grid_data.best_bound->lb()))
332 : U0(dual<typename U0::LB>(grid_data.best_bound->ub())));
333 // printf("global best: "); grid_data.best_bound->ub().print(); printf("\n");
334 // best_formula.print(); printf("\n");
335 IDiagnostics diagnostics;
336 interpret_and_tell(best_formula, empty_env, *block_data.root->store, diagnostics);
337 }
338}
339
340/** Propagate a node of the search tree and process the leaf nodes (failed or solution).
341 * Branching on unknown nodes is a task left to the caller.
342 */
343template <class S>
344__device__ bool propagate(BlockData<S>& block_data, GridData<S>& grid_data) {
345 using BlockCP = typename S::BlockCP;
346 bool is_leaf_node = false;
347 BlockCP& cp = *block_data.root;
348 auto& fp_engine = *block_data.fp_engine;
349#ifdef TURBO_PROFILE_MODE
350 cuda::std::chrono::system_clock::time_point start;
351 if(threadIdx.x == 0) {
352 start = cuda::std::chrono::system_clock::now();
353 }
354 fp_engine.barrier();
355#endif
356 size_t iterations = fp_engine.fixpoint(*cp.ipc);
357 if(threadIdx.x == 0) {
358#ifdef TURBO_PROFILE_MODE
359 auto end = cuda::std::chrono::system_clock::now();
360 cuda::std::chrono::duration<double> diff = end - start;
361 cp.stats.propagation_time += diff.count();
362#endif
363 cp.stats.fixpoint_iterations += iterations;
364 bool is_pruned = cp.on_node();
365 if(cp.ipc->is_bot()) {
366 is_leaf_node = true;
367 cp.on_failed_node();
368 }
369 else if(cp.search_tree->template is_extractable<AtomicExtraction>()) {
370 is_leaf_node = true;
371 if(cp.bab->is_satisfaction() || cp.bab->compare_bound(*cp.store, cp.bab->optimum())) {
372 cp.bab->deduce();
373 bool best_has_changed = update_grid_best_bound(block_data, grid_data);
374 if(cp.bab->is_satisfaction() || (best_has_changed && cp.is_printing_intermediate_sol())) {
375 grid_data.produce_solution(*cp.bab);
376 }
377 is_pruned |= cp.update_solution_stats();
378 }
379 }
380 if(is_pruned) {
381 grid_data.gpu_stop->join(true);
382 }
383#ifdef TURBO_PROFILE_MODE
384 auto end2 = cuda::std::chrono::system_clock::now();
385 diff = end2 - end;
386 cp.stats.search_time += diff.count();
387#endif
388 }
389 return is_leaf_node;
390}
391
392/** The initial problem tackled during the dive must always be the same.
393 * Hence, don't be tempted to add the objective during diving because it might lead to ignoring some subproblems since the splitting decisions will differ.
394 */
395template <class S>
396__device__ size_t dive(BlockData<S>& block_data, GridData<S>& grid_data) {
397 using BlockCP = typename S::BlockCP;
398 BlockCP& cp = *block_data.root;
399 auto& fp_engine = *block_data.fp_engine;
400 auto& stop = *block_data.stop;
401 // Note that we use `block_has_changed` to stop the "diving", not really to indicate something has changed or not (since we do not need this information for this algorithm).
402 auto& stop_diving = *block_data.has_changed;
403 stop.meet(false);
404 stop_diving.meet(false);
405 fp_engine.barrier();
406 size_t remaining_depth = grid_data.root.config.subproblems_power;
407 while(remaining_depth > 0 && !stop_diving && !stop) {
408 remaining_depth--;
409 bool is_leaf_node = propagate(block_data, grid_data);
410 if(threadIdx.x == 0) {
411 if(is_leaf_node) {
412 stop_diving.join(true);
413 }
414 else {
415 size_t branch_idx = (block_data.subproblem_idx & (size_t{1} << remaining_depth)) >> remaining_depth;
416 auto branches = cp.eps_split->split();
417 assert(branches.size() == 2);
418 cp.ipc->deduce(branches[branch_idx]);
419 }
420 stop.join(grid_data.cpu_stop || *(grid_data.gpu_stop));
421 }
422 fp_engine.barrier();
423 }
424 return remaining_depth;
425}
426
427template <class S>
428__device__ void solve_problem(BlockData<S>& block_data, GridData<S>& grid_data) {
429 using BlockCP = typename S::BlockCP;
430 BlockCP& cp = *block_data.root;
431 auto& fp_engine = *block_data.fp_engine;
432 auto& block_has_changed = *block_data.has_changed;
433 auto& stop = *block_data.stop;
434 block_has_changed.join(true);
435 stop.meet(false);
436 fp_engine.barrier();
437 // In the condition, we must only read variables that are local to this block.
438 // Otherwise, two threads might read different values if it is changed in between by another block.
439 while(block_has_changed && !stop) {
440 update_block_best_bound(block_data, grid_data);
441 propagate(block_data, grid_data);
442 if(threadIdx.x == 0) {
443 stop.join(grid_data.cpu_stop || *(grid_data.gpu_stop));
444 // propagate induces a memory fence, therefore all threads are already past the "while" condition.
445 block_has_changed.meet(cp.search_tree->deduce());
446 }
447 fp_engine.barrier();
448 }
449}
450
451template <class S>
452CUDA void reduce_blocks(GridData<S>* grid_data) {
453 for(int i = 0; i < grid_data->blocks.size(); ++i) {
454 if(grid_data->blocks[i].root) { // `nullptr` could happen if we try to terminate the program before all blocks are even created.
455 grid_data->root.meet(*(grid_data->blocks[i].root));
456 }
457 }
458}
459
460template <class S>
461__global__ void gpu_solve_kernel(GridData<S>* grid_data)
462{
463 if(threadIdx.x == 0 && blockIdx.x == 0 && grid_data->root.config.verbose_solving) {
464 printf("%% GPU kernel started, starting solving...\n");
465 }
466 extern __shared__ unsigned char shared_mem[];
467 size_t num_subproblems = grid_data->root.stats.eps_num_subproblems;
468 BlockData<S>& block_data = grid_data->blocks[blockIdx.x];
469 block_data.allocate(*grid_data, shared_mem);
470 while(block_data.subproblem_idx < num_subproblems && !*(block_data.stop)) {
471 if(threadIdx.x == 0 && grid_data->root.config.verbose_solving) {
472 grid_data->print_lock->acquire();
473 printf("%% Block %d solves subproblem num %" PRIu64 "\n", blockIdx.x, block_data.subproblem_idx);
474 grid_data->print_lock->release();
475 }
476 block_data.restore();
477 cooperative_groups::this_thread_block().sync();
478 size_t remaining_depth = dive(block_data, *grid_data);
479 if(remaining_depth == 0) {
480 solve_problem(block_data, *grid_data);
481 if(threadIdx.x == 0 && !*(block_data.stop)) {
482 block_data.root->stats.eps_solved_subproblems += 1;
483 }
484 }
485 else {
486 if(threadIdx.x == 0 && !*(block_data.stop)) {
487 size_t next_subproblem_idx = ((block_data.subproblem_idx >> remaining_depth) + size_t{1}) << remaining_depth;
488 grid_data->next_subproblem->meet(ZLB<size_t, bt::local_memory>(next_subproblem_idx));
489 // It is possible that several blocks skip similar subproblems. Hence, we only count the subproblems skipped by the block solving the left most subproblem.
490 if((block_data.subproblem_idx & ((size_t{1} << remaining_depth) - size_t{1})) == size_t{0}) {
491 block_data.root->stats.eps_skipped_subproblems += next_subproblem_idx - block_data.subproblem_idx;
492 }
493 }
494 }
495 // Load next problem.
496 if(threadIdx.x == 0 && !*(block_data.stop)) {
497 block_data.subproblem_idx = grid_data->next_subproblem->value();
498 grid_data->next_subproblem->meet(ZLB<size_t, bt::local_memory>(block_data.subproblem_idx + size_t{1}));
499 }
500 cooperative_groups::this_thread_block().sync();
501 }
502 cooperative_groups::this_thread_block().sync();
503 if(threadIdx.x == 0 && !*(block_data.stop)) {
504 block_data.root->stats.num_blocks_done = 1;
505 }
506 if(threadIdx.x == 0) {
507 grid_data->print_lock->acquire();
508 if(!grid_data->blocks_reduced) {
509 int n = 0;
510 for(int i = 0; i < grid_data->blocks.size(); ++i) {
511 if(grid_data->blocks[i].root) { // `nullptr` could happen if we try to terminate the program before all blocks are even created.
512 n += grid_data->blocks[i].root->stats.num_blocks_done;
513 }
514 }
515 if(block_data.stop->value() || n == grid_data->blocks.size()) {
516 reduce_blocks(grid_data);
517 grid_data->blocks_reduced = true;
518 cuda::atomic_thread_fence(cuda::memory_order_seq_cst, cuda::thread_scope_system);
519 grid_data->ready_to_consume.test_and_set(cuda::std::memory_order_seq_cst);
520 grid_data->ready_to_consume.notify_one();
521 }
522 }
523 grid_data->print_lock->release();
524 }
525 // We must destroy all objects allocated in the shared memory, trying to destroy them anywhere else will lead to segfault.
526 block_data.deallocate_shared();
527}
528
529template <class T> __global__ void gpu_sizeof_kernel(size_t* size) { *size = sizeof(T); }
530template <class T>
531size_t gpu_sizeof() {
532 auto s = bt::make_unique<size_t, bt::managed_allocator>();
533 gpu_sizeof_kernel<T><<<1, 1>>>(s.get());
534 CUDAEX(cudaDeviceSynchronize());
535 return *s;
536}
537
538template <class S, class U>
539size_t sizeof_store(const CP<U>& root) {
540 return gpu_sizeof<typename S::BlockCP::IStore>()
541 + gpu_sizeof<typename S::BlockCP::IStore::universe_type>() * root.store->vars();
542}
543
544/** \returns the size of the shared memory and the kind of memory used. */
545template <class S, class U>
546MemoryConfig configure_memory(CP<U>& root) {
547 cudaDeviceProp deviceProp;
548 cudaGetDeviceProperties(&deviceProp, 0);
549 const auto& config = root.config;
550 size_t shared_mem_capacity = deviceProp.sharedMemPerBlock;
551
552 // Copy the root to know how large are the abstract domains.
553 CP<U> root2(root);
554
555 size_t store_alignment = 200; // The store does not create much alignment overhead since it is almost only a single array.
556
557 MemoryConfig mem_config;
558 // Need a bit of shared memory for the fixpoint engine.
559 mem_config.shared_bytes = 100;
560 mem_config.store_bytes = sizeof_store<S>(root2) + store_alignment;
561 // We add 20% extra memory due to the alignment of the shared memory which is not taken into account in the statistics.
562 // From limited experiments, alignment overhead is usually around 10%.
563 mem_config.pc_bytes = root2.prop_allocator.total_bytes_allocated();
564 mem_config.pc_bytes += mem_config.pc_bytes / 5;
565 if(config.only_global_memory || shared_mem_capacity < mem_config.shared_bytes + mem_config.store_bytes) {
566 if(!config.only_global_memory && config.verbose_solving) {
567 printf("%% The store of variables (%zuKB) cannot be stored in the shared memory of the GPU (%zuKB), therefore we use the global memory.\n",
568 mem_config.store_bytes / 1000,
569 shared_mem_capacity / 1000);
570 }
571 mem_config.mem_kind = MemoryKind::GLOBAL;
572 }
573 else if(shared_mem_capacity > mem_config.shared_bytes + mem_config.store_bytes + mem_config.pc_bytes) {
574 if(config.verbose_solving) {
575 printf("%% The store of variables and the propagators (%zuKB) are stored in the shared memory of the GPU (%zuKB).\n",
576 (mem_config.shared_bytes + mem_config.store_bytes + mem_config.pc_bytes) / 1000,
577 shared_mem_capacity / 1000);
578 }
579 mem_config.shared_bytes += mem_config.store_bytes + mem_config.pc_bytes;
580 mem_config.mem_kind = MemoryKind::STORE_PC_SHARED;
581 }
582 else {
583 if(config.verbose_solving) {
584 printf("%% The store of variables (%zuKB) is stored in the shared memory of the GPU (%zuKB).\n",
585 mem_config.store_bytes / 1000,
586 shared_mem_capacity / 1000);
587 }
588 mem_config.shared_bytes += mem_config.store_bytes;
589 mem_config.mem_kind = MemoryKind::STORE_SHARED;
590 }
591 if(config.verbose_solving) {
592 print_memory_statistics("store_memory_real", root2.store_allocator.total_bytes_allocated());
593 print_memory_statistics("pc_memory_real", root2.prop_allocator.total_bytes_allocated());
594 print_memory_statistics("other_memory_real", root2.basic_allocator.total_bytes_allocated());
595 }
596 return mem_config;
597}
598
599/** Wait the solving ends because of a timeout, CTRL-C or because the kernel finished. */
600template<class S, class Timepoint>
601bool wait_solving_ends(GridData<S>& grid_data, const Timepoint& start) {
602 cudaEvent_t event;
603 cudaEventCreateWithFlags(&event,cudaEventDisableTiming);
604 cudaEventRecord(event);
605 while(!must_quit() && check_timeout(grid_data.root, start) && cudaEventQuery(event) == cudaErrorNotReady) {
606 std::this_thread::sleep_for(std::chrono::milliseconds(100));
607 }
608 if(cudaEventQuery(event) == cudaErrorNotReady) {
609 grid_data.cpu_stop = true;
610 grid_data.root.prune();
611 return true;
612 }
613 else {
614 cudaError error = cudaDeviceSynchronize();
615 if(error == cudaErrorIllegalAddress) {
616 printf("%% ERROR: CUDA kernel failed due to an illegal memory access. This might be due to a stack overflow because it is too small. Try increasing the stack size with the options -stack. If it does not work, please report it as a bug.\n");
617 exit(EXIT_FAILURE);
618 }
619 CUDAEX(error);
620 return false;
621 }
622}
623
624template <class S>
625void consume_kernel_solutions(GridData<S>& grid_data) {
626 while(!grid_data.consume_solution()) {}
627}
628
629template <class S, class U, class Timepoint>
630void transfer_memory_and_run(CP<U>& root, MemoryConfig mem_config, const Timepoint& start) {
631 using concurrent_allocator = typename S::concurrent_allocator;
632 auto grid_data = bt::make_shared<GridData<S>, concurrent_allocator>(std::move(root), mem_config);
633 initialize_grid_data<<<1,1>>>(grid_data.get());
634 CUDAEX(cudaDeviceSynchronize());
635 if(grid_data->root.config.print_statistics) {
636 mem_config.print_mzn_statistics();
637 }
638 std::thread consumer_thread(consume_kernel_solutions<S>, std::ref(*grid_data));
639 gpu_solve_kernel
640 <<<static_cast<unsigned int>(grid_data->root.config.or_nodes),
641 static_cast<unsigned int>(grid_data->root.config.and_nodes),
642 grid_data->mem_config.shared_bytes>>>
643 (grid_data.get());
644 bool interrupted = wait_solving_ends(*grid_data, start);
645 consumer_thread.join();
646 CUDAEX(cudaDeviceSynchronize());
647 deallocate_grid_data<<<1,1>>>(grid_data.get());
648 CUDAEX(cudaDeviceSynchronize());
649}
650
651// From https://stackoverflow.com/a/32531982/2231159
652int threads_per_sm(cudaDeviceProp devProp) {
653 switch (devProp.major){
654 case 2: return (devProp.minor == 1) ? 48 : 32; // Fermi
655 case 3: return 192; // Kepler
656 case 5: return 128; // Maxwell
657 case 6: return (devProp.minor == 0) ? 64 : 128; // Pascal
658 case 7: return 64; // Volta and Turing
659 case 8: return (devProp.minor == 0) ? 64 : 128; // Ampere
660 case 9: return 128; // Hopper
661 default: return 64;
662 }
663}
664
665template <class S, class U>
666void configure_blocks_threads(CP<U>& root, const MemoryConfig& mem_config) {
667 int hint_num_blocks;
668 int hint_num_threads;
669 CUDAE(cudaOccupancyMaxPotentialBlockSize(&hint_num_blocks, &hint_num_threads, (void*) gpu_solve_kernel<S>, (int)mem_config.shared_bytes));
670
671 cudaDeviceProp deviceProp;
672 cudaGetDeviceProperties(&deviceProp, 0);
673
674 size_t total_global_mem = deviceProp.totalGlobalMem;
675 size_t num_sm = deviceProp.multiProcessorCount;
676 size_t num_threads_per_sm = threads_per_sm(deviceProp);
677
678 auto& config = root.config;
679 config.or_nodes = (config.or_nodes == 0) ? hint_num_blocks : config.or_nodes;
680 config.and_nodes = (config.and_nodes == 0) ? hint_num_threads : config.and_nodes;
681
682 if(config.and_nodes > deviceProp.maxThreadsPerBlock) {
683 if(config.verbose_solving) {
684 printf("%% WARNING: -and %zu too high for this GPU, we use the maximum %d instead.", config.and_nodes, deviceProp.maxThreadsPerBlock);
685 }
686 config.and_nodes = deviceProp.maxThreadsPerBlock;
687 }
688
689 // The stack allocated depends on the maximum number of threads per SM, not on the actual number of threads per block.
690 size_t total_stack_size = num_sm * deviceProp.maxThreadsPerMultiProcessor * config.stack_kb * 1000;
691 size_t remaining_global_mem = total_global_mem - total_stack_size;
692 remaining_global_mem -= remaining_global_mem / 10; // We leave 10% of global memory free for CUDA allocations, not sure if it is useful though.
693
694 // Basically the size of the store and propagator, and 100 bytes per variable.
695 // +1 for the root node in GridCP.
696 size_t heap_usage_estimation = (config.or_nodes + 1) * (mem_config.pc_bytes + mem_config.store_bytes + 100 * root.store->vars());
697 while(heap_usage_estimation > remaining_global_mem) {
698 config.or_nodes--;
699 }
700
701 CUDAEX(cudaDeviceSetLimit(cudaLimitStackSize, config.stack_kb*1000));
702 CUDAEX(cudaDeviceSetLimit(cudaLimitMallocHeapSize, remaining_global_mem));
703
704 if(config.verbose_solving) {
705 print_memory_statistics("stack_memory", total_stack_size);
706 print_memory_statistics("heap_memory", remaining_global_mem);
707 print_memory_statistics("heap_usage_estimation", heap_usage_estimation);
708 printf("%% and_nodes=%zu\n", config.and_nodes);
709 printf("%% or_nodes=%zu\n", config.or_nodes);
710 }
711}
712
713template <class S, class U, class Timepoint>
714void configure_and_run(CP<U>& root, const Timepoint& start) {
715 MemoryConfig mem_config = configure_memory<S>(root);
716 configure_blocks_threads<S>(root, mem_config);
717 transfer_memory_and_run<S>(root, mem_config, start);
718}
719
720void check_support_unified_memory() {
721 int attr = 0;
722 int dev = 0;
723 CUDAEX(cudaDeviceGetAttribute(&attr, cudaDevAttrManagedMemory, dev));
724 if (!attr) {
725 std::cerr << "The GPU does not support unified memory." << std::endl;
726 exit(EXIT_FAILURE);
727 }
728}
729
730void check_support_concurrent_managed_memory() {
731 int attr = 0;
732 int dev = 0;
733 CUDAEX(cudaDeviceGetAttribute(&attr, cudaDevAttrConcurrentManagedAccess, dev));
734 if (!attr) {
735#ifdef NO_CONCURRENT_MANAGED_MEMORY
736 printf("%% WARNING: The GPU does not support concurrent access to managed memory, hence we fall back on pinned memory.\n");
737 /** Set cudaDeviceMapHost to allow cudaMallocHost() to allocate pinned memory
738 * for concurrent access between the device and the host. It must be called
739 * early, before any CUDA management functions, so that we can fall back to
740 * using the pinned_allocator instead of the managed_allocator.
741 * This is required on Windows, WSL, macOS, and NVIDIA GRID.
742 * See also PR #18.
743 */
744 unsigned int flags = 0;
745 CUDAEX(cudaGetDeviceFlags(&flags));
746 flags |= cudaDeviceMapHost;
747 CUDAEX(cudaSetDeviceFlags(flags));
748#else
749 printf("%% To run Turbo on this GPU you need to build Turbo with the option NO_CONCURRENT_MANAGED_MEMORY.\n");
750 exit(EXIT_FAILURE);
751#endif
752 }
753}
754
755#endif // __CUDACC__
756
758#ifndef __CUDACC__
759 std::cerr << "You must use a CUDA compiler (nvcc or clang) to compile Turbo on GPU." << std::endl;
760#else
761 check_support_unified_memory();
762 check_support_concurrent_managed_memory();
763 auto start = std::chrono::high_resolution_clock::now();
764 CP<Itv> root(config);
765 root.preprocess();
767#ifdef NO_CONCURRENT_MANAGED_MEMORY
768 if(root.config.noatomics) {
769 configure_and_run<ItvSolverPinnedNoAtomics>(root, start);
770 }
771 else {
772 configure_and_run<ItvSolverPinned>(root, start);
773 }
774#else
775 if(root.config.noatomics) {
776 configure_and_run<ItvSolverNoAtomics>(root, start);
777 }
778 else {
779 configure_and_run<ItvSolver>(root, start);
780 }
781#endif
782#endif
783}
784
785#endif // TURBO_GPU_DIVE_AND_SOLVE_HPP
bool must_quit()
Definition common_solving.hpp:89
void block_signal_ctrlc()
Definition common_solving.hpp:77
bool check_timeout(A &a, const Timepoint &start)
Definition common_solving.hpp:101
void gpu_dive_and_solve(Configuration< bt::standard_allocator > &config)
Definition gpu_dive_and_solve.hpp:757
void print_memory_statistics(const char *key, size_t bytes)
Definition statistics.hpp:12
Definition common_solving.hpp:156
Configuration< BasicAllocator > config
Definition common_solving.hpp:283
abstract_ptr< IStore > store
Definition common_solving.hpp:268
void preprocess()
Definition common_solving.hpp:462
Definition config.hpp:29
size_t or_nodes
Definition config.hpp:41
bool noatomics
Definition config.hpp:39
Definition common_solving.hpp:119
CUDA void * allocate(size_t bytes)
Definition common_solving.hpp:127