Turbo Constraint Solver
Loading...
Searching...
No Matches
gpu_dive_and_solve.hpp
Go to the documentation of this file.
1// Copyright 2023 Pierre Talbot
2
3#ifndef TURBO_GPU_DIVE_AND_SOLVE_HPP
4#define TURBO_GPU_DIVE_AND_SOLVE_HPP
5
6#include "common_solving.hpp"
7#include "memory_gpu.hpp"
8#include <thread>
9#include <algorithm>
10
11namespace bt = ::battery;
12
13#ifdef __CUDACC__
14
15#include <cuda/std/chrono>
16#include <cuda/semaphore>
17
18template <
19 class Universe0, // Universe used locally to one thread.
20 class Universe1, // Universe used in the scope of a block.
21 class Universe2, // Universe used in the scope of a grid.
22 class ConcurrentAllocator> // This allocator allocates memory that can be both accessed by the CPU and the GPU. See PR #18 for the reason (basically, non-Linux systems do not support concurrent managed memory (accessed from CPU and GPU) and must rely on pinned memory instead).
23struct StateTypes
24{
25 using U0 = Universe0;
26 using U1 = Universe1;
27 using U2 = Universe2;
28 using concurrent_allocator = ConcurrentAllocator;
29
30 /** We first interpret the formula in an abstract domain with sequential concurrent memory, that we call `GridCP`. */
31 using GridCP = AbstractDomains<U0,
32 bt::statistics_allocator<ConcurrentAllocator>,
33 bt::statistics_allocator<UniqueLightAlloc<ConcurrentAllocator, 0>>,
34 bt::statistics_allocator<UniqueLightAlloc<ConcurrentAllocator, 1>>>;
35
36 /** Then, once everything is initialized, we rely on a parallel abstract domain called `BlockCP`, usually using atomic shared and global memory. */
37 using BlockCP = AbstractDomains<U1,
38 bt::global_allocator,
39 bt::pool_allocator,
41};
42
43using Itv0 = Interval<ZLB<bound_value_type, bt::local_memory>>;
44using Itv1 = Interval<ZLB<bound_value_type, bt::atomic_memory_block>>;
45using Itv2 = Interval<ZLB<bound_value_type, bt::atomic_memory_grid>>;
46using AtomicBool = B<bt::atomic_memory_block>;
47using FPEngine = FixpointSubsetGPU<BlockAsynchronousFixpointGPU<true>, bt::global_allocator, CUDA_THREADS_PER_BLOCK>;
48
49// Version for non-Linux systems such as Windows where pinned memory must be used (see PR #19).
50#ifdef NO_CONCURRENT_MANAGED_MEMORY
51 using ItvSolverPinned = StateTypes<Itv0, Itv1, Itv2, bt::pinned_allocator>;
52#else
53 // using ItvSolver = StateTypes<Itv0, Itv1, Itv2, bt::managed_allocator>;
54 using ItvSolver = StateTypes<Itv0, Itv0, Itv0, bt::managed_allocator>;
55#endif
56
57template <class S>
58struct BlockData;
59
60/** `GridData` represents the problem to be solved (`root`), the data of each block when running EPS (`blocks`), the index of the subproblem that needs to be solved next (`next_subproblem`), an estimation of the best bound found across subproblems in case of optimisation problem and other global information.
61 */
62template <class S>
63struct GridData {
64 using GridCP = typename S::GridCP;
65 using BlockCP = typename S::BlockCP;
66 using U2 = typename S::U2;
67
68 GridCP root;
69 // `blocks_root` is a copy of `root` but with the same allocators than the ones used in `blocks`.
70 // This is helpful to share immutable data among blocks (for instance the propagators).
71 bt::shared_ptr<BlockCP, bt::global_allocator> blocks_root;
72 // Stop from the CPU, for instance because of a timeout.
73 cuda::std::atomic_flag cpu_stop;
74 // Boolean indicating that the blocks have been reduced, and the CPU can now print the statistics.
75 volatile bool blocks_reduced;
76 MemoryConfig mem_config;
77 bt::vector<BlockData<S>, bt::global_allocator> blocks;
78 // Stop from a block on the GPU, for instance because we found a solution.
79 bt::shared_ptr<B<bt::atomic_memory_grid>, bt::global_allocator> gpu_stop;
80 bt::shared_ptr<ZLB<size_t, bt::atomic_memory_grid>, bt::global_allocator> next_subproblem;
81 bt::shared_ptr<U2, bt::global_allocator> best_bound;
82
83 // All of what follows is only to support printing while the kernel is running.
84 // In particular, we transfer the solution to the CPU where it is printed, because printing on the GPU can be very slow when the problem is large.
85 bt::shared_ptr<cuda::binary_semaphore<cuda::thread_scope_device>, bt::global_allocator> print_lock;
86 cuda::std::atomic_flag ready_to_produce;
87 cuda::std::atomic_flag ready_to_consume;
88
89 GridData(const GridCP& root, const MemoryConfig& mem_config)
90 : root(root)
91 , mem_config(mem_config)
92 , cpu_stop(false)
93 , blocks_reduced(false)
94 {
95 ready_to_consume.clear();
96 ready_to_produce.clear();
97 cuda::atomic_thread_fence(cuda::memory_order_seq_cst, cuda::thread_scope_system);
98 }
99
100 template <class BlockBAB>
101 __device__ void produce_solution(const BlockBAB& bab) {
102 print_lock->acquire();
103 if(!cpu_stop.test()) {
104 bab.extract(*(root.bab));
105 cuda::atomic_thread_fence(cuda::memory_order_seq_cst, cuda::thread_scope_system);
106 ready_to_consume.test_and_set(cuda::std::memory_order_seq_cst);
107 ready_to_consume.notify_one();
108 // Wait the CPU has consumed the solution before continuing.
109 // This avoids a problem with "blocks_reduced" being `true` too fast.
110 ready_to_produce.wait(false, cuda::std::memory_order_seq_cst);
111 ready_to_produce.clear();
112 }
113 print_lock->release();
114 }
115
116 __host__ bool consume_solution() {
117 ready_to_consume.wait(false, cuda::std::memory_order_seq_cst);
118 ready_to_consume.clear();
119 if(blocks_reduced) {
120 root.print_final_solution();
121 if(root.config.print_statistics) {
122 root.print_mzn_statistics();
123 }
124 return true;
125 }
126 else {
127 root.print_solution();
128 }
129 ready_to_produce.test_and_set(cuda::std::memory_order_seq_cst);
130 ready_to_produce.notify_one();
131 return false;
132 }
133
134 __device__ void allocate() {
135 assert(threadIdx.x == 0 && blockIdx.x == 0);
136 auto root_mem_config(mem_config);
137 root_mem_config.mem_kind = MemoryKind::GLOBAL;
138 blocks_root = bt::make_shared<BlockCP, bt::global_allocator>(
139 typename BlockCP::tag_gpu_block_copy{},
140 false, // Due to different allocators between BlockCP and GridCP, it won't be able to share data anyways.
141 root,
142 bt::global_allocator{},
143 root_mem_config.make_prop_pool(bt::pool_allocator(nullptr,0)),
144 root_mem_config.make_store_pool(bt::pool_allocator(nullptr,0)));
145 blocks = bt::vector<BlockData<S>, bt::global_allocator>(root.stats.num_blocks);
146 gpu_stop = bt::make_shared<B<bt::atomic_memory_grid>, bt::global_allocator>(false);
147 print_lock = bt::make_shared<cuda::binary_semaphore<cuda::thread_scope_device>, bt::global_allocator>(1);
148 next_subproblem = bt::make_shared<ZLB<size_t, bt::atomic_memory_grid>, bt::global_allocator>(0);
149 best_bound = bt::make_shared<U2, bt::global_allocator>();
150 }
151
152 __device__ void deallocate() {
153 assert(threadIdx.x == 0 && blockIdx.x == 0);
154 blocks = bt::vector<BlockData<S>, bt::global_allocator>();
155 blocks_root->deallocate();
156 blocks_root.reset();
157 gpu_stop.reset();
158 print_lock.reset();
159 next_subproblem.reset();
160 best_bound.reset();
161 }
162};
163
164/** `BlockData` contains all the structures required to solve a subproblem including the problem itself (`root`) and the fixpoint engine (`fp_engine`). */
165template <class S>
166struct BlockData {
167 using GridCP = typename S::GridCP;
168 using BlockCP = typename S::BlockCP;
169
170 using snapshot_type = typename BlockCP::IST::snapshot_type<bt::global_allocator>;
171 size_t subproblem_idx;
172 bt::shared_ptr<FPEngine, bt::pool_allocator> fp_engine;
173 bt::shared_ptr<AtomicBool, bt::pool_allocator> has_changed;
174 bt::shared_ptr<AtomicBool, bt::pool_allocator> stop;
175 bt::shared_ptr<BlockCP, bt::global_allocator> root;
176 bt::shared_ptr<snapshot_type, bt::global_allocator> snapshot_root;
177
178 __device__ BlockData():
179 has_changed(nullptr, bt::pool_allocator(nullptr, 0)),
180 stop(nullptr, bt::pool_allocator(nullptr, 0))
181 {}
182
183public:
184 /** Initialize the block data.
185 * Allocate the abstract domains in the best memory available depending on how large are the abstract domains. */
186 __device__ void allocate(GridData<S>& grid_data, unsigned char* shared_mem) {
187 auto block = cooperative_groups::this_thread_block();
188 if(threadIdx.x == 0) {
189 subproblem_idx = blockIdx.x;
190 MemoryConfig& mem_config = grid_data.mem_config;
191 bt::pool_allocator shared_mem_pool(mem_config.make_shared_pool(shared_mem));
192 fp_engine = bt::allocate_shared<FPEngine, bt::pool_allocator>(shared_mem_pool);
193 has_changed = bt::allocate_shared<AtomicBool, bt::pool_allocator>(shared_mem_pool, true);
194 stop = bt::allocate_shared<AtomicBool, bt::pool_allocator>(shared_mem_pool, false);
195 root = bt::make_shared<BlockCP, bt::global_allocator>(typename BlockCP::tag_gpu_block_copy{},
196 (mem_config.mem_kind != MemoryKind::TCN_SHARED),
197 *(grid_data.blocks_root),
198 bt::global_allocator{},
199 mem_config.make_prop_pool(shared_mem_pool),
200 mem_config.make_store_pool(shared_mem_pool));
201 snapshot_root = bt::make_shared<snapshot_type, bt::global_allocator>(root->search_tree->template snapshot<bt::global_allocator>());
202 }
203 block.sync();
204 fp_engine->init(root->iprop->num_deductions());
205 block.sync();
206 }
207
208 __device__ void deallocate_shared() {
209 if(threadIdx.x == 0) {
210 fp_engine.reset();
211 has_changed.reset();
212 stop.reset();
213 root->deallocate();
214 snapshot_root.reset();
215 }
216 __syncthreads();
217 }
218
219 __device__ void restore() {
220 if(threadIdx.x == 0) {
221 root->search_tree->restore(*snapshot_root);
222 }
223 __syncthreads();
224 }
225};
226
227template <class S>
228__global__ void initialize_grid_data(GridData<S>* grid_data) {
229 grid_data->allocate();
230 size_t num_subproblems = 1;
231 num_subproblems <<= grid_data->root.config.subproblems_power;
232 grid_data->next_subproblem->meet(ZLB<size_t, bt::local_memory>(grid_data->root.stats.num_blocks));
233 grid_data->root.stats.eps_num_subproblems = num_subproblems;
234}
235
236template <class S>
237__global__ void deallocate_grid_data(GridData<S>* grid_data) {
238 grid_data->deallocate();
239}
240
241/** We update the bound found by the current block so it is visible to all other blocks.
242 * Note that this operation might not always succeed, which is okay, the best bound is still preserved in `block_data` and then reduced at the end (in `reduce_blocks`).
243 * The worst that can happen is that a best bound is found twice, which does not prevent the correctness of the algorithm.
244 */
245template <class S>
246__device__ bool update_grid_best_bound(BlockData<S>& block_data, GridData<S>& grid_data) {
247 using U0 = typename S::U0;
248 assert(threadIdx.x == 0);
249 if(block_data.root->bab->is_optimization()) {
250 const auto& bab = block_data.root->bab;
251 auto local_best = bab->optimum().project(bab->objective_var());
252 // printf("[new bound] %d: [%d..%d] (current best: [%d..%d])\n", blockIdx.x, local_best.lb().value(), local_best.ub().value(), grid_data.best_bound->lb().value(), grid_data.best_bound->ub().value());
253 if(bab->is_maximization()) {
254 return grid_data.best_bound->meet_lb(dual_bound<typename U0::LB>(local_best.ub()));
255 }
256 else {
257 return grid_data.best_bound->meet_ub(dual_bound<typename U0::UB>(local_best.lb()));
258 }
259 }
260 return false;
261}
262
263/** This function updates the best bound of the current block according to the best bound found so far across blocks.
264 * We directly update the store with the best bound.
265 * This function should be called in each node, since the best bound is erased on backtracking (it is not included in the snapshot).
266 */
267template <class S>
268__device__ void update_block_best_bound(BlockData<S>& block_data, GridData<S>& grid_data) {
269 using U0 = typename S::U0;
270 if(threadIdx.x == 0 && block_data.root->bab->is_optimization()) {
271 const auto& bab = block_data.root->bab;
272 VarEnv<bt::global_allocator> empty_env{};
273 auto best_formula = bab->template deinterpret_best_bound<bt::global_allocator>(
274 bab->is_maximization()
275 ? U0(dual_bound<typename U0::UB>(grid_data.best_bound->lb()))
276 : U0(dual_bound<typename U0::LB>(grid_data.best_bound->ub())));
277 // printf("global best: "); grid_data.best_bound->ub().print(); printf("\n");
278 // best_formula.print(); printf("\n");
279 IDiagnostics diagnostics;
280 interpret_and_tell(best_formula, empty_env, *block_data.root->store, diagnostics);
281 }
282}
283
284/** Propagate a node of the search tree and process the leaf nodes (failed or solution).
285 * Branching on unknown nodes is a task left to the caller.
286 */
287template <class S>
288__device__ bool propagate(BlockData<S>& block_data, GridData<S>& grid_data) {
289 using BlockCP = typename S::BlockCP;
290 __shared__ int warp_iterations[CUDA_THREADS_PER_BLOCK/32];
291 warp_iterations[threadIdx.x / 32] = 0;
292 bool is_leaf_node = false;
293 BlockCP& cp = *block_data.root;
294 auto group = cooperative_groups::this_thread_block();
295 auto& fp_engine = *block_data.fp_engine;
296 auto& iprop = *cp.iprop;
297 auto start = cp.stats.start_timer_device();
298 int fp_iterations;
299 switch(cp.config.fixpoint) {
300 case FixpointKind::AC1: {
301 fp_iterations = fp_engine.fixpoint(
302 [&](int i){ return iprop.deduce(i); },
303 [&](){ return iprop.is_bot(); });
304 if(threadIdx.x == 0) {
305 cp.stats.num_deductions += fp_iterations * fp_engine.num_active();
306 }
307 break;
308 }
309 case FixpointKind::WAC1: {
310 if(fp_engine.num_active() <= cp.config.wac1_threshold) {
311 fp_iterations = fp_engine.fixpoint(
312 [&](int i){ return iprop.deduce(i); },
313 [&](){ return iprop.is_bot(); });
314 if(threadIdx.x == 0) {
315 cp.stats.num_deductions += fp_iterations * fp_engine.num_active();
316 }
317 }
318 else {
319 fp_iterations = fp_engine.fixpoint(
320 [&](int i){ return warp_fixpoint<CUDA_THREADS_PER_BLOCK>(iprop, i, warp_iterations); },
321 [&](){ return iprop.is_bot(); });
322 if(threadIdx.x == 0) {
323 for(int i = 0; i < CUDA_THREADS_PER_BLOCK/32; ++i) {
324 cp.stats.num_deductions += warp_iterations[i] * 32;
325 }
326 }
327 }
328 break;
329 }
330 }
331
332 start = cp.stats.stop_timer(Timer::FIXPOINT, start);
333 if(!iprop.is_bot()) {
334 fp_engine.select([&](int i) { return !iprop.ask(i); });
335 start = cp.stats.stop_timer(Timer::SELECT_FP_FUNCTIONS, start);
336 if(fp_engine.num_active() == 0) {
337 is_leaf_node = cp.store->template is_extractable<AtomicExtraction>(group);
338 }
339 }
340 else {
341 is_leaf_node = true;
342 }
343 if(threadIdx.x == 0) {
344 cp.stats.fixpoint_iterations += fp_iterations;
345 bool is_pruned = cp.on_node();
346 if(iprop.is_bot()) {
347 cp.on_failed_node();
348 }
349 else if(is_leaf_node) { // is_leaf_node is set to true above.
350 if(cp.bab->is_satisfaction() || cp.bab->compare_bound(*cp.store, cp.bab->optimum())) {
351 cp.bab->deduce();
352 bool best_has_changed = update_grid_best_bound(block_data, grid_data);
353 if(cp.bab->is_satisfaction() || (best_has_changed && cp.is_printing_intermediate_sol())) {
354 grid_data.produce_solution(*cp.bab);
355 }
356 is_pruned |= cp.update_solution_stats();
357 }
358 }
359 if(is_pruned) {
360 grid_data.gpu_stop->join(true);
361 }
362 cp.stats.stop_timer(Timer::SEARCH, start);
363 }
364 if(is_leaf_node) {
365 fp_engine.reset(cp.iprop->num_deductions());
366 }
367 return is_leaf_node;
368}
369
370/** The initial problem tackled during the dive must always be the same.
371 * Hence, don't be tempted to add the objective during diving because it might lead to ignoring some subproblems since the splitting decisions will differ.
372 */
373template <class S>
374__device__ size_t dive(BlockData<S>& block_data, GridData<S>& grid_data) {
375 using BlockCP = typename S::BlockCP;
376 BlockCP& cp = *block_data.root;
377 auto& fp_engine = *block_data.fp_engine;
378 auto& stop = *block_data.stop;
379 // Note that we use `block_has_changed` to stop the "diving", not really to indicate something has changed or not (since we do not need this information for this algorithm).
380 auto& stop_diving = *block_data.has_changed;
381 stop.meet(false);
382 stop_diving.meet(false);
383 fp_engine.barrier();
384 size_t remaining_depth = grid_data.root.config.subproblems_power;
385 while(remaining_depth > 0 && !stop_diving && !stop) {
386 bool is_leaf_node = propagate(block_data, grid_data);
387 stop.join(grid_data.cpu_stop.test() || *(grid_data.gpu_stop));
388 if(is_leaf_node) {
389 if(threadIdx.x == 0) {
390 stop_diving.join(true);
391 }
392 }
393 else {
394 remaining_depth--;
395 if(threadIdx.x == 0) {
396 auto start = cp.stats.start_timer_device();
397 size_t branch_idx = (block_data.subproblem_idx & (size_t{1} << remaining_depth)) >> remaining_depth;
398 auto branches = cp.split->split();
399 assert(branches.size() == 2);
400 cp.iprop->deduce(branches[branch_idx]);
401 cp.stats.stop_timer(Timer::SEARCH, start);
402 }
403 }
404 fp_engine.barrier();
405 }
406 return remaining_depth;
407}
408
409template <class S>
410__device__ void solve_problem(BlockData<S>& block_data, GridData<S>& grid_data) {
411 using BlockCP = typename S::BlockCP;
412 BlockCP& cp = *block_data.root;
413 auto& fp_engine = *block_data.fp_engine;
414 auto& block_has_changed = *block_data.has_changed;
415 auto& stop = *block_data.stop;
416 block_has_changed.join(true);
417 stop.meet(false);
418 fp_engine.barrier();
419 auto start = cp.stats.start_timer_device();
420 // In the condition, we must only read variables that are local to this block.
421 // Otherwise, two threads might read different values if it is changed in between by another block.
422 while(block_has_changed && !stop) {
423 update_block_best_bound(block_data, grid_data);
424 cp.stats.stop_timer(Timer::SEARCH, start);
425 propagate(block_data, grid_data);
426 if(threadIdx.x == 0) {
427 start = cp.stats.start_timer_device();
428 stop.join(grid_data.cpu_stop.test() || *(grid_data.gpu_stop));
429 // propagate induces a memory fence, therefore all threads are already past the "while" condition.
430 // auto t = cp.stats.start_timer_device();
431 block_has_changed.meet(cp.search_tree->deduce());
432 // cp.stats.stop_timer(Timer::ST_DEDUCE, t);
433 // auto t = cp.stats.start_timer_device();
434 // auto s = cp.search_tree->split->split();
435 // t = cp.stats.stop_timer(Timer::SPLIT, t);
436 // auto P = cp.search_tree->push(std::move(s));
437 // t = cp.stats.stop_timer(Timer::PUSH, t);
438 // block_has_changed.meet(cp.search_tree->pop(std::move(P)));
439 // cp.stats.stop_timer(Timer::POP, t);
440 }
441 fp_engine.barrier();
442 }
443 cp.stats.stop_timer(Timer::SEARCH, start);
444}
445
446template <class S>
447CUDA void reduce_blocks(GridData<S>* grid_data) {
448 for(int i = 0; i < grid_data->blocks.size(); ++i) {
449 if(grid_data->blocks[i].root) { // `nullptr` could happen if we try to terminate the program before all blocks are even created.
450 grid_data->root.meet(*(grid_data->blocks[i].root));
451 }
452 }
453}
454
455template <class S>
456__global__ void gpu_solve_kernel(GridData<S>* grid_data)
457{
458 if(threadIdx.x == 0 && blockIdx.x == 0 && grid_data->root.config.verbose_solving) {
459 printf("%% GPU kernel started, starting solving...\n");
460 }
461 extern __shared__ unsigned char shared_mem[];
462 size_t num_subproblems = grid_data->root.stats.eps_num_subproblems;
463 BlockData<S>& block_data = grid_data->blocks[blockIdx.x];
464 block_data.allocate(*grid_data, shared_mem);
465 auto solve_start = block_data.root->stats.start_timer_device();
466 while(block_data.subproblem_idx < num_subproblems && !*(block_data.stop)) {
467 if(threadIdx.x == 0 && grid_data->root.config.verbose_solving >= 2) {
468 grid_data->print_lock->acquire();
469 printf("%% Block %d solves subproblem num %" PRIu64 "\n", blockIdx.x, block_data.subproblem_idx);
470 grid_data->print_lock->release();
471 }
472 block_data.restore();
473 __syncthreads();
474 auto dive_start = block_data.root->stats.start_timer_device();
475 size_t remaining_depth = dive(block_data, *grid_data);
476 block_data.root->stats.stop_timer(Timer::DIVE, dive_start);
477 if(remaining_depth == 0) {
478 solve_problem(block_data, *grid_data);
479 if(threadIdx.x == 0 && !*(block_data.stop)) {
480 block_data.root->stats.eps_solved_subproblems += 1;
481 }
482 }
483 else {
484 if(threadIdx.x == 0 && !*(block_data.stop)) {
485 size_t next_subproblem_idx = ((block_data.subproblem_idx >> remaining_depth) + size_t{1}) << remaining_depth;
486 grid_data->next_subproblem->meet(ZLB<size_t, bt::local_memory>(next_subproblem_idx));
487 // It is possible that several blocks skip similar subproblems. Hence, we only count the subproblems skipped by the block solving the left most subproblem.
488 if((block_data.subproblem_idx & ((size_t{1} << remaining_depth) - size_t{1})) == size_t{0}) {
489 block_data.root->stats.eps_skipped_subproblems += next_subproblem_idx - block_data.subproblem_idx;
490 }
491 }
492 }
493 // Load next problem.
494 if(threadIdx.x == 0 && !*(block_data.stop)) {
495 block_data.subproblem_idx = grid_data->next_subproblem->value();
496 grid_data->next_subproblem->meet(ZLB<size_t, bt::local_memory>(block_data.subproblem_idx + size_t{1}));
497 }
498 __syncthreads();
499 }
500 __syncthreads();
501 block_data.root->stats.stop_timer(Timer::SOLVE, solve_start);
502 if(threadIdx.x == 0 && !*(block_data.stop)) {
503 block_data.root->stats.num_blocks_done = 1;
504 }
505 if(threadIdx.x == 0) {
506 grid_data->print_lock->acquire();
507 if(!grid_data->blocks_reduced) {
508 int n = 0;
509 for(int i = 0; i < grid_data->blocks.size(); ++i) {
510 if(grid_data->blocks[i].root) { // `nullptr` could happen if we try to terminate the program before all blocks are even created.
511 n += grid_data->blocks[i].root->stats.num_blocks_done;
512 }
513 }
514 if(block_data.stop->value() || n == grid_data->blocks.size()) {
515 reduce_blocks(grid_data);
516 grid_data->blocks_reduced = true;
517 cuda::atomic_thread_fence(cuda::memory_order_seq_cst, cuda::thread_scope_system);
518 grid_data->ready_to_consume.test_and_set(cuda::std::memory_order_seq_cst);
519 grid_data->ready_to_consume.notify_one();
520 }
521 }
522 grid_data->print_lock->release();
523 }
524 // We must destroy all objects allocated in the shared memory, trying to destroy them anywhere else will lead to segfault.
525 block_data.deallocate_shared();
526}
527
528template <class S, class U>
529size_t sizeof_store(const CP<U>& root) {
530 return gpu_sizeof<typename S::BlockCP::IStore>()
531 + gpu_sizeof<typename S::BlockCP::IStore::universe_type>() * root.store->vars();
532}
533
534/** \returns the size of the shared memory and the kind of memory used. */
535template <class S, class U>
536MemoryConfig configure_memory(CP<U>& root) {
537 cudaDeviceProp deviceProp;
538 cudaGetDeviceProperties(&deviceProp, 0);
539 const auto& config = root.config;
540 size_t shared_mem_capacity = deviceProp.sharedMemPerBlock;
541
542 // Copy the root to know how large are the abstract domains.
543 CP<U> root2(root);
544
545 size_t store_alignment = 200; // The store does not create much alignment overhead since it is almost only a single array.
546
547 MemoryConfig mem_config;
548 // Need a bit of shared memory for the fixpoint engine.
549 mem_config.shared_bytes = sizeof(FPEngine)+100;
550 mem_config.store_bytes = sizeof_store<S>(root2) + store_alignment;
551 // We add 20% extra memory due to the alignment of the shared memory which is not taken into account in the statistics.
552 // From limited experiments, alignment overhead is usually around 10%.
553 mem_config.prop_bytes = root2.prop_allocator.total_bytes_allocated();
554 mem_config.prop_bytes += mem_config.prop_bytes / 5;
555 if(config.only_global_memory || shared_mem_capacity < mem_config.shared_bytes + mem_config.store_bytes) {
556 if(!config.only_global_memory && config.verbose_solving) {
557 printf("%% The store of variables (%zuKB) cannot be stored in the shared memory of the GPU (%zuKB), therefore we use the global memory.\n",
558 mem_config.store_bytes / 1000,
559 shared_mem_capacity / 1000);
560 }
561 mem_config.mem_kind = MemoryKind::GLOBAL;
562 }
563 else if(shared_mem_capacity > mem_config.shared_bytes + mem_config.store_bytes + mem_config.prop_bytes) {
564 if(config.verbose_solving) {
565 printf("%% The store of variables and the propagators (%zuKB) are stored in the shared memory of the GPU (%zuKB).\n",
566 (mem_config.shared_bytes + mem_config.store_bytes + mem_config.prop_bytes) / 1000,
567 shared_mem_capacity / 1000);
568 }
569 mem_config.shared_bytes += mem_config.store_bytes + mem_config.prop_bytes;
570 mem_config.mem_kind = MemoryKind::TCN_SHARED;
571 }
572 else {
573 if(config.verbose_solving) {
574 printf("%% The store of variables (%zuKB) is stored in the shared memory of the GPU (%zuKB).\n",
575 mem_config.store_bytes / 1000,
576 shared_mem_capacity / 1000);
577 }
578 mem_config.shared_bytes += mem_config.store_bytes;
579 mem_config.mem_kind = MemoryKind::STORE_SHARED;
580 }
581 root.stats.print_memory_statistics(config.verbose_solving, "store_memory_real", root2.store_allocator.total_bytes_allocated());
582 root.stats.print_memory_statistics(config.verbose_solving, "prop_memory_real", root2.prop_allocator.total_bytes_allocated());
583 root.stats.print_memory_statistics(config.verbose_solving, "other_memory_real", root2.basic_allocator.total_bytes_allocated());
584 return mem_config;
585}
586
587template <class S>
588void consume_kernel_solutions(GridData<S>& grid_data) {
589 while(!grid_data.consume_solution()) {}
590}
591
592template <class S, class U, class Timepoint>
593void transfer_memory_and_run(CP<U>& root, MemoryConfig mem_config, const Timepoint& start) {
594 using concurrent_allocator = typename S::concurrent_allocator;
595 auto grid_data = bt::make_shared<GridData<S>, concurrent_allocator>(root, mem_config);
596 initialize_grid_data<<<1,1>>>(grid_data.get());
597 CUDAEX(cudaDeviceSynchronize());
598 if(grid_data->root.config.print_statistics) {
599 mem_config.print_mzn_statistics(root.config, root.stats);
600 }
601 std::thread consumer_thread(consume_kernel_solutions<S>, std::ref(*grid_data));
602 gpu_solve_kernel
603 <<<static_cast<unsigned int>(grid_data->root.stats.num_blocks),
604 CUDA_THREADS_PER_BLOCK,
605 grid_data->mem_config.shared_bytes>>>
606 (grid_data.get());
607 bool interrupted = wait_solving_ends(grid_data->cpu_stop, grid_data->root, start);
608 consumer_thread.join();
609 CUDAEX(cudaDeviceSynchronize());
610 deallocate_grid_data<<<1,1>>>(grid_data.get());
611 CUDAEX(cudaDeviceSynchronize());
612}
613
614// From https://stackoverflow.com/a/32531982/2231159
615int threads_per_sm(cudaDeviceProp devProp) {
616 switch (devProp.major){
617 case 2: return (devProp.minor == 1) ? 48 : 32; // Fermi
618 case 3: return 192; // Kepler
619 case 5: return 128; // Maxwell
620 case 6: return (devProp.minor == 0) ? 64 : 128; // Pascal
621 case 7: return 64; // Volta and Turing
622 case 8: return (devProp.minor == 0) ? 64 : 128; // Ampere
623 case 9: return 128; // Hopper
624 default: return 64;
625 }
626}
627
628template <class S, class U>
629void configure_blocks_threads(CP<U>& root, const MemoryConfig& mem_config) {
630 int max_block_per_sm;
631 cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_block_per_sm, (void*) gpu_solve_kernel<S>, CUDA_THREADS_PER_BLOCK, (int)mem_config.shared_bytes);
632 if(root.config.verbose_solving) {
633 printf("%% max_blocks_per_sm=%d\n", max_block_per_sm);
634 }
635
636 cudaDeviceProp deviceProp;
637 cudaGetDeviceProperties(&deviceProp, 0);
638
639 size_t total_global_mem = deviceProp.totalGlobalMem;
640 size_t num_sm = deviceProp.multiProcessorCount;
641 size_t num_threads_per_sm = threads_per_sm(deviceProp);
642
643 auto& config = root.config;
644 root.stats.num_blocks = (config.or_nodes == 0) ? max_block_per_sm : config.or_nodes;
645
646 config.stack_kb = config.stack_kb == 0 ? 32 : config.stack_kb;
647
648 // The stack allocated depends on the maximum number of threads per SM, not on the actual number of threads per block.
649 size_t total_stack_size = num_sm * deviceProp.maxThreadsPerMultiProcessor * config.stack_kb * 1000;
650 size_t remaining_global_mem = total_global_mem - total_stack_size;
651 remaining_global_mem -= remaining_global_mem / 10; // We leave 10% of global memory free for CUDA allocations, not sure if it is useful though.
652
653 // Basically the size of the store and propagator, and 100 bytes per variable.
654 // +1 for the root node in GridCP.
655 size_t heap_usage_estimation = (root.stats.num_blocks + 1) * (mem_config.prop_bytes + mem_config.store_bytes + 100 * root.store->vars());
656 while(heap_usage_estimation > remaining_global_mem) {
657 root.stats.num_blocks--;
658 }
659
660 // The stack always need to be modified for this algorithm due to recursive function calls.
661 CUDAEX(cudaDeviceSetLimit(cudaLimitStackSize, config.stack_kb*1000));
662 CUDAEX(cudaDeviceSetLimit(cudaLimitMallocHeapSize, remaining_global_mem/15));
663
664 root.stats.print_memory_statistics(config.verbose_solving, "stack_memory", total_stack_size);
665 root.stats.print_memory_statistics(config.verbose_solving, "heap_memory", remaining_global_mem);
666 root.stats.print_memory_statistics(config.verbose_solving, "heap_usage_estimation", heap_usage_estimation);
667 if(config.verbose_solving) {
668 printf("%% num_blocks=%d\n", root.stats.num_blocks);
669 }
670}
671
672template <class S, class U, class Timepoint>
673void configure_and_run(CP<U>& root, const Timepoint& start) {
674 MemoryConfig mem_config = configure_memory<S>(root);
675 configure_blocks_threads<S>(root, mem_config);
676 transfer_memory_and_run<S>(root, mem_config, start);
677}
678
679#endif // __CUDACC__
680
682#ifndef __CUDACC__
683 std::cerr << "You must use a CUDA compiler (nvcc or clang) to compile Turbo on GPU." << std::endl;
684#else
685 check_support_managed_memory();
686 check_support_concurrent_managed_memory();
687 auto start = std::chrono::steady_clock::now();
688 CP<Itv> root(config);
689 root.preprocess();
690 if(root.iprop->is_bot()) {
692 return;
693 }
695#ifdef NO_CONCURRENT_MANAGED_MEMORY
696 configure_and_run<ItvSolverPinned>(root, start);
697#else
698 configure_and_run<ItvSolver>(root, start);
699#endif
700#endif
701}
702
703#endif // TURBO_GPU_DIVE_AND_SOLVE_HPP
void block_signal_ctrlc()
Definition common_solving.hpp:72
void gpu_dive_and_solve(Configuration< bt::standard_allocator > &config)
Definition gpu_dive_and_solve.hpp:681
@ FIXPOINT
@ SELECT_FP_FUNCTIONS
Definition common_solving.hpp:144
Configuration< BasicAllocator > config
Definition common_solving.hpp:278
CUDA void print_final_solution()
Definition common_solving.hpp:830
Statistics< BasicAllocator > stats
Definition common_solving.hpp:279
abstract_ptr< IStore > store
Definition common_solving.hpp:260
void preprocess()
Definition common_solving.hpp:591
abstract_ptr< IProp > iprop
Definition common_solving.hpp:261
Definition config.hpp:34
int verbose_solving
Definition config.hpp:41
int num_blocks
Definition statistics.hpp:117
CUDA void print_memory_statistics(int verbose, const char *key, size_t bytes) const
Definition statistics.hpp:256
Definition common_solving.hpp:107