Turbo Constraint Solver
Loading...
Searching...
No Matches
hybrid_dive_and_solve.hpp
Go to the documentation of this file.
1// Copyright 2024 Pierre Talbot
2
3#ifndef TURBO_HYBRID_DIVE_AND_SOLVE_HPP
4#define TURBO_HYBRID_DIVE_AND_SOLVE_HPP
5
6#include "common_solving.hpp"
7#include <mutex>
8#include <thread>
9#include <chrono>
10
11namespace bt = ::battery;
12
13/**
14 * "Dive and solve" is a new algorithm to parallelize the standard "propagate and search" algorithm of constraint programming.
15 * Given a depth `d`, we create `2^d` subproblems that we solve in parallel.
16 * We create as many CPU threads as blocks on the GPU (option `cp.stats.num_blocks`).
17 * A CPU thread takes the next subproblem available and run the "propagate and search" algorithm on it.
18 * The CPU thread offloads the propagation to the GPU, but take care of splitting and backtracking in the search tree, as well as maintaining the best bound found, and the statistics.
19 * Therefore, a kernel with only 1 block is executed each time we propagate a node.
20 * Since many CPU threads execute in parallel, there are many kernels running in parallel.
21 *
22 * We call a task solving a subproblem a "cube" (this loosely follows the terminology of SAT solving with "cube and conquer").
23 * By CPU cube, we refer to the local state of a CPU thread for solving a subproblem.
24 * By GPU cube, we refer to the local state of a GPU block for solving a subproblem.
25 * Note that at each moment, there are at most `cp.stats.num_blocks` cubes active in parallel.
26 */
27
28#ifdef __CUDACC__
29
30/** By default, we don't need dynamic shared memory. */
31#define DEFAULT_SHARED_MEM_BYTES 0
32
33/** A CPU cube is the data used by a CPU thread to solve a subproblem. */
34struct CPUCube {
35 using cube_type = AbstractDomains<
36 Itv, bt::standard_allocator, UniqueLightAlloc<bt::standard_allocator,0>, bt::pinned_allocator>;
37 /** A CPU cube is fully allocated on the CPU, but for the store of variables `cube.store` which is allocated in managed memory.
38 * Indeed, when exchanging information between a GPU cube and a CPU cube, only the store of variables need to be transfered.
39 */
40 cube_type cube;
41
42 /** We keep a snapshot of the root to reinitialize the CPU cube after each subproblem has been solved. */
43 typename cube_type::IST::snapshot_type<bt::standard_allocator> root_snapshot;
44
45 /** This flag becomes `true` when the thread has finished its execution (it exits `dive_and_solve` function). */
46 std::atomic_flag finished;
47
48 /** This is the path to the subproblem needed to be solved by this cube.
49 * This member is initialized in `CPUData` constructor.
50 */
51 size_t subproblem_idx;
52
53 CPUCube(const CP<Itv>& root)
54 : cube(root)
55 , root_snapshot(cube.search_tree->template snapshot<bt::standard_allocator>())
56 {}
57
58 /** We share the common data among the CPU cubes (e.g., propagators, simplifier, environment, ...). */
59 CPUCube(const CPUCube& other)
60 : cube(typename cube_type::tag_gpu_block_copy{}, true, other.cube)
61 , root_snapshot(cube.search_tree->template snapshot<bt::standard_allocator>())
62 {}
63};
64
65/** A GPU cube is the data used by a GPU block to solve a subproblem.
66 * We only allocate the necessary structures to perform propagation: the store of variables and the propagators.
67*/
68struct GPUCube {
69 /** We use atomic to store the interval's lower and upper bounds. */
70 // using Itv1 = Interval<ZLB<bound_value_type, bt::atomic_memory_block>>;
71 using Itv1 = Interval<ZLB<bound_value_type, bt::local_memory>>;
72
73 /** We use a `pool_allocator`, this allows to easily switch between global memory and shared memory, if the store of variables can fit inside. */
74 using IStore = VStore<Itv1, bt::pool_allocator>;
75
76 /** The store of propagators also uses a `pool_allocator` of global memory. This was necessary due to the slow copy of propagators between CPU and GPU.
77 * Indeed, a propagator is a tree-shaped object (like an AST) that contain many pointers, and thus the copy calls the allocation function a lot. */
78 #ifdef TURBO_IPC_ABSTRACT_DOMAIN
79 using IProp = PC<IStore, bt::pool_allocator>;
80 #else
81 using IProp = PIR<IStore, bt::pool_allocator>;
82 #endif
83
84 /** The store of variables is only accessible on GPU. */
85 abstract_ptr<IStore> store_gpu;
86
87 /** The propagators is only accessible on GPU but the array of propagators is shared among all blocks.
88 * Since the propagators are state-less, we avoid duplicating them in each block.
89 */
90 abstract_ptr<IProp> iprop_gpu;
91
92 /** `events` is used in the fixpoint loop to know which variables have been modified between two iterations. */
93 // battery::vector<local::B, bt::global_allocator> events;
94 // battery::vector<local::B, bt::global_allocator> events2;
95 // battery::vector<local::B, bt::pinned_allocator> events_cpu;
96
97 /** We also have a store of variables in managed memory to communicate the results of the propagation to the CPU.
98 * Note that this store is the same than the one in the corresponding CPU cube (`cube.store`).
99 * This member is initialized in `CPUData` constructor.
100 */
101 abstract_ptr<VStore<Itv, bt::pinned_allocator>> store_cpu;
102
103 /** This Boolean is used to communicate to the CPU the current node is a solution. */
104 cuda::std::atomic_flag solution_found;
105
106 /** The cumulative number of iterations required to reach a fixpoint.
107 * By dividing this statistic by the number of nodes, we get the average number of iterations per node.
108 */
109 size_t fp_iterations;
110 size_t num_deductions;
111
112 /** From config. */
113 FixpointKind fp_kind;
114 size_t wac1_threshold;
115
117
118 /** The CPU thread and the GPU block use those two flags to signal to each other when to work and when to wait.
119 * This is necessary due to the persistent kernel design of this algorithm.
120 */
121 cuda::std::atomic_flag ready_to_propagate;
122 cuda::std::atomic_flag ready_to_search;
123
124 /** A flag to notify the kernel it must stop. */
125 cuda::std::atomic_flag stop;
126
127 GPUCube() :
128 fp_iterations(0), num_deductions(0), wac1_threshold(0)
129 {
130 /** Initially, we are not ready to propagate or to search. */
131 ready_to_search.clear();
132 ready_to_propagate.clear();
133 stop.clear();
134 solution_found.clear();
135 cuda::atomic_thread_fence(cuda::memory_order_seq_cst, cuda::thread_scope_system);
136 }
137
138 /** Initialize the store of variables and propagators from existing store and propagators.
139 * If `pc_shared` is `true`, the propagators if this cube will be shared with `pc`.
140 * Otherwise, a full copy of the propagators is made.
141 * We generally want to share the propagators, hence the cube 0 copies the propagators, and all other cubes share them.
142 */
143 template <class StoreType, class PCType>
144 __device__ void allocate(StoreType& store, PCType& pc, size_t bytes, bool pc_shared) {
145 int n = pc.num_deductions();
146 bytes += 100; // for alignment padding.
147 void* mem_pool = bt::global_allocator{}.allocate(bytes);
148 bt::pool_allocator pool(static_cast<unsigned char*>(mem_pool), bytes);
149 AbstractDeps<bt::global_allocator, bt::pool_allocator> deps(pc_shared, bt::global_allocator{}, pool);
150 iprop_gpu = bt::allocate_shared<IProp, bt::pool_allocator>(pool, pc, deps);
151 store_gpu = deps.template extract<IStore>(store.aty());
152 // events.resize(store.vars());
153 // events2.resize(store.vars());
154 }
155
156 __device__ void deallocate() {
157 // NOTE: .reset() does not work because it does not reset the allocator, which is itself allocated in global memory.
158 store_gpu = abstract_ptr<IStore>();
159 iprop_gpu = abstract_ptr<IProp>();
160 // events = battery::vector<local::B, bt::global_allocator>();
161 // events2 = battery::vector<local::B, bt::global_allocator>();
162 }
163};
164
165/** This kernel initializes the GPU cubes. See the constructor of `CPUData` for more information. */
166template <class Store, class IProp>
167__global__ void allocate_gpu_cubes(GPUCube* gpu_cubes,
168 size_t n, Store* store, IProp* iprop)
169{
170 assert(threadIdx.x == 0 && blockIdx.x == 0);
171 size_t bytes = store->get_allocator().total_bytes_allocated()
172 + sizeof(GPUCube::IStore) + sizeof(GPUCube::IProp) + 1000;
173 gpu_cubes[0].allocate(*store, *iprop, bytes + iprop->get_allocator().total_bytes_allocated(), false);
174 for(int i = 1; i < n; ++i) {
175 gpu_cubes[i].allocate(*gpu_cubes[0].store_gpu, *gpu_cubes[0].iprop_gpu, bytes, true);
176 }
177}
178
179__global__ void deallocate_gpu_cubes(GPUCube* gpu_cubes, size_t n) {
180 assert(threadIdx.x == 0 && blockIdx.x == 0);
181 for(int i = 0; i < n; ++i) {
182 gpu_cubes[i].deallocate();
183 }
184}
185
186/** This is the data shared among all CPU threads. */
187struct CPUData {
188 /** We generate the subproblems lazily.
189 * Suppose we generate `2^3` subproblems, we represent the first subproblem as `000`, the second as `001`, the third as `010`, and so on.
190 * A `0` means to turn left in the search tree, and a `1` means to turn right.
191 * Incrementing this integer will generate the path of the next subproblem.
192 */
193 ZLB<size_t, bt::atomic_memory<>> next_subproblem;
194
195 /** This is the best bound found so far, globally, across all threads. */
196 Itv best_bound;
197
198 /** A flag to stop the threads, for example because of a timeout or CTRL-C. */
199 std::atomic_flag cpu_stop;
200
201 /** Due to multithreading, we must protect `stdout` when printing.
202 * The model of computation in this work is lock-free, but it seems unavoidable for printing.
203 */
204 std::mutex print_lock;
205
206 /** The initial problem is only accessible from the CPU.
207 * It is used at the beginning to count the bytes used by the store and propagators.
208 * And at the end, to merge all the statistics and solutions from all threads, and print them.
209 */
210 CP<Itv> root;
211
212 /** For each block, how much shared memory are we using? */
213 size_t shared_mem_bytes;
214
215 /** Each CPU thread has its own local state to solve a subproblem, called a cube. */
216 bt::vector<CPUCube> cpu_cubes;
217
218 /** Each GPU block has its own local state to solve a subproblem, called a cube. */
219 bt::vector<GPUCube, bt::managed_allocator> gpu_cubes;
220
221 /** We create as many cubes as CPU threads and GPU blocks (option `cp.stats.num_blocks`).
222 * All CPU cubes are initialized to different subproblems, from 0 to `cp.stats.num_blocks - 1`.
223 * Hence the next subproblem to solve is `cp.stats.num_blocks`.
224 * The GPU cubes are initialized to the same state than the CPU cubes.
225 * Further, we connect the CPU and GPU cubes by sharing their store of variables (`gpu_cubes[i].store_cpu` and `cpu_cubes[i].cube.store`).
226 * Also, we share the propagators of `gpu_cubes[0].iprop_gpu` with all other cubes `gpu_cubes[i].iprop_gpu` (with i >= 1).
227 */
228 CPUData(const CP<Itv>& root, size_t shared_mem_bytes)
229 : next_subproblem(root.stats.num_blocks)
230 , best_bound(Itv::top())
231 , root(root)
232 , shared_mem_bytes(shared_mem_bytes)
233 , gpu_cubes(root.stats.num_blocks)
234 {
235 cpu_stop.clear();
236 cpu_cubes.reserve(root.stats.num_blocks);
237 cpu_cubes.emplace_back(root);
238 for(int i = 0; i < root.stats.num_blocks; ++i) {
239 if(i > 0) {
240 cpu_cubes.emplace_back(cpu_cubes[0]);
241 }
242 cpu_cubes[i].subproblem_idx = i;
243 gpu_cubes[i].store_cpu = cpu_cubes[i].cube.store;
244 gpu_cubes[i].fp_kind = root.config.fixpoint;
245 gpu_cubes[i].wac1_threshold = root.config.wac1_threshold;
246 }
247 /** This is a temporary object to initialize the first cube with the store and propagators. */
248 AbstractDomains<Itv, bt::standard_allocator,
249 bt::statistics_allocator<UniqueLightAlloc<bt::managed_allocator, 0>>,
250 bt::statistics_allocator<UniqueLightAlloc<bt::managed_allocator, 1>>>
251 managed_cp(root);
252 root.stats.print_stat("store_mem", managed_cp.store.get_allocator().total_bytes_allocated());
253 root.stats.print_stat("propagator_mem", managed_cp.iprop.get_allocator().total_bytes_allocated());
255 allocate_gpu_cubes<<<1, 1>>>(gpu_cubes.data(), gpu_cubes.size(), managed_cp.store.get(), managed_cp.iprop.get());
256 CUDAEX(cudaDeviceSynchronize());
257 }
258
259 ~CPUData() {
260 deallocate_gpu_cubes<<<1, 1>>>(gpu_cubes.data(), gpu_cubes.size());
261 CUDAEX(cudaDeviceSynchronize());
262 }
263
264 CPUData() = delete;
265 CPUData(const CPUData&) = delete;
266 CPUData(CPUData&&) = delete;
267};
268
269void dive_and_solve(CPUData& global, size_t cube_idx);
270size_t dive(CPUData& global, size_t cube_idx);
271void solve(CPUData& global, size_t cube_idx);
272bool propagate(CPUData& global, size_t cube_idx);
273bool update_global_best_bound(CPUData& global, size_t cube_idx);
274void update_local_best_bound(CPUData& global, size_t cube_idx);
275void reduce_cubes(CPUData& global);
276size_t configure_gpu(CP<Itv>& cp);
277__global__ void gpu_propagate(GPUCube* cube, size_t shared_bytes);
278
279#endif // __CUDACC__
280
281/** This is the point of entry, we preprocess the problem, create the threads solving the problem, wait for their completion or an interruption, merge and print the statistics. */
283{
284#ifndef __CUDACC__
285 std::cerr << "You must use a CUDA compiler (nvcc or clang) to compile Turbo on GPU." << std::endl;
286#else
287 auto start = std::chrono::steady_clock::now();
288 /** We start with some preprocessing to reduce the number of variables and constraints. */
289 CP<Itv> cp(config);
290 cp.preprocess();
291 if(cp.iprop->is_bot()) {
294 return;
295 }
296 size_t shared_mem_bytes = configure_gpu(cp);
297
298 /** This is the main data structure, we create all the data for each thread and GPU block. */
299 CPUData global(cp, shared_mem_bytes);
300
301 /** We wait that either the solving is interrupted, or that all threads have finished. */
302 /** Block the signal CTRL-C to notify the threads if we must exit. */
304
305 /** Start the algorithm in parallel with as many CPU threads as GPU blocks. */
306 std::vector<std::thread> threads;
307 for(int i = 0; i < global.root.stats.num_blocks; ++i) {
308 threads.push_back(std::thread(dive_and_solve, std::ref(global), i));
309 }
310
311 /** We start the persistent kernel, that will perform the propagation. */
312 gpu_propagate<<<
313 static_cast<unsigned int>(global.root.stats.num_blocks),
314 CUDA_THREADS_PER_BLOCK,
315 global.shared_mem_bytes>>>
316 (global.gpu_cubes.data(), global.shared_mem_bytes);
317
318 size_t terminated = 0;
319 while(terminated < threads.size()) {
320 if(must_quit(global.root) || !check_timeout(global.root, start)) {
321 global.cpu_stop.test_and_set();
322 cuda::atomic_thread_fence(cuda::memory_order_seq_cst, cuda::thread_scope_system);
323 break;
324 }
325 std::this_thread::sleep_for(std::chrono::milliseconds(100));
326 terminated = 0;
327 for(int i = 0; i < global.cpu_cubes.size(); ++i) {
328 if(global.cpu_cubes[i].finished.test()) {
329 ++terminated;
330 }
331 }
332 }
333 for(auto& t : threads) {
334 t.join();
335 }
336 CUDAEX(cudaDeviceSynchronize());
337 /** We reduce all the statistics of all threads. */
338 reduce_cubes(global);
339 global.root.print_final_solution();
340 global.root.print_mzn_statistics();
341#endif
342}
343
344#ifdef __CUDACC__
345
346/** This is the main algorithm.
347 * Each CPU thread will run one instance of this algorithm with a different `cube_idx`.
348 * It interleaves two steps until all subproblems have been solved or we reached another stopping condition (timeout, CTRL-C, pruning conditions):
349 * 1) Dive (function `dive`): Given a root node, follow a path in the search tree to reach a subproblem.
350 * 2) Solve (function `solve`): Solve the subproblem using the propagate and search algorithm.
351*/
352void dive_and_solve(CPUData& global, size_t cube_idx)
353{
354 auto& cube = global.cpu_cubes[cube_idx].cube;
355 auto solving_start = cube.stats.start_timer_host();
356 size_t num_subproblems = global.root.stats.eps_num_subproblems;
357 size_t& subproblem_idx = global.cpu_cubes[cube_idx].subproblem_idx;
358 /** In each iteration, we will solve one subproblem obtained after a diving phase. */
359 while(subproblem_idx < num_subproblems && !global.cpu_stop.test()) {
360 if(global.root.config.verbose_solving >= 2) {
361 std::lock_guard<std::mutex> print_guard(global.print_lock);
362 printf("%% Cube %zu solves subproblem num %zu\n", cube_idx, subproblem_idx);
363 }
364 /** The first step is to "dive" by committing to a search path. */
365 auto dive_start = cube.stats.start_timer_host();
366 size_t remaining_depth = dive(global, cube_idx);
367 cube.stats.stop_timer(Timer::DIVE, dive_start);
368 /** If we reached the subproblem without reaching a leaf node, we start the solving phase. */
369 if(remaining_depth == 0) {
370 solve(global, cube_idx);
371 /** If we didn't stop solving because of an external interruption, we increase the number of subproblems solved. */
372 if(!global.cpu_stop.test()) {
373 cube.stats.eps_solved_subproblems += 1;
374 }
375 }
376 /** We reached a leaf node before the subproblem was reached, it means a whole subtree should be skipped. */
377 else if(!global.cpu_stop.test()) {
378 /** To skip all the paths of the subtree obtained, we perform bitwise operations.
379 * Suppose the current path is "00" turn left two times, and the following search tree:
380 * * depth = 0
381 * / \
382 * 0 1 depth = 1
383 * / \ / \
384 * 00 01 10 11 depth = 2
385 *
386 * If we detect a leaf node at depth 1, after only one left turn, we must skip the remaining of the subtree, in particular to avoid exploring the path "01".
387 * To achieve that, we take the current path "00", shift it to the right by 1 (essentially erasing the path that has not been explored), increment it to go to the next subtree (at the same depth), and shift it back to the left to reach the first subproblem of the subtree.
388 * Cool huh?
389 */
390 size_t next_subproblem_idx = ((subproblem_idx >> remaining_depth) + size_t{1}) << remaining_depth;
391 global.next_subproblem.meet(ZLB<size_t, bt::local_memory>(next_subproblem_idx));
392 /** It is possible that other threads skip similar subtrees.
393 * Hence, we only count the subproblems skipped by the thread solving the left most subproblem. */
394 if((subproblem_idx & ((size_t{1} << remaining_depth) - size_t{1})) == size_t{0}) {
395 cube.stats.eps_skipped_subproblems += next_subproblem_idx - subproblem_idx;
396 }
397 }
398 /** We prepare the cube to solve the next problem.
399 * We restore the search tree to the root, and reset the EPS search strategy.
400 * We also update the subproblem index to the next subproblem to solve. */
401 if(!global.cpu_stop.test()) {
402 /** To avoid that several cubes solve the same subproblem, we use an atomic post-increment. */
403 subproblem_idx = global.next_subproblem.atomic()++;
404 /** The following commented code is completely valid and does not use atomic post-increment.
405 * But honestly, we kinda need more performance so... let's avoid reexploring subproblems. */
406 // subproblem_idx = global.next_subproblem.value();
407 // global.next_subproblem.meet(ZLB<size_t, bt::local_memory>(subproblem_idx + size_t{1}));
408 if(subproblem_idx < num_subproblems) {
409 auto start = cube.stats.start_timer_host();
410 cube.search_tree->restore(global.cpu_cubes[cube_idx].root_snapshot);
411 cube.stats.stop_timer(Timer::SEARCH, start);
412 }
413 }
414 }
415 /** If we did not get interrupted, this thread has explored all available subproblems.
416 * We record this information in `num_blocks_done` simply because it helps to detect unbalanced workloads (many threads finished but not some others).
417 */
418 if(!global.cpu_stop.test()) {
419 cube.stats.num_blocks_done = 1;
420 }
421 /** We signal to the GPU kernel that this block must terminate.
422 * The GPU block is necessarily waiting on `ready_to_propagate`, hence by setting `stop` first, using a memory fence, we ensure the GPU block is going to see it must stop.
423 */
424 global.gpu_cubes[cube_idx].stop.test_and_set();
425 cuda::atomic_thread_fence(cuda::memory_order_seq_cst, cuda::thread_scope_system);
426 global.gpu_cubes[cube_idx].ready_to_propagate.test_and_set(cuda::std::memory_order_seq_cst);
427 global.gpu_cubes[cube_idx].ready_to_propagate.notify_one();
428
429 cube.stats.stop_timer(Timer::SOLVE, solving_start);
430 /** We signal to the main thread that we have finished our work. */
431 global.cpu_cubes[cube_idx].finished.test_and_set();
432}
433
434/** Given a root problem, we follow a predefined path in the search tree to reach a subproblem.
435 * This is call the "dive" operation.
436 * The path is given by `subproblem_idx` in the CPU cube (see `CPUData::next_subproblem` for more info).
437 * For all dives, the initial problem must be the same and the EPS search strategy must be static (always splitting the tree in the same way).
438 * Therefore, don't be tempted to add the objective to the initial problem because it might lead to ignoring some subproblems since the splitting decisions will differ.
439 *
440 * \return The remaining depth of the path if we reach a leaf node before the end of the path.
441 */
442size_t dive(CPUData& global, size_t cube_idx) {
443 auto& cube = global.cpu_cubes[cube_idx].cube;
444 bool stop_diving = false;
445 size_t remaining_depth = cube.config.subproblems_power;
446 /** The number of iterations depends on the length of the diving path. */
447 while(remaining_depth > 0 && !stop_diving && !global.cpu_stop.test()) {
448 bool is_leaf_node = propagate(global, cube_idx);
449 auto start = cube.stats.start_timer_host();
450 /** If we reach a leaf node before the end of the path, we stop and the remaining depth is reported to the caller. */
451 if(is_leaf_node) {
452 stop_diving = true;
453 }
454 else {
455 remaining_depth--;
456 /** We create two branches according to the EPS search strategy. */
457 auto branches = cube.split->split();
458 assert(branches.size() == 2);
459 /** We commit to one of the branches depending on the current value on the path.
460 * Suppose the depth is 3, the path is "010" we are currently at `remaining_depth = 1`.
461 * We must extract the bit "1", and we do so by standard bitwise manipulation.
462 * Whenever the branch_idx is 0 means to take the left branch, and 1 means to take the right branch.
463 */
464 size_t branch_idx = (global.cpu_cubes[cube_idx].subproblem_idx & (size_t{1} << remaining_depth)) >> remaining_depth;
465 /** We immediately commit to the branch.
466 * It has the effect of reducing the domain of the variables in `cube.store` (and `gpu_cube.store_cpu` since they are aliased).
467 */
468 cube.iprop->deduce(branches[branch_idx]);
469 }
470 cube.stats.stop_timer(Timer::SEARCH, start);
471 }
472 return remaining_depth;
473}
474
475/** We solve a cube using the propagate and search algorithm.
476 * We explore the whole search tree of the cube, only stopping earlier due to pruning conditions or external stop signal (e.g. `global.cpu_stop`).
477 */
478void solve(CPUData& global, size_t cube_idx) {
479 auto& cpu_cube = global.cpu_cubes[cube_idx].cube;
480 bool has_changed = true;
481 auto start = cpu_cube.stats.start_timer_host();
482 while(has_changed && !global.cpu_stop.test()) {
483 /** Before propagating, we update the local bound with the best known global bound. */
484 update_local_best_bound(global, cube_idx);
485 cpu_cube.stats.stop_timer(Timer::SEARCH, start);
486 /** We propagate on GPU, and manage the leaf nodes (solution and failed nodes). */
487 propagate(global, cube_idx);
488 /** We pop a new node from the search tree, ready to explore.
489 * If the search tree becomes empty, `has_changed` will be `false`.
490 */
491 start = cpu_cube.stats.start_timer_host();
492 has_changed = cpu_cube.search_tree->deduce();
493 }
494}
495
496/** Propagate the cube `cube_idx` on the GPU.
497 * Check if we reached a leaf node (solution or failed) on the CPU.
498 * Branching on unknown nodes is a task left to the caller.
499 *
500 * \return `true` if we reached a leaf node.
501 */
502bool propagate(CPUData& global, size_t cube_idx) {
503 auto& cpu_cube = global.cpu_cubes[cube_idx].cube;
504 auto& gpu_cube = global.gpu_cubes[cube_idx];
505 bool is_leaf_node = false;
506
507 gpu_cube.store_cpu->prefetch(0);
508
509 /** We signal to the GPU that it can propagate the current node.
510 * Thereafter, we immediately wait for the GPU to finish the propagation before performing the search step.
511 */
512 cuda::atomic_thread_fence(cuda::memory_order_seq_cst, cuda::thread_scope_system);
513 gpu_cube.ready_to_propagate.test_and_set(cuda::std::memory_order_seq_cst);
514 gpu_cube.ready_to_propagate.notify_one();
515 gpu_cube.ready_to_search.wait(false, cuda::std::memory_order_seq_cst);
516 gpu_cube.ready_to_search.clear();
517
518 auto start = cpu_cube.stats.start_timer_host();
519 gpu_cube.store_cpu->prefetch(cudaCpuDeviceId);
520
521 /** `on_node` updates the statistics and verifies whether we should stop (e.g. option `--cutnodes`). */
522 bool is_pruned = cpu_cube.on_node();
523 /** If the abstract domain is `bottom`, we reached a leaf node where the problem is unsatisfiable. */
524 if(cpu_cube.iprop->is_bot()) {
525 is_leaf_node = true;
526 cpu_cube.on_failed_node();
527 }
528 /** When the problem is "extractable", then all variables are assigned to a single value.
529 * It means that we have reached a solution.
530 */
531 else if(gpu_cube.solution_found.test()) {
532 is_leaf_node = true;
533 gpu_cube.solution_found.clear();
534 /** We save the new best solution found.
535 * The "branch-and-bound" (bab) abstract domain has a local store of variable to store the best solution.
536 * It adds a new bound constraint to the root of the search tree, such that, on backtracking the best bound is enforced.
537 */
538 if(cpu_cube.bab->is_satisfaction() || cpu_cube.bab->compare_bound(*cpu_cube.store, cpu_cube.bab->optimum())) {
539 cpu_cube.bab->deduce();
540 bool print_solution = cpu_cube.is_printing_intermediate_sol();
541 if(cpu_cube.bab->is_optimization()) {
542 /** We share the new best bound with the other cubes. */
543 print_solution &= update_global_best_bound(global, cube_idx);
544 }
545 /** If we print all intermediate solutions, and really found a better bound (no other thread found a better one meanwhile), we print the current solution. */
546 if(print_solution) {
547 std::lock_guard<std::mutex> print_guard(global.print_lock);
548 global.root.print_solution(*cpu_cube.best);
549 }
550 /** We update the statistics, and check if we must terminate (e.g. we stop after N solutions). */
551 is_pruned |= cpu_cube.update_solution_stats();
552 }
553 }
554 if(is_pruned) {
555 /** We notify all threads that we must stop. */
556 global.cpu_stop.test_and_set();
557 }
558 cpu_cube.stats.stop_timer(Timer::SEARCH, start);
559 return is_leaf_node;
560}
561
562/** Each block of this kernel executes the propagation loop on the GPU until a fixpoint is reached.
563 * 1) Transfer the store of variables from the CPU to the GPU.
564 * 2) Execute the fixpoint engine.
565 * 3) Transfer the store of variables from the GPU to the CPU.
566 *
567 * These three steps are repeated until `cpu_stop` becomes `true`.
568 * Each block is continuously processing a stream of nodes coming from the CPU.
569 *
570 * The size of `gpu_cubes` must be equal to the number of blocks.
571 */
572__global__ void gpu_propagate(GPUCube* gpu_cubes, size_t shared_bytes) {
573 extern __shared__ unsigned char shared_mem[];
574 GPUCube& cube = gpu_cubes[blockIdx.x];
575 GPUCube::IProp& iprop = *cube.iprop_gpu;
576 __shared__ int warp_iterations[CUDA_THREADS_PER_BLOCK/32];
577 /** We start by initializing the structures in shared memory (fixpoint loop engine, store of variables). */
578 __shared__ FixpointSubsetGPU<BlockAsynchronousFixpointGPU<true>, bt::global_allocator, CUDA_THREADS_PER_BLOCK> fp_engine;
579 fp_engine.init(iprop.num_deductions());
580 /** This shared variable is necessary to avoid multiple threads to read into `cube.stop.test()`,
581 * potentially reading different values and leading to deadlock. */
582 __shared__ bool stop;
583
584 // If we booked more than the default shared memory, it means we allocate the store in shared memory.
585 if(threadIdx.x == 0 && shared_bytes > DEFAULT_SHARED_MEM_BYTES) {
586 bt::pool_allocator shared_mem_pool(shared_mem, shared_bytes);
587 cube.store_gpu->reset_data(shared_mem_pool);
588 }
589
590 auto group = cooperative_groups::this_thread_block();
591 __syncthreads();
592
593 auto start = cube.timers.start_timer_device();
594 while(true) {
595 /** We wait that the CPU notifies us the store is ready to be copied and propagated. */
596 if(threadIdx.x == 0) {
597 start = cube.timers.start_timer_device();
598 cube.ready_to_propagate.wait(false, cuda::std::memory_order_seq_cst);
599 cube.timers.stop_timer(Timer::WAIT_CPU, start);
600 cube.ready_to_propagate.clear();
601 // NOTE: Only one thread should read the atomic `cube.stop`, to avoid deadlock if one thread reads `true` and exits, while another thread reads `false`.
602 stop = cube.stop.test();
603 }
604 __syncthreads();
605 if(stop) {
606 break;
607 }
608 /** We copy the CPU store into the GPU memory. */
609 start = cube.timers.start_timer_device();
610 cube.store_cpu->copy_to(group, *cube.store_gpu);
611 // for(int i = threadIdx.x; i < cube.events.size(); i += blockDim.x) {
612 // cube.events[i].join_top();
613 // // [cube.events_cpu[i].vid()].join_top();
614 // }
615 __syncthreads();
616 start = cube.timers.stop_timer(Timer::TRANSFER_CPU2GPU, start);
617 /** This is the main propagation algorithm: the current node is propagated in parallel. */
618 int fp_iterations;
619 warp_iterations[threadIdx.x / 32] = 0;
620 switch(cube.fp_kind) {
621 case FixpointKind::AC1: {
622 fp_iterations = fp_engine.fixpoint(
623 [&](int i){ return iprop.deduce(i); },
624 [&](){ return iprop.is_bot(); });
625 if(threadIdx.x == 0) {
626 cube.num_deductions += fp_iterations * fp_engine.num_active();
627 }
628 break;
629 }
630 case FixpointKind::WAC1: {
631 if(fp_engine.num_active() <= cube.wac1_threshold) {
632 fp_iterations = fp_engine.fixpoint(
633 [&](int i){ return iprop.deduce(i); },
634 [&](){ return iprop.is_bot(); });
635 if(threadIdx.x == 0) {
636 cube.num_deductions += fp_iterations * fp_engine.num_active();
637 }
638 }
639 else {
640 fp_iterations = fp_engine.fixpoint(
641 [&](int i){ return warp_fixpoint<CUDA_THREADS_PER_BLOCK>(iprop, i, warp_iterations); },
642 [&](){ return iprop.is_bot(); });
643 if(threadIdx.x == 0) {
644 for(int i = 0; i < CUDA_THREADS_PER_BLOCK/32; ++i) {
645 cube.num_deductions += warp_iterations[i] * 32;
646 }
647 }
648 }
649 break;
650 }
651 }
652 start = cube.timers.stop_timer(Timer::FIXPOINT, start);
653 cube.store_gpu->copy_to(group, *cube.store_cpu);
654 __syncthreads();
655 start = cube.timers.stop_timer(Timer::TRANSFER_GPU2CPU, start);
656 if(threadIdx.x == 0) {
657 cube.fp_iterations += fp_iterations;
658 }
659 bool is_leaf_node = cube.store_gpu->is_bot();
660 if(!is_leaf_node) {
661 fp_engine.select([&](int i) { return !iprop.ask(i); });
662 cube.timers.stop_timer(Timer::SELECT_FP_FUNCTIONS, start);
663 if(fp_engine.num_active() == 0) {
664 is_leaf_node = cube.store_gpu->template is_extractable<AtomicExtraction>(group);
665 if(threadIdx.x == 0 && is_leaf_node) {
666 cube.solution_found.test_and_set();
667 }
668 }
669 }
670 cuda::atomic_thread_fence(cuda::memory_order_seq_cst, cuda::thread_scope_system);
671 __syncthreads();
672 /** We notify to the CPU that we have propagated the current node. */
673 if(threadIdx.x == 0) {
674 cube.ready_to_search.test_and_set(cuda::std::memory_order_seq_cst);
675 cube.ready_to_search.notify_one();
676 }
677 // Backtrack detected.
678 if(is_leaf_node) {
679 fp_engine.reset(iprop.num_deductions());
680 }
681 }
682 fp_engine.destroy();
683}
684
685/** We update the bound found by the current cube so it is visible to all other cubes.
686 * Note that this operation might not always succeed, which is okay, the best bound is still saved locally in `gpu_cubes` and then reduced at the end (in `reduce_cubes`).
687 * The worst that can happen is that a best bound is found twice, which does not prevent the correctness of the algorithm.
688 *
689 * \return `true` if the best bound has changed. Can return `false` if the best bound was updated by another thread meanwhile.
690 */
691bool update_global_best_bound(CPUData& global, size_t cube_idx) {
692 const auto& cube = global.cpu_cubes[cube_idx].cube;
693 assert(cube.bab->is_optimization());
694 // We retrieve the best bound found by the current cube.
695 auto local_best = cube.bab->optimum().project(cube.bab->objective_var());
696 // We update the global `best_bound`.
697 if(cube.bab->is_maximization()) {
698 return global.best_bound.meet_lb(dual_bound<Itv::LB>(local_best.ub()));
699 }
700 else {
701 return global.best_bound.meet_ub(dual_bound<Itv::UB>(local_best.lb()));
702 }
703}
704
705/** This function essentially does the converse operation of `update_global_best_bound`.
706 * We directly update the store with the global best bound.
707 * This function should be called in each node, since the best bound is erased on backtracking (it is not included in the snapshot).
708 */
709void update_local_best_bound(CPUData& global, size_t cube_idx) {
710 if(global.cpu_cubes[cube_idx].cube.bab->is_optimization()) {
711 auto& cube = global.cpu_cubes[cube_idx].cube;
712 VarEnv<bt::standard_allocator> empty_env{};
713 auto best_formula = cube.bab->template deinterpret_best_bound<bt::standard_allocator>(
714 cube.bab->is_maximization()
715 ? Itv(dual_bound<Itv::UB>(global.best_bound.lb()))
716 : Itv(dual_bound<Itv::LB>(global.best_bound.ub())));
717 IDiagnostics diagnostics;
718 bool r = interpret_and_tell(best_formula, empty_env, *cube.store, diagnostics);
719 assert(r);
720 }
721}
722
723/** After solving, we merge all the statistics and best solutions from all cubes together, before printing them. */
724void reduce_cubes(CPUData& global) {
725 for(int i = 0; i < global.cpu_cubes.size(); ++i) {
726 /** `meet` is the merge operation. */
727 global.cpu_cubes[i].cube.stats.meet(global.gpu_cubes[i].timers);
728 global.root.meet(global.cpu_cubes[i].cube);
729 global.root.stats.fixpoint_iterations += global.gpu_cubes[i].fp_iterations;
730 global.root.stats.num_deductions += global.gpu_cubes[i].num_deductions;
731 }
732}
733
734/** We configure the GPU according to the user configuration:
735 * 1) Decide the size of the shared memory and return it.
736 * 2) Increase the stack size if needed.
737 * 3) Increase the global memory allocation (we set the limit to around 90% of the global memory).
738 * 4) Guess the "best" number of threads per block and the number of blocks per SM, if not provided.
739 */
740size_t configure_gpu(CP<Itv>& cp) {
741 auto& config = cp.config;
742 /** Configure the shared memory size. */
743 size_t alignment_overhead = 200;
744 size_t shared_mem_bytes = DEFAULT_SHARED_MEM_BYTES + alignment_overhead + (cp.store->vars() * sizeof(GPUCube::Itv1));
745 cudaDeviceProp deviceProp;
746 cudaGetDeviceProperties(&deviceProp, 0);
747 if(shared_mem_bytes >= deviceProp.sharedMemPerBlock || config.only_global_memory) {
748 shared_mem_bytes = DEFAULT_SHARED_MEM_BYTES;
749 cp.stats.print_stat("memory_configuration", "global");
750 }
751 else {
752 cp.stats.print_stat("memory_configuration", "store_shared");
753 }
754 cp.stats.print_memory_statistics(config.verbose_solving, "shared_mem", shared_mem_bytes);
755
756 int hint_num_blocks;
757 int hint_num_threads;
758 CUDAE(cudaOccupancyMaxPotentialBlockSize(&hint_num_blocks, &hint_num_threads, (void*) gpu_propagate, shared_mem_bytes));
759 size_t total_global_mem = deviceProp.totalGlobalMem;
760 size_t num_sm = deviceProp.multiProcessorCount;
761 cp.stats.num_blocks = (config.or_nodes == 0) ? hint_num_blocks : config.or_nodes;
762 // The stack allocated depends on the maximum number of threads per SM, not on the actual number of threads per block.
763 size_t total_stack_size = num_sm * deviceProp.maxThreadsPerMultiProcessor * (config.stack_kb == 0 ? 1 : config.stack_kb) * 1000;
764 size_t remaining_global_mem = total_global_mem - total_stack_size;
765 remaining_global_mem -= remaining_global_mem / 10; // We leave 10% of global memory free for CUDA allocations, not sure if it is useful though.
766 if(config.stack_kb != 0) {
767 CUDAEX(cudaDeviceSetLimit(cudaLimitStackSize, config.stack_kb*1000));
768 }
769 CUDAEX(cudaDeviceSetLimit(cudaLimitMallocHeapSize, remaining_global_mem));
770 cp.stats.print_memory_statistics(cp.config.verbose_solving, "stack_memory", total_stack_size);
771 cp.stats.print_memory_statistics(cp.config.verbose_solving, "heap_memory", remaining_global_mem);
772 if(cp.config.verbose_solving) {
773 printf("%% num_blocks=%d\n", cp.stats.num_blocks);
774 }
775 int num_blocks;
776 cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks, (void*) gpu_propagate, CUDA_THREADS_PER_BLOCK, shared_mem_bytes);
777 if(cp.config.verbose_solving) {
778 printf("%% max_blocks_per_sm=%d\n", num_blocks);
779 }
780 return shared_mem_bytes;
781}
782
783#endif // __CUDACC__
784#endif // TURBO_HYBRID_DIVE_AND_SOLVE_HPP
bool must_quit(A &a)
Definition common_solving.hpp:78
Interval< ZLB< bound_value_type, battery::local_memory > > Itv
Definition common_solving.hpp:53
void block_signal_ctrlc()
Definition common_solving.hpp:72
bool check_timeout(A &a, const Timepoint &start)
Definition common_solving.hpp:90
FixpointKind
Definition config.hpp:23
void hybrid_dive_and_solve(const Configuration< battery::standard_allocator > &config)
Definition hybrid_dive_and_solve.hpp:282
@ TRANSFER_CPU2GPU
@ WAIT_CPU
@ TRANSFER_GPU2CPU
@ FIXPOINT
@ SELECT_FP_FUNCTIONS
Definition common_solving.hpp:144
Configuration< BasicAllocator > config
Definition common_solving.hpp:278
CUDA void print_final_solution()
Definition common_solving.hpp:830
Statistics< BasicAllocator > stats
Definition common_solving.hpp:279
abstract_ptr< IStore > store
Definition common_solving.hpp:260
void preprocess()
Definition common_solving.hpp:591
abstract_ptr< IProp > iprop
Definition common_solving.hpp:261
CUDA void print_mzn_statistics()
Definition common_solving.hpp:837
Definition config.hpp:34
FixpointKind fixpoint
Definition config.hpp:52
int verbose_solving
Definition config.hpp:41
size_t wac1_threshold
Definition config.hpp:53
CUDA void print_mzn_end_stats() const
Definition statistics.hpp:318
CUDA void print_stat(const char *name, const char *value) const
Definition statistics.hpp:206
int num_blocks
Definition statistics.hpp:117
CUDA void print_memory_statistics(int verbose, const char *key, size_t bytes) const
Definition statistics.hpp:256
Definition statistics.hpp:30
Definition common_solving.hpp:124