Turbo Constraint Solver
Loading...
Searching...
No Matches
hybrid_dive_and_solve.hpp
Go to the documentation of this file.
1// Copyright 2024 Pierre Talbot
2
3#ifndef TURBO_HYBRID_DIVE_AND_SOLVE_HPP
4#define TURBO_HYBRID_DIVE_AND_SOLVE_HPP
5
6#include "common_solving.hpp"
7#include <mutex>
8#include <thread>
9#include <chrono>
10
11namespace bt = ::battery;
12
13/**
14 * "Dive and solve" is a new algorithm to parallelize the standard "propagate and search" algorithm of constraint programming.
15 * Given a depth `d`, we create `2^d` subproblems that we solve in parallel.
16 * We create as many CPU threads as blocks on the GPU (option `config.or_nodes`).
17 * A CPU thread takes the next subproblem available and run the "propagate and search" algorithm on it.
18 * The CPU thread offloads the propagation to the GPU, but take care of splitting and backtracking in the search tree, as well as maintaining the best bound found, and the statistics.
19 * Therefore, a kernel with only 1 block is executed each time we propagate a node.
20 * Since many CPU threads execute in parallel, there are many kernels running in parallel.
21 *
22 * We call a task solving a subproblem a "cube" (this loosely follows the terminology of SAT solving with "cube and conquer").
23 * By CPU cube, we refer to the local state of a CPU thread for solving a subproblem.
24 * By GPU cube, we refer to the local state of a GPU block for solving a subproblem.
25 * Note that at each moment, there are at most `config.or_nodes` cubes active in parallel.
26 */
27
28#ifdef __CUDACC__
29
30#include <cub/block/block_scan.cuh>
31
32#define BLOCK_SIZE 256
33
34/** By default, we don't need dynamic shared memory. */
35#define DEFAULT_SHARED_MEM_BYTES 0
36
37/** A CPU cube is the data used by a CPU thread to solve a subproblem. */
38struct CPUCube {
39 using cube_type = AbstractDomains<
40 Itv, bt::standard_allocator, UniqueLightAlloc<bt::standard_allocator,0>, bt::pinned_allocator>;
41 /** A CPU cube is fully allocated on the CPU, but for the store of variables `cube.store` which is allocated in managed memory.
42 * Indeed, when exchanging information between a GPU cube and a CPU cube, only the store of variables need to be transfered.
43 */
44 cube_type cube;
45
46 /** We keep a snapshot of the root to reinitialize the CPU cube after each subproblem has been solved. */
47 typename cube_type::IST::snapshot_type<bt::standard_allocator> root_snapshot;
48
49 /** This flag becomes `true` when the thread has finished its execution (it exits `dive_and_solve` function). */
50 std::atomic_flag finished;
51
52 /** This is the path to the subproblem needed to be solved by this cube.
53 * This member is initialized in `CPUData` constructor.
54 */
55 size_t subproblem_idx;
56
57 CPUCube(const CP<Itv>& root)
58 : cube(root)
59 , root_snapshot(cube.search_tree->template snapshot<bt::standard_allocator>())
60 {}
61};
62
63/** A GPU cube is the data used by a GPU block to solve a subproblem.
64 * We only allocate the necessary structures to perform propagation: the store of variables and the propagators.
65*/
66struct GPUCube {
67 /** We use atomic to store the interval's lower and upper bounds. */
68 // using Itv1 = Interval<ZLB<int, bt::atomic_memory_block>>;
69 using Itv1 = Interval<ZLB<int, bt::local_memory>>;
70
71 /** We use a `pool_allocator`, this allows to easily switch between global memory and shared memory, if the store of variables can fit inside. */
72 using IStore = VStore<Itv1, bt::pool_allocator>;
73
74 /** The store of propagators also uses a `pool_allocator` of global memory. This was necessary due to the slow copy of propagators between CPU and GPU.
75 * Indeed, a propagator is a tree-shaped object (like an AST) that contain many pointers, and thus the copy calls the allocation function a lot. */
76 using IPC = PC<IStore, bt::pool_allocator>;
77
78 /** The store of variables is only accessible on GPU. */
79 abstract_ptr<IStore> store_gpu;
80
81 /** The indexes of propagators that are still unknown (not entailed).
82 * The unknown propagators are between 0 and pidx.size() - 1. */
83 bt::vector<int, bt::pool_allocator> pidx;
84
85 /** A mask to know which propagators are still unknown.
86 * We have `pmask[i] <=> !ipc.ask(pidx[i])`.
87 */
88 bt::vector<bool, bt::pool_allocator> pmask;
89
90 /** A temporary array to compute the prefix sum of `pmask`, in order to copy pidx into pidx2. */
91 bt::vector<int, bt::pool_allocator> psum;
92
93 /** A temporary array when copying the new active propagators. */
94 bt::vector<int, bt::pool_allocator> pidx2;
95
96 /** The propagators is only accessible on GPU but the array of propagators is shared among all blocks.
97 * Since the propagators are state-less, we avoid duplicating them in each block.
98 */
99 abstract_ptr<IPC> ipc_gpu;
100
101 /** We also have a store of variables in managed memory to communicate the results of the propagation to the CPU.
102 * Note that this store is the same than the one in the corresponding CPU cube (`cube.store`).
103 * This member is initialized in `CPUData` constructor.
104 */
105 abstract_ptr<VStore<Itv, bt::pinned_allocator>> store_cpu;
106
107 /** This Boolean is used to communicate to the CPU the current node is a solution. */
108 cuda::std::atomic_flag solution_found;
109
110 /** The cumulative number of iterations required to reach a fixpoint.
111 * By dividing this statistic by the number of nodes, we get the average number of iterations per node.
112 */
113 size_t fp_iterations;
114
115 /** The CPU thread and the GPU block use those two flags to signal to each other when to work and when to wait.
116 * This is necessary due to the persistent kernel design of this algorithm.
117 */
118 cuda::std::atomic_flag ready_to_propagate;
119 cuda::std::atomic_flag ready_to_search;
120
121 /** A flag to notify the kernel it must stop. */
122 cuda::std::atomic_flag stop;
123
124 GPUCube() {
125 /** Initially, we are not ready to propagate or to search. */
126 ready_to_search.clear();
127 ready_to_propagate.clear();
128 stop.clear();
129 solution_found.clear();
130 cuda::atomic_thread_fence(cuda::memory_order_seq_cst, cuda::thread_scope_system);
131 }
132
133 /** Initialize the store of variables and propagators from existing store and propagators.
134 * If `pc_shared` is `true`, the propagators if this cube will be shared with `pc`.
135 * Otherwise, a full copy of the propagators is made.
136 * We generally want to share the propagators, hence the cube 0 copies the propagators, and all other cubes share them.
137 */
138 template <class StoreType, class PCType>
139 __device__ void allocate(StoreType& store, PCType& pc, size_t bytes, bool pc_shared) {
140 int n = pc.num_deductions();
141 // We round n to the next multiple of BLOCK_SIZE (the maximum dimension of the block, for now).
142 n = n + ((BLOCK_SIZE - n % BLOCK_SIZE) % BLOCK_SIZE);
143 bytes += 100 + 4*sizeof(int)*n; // for the structures to eliminate propagators (and alignment padding).
144 void* mem_pool = bt::global_allocator{}.allocate(bytes);
145 bt::pool_allocator pool(static_cast<unsigned char*>(mem_pool), bytes);
146 AbstractDeps<bt::global_allocator, bt::pool_allocator> deps(pc_shared, bt::global_allocator{}, pool);
147 ipc_gpu = bt::allocate_shared<IPC, bt::pool_allocator>(pool, pc, deps);
148 store_gpu = deps.template extract<IStore>(store.aty());
149 pidx = bt::vector<int, bt::pool_allocator>(pc.num_deductions(), pool);
150 for(int i = 0; i < pidx.size(); ++i) {
151 pidx[i] = i;
152 }
153 pmask = bt::vector<bool, bt::pool_allocator>(n, false, pool);
154 psum = bt::vector<int, bt::pool_allocator>(n, 0, pool);
155 pidx2 = bt::vector<int, bt::pool_allocator>(pidx, pool);
156 }
157
158 __device__ void deallocate() {
159 // NOTE: .reset() does not work because it does not reset the allocator, which is itself allocated in global memory.
160 store_gpu = abstract_ptr<IStore>();
161 ipc_gpu = abstract_ptr<IPC>();
162 pidx = bt::vector<int, bt::pool_allocator>();
163 pmask = bt::vector<bool, bt::pool_allocator>();
164 psum = bt::vector<int, bt::pool_allocator>();
165 pidx2 = bt::vector<int, bt::pool_allocator>();
166 }
167};
168
169/** This kernel initializes the GPU cubes. See the constructor of `CPUData` for more information. */
170template <class Store, class IPC>
171__global__ void allocate_gpu_cubes(GPUCube* gpu_cubes,
172 size_t n, Store* store, IPC* ipc)
173{
174 assert(threadIdx.x == 0 && blockIdx.x == 0);
175 size_t bytes = store->get_allocator().total_bytes_allocated()
176 + sizeof(GPUCube::IStore) + sizeof(GPUCube::IPC) + 1000;
177 gpu_cubes[0].allocate(*store, *ipc, bytes + ipc->get_allocator().total_bytes_allocated(), false);
178 for(int i = 1; i < n; ++i) {
179 gpu_cubes[i].allocate(*gpu_cubes[0].store_gpu, *gpu_cubes[0].ipc_gpu, bytes, true);
180 }
181}
182
183__global__ void deallocate_gpu_cubes(GPUCube* gpu_cubes, size_t n) {
184 assert(threadIdx.x == 0 && blockIdx.x == 0);
185 for(int i = 0; i < n; ++i) {
186 gpu_cubes[i].deallocate();
187 }
188}
189
190/** This is the data shared among all CPU threads. */
191struct CPUData {
192 /** We generate the subproblems lazily.
193 * Suppose we generate `2^3` subproblems, we represent the first subproblem as `000`, the second as `001`, the third as `010`, and so on.
194 * A `0` means to turn left in the search tree, and a `1` means to turn right.
195 * Incrementing this integer will generate the path of the next subproblem.
196 */
197 ZLB<size_t, bt::atomic_memory<>> next_subproblem;
198
199 /** This is the best bound found so far, globally, across all threads. */
200 Itv best_bound;
201
202 /** A flag to stop the threads, for example because of a timeout or CTRL-C. */
203 std::atomic_flag cpu_stop;
204
205 /** Due to multithreading, we must protect `stdout` when printing.
206 * The model of computation in this work is lock-free, but it seems unavoidable for printing.
207 */
208 std::mutex print_lock;
209
210 /** The initial problem is only accessible from the CPU.
211 * It is used at the beginning to count the bytes used by the store and propagators.
212 * And at the end, to merge all the statistics and solutions from all threads, and print them.
213 */
214 CP<Itv> root;
215
216 /** For each block, how much shared memory are we using? */
217 size_t shared_mem_bytes;
218
219 /** Each CPU thread has its own local state to solve a subproblem, called a cube. */
220 bt::vector<CPUCube> cpu_cubes;
221
222 /** Each GPU block has its own local state to solve a subproblem, called a cube. */
223 bt::vector<GPUCube, bt::managed_allocator> gpu_cubes;
224
225 /** We create as many cubes as CPU threads and GPU blocks (option `config.or_nodes`).
226 * All CPU cubes are initialized to different subproblems, from 0 to `config.or_nodes - 1`.
227 * Hence the next subproblem to solve is `config.or_nodes`.
228 * The GPU cubes are initialized to the same state than the CPU cubes.
229 * Further, we connect the CPU and GPU cubes by sharing their store of variables (`gpu_cubes[i].store_cpu` and `cpu_cubes[i].cube.store`).
230 * Also, we share the propagators of `gpu_cubes[0].ipc_gpu` with all other cubes `gpu_cubes[i].ipc_gpu` (with i >= 1).
231 */
232 CPUData(const CP<Itv>& root, size_t shared_mem_bytes)
233 : next_subproblem(root.config.or_nodes)
234 , best_bound(Itv::top())
235 , root(root)
236 , shared_mem_bytes(shared_mem_bytes)
237 , cpu_cubes(root.config.or_nodes, this->root)
238 , gpu_cubes(root.config.or_nodes)
239 {
240 cpu_stop.clear();
241 for(int i = 0; i < root.config.or_nodes; ++i) {
242 cpu_cubes[i].subproblem_idx = i;
243 gpu_cubes[i].store_cpu = cpu_cubes[i].cube.store;
244 }
245 /** This is a temporary object to initialize the first cube with the store and propagators. */
246 AbstractDomains<Itv, bt::standard_allocator,
247 bt::statistics_allocator<UniqueLightAlloc<bt::managed_allocator, 0>>,
248 bt::statistics_allocator<UniqueLightAlloc<bt::managed_allocator, 1>>>
249 managed_cp(root);
250 printf("%%%%%%mzn-stat: store_mem=%" PRIu64 "\n", managed_cp.store.get_allocator().total_bytes_allocated());
251 printf("%%%%%%mzn-stat: propagator_mem=%" PRIu64 "\n", managed_cp.ipc.get_allocator().total_bytes_allocated());
252 allocate_gpu_cubes<<<1, 1>>>(gpu_cubes.data(), gpu_cubes.size(), managed_cp.store.get(), managed_cp.ipc.get());
253 CUDAEX(cudaDeviceSynchronize());
254 }
255
256 ~CPUData() {
257 deallocate_gpu_cubes<<<1, 1>>>(gpu_cubes.data(), gpu_cubes.size());
258 CUDAEX(cudaDeviceSynchronize());
259 }
260
261 CPUData() = delete;
262 CPUData(const CPUData&) = delete;
263 CPUData(CPUData&&) = delete;
264};
265
266void dive_and_solve(CPUData& global, size_t cube_idx);
267size_t dive(CPUData& global, size_t cube_idx);
268void solve(CPUData& global, size_t cube_idx);
269bool propagate(CPUData& global, size_t cube_idx);
270bool update_global_best_bound(CPUData& global, size_t cube_idx);
271void update_local_best_bound(CPUData& global, size_t cube_idx);
272void reduce_cubes(CPUData& global);
273size_t configure_gpu(CP<Itv>& cp);
274__global__ void gpu_propagate(GPUCube* cube, size_t shared_bytes);
275
276#endif // __CUDACC__
277
278/** This is the point of entry, we preprocess the problem, create the threads solving the problem, wait for their completion or an interruption, merge and print the statistics. */
280{
281#ifndef __CUDACC__
282 std::cerr << "You must use a CUDA compiler (nvcc or clang) to compile Turbo on GPU." << std::endl;
283#else
284 auto start = std::chrono::high_resolution_clock::now();
285 /** We start with some preprocessing to reduce the number of variables and constraints. */
286 CP<Itv> cp(config);
287 cp.preprocess();
288 size_t shared_mem_bytes = configure_gpu(cp);
289
290 /** Block the signal CTRL-C to notify the threads if we must exit. */
292
293 /** This is the main data structure, we create all the data for each thread and GPU block. */
294 CPUData global(cp, shared_mem_bytes);
295
296 /** Start the algorithm in parallel with as many CPU threads as GPU blocks. */
297 std::vector<std::thread> threads;
298 for(int i = 0; i < global.root.config.or_nodes; ++i) {
299 threads.push_back(std::thread(dive_and_solve, std::ref(global), i));
300 }
301
302 /** We start the persistent kernel, that will perform the propagation. */
303 gpu_propagate<<<
304 static_cast<unsigned int>(global.root.config.or_nodes),
305 static_cast<unsigned int>(global.root.config.and_nodes),
306 global.shared_mem_bytes>>>
307 (global.gpu_cubes.data(), global.shared_mem_bytes);
308
309 /** We wait that either the solving is interrupted, or that all threads have finished. */
310 size_t terminated = 0;
311 while(terminated < threads.size()) {
312 if(must_quit() || !check_timeout(global.root, start)) {
313 global.cpu_stop.test_and_set();
314 cuda::atomic_thread_fence(cuda::memory_order_seq_cst, cuda::thread_scope_system);
315 break;
316 }
317 std::this_thread::sleep_for(std::chrono::milliseconds(100));
318 terminated = 0;
319 for(int i = 0; i < global.cpu_cubes.size(); ++i) {
320 if(global.cpu_cubes[i].finished.test()) {
321 ++terminated;
322 }
323 }
324 }
325 for(auto& t : threads) {
326 t.join();
327 }
328 CUDAEX(cudaDeviceSynchronize());
329 /** We reduce all the statistics of all threads. */
330 reduce_cubes(global);
331 global.root.print_final_solution();
332 global.root.print_mzn_statistics();
333#endif
334}
335
336#ifdef __CUDACC__
337
338/** This is the main algorithm.
339 * Each CPU thread will run one instance of this algorithm with a different `cube_idx`.
340 * It interleaves two steps until all subproblems have been solved or we reached another stopping condition (timeout, CTRL-C, pruning conditions):
341 * 1) Dive (function `dive`): Given a root node, follow a path in the search tree to reach a subproblem.
342 * 2) Solve (function `solve`): Solve the subproblem using the propagate and search algorithm.
343*/
344void dive_and_solve(CPUData& global, size_t cube_idx)
345{
346 size_t num_subproblems = global.root.stats.eps_num_subproblems;
347 size_t& subproblem_idx = global.cpu_cubes[cube_idx].subproblem_idx;
348 auto& cube = global.cpu_cubes[cube_idx].cube;
349 /** In each iteration, we will solve one subproblem obtained after a diving phase. */
350 while(subproblem_idx < num_subproblems && !global.cpu_stop.test()) {
351 if(global.root.config.verbose_solving) {
352 std::lock_guard<std::mutex> print_guard(global.print_lock);
353 printf("%% Cube %d solves subproblem num %zu\n", cube_idx, subproblem_idx);
354 }
355 /** The first step is to "dive" by committing to a search path. */
356 size_t remaining_depth = dive(global, cube_idx);
357 /** If we reached the subproblem without reaching a leaf node, we start the solving phase. */
358 if(remaining_depth == 0) {
359 solve(global, cube_idx);
360 /** If we didn't stop solving because of an external interruption, we increase the number of subproblems solved. */
361 if(!global.cpu_stop.test()) {
362 cube.stats.eps_solved_subproblems += 1;
363 }
364 }
365 /** We reached a leaf node before the subproblem was reached, it means a whole subtree should be skipped. */
366 else if(!global.cpu_stop.test()) {
367 /** To skip all the paths of the subtree obtained, we perform bitwise operations.
368 * Suppose the current path is "00" turn left two times, and the following search tree:
369 * * depth = 0
370 * / \
371 * 0 1 depth = 1
372 * / \ / \
373 * 00 01 10 11 depth = 2
374 *
375 * If we detect a leaf node at depth 1, after only one left turn, we must skip the remaining of the subtree, in particular to avoid exploring the path "01".
376 * To achieve that, we take the current path "00", shift it to the right by 1 (essentially erasing the path that has not been explored), increment it to go to the next subtree (at the same depth), and shift it back to the left to reach the first subproblem of the subtree.
377 * Cool huh?
378 */
379 size_t next_subproblem_idx = ((subproblem_idx >> remaining_depth) + size_t{1}) << remaining_depth;
380 global.next_subproblem.meet(ZLB<size_t, bt::local_memory>(next_subproblem_idx));
381 /** It is possible that other threads skip similar subtrees.
382 * Hence, we only count the subproblems skipped by the thread solving the left most subproblem. */
383 if((subproblem_idx & ((size_t{1} << remaining_depth) - size_t{1})) == size_t{0}) {
384 cube.stats.eps_skipped_subproblems += next_subproblem_idx - subproblem_idx;
385 }
386 }
387 /** We prepare the cube to solve the next problem.
388 * We restore the search tree to the root, and reset the EPS search strategy.
389 * We also update the subproblem index to the next subproblem to solve. */
390 if(!global.cpu_stop.test()) {
391 /** To avoid that several cubes solve the same subproblem, we use an atomic post-increment. */
392 subproblem_idx = global.next_subproblem.atomic()++;
393 /** The following commented code is completely valid and does not use atomic post-increment.
394 * But honestly, we kinda need more performance so... let's avoid reexploring subproblems. */
395 // subproblem_idx = global.next_subproblem.value();
396 // global.next_subproblem.meet(ZLB<size_t, bt::local_memory>(subproblem_idx + size_t{1}));
397 if(subproblem_idx < num_subproblems) {
398 cube.search_tree->restore(global.cpu_cubes[cube_idx].root_snapshot);
399 cube.eps_split->reset();
400 }
401 }
402 }
403 /** If we did not get interrupted, this thread has explored all available subproblems.
404 * We record this information in `num_blocks_done` simply because it helps to detect unbalanced workloads (many threads finished but not some others).
405 */
406 if(!global.cpu_stop.test()) {
407 cube.stats.num_blocks_done = 1;
408 }
409 /** We signal to the GPU kernel that this block must terminate.
410 * The GPU block is necessarily waiting on `ready_to_propagate`, hence by setting `stop` first, using a memory fence, we ensure the GPU block is going to see it must stop.
411 */
412 global.gpu_cubes[cube_idx].stop.test_and_set();
413 cuda::atomic_thread_fence(cuda::memory_order_seq_cst, cuda::thread_scope_system);
414 global.gpu_cubes[cube_idx].ready_to_propagate.test_and_set(cuda::std::memory_order_seq_cst);
415 global.gpu_cubes[cube_idx].ready_to_propagate.notify_one();
416
417 /** We signal to the main thread that we have finished our work. */
418 global.cpu_cubes[cube_idx].finished.test_and_set();
419}
420
421/** Given a root problem, we follow a predefined path in the search tree to reach a subproblem.
422 * This is call the "dive" operation.
423 * The path is given by `subproblem_idx` in the CPU cube (see `CPUData::next_subproblem` for more info).
424 * For all dives, the initial problem must be the same and the EPS search strategy must be static (always splitting the tree in the same way).
425 * Therefore, don't be tempted to add the objective to the initial problem because it might lead to ignoring some subproblems since the splitting decisions will differ.
426 *
427 * \return The remaining depth of the path if we reach a leaf node before the end of the path.
428 */
429size_t dive(CPUData& global, size_t cube_idx) {
430 auto& cube = global.cpu_cubes[cube_idx].cube;
431 bool stop_diving = false;
432 size_t remaining_depth = cube.config.subproblems_power;
433 /** The number of iterations depends on the length of the diving path. */
434 while(remaining_depth > 0 && !stop_diving && !global.cpu_stop.test()) {
435 remaining_depth--;
436 bool is_leaf_node = propagate(global, cube_idx);
437 /** If we reach a leaf node before the end of the path, we stop and the remaining depth is reported to the caller. */
438 if(is_leaf_node) {
439 stop_diving = true;
440 }
441 else {
442 /** We create two branches according to the EPS search strategy. */
443 auto branches = cube.eps_split->split();
444 assert(branches.size() == 2);
445 /** We commit to one of the branches depending on the current value on the path.
446 * Suppose the depth is 3, the path is "010" we are currently at `remaining_depth = 1`.
447 * We must extract the bit "1", and we do so by standard bitwise manipulation.
448 * Whenever the branch_idx is 0 means to take the left branch, and 1 means to take the right branch.
449 */
450 size_t branch_idx = (global.cpu_cubes[cube_idx].subproblem_idx & (size_t{1} << remaining_depth)) >> remaining_depth;
451 /** We immediately commit to the branch.
452 * It has the effect of reducing the domain of the variables in `cube.store` (and `gpu_cube.store_cpu` since they are aliased).
453 */
454 cube.ipc->deduce(branches[branch_idx]);
455 }
456 }
457 return remaining_depth;
458}
459
460/** We solve a cube using the propagate and search algorithm.
461 * We explore the whole search tree of the cube, only stopping earlier due to pruning conditions or external stop signal (e.g. `global.cpu_stop`).
462 */
463void solve(CPUData& global, size_t cube_idx) {
464 auto& cpu_cube = global.cpu_cubes[cube_idx].cube;
465 bool has_changed = true;
466 while(has_changed && !global.cpu_stop.test()) {
467 /** Before propagating, we update the local bound with the best known global bound. */
468 update_local_best_bound(global, cube_idx);
469 /** We propagate on GPU, and manage the leaf nodes (solution and failed nodes). */
470 propagate(global, cube_idx);
471 /** We pop a new node from the search tree, ready to explore.
472 * If the search tree becomes empty, `has_changed` will be `false`.
473 */
474 has_changed = cpu_cube.search_tree->deduce();
475 }
476}
477
478/** Propagate the cube `cube_idx` on the GPU.
479 * Check if we reached a leaf node (solution or failed) on the CPU.
480 * Branching on unknown nodes is a task left to the caller.
481 *
482 * \return `true` if we reached a leaf node.
483 */
484bool propagate(CPUData& global, size_t cube_idx) {
485 auto& cpu_cube = global.cpu_cubes[cube_idx].cube;
486 auto& gpu_cube = global.gpu_cubes[cube_idx];
487 bool is_leaf_node = false;
488
489 gpu_cube.store_cpu->prefetch(0);
490
491 /** We signal to the GPU that it can propagate the current node.
492 * Thereafter, we immediately wait for the GPU to finish the propagation before performing the search step.
493 */
494 cuda::atomic_thread_fence(cuda::memory_order_seq_cst, cuda::thread_scope_system);
495 gpu_cube.ready_to_propagate.test_and_set(cuda::std::memory_order_seq_cst);
496 gpu_cube.ready_to_propagate.notify_one();
497 gpu_cube.ready_to_search.wait(false, cuda::std::memory_order_seq_cst);
498 gpu_cube.ready_to_search.clear();
499
500 gpu_cube.store_cpu->prefetch(cudaCpuDeviceId);
501
502 /** `on_node` updates the statistics and verifies whether we should stop (e.g. option `--cutnodes`). */
503 bool is_pruned = cpu_cube.on_node();
504 /** If the abstract domain is `bottom`, we reached a leaf node where the problem is unsatisfiable. */
505 if(cpu_cube.ipc->is_bot()) {
506 is_leaf_node = true;
507 cpu_cube.on_failed_node();
508 }
509 /** When the problem is "extractable", then all variables are assigned to a single value.
510 * It means that we have reached a solution.
511 */
512 else if(gpu_cube.solution_found.test()) {
513 is_leaf_node = true;
514 gpu_cube.solution_found.clear();
515 /** We save the new best solution found.
516 * The "branch-and-bound" (bab) abstract domain has a local store of variable to store the best solution.
517 * It adds a new bound constraint to the root of the search tree, such that, on backtracking the best bound is enforced.
518 */
519 cpu_cube.bab->deduce();
520 bool print_solution = cpu_cube.is_printing_intermediate_sol();
521 if(cpu_cube.bab->is_optimization()) {
522 /** We share the new best bound with the other cubes. */
523 print_solution &= update_global_best_bound(global, cube_idx);
524 }
525 /** If we print all intermediate solutions, and really found a better bound (no other thread found a better one meanwhile), we print the current solution. */
526 if(print_solution) {
527 cpu_cube.print_solution();
528 }
529 /** We update the statistics, and check if we must terminate (e.g. we stop after N solutions). */
530 is_pruned |= cpu_cube.update_solution_stats();
531 }
532 if(is_pruned) {
533 /** We notify all threads that we must stop. */
534 global.cpu_stop.test_and_set();
535 }
536 return is_leaf_node;
537}
538
539/** Each block of this kernel executes the propagation loop on the GPU until a fixpoint is reached.
540 * 1) Transfer the store of variables from the CPU to the GPU.
541 * 2) Execute the fixpoint engine.
542 * 3) Transfer the store of variables from the GPU to the CPU.
543 *
544 * These three steps are repeated until `cpu_stop` becomes `true`.
545 * Each block is continuously processing a stream of nodes coming from the CPU.
546 *
547 * The size of `gpu_cubes` must be equal to the number of blocks.
548 */
549__global__ void gpu_propagate(GPUCube* gpu_cubes, size_t shared_bytes) {
550 extern __shared__ unsigned char shared_mem[];
551 GPUCube& cube = gpu_cubes[blockIdx.x];
552
553 /** We start by initializing the structures in shared memory (fixpoint loop engine, store of variables). */
554 __shared__ BlockAsynchronousIterationGPU fp_engine;
555 /** This shared variable is necessary to avoid multiple threads to read into `cube.stop.test()`,
556 * potentially reading different values and leading to deadlock. */
557 __shared__ bool stop;
558 // If we booked more than the default shared memory, it means we allocate the store in shared memory.
559 if(threadIdx.x == 0 && shared_bytes > DEFAULT_SHARED_MEM_BYTES) {
560 bt::pool_allocator shared_mem_pool(shared_mem, shared_bytes);
561 cube.store_gpu->reset_data(shared_mem_pool);
562 }
563
564 auto group = cooperative_groups::this_thread_block();
565 group.sync();
566
567 using BlockScan = cub::BlockScan<int, BLOCK_SIZE>;
568 assert(BLOCK_SIZE == blockDim.x);
569 __shared__ typename BlockScan::TempStorage cub_prefixsum_tmp;
570
571 while(true) {
572 /** We wait that the CPU notifies us the store is ready to be copied and propagated. */
573 if(threadIdx.x == 0) {
574 cube.ready_to_propagate.wait(false, cuda::std::memory_order_seq_cst);
575 cube.ready_to_propagate.clear();
576 // NOTE: Only one thread should read the atomic `cube.stop`, to avoid deadlock if one thread reads `true` and exits, while another thread reads `false`.
577 stop = cube.stop.test();
578 }
579 group.sync();
580 if(stop) {
581 break;
582 }
583 /** We copy the CPU store into the GPU memory. */
584 cube.store_cpu->copy_to(group, *cube.store_gpu);
585 group.sync();
586 group.sync();
587 /** This is the main propagation algorithm: the current node is propagated in parallel. */
588 size_t fp_iterations = fp_engine.fixpoint(cube.pidx, *(cube.ipc_gpu));
589 cube.store_gpu->copy_to(group, *cube.store_cpu);
590 // if(threadIdx.x == 0) {
591 // printf("GPU VStore (after propagation): ");
592 // cube.store_gpu->print();
593 // }
594 if(threadIdx.x == 0) {
595 cube.fp_iterations += fp_iterations;
596 }
597 /** We eliminate propagators that are entailed, shrinking the `pidx` array. */
598 bool is_leaf_node = cube.store_gpu->is_bot();
599 int n = cube.pidx.size() + ((blockDim.x - cube.pidx.size() % blockDim.x) % blockDim.x);
600 if(!is_leaf_node) {
601 // pidx: 0 1 2 3 (indexes of the propagators)
602 // pmask: 1 0 0 1 (filtering entailed propagators)
603 // psum: 0 1 1 1 (exclusive prefix sum)
604 // pidx2: 0 3 (new indexes of the propagators)
605 for(int i = threadIdx.x; i < cube.pidx.size(); i += blockDim.x) {
606 cube.pmask[i] = !cube.ipc_gpu->ask(cube.pidx[i]);
607 }
608 for(int i = threadIdx.x; i < n; i += blockDim.x) {
609 BlockScan(cub_prefixsum_tmp).InclusiveSum(cube.pmask[i], cube.psum[i]);
610 group.sync(); // required by BlockScan to reuse the temporary storage.
611 }
612 for(int i = blockDim.x + threadIdx.x; i < n; i += blockDim.x) {
613 cube.psum[i] += cube.psum[i - threadIdx.x - 1];
614 group.sync();
615 }
616 if(cube.psum[cube.pidx.size()-1] == 0) {
617 is_leaf_node = cube.store_gpu->template is_extractable<AtomicExtraction>(group);
618 if(threadIdx.x == 0 && is_leaf_node) {
619 cube.solution_found.test_and_set();
620 }
621 }
622 }
623 cuda::atomic_thread_fence(cuda::memory_order_seq_cst, cuda::thread_scope_system);
624 group.sync();
625 /** We notify to the CPU that we have propagated the current node. */
626 if(threadIdx.x == 0) {
627 cube.ready_to_search.test_and_set(cuda::std::memory_order_seq_cst);
628 cube.ready_to_search.notify_one();
629 }
630 // Backtrack detected so we reinitialize pidx.
631 if(is_leaf_node) {
632 if(threadIdx.x == 0) {
633 cube.pidx.resize(cube.ipc_gpu->num_deductions());
634 cube.pidx2.resize(cube.ipc_gpu->num_deductions());
635 }
636 group.sync();
637 for(int i = threadIdx.x; i < cube.pidx.size(); i += blockDim.x) {
638 cube.pidx[i] = i;
639 }
640 }
641 // We compute the shrunk pidx.
642 else {
643 if(threadIdx.x == 0) {
644 battery::swap(cube.pidx, cube.pidx2);
645 cube.pidx.resize(cube.psum[cube.pidx2.size()-1]);
646 }
647 group.sync();
648 for(int i = threadIdx.x; i < cube.pidx2.size(); i += blockDim.x) {
649 if(cube.pmask[i]) {
650 cube.pidx[cube.psum[i]-1] = cube.pidx2[i];
651 }
652 }
653 }
654 }
655}
656
657/** We update the bound found by the current cube so it is visible to all other cubes.
658 * Note that this operation might not always succeed, which is okay, the best bound is still saved locally in `gpu_cubes` and then reduced at the end (in `reduce_cubes`).
659 * The worst that can happen is that a best bound is found twice, which does not prevent the correctness of the algorithm.
660 *
661 * \return `true` if the best bound has changed. Can return `false` if the best bound was updated by another thread meanwhile.
662 */
663bool update_global_best_bound(CPUData& global, size_t cube_idx) {
664 const auto& cube = global.cpu_cubes[cube_idx].cube;
665 assert(cube.bab->is_optimization());
666 // We retrieve the best bound found by the current cube.
667 auto local_best = cube.bab->optimum().project(cube.bab->objective_var());
668 // We update the global `best_bound`.
669 if(cube.bab->is_maximization()) {
670 return global.best_bound.meet_lb(dual<Itv::LB>(local_best.ub()));
671 }
672 else {
673 return global.best_bound.meet_ub(dual<Itv::UB>(local_best.lb()));
674 }
675}
676
677/** This function essentially does the converse operation of `update_global_best_bound`.
678 * We directly update the store with the global best bound.
679 * This function should be called in each node, since the best bound is erased on backtracking (it is not included in the snapshot).
680 */
681void update_local_best_bound(CPUData& global, size_t cube_idx) {
682 if(global.cpu_cubes[cube_idx].cube.bab->is_optimization()) {
683 auto& cube = global.cpu_cubes[cube_idx].cube;
684 VarEnv<bt::standard_allocator> empty_env{};
685 auto best_formula = cube.bab->template deinterpret_best_bound<bt::standard_allocator>(
686 cube.bab->is_maximization()
687 ? Itv(dual<Itv::UB>(global.best_bound.lb()))
688 : Itv(dual<Itv::LB>(global.best_bound.ub())));
689 IDiagnostics diagnostics;
690 bool r = interpret_and_tell(best_formula, empty_env, *cube.store, diagnostics);
691 assert(r);
692 }
693}
694
695/** After solving, we merge all the statistics and best solutions from all cubes together, before printing them. */
696void reduce_cubes(CPUData& global) {
697 for(int i = 0; i < global.cpu_cubes.size(); ++i) {
698 /** `meet` is the merge operation. */
699 global.root.meet(global.cpu_cubes[i].cube);
700 global.root.stats.fixpoint_iterations += global.gpu_cubes[i].fp_iterations;
701 }
702}
703
704/** We configure the GPU according to the user configuration:
705 * 1) Decide the size of the shared memory and return it.
706 * 2) Increase the stack size if needed.
707 * 3) Increase the global memory allocation (we set the limit to around 90% of the global memory).
708 * 4) Guess the "best" number of threads per block and the number of blocks per SM, if not provided.
709 */
710size_t configure_gpu(CP<Itv>& cp) {
711 auto& config = cp.config;
712 /** Configure the shared memory size. */
713 size_t alignment_overhead = 200;
714 size_t shared_mem_bytes = DEFAULT_SHARED_MEM_BYTES + alignment_overhead + (cp.store->vars() * sizeof(GPUCube::Itv1));
715 cudaDeviceProp deviceProp;
716 cudaGetDeviceProperties(&deviceProp, 0);
717 if(shared_mem_bytes >= deviceProp.sharedMemPerBlock || config.only_global_memory) {
718 shared_mem_bytes = DEFAULT_SHARED_MEM_BYTES;
719 printf("%%%%%%mzn-stat: memory_configuration=\"global\"\n");
720 }
721 else {
722 printf("%%%%%%mzn-stat: memory_configuration=\"store_shared\"\n");
723 }
724 printf("%%%%%%mzn-stat: shared_mem=%" PRIu64 "\n", shared_mem_bytes);
725
726 int hint_num_blocks;
727 int hint_num_threads;
728 CUDAE(cudaOccupancyMaxPotentialBlockSize(&hint_num_blocks, &hint_num_threads, (void*) gpu_propagate, shared_mem_bytes));
729 size_t total_global_mem = deviceProp.totalGlobalMem;
730 size_t num_sm = deviceProp.multiProcessorCount;
731 config.and_nodes = (config.and_nodes == 0) ? hint_num_threads : config.and_nodes;
732 config.or_nodes = (config.or_nodes == 0) ? hint_num_blocks : config.or_nodes;
733 // The stack allocated depends on the maximum number of threads per SM, not on the actual number of threads per block.
734 size_t total_stack_size = num_sm * deviceProp.maxThreadsPerMultiProcessor * config.stack_kb * 1000;
735 size_t remaining_global_mem = total_global_mem - total_stack_size;
736 remaining_global_mem -= remaining_global_mem / 10; // We leave 10% of global memory free for CUDA allocations, not sure if it is useful though.
737 CUDAEX(cudaDeviceSetLimit(cudaLimitStackSize, config.stack_kb*1000));
738 CUDAEX(cudaDeviceSetLimit(cudaLimitMallocHeapSize, remaining_global_mem));
739 print_memory_statistics("stack_memory", total_stack_size);
740 print_memory_statistics("heap_memory", remaining_global_mem);
741 printf("%% and_nodes=%zu\n", config.and_nodes);
742 printf("%% or_nodes=%zu\n", config.or_nodes);
743 int num_blocks;
744 cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks, (void*) gpu_propagate, config.and_nodes, shared_mem_bytes);
745 printf("%% max_blocks_per_sm=%d\n", num_blocks);
746 return shared_mem_bytes;
747}
748
749#endif // __CUDACC__
750#endif // TURBO_HYBRID_DIVE_AND_SOLVE_HPP
Interval< local::ZLB > Itv
Definition common_solving.hpp:589
bool must_quit()
Definition common_solving.hpp:89
void block_signal_ctrlc()
Definition common_solving.hpp:77
bool check_timeout(A &a, const Timepoint &start)
Definition common_solving.hpp:101
void hybrid_dive_and_solve(const Configuration< battery::standard_allocator > &config)
Definition hybrid_dive_and_solve.hpp:279
void print_memory_statistics(const char *key, size_t bytes)
Definition statistics.hpp:12
Definition common_solving.hpp:156
Configuration< BasicAllocator > config
Definition common_solving.hpp:283
abstract_ptr< IStore > store
Definition common_solving.hpp:268
void preprocess()
Definition common_solving.hpp:462
Definition config.hpp:29
size_t or_nodes
Definition config.hpp:41
Definition common_solving.hpp:136