Turbo Constraint Solver
Loading...
Searching...
No Matches
barebones_dive_and_solve.hpp
Go to the documentation of this file.
1// Copyright 2025 Pierre Talbot
2
3#ifndef TURBO_BAREBONES_DIVE_AND_SOLVE_HPP
4#define TURBO_BAREBONES_DIVE_AND_SOLVE_HPP
5
6#include "common_solving.hpp"
7#include "memory_gpu.hpp"
8#include "lala/light_branch.hpp"
9#include <mutex>
10#include <thread>
11#include <chrono>
12
13/** This is required in order to guess the usage of global memory, and increase it. */
14#define MAX_SEARCH_DEPTH 10000
15
16namespace bt = ::battery;
17
18/**
19 * The full GPU version (`gpu_dive_and_solve`) is not compiling on modern GPU hardware (SM >= 9) due to the kernel being too large.
20 * We circuvanted this issue by creating an hybrid version where only propagation is executed on the GPU (`hybrid_dive_and_solve`).
21 * This has the disadvantage of memory transfers between CPU and GPU and synchronization overheads.
22 * We propose a new "barebones" version which contains less abstractions than the GPU and hybrid versions, but have the same functionalities.
23 * In particular, we directly implement the branch-and-bound algorithm here and avoid using `lala::BAB` and `lala::SearchTree` which are nice from a software engineering perspective but bring significant overhead.
24 * This version is intended to reach the best possible performance.
25 *
26 * Terminology:
27 * * unified data: data available to both the CPU and GPU.
28 * * block data: data used within a single block.
29 * * grid data: data shared among all blocks in the grid.
30 */
31
32#ifdef __CUDACC__
33
34#include <cuda/std/chrono>
35#include <cuda/semaphore>
36
37#endif
38
39namespace barebones {
40
41#ifdef __CUDACC__
42#ifndef TURBO_IPC_ABSTRACT_DOMAIN
43
44/** `ConcurrentAllocator` allocates memory available both on CPU and GPU. For non-Linux systems such as Windows pinned memory must be used (see PR #19). */
45#ifdef NO_CONCURRENT_MANAGED_MEMORY
46 using ConcurrentAllocator = bt::pinned_allocator;
47#else
48 using ConcurrentAllocator = bt::managed_allocator;
49#endif
50
51using GridCP = AbstractDomains<Itv,
52 bt::statistics_allocator<ConcurrentAllocator>,
53 bt::statistics_allocator<UniqueLightAlloc<ConcurrentAllocator, 0>>,
54 bt::statistics_allocator<UniqueLightAlloc<ConcurrentAllocator, 1>>>;
55
56/** Data shared between CPU and GPU. */
57struct UnifiedData {
58 /** The root node of the problem, useful to backtrack when solving a new subproblem.
59 * Also contains the shared information such as statistics and solver configuration.
60 */
61 GridCP root;
62
63 /** Stop signal from the CPU because of a timeout or CTRL-C. */
64 cuda::std::atomic_flag stop;
65
66 /** The memory configuration of each block. */
67 MemoryConfig mem_config;
68
69 UnifiedData(const CP<Itv>& cp, MemoryConfig mem_config)
70 : root(GridCP::tag_gpu_block_copy{}, false, cp)
71 , stop(false)
72 , mem_config(mem_config)
73 {
74 size_t num_subproblems = 1;
75 num_subproblems <<= root.config.subproblems_power;
76 root.stats.eps_num_subproblems = num_subproblems;
77 }
78};
79
80struct GridData;
81using IStore = VStore<Itv, bt::pool_allocator>;
82using IProp = PIR<IStore, bt::pool_allocator>;
83using UB = ZUB<typename Itv::LB::value_type, bt::atomic_memory_grid>;
84using strategies_type = bt::vector<StrategyType<bt::global_allocator>, bt::global_allocator>;
85
86/** Data private to a single block. */
87struct BlockData {
88 /** The store of variables at the root of the current subproblem. */
89 abstract_ptr<VStore<Itv, bt::global_allocator>> root_store;
90
91 /** The best solution found so far in this block. */
92 abstract_ptr<VStore<Itv, bt::global_allocator>> best_store;
93
94 /** The current store of variables.
95 * We use a `pool_allocator`, this allows to easily switch between global memory and shared memory, if the store of variables can fit inside.
96 * */
97 abstract_ptr<IStore> store;
98
99 /** The propagators implemented as an array of bytecodes.
100 * Similarly, the propagators can be stored in the global or shared memory.
101 * If the propagators do not fit in shared memory, the array of propagators is shared among all blocks.
102 * It is possible because the propagators are state-less, we avoid duplicating them in each block.
103 * */
104 abstract_ptr<IProp> iprop;
105
106 /** The statistics of the current block. */
108
109 /** The path from `UnifiedData::root` to the current subproblem `root_store`. */
110 size_t subproblem_idx;
111
112 /** The best bound found so far by this block.
113 * We always seek to minimize.
114 * Invariant: `best_bound == best_store.project(obj_var).lb()`
115 */
116 UB best_bound;
117
118 /** The current strategy being used to split the store.
119 * It is an index into `GridData::strategies`.
120 */
121 int current_strategy;
122
123 /** The next unassigned variable in the current strategy.
124 * It is an index into `GridData::strategies.vars`.
125 */
126 int next_unassigned_var;
127
128 /** On backtracking, the value to restore `current_strategy`. */
129 int snapshot_root_strategy;
130
131 /** On backtracking, the value to restore `next_unassigned_var`. */
132 int snapshot_next_unassigned_var;
133
134 /** The decision taken when exploring the tree. */
135 bt::vector<LightBranch<Itv>, bt::global_allocator> decisions;
136
137 /** Current depth of the search tree. */
138 int depth;
139
140 /** A timer used for computing time statistics. */
141 cuda::std::chrono::system_clock::time_point timer;
142
143 __device__ BlockData()
144 : subproblem_idx(0)
145 , current_strategy(0)
146 , next_unassigned_var(0)
147 , decisions(5000)
148 , depth(0)
149 {}
150
151 __device__ void allocate(const UnifiedData& unified_data, const GridData& grid_data, unsigned char* shared_mem) {
152 if(threadIdx.x == 0) {
153 subproblem_idx = blockIdx.x;
154 const MemoryConfig& mem_config = unified_data.mem_config;
155 const auto& u_store = *(unified_data.root.store);
156 const auto& u_iprop = *(unified_data.root.iprop);
157 bt::pool_allocator shared_mem_pool(mem_config.make_shared_pool(shared_mem));
158 bt::pool_allocator store_allocator(mem_config.make_store_pool(shared_mem_pool));
159 bt::pool_allocator prop_allocator(mem_config.make_prop_pool(shared_mem_pool));
160 root_store = bt::make_shared<VStore<Itv, bt::global_allocator>, bt::global_allocator>(u_store);
161 best_store = bt::make_shared<VStore<Itv, bt::global_allocator>, bt::global_allocator>(u_store);
162 store = bt::allocate_shared<IStore, bt::pool_allocator>(store_allocator, u_store, store_allocator);
163 iprop = bt::allocate_shared<IProp, bt::pool_allocator>(prop_allocator, u_iprop, store, prop_allocator);
164 }
165 }
166
167 /** We must deallocate store and iprop inside the kernel because they might be initialized in shared memory. */
168 __device__ void deallocate_shared_data() {
169 if(threadIdx.x == 0) {
170 // NOTE: .reset() does not work because it does not reset the allocator, which is itself allocated in global memory.
171 store = abstract_ptr<IStore>();
172 iprop = abstract_ptr<IProp>();
173 }
174 }
175
176 /** Add a new decision on the `decisions` stack and increase depth.
177 * \param has_changed: A Boolean in shared memory.
178 * \param strategies: A sequence of strategies.
179 * \precondition: We must not be on a leaf node.
180 */
181 __device__ INLINE void split(bool& has_changed, const strategies_type& strategies) {
182 using LB2 = typename Itv::LB::local_type;
183 using UB2 = typename Itv::UB::local_type;
184 __shared__ local::ZUB idx;
185 decisions[depth].var = AVar{};
186 int currentDepth = depth;
187 for(int i = current_strategy; i < strategies.size(); ++i) {
188 switch(strategies[i].var_order) {
189 case VariableOrder::INPUT_ORDER: {
190 input_order_split(has_changed, idx, strategies[i]);
191 break;
192 }
193 case VariableOrder::FIRST_FAIL: {
194 lattice_smallest_split(has_changed, idx, strategies[i],
195 [](const Itv& u) { return UB2(u.ub().value() - u.lb().value()); });
196 break;
197 }
198 case VariableOrder::ANTI_FIRST_FAIL: {
199 lattice_smallest_split(has_changed, idx, strategies[i],
200 [](const Itv& u) { return LB2(u.ub().value() - u.lb().value()); });
201 break;
202 }
203 case VariableOrder::LARGEST: {
204 lattice_smallest_split(has_changed, idx, strategies[i],
205 [](const Itv& u) { return LB2(u.ub().value()); });
206 break;
207 }
208 case VariableOrder::SMALLEST: {
209 lattice_smallest_split(has_changed, idx, strategies[i],
210 [](const Itv& u) { return UB2(u.lb().value()); });
211 break;
212 }
213 default: assert(false);
214 }
215 __syncthreads();
216 // If we could find a variable with the current strategy, we return.
217 if(!decisions[currentDepth].var.is_untyped()) {
218 return;
219 }
220 if(threadIdx.x == 0) {
221 current_strategy++;
222 next_unassigned_var = 0;
223 }
224 // `input_order_split` and `lattice_smallest_split` have a `__syncthreads()` before reading next_unassigned_var.
225 }
226 }
227
228 /** Select the next unassigned variable with a finite interval in the array `strategy.vars()` or `store` if the previous one is empty.
229 * We ignore infinite variables as splitting on them do not guarantee termination.
230 * \param has_changed is a Boolean in shared memory.
231 * \param idx is a decreasing integer in shared memory.
232 */
233 __device__ INLINE void input_order_split(bool& has_changed, local::ZUB& idx, const StrategyType<bt::global_allocator>& strategy)
234 {
235 bool split_in_store = strategy.vars.empty();
236 int n = split_in_store ? store->vars() : strategy.vars.size();
237 has_changed = true;
238 if(threadIdx.x == 0) {
239 idx = n;
240 }
241 while(has_changed) {
242 __syncthreads();
243 // int n = idx.value();
244 // __syncthreads();
245 has_changed = false;
246 for(int i = next_unassigned_var + threadIdx.x; i < n; i += blockDim.x) {
247 const auto& dom = (*store)[split_in_store ? i : strategy.vars[i].vid()];
248 if(dom.lb().value() != dom.ub().value() && !dom.lb().is_top() && !dom.ub().is_top()) {
249 if(idx.meet(local::ZUB(i))) {
250 has_changed = true;
251 }
252 }
253 }
254 __syncthreads();
255 }
256 if(threadIdx.x == 0) {
257 next_unassigned_var = idx.value();
258 }
259 if(idx.value() != n) {
260 push_decision(strategy.val_order, split_in_store ? AVar{store->aty(), idx.value()} : strategy.vars[idx.value()]);
261 }
262 }
263
264 /** Given an array of variable, select the variable `x` with the smallest value `f(store[x])` where "smallest" is defined according to the lattice order of the return type of `f`.
265 * \param has_changed is a Boolean in shared memory.
266 * \param idx is a decreasing integer in shared memory.
267 * */
268 template <class F>
269 __device__ INLINE void lattice_smallest_split(bool& has_changed, local::ZUB& idx,
270 const StrategyType<bt::global_allocator>& strategy, F f)
271 {
272 using T = decltype(f(Itv{}));
273 __shared__ T value;
274 bool split_in_store = strategy.vars.empty();
275 int n = split_in_store ? store->vars() : strategy.vars.size();
276 has_changed = true;
277 if(threadIdx.x == 0) {
278 value = T::top();
279 idx = n;
280 }
281 /** This fixpoint loop seeks for the smallest `x` according to `f(x)` and the next unassigned variable. */
282 while(has_changed) {
283 __syncthreads();
284 has_changed = false;
285 for(int i = next_unassigned_var + threadIdx.x; i < n; i += blockDim.x) {
286 const auto& dom = (*store)[split_in_store ? i : strategy.vars[i].vid()];
287 if(dom.lb().value() != dom.ub().value() && !dom.lb().is_top() && !dom.ub().is_top()) {
288 if(value.meet(f(dom))) {
289 has_changed = true;
290 }
291 if(idx.meet(local::ZUB(i))) {
292 has_changed = true;
293 }
294 }
295 }
296 __syncthreads();
297 }
298 /** If we found a value, we traverse again the variables' array to find its index. */
299 if(!value.is_top()) {
300 if(threadIdx.x == 0) {
301 next_unassigned_var = idx.value();
302 idx = n;
303 }
304 __syncthreads();
305 has_changed = true;
306 // This fixpoint loop is not strictly necessary.
307 // We keep it for determinism: the variable with the smallest index is selected first.
308 while(has_changed) {
309 int n = idx.value();
310 __syncthreads();
311 has_changed = false;
312 for(int i = next_unassigned_var + threadIdx.x; i < n; i += blockDim.x) {
313 const auto& dom = (*store)[split_in_store ? i : strategy.vars[i].vid()];
314 if(dom.lb().value() != dom.ub().value() && !dom.lb().is_top() && !dom.ub().is_top() && f(dom) == value) {
315 if(idx.meet(local::ZUB(i))) {
316 has_changed = true;
317 }
318 }
319 }
320 __syncthreads();
321 }
322 assert(idx.value() < n);
323 push_decision(strategy.val_order, split_in_store ? AVar{store->aty(), idx.value()} : strategy.vars[idx.value()]);
324 }
325 }
326
327 /** Push a new decision onto the decisions stack.
328 * \precondition The domain of the variable `var` must not be empty, be a singleton or contain infinite bounds.
329 */
330 __device__ INLINE void push_decision(ValueOrder val_order, AVar var) {
331 using value_type = typename Itv::LB::value_type;
332 if(threadIdx.x == 0) {
333 decisions[depth].var = var;
334 decisions[depth].current_idx = -1;
335 const auto& dom = store->project(decisions[depth].var);
336 assert(dom.lb().value() != dom.ub().value());
337 switch(val_order) {
338 case ValueOrder::MIN: {
339 decisions[depth].children[0] = Itv(dom.lb().value());
340 decisions[depth].children[1] = Itv(dom.lb().value() + value_type{1}, dom.ub());
341 break;
342 }
343 case ValueOrder::MAX: {
344 decisions[depth].children[0] = Itv(dom.ub().value());
345 decisions[depth].children[1] = Itv(dom.lb(), dom.ub().value() - value_type{1});
346 break;
347 }
348 case ValueOrder::SPLIT: {
349 auto mid = dom.lb().value() + (dom.ub().value() - dom.lb().value()) / value_type{2};
350 decisions[depth].children[0] = Itv(dom.lb(), mid);
351 decisions[depth].children[1] = Itv(mid + value_type{1}, dom.ub());
352 break;
353 }
354 case ValueOrder::REVERSE_SPLIT: {
355 auto mid = dom.lb().value() + (dom.ub().value() - dom.lb().value()) / value_type{2};
356 decisions[depth].children[0] = Itv(mid + value_type{1}, dom.ub());
357 decisions[depth].children[1] = Itv(dom.lb(), mid);
358 break;
359 }
360 // ValueOrder::MEDIAN is not possible with interval.
361 default: assert(false);
362 }
363 /** Ropes are a mechanism for fast backtracking.
364 * The rope of a left node is always the depth of the right node (also its depth), because after completing the exploration of the left subtree, we must visit the right subtree (rooted at the current depth).
365 * The rope of the right node is inherited from its parent, we set -1 if there is no next node to visit.
366 */
367 decisions[depth].ropes[0] = depth + 1;
368 decisions[depth].ropes[1] = depth > 0 ? decisions[depth-1].ropes[decisions[depth-1].current_idx] : -1;
369 ++depth;
370 // Reallocate decisions if needed.
371 if(decisions.size() == depth) {
372 printf("resize to %d\n", (int)decisions.size() * 2);
373 decisions.resize(decisions.size() * 2);
374 }
375 }
376 }
377};
378
379/** Data shared among all blocks. */
380struct GridData {
381 /** The private data of each block. */
382 bt::vector<BlockData, bt::global_allocator> blocks;
383
384 /** We generate the subproblems lazily.
385 * Suppose we generate `2^3` subproblems, we represent the first subproblem as `000`, the second as `001`, the third as `010`, and so on.
386 * A `0` means to turn left in the search tree, and a `1` means to turn right.
387 * Incrementing this integer will generate the path of the next subproblem.
388 */
389 ZLB<size_t, bt::atomic_memory_grid> next_subproblem;
390
391 /** This is an approximation of the best bound found so far, globally, across all threads.
392 * It is not necessarily the true best bound at each time `t`.
393 * The true best bound is obtained by `meet` over all block's best bounds.
394 * It is used to share information among blocks.
395 * We always seek to minimize.
396 */
397 UB appx_best_bound;
398
399 /** Due to multithreading, we must protect `stdout` when printing.
400 * The model of computation in this work is lock-free, but it seems unavoidable for printing.
401 */
402 cuda::binary_semaphore<cuda::thread_scope_device> print_lock;
403
404 /** The search strategy is immutable and shared among the blocks. */
405 strategies_type search_strategies;
406
407 /** The objective variable to minimize.
408 * Maximization problem are transformed into minimization problems by negating the objective variable.
409 * Equal to -1 if the problem is a satisfaction problem.
410 */
411 AVar obj_var;
412
413 __device__ GridData(const GridCP& root)
414 : blocks(root.stats.num_blocks)
415 , next_subproblem(root.stats.num_blocks)
416 , print_lock(1)
417 , search_strategies(root.split->strategies_())
418 , obj_var(root.minimize_obj_var)
419 {}
420};
421
422MemoryConfig configure_gpu_barebones(CP<Itv>&);
423__global__ void initialize_global_data(UnifiedData*, bt::unique_ptr<GridData, bt::global_allocator>*);
424__global__ void gpu_barebones_solve(UnifiedData*, GridData*);
425template <class FPEngine>
426__device__ INLINE void propagate(UnifiedData& unified_data, GridData& grid_data, BlockData& block_data,
427 FPEngine& fp_engine, bool& stop, bool& has_changed, bool& is_leaf_node);
428__global__ void reduce_blocks(UnifiedData*, GridData*);
429__global__ void deallocate_global_data(bt::unique_ptr<GridData, bt::global_allocator>*);
430
433 printf("%% WARNING: -arch barebones is incompatible with -i and -a (it cannot print intermediate solutions).");
434 }
435 auto start = std::chrono::steady_clock::now();
436 check_support_managed_memory();
437 check_support_concurrent_managed_memory();
438 /** We start with some preprocessing to reduce the number of variables and constraints. */
439 CP<Itv> cp(config);
440 cp.preprocess();
441 if(cp.iprop->is_bot()) {
444 return;
445 }
446 MemoryConfig mem_config = configure_gpu_barebones(cp);
447 auto unified_data = bt::make_unique<UnifiedData, ConcurrentAllocator>(cp, mem_config);
448 auto grid_data = bt::make_unique<bt::unique_ptr<GridData, bt::global_allocator>, ConcurrentAllocator>();
449 initialize_global_data<<<1,1>>>(unified_data.get(), grid_data.get());
450 CUDAEX(cudaDeviceSynchronize());
451 /** We wait that either the solving is interrupted, or that all threads have finished. */
452 /** Block the signal CTRL-C to notify the threads if we must exit. */
454 gpu_barebones_solve
455 <<<static_cast<unsigned int>(cp.stats.num_blocks),
456 CUDA_THREADS_PER_BLOCK,
457 mem_config.shared_bytes>>>
458 (unified_data.get(), grid_data->get());
459 bool interrupted = wait_solving_ends(unified_data->stop, unified_data->root, start);
460 CUDAEX(cudaDeviceSynchronize());
461 reduce_blocks<<<1,1>>>(unified_data.get(), grid_data->get());
462 CUDAEX(cudaDeviceSynchronize());
463 auto& uroot = unified_data->root;
464 if(uroot.stats.solutions > 0) {
465 cp.print_solution(*uroot.best);
466 }
467 uroot.stats.print_mzn_final_separator();
468 if(uroot.config.print_statistics) {
469 uroot.config.print_mzn_statistics();
470 uroot.stats.print_mzn_statistics(uroot.config.verbose_solving);
471 if(uroot.bab->is_optimization() && uroot.stats.solutions > 0) {
472 uroot.stats.print_mzn_objective(uroot.best->project(uroot.bab->objective_var()), uroot.bab->is_minimization());
473 }
474 unified_data->root.stats.print_mzn_end_stats();
475 }
476 deallocate_global_data<<<1,1>>>(grid_data.get());
477 CUDAEX(cudaDeviceSynchronize());
478}
479
480/** We configure the GPU according to the user configuration:
481 * 1) Guess the "best" number of blocks per SM, if not provided.
482 * 2) Configure the size of the shared memory.
483 * 3) Increase the global heap memory.
484 * 4) Increase the stack size if requested by the user.
485 */
486MemoryConfig configure_gpu_barebones(CP<Itv>& cp) {
487 auto& config = cp.config;
488
489 /** I. Number of blocks per SM. */
490
491 cudaDeviceProp deviceProp;
492 cudaGetDeviceProperties(&deviceProp, 0);
493 int max_block_per_sm;
494 cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_block_per_sm, (void*) gpu_barebones_solve, CUDA_THREADS_PER_BLOCK, 0);
495 if(cp.config.verbose_solving) {
496 printf("%% max_blocks_per_sm=%d\n", max_block_per_sm);
497 }
498 if(cp.config.or_nodes != 0) {
499 cp.stats.num_blocks = std::min(max_block_per_sm * deviceProp.multiProcessorCount, (int)cp.config.or_nodes);
500 if(cp.config.verbose_solving >= 1 && cp.stats.num_blocks < cp.config.or_nodes) {
501 printf("%% WARNING: -or %d is too high on your GPU architecture, it has been reduced to %d.\n", (int)cp.config.or_nodes, cp.stats.num_blocks);
502 }
503 }
504 else {
505 cp.stats.num_blocks = max_block_per_sm * deviceProp.multiProcessorCount;
506 }
507
508 /** II. Configure the shared memory size. */
509 size_t store_bytes = gpu_sizeof<IStore>() + gpu_sizeof<abstract_ptr<IStore>>() + cp.store->vars() * gpu_sizeof<Itv>();
510 size_t iprop_bytes = gpu_sizeof<IProp>() + gpu_sizeof<abstract_ptr<IProp>>() + cp.iprop->num_deductions() * gpu_sizeof<bytecode_type>() + gpu_sizeof<typename IProp::bytecodes_type>();
511 // If large problem, minimal amount of blocks.
512 if(cp.iprop->num_deductions() + cp.store->vars() > 1000000) {
513 cp.stats.num_blocks = deviceProp.multiProcessorCount;
514 printf("%% WARNING: Large problem detected, reducing to 1 block per SM.\n");
515 }
516 int blocks_per_sm = (cp.stats.num_blocks + deviceProp.multiProcessorCount - 1) / deviceProp.multiProcessorCount;
517 MemoryConfig mem_config;
518 if(config.only_global_memory) {
519 mem_config = MemoryConfig(store_bytes, iprop_bytes);
520 }
521 else {
522 mem_config = MemoryConfig((void*) gpu_barebones_solve, config.verbose_solving, blocks_per_sm, store_bytes, iprop_bytes);
523 }
524 mem_config.print_mzn_statistics(config, cp.stats);
525
526 /** III. Size of the heap global memory.
527 * The estimation is very conservative, normally we should not run out of memory.
528 * */
529 size_t nblocks = static_cast<size_t>(cp.stats.num_blocks);
530 size_t estimated_global_mem = gpu_sizeof<UnifiedData>() + store_bytes * size_t{5} + iprop_bytes +
531 gpu_sizeof<GridData>() +
532 nblocks * gpu_sizeof<BlockData>() +
533 nblocks * store_bytes * size_t{3} + // current, root, best.
534 nblocks * iprop_bytes * size_t{2} +
535 nblocks * cp.iprop->num_deductions() * size_t{4} * gpu_sizeof<int>() + // fixpoint engine
536 nblocks * (gpu_sizeof<int>() + gpu_sizeof<LightBranch<Itv>>()) * size_t{MAX_SEARCH_DEPTH};
537 size_t required_global_mem = std::max(deviceProp.totalGlobalMem / 2, estimated_global_mem);
538 if(estimated_global_mem > deviceProp.totalGlobalMem / 2) {
539 printf("%% WARNING: The estimated global memory is larger than half of the total global memory.\n\
540 We reduce the number of blocks to avoid running out of memory.\n");
541 cp.stats.num_blocks = deviceProp.multiProcessorCount;
542 }
543 CUDAEX(cudaDeviceSetLimit(cudaLimitMallocHeapSize, required_global_mem));
544 cp.stats.print_memory_statistics(cp.config.verbose_solving, "heap_memory", required_global_mem);
545 if(cp.config.verbose_solving) {
546 printf("%% estimated_global_mem=%zu\n", estimated_global_mem);
547 printf("%% num_blocks=%d\n", cp.stats.num_blocks);
548 }
549 if(deviceProp.totalGlobalMem < required_global_mem) {
550 printf("%% WARNING: The total global memory available is less than the required global memory.\n\
551 As our memory estimation is very conservative, it might still work, but it is not guaranteed.\n");
552 }
553
554 /** IV. Increase the stack if requested by the user. */
555 if(config.stack_kb != 0) {
556 CUDAEX(cudaDeviceSetLimit(cudaLimitStackSize, config.stack_kb*1000));
557 // The stack allocated depends on the maximum number of threads per SM, not on the actual number of threads per block.
558 size_t total_stack_size = deviceProp.multiProcessorCount * deviceProp.maxThreadsPerMultiProcessor * config.stack_kb * 1000;
559 cp.stats.print_memory_statistics(cp.config.verbose_solving, "stack_memory", total_stack_size);
560 }
561 return mem_config;
562}
563
564__global__ void initialize_global_data(
565 UnifiedData* unified_data,
566 bt::unique_ptr<GridData, bt::global_allocator>* grid_data_ptr)
567{
568 *grid_data_ptr = bt::make_unique<GridData, bt::global_allocator>(unified_data->root);
569}
570
571#define TIMEPOINT(KIND) \
572 if(threadIdx.x == 0) { \
573 block_data.timer = block_data.stats.stop_timer(Timer::KIND, block_data.timer); \
574 }
575
576__global__ void gpu_barebones_solve(UnifiedData* unified_data, GridData* grid_data) {
577 extern __shared__ unsigned char shared_mem[];
578 auto& config = unified_data->root.config;
579 BlockData& block_data = grid_data->blocks[blockIdx.x];
580 if(threadIdx.x == 0 && blockIdx.x == 0 && config.verbose_solving) {
581 printf("%% GPU kernel started, starting solving...\n");
582 }
583
584 /** A. Initialization the block data and the fixpoint engine. */
585
586 block_data.allocate(*unified_data, *grid_data, shared_mem);
587 __syncthreads();
588 IProp& iprop = *block_data.iprop;
589#ifdef TURBO_NO_ENTAILED_PROP_REMOVAL
590 __shared__ BlockAsynchronousFixpointGPU<true> fp_engine;
591#else
592 __shared__ FixpointSubsetGPU<BlockAsynchronousFixpointGPU<true>, bt::global_allocator, CUDA_THREADS_PER_BLOCK> fp_engine;
593 fp_engine.init(iprop.num_deductions());
594#endif
595 /** This shared variable is necessary to avoid multiple threads to read into `unified_data.stop.test()`,
596 * potentially reading different values and leading to deadlock. */
597 __shared__ bool stop;
598 __shared__ bool has_changed;
599 __shared__ bool is_leaf_node;
600 __shared__ int remaining_depth;
601 stop = false;
602 auto group = cooperative_groups::this_thread_block();
603 if(threadIdx.x == 0) {
604 block_data.timer = block_data.stats.start_timer_device();
605 }
606 __syncthreads();
607
608 /** B. Start the main dive and solve loop. */
609
610 size_t num_subproblems = unified_data->root.stats.eps_num_subproblems;
611 while(block_data.subproblem_idx < num_subproblems && !stop) {
612 if(config.verbose_solving >= 2 && threadIdx.x == 0) {
613 grid_data->print_lock.acquire();
614 printf("%% Block %d solves subproblem num %" PRIu64 "\n", blockIdx.x, block_data.subproblem_idx);
615 grid_data->print_lock.release();
616 }
617
618 // C. Restoring the current state to the root node.
619
620 block_data.current_strategy = 0;
621 block_data.next_unassigned_var = 0;
622 block_data.depth = 0;
623 unified_data->root.store->copy_to(group, *block_data.store);
624#ifndef TURBO_NO_ENTAILED_PROP_REMOVAL
625 fp_engine.reset(iprop.num_deductions());
626#endif
627 __syncthreads();
628
629 // D. Dive into the search tree until we reach the target subproblem.
630
631 remaining_depth = config.subproblems_power;
632 is_leaf_node = false;
633 while(remaining_depth > 0 && !is_leaf_node && !stop) {
634 propagate(*unified_data, *grid_data, block_data, fp_engine, stop, has_changed, is_leaf_node);
635 __syncthreads();
636 if(!is_leaf_node) {
637 block_data.split(has_changed, grid_data->search_strategies);
638 __syncthreads();
639 // Split was not able to split a domain. It means that the search strategy is not complete due to unsplittable infinite domains.
640 // We skip the subtree, and set exhaustive to `false`.
641 if(block_data.decisions[0].var.is_untyped()) {
642 is_leaf_node = true;
643 block_data.stats.exhaustive = false;
644 }
645 else if(threadIdx.x == 0) {
646 --remaining_depth;
647 // We do not record the decisions when diving.
648 --block_data.depth;
649 /** We commit to one of the branches depending on the current value on the path.
650 * Suppose the depth is 3, the path is "010" we are currently at `remaining_depth = 1`.
651 * We must extract the bit "1", and we do so by standard bitwise manipulation.
652 * Whenever the branch_idx is 0 means to take the left branch, and 1 means to take the right branch.
653 */
654 size_t branch_idx = (block_data.subproblem_idx & (size_t{1} << remaining_depth)) >> remaining_depth;
655 /** We immediately commit to the branch. */
656 block_data.store->embed(block_data.decisions[0].var, block_data.decisions[0].children[branch_idx]);
657 }
658 }
659 __syncthreads();
660 }
661
662 // E. Skip subproblems that are not reachable.
663
664 /** If we reached a leaf node before the subproblem was reached, then it means a whole subtree should be skipped. */
665 if(is_leaf_node && !stop) {
666 /** To skip all the paths of the subtree obtained, we perform bitwise operations.
667 * Suppose the current path is "00" turn left two times, and the following search tree:
668 * * depth = 0
669 * / \
670 * 0 1 depth = 1
671 * / \ / \
672 * 00 01 10 11 depth = 2
673 *
674 * If we detect a leaf node at depth 1, after only one left turn, we must skip the remaining of the subtree, in particular to avoid exploring the path "01".
675 * To achieve that, we take the current path "00", shift it to the right by 1 (essentially erasing the path that has not been explored), increment it to go to the next subtree (at the same depth), and shift it back to the left to reach the first subproblem of the subtree.
676 * Cool huh?
677 */
678 if(threadIdx.x == 0) {
679 size_t next_subproblem_idx = ((block_data.subproblem_idx >> remaining_depth) + size_t{1}) << remaining_depth;
680 // Make sure the subtree is skipped.
681 while(grid_data->next_subproblem.meet(ZLB<size_t, bt::local_memory>(next_subproblem_idx))) {}
682 /** It is possible that other blocks skip similar subtrees.
683 * Hence, we only count the subproblems skipped by the block solving the left most subproblem. */
684 if((block_data.subproblem_idx & ((size_t{1} << remaining_depth) - size_t{1})) == size_t{0}) {
685 block_data.stats.eps_skipped_subproblems += next_subproblem_idx - block_data.subproblem_idx;
686 }
687 }
688 }
689 else if(!stop) {
690
691 // F. Solve the current subproblem.
692
693 while(!stop) {
694
695 // I. Propagate the current node.
696
697 propagate(*unified_data, *grid_data, block_data, fp_engine, stop, has_changed, is_leaf_node);
698 __syncthreads();
699
700 // II. Branching
701
702 if(!is_leaf_node) {
703 // If we are at the root of the current subproblem, we create a snapshot for future backtracking.
704 if(block_data.depth == 0) {
705 block_data.store->copy_to(group, *block_data.root_store);
706 }
707 block_data.split(has_changed, grid_data->search_strategies);
708 __syncthreads();
709 // Split was not able to split a domain. It means that the search strategy is not complete due to unsplittable infinite domains.
710 // We trigger backtracking, and set exhaustive to `false`.
711 if(block_data.decisions[block_data.depth - 1].var.is_untyped()) {
712 is_leaf_node = true;
713 block_data.stats.exhaustive = false;
714 }
715 else if(threadIdx.x == 0) {
716 // depth == 1 for root node because we just increased it in `block_data.split`.
717 if(block_data.depth == 1) {
718 block_data.snapshot_root_strategy = block_data.current_strategy;
719 block_data.snapshot_next_unassigned_var = block_data.next_unassigned_var;
720 }
721 // Apply the decision.
722 block_data.store->embed(block_data.decisions[block_data.depth-1].var, block_data.decisions[block_data.depth-1].next());
723 // printf("left decision: %d [", block_data.decisions[block_data.depth - 1].var.vid()); block_data.decisions[block_data.depth - 1].current().print(); printf("]\n");
724 }
725 }
726
727 // III. Backtracking
728
729 if(is_leaf_node) {
730
731 // if(threadIdx.x == 0) {
732 // printf("backtracking\n");
733 // }
734 // Leaf node at root.
735 if(block_data.depth == 0) {
736 break;
737 }
738 if(threadIdx.x == 0) {
739 block_data.depth = block_data.decisions[block_data.depth-1].ropes[block_data.decisions[block_data.depth-1].current_idx];
740 }
741 __syncthreads();
742 // Check if there is no more node to visit.
743 if(block_data.depth == -1) {
744 break;
745 }
746 // Restore from root by copying the store and re-applying all decisions from root to block_data.depth-1.
747#ifndef TURBO_NO_ENTAILED_PROP_REMOVAL
748 fp_engine.reset(iprop.num_deductions());
749#endif
750 block_data.root_store->copy_to(group, *block_data.store);
751 has_changed = true;
752 while(has_changed) {
753 __syncthreads();
754 has_changed = false;
755 for(int i = threadIdx.x; i < block_data.depth - 1; i += blockDim.x) {
756 if(block_data.store->embed(block_data.decisions[i].var, block_data.decisions[i].current())) {
757 has_changed = true;
758 }
759 }
760 __syncthreads();
761 }
762 if(threadIdx.x == 0) {
763 block_data.store->embed(block_data.decisions[block_data.depth - 1].var, block_data.decisions[block_data.depth - 1].next());
764 // printf("right decision: %d [", block_data.decisions[block_data.depth - 1].var.vid()); block_data.decisions[block_data.depth - 1].current().print(); printf("]\n");
765 block_data.current_strategy = block_data.snapshot_root_strategy;
766 block_data.next_unassigned_var = block_data.snapshot_next_unassigned_var;
767 }
768 }
769 }
770 /** If we didn't stop solving because of an external interruption, we increase the number of subproblems solved. */
771 if(threadIdx.x == 0 && block_data.stats.nodes < config.stop_after_n_nodes
772 && !unified_data->stop.test())
773 {
774 block_data.stats.eps_solved_subproblems += 1;
775 }
776 }
777
778 // G. Move to the next subproblem.
779
780 /** We prepare the block to solve the next problem.
781 * We update the subproblem index to the next subproblem to solve. */
782 if(threadIdx.x == 0 && !stop) {
783 /** To avoid that several blocks solve the same subproblem, we use an atomic post-increment. */
784 block_data.subproblem_idx = grid_data->next_subproblem.atomic()++;
785 /** The following commented code is completely valid and does not use atomic post-increment.
786 * But honestly, we kinda need more performance so... let's avoid reexploring subproblems. */
787 // subproblem_idx = grid_data->next_subproblem.value();
788 // grid_data->next_subproblem.meet(ZLB<size_t, bt::local_memory>(subproblem_idx + size_t{1}));
789 }
790 __syncthreads();
791 }
792 if(threadIdx.x == 0 && block_data.stats.nodes < config.stop_after_n_nodes
793 && !unified_data->stop.test())
794 {
795 block_data.stats.num_blocks_done = 1;
796 }
797 __syncthreads();
798#ifndef TURBO_NO_ENTAILED_PROP_REMOVAL
799 fp_engine.destroy();
800#endif
801 block_data.deallocate_shared_data();
802 __syncthreads();
803}
804
805template <class FPEngine>
806__device__ INLINE void propagate(UnifiedData& unified_data, GridData& grid_data, BlockData& block_data,
807 FPEngine& fp_engine, bool& stop, bool& has_changed, bool& is_leaf_node)
808{
809 __shared__ int warp_iterations[CUDA_THREADS_PER_BLOCK/32];
810 warp_iterations[threadIdx.x / 32] = 0;
811 auto& config = unified_data.root.config;
812 IProp& iprop = *block_data.iprop;
813 auto group = cooperative_groups::this_thread_block();
814
815 // I. Optimize the objective variable.
816
817 if(threadIdx.x == 0 && !grid_data.obj_var.is_untyped()) {
818 /** Before propagating, we update the local bound with the best known global bound.
819 * Strenghten the objective variable to get a better objective next time.
820 */
821 if(!grid_data.appx_best_bound.is_top()) {
822 block_data.store->embed(grid_data.obj_var,
823 Itv(Itv::LB::top(), Itv::UB(grid_data.appx_best_bound.value() - 1)));
824 block_data.store->embed(grid_data.obj_var,
825 Itv(Itv::LB::top(), Itv::UB(block_data.best_bound.value() - 1)));
826 }
827 }
828 __syncthreads();
829 // Unconstrained objective, can terminate, we will not find a better solution.
830 if(grid_data.appx_best_bound.is_bot()) {
831 stop = true;
832 return;
833 }
834 TIMEPOINT(SEARCH);
835 is_leaf_node = false;
836
837 // II. Compute the fixpoint of the current node.
838 int fp_iterations;
839#ifdef TURBO_NO_ENTAILED_PROP_REMOVAL
840 int num_active = iprop.num_deductions();
841#else
842 int num_active = fp_engine.num_active();
843#endif
844 switch(config.fixpoint) {
845 case FixpointKind::AC1: {
846 fp_iterations = fp_engine.fixpoint(
847#ifdef TURBO_NO_ENTAILED_PROP_REMOVAL
848 iprop.num_deductions(),
849#endif
850 [&](int i){ return iprop.deduce(i); },
851 [&](){ return iprop.is_bot(); });
852 if(threadIdx.x == 0) {
853 block_data.stats.num_deductions += fp_iterations * num_active;
854 }
855 break;
856 }
857 case FixpointKind::WAC1: {
858 if(num_active <= config.wac1_threshold) {
859 fp_iterations = fp_engine.fixpoint(
860#ifdef TURBO_NO_ENTAILED_PROP_REMOVAL
861 iprop.num_deductions(),
862#endif
863 [&](int i){ return iprop.deduce(i); },
864 [&](){ return iprop.is_bot(); });
865 if(threadIdx.x == 0) {
866 block_data.stats.num_deductions += fp_iterations * num_active;
867 }
868 }
869 else {
870 fp_iterations = fp_engine.fixpoint(
871#ifdef TURBO_NO_ENTAILED_PROP_REMOVAL
872 iprop.num_deductions(),
873#endif
874 [&](int i){ return warp_fixpoint<CUDA_THREADS_PER_BLOCK>(iprop, i, warp_iterations); },
875 [&](){ return iprop.is_bot(); });
876 if(threadIdx.x == 0) {
877 for(int i = 0; i < CUDA_THREADS_PER_BLOCK/32; ++i) {
878 block_data.stats.num_deductions += warp_iterations[i] * 32;
879 }
880 }
881 }
882 break;
883 }
884 }
885 TIMEPOINT(FIXPOINT);
886
887 // III. Analyze the result of propagation
888
889 if(!iprop.is_bot()) {
890#ifdef TURBO_NO_ENTAILED_PROP_REMOVAL
891 has_changed = false;
892 for(int i = threadIdx.x; !has_changed && i < iprop.num_deductions(); i += blockDim.x) {
893 if(!iprop.ask(i)) {
894 has_changed = true;
895 }
896 }
897 __syncthreads();
898 num_active = has_changed;
899#else
900 fp_engine.select([&](int i) { return !iprop.ask(i); });
901 num_active = fp_engine.num_active();
902#endif
903 TIMEPOINT(SELECT_FP_FUNCTIONS);
904 if(num_active == 0) {
905 is_leaf_node = true;
906 if(threadIdx.x == 0) {
907 block_data.best_bound.meet(Itv::UB(block_data.store->project(grid_data.obj_var).lb().value()));
908 grid_data.appx_best_bound.meet(block_data.best_bound);
909 }
910 block_data.store->copy_to(group, *block_data.best_store);
911 if(threadIdx.x == 0) {
912 block_data.stats.solutions++;
913 if(config.verbose_solving >= 2) {
914 grid_data.print_lock.acquire();
915 printf("%% objective="); block_data.best_bound.print(); printf("\n");
916 grid_data.print_lock.release();
917 }
918 }
919 }
920 }
921 else {
922 is_leaf_node = true;
923 }
924
925 if(threadIdx.x == 0) {
926 block_data.stats.fixpoint_iterations += fp_iterations;
927 block_data.stats.nodes++;
928 block_data.stats.fails += (iprop.is_bot() ? 1 : 0);
929 block_data.stats.depth_max = battery::max(block_data.stats.depth_max, block_data.depth);
930
931 // IV. Checking stopping conditions.
932
933 if(block_data.stats.nodes >= config.stop_after_n_nodes
934 || unified_data.stop.test())
935 {
936 block_data.stats.exhaustive = false;
937 stop = true;
938 }
939 }
940}
941
942__global__ void reduce_blocks(UnifiedData* unified_data, GridData* grid_data) {
943 auto& root = unified_data->root;
944 for(int i = 0; i < grid_data->blocks.size(); ++i) {
945 root.stats.meet(grid_data->blocks[i].stats);
946 }
947 for(int i = 0; i < grid_data->blocks.size(); ++i) {
948 auto& block = grid_data->blocks[i];
949 if(block.stats.solutions > 0) {
950 if(root.bab->is_satisfaction()) {
951 block.best_store->extract(*root.best);
952 break;
953 }
954 else {
955 grid_data->appx_best_bound.meet(block.best_bound);
956 }
957 }
958 }
959 // If we found a bound, we copy the best store into the unified data.
960 if(!grid_data->appx_best_bound.is_top()) {
961 for(int i = 0; i < grid_data->blocks.size(); ++i) {
962 auto& block = grid_data->blocks[i];
963 if(block.best_bound == grid_data->appx_best_bound
964 && block.best_store->project(grid_data->obj_var).lb() == block.best_bound)
965 {
966 block.best_store->copy_to(*root.best);
967 break;
968 }
969 }
970 }
971}
972
973__global__ void deallocate_global_data(bt::unique_ptr<GridData, bt::global_allocator>* grid_data) {
974 grid_data->reset();
975}
976
977#endif // TURBO_IPC_ABSTRACT_DOMAIN
978#endif // __CUDACC__
979
980#if defined(TURBO_IPC_ABSTRACT_DOMAIN) || !defined(__CUDACC__)
981
983#ifdef TURBO_IPC_ABSTRACT_DOMAIN
984 std::cerr << "-arch barebones does not support IPC abstract domain." << std::endl;
985#else
986 std::cerr << "You must use a CUDA compiler (nvcc or clang) to compile Turbo on GPU." << std::endl;
987#endif
988}
989
990#endif
991
992} // namespace barebones
993
994#endif // TURBO_BAREBONES_DIVE_AND_SOLVE_HPP
#define MAX_SEARCH_DEPTH
Definition barebones_dive_and_solve.hpp:14
Interval< ZLB< bound_value_type, battery::local_memory > > Itv
Definition common_solving.hpp:53
void block_signal_ctrlc()
Definition common_solving.hpp:72
Definition barebones_dive_and_solve.hpp:39
void barebones_dive_and_solve(const Configuration< battery::standard_allocator > &config)
Definition barebones_dive_and_solve.hpp:982
@ FIXPOINT
@ SELECT_FP_FUNCTIONS
Definition common_solving.hpp:144
Configuration< BasicAllocator > config
Definition common_solving.hpp:278
CUDA void print_final_solution()
Definition common_solving.hpp:830
Statistics< BasicAllocator > stats
Definition common_solving.hpp:279
abstract_ptr< IStore > store
Definition common_solving.hpp:260
void preprocess()
Definition common_solving.hpp:591
abstract_ptr< IProp > iprop
Definition common_solving.hpp:261
CUDA void print_solution()
Definition common_solving.hpp:793
CUDA void print_mzn_statistics()
Definition common_solving.hpp:837
Definition config.hpp:34
size_t stop_after_n_nodes
Definition config.hpp:38
bool print_intermediate_solutions
Definition config.hpp:36
size_t or_nodes
Definition config.hpp:48
size_t stack_kb
Definition config.hpp:50
bool only_global_memory
Definition config.hpp:43
FixpointKind fixpoint
Definition config.hpp:52
int verbose_solving
Definition config.hpp:41
size_t wac1_threshold
Definition config.hpp:53
size_t subproblems_power
Definition config.hpp:49
Definition statistics.hpp:110
int num_blocks
Definition statistics.hpp:117
CUDA void print_memory_statistics(int verbose, const char *key, size_t bytes) const
Definition statistics.hpp:256