3#ifndef TURBO_GPU_DIVE_AND_SOLVE_HPP
4#define TURBO_GPU_DIVE_AND_SOLVE_HPP
11namespace bt = ::battery;
15#include <cuda/std/chrono>
16#include <cuda/semaphore>
22 class ConcurrentAllocator>
28 using concurrent_allocator = ConcurrentAllocator;
32 bt::statistics_allocator<ConcurrentAllocator>,
33 bt::statistics_allocator<UniqueLightAlloc<ConcurrentAllocator, 0>>,
34 bt::statistics_allocator<UniqueLightAlloc<ConcurrentAllocator, 1>>>;
43using Itv0 = Interval<ZLB<bound_value_type, bt::local_memory>>;
44using Itv1 = Interval<ZLB<bound_value_type, bt::atomic_memory_block>>;
45using Itv2 = Interval<ZLB<bound_value_type, bt::atomic_memory_grid>>;
46using AtomicBool = B<bt::atomic_memory_block>;
47using FPEngine = FixpointSubsetGPU<BlockAsynchronousFixpointGPU<true>, bt::global_allocator, CUDA_THREADS_PER_BLOCK>;
50#ifdef NO_CONCURRENT_MANAGED_MEMORY
51 using ItvSolverPinned = StateTypes<Itv0, Itv1, Itv2, bt::pinned_allocator>;
54 using ItvSolver = StateTypes<Itv0, Itv0, Itv0, bt::managed_allocator>;
64 using GridCP =
typename S::GridCP;
65 using BlockCP =
typename S::BlockCP;
66 using U2 =
typename S::U2;
71 bt::shared_ptr<BlockCP, bt::global_allocator> blocks_root;
73 cuda::std::atomic_flag cpu_stop;
75 volatile bool blocks_reduced;
76 MemoryConfig mem_config;
77 bt::vector<BlockData<S>, bt::global_allocator> blocks;
79 bt::shared_ptr<B<bt::atomic_memory_grid>, bt::global_allocator> gpu_stop;
80 bt::shared_ptr<ZLB<size_t, bt::atomic_memory_grid>, bt::global_allocator> next_subproblem;
81 bt::shared_ptr<U2, bt::global_allocator> best_bound;
85 bt::shared_ptr<cuda::binary_semaphore<cuda::thread_scope_device>, bt::global_allocator> print_lock;
86 cuda::std::atomic_flag ready_to_produce;
87 cuda::std::atomic_flag ready_to_consume;
89 GridData(
const GridCP& root,
const MemoryConfig& mem_config)
91 , mem_config(mem_config)
93 , blocks_reduced(false)
95 ready_to_consume.clear();
96 ready_to_produce.clear();
97 cuda::atomic_thread_fence(cuda::memory_order_seq_cst, cuda::thread_scope_system);
100 template <
class BlockBAB>
101 __device__
void produce_solution(
const BlockBAB& bab) {
102 print_lock->acquire();
103 if(!cpu_stop.test()) {
104 bab.extract(*(root.bab));
105 cuda::atomic_thread_fence(cuda::memory_order_seq_cst, cuda::thread_scope_system);
106 ready_to_consume.test_and_set(cuda::std::memory_order_seq_cst);
107 ready_to_consume.notify_one();
110 ready_to_produce.wait(
false, cuda::std::memory_order_seq_cst);
111 ready_to_produce.clear();
113 print_lock->release();
116 __host__
bool consume_solution() {
117 ready_to_consume.wait(
false, cuda::std::memory_order_seq_cst);
118 ready_to_consume.clear();
120 root.print_final_solution();
121 if(root.config.print_statistics) {
122 root.print_mzn_statistics();
127 root.print_solution();
129 ready_to_produce.test_and_set(cuda::std::memory_order_seq_cst);
130 ready_to_produce.notify_one();
134 __device__
void allocate() {
135 assert(threadIdx.x == 0 && blockIdx.x == 0);
136 auto root_mem_config(mem_config);
137 root_mem_config.mem_kind = MemoryKind::GLOBAL;
138 blocks_root = bt::make_shared<BlockCP, bt::global_allocator>(
139 typename BlockCP::tag_gpu_block_copy{},
142 bt::global_allocator{},
143 root_mem_config.make_prop_pool(bt::pool_allocator(
nullptr,0)),
144 root_mem_config.make_store_pool(bt::pool_allocator(
nullptr,0)));
145 blocks = bt::vector<BlockData<S>, bt::global_allocator>(root.stats.num_blocks);
146 gpu_stop = bt::make_shared<B<bt::atomic_memory_grid>, bt::global_allocator>(
false);
147 print_lock = bt::make_shared<cuda::binary_semaphore<cuda::thread_scope_device>, bt::global_allocator>(1);
148 next_subproblem = bt::make_shared<ZLB<size_t, bt::atomic_memory_grid>, bt::global_allocator>(0);
149 best_bound = bt::make_shared<U2, bt::global_allocator>();
152 __device__
void deallocate() {
153 assert(threadIdx.x == 0 && blockIdx.x == 0);
154 blocks = bt::vector<BlockData<S>, bt::global_allocator>();
155 blocks_root->deallocate();
159 next_subproblem.reset();
167 using GridCP =
typename S::GridCP;
168 using BlockCP =
typename S::BlockCP;
170 using snapshot_type =
typename BlockCP::IST::snapshot_type<bt::global_allocator>;
171 size_t subproblem_idx;
172 bt::shared_ptr<FPEngine, bt::pool_allocator> fp_engine;
173 bt::shared_ptr<AtomicBool, bt::pool_allocator> has_changed;
174 bt::shared_ptr<AtomicBool, bt::pool_allocator> stop;
175 bt::shared_ptr<BlockCP, bt::global_allocator> root;
176 bt::shared_ptr<snapshot_type, bt::global_allocator> snapshot_root;
178 __device__ BlockData():
179 has_changed(nullptr, bt::pool_allocator(nullptr, 0)),
180 stop(nullptr, bt::pool_allocator(nullptr, 0))
186 __device__
void allocate(GridData<S>& grid_data,
unsigned char* shared_mem) {
187 auto block = cooperative_groups::this_thread_block();
188 if(threadIdx.x == 0) {
189 subproblem_idx = blockIdx.x;
190 MemoryConfig& mem_config = grid_data.mem_config;
191 bt::pool_allocator shared_mem_pool(mem_config.make_shared_pool(shared_mem));
192 fp_engine = bt::allocate_shared<FPEngine, bt::pool_allocator>(shared_mem_pool);
193 has_changed = bt::allocate_shared<AtomicBool, bt::pool_allocator>(shared_mem_pool,
true);
194 stop = bt::allocate_shared<AtomicBool, bt::pool_allocator>(shared_mem_pool,
false);
195 root = bt::make_shared<BlockCP, bt::global_allocator>(
typename BlockCP::tag_gpu_block_copy{},
196 (mem_config.mem_kind != MemoryKind::TCN_SHARED),
197 *(grid_data.blocks_root),
198 bt::global_allocator{},
199 mem_config.make_prop_pool(shared_mem_pool),
200 mem_config.make_store_pool(shared_mem_pool));
201 snapshot_root = bt::make_shared<snapshot_type, bt::global_allocator>(root->search_tree->template snapshot<bt::global_allocator>());
204 fp_engine->init(root->iprop->num_deductions());
208 __device__
void deallocate_shared() {
209 if(threadIdx.x == 0) {
214 snapshot_root.reset();
219 __device__
void restore() {
220 if(threadIdx.x == 0) {
221 root->search_tree->restore(*snapshot_root);
228__global__
void initialize_grid_data(GridData<S>* grid_data) {
229 grid_data->allocate();
230 size_t num_subproblems = 1;
231 num_subproblems <<= grid_data->root.config.subproblems_power;
232 grid_data->next_subproblem->meet(ZLB<size_t, bt::local_memory>(grid_data->root.stats.num_blocks));
233 grid_data->root.stats.eps_num_subproblems = num_subproblems;
237__global__
void deallocate_grid_data(GridData<S>* grid_data) {
238 grid_data->deallocate();
246__device__
bool update_grid_best_bound(BlockData<S>& block_data, GridData<S>& grid_data) {
247 using U0 =
typename S::U0;
248 assert(threadIdx.x == 0);
249 if(block_data.root->bab->is_optimization()) {
250 const auto& bab = block_data.root->bab;
251 auto local_best = bab->optimum().project(bab->objective_var());
253 if(bab->is_maximization()) {
254 return grid_data.best_bound->meet_lb(dual_bound<typename U0::LB>(local_best.ub()));
257 return grid_data.best_bound->meet_ub(dual_bound<typename U0::UB>(local_best.lb()));
268__device__
void update_block_best_bound(BlockData<S>& block_data, GridData<S>& grid_data) {
269 using U0 =
typename S::U0;
270 if(threadIdx.x == 0 && block_data.root->bab->is_optimization()) {
271 const auto& bab = block_data.root->bab;
272 VarEnv<bt::global_allocator> empty_env{};
273 auto best_formula = bab->template deinterpret_best_bound<bt::global_allocator>(
274 bab->is_maximization()
275 ? U0(dual_bound<typename U0::UB>(grid_data.best_bound->lb()))
276 : U0(dual_bound<typename U0::LB>(grid_data.best_bound->ub())));
279 IDiagnostics diagnostics;
280 interpret_and_tell(best_formula, empty_env, *block_data.root->store, diagnostics);
288__device__
bool propagate(BlockData<S>& block_data, GridData<S>& grid_data) {
289 using BlockCP =
typename S::BlockCP;
290 __shared__
int warp_iterations[CUDA_THREADS_PER_BLOCK/32];
291 warp_iterations[threadIdx.x / 32] = 0;
292 bool is_leaf_node =
false;
293 BlockCP& cp = *block_data.root;
294 auto group = cooperative_groups::this_thread_block();
295 auto& fp_engine = *block_data.fp_engine;
296 auto& iprop = *cp.iprop;
297 auto start = cp.stats.start_timer_device();
299 switch(cp.config.fixpoint) {
301 fp_iterations = fp_engine.fixpoint(
302 [&](
int i){
return iprop.deduce(i); },
303 [&](){
return iprop.is_bot(); });
304 if(threadIdx.x == 0) {
305 cp.stats.num_deductions += fp_iterations * fp_engine.num_active();
310 if(fp_engine.num_active() <= cp.config.wac1_threshold) {
311 fp_iterations = fp_engine.fixpoint(
312 [&](
int i){
return iprop.deduce(i); },
313 [&](){
return iprop.is_bot(); });
314 if(threadIdx.x == 0) {
315 cp.stats.num_deductions += fp_iterations * fp_engine.num_active();
319 fp_iterations = fp_engine.fixpoint(
320 [&](
int i){
return warp_fixpoint<CUDA_THREADS_PER_BLOCK>(iprop, i, warp_iterations); },
321 [&](){
return iprop.is_bot(); });
322 if(threadIdx.x == 0) {
323 for(
int i = 0; i < CUDA_THREADS_PER_BLOCK/32; ++i) {
324 cp.stats.num_deductions += warp_iterations[i] * 32;
333 if(!iprop.is_bot()) {
334 fp_engine.select([&](
int i) {
return !iprop.ask(i); });
336 if(fp_engine.num_active() == 0) {
337 is_leaf_node = cp.store->template is_extractable<AtomicExtraction>(group);
343 if(threadIdx.x == 0) {
344 cp.stats.fixpoint_iterations += fp_iterations;
345 bool is_pruned = cp.on_node();
349 else if(is_leaf_node) {
350 if(cp.bab->is_satisfaction() || cp.bab->compare_bound(*cp.store, cp.bab->optimum())) {
352 bool best_has_changed = update_grid_best_bound(block_data, grid_data);
353 if(cp.bab->is_satisfaction() || (best_has_changed && cp.is_printing_intermediate_sol())) {
354 grid_data.produce_solution(*cp.bab);
356 is_pruned |= cp.update_solution_stats();
360 grid_data.gpu_stop->join(
true);
365 fp_engine.reset(cp.iprop->num_deductions());
374__device__
size_t dive(BlockData<S>& block_data, GridData<S>& grid_data) {
375 using BlockCP =
typename S::BlockCP;
376 BlockCP& cp = *block_data.root;
377 auto& fp_engine = *block_data.fp_engine;
378 auto& stop = *block_data.stop;
380 auto& stop_diving = *block_data.has_changed;
382 stop_diving.meet(
false);
384 size_t remaining_depth = grid_data.root.config.subproblems_power;
385 while(remaining_depth > 0 && !stop_diving && !stop) {
386 bool is_leaf_node = propagate(block_data, grid_data);
387 stop.join(grid_data.cpu_stop.test() || *(grid_data.gpu_stop));
389 if(threadIdx.x == 0) {
390 stop_diving.join(
true);
395 if(threadIdx.x == 0) {
396 auto start = cp.stats.start_timer_device();
397 size_t branch_idx = (block_data.subproblem_idx & (
size_t{1} << remaining_depth)) >> remaining_depth;
398 auto branches = cp.split->split();
399 assert(branches.size() == 2);
400 cp.iprop->deduce(branches[branch_idx]);
406 return remaining_depth;
410__device__
void solve_problem(BlockData<S>& block_data, GridData<S>& grid_data) {
411 using BlockCP =
typename S::BlockCP;
412 BlockCP& cp = *block_data.root;
413 auto& fp_engine = *block_data.fp_engine;
414 auto& block_has_changed = *block_data.has_changed;
415 auto& stop = *block_data.stop;
416 block_has_changed.join(
true);
419 auto start = cp.stats.start_timer_device();
422 while(block_has_changed && !stop) {
423 update_block_best_bound(block_data, grid_data);
425 propagate(block_data, grid_data);
426 if(threadIdx.x == 0) {
427 start = cp.stats.start_timer_device();
428 stop.join(grid_data.cpu_stop.test() || *(grid_data.gpu_stop));
431 block_has_changed.meet(cp.search_tree->deduce());
447CUDA
void reduce_blocks(GridData<S>* grid_data) {
448 for(
int i = 0; i < grid_data->blocks.size(); ++i) {
449 if(grid_data->blocks[i].root) {
450 grid_data->root.meet(*(grid_data->blocks[i].root));
456__global__
void gpu_solve_kernel(GridData<S>* grid_data)
458 if(threadIdx.x == 0 && blockIdx.x == 0 && grid_data->root.config.verbose_solving) {
459 printf(
"%% GPU kernel started, starting solving...\n");
461 extern __shared__
unsigned char shared_mem[];
462 size_t num_subproblems = grid_data->root.stats.eps_num_subproblems;
463 BlockData<S>& block_data = grid_data->blocks[blockIdx.x];
464 block_data.allocate(*grid_data, shared_mem);
465 auto solve_start = block_data.root->stats.start_timer_device();
466 while(block_data.subproblem_idx < num_subproblems && !*(block_data.stop)) {
467 if(threadIdx.x == 0 && grid_data->root.config.verbose_solving >= 2) {
468 grid_data->print_lock->acquire();
469 printf(
"%% Block %d solves subproblem num %" PRIu64
"\n", blockIdx.x, block_data.subproblem_idx);
470 grid_data->print_lock->release();
472 block_data.restore();
474 auto dive_start = block_data.root->stats.start_timer_device();
475 size_t remaining_depth = dive(block_data, *grid_data);
476 block_data.root->stats.stop_timer(
Timer::DIVE, dive_start);
477 if(remaining_depth == 0) {
478 solve_problem(block_data, *grid_data);
479 if(threadIdx.x == 0 && !*(block_data.stop)) {
480 block_data.root->stats.eps_solved_subproblems += 1;
484 if(threadIdx.x == 0 && !*(block_data.stop)) {
485 size_t next_subproblem_idx = ((block_data.subproblem_idx >> remaining_depth) +
size_t{1}) << remaining_depth;
486 grid_data->next_subproblem->meet(ZLB<size_t, bt::local_memory>(next_subproblem_idx));
488 if((block_data.subproblem_idx & ((
size_t{1} << remaining_depth) -
size_t{1})) ==
size_t{0}) {
489 block_data.root->stats.eps_skipped_subproblems += next_subproblem_idx - block_data.subproblem_idx;
494 if(threadIdx.x == 0 && !*(block_data.stop)) {
495 block_data.subproblem_idx = grid_data->next_subproblem->value();
496 grid_data->next_subproblem->meet(ZLB<size_t, bt::local_memory>(block_data.subproblem_idx +
size_t{1}));
501 block_data.root->stats.stop_timer(
Timer::SOLVE, solve_start);
502 if(threadIdx.x == 0 && !*(block_data.stop)) {
503 block_data.root->stats.num_blocks_done = 1;
505 if(threadIdx.x == 0) {
506 grid_data->print_lock->acquire();
507 if(!grid_data->blocks_reduced) {
509 for(
int i = 0; i < grid_data->blocks.size(); ++i) {
510 if(grid_data->blocks[i].root) {
511 n += grid_data->blocks[i].root->stats.num_blocks_done;
514 if(block_data.stop->value() || n == grid_data->blocks.size()) {
515 reduce_blocks(grid_data);
516 grid_data->blocks_reduced =
true;
517 cuda::atomic_thread_fence(cuda::memory_order_seq_cst, cuda::thread_scope_system);
518 grid_data->ready_to_consume.test_and_set(cuda::std::memory_order_seq_cst);
519 grid_data->ready_to_consume.notify_one();
522 grid_data->print_lock->release();
525 block_data.deallocate_shared();
528template <
class S,
class U>
529size_t sizeof_store(
const CP<U>& root) {
530 return gpu_sizeof<typename S::BlockCP::IStore>()
531 + gpu_sizeof<typename S::BlockCP::IStore::universe_type>() * root.
store->vars();
535template <
class S,
class U>
536MemoryConfig configure_memory(
CP<U>& root) {
537 cudaDeviceProp deviceProp;
538 cudaGetDeviceProperties(&deviceProp, 0);
539 const auto& config = root.
config;
540 size_t shared_mem_capacity = deviceProp.sharedMemPerBlock;
545 size_t store_alignment = 200;
547 MemoryConfig mem_config;
549 mem_config.shared_bytes =
sizeof(FPEngine)+100;
550 mem_config.store_bytes = sizeof_store<S>(root2) + store_alignment;
553 mem_config.prop_bytes = root2.prop_allocator.total_bytes_allocated();
554 mem_config.prop_bytes += mem_config.prop_bytes / 5;
555 if(config.only_global_memory || shared_mem_capacity < mem_config.shared_bytes + mem_config.store_bytes) {
556 if(!config.only_global_memory && config.verbose_solving) {
557 printf(
"%% The store of variables (%zuKB) cannot be stored in the shared memory of the GPU (%zuKB), therefore we use the global memory.\n",
558 mem_config.store_bytes / 1000,
559 shared_mem_capacity / 1000);
561 mem_config.mem_kind = MemoryKind::GLOBAL;
563 else if(shared_mem_capacity > mem_config.shared_bytes + mem_config.store_bytes + mem_config.prop_bytes) {
564 if(config.verbose_solving) {
565 printf(
"%% The store of variables and the propagators (%zuKB) are stored in the shared memory of the GPU (%zuKB).\n",
566 (mem_config.shared_bytes + mem_config.store_bytes + mem_config.prop_bytes) / 1000,
567 shared_mem_capacity / 1000);
569 mem_config.shared_bytes += mem_config.store_bytes + mem_config.prop_bytes;
570 mem_config.mem_kind = MemoryKind::TCN_SHARED;
573 if(config.verbose_solving) {
574 printf(
"%% The store of variables (%zuKB) is stored in the shared memory of the GPU (%zuKB).\n",
575 mem_config.store_bytes / 1000,
576 shared_mem_capacity / 1000);
578 mem_config.shared_bytes += mem_config.store_bytes;
579 mem_config.mem_kind = MemoryKind::STORE_SHARED;
588void consume_kernel_solutions(GridData<S>& grid_data) {
589 while(!grid_data.consume_solution()) {}
592template <
class S,
class U,
class Timepo
int>
593void transfer_memory_and_run(
CP<U>& root, MemoryConfig mem_config,
const Timepoint& start) {
594 using concurrent_allocator =
typename S::concurrent_allocator;
595 auto grid_data = bt::make_shared<GridData<S>, concurrent_allocator>(root, mem_config);
596 initialize_grid_data<<<1,1>>>(grid_data.get());
597 CUDAEX(cudaDeviceSynchronize());
598 if(grid_data->root.config.print_statistics) {
599 mem_config.print_mzn_statistics(root.
config, root.
stats);
601 std::thread consumer_thread(consume_kernel_solutions<S>, std::ref(*grid_data));
603 <<<
static_cast<unsigned int>(grid_data->root.stats.num_blocks),
604 CUDA_THREADS_PER_BLOCK,
605 grid_data->mem_config.shared_bytes>>>
607 bool interrupted = wait_solving_ends(grid_data->cpu_stop, grid_data->root, start);
608 consumer_thread.join();
609 CUDAEX(cudaDeviceSynchronize());
610 deallocate_grid_data<<<1,1>>>(grid_data.get());
611 CUDAEX(cudaDeviceSynchronize());
615int threads_per_sm(cudaDeviceProp devProp) {
616 switch (devProp.major){
617 case 2:
return (devProp.minor == 1) ? 48 : 32;
620 case 6:
return (devProp.minor == 0) ? 64 : 128;
622 case 8:
return (devProp.minor == 0) ? 64 : 128;
628template <
class S,
class U>
629void configure_blocks_threads(
CP<U>& root,
const MemoryConfig& mem_config) {
630 int max_block_per_sm;
631 cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_block_per_sm, (
void*) gpu_solve_kernel<S>, CUDA_THREADS_PER_BLOCK, (
int)mem_config.shared_bytes);
633 printf(
"%% max_blocks_per_sm=%d\n", max_block_per_sm);
636 cudaDeviceProp deviceProp;
637 cudaGetDeviceProperties(&deviceProp, 0);
639 size_t total_global_mem = deviceProp.totalGlobalMem;
640 size_t num_sm = deviceProp.multiProcessorCount;
641 size_t num_threads_per_sm = threads_per_sm(deviceProp);
643 auto& config = root.
config;
644 root.
stats.
num_blocks = (config.or_nodes == 0) ? max_block_per_sm : config.or_nodes;
646 config.stack_kb = config.stack_kb == 0 ? 32 : config.stack_kb;
649 size_t total_stack_size = num_sm * deviceProp.maxThreadsPerMultiProcessor * config.stack_kb * 1000;
650 size_t remaining_global_mem = total_global_mem - total_stack_size;
651 remaining_global_mem -= remaining_global_mem / 10;
655 size_t heap_usage_estimation = (root.
stats.
num_blocks + 1) * (mem_config.prop_bytes + mem_config.store_bytes + 100 * root.
store->vars());
656 while(heap_usage_estimation > remaining_global_mem) {
661 CUDAEX(cudaDeviceSetLimit(cudaLimitStackSize, config.stack_kb*1000));
662 CUDAEX(cudaDeviceSetLimit(cudaLimitMallocHeapSize, remaining_global_mem/15));
667 if(config.verbose_solving) {
672template <
class S,
class U,
class Timepo
int>
673void configure_and_run(
CP<U>& root,
const Timepoint& start) {
674 MemoryConfig mem_config = configure_memory<S>(root);
675 configure_blocks_threads<S>(root, mem_config);
676 transfer_memory_and_run<S>(root, mem_config, start);
683 std::cerr <<
"You must use a CUDA compiler (nvcc or clang) to compile Turbo on GPU." << std::endl;
685 check_support_managed_memory();
686 check_support_concurrent_managed_memory();
687 auto start = std::chrono::steady_clock::now();
690 if(root.
iprop->is_bot()) {
695#ifdef NO_CONCURRENT_MANAGED_MEMORY
696 configure_and_run<ItvSolverPinned>(root, start);
698 configure_and_run<ItvSolver>(root, start);
void block_signal_ctrlc()
Definition common_solving.hpp:72
void gpu_dive_and_solve(Configuration< bt::standard_allocator > &config)
Definition gpu_dive_and_solve.hpp:681
Definition common_solving.hpp:144
Configuration< BasicAllocator > config
Definition common_solving.hpp:278
CUDA void print_final_solution()
Definition common_solving.hpp:830
Statistics< BasicAllocator > stats
Definition common_solving.hpp:279
abstract_ptr< IStore > store
Definition common_solving.hpp:260
void preprocess()
Definition common_solving.hpp:591
abstract_ptr< IProp > iprop
Definition common_solving.hpp:261
int verbose_solving
Definition config.hpp:41
int num_blocks
Definition statistics.hpp:117
CUDA void print_memory_statistics(int verbose, const char *key, size_t bytes) const
Definition statistics.hpp:256
Definition common_solving.hpp:107