3#ifndef TURBO_HYBRID_DIVE_AND_SOLVE_HPP
4#define TURBO_HYBRID_DIVE_AND_SOLVE_HPP
11namespace bt = ::battery;
31#define DEFAULT_SHARED_MEM_BYTES 0
43 typename cube_type::IST::snapshot_type<bt::standard_allocator> root_snapshot;
46 std::atomic_flag finished;
51 size_t subproblem_idx;
55 , root_snapshot(cube.search_tree->template snapshot<bt::standard_allocator>())
59 CPUCube(
const CPUCube& other)
60 : cube(typename cube_type::tag_gpu_block_copy{}, true, other.cube)
61 , root_snapshot(cube.search_tree->template snapshot<bt::standard_allocator>())
71 using Itv1 = Interval<ZLB<bound_value_type, bt::local_memory>>;
74 using IStore = VStore<Itv1, bt::pool_allocator>;
78 #ifdef TURBO_IPC_ABSTRACT_DOMAIN
79 using IProp = PC<IStore, bt::pool_allocator>;
81 using IProp = PIR<IStore, bt::pool_allocator>;
85 abstract_ptr<IStore> store_gpu;
90 abstract_ptr<IProp> iprop_gpu;
101 abstract_ptr<VStore<Itv, bt::pinned_allocator>> store_cpu;
104 cuda::std::atomic_flag solution_found;
109 size_t fp_iterations;
110 size_t num_deductions;
114 size_t wac1_threshold;
121 cuda::std::atomic_flag ready_to_propagate;
122 cuda::std::atomic_flag ready_to_search;
125 cuda::std::atomic_flag stop;
128 fp_iterations(0), num_deductions(0), wac1_threshold(0)
131 ready_to_search.clear();
132 ready_to_propagate.clear();
134 solution_found.clear();
135 cuda::atomic_thread_fence(cuda::memory_order_seq_cst, cuda::thread_scope_system);
143 template <
class StoreType,
class PCType>
144 __device__
void allocate(StoreType& store, PCType& pc,
size_t bytes,
bool pc_shared) {
145 int n = pc.num_deductions();
147 void* mem_pool = bt::global_allocator{}.allocate(bytes);
148 bt::pool_allocator pool(
static_cast<unsigned char*
>(mem_pool), bytes);
149 AbstractDeps<bt::global_allocator, bt::pool_allocator> deps(pc_shared, bt::global_allocator{}, pool);
150 iprop_gpu = bt::allocate_shared<IProp, bt::pool_allocator>(pool, pc, deps);
151 store_gpu = deps.template extract<IStore>(store.aty());
156 __device__
void deallocate() {
158 store_gpu = abstract_ptr<IStore>();
159 iprop_gpu = abstract_ptr<IProp>();
166template <
class Store,
class IProp>
167__global__
void allocate_gpu_cubes(GPUCube* gpu_cubes,
168 size_t n, Store* store, IProp* iprop)
170 assert(threadIdx.x == 0 && blockIdx.x == 0);
171 size_t bytes = store->get_allocator().total_bytes_allocated()
172 +
sizeof(GPUCube::IStore) +
sizeof(GPUCube::IProp) + 1000;
173 gpu_cubes[0].allocate(*store, *iprop, bytes + iprop->get_allocator().total_bytes_allocated(),
false);
174 for(
int i = 1; i < n; ++i) {
175 gpu_cubes[i].allocate(*gpu_cubes[0].store_gpu, *gpu_cubes[0].iprop_gpu, bytes,
true);
179__global__
void deallocate_gpu_cubes(GPUCube* gpu_cubes,
size_t n) {
180 assert(threadIdx.x == 0 && blockIdx.x == 0);
181 for(
int i = 0; i < n; ++i) {
182 gpu_cubes[i].deallocate();
193 ZLB<size_t, bt::atomic_memory<>> next_subproblem;
199 std::atomic_flag cpu_stop;
204 std::mutex print_lock;
213 size_t shared_mem_bytes;
216 bt::vector<CPUCube> cpu_cubes;
219 bt::vector<GPUCube, bt::managed_allocator> gpu_cubes;
228 CPUData(
const CP<Itv>& root,
size_t shared_mem_bytes)
229 : next_subproblem(root.stats.num_blocks)
230 , best_bound(
Itv::top())
232 , shared_mem_bytes(shared_mem_bytes)
233 , gpu_cubes(root.stats.num_blocks)
237 cpu_cubes.emplace_back(root);
240 cpu_cubes.emplace_back(cpu_cubes[0]);
242 cpu_cubes[i].subproblem_idx = i;
243 gpu_cubes[i].store_cpu = cpu_cubes[i].cube.
store;
249 bt::statistics_allocator<UniqueLightAlloc<bt::managed_allocator, 0>>,
250 bt::statistics_allocator<UniqueLightAlloc<bt::managed_allocator, 1>>>
252 root.
stats.
print_stat(
"store_mem", managed_cp.store.get_allocator().total_bytes_allocated());
253 root.
stats.
print_stat(
"propagator_mem", managed_cp.iprop.get_allocator().total_bytes_allocated());
255 allocate_gpu_cubes<<<1, 1>>>(gpu_cubes.data(), gpu_cubes.size(), managed_cp.store.get(), managed_cp.iprop.get());
256 CUDAEX(cudaDeviceSynchronize());
260 deallocate_gpu_cubes<<<1, 1>>>(gpu_cubes.data(), gpu_cubes.size());
261 CUDAEX(cudaDeviceSynchronize());
265 CPUData(
const CPUData&) =
delete;
266 CPUData(CPUData&&) =
delete;
269void dive_and_solve(CPUData& global,
size_t cube_idx);
270size_t dive(CPUData& global,
size_t cube_idx);
271void solve(CPUData& global,
size_t cube_idx);
272bool propagate(CPUData& global,
size_t cube_idx);
273bool update_global_best_bound(CPUData& global,
size_t cube_idx);
274void update_local_best_bound(CPUData& global,
size_t cube_idx);
275void reduce_cubes(CPUData& global);
276size_t configure_gpu(
CP<Itv>& cp);
277__global__
void gpu_propagate(GPUCube* cube,
size_t shared_bytes);
285 std::cerr <<
"You must use a CUDA compiler (nvcc or clang) to compile Turbo on GPU." << std::endl;
287 auto start = std::chrono::steady_clock::now();
291 if(cp.
iprop->is_bot()) {
296 size_t shared_mem_bytes = configure_gpu(cp);
299 CPUData global(cp, shared_mem_bytes);
306 std::vector<std::thread> threads;
307 for(
int i = 0; i < global.root.stats.num_blocks; ++i) {
308 threads.push_back(std::thread(dive_and_solve, std::ref(global), i));
313 static_cast<unsigned int>(global.root.stats.num_blocks),
314 CUDA_THREADS_PER_BLOCK,
315 global.shared_mem_bytes>>>
316 (global.gpu_cubes.data(), global.shared_mem_bytes);
318 size_t terminated = 0;
319 while(terminated < threads.size()) {
321 global.cpu_stop.test_and_set();
322 cuda::atomic_thread_fence(cuda::memory_order_seq_cst, cuda::thread_scope_system);
325 std::this_thread::sleep_for(std::chrono::milliseconds(100));
327 for(
int i = 0; i < global.cpu_cubes.size(); ++i) {
328 if(global.cpu_cubes[i].finished.test()) {
333 for(
auto& t : threads) {
336 CUDAEX(cudaDeviceSynchronize());
338 reduce_cubes(global);
339 global.root.print_final_solution();
340 global.root.print_mzn_statistics();
352void dive_and_solve(CPUData& global,
size_t cube_idx)
354 auto& cube = global.cpu_cubes[cube_idx].cube;
355 auto solving_start = cube.stats.start_timer_host();
356 size_t num_subproblems = global.root.stats.eps_num_subproblems;
357 size_t& subproblem_idx = global.cpu_cubes[cube_idx].subproblem_idx;
359 while(subproblem_idx < num_subproblems && !global.cpu_stop.test()) {
360 if(global.root.config.verbose_solving >= 2) {
361 std::lock_guard<std::mutex> print_guard(global.print_lock);
362 printf(
"%% Cube %zu solves subproblem num %zu\n", cube_idx, subproblem_idx);
365 auto dive_start = cube.stats.start_timer_host();
366 size_t remaining_depth = dive(global, cube_idx);
369 if(remaining_depth == 0) {
370 solve(global, cube_idx);
372 if(!global.cpu_stop.test()) {
373 cube.stats.eps_solved_subproblems += 1;
377 else if(!global.cpu_stop.test()) {
390 size_t next_subproblem_idx = ((subproblem_idx >> remaining_depth) +
size_t{1}) << remaining_depth;
391 global.next_subproblem.meet(ZLB<size_t, bt::local_memory>(next_subproblem_idx));
394 if((subproblem_idx & ((
size_t{1} << remaining_depth) -
size_t{1})) ==
size_t{0}) {
395 cube.stats.eps_skipped_subproblems += next_subproblem_idx - subproblem_idx;
401 if(!global.cpu_stop.test()) {
403 subproblem_idx = global.next_subproblem.atomic()++;
408 if(subproblem_idx < num_subproblems) {
409 auto start = cube.stats.start_timer_host();
410 cube.search_tree->restore(global.cpu_cubes[cube_idx].root_snapshot);
418 if(!global.cpu_stop.test()) {
419 cube.stats.num_blocks_done = 1;
424 global.gpu_cubes[cube_idx].stop.test_and_set();
425 cuda::atomic_thread_fence(cuda::memory_order_seq_cst, cuda::thread_scope_system);
426 global.gpu_cubes[cube_idx].ready_to_propagate.test_and_set(cuda::std::memory_order_seq_cst);
427 global.gpu_cubes[cube_idx].ready_to_propagate.notify_one();
431 global.cpu_cubes[cube_idx].finished.test_and_set();
442size_t dive(CPUData& global,
size_t cube_idx) {
443 auto& cube = global.cpu_cubes[cube_idx].cube;
444 bool stop_diving =
false;
445 size_t remaining_depth = cube.config.subproblems_power;
447 while(remaining_depth > 0 && !stop_diving && !global.cpu_stop.test()) {
448 bool is_leaf_node = propagate(global, cube_idx);
449 auto start = cube.stats.start_timer_host();
457 auto branches = cube.split->split();
458 assert(branches.size() == 2);
464 size_t branch_idx = (global.cpu_cubes[cube_idx].subproblem_idx & (
size_t{1} << remaining_depth)) >> remaining_depth;
468 cube.iprop->deduce(branches[branch_idx]);
472 return remaining_depth;
478void solve(CPUData& global,
size_t cube_idx) {
479 auto& cpu_cube = global.cpu_cubes[cube_idx].cube;
480 bool has_changed =
true;
481 auto start = cpu_cube.stats.start_timer_host();
482 while(has_changed && !global.cpu_stop.test()) {
484 update_local_best_bound(global, cube_idx);
487 propagate(global, cube_idx);
491 start = cpu_cube.stats.start_timer_host();
492 has_changed = cpu_cube.search_tree->deduce();
502bool propagate(CPUData& global,
size_t cube_idx) {
503 auto& cpu_cube = global.cpu_cubes[cube_idx].cube;
504 auto& gpu_cube = global.gpu_cubes[cube_idx];
505 bool is_leaf_node =
false;
507 gpu_cube.store_cpu->prefetch(0);
512 cuda::atomic_thread_fence(cuda::memory_order_seq_cst, cuda::thread_scope_system);
513 gpu_cube.ready_to_propagate.test_and_set(cuda::std::memory_order_seq_cst);
514 gpu_cube.ready_to_propagate.notify_one();
515 gpu_cube.ready_to_search.wait(
false, cuda::std::memory_order_seq_cst);
516 gpu_cube.ready_to_search.clear();
518 auto start = cpu_cube.stats.start_timer_host();
519 gpu_cube.store_cpu->prefetch(cudaCpuDeviceId);
522 bool is_pruned = cpu_cube.on_node();
524 if(cpu_cube.iprop->is_bot()) {
526 cpu_cube.on_failed_node();
531 else if(gpu_cube.solution_found.test()) {
533 gpu_cube.solution_found.clear();
538 if(cpu_cube.bab->is_satisfaction() || cpu_cube.bab->compare_bound(*cpu_cube.store, cpu_cube.bab->optimum())) {
539 cpu_cube.bab->deduce();
540 bool print_solution = cpu_cube.is_printing_intermediate_sol();
541 if(cpu_cube.bab->is_optimization()) {
543 print_solution &= update_global_best_bound(global, cube_idx);
547 std::lock_guard<std::mutex> print_guard(global.print_lock);
548 global.root.print_solution(*cpu_cube.best);
551 is_pruned |= cpu_cube.update_solution_stats();
556 global.cpu_stop.test_and_set();
572__global__
void gpu_propagate(GPUCube* gpu_cubes,
size_t shared_bytes) {
573 extern __shared__
unsigned char shared_mem[];
574 GPUCube& cube = gpu_cubes[blockIdx.x];
575 GPUCube::IProp& iprop = *cube.iprop_gpu;
576 __shared__
int warp_iterations[CUDA_THREADS_PER_BLOCK/32];
578 __shared__ FixpointSubsetGPU<BlockAsynchronousFixpointGPU<true>, bt::global_allocator, CUDA_THREADS_PER_BLOCK> fp_engine;
579 fp_engine.init(iprop.num_deductions());
582 __shared__
bool stop;
585 if(threadIdx.x == 0 && shared_bytes > DEFAULT_SHARED_MEM_BYTES) {
586 bt::pool_allocator shared_mem_pool(shared_mem, shared_bytes);
587 cube.store_gpu->reset_data(shared_mem_pool);
590 auto group = cooperative_groups::this_thread_block();
593 auto start = cube.timers.start_timer_device();
596 if(threadIdx.x == 0) {
597 start = cube.timers.start_timer_device();
598 cube.ready_to_propagate.wait(
false, cuda::std::memory_order_seq_cst);
600 cube.ready_to_propagate.clear();
602 stop = cube.stop.test();
609 start = cube.timers.start_timer_device();
610 cube.store_cpu->copy_to(group, *cube.store_gpu);
619 warp_iterations[threadIdx.x / 32] = 0;
620 switch(cube.fp_kind) {
622 fp_iterations = fp_engine.fixpoint(
623 [&](
int i){
return iprop.deduce(i); },
624 [&](){
return iprop.is_bot(); });
625 if(threadIdx.x == 0) {
626 cube.num_deductions += fp_iterations * fp_engine.num_active();
631 if(fp_engine.num_active() <= cube.wac1_threshold) {
632 fp_iterations = fp_engine.fixpoint(
633 [&](
int i){
return iprop.deduce(i); },
634 [&](){
return iprop.is_bot(); });
635 if(threadIdx.x == 0) {
636 cube.num_deductions += fp_iterations * fp_engine.num_active();
640 fp_iterations = fp_engine.fixpoint(
641 [&](
int i){
return warp_fixpoint<CUDA_THREADS_PER_BLOCK>(iprop, i, warp_iterations); },
642 [&](){
return iprop.is_bot(); });
643 if(threadIdx.x == 0) {
644 for(
int i = 0; i < CUDA_THREADS_PER_BLOCK/32; ++i) {
645 cube.num_deductions += warp_iterations[i] * 32;
653 cube.store_gpu->copy_to(group, *cube.store_cpu);
656 if(threadIdx.x == 0) {
657 cube.fp_iterations += fp_iterations;
659 bool is_leaf_node = cube.store_gpu->is_bot();
661 fp_engine.select([&](
int i) {
return !iprop.ask(i); });
663 if(fp_engine.num_active() == 0) {
664 is_leaf_node = cube.store_gpu->template is_extractable<AtomicExtraction>(group);
665 if(threadIdx.x == 0 && is_leaf_node) {
666 cube.solution_found.test_and_set();
670 cuda::atomic_thread_fence(cuda::memory_order_seq_cst, cuda::thread_scope_system);
673 if(threadIdx.x == 0) {
674 cube.ready_to_search.test_and_set(cuda::std::memory_order_seq_cst);
675 cube.ready_to_search.notify_one();
679 fp_engine.reset(iprop.num_deductions());
691bool update_global_best_bound(CPUData& global,
size_t cube_idx) {
692 const auto& cube = global.cpu_cubes[cube_idx].cube;
693 assert(cube.bab->is_optimization());
695 auto local_best = cube.bab->optimum().project(cube.bab->objective_var());
697 if(cube.bab->is_maximization()) {
698 return global.best_bound.meet_lb(dual_bound<Itv::LB>(local_best.ub()));
701 return global.best_bound.meet_ub(dual_bound<Itv::UB>(local_best.lb()));
709void update_local_best_bound(CPUData& global,
size_t cube_idx) {
710 if(global.cpu_cubes[cube_idx].cube.bab->is_optimization()) {
711 auto& cube = global.cpu_cubes[cube_idx].cube;
712 VarEnv<bt::standard_allocator> empty_env{};
713 auto best_formula = cube.bab->template deinterpret_best_bound<bt::standard_allocator>(
714 cube.bab->is_maximization()
715 ?
Itv(dual_bound<Itv::UB>(global.best_bound.lb()))
716 :
Itv(dual_bound<
Itv::LB>(global.best_bound.ub())));
717 IDiagnostics diagnostics;
718 bool r = interpret_and_tell(best_formula, empty_env, *cube.store, diagnostics);
724void reduce_cubes(CPUData& global) {
725 for(
int i = 0; i < global.cpu_cubes.size(); ++i) {
727 global.cpu_cubes[i].cube.stats.meet(global.gpu_cubes[i].timers);
728 global.root.meet(global.cpu_cubes[i].cube);
729 global.root.stats.fixpoint_iterations += global.gpu_cubes[i].fp_iterations;
730 global.root.stats.num_deductions += global.gpu_cubes[i].num_deductions;
740size_t configure_gpu(
CP<Itv>& cp) {
743 size_t alignment_overhead = 200;
744 size_t shared_mem_bytes = DEFAULT_SHARED_MEM_BYTES + alignment_overhead + (cp.
store->vars() *
sizeof(GPUCube::Itv1));
745 cudaDeviceProp deviceProp;
746 cudaGetDeviceProperties(&deviceProp, 0);
747 if(shared_mem_bytes >= deviceProp.sharedMemPerBlock || config.only_global_memory) {
748 shared_mem_bytes = DEFAULT_SHARED_MEM_BYTES;
757 int hint_num_threads;
758 CUDAE(cudaOccupancyMaxPotentialBlockSize(&hint_num_blocks, &hint_num_threads, (
void*) gpu_propagate, shared_mem_bytes));
759 size_t total_global_mem = deviceProp.totalGlobalMem;
760 size_t num_sm = deviceProp.multiProcessorCount;
761 cp.
stats.
num_blocks = (config.or_nodes == 0) ? hint_num_blocks : config.or_nodes;
763 size_t total_stack_size = num_sm * deviceProp.maxThreadsPerMultiProcessor * (config.stack_kb == 0 ? 1 : config.stack_kb) * 1000;
764 size_t remaining_global_mem = total_global_mem - total_stack_size;
765 remaining_global_mem -= remaining_global_mem / 10;
766 if(config.stack_kb != 0) {
767 CUDAEX(cudaDeviceSetLimit(cudaLimitStackSize, config.stack_kb*1000));
769 CUDAEX(cudaDeviceSetLimit(cudaLimitMallocHeapSize, remaining_global_mem));
776 cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks, (
void*) gpu_propagate, CUDA_THREADS_PER_BLOCK, shared_mem_bytes);
778 printf(
"%% max_blocks_per_sm=%d\n", num_blocks);
780 return shared_mem_bytes;
bool must_quit(A &a)
Definition common_solving.hpp:78
Interval< ZLB< bound_value_type, battery::local_memory > > Itv
Definition common_solving.hpp:53
void block_signal_ctrlc()
Definition common_solving.hpp:72
bool check_timeout(A &a, const Timepoint &start)
Definition common_solving.hpp:90
FixpointKind
Definition config.hpp:23
void hybrid_dive_and_solve(const Configuration< battery::standard_allocator > &config)
Definition hybrid_dive_and_solve.hpp:282
Definition common_solving.hpp:144
Configuration< BasicAllocator > config
Definition common_solving.hpp:278
CUDA void print_final_solution()
Definition common_solving.hpp:830
Statistics< BasicAllocator > stats
Definition common_solving.hpp:279
abstract_ptr< IStore > store
Definition common_solving.hpp:260
void preprocess()
Definition common_solving.hpp:591
abstract_ptr< IProp > iprop
Definition common_solving.hpp:261
CUDA void print_mzn_statistics()
Definition common_solving.hpp:837
FixpointKind fixpoint
Definition config.hpp:52
int verbose_solving
Definition config.hpp:41
size_t wac1_threshold
Definition config.hpp:53
CUDA void print_mzn_end_stats() const
Definition statistics.hpp:318
CUDA void print_stat(const char *name, const char *value) const
Definition statistics.hpp:206
int num_blocks
Definition statistics.hpp:117
CUDA void print_memory_statistics(int verbose, const char *key, size_t bytes) const
Definition statistics.hpp:256
Definition statistics.hpp:30
Definition common_solving.hpp:124