3#ifndef TURBO_HYBRID_DIVE_AND_SOLVE_HPP
4#define TURBO_HYBRID_DIVE_AND_SOLVE_HPP
11namespace bt = ::battery;
30#include <cub/block/block_scan.cuh>
35#define DEFAULT_SHARED_MEM_BYTES 0
47 typename cube_type::IST::snapshot_type<bt::standard_allocator> root_snapshot;
50 std::atomic_flag finished;
55 size_t subproblem_idx;
59 , root_snapshot(cube.search_tree->template snapshot<bt::standard_allocator>())
69 using Itv1 = Interval<ZLB<int, bt::local_memory>>;
72 using IStore = VStore<Itv1, bt::pool_allocator>;
76 using IPC = PC<IStore, bt::pool_allocator>;
79 abstract_ptr<IStore> store_gpu;
83 bt::vector<int, bt::pool_allocator> pidx;
88 bt::vector<bool, bt::pool_allocator> pmask;
91 bt::vector<int, bt::pool_allocator> psum;
94 bt::vector<int, bt::pool_allocator> pidx2;
99 abstract_ptr<IPC> ipc_gpu;
105 abstract_ptr<VStore<Itv, bt::pinned_allocator>> store_cpu;
108 cuda::std::atomic_flag solution_found;
113 size_t fp_iterations;
118 cuda::std::atomic_flag ready_to_propagate;
119 cuda::std::atomic_flag ready_to_search;
122 cuda::std::atomic_flag stop;
126 ready_to_search.clear();
127 ready_to_propagate.clear();
129 solution_found.clear();
130 cuda::atomic_thread_fence(cuda::memory_order_seq_cst, cuda::thread_scope_system);
138 template <
class StoreType,
class PCType>
139 __device__
void allocate(StoreType& store, PCType& pc,
size_t bytes,
bool pc_shared) {
140 int n = pc.num_deductions();
142 n = n + ((BLOCK_SIZE - n % BLOCK_SIZE) % BLOCK_SIZE);
143 bytes += 100 + 4*
sizeof(int)*n;
144 void* mem_pool = bt::global_allocator{}.allocate(bytes);
145 bt::pool_allocator pool(
static_cast<unsigned char*
>(mem_pool), bytes);
146 AbstractDeps<bt::global_allocator, bt::pool_allocator> deps(pc_shared, bt::global_allocator{}, pool);
147 ipc_gpu = bt::allocate_shared<IPC, bt::pool_allocator>(pool, pc, deps);
148 store_gpu = deps.template extract<IStore>(store.aty());
149 pidx = bt::vector<int, bt::pool_allocator>(pc.num_deductions(), pool);
150 for(
int i = 0; i < pidx.size(); ++i) {
153 pmask = bt::vector<bool, bt::pool_allocator>(n,
false, pool);
154 psum = bt::vector<int, bt::pool_allocator>(n, 0, pool);
155 pidx2 = bt::vector<int, bt::pool_allocator>(pidx, pool);
158 __device__
void deallocate() {
160 store_gpu = abstract_ptr<IStore>();
161 ipc_gpu = abstract_ptr<IPC>();
162 pidx = bt::vector<int, bt::pool_allocator>();
163 pmask = bt::vector<bool, bt::pool_allocator>();
164 psum = bt::vector<int, bt::pool_allocator>();
165 pidx2 = bt::vector<int, bt::pool_allocator>();
170template <
class Store,
class IPC>
171__global__
void allocate_gpu_cubes(GPUCube* gpu_cubes,
172 size_t n, Store* store, IPC* ipc)
174 assert(threadIdx.x == 0 && blockIdx.x == 0);
175 size_t bytes = store->get_allocator().total_bytes_allocated()
176 +
sizeof(GPUCube::IStore) +
sizeof(GPUCube::IPC) + 1000;
177 gpu_cubes[0].allocate(*store, *ipc, bytes + ipc->get_allocator().total_bytes_allocated(),
false);
178 for(
int i = 1; i < n; ++i) {
179 gpu_cubes[i].allocate(*gpu_cubes[0].store_gpu, *gpu_cubes[0].ipc_gpu, bytes,
true);
183__global__
void deallocate_gpu_cubes(GPUCube* gpu_cubes,
size_t n) {
184 assert(threadIdx.x == 0 && blockIdx.x == 0);
185 for(
int i = 0; i < n; ++i) {
186 gpu_cubes[i].deallocate();
197 ZLB<size_t, bt::atomic_memory<>> next_subproblem;
203 std::atomic_flag cpu_stop;
208 std::mutex print_lock;
217 size_t shared_mem_bytes;
220 bt::vector<CPUCube> cpu_cubes;
223 bt::vector<GPUCube, bt::managed_allocator> gpu_cubes;
232 CPUData(
const CP<Itv>& root,
size_t shared_mem_bytes)
233 : next_subproblem(root.config.or_nodes)
234 , best_bound(
Itv::top())
236 , shared_mem_bytes(shared_mem_bytes)
237 , cpu_cubes(root.config.or_nodes, this->root)
238 , gpu_cubes(root.config.or_nodes)
242 cpu_cubes[i].subproblem_idx = i;
243 gpu_cubes[i].store_cpu = cpu_cubes[i].cube.store;
247 bt::statistics_allocator<UniqueLightAlloc<bt::managed_allocator, 0>>,
248 bt::statistics_allocator<UniqueLightAlloc<bt::managed_allocator, 1>>>
250 printf(
"%%%%%%mzn-stat: store_mem=%" PRIu64
"\n", managed_cp.store.get_allocator().total_bytes_allocated());
251 printf(
"%%%%%%mzn-stat: propagator_mem=%" PRIu64
"\n", managed_cp.ipc.get_allocator().total_bytes_allocated());
252 allocate_gpu_cubes<<<1, 1>>>(gpu_cubes.data(), gpu_cubes.size(), managed_cp.store.get(), managed_cp.ipc.get());
253 CUDAEX(cudaDeviceSynchronize());
257 deallocate_gpu_cubes<<<1, 1>>>(gpu_cubes.data(), gpu_cubes.size());
258 CUDAEX(cudaDeviceSynchronize());
262 CPUData(
const CPUData&) =
delete;
263 CPUData(CPUData&&) =
delete;
266void dive_and_solve(CPUData& global,
size_t cube_idx);
267size_t dive(CPUData& global,
size_t cube_idx);
268void solve(CPUData& global,
size_t cube_idx);
269bool propagate(CPUData& global,
size_t cube_idx);
270bool update_global_best_bound(CPUData& global,
size_t cube_idx);
271void update_local_best_bound(CPUData& global,
size_t cube_idx);
272void reduce_cubes(CPUData& global);
273size_t configure_gpu(
CP<Itv>& cp);
274__global__
void gpu_propagate(GPUCube* cube,
size_t shared_bytes);
282 std::cerr <<
"You must use a CUDA compiler (nvcc or clang) to compile Turbo on GPU." << std::endl;
284 auto start = std::chrono::high_resolution_clock::now();
288 size_t shared_mem_bytes = configure_gpu(cp);
294 CPUData global(cp, shared_mem_bytes);
297 std::vector<std::thread> threads;
298 for(
int i = 0; i < global.root.config.or_nodes; ++i) {
299 threads.push_back(std::thread(dive_and_solve, std::ref(global), i));
304 static_cast<unsigned int>(global.root.config.or_nodes),
305 static_cast<unsigned int>(global.root.config.and_nodes),
306 global.shared_mem_bytes>>>
307 (global.gpu_cubes.data(), global.shared_mem_bytes);
310 size_t terminated = 0;
311 while(terminated < threads.size()) {
313 global.cpu_stop.test_and_set();
314 cuda::atomic_thread_fence(cuda::memory_order_seq_cst, cuda::thread_scope_system);
317 std::this_thread::sleep_for(std::chrono::milliseconds(100));
319 for(
int i = 0; i < global.cpu_cubes.size(); ++i) {
320 if(global.cpu_cubes[i].finished.test()) {
325 for(
auto& t : threads) {
328 CUDAEX(cudaDeviceSynchronize());
330 reduce_cubes(global);
331 global.root.print_final_solution();
332 global.root.print_mzn_statistics();
344void dive_and_solve(CPUData& global,
size_t cube_idx)
346 size_t num_subproblems = global.root.stats.eps_num_subproblems;
347 size_t& subproblem_idx = global.cpu_cubes[cube_idx].subproblem_idx;
348 auto& cube = global.cpu_cubes[cube_idx].cube;
350 while(subproblem_idx < num_subproblems && !global.cpu_stop.test()) {
351 if(global.root.config.verbose_solving) {
352 std::lock_guard<std::mutex> print_guard(global.print_lock);
353 printf(
"%% Cube %d solves subproblem num %zu\n", cube_idx, subproblem_idx);
356 size_t remaining_depth = dive(global, cube_idx);
358 if(remaining_depth == 0) {
359 solve(global, cube_idx);
361 if(!global.cpu_stop.test()) {
362 cube.stats.eps_solved_subproblems += 1;
366 else if(!global.cpu_stop.test()) {
379 size_t next_subproblem_idx = ((subproblem_idx >> remaining_depth) +
size_t{1}) << remaining_depth;
380 global.next_subproblem.meet(ZLB<size_t, bt::local_memory>(next_subproblem_idx));
383 if((subproblem_idx & ((
size_t{1} << remaining_depth) -
size_t{1})) ==
size_t{0}) {
384 cube.stats.eps_skipped_subproblems += next_subproblem_idx - subproblem_idx;
390 if(!global.cpu_stop.test()) {
392 subproblem_idx = global.next_subproblem.atomic()++;
397 if(subproblem_idx < num_subproblems) {
398 cube.search_tree->restore(global.cpu_cubes[cube_idx].root_snapshot);
399 cube.eps_split->reset();
406 if(!global.cpu_stop.test()) {
407 cube.stats.num_blocks_done = 1;
412 global.gpu_cubes[cube_idx].stop.test_and_set();
413 cuda::atomic_thread_fence(cuda::memory_order_seq_cst, cuda::thread_scope_system);
414 global.gpu_cubes[cube_idx].ready_to_propagate.test_and_set(cuda::std::memory_order_seq_cst);
415 global.gpu_cubes[cube_idx].ready_to_propagate.notify_one();
418 global.cpu_cubes[cube_idx].finished.test_and_set();
429size_t dive(CPUData& global,
size_t cube_idx) {
430 auto& cube = global.cpu_cubes[cube_idx].cube;
431 bool stop_diving =
false;
432 size_t remaining_depth = cube.config.subproblems_power;
434 while(remaining_depth > 0 && !stop_diving && !global.cpu_stop.test()) {
436 bool is_leaf_node = propagate(global, cube_idx);
443 auto branches = cube.eps_split->split();
444 assert(branches.size() == 2);
450 size_t branch_idx = (global.cpu_cubes[cube_idx].subproblem_idx & (
size_t{1} << remaining_depth)) >> remaining_depth;
454 cube.ipc->deduce(branches[branch_idx]);
457 return remaining_depth;
463void solve(CPUData& global,
size_t cube_idx) {
464 auto& cpu_cube = global.cpu_cubes[cube_idx].cube;
465 bool has_changed =
true;
466 while(has_changed && !global.cpu_stop.test()) {
468 update_local_best_bound(global, cube_idx);
470 propagate(global, cube_idx);
474 has_changed = cpu_cube.search_tree->deduce();
484bool propagate(CPUData& global,
size_t cube_idx) {
485 auto& cpu_cube = global.cpu_cubes[cube_idx].cube;
486 auto& gpu_cube = global.gpu_cubes[cube_idx];
487 bool is_leaf_node =
false;
489 gpu_cube.store_cpu->prefetch(0);
494 cuda::atomic_thread_fence(cuda::memory_order_seq_cst, cuda::thread_scope_system);
495 gpu_cube.ready_to_propagate.test_and_set(cuda::std::memory_order_seq_cst);
496 gpu_cube.ready_to_propagate.notify_one();
497 gpu_cube.ready_to_search.wait(
false, cuda::std::memory_order_seq_cst);
498 gpu_cube.ready_to_search.clear();
500 gpu_cube.store_cpu->prefetch(cudaCpuDeviceId);
503 bool is_pruned = cpu_cube.on_node();
505 if(cpu_cube.ipc->is_bot()) {
507 cpu_cube.on_failed_node();
512 else if(gpu_cube.solution_found.test()) {
514 gpu_cube.solution_found.clear();
519 cpu_cube.bab->deduce();
520 bool print_solution = cpu_cube.is_printing_intermediate_sol();
521 if(cpu_cube.bab->is_optimization()) {
523 print_solution &= update_global_best_bound(global, cube_idx);
527 cpu_cube.print_solution();
530 is_pruned |= cpu_cube.update_solution_stats();
534 global.cpu_stop.test_and_set();
549__global__
void gpu_propagate(GPUCube* gpu_cubes,
size_t shared_bytes) {
550 extern __shared__
unsigned char shared_mem[];
551 GPUCube& cube = gpu_cubes[blockIdx.x];
554 __shared__ BlockAsynchronousIterationGPU fp_engine;
557 __shared__
bool stop;
559 if(threadIdx.x == 0 && shared_bytes > DEFAULT_SHARED_MEM_BYTES) {
560 bt::pool_allocator shared_mem_pool(shared_mem, shared_bytes);
561 cube.store_gpu->reset_data(shared_mem_pool);
564 auto group = cooperative_groups::this_thread_block();
567 using BlockScan = cub::BlockScan<int, BLOCK_SIZE>;
568 assert(BLOCK_SIZE == blockDim.x);
569 __shared__
typename BlockScan::TempStorage cub_prefixsum_tmp;
573 if(threadIdx.x == 0) {
574 cube.ready_to_propagate.wait(
false, cuda::std::memory_order_seq_cst);
575 cube.ready_to_propagate.clear();
577 stop = cube.stop.test();
584 cube.store_cpu->copy_to(group, *cube.store_gpu);
588 size_t fp_iterations = fp_engine.fixpoint(cube.pidx, *(cube.ipc_gpu));
589 cube.store_gpu->copy_to(group, *cube.store_cpu);
594 if(threadIdx.x == 0) {
595 cube.fp_iterations += fp_iterations;
598 bool is_leaf_node = cube.store_gpu->is_bot();
599 int n = cube.pidx.size() + ((blockDim.x - cube.pidx.size() % blockDim.x) % blockDim.x);
605 for(
int i = threadIdx.x; i < cube.pidx.size(); i += blockDim.x) {
606 cube.pmask[i] = !cube.ipc_gpu->ask(cube.pidx[i]);
608 for(
int i = threadIdx.x; i < n; i += blockDim.x) {
609 BlockScan(cub_prefixsum_tmp).InclusiveSum(cube.pmask[i], cube.psum[i]);
612 for(
int i = blockDim.x + threadIdx.x; i < n; i += blockDim.x) {
613 cube.psum[i] += cube.psum[i - threadIdx.x - 1];
616 if(cube.psum[cube.pidx.size()-1] == 0) {
617 is_leaf_node = cube.store_gpu->template is_extractable<AtomicExtraction>(group);
618 if(threadIdx.x == 0 && is_leaf_node) {
619 cube.solution_found.test_and_set();
623 cuda::atomic_thread_fence(cuda::memory_order_seq_cst, cuda::thread_scope_system);
626 if(threadIdx.x == 0) {
627 cube.ready_to_search.test_and_set(cuda::std::memory_order_seq_cst);
628 cube.ready_to_search.notify_one();
632 if(threadIdx.x == 0) {
633 cube.pidx.resize(cube.ipc_gpu->num_deductions());
634 cube.pidx2.resize(cube.ipc_gpu->num_deductions());
637 for(
int i = threadIdx.x; i < cube.pidx.size(); i += blockDim.x) {
643 if(threadIdx.x == 0) {
644 battery::swap(cube.pidx, cube.pidx2);
645 cube.pidx.resize(cube.psum[cube.pidx2.size()-1]);
648 for(
int i = threadIdx.x; i < cube.pidx2.size(); i += blockDim.x) {
650 cube.pidx[cube.psum[i]-1] = cube.pidx2[i];
663bool update_global_best_bound(CPUData& global,
size_t cube_idx) {
664 const auto& cube = global.cpu_cubes[cube_idx].cube;
665 assert(cube.bab->is_optimization());
667 auto local_best = cube.bab->optimum().project(cube.bab->objective_var());
669 if(cube.bab->is_maximization()) {
670 return global.best_bound.meet_lb(dual<Itv::LB>(local_best.ub()));
673 return global.best_bound.meet_ub(dual<Itv::UB>(local_best.lb()));
681void update_local_best_bound(CPUData& global,
size_t cube_idx) {
682 if(global.cpu_cubes[cube_idx].cube.bab->is_optimization()) {
683 auto& cube = global.cpu_cubes[cube_idx].cube;
684 VarEnv<bt::standard_allocator> empty_env{};
685 auto best_formula = cube.bab->template deinterpret_best_bound<bt::standard_allocator>(
686 cube.bab->is_maximization()
687 ?
Itv(dual<Itv::UB>(global.best_bound.lb()))
688 :
Itv(dual<Itv::LB>(global.best_bound.ub())));
689 IDiagnostics diagnostics;
690 bool r = interpret_and_tell(best_formula, empty_env, *cube.store, diagnostics);
696void reduce_cubes(CPUData& global) {
697 for(
int i = 0; i < global.cpu_cubes.size(); ++i) {
699 global.root.meet(global.cpu_cubes[i].cube);
700 global.root.stats.fixpoint_iterations += global.gpu_cubes[i].fp_iterations;
710size_t configure_gpu(
CP<Itv>& cp) {
713 size_t alignment_overhead = 200;
714 size_t shared_mem_bytes = DEFAULT_SHARED_MEM_BYTES + alignment_overhead + (cp.
store->vars() *
sizeof(GPUCube::Itv1));
715 cudaDeviceProp deviceProp;
716 cudaGetDeviceProperties(&deviceProp, 0);
717 if(shared_mem_bytes >= deviceProp.sharedMemPerBlock || config.only_global_memory) {
718 shared_mem_bytes = DEFAULT_SHARED_MEM_BYTES;
719 printf(
"%%%%%%mzn-stat: memory_configuration=\"global\"\n");
722 printf(
"%%%%%%mzn-stat: memory_configuration=\"store_shared\"\n");
724 printf(
"%%%%%%mzn-stat: shared_mem=%" PRIu64
"\n", shared_mem_bytes);
727 int hint_num_threads;
728 CUDAE(cudaOccupancyMaxPotentialBlockSize(&hint_num_blocks, &hint_num_threads, (
void*) gpu_propagate, shared_mem_bytes));
729 size_t total_global_mem = deviceProp.totalGlobalMem;
730 size_t num_sm = deviceProp.multiProcessorCount;
731 config.and_nodes = (config.and_nodes == 0) ? hint_num_threads : config.and_nodes;
732 config.or_nodes = (config.or_nodes == 0) ? hint_num_blocks : config.or_nodes;
734 size_t total_stack_size = num_sm * deviceProp.maxThreadsPerMultiProcessor * config.stack_kb * 1000;
735 size_t remaining_global_mem = total_global_mem - total_stack_size;
736 remaining_global_mem -= remaining_global_mem / 10;
737 CUDAEX(cudaDeviceSetLimit(cudaLimitStackSize, config.stack_kb*1000));
738 CUDAEX(cudaDeviceSetLimit(cudaLimitMallocHeapSize, remaining_global_mem));
741 printf(
"%% and_nodes=%zu\n", config.and_nodes);
742 printf(
"%% or_nodes=%zu\n", config.or_nodes);
744 cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks, (
void*) gpu_propagate, config.and_nodes, shared_mem_bytes);
745 printf(
"%% max_blocks_per_sm=%d\n", num_blocks);
746 return shared_mem_bytes;
Interval< local::ZLB > Itv
Definition common_solving.hpp:589
bool must_quit()
Definition common_solving.hpp:89
void block_signal_ctrlc()
Definition common_solving.hpp:77
bool check_timeout(A &a, const Timepoint &start)
Definition common_solving.hpp:101
void hybrid_dive_and_solve(const Configuration< battery::standard_allocator > &config)
Definition hybrid_dive_and_solve.hpp:279
void print_memory_statistics(const char *key, size_t bytes)
Definition statistics.hpp:12
Definition common_solving.hpp:156
Configuration< BasicAllocator > config
Definition common_solving.hpp:283
abstract_ptr< IStore > store
Definition common_solving.hpp:268
void preprocess()
Definition common_solving.hpp:462
size_t or_nodes
Definition config.hpp:41
Definition common_solving.hpp:136