3#ifndef TURBO_GPU_DIVE_AND_SOLVE_HPP
4#define TURBO_GPU_DIVE_AND_SOLVE_HPP
10namespace bt = ::battery;
14#include <cuda/std/chrono>
15#include <cuda/semaphore>
21 class ConcurrentAllocator>
27 using concurrent_allocator = ConcurrentAllocator;
31 bt::statistics_allocator<ConcurrentAllocator>,
32 bt::statistics_allocator<UniqueLightAlloc<ConcurrentAllocator, 0>>,
33 bt::statistics_allocator<UniqueLightAlloc<ConcurrentAllocator, 1>>>;
42using Itv0 = Interval<ZLB<int, bt::local_memory>>;
43using Itv1 = Interval<ZLB<int, bt::atomic_memory_block>>;
44using Itv2 = Interval<ZLB<int, bt::atomic_memory_grid>>;
45using AtomicBool = B<bt::atomic_memory_block>;
46using FPEngine = BlockAsynchronousIterationGPU;
49#ifdef NO_CONCURRENT_MANAGED_MEMORY
50 using ItvSolverPinned = StateTypes<Itv0, Itv1, Itv2, bt::pinned_allocator>;
51 using ItvSolverPinnedNoAtomics = StateTypes<Itv0, Itv0, Itv0, bt::pinned_allocator>;
53 using ItvSolver = StateTypes<Itv0, Itv1, Itv2, bt::managed_allocator>;
55 using ItvSolverNoAtomics = StateTypes<Itv0, Itv0, Itv0, bt::managed_allocator>;
63enum class MemoryKind {
78 CUDA bt::pool_allocator make_global_pool(
size_t bytes) {
79 void* mem_pool = bt::global_allocator{}.
allocate(bytes);
80 return bt::pool_allocator(
static_cast<unsigned char*
>(mem_pool), bytes);
83 CUDA bt::pool_allocator make_shared_pool(
unsigned char* shared_mem) {
84 return bt::pool_allocator(shared_mem, shared_bytes);
87 CUDA bt::pool_allocator make_pc_pool(bt::pool_allocator shared_mem) {
88 if(mem_kind == MemoryKind::STORE_PC_SHARED) {
92 return make_global_pool(pc_bytes);
96 CUDA bt::pool_allocator make_store_pool(bt::pool_allocator shared_mem) {
97 if(mem_kind == MemoryKind::STORE_PC_SHARED || mem_kind == MemoryKind::STORE_SHARED) {
101 return make_global_pool(store_bytes);
105 CUDA
void print_mzn_statistics()
const {
106 printf(
"%%%%%%mzn-stat: memory_configuration=\"%s\"\n",
107 mem_kind == MemoryKind::GLOBAL ?
"global" : (
108 mem_kind == MemoryKind::STORE_SHARED ?
"store_shared" :
"store_pc_shared"));
109 printf(
"%%%%%%mzn-stat: shared_mem=%" PRIu64
"\n", shared_bytes);
110 printf(
"%%%%%%mzn-stat: store_mem=%" PRIu64
"\n", store_bytes);
111 printf(
"%%%%%%mzn-stat: propagator_mem=%" PRIu64
"\n", pc_bytes);
122 using GridCP =
typename S::GridCP;
123 using BlockCP =
typename S::BlockCP;
124 using U2 =
typename S::U2;
129 bt::shared_ptr<BlockCP, bt::global_allocator> blocks_root;
131 volatile bool cpu_stop;
133 volatile bool blocks_reduced;
134 MemoryConfig mem_config;
135 bt::vector<BlockData<S>, bt::global_allocator> blocks;
137 bt::shared_ptr<B<bt::atomic_memory_grid>, bt::global_allocator> gpu_stop;
138 bt::shared_ptr<ZLB<size_t, bt::atomic_memory_grid>, bt::global_allocator> next_subproblem;
139 bt::shared_ptr<U2, bt::global_allocator> best_bound;
143 bt::shared_ptr<cuda::binary_semaphore<cuda::thread_scope_device>, bt::global_allocator> print_lock;
144 cuda::std::atomic_flag ready_to_produce;
145 cuda::std::atomic_flag ready_to_consume;
147 GridData(
const GridCP& root,
const MemoryConfig& mem_config)
149 , mem_config(mem_config)
151 , blocks_reduced(false)
153 ready_to_consume.clear();
154 ready_to_produce.test_and_set();
155 cuda::atomic_thread_fence(cuda::memory_order_seq_cst, cuda::thread_scope_system);
158 template <
class BlockBAB>
159 __device__
void produce_solution(
const BlockBAB& bab) {
160 print_lock->acquire();
162 ready_to_produce.wait(
false, cuda::std::memory_order_seq_cst);
163 ready_to_produce.clear();
164 bab.extract(*(root.bab));
165 cuda::atomic_thread_fence(cuda::memory_order_seq_cst, cuda::thread_scope_system);
166 ready_to_consume.test_and_set(cuda::std::memory_order_seq_cst);
167 ready_to_consume.notify_one();
169 print_lock->release();
172 __host__
bool consume_solution() {
173 ready_to_consume.wait(
false, cuda::std::memory_order_seq_cst);
174 ready_to_consume.clear();
176 root.print_final_solution();
177 if(root.config.print_statistics) {
178 root.print_mzn_statistics();
183 root.print_solution();
185 cuda::atomic_thread_fence(cuda::memory_order_seq_cst, cuda::thread_scope_system);
186 ready_to_produce.test_and_set(cuda::std::memory_order_seq_cst);
187 ready_to_produce.notify_one();
191 __device__
void allocate() {
192 assert(threadIdx.x == 0 && blockIdx.x == 0);
193 auto root_mem_config(mem_config);
194 root_mem_config.mem_kind = MemoryKind::GLOBAL;
195 blocks_root = bt::make_shared<BlockCP, bt::global_allocator>(
196 typename BlockCP::tag_gpu_block_copy{},
199 bt::global_allocator{},
200 root_mem_config.make_pc_pool(bt::pool_allocator(
nullptr,0)),
201 root_mem_config.make_store_pool(bt::pool_allocator(
nullptr,0)));
202 blocks = bt::vector<BlockData<S>, bt::global_allocator>(root.config.or_nodes);
203 gpu_stop = bt::make_shared<B<bt::atomic_memory_grid>, bt::global_allocator>(
false);
204 print_lock = bt::make_shared<cuda::binary_semaphore<cuda::thread_scope_device>, bt::global_allocator>(1);
205 next_subproblem = bt::make_shared<ZLB<size_t, bt::atomic_memory_grid>, bt::global_allocator>(0);
206 best_bound = bt::make_shared<U2, bt::global_allocator>();
209 __device__
void deallocate() {
210 assert(threadIdx.x == 0 && blockIdx.x == 0);
211 blocks = bt::vector<BlockData<S>, bt::global_allocator>();
212 blocks_root->deallocate();
216 next_subproblem.reset();
224 using GridCP =
typename S::GridCP;
225 using BlockCP =
typename S::BlockCP;
227 using snapshot_type =
typename BlockCP::IST::snapshot_type<bt::global_allocator>;
228 size_t subproblem_idx;
229 bt::shared_ptr<FPEngine, bt::pool_allocator> fp_engine;
230 bt::shared_ptr<AtomicBool, bt::pool_allocator> has_changed;
231 bt::shared_ptr<AtomicBool, bt::pool_allocator> stop;
232 bt::shared_ptr<BlockCP, bt::global_allocator> root;
233 bt::shared_ptr<snapshot_type, bt::global_allocator> snapshot_root;
235 __device__ BlockData():
236 has_changed(nullptr, bt::pool_allocator(nullptr, 0)),
237 stop(nullptr, bt::pool_allocator(nullptr, 0))
243 __device__
void allocate(GridData<S>& grid_data,
unsigned char* shared_mem) {
244 auto block = cooperative_groups::this_thread_block();
245 if(threadIdx.x == 0) {
246 subproblem_idx = blockIdx.x;
247 MemoryConfig& mem_config = grid_data.mem_config;
248 bt::pool_allocator shared_mem_pool(mem_config.make_shared_pool(shared_mem));
249 fp_engine = bt::allocate_shared<FPEngine, bt::pool_allocator>(shared_mem_pool);
250 has_changed = bt::allocate_shared<AtomicBool, bt::pool_allocator>(shared_mem_pool,
true);
251 stop = bt::allocate_shared<AtomicBool, bt::pool_allocator>(shared_mem_pool,
false);
252 root = bt::make_shared<BlockCP, bt::global_allocator>(
typename BlockCP::tag_gpu_block_copy{},
253 (mem_config.mem_kind != MemoryKind::STORE_PC_SHARED),
254 *(grid_data.blocks_root),
255 bt::global_allocator{},
256 mem_config.make_pc_pool(shared_mem_pool),
257 mem_config.make_store_pool(shared_mem_pool));
258 snapshot_root = bt::make_shared<snapshot_type, bt::global_allocator>(root->search_tree->template snapshot<bt::global_allocator>());
263 __device__
void deallocate_shared() {
264 if(threadIdx.x == 0) {
269 snapshot_root.reset();
271 cooperative_groups::this_thread_block().sync();
274 __device__
void restore() {
275 if(threadIdx.x == 0) {
276 root->search_tree->restore(*snapshot_root);
277 root->eps_split->reset();
279 cooperative_groups::this_thread_block().sync();
284__global__
void initialize_grid_data(GridData<S>* grid_data) {
285 grid_data->allocate();
286 size_t num_subproblems = 1;
287 num_subproblems <<= grid_data->root.config.subproblems_power;
288 grid_data->next_subproblem->meet(ZLB<size_t, bt::local_memory>(grid_data->root.config.or_nodes));
289 grid_data->root.stats.eps_num_subproblems = num_subproblems;
293__global__
void deallocate_grid_data(GridData<S>* grid_data) {
294 grid_data->deallocate();
302__device__
bool update_grid_best_bound(BlockData<S>& block_data, GridData<S>& grid_data) {
303 using U0 =
typename S::U0;
304 assert(threadIdx.x == 0);
305 if(block_data.root->bab->is_optimization()) {
306 const auto& bab = block_data.root->bab;
307 auto local_best = bab->optimum().project(bab->objective_var());
309 if(bab->is_maximization()) {
310 return grid_data.best_bound->meet_lb(dual<typename U0::LB>(local_best.ub()));
313 return grid_data.best_bound->meet_ub(dual<typename U0::UB>(local_best.lb()));
324__device__
void update_block_best_bound(BlockData<S>& block_data, GridData<S>& grid_data) {
325 using U0 =
typename S::U0;
326 if(threadIdx.x == 0 && block_data.root->bab->is_optimization()) {
327 const auto& bab = block_data.root->bab;
328 VarEnv<bt::global_allocator> empty_env{};
329 auto best_formula = bab->template deinterpret_best_bound<bt::global_allocator>(
330 bab->is_maximization()
331 ? U0(dual<typename U0::UB>(grid_data.best_bound->lb()))
332 : U0(dual<typename U0::LB>(grid_data.best_bound->ub())));
335 IDiagnostics diagnostics;
336 interpret_and_tell(best_formula, empty_env, *block_data.root->store, diagnostics);
344__device__
bool propagate(BlockData<S>& block_data, GridData<S>& grid_data) {
345 using BlockCP =
typename S::BlockCP;
346 bool is_leaf_node =
false;
347 BlockCP& cp = *block_data.root;
348 auto& fp_engine = *block_data.fp_engine;
349#ifdef TURBO_PROFILE_MODE
350 cuda::std::chrono::system_clock::time_point start;
351 if(threadIdx.x == 0) {
352 start = cuda::std::chrono::system_clock::now();
356 size_t iterations = fp_engine.fixpoint(*cp.ipc);
357 if(threadIdx.x == 0) {
358#ifdef TURBO_PROFILE_MODE
359 auto end = cuda::std::chrono::system_clock::now();
360 cuda::std::chrono::duration<double> diff = end - start;
361 cp.stats.propagation_time += diff.count();
363 cp.stats.fixpoint_iterations += iterations;
364 bool is_pruned = cp.on_node();
365 if(cp.ipc->is_bot()) {
369 else if(cp.search_tree->template is_extractable<AtomicExtraction>()) {
371 if(cp.bab->is_satisfaction() || cp.bab->compare_bound(*cp.store, cp.bab->optimum())) {
373 bool best_has_changed = update_grid_best_bound(block_data, grid_data);
374 if(cp.bab->is_satisfaction() || (best_has_changed && cp.is_printing_intermediate_sol())) {
375 grid_data.produce_solution(*cp.bab);
377 is_pruned |= cp.update_solution_stats();
381 grid_data.gpu_stop->join(
true);
383#ifdef TURBO_PROFILE_MODE
384 auto end2 = cuda::std::chrono::system_clock::now();
386 cp.stats.search_time += diff.count();
396__device__
size_t dive(BlockData<S>& block_data, GridData<S>& grid_data) {
397 using BlockCP =
typename S::BlockCP;
398 BlockCP& cp = *block_data.root;
399 auto& fp_engine = *block_data.fp_engine;
400 auto& stop = *block_data.stop;
402 auto& stop_diving = *block_data.has_changed;
404 stop_diving.meet(
false);
406 size_t remaining_depth = grid_data.root.config.subproblems_power;
407 while(remaining_depth > 0 && !stop_diving && !stop) {
409 bool is_leaf_node = propagate(block_data, grid_data);
410 if(threadIdx.x == 0) {
412 stop_diving.join(
true);
415 size_t branch_idx = (block_data.subproblem_idx & (
size_t{1} << remaining_depth)) >> remaining_depth;
416 auto branches = cp.eps_split->split();
417 assert(branches.size() == 2);
418 cp.ipc->deduce(branches[branch_idx]);
420 stop.join(grid_data.cpu_stop || *(grid_data.gpu_stop));
424 return remaining_depth;
428__device__
void solve_problem(BlockData<S>& block_data, GridData<S>& grid_data) {
429 using BlockCP =
typename S::BlockCP;
430 BlockCP& cp = *block_data.root;
431 auto& fp_engine = *block_data.fp_engine;
432 auto& block_has_changed = *block_data.has_changed;
433 auto& stop = *block_data.stop;
434 block_has_changed.join(
true);
439 while(block_has_changed && !stop) {
440 update_block_best_bound(block_data, grid_data);
441 propagate(block_data, grid_data);
442 if(threadIdx.x == 0) {
443 stop.join(grid_data.cpu_stop || *(grid_data.gpu_stop));
445 block_has_changed.meet(cp.search_tree->deduce());
452CUDA
void reduce_blocks(GridData<S>* grid_data) {
453 for(
int i = 0; i < grid_data->blocks.size(); ++i) {
454 if(grid_data->blocks[i].root) {
455 grid_data->root.meet(*(grid_data->blocks[i].root));
461__global__
void gpu_solve_kernel(GridData<S>* grid_data)
463 if(threadIdx.x == 0 && blockIdx.x == 0 && grid_data->root.config.verbose_solving) {
464 printf(
"%% GPU kernel started, starting solving...\n");
466 extern __shared__
unsigned char shared_mem[];
467 size_t num_subproblems = grid_data->root.stats.eps_num_subproblems;
468 BlockData<S>& block_data = grid_data->blocks[blockIdx.x];
469 block_data.allocate(*grid_data, shared_mem);
470 while(block_data.subproblem_idx < num_subproblems && !*(block_data.stop)) {
471 if(threadIdx.x == 0 && grid_data->root.config.verbose_solving) {
472 grid_data->print_lock->acquire();
473 printf(
"%% Block %d solves subproblem num %" PRIu64
"\n", blockIdx.x, block_data.subproblem_idx);
474 grid_data->print_lock->release();
476 block_data.restore();
477 cooperative_groups::this_thread_block().sync();
478 size_t remaining_depth = dive(block_data, *grid_data);
479 if(remaining_depth == 0) {
480 solve_problem(block_data, *grid_data);
481 if(threadIdx.x == 0 && !*(block_data.stop)) {
482 block_data.root->stats.eps_solved_subproblems += 1;
486 if(threadIdx.x == 0 && !*(block_data.stop)) {
487 size_t next_subproblem_idx = ((block_data.subproblem_idx >> remaining_depth) +
size_t{1}) << remaining_depth;
488 grid_data->next_subproblem->meet(ZLB<size_t, bt::local_memory>(next_subproblem_idx));
490 if((block_data.subproblem_idx & ((
size_t{1} << remaining_depth) -
size_t{1})) ==
size_t{0}) {
491 block_data.root->stats.eps_skipped_subproblems += next_subproblem_idx - block_data.subproblem_idx;
496 if(threadIdx.x == 0 && !*(block_data.stop)) {
497 block_data.subproblem_idx = grid_data->next_subproblem->value();
498 grid_data->next_subproblem->meet(ZLB<size_t, bt::local_memory>(block_data.subproblem_idx +
size_t{1}));
500 cooperative_groups::this_thread_block().sync();
502 cooperative_groups::this_thread_block().sync();
503 if(threadIdx.x == 0 && !*(block_data.stop)) {
504 block_data.root->stats.num_blocks_done = 1;
506 if(threadIdx.x == 0) {
507 grid_data->print_lock->acquire();
508 if(!grid_data->blocks_reduced) {
510 for(
int i = 0; i < grid_data->blocks.size(); ++i) {
511 if(grid_data->blocks[i].root) {
512 n += grid_data->blocks[i].root->stats.num_blocks_done;
515 if(block_data.stop->value() || n == grid_data->blocks.size()) {
516 reduce_blocks(grid_data);
517 grid_data->blocks_reduced =
true;
518 cuda::atomic_thread_fence(cuda::memory_order_seq_cst, cuda::thread_scope_system);
519 grid_data->ready_to_consume.test_and_set(cuda::std::memory_order_seq_cst);
520 grid_data->ready_to_consume.notify_one();
523 grid_data->print_lock->release();
526 block_data.deallocate_shared();
529template <
class T> __global__
void gpu_sizeof_kernel(
size_t* size) { *size =
sizeof(T); }
532 auto s = bt::make_unique<size_t, bt::managed_allocator>();
533 gpu_sizeof_kernel<T><<<1, 1>>>(s.get());
534 CUDAEX(cudaDeviceSynchronize());
538template <
class S,
class U>
539size_t sizeof_store(
const CP<U>& root) {
540 return gpu_sizeof<typename S::BlockCP::IStore>()
541 + gpu_sizeof<typename S::BlockCP::IStore::universe_type>() * root.
store->vars();
545template <
class S,
class U>
546MemoryConfig configure_memory(
CP<U>& root) {
547 cudaDeviceProp deviceProp;
548 cudaGetDeviceProperties(&deviceProp, 0);
549 const auto& config = root.
config;
550 size_t shared_mem_capacity = deviceProp.sharedMemPerBlock;
555 size_t store_alignment = 200;
557 MemoryConfig mem_config;
559 mem_config.shared_bytes = 100;
560 mem_config.store_bytes = sizeof_store<S>(root2) + store_alignment;
563 mem_config.pc_bytes = root2.prop_allocator.total_bytes_allocated();
564 mem_config.pc_bytes += mem_config.pc_bytes / 5;
565 if(config.only_global_memory || shared_mem_capacity < mem_config.shared_bytes + mem_config.store_bytes) {
566 if(!config.only_global_memory && config.verbose_solving) {
567 printf(
"%% The store of variables (%zuKB) cannot be stored in the shared memory of the GPU (%zuKB), therefore we use the global memory.\n",
568 mem_config.store_bytes / 1000,
569 shared_mem_capacity / 1000);
571 mem_config.mem_kind = MemoryKind::GLOBAL;
573 else if(shared_mem_capacity > mem_config.shared_bytes + mem_config.store_bytes + mem_config.pc_bytes) {
574 if(config.verbose_solving) {
575 printf(
"%% The store of variables and the propagators (%zuKB) are stored in the shared memory of the GPU (%zuKB).\n",
576 (mem_config.shared_bytes + mem_config.store_bytes + mem_config.pc_bytes) / 1000,
577 shared_mem_capacity / 1000);
579 mem_config.shared_bytes += mem_config.store_bytes + mem_config.pc_bytes;
580 mem_config.mem_kind = MemoryKind::STORE_PC_SHARED;
583 if(config.verbose_solving) {
584 printf(
"%% The store of variables (%zuKB) is stored in the shared memory of the GPU (%zuKB).\n",
585 mem_config.store_bytes / 1000,
586 shared_mem_capacity / 1000);
588 mem_config.shared_bytes += mem_config.store_bytes;
589 mem_config.mem_kind = MemoryKind::STORE_SHARED;
591 if(config.verbose_solving) {
600template<
class S,
class Timepo
int>
601bool wait_solving_ends(GridData<S>& grid_data,
const Timepoint& start) {
603 cudaEventCreateWithFlags(&event,cudaEventDisableTiming);
604 cudaEventRecord(event);
605 while(!
must_quit() &&
check_timeout(grid_data.root, start) && cudaEventQuery(event) == cudaErrorNotReady) {
606 std::this_thread::sleep_for(std::chrono::milliseconds(100));
608 if(cudaEventQuery(event) == cudaErrorNotReady) {
609 grid_data.cpu_stop =
true;
610 grid_data.root.prune();
614 cudaError error = cudaDeviceSynchronize();
615 if(error == cudaErrorIllegalAddress) {
616 printf(
"%% ERROR: CUDA kernel failed due to an illegal memory access. This might be due to a stack overflow because it is too small. Try increasing the stack size with the options -stack. If it does not work, please report it as a bug.\n");
625void consume_kernel_solutions(GridData<S>& grid_data) {
626 while(!grid_data.consume_solution()) {}
629template <
class S,
class U,
class Timepo
int>
630void transfer_memory_and_run(
CP<U>& root, MemoryConfig mem_config,
const Timepoint& start) {
631 using concurrent_allocator =
typename S::concurrent_allocator;
632 auto grid_data = bt::make_shared<GridData<S>, concurrent_allocator>(std::move(root), mem_config);
633 initialize_grid_data<<<1,1>>>(grid_data.get());
634 CUDAEX(cudaDeviceSynchronize());
635 if(grid_data->root.config.print_statistics) {
636 mem_config.print_mzn_statistics();
638 std::thread consumer_thread(consume_kernel_solutions<S>, std::ref(*grid_data));
640 <<<
static_cast<unsigned int>(grid_data->root.config.or_nodes),
641 static_cast<unsigned int>(grid_data->root.config.and_nodes),
642 grid_data->mem_config.shared_bytes>>>
644 bool interrupted = wait_solving_ends(*grid_data, start);
645 consumer_thread.join();
646 CUDAEX(cudaDeviceSynchronize());
647 deallocate_grid_data<<<1,1>>>(grid_data.get());
648 CUDAEX(cudaDeviceSynchronize());
652int threads_per_sm(cudaDeviceProp devProp) {
653 switch (devProp.major){
654 case 2:
return (devProp.minor == 1) ? 48 : 32;
657 case 6:
return (devProp.minor == 0) ? 64 : 128;
659 case 8:
return (devProp.minor == 0) ? 64 : 128;
665template <
class S,
class U>
666void configure_blocks_threads(
CP<U>& root,
const MemoryConfig& mem_config) {
668 int hint_num_threads;
669 CUDAE(cudaOccupancyMaxPotentialBlockSize(&hint_num_blocks, &hint_num_threads, (
void*) gpu_solve_kernel<S>, (
int)mem_config.shared_bytes));
671 cudaDeviceProp deviceProp;
672 cudaGetDeviceProperties(&deviceProp, 0);
674 size_t total_global_mem = deviceProp.totalGlobalMem;
675 size_t num_sm = deviceProp.multiProcessorCount;
676 size_t num_threads_per_sm = threads_per_sm(deviceProp);
678 auto& config = root.
config;
679 config.
or_nodes = (config.or_nodes == 0) ? hint_num_blocks : config.or_nodes;
680 config.and_nodes = (config.and_nodes == 0) ? hint_num_threads : config.and_nodes;
682 if(config.and_nodes > deviceProp.maxThreadsPerBlock) {
683 if(config.verbose_solving) {
684 printf(
"%% WARNING: -and %zu too high for this GPU, we use the maximum %d instead.", config.and_nodes, deviceProp.maxThreadsPerBlock);
686 config.and_nodes = deviceProp.maxThreadsPerBlock;
690 size_t total_stack_size = num_sm * deviceProp.maxThreadsPerMultiProcessor * config.stack_kb * 1000;
691 size_t remaining_global_mem = total_global_mem - total_stack_size;
692 remaining_global_mem -= remaining_global_mem / 10;
696 size_t heap_usage_estimation = (config.or_nodes + 1) * (mem_config.pc_bytes + mem_config.store_bytes + 100 * root.
store->vars());
697 while(heap_usage_estimation > remaining_global_mem) {
701 CUDAEX(cudaDeviceSetLimit(cudaLimitStackSize, config.stack_kb*1000));
702 CUDAEX(cudaDeviceSetLimit(cudaLimitMallocHeapSize, remaining_global_mem));
704 if(config.verbose_solving) {
708 printf(
"%% and_nodes=%zu\n", config.and_nodes);
709 printf(
"%% or_nodes=%zu\n", config.or_nodes);
713template <
class S,
class U,
class Timepo
int>
714void configure_and_run(
CP<U>& root,
const Timepoint& start) {
715 MemoryConfig mem_config = configure_memory<S>(root);
716 configure_blocks_threads<S>(root, mem_config);
717 transfer_memory_and_run<S>(root, mem_config, start);
720void check_support_unified_memory() {
723 CUDAEX(cudaDeviceGetAttribute(&attr, cudaDevAttrManagedMemory, dev));
725 std::cerr <<
"The GPU does not support unified memory." << std::endl;
730void check_support_concurrent_managed_memory() {
733 CUDAEX(cudaDeviceGetAttribute(&attr, cudaDevAttrConcurrentManagedAccess, dev));
735#ifdef NO_CONCURRENT_MANAGED_MEMORY
736 printf(
"%% WARNING: The GPU does not support concurrent access to managed memory, hence we fall back on pinned memory.\n");
744 unsigned int flags = 0;
745 CUDAEX(cudaGetDeviceFlags(&flags));
746 flags |= cudaDeviceMapHost;
747 CUDAEX(cudaSetDeviceFlags(flags));
749 printf(
"%% To run Turbo on this GPU you need to build Turbo with the option NO_CONCURRENT_MANAGED_MEMORY.\n");
759 std::cerr <<
"You must use a CUDA compiler (nvcc or clang) to compile Turbo on GPU." << std::endl;
761 check_support_unified_memory();
762 check_support_concurrent_managed_memory();
763 auto start = std::chrono::high_resolution_clock::now();
767#ifdef NO_CONCURRENT_MANAGED_MEMORY
769 configure_and_run<ItvSolverPinnedNoAtomics>(root, start);
772 configure_and_run<ItvSolverPinned>(root, start);
776 configure_and_run<ItvSolverNoAtomics>(root, start);
779 configure_and_run<ItvSolver>(root, start);
bool must_quit()
Definition common_solving.hpp:89
void block_signal_ctrlc()
Definition common_solving.hpp:77
bool check_timeout(A &a, const Timepoint &start)
Definition common_solving.hpp:101
void gpu_dive_and_solve(Configuration< bt::standard_allocator > &config)
Definition gpu_dive_and_solve.hpp:757
void print_memory_statistics(const char *key, size_t bytes)
Definition statistics.hpp:12
Definition common_solving.hpp:156
Configuration< BasicAllocator > config
Definition common_solving.hpp:283
abstract_ptr< IStore > store
Definition common_solving.hpp:268
void preprocess()
Definition common_solving.hpp:462
size_t or_nodes
Definition config.hpp:41
bool noatomics
Definition config.hpp:39
Definition common_solving.hpp:119
CUDA void * allocate(size_t bytes)
Definition common_solving.hpp:127