3#ifndef TURBO_GPU_SOLVING_HPP
4#define TURBO_GPU_SOLVING_HPP
10namespace bt = ::battery;
14#include <cuda/std/chrono>
15#include <cuda/semaphore>
21 class ConcurrentAllocator>
27 using concurrent_allocator = ConcurrentAllocator;
31 bt::statistics_allocator<ConcurrentAllocator>,
32 bt::statistics_allocator<UniqueLightAlloc<ConcurrentAllocator, 0>>,
33 bt::statistics_allocator<UniqueLightAlloc<ConcurrentAllocator, 1>>>;
42using Itv0 = Interval<ZLB<int, bt::local_memory>>;
43using Itv1 = Interval<ZLB<int, bt::atomic_memory_block>>;
44using Itv2 = Interval<ZLB<int, bt::atomic_memory_grid>>;
45using AtomicBool = B<bt::atomic_memory_block>;
46using FPEngine = BlockAsynchronousIterationGPU<bt::pool_allocator>;
49#ifdef NO_CONCURRENT_MANAGED_MEMORY
50 using ItvSolverPinned = StateTypes<Itv0, Itv1, Itv2, bt::pinned_allocator>;
51 using ItvSolverPinnedNoAtomics = StateTypes<Itv0, Itv0, Itv0, bt::pinned_allocator>;
53 using ItvSolver = StateTypes<Itv0, Itv1, Itv2, bt::managed_allocator>;
55 using ItvSolverNoAtomics = StateTypes<Itv0, Itv0, Itv0, bt::managed_allocator>;
63enum class MemoryKind {
78 CUDA bt::pool_allocator make_global_pool(
size_t bytes) {
79 void* mem_pool = bt::global_allocator{}.
allocate(bytes);
80 return bt::pool_allocator(
static_cast<unsigned char*
>(mem_pool), bytes);
83 CUDA bt::pool_allocator make_shared_pool(
unsigned char* shared_mem) {
84 return bt::pool_allocator(shared_mem, shared_bytes);
87 CUDA bt::pool_allocator make_pc_pool(bt::pool_allocator shared_mem) {
88 if(mem_kind == MemoryKind::STORE_PC_SHARED) {
92 return make_global_pool(pc_bytes);
96 CUDA bt::pool_allocator make_store_pool(bt::pool_allocator shared_mem) {
97 if(mem_kind == MemoryKind::STORE_PC_SHARED || mem_kind == MemoryKind::STORE_SHARED) {
101 return make_global_pool(store_bytes);
105 CUDA
void print_mzn_statistics()
const {
106 printf(
"%%%%%%mzn-stat: memory_configuration=\"%s\"\n",
107 mem_kind == MemoryKind::GLOBAL ?
"global" : (
108 mem_kind == MemoryKind::STORE_SHARED ?
"store_shared" :
"store_pc_shared"));
109 printf(
"%%%%%%mzn-stat: shared_mem=%" PRIu64
"\n", shared_bytes);
110 printf(
"%%%%%%mzn-stat: store_mem=%" PRIu64
"\n", store_bytes);
111 printf(
"%%%%%%mzn-stat: propagator_mem=%" PRIu64
"\n", pc_bytes);
122 using GridCP =
typename S::GridCP;
123 using BlockCP =
typename S::BlockCP;
124 using U2 =
typename S::U2;
129 bt::shared_ptr<BlockCP, bt::global_allocator> blocks_root;
131 volatile bool cpu_stop;
133 volatile bool blocks_reduced;
134 MemoryConfig mem_config;
135 bt::vector<BlockData<S>, bt::global_allocator> blocks;
137 bt::shared_ptr<B<bt::atomic_memory_grid>, bt::global_allocator> gpu_stop;
138 bt::shared_ptr<ZLB<size_t, bt::atomic_memory_grid>, bt::global_allocator> next_subproblem;
139 bt::shared_ptr<U2, bt::global_allocator> best_bound;
143 bt::shared_ptr<cuda::binary_semaphore<cuda::thread_scope_device>, bt::global_allocator> print_lock;
144 cuda::std::atomic_flag ready_to_produce;
145 cuda::std::atomic_flag ready_to_consume;
147 GridData(
const GridCP& root,
const MemoryConfig& mem_config)
149 , mem_config(mem_config)
151 , blocks_reduced(false)
153 ready_to_consume.clear();
154 ready_to_produce.test_and_set();
155 cuda::atomic_thread_fence(cuda::memory_order_seq_cst, cuda::thread_scope_system);
158 template <
class BlockBAB>
159 __device__
void produce_solution(
const BlockBAB& bab) {
160 print_lock->acquire();
162 ready_to_produce.wait(
false, cuda::std::memory_order_seq_cst);
163 ready_to_produce.clear();
164 bab.extract(*(root.bab));
165 cuda::atomic_thread_fence(cuda::memory_order_seq_cst, cuda::thread_scope_system);
166 ready_to_consume.test_and_set(cuda::std::memory_order_seq_cst);
167 ready_to_consume.notify_one();
169 print_lock->release();
172 __host__
bool consume_solution() {
173 ready_to_consume.wait(
false, cuda::std::memory_order_seq_cst);
174 ready_to_consume.clear();
176 root.print_final_solution();
177 if(root.config.print_statistics) {
178 root.print_mzn_statistics();
183 root.print_solution();
185 cuda::atomic_thread_fence(cuda::memory_order_seq_cst, cuda::thread_scope_system);
186 ready_to_produce.test_and_set(cuda::std::memory_order_seq_cst);
187 ready_to_produce.notify_one();
191 __device__
void allocate() {
192 assert(threadIdx.x == 0 && blockIdx.x == 0);
193 auto root_mem_config(mem_config);
194 root_mem_config.mem_kind = MemoryKind::GLOBAL;
195 blocks_root = bt::make_shared<BlockCP, bt::global_allocator>(
196 typename BlockCP::tag_gpu_block_copy{},
199 bt::global_allocator{},
200 root_mem_config.make_pc_pool(bt::pool_allocator(
nullptr,0)),
201 root_mem_config.make_store_pool(bt::pool_allocator(
nullptr,0)));
202 blocks = bt::vector<BlockData<S>, bt::global_allocator>(root.config.or_nodes);
203 gpu_stop = bt::make_shared<B<bt::atomic_memory_grid>, bt::global_allocator>(
false);
204 print_lock = bt::make_shared<cuda::binary_semaphore<cuda::thread_scope_device>, bt::global_allocator>(1);
205 next_subproblem = bt::make_shared<ZLB<size_t, bt::atomic_memory_grid>, bt::global_allocator>(0);
206 best_bound = bt::make_shared<U2, bt::global_allocator>();
209 __device__
void deallocate() {
210 assert(threadIdx.x == 0 && blockIdx.x == 0);
211 blocks = bt::vector<BlockData<S>, bt::global_allocator>();
212 blocks_root->deallocate();
216 next_subproblem.reset();
224 using GridCP =
typename S::GridCP;
225 using BlockCP =
typename S::BlockCP;
227 using snapshot_type =
typename BlockCP::IST::snapshot_type<bt::global_allocator>;
228 size_t subproblem_idx;
229 bt::shared_ptr<FPEngine, bt::global_allocator> fp_engine;
230 bt::shared_ptr<AtomicBool, bt::pool_allocator> has_changed;
231 bt::shared_ptr<AtomicBool, bt::pool_allocator> stop;
232 bt::shared_ptr<BlockCP, bt::global_allocator> root;
233 bt::shared_ptr<snapshot_type, bt::global_allocator> snapshot_root;
235 __device__ BlockData():
236 has_changed(nullptr, bt::pool_allocator(nullptr, 0)),
237 stop(nullptr, bt::pool_allocator(nullptr, 0))
243 __device__
void allocate(GridData<S>& grid_data,
unsigned char* shared_mem) {
244 auto block = cooperative_groups::this_thread_block();
245 if(threadIdx.x == 0) {
246 subproblem_idx = blockIdx.x;
247 MemoryConfig& mem_config = grid_data.mem_config;
248 bt::pool_allocator shared_mem_pool(mem_config.make_shared_pool(shared_mem));
249 fp_engine = bt::make_shared<FPEngine, bt::global_allocator>(block, shared_mem_pool);
250 has_changed = bt::allocate_shared<AtomicBool, bt::pool_allocator>(shared_mem_pool,
true);
251 stop = bt::allocate_shared<AtomicBool, bt::pool_allocator>(shared_mem_pool,
false);
252 root = bt::make_shared<BlockCP, bt::global_allocator>(
typename BlockCP::tag_gpu_block_copy{},
253 (mem_config.mem_kind != MemoryKind::STORE_PC_SHARED),
254 *(grid_data.blocks_root),
255 bt::global_allocator{},
256 mem_config.make_pc_pool(shared_mem_pool),
257 mem_config.make_store_pool(shared_mem_pool));
258 snapshot_root = bt::make_shared<snapshot_type, bt::global_allocator>(root->search_tree->template snapshot<bt::global_allocator>());
263 __device__
void deallocate_shared() {
264 if(threadIdx.x == 0) {
269 snapshot_root.reset();
271 cooperative_groups::this_thread_block().sync();
274 __device__
void restore() {
275 if(threadIdx.x == 0) {
276 root->search_tree->restore(*snapshot_root);
277 root->eps_split->reset();
279 cooperative_groups::this_thread_block().sync();
284__global__
void initialize_grid_data(GridData<S>* grid_data) {
285 grid_data->allocate();
286 size_t num_subproblems = 1;
287 num_subproblems <<= grid_data->root.config.subproblems_power;
288 grid_data->next_subproblem->meet(ZLB<size_t, bt::local_memory>(grid_data->root.config.or_nodes));
289 grid_data->root.stats.eps_num_subproblems = num_subproblems;
293__global__
void deallocate_grid_data(GridData<S>* grid_data) {
294 grid_data->deallocate();
302__device__
bool update_grid_best_bound(BlockData<S>& block_data, GridData<S>& grid_data) {
303 using U0 =
typename S::U0;
304 assert(threadIdx.x == 0);
305 if(block_data.root->bab->is_optimization()) {
306 const auto& bab = block_data.root->bab;
307 auto local_best = bab->optimum().project(bab->objective_var());
309 if(bab->is_maximization()) {
310 return grid_data.best_bound->meet_lb(dual<typename U0::LB>(local_best.ub()));
313 return grid_data.best_bound->meet_ub(dual<typename U0::UB>(local_best.lb()));
324__device__
void update_block_best_bound(BlockData<S>& block_data, GridData<S>& grid_data) {
325 using U0 =
typename S::U0;
326 if(threadIdx.x == 0 && block_data.root->bab->is_optimization()) {
327 const auto& bab = block_data.root->bab;
328 VarEnv<bt::global_allocator> empty_env{};
329 auto best_formula = bab->template deinterpret_best_bound<bt::global_allocator>(
330 bab->is_maximization()
331 ? U0(dual<typename U0::UB>(grid_data.best_bound->lb()))
332 : U0(dual<typename U0::LB>(grid_data.best_bound->ub())));
335 IDiagnostics diagnostics;
336 interpret_and_tell(best_formula, empty_env, *block_data.root->store, diagnostics);
344__device__
bool propagate(BlockData<S>& block_data, GridData<S>& grid_data) {
345 using BlockCP =
typename S::BlockCP;
346 bool is_leaf_node =
false;
347 BlockCP& cp = *block_data.root;
348 auto& fp_engine = *block_data.fp_engine;
349#ifdef TURBO_PROFILE_MODE
350 cuda::std::chrono::system_clock::time_point start;
351 if(threadIdx.x == 0) {
352 start = cuda::std::chrono::system_clock::now();
356 local::B thread_has_changed{
false};
357 size_t iterations = fp_engine.fixpoint(*cp.ipc, thread_has_changed, &grid_data.cpu_stop);
358 if(threadIdx.x == 0) {
359#ifdef TURBO_PROFILE_MODE
360 auto end = cuda::std::chrono::system_clock::now();
361 cuda::std::chrono::duration<double> diff = end - start;
362 cp.stats.propagation_time += diff.count();
364 cp.stats.fixpoint_iterations += iterations;
365 bool is_pruned = cp.on_node();
366 if(cp.ipc->is_bot()) {
370 else if(cp.search_tree->template is_extractable<AtomicExtraction>()) {
372 if(cp.bab->is_satisfaction() || cp.bab->compare_bound(*cp.store, cp.bab->optimum())) {
374 bool best_has_changed = update_grid_best_bound(block_data, grid_data);
375 if(cp.bab->is_satisfaction() || (best_has_changed && cp.is_printing_intermediate_sol())) {
376 grid_data.produce_solution(*cp.bab);
378 is_pruned |= cp.update_solution_stats();
382 grid_data.gpu_stop->join(
true);
384#ifdef TURBO_PROFILE_MODE
385 auto end2 = cuda::std::chrono::system_clock::now();
387 cp.stats.search_time += diff.count();
397__device__
size_t dive(BlockData<S>& block_data, GridData<S>& grid_data) {
398 using BlockCP =
typename S::BlockCP;
399 BlockCP& cp = *block_data.root;
400 auto& fp_engine = *block_data.fp_engine;
401 auto& stop = *block_data.stop;
403 auto& stop_diving = *block_data.has_changed;
405 stop_diving.meet(
false);
407 size_t remaining_depth = grid_data.root.config.subproblems_power;
408 while(remaining_depth > 0 && !stop_diving && !stop) {
410 bool is_leaf_node = propagate(block_data, grid_data);
411 if(threadIdx.x == 0) {
413 stop_diving.join(
true);
416 size_t branch_idx = (block_data.subproblem_idx & (
size_t{1} << remaining_depth)) >> remaining_depth;
417 auto branches = cp.eps_split->split();
418 assert(branches.size() == 2);
419 cp.ipc->deduce(branches[branch_idx]);
421 stop.join(grid_data.cpu_stop || *(grid_data.gpu_stop));
425 return remaining_depth;
429__device__
void solve_problem(BlockData<S>& block_data, GridData<S>& grid_data) {
430 using BlockCP =
typename S::BlockCP;
431 BlockCP& cp = *block_data.root;
432 auto& fp_engine = *block_data.fp_engine;
433 auto& block_has_changed = *block_data.has_changed;
434 auto& stop = *block_data.stop;
435 block_has_changed.join(
true);
440 while(block_has_changed && !stop) {
441 update_block_best_bound(block_data, grid_data);
442 propagate(block_data, grid_data);
443 if(threadIdx.x == 0) {
444 stop.join(grid_data.cpu_stop || *(grid_data.gpu_stop));
446 block_has_changed.meet(cp.search_tree->deduce());
453CUDA
void reduce_blocks(GridData<S>* grid_data) {
454 for(
int i = 0; i < grid_data->blocks.size(); ++i) {
455 if(grid_data->blocks[i].root) {
456 grid_data->root.meet(*(grid_data->blocks[i].root));
462__global__
void gpu_solve_kernel(GridData<S>* grid_data)
464 if(threadIdx.x == 0 && blockIdx.x == 0 && grid_data->root.config.verbose_solving) {
465 printf(
"%% GPU kernel started, starting solving...\n");
467 extern __shared__
unsigned char shared_mem[];
468 size_t num_subproblems = grid_data->root.stats.eps_num_subproblems;
469 BlockData<S>& block_data = grid_data->blocks[blockIdx.x];
470 block_data.allocate(*grid_data, shared_mem);
471 while(block_data.subproblem_idx < num_subproblems && !*(block_data.stop)) {
472 if(threadIdx.x == 0 && grid_data->root.config.verbose_solving) {
473 grid_data->print_lock->acquire();
474 printf(
"%% Block %d solves subproblem num %" PRIu64
"\n", blockIdx.x, block_data.subproblem_idx);
475 grid_data->print_lock->release();
477 block_data.restore();
478 cooperative_groups::this_thread_block().sync();
479 size_t remaining_depth = dive(block_data, *grid_data);
480 if(remaining_depth == 0) {
481 solve_problem(block_data, *grid_data);
482 if(threadIdx.x == 0 && !*(block_data.stop)) {
483 block_data.root->stats.eps_solved_subproblems += 1;
487 if(threadIdx.x == 0 && !*(block_data.stop)) {
488 size_t next_subproblem_idx = ((block_data.subproblem_idx >> remaining_depth) +
size_t{1}) << remaining_depth;
489 grid_data->next_subproblem->meet(ZLB<size_t, bt::local_memory>(next_subproblem_idx));
491 if((block_data.subproblem_idx & ((
size_t{1} << remaining_depth) -
size_t{1})) ==
size_t{0}) {
492 block_data.root->stats.eps_skipped_subproblems += next_subproblem_idx - block_data.subproblem_idx;
497 if(threadIdx.x == 0 && !*(block_data.stop)) {
498 block_data.subproblem_idx = grid_data->next_subproblem->value();
499 grid_data->next_subproblem->meet(ZLB<size_t, bt::local_memory>(block_data.subproblem_idx +
size_t{1}));
501 cooperative_groups::this_thread_block().sync();
503 cooperative_groups::this_thread_block().sync();
504 if(threadIdx.x == 0 && !*(block_data.stop)) {
505 block_data.root->stats.num_blocks_done = 1;
507 if(threadIdx.x == 0) {
508 grid_data->print_lock->acquire();
509 if(!grid_data->blocks_reduced) {
511 for(
int i = 0; i < grid_data->blocks.size(); ++i) {
512 if(grid_data->blocks[i].root) {
513 n += grid_data->blocks[i].root->stats.num_blocks_done;
516 if(block_data.stop->value() || n == grid_data->blocks.size()) {
517 reduce_blocks(grid_data);
518 grid_data->blocks_reduced =
true;
519 cuda::atomic_thread_fence(cuda::memory_order_seq_cst, cuda::thread_scope_system);
520 grid_data->ready_to_consume.test_and_set(cuda::std::memory_order_seq_cst);
521 grid_data->ready_to_consume.notify_one();
524 grid_data->print_lock->release();
527 block_data.deallocate_shared();
530template <
class T> __global__
void gpu_sizeof_kernel(
size_t* size) { *size =
sizeof(T); }
533 auto s = bt::make_unique<size_t, bt::managed_allocator>();
534 gpu_sizeof_kernel<T><<<1, 1>>>(s.get());
535 CUDAEX(cudaDeviceSynchronize());
539template <
class S,
class U>
540size_t sizeof_store(
const CP<U>& root) {
541 return gpu_sizeof<typename S::BlockCP::IStore>()
542 + gpu_sizeof<typename S::BlockCP::IStore::universe_type>() * root.
store->vars();
545void print_memory_statistics(
const char* key,
size_t bytes) {
546 printf(
"%% %s=%zu [", key, bytes);
547 if(bytes < 1000 * 1000) {
548 printf(
"%.2fKB",
static_cast<double>(bytes) / 1000);
550 else if(bytes < 1000 * 1000 * 1000) {
551 printf(
"%.2fMB",
static_cast<double>(bytes) / (1000 * 1000));
554 printf(
"%.2fGB",
static_cast<double>(bytes) / (1000 * 1000 * 1000));
560template <
class S,
class U>
561MemoryConfig configure_memory(
CP<U>& root) {
562 cudaDeviceProp deviceProp;
563 cudaGetDeviceProperties(&deviceProp, 0);
564 const auto& config = root.
config;
565 size_t shared_mem_capacity = deviceProp.sharedMemPerBlock;
570 size_t store_alignment = 200;
572 MemoryConfig mem_config;
574 mem_config.shared_bytes = 100;
575 mem_config.store_bytes = sizeof_store<S>(root2) + store_alignment;
578 mem_config.pc_bytes = root2.prop_allocator.total_bytes_allocated();
579 mem_config.pc_bytes += mem_config.pc_bytes / 5;
580 if(config.only_global_memory || shared_mem_capacity < mem_config.shared_bytes + mem_config.store_bytes) {
581 if(!config.only_global_memory && config.verbose_solving) {
582 printf(
"%% The store of variables (%zuKB) cannot be stored in the shared memory of the GPU (%zuKB), therefore we use the global memory.\n",
583 mem_config.store_bytes / 1000,
584 shared_mem_capacity / 1000);
586 mem_config.mem_kind = MemoryKind::GLOBAL;
588 else if(shared_mem_capacity > mem_config.shared_bytes + mem_config.store_bytes + mem_config.pc_bytes) {
589 if(config.verbose_solving) {
590 printf(
"%% The store of variables and the propagators (%zuKB) are stored in the shared memory of the GPU (%zuKB).\n",
591 (mem_config.shared_bytes + mem_config.store_bytes + mem_config.pc_bytes) / 1000,
592 shared_mem_capacity / 1000);
594 mem_config.shared_bytes += mem_config.store_bytes + mem_config.pc_bytes;
595 mem_config.mem_kind = MemoryKind::STORE_PC_SHARED;
598 if(config.verbose_solving) {
599 printf(
"%% The store of variables (%zuKB) is stored in the shared memory of the GPU (%zuKB).\n",
600 mem_config.store_bytes / 1000,
601 shared_mem_capacity / 1000);
603 mem_config.shared_bytes += mem_config.store_bytes;
604 mem_config.mem_kind = MemoryKind::STORE_SHARED;
606 if(config.verbose_solving) {
607 print_memory_statistics(
"store_memory_real", root2.store_allocator.total_bytes_allocated());
608 print_memory_statistics(
"pc_memory_real", root2.prop_allocator.total_bytes_allocated());
609 print_memory_statistics(
"other_memory_real", root2.basic_allocator.total_bytes_allocated());
615template<
class S,
class Timepo
int>
616bool wait_solving_ends(GridData<S>& grid_data,
const Timepoint& start) {
618 cudaEventCreateWithFlags(&event,cudaEventDisableTiming);
619 cudaEventRecord(event);
620 while(!
must_quit() &&
check_timeout(grid_data.root, start) && cudaEventQuery(event) == cudaErrorNotReady) {
621 std::this_thread::sleep_for(std::chrono::milliseconds(100));
623 if(cudaEventQuery(event) == cudaErrorNotReady) {
624 grid_data.cpu_stop =
true;
625 grid_data.root.prune();
629 cudaError error = cudaDeviceSynchronize();
630 if(error == cudaErrorIllegalAddress) {
631 printf(
"%% ERROR: CUDA kernel failed due to an illegal memory access. This might be due to a stack overflow because it is too small. Try increasing the stack size with the options -stack. If it does not work, please report it as a bug.\n");
640void consume_kernel_solutions(GridData<S>& grid_data) {
641 while(!grid_data.consume_solution()) {}
644template <
class S,
class U,
class Timepo
int>
645void transfer_memory_and_run(
CP<U>& root, MemoryConfig mem_config,
const Timepoint& start) {
646 using concurrent_allocator =
typename S::concurrent_allocator;
647 auto grid_data = bt::make_shared<GridData<S>, concurrent_allocator>(std::move(root), mem_config);
648 initialize_grid_data<<<1,1>>>(grid_data.get());
649 CUDAEX(cudaDeviceSynchronize());
650 if(grid_data->root.config.print_statistics) {
651 mem_config.print_mzn_statistics();
653 std::thread consumer_thread(consume_kernel_solutions<S>, std::ref(*grid_data));
655 <<<
static_cast<unsigned int>(grid_data->root.config.or_nodes),
656 static_cast<unsigned int>(grid_data->root.config.and_nodes),
657 grid_data->mem_config.shared_bytes>>>
659 bool interrupted = wait_solving_ends(*grid_data, start);
660 consumer_thread.join();
661 CUDAEX(cudaDeviceSynchronize());
662 deallocate_grid_data<<<1,1>>>(grid_data.get());
663 CUDAEX(cudaDeviceSynchronize());
667int threads_per_sm(cudaDeviceProp devProp) {
668 switch (devProp.major){
669 case 2:
return (devProp.minor == 1) ? 48 : 32;
672 case 6:
return (devProp.minor == 0) ? 64 : 128;
674 case 8:
return (devProp.minor == 0) ? 64 : 128;
680template <
class S,
class U>
681void configure_blocks_threads(
CP<U>& root,
const MemoryConfig& mem_config) {
683 int hint_num_threads;
684 CUDAE(cudaOccupancyMaxPotentialBlockSize(&hint_num_blocks, &hint_num_threads, (
void*) gpu_solve_kernel<S>, (
int)mem_config.shared_bytes));
686 cudaDeviceProp deviceProp;
687 cudaGetDeviceProperties(&deviceProp, 0);
689 size_t total_global_mem = deviceProp.totalGlobalMem;
690 size_t num_sm = deviceProp.multiProcessorCount;
691 size_t num_threads_per_sm = threads_per_sm(deviceProp);
693 auto& config = root.
config;
694 config.
or_nodes = (config.or_nodes == 0) ? hint_num_blocks : config.or_nodes;
695 config.and_nodes = (config.and_nodes == 0) ? hint_num_threads : config.and_nodes;
697 if(config.and_nodes > deviceProp.maxThreadsPerBlock) {
698 if(config.verbose_solving) {
699 printf(
"%% WARNING: -and %zu too high for this GPU, we use the maximum %d instead.", config.and_nodes, deviceProp.maxThreadsPerBlock);
701 config.and_nodes = deviceProp.maxThreadsPerBlock;
705 size_t total_stack_size = num_sm * deviceProp.maxThreadsPerMultiProcessor * config.stack_kb * 1000;
706 size_t remaining_global_mem = total_global_mem - total_stack_size;
707 remaining_global_mem -= remaining_global_mem / 10;
711 size_t heap_usage_estimation = (config.or_nodes + 1) * (mem_config.pc_bytes + mem_config.store_bytes + 100 * root.
store->vars());
712 while(heap_usage_estimation > remaining_global_mem) {
716 CUDAEX(cudaDeviceSetLimit(cudaLimitStackSize, config.stack_kb*1000));
717 CUDAEX(cudaDeviceSetLimit(cudaLimitMallocHeapSize, remaining_global_mem));
719 if(config.verbose_solving) {
720 print_memory_statistics(
"stack_memory", total_stack_size);
721 print_memory_statistics(
"heap_memory", remaining_global_mem);
722 print_memory_statistics(
"heap_usage_estimation", heap_usage_estimation);
723 printf(
"%% and_nodes=%zu\n", config.and_nodes);
724 printf(
"%% or_nodes=%zu\n", config.or_nodes);
728template <
class S,
class U,
class Timepo
int>
729void configure_and_run(
CP<U>& root,
const Timepoint& start) {
730 MemoryConfig mem_config = configure_memory<S>(root);
731 configure_blocks_threads<S>(root, mem_config);
732 transfer_memory_and_run<S>(root, mem_config, start);
735void check_support_unified_memory() {
738 CUDAEX(cudaDeviceGetAttribute(&attr, cudaDevAttrManagedMemory, dev));
740 std::cerr <<
"The GPU does not support unified memory." << std::endl;
745void check_support_concurrent_managed_memory() {
748 CUDAEX(cudaDeviceGetAttribute(&attr, cudaDevAttrConcurrentManagedAccess, dev));
750#ifdef NO_CONCURRENT_MANAGED_MEMORY
751 printf(
"%% WARNING: The GPU does not support concurrent access to managed memory, hence we fall back on pinned memory.\n");
759 unsigned int flags = 0;
760 CUDAEX(cudaGetDeviceFlags(&flags));
761 flags |= cudaDeviceMapHost;
762 CUDAEX(cudaSetDeviceFlags(flags));
764 printf(
"%% To run Turbo on this GPU you need to build Turbo with the option NO_CONCURRENT_MANAGED_MEMORY.\n");
774 std::cerr <<
"You must use a CUDA compiler (nvcc or clang) to compile Turbo on GPU." << std::endl;
776 check_support_unified_memory();
777 check_support_concurrent_managed_memory();
778 auto start = std::chrono::high_resolution_clock::now();
782#ifdef NO_CONCURRENT_MANAGED_MEMORY
784 configure_and_run<ItvSolverPinnedNoAtomics>(root, start);
787 configure_and_run<ItvSolverPinned>(root, start);
791 configure_and_run<ItvSolverNoAtomics>(root, start);
794 configure_and_run<ItvSolver>(root, start);
bool must_quit()
Definition common_solving.hpp:89
void block_signal_ctrlc()
Definition common_solving.hpp:77
bool check_timeout(A &a, const Timepoint &start)
Definition common_solving.hpp:101
void gpu_solve(Configuration< bt::standard_allocator > &config)
Definition gpu_solving.hpp:772
Definition common_solving.hpp:154
Configuration< BasicAllocator > config
Definition common_solving.hpp:277
abstract_ptr< IStore > store
Definition common_solving.hpp:262
void preprocess()
Definition common_solving.hpp:451
size_t or_nodes
Definition config.hpp:40
bool noatomics
Definition config.hpp:38
Definition common_solving.hpp:119
CUDA void * allocate(size_t bytes)
Definition common_solving.hpp:125