3#ifndef TURBO_GPU_DIVE_AND_SOLVE_HPP 
    4#define TURBO_GPU_DIVE_AND_SOLVE_HPP 
   11namespace bt = ::battery;
 
   15#include <cuda/std/chrono> 
   16#include <cuda/semaphore> 
   22  class ConcurrentAllocator> 
 
   28  using concurrent_allocator = ConcurrentAllocator;
 
   32    bt::statistics_allocator<ConcurrentAllocator>,
 
   33    bt::statistics_allocator<UniqueLightAlloc<ConcurrentAllocator, 0>>,
 
   34    bt::statistics_allocator<UniqueLightAlloc<ConcurrentAllocator, 1>>>;
 
   43using Itv0 = Interval<ZLB<bound_value_type, bt::local_memory>>;
 
   44using Itv1 = Interval<ZLB<bound_value_type, bt::atomic_memory_block>>;
 
   45using Itv2 = Interval<ZLB<bound_value_type, bt::atomic_memory_grid>>;
 
   46using AtomicBool = B<bt::atomic_memory_block>;
 
   47using FPEngine = FixpointSubsetGPU<BlockAsynchronousFixpointGPU<true>, bt::global_allocator, CUDA_THREADS_PER_BLOCK>;
 
   50#ifdef NO_CONCURRENT_MANAGED_MEMORY 
   51  using ItvSolverPinned = StateTypes<Itv0, Itv1, Itv2, bt::pinned_allocator>;
 
   54  using ItvSolver = StateTypes<Itv0, Itv0, Itv0, bt::managed_allocator>;
 
   64  using GridCP = 
typename S::GridCP;
 
   65  using BlockCP = 
typename S::BlockCP;
 
   66  using U2 = 
typename S::U2;
 
   71  bt::shared_ptr<BlockCP, bt::global_allocator> blocks_root;
 
   73  cuda::std::atomic_flag cpu_stop;
 
   75  volatile bool blocks_reduced;
 
   76  MemoryConfig mem_config;
 
   77  bt::vector<BlockData<S>, bt::global_allocator> blocks;
 
   79  bt::shared_ptr<B<bt::atomic_memory_grid>, bt::global_allocator> gpu_stop;
 
   80  bt::shared_ptr<ZLB<size_t, bt::atomic_memory_grid>, bt::global_allocator> next_subproblem;
 
   81  bt::shared_ptr<U2, bt::global_allocator> best_bound;
 
   85  bt::shared_ptr<cuda::binary_semaphore<cuda::thread_scope_device>, bt::global_allocator> print_lock;
 
   86  cuda::std::atomic_flag ready_to_produce;
 
   87  cuda::std::atomic_flag ready_to_consume;
 
   89  GridData(
const GridCP& root, 
const MemoryConfig& mem_config)
 
   91    , mem_config(mem_config)
 
   93    , blocks_reduced(false)
 
   95    ready_to_consume.clear();
 
   96    ready_to_produce.clear();
 
   97    cuda::atomic_thread_fence(cuda::memory_order_seq_cst, cuda::thread_scope_system);
 
  100  template <
class BlockBAB>
 
  101  __device__ 
void produce_solution(
const BlockBAB& bab) {
 
  102    print_lock->acquire();
 
  103    if(!cpu_stop.test()) {
 
  104      bab.extract(*(root.bab));
 
  105      cuda::atomic_thread_fence(cuda::memory_order_seq_cst, cuda::thread_scope_system);
 
  106      ready_to_consume.test_and_set(cuda::std::memory_order_seq_cst);
 
  107      ready_to_consume.notify_one();
 
  110      ready_to_produce.wait(
false, cuda::std::memory_order_seq_cst);
 
  111      ready_to_produce.clear();
 
  113    print_lock->release();
 
  116  __host__ 
bool consume_solution() {
 
  117    ready_to_consume.wait(
false, cuda::std::memory_order_seq_cst);
 
  118    ready_to_consume.clear();
 
  120      root.print_final_solution();
 
  121      if(root.config.print_statistics) {
 
  122        root.print_mzn_statistics();
 
  127      root.print_solution();
 
  129    ready_to_produce.test_and_set(cuda::std::memory_order_seq_cst);
 
  130    ready_to_produce.notify_one();
 
  134  __device__ 
void allocate() {
 
  135    assert(threadIdx.x == 0 && blockIdx.x == 0);
 
  136    auto root_mem_config(mem_config);
 
  137    root_mem_config.mem_kind = MemoryKind::GLOBAL;
 
  138    blocks_root = bt::make_shared<BlockCP, bt::global_allocator>(
 
  139      typename BlockCP::tag_gpu_block_copy{},
 
  142      bt::global_allocator{},
 
  143      root_mem_config.make_prop_pool(bt::pool_allocator(
nullptr,0)),
 
  144      root_mem_config.make_store_pool(bt::pool_allocator(
nullptr,0)));
 
  145    blocks = bt::vector<BlockData<S>, bt::global_allocator>(root.stats.num_blocks);
 
  146    gpu_stop = bt::make_shared<B<bt::atomic_memory_grid>, bt::global_allocator>(
false);
 
  147    print_lock = bt::make_shared<cuda::binary_semaphore<cuda::thread_scope_device>, bt::global_allocator>(1);
 
  148    next_subproblem = bt::make_shared<ZLB<size_t, bt::atomic_memory_grid>, bt::global_allocator>(0);
 
  149    best_bound = bt::make_shared<U2, bt::global_allocator>();
 
  152  __device__ 
void deallocate() {
 
  153    assert(threadIdx.x == 0 && blockIdx.x == 0);
 
  154    blocks = bt::vector<BlockData<S>, bt::global_allocator>();
 
  155    blocks_root->deallocate();
 
  159    next_subproblem.reset();
 
  167  using GridCP = 
typename S::GridCP;
 
  168  using BlockCP = 
typename S::BlockCP;
 
  170  using snapshot_type = 
typename BlockCP::IST::snapshot_type<bt::global_allocator>;
 
  171  size_t subproblem_idx;
 
  172  bt::shared_ptr<FPEngine, bt::pool_allocator> fp_engine;
 
  173  bt::shared_ptr<AtomicBool, bt::pool_allocator> has_changed;
 
  174  bt::shared_ptr<AtomicBool, bt::pool_allocator> stop;
 
  175  bt::shared_ptr<BlockCP, bt::global_allocator> root;
 
  176  bt::shared_ptr<snapshot_type, bt::global_allocator> snapshot_root;
 
  178  __device__ BlockData():
 
  179    has_changed(nullptr, bt::pool_allocator(nullptr, 0)),
 
  180    stop(nullptr, bt::pool_allocator(nullptr, 0))
 
  186  __device__ 
void allocate(GridData<S>& grid_data, 
unsigned char* shared_mem) {
 
  187    auto block = cooperative_groups::this_thread_block();
 
  188    if(threadIdx.x == 0) {
 
  189      subproblem_idx = blockIdx.x;
 
  190      MemoryConfig& mem_config = grid_data.mem_config;
 
  191      bt::pool_allocator shared_mem_pool(mem_config.make_shared_pool(shared_mem));
 
  192      fp_engine = bt::allocate_shared<FPEngine, bt::pool_allocator>(shared_mem_pool);
 
  193      has_changed = bt::allocate_shared<AtomicBool, bt::pool_allocator>(shared_mem_pool, 
true);
 
  194      stop = bt::allocate_shared<AtomicBool, bt::pool_allocator>(shared_mem_pool, 
false);
 
  195      root = bt::make_shared<BlockCP, bt::global_allocator>(
typename BlockCP::tag_gpu_block_copy{},
 
  196        (mem_config.mem_kind != MemoryKind::TCN_SHARED),
 
  197        *(grid_data.blocks_root),
 
  198        bt::global_allocator{},
 
  199        mem_config.make_prop_pool(shared_mem_pool),
 
  200        mem_config.make_store_pool(shared_mem_pool));
 
  201      snapshot_root = bt::make_shared<snapshot_type, bt::global_allocator>(root->search_tree->template snapshot<bt::global_allocator>());
 
  204    fp_engine->init(root->iprop->num_deductions());
 
  208  __device__ 
void deallocate_shared() {
 
  209    if(threadIdx.x == 0) {
 
  214      snapshot_root.reset();
 
  219  __device__ 
void restore() {
 
  220    if(threadIdx.x == 0) {
 
  221      root->search_tree->restore(*snapshot_root);
 
  228__global__ 
void initialize_grid_data(GridData<S>* grid_data) {
 
  229  grid_data->allocate();
 
  230  size_t num_subproblems = 1;
 
  231  num_subproblems <<= grid_data->root.config.subproblems_power;
 
  232  grid_data->next_subproblem->meet(ZLB<size_t, bt::local_memory>(grid_data->root.stats.num_blocks));
 
  233  grid_data->root.stats.eps_num_subproblems = num_subproblems;
 
  237__global__ 
void deallocate_grid_data(GridData<S>* grid_data) {
 
  238  grid_data->deallocate();
 
  246__device__ 
bool update_grid_best_bound(BlockData<S>& block_data, GridData<S>& grid_data) {
 
  247  using U0 = 
typename S::U0;
 
  248  assert(threadIdx.x == 0);
 
  249  if(block_data.root->bab->is_optimization()) {
 
  250    const auto& bab = block_data.root->bab;
 
  251    auto local_best = bab->optimum().project(bab->objective_var());
 
  253    if(bab->is_maximization()) {
 
  254      return grid_data.best_bound->meet_lb(dual_bound<typename U0::LB>(local_best.ub()));
 
  257      return grid_data.best_bound->meet_ub(dual_bound<typename U0::UB>(local_best.lb()));
 
  268__device__ 
void update_block_best_bound(BlockData<S>& block_data, GridData<S>& grid_data) {
 
  269  using U0 = 
typename S::U0;
 
  270  if(threadIdx.x == 0 && block_data.root->bab->is_optimization()) {
 
  271    const auto& bab = block_data.root->bab;
 
  272    VarEnv<bt::global_allocator> empty_env{};
 
  273    auto best_formula = bab->template deinterpret_best_bound<bt::global_allocator>(
 
  274      bab->is_maximization()
 
  275      ? U0(dual_bound<typename U0::UB>(grid_data.best_bound->lb()))
 
  276      : U0(dual_bound<typename U0::LB>(grid_data.best_bound->ub())));
 
  279    IDiagnostics diagnostics;
 
  280    interpret_and_tell(best_formula, empty_env, *block_data.root->store, diagnostics);
 
  288__device__ 
bool propagate(BlockData<S>& block_data, GridData<S>& grid_data) {
 
  289  using BlockCP = 
typename S::BlockCP;
 
  290  __shared__ 
int warp_iterations[CUDA_THREADS_PER_BLOCK/32];
 
  291  warp_iterations[threadIdx.x / 32] = 0;
 
  292  bool is_leaf_node = 
false;
 
  293  BlockCP& cp = *block_data.root;
 
  294  auto group = cooperative_groups::this_thread_block();
 
  295  auto& fp_engine = *block_data.fp_engine;
 
  296  auto& iprop = *cp.iprop;
 
  297  auto start = cp.stats.start_timer_device();
 
  299  switch(cp.config.fixpoint) {
 
  301      fp_iterations = fp_engine.fixpoint(
 
  302        [&](
int i){ 
return iprop.deduce(i); },
 
  303        [&](){ 
return iprop.is_bot(); });
 
  304      if(threadIdx.x == 0) {
 
  305        cp.stats.num_deductions += fp_iterations * fp_engine.num_active();
 
  310      if(fp_engine.num_active() <= cp.config.wac1_threshold) {
 
  311        fp_iterations = fp_engine.fixpoint(
 
  312          [&](
int i){ 
return iprop.deduce(i); },
 
  313          [&](){ 
return iprop.is_bot(); });
 
  314        if(threadIdx.x == 0) {
 
  315          cp.stats.num_deductions += fp_iterations * fp_engine.num_active();
 
  319        fp_iterations = fp_engine.fixpoint(
 
  320          [&](
int i){ 
return warp_fixpoint<CUDA_THREADS_PER_BLOCK>(iprop, i, warp_iterations); },
 
  321          [&](){ 
return iprop.is_bot(); });
 
  322        if(threadIdx.x == 0) {
 
  323          for(
int i = 0; i < CUDA_THREADS_PER_BLOCK/32; ++i) {
 
  324            cp.stats.num_deductions += warp_iterations[i] * 32;
 
  333  if(!iprop.is_bot()) {
 
  334    fp_engine.select([&](
int i) { 
return !iprop.ask(i); });
 
  336    if(fp_engine.num_active() == 0) {
 
  337      is_leaf_node = cp.store->template is_extractable<AtomicExtraction>(group);
 
  343  if(threadIdx.x == 0) {
 
  344    cp.stats.fixpoint_iterations += fp_iterations;
 
  345    bool is_pruned = cp.on_node();
 
  349    else if(is_leaf_node) { 
 
  350      if(cp.bab->is_satisfaction() || cp.bab->compare_bound(*cp.store, cp.bab->optimum())) {
 
  352        bool best_has_changed = update_grid_best_bound(block_data, grid_data);
 
  353        if(cp.bab->is_satisfaction() || (best_has_changed && cp.is_printing_intermediate_sol())) {
 
  354          grid_data.produce_solution(*cp.bab);
 
  356        is_pruned |= cp.update_solution_stats();
 
  360      grid_data.gpu_stop->join(
true);
 
  365    fp_engine.reset(cp.iprop->num_deductions());
 
  374__device__ 
size_t dive(BlockData<S>& block_data, GridData<S>& grid_data) {
 
  375  using BlockCP = 
typename S::BlockCP;
 
  376  BlockCP& cp = *block_data.root;
 
  377  auto& fp_engine = *block_data.fp_engine;
 
  378  auto& stop = *block_data.stop;
 
  380  auto& stop_diving = *block_data.has_changed;
 
  382  stop_diving.meet(
false);
 
  384  size_t remaining_depth = grid_data.root.config.subproblems_power;
 
  385  while(remaining_depth > 0 && !stop_diving && !stop) {
 
  386    bool is_leaf_node = propagate(block_data, grid_data);
 
  387    stop.join(grid_data.cpu_stop.test() || *(grid_data.gpu_stop));
 
  389      if(threadIdx.x == 0) {
 
  390        stop_diving.join(
true);
 
  395      if(threadIdx.x == 0) {
 
  396        auto start = cp.stats.start_timer_device();
 
  397        size_t branch_idx = (block_data.subproblem_idx & (
size_t{1} << remaining_depth)) >> remaining_depth;
 
  398        auto branches = cp.split->split();
 
  399        assert(branches.size() == 2);
 
  400        cp.iprop->deduce(branches[branch_idx]);
 
  406  return remaining_depth;
 
  410__device__ 
void solve_problem(BlockData<S>& block_data, GridData<S>& grid_data) {
 
  411  using BlockCP = 
typename S::BlockCP;
 
  412  BlockCP& cp = *block_data.root;
 
  413  auto& fp_engine = *block_data.fp_engine;
 
  414  auto& block_has_changed = *block_data.has_changed;
 
  415  auto& stop = *block_data.stop;
 
  416  block_has_changed.join(
true);
 
  419  auto start = cp.stats.start_timer_device();
 
  422  while(block_has_changed && !stop) {
 
  423    update_block_best_bound(block_data, grid_data);
 
  425    propagate(block_data, grid_data);
 
  426    if(threadIdx.x == 0) {
 
  427      start = cp.stats.start_timer_device();
 
  428      stop.join(grid_data.cpu_stop.test() || *(grid_data.gpu_stop));
 
  431      block_has_changed.meet(cp.search_tree->deduce());
 
  447CUDA 
void reduce_blocks(GridData<S>* grid_data) {
 
  448  for(
int i = 0; i < grid_data->blocks.size(); ++i) {
 
  449    if(grid_data->blocks[i].root) { 
 
  450      grid_data->root.meet(*(grid_data->blocks[i].root));
 
  456__global__ 
void gpu_solve_kernel(GridData<S>* grid_data)
 
  458  if(threadIdx.x == 0 && blockIdx.x == 0 && grid_data->root.config.verbose_solving) {
 
  459    printf(
"%% GPU kernel started, starting solving...\n");
 
  461  extern __shared__ 
unsigned char shared_mem[];
 
  462  size_t num_subproblems = grid_data->root.stats.eps_num_subproblems;
 
  463  BlockData<S>& block_data = grid_data->blocks[blockIdx.x];
 
  464  block_data.allocate(*grid_data, shared_mem);
 
  465  auto solve_start = block_data.root->stats.start_timer_device();
 
  466  while(block_data.subproblem_idx < num_subproblems && !*(block_data.stop)) {
 
  467    if(threadIdx.x == 0 && grid_data->root.config.verbose_solving >= 2) {
 
  468      grid_data->print_lock->acquire();
 
  469      printf(
"%% Block %d solves subproblem num %" PRIu64 
"\n", blockIdx.x, block_data.subproblem_idx);
 
  470      grid_data->print_lock->release();
 
  472    block_data.restore();
 
  474    auto dive_start = block_data.root->stats.start_timer_device();
 
  475    size_t remaining_depth = dive(block_data, *grid_data);
 
  476    block_data.root->stats.stop_timer(
Timer::DIVE, dive_start);
 
  477    if(remaining_depth == 0) {
 
  478      solve_problem(block_data, *grid_data);
 
  479      if(threadIdx.x == 0 && !*(block_data.stop)) {
 
  480        block_data.root->stats.eps_solved_subproblems += 1;
 
  484      if(threadIdx.x == 0 && !*(block_data.stop)) {
 
  485        size_t next_subproblem_idx = ((block_data.subproblem_idx >> remaining_depth) + 
size_t{1}) << remaining_depth;
 
  486        grid_data->next_subproblem->meet(ZLB<size_t, bt::local_memory>(next_subproblem_idx));
 
  488        if((block_data.subproblem_idx & ((
size_t{1} << remaining_depth) - 
size_t{1})) == 
size_t{0}) {
 
  489          block_data.root->stats.eps_skipped_subproblems += next_subproblem_idx - block_data.subproblem_idx;
 
  494    if(threadIdx.x == 0 && !*(block_data.stop)) {
 
  495      block_data.subproblem_idx = grid_data->next_subproblem->value();
 
  496      grid_data->next_subproblem->meet(ZLB<size_t, bt::local_memory>(block_data.subproblem_idx + 
size_t{1}));
 
  501  block_data.root->stats.stop_timer(
Timer::SOLVE, solve_start);
 
  502  if(threadIdx.x == 0 && !*(block_data.stop)) {
 
  503    block_data.root->stats.num_blocks_done = 1;
 
  505  if(threadIdx.x == 0) {
 
  506    grid_data->print_lock->acquire();
 
  507    if(!grid_data->blocks_reduced) {
 
  509      for(
int i = 0; i < grid_data->blocks.size(); ++i) {
 
  510        if(grid_data->blocks[i].root) { 
 
  511          n += grid_data->blocks[i].root->stats.num_blocks_done;
 
  514      if(block_data.stop->value() || n == grid_data->blocks.size()) {
 
  515        reduce_blocks(grid_data);
 
  516        grid_data->blocks_reduced = 
true;
 
  517        cuda::atomic_thread_fence(cuda::memory_order_seq_cst, cuda::thread_scope_system);
 
  518        grid_data->ready_to_consume.test_and_set(cuda::std::memory_order_seq_cst);
 
  519        grid_data->ready_to_consume.notify_one();
 
  522    grid_data->print_lock->release();
 
  525  block_data.deallocate_shared();
 
  528template <
class S, 
class U>
 
  529size_t sizeof_store(
const CP<U>& root) {
 
  530  return gpu_sizeof<typename S::BlockCP::IStore>()
 
  531       + gpu_sizeof<typename S::BlockCP::IStore::universe_type>() * root.
store->vars();
 
  535template <
class S, 
class U>
 
  536MemoryConfig configure_memory(
CP<U>& root) {
 
  537  cudaDeviceProp deviceProp;
 
  538  cudaGetDeviceProperties(&deviceProp, 0);
 
  539  const auto& config = root.
config;
 
  540  size_t shared_mem_capacity = deviceProp.sharedMemPerBlock;
 
  545  size_t store_alignment = 200; 
 
  547  MemoryConfig mem_config;
 
  549  mem_config.shared_bytes = 
sizeof(FPEngine)+100;
 
  550  mem_config.store_bytes = sizeof_store<S>(root2) + store_alignment;
 
  553  mem_config.prop_bytes = root2.prop_allocator.total_bytes_allocated();
 
  554  mem_config.prop_bytes += mem_config.prop_bytes / 5;
 
  555  if(config.only_global_memory || shared_mem_capacity < mem_config.shared_bytes + mem_config.store_bytes) {
 
  556    if(!config.only_global_memory && config.verbose_solving) {
 
  557      printf(
"%% The store of variables (%zuKB) cannot be stored in the shared memory of the GPU (%zuKB), therefore we use the global memory.\n",
 
  558      mem_config.store_bytes / 1000,
 
  559      shared_mem_capacity / 1000);
 
  561    mem_config.mem_kind = MemoryKind::GLOBAL;
 
  563  else if(shared_mem_capacity > mem_config.shared_bytes + mem_config.store_bytes + mem_config.prop_bytes) {
 
  564    if(config.verbose_solving) {
 
  565      printf(
"%% The store of variables and the propagators (%zuKB) are stored in the shared memory of the GPU (%zuKB).\n",
 
  566      (mem_config.shared_bytes + mem_config.store_bytes + mem_config.prop_bytes) / 1000,
 
  567      shared_mem_capacity / 1000);
 
  569    mem_config.shared_bytes += mem_config.store_bytes + mem_config.prop_bytes;
 
  570    mem_config.mem_kind = MemoryKind::TCN_SHARED;
 
  573    if(config.verbose_solving) {
 
  574      printf(
"%% The store of variables (%zuKB) is stored in the shared memory of the GPU (%zuKB).\n",
 
  575        mem_config.store_bytes / 1000,
 
  576        shared_mem_capacity / 1000);
 
  578    mem_config.shared_bytes += mem_config.store_bytes;
 
  579    mem_config.mem_kind = MemoryKind::STORE_SHARED;
 
  588void consume_kernel_solutions(GridData<S>& grid_data) {
 
  589  while(!grid_data.consume_solution()) {}
 
  592template <
class S, 
class U, 
class Timepo
int>
 
  593void transfer_memory_and_run(
CP<U>& root, MemoryConfig mem_config, 
const Timepoint& start) {
 
  594  using concurrent_allocator = 
typename S::concurrent_allocator;
 
  595  auto grid_data = bt::make_shared<GridData<S>, concurrent_allocator>(root, mem_config);
 
  596  initialize_grid_data<<<1,1>>>(grid_data.get());
 
  597  CUDAEX(cudaDeviceSynchronize());
 
  598  if(grid_data->root.config.print_statistics) {
 
  599    mem_config.print_mzn_statistics(root.
config, root.
stats);
 
  601  std::thread consumer_thread(consume_kernel_solutions<S>, std::ref(*grid_data));
 
  603    <<<
static_cast<unsigned int>(grid_data->root.stats.num_blocks),
 
  604      CUDA_THREADS_PER_BLOCK,
 
  605      grid_data->mem_config.shared_bytes>>>
 
  607  bool interrupted = wait_solving_ends(grid_data->cpu_stop, grid_data->root, start);
 
  608  consumer_thread.join();
 
  609  CUDAEX(cudaDeviceSynchronize());
 
  610  deallocate_grid_data<<<1,1>>>(grid_data.get());
 
  611  CUDAEX(cudaDeviceSynchronize());
 
  615int threads_per_sm(cudaDeviceProp devProp) {
 
  616  switch (devProp.major){
 
  617    case 2: 
return (devProp.minor == 1) ? 48 : 32; 
 
  620    case 6: 
return (devProp.minor == 0) ? 64 : 128; 
 
  622    case 8: 
return (devProp.minor == 0) ? 64 : 128; 
 
  628template <
class S, 
class U>
 
  629void configure_blocks_threads(
CP<U>& root, 
const MemoryConfig& mem_config) {
 
  630  int max_block_per_sm;
 
  631  cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_block_per_sm, (
void*) gpu_solve_kernel<S>, CUDA_THREADS_PER_BLOCK, (
int)mem_config.shared_bytes);
 
  633    printf(
"%% max_blocks_per_sm=%d\n", max_block_per_sm);
 
  636  cudaDeviceProp deviceProp;
 
  637  cudaGetDeviceProperties(&deviceProp, 0);
 
  639  size_t total_global_mem = deviceProp.totalGlobalMem;
 
  640  size_t num_sm = deviceProp.multiProcessorCount;
 
  641  size_t num_threads_per_sm = threads_per_sm(deviceProp);
 
  643  auto& config = root.
config;
 
  644  root.
stats.
num_blocks = (config.or_nodes == 0) ? max_block_per_sm : config.or_nodes;
 
  646  config.stack_kb = config.stack_kb == 0 ? 32 : config.stack_kb;
 
  649  size_t total_stack_size = num_sm * deviceProp.maxThreadsPerMultiProcessor * config.stack_kb * 1000;
 
  650  size_t remaining_global_mem = total_global_mem - total_stack_size;
 
  651  remaining_global_mem -= remaining_global_mem / 10; 
 
  655  size_t heap_usage_estimation = (root.
stats.
num_blocks + 1) * (mem_config.prop_bytes + mem_config.store_bytes + 100 * root.
store->vars());
 
  656  while(heap_usage_estimation > remaining_global_mem) {
 
  661  CUDAEX(cudaDeviceSetLimit(cudaLimitStackSize, config.stack_kb*1000));
 
  662  CUDAEX(cudaDeviceSetLimit(cudaLimitMallocHeapSize, remaining_global_mem/15));
 
  667  if(config.verbose_solving) {
 
  672template <
class S, 
class U, 
class Timepo
int>
 
  673void configure_and_run(
CP<U>& root, 
const Timepoint& start) {
 
  674  MemoryConfig mem_config = configure_memory<S>(root);
 
  675  configure_blocks_threads<S>(root, mem_config);
 
  676  transfer_memory_and_run<S>(root, mem_config, start);
 
  683  std::cerr << 
"You must use a CUDA compiler (nvcc or clang) to compile Turbo on GPU." << std::endl;
 
  685  check_support_managed_memory();
 
  686  check_support_concurrent_managed_memory();
 
  687  auto start = std::chrono::steady_clock::now();
 
  690  if(root.
iprop->is_bot()) {
 
  695#ifdef NO_CONCURRENT_MANAGED_MEMORY 
  696  configure_and_run<ItvSolverPinned>(root, start);
 
  698  configure_and_run<ItvSolver>(root, start);
 
 
void block_signal_ctrlc()
Definition common_solving.hpp:72
 
void gpu_dive_and_solve(Configuration< bt::standard_allocator > &config)
Definition gpu_dive_and_solve.hpp:681
 
Definition common_solving.hpp:144
 
Configuration< BasicAllocator > config
Definition common_solving.hpp:278
 
CUDA void print_final_solution()
Definition common_solving.hpp:830
 
Statistics< BasicAllocator > stats
Definition common_solving.hpp:279
 
abstract_ptr< IStore > store
Definition common_solving.hpp:260
 
void preprocess()
Definition common_solving.hpp:591
 
abstract_ptr< IProp > iprop
Definition common_solving.hpp:261
 
int verbose_solving
Definition config.hpp:41
 
int num_blocks
Definition statistics.hpp:117
 
CUDA void print_memory_statistics(int verbose, const char *key, size_t bytes) const
Definition statistics.hpp:256
 
Definition common_solving.hpp:107