3#ifndef TURBO_MEMORY_GPU_HPP 
    4#define TURBO_MEMORY_GPU_HPP 
    6#include "battery/allocator.hpp" 
    9namespace bt = ::battery;
 
   18enum class MemoryKind {
 
   33  MemoryConfig() = 
default;
 
   34  MemoryConfig(
const MemoryConfig&) = 
default;
 
   36  MemoryConfig(
size_t store_bytes, 
size_t prop_bytes):
 
   37    mem_kind(MemoryKind::GLOBAL),
 
   39    store_bytes(store_bytes),
 
   40    prop_bytes(prop_bytes)
 
   43  MemoryConfig(
const void* kernel, 
int verbose, 
int blocks_per_sm, 
size_t store_bytes, 
size_t prop_bytes):
 
   44    store_bytes(store_bytes),
 
   45    prop_bytes(prop_bytes)
 
   47    int maxSharedMemPerSM;
 
   48    cudaDeviceGetAttribute(&maxSharedMemPerSM, cudaDevAttrMaxSharedMemoryPerMultiprocessor, 0);
 
   49    cudaFuncAttributes attr;
 
   50    cudaFuncGetAttributes(&attr, kernel);
 
   52      printf(
"%% max_shared_memory=%d\n", maxSharedMemPerSM);
 
   53      printf(
"%% static_shared_memory=%zu\n", attr.sharedSizeBytes);
 
   57    if(blocks_per_sm * (store_bytes + prop_bytes + alignment + attr.sharedSizeBytes) < maxSharedMemPerSM) {
 
   58      shared_bytes = store_bytes + prop_bytes + alignment;
 
   59      mem_kind = MemoryKind::TCN_SHARED;
 
   61    else if(blocks_per_sm * (store_bytes + alignment + attr.sharedSizeBytes) < maxSharedMemPerSM) {
 
   62      shared_bytes = store_bytes + alignment;
 
   63      mem_kind = MemoryKind::STORE_SHARED;
 
   67      mem_kind = MemoryKind::GLOBAL;
 
   69    if(shared_bytes != 0) {
 
   70      cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_bytes);
 
   74  CUDA bt::pool_allocator make_global_pool(
size_t bytes)
 const {
 
   75    void* mem_pool = bt::global_allocator{}.allocate(bytes);
 
   76    return bt::pool_allocator(
static_cast<unsigned char*
>(mem_pool), bytes);
 
   79  CUDA bt::pool_allocator make_shared_pool(
unsigned char* shared_mem)
 const {
 
   80    return bt::pool_allocator(shared_mem, shared_bytes);
 
   83  CUDA bt::pool_allocator make_prop_pool(bt::pool_allocator shared_mem)
 const {
 
   84    if(mem_kind == MemoryKind::TCN_SHARED) {
 
   88      return make_global_pool(prop_bytes);
 
   92  CUDA bt::pool_allocator make_store_pool(bt::pool_allocator shared_mem)
 const {
 
   93    if(mem_kind == MemoryKind::TCN_SHARED || mem_kind == MemoryKind::STORE_SHARED) {
 
   97      return make_global_pool(store_bytes);
 
  101  template <
class Config, 
class Stat>
 
  102  CUDA 
void print_mzn_statistics(
const Config& config, 
const Stat& stats)
 const {
 
  103    stats.print_stat(
"memory_configuration",
 
  104      mem_kind == MemoryKind::GLOBAL ? 
"global" : (
 
  105      mem_kind == MemoryKind::STORE_SHARED ? 
"store_shared" : 
"tcn_shared"));
 
  106    stats.print_memory_statistics(config.verbose_solving, 
"shared_mem", shared_bytes);
 
  107    stats.print_memory_statistics(config.verbose_solving, 
"store_mem", store_bytes);
 
  108    stats.print_memory_statistics(config.verbose_solving, 
"propagator_mem", prop_bytes);
 
  109    stats.print_mzn_end_stats();
 
  114__global__ 
void gpu_sizeof_kernel(
size_t* size) {
 
  120  auto s = bt::make_unique<size_t, bt::managed_allocator>();
 
  121  gpu_sizeof_kernel<T><<<1, 1>>>(s.get());
 
  122  CUDAEX(cudaDeviceSynchronize());
 
  126void check_support_managed_memory() {
 
  129  CUDAEX(cudaDeviceGetAttribute(&attr, cudaDevAttrManagedMemory, dev));
 
  131    std::cerr << 
"The GPU does not support managed memory." << std::endl;
 
  136void check_support_concurrent_managed_memory() {
 
  139  CUDAEX(cudaDeviceGetAttribute(&attr, cudaDevAttrConcurrentManagedAccess, dev));
 
  141#ifdef NO_CONCURRENT_MANAGED_MEMORY 
  142    printf(
"%% WARNING: The GPU does not support concurrent access to managed memory, hence we fall back on pinned memory.\n");
 
  150    unsigned int flags = 0;
 
  151    CUDAEX(cudaGetDeviceFlags(&flags));
 
  152    flags |= cudaDeviceMapHost;
 
  153    CUDAEX(cudaSetDeviceFlags(flags));
 
  155    printf(
"%% To run Turbo on this GPU you need to build Turbo with the option NO_CONCURRENT_MANAGED_MEMORY.\n");
 
  162template<
class CP, 
class Timepo
int>
 
  163bool wait_solving_ends(cuda::std::atomic_flag& stop, 
CP& root, 
const Timepoint& start) {
 
  165  cudaEventCreateWithFlags(&event,cudaEventDisableTiming);
 
  166  cudaEventRecord(event);
 
  168    std::this_thread::sleep_for(std::chrono::milliseconds(100));
 
  170  if(cudaEventQuery(event) == cudaErrorNotReady) {
 
  176    cudaError error = cudaDeviceSynchronize();
 
  177    if(error == cudaErrorIllegalAddress) {
 
  178      printf(
"%% ERROR: CUDA kernel failed due to an illegal memory access. This might be due to a stack overflow because it is too small. Try increasing the stack size with the options -stack. If it does not work, please report it as a bug.\n");
 
bool must_quit(A &a)
Definition common_solving.hpp:78
 
bool check_timeout(A &a, const Timepoint &start)
Definition common_solving.hpp:90
 
Definition common_solving.hpp:144
 
CUDA void prune()
Definition common_solving.hpp:803