3#ifndef TURBO_MEMORY_GPU_HPP
4#define TURBO_MEMORY_GPU_HPP
6#include "battery/allocator.hpp"
9namespace bt = ::battery;
18enum class MemoryKind {
33 MemoryConfig() =
default;
34 MemoryConfig(
const MemoryConfig&) =
default;
36 MemoryConfig(
size_t store_bytes,
size_t prop_bytes):
37 mem_kind(MemoryKind::GLOBAL),
39 store_bytes(store_bytes),
40 prop_bytes(prop_bytes)
43 MemoryConfig(
const void* kernel,
int verbose,
int blocks_per_sm,
size_t store_bytes,
size_t prop_bytes):
44 store_bytes(store_bytes),
45 prop_bytes(prop_bytes)
47 int maxSharedMemPerSM;
48 cudaDeviceGetAttribute(&maxSharedMemPerSM, cudaDevAttrMaxSharedMemoryPerMultiprocessor, 0);
49 cudaFuncAttributes attr;
50 cudaFuncGetAttributes(&attr, kernel);
52 printf(
"%% max_shared_memory=%d\n", maxSharedMemPerSM);
53 printf(
"%% static_shared_memory=%zu\n", attr.sharedSizeBytes);
57 if(blocks_per_sm * (store_bytes + prop_bytes + alignment + attr.sharedSizeBytes) < maxSharedMemPerSM) {
58 shared_bytes = store_bytes + prop_bytes + alignment;
59 mem_kind = MemoryKind::TCN_SHARED;
61 else if(blocks_per_sm * (store_bytes + alignment + attr.sharedSizeBytes) < maxSharedMemPerSM) {
62 shared_bytes = store_bytes + alignment;
63 mem_kind = MemoryKind::STORE_SHARED;
67 mem_kind = MemoryKind::GLOBAL;
69 if(shared_bytes != 0) {
70 cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_bytes);
74 CUDA bt::pool_allocator make_global_pool(
size_t bytes)
const {
75 void* mem_pool = bt::global_allocator{}.allocate(bytes);
76 return bt::pool_allocator(
static_cast<unsigned char*
>(mem_pool), bytes);
79 CUDA bt::pool_allocator make_shared_pool(
unsigned char* shared_mem)
const {
80 return bt::pool_allocator(shared_mem, shared_bytes);
83 CUDA bt::pool_allocator make_prop_pool(bt::pool_allocator shared_mem)
const {
84 if(mem_kind == MemoryKind::TCN_SHARED) {
88 return make_global_pool(prop_bytes);
92 CUDA bt::pool_allocator make_store_pool(bt::pool_allocator shared_mem)
const {
93 if(mem_kind == MemoryKind::TCN_SHARED || mem_kind == MemoryKind::STORE_SHARED) {
97 return make_global_pool(store_bytes);
101 template <
class Config,
class Stat>
102 CUDA
void print_mzn_statistics(
const Config& config,
const Stat& stats)
const {
103 stats.print_stat(
"memory_configuration",
104 mem_kind == MemoryKind::GLOBAL ?
"global" : (
105 mem_kind == MemoryKind::STORE_SHARED ?
"store_shared" :
"tcn_shared"));
106 stats.print_memory_statistics(config.verbose_solving,
"shared_mem", shared_bytes);
107 stats.print_memory_statistics(config.verbose_solving,
"store_mem", store_bytes);
108 stats.print_memory_statistics(config.verbose_solving,
"propagator_mem", prop_bytes);
109 stats.print_mzn_end_stats();
114__global__
void gpu_sizeof_kernel(
size_t* size) {
120 auto s = bt::make_unique<size_t, bt::managed_allocator>();
121 gpu_sizeof_kernel<T><<<1, 1>>>(s.get());
122 CUDAEX(cudaDeviceSynchronize());
126void check_support_managed_memory() {
129 CUDAEX(cudaDeviceGetAttribute(&attr, cudaDevAttrManagedMemory, dev));
131 std::cerr <<
"The GPU does not support managed memory." << std::endl;
136void check_support_concurrent_managed_memory() {
139 CUDAEX(cudaDeviceGetAttribute(&attr, cudaDevAttrConcurrentManagedAccess, dev));
141#ifdef NO_CONCURRENT_MANAGED_MEMORY
142 printf(
"%% WARNING: The GPU does not support concurrent access to managed memory, hence we fall back on pinned memory.\n");
150 unsigned int flags = 0;
151 CUDAEX(cudaGetDeviceFlags(&flags));
152 flags |= cudaDeviceMapHost;
153 CUDAEX(cudaSetDeviceFlags(flags));
155 printf(
"%% To run Turbo on this GPU you need to build Turbo with the option NO_CONCURRENT_MANAGED_MEMORY.\n");
162template<
class CP,
class Timepo
int>
163bool wait_solving_ends(cuda::std::atomic_flag& stop,
CP& root,
const Timepoint& start) {
165 cudaEventCreateWithFlags(&event,cudaEventDisableTiming);
166 cudaEventRecord(event);
168 std::this_thread::sleep_for(std::chrono::milliseconds(100));
170 if(cudaEventQuery(event) == cudaErrorNotReady) {
176 cudaError error = cudaDeviceSynchronize();
177 if(error == cudaErrorIllegalAddress) {
178 printf(
"%% ERROR: CUDA kernel failed due to an illegal memory access. This might be due to a stack overflow because it is too small. Try increasing the stack size with the options -stack. If it does not work, please report it as a bug.\n");
bool must_quit(A &a)
Definition common_solving.hpp:78
bool check_timeout(A &a, const Timepoint &start)
Definition common_solving.hpp:90
Definition common_solving.hpp:144
CUDA void prune()
Definition common_solving.hpp:803