3#ifndef LALA_CORE_FIXPOINT_HPP
4#define LALA_CORE_FIXPOINT_HPP
8#include "battery/memory.hpp"
9#include "battery/vector.hpp"
12 #include <cooperative_groups.h>
13 #include <cub/block/block_scan.cuh>
32 bool has_changed =
false;
33 for(
size_t i = 0; i < n; ++i) {
46 template <
class F,
class StopFun,
class M>
47 CUDA
size_t fixpoint(
size_t n,
const F& f,
const StopFun& must_stop,
B<M>& has_changed) {
48 size_t iterations = 0;
50 while(changed && !must_stop()) {
52 has_changed.join(changed);
59 template <
class F,
class StopFun>
60 CUDA
size_t fixpoint(
size_t n,
const F& f,
const StopFun& must_stop) {
62 return fixpoint(n, f, must_stop, has_changed);
66 template <
class F,
class M>
68 return fixpoint(n, f, [](){
return false; }, has_changed);
83template <
class Fixpo
intEngine>
86 FixpointEngine fp_engine;
89 battery::vector<int> indexes;
96 for(
int i = 0; i < n; ++i) {
103 return fp_engine.iterate(n, [&](
size_t i) {
return f(indexes[i]); });
108 return fp_engine.fixpoint(n, [&](
size_t i) {
return f(indexes[i]); });
111 template <
class F,
class StopFun>
113 return fp_engine.fixpoint(n, [&](
size_t i) {
return f(indexes[i]); }, g);
116 template <
class F,
class StopFun,
class M>
118 return fp_engine.fixpoint(n, [&](
size_t i) {
return f(indexes[i]); }, g);
134 for(
int i = 0; i < n; ++i) {
136 battery::swap(indexes[i], indexes[--n]);
157template <
class IteratorEngine>
158class AsynchronousFixpoint {
168 for(
int i = 0; i < 3; ++i) {
175 CUDA INLINE
bool is_thread0() {
176 return static_cast<IteratorEngine*
>(
this)->is_thread0();
179 CUDA INLINE
void barrier() {
180 static_cast<IteratorEngine*
>(
this)->barrier();
184 CUDA INLINE local::B iterate(
size_t n,
const F& f)
const {
185 return static_cast<const IteratorEngine*
>(
this)->iterate(n, f);
195 template <
class F,
class StopFun,
class M>
196 CUDA
size_t fixpoint(
size_t n,
const F& f,
const StopFun& must_stop, B<M>& has_changed) {
200 for(i = 1; changed[(i-1)%3] && !stop[(i-1)%3]; ++i) {
201 changed[i%3].join(iterate(n, f));
203 changed[(i+1)%3].meet(
false);
204 stop[i%3].join(must_stop());
210 has_changed.join(changed[1] || i > 2);
216 template <
class F,
class StopFun>
217 CUDA INLINE
size_t fixpoint(
size_t n,
const F& f,
const StopFun& must_stop) {
218 local::B has_changed(
false);
219 return fixpoint(n, f, must_stop, has_changed);
223 template <
class F,
class M>
224 CUDA INLINE
size_t fixpoint(
size_t n,
const F& f, B<M>& has_changed) {
225 return fixpoint(n, f, [](){
return false; }, has_changed);
230 CUDA INLINE
size_t fixpoint(
size_t n,
const F& f) {
231 local::B has_changed(
false);
232 return fixpoint(n, f, has_changed);
236 template <
class Alloc,
class F,
class StopFun,
class M>
237 CUDA INLINE
size_t fixpoint(
const battery::vector<int, Alloc>& indexes,
const F& f,
const StopFun& must_stop, B<M>& has_changed) {
238 return fixpoint(indexes.size(), [&](
size_t i) { return f(indexes[i]); }, must_stop, has_changed);
242 template <
class Alloc,
class F,
class StopFun>
243 CUDA INLINE
size_t fixpoint(
const battery::vector<int, Alloc>& indexes,
const F& f,
const StopFun& must_stop) {
244 local::B has_changed(
false);
245 return fixpoint(indexes, f, must_stop, has_changed);
249 template <
class Alloc,
class F,
class M>
250 CUDA INLINE
size_t fixpoint(
const battery::vector<int, Alloc>& indexes,
const F& f, B<M>& has_changed) {
251 return fixpoint(indexes, f, [](){
return false; }, has_changed);
255 template <
class Alloc,
class F>
256 CUDA INLINE
size_t fixpoint(
const battery::vector<int, Alloc>& indexes,
const F& f) {
257 local::B has_changed(
false);
258 return fixpoint(indexes, f, has_changed);
267template <
class Group>
268class AsynchronousIterationGPU :
public AsynchronousFixpoint<AsynchronousIterationGPU<Group>> {
270 using group_type = Group;
274 CUDA
void assert_cuda_arch()
const {
275 printf(
"AsynchronousIterationGPU must be used on the GPU device only.\n");
280 CUDA AsynchronousIterationGPU(
const Group& group):
284 CUDA
void reset()
const {}
286 CUDA INLINE
bool is_thread0()
const {
287 #ifndef __CUDA_ARCH__
291 return group.thread_rank() == 0;
296 CUDA INLINE
void barrier() {
297 #ifndef __CUDA_ARCH__
310 CUDA INLINE
bool iterate(
size_t n,
const F& f)
const {
311 #ifndef __CUDA_ARCH__
315 bool has_changed =
false;
316 for (
size_t i = group.thread_rank(); i < n; i += group.num_threads()) {
324using GridAsynchronousFixpointGPU = AsynchronousIterationGPU<cooperative_groups::grid_group>;
329class BlockAsynchronousFixpointGPU :
public AsynchronousFixpoint<BlockAsynchronousFixpointGPU> {
331 CUDA
void assert_cuda_arch()
const {
332 printf(
"BlockAsynchronousFixpointGPU must be used on the GPU device only.\n");
337 BlockAsynchronousFixpointGPU() =
default;
339 CUDA INLINE
bool is_thread0()
const {
340 #ifndef __CUDA_ARCH__
344 return threadIdx.x == 0;
348 CUDA INLINE
void barrier() {
349 #ifndef __CUDA_ARCH__
363 CUDA INLINE
bool iterate(
size_t n,
const F& f)
const {
364 #ifndef __CUDA_ARCH__
368 bool has_changed =
false;
369 for (
size_t i = threadIdx.x; i < n; i += blockDim.x) {
380template <
class Fixpo
intEngine,
class Allocator,
size_t TPB>
381class FixpointSubsetGPU {
383 using allocator_type = Allocator;
386 FixpointEngine fp_engine;
389 battery::vector<int, allocator_type> indexes;
394 battery::vector<bool, allocator_type> mask;
397 battery::vector<int, allocator_type> sum;
400 battery::vector<int, allocator_type> indexes2;
403 using BlockScan = cub::BlockScan<int, TPB>;
404 typename BlockScan::TempStorage cub_prefixsum_tmp;
407 __device__ INLINE
size_t round_multiple_TPB(
size_t n) {
408 return n + ((blockDim.x - n % blockDim.x) % blockDim.x);
412 FixpointSubsetGPU() =
default;
414 __device__
void reset(
size_t n) {
415 if(threadIdx.x == 0) {
420 for(
int i = threadIdx.x; i < indexes.size(); i += blockDim.x) {
425 __device__
void init(
size_t n,
const allocator_type& allocator = allocator_type()) {
426 if(threadIdx.x == 0) {
427 indexes = battery::vector<int, allocator_type>(n, allocator);
428 indexes2 = battery::vector<int, allocator_type>(n, allocator);
429 mask = battery::vector<bool, allocator_type>(round_multiple_TPB(n),
false, allocator);
430 sum = battery::vector<int, allocator_type>(round_multiple_TPB(n), 0, allocator);
433 for(
int i = threadIdx.x; i < indexes.size(); i += blockDim.x) {
438 __device__
void destroy() {
439 if(threadIdx.x == 0) {
440 indexes = battery::vector<int, allocator_type>();
441 indexes2 = battery::vector<int, allocator_type>();
442 mask = battery::vector<bool, allocator_type>();
443 sum = battery::vector<int, allocator_type>();
448 CUDA INLINE
bool is_thread0()
const {
449 return fp_engine.is_thread0();
452 CUDA INLINE
void barrier() {
457 CUDA INLINE
bool iterate(
const F& f) {
458 return fp_engine.iterate(indexes, f);
462 CUDA INLINE
size_t fixpoint(
const F& f) {
463 return fp_engine.fixpoint(indexes, f);
466 template <
class F,
class StopFun>
467 CUDA INLINE
size_t fixpoint(
const F& f,
const StopFun& g) {
468 return fp_engine.fixpoint(indexes, f, g);
471 template <
class F,
class StopFun,
class M>
472 CUDA INLINE
size_t fixpoint(
const F& f,
const StopFun& g, B<M>& has_changed) {
473 return fp_engine.fixpoint(indexes, f, g);
477 CUDA
size_t num_active()
const {
478 return indexes.size();
484 __device__
void select(
const G& g) {
485 assert(TPB == blockDim.x);
490 if(indexes.size() == 0) {
495 for(
int i = threadIdx.x; i < indexes.size(); i += blockDim.x) {
496 mask[i] = g(indexes[i]);
500 size_t n = round_multiple_TPB(indexes.size());
501 for(
int i = threadIdx.x; i < n; i += blockDim.x) {
502 BlockScan(cub_prefixsum_tmp).InclusiveSum(mask[i], sum[i]);
505 for(
int i = blockDim.x + threadIdx.x; i < n; i += blockDim.x) {
506 sum[i] += sum[i - threadIdx.x - 1];
511 if(threadIdx.x == 0) {
512 battery::swap(indexes, indexes2);
513 indexes.resize(sum[indexes2.size()-1]);
516 for(
int i = threadIdx.x; i < indexes2.size(); i += blockDim.x) {
518 indexes[sum[i]-1] = indexes2[i];
523 template <
class Alloc = allocator_type>
524 using snapshot_type = battery::vector<int, Alloc>;
526 template <
class Alloc = allocator_type>
527 CUDA snapshot_type<Alloc> snapshot(
const Alloc& alloc = Alloc())
const {
528 return snapshot_type<Alloc>(indexes, alloc);
531 template <
class Alloc>
532 __device__
void restore_par(
const snapshot_type<Alloc>& snap) {
533 for(
int i = threadIdx.x; i < snap.size(); i += blockDim.x) {
534 indexes[i] = snap[i];
536 if(threadIdx.x == 0) {
537 assert(snap.size() < indexes.capacity());
538 indexes.resize(snap.size());
Definition fixpoint.hpp:84
FixpointSubsetCPU(size_t n)
Definition fixpoint.hpp:95
bool iterate(const F &f)
Definition fixpoint.hpp:102
void reset()
Definition fixpoint.hpp:126
snapshot_type snapshot() const
Definition fixpoint.hpp:144
size_t num_active() const
Definition fixpoint.hpp:122
size_t fixpoint(const F &f)
Definition fixpoint.hpp:107
size_t fixpoint(const F &f, const StopFun &g)
Definition fixpoint.hpp:112
size_t snapshot_type
Definition fixpoint.hpp:142
void select(const G &g)
Definition fixpoint.hpp:133
void restore(const snapshot_type &snap)
Definition fixpoint.hpp:148
size_t fixpoint(const F &f, const StopFun &g, B< M > &has_changed)
Definition fixpoint.hpp:117
Definition fixpoint.hpp:21
CUDA local::B iterate(size_t n, const F &f) const
Definition fixpoint.hpp:31
CUDA size_t fixpoint(size_t n, const F &f)
Definition fixpoint.hpp:73
CUDA size_t fixpoint(size_t n, const F &f, B< M > &has_changed)
Definition fixpoint.hpp:67
CUDA size_t fixpoint(size_t n, const F &f, const StopFun &must_stop, B< M > &has_changed)
Definition fixpoint.hpp:47
CUDA void barrier()
Definition fixpoint.hpp:23
CUDA size_t fixpoint(size_t n, const F &f, const StopFun &must_stop)
Definition fixpoint.hpp:60
Definition abstract_deps.hpp:14