3 #ifndef LALA_CORE_FIXPOINT_HPP
4 #define LALA_CORE_FIXPOINT_HPP
8 #include "battery/memory.hpp"
9 #include "battery/vector.hpp"
12 #include <cooperative_groups.h>
13 #include <cub/block/block_scan.cuh>
32 bool has_changed =
false;
33 for(
int i = 0; i < n; ++i) {
46 template <
class F,
class StopFun,
class M>
47 CUDA
int fixpoint(
int n,
const F& f,
const StopFun& must_stop,
B<M>& has_changed) {
50 while(changed && !must_stop()) {
52 has_changed.join(changed);
59 template <
class F,
class StopFun>
60 CUDA
int fixpoint(
int n,
const F& f,
const StopFun& must_stop) {
62 return fixpoint(n, f, must_stop, has_changed);
66 template <
class F,
class M>
68 return fixpoint(n, f, [](){
return false; }, has_changed);
83 template <
class Fixpo
intEngine>
86 FixpointEngine fp_engine;
89 battery::vector<int> indexes;
96 for(
int i = 0; i < n; ++i) {
103 return fp_engine.iterate(n, [&](
int i) {
return f(indexes[i]); });
108 return fp_engine.fixpoint(n, [&](
int i) {
return f(indexes[i]); });
111 template <
class F,
class StopFun>
113 return fp_engine.fixpoint(n, [&](
int i) {
return f(indexes[i]); }, g);
116 template <
class F,
class StopFun,
class M>
118 return fp_engine.fixpoint(n, [&](
int i) {
return f(indexes[i]); }, g);
134 for(
int i = 0; i < n; ++i) {
136 battery::swap(indexes[i], indexes[--n]);
157 template <
class IteratorEngine>
158 class AsynchronousFixpoint {
168 for(
int i = 0; i < 3; ++i) {
175 CUDA INLINE
bool is_thread0() {
176 return static_cast<IteratorEngine*
>(
this)->is_thread0();
179 CUDA INLINE
void barrier() {
180 static_cast<IteratorEngine*
>(
this)->barrier();
184 CUDA INLINE
local::B iterate(
int n,
const F& f)
const {
185 return static_cast<const IteratorEngine*
>(
this)->iterate(n, f);
195 template <
class F,
class StopFun,
class M>
196 CUDA
int fixpoint(
int n,
const F& f,
const StopFun& must_stop, B<M>& has_changed) {
200 for(i = 1; changed[(i-1)%3] && !stop[(i-1)%3]; ++i) {
201 changed[i%3].join(iterate(n, f));
203 changed[(i+1)%3].meet(
false);
204 stop[i%3].join(must_stop());
210 has_changed.join(changed[1] || i > 2);
215 template <
class F,
class Iter,
class StopFun>
216 CUDA
int fixpoint(
int n,
const F& f,
const Iter& h,
const StopFun& must_stop) {
220 for(i = 1; changed[(i-1)%3] && !stop[(i-1)%3]; ++i) {
221 changed[i%3].join(iterate(n, f));
223 changed[(i+1)%3].meet(
false);
224 stop[i%3].join(must_stop());
233 template <
class Alloc,
class F,
class Iter,
class StopFun>
234 CUDA INLINE
int fixpoint(
const battery::vector<int, Alloc>& indexes,
const F& f,
const Iter& h,
const StopFun& must_stop) {
235 return fixpoint(indexes.size(), [&](
int i) { return f(indexes[i]); }, h, must_stop);
239 template <
class F,
class StopFun>
240 CUDA INLINE
int fixpoint(
int n,
const F& f,
const StopFun& must_stop) {
242 return fixpoint(n, f, must_stop, has_changed);
246 template <
class F,
class M>
247 CUDA INLINE
int fixpoint(
int n,
const F& f, B<M>& has_changed) {
248 return fixpoint(n, f, [](){
return false; }, has_changed);
253 CUDA INLINE
int fixpoint(
int n,
const F& f) {
255 return fixpoint(n, f, has_changed);
259 template <
class Alloc,
class F,
class StopFun,
class M>
260 CUDA INLINE
int fixpoint(
const battery::vector<int, Alloc>& indexes,
const F& f,
const StopFun& must_stop, B<M>& has_changed) {
261 return fixpoint(indexes.size(), [&](
int i) { return f(indexes[i]); }, must_stop, has_changed);
265 template <
class Alloc,
class F,
class StopFun>
266 CUDA INLINE
int fixpoint(
const battery::vector<int, Alloc>& indexes,
const F& f,
const StopFun& must_stop) {
268 return fixpoint(indexes, f, must_stop, has_changed);
272 template <
class Alloc,
class F,
class M>
273 CUDA INLINE
int fixpoint(
const battery::vector<int, Alloc>& indexes,
const F& f, B<M>& has_changed) {
274 return fixpoint(indexes, f, [](){
return false; }, has_changed);
278 template <
class Alloc,
class F>
279 CUDA INLINE
int fixpoint(
const battery::vector<int, Alloc>& indexes,
const F& f) {
281 return fixpoint(indexes, f, has_changed);
290 template <
class Group>
291 class AsynchronousIterationGPU :
public AsynchronousFixpoint<AsynchronousIterationGPU<Group>> {
293 using group_type = Group;
297 CUDA
void assert_cuda_arch()
const {
298 printf(
"AsynchronousIterationGPU must be used on the GPU device only.\n");
303 CUDA AsynchronousIterationGPU(
const Group& group):
307 CUDA
void reset()
const {}
309 CUDA INLINE
bool is_thread0()
const {
310 #ifndef __CUDA_ARCH__
314 return group.thread_rank() == 0;
319 CUDA INLINE
void barrier() {
320 #ifndef __CUDA_ARCH__
333 CUDA INLINE
bool iterate(
int n,
const F& f)
const {
334 #ifndef __CUDA_ARCH__
338 bool has_changed =
false;
339 for (
int i = group.thread_rank(); i < n; i += group.num_threads()) {
347 using GridAsynchronousFixpointGPU = AsynchronousIterationGPU<cooperative_groups::grid_group>;
353 template <
bool syncwarp = false>
354 class BlockAsynchronousFixpointGPU :
public AsynchronousFixpoint<BlockAsynchronousFixpointGPU<syncwarp>> {
356 CUDA
void assert_cuda_arch()
const {
357 printf(
"BlockAsynchronousFixpointGPU must be used on the GPU device only.\n");
362 BlockAsynchronousFixpointGPU() =
default;
364 CUDA INLINE
bool is_thread0()
const {
365 #ifndef __CUDA_ARCH__
369 return threadIdx.x == 0;
373 CUDA INLINE
void barrier() {
374 #ifndef __CUDA_ARCH__
388 CUDA INLINE
bool iterate(
int n,
const F& f)
const {
389 #ifndef __CUDA_ARCH__
393 bool has_changed =
false;
394 int n2 = syncwarp ? max(n,n+(32-(n%32))) : n;
395 for (
int i = threadIdx.x; i < n2; i += blockDim.x) {
396 has_changed |= f(syncwarp ? (i >= n ? n-1 : i) : i);
409 template <
int TPB,
class A>
410 __device__
local::B warp_fixpoint(A& a,
int i) {
411 auto ded = a.load_deduce(i);
413 __shared__
bool warp_changed[TPB/32];
414 int warp_id = threadIdx.x / 32;
415 warp_changed[warp_id] =
true;
416 while(warp_changed[warp_id]) {
418 warp_changed[warp_id] =
false;
424 warp_changed[warp_id] =
true;
437 template <
class Fixpo
intEngine,
class Allocator,
int TPB>
438 class FixpointSubsetGPU {
440 using allocator_type = Allocator;
443 FixpointEngine fp_engine;
446 battery::vector<int, allocator_type> indexes;
451 battery::vector<bool, allocator_type> mask;
454 battery::vector<int, allocator_type> sum;
457 battery::vector<int, allocator_type> indexes2;
460 using BlockScan = cub::BlockScan<int, TPB>;
461 typename BlockScan::TempStorage cub_prefixsum_tmp;
464 __device__ INLINE
int round_multiple_TPB(
int n) {
465 return n + ((blockDim.x - n % blockDim.x) % blockDim.x);
469 FixpointSubsetGPU() =
default;
471 __device__
void reset(
int n) {
472 if(threadIdx.x == 0) {
477 for(
int i = threadIdx.x; i < indexes.size(); i += blockDim.x) {
482 __device__
void init(
int n,
const allocator_type& allocator = allocator_type()) {
483 if(threadIdx.x == 0) {
484 indexes = battery::vector<int, allocator_type>(n, allocator);
485 indexes2 = battery::vector<int, allocator_type>(n, allocator);
486 mask = battery::vector<bool, allocator_type>(round_multiple_TPB(n),
false, allocator);
487 sum = battery::vector<int, allocator_type>(round_multiple_TPB(n), 0, allocator);
490 for(
int i = threadIdx.x; i < indexes.size(); i += blockDim.x) {
495 __device__
void destroy() {
496 if(threadIdx.x == 0) {
497 indexes = battery::vector<int, allocator_type>();
498 indexes2 = battery::vector<int, allocator_type>();
499 mask = battery::vector<bool, allocator_type>();
500 sum = battery::vector<int, allocator_type>();
505 CUDA INLINE
bool is_thread0()
const {
506 return fp_engine.is_thread0();
509 CUDA INLINE
void barrier() {
514 CUDA INLINE
bool iterate(
const F& f) {
515 return fp_engine.iterate(indexes, f);
519 CUDA INLINE
int fixpoint(
const F& f) {
520 return fp_engine.fixpoint(indexes, f);
523 template <
class F,
class StopFun>
524 CUDA INLINE
int fixpoint(
const F& f,
const StopFun& g) {
525 return fp_engine.fixpoint(indexes, f, g);
528 template <
class F,
class Iter,
class StopFun>
529 CUDA INLINE
int fixpoint(
const F& f,
const Iter& h,
const StopFun& g) {
530 return fp_engine.fixpoint(indexes, f, h, g);
533 template <
class F,
class StopFun,
class M>
534 CUDA INLINE
int fixpoint(
const F& f,
const StopFun& g, B<M>& has_changed) {
535 return fp_engine.fixpoint(indexes, f, g);
539 CUDA
int num_active()
const {
540 return indexes.size();
546 __device__
void select(
const G& g) {
547 assert(TPB == blockDim.x);
552 if(indexes.size() == 0) {
557 for(
int i = threadIdx.x; i < indexes.size(); i += blockDim.x) {
558 mask[i] = g(indexes[i]);
562 int n = round_multiple_TPB(indexes.size());
563 for(
int i = threadIdx.x; i < n; i += blockDim.x) {
564 BlockScan(cub_prefixsum_tmp).InclusiveSum(mask[i], sum[i]);
567 for(
int i = blockDim.x + threadIdx.x; i < n; i += blockDim.x) {
568 sum[i] += sum[i - threadIdx.x - 1];
573 if(threadIdx.x == 0) {
574 battery::swap(indexes, indexes2);
575 indexes.resize(sum[indexes2.size()-1]);
578 for(
int i = threadIdx.x; i < indexes2.size(); i += blockDim.x) {
580 indexes[sum[i]-1] = indexes2[i];
585 template <
class Alloc = allocator_type>
586 using snapshot_type = battery::vector<int, Alloc>;
588 template <
class Alloc = allocator_type>
589 CUDA snapshot_type<Alloc> snapshot(
const Alloc& alloc = Alloc())
const {
590 return snapshot_type<Alloc>(indexes, alloc);
593 template <
class Alloc>
594 __device__
void restore_par(
const snapshot_type<Alloc>& snap) {
595 for(
int i = threadIdx.x; i < snap.size(); i += blockDim.x) {
596 indexes[i] = snap[i];
598 if(threadIdx.x == 0) {
599 assert(snap.size() < indexes.capacity());
600 indexes.resize(snap.size());
Definition: fixpoint.hpp:84
int fixpoint(const F &f)
Definition: fixpoint.hpp:107
bool iterate(const F &f)
Definition: fixpoint.hpp:102
void reset()
Definition: fixpoint.hpp:126
int fixpoint(const F &f, const StopFun &g)
Definition: fixpoint.hpp:112
FixpointSubsetCPU(int n)
Definition: fixpoint.hpp:95
int num_active() const
Definition: fixpoint.hpp:122
int snapshot_type
Definition: fixpoint.hpp:142
snapshot_type snapshot() const
Definition: fixpoint.hpp:144
int fixpoint(const F &f, const StopFun &g, B< M > &has_changed)
Definition: fixpoint.hpp:117
void select(const G &g)
Definition: fixpoint.hpp:133
void restore(const snapshot_type &snap)
Definition: fixpoint.hpp:148
Definition: fixpoint.hpp:21
CUDA int fixpoint(int n, const F &f, const StopFun &must_stop, B< M > &has_changed)
Definition: fixpoint.hpp:47
CUDA int fixpoint(int n, const F &f, B< M > &has_changed)
Definition: fixpoint.hpp:67
CUDA int fixpoint(int n, const F &f)
Definition: fixpoint.hpp:73
CUDA int fixpoint(int n, const F &f, const StopFun &must_stop)
Definition: fixpoint.hpp:60
CUDA void barrier()
Definition: fixpoint.hpp:23
CUDA local::B iterate(int n, const F &f) const
Definition: fixpoint.hpp:31
::lala::B<::battery::local_memory > B
Definition: b.hpp:12
Definition: abstract_deps.hpp:14