3#ifndef LALA_CORE_FIXPOINT_HPP
4#define LALA_CORE_FIXPOINT_HPP
8#include "battery/memory.hpp"
9#include "battery/vector.hpp"
12 #include <cooperative_groups.h>
13 #include <cub/block/block_scan.cuh>
32 bool has_changed =
false;
33 for(
int i = 0; i < n; ++i) {
46 template <
class F,
class StopFun,
class M>
47 CUDA
int fixpoint(
int n,
const F& f,
const StopFun& must_stop,
B<M>& has_changed) {
50 while(changed && !must_stop()) {
52 has_changed.join(changed);
59 template <
class F,
class StopFun>
60 CUDA
int fixpoint(
int n,
const F& f,
const StopFun& must_stop) {
62 return fixpoint(n, f, must_stop, has_changed);
66 template <
class F,
class M>
68 return fixpoint(n, f, [](){
return false; }, has_changed);
83template <
class Fixpo
intEngine>
86 FixpointEngine fp_engine;
89 battery::vector<int> indexes;
96 for(
int i = 0; i < n; ++i) {
103 return fp_engine.iterate(n, [&](
int i) {
return f(indexes[i]); });
108 return fp_engine.fixpoint(n, [&](
int i) {
return f(indexes[i]); });
111 template <
class F,
class StopFun>
113 return fp_engine.fixpoint(n, [&](
int i) {
return f(indexes[i]); }, g);
116 template <
class F,
class StopFun,
class M>
118 return fp_engine.fixpoint(n, [&](
int i) {
return f(indexes[i]); }, g);
134 for(
int i = 0; i < n; ++i) {
136 battery::swap(indexes[i], indexes[--n]);
157template <
class IteratorEngine>
158class AsynchronousFixpoint {
168 for(
int i = 0; i < 3; ++i) {
175 CUDA INLINE
bool is_thread0() {
176 return static_cast<IteratorEngine*
>(
this)->is_thread0();
179 CUDA INLINE
void barrier() {
180 static_cast<IteratorEngine*
>(
this)->barrier();
184 CUDA INLINE local::B iterate(
int n,
const F& f)
const {
185 return static_cast<const IteratorEngine*
>(
this)->iterate(n, f);
195 template <
class F,
class StopFun,
class M>
196 CUDA
int fixpoint(
int n,
const F& f,
const StopFun& must_stop, B<M>& has_changed) {
200 for(i = 1; changed[(i-1)%3] && !stop[(i-1)%3]; ++i) {
201 changed[i%3].join(iterate(n, f));
203 changed[(i+1)%3].meet(
false);
204 stop[i%3].join(must_stop());
210 has_changed.join(changed[1] || i > 2);
215 template <
class F,
class Iter,
class StopFun>
216 CUDA
int fixpoint(
int n,
const F& f,
const Iter& h,
const StopFun& must_stop) {
220 for(i = 1; changed[(i-1)%3] && !stop[(i-1)%3]; ++i) {
221 changed[i%3].join(iterate(n, f));
223 changed[(i+1)%3].meet(
false);
224 stop[i%3].join(must_stop());
233 template <
class Alloc,
class F,
class Iter,
class StopFun>
234 CUDA INLINE
int fixpoint(
const battery::vector<int, Alloc>& indexes,
const F& f,
const Iter& h,
const StopFun& must_stop) {
235 return fixpoint(indexes.size(), [&](
int i) { return f(indexes[i]); }, h, must_stop);
239 template <
class F,
class StopFun>
240 CUDA INLINE
int fixpoint(
int n,
const F& f,
const StopFun& must_stop) {
241 local::B has_changed(
false);
242 return fixpoint(n, f, must_stop, has_changed);
246 template <
class F,
class M>
247 CUDA INLINE
int fixpoint(
int n,
const F& f, B<M>& has_changed) {
248 return fixpoint(n, f, [](){
return false; }, has_changed);
253 CUDA INLINE
int fixpoint(
int n,
const F& f) {
254 local::B has_changed(
false);
255 return fixpoint(n, f, has_changed);
259 template <
class Alloc,
class F,
class StopFun,
class M>
260 CUDA INLINE
int fixpoint(
const battery::vector<int, Alloc>& indexes,
const F& f,
const StopFun& must_stop, B<M>& has_changed) {
261 return fixpoint(indexes.size(), [&](
int i) { return f(indexes[i]); }, must_stop, has_changed);
265 template <
class Alloc,
class F,
class StopFun>
266 CUDA INLINE
int fixpoint(
const battery::vector<int, Alloc>& indexes,
const F& f,
const StopFun& must_stop) {
267 local::B has_changed(
false);
268 return fixpoint(indexes, f, must_stop, has_changed);
272 template <
class Alloc,
class F,
class M>
273 CUDA INLINE
int fixpoint(
const battery::vector<int, Alloc>& indexes,
const F& f, B<M>& has_changed) {
274 return fixpoint(indexes, f, [](){
return false; }, has_changed);
278 template <
class Alloc,
class F>
279 CUDA INLINE
int fixpoint(
const battery::vector<int, Alloc>& indexes,
const F& f) {
280 local::B has_changed(
false);
281 return fixpoint(indexes, f, has_changed);
290template <
class Group>
291class AsynchronousIterationGPU :
public AsynchronousFixpoint<AsynchronousIterationGPU<Group>> {
293 using group_type = Group;
297 CUDA
void assert_cuda_arch()
const {
298 printf(
"AsynchronousIterationGPU must be used on the GPU device only.\n");
303 CUDA AsynchronousIterationGPU(
const Group& group):
307 CUDA
void reset()
const {}
309 CUDA INLINE
bool is_thread0()
const {
310 #ifndef __CUDA_ARCH__
314 return group.thread_rank() == 0;
319 CUDA INLINE
void barrier() {
320 #ifndef __CUDA_ARCH__
333 CUDA INLINE
bool iterate(
int n,
const F& f)
const {
334 #ifndef __CUDA_ARCH__
338 bool has_changed =
false;
339 for (
int i = group.thread_rank(); i < n; i += group.num_threads()) {
347using GridAsynchronousFixpointGPU = AsynchronousIterationGPU<cooperative_groups::grid_group>;
353template <
bool syncwarp = false>
354class BlockAsynchronousFixpointGPU :
public AsynchronousFixpoint<BlockAsynchronousFixpointGPU<syncwarp>> {
356 CUDA
void assert_cuda_arch()
const {
357 printf(
"BlockAsynchronousFixpointGPU must be used on the GPU device only.\n");
362 BlockAsynchronousFixpointGPU() =
default;
364 CUDA INLINE
bool is_thread0()
const {
365 #ifndef __CUDA_ARCH__
369 return threadIdx.x == 0;
373 CUDA INLINE
void barrier() {
374 #ifndef __CUDA_ARCH__
388 CUDA INLINE
bool iterate(
int n,
const F& f)
const {
389 #ifndef __CUDA_ARCH__
393 bool has_changed =
false;
394 int n2 = syncwarp && n != 0 ? max(n,n+(32-(n%32))) : n;
395 for (
int i = threadIdx.x; i < n2; i += blockDim.x) {
396 has_changed |= f(syncwarp ? (i >= n ? n-1 : i) : i);
410template <
int TPB,
class A>
411__device__
local::B warp_fixpoint(A& a,
int i,
int* warp_iterations) {
412 auto ded = a.load_deduce(i);
414 __shared__
bool warp_changed[TPB/32];
415 int warp_id = threadIdx.x / 32;
416 warp_changed[warp_id] =
true;
417 while(warp_changed[warp_id]) {
419 warp_changed[warp_id] =
false;
425 warp_changed[warp_id] =
true;
428 if(threadIdx.x % 32 == 0) {
429 warp_iterations[warp_id]++;
441template <
class Fixpo
intEngine,
class Allocator,
int TPB>
442class FixpointSubsetGPU {
444 using allocator_type = Allocator;
447 FixpointEngine fp_engine;
450 battery::vector<int, allocator_type> indexes;
455 battery::vector<bool, allocator_type> mask;
458 battery::vector<int, allocator_type> sum;
461 battery::vector<int, allocator_type> indexes2;
464 using BlockScan = cub::BlockScan<int, TPB>;
465 typename BlockScan::TempStorage cub_prefixsum_tmp;
468 __device__ INLINE
int round_multiple_TPB(
int n) {
469 return n + ((blockDim.x - n % blockDim.x) % blockDim.x);
473 FixpointSubsetGPU() =
default;
475 __device__
void reset(
int n) {
476 if(threadIdx.x == 0) {
481 for(
int i = threadIdx.x; i < indexes.size(); i += blockDim.x) {
486 __device__
void init(
int n,
const allocator_type& allocator = allocator_type()) {
487 if(threadIdx.x == 0) {
488 indexes = battery::vector<int, allocator_type>(n, allocator);
489 indexes2 = battery::vector<int, allocator_type>(n, allocator);
490 mask = battery::vector<bool, allocator_type>(round_multiple_TPB(n),
false, allocator);
491 sum = battery::vector<int, allocator_type>(round_multiple_TPB(n), 0, allocator);
494 for(
int i = threadIdx.x; i < indexes.size(); i += blockDim.x) {
499 __device__
void destroy() {
500 if(threadIdx.x == 0) {
501 indexes = battery::vector<int, allocator_type>();
502 indexes2 = battery::vector<int, allocator_type>();
503 mask = battery::vector<bool, allocator_type>();
504 sum = battery::vector<int, allocator_type>();
509 CUDA INLINE
bool is_thread0()
const {
510 return fp_engine.is_thread0();
513 CUDA INLINE
void barrier() {
518 CUDA INLINE
bool iterate(
const F& f) {
519 return fp_engine.iterate(indexes, f);
523 CUDA INLINE
int fixpoint(
const F& f) {
524 return fp_engine.fixpoint(indexes, f);
527 template <
class F,
class StopFun>
528 CUDA INLINE
int fixpoint(
const F& f,
const StopFun& g) {
529 return fp_engine.fixpoint(indexes, f, g);
532 template <
class F,
class Iter,
class StopFun>
533 CUDA INLINE
int fixpoint(
const F& f,
const Iter& h,
const StopFun& g) {
534 return fp_engine.fixpoint(indexes, f, h, g);
537 template <
class F,
class StopFun,
class M>
538 CUDA INLINE
int fixpoint(
const F& f,
const StopFun& g, B<M>& has_changed) {
539 return fp_engine.fixpoint(indexes, f, g);
543 CUDA
int num_active()
const {
544 return indexes.size();
550 __device__
void select(
const G& g) {
551 assert(TPB == blockDim.x);
556 if(indexes.size() == 0) {
561 for(
int i = threadIdx.x; i < indexes.size(); i += blockDim.x) {
562 mask[i] = g(indexes[i]);
566 int n = round_multiple_TPB(indexes.size());
567 for(
int i = threadIdx.x; i < n; i += blockDim.x) {
568 BlockScan(cub_prefixsum_tmp).InclusiveSum(mask[i], sum[i]);
571 for(
int i = blockDim.x + threadIdx.x; i < n; i += blockDim.x) {
572 sum[i] += sum[i - threadIdx.x - 1];
577 if(threadIdx.x == 0) {
578 battery::swap(indexes, indexes2);
579 indexes.resize(sum[indexes2.size()-1]);
582 for(
int i = threadIdx.x; i < indexes2.size(); i += blockDim.x) {
584 indexes[sum[i]-1] = indexes2[i];
589 template <
class Alloc = allocator_type>
590 using snapshot_type = battery::vector<int, Alloc>;
592 template <
class Alloc = allocator_type>
593 CUDA snapshot_type<Alloc> snapshot(
const Alloc& alloc = Alloc())
const {
594 return snapshot_type<Alloc>(indexes, alloc);
597 template <
class Alloc>
598 __device__
void restore_par(
const snapshot_type<Alloc>& snap) {
599 for(
int i = threadIdx.x; i < snap.size(); i += blockDim.x) {
600 indexes[i] = snap[i];
602 if(threadIdx.x == 0) {
603 assert(snap.size() < indexes.capacity());
604 indexes.resize(snap.size());
Definition fixpoint.hpp:84
int fixpoint(const F &f)
Definition fixpoint.hpp:107
bool iterate(const F &f)
Definition fixpoint.hpp:102
void reset()
Definition fixpoint.hpp:126
int fixpoint(const F &f, const StopFun &g)
Definition fixpoint.hpp:112
FixpointSubsetCPU(int n)
Definition fixpoint.hpp:95
int num_active() const
Definition fixpoint.hpp:122
int snapshot_type
Definition fixpoint.hpp:142
snapshot_type snapshot() const
Definition fixpoint.hpp:144
int fixpoint(const F &f, const StopFun &g, B< M > &has_changed)
Definition fixpoint.hpp:117
void select(const G &g)
Definition fixpoint.hpp:133
void restore(const snapshot_type &snap)
Definition fixpoint.hpp:148
Definition fixpoint.hpp:21
CUDA int fixpoint(int n, const F &f, const StopFun &must_stop, B< M > &has_changed)
Definition fixpoint.hpp:47
CUDA int fixpoint(int n, const F &f, B< M > &has_changed)
Definition fixpoint.hpp:67
CUDA int fixpoint(int n, const F &f)
Definition fixpoint.hpp:73
CUDA int fixpoint(int n, const F &f, const StopFun &must_stop)
Definition fixpoint.hpp:60
CUDA void barrier()
Definition fixpoint.hpp:23
CUDA local::B iterate(int n, const F &f) const
Definition fixpoint.hpp:31
::lala::B<::battery::local_memory > B
Definition b.hpp:12
Definition abstract_deps.hpp:14