Lattice Land Core Library
Loading...
Searching...
No Matches
fixpoint.hpp
Go to the documentation of this file.
1// Copyright 2022 Pierre Talbot
2
3#ifndef LALA_CORE_FIXPOINT_HPP
4#define LALA_CORE_FIXPOINT_HPP
5
6#include "logic/logic.hpp"
7#include "b.hpp"
8#include "battery/memory.hpp"
9#include "battery/vector.hpp"
10
11#ifdef __CUDACC__
12 #include <cooperative_groups.h>
13 #include <cub/block/block_scan.cuh>
14#endif
15
16namespace lala {
17
18/** A simple form of sequential fixpoint computation based on Kleene fixpoint.
19 * At each iteration, the deduction operations \f$ f_1, \ldots, f_n \f$ are simply composed by functional composition \f$ f = f_n \circ \ldots \circ f_1 \f$.
20 * This strategy basically corresponds to the Gauss-Seidel iteration method. */
22public:
23 CUDA void barrier() {}
24
25 /** We iterate the function `f` `n` times: \f$ f(0); f(1); \ldots ; f(n); \f$
26 * \param `n` the number of call to `f`.
27 * \param `bool f(size_t i)` returns `true` if something has changed for `i`.
28 * \return `true` if for some `i`, `f(i)` returned `true`, `false` otherwise.
29 */
30 template <class F>
31 CUDA local::B iterate(size_t n, const F& f) const {
32 bool has_changed = false;
33 for(size_t i = 0; i < n; ++i) {
34 has_changed |= f(i);
35 }
36 return has_changed;
37 }
38
39 /** We execute `iterate(n, f)` until we reach a fixpoint or `must_stop()` returns `true`.
40 * \param `n` the number of call to `f`.
41 * \param `bool f(size_t i)` returns `true` if something has changed for `i`.
42 * \param `bool must_stop()` returns `true` if we must stop early the fixpoint computation.
43 * \param `has_changed` is set to `true` if we were not yet in a fixpoint.
44 * \return The number of iterations required to reach a fixpoint or until `must_stop()` returns `true`.
45 */
46 template <class F, class StopFun, class M>
47 CUDA size_t fixpoint(size_t n, const F& f, const StopFun& must_stop, B<M>& has_changed) {
48 size_t iterations = 0;
49 local::B changed(true);
50 while(changed && !must_stop()) {
51 changed = iterate(n, f);
52 has_changed.join(changed);
53 iterations++;
54 }
55 return iterations;
56 }
57
58 /** Same as `fixpoint` above without `has_changed`. */
59 template <class F, class StopFun>
60 CUDA size_t fixpoint(size_t n, const F& f, const StopFun& must_stop) {
61 local::B has_changed(false);
62 return fixpoint(n, f, must_stop, has_changed);
63 }
64
65 /** Same as `fixpoint` above with `must_stop` always returning `false`. */
66 template <class F, class M>
67 CUDA size_t fixpoint(size_t n, const F& f, B<M>& has_changed) {
68 return fixpoint(n, f, [](){ return false; }, has_changed);
69 }
70
71 /** Same as `fixpoint` above without `has_changed` and with `must_stop` always returning `false`. */
72 template <class F>
73 CUDA size_t fixpoint(size_t n, const F& f) {
74 local::B has_changed(false);
75 return fixpoint(n, f, has_changed);
76 }
77};
78
79
80/** Add the ability to deactive functions in a fixpoint computation.
81 * Given a function `g`, we select only the functions \f$ f_{i_1} ; \ldots ; f_{i_k} \f$ for which \f$ g(i_k) \f$ is `true`, and compute subsequent fixpoint without them.
82 */
83template <class FixpointEngine>
85private:
86 FixpointEngine fp_engine;
87
88 /** The indexes of all functions. */
89 battery::vector<int> indexes;
90
91 /** The active subset of the functions is from 0..n-1. */
92 size_t n;
93
94public:
95 FixpointSubsetCPU(size_t n) : n(n), indexes(n) {
96 for(int i = 0; i < n; ++i) {
97 indexes[i] = i;
98 }
99 }
100
101 template <class F>
102 bool iterate(const F& f) {
103 return fp_engine.iterate(n, [&](size_t i) { return f(indexes[i]); });
104 }
105
106 template <class F>
107 size_t fixpoint(const F& f) {
108 return fp_engine.fixpoint(n, [&](size_t i) { return f(indexes[i]); });
109 }
110
111 template <class F, class StopFun>
112 size_t fixpoint(const F& f, const StopFun& g) {
113 return fp_engine.fixpoint(n, [&](size_t i) { return f(indexes[i]); }, g);
114 }
115
116 template <class F, class StopFun, class M>
117 size_t fixpoint(const F& f, const StopFun& g, B<M>& has_changed) {
118 return fp_engine.fixpoint(n, [&](size_t i) { return f(indexes[i]); }, g);
119 }
120
121 /** \return the number of active functions. */
122 size_t num_active() const {
123 return n;
124 }
125
126 void reset() {
127 n = indexes.size();
128 }
129
130 /** Compute the subset of the functions that are still active.
131 * The subsequent call to `fixpoint` will only consider the function `f_i` for which `g(i)` is `true`. */
132 template <class G>
133 void select(const G& g) {
134 for(int i = 0; i < n; ++i) {
135 if(!g(indexes[i])) {
136 battery::swap(indexes[i], indexes[--n]);
137 i--;
138 }
139 }
140 }
141
142 using snapshot_type = size_t;
143
145 return snapshot_type(n);
146 }
147
148 void restore(const snapshot_type& snap) {
149 n = snap;
150 }
151};
152
153#ifdef __CUDACC__
154
155/** This fixpoint engine is parametrized by an iterator engine `I` providing a method `iterate(n,f)`, a barrier `barrier()` and a function `is_thread0()` returning `true` for a single thread.
156 * AsynchronousFixpoint provides a `fixpoint` function using `iterate` of `I`. */
157template <class IteratorEngine>
158class AsynchronousFixpoint {
159 /** We do not use atomic because tearing is seemingly not possible in CUDA (according to information given by Nvidia engineers during a hackathon). */
160 local::B changed[3];
161 local::B stop[3];
162
163 CUDA void reset() {
164 if(is_thread0()) {
165 changed[0] = true;
166 changed[1] = false;
167 changed[2] = false;
168 for(int i = 0; i < 3; ++i) {
169 stop[i] = false;
170 }
171 }
172 }
173
174public:
175 CUDA INLINE bool is_thread0() {
176 return static_cast<IteratorEngine*>(this)->is_thread0();
177 }
178
179 CUDA INLINE void barrier() {
180 static_cast<IteratorEngine*>(this)->barrier();
181 }
182
183 template <class F>
184 CUDA INLINE local::B iterate(size_t n, const F& f) const {
185 return static_cast<const IteratorEngine*>(this)->iterate(n, f);
186 }
187
188 /** We execute `I::iterate(n, f)` until we reach a fixpoint or `must_stop()` returns `true`.
189 * \param `n` the number of call to `f`.
190 * \param `bool f(size_t i)` returns `true` if something has changed for `i`.
191 * \param `bool must_stop()` returns `true` if we must stop early the fixpoint computation. This function is called by the first thread only.
192 * \param `has_changed` is set to `true` if we were not yet in a fixpoint.
193 * \return The number of iterations required to reach a fixpoint or until `must_stop()` returns `true`.
194 */
195 template <class F, class StopFun, class M>
196 CUDA size_t fixpoint(size_t n, const F& f, const StopFun& must_stop, B<M>& has_changed) {
197 reset();
198 barrier();
199 size_t i;
200 for(i = 1; changed[(i-1)%3] && !stop[(i-1)%3]; ++i) {
201 changed[i%3].join(iterate(n, f));
202 if(is_thread0()) {
203 changed[(i+1)%3].meet(false); // reinitialize changed for the next iteration.
204 stop[i%3].join(must_stop());
205 }
206 barrier();
207 }
208 // It changes if we performed several iteration, or if the first iteration changed the abstract domain.
209 if(is_thread0()) {
210 has_changed.join(changed[1] || i > 2);
211 }
212 return i - 1;
213 }
214
215 /** Same as `fixpoint` above without `has_changed`. */
216 template <class F, class StopFun>
217 CUDA INLINE size_t fixpoint(size_t n, const F& f, const StopFun& must_stop) {
218 local::B has_changed(false);
219 return fixpoint(n, f, must_stop, has_changed);
220 }
221
222 /** Same as `fixpoint` above with `must_stop` always returning `false`. */
223 template <class F, class M>
224 CUDA INLINE size_t fixpoint(size_t n, const F& f, B<M>& has_changed) {
225 return fixpoint(n, f, [](){ return false; }, has_changed);
226 }
227
228 /** Same as `fixpoint` above without `has_changed` and with `must_stop` always returning `false`. */
229 template <class F>
230 CUDA INLINE size_t fixpoint(size_t n, const F& f) {
231 local::B has_changed(false);
232 return fixpoint(n, f, has_changed);
233 }
234
235 /** Same as `fixpoint` with a new function defined by `g(i) = f(indexes[i])` and `n = indexes.size()`. */
236 template <class Alloc, class F, class StopFun, class M>
237 CUDA INLINE size_t fixpoint(const battery::vector<int, Alloc>& indexes, const F& f, const StopFun& must_stop, B<M>& has_changed) {
238 return fixpoint(indexes.size(), [&](size_t i) { return f(indexes[i]); }, must_stop, has_changed);
239 }
240
241 /** Same as `fixpoint` with `g(i) = f(indexes[i])` and `n = indexes.size()`, without `has_changed`. */
242 template <class Alloc, class F, class StopFun>
243 CUDA INLINE size_t fixpoint(const battery::vector<int, Alloc>& indexes, const F& f, const StopFun& must_stop) {
244 local::B has_changed(false);
245 return fixpoint(indexes, f, must_stop, has_changed);
246 }
247
248 /** Same as `fixpoint` above with `must_stop` always returning `false`. */
249 template <class Alloc, class F, class M>
250 CUDA INLINE size_t fixpoint(const battery::vector<int, Alloc>& indexes, const F& f, B<M>& has_changed) {
251 return fixpoint(indexes, f, [](){ return false; }, has_changed);
252 }
253
254 /** Same as `fixpoint` above without `has_changed` and with `must_stop` always returning `false`. */
255 template <class Alloc, class F>
256 CUDA INLINE size_t fixpoint(const battery::vector<int, Alloc>& indexes, const F& f) {
257 local::B has_changed(false);
258 return fixpoint(indexes, f, has_changed);
259 }
260};
261
262/** A simple form of fixpoint computation based on Kleene fixpoint.
263 * At each iteration, the functions \f$ f_1, \ldots, f_n \f$ are composed by parallel composition \f$ f = f_1 \| \ldots \| f_n \f$ meaning they are executed in parallel by different threads.
264 * This is called an asynchronous iteration and it is due to (Cousot, Asynchronous iterative methods for solving a fixed point system of monotone equations in a complete lattice, 1977).
265 * \tparam Group is a CUDA cooperative group class (note that we provide a more efficient implementation for block group below in `BlockAsynchronousIterationGPU`).
266*/
267template <class Group>
268class AsynchronousIterationGPU : public AsynchronousFixpoint<AsynchronousIterationGPU<Group>> {
269public:
270 using group_type = Group;
271private:
272 Group group;
273
274 CUDA void assert_cuda_arch() const {
275 printf("AsynchronousIterationGPU must be used on the GPU device only.\n");
276 assert(0);
277 }
278
279public:
280 CUDA AsynchronousIterationGPU(const Group& group):
281 group(group)
282 {}
283
284 CUDA void reset() const {}
285
286 CUDA INLINE bool is_thread0() const {
287 #ifndef __CUDA_ARCH__
288 assert_cuda_arch();
289 return false;
290 #else
291 return group.thread_rank() == 0;
292 #endif
293 }
294
295 /** A barrier used to synchronize the threads within the group between iterations. */
296 CUDA INLINE void barrier() {
297 #ifndef __CUDA_ARCH__
298 assert_cuda_arch();
299 #else
300 group.sync();
301 #endif
302 }
303
304 /** The function `f` is called `n` times in parallel: \f$ f(0) \| f(1) \| \ldots \| f(n) \f$.
305 * \param `n` the number of call to `f`.
306 * \param `bool f(size_t i)` returns `true` if something has changed for `i`.
307 * \return `true` if for some `i`, `f(i)` returned `true`, `false` otherwise.
308 */
309 template <class F>
310 CUDA INLINE bool iterate(size_t n, const F& f) const {
311 #ifndef __CUDA_ARCH__
312 assert_cuda_arch();
313 return false;
314 #else
315 bool has_changed = false;
316 for (size_t i = group.thread_rank(); i < n; i += group.num_threads()) {
317 has_changed |= f(i);
318 }
319 return has_changed;
320 #endif
321 }
322};
323
324using GridAsynchronousFixpointGPU = AsynchronousIterationGPU<cooperative_groups::grid_group>;
325
326/** An optimized version of `AsynchronousIterationGPU` when the fixpoint is computed on a single block.
327 * We avoid the use of cooperative groups which take extra memory space.
328 */
329class BlockAsynchronousFixpointGPU : public AsynchronousFixpoint<BlockAsynchronousFixpointGPU> {
330private:
331 CUDA void assert_cuda_arch() const {
332 printf("BlockAsynchronousFixpointGPU must be used on the GPU device only.\n");
333 assert(0);
334 }
335
336public:
337 BlockAsynchronousFixpointGPU() = default;
338
339 CUDA INLINE bool is_thread0() const {
340 #ifndef __CUDA_ARCH__
341 assert_cuda_arch();
342 return false;
343 #else
344 return threadIdx.x == 0;
345 #endif
346 }
347
348 CUDA INLINE void barrier() {
349 #ifndef __CUDA_ARCH__
350 assert_cuda_arch();
351 #else
352 __syncthreads();
353 #endif
354 }
355
356 /** The function `f` is called `n` times in parallel: \f$ f(0) \| f(1) \| \ldots \| f(n) \f$.
357 * If `n` is greater than the number of threads in the block, we perform a stride loop, without synchronization between two iterations.
358 * \param `n` the number of calls to `f`.
359 * \param `bool f(size_t i)` returns `true` if something has changed for `i`.
360 * \return `true` if for some `i`, `f(i)` returned `true`, `false` otherwise.
361 */
362 template <class F>
363 CUDA INLINE bool iterate(size_t n, const F& f) const {
364 #ifndef __CUDA_ARCH__
365 assert_cuda_arch();
366 return false;
367 #else
368 bool has_changed = false;
369 for (size_t i = threadIdx.x; i < n; i += blockDim.x) {
370 has_changed |= f(i);
371 }
372 return has_changed;
373 #endif
374 }
375};
376
377/** Add the ability to deactive functions in a fixpoint computation.
378 * Given a function `g`, we select only the functions \f$ f_{i_1} \| \ldots \| f_{i_k} \f$ for which \f$ g(i_k) \f$ is `true`, and compute subsequent fixpoint without them.
379 */
380template <class FixpointEngine, class Allocator, size_t TPB>
381class FixpointSubsetGPU {
382public:
383 using allocator_type = Allocator;
384
385private:
386 FixpointEngine fp_engine;
387
388 /** The indexes of functions that are active. */
389 battery::vector<int, allocator_type> indexes;
390
391 /** A mask to know which functions are still active.
392 * We have `mask[i] <=> g(indexes[i])`.
393 */
394 battery::vector<bool, allocator_type> mask;
395
396 /** A temporary array to compute the prefix sum of `mask`, in order to copy indexes into `indexes2`. */
397 battery::vector<int, allocator_type> sum;
398
399 /** A temporary array when copying the new active functions. */
400 battery::vector<int, allocator_type> indexes2;
401
402 /** The CUB prefix sum temporary storage. */
403 using BlockScan = cub::BlockScan<int, TPB>;
404 typename BlockScan::TempStorage cub_prefixsum_tmp;
405
406 // We round n to the next multiple of TPB (the maximum dimension of the block, for now).
407 __device__ INLINE size_t round_multiple_TPB(size_t n) {
408 return n + ((blockDim.x - n % blockDim.x) % blockDim.x);
409 }
410
411public:
412 FixpointSubsetGPU() = default;
413
414 __device__ void reset(size_t n) {
415 if(threadIdx.x == 0) {
416 indexes.resize(n);
417 indexes2.resize(n);
418 }
419 __syncthreads();
420 for(int i = threadIdx.x; i < indexes.size(); i += blockDim.x) {
421 indexes[i] = i;
422 }
423 }
424
425 __device__ void init(size_t n, const allocator_type& allocator = allocator_type()) {
426 if(threadIdx.x == 0) {
427 indexes = battery::vector<int, allocator_type>(n, allocator);
428 indexes2 = battery::vector<int, allocator_type>(n, allocator);
429 mask = battery::vector<bool, allocator_type>(round_multiple_TPB(n), false, allocator);
430 sum = battery::vector<int, allocator_type>(round_multiple_TPB(n), 0, allocator);
431 }
432 __syncthreads();
433 for(int i = threadIdx.x; i < indexes.size(); i += blockDim.x) {
434 indexes[i] = i;
435 }
436 }
437
438 __device__ void destroy() {
439 if(threadIdx.x == 0) {
440 indexes = battery::vector<int, allocator_type>();
441 indexes2 = battery::vector<int, allocator_type>();
442 mask = battery::vector<bool, allocator_type>();
443 sum = battery::vector<int, allocator_type>();
444 }
445 __syncthreads();
446 }
447
448 CUDA INLINE bool is_thread0() const {
449 return fp_engine.is_thread0();
450 }
451
452 CUDA INLINE void barrier() {
453 fp_engine.barrier();
454 }
455
456 template <class F>
457 CUDA INLINE bool iterate(const F& f) {
458 return fp_engine.iterate(indexes, f);
459 }
460
461 template <class F>
462 CUDA INLINE size_t fixpoint(const F& f) {
463 return fp_engine.fixpoint(indexes, f);
464 }
465
466 template <class F, class StopFun>
467 CUDA INLINE size_t fixpoint(const F& f, const StopFun& g) {
468 return fp_engine.fixpoint(indexes, f, g);
469 }
470
471 template <class F, class StopFun, class M>
472 CUDA INLINE size_t fixpoint(const F& f, const StopFun& g, B<M>& has_changed) {
473 return fp_engine.fixpoint(indexes, f, g);
474 }
475
476 /** \return the number of active functions. */
477 CUDA size_t num_active() const {
478 return indexes.size();
479 }
480
481 /** Compute the subset of the functions that are still active.
482 * The subsequent call to `fixpoint` will only consider the function `f_i` for which `g(i)` is `true`. */
483 template <class G>
484 __device__ void select(const G& g) {
485 assert(TPB == blockDim.x);
486 // indexes: 0 1 2 3 (indexes of the propagators)
487 // mask: 1 0 0 1 (filtering entailed functions)
488 // sum: 1 1 1 2 (inclusive prefix sum)
489 // indexes2: 0 3 (new indexes of the propagators)
490 if(indexes.size() == 0) {
491 return;
492 }
493
494 /** I. We perform a parallel map to detect the active functions. */
495 for(int i = threadIdx.x; i < indexes.size(); i += blockDim.x) {
496 mask[i] = g(indexes[i]);
497 }
498
499 /** II. We then compute the prefix sum of the mask in order to compute the new indexes of the active functions. */
500 size_t n = round_multiple_TPB(indexes.size());
501 for(int i = threadIdx.x; i < n; i += blockDim.x) {
502 BlockScan(cub_prefixsum_tmp).InclusiveSum(mask[i], sum[i]);
503 __syncthreads(); // required by BlockScan to reuse the temporary storage.
504 }
505 for(int i = blockDim.x + threadIdx.x; i < n; i += blockDim.x) {
506 sum[i] += sum[i - threadIdx.x - 1];
507 __syncthreads();
508 }
509
510 /** III. We compute the new indexes of the active functions. */
511 if(threadIdx.x == 0) {
512 battery::swap(indexes, indexes2);
513 indexes.resize(sum[indexes2.size()-1]);
514 }
515 __syncthreads();
516 for(int i = threadIdx.x; i < indexes2.size(); i += blockDim.x) {
517 if(mask[i]) {
518 indexes[sum[i]-1] = indexes2[i];
519 }
520 }
521 }
522
523 template <class Alloc = allocator_type>
524 using snapshot_type = battery::vector<int, Alloc>;
525
526 template <class Alloc = allocator_type>
527 CUDA snapshot_type<Alloc> snapshot(const Alloc& alloc = Alloc()) const {
528 return snapshot_type<Alloc>(indexes, alloc);
529 }
530
531 template <class Alloc>
532 __device__ void restore_par(const snapshot_type<Alloc>& snap) {
533 for(int i = threadIdx.x; i < snap.size(); i += blockDim.x) {
534 indexes[i] = snap[i];
535 }
536 if(threadIdx.x == 0) {
537 assert(snap.size() < indexes.capacity());
538 indexes.resize(snap.size());
539 }
540 }
541};
542
543#endif
544
545} // namespace lala
546
547#endif
Definition b.hpp:10
Definition fixpoint.hpp:84
FixpointSubsetCPU(size_t n)
Definition fixpoint.hpp:95
bool iterate(const F &f)
Definition fixpoint.hpp:102
void reset()
Definition fixpoint.hpp:126
snapshot_type snapshot() const
Definition fixpoint.hpp:144
size_t num_active() const
Definition fixpoint.hpp:122
size_t fixpoint(const F &f)
Definition fixpoint.hpp:107
size_t fixpoint(const F &f, const StopFun &g)
Definition fixpoint.hpp:112
size_t snapshot_type
Definition fixpoint.hpp:142
void select(const G &g)
Definition fixpoint.hpp:133
void restore(const snapshot_type &snap)
Definition fixpoint.hpp:148
size_t fixpoint(const F &f, const StopFun &g, B< M > &has_changed)
Definition fixpoint.hpp:117
Definition fixpoint.hpp:21
CUDA local::B iterate(size_t n, const F &f) const
Definition fixpoint.hpp:31
CUDA size_t fixpoint(size_t n, const F &f)
Definition fixpoint.hpp:73
CUDA size_t fixpoint(size_t n, const F &f, B< M > &has_changed)
Definition fixpoint.hpp:67
CUDA size_t fixpoint(size_t n, const F &f, const StopFun &must_stop, B< M > &has_changed)
Definition fixpoint.hpp:47
CUDA void barrier()
Definition fixpoint.hpp:23
CUDA size_t fixpoint(size_t n, const F &f, const StopFun &must_stop)
Definition fixpoint.hpp:60
Definition abstract_deps.hpp:14