hmbdc
simplify-high-performance-messaging-programming
MemRingBuffer.hpp
1 #include "hmbdc/Copyright.hpp"
2 #pragma once
3 #include "hmbdc/pattern/LockFreeBufferMisc.hpp"
4 #include "hmbdc/Exception.hpp"
5 #include "hmbdc/Compile.hpp"
6 
7 #include <boost/smart_ptr/detail/yield_k.hpp>
8 
9 #include <iterator>
10 #include <thread>
11 #include <vector>
12 #include <limits>
13 #include <algorithm>
14 #include <mutex>
15 
16 #ifndef HMBDC_YIELD
17 #define HMBDC_YIELD(x) boost::detail::yield(x)
18 #endif
19 
20 namespace hmbdc { namespace pattern {
21 
22 namespace memringbuffer_detail {
23 struct my_spin_lock : std::atomic_flag {
24  my_spin_lock()
25  : std::atomic_flag{0} {
26  //clang has ATOMIC_FLAG_INIT as {0} not 0, so ...
27  constexpr int f[] = ATOMIC_FLAG_INIT;
28  static_assert(0 == f[0], "");
29  }
30  void lock() {
31  while (test_and_set(std::memory_order_acquire)) {
32  std::this_thread::yield();
33  }
34  }
35 
36  void unlock() {
37  clear(std::memory_order_release);
38  }
39 };
40 
41 template<uint16_t parallel_consumer_count, typename SeqT = size_t>
43 public:
44  using Sequence = SeqT;
45  enum {PARALLEL_CONSUMER_COUNT = parallel_consumer_count,};
46 
47  const size_t CAPACITY;
48  const size_t VALUE_TYPE_SIZE;
49 private:
50 
51  const Sequence READ_SEQ_MAX;
52  const Sequence MASK;
53 
55  Sequence toBeClaimedSeq_
56  __attribute__((__aligned__(SMP_CACHE_BYTES)));
57  Sequence readSeq_[PARALLEL_CONSUMER_COUNT]
58  __attribute__((__aligned__(SMP_CACHE_BYTES)));
59  Sequence readSeqLastPurge_[PARALLEL_CONSUMER_COUNT]
60  __attribute__((__aligned__(SMP_CACHE_BYTES)));
61 
62  my_spin_lock locks_[PARALLEL_CONSUMER_COUNT];
63 
64  inline __attribute__ ((always_inline))
65  Sequence readSeqLow() const HMBDC_RESTRICT {
66  Sequence res = readSeq_[0];
67  for (uint16_t i = 1;
68  i < PARALLEL_CONSUMER_COUNT; ++i)
69  if (res > readSeq_[i]) res = readSeq_[i];
70  return res;
71  }
72 
73  inline __attribute__ ((always_inline))
74  uint16_t findSlowestReader() const HMBDC_RESTRICT {
75  Sequence smallest = readSeq_[0];
76  uint16_t smallestLoc = 0;
77  for (uint16_t i = 1; i < PARALLEL_CONSUMER_COUNT; ++i)
78  if (smallest > readSeq_[i]) {
79  smallest = readSeq_[i];
80  smallestLoc = i;
81  }
82  return smallestLoc;
83  }
84 
85 public:
86  using value_type = void *;
88 
90 
91  static size_t footprint(uint32_t valueTypeSizePower2Num, uint32_t ringSizePower2Num) {
92  return sizeof(MemRingBuffer) + decltype(buffer_)::footprint(
93  valueTypeSizePower2Num, ringSizePower2Num) + SMP_CACHE_BYTES;
94  }
95 
96  template <typename Allocator = os::DefaultAllocator>
97  MemRingBuffer(uint32_t valueTypeSizePower2Num, uint32_t ringSizePower2Num
98  , Allocator& allocator = os::DefaultAllocator::instance())
99  : CAPACITY(1u << ringSizePower2Num)
100  , VALUE_TYPE_SIZE((1u << valueTypeSizePower2Num) - sizeof(Sequence))
101  , READ_SEQ_MAX(std::numeric_limits<Sequence>::max() - CAPACITY - 1000u)
102  //1000 is a safe number yet small for how many threads implement a single consumer
103  , MASK(CAPACITY - 1)
104  , buffer_(valueTypeSizePower2Num, ringSizePower2Num
105  , (allocator))
106  , toBeClaimedSeq_(0u) {
107  std::fill_n(readSeq_, (int)PARALLEL_CONSUMER_COUNT, 0);
108  std::fill_n(readSeqLastPurge_, (int)PARALLEL_CONSUMER_COUNT, READ_SEQ_MAX);
109  for (auto i = CAPACITY; i != 0 ; --i) {
110  *buffer_.getSeq(i - 1) = std::numeric_limits<Sequence>::max();
111  }
112  }
113 
114  void put(void const* HMBDC_RESTRICT item, size_t sizeHint = 0) HMBDC_RESTRICT {
115  // Sequence seq = __sync_fetch_and_add(&toBeClaimedSeq_, 1);
116  Sequence seq = __atomic_fetch_add(&toBeClaimedSeq_, 1, __ATOMIC_RELAXED);
117  for (uint32_t k = 0;
118  seq >= CAPACITY + readSeqLow();
119  ++k) {
120  HMBDC_YIELD(k);
121  }
122  size_t index = seq & MASK;
123  memcpy(buffer_ + index
124  , item, sizeHint ? sizeHint : VALUE_TYPE_SIZE);
125  // __sync_synchronize();
126  __atomic_thread_fence(__ATOMIC_ACQUIRE);
127  *buffer_.getSeq(index) = seq;
128  }
129 
130  bool tryPut(void const* HMBDC_RESTRICT item, size_t sizeHint = 0) HMBDC_RESTRICT {
131  // __sync_synchronize();
132  __atomic_thread_fence(__ATOMIC_ACQUIRE);
133  for(auto seq = toBeClaimedSeq_;
134  seq < CAPACITY + readSeqLow();
135  seq = toBeClaimedSeq_) {
136  // if (hmbdc_likely(__sync_bool_compare_and_swap(&toBeClaimedSeq_, seq, seq + 1))) {
137  if (hmbdc_likely(__atomic_compare_exchange_n (
138  &toBeClaimedSeq_, &seq, seq + 1, true, __ATOMIC_RELAXED, __ATOMIC_RELAXED))) {
139  size_t index = seq & MASK;
140  memcpy(buffer_ + index
141  , item, sizeHint ? sizeHint : VALUE_TYPE_SIZE);
142  // __sync_synchronize();
143  __atomic_thread_fence(__ATOMIC_ACQUIRE);
144  *buffer_.getSeq(index) = seq;
145  return true;
146  }
147  }
148  return false;
149  }
150 
151  void killPut(void const* HMBDC_RESTRICT item, size_t sizeHint = 0) HMBDC_RESTRICT {
152  // Sequence seq = __sync_fetch_and_add(&toBeClaimedSeq_, 1);
153  Sequence seq = __atomic_fetch_add(&toBeClaimedSeq_, 1, __ATOMIC_RELAXED);
154  while (seq >= CAPACITY + readSeqLow()) {
155  uint16_t slowLoc = findSlowestReader();
156  markDead(slowLoc);
157  }
158 
159  size_t index = seq & MASK;
160  memcpy(buffer_ + index, item, sizeHint ? sizeHint : VALUE_TYPE_SIZE);
161  __atomic_thread_fence(__ATOMIC_ACQUIRE);
162  *buffer_.getSeq(index) = seq;
163  }
164 
165  bool isFull() const {
166  return toBeClaimedSeq_ >= CAPACITY + readSeqLow();
167  }
168 
169 
170  Sequence readSeq(uint16_t PARALLEL_CONSUMER_INDEX) const HMBDC_RESTRICT {
171  return readSeq_[PARALLEL_CONSUMER_INDEX];
172  }
173 
174  iterator claim() HMBDC_RESTRICT {
175  // Sequence seq = __sync_fetch_and_add(&toBeClaimedSeq_, 1);
176  Sequence seq = __atomic_fetch_add(&toBeClaimedSeq_, 1, __ATOMIC_RELAXED);
177  for (uint32_t k = 0;
178  seq >= CAPACITY + readSeqLow();
179  ++k) {
180  HMBDC_YIELD(k);
181  }
182  return iterator(buffer_, seq);
183  }
184 
185  iterator tryClaim() HMBDC_RESTRICT {
186  // __sync_synchronize();
187  __atomic_thread_fence(__ATOMIC_ACQUIRE);
188  for(auto seq = toBeClaimedSeq_;
189  seq < CAPACITY + readSeqLow();
190  seq = toBeClaimedSeq_) {
191  // if (hmbdc_likely(__sync_bool_compare_and_swap(&toBeClaimedSeq_, seq, seq + 1))) {
192  if (hmbdc_likely(__atomic_compare_exchange_n (
193  &toBeClaimedSeq_, &seq, seq + 1, true, __ATOMIC_RELAXED, __ATOMIC_RELAXED))) {
194  return iterator(buffer_, seq);
195  }
196  }
197  return iterator();
198  }
199 
200  /**
201  * @brief claim slots in the ring buffer for write, return empty iterator
202  * when not possible - this call does not block
203  *
204  * @details the user needs to call commit after being done with the slots
205  * @param n size of the claimed slots
206  * @return iterator pointing to the start of the slot, or empty when not possible
207  */
208  iterator tryClaim(size_t n) HMBDC_RESTRICT {
209  // __sync_synchronize();
210  __atomic_thread_fence(__ATOMIC_ACQUIRE);
211  for(auto seq = toBeClaimedSeq_;
212  seq + n - 1 < CAPACITY + readSeqLow();
213  seq = toBeClaimedSeq_) {
214  // if (hmbdc_likely(__sync_bool_compare_and_swap(&toBeClaimedSeq_, seq, seq + n))) {
215  if (hmbdc_likely(__atomic_compare_exchange_n (
216  &toBeClaimedSeq_, &seq, seq + n, true, __ATOMIC_RELAXED, __ATOMIC_RELAXED))) {
217  return iterator(buffer_, seq);
218  }
219  }
220  return iterator();
221  }
222 
223  iterator claim(size_t n) HMBDC_RESTRICT {
224  // Sequence seq = __sync_fetch_and_add(&toBeClaimedSeq_, n);
225  Sequence seq = __atomic_fetch_add(&toBeClaimedSeq_, n, __ATOMIC_RELAXED);
226  for (uint32_t k = 0;
227  seq + n > CAPACITY + readSeqLow();
228  ++k) {
229  HMBDC_YIELD(k);
230  }
231  return iterator(buffer_, seq);
232  }
233 
234  iterator killClaim() HMBDC_RESTRICT {
235  // Sequence seq = __sync_fetch_and_add(&toBeClaimedSeq_, 1);
236  Sequence seq = __atomic_fetch_add(&toBeClaimedSeq_, 1, __ATOMIC_RELAXED);
237  while (seq >= CAPACITY + readSeqLow()) {
238  uint16_t slowLoc = findSlowestReader();
239  markDead(slowLoc);
240  }
241  return iterator(buffer_, seq);
242  }
243 
244  iterator killClaim(size_t n) HMBDC_RESTRICT {
245  // Sequence seq = __sync_fetch_and_add(&toBeClaimedSeq_, n);
246  Sequence seq = __atomic_fetch_add(&toBeClaimedSeq_, n, __ATOMIC_RELAXED);
247  while (seq + n > CAPACITY + readSeqLow()) {
248  uint16_t slowLoc = findSlowestReader();
249  markDead(slowLoc);
250  }
251  return iterator(buffer_, seq);
252  }
253 
254  void commit(iterator it) HMBDC_RESTRICT {
255  __sync_synchronize();
256  // __atomic_thread_fence(__ATOMIC_ACQUIRE);
257  *buffer_.getSeq(*it - buffer_) = it.seq_;
258  }
259 
260  void commit(iterator from, size_t n) HMBDC_RESTRICT {
261  __sync_synchronize();
262  // __atomic_thread_fence(__ATOMIC_ACQUIRE);
263  for (size_t i = 0; i < n; ++i) {
264  *buffer_.getSeq(*from - buffer_) = from.seq_;
265  ++from;
266  }
267  }
268 
269  void markDead(uint16_t parallel_consumer_index) HMBDC_RESTRICT {
270  if (parallel_consumer_index < PARALLEL_CONSUMER_COUNT) {
271  readSeq_[parallel_consumer_index] = READ_SEQ_MAX;
272  __sync_synchronize();
273  // __atomic_thread_fence(__ATOMIC_RELEASE);
274  }
275  }
276 
277  auto
278  unusedConsumerIndexes() const {
279  std::vector<uint16_t> res;
280  for (uint16_t i = 0; i < PARALLEL_CONSUMER_COUNT; ++i) {
281  if (readSeq_[i] == READ_SEQ_MAX) {
282  res.push_back(i);
283  }
284  }
285  return res;
286  }
287 
288  void take(uint16_t PARALLEL_CONSUMER_INDEX, void * HMBDC_RESTRICT item, size_t sizeHint = 0) HMBDC_RESTRICT {
289  auto seq = readSeq_[PARALLEL_CONSUMER_INDEX];
290  if (hmbdc_unlikely(seq >= READ_SEQ_MAX)) {
291  HMBDC_THROW(DeadConsumer, PARALLEL_CONSUMER_INDEX);
292  }
293  size_t index = seq & MASK;
294  for (uint32_t k = 0;
295  seq != *buffer_.getSeq(index);
296  ++k) {
297  HMBDC_YIELD(k);
298  }
299 
300  memcpy(item, buffer_ + index, sizeHint ? sizeHint : VALUE_TYPE_SIZE);
301  // __sync_fetch_and_add(readSeq_ + PARALLEL_CONSUMER_INDEX, 1);
302  __atomic_fetch_add(readSeq_ + PARALLEL_CONSUMER_INDEX, 1, __ATOMIC_RELEASE);
303  }
304 
305  void takeReentrant(uint16_t PARALLEL_CONSUMER_INDEX, void * HMBDC_RESTRICT item, size_t sizeHint = 0) HMBDC_RESTRICT {
306  std::lock_guard<my_spin_lock> guard(locks_[PARALLEL_CONSUMER_INDEX]);
307  take(PARALLEL_CONSUMER_INDEX, item, sizeHint);
308  }
309 
310  iterator peek(uint16_t PARALLEL_CONSUMER_INDEX) const HMBDC_RESTRICT {
311  auto readSeq = readSeq_[PARALLEL_CONSUMER_INDEX];
312  if (hmbdc_unlikely(readSeq >= READ_SEQ_MAX)) {
313  HMBDC_THROW(DeadConsumer, PARALLEL_CONSUMER_INDEX);
314  }
315  if (readSeq == *buffer_.getSeq(readSeq & MASK)) {
316  return iterator(buffer_, readSeq);
317  }
318  return iterator();
319  }
320 
321  size_t peek(uint16_t PARALLEL_CONSUMER_INDEX, iterator& begin, iterator& end
322  , size_t maxPeekSize = std::numeric_limits<size_t>::max()) const {
323  Sequence readSeq = readSeq_[PARALLEL_CONSUMER_INDEX];
324  if (hmbdc_unlikely(readSeq >= READ_SEQ_MAX)) {
325  HMBDC_THROW(DeadConsumer, PARALLEL_CONSUMER_INDEX);
326  }
327  begin = iterator(buffer_, readSeq);
328  while (readSeq == *buffer_.getSeq(readSeq & MASK)
329  && maxPeekSize--) {
330  ++readSeq;
331  }
332  end = iterator(buffer_, readSeq);
333  return readSeq - readSeq_[PARALLEL_CONSUMER_INDEX];
334  }
335 
336  size_t peekSome(uint16_t PARALLEL_CONSUMER_INDEX, iterator& begin, iterator& end
337  , size_t maxPeekSize = std::numeric_limits<size_t>::max()) const {
338  size_t res;
339  for (uint32_t k = 0;
340  !(res = peek(PARALLEL_CONSUMER_INDEX, begin, end, maxPeekSize))
341  && k < 64;
342  ++k) {
343  HMBDC_YIELD(k);
344  }
345  return res;
346  }
347 
348  void waste(uint16_t PARALLEL_CONSUMER_INDEX, size_t size) HMBDC_RESTRICT {
349  Sequence seq = readSeq_[PARALLEL_CONSUMER_INDEX];
350  if (hmbdc_unlikely(seq >= READ_SEQ_MAX)) {
351  HMBDC_THROW(DeadConsumer, PARALLEL_CONSUMER_INDEX);
352  }
353  for (uint32_t k = 0;
354  seq + size > toBeClaimedSeq_;
355  ++k) {
356  HMBDC_YIELD(k);
357  }
358  // __sync_fetch_and_add(readSeq_ + PARALLEL_CONSUMER_INDEX, size);
359  __atomic_fetch_add(readSeq_ + PARALLEL_CONSUMER_INDEX, size, __ATOMIC_RELEASE);
360  }
361 
362  /**
363  * @brief for batching process, mark the next items consumed
364  * @details faster than waste, only use it if the size is
365  * what peek just returned or less. Otherwise, undefined behavior
366  *
367  * @param PARALLEL_CONSUMER_INDEX identifying which parallel consumer
368  * @param size consume count
369  */
370  void wasteAfterPeek(uint16_t PARALLEL_CONSUMER_INDEX, size_t size) HMBDC_RESTRICT
371  {
372  if (!size) return;
373  // __sync_fetch_and_add(readSeq_ + PARALLEL_CONSUMER_INDEX, size);
374  __atomic_fetch_add(readSeq_ + PARALLEL_CONSUMER_INDEX, size, __ATOMIC_RELEASE);
375  }
376 
377  Sequence catchUpWith(uint16_t PARALLEL_CONSUMER_INDEX, uint16_t WITH_PARALLEL_CONSUMER_INDEX) {
378  readSeq_[PARALLEL_CONSUMER_INDEX] = readSeq_[WITH_PARALLEL_CONSUMER_INDEX];
379  // __sync_synchronize();
380  __atomic_thread_fence(__ATOMIC_ACQ_REL);
381  return readSeq_[PARALLEL_CONSUMER_INDEX];
382  }
383 
384  void catchUpTo(uint16_t PARALLEL_CONSUMER_INDEX, Sequence newSeq) {
385  if (readSeq_[PARALLEL_CONSUMER_INDEX] <= newSeq) {
386  readSeq_[PARALLEL_CONSUMER_INDEX] = newSeq;
387  }
388  // __sync_synchronize();
389  __atomic_thread_fence(__ATOMIC_RELEASE);
390  }
391 
392  size_t remainingSize(uint16_t index) const HMBDC_RESTRICT {
393  // __sync_synchronize();
394  __atomic_thread_fence(__ATOMIC_ACQUIRE);
395  Sequence r = readSeq_[index];
396  Sequence w = toBeClaimedSeq_;
397  return w > r ? w - r : 0;
398  }
399  size_t remainingSize() const HMBDC_RESTRICT {
400  // __sync_synchronize();
401  __atomic_thread_fence(__ATOMIC_ACQUIRE);
402  Sequence r = readSeqLow();
403  Sequence w = toBeClaimedSeq_;
404  return w > r ? w - r : 0;
405  }
406 
407  void reset(uint16_t PARALLEL_CONSUMER_INDEX) {
408  size_t index;
409  do {
410  readSeq_[PARALLEL_CONSUMER_INDEX] = toBeClaimedSeq_;
411  index = readSeq_[PARALLEL_CONSUMER_INDEX] & MASK;
412  // __sync_synchronize();
413  __atomic_thread_fence(__ATOMIC_ACQ_REL);
414  } while (*buffer_.getSeq(index) == readSeq_[PARALLEL_CONSUMER_INDEX]);
415  }
416 
417  size_t parallelConsumerAlive() const {
418  return count_if(readSeq_, readSeq_ + PARALLEL_CONSUMER_COUNT
419  , [m = READ_SEQ_MAX](Sequence s) {
420  return s < m;
421  }
422  );
423  }
424 
425  /**
426  * @brief call this peroidically to mark the stuck consumers dead
427  * @details caller needs to make sure there r new messages between calls
428  * @return mask indicating which consumer marked dead
429  */
430  uint64_t purge() {
431  __sync_synchronize();
432  uint64_t res = 0;
433  for (uint16_t i = 0; i < PARALLEL_CONSUMER_COUNT; ++i) {
434  auto seq = readSeq_[i];
435  if (seq < READ_SEQ_MAX) {
436  size_t index = seq & MASK;
437  if (hmbdc_unlikely(readSeqLastPurge_[i] == seq)) {
438  if (i == findSlowestReader()) {
439  res |= (1ul << i);
440  }
441  } else if (hmbdc_unlikely(seq == *buffer_.getSeq(index))) {
442  readSeqLastPurge_[i] = seq;
443  }
444  }
445  }
446  for (uint16_t i = 0; res && i < PARALLEL_CONSUMER_COUNT; ++i) {
447  if (res & (1ul << i)) markDead(i);
448  }
449  return res;
450  }
451 };
452 
453 } //memringbuffer_detail
454 
455 }} // end namespace hmbdc::pattern
456 
457 #include "hmbdc/pattern/MemRingBuffer2.hpp"
458 
459 namespace hmbdc { namespace pattern {
460 template<uint16_t PARALLEL_CONSUMER_COUNT, typename SeqT = size_t>
461 using MemRingBuffer = memringbuffer_detail::MemRingBuffer<PARALLEL_CONSUMER_COUNT, SeqT>;
462 }}
void wasteAfterPeek(uint16_t PARALLEL_CONSUMER_INDEX, size_t size) HMBDC_RESTRICT
for batching process, mark the next items consumed
Definition: MemRingBuffer.hpp:370
Definition: LockFreeBufferMisc.hpp:23
iterator tryClaim(size_t n) HMBDC_RESTRICT
claim slots in the ring buffer for write, return empty iterator when not possible - this call does no...
Definition: MemRingBuffer.hpp:208
Definition: Base.hpp:12
uint64_t purge()
call this peroidically to mark the stuck consumers dead
Definition: MemRingBuffer.hpp:430
Definition: LockFreeBufferMisc.hpp:89