13 files changed, 295 insertions, 194 deletions
diff --git a/boost/fiber/detail/config.hpp b/boost/fiber/detail/config.hpp
index f65d48910d..7c7119e1fb 100644
--- a/boost/fiber/detail/config.hpp
+++ b/boost/fiber/detail/config.hpp
@@ -52,7 +52,7 @@
 #endif
 
 #if !defined(BOOST_FIBERS_SPIN_MAX_TESTS)
-# define BOOST_FIBERS_SPIN_MAX_TESTS 100
+# define BOOST_FIBERS_SPIN_MAX_TESTS 500
 #endif
 
 // modern architectures have cachelines with 64byte length
diff --git a/boost/fiber/detail/context_mpsc_queue.hpp b/boost/fiber/detail/context_mpsc_queue.hpp
deleted file mode 100644
index f7e664659c..0000000000
--- a/boost/fiber/detail/context_mpsc_queue.hpp
+++ /dev/null
@@ -1,98 +0,0 @@
-
-//          Copyright Dmitry Vyukov 2010-2011.
-//          Copyright Oliver Kowalke 2016.
-// Distributed under the Boost Software License, Version 1.0.
-//    (See accompanying file LICENSE_1_0.txt or copy at
-//          http://www.boost.org/LICENSE_1_0.txt)
-//
-// based on Dmitry Vyukov's intrusive MPSC queue
-// http://www.1024cores.net/home/lock-free-algorithms/queues/intrusive-mpsc-node-based-queue
-// https://groups.google.com/forum/#!topic/lock-free/aFHvZhu1G-0
-
-#ifndef BOOST_FIBERS_DETAIL_CONTEXT_MPSC_QUEUE_H
-#define BOOST_FIBERS_DETAIL_CONTEXT_MPSC_QUEUE_H
-
-#include <atomic>
-#include <memory>
-#include <type_traits>
-
-#include <boost/assert.hpp>
-#include <boost/config.hpp>
-
-#include <boost/fiber/context.hpp>
-#include <boost/fiber/detail/config.hpp>
-
-#ifdef BOOST_HAS_ABI_HEADERS
-#  include BOOST_ABI_PREFIX
-#endif
-
-namespace boost {
-namespace fibers {
-namespace detail {
-
-// a MPSC queue
-// multiple threads push ready fibers (belonging to local scheduler)
-// (thread) local scheduler pops fibers
-class context_mpsc_queue {
-private:
-    // not default constructor for context - use aligned_storage instead
-    alignas(cache_alignment) std::aligned_storage< sizeof( context), alignof( context) >::type  storage_{};
-    context                                         *   dummy_;
-    alignas(cache_alignment) std::atomic< context * >   head_;
-    alignas(cache_alignment) context                *   tail_;
-    char                                                pad_[cacheline_length];
-
-public:
-    context_mpsc_queue() :
-        dummy_{ reinterpret_cast< context * >( std::addressof( storage_) ) },
-        head_{ dummy_ },
-        tail_{ dummy_ } {
-        dummy_->remote_nxt_.store( nullptr, std::memory_order_release);
-    }
-
-    context_mpsc_queue( context_mpsc_queue const&) = delete;
-    context_mpsc_queue & operator=( context_mpsc_queue const&) = delete;
-
-    void push( context * ctx) noexcept {
-        BOOST_ASSERT( nullptr != ctx);
-        ctx->remote_nxt_.store( nullptr, std::memory_order_release);
-        context * prev = head_.exchange( ctx, std::memory_order_acq_rel);
-        prev->remote_nxt_.store( ctx, std::memory_order_release);
-    }
-
-    context * pop() noexcept {
-        context * tail = tail_;
-        context * next = tail->remote_nxt_.load( std::memory_order_acquire);
-        if ( dummy_ == tail) {
-            if ( nullptr == next) {
-                return nullptr;
-            }
-            tail_ = next;
-            tail = next;
-            next = next->remote_nxt_.load( std::memory_order_acquire);;
-        }
-        if ( nullptr != next) {
-            tail_ = next;
-            return tail;
-        }
-        context * head = head_.load( std::memory_order_acquire);
-        if ( tail != head) {
-            return nullptr;
-        }
-        push( dummy_);
-        next = tail->remote_nxt_.load( std::memory_order_acquire);
-        if ( nullptr != next) {
-            tail_= next;
-            return tail;
-        }
-        return nullptr;
-    }
-};
-
-}}}
-
-#ifdef BOOST_HAS_ABI_HEADERS
-#  include BOOST_ABI_SUFFIX
-#endif
-
-#endif // BOOST_FIBERS_DETAIL_CONTEXT_MPSC_QUEUE_H
diff --git a/boost/fiber/detail/context_spinlock_queue.hpp b/boost/fiber/detail/context_spinlock_queue.hpp
new file mode 100644
index 0000000000..e0ebdabda6
--- /dev/null
+++ b/boost/fiber/detail/context_spinlock_queue.hpp
@@ -0,0 +1,118 @@
+
+//          Copyright Oliver Kowalke 2015.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE_1_0.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+//
+
+#ifndef BOOST_FIBERS_DETAIL_SPINLOCK_QUEUE_H
+#define BOOST_FIBERS_DETAIL_SPINLOCK_QUEUE_H
+
+#include <cstddef>
+#include <cstring>
+#include <mutex>
+
+#include <boost/config.hpp>
+
+#include <boost/fiber/context.hpp>
+#include <boost/fiber/detail/config.hpp>
+#include <boost/fiber/detail/spinlock.hpp>
+
+#ifdef BOOST_HAS_ABI_HEADERS
+#  include BOOST_ABI_PREFIX
+#endif
+
+namespace boost {
+namespace fibers {
+namespace detail {
+
+class context_spinlock_queue {
+private:
+	typedef context *   slot_type;
+
+    alignas(cache_alignment) mutable spinlock   splk_{};
+	std::size_t                                 pidx_{ 0 };
+	std::size_t                                 cidx_{ 0 };
+	std::size_t                                 capacity_;
+	slot_type                               *   slots_;
+
+	void resize_() {
+		slot_type * old_slots = slots_;
+		slots_ = new slot_type[2*capacity_];
+		std::size_t offset = capacity_ - cidx_;
+		std::memcpy( slots_, old_slots + cidx_, offset * sizeof( slot_type) );
+		if ( 0 < cidx_) {
+			std::memcpy( slots_ + offset, old_slots, pidx_ * sizeof( slot_type) );
+		}
+		cidx_ = 0;
+		pidx_ = capacity_ - 1;
+		capacity_ *= 2;
+		delete [] old_slots;
+	}
+
+	bool is_full_() const noexcept {
+		return cidx_ == ((pidx_ + 1) % capacity_);
+	}
+
+	bool is_empty_() const noexcept {
+		return cidx_ == pidx_;
+	}
+
+public:
+	context_spinlock_queue( std::size_t capacity = 4096) :
+			capacity_{ capacity } {
+		slots_ = new slot_type[capacity_];
+	}
+
+	~context_spinlock_queue() {
+		delete [] slots_;
+	}
+
+    context_spinlock_queue( context_spinlock_queue const&) = delete;
+    context_spinlock_queue & operator=( context_spinlock_queue const&) = delete;
+
+	bool empty() const noexcept {
+        spinlock_lock lk{ splk_ };
+		return is_empty_();
+	}
+
+	void push( context * c) {
+        spinlock_lock lk{ splk_ };
+		if ( is_full_() ) {
+			resize_();
+		}
+		slots_[pidx_] = c;
+		pidx_ = (pidx_ + 1) % capacity_;
+	}
+
+	context * pop() {
+        spinlock_lock lk{ splk_ };
+		context * c = nullptr;
+		if ( ! is_empty_() ) {
+			c = slots_[cidx_];
+			cidx_ = (cidx_ + 1) % capacity_;
+		}
+		return c;
+	}
+
+	context * steal() {
+        spinlock_lock lk{ splk_ };
+		context * c = nullptr;
+		if ( ! is_empty_() ) {
+			c = slots_[cidx_];
+            if ( c->is_context( type::pinned_context) ) {
+                return nullptr;
+            }
+			cidx_ = (cidx_ + 1) % capacity_;
+		}
+		return c;
+	}
+};
+
+}}}
+
+#ifdef BOOST_HAS_ABI_HEADERS
+#  include BOOST_ABI_SUFFIX
+#endif
+
+#endif // BOOST_FIBERS_DETAIL_SPINLOCK_QUEUE_H
diff --git a/boost/fiber/detail/context_spmc_queue.hpp b/boost/fiber/detail/context_spmc_queue.hpp
index 6449e3658f..27256233cf 100644
--- a/boost/fiber/detail/context_spmc_queue.hpp
+++ b/boost/fiber/detail/context_spmc_queue.hpp
@@ -30,6 +30,11 @@
 // In Proceedings of the 18th ACM SIGPLAN symposium on Principles and practice
 // of parallel programming (PPoPP '13). ACM, New York, NY, USA, 69-80.
 
+#if BOOST_COMP_CLANG
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wunused-private-field"
+#endif
+
 namespace boost {
 namespace fibers {
 namespace detail {
@@ -43,43 +48,43 @@ private:
             sizeof( atomic_type), cache_alignment
         >::type                                         storage_type; 
 
-        std::size_t        size_;
+        std::size_t         capacity_;
         storage_type    *   storage_;
 
     public:
-        array( std::size_t size) :
-            size_{ size },
-            storage_{ new storage_type[size_] } {
-            for ( std::size_t i = 0; i < size_; ++i) {
+        array( std::size_t capacity) :
+            capacity_{ capacity },
+            storage_{ new storage_type[capacity_] } {
+            for ( std::size_t i = 0; i < capacity_; ++i) {
                 ::new ( static_cast< void * >( std::addressof( storage_[i]) ) ) atomic_type{ nullptr };
             }
         }
 
         ~array() {
-            for ( std::size_t i = 0; i < size_; ++i) {
+            for ( std::size_t i = 0; i < capacity_; ++i) {
                 reinterpret_cast< atomic_type * >( std::addressof( storage_[i]) )->~atomic_type();
             }
             delete [] storage_;
         }
 
-        std::size_t size() const noexcept {
-            return size_;
+        std::size_t capacity() const noexcept {
+            return capacity_;
         }
 
         void push( std::size_t bottom, context * ctx) noexcept {
             reinterpret_cast< atomic_type * >(
-                std::addressof( storage_[bottom % size_]) )
+                std::addressof( storage_[bottom % capacity_]) )
                     ->store( ctx, std::memory_order_relaxed);
         }
 
         context * pop( std::size_t top) noexcept {
             return reinterpret_cast< atomic_type * >(
-                std::addressof( storage_[top % size_]) )
+                std::addressof( storage_[top % capacity_]) )
                     ->load( std::memory_order_relaxed);
         }
 
         array * resize( std::size_t bottom, std::size_t top) {
-            std::unique_ptr< array > tmp{ new array{ 2 * size_ } };
+            std::unique_ptr< array > tmp{ new array{ 2 * capacity_ } };
             for ( std::size_t i = top; i != bottom; ++i) {
                 tmp->push( i, pop( i) );
             }
@@ -87,15 +92,15 @@ private:
         }
     };
 
-    alignas(cache_alignment) std::atomic< std::size_t >    top_{ 0 };
-    alignas(cache_alignment) std::atomic< std::size_t >    bottom_{ 0 };
+    alignas(cache_alignment) std::atomic< std::size_t >     top_{ 0 };
+    alignas(cache_alignment) std::atomic< std::size_t >     bottom_{ 0 };
     alignas(cache_alignment) std::atomic< array * >         array_;
-    std::vector< array * >          						old_arrays_{};
+    std::vector< array * >                                  old_arrays_{};
     char                                                    padding_[cacheline_length];
 
 public:
-    context_spmc_queue() :
-        array_{ new array{ 1024 } } {
+    context_spmc_queue( std::size_t capacity = 4096) :
+        array_{ new array{ capacity } } {
         old_arrays_.reserve( 32);
     }
 
@@ -110,19 +115,19 @@ public:
     context_spmc_queue & operator=( context_spmc_queue const&) = delete;
 
     bool empty() const noexcept {
-        std::size_t bottom{ bottom_.load( std::memory_order_relaxed) };
-        std::size_t top{ top_.load( std::memory_order_relaxed) };
+        std::size_t bottom = bottom_.load( std::memory_order_relaxed);
+        std::size_t top = top_.load( std::memory_order_relaxed);
         return bottom <= top;
     }
 
     void push( context * ctx) {
-        std::size_t bottom{ bottom_.load( std::memory_order_relaxed) };
-        std::size_t top{ top_.load( std::memory_order_acquire) };
-        array * a{ array_.load( std::memory_order_relaxed) };
-        if ( (a->size() - 1) < (bottom - top) ) {
+        std::size_t bottom = bottom_.load( std::memory_order_relaxed);
+        std::size_t top = top_.load( std::memory_order_acquire);
+        array * a = array_.load( std::memory_order_relaxed);
+        if ( (a->capacity() - 1) < (bottom - top) ) {
             // queue is full
             // resize
-            array * tmp{ a->resize( bottom, top) };
+            array * tmp = a->resize( bottom, top);
             old_arrays_.push_back( a);
             std::swap( a, tmp);
             array_.store( a, std::memory_order_relaxed);
@@ -133,16 +138,48 @@ public:
     }
 
     context * pop() {
-        std::size_t top{ top_.load( std::memory_order_acquire) };
+        std::size_t bottom = bottom_.load( std::memory_order_relaxed) - 1;
+        array * a = array_.load( std::memory_order_relaxed);
+        bottom_.store( bottom, std::memory_order_relaxed);
         std::atomic_thread_fence( std::memory_order_seq_cst);
-        std::size_t bottom{ bottom_.load( std::memory_order_acquire) };
-        context * ctx{ nullptr };
+        std::size_t top = top_.load( std::memory_order_relaxed);
+        context * ctx = nullptr;
+        if ( top <= bottom) {
+            // queue is not empty
+            ctx = a->pop( bottom);
+            BOOST_ASSERT( nullptr != ctx);
+            if ( top == bottom) {
+                // last element dequeued
+                if ( ! top_.compare_exchange_strong( top, top + 1,
+                                                     std::memory_order_seq_cst,
+                                                     std::memory_order_relaxed) ) {
+                    // lose the race
+                    ctx = nullptr;
+                }
+                bottom_.store( bottom + 1, std::memory_order_relaxed);
+            }
+        } else {
+            // queue is empty
+            bottom_.store( bottom + 1, std::memory_order_relaxed);
+        }
+        return ctx;
+    }
+
+    context * steal() {
+        std::size_t top = top_.load( std::memory_order_acquire);
+        std::atomic_thread_fence( std::memory_order_seq_cst);
+        std::size_t bottom = bottom_.load( std::memory_order_acquire);
+        context * ctx = nullptr;
         if ( top < bottom) {
             // queue is not empty
-            array * a{ array_.load( std::memory_order_consume) };
+            array * a = array_.load( std::memory_order_consume);
             ctx = a->pop( top);
-            if ( ctx->is_context( type::pinned_context) ||
-                 ! top_.compare_exchange_strong( top, top + 1,
+            BOOST_ASSERT( nullptr != ctx);
+            // do not steal pinned context (e.g. main-/dispatcher-context)
+            if ( ctx->is_context( type::pinned_context) ) {
+                return nullptr;
+            }
+            if ( ! top_.compare_exchange_strong( top, top + 1,
                                                  std::memory_order_seq_cst,
                                                  std::memory_order_relaxed) ) {
                 // lose the race
@@ -155,4 +192,8 @@ public:
 
 }}}
 
+#if BOOST_COMP_CLANG
+#pragma clang diagnostic pop
+#endif
+
 #endif // BOOST_FIBERS_DETAIL_CONTEXT_SPMC_QUEUE_H
diff --git a/boost/fiber/detail/cpu_relax.hpp b/boost/fiber/detail/cpu_relax.hpp
index d00020a23b..541b46dfd0 100644
--- a/boost/fiber/detail/cpu_relax.hpp
+++ b/boost/fiber/detail/cpu_relax.hpp
@@ -7,6 +7,7 @@
 #ifndef BOOST_FIBERS_DETAIL_CPU_RELAX_H
 #define BOOST_FIBERS_DETAIL_CPU_RELAX_H
 
+#include <chrono>
 #include <thread>
 
 #include <boost/config.hpp>
@@ -14,7 +15,7 @@
 
 #include <boost/fiber/detail/config.hpp>
 
-#if BOOST_COMP_MSVC
+#if BOOST_COMP_MSVC || BOOST_COMP_MSVC_EMULATED
 # include <Windows.h>
 #endif
 
@@ -29,22 +30,47 @@ namespace detail {
 #if BOOST_ARCH_ARM
 # if BOOST_COMP_MSVC
 #  define cpu_relax() YieldProcessor();
-# else
+# elif (defined(__ARM_ARCH_6K__) || \
+        defined(__ARM_ARCH_6Z__) || \
+        defined(__ARM_ARCH_6ZK__) || \
+        defined(__ARM_ARCH_6T2__) || \
+        defined(__ARM_ARCH_7__) || \
+        defined(__ARM_ARCH_7A__) || \
+        defined(__ARM_ARCH_7R__) || \
+        defined(__ARM_ARCH_7M__) || \
+        defined(__ARM_ARCH_7S__) || \
+        defined(__ARM_ARCH_8A__) || \
+        defined(__aarch64__))
+// http://groups.google.com/a/chromium.org/forum/#!msg/chromium-dev/YGVrZbxYOlU/Vpgy__zeBQAJ
+// mnemonic 'yield' is supported from ARMv6k onwards
 #  define cpu_relax() asm volatile ("yield" ::: "memory");
+# else
+#  define cpu_relax() asm volatile ("nop" ::: "memory");
 # endif
 #elif BOOST_ARCH_MIPS
 # define cpu_relax() asm volatile ("pause" ::: "memory");
 #elif BOOST_ARCH_PPC
+// http://code.metager.de/source/xref/gnu/glibc/sysdeps/powerpc/sys/platform/ppc.h
+// http://stackoverflow.com/questions/5425506/equivalent-of-x86-pause-instruction-for-ppc
+// mnemonic 'or' shared resource hints
+// or 27, 27, 27 This form of 'or' provides a hint that performance
+//               will probably be imrpoved if shared resources dedicated
+//               to the executing processor are released for use by other
+//               processors
+// extended mnemonics (available with POWER7)
+// yield   ==   or 27, 27, 27
 # define cpu_relax() asm volatile ("or 27,27,27" ::: "memory");
 #elif BOOST_ARCH_X86
-# if BOOST_COMP_MSVC
+# if BOOST_COMP_MSVC || BOOST_COMP_MSVC_EMULATED
 #  define cpu_relax() YieldProcessor();
 # else
 #  define cpu_relax() asm volatile ("pause" ::: "memory");
 # endif
 #else
-# warning "architecture does not support yield/pause mnemonic"
-# define cpu_relax() std::this_thread::yield();
+# define cpu_relax() { \
+   static constexpr std::chrono::microseconds us0{ 0 }; \
+   std::this_thread::sleep_for( us0); \
+  }
 #endif
 
 }}}
diff --git a/boost/fiber/detail/data.hpp b/boost/fiber/detail/data.hpp
index 24e833a9e8..e2b119ec3e 100644
--- a/boost/fiber/detail/data.hpp
+++ b/boost/fiber/detail/data.hpp
@@ -28,7 +28,7 @@ struct data_t {
     spinlock_lock   *   lk{ nullptr };
     context         *   ctx{ nullptr };
 
-    data_t() noexcept = default;
+    data_t() = default;
 
     explicit data_t( spinlock_lock * lk_) noexcept :
         lk{ lk_ } {
diff --git a/boost/fiber/detail/fss.hpp b/boost/fiber/detail/fss.hpp
index 54dc5b79d3..27a7d67f26 100644
--- a/boost/fiber/detail/fss.hpp
+++ b/boost/fiber/detail/fss.hpp
@@ -38,12 +38,13 @@ public:
 
     friend inline
     void intrusive_ptr_add_ref( fss_cleanup_function * p) noexcept {
-        ++p->use_count_;
+        p->use_count_.fetch_add( 1, std::memory_order_relaxed);
     }
 
     friend inline
     void intrusive_ptr_release( fss_cleanup_function * p) noexcept {
-        if ( --p->use_count_ == 0) {
+        if ( 1 == p->use_count_.fetch_sub( 1, std::memory_order_release) ) {
+            std::atomic_thread_fence( std::memory_order_acquire);
             delete p;
         }
     }
diff --git a/boost/fiber/detail/futex.hpp b/boost/fiber/detail/futex.hpp
index 4c966867c5..d383dc4077 100644
--- a/boost/fiber/detail/futex.hpp
+++ b/boost/fiber/detail/futex.hpp
@@ -49,7 +49,7 @@ int futex_wake( std::atomic< std::int32_t > * addr) {
 
 inline
 int futex_wait( std::atomic< std::int32_t > * addr, std::int32_t x) {
-    ::WaitOnAddress( static_cast< volatile void * >( addr), & x, sizeof( x), -1);
+    ::WaitOnAddress( static_cast< volatile void * >( addr), & x, sizeof( x), INFINITE);
     return 0;
 }
 #else
diff --git a/boost/fiber/detail/spinlock_ttas.hpp b/boost/fiber/detail/spinlock_ttas.hpp
index d64630d84d..380773ad6d 100644
--- a/boost/fiber/detail/spinlock_ttas.hpp
+++ b/boost/fiber/detail/spinlock_ttas.hpp
@@ -19,6 +19,11 @@
 // https://software.intel.com/en-us/articles/benefitting-power-and-performance-sleep-loops
 // https://software.intel.com/en-us/articles/long-duration-spin-wait-loops-on-hyper-threading-technology-enabled-intel-processors
 
+#if BOOST_COMP_CLANG
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wunused-private-field"
+#endif
+
 namespace boost {
 namespace fibers {
 namespace detail {
@@ -30,10 +35,7 @@ private:
         unlocked
     };
 
-    // align shared variable 'state_' at cache line to prevent false sharing
-    alignas(cache_alignment) std::atomic< spinlock_status >  state_{ spinlock_status::unlocked };
-    // padding to avoid other data one the cacheline of shared variable 'state_'
-    char                                                     pad[cacheline_length];
+    std::atomic< spinlock_status >  state_{ spinlock_status::unlocked };
 
 public:
     spinlock_ttas() noexcept = default;
@@ -63,20 +65,15 @@ public:
                     // delays the next instruction's execution for a finite period of time (depends on processor family)
                     // the CPU is not under demand, parts of the pipeline are no longer being used
                     // -> reduces the power consumed by the CPU
+                    // -> prevent pipeline stalls
                     cpu_relax();
-                } else if ( BOOST_FIBERS_SPIN_MAX_TESTS + 20 > tests) {
-                    ++tests;
+                } else {
                     // std::this_thread::sleep_for( 0us) has a fairly long instruction path length,
                     // combined with an expensive ring3 to ring 0 transition costing about 1000 cycles
                     // std::this_thread::sleep_for( 0us) lets give up this_thread the remaining part of its time slice
                     // if and only if a thread of equal or greater priority is ready to run
                     static constexpr std::chrono::microseconds us0{ 0 };
                     std::this_thread::sleep_for( us0);
-                } else {
-                    // std::this_thread::yield() allows this_thread to give up the remaining part of its time slice,
-                    // but only to another thread on the same processor
-                    // instead of constant checking, a thread only checks if no other useful work is pending
-                    std::this_thread::yield();
                 }
 #else
                 std::this_thread::yield();
@@ -89,10 +86,12 @@ public:
                 // utilize 'Binary Exponential Backoff' algorithm
                 // linear_congruential_engine is a random number engine based on Linear congruential generator (LCG)
                 static thread_local std::minstd_rand generator;
-                const std::size_t z =
-                    std::uniform_int_distribution< std::size_t >{ 0, static_cast< std::size_t >( 1) << collisions }( generator);
+                static std::uniform_int_distribution< std::size_t > distribution{ 0, static_cast< std::size_t >( 1) << collisions };
+                const std::size_t z = distribution( generator);
                 ++collisions;
                 for ( std::size_t i = 0; i < z; ++i) {
+                    // -> reduces the power consumed by the CPU
+                    // -> prevent pipeline stalls
                     cpu_relax();
                 }
             } else {
@@ -109,4 +108,8 @@ public:
 
 }}}
 
+#if BOOST_COMP_CLANG
+#pragma clang diagnostic pop
+#endif
+
 #endif // BOOST_FIBERS_SPINLOCK_TTAS_H
diff --git a/boost/fiber/detail/spinlock_ttas_adaptive.hpp b/boost/fiber/detail/spinlock_ttas_adaptive.hpp
index c6a9a57d79..da044b6298 100644
--- a/boost/fiber/detail/spinlock_ttas_adaptive.hpp
+++ b/boost/fiber/detail/spinlock_ttas_adaptive.hpp
@@ -31,11 +31,8 @@ private:
         unlocked
     };
 
-    // align shared variable 'state_' at cache line to prevent false sharing
-    alignas(cache_alignment) std::atomic< spinlock_status > state_{ spinlock_status::unlocked };
-    std::atomic< std::size_t >                              tests_{ 0 };
-    // padding to avoid other data one the cacheline of shared variable 'state_'
-    char                                                    pad[cacheline_length];
+    std::atomic< spinlock_status >  state_{ spinlock_status::unlocked };
+    std::atomic< std::size_t >      tests_{ 0 };
 
 public:
     spinlock_ttas_adaptive() noexcept = default;
@@ -67,8 +64,9 @@ public:
                     // delays the next instruction's execution for a finite period of time (depends on processor family)
                     // the CPU is not under demand, parts of the pipeline are no longer being used
                     // -> reduces the power consumed by the CPU
+                    // -> prevent pipeline stalls
                     cpu_relax();
-                } else if ( BOOST_FIBERS_SPIN_MAX_TESTS + 20 > tests) {
+                } else {
                     ++tests;
                     // std::this_thread::sleep_for( 0us) has a fairly long instruction path length,
                     // combined with an expensive ring3 to ring 0 transition costing about 1000 cycles
@@ -76,11 +74,6 @@ public:
                     // if and only if a thread of equal or greater priority is ready to run
                     static constexpr std::chrono::microseconds us0{ 0 };
                     std::this_thread::sleep_for( us0);
-                } else {
-                    // std::this_thread::yield() allows this_thread to give up the remaining part of its time slice,
-                    // but only to another thread on the same processor
-                    // instead of constant checking, a thread only checks if no other useful work is pending
-                    std::this_thread::yield();
                 }
 #else
                 std::this_thread::yield();
@@ -93,10 +86,12 @@ public:
                 // utilize 'Binary Exponential Backoff' algorithm
                 // linear_congruential_engine is a random number engine based on Linear congruential generator (LCG)
                 static thread_local std::minstd_rand generator;
-                const std::size_t z =
-                    std::uniform_int_distribution< std::size_t >{ 0, static_cast< std::size_t >( 1) << collisions }( generator);
+                static std::uniform_int_distribution< std::size_t > distribution{ 0, static_cast< std::size_t >( 1) << collisions };
+                const std::size_t z = distribution( generator);
                 ++collisions;
                 for ( std::size_t i = 0; i < z; ++i) {
+                    // -> reduces the power consumed by the CPU
+                    // -> prevent pipeline stalls
                     cpu_relax();
                 }
             } else {
diff --git a/boost/fiber/detail/spinlock_ttas_adaptive_futex.hpp b/boost/fiber/detail/spinlock_ttas_adaptive_futex.hpp
index fbd6a0e4d2..61ab47691e 100644
--- a/boost/fiber/detail/spinlock_ttas_adaptive_futex.hpp
+++ b/boost/fiber/detail/spinlock_ttas_adaptive_futex.hpp
@@ -26,11 +26,8 @@ namespace detail {
 
 class spinlock_ttas_adaptive_futex {
 private:
-    // align shared variable 'value_' at cache line to prevent false sharing
-    alignas(cache_alignment) std::atomic< std::int32_t >    value_{ 0 };
-    std::atomic< std::int32_t >                              tests_{ 0 };
-    // padding to avoid other data one the cacheline of shared variable 'value_'
-    char                                                    pad_[cacheline_length];
+    std::atomic< std::int32_t >     value_{ 0 };
+    std::atomic< std::int32_t >     tests_{ 0 };
 
 public:
     spinlock_ttas_adaptive_futex() noexcept = default;
@@ -61,6 +58,7 @@ public:
                 // delays the next instruction's execution for a finite period of time (depends on processor family)
                 // the CPU is not under demand, parts of the pipeline are no longer being used
                 // -> reduces the power consumed by the CPU
+                // -> prevent pipeline stalls
                 cpu_relax();
 #else
                 // std::this_thread::yield() allows this_thread to give up the remaining part of its time slice,
@@ -73,10 +71,12 @@ public:
                 // utilize 'Binary Exponential Backoff' algorithm
                 // linear_congruential_engine is a random number engine based on Linear congruential generator (LCG)
                 static thread_local std::minstd_rand generator;
-                const std::int32_t z = std::uniform_int_distribution< std::int32_t >{
-                    0, static_cast< std::int32_t >( 1) << collisions }( generator);
+                static std::uniform_int_distribution< std::int32_t > distribution{ 0, static_cast< std::int32_t >( 1) << collisions };
+                const std::int32_t z = distribution( generator);
                 ++collisions;
                 for ( std::int32_t i = 0; i < z; ++i) {
+                    // -> reduces the power consumed by the CPU
+                    // -> prevent pipeline stalls
                     cpu_relax();
                 }
             } else {
diff --git a/boost/fiber/detail/spinlock_ttas_futex.hpp b/boost/fiber/detail/spinlock_ttas_futex.hpp
index b11e63b587..a427b73ba5 100644
--- a/boost/fiber/detail/spinlock_ttas_futex.hpp
+++ b/boost/fiber/detail/spinlock_ttas_futex.hpp
@@ -25,10 +25,7 @@ namespace detail {
 
 class spinlock_ttas_futex {
 private:
-    // align shared variable 'value_' at cache line to prevent false sharing
-    alignas(cache_alignment) std::atomic< std::int32_t >    value_{ 0 };
-    // padding to avoid other data one the cacheline of shared variable 'value_'
-    char                                                    pad_[cacheline_length];
+    std::atomic< std::int32_t > value_{ 0 };
 
 public:
     spinlock_ttas_futex() noexcept = default;
@@ -57,6 +54,7 @@ public:
                 // delays the next instruction's execution for a finite period of time (depends on processor family)
                 // the CPU is not under demand, parts of the pipeline are no longer being used
                 // -> reduces the power consumed by the CPU
+                // -> prevent pipeline stalls
                 cpu_relax();
 #else
                 // std::this_thread::yield() allows this_thread to give up the remaining part of its time slice,
@@ -69,10 +67,12 @@ public:
                 // utilize 'Binary Exponential Backoff' algorithm
                 // linear_congruential_engine is a random number engine based on Linear congruential generator (LCG)
                 static thread_local std::minstd_rand generator;
-                const std::int32_t z = std::uniform_int_distribution< std::int32_t >{
-                    0, static_cast< std::int32_t >( 1) << collisions }( generator);
+                static std::uniform_int_distribution< std::int32_t > distribution{ 0, static_cast< std::int32_t >( 1) << collisions };
+                const std::int32_t z = distribution( generator);
                 ++collisions;
                 for ( std::int32_t i = 0; i < z; ++i) {
+                    // -> reduces the power consumed by the CPU
+                    // -> prevent pipeline stalls
                     cpu_relax();
                 }
             } else {
diff --git a/boost/fiber/detail/wrap.hpp b/boost/fiber/detail/wrap.hpp
index 0369e61ee6..558de6bd94 100644
--- a/boost/fiber/detail/wrap.hpp
+++ b/boost/fiber/detail/wrap.hpp
@@ -10,8 +10,14 @@
 #include <type_traits>
 
 #include <boost/config.hpp>
+#if defined(BOOST_NO_CXX17_STD_INVOKE)
 #include <boost/context/detail/invoke.hpp>
-#include <boost/context/execution_context.hpp>
+#endif
+#if (BOOST_EXECUTION_CONTEXT==1)
+# include <boost/context/execution_context.hpp>
+#else
+# include <boost/context/continuation.hpp>
+#endif
 
 #include <boost/fiber/detail/config.hpp>
 #include <boost/fiber/detail/data.hpp>
@@ -36,9 +42,9 @@ private:
 public:
     wrapper( Fn1 && fn1, Fn2 && fn2, Tpl && tpl,
              boost::context::execution_context const& ctx) :
-        fn1_( std::move( fn1) ),
-        fn2_( std::move( fn2) ),
-        tpl_( std::move( tpl) ),
+        fn1_{ std::move( fn1) },
+        fn2_{ std::move( fn2) },
+        tpl_{ std::move( tpl) },
         ctx_{ ctx } {
     }
 
@@ -49,9 +55,11 @@ public:
     wrapper & operator=( wrapper && other) = default;
 
     void operator()( void * vp) {
-        boost::context::detail::invoke(
-                std::move( fn1_),
-                fn2_, tpl_, ctx_, vp);
+#if defined(BOOST_NO_CXX17_STD_INVOKE)
+        boost::context::detail::invoke( std::move( fn1_), fn2_, tpl_, ctx_, vp);
+#else
+        std::invoke( std::move( fn1_), fn2_, tpl_, ctx_, vp);
+#endif
     }
 };
 
@@ -59,11 +67,11 @@ template< typename Fn1, typename Fn2, typename Tpl  >
 wrapper< Fn1, Fn2, Tpl >
 wrap( Fn1 && fn1, Fn2 && fn2, Tpl && tpl,
       boost::context::execution_context const& ctx) {
-    return wrapper< Fn1, Fn2, Tpl >(
+    return wrapper< Fn1, Fn2, Tpl >{
             std::forward< Fn1 >( fn1),
             std::forward< Fn2 >( fn2),
             std::forward< Tpl >( tpl),
-            ctx);
+            ctx };
 }
 #else
 template< typename Fn1, typename Fn2, typename Tpl  >
@@ -75,9 +83,9 @@ private:
 
 public:
     wrapper( Fn1 && fn1, Fn2 && fn2, Tpl && tpl) :
-        fn1_( std::move( fn1) ),
-        fn2_( std::move( fn2) ),
-        tpl_( std::move( tpl) ) {
+        fn1_{ std::move( fn1) },
+        fn2_{ std::move( fn2) },
+        tpl_{ std::move( tpl) } {
     }
 
     wrapper( wrapper const&) = delete;
@@ -86,24 +94,31 @@ public:
     wrapper( wrapper && other) = default;
     wrapper & operator=( wrapper && other) = default;
 
-    boost::context::execution_context< data_t * >
-    operator()( boost::context::execution_context< data_t * > && ctx, data_t * dp) {
+    boost::context::continuation
+    operator()( boost::context::continuation && c) {
+#if defined(BOOST_NO_CXX17_STD_INVOKE)
         return boost::context::detail::invoke(
                 std::move( fn1_),
                 fn2_,
                 tpl_,
-                std::forward< boost::context::execution_context< data_t * > >( ctx),
-                dp);
+                std::forward< boost::context::continuation >( c) );
+#else
+        return std::invoke(
+                std::move( fn1_),
+                fn2_,
+                tpl_,
+                std::forward< boost::context::continuation >( c) );
+#endif
     }
 };
 
 template< typename Fn1, typename Fn2, typename Tpl  >
 wrapper< Fn1, Fn2, Tpl >
 wrap( Fn1 && fn1, Fn2 && fn2, Tpl && tpl) {
-    return wrapper< Fn1, Fn2, Tpl >(
+    return wrapper< Fn1, Fn2, Tpl >{
             std::forward< Fn1 >( fn1),
             std::forward< Fn2 >( fn2),
-            std::forward< Tpl >( tpl) );
+            std::forward< Tpl >( tpl) };
 }
 #endif