diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
index 0d48896dc..5585a4fd6 100644
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -30,6 +30,7 @@ endif()
 
 if(BOOST_CAPY_BUILD_CUDA_EXAMPLES)
     add_subdirectory(cuda/datamovement)
+    add_subdirectory(cuda/notification-strategies)
 endif()
 
 if(BOOST_CAPY_BUILD_NVEXEC_EXAMPLES)
diff --git a/example/cuda/notification-strategies/CMakeLists.txt b/example/cuda/notification-strategies/CMakeLists.txt
new file mode 100644
index 000000000..16cc44483
--- /dev/null
+++ b/example/cuda/notification-strategies/CMakeLists.txt
@@ -0,0 +1,36 @@
+#
+# Copyright (c) 2026 Steve Gerbino
+#
+# Distributed under the Boost Software License, Version 1.0. (See accompanying
+# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+#
+# Official repository: https://github.com/cppalliance/capy
+#
+
+# CUDA was enabled at the top level when the option was flipped on.
+if(NOT CMAKE_CUDA_COMPILER)
+    message(FATAL_ERROR
+        "example/cuda/notification-strategies requires CUDA; "
+        "did you set BOOST_CAPY_BUILD_CUDA_EXAMPLES?")
+endif()
+
+file(GLOB_RECURSE PFILES CONFIGURE_DEPENDS
+    *.cu *.cuh *.hpp
+    CMakeLists.txt
+    README.md)
+
+source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} PREFIX "" FILES ${PFILES})
+
+add_executable(capy_example_cuda_notification_strategies ${PFILES})
+
+set_target_properties(capy_example_cuda_notification_strategies PROPERTIES
+    FOLDER "examples"
+    CUDA_STANDARD 20
+    CUDA_STANDARD_REQUIRED ON
+    CUDA_SEPARABLE_COMPILATION OFF)
+
+target_compile_features(capy_example_cuda_notification_strategies PRIVATE cxx_std_20)
+
+target_link_libraries(capy_example_cuda_notification_strategies PRIVATE
+    Boost::capy
+    CUDA::cudart)
diff --git a/example/cuda/notification-strategies/README.md b/example/cuda/notification-strategies/README.md
new file mode 100644
index 000000000..0a69765ca
--- /dev/null
+++ b/example/cuda/notification-strategies/README.md
@@ -0,0 +1,81 @@
+# CUDA notification-strategies example
+
+One GPU completion, three notification mechanisms, one protocol.
+
+The same pipeline (fill a device buffer, copy it back to host, await the
+CUDA stream) is awaited three structurally different ways. All three are
+`boost::capy::IoAwaitable`s, and all three produce the identical result
+at runtime. This demonstrates that the IoAwaitable protocol is
+independent of how asynchronous completion is detected: a host-function
+callback is only one option, and not the best-scaling one.
+
+## The three mechanisms
+
+| Mechanism | How completion is detected | Resumes on |
+|-----------|----------------------------|------------|
+| `callback_awaitable` | `cudaLaunchHostFunc` enqueued on the stream | a CUDA driver thread re-posts through the executor |
+| `poll_awaitable` | a service thread loops `cudaEventQuery` on a recorded event | the poll thread posts when ready |
+| `deferred_sync_awaitable` | a service thread runs blocking `cudaStreamSynchronize` | the service thread posts when it returns |
+
+Each awaitable captures the executor and posts the continuation through
+it, so the coroutine always resumes on a worker thread, never on a CUDA
+or service thread.
+
+The callback mechanism is the only one that cannot report a stream error
+through its host function; `cudaLaunchHostFunc` does not pass completion
+status to the callback, so `callback_awaitable` always resumes with
+success — this is an inherent limitation of the API.
+
+### Service lifetime
+
+`poll_service` and `sync_service` own threads that post continuations to
+the worker executor. Construct them after the worker `thread_pool` and
+destroy them before it, so no continuation is ever posted to a destroyed
+executor. The driver joins every pipeline (via a `std::latch`) before
+shutdown, so no wait is outstanding at teardown.
+
+## Scaling tradeoff
+
+This example proves the three mechanisms are *equivalent in result*. It
+does not measure their *throughput under load*, which a single-GPU
+developer box cannot show. That comparison needs many worker threads
+driving a server-class GPU.
+
+For that measurement: E. Cano, M. Fila, A. Krasznahorkay,
+"Scheduling for Next Generation Triggers", CHEP 2026,
+<https://indico.cern.ch/event/1471803/contributions/6967272/>. They
+report that the CUDA host-function callback handler scales poorly as the
+number of worker threads grows, while event polling and deferred
+synchronization remain stable. In a multi-threaded framework, prefer the
+poll or deferred-sync mechanisms; reach for the callback mechanism for
+its simplicity in low-concurrency settings.
+
+## Prerequisites
+
+- NVIDIA GPU and driver visible to `nvidia-smi`.
+- CUDA toolkit 13.x.
+- clang as host and CUDA compiler (verified with clang 22).
+- `CMAKE_CXX_STANDARD=20`.
+
+## Building and running
+
+```
+CXX=clang++ cmake -S . -B build-cuda -G Ninja \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DCMAKE_CXX_STANDARD=20 \
+    -DBOOST_CAPY_BUILD_CUDA_EXAMPLES=ON \
+    -DCMAKE_CUDA_COMPILER=clang++ \
+    -DCMAKE_CUDA_HOST_COMPILER=clang++ \
+    -DCMAKE_CUDA_ARCHITECTURES=89 \
+    -DCUDAToolkit_ROOT=/opt/cuda
+cmake --build build-cuda --config Release \
+    --target capy_example_cuda_notification_strategies
+./build-cuda/example/cuda/notification-strategies/capy_example_cuda_notification_strategies
+```
+
+Replace `89` with your GPU's compute capability
+(`nvidia-smi --query-gpu=compute_cap --format=csv,noheader`).
+
+Unlike the sibling `cuda/datamovement` example, this one is meant to be
+run. The pass condition is all three mechanisms printing the same
+checksum and a zero exit code.
diff --git a/example/cuda/notification-strategies/notification_strategies.cu b/example/cuda/notification-strategies/notification_strategies.cu
new file mode 100644
index 000000000..733422bf0
--- /dev/null
+++ b/example/cuda/notification-strategies/notification_strategies.cu
@@ -0,0 +1,200 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+// One GPU completion, three notification mechanisms.
+//
+// The IoAwaitable protocol does not care how completion is detected. The
+// same pipeline (fill a device buffer, copy it back, await the stream)
+// runs once per mechanism (host-function callback, event polling,
+// deferred blocking synchronize) and all three must produce the same
+// checksum. See README.md for the multi-threaded scaling tradeoff, which
+// a single-GPU box cannot measure.
+
+#include "notification_strategies.hpp"
+
+#include <boost/capy.hpp>
+#include <boost/capy/concept/io_awaitable.hpp>
+#include <boost/capy/ex/thread_pool.hpp>
+
+#include <cstdlib>
+#include <iostream>
+#include <latch>
+#include <system_error>
+#include <type_traits>
+#include <vector>
+
+namespace capy = boost::capy;
+namespace ex   = boost::capy::example;
+
+static_assert(std::is_same_v<
+    decltype(ex::make_cuda_error(cudaSuccess)), std::error_code>);
+
+// The whole point: three different mechanisms, all IoAwaitables.
+static_assert(capy::IoAwaitable<ex::callback_awaitable>);
+static_assert(capy::IoAwaitable<ex::poll_awaitable>);
+static_assert(capy::IoAwaitable<ex::deferred_sync_awaitable>);
+
+namespace {
+
+constexpr int buffer_len = 256;
+constexpr int fill_value = 7;
+
+__global__ void
+fill_kernel(int* p, int n, int v)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if(i < n)
+        p[i] = v;
+}
+
+enum class notify
+{
+    callback,
+    poll,
+    deferred_sync
+};
+
+char const*
+name_of(notify how) noexcept
+{
+    switch(how)
+    {
+    case notify::callback:      return "callback";
+    case notify::poll:          return "poll";
+    case notify::deferred_sync: return "deferred-sync";
+    }
+    return "?";
+}
+
+// Await the stream's completion using the selected mechanism. poll waits
+// on the event; callback and deferred-sync wait on the stream.
+capy::task<std::error_code>
+wait(ex::cuda_stream& stream,
+     ex::cuda_event& event,
+     notify how,
+     ex::poll_service& poll_svc,
+     ex::sync_service& sync_svc)
+{
+    switch(how)
+    {
+    case notify::callback:
+        co_return co_await stream.sync_via_callback();
+    case notify::poll:
+        co_return co_await event.sync_via_poll(poll_svc);
+    case notify::deferred_sync:
+        co_return co_await stream.sync_via_deferred(sync_svc);
+    }
+    co_return std::error_code{};
+}
+
+// Fill a device buffer, copy it back, await completion via `how`, and
+// return the host-side checksum. Identical across mechanisms.
+capy::task<long>
+run_pipeline(ex::cuda_stream& stream,
+             ex::cuda_event& event,
+             notify how,
+             ex::poll_service& poll_svc,
+             ex::sync_service& sync_svc)
+{
+    auto s = stream.native_handle();
+
+    int* d_buf = nullptr;
+    auto err = cudaMallocAsync(
+        reinterpret_cast<void**>(&d_buf),
+        buffer_len * sizeof(int), s);
+    if(err != cudaSuccess)
+        co_return -1;
+
+    fill_kernel<<<(buffer_len + 63) / 64, 64, 0, s>>>(
+        d_buf, buffer_len, fill_value);
+
+    std::vector<int> host(buffer_len, 0);
+    cudaMemcpyAsync(
+        host.data(), d_buf, buffer_len * sizeof(int),
+        cudaMemcpyDeviceToHost, s);
+    cudaFreeAsync(d_buf, s);
+    event.record(s);
+
+    auto ec = co_await wait(stream, event, how, poll_svc, sync_svc);
+    if(ec)
+        co_return -1;
+
+    long sum = 0;
+    for(int v : host)
+        sum += v;
+    co_return sum;
+}
+
+// Drive one pipeline run to completion on `pool` and return its checksum.
+long
+run_one(capy::thread_pool& pool,
+        ex::cuda_stream& stream,
+        ex::cuda_event& event,
+        notify how,
+        ex::poll_service& poll_svc,
+        ex::sync_service& sync_svc)
+{
+    long result = 0;
+    std::latch done{1};
+    capy::run_async(pool.get_executor(),
+        [&](long r) { result = r; done.count_down(); })(
+            run_pipeline(stream, event, how, poll_svc, sync_svc));
+    done.wait();
+    return result;
+}
+
+} // namespace
+
+int
+main()
+{
+    int device_count = 0;
+    if(cudaGetDeviceCount(&device_count) != cudaSuccess || device_count == 0)
+    {
+        std::cout << "No CUDA device available.\n";
+        return EXIT_FAILURE;
+    }
+
+    // Declaration order fixes teardown: services stop before the pool
+    // they post to; the stream/event close first of all.
+    capy::thread_pool pool(4);
+    ex::poll_service poll_svc;
+    ex::sync_service sync_svc;
+    ex::cuda_stream stream;
+    ex::cuda_event event;
+
+    notify const modes[] =
+        { notify::callback, notify::poll, notify::deferred_sync };
+
+    long first = 0;
+    bool ok = true;
+    std::cout << "mechanism      checksum\n";
+    for(std::size_t i = 0; i < std::size(modes); ++i)
+    {
+        long r = run_one(pool, stream, event, modes[i], poll_svc, sync_svc);
+        std::cout << name_of(modes[i]) << "   " << r << "\n";
+        if(i == 0)
+            first = r;
+        else if(r != first)
+            ok = false;
+    }
+
+    long const expected =
+        static_cast<long>(buffer_len) * fill_value;
+    if(! ok || first != expected)
+    {
+        std::cout << "MISMATCH: mechanisms disagree or wrong result "
+                     "(expected " << expected << ")\n";
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "All three mechanisms produced " << first
+              << " (identical).\n";
+    return EXIT_SUCCESS;
+}
diff --git a/example/cuda/notification-strategies/notification_strategies.hpp b/example/cuda/notification-strategies/notification_strategies.hpp
new file mode 100644
index 000000000..5c0956e68
--- /dev/null
+++ b/example/cuda/notification-strategies/notification_strategies.hpp
@@ -0,0 +1,459 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+#ifndef BOOST_CAPY_EXAMPLE_CUDA_NOTIFICATION_STRATEGIES_HPP
+#define BOOST_CAPY_EXAMPLE_CUDA_NOTIFICATION_STRATEGIES_HPP
+
+#include <boost/capy/continuation.hpp>
+#include <boost/capy/ex/executor_ref.hpp>
+#include <boost/capy/ex/io_env.hpp>
+
+#include <cuda_runtime.h>
+
+#include <condition_variable>
+#include <coroutine>
+#include <cstddef>
+#include <functional>
+#include <mutex>
+#include <queue>
+#include <string>
+#include <system_error>
+#include <thread>
+#include <utility>
+#include <vector>
+
+namespace boost {
+namespace capy {
+namespace example {
+
+/// Error category mapping `cudaError_t` values to messages.
+class cuda_error_category
+    : public std::error_category
+{
+public:
+    char const* name() const noexcept override
+    {
+        return "cuda";
+    }
+
+    std::string message(int ev) const override
+    {
+        return ::cudaGetErrorString(static_cast<cudaError_t>(ev));
+    }
+};
+
+/// Return the singleton CUDA error category.
+inline std::error_category const& cuda_category() noexcept
+{
+    static cuda_error_category const cat;
+    return cat;
+}
+
+/// Convert a `cudaError_t` to a `std::error_code`.
+inline std::error_code make_cuda_error(cudaError_t e) noexcept
+{
+    return std::error_code(static_cast<int>(e), cuda_category());
+}
+
+/// One pending poll registration. `cont` and `ec` point into the
+/// awaitable's coroutine frame, valid for the suspension's lifetime.
+struct poll_entry
+{
+    cudaEvent_t event;
+    executor_ref ex;
+    continuation* cont;
+    std::error_code* ec;
+};
+
+/// A dedicated thread that polls CUDA events with `cudaEventQuery` and
+/// posts each waiter's continuation once its event reports ready.
+///
+/// This is the notification mechanism that avoids both blocking a worker
+/// thread and registering a driver callback. Must be destroyed before
+/// the executor it posts to; in this example no waits are outstanding at
+/// teardown because the driver joins every pipeline first.
+class poll_service
+{
+    std::mutex mtx_;
+    std::vector<poll_entry> entries_;
+    std::jthread thread_;
+
+    void run(std::stop_token st)
+    {
+        while(! st.stop_requested())
+        {
+            {
+                std::lock_guard<std::mutex> lock(mtx_);
+                for(std::size_t i = 0; i < entries_.size();)
+                {
+                    auto& e = entries_[i];
+                    auto s = cudaEventQuery(e.event);
+                    if(s == cudaErrorNotReady)
+                    {
+                        ++i;
+                        continue;
+                    }
+                    *e.ec = (s == cudaSuccess)
+                        ? std::error_code{}
+                        : make_cuda_error(s);
+                    e.ex.post(*e.cont);
+                    entries_[i] = entries_.back();
+                    entries_.pop_back();
+                }
+            }
+            std::this_thread::yield();
+        }
+    }
+
+public:
+    poll_service()
+        : thread_([this](std::stop_token st) { run(st); })
+    {
+    }
+
+    /// Register a waiter; its continuation posts when `e.event` is ready.
+    void register_wait(poll_entry e)
+    {
+        std::lock_guard<std::mutex> lock(mtx_);
+        entries_.push_back(e);
+    }
+};
+
+/// A single-thread worker that runs blocking jobs off the calling
+/// thread. The deferred-sync mechanism uses it to call the blocking
+/// `cudaStreamSynchronize` without stalling a worker thread, then posts
+/// the continuation. Stops and joins on destruction.
+class sync_service
+{
+    std::mutex mtx_;
+    std::condition_variable cv_;
+    std::queue<std::function<void()>> jobs_;
+    bool stop_ = false;
+    std::jthread thread_;
+
+    void run()
+    {
+        for(;;)
+        {
+            std::function<void()> job;
+            {
+                std::unique_lock<std::mutex> lock(mtx_);
+                cv_.wait(lock, [this] { return stop_ || ! jobs_.empty(); });
+                if(stop_ && jobs_.empty())
+                    return;
+                job = std::move(jobs_.front());
+                jobs_.pop();
+            }
+            job();
+        }
+    }
+
+public:
+    sync_service()
+        : thread_([this] { run(); })
+    {
+    }
+
+    ~sync_service()
+    {
+        {
+            std::lock_guard<std::mutex> lock(mtx_);
+            stop_ = true;
+        }
+        cv_.notify_all();
+    }
+
+    /// Enqueue a job to run on the service thread.
+    void post(std::function<void()> job)
+    {
+        {
+            std::lock_guard<std::mutex> lock(mtx_);
+            jobs_.push(std::move(job));
+        }
+        cv_.notify_one();
+    }
+};
+
+struct callback_awaitable;
+class poll_awaitable;
+class deferred_sync_awaitable;
+
+/// Owns a CUDA stream; created on construction, destroyed on teardown.
+class cuda_stream
+{
+    cudaStream_t stream_ = nullptr;
+
+public:
+    cuda_stream()
+    {
+        auto err = cudaStreamCreate(&stream_);
+        if(err != cudaSuccess)
+            throw std::system_error(make_cuda_error(err));
+    }
+
+    ~cuda_stream()
+    {
+        if(stream_)
+            cudaStreamDestroy(stream_);
+    }
+
+    cuda_stream(cuda_stream&& other) noexcept
+        : stream_(std::exchange(other.stream_, nullptr))
+    {
+    }
+
+    cuda_stream& operator=(cuda_stream&& other) noexcept
+    {
+        if(this != &other)
+        {
+            if(stream_)
+                cudaStreamDestroy(stream_);
+            stream_ = std::exchange(other.stream_, nullptr);
+        }
+        return *this;
+    }
+
+    cuda_stream(cuda_stream const&) = delete;
+    cuda_stream& operator=(cuda_stream const&) = delete;
+
+    /// Return the underlying CUDA stream handle.
+    cudaStream_t native_handle() const noexcept
+    {
+        return stream_;
+    }
+
+    /// Return an IoAwaitable that completes via a host-function callback.
+    callback_awaitable sync_via_callback() noexcept;
+
+    /// Return an IoAwaitable that completes after a blocking synchronize
+    /// runs on `svc`.
+    deferred_sync_awaitable sync_via_deferred(sync_service& svc) noexcept;
+};
+
+/// Owns a CUDA event used to observe stream progress.
+class cuda_event
+{
+    cudaEvent_t event_ = nullptr;
+
+public:
+    cuda_event()
+    {
+        auto err = cudaEventCreateWithFlags(&event_, cudaEventDisableTiming);
+        if(err != cudaSuccess)
+            throw std::system_error(make_cuda_error(err));
+    }
+
+    ~cuda_event()
+    {
+        if(event_)
+            cudaEventDestroy(event_);
+    }
+
+    cuda_event(cuda_event&& other) noexcept
+        : event_(std::exchange(other.event_, nullptr))
+    {
+    }
+
+    cuda_event& operator=(cuda_event&& other) noexcept
+    {
+        if(this != &other)
+        {
+            if(event_)
+                cudaEventDestroy(event_);
+            event_ = std::exchange(other.event_, nullptr);
+        }
+        return *this;
+    }
+
+    cuda_event(cuda_event const&) = delete;
+    cuda_event& operator=(cuda_event const&) = delete;
+
+    /// Return the underlying CUDA event handle.
+    cudaEvent_t native_handle() const noexcept
+    {
+        return event_;
+    }
+
+    /// Record this event into `stream` at the current queue position.
+    void record(cudaStream_t stream)
+    {
+        auto err = cudaEventRecord(event_, stream);
+        if(err != cudaSuccess)
+            throw std::system_error(make_cuda_error(err));
+    }
+
+    /// Return an IoAwaitable that completes when this event is observed
+    /// ready by `svc`.
+    poll_awaitable sync_via_poll(poll_service& svc) noexcept;
+};
+
+/** IoAwaitable that resumes via a CUDA host-function callback.
+
+    `cudaLaunchHostFunc` enqueues a host callback at the stream's current
+    position. CUDA runs it on a driver-owned thread that must not make
+    CUDA calls or resume the coroutine inline, so the callback posts the
+    continuation through the captured executor instead. One operation is
+    in flight at a time, so the resume context is a member rather than a
+    per-operation allocation.
+*/
+struct callback_awaitable
+{
+    cudaStream_t stream;
+
+    struct resume_ctx
+    {
+        executor_ref ex;
+        continuation* cont = nullptr;
+        std::error_code* ec = nullptr;
+    };
+
+    continuation cont_;
+    std::error_code ec_;
+    resume_ctx ctx_;
+
+    static void CUDART_CB
+    on_complete(void* arg)
+    {
+        auto* c = static_cast<resume_ctx*>(arg);
+        *c->ec = std::error_code{};
+        c->ex.post(*c->cont);
+    }
+
+    bool await_ready() const noexcept
+    {
+        return false;
+    }
+
+    std::coroutine_handle<>
+    await_suspend(std::coroutine_handle<> h, io_env const* env)
+    {
+        cont_.h = h;
+        ctx_ = resume_ctx{env->executor, &cont_, &ec_};
+        auto err = cudaLaunchHostFunc(stream, &on_complete, &ctx_);
+        if(err != cudaSuccess)
+        {
+            // Could not register: resume inline so we never deadlock.
+            ec_ = make_cuda_error(err);
+            return h;
+        }
+        return std::noop_coroutine();
+    }
+
+    std::error_code await_resume() const noexcept
+    {
+        return ec_;
+    }
+};
+
+inline callback_awaitable
+cuda_stream::sync_via_callback() noexcept
+{
+    return callback_awaitable{stream_};
+}
+
+/// IoAwaitable that resumes when a CUDA event reports ready, detected by
+/// a polling service thread rather than a callback or a blocking wait.
+///
+/// @note A production awaitable should also check `env->stop_token` for
+///     cancellation in `await_suspend`; these demo awaitables omit that
+///     for brevity.
+class poll_awaitable
+{
+    poll_service& svc_;
+    cudaEvent_t event_;
+    continuation cont_;
+    std::error_code ec_;
+
+public:
+    poll_awaitable(poll_service& svc, cudaEvent_t event) noexcept
+        : svc_(svc)
+        , event_(event)
+    {
+    }
+
+    bool await_ready() const noexcept
+    {
+        return false;
+    }
+
+    std::coroutine_handle<>
+    await_suspend(std::coroutine_handle<> h, io_env const* env)
+    {
+        cont_.h = h;
+        svc_.register_wait(poll_entry{event_, env->executor, &cont_, &ec_});
+        return std::noop_coroutine();
+    }
+
+    std::error_code await_resume() const noexcept
+    {
+        return ec_;
+    }
+};
+
+inline poll_awaitable
+cuda_event::sync_via_poll(poll_service& svc) noexcept
+{
+    return poll_awaitable{svc, event_};
+}
+
+/// IoAwaitable that resumes after a blocking `cudaStreamSynchronize` runs
+/// on a service thread, keeping the worker thread free meanwhile.
+class deferred_sync_awaitable
+{
+    sync_service& svc_;
+    cudaStream_t stream_;
+    continuation cont_;
+    std::error_code ec_;
+
+public:
+    deferred_sync_awaitable(sync_service& svc, cudaStream_t stream) noexcept
+        : svc_(svc)
+        , stream_(stream)
+    {
+    }
+
+    bool await_ready() const noexcept
+    {
+        return false;
+    }
+
+    std::coroutine_handle<>
+    await_suspend(std::coroutine_handle<> h, io_env const* env)
+    {
+        auto ex = env->executor;
+        auto stream = stream_;
+        cont_.h = h;
+        svc_.post([ex, stream, ec = &ec_, cont = &cont_]() mutable
+        {
+            auto err = cudaStreamSynchronize(stream);
+            *ec = (err == cudaSuccess)
+                ? std::error_code{}
+                : make_cuda_error(err);
+            ex.post(*cont);
+        });
+        return std::noop_coroutine();
+    }
+
+    std::error_code await_resume() const noexcept
+    {
+        return ec_;
+    }
+};
+
+inline deferred_sync_awaitable
+cuda_stream::sync_via_deferred(sync_service& svc) noexcept
+{
+    return deferred_sync_awaitable{svc, stream_};
+}
+
+} // namespace example
+} // namespace capy
+} // namespace boost
+
+#endif