test_alternator: add test_alternator_invalid_shard_for_lwt

This test reproduces scylladb/scylladb#27353 using two injection points. First, the test triggers an intra-node tablet migration and suspends it at the streaming stage using the intranode_migration_streaming_wait injection. Next, it enables the alternator_executor_batch_write_wait injection, which suspends a batch write after its cas_shard has already been created. The test then issues several batch writes and waits until one of them hits this injection on the destination shard. At this point, the cas_shard.erm for that write is still in the streaming state, meaning the executor would need to jump back to the source shard. The test then resumes the suspended tablet migration, allowing it to update the ERM on the source shard to write_both_read_new. After that, the test releases the suspended batch write and expects it to perform two shard jumps: first from the destination to the source shard, and then again back to the source shard. This commit adds the alternator_executor_batch_write_wait injection to alternator/executor.cc. Coroutines are intentionally avoided in the parallel_for_each lambda to prevent unnecessary coroutine-frame allocations. (cherry picked from commit e60bcd0011)
2025-12-02 18:19:58 +01:00
parent b88ed6156b
commit 37c575c104
2 changed files with 134 additions and 0 deletions
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -3123,6 +3123,21 @@ static future<> do_batch_write(service::storage_proxy& proxy,
        return parallel_for_each(*key_builders_ptr, [&proxy, &client_state, &stats, trace_state, ssg, permit = std::move(permit)] (const auto& e) {
            stats.write_using_lwt++;
            auto desired_shard = service::cas_shard(*e.first.schema, e.first.dk.token());
+            auto s = e.first.schema;
+
+            static const auto* injection_name = "alternator_executor_batch_write_wait";
+            return utils::get_local_injector().inject(injection_name, [s = std::move(s)] (auto& handler) -> future<> {
+                const auto ks = handler.get("keyspace");
+                const auto cf = handler.get("table");
+                const auto shard = std::atoll(handler.get("shard")->data());
+                if (ks == s->ks_name() && cf == s->cf_name() && shard == this_shard_id()) {
+                    elogger.info("{}: hit", injection_name);
+                    co_await handler.wait_for_message(std::chrono::steady_clock::now() + std::chrono::minutes{5});
+                    elogger.info("{}: continue", injection_name);
+                }
+            }).then([&e, desired_shard = std::move(desired_shard),
+                 &client_state, trace_state = std::move(trace_state), permit = std::move(permit), &proxy, ssg, &stats]() mutable
+            {
            if (desired_shard.this_shard()) {
                return cas_write(proxy, e.first.schema, std::move(desired_shard), e.first.dk, e.second, client_state, trace_state, permit);
            } else {
@@ -3154,6 +3169,7 @@ static future<> do_batch_write(service::storage_proxy& proxy,
                    });
                }).finally([desired_shard = std::move(desired_shard)]{});
            }
+            });
        }).finally([key_builders = std::move(key_builders)]{});
    }
 }
--- a/test/cluster/test_alternator.py
+++ b/test/cluster/test_alternator.py
@@ -25,11 +25,13 @@ import json
 from cassandra.auth import PlainTextAuthProvider
 import threading
 import random
+import re

 from test.pylib.manager_client import ManagerClient
 from test.pylib.util import wait_for
 from test.pylib.tablets import get_all_tablet_replicas
 from test.cluster.conftest import skip_mode
+from test.pylib.tablets import get_tablet_replica

 logger = logging.getLogger(__name__)

@@ -970,3 +972,119 @@ async def test_alternator_concurrent_rmw_same_partition_different_server(manager
            t.join()
    finally:
        table.delete()
+
+
+@pytest.mark.xfail(reason="#27353")
+@pytest.mark.asyncio
+@skip_mode('release', 'error injections are not supported in release mode')
+async def test_alternator_invalid_shard_for_lwt(manager: ManagerClient):
+    """
+    Reproducer for issue #27353.
+
+    LWT requires that storage_proxy::cas() is invoked on a valid shard — the one
+    returned by sharder.try_get_shard_for_reads() for a tablets-based table.
+
+    The bug: if the current shard is invalid and we jump to the valid shard, that
+    new shard may become invalid again by the time we attempt to capture the ERM.
+    This leads to a failure of the CAS path.
+
+    The fix: retry the validity check and jump again if the current shard is already
+    invalid. We should exit the loop once the shard is valid *and* we hold a strong pointer
+    to the ERM — which prevents further tablet movements until the ERM is released.
+
+    This problem is specific to BatchWriteItem; other commands are already handled
+    correctly.
+    """
+    config = alternator_config.copy()
+    config['alternator_write_isolation'] = 'always_use_lwt'
+    cmdline = [
+        '--logger-log-level', 'alternator-executor=trace',
+        '--logger-log-level', 'alternator_controller=trace',
+        '--logger-log-level', 'paxos=trace'
+    ]
+    server = await manager.server_add(config=config, cmdline=cmdline)
+    alternator = get_alternator(server.ip_addr)
+
+    logger.info("Creating alternator test table")
+    table = alternator.create_table(TableName=unique_table_name(),
+                                    Tags=[{'Key': 'system:initial_tablets', 'Value': '1'}],
+                                    BillingMode='PAY_PER_REQUEST',
+                                    KeySchema=[{'AttributeName': 'p', 'KeyType': 'HASH'}],
+                                    AttributeDefinitions=[{'AttributeName': 'p', 'AttributeType': 'N'}])
+    table_name = table.name
+    ks_name = 'alternator_' + table_name
+    last_token = 7 # Any token works since we have only one tablet
+    
+    (src_host_id, src_shard) = await get_tablet_replica(manager, server, ks_name, table_name, last_token)
+    dst_shard = 0 if src_shard == 1 else 1
+
+    logger.info("Inject 'intranode_migration_streaming_wait'")
+    await manager.api.enable_injection(server.ip_addr,
+                                       "intranode_migration_streaming_wait",
+                                       one_shot=False)
+
+    logger.info("Start tablet migration")
+    intranode_migration_task = asyncio.create_task(
+        manager.api.move_tablet(server.ip_addr, ks_name, table_name,
+                                src_host_id, src_shard,
+                                src_host_id, dst_shard, last_token))
+
+    logger.info("Open server logs")
+    log = await manager.server_open_log(server.server_id)
+
+    logger.info("Wait for intranode_migration_streaming_wait")
+    await log.wait_for("intranode_migration_streaming: waiting")
+
+    logger.info("Inject 'alternator_executor_batch_write_wait'")
+    await manager.api.enable_injection(server.ip_addr,
+                                       "alternator_executor_batch_write_wait",
+                                       one_shot=False,
+                                       parameters={
+                                           'table': table_name,
+                                           'keyspace': ks_name,
+                                           'shard': dst_shard
+                                       })
+    m = await log.mark()
+
+    # Start a background thread, which tries to hit the alternator_executor_batch_write_wait
+    # injection on the destination shard.
+    logger.info("Start a batch_write thread")
+    stop_event = threading.Event()
+    def run_batch():
+        alternator = get_alternator(server.ip_addr)
+        table = alternator.Table(table_name)
+        while not stop_event.is_set():
+            with table.batch_writer() as batch:
+                batch.put_item(Item={'p': 1, 'x': 'hellow world'})
+    t = ThreadWrapper(target=run_batch)
+    t.start()
+
+    logger.info("Waiting for 'alternator_executor_batch_write_wait: hit'")
+    await log.wait_for("alternator_executor_batch_write_wait: hit", from_mark=m)
+
+    # We have a batch request with "streaming" cas_shard on the destination shard.
+    # This means we have already made a decision to jump to the src_shard.
+    # Now we're releasing the tablet migration so that it reaches write_both_read_new and
+    # and invaldiates this decision.
+
+    m = await log.mark()
+    await manager.api.message_injection(server.ip_addr, "intranode_migration_streaming_wait")
+
+    # The next barrier must be for the write_both_read_new, we need a guarantee
+    # that the src_shard observed it
+    logger.info("Waiting for the next barrier")
+    await log.wait_for(re.escape(f"[shard {src_shard}: gms] raft_topology - raft_topology_cmd::barrier_and_drain done"),
+                       from_mark=m)
+
+    # Now we have a guarantee that a new barrier succeeded on the src_shard,
+    # this means the src_shard has already transitioned to write_both_read_new,
+    # and our batch write will have to jump back to the destination shard.
+
+    logger.info("Release the 'alternator_executor_batch_write_wait'")
+    await manager.api.message_injection(server.ip_addr, "alternator_executor_batch_write_wait")
+
+    logger.info("Waiting for migratino task to finish")
+    await intranode_migration_task
+
+    stop_event.set()
+    t.join()