Initial setup: Fix configure.py for Ubuntu/Debian platform

Temporarily add Ubuntu/Debian support in kmiplib() function to allow configuration to proceed on Ubuntu systems. Co-authored-by: nyh <584227+nyh@users.noreply.github.com>
Initial plan
2025-12-16 11:18:56 +00:00 · 2025-12-16 10:58:17 +00:00 · 2025-12-16 06:58:53 +02:00 · 2025-12-16 06:57:19 +02:00 · 2025-12-16 06:56:39 +02:00 · 2025-12-16 06:56:00 +02:00
82 changed files with 2398 additions and 359 deletions
--- a/.github/workflows/docs-validate-metrics.yml
+++ b/.github/workflows/docs-validate-metrics.yml
@@ -7,7 +7,7 @@ on:
      - enterprise
    paths:
      - '**/*.cc'
-      - 'scripts/metrics-config.yml' 
+      - 'scripts/metrics-config.yml'
      - 'scripts/get_description.py'
      - 'docs/_ext/scylladb_metrics.py'

@@ -15,20 +15,20 @@ jobs:
  validate-metrics:
    runs-on: ubuntu-latest
    name: Check metrics documentation coverage
-    
+
    steps:
    - name: Checkout code
      uses: actions/checkout@v4
      with:
        submodules: true
-      
+
    - name: Set up Python
      uses: actions/setup-python@v6
      with:
        python-version: '3.10'
-        
+
    - name: Install dependencies
      run: pip install PyYAML
-        
+
    - name: Validate metrics
      run: python3 scripts/get_description.py --validate -c scripts/metrics-config.yml
--- a/alternator/executor.cc
+++ b/alternator/executor.cc
@@ -1799,6 +1799,11 @@ static future<executor::request_return_type> create_table_on_shard0(service::cli
                }
            }
        }
+        // Creating an index in tablets mode requires the rf_rack_valid_keyspaces option to be enabled.
+        // GSI and LSI indexes are based on materialized views which require this option to avoid consistency issues.
+        if (!view_builders.empty() && ksm->uses_tablets() && !sp.data_dictionary().get_config().rf_rack_valid_keyspaces()) {
+            co_return api_error::validation("GlobalSecondaryIndexes and LocalSecondaryIndexes with tablets require the rf_rack_valid_keyspaces option to be enabled.");
+        }
        try {
            schema_mutations = service::prepare_new_keyspace_announcement(sp.local_db(), ksm, ts);
        } catch (exceptions::already_exists_exception&) {
@@ -2019,6 +2024,10 @@ future<executor::request_return_type> executor::update_table(client_state& clien
                            co_return api_error::validation(fmt::format(
                                "LSI {} already exists in table {}, can't use same name for GSI", index_name, table_name));
                        }
+                        if (p.local().local_db().find_keyspace(keyspace_name).get_replication_strategy().uses_tablets() &&
+                                !p.local().data_dictionary().get_config().rf_rack_valid_keyspaces()) {
+                            co_return api_error::validation("GlobalSecondaryIndexes with tablets require the rf_rack_valid_keyspaces option to be enabled.");
+                        }

                        elogger.trace("Adding GSI {}", index_name);
                        // FIXME: read and handle "Projection" parameter. This will
--- a/auth/allow_all_authenticator.cc
+++ b/auth/allow_all_authenticator.cc
@@ -9,7 +9,6 @@
 #include "auth/allow_all_authenticator.hh"

 #include "service/migration_manager.hh"
-#include "utils/alien_worker.hh"
 #include "utils/class_registrator.hh"

 namespace auth {
@@ -23,7 +22,6 @@ static const class_registrator<
        cql3::query_processor&,
        ::service::raft_group0_client&,
        ::service::migration_manager&,
-        cache&,
-        utils::alien_worker&> registration("org.apache.cassandra.auth.AllowAllAuthenticator");
+        cache&> registration("org.apache.cassandra.auth.AllowAllAuthenticator");

 }
--- a/auth/allow_all_authenticator.hh
+++ b/auth/allow_all_authenticator.hh
@@ -14,7 +14,6 @@
 #include "auth/authenticator.hh"
 #include "auth/cache.hh"
 #include "auth/common.hh"
-#include "utils/alien_worker.hh"

 namespace cql3 {
 class query_processor;
@@ -30,7 +29,7 @@ extern const std::string_view allow_all_authenticator_name;

 class allow_all_authenticator final : public authenticator {
 public:
-    allow_all_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&, cache&, utils::alien_worker&) {
+    allow_all_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&, cache&) {
    }

    virtual future<> start() override {
--- a/auth/certificate_authenticator.cc
+++ b/auth/certificate_authenticator.cc
@@ -35,14 +35,13 @@ static const class_registrator<auth::authenticator
    , cql3::query_processor&
    , ::service::raft_group0_client&
    , ::service::migration_manager&
-    , auth::cache&
-    , utils::alien_worker&> cert_auth_reg(CERT_AUTH_NAME);
+    , auth::cache&> cert_auth_reg(CERT_AUTH_NAME);

 enum class auth::certificate_authenticator::query_source {
    subject, altname
 };

-auth::certificate_authenticator::certificate_authenticator(cql3::query_processor& qp, ::service::raft_group0_client&, ::service::migration_manager&, auth::cache&, utils::alien_worker&)
+auth::certificate_authenticator::certificate_authenticator(cql3::query_processor& qp, ::service::raft_group0_client&, ::service::migration_manager&, auth::cache&)
    : _queries([&] {
        auto& conf = qp.db().get_config();
        auto queries = conf.auth_certificate_role_queries();
--- a/auth/certificate_authenticator.hh
+++ b/auth/certificate_authenticator.hh
@@ -10,7 +10,6 @@
 #pragma once

 #include "auth/authenticator.hh"
-#include "utils/alien_worker.hh"
 #include <boost/regex_fwd.hpp>  // IWYU pragma: keep

 namespace cql3 {
@@ -34,7 +33,7 @@ class certificate_authenticator : public authenticator {
    enum class query_source;
    std::vector<std::pair<query_source, boost::regex>> _queries;
 public:
-    certificate_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&, cache&, utils::alien_worker&);
+    certificate_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&, cache&);
    ~certificate_authenticator();

    future<> start() override;
--- a/auth/password_authenticator.cc
+++ b/auth/password_authenticator.cc
@@ -49,8 +49,7 @@ static const class_registrator<
        cql3::query_processor&,
        ::service::raft_group0_client&,
        ::service::migration_manager&,
-        cache&,
-        utils::alien_worker&> password_auth_reg("org.apache.cassandra.auth.PasswordAuthenticator");
+        cache&> password_auth_reg("org.apache.cassandra.auth.PasswordAuthenticator");

 static thread_local auto rng_for_salt = std::default_random_engine(std::random_device{}());

@@ -64,14 +63,13 @@ std::string password_authenticator::default_superuser(const db::config& cfg) {
 password_authenticator::~password_authenticator() {
 }

-password_authenticator::password_authenticator(cql3::query_processor& qp, ::service::raft_group0_client& g0, ::service::migration_manager& mm, cache& cache, utils::alien_worker& hashing_worker)
+password_authenticator::password_authenticator(cql3::query_processor& qp, ::service::raft_group0_client& g0, ::service::migration_manager& mm, cache& cache)
    : _qp(qp)
    , _group0_client(g0)
    , _migration_manager(mm)
    , _cache(cache)
    , _stopped(make_ready_future<>()) 
    , _superuser(default_superuser(qp.db().get_config()))
-    , _hashing_worker(hashing_worker)
 {}

 static bool has_salted_hash(const cql3::untyped_result_set_row& row) {
@@ -330,9 +328,7 @@ future<authenticated_user> password_authenticator::authenticate(
            }
            salted_hash = role->salted_hash;
        }
-        const bool password_match = co_await _hashing_worker.submit<bool>([password = std::move(password), salted_hash] {
-            return passwords::check(password, *salted_hash);
-        });
+        const bool password_match = co_await passwords::check(password, *salted_hash);
        if (!password_match) {
            throw exceptions::authentication_exception("Username and/or password are incorrect");
        }
--- a/auth/password_authenticator.hh
+++ b/auth/password_authenticator.hh
@@ -18,7 +18,6 @@
 #include "auth/passwords.hh"
 #include "auth/cache.hh"
 #include "service/raft/raft_group0_client.hh"
-#include "utils/alien_worker.hh"

 namespace db {
    class config;
@@ -49,13 +48,12 @@ class password_authenticator : public authenticator {
    shared_promise<> _superuser_created_promise;
    // We used to also support bcrypt, SHA-256, and MD5 (ref. scylladb#24524).
    constexpr static auth::passwords::scheme _scheme = passwords::scheme::sha_512;
-    utils::alien_worker& _hashing_worker;

 public:
    static db::consistency_level consistency_for_user(std::string_view role_name);
    static std::string default_superuser(const db::config&);

-    password_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&, cache&, utils::alien_worker&);
+    password_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&, cache&);

    ~password_authenticator();

--- a/auth/passwords.cc
+++ b/auth/passwords.cc
@@ -7,6 +7,8 @@
 */

 #include "auth/passwords.hh"
+#include "utils/crypt_sha512.hh"
+#include <seastar/core/coroutine.hh>

 #include <cerrno>

@@ -21,27 +23,48 @@ static thread_local crypt_data tlcrypt = {};

 namespace detail {

+void verify_hashing_output(const char * res) {
+    if (!res || (res[0] == '*')) {
+        throw std::system_error(errno, std::system_category());
+    }
+}
+
 void verify_scheme(scheme scheme) {
    const sstring random_part_of_salt = "aaaabbbbccccdddd";

    const sstring salt = sstring(prefix_for_scheme(scheme)) + random_part_of_salt;
    const char* e = crypt_r("fisk", salt.c_str(), &tlcrypt);
-
-    if (e && (e[0] != '*')) {
-        return;
+    try {
+        verify_hashing_output(e);
+    } catch (const std::system_error& ex) {
+        throw no_supported_schemes();
    }
-
-    throw no_supported_schemes();
 }

 sstring hash_with_salt(const sstring& pass, const sstring& salt) {
    auto res = crypt_r(pass.c_str(), salt.c_str(), &tlcrypt);
-    if (!res || (res[0] == '*')) {
-        throw std::system_error(errno, std::system_category());
-    }
+    verify_hashing_output(res);
    return res;
 }

+seastar::future<sstring> hash_with_salt_async(const sstring& pass, const sstring& salt) {
+    sstring res;
+    // Only SHA-512 hashes for passphrases shorter than 256 bytes can be computed using
+    // the __crypt_sha512 method. For other computations, we fall back to the
+    // crypt_r implementation from `<crypt.h>`, which can stall.
+    if (salt.starts_with(prefix_for_scheme(scheme::sha_512)) && pass.size() <= 255) {
+        char buf[128];
+        const char * output_ptr = co_await __crypt_sha512(pass.c_str(), salt.c_str(), buf);
+        verify_hashing_output(output_ptr);
+        res = output_ptr;
+    } else {
+        const char * output_ptr = crypt_r(pass.c_str(), salt.c_str(), &tlcrypt);
+        verify_hashing_output(output_ptr);
+        res = output_ptr;
+    }
+    co_return res;
+}
+
 std::string_view prefix_for_scheme(scheme c) noexcept {
    switch (c) {
    case scheme::bcrypt_y: return "$2y$";
@@ -58,8 +81,9 @@ no_supported_schemes::no_supported_schemes()
        : std::runtime_error("No allowed hashing schemes are supported on this system") {
 }

-bool check(const sstring& pass, const sstring& salted_hash) {
-    return detail::hash_with_salt(pass, salted_hash) == salted_hash;
+seastar::future<bool> check(const sstring& pass, const sstring& salted_hash) {
+    const auto pwd_hash = co_await detail::hash_with_salt_async(pass, salted_hash);
+    co_return pwd_hash == salted_hash;
 }

 } // namespace auth::passwords
--- a/auth/passwords.hh
+++ b/auth/passwords.hh
@@ -11,6 +11,7 @@
 #include <random>
 #include <stdexcept>

+#include <seastar/core/future.hh>
 #include <seastar/core/sstring.hh>

 #include "seastarx.hh"
@@ -75,10 +76,19 @@ sstring generate_salt(RandomNumberEngine& g, scheme scheme) {

 ///
 /// Hash a password combined with an implementation-specific salt string.
+/// Deprecated in favor of `hash_with_salt_async`.
 ///
 /// \throws \ref std::system_error when an unexpected implementation-specific error occurs.
 ///
-sstring hash_with_salt(const sstring& pass, const sstring& salt);
+[[deprecated("Use hash_with_salt_async instead")]] sstring hash_with_salt(const sstring& pass, const sstring& salt);
+
+///
+/// Async version of `hash_with_salt` that returns a future.
+/// If possible, hashing uses `coroutine::maybe_yield` to prevent reactor stalls.
+///
+/// \throws \ref std::system_error when an unexpected implementation-specific error occurs.
+///
+seastar::future<sstring> hash_with_salt_async(const sstring& pass, const sstring& salt);

 } // namespace detail

@@ -107,6 +117,6 @@ sstring hash(const sstring& pass, RandomNumberEngine& g, scheme scheme) {
 ///
 /// \throws \ref std::system_error when an unexpected implementation-specific error occurs.
 ///
-bool check(const sstring& pass, const sstring& salted_hash);
+seastar::future<bool> check(const sstring& pass, const sstring& salted_hash);

 } // namespace auth::passwords
--- a/auth/saslauthd_authenticator.cc
+++ b/auth/saslauthd_authenticator.cc
@@ -35,10 +35,9 @@ static const class_registrator<
        cql3::query_processor&,
        ::service::raft_group0_client&,
        ::service::migration_manager&,
-        cache&,
-        utils::alien_worker&> saslauthd_auth_reg("com.scylladb.auth.SaslauthdAuthenticator");
+        cache&> saslauthd_auth_reg("com.scylladb.auth.SaslauthdAuthenticator");

-saslauthd_authenticator::saslauthd_authenticator(cql3::query_processor& qp, ::service::raft_group0_client&, ::service::migration_manager&, cache&, utils::alien_worker&)
+saslauthd_authenticator::saslauthd_authenticator(cql3::query_processor& qp, ::service::raft_group0_client&, ::service::migration_manager&, cache&)
    : _socket_path(qp.db().get_config().saslauthd_socket_path())
 {}

--- a/auth/saslauthd_authenticator.hh
+++ b/auth/saslauthd_authenticator.hh
@@ -12,7 +12,6 @@

 #include "auth/authenticator.hh"
 #include "auth/cache.hh"
-#include "utils/alien_worker.hh"

 namespace cql3 {
 class query_processor;
@@ -30,7 +29,7 @@ namespace auth {
 class saslauthd_authenticator : public authenticator {
    sstring _socket_path; ///< Path to the domain socket on which saslauthd is listening.
 public:
-    saslauthd_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&, cache&,utils::alien_worker&);
+    saslauthd_authenticator(cql3::query_processor&, ::service::raft_group0_client&, ::service::migration_manager&, cache&);

    future<> start() override;

--- a/auth/service.cc
+++ b/auth/service.cc
@@ -191,8 +191,7 @@ service::service(
        ::service::migration_manager& mm,
        const service_config& sc,
        maintenance_socket_enabled used_by_maintenance_socket,
-        cache& cache,
-        utils::alien_worker& hashing_worker)
+        cache& cache)
            : service(
                      std::move(c),
                      cache,
@@ -200,7 +199,7 @@ service::service(
                      g0,
                      mn,
                      create_object<authorizer>(sc.authorizer_java_name, qp, g0, mm),
-                      create_object<authenticator>(sc.authenticator_java_name, qp, g0, mm, cache, hashing_worker),
+                      create_object<authenticator>(sc.authenticator_java_name, qp, g0, mm, cache),
                      create_object<role_manager>(sc.role_manager_java_name, qp, g0, mm, cache),
                      used_by_maintenance_socket) {
 }
--- a/auth/service.hh
+++ b/auth/service.hh
@@ -27,7 +27,6 @@
 #include "cql3/description.hh"
 #include "seastarx.hh"
 #include "service/raft/raft_group0_client.hh"
-#include "utils/alien_worker.hh"
 #include "utils/observable.hh"
 #include "utils/serialized_action.hh"
 #include "service/maintenance_mode.hh"
@@ -131,8 +130,7 @@ public:
            ::service::migration_manager&,
            const service_config&,
            maintenance_socket_enabled,
-            cache&,
-            utils::alien_worker&);
+            cache&);

    future<> start(::service::migration_manager&, db::system_keyspace&);

--- a/auth/transitional.cc
+++ b/auth/transitional.cc
@@ -38,8 +38,8 @@ class transitional_authenticator : public authenticator {
 public:
    static const sstring PASSWORD_AUTHENTICATOR_NAME;

-    transitional_authenticator(cql3::query_processor& qp, ::service::raft_group0_client& g0, ::service::migration_manager& mm, cache& cache, utils::alien_worker& hashing_worker)
-            : transitional_authenticator(std::make_unique<password_authenticator>(qp, g0, mm, cache, hashing_worker)) {
+    transitional_authenticator(cql3::query_processor& qp, ::service::raft_group0_client& g0, ::service::migration_manager& mm, cache& cache)
+            : transitional_authenticator(std::make_unique<password_authenticator>(qp, g0, mm, cache)) {
    }
    transitional_authenticator(std::unique_ptr<authenticator> a)
            : _authenticator(std::move(a)) {
@@ -241,8 +241,7 @@ static const class_registrator<
        cql3::query_processor&,
        ::service::raft_group0_client&,
        ::service::migration_manager&,
-        auth::cache&,
-        utils::alien_worker&> transitional_authenticator_reg(auth::PACKAGE_NAME + "TransitionalAuthenticator");
+        auth::cache&> transitional_authenticator_reg(auth::PACKAGE_NAME + "TransitionalAuthenticator");

 static const class_registrator<
        auth::authorizer,
--- a/configure.py
+++ b/configure.py
@@ -859,6 +859,7 @@ scylla_core = (['message/messaging_service.cc',
                'utils/alien_worker.cc',
                'utils/array-search.cc',
                'utils/base64.cc',
+                'utils/crypt_sha512.cc',
                'utils/logalloc.cc',
                'utils/large_bitset.cc',
                'utils/buffer_input_stream.cc',
@@ -1479,7 +1480,6 @@ deps = {

 pure_boost_tests = set([
    'test/boost/anchorless_list_test',
-    'test/boost/auth_passwords_test',
    'test/boost/auth_resource_test',
    'test/boost/big_decimal_test',
    'test/boost/caching_options_test',
@@ -2192,6 +2192,8 @@ def kmiplib():
    for id in os_ids:
        if id in { 'centos', 'fedora', 'rhel' }:
            return 'rhel84'
+        elif id in { 'ubuntu', 'debian' }:
+            return 'ubuntu'  # Temporarily use a placeholder for Ubuntu/Debian
    print('Could not resolve libkmip.a for platform {}'.format(os_ids))
    sys.exit(1)

--- a/cql3/restrictions/statement_restrictions.cc
+++ b/cql3/restrictions/statement_restrictions.cc
@@ -1322,6 +1322,10 @@ const std::vector<expr::expression>& statement_restrictions::index_restrictions(
    return _index_restrictions;
 }

+bool statement_restrictions::is_empty() const {
+    return !_where.has_value();
+}
+
 // Current score table:
 // local and restrictions include full partition key: 2
 // global: 1
--- a/cql3/restrictions/statement_restrictions.hh
+++ b/cql3/restrictions/statement_restrictions.hh
@@ -408,6 +408,8 @@ public:

    /// Checks that the primary key restrictions don't contain null values, throws invalid_request_exception otherwise.
    void validate_primary_key(const query_options& options) const;
+
+    bool is_empty() const;
 };

 statement_restrictions analyze_statement_restrictions(
--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -1976,7 +1976,7 @@ mutation_fragments_select_statement::do_execute(query_processor& qp, service::qu
    if (it == indexes.end()) {
        throw exceptions::invalid_request_exception("ANN ordering by vector requires the column to be indexed using 'vector_index'");
    }
-    if (index_opt || parameters->allow_filtering() || restrictions->need_filtering() || check_needs_allow_filtering_anyway(*restrictions)) {
+    if (index_opt || parameters->allow_filtering() || !(restrictions->is_empty()) || check_needs_allow_filtering_anyway(*restrictions)) {
        throw exceptions::invalid_request_exception("ANN ordering by vector does not support filtering");
    }
    index_opt = *it;
--- a/data_dictionary/data_dictionary.cc
+++ b/data_dictionary/data_dictionary.cc
@@ -42,6 +42,11 @@ table::get_index_manager() const {
    return _ops->get_index_manager(*this);
 }

+db_clock::time_point
+table::get_truncation_time() const {
+    return _ops->get_truncation_time(*this);
+}
+
 lw_shared_ptr<keyspace_metadata>
 keyspace::metadata() const {
    return _ops->get_keyspace_metadata(*this);
--- a/data_dictionary/data_dictionary.hh
+++ b/data_dictionary/data_dictionary.hh
@@ -77,6 +77,7 @@ public:
    schema_ptr schema() const;
    const std::vector<view_ptr>& views() const;
    const secondary_index::secondary_index_manager& get_index_manager() const;
+    db_clock::time_point get_truncation_time() const;
 };

 class keyspace {
--- a/data_dictionary/impl.hh
+++ b/data_dictionary/impl.hh
@@ -27,6 +27,7 @@ public:
    virtual std::optional<table> try_find_table(database db, table_id id) const = 0;
    virtual const secondary_index::secondary_index_manager& get_index_manager(table t) const = 0;
    virtual schema_ptr get_table_schema(table t) const = 0;
+    virtual db_clock::time_point get_truncation_time(table t) const = 0;
    virtual lw_shared_ptr<keyspace_metadata> get_keyspace_metadata(keyspace ks) const = 0;
    virtual bool is_internal(keyspace ks) const = 0;
    virtual const locator::abstract_replication_strategy& get_replication_strategy(keyspace ks) const = 0;
--- a/db/batchlog.hh
+++ b/db/batchlog.hh
@@ -0,0 +1,20 @@
+/*
+ * Copyright (C) 2025-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
+ */
+
+#pragma once
+
+#include "mutation/mutation.hh"
+#include "utils/UUID.hh"
+
+namespace db {
+
+mutation get_batchlog_mutation_for(schema_ptr schema, const utils::chunked_vector<mutation>& mutations, int32_t version, db_clock::time_point now, const utils::UUID& id);
+
+mutation get_batchlog_delete_mutation(schema_ptr schema, int32_t version, db_clock::time_point now, const utils::UUID& id);
+
+}
--- a/db/batchlog_manager.cc
+++ b/db/batchlog_manager.cc
@@ -10,6 +10,7 @@

 #include <chrono>
 #include <exception>
+#include <ranges>
 #include <seastar/core/future-util.hh>
 #include <seastar/core/do_with.hh>
 #include <seastar/core/semaphore.hh>
@@ -18,12 +19,14 @@
 #include <seastar/core/sleep.hh>

 #include "batchlog_manager.hh"
+#include "batchlog.hh"
 #include "data_dictionary/data_dictionary.hh"
 #include "mutation/canonical_mutation.hh"
 #include "service/storage_proxy.hh"
 #include "system_keyspace.hh"
 #include "utils/rate_limiter.hh"
 #include "utils/log.hh"
+#include "utils/murmur_hash.hh"
 #include "db_clock.hh"
 #include "unimplemented.hh"
 #include "idl/frozen_schema.dist.hh"
@@ -33,17 +36,94 @@
 #include "cql3/untyped_result_set.hh"
 #include "service_permit.hh"
 #include "cql3/query_processor.hh"
-#include "replica/database.hh"

 static logging::logger blogger("batchlog_manager");

+namespace db {
+
+// Yields 256 batchlog shards. Even on the largest nodes we currently run on,
+// this should be enough to give every core a batchlog partition.
+static constexpr unsigned batchlog_shard_bits = 8;
+
+int32_t batchlog_shard_of(db_clock::time_point written_at) {
+    const int64_t count = written_at.time_since_epoch().count();
+    std::array<uint64_t, 2> result;
+    utils::murmur_hash::hash3_x64_128(bytes_view(reinterpret_cast<const signed char*>(&count), sizeof(count)), 0, result);
+    uint64_t hash = result[0] ^ result[1];
+    return hash & ((1ULL << batchlog_shard_bits) - 1);
+}
+
+std::pair<partition_key, clustering_key>
+get_batchlog_key(const schema& schema, int32_t version, db::batchlog_stage stage, int32_t batchlog_shard, db_clock::time_point written_at, std::optional<utils::UUID> id) {
+    auto pkey = partition_key::from_exploded(schema, {serialized(version), serialized(int8_t(stage)), serialized(batchlog_shard)});
+
+    std::vector<bytes> ckey_components;
+    ckey_components.reserve(2);
+    ckey_components.push_back(serialized(written_at));
+    if (id) {
+        ckey_components.push_back(serialized(*id));
+    }
+    auto ckey = clustering_key::from_exploded(schema, ckey_components);
+
+    return {std::move(pkey), std::move(ckey)};
+}
+
+std::pair<partition_key, clustering_key>
+get_batchlog_key(const schema& schema, int32_t version, db::batchlog_stage stage, db_clock::time_point written_at, std::optional<utils::UUID> id) {
+    return get_batchlog_key(schema, version, stage, batchlog_shard_of(written_at), written_at, id);
+}
+
+mutation get_batchlog_mutation_for(schema_ptr schema, managed_bytes data, int32_t version, db::batchlog_stage stage, db_clock::time_point now, const utils::UUID& id) {
+    auto [key, ckey] = get_batchlog_key(*schema, version, stage, now, id);
+
+    auto timestamp = api::new_timestamp();
+
+    mutation m(schema, key);
+    // Avoid going through data_value and therefore `bytes`, as it can be large (#24809).
+    auto cdef_data = schema->get_column_definition(to_bytes("data"));
+    m.set_cell(ckey, *cdef_data, atomic_cell::make_live(*cdef_data->type, timestamp, std::move(data)));
+
+    return m;
+}
+
+mutation get_batchlog_mutation_for(schema_ptr schema, const utils::chunked_vector<mutation>& mutations, int32_t version, db::batchlog_stage stage, db_clock::time_point now, const utils::UUID& id) {
+    auto data = [&mutations] {
+        utils::chunked_vector<canonical_mutation> fm(mutations.begin(), mutations.end());
+        bytes_ostream out;
+        for (auto& m : fm) {
+            ser::serialize(out, m);
+        }
+        return std::move(out).to_managed_bytes();
+    }();
+
+    return get_batchlog_mutation_for(std::move(schema), std::move(data), version, stage, now, id);
+}
+
+mutation get_batchlog_mutation_for(schema_ptr schema, const utils::chunked_vector<mutation>& mutations, int32_t version, db_clock::time_point now, const utils::UUID& id) {
+    return get_batchlog_mutation_for(std::move(schema), mutations, version, batchlog_stage::initial, now, id);
+}
+
+mutation get_batchlog_delete_mutation(schema_ptr schema, int32_t version, db::batchlog_stage stage, db_clock::time_point now, const utils::UUID& id) {
+    auto [key, ckey] = get_batchlog_key(*schema, version, stage, now, id);
+    mutation m(schema, key);
+    auto timestamp = api::new_timestamp();
+    m.partition().apply_delete(*schema, ckey, tombstone(timestamp, gc_clock::now()));
+    return m;
+}
+
+mutation get_batchlog_delete_mutation(schema_ptr schema, int32_t version, db_clock::time_point now, const utils::UUID& id) {
+    return get_batchlog_delete_mutation(std::move(schema), version, batchlog_stage::initial, now, id);
+}
+
+} // namespace db
+
 const std::chrono::seconds db::batchlog_manager::replay_interval;
 const uint32_t db::batchlog_manager::page_size;

 db::batchlog_manager::batchlog_manager(cql3::query_processor& qp, db::system_keyspace& sys_ks, batchlog_manager_config config)
        : _qp(qp)
        , _sys_ks(sys_ks)
-        , _write_request_timeout(std::chrono::duration_cast<db_clock::duration>(config.write_request_timeout))
+        , _replay_timeout(config.replay_timeout)
        , _replay_rate(config.replay_rate)
        , _delay(config.delay)
        , _replay_cleanup_after_replays(config.replay_cleanup_after_replays)
@@ -152,18 +232,75 @@ future<> db::batchlog_manager::stop() {
 }

 future<size_t> db::batchlog_manager::count_all_batches() const {
-    sstring query = format("SELECT count(*) FROM {}.{} BYPASS CACHE", system_keyspace::NAME, system_keyspace::BATCHLOG);
+    sstring query = format("SELECT count(*) FROM {}.{} BYPASS CACHE", system_keyspace::NAME, system_keyspace::BATCHLOG_V2);
    return _qp.execute_internal(query, cql3::query_processor::cache_internal::yes).then([](::shared_ptr<cql3::untyped_result_set> rs) {
       return size_t(rs->one().get_as<int64_t>("count"));
    });
 }

-db_clock::duration db::batchlog_manager::get_batch_log_timeout() const {
-    // enough time for the actual write + BM removal mutation
-    return _write_request_timeout * 2;
+future<> db::batchlog_manager::maybe_migrate_v1_to_v2() {
+    if (_migration_done) {
+        return make_ready_future<>();
+    }
+    return with_gate(_gate, [this] () mutable -> future<> {
+        blogger.info("Migrating batchlog entries from v1 -> v2");
+
+        auto schema_v1 = _qp.db().find_schema(system_keyspace::NAME, system_keyspace::BATCHLOG);
+        auto schema_v2 = _qp.db().find_schema(system_keyspace::NAME, system_keyspace::BATCHLOG_V2);
+
+        auto batch = [this, schema_v1, schema_v2] (const cql3::untyped_result_set::row& row) -> future<stop_iteration> {
+            // check version of serialization format
+            if (!row.has("version")) {
+                blogger.warn("Not migrating logged batch because of unknown version");
+                co_return stop_iteration::no;
+            }
+
+            auto version = row.get_as<int32_t>("version");
+            if (version != netw::messaging_service::current_version) {
+                blogger.warn("Not migrating logged batch because of incorrect version");
+                co_return stop_iteration::no;
+            }
+
+            auto id = row.get_as<utils::UUID>("id");
+            auto written_at = row.get_as<db_clock::time_point>("written_at");
+            auto data = row.get_blob_fragmented("data");
+
+            auto& sp = _qp.proxy();
+
+            utils::get_local_injector().inject("batchlog_manager_fail_migration", [] { throw std::runtime_error("Error injection: failing batchlog migration"); });
+
+            auto migrate_mut = get_batchlog_mutation_for(schema_v2, std::move(data), version, batchlog_stage::failed_replay, written_at, id);
+            co_await sp.mutate_locally(migrate_mut, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
+
+            mutation delete_mut(schema_v1, partition_key::from_single_value(*schema_v1, serialized(id)));
+            delete_mut.partition().apply_delete(*schema_v1, clustering_key_prefix::make_empty(), tombstone(api::new_timestamp(), gc_clock::now()));
+            co_await sp.mutate_locally(delete_mut, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
+
+            co_return stop_iteration::no;
+        };
+        try {
+            co_await _qp.query_internal(
+                    format("SELECT * FROM {}.{} BYPASS CACHE", system_keyspace::NAME, system_keyspace::BATCHLOG),
+                    db::consistency_level::ONE,
+                    {},
+                    page_size,
+                    std::move(batch));
+        } catch (...) {
+            blogger.warn("Batchlog v1 to v2 migration failed: {}; will retry", std::current_exception());
+            co_return;
+        }
+
+        co_await container().invoke_on_all([] (auto& bm) {
+            bm._migration_done = true;
+        });
+
+        blogger.info("Done migrating batchlog entries from v1 -> v2");
+    });
 }

 future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches(post_replay_cleanup cleanup) {
+    co_await maybe_migrate_v1_to_v2();
+
    typedef db_clock::rep clock_type;

    db::all_batches_replayed all_replayed = all_batches_replayed::yes;
@@ -172,21 +309,26 @@ future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches
    auto throttle = _replay_rate / _qp.proxy().get_token_metadata_ptr()->count_normal_token_owners();
    auto limiter = make_lw_shared<utils::rate_limiter>(throttle);

-    auto schema = _qp.db().find_schema(system_keyspace::NAME, system_keyspace::BATCHLOG);
-    auto delete_batch = [this, schema = std::move(schema)] (utils::UUID id) {
-        auto key = partition_key::from_singular(*schema, id);
-        mutation m(schema, key);
-        auto now = service::client_state(service::client_state::internal_tag()).get_timestamp();
-        m.partition().apply_delete(*schema, clustering_key_prefix::make_empty(), tombstone(now, gc_clock::now()));
-        return _qp.proxy().mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
+    auto schema = _qp.db().find_schema(system_keyspace::NAME, system_keyspace::BATCHLOG_V2);
+
+    struct replay_stats {
+        std::optional<db_clock::time_point> min_too_fresh;
+        bool need_cleanup = false;
    };

-    auto batch = [this, limiter, delete_batch = std::move(delete_batch), &all_replayed](const cql3::untyped_result_set::row& row) -> future<stop_iteration> {
+    std::unordered_map<int32_t, replay_stats> replay_stats_per_shard;
+
+    // Use a stable `now` accross all batches, so skip/replay decisions are the
+    // same accross a while prefix of written_at (accross all ids).
+    const auto now = db_clock::now();
+
+    auto batch = [this, cleanup, limiter, schema, &all_replayed, &replay_stats_per_shard, now] (const cql3::untyped_result_set::row& row) -> future<stop_iteration> {
+        const auto stage = static_cast<batchlog_stage>(row.get_as<int8_t>("stage"));
+        const auto batch_shard = row.get_as<int32_t>("shard");
        auto written_at = row.get_as<db_clock::time_point>("written_at");
        auto id = row.get_as<utils::UUID>("id");
        // enough time for the actual write + batchlog entry mutation delivery (two separate requests).
-        auto now = db_clock::now();
-        auto timeout = get_batch_log_timeout();
+        auto timeout = _replay_timeout;

        if (utils::get_local_injector().is_enabled("skip_batch_replay")) {
            blogger.debug("Skipping batch replay due to skip_batch_replay injection");
@@ -194,52 +336,48 @@ future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches
            co_return stop_iteration::no;
        }

-        // check version of serialization format
-        if (!row.has("version")) {
-            blogger.warn("Skipping logged batch because of unknown version");
-            co_await delete_batch(id);
-            co_return stop_iteration::no;
-        }
-
-        auto version = row.get_as<int32_t>("version");
-        if (version != netw::messaging_service::current_version) {
-            blogger.warn("Skipping logged batch because of incorrect version {}; current version = {}", version, netw::messaging_service::current_version);
-            co_await delete_batch(id);
-            co_return stop_iteration::no;
-        }
-
        auto data = row.get_blob_unfragmented("data");

-        blogger.debug("Replaying batch {}", id);
+        blogger.debug("Replaying batch {} from stage {} and batch shard {}", id, int32_t(stage), batch_shard);
+
+        utils::chunked_vector<mutation> mutations;
+        bool send_failed = false;
+
+        auto& shard_written_at = replay_stats_per_shard.try_emplace(batch_shard, replay_stats{}).first->second;

        try {
-            auto fms = make_lw_shared<std::deque<canonical_mutation>>();
+            utils::chunked_vector<std::pair<canonical_mutation, schema_ptr>> fms;
            auto in = ser::as_input_stream(data);
            while (in.size()) {
-                fms->emplace_back(ser::deserialize(in, std::type_identity<canonical_mutation>()));
-                schema_ptr s = _qp.db().find_schema(fms->back().column_family_id());
-                timeout = std::min(timeout, std::chrono::duration_cast<db_clock::duration>(s->tombstone_gc_options().propagation_delay_in_seconds()));
+                auto fm = ser::deserialize(in, std::type_identity<canonical_mutation>());
+                const auto tbl = _qp.db().try_find_table(fm.column_family_id());
+                if (!tbl) {
+                    continue;
+                }
+                if (written_at <= tbl->get_truncation_time()) {
+                    continue;
+                }
+                schema_ptr s = tbl->schema();
+                if (s->tombstone_gc_options().mode() == tombstone_gc_mode::repair) {
+                    timeout = std::min(timeout, std::chrono::duration_cast<db_clock::duration>(s->tombstone_gc_options().propagation_delay_in_seconds()));
+                }
+                fms.emplace_back(std::move(fm), std::move(s));
            }

            if (now < written_at + timeout) {
                blogger.debug("Skipping replay of {}, too fresh", id);
+
+                shard_written_at.min_too_fresh = std::min(shard_written_at.min_too_fresh.value_or(written_at), written_at);
+
                co_return stop_iteration::no;
            }

            auto size = data.size();

-            auto mutations = co_await map_reduce(*fms, [this, written_at] (canonical_mutation& fm) {
-                const auto& cf = _qp.proxy().local_db().find_column_family(fm.column_family_id());
-                return make_ready_future<canonical_mutation*>(written_at > cf.get_truncation_time() ? &fm : nullptr);
-            },
-            utils::chunked_vector<mutation>(),
-            [this] (utils::chunked_vector<mutation> mutations, canonical_mutation* fm) {
-                if (fm) {
-                    schema_ptr s = _qp.db().find_schema(fm->column_family_id());
-                    mutations.emplace_back(fm->to_mutation(s));
-                }
-                return mutations;
-            });
+            for (const auto& [fm, s] : fms) {
+                mutations.emplace_back(fm.to_mutation(s));
+                co_await maybe_yield();
+            }

            if (!mutations.empty()) {
                const auto ttl = [written_at]() -> clock_type {
@@ -265,7 +403,11 @@ future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches
                    co_await limiter->reserve(size);
                    _stats.write_attempts += mutations.size();
                    auto timeout = db::timeout_clock::now() + write_timeout;
-                    co_await _qp.proxy().send_batchlog_replay_to_all_replicas(std::move(mutations), timeout);
+                    if (cleanup) {
+                        co_await _qp.proxy().send_batchlog_replay_to_all_replicas(mutations, timeout);
+                    } else {
+                        co_await _qp.proxy().send_batchlog_replay_to_all_replicas(std::move(mutations), timeout);
+                    }
                }
            }
        } catch (data_dictionary::no_such_keyspace& ex) {
@@ -279,31 +421,80 @@ future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches
            // Do _not_ remove the batch, assuning we got a node write error.
            // Since we don't have hints (which origin is satisfied with),
            // we have to resort to keeping this batch to next lap.
-            co_return stop_iteration::no;
+            if (!cleanup || stage == batchlog_stage::failed_replay) {
+                co_return stop_iteration::no;
+            }
+            send_failed = true;
        }
+
+        auto& sp = _qp.proxy();
+
+        if (send_failed) {
+            blogger.debug("Moving batch {} to stage failed_replay", id);
+            auto m = get_batchlog_mutation_for(schema, mutations, netw::messaging_service::current_version, batchlog_stage::failed_replay, written_at, id);
+            co_await sp.mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
+        }
+
        // delete batch
-        co_await delete_batch(id);
+        auto m = get_batchlog_delete_mutation(schema, netw::messaging_service::current_version, stage, written_at, id);
+        co_await _qp.proxy().mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
+
+        shard_written_at.need_cleanup = true;
+
        co_return stop_iteration::no;
    };

-    co_await with_gate(_gate, [this, cleanup, batch = std::move(batch)] () mutable -> future<> {
-        blogger.debug("Started replayAllFailedBatches (cpu {})", this_shard_id());
+    co_await with_gate(_gate, [this, cleanup, &all_replayed, batch = std::move(batch), now, &replay_stats_per_shard] () mutable -> future<> {
+        blogger.debug("Started replayAllFailedBatches with cleanup: {}", cleanup);
        co_await utils::get_local_injector().inject("add_delay_to_batch_replay", std::chrono::milliseconds(1000));
-        co_await _qp.query_internal(
-                format("SELECT id, data, written_at, version FROM {}.{} BYPASS CACHE", system_keyspace::NAME, system_keyspace::BATCHLOG),
-                db::consistency_level::ONE,
-                {},
-                page_size,
-                std::move(batch)).then([this, cleanup] {
-            if (cleanup == post_replay_cleanup::no) {
-                return make_ready_future<>();
+
+        auto schema = _qp.db().find_schema(system_keyspace::NAME, system_keyspace::BATCHLOG_V2);
+
+        co_await coroutine::parallel_for_each(std::views::iota(0, 16), [&] (int32_t chunk) -> future<> {
+            const int32_t batchlog_chunk_base = chunk * 16;
+            for (int32_t i = 0; i < 16; ++i) {
+                int32_t batchlog_shard = batchlog_chunk_base + i;
+
+                co_await _qp.query_internal(
+                        format("SELECT * FROM {}.{} WHERE version = ? AND stage = ? AND shard = ? BYPASS CACHE", system_keyspace::NAME, system_keyspace::BATCHLOG_V2),
+                        db::consistency_level::ONE,
+                        {data_value(netw::messaging_service::current_version), data_value(int8_t(batchlog_stage::failed_replay)), data_value(batchlog_shard)},
+                        page_size,
+                        batch);
+
+                co_await _qp.query_internal(
+                        format("SELECT * FROM {}.{} WHERE version = ? AND stage = ? AND shard = ? BYPASS CACHE", system_keyspace::NAME, system_keyspace::BATCHLOG_V2),
+                        db::consistency_level::ONE,
+                        {data_value(netw::messaging_service::current_version), data_value(int8_t(batchlog_stage::initial)), data_value(batchlog_shard)},
+                        page_size,
+                        batch);
+
+                if (cleanup != post_replay_cleanup::yes) {
+                    continue;
+                }
+
+                auto it = replay_stats_per_shard.find(batchlog_shard);
+                if (it == replay_stats_per_shard.end() || !it->second.need_cleanup) {
+                    // Nothing was replayed on this batchlog shard, nothing to cleanup.
+                    continue;
+                }
+
+                const auto write_time = it->second.min_too_fresh.value_or(now - _replay_timeout);
+                const auto end_weight  = it->second.min_too_fresh ? bound_weight::before_all_prefixed : bound_weight::after_all_prefixed;
+                auto [key, ckey] = get_batchlog_key(*schema, netw::messaging_service::current_version, batchlog_stage::initial, batchlog_shard, write_time, {});
+                auto end_pos = position_in_partition(partition_region::clustered, end_weight, std::move(ckey));
+
+                range_tombstone rt(position_in_partition::before_all_clustered_rows(), std::move(end_pos), tombstone(api::new_timestamp(), gc_clock::now()));
+
+                blogger.trace("Clean up batchlog shard {} with range tombstone {}", batchlog_shard, rt);
+
+                mutation m(schema, key);
+                m.partition().apply_row_tombstone(*schema, std::move(rt));
+                co_await _qp.proxy().mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no);
            }
-            // Replaying batches could have generated tombstones, flush to disk,
-            // where they can be compacted away.
-            return replica::database::flush_table_on_all_shards(_qp.proxy().get_db(), system_keyspace::NAME, system_keyspace::BATCHLOG);
-        }).then([] {
-            blogger.debug("Finished replayAllFailedBatches");
        });
+
+        blogger.debug("Finished replayAllFailedBatches with all_replayed: {}", all_replayed);
    });

    co_return all_replayed;
--- a/db/batchlog_manager.hh
+++ b/db/batchlog_manager.hh
@@ -34,12 +34,17 @@ class system_keyspace;
 using all_batches_replayed = bool_class<struct all_batches_replayed_tag>;

 struct batchlog_manager_config {
-    std::chrono::duration<double> write_request_timeout;
+    db_clock::duration replay_timeout;
    uint64_t replay_rate = std::numeric_limits<uint64_t>::max();
    std::chrono::milliseconds delay = std::chrono::milliseconds(0);
    unsigned replay_cleanup_after_replays;
 };

+enum class batchlog_stage : int8_t {
+    initial,
+    failed_replay
+};
+
 class batchlog_manager : public peering_sharded_service<batchlog_manager> {
 public:
    using post_replay_cleanup = bool_class<class post_replay_cleanup_tag>;
@@ -59,7 +64,7 @@ private:

    cql3::query_processor& _qp;
    db::system_keyspace& _sys_ks;
-    db_clock::duration _write_request_timeout;
+    db_clock::duration _replay_timeout;
    uint64_t _replay_rate;
    std::chrono::milliseconds _delay;
    unsigned _replay_cleanup_after_replays = 100;
@@ -71,6 +76,14 @@ private:

    gc_clock::time_point _last_replay;

+    // Was the v1 -> v2 migration already done since last restart?
+    // The migration is attempted once after each restart. This is redundant but
+    // keeps thing simple. Once no upgrade path exists from a ScyllaDB version
+    // which can still produce v1 entries, this migration code can be removed.
+    bool _migration_done = false;
+
+    future<> maybe_migrate_v1_to_v2();
+
    future<all_batches_replayed> replay_all_failed_batches(post_replay_cleanup cleanup);
 public:
    // Takes a QP, not a distributes. Because this object is supposed
@@ -85,10 +98,13 @@ public:
    future<all_batches_replayed> do_batch_log_replay(post_replay_cleanup cleanup);

    future<size_t> count_all_batches() const;
-    db_clock::duration get_batch_log_timeout() const;
    gc_clock::time_point get_last_replay() const {
        return _last_replay;
    }
+
+    const stats& stats() const {
+        return _stats;
+    }
 private:
    future<> batchlog_replay_loop();
 };
--- a/db/commitlog/commitlog_replayer.cc
+++ b/db/commitlog/commitlog_replayer.cc
@@ -54,12 +54,14 @@ public:
        uint64_t applied_mutations = 0;
        uint64_t corrupt_bytes = 0;
        uint64_t truncated_at = 0;
+        uint64_t broken_files = 0;

        stats& operator+=(const stats& s) {
            invalid_mutations += s.invalid_mutations;
            skipped_mutations += s.skipped_mutations;
            applied_mutations += s.applied_mutations;
            corrupt_bytes += s.corrupt_bytes;
+            broken_files += s.broken_files;
            return *this;
        }
        stats operator+(const stats& s) const {
@@ -192,6 +194,8 @@ db::commitlog_replayer::impl::recover(const commitlog::descriptor& d, const comm
            s->corrupt_bytes += e.bytes();
        } catch (commitlog::segment_truncation& e) {
            s->truncated_at = e.position();
+        } catch (commitlog::header_checksum_error&) {
+            ++s->broken_files;
        } catch (...) {
            throw;
        }
@@ -370,6 +374,9 @@ future<> db::commitlog_replayer::recover(std::vector<sstring> files, sstring fna
                    if (stats.truncated_at != 0) {
                        rlogger.warn("Truncated file: {} at position {}.", f, stats.truncated_at);
                    }
+                    if (stats.broken_files != 0) {
+                        rlogger.warn("Corrupted file header: {}. Skipped.", f);
+                    }
                    rlogger.debug("Log replay of {} complete, {} replayed mutations ({} invalid, {} skipped)"
                                    , f
                                    , stats.applied_mutations
--- a/db/config.cc
+++ b/db/config.cc
@@ -1152,7 +1152,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
        "Number of threads with which to deliver hints. In multiple data-center deployments, consider increasing this number because cross data-center handoff is generally slower.")
    , batchlog_replay_throttle_in_kb(this, "batchlog_replay_throttle_in_kb", value_status::Unused, 1024,
        "Total maximum throttle. Throttling is reduced proportionally to the number of nodes in the cluster.")
-    , batchlog_replay_cleanup_after_replays(this, "batchlog_replay_cleanup_after_replays", liveness::LiveUpdate, value_status::Used, 60,
+    , batchlog_replay_cleanup_after_replays(this, "batchlog_replay_cleanup_after_replays", liveness::LiveUpdate, value_status::Used, 1,
        "Clean up batchlog memtable after every N replays. Replays are issued on a timer, every 60 seconds. So if batchlog_replay_cleanup_after_replays is set to 60, the batchlog memtable is flushed every 60 * 60 seconds.")
    /**
    * @Group Request scheduler properties
--- a/db/schema_applier.cc
+++ b/db/schema_applier.cc
@@ -1262,16 +1262,9 @@ static future<> do_merge_schema(sharded<service::storage_proxy>& proxy,  sharded
 {
    slogger.trace("do_merge_schema: {}", mutations);
    schema_applier ap(proxy, ss, sys_ks, reload);
-    std::exception_ptr ex;
-    try {
-        co_await execute_do_merge_schema(proxy, ap, std::move(mutations));
-    } catch (...) {
-        ex = std::current_exception();
-    }
-    co_await ap.destroy();
-    if (ex) {
-        throw ex;
-    }
+    co_await execute_do_merge_schema(proxy, ap, std::move(mutations)).finally([&ap]() {
+        return ap.destroy();
+    });
 }

 /**
--- a/db/system_keyspace.cc
+++ b/db/system_keyspace.cc
@@ -213,6 +213,30 @@ schema_ptr system_keyspace::batchlog() {
    return batchlog;
 }

+schema_ptr system_keyspace::batchlog_v2() {
+    static thread_local auto batchlog_v2 = [] {
+        schema_builder builder(generate_legacy_id(NAME, BATCHLOG_V2), NAME, BATCHLOG_V2,
+        // partition key
+        {{"version", int32_type}, {"stage", byte_type}, {"shard", int32_type}},
+        // clustering key
+        {{"written_at", timestamp_type}, {"id", uuid_type}},
+        // regular columns
+        {{"data", bytes_type}},
+        // static columns
+        {},
+        // regular column name type
+        utf8_type,
+        // comment
+        "batches awaiting replay"
+       );
+       builder.set_gc_grace_seconds(0);
+       builder.set_caching_options(caching_options::get_disabled_caching_options());
+       builder.with_hash_version();
+       return builder.build(schema_builder::compact_storage::no);
+    }();
+    return batchlog_v2;
+}
+
 /*static*/ schema_ptr system_keyspace::paxos() {
    static thread_local auto paxos = [] {
        // FIXME: switch to the new schema_builder interface (with_column(...), etc)
@@ -2304,7 +2328,7 @@ std::vector<schema_ptr> system_keyspace::all_tables(const db::config& cfg) {
    std::copy(schema_tables.begin(), schema_tables.end(), std::back_inserter(r));
    auto auth_tables = system_keyspace::auth_tables();
    std::copy(auth_tables.begin(), auth_tables.end(), std::back_inserter(r));
-    r.insert(r.end(), { built_indexes(), hints(), batchlog(), paxos(), local(),
+    r.insert(r.end(), { built_indexes(), hints(), batchlog(), batchlog_v2(), paxos(), local(),
                    peers(), peer_events(), range_xfers(),
                    compactions_in_progress(), compaction_history(),
                    sstable_activity(), size_estimates(), large_partitions(), large_rows(), large_cells(),
@@ -2335,7 +2359,9 @@ std::vector<schema_ptr> system_keyspace::all_tables(const db::config& cfg) {
 }

 static bool maybe_write_in_user_memory(schema_ptr s) {
-    return (s.get() == system_keyspace::batchlog().get()) || (s.get() == system_keyspace::paxos().get())
+    return (s.get() == system_keyspace::batchlog().get())
+            || (s.get() == system_keyspace::batchlog_v2().get())
+            || (s.get() == system_keyspace::paxos().get())
            || s == system_keyspace::v3::scylla_views_builds_in_progress();
 }

--- a/db/system_keyspace.hh
+++ b/db/system_keyspace.hh
@@ -163,6 +163,7 @@ public:
    static constexpr auto NAME = "system";
    static constexpr auto HINTS = "hints";
    static constexpr auto BATCHLOG = "batchlog";
+    static constexpr auto BATCHLOG_V2 = "batchlog_v2";
    static constexpr auto PAXOS = "paxos";
    static constexpr auto BUILT_INDEXES = "IndexInfo";
    static constexpr auto LOCAL = "local";
@@ -255,6 +256,7 @@ public:

    static schema_ptr hints();
    static schema_ptr batchlog();
+    static schema_ptr batchlog_v2();
    static schema_ptr paxos();
    static schema_ptr built_indexes(); // TODO (from Cassandra): make private
    static schema_ptr raft();
--- a/docs/README.md
+++ b/docs/README.md
@@ -71,7 +71,7 @@ Use "Bash on Ubuntu on Windows" for the same tools and capabilities as on Linux

 ### Building the Docs 

-1. Run `make preview` to build the documentation.
+1. Run `make preview` in the `docs/` directory to build the documentation.
 1. Preview the built documentation locally at http://127.0.0.1:5500/.

 ### Cleanup
--- a/docs/_ext/scylladb_metrics.py
+++ b/docs/_ext/scylladb_metrics.py
@@ -41,6 +41,8 @@ class MetricsProcessor:
                # Get metrics from the file
                try:
                    metrics_file = metrics.get_metrics_from_file(relative_path, "scylla_", metrics_info, strict=strict)
+                except SystemExit:
+                    pass
                finally:
                    os.chdir(old_cwd)
                if metrics_file:
--- a/docs/features/cdc/cdc-stream-changes.rst
+++ b/docs/features/cdc/cdc-stream-changes.rst
@@ -29,9 +29,6 @@ A CDC generation consists of:

 This is the mapping used to decide on which stream IDs to use when making writes, as explained in the :doc:`./cdc-streams` document. It is a global property of the cluster: it doesn't depend on the table you're making writes to.

-.. caution::
-   The tables mentioned in the following sections: ``system_distributed.cdc_generation_timestamps`` and ``system_distributed.cdc_streams_descriptions_v2`` have been introduced in ScyllaDB 4.4. It is highly recommended to upgrade to 4.4 for efficient CDC usage. The last section explains how to run the below examples in ScyllaDB 4.3.
-
 When CDC generations change
 ---------------------------

--- a/docs/operating-scylla/security/auditing.rst
+++ b/docs/operating-scylla/security/auditing.rst
@@ -64,13 +64,12 @@ ADMIN      Logs service level operations: create, alter, drop, attach, detach, l
           auditing.
 =========  =========================================================================================

-Note that audit for every DML or QUERY might impact performance and consume a lot of storage.
+Note that enabling audit may negatively impact performance and audit-to-table may consume extra storage. That's especially true when auditing DML and QUERY categories, which generate a high volume of audit messages.

 Configuring Audit Storage
 ---------------------------

-Auditing messages can be sent to :ref:`Syslog <auditing-syslog-storage>` or stored in a Scylla :ref:`table <auditing-table-storage>`.
-Currently, auditing messages can only be saved to one location at a time. You cannot log into both a table and the Syslog.
+Auditing messages can be sent to :ref:`Syslog <auditing-syslog-storage>` or stored in a Scylla :ref:`table <auditing-table-storage>` or both.

 .. _auditing-syslog-storage:

@@ -193,6 +192,23 @@ For example:
      2018-03-18 00:00:00+0000 | 10.143.2.108 | 3429b1a5-2a94-11e8-8f4e-000000000001 |      DDL |         ONE | False |           nba | DROP TABLE nba.team_roster ; | 127.0.0.1       | team_roster | Scylla   | 
      (1 row)

+.. _auditing-table-and-syslog-storage:
+
+Storing Audit Messages in a Table and Syslog Simultaneously
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+**Procedure**
+
+#. Follow both procedures from above, and set the ``audit`` parameter in the ``scylla.yaml`` file to both ``syslog`` and ``table``. You need to restart scylla only once.
+
+   To have both syslog and table you need to specify both backends separated by a comma:
+
+   .. code-block:: shell
+
+      audit: "syslog,table"
+
+
+
 Handling Audit Failures
 ---------------------------

--- a/licenses/README.md
+++ b/licenses/README.md
@@ -15,3 +15,22 @@ with the Apache License (version 2) and ScyllaDB-Source-Available-1.0.
 They contain the following tag:

  SPDX-License-Identifier: (LicenseRef-ScyllaDB-Source-Available-1.0 and Apache-2.0)
+
+### `musl libc` files
+
+`licenses/musl-license.txt` is obtained from:
+  https://git.musl-libc.org/cgit/musl/tree/COPYRIGHT
+
+`utils/crypt_sha512.cc` is obtained from:
+  https://git.musl-libc.org/cgit/musl/tree/src/crypt/crypt_sha512.c
+
+Both files are obtained from git.musl-libc.org.
+Import commit:
+  commit 1b76ff0767d01df72f692806ee5adee13c67ef88
+  Author: Alex Rønne Petersen <alex@alexrp.com>
+  Date:   Sun Oct 12 05:35:19 2025 +0200
+
+  s390x: shuffle register usage in __tls_get_offset to avoid r0 as address
+
+musl as a whole is licensed under the standard MIT license included in
+`licenses/musl-license.txt`.
--- a/licenses/musl-license.txt
+++ b/licenses/musl-license.txt
@@ -0,0 +1,193 @@
+musl as a whole is licensed under the following standard MIT license:
+
+----------------------------------------------------------------------
+Copyright © 2005-2020 Rich Felker, et al.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+----------------------------------------------------------------------
+
+Authors/contributors include:
+
+A. Wilcox
+Ada Worcester
+Alex Dowad
+Alex Suykov
+Alexander Monakov
+Andre McCurdy
+Andrew Kelley
+Anthony G. Basile
+Aric Belsito
+Arvid Picciani
+Bartosz Brachaczek
+Benjamin Peterson
+Bobby Bingham
+Boris Brezillon
+Brent Cook
+Chris Spiegel
+Clément Vasseur
+Daniel Micay
+Daniel Sabogal
+Daurnimator
+David Carlier
+David Edelsohn
+Denys Vlasenko
+Dmitry Ivanov
+Dmitry V. Levin
+Drew DeVault
+Emil Renner Berthing
+Fangrui Song
+Felix Fietkau
+Felix Janda
+Gianluca Anzolin
+Hauke Mehrtens
+He X
+Hiltjo Posthuma
+Isaac Dunham
+Jaydeep Patil
+Jens Gustedt
+Jeremy Huntwork
+Jo-Philipp Wich
+Joakim Sindholt
+John Spencer
+Julien Ramseier
+Justin Cormack
+Kaarle Ritvanen
+Khem Raj
+Kylie McClain
+Leah Neukirchen
+Luca Barbato
+Luka Perkov
+Lynn Ochs
+M Farkas-Dyck (Strake)
+Mahesh Bodapati
+Markus Wichmann
+Masanori Ogino
+Michael Clark
+Michael Forney
+Mikhail Kremnyov
+Natanael Copa
+Nicholas J. Kain
+orc
+Pascal Cuoq
+Patrick Oppenlander
+Petr Hosek
+Petr Skocik
+Pierre Carrier
+Reini Urban
+Rich Felker
+Richard Pennington
+Ryan Fairfax
+Samuel Holland
+Segev Finer
+Shiz
+sin
+Solar Designer
+Stefan Kristiansson
+Stefan O'Rear
+Szabolcs Nagy
+Timo Teräs
+Trutz Behn
+Will Dietz
+William Haddon
+William Pitcock
+
+Portions of this software are derived from third-party works licensed
+under terms compatible with the above MIT license:
+
+The TRE regular expression implementation (src/regex/reg* and
+src/regex/tre*) is Copyright © 2001-2008 Ville Laurikari and licensed
+under a 2-clause BSD license (license text in the source files). The
+included version has been heavily modified by Rich Felker in 2012, in
+the interests of size, simplicity, and namespace cleanliness.
+
+Much of the math library code (src/math/* and src/complex/*) is
+Copyright © 1993,2004 Sun Microsystems or
+Copyright © 2003-2011 David Schultz or
+Copyright © 2003-2009 Steven G. Kargl or
+Copyright © 2003-2009 Bruce D. Evans or
+Copyright © 2008 Stephen L. Moshier or
+Copyright © 2017-2018 Arm Limited
+and labelled as such in comments in the individual source files. All
+have been licensed under extremely permissive terms.
+
+The ARM memcpy code (src/string/arm/memcpy.S) is Copyright © 2008
+The Android Open Source Project and is licensed under a two-clause BSD
+license. It was taken from Bionic libc, used on Android.
+
+The AArch64 memcpy and memset code (src/string/aarch64/*) are
+Copyright © 1999-2019, Arm Limited.
+
+The implementation of DES for crypt (src/crypt/crypt_des.c) is
+Copyright © 1994 David Burren. It is licensed under a BSD license.
+
+The implementation of blowfish crypt (src/crypt/crypt_blowfish.c) was
+originally written by Solar Designer and placed into the public
+domain. The code also comes with a fallback permissive license for use
+in jurisdictions that may not recognize the public domain.
+
+The smoothsort implementation (src/stdlib/qsort.c) is Copyright © 2011
+Lynn Ochs and is licensed under an MIT-style license.
+
+The x86_64 port was written by Nicholas J. Kain and is licensed under
+the standard MIT terms.
+
+The mips and microblaze ports were originally written by Richard
+Pennington for use in the ellcc project. The original code was adapted
+by Rich Felker for build system and code conventions during upstream
+integration. It is licensed under the standard MIT terms.
+
+The mips64 port was contributed by Imagination Technologies and is
+licensed under the standard MIT terms.
+
+The powerpc port was also originally written by Richard Pennington,
+and later supplemented and integrated by John Spencer. It is licensed
+under the standard MIT terms.
+
+All other files which have no copyright comments are original works
+produced specifically for use as part of this library, written either
+by Rich Felker, the main author of the library, or by one or more
+contibutors listed above. Details on authorship of individual files
+can be found in the git version control history of the project. The
+omission of copyright and license comments in each file is in the
+interest of source tree size.
+
+In addition, permission is hereby granted for all public header files
+(include/* and arch/*/bits/*) and crt files intended to be linked into
+applications (crt/*, ldso/dlstart.c, and arch/*/crt_arch.h) to omit
+the copyright notice and permission notice otherwise required by the
+license, and to use these files without any requirement of
+attribution. These files include substantial contributions from:
+
+Bobby Bingham
+John Spencer
+Nicholas J. Kain
+Rich Felker
+Richard Pennington
+Stefan Kristiansson
+Szabolcs Nagy
+
+all of whom have explicitly granted such permission.
+
+This file previously contained text expressing a belief that most of
+the files covered by the above exception were sufficiently trivial not
+to be subject to copyright, resulting in confusion over whether it
+negated the permissions granted in the license. In the spirit of
+permissive licensing, and of not having licensing issues being an
+obstacle to adoption, that text has been removed.
--- a/locator/token_metadata.hh
+++ b/locator/token_metadata.hh
@@ -235,9 +235,6 @@ public:
    const topology& get_topology() const;
    void debug_show() const;

-    /** Return the unique host ID for an end-point. */
-    host_id get_host_id(inet_address endpoint) const;
-
    /** @return a copy of the endpoint-to-id map for read-only operations */
    std::unordered_set<host_id> get_host_ids() const;

--- a/main.cc
+++ b/main.cc
@@ -748,8 +748,6 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
    // inherit Seastar's CPU affinity masks. We want this thread to be free
    // to migrate between CPUs; we think that's what makes the most sense.
    auto rpc_dict_training_worker = utils::alien_worker(startlog, 19, "rpc-dict");
-    // niceness=10 is ~10% of normal process time
-    auto hashing_worker = utils::alien_worker(startlog, 10, "pwd-hash");

    return app.run(ac, av, [&] () -> future<int> {

@@ -779,8 +777,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
        return seastar::async([&app, cfg, ext, &disk_space_monitor_shard0, &cm, &sstm, &db, &qp, &bm, &proxy, &mapreduce_service, &mm, &mm_notifier, &ctx, &opts, &dirs,
                &prometheus_server, &cf_cache_hitrate_calculator, &load_meter, &feature_service, &gossiper, &snitch,
                &token_metadata, &erm_factory, &snapshot_ctl, &messaging, &sst_dir_semaphore, &raft_gr, &service_memory_limiter,
-                &repair, &sst_loader, &auth_cache, &ss, &lifecycle_notifier, &stream_manager, &task_manager, &rpc_dict_training_worker,
-                &hashing_worker, &vector_store_client] {
+                &repair, &sst_loader, &auth_cache, &ss, &lifecycle_notifier, &stream_manager, &task_manager, &rpc_dict_training_worker, &vector_store_client] {
          try {
              if (opts.contains("relabel-config-file") && !opts["relabel-config-file"].as<sstring>().empty()) {
                  // calling update_relabel_config_from_file can cause an exception that would stop startup
@@ -2060,7 +2057,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
                maintenance_auth_config.authenticator_java_name = sstring{auth::allow_all_authenticator_name};
                maintenance_auth_config.role_manager_java_name = sstring{auth::maintenance_socket_role_manager_name};

-                maintenance_auth_service.start(perm_cache_config, std::ref(qp), std::ref(group0_client),  std::ref(mm_notifier), std::ref(mm), maintenance_auth_config, maintenance_socket_enabled::yes, std::ref(auth_cache), std::ref(hashing_worker)).get();
+                maintenance_auth_service.start(perm_cache_config, std::ref(qp), std::ref(group0_client),  std::ref(mm_notifier), std::ref(mm), maintenance_auth_config, maintenance_socket_enabled::yes, std::ref(auth_cache)).get();

                cql_maintenance_server_ctl.emplace(maintenance_auth_service, mm_notifier, gossiper, qp, service_memory_limiter, sl_controller, lifecycle_notifier, *cfg, maintenance_cql_sg_stats_key, maintenance_socket_enabled::yes, dbcfg.statement_scheduling_group);

@@ -2336,7 +2333,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
            auth_config.authenticator_java_name = qualified_authenticator_name;
            auth_config.role_manager_java_name = qualified_role_manager_name;

-            auth_service.start(std::move(perm_cache_config), std::ref(qp), std::ref(group0_client), std::ref(mm_notifier), std::ref(mm), auth_config, maintenance_socket_enabled::no, std::ref(auth_cache), std::ref(hashing_worker)).get();
+            auth_service.start(std::move(perm_cache_config), std::ref(qp), std::ref(group0_client), std::ref(mm_notifier), std::ref(mm), auth_config, maintenance_socket_enabled::no, std::ref(auth_cache)).get();

            std::any stop_auth_service;
            // Has to be called after node joined the cluster (join_cluster())
@@ -2380,7 +2377,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl

            checkpoint(stop_signal, "starting batchlog manager");
            db::batchlog_manager_config bm_cfg;
-            bm_cfg.write_request_timeout = cfg->write_request_timeout_in_ms() * 1ms;
+            bm_cfg.replay_timeout = cfg->write_request_timeout_in_ms() * 1ms * 2;
            bm_cfg.replay_rate = cfg->batchlog_replay_throttle_in_kb() * 1000;
            bm_cfg.delay = std::chrono::milliseconds(cfg->ring_delay_ms());
            bm_cfg.replay_cleanup_after_replays = cfg->batchlog_replay_cleanup_after_replays();
--- a/pgo/profiles/aarch64/profile.profdata.xz
+++ b/pgo/profiles/aarch64/profile.profdata.xz
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:80a47fe93866989aaf7e949168fcd308e95841e78c976a61f9eac20bfdd34d96
-size 6448960
+oid sha256:3cbe2dd05945f8fb76ebce2ea70864063d2b282c4d5080af1f290ead43321ab3
+size 6444732
--- a/repair/row_level.cc
+++ b/repair/row_level.cc
@@ -551,9 +551,13 @@ void repair_writer_impl::create_writer(lw_shared_ptr<repair_writer> w) {
    }
    replica::table& t = _db.local().find_column_family(_schema->id());
    rlogger.debug("repair_writer: keyspace={}, table={}, estimated_partitions={}", w->schema()->ks_name(), w->schema()->cf_name(), w->get_estimated_partitions());
+    // #17384 don't use off-strategy for repair (etc) if using tablets. sstables generated will 
+    // be single token range and can just be added to normal sstable set as is, eventually
+    // handled by normal compaction.
+    auto off_str = t.uses_tablets() ? sstables::offstrategy(false) : is_offstrategy_supported(_reason);
    auto sharder = get_sharder_helper(t, *(w->schema()), _topo_guard);
    _writer_done = mutation_writer::distribute_reader_and_consume_on_shards(_schema, sharder.sharder, std::move(_queue_reader),
-            streaming::make_streaming_consumer(sstables::repair_origin, _db, _view_builder, _view_building_worker, w->get_estimated_partitions(), _reason, is_offstrategy_supported(_reason),
+            streaming::make_streaming_consumer(sstables::repair_origin, _db, _view_builder, _view_building_worker, w->get_estimated_partitions(), _reason, off_str,
                _topo_guard, _repaired_at, w->get_sstable_list_to_mark_as_repaired()),
    t.stream_in_progress()).then([w] (uint64_t partitions) {
        rlogger.debug("repair_writer: keyspace={}, table={}, managed to write partitions={} to sstable",
--- a/replica/data_dictionary_impl.hh
+++ b/replica/data_dictionary_impl.hh
@@ -96,6 +96,9 @@ public:
    virtual const secondary_index::secondary_index_manager& get_index_manager(data_dictionary::table t) const override {
        return const_cast<replica::table&>(unwrap(t)).get_index_manager();
    }
+    virtual db_clock::time_point get_truncation_time(data_dictionary::table t) const override {
+        return const_cast<replica::table&>(unwrap(t)).get_truncation_time();
+    }
    virtual lw_shared_ptr<keyspace_metadata> get_keyspace_metadata(data_dictionary::keyspace ks) const override {
        return unwrap(ks).metadata();
    }
--- a/replica/mutation_dump.cc
+++ b/replica/mutation_dump.cc
@@ -215,7 +215,7 @@ private:
                output_ck_raw_values.emplace_back(bytes{});
            }
        }
-        if (underlying_ck_raw_values.empty()) {
+        if (pos.region() != partition_region::clustered) {
            output_ck_raw_values.push_back(bytes{});
        } else {
            output_ck_raw_values.push_back(data_value(static_cast<int8_t>(pos.get_bound_weight())).serialize_nonnull());
--- a/replica/table.cc
+++ b/replica/table.cc
@@ -16,6 +16,7 @@
 #include <seastar/coroutine/as_future.hh>
 #include <seastar/util/closeable.hh>
 #include <seastar/util/defer.hh>
+#include <seastar/json/json_elements.hh>

 #include "dht/decorated_key.hh"
 #include "replica/database.hh"
@@ -3198,23 +3199,35 @@ db::replay_position table::highest_flushed_replay_position() const {
    return _highest_flushed_rp;
 }

+struct manifest_json : public json::json_base {
+    json::json_chunked_list<sstring> files;
+
+    manifest_json() {
+        register_params();
+    }
+    manifest_json(manifest_json&& e) {
+        register_params();
+        files = std::move(e.files);
+    }
+    manifest_json& operator=(manifest_json&& e) {
+        files = std::move(e.files);
+        return *this;
+    }
+private:
+    void register_params() {
+        add(&files, "files");
+    }
+};
+
 future<>
 table::seal_snapshot(sstring jsondir, std::vector<snapshot_file_set> file_sets) {
-    std::ostringstream ss;
-    int n = 0;
-    ss << "{" << std::endl << "\t\"files\" : [ ";
+    manifest_json manifest;
    for (const auto& fsp : file_sets) {
-      for (const auto& rf : *fsp) {
-        if (n++ > 0) {
-            ss << ", ";
+        for (auto& rf : *fsp) {
+            manifest.files.push(std::move(rf));
        }
-        ss << "\"" << rf << "\"";
-        co_await coroutine::maybe_yield();
-      }
    }
-    ss << " ]" << std::endl << "}" << std::endl;
-
-    auto json = ss.str();
+    auto streamer = json::stream_object(std::move(manifest));
    auto jsonfile = jsondir + "/manifest.json";

    tlogger.debug("Storing manifest {}", jsonfile);
@@ -3224,12 +3237,10 @@ table::seal_snapshot(sstring jsondir, std::vector<snapshot_file_set> file_sets)
    auto out = co_await make_file_output_stream(std::move(f));
    std::exception_ptr ex;
    try {
-        co_await out.write(json.c_str(), json.size());
-        co_await out.flush();
+        co_await streamer(std::move(out));
    } catch (...) {
        ex = std::current_exception();
    }
-    co_await out.close();

    if (ex) {
        co_await coroutine::return_exception_ptr(std::move(ex));
--- a/scripts/get_description.py
+++ b/scripts/get_description.py
@@ -22,7 +22,7 @@ format_match = re.compile(r'\s*(?:seastar::)?format\(\s*"([^"]+)"\s*,\s*(.*)\s*'
 def handle_error(message, strict=True, verbose_mode=False):
    if strict:
        print(f"[ERROR] {message}")
-        exit(-1)
+        exit(1)
    elif verbose_mode:
        print(f"[WARNING] {message}")

@@ -180,12 +180,11 @@ def get_metrics_from_file(file_name, prefix, metrics_information, verb=None, str
    groups = {}
    if clean_name in metrics_information:
        if (isinstance(metrics_information[clean_name], str) and metrics_information[clean_name] == "skip") or "skip" in metrics_information[clean_name]:
-            exit(0)
+            return {}
    param_mapping =  metrics_information[clean_name]["params"] if clean_name in metrics_information and "params" in metrics_information[clean_name] else {}
    groups = metrics_information[clean_name]["groups"] if clean_name in metrics_information and "groups" in metrics_information[clean_name] else {}

    metrics = {}
-    multi_line = False
    names = undefined
    typ = undefined
    line_number = 0;
--- a/scripts/metrics-config.yml
+++ b/scripts/metrics-config.yml
@@ -1,6 +1,6 @@
 "cdc/log.cc":
  params:
-    cdc_group_name: cdc
+    cdc_group_name: "cdc"
    part_name;suffix: [["static_row", "total"],["clustering_row", "total"], ["map", "total"], ["set", "total"], ["list", "total"], ["udt", "total"], ["range_tombstone", "total"],["partition_delete", "total"],["row_delete", "total"], ["static_row", "failed"],["clustering_row", "failed"], ["map", "failed"], ["set", "failed"], ["list", "failed"], ["udt", "failed"], ["range_tombstone", "failed"],["partition_delete", "failed"],["row_delete", "failed"]]
    kind: ["total", "failed"]
 "db/commitlog/commitlog.cc":
@@ -9,7 +9,7 @@
        "cfg.max_active_flushes": "cfg.max_active_flushes"
 "cql3/query_processor.cc":
    groups:
-        "80": query_processor
+        "80": "query_processor"
 "replica/dirty_memory_manager.cc":
  params:
    namestr: ["regular", "system"]
@@ -19,10 +19,11 @@
 "replica/database.cc":
  params:
    "_dirty_memory_manager.throttle_threshold()": "throttle threshold"
-"seastar/apps/metrics_tester/metrics_tester.cc": skip
-"seastar/tests/unit/metrics_test.cc": skip
-"seastar/tests/unit/metrics_tester.cc": skip
-"seastar/tests/unit/prometheus_http_test.cc": skip
+"seastar/apps/metrics_tester/metrics_tester.cc": "skip"
+"seastar/tests/unit/metrics_test.cc": "skip"
+"seastar/tests/unit/metrics_tester.cc": "skip"
+"seastar/tests/unit/prometheus_http_test.cc": "skip"
+"seastar/tests/unit/prometheus_text_test.cc": "skip"
 "service/storage_proxy.cc":
  params:
    COORDINATOR_STATS_CATEGORY: "storage_proxy_coordinator"
@@ -32,25 +33,25 @@
    _short_description_prefix: ["total_write_attempts", "write_errors", "background_replica_writes_failed", "read_repair_write_attempts"]
    _long_description_prefix: ["total number of write requests", "number of write requests that failed", "background_replica_writes_failed", "number of write operations in a read repair context"]
    _category: "storage_proxy_coordinator"
-"thrift/server.cc": skip
+"thrift/server.cc": "skip"
 "tracing/tracing.cc":
  params:
    "max_pending_trace_records + write_event_records_threshold": "max_pending_trace_records + write_event_records_threshold"
 "transport/server.cc":
  groups:
-    "200": transport
+    "200": "transport"
  params:
    "_config.max_request_size": "max_request_size"
-"seastar/src/net/dpdk.cc": skip
+"seastar/src/net/dpdk.cc": "skip"
 "db/hints/manager.cc":
    params:
        "group_name": ["hints_for_views_manager", "hints_manager"]
 "seastar/src/core/execution_stage.cc":
    groups:
-        "100": execution_stages
+        "100": "execution_stages"
 "seastar/src/core/fair_queue.cc":
    groups:
-        "300": io_queue
+        "300": "io_queue"
 "seastar/src/net/net.cc":
    params:
        _stats_plugin_name: ["stats_plugin_name"]
--- a/scylla-gdb.py
+++ b/scylla-gdb.py
@@ -818,6 +818,9 @@ class std_list:
                self._node = node_header['_M_next']
                self._end = node_header['_M_next']['_M_prev']

+            def __iter__(self):
+                return self
+
            def __next__(self):
                if self._node == self._end:
                    raise StopIteration()
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -36,6 +36,7 @@
 #include <seastar/core/future-util.hh>
 #include "db/read_repair_decision.hh"
 #include "db/config.hh"
+#include "db/batchlog.hh"
 #include "db/batchlog_manager.hh"
 #include "db/hints/manager.hh"
 #include "db/system_keyspace.hh"
@@ -4281,12 +4282,13 @@ storage_proxy::mutate_atomically_result(utils::chunked_vector<mutation> mutation
        coordinator_mutate_options _options;

        const utils::UUID _batch_uuid;
+        const db_clock::time_point _batch_write_time;
        const host_id_vector_replica_set _batchlog_endpoints;

    public:
        context(storage_proxy & p, utils::chunked_vector<mutation>&& mutations, lw_shared_ptr<cdc::operation_result_tracker>&& cdc_tracker, db::consistency_level cl, clock_type::time_point timeout, tracing::trace_state_ptr tr_state, service_permit permit, coordinator_mutate_options options)
                : _p(p)
-                , _schema(_p.local_db().find_schema(db::system_keyspace::NAME, db::system_keyspace::BATCHLOG))
+                , _schema(_p.local_db().find_schema(db::system_keyspace::NAME, db::system_keyspace::BATCHLOG_V2))
                , _ermp(_p.local_db().find_column_family(_schema->id()).get_effective_replication_map())
                , _mutations(std::move(mutations))
                , _cdc_tracker(std::move(cdc_tracker))
@@ -4297,6 +4299,7 @@ storage_proxy::mutate_atomically_result(utils::chunked_vector<mutation> mutation
                , _permit(std::move(permit))
                , _options(std::move(options))
                , _batch_uuid(utils::UUID_gen::get_time_UUID())
+                , _batch_write_time(db_clock::now())
                , _batchlog_endpoints(
                        [this]() -> host_id_vector_replica_set {
                            auto local_addr = _p.my_host_id(*_ermp);
@@ -4334,17 +4337,14 @@ storage_proxy::mutate_atomically_result(utils::chunked_vector<mutation> mutation
            }));
        }
        future<result<>> sync_write_to_batchlog() {
-            auto m = _p.do_get_batchlog_mutation_for(_schema, _mutations, _batch_uuid, netw::messaging_service::current_version, db_clock::now());
+            auto m = db::get_batchlog_mutation_for(_schema, _mutations, netw::messaging_service::current_version, _batch_write_time, _batch_uuid);
            tracing::trace(_trace_state, "Sending a batchlog write mutation");
            return send_batchlog_mutation(std::move(m));
        };
        future<> async_remove_from_batchlog() {
            // delete batch
            utils::get_local_injector().inject("storage_proxy_fail_remove_from_batchlog", [] { throw std::runtime_error("Error injection: failing remove from batchlog"); });
-            auto key = partition_key::from_exploded(*_schema, {uuid_type->decompose(_batch_uuid)});
-            auto now = service::client_state(service::client_state::internal_tag()).get_timestamp();
-            mutation m(_schema, key);
-            m.partition().apply_delete(*_schema, clustering_key_prefix::make_empty(), tombstone(now, gc_clock::now()));
+            auto m = db::get_batchlog_delete_mutation(_schema, netw::messaging_service::current_version, _batch_write_time, _batch_uuid);

            tracing::trace(_trace_state, "Sending a batchlog remove mutation");
            return send_batchlog_mutation(std::move(m), db::consistency_level::ANY).then_wrapped([] (future<result<>> f) {
@@ -4363,6 +4363,7 @@ storage_proxy::mutate_atomically_result(utils::chunked_vector<mutation> mutation
            return _p.mutate_prepare(_mutations, _cl, db::write_type::BATCH, _trace_state, _permit, db::allow_per_partition_rate_limit::no, _options).then(utils::result_wrap([this] (unique_response_handler_vector ids) {
                return sync_write_to_batchlog().then(utils::result_wrap([this, ids = std::move(ids)] () mutable {
                    tracing::trace(_trace_state, "Sending batch mutations");
+                    utils::get_local_injector().inject("storage_proxy_fail_send_batch", [] { throw std::runtime_error("Error injection: failing to send batch"); });
                    _p.register_cdc_operation_result_tracker(ids, _cdc_tracker);
                    return _p.mutate_begin(std::move(ids), _cl, _trace_state, _timeout);
                })).then(utils::result_wrap([this] {
@@ -4398,33 +4399,6 @@ storage_proxy::mutate_atomically_result(utils::chunked_vector<mutation> mutation
    }).then_wrapped(std::move(cleanup));
 }

-mutation storage_proxy::get_batchlog_mutation_for(const utils::chunked_vector<mutation>& mutations, const utils::UUID& id, int32_t version, db_clock::time_point now) {
-    auto schema = local_db().find_schema(db::system_keyspace::NAME, db::system_keyspace::BATCHLOG);
-    return do_get_batchlog_mutation_for(std::move(schema), mutations, id, version, now);
-}
-
-mutation storage_proxy::do_get_batchlog_mutation_for(schema_ptr schema, const utils::chunked_vector<mutation>& mutations, const utils::UUID& id, int32_t version, db_clock::time_point now) {
-    auto key = partition_key::from_singular(*schema, id);
-    auto timestamp = api::new_timestamp();
-    auto data = [&mutations] {
-        utils::chunked_vector<canonical_mutation> fm(mutations.begin(), mutations.end());
-        bytes_ostream out;
-        for (auto& m : fm) {
-            ser::serialize(out, m);
-        }
-        return std::move(out).to_managed_bytes();
-    }();
-
-    mutation m(schema, key);
-    m.set_cell(clustering_key_prefix::make_empty(), to_bytes("version"), version, timestamp);
-    m.set_cell(clustering_key_prefix::make_empty(), to_bytes("written_at"), now, timestamp);
-    // Avoid going through data_value and therefore `bytes`, as it can be large (#24809).
-    auto cdef_data = schema->get_column_definition(to_bytes("data"));
-    m.set_cell(clustering_key_prefix::make_empty(), *cdef_data, atomic_cell::make_live(*cdef_data->type, timestamp, std::move(data)));
-
-    return m;
-}
-
 template<typename Range>
 bool storage_proxy::cannot_hint(const Range& targets, db::write_type type) const {
    // if hints are disabled we "can always hint" since there's going to be no hint generated in this case
@@ -4528,14 +4502,14 @@ future<> storage_proxy::send_hint_to_all_replicas(frozen_mutation_and_schema fm_
 }

 future<> storage_proxy::send_batchlog_replay_to_all_replicas(utils::chunked_vector<mutation> mutations, clock_type::time_point timeout) {
-    if (utils::get_local_injector().is_enabled("batch_replay_throw")) {
-        throw std::runtime_error("Skipping batch replay due to batch_replay_throw injection");
-    }
+    utils::get_local_injector().inject("storage_proxy_fail_replay_batch", [] { throw std::runtime_error("Error injection: failing to send batch"); });

    utils::chunked_vector<batchlog_replay_mutation> ms = mutations | std::views::transform([] (auto&& m) {
            return batchlog_replay_mutation(std::move(m));
        }) | std::ranges::to<utils::chunked_vector<batchlog_replay_mutation>>();

+    utils::get_local_injector().inject("storage_proxy_fail_replay_batch", [] { throw std::runtime_error("Error injection: failing to send batch"); });
+
    return mutate_internal(std::move(ms), db::consistency_level::EACH_QUORUM, nullptr, empty_service_permit(), timeout, db::write_type::BATCH)
            .then(utils::result_into_future<result<>>);
 }
--- a/service/storage_proxy.hh
+++ b/service/storage_proxy.hh
@@ -683,7 +683,6 @@ private:
        fencing_token caller_token, locator::host_id caller_id,
        Func&& write_func);

-    mutation do_get_batchlog_mutation_for(schema_ptr schema, const utils::chunked_vector<mutation>& mutations, const utils::UUID& id, int32_t version, db_clock::time_point now);
    future<> drain_on_shutdown();
 public:
    void update_fence_version(locator::token_metadata::version_t fence_version);
@@ -834,8 +833,6 @@ public:
            db::consistency_level cl_for_paxos, db::consistency_level cl_for_learn,
            clock_type::time_point write_timeout, clock_type::time_point cas_timeout, bool write = true, cdc::per_request_options cdc_opts = {});

-    mutation get_batchlog_mutation_for(const utils::chunked_vector<mutation>& mutations, const utils::UUID& id, int32_t version, db_clock::time_point now);
-
    future<> stop();
    future<> start_hints_manager();
    void allow_replaying_hints() noexcept;
--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -7531,7 +7531,7 @@ future<join_node_response_result> storage_service::join_node_response_handler(jo
                &&  _join_node_response_done.failed()) {
            // The topology coordinator accepted the node that was rejected before or failed while handling
            // the response. Inform the coordinator about it so it moves the node to the left state.
-            throw _join_node_response_done.get_shared_future().get_exception();
+            co_await coroutine::return_exception_ptr(_join_node_response_done.get_shared_future().get_exception());
        }

        co_return join_node_response_result{};
--- a/test/alternator/test_authorization.py
+++ b/test/alternator/test_authorization.py
@@ -9,7 +9,7 @@ import pytest
 import requests
 from botocore.exceptions import ClientError

-from test.alternator.test_manual_requests import get_signed_request
+from .util import get_signed_request


 # Test that trying to perform an operation signed with a wrong key
--- a/test/alternator/test_manual_requests.py
+++ b/test/alternator/test_manual_requests.py
@@ -13,30 +13,12 @@ import urllib3
 from botocore.exceptions import BotoCoreError, ClientError
 from packaging.version import Version

-from test.alternator.util import random_bytes, random_string
+from test.alternator.util import random_bytes, random_string, get_signed_request, manual_request, ManualRequestError


 def gen_json(n):
    return '{"":'*n + '{}' + '}'*n

-def get_signed_request(dynamodb, target, payload):
-    # Usually "payload" will be a Python string and we'll write it as UTF-8.
-    # but in some tests we may want to write bytes directly - potentially
-    # bytes which include invalid UTF-8.
-    payload_bytes = payload if isinstance(payload, bytes) else payload.encode(encoding='UTF-8')
-    # NOTE: Signing routines use boto3 implementation details and may be prone
-    # to unexpected changes
-    class Request:
-        url=dynamodb.meta.client._endpoint.host
-        headers={'X-Amz-Target': 'DynamoDB_20120810.' + target, 'Content-Type': 'application/x-amz-json-1.0'}
-        body=payload_bytes
-        method='POST'
-        context={}
-        params={}
-    req = Request()
-    signer = dynamodb.meta.client._request_signer
-    signer.get_auth(signer.signing_name, signer.region_name).add_auth(request=req)
-    return req

 # Test that deeply nested objects (e.g. with depth of 200k) are parsed correctly,
 # i.e. do not cause stack overflows for the server. It's totally fine for the
@@ -483,6 +465,23 @@ def assert_validation_exception(response_text, request_info, accept_serializatio
        (accept_serialization_exception and "SerializationException" in r['__type']), \
        f"Unexpected error type {r['__type']} for {request_info}"

+# Test that JSON parse errors have a reasonable type (in DynamoDB, it is
+# SerializationException but in Alternator it is ValidationException), and
+# if it contains a message, it doesn't contain garbage positions (reproduces
+# issue #27372).
+def test_json_parse_error(dynamodb, test_table):
+    with pytest.raises(ManualRequestError) as err:
+        manual_request(dynamodb, 'PutItem', '{broken json')
+    e = err.value
+    # In DynamoDB, we get a SerializationException with no message.
+    # In Alternator, we get a ValidationException with a message.
+    # For now, we'll accept both, but want to check that if we do have
+    # a message it gives the correct position of the error ('at 1') and
+    # not some garbage number (issue #27372).
+    assert e.type == 'SerializationException' or e.type == 'ValidationException'
+    if e.message:
+        assert e.message.endswith('at 1')
+
 # Tests some invalid payloads (empty values, wrong types) to BatchWriteItem. Reproduces #23233
 def test_batch_write_item_invalid_payload(dynamodb, test_table):
    cases = [
@@ -587,3 +586,58 @@ def test_keep_alive(dynamodb, test_table, use_keep_alive):
    finally:
        urllib3.connection.HTTPConnection.connect = original_http_connect
        urllib3.connection.HTTPSConnection.connect = original_https_connect
+
+# Test that attempting to write a malformed value with PutItem, UpdateItem or
+# BatchWriteItem fails. A "malformed value" is valid JSON but which doesn't
+# conform to DynamoDB's value structure - maps with types. For example,
+# {"S": "dog"} is a proper value (a string), but {"dog": "cat"} is malformed.
+# We don't want to store the malformed value on disk and only discover the
+# problem when reading the value back. We want the write to fail immediately,
+# and this is what this test checks. Reproduces issue #8070.
+@pytest.mark.xfail(reason="issue #8070")
+@pytest.mark.parametrize("op", ['PutItem', 'UpdateItem', 'BatchWriteItem'])
+def test_write_malformed_value(dynamodb, test_table_s, op):
+    p = random_string()
+    payloads = {
+        'PutItem': '''{
+            "TableName": "''' + test_table_s.name + '''",
+            "Item": {"p": {"S": "''' + p + '''"}, "x": %s}}''',
+        'UpdateItem': '''{
+            "TableName": "''' + test_table_s.name + '''",
+            "Key": {"p": {"S": "''' + p + '''"}},
+            "UpdateExpression": "SET x = :x",
+            "ExpressionAttributeValues": {":x": %s }}''',
+        'BatchWriteItem': '''{
+            "RequestItems": {
+            "''' + test_table_s.name + '''": [
+                {"PutRequest":
+                    {"Item": {"p": {"S": "''' + p + '''"}, "x": %s}}}
+            ]}}'''
+    }
+    payload = payloads[op]
+    # As a sanity check, check the value {"S": "hello"} works:
+    manual_request(dynamodb, op, payload % '{"S": "hello"}')
+    assert test_table_s.get_item(Key={'p': p}, ConsistentRead=True)['Item']['x'] == 'hello'
+    # The value {"dog": "cat"} is malformed. Because Alternator wants to
+    # optimize how it stores certain types, it checks the supposed type
+    # of this value and sees "dog" is not a known type and fails the write.
+    with pytest.raises(ManualRequestError, match='ValidationException'):
+        manual_request(dynamodb, op, payload % '{"dog": "cat"}')
+    # The value {"N": 3} is also malformed - "N" is a good type ("N") but
+    # the right serialization for the number 3 is {"N": "3"} (with the
+    # string "3") not {"N": 3}. DynamoDB generates a SerializationException
+    # error here, Alternator a ValidationException.
+    # I consider the difference to be not important - the important thing
+    # is that the write fails and doesn't save a malformed value on disk.
+    with pytest.raises(ManualRequestError, match='ValidationException|SerializationException'):
+        manual_request(dynamodb, op, payload % '{"N": 3}')
+    # If the value is a map (type "M"), Alternator doesn't attempt to further
+    # optimize its storage, and as issue #8070 noted, just stored the value
+    # as-is, as JSON, so it missed the need to validate the content of that
+    # JSON - and failed to find the malformed value.
+    with pytest.raises(ManualRequestError, match='ValidationException|SerializationException'):
+        manual_request(dynamodb, op, payload % '{"M": {"dog": "cat"}}')
+        # If PutItem didn't fail, and wrote the malformed map, GetItem
+        # will return this broken map and boto3's attempt to parse the
+        # returned map will fail, causing the following call to fail.
+        test_table_s.get_item(Key={'p': p}, ConsistentRead=True)
--- a/test/alternator/test_metrics.py
+++ b/test/alternator/test_metrics.py
@@ -33,9 +33,8 @@ import pytest
 import requests
 from botocore.exceptions import ClientError

-from test.alternator.test_manual_requests import get_signed_request
 from test.alternator.test_cql_rbac import new_dynamodb, new_role
-from test.alternator.util import random_string, new_test_table, is_aws, scylla_config_read, scylla_config_temporary
+from test.alternator.util import random_string, new_test_table, is_aws, scylla_config_read, scylla_config_temporary, get_signed_request

 # Fixture for checking if we are able to test Scylla metrics. Scylla metrics
 # are not available on AWS (of course), but may also not be available for
--- a/test/alternator/test_streams.py
+++ b/test/alternator/test_streams.py
@@ -14,7 +14,7 @@ import pytest
 from boto3.dynamodb.types import TypeDeserializer
 from botocore.exceptions import ClientError

-from test.alternator.util import is_aws, scylla_config_temporary, unique_table_name, create_test_table, new_test_table, random_string, full_scan, freeze, list_tables, get_region
+from test.alternator.util import is_aws, scylla_config_temporary, unique_table_name, create_test_table, new_test_table, random_string, full_scan, freeze, list_tables, get_region, manual_request

 # All tests in this file are expected to fail with tablets due to #23838.
 # To ensure that Alternator Streams is still being tested, instead of
@@ -1023,6 +1023,47 @@ def test_streams_updateitem_identical(test_table_ss_keys_only, test_table_ss_new
        do_test(test_table_ss_old_image, dynamodb, dynamodbstreams, do_updates, 'OLD_IMAGE')
        do_test(test_table_ss_new_and_old_images, dynamodb, dynamodbstreams, do_updates, 'NEW_AND_OLD_IMAGES')

+# The above test_streams_updateitem_identical tested that if we UpdateItem an
+# item changing an attribute to a value identical to the one it already has,
+# no event is generated on the stream. In this test we change an attribute
+# value from one map to another which are really the same value, but have
+# different serialization (the map's elements are ordered differently).
+# Since the value is nevertheless the same, here too we expect not to see
+# a change event in the stream. Reproduces issue #27375.
+@pytest.mark.xfail(reason="issue #27375")
+def test_streams_updateitem_equal_but_not_identical(test_table_ss_keys_only, test_table_ss_new_image, test_table_ss_old_image, test_table_ss_new_and_old_images, dynamodb, dynamodbstreams):
+    def do_updates(table, p, c):
+        events = []
+        # We use manual_request() to let us be sure that we pass the JSON
+        # values to the server in exactly the order we want to, without the
+        # Python SDK possibly rearranging them.
+        # Set x to be a map: {'dog': 1, 'cat': 2, 'mouse': 3}.
+        payload = '''{
+            "TableName": "''' + table.name + '''",
+            "Key": {"p": {"S": "''' + p + '''"}, "c": {"S": "''' + c + '''"}},
+            "UpdateExpression": "SET x = :x",
+            "ExpressionAttributeValues": {":x": %s}
+            }'''
+        manual_request(dynamodb, 'UpdateItem',
+            payload % '{"M": {"dog": {"N": "1"}, "cat": {"N": "2"}, "mouse": {"N": "3"}}}')
+        events.append(['INSERT', {'p': p, 'c': c}, None, {'p': p, 'c': c, 'x': {'dog': 1, 'cat': 2, 'mouse': 3}}])
+        # Overwriting x an identical map item shouldn't produce any events,
+        # so we won't add anything to the "events" list:
+        manual_request(dynamodb, 'UpdateItem',
+            payload % '{"M": {"dog": {"N": "1"}, "cat": {"N": "2"}, "mouse": {"N": "3"}}}')
+        # Now try to overwrite x with a map that has the same elements in
+        # a different order. These two values, despite not being identical
+        # in JSON form, should be considered equal and again no event should
+        # be generated.
+        manual_request(dynamodb, 'UpdateItem',
+            payload % '{"M": {"cat": {"N": "2"}, "dog": {"N": "1"}, "mouse": {"N": "3"}}}')
+        return events
+    with scylla_config_temporary(dynamodb, 'alternator_streams_increased_compatibility', 'true', nop=is_aws(dynamodb)):
+        do_test(test_table_ss_keys_only, dynamodb, dynamodbstreams, do_updates, 'KEYS_ONLY')
+        do_test(test_table_ss_new_image, dynamodb, dynamodbstreams, do_updates, 'NEW_IMAGE')
+        do_test(test_table_ss_old_image, dynamodb, dynamodbstreams, do_updates, 'OLD_IMAGE')
+        do_test(test_table_ss_new_and_old_images, dynamodb, dynamodbstreams, do_updates, 'NEW_AND_OLD_IMAGES')
+
 # Tests that deleting a missing attribute with UpdateItem doesn't generate a
 # REMOVE event. Other cases are tested in test_streams_batch_delete_missing
 # and in tests based on do_updates_1. Reproduces #6918.
--- a/test/alternator/util.py
+++ b/test/alternator/util.py
@@ -10,6 +10,7 @@ import collections
 import time
 import re
 import requests
+import json
 import pytest
 from contextlib import contextmanager
 from botocore.hooks import HierarchicalEmitter
@@ -364,3 +365,54 @@ def scylla_config_temporary(dynamodb, name, value, nop = False):
        yield
    finally:
        scylla_config_write(dynamodb, name, original_value)
+
+# manual_request() can be used to send a DynamoDB API request without any
+# boto3 involvement in preparing the request - the operation name and
+# operation payload (a JSON string) are created by the caller. Use this
+# function sparingly - most tests should use boto3's resource API or when
+# needed, client_no_transform(). Although manual_request() does give the test
+# more control, not using it allows the test to check the natural requests
+# sent by a real-life SDK.
+def manual_request(dynamodb, op, payload):
+    req = get_signed_request(dynamodb, op, payload)
+    res = requests.post(req.url, headers=req.headers, data=req.body, verify=False)
+    if res.status_code == 200:
+        return json.loads(res.text)
+    else:
+        err = json.loads(res.text)
+        error_code = res.status_code
+        error_type = err['__type'].split('#')[1]
+        # Normally, DynamoDB uses lowercase 'message', but in some cases
+        # it uses 'Message', and for some types of error it may be missing
+        # entirely (we'll return an empty string for that).
+        message = err.get('message', err.get('Message', ''))
+        raise ManualRequestError(error_code, error_type, message)
+
+class ManualRequestError(Exception):
+    def __init__(self, error_code, error_type, message):
+        super().__init__(message) # message is the main exception text
+        self.code = error_code
+        self.type = error_type
+        self.message = message
+    def __str__(self):
+        return f'{self.code} {self.type} {self.message}'
+    __repr__ = __str__
+
+def get_signed_request(dynamodb, op, payload):
+    # Usually "payload" will be a Python string and we'll write it as UTF-8.
+    # but in some tests we may want to write bytes directly - potentially
+    # bytes which include invalid UTF-8.
+    payload_bytes = payload if isinstance(payload, bytes) else payload.encode(encoding='UTF-8')
+    # NOTE: Signing routines use boto3 implementation details and may be prone
+    # to unexpected changes
+    class Request:
+        url=dynamodb.meta.client._endpoint.host
+        headers={'X-Amz-Target': 'DynamoDB_20120810.' + op, 'Content-Type': 'application/x-amz-json-1.0'}
+        body=payload_bytes
+        method='POST'
+        context={}
+        params={}
+    req = Request()
+    signer = dynamodb.meta.client._request_signer
+    signer.get_auth(signer.signing_name, signer.region_name).add_auth(request=req)
+    return req
--- a/test/boost/CMakeLists.txt
+++ b/test/boost/CMakeLists.txt
@@ -12,7 +12,7 @@ add_scylla_test(alternator_unit_test
 add_scylla_test(anchorless_list_test
  KIND BOOST)
 add_scylla_test(auth_passwords_test
-  KIND BOOST
+  KIND SEASTAR
  LIBRARIES auth)
 add_scylla_test(auth_resource_test
  KIND BOOST)
--- a/test/boost/auth_passwords_test.cc
+++ b/test/boost/auth_passwords_test.cc
@@ -6,7 +6,7 @@
 * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
 */

-#define BOOST_TEST_MODULE core
+#include <seastar/testing/test_case.hh>

 #include <array>
 #include <random>
@@ -16,15 +16,21 @@

 #include <boost/test/unit_test.hpp>
 #include <seastar/core/sstring.hh>
+#include <seastar/core/coroutine.hh>

 #include "seastarx.hh"

+extern "C" {
+#include <crypt.h>
+#include <unistd.h>
+}
+
 static auto rng_for_salt = std::default_random_engine(std::random_device{}());

 //
 // The same password hashed multiple times will result in different strings because the salt will be different.
 //
-BOOST_AUTO_TEST_CASE(passwords_are_salted) {
+SEASTAR_TEST_CASE(passwords_are_salted) {
    const char* const cleartext = "my_excellent_password";
    std::unordered_set<sstring> observed_passwords{};

@@ -33,12 +39,13 @@ BOOST_AUTO_TEST_CASE(passwords_are_salted) {
        BOOST_REQUIRE(!observed_passwords.contains(e));
        observed_passwords.insert(e);
    }
+    co_return;
 }

 //
 // A hashed password will authenticate against the same password in cleartext.
 //
-BOOST_AUTO_TEST_CASE(correct_passwords_authenticate) {
+SEASTAR_TEST_CASE(correct_passwords_authenticate) {
    // Common passwords.
    std::array<const char*, 3> passwords{
        "12345",
@@ -47,14 +54,85 @@ BOOST_AUTO_TEST_CASE(correct_passwords_authenticate) {
    };

    for (const char* p : passwords) {
-        BOOST_REQUIRE(auth::passwords::check(p, auth::passwords::hash(p, rng_for_salt, auth::passwords::scheme::sha_512)));
+        BOOST_REQUIRE(co_await auth::passwords::check(p, auth::passwords::hash(p, rng_for_salt, auth::passwords::scheme::sha_512)));
    }
 }

+std::string long_password(uint32_t len) {
+    std::string out;
+    auto pattern = "0123456789";
+    for (uint32_t i = 0; i < len; ++i) {
+        out.push_back(pattern[i % strlen(pattern)]);
+    }
+
+    return out;
+}
+
+SEASTAR_TEST_CASE(same_hashes_as_crypt_h) {
+
+    std::string long_pwd_254 = long_password(254);
+    std::string long_pwd_255 = long_password(255);
+    std::string long_pwd_511 = long_password(511);
+
+    std::array<const char*, 8> passwords{
+        "12345",
+        "1_am_the_greatest!",
+        "password1",
+        // Some special characters
+        "!@#$%^&*()_+-=[]{}|\n;:'\",.<>/?",
+        // UTF-8 characters
+        "こんにちは、世界！",
+        // Passwords close to __crypt_sha512 length limit
+        long_pwd_254.c_str(),
+        long_pwd_255.c_str(),
+        // Password of maximal accepted length
+        long_pwd_511.c_str(),
+    };
+
+    auto salt = "$6$aaaabbbbccccdddd";
+
+    for (const char* p : passwords) {
+        auto res = co_await auth::passwords::detail::hash_with_salt_async(p, salt);
+        BOOST_REQUIRE(res == auth::passwords::detail::hash_with_salt(p, salt));
+    }
+}
+
+SEASTAR_TEST_CASE(too_long_password) {
+    auto p1 = long_password(71);
+    auto p2 = long_password(72);
+    auto p3 = long_password(73);
+    auto too_long_password = long_password(512);
+
+    auto salt_bcrypt = "$2a$05$mAyzaIeJu41dWUkxEbn8hO";
+    auto h1_bcrypt = co_await auth::passwords::detail::hash_with_salt_async(p1, salt_bcrypt);
+    auto h2_bcrypt = co_await auth::passwords::detail::hash_with_salt_async(p2, salt_bcrypt);
+    auto h3_bcrypt = co_await auth::passwords::detail::hash_with_salt_async(p3, salt_bcrypt);
+    BOOST_REQUIRE(h1_bcrypt != h2_bcrypt);
+
+    // The check below documents the behavior of the current bcrypt
+    // implementation that compares only the first 72 bytes of the password.
+    // Although we don't typically use bcrypt for password hashing, it is
+    // possible to insert such a hash using`CREATE ROLE ... WITH HASHED PASSWORD ...`.
+    // Refs: scylladb/scylladb#26842
+    BOOST_REQUIRE(h2_bcrypt == h3_bcrypt);
+
+    // The current implementation of bcrypt password hasing fails with passwords of length 512 and above
+    BOOST_CHECK_THROW(co_await auth::passwords::detail::hash_with_salt_async(too_long_password, salt_bcrypt), std::system_error);
+
+    auto salt_sha512 = "$6$aaaabbbbccccdddd";
+    auto h1_sha512 = co_await auth::passwords::detail::hash_with_salt_async(p1, salt_sha512);
+    auto h2_sha512 = co_await auth::passwords::detail::hash_with_salt_async(p2, salt_sha512);
+    auto h3_sha512 = co_await auth::passwords::detail::hash_with_salt_async(p3, salt_sha512);
+    BOOST_REQUIRE(h1_sha512 != h2_sha512);
+    BOOST_REQUIRE(h2_sha512 != h3_sha512);
+    // The current implementation of SHA-512 password hasing fails with passwords of length 512 and above
+    BOOST_CHECK_THROW(co_await auth::passwords::detail::hash_with_salt_async(too_long_password, salt_sha512), std::system_error);
+}
+
 //
 // A hashed password that does not match the password in cleartext does not authenticate.
 //
-BOOST_AUTO_TEST_CASE(incorrect_passwords_do_not_authenticate) {
+SEASTAR_TEST_CASE(incorrect_passwords_do_not_authenticate) {
    const sstring hashed_password = auth::passwords::hash("actual_password", rng_for_salt,auth::passwords::scheme::sha_512);
-    BOOST_REQUIRE(!auth::passwords::check("password_guess", hashed_password));
+    BOOST_REQUIRE(!co_await auth::passwords::check("password_guess", hashed_password));
 }
--- a/test/boost/batchlog_manager_test.cc
+++ b/test/boost/batchlog_manager_test.cc
@@ -12,16 +12,25 @@

 #undef SEASTAR_TESTING_MAIN
 #include <seastar/testing/test_case.hh>
+#include "test/lib/cql_assertions.hh"
 #include "test/lib/cql_test_env.hh"
+#include "test/lib/error_injection.hh"
+#include "test/lib/log.hh"

 #include <seastar/core/future-util.hh>
 #include <seastar/core/shared_ptr.hh>
+#include "cql3/statements/batch_statement.hh"
 #include "cql3/query_processor.hh"
 #include "cql3/untyped_result_set.hh"
+#include "db/batchlog.hh"
 #include "db/batchlog_manager.hh"
 #include "db/commitlog/commitlog.hh"
+#include "db/config.hh"
+#include "idl/frozen_schema.dist.hh"
+#include "idl/frozen_schema.dist.impl.hh"
 #include "message/messaging_service.hh"
 #include "service/storage_proxy.hh"
+#include "utils/rjson.hh"

 BOOST_AUTO_TEST_SUITE(batchlog_manager_test)

@@ -37,6 +46,7 @@ SEASTAR_TEST_CASE(test_execute_batch) {
        return e.execute_cql("create table cf (p1 varchar, c1 int, r1 int, PRIMARY KEY (p1, c1));").discard_result().then([&qp, &e, &bp] () mutable {
            auto& db = e.local_db();
            auto s = db.find_schema("ks", "cf");
+            const auto batchlog_schema = db.find_schema(db::system_keyspace::NAME, db::system_keyspace::BATCHLOG_V2);

            const column_definition& r1_col = *s->get_column_definition("r1");
            auto key = partition_key::from_exploded(*s, {to_bytes("key1")});
@@ -48,7 +58,7 @@ SEASTAR_TEST_CASE(test_execute_batch) {
            using namespace std::chrono_literals;

            auto version = netw::messaging_service::current_version;
-            auto bm = qp.proxy().get_batchlog_mutation_for({ m }, s->id().uuid(), version, db_clock::now() - db_clock::duration(3h));
+            auto bm = db::get_batchlog_mutation_for(batchlog_schema, { m }, version, db_clock::now() - db_clock::duration(3h), s->id().uuid());

            return qp.proxy().mutate_locally(bm, tracing::trace_state_ptr(), db::commitlog::force_sync::no).then([&bp] () mutable {
                return bp.count_all_batches().then([](auto n) {
@@ -67,4 +77,719 @@ SEASTAR_TEST_CASE(test_execute_batch) {
    });
 }

+namespace {
+
+struct fragment {
+    position_in_partition pos;
+    bool is_range_tombstone;
+    bool is_live;
+
+    fragment(position_in_partition p, bool is_rt, bool is_live)
+        : pos(std::move(p)), is_range_tombstone(is_rt), is_live(is_live)
+    {}
+
+    class less_comparator {
+        schema_ptr _s;
+    public:
+        explicit less_comparator(schema_ptr s) : _s(std::move(s)) {}
+        bool operator()(const fragment& a, const fragment& b) const {
+            position_in_partition::less_compare cmp{*_s};
+            return cmp(a.pos, b.pos);
+        }
+    };
+};
+
+struct batchlog_as_fragments {
+    using fragment_set = std::set<fragment, fragment::less_comparator>;
+    std::unordered_map<int32_t, fragment_set> initial_fragments_per_shard;
+    std::unordered_map<int32_t, fragment_set> failed_replay_fragments_per_shard;
+};
+
+position_in_partition parse_batchlog_position(const schema& s, const cql3::untyped_result_set::row& row) {
+    std::vector<managed_bytes> ck_components;
+    ck_components.reserve(2);
+    for (const auto ck_component : {"written_at", "id"}) {
+        if (!row.has(ck_component) || row.get_blob_fragmented(ck_component).empty()) {
+            // ck is prefix
+            break;
+        }
+        ck_components.push_back(row.get_blob_fragmented(ck_component));
+    }
+
+    return position_in_partition(
+            static_cast<partition_region>(row.get_as<int8_t>("partition_region")),
+            static_cast<bound_weight>(row.get_as<int8_t>("position_weight")),
+            clustering_key_prefix::from_exploded(s, ck_components));
+}
+
+batchlog_as_fragments extract_batchlog_fragments(const cql3::untyped_result_set& fragment_results, schema_ptr batchlog_v2_schema) {
+    auto is_live = [] (std::optional<sstring> value) {
+        // no value can be stored as null or empty json object ("{}")
+        return value && value->size() > 2;
+    };
+
+    batchlog_as_fragments result;
+
+    for (const auto& row : fragment_results) {
+        const auto stage = static_cast<db::batchlog_stage>(row.get_as<int8_t>("stage"));
+        const auto shard = row.get_as<int32_t>("shard");
+
+        auto& fragments_per_shard = stage == db::batchlog_stage::initial
+                ? result.initial_fragments_per_shard
+                : result.failed_replay_fragments_per_shard;
+
+        const auto is_rtc = row.get_as<sstring>("mutation_fragment_kind") == "range tombstone change";
+        auto pos = parse_batchlog_position(*batchlog_v2_schema, row);
+        BOOST_REQUIRE_EQUAL(is_rtc, (pos.get_bound_weight() != bound_weight::equal));
+
+        testlog.info("[stage {}, bathlog shard {}] fragment: pos={}, kind={}, live={}", int8_t(stage), shard, pos, row.get_as<sstring>("mutation_fragment_kind"), is_live(row.get_opt<sstring>("value")));
+
+        auto& fragments = fragments_per_shard.try_emplace(shard, batchlog_as_fragments::fragment_set(fragment::less_comparator(batchlog_v2_schema))).first->second;
+        fragments.emplace(pos, is_rtc, !is_rtc && is_live(row.get_opt<sstring>("value")));
+    }
+
+    return result;
+}
+
+void check_range_tombstone_start(const fragment& f) {
+    BOOST_REQUIRE(f.is_range_tombstone);
+    BOOST_REQUIRE(f.pos.region() == partition_region::clustered);
+    BOOST_REQUIRE(f.pos.has_key());
+    BOOST_REQUIRE(f.pos.key().explode().empty());
+    BOOST_REQUIRE(f.pos.get_bound_weight() == bound_weight::before_all_prefixed);
+}
+
+void check_range_tombstone_end(const fragment& f, std::optional<bound_weight> bound_weight_opt = {}) {
+    BOOST_REQUIRE(f.is_range_tombstone);
+    BOOST_REQUIRE(f.pos.region() == partition_region::clustered);
+    BOOST_REQUIRE(f.pos.has_key());
+    BOOST_REQUIRE_EQUAL(f.pos.key().explode().size(), 1);
+    if (bound_weight_opt) {
+        BOOST_REQUIRE(f.pos.get_bound_weight() == *bound_weight_opt);
+    }
+}
+
+} // anonymous namespace
+
+future<> run_batchlog_cleanup_with_failed_batches_test(bool replay_fails, db::batchlog_manager::post_replay_cleanup cleanup) {
+#ifndef SCYLLA_ENABLE_ERROR_INJECTION
+    return make_ready_future<>();
+#endif
+
+    cql_test_config cfg;
+    cfg.db_config->batchlog_replay_cleanup_after_replays.set_value("9999999", utils::config_file::config_source::Internal);
+    cfg.batchlog_replay_timeout = 0s;
+    cfg.batchlog_delay = 9999h;
+
+    return do_with_cql_env_thread([=] (cql_test_env& env) -> void {
+        auto& bm = env.batchlog_manager().local();
+
+        env.execute_cql("CREATE TABLE tbl (pk bigint PRIMARY KEY, v text)").get();
+
+        const uint64_t batch_count = 8;
+        uint64_t failed_batches = 0;
+
+        for (uint64_t i = 0; i != batch_count; ++i) {
+            std::vector<sstring> queries;
+            std::vector<std::string_view> query_views;
+            for (uint64_t j = 0; j != i+2; ++j) {
+                queries.emplace_back(format("INSERT INTO tbl (pk, v) VALUES ({}, 'value');", j));
+                query_views.emplace_back(queries.back());
+            }
+            const bool fail = i % 2;
+            bool injected_exception_thrown = false;
+
+            std::optional<scoped_error_injection> error_injection;
+            if (fail) {
+                ++failed_batches;
+                error_injection.emplace("storage_proxy_fail_send_batch");
+            }
+            try {
+                env.execute_batch(
+                        query_views,
+                        cql3::statements::batch_statement::type::LOGGED,
+                        std::make_unique<cql3::query_options>(db::consistency_level::ONE, std::vector<cql3::raw_value>())).get();
+            } catch (std::runtime_error& ex) {
+                if (fail) {
+                    BOOST_REQUIRE_EQUAL(std::string(ex.what()), "Error injection: failing to send batch");
+                    injected_exception_thrown = true;
+                } else {
+                    throw;
+                }
+            }
+            BOOST_REQUIRE_EQUAL(injected_exception_thrown, fail);
+        }
+
+        const auto fragments_query = format("SELECT * FROM MUTATION_FRAGMENTS({}.{}) WHERE partition_region = 2 ALLOW FILTERING", db::system_keyspace::NAME, db::system_keyspace::BATCHLOG_V2);
+
+        assert_that(env.execute_cql(format("SELECT id FROM {}.{}", db::system_keyspace::NAME, db::system_keyspace::BATCHLOG_V2)).get())
+            .is_rows()
+            .with_size(failed_batches);
+
+        assert_that(env.execute_cql(fragments_query).get())
+            .is_rows()
+            .with_size(batch_count)
+            .assert_for_columns_of_each_row([&] (columns_assertions& columns) {
+                columns.with_typed_column<sstring>("mutation_source", "memtable:0");
+            });
+
+        std::optional<scoped_error_injection> error_injection;
+        if (replay_fails) {
+            error_injection.emplace("storage_proxy_fail_replay_batch");
+        }
+
+        bm.do_batch_log_replay(cleanup).get();
+
+        assert_that(env.execute_cql(format("SELECT id FROM {}.{}", db::system_keyspace::NAME, db::system_keyspace::BATCHLOG_V2)).get())
+            .is_rows()
+            .with_size(replay_fails ? failed_batches : 0);
+
+        const auto fragment_results = cql3::untyped_result_set(env.execute_cql(fragments_query).get());
+
+        const auto batchlog_v2_schema = env.local_db().find_schema(db::system_keyspace::NAME, db::system_keyspace::BATCHLOG_V2);
+
+        const auto [initial_fragments_per_shard, failed_replay_fragments_per_shard] = extract_batchlog_fragments(
+                cql3::untyped_result_set(env.execute_cql(fragments_query).get()), batchlog_v2_schema);
+
+        if (cleanup) {
+            size_t initial_rows{};
+            for (const auto& [batchlog_shard, batchlog_shard_fragments] : initial_fragments_per_shard) {
+                // some batchlog shards can be empty, just ignore
+                if (batchlog_shard_fragments.empty()) {
+                    continue;
+                }
+
+                testlog.info("Checking fragment in initial stage and batchlog shard {}", batchlog_shard);
+
+                size_t rts{}, rows{};
+                for (const auto& fragment : batchlog_shard_fragments) {
+                    rts += fragment.is_range_tombstone;
+                    rows += !fragment.is_range_tombstone;
+                    BOOST_REQUIRE(!fragment.is_live);
+                }
+
+                // cleanup affects only batchlog shards which contributed to replay
+                if (rts) {
+                    BOOST_REQUIRE_EQUAL(rts, 2);
+                    check_range_tombstone_start(*batchlog_shard_fragments.begin());
+                    check_range_tombstone_end(*batchlog_shard_fragments.rbegin(), bound_weight::after_all_prefixed);
+                }
+                initial_rows += rows;
+            }
+            // some of the initial fragments could have been garbage collected after cleanup (shadowed by range tombstone)
+            // this happens in the background so we can have up to total_batches rows
+            BOOST_REQUIRE_LE(initial_rows, batch_count);
+
+            if (replay_fails) {
+                size_t failed_replay_rows{};
+                for (const auto& [batchlog_shard, batchlog_shard_fragments] : failed_replay_fragments_per_shard) {
+                    for (const auto& fragment : batchlog_shard_fragments) {
+                        BOOST_REQUIRE(!fragment.is_range_tombstone);
+                        BOOST_REQUIRE(fragment.is_live);
+                        ++failed_replay_rows;
+                    }
+                }
+                BOOST_REQUIRE_EQUAL(failed_replay_rows, failed_batches);
+            } else {
+                BOOST_REQUIRE(failed_replay_fragments_per_shard.empty());
+            }
+        } else {
+            size_t live{}, dead{};
+            BOOST_REQUIRE(failed_replay_fragments_per_shard.empty());
+            for (const auto& [batchlog_shard, batchlog_shard_fragments] : initial_fragments_per_shard) {
+                for (const auto& fragment : batchlog_shard_fragments) {
+                    BOOST_REQUIRE(!fragment.is_range_tombstone);
+                    if (fragment.is_live) {
+                        ++live;
+                    } else {
+                        ++dead;
+                    }
+                }
+            }
+            const auto total = live + dead;
+            BOOST_REQUIRE_EQUAL(total, batch_count);
+            if (replay_fails) {
+                BOOST_REQUIRE_EQUAL(live, failed_batches);
+            } else {
+                BOOST_REQUIRE_EQUAL(dead, total);
+            }
+        }
+    }, cfg);
+}
+
+SEASTAR_TEST_CASE(test_batchlog_replay_fails_no_cleanup) {
+    return run_batchlog_cleanup_with_failed_batches_test(true, db::batchlog_manager::post_replay_cleanup::no);
+}
+
+SEASTAR_TEST_CASE(test_batchlog_replay_fails_with_cleanup) {
+    return run_batchlog_cleanup_with_failed_batches_test(true, db::batchlog_manager::post_replay_cleanup::yes);
+}
+
+SEASTAR_TEST_CASE(test_batchlog_replay_no_cleanup) {
+    return run_batchlog_cleanup_with_failed_batches_test(false, db::batchlog_manager::post_replay_cleanup::no);
+}
+
+SEASTAR_TEST_CASE(test_batchlog_replay_with_cleanup) {
+    return run_batchlog_cleanup_with_failed_batches_test(false, db::batchlog_manager::post_replay_cleanup::yes);
+}
+
+SEASTAR_TEST_CASE(test_batchlog_replay_stage) {
+#ifndef SCYLLA_ENABLE_ERROR_INJECTION
+    return make_ready_future<>();
+#endif
+
+    cql_test_config cfg;
+    cfg.db_config->batchlog_replay_cleanup_after_replays.set_value("9999999", utils::config_file::config_source::Internal);
+    cfg.batchlog_replay_timeout = 0s;
+    cfg.batchlog_delay = 9999h;
+
+    return do_with_cql_env_thread([] (cql_test_env& env) -> void {
+        auto& bm = env.batchlog_manager().local();
+
+        env.execute_cql("CREATE TABLE tbl (pk bigint PRIMARY KEY, v text)").get();
+
+        const uint64_t batch_count = 8;
+
+        const auto shard_count = 256;
+
+        {
+            scoped_error_injection error_injection("storage_proxy_fail_send_batch");
+
+            for (uint64_t i = 0; i != batch_count; ++i) {
+                std::vector<sstring> queries;
+                std::vector<std::string_view> query_views;
+                for (uint64_t j = 0; j != i+2; ++j) {
+                    queries.emplace_back(format("INSERT INTO tbl (pk, v) VALUES ({}, 'value');", j));
+                    query_views.emplace_back(queries.back());
+                }
+                try {
+                    env.execute_batch(
+                            query_views,
+                            cql3::statements::batch_statement::type::LOGGED,
+                            std::make_unique<cql3::query_options>(db::consistency_level::ONE, std::vector<cql3::raw_value>())).get();
+                } catch (std::runtime_error& ex) {
+                    BOOST_REQUIRE_EQUAL(std::string(ex.what()), "Error injection: failing to send batch");
+                }
+            }
+        }
+
+        // Ensure select ... where write_time < write_time_limit (=now) picks up all batches.
+        sleep(2ms).get();
+
+        const auto batchlog_query = format("SELECT * FROM {}.{}", db::system_keyspace::NAME, db::system_keyspace::BATCHLOG_V2);
+        const auto fragments_query = format("SELECT * FROM MUTATION_FRAGMENTS({}.{}) WHERE partition_region = 2 ALLOW FILTERING", db::system_keyspace::NAME, db::system_keyspace::BATCHLOG_V2);
+
+        std::set<utils::UUID> ids;
+        std::set<db_clock::time_point> written_ats;
+        assert_that(env.execute_cql(batchlog_query).get())
+            .is_rows()
+            .with_size(batch_count)
+            .assert_for_columns_of_each_row([&] (columns_assertions& columns) {
+                columns.with_typed_column<int32_t>("version", netw::messaging_service::current_version)
+                    .with_typed_column<int8_t>("stage", int8_t(db::batchlog_stage::initial))
+                    .with_typed_column<db_clock::time_point>("written_at", [&] (db_clock::time_point written_at) {
+                        written_ats.insert(written_at);
+                        return true;
+                    })
+                    .with_typed_column<utils::UUID>("id", [&] (utils::UUID id) {
+                        ids.insert(id);
+                        return true;
+                    });
+            });
+
+        BOOST_REQUIRE_EQUAL(ids.size(), batch_count);
+        BOOST_REQUIRE_LE(written_ats.size(), batch_count);
+
+        auto do_replays = [&] (db::batchlog_manager::post_replay_cleanup cleanup) {
+            for (unsigned i = 0; i < 3; ++i) {
+                testlog.info("Replay attempt [cleanup={}] #{} - batches should be in failed_replay stage", cleanup, i);
+
+                bm.do_batch_log_replay(cleanup).get();
+
+                assert_that(env.execute_cql(batchlog_query).get())
+                    .is_rows()
+                    .with_size(batch_count)
+                    .assert_for_columns_of_each_row([&] (columns_assertions& columns) {
+                        columns.with_typed_column<int32_t>("version", netw::messaging_service::current_version)
+                            .with_typed_column<int8_t>("stage", [&] (int8_t stage) {
+                                // (0) cleanup::no  == db::batchlog_stage::initial
+                                // (1) cleanup::yes == db::batchlog_stage::failed_replay
+                                return stage == int8_t(bool(cleanup));
+                            })
+                            .with_typed_column<db_clock::time_point>("written_at", [&] (db_clock::time_point written_at) {
+                                return written_ats.contains(written_at);
+                            })
+                            .with_typed_column<utils::UUID>("id", [&] (utils::UUID id) {
+                                return ids.contains(id);
+                            });
+                    });
+
+                auto fragment_results = cql3::untyped_result_set(env.execute_cql(fragments_query).get());
+                if (cleanup) {
+                    // Each shard has a range tombstone (shard_count * 2 range tombstone changes)
+                    // There should be batch_count clustering rows, with stage=1
+                    // There may be [0, batch_count] clustering rows with stage=0
+                    BOOST_REQUIRE_GT(fragment_results.size(), batch_count);
+                    BOOST_REQUIRE_LE(fragment_results.size(), shard_count * 2 + batch_count * 2);
+                } else {
+                    BOOST_REQUIRE_EQUAL(fragment_results.size(), batch_count); // only clustering rows
+                }
+                for (const auto& row : fragment_results) {
+                    if (row.get_as<sstring>("mutation_fragment_kind") == "range tombstone change") {
+                        continue;
+                    }
+
+                    const auto id = row.get_as<utils::UUID>("id");
+                    const auto stage = row.get_as<int8_t>("stage");
+
+                    testlog.trace("Processing row for batch id={}, stage={}: ", id, stage);
+
+                    BOOST_REQUIRE_EQUAL(row.get_as<int32_t>("version"), netw::messaging_service::current_version);
+                    BOOST_REQUIRE(ids.contains(id));
+
+                    const auto metadata = row.get_as<sstring>("metadata");
+                    auto metadata_json = rjson::parse(metadata);
+                    BOOST_REQUIRE(metadata_json.IsObject());
+
+                    if (!cleanup || stage == int8_t(db::batchlog_stage::failed_replay)) {
+                        BOOST_REQUIRE_NE(row.get_as<sstring>("value"), "{}");
+                        BOOST_REQUIRE(!metadata_json.HasMember("tombstone"));
+                        const auto value_json = rjson::parse(row.get_as<sstring>("value"));
+                        BOOST_REQUIRE(value_json.IsObject());
+                        BOOST_REQUIRE(value_json.HasMember("data"));
+                    } else if (stage == int8_t(db::batchlog_stage::initial)) {
+                        BOOST_REQUIRE_EQUAL(row.get_as<sstring>("value"), "{}"); // row should be dead -- data column shadowed by tombstone
+                        if (!cleanup) {
+                            BOOST_REQUIRE(metadata_json.HasMember("tombstone"));
+                        }
+                    } else {
+                        BOOST_FAIL(format("Unexpected stage: {}", stage));
+                    }
+                }
+            }
+        };
+
+        {
+            scoped_error_injection error_injection("storage_proxy_fail_replay_batch");
+            do_replays(db::batchlog_manager::post_replay_cleanup::no);
+            do_replays(db::batchlog_manager::post_replay_cleanup::yes);
+        }
+
+        testlog.info("Successful replay - should remove all batches");
+        bm.do_batch_log_replay(db::batchlog_manager::post_replay_cleanup::no).get();
+
+        assert_that(env.execute_cql(batchlog_query).get())
+            .is_rows()
+            .is_empty();
+
+        const auto fragment_results = cql3::untyped_result_set(env.execute_cql(fragments_query).get());
+        // Each shard can have a range tombstone (shard_count * 2 range tombstone changes)
+        // There should be batch_count clustering rows, with stage=1
+        // There may be [0, batch_count] clustering rows with stage=0
+        BOOST_REQUIRE_GT(fragment_results.size(), batch_count);
+        BOOST_REQUIRE_LE(fragment_results.size(), shard_count * 2 + batch_count * 2);
+        for (const auto& row : fragment_results) {
+            if (row.get_as<sstring>("mutation_fragment_kind") == "range tombstone change") {
+                BOOST_REQUIRE_EQUAL(row.get_as<int8_t>("stage"), int8_t(db::batchlog_stage::initial));
+                continue;
+            }
+
+            BOOST_REQUIRE_EQUAL(row.get_as<int32_t>("version"), netw::messaging_service::current_version);
+            BOOST_REQUIRE(written_ats.contains(row.get_as<db_clock::time_point>("written_at")));
+            BOOST_REQUIRE(ids.contains(row.get_as<utils::UUID>("id")));
+            BOOST_REQUIRE_EQUAL(row.get_as<sstring>("value"), "{}");
+        }
+    }, cfg);
+}
+
+SEASTAR_TEST_CASE(test_batchlog_migrate_v1_v2) {
+#ifndef SCYLLA_ENABLE_ERROR_INJECTION
+    return make_ready_future<>();
+#endif
+
+    const auto batch_replay_timeout = 1s;
+
+    cql_test_config cfg;
+    cfg.db_config->batchlog_replay_cleanup_after_replays.set_value("9999999", utils::config_file::config_source::Internal);
+    cfg.batchlog_replay_timeout = batch_replay_timeout;
+    cfg.batchlog_delay = 9999h;
+
+    return do_with_cql_env_thread([batch_replay_timeout] (cql_test_env& env) -> void {
+        auto& bm = env.batchlog_manager().local();
+
+        env.execute_cql("CREATE TABLE tbl (pk bigint PRIMARY KEY, v text)").get();
+
+        auto& sp = env.get_storage_proxy().local();
+        auto& db = env.local_db();
+
+        auto& tbl = db.find_column_family("ks", "tbl");
+        auto tbl_schema = tbl.schema();
+        auto cdef_tbl_v = tbl_schema->get_column_definition(to_bytes("v"));
+
+        auto batchlog_v1_schema = db.find_schema(db::system_keyspace::NAME, db::system_keyspace::BATCHLOG);
+        auto cdef_batchlog_v1_data = batchlog_v1_schema->get_column_definition(to_bytes("data"));
+
+        const uint64_t batch_count = 8;
+
+        struct batchlog {
+            utils::UUID id;
+            int32_t version;
+            db_clock::time_point written_at;
+            managed_bytes data;
+        };
+        std::map<utils::UUID, const batchlog> batchlogs;
+
+        for (int64_t i = 0; i != batch_count; ++i) {
+            bytes_ostream batchlog_data;
+            for (int64_t j = 0; j != i+2; ++j) {
+                auto key = partition_key::from_single_value(*tbl_schema, serialized(j));
+                mutation m(tbl_schema, key);
+                m.set_clustered_cell(clustering_key::make_empty(), *cdef_tbl_v, make_atomic_cell(utf8_type, serialized("value")));
+                ser::serialize(batchlog_data, canonical_mutation(m));
+            }
+
+            const auto id = utils::UUID_gen::get_time_UUID();
+            auto [it, _] = batchlogs.emplace(id, batchlog{
+                    .id = id,
+                    .version = netw::messaging_service::current_version,
+                    .written_at = db_clock::now() - batch_replay_timeout * 10,
+                    .data = std::move(batchlog_data).to_managed_bytes()});
+
+            auto& batch = it->second;
+
+            const auto timestamp = api::new_timestamp();
+            mutation m(batchlog_v1_schema, partition_key::from_single_value(*batchlog_v1_schema, serialized(batch.id)));
+            m.set_cell(clustering_key_prefix::make_empty(), to_bytes("version"), batch.version, timestamp);
+            m.set_cell(clustering_key_prefix::make_empty(), to_bytes("written_at"), batch.written_at, timestamp);
+            m.set_cell(clustering_key_prefix::make_empty(), *cdef_batchlog_v1_data, atomic_cell::make_live(*cdef_batchlog_v1_data->type, timestamp, std::move(batch.data)));
+
+            sp.mutate_locally(m, tracing::trace_state_ptr(), db::commitlog::force_sync::no).get();
+        }
+
+        const auto batchlog_v1_query = format("SELECT * FROM {}.{}", db::system_keyspace::NAME, db::system_keyspace::BATCHLOG);
+        const auto batchlog_v2_query = format("SELECT * FROM {}.{}", db::system_keyspace::NAME, db::system_keyspace::BATCHLOG_V2);
+
+        // Initial state, all entries are in the v1 table.
+        assert_that(env.execute_cql(batchlog_v1_query).get())
+            .is_rows()
+            .with_size(batch_count);
+
+        assert_that(env.execute_cql(batchlog_v2_query).get())
+            .is_rows()
+            .is_empty();
+
+        {
+            scoped_error_injection error_injection("batchlog_manager_fail_migration");
+
+            testlog.info("First replay - migration should fail, all entries stay in v1 table");
+            bm.do_batch_log_replay(db::batchlog_manager::post_replay_cleanup::no).get();
+        }
+
+        assert_that(env.execute_cql(batchlog_v1_query).get())
+            .is_rows()
+            .with_size(batch_count);
+
+        assert_that(env.execute_cql(batchlog_v2_query).get())
+            .is_rows()
+            .is_empty();
+
+        {
+            scoped_error_injection error_injection("storage_proxy_fail_replay_batch");
+
+            testlog.info("Second replay - migration should run again and succeed, but replay of migrated entries should fail, so they should remain in the v2 table.");
+            bm.do_batch_log_replay(db::batchlog_manager::post_replay_cleanup::no).get();
+        }
+
+        assert_that(env.execute_cql(batchlog_v1_query).get())
+            .is_rows()
+            .is_empty();
+
+        auto results = cql3::untyped_result_set(env.execute_cql(batchlog_v2_query).get());
+        BOOST_REQUIRE_EQUAL(results.size(), batch_count);
+
+        std::set<utils::UUID> migrated_batchlog_ids;
+        for (const auto& row : results) {
+            BOOST_REQUIRE_EQUAL(row.get_as<int8_t>("stage"), int8_t(db::batchlog_stage::failed_replay));
+
+            const auto id = row.get_as<utils::UUID>("id");
+            auto it = batchlogs.find(id);
+            BOOST_REQUIRE(it != batchlogs.end());
+            const auto& batch = it->second;
+
+            BOOST_REQUIRE_EQUAL(row.get_as<int32_t>("version"), batch.version);
+            BOOST_REQUIRE(row.get_as<db_clock::time_point>("written_at") == batch.written_at);
+            BOOST_REQUIRE_EQUAL(row.get_blob_fragmented("data"), batch.data);
+
+            migrated_batchlog_ids.emplace(id);
+        }
+
+        BOOST_REQUIRE_EQUAL(batchlogs.size(), migrated_batchlog_ids.size());
+
+        testlog.info("Third replay - migration is already done, replay of migrated entries should succeed, v2 table should be empty afterwards.");
+        bm.do_batch_log_replay(db::batchlog_manager::post_replay_cleanup::no).get();
+
+        assert_that(env.execute_cql(batchlog_v2_query).get())
+            .is_rows()
+            .is_empty();
+    }, cfg);
+}
+
+SEASTAR_TEST_CASE(test_batchlog_replay_write_time) {
+#ifndef SCYLLA_ENABLE_ERROR_INJECTION
+    return make_ready_future<>();
+#endif
+
+    cql_test_config cfg;
+    cfg.db_config->batchlog_replay_cleanup_after_replays.set_value("9999999", utils::config_file::config_source::Internal);
+    cfg.batchlog_replay_timeout = 1h;
+    cfg.batchlog_delay = 9999h;
+
+    return do_with_cql_env_thread([ks = get_name()] (cql_test_env& env) -> void {
+        auto& bm = env.batchlog_manager().local();
+
+        env.execute_cql(format("CREATE KEYSPACE {} WITH replication = {{'class': 'SimpleStrategy', 'replication_factor': 2}} AND tablets = {{'enabled': 'false'}}", ks)).get();
+
+        env.execute_cql(format("CREATE TABLE {}.tbl1 (pk bigint PRIMARY KEY, v text) WITH tombstone_gc = {{'mode': 'repair', 'propagation_delay_in_seconds': 1}}", ks)).get();
+        env.execute_cql(format("CREATE TABLE {}.tbl2 (pk bigint PRIMARY KEY, v text) WITH tombstone_gc = {{'mode': 'timeout'}}", ks)).get();
+
+        const uint64_t batch_count = 8;
+        const uint64_t mutations_per_batch = 2;
+
+        {
+            scoped_error_injection error_injection("storage_proxy_fail_send_batch");
+
+            for (uint64_t i = 0; i != batch_count; ++i) {
+                std::vector<sstring> queries;
+                std::vector<std::string_view> query_views;
+
+                if (i % 2) {
+                    for (const auto& tbl_name : {"tbl1", "tbl2"}) {
+                        queries.emplace_back(format("INSERT INTO {}.{} (pk, v) VALUES (0, 'value');", ks, tbl_name, i));
+                        query_views.emplace_back(queries.back());
+                    }
+                } else {
+                    for (uint64_t j = 0; j != mutations_per_batch; ++j) {
+                        queries.emplace_back(format("INSERT INTO {}.tbl2 (pk, v) VALUES ({}, 'value');", ks, i * 2 + j));
+                        query_views.emplace_back(queries.back());
+                    }
+                }
+
+                try {
+                    env.execute_batch(
+                            query_views,
+                            cql3::statements::batch_statement::type::LOGGED,
+                            std::make_unique<cql3::query_options>(db::consistency_level::ONE, std::vector<cql3::raw_value>())).get();
+                } catch (std::runtime_error& ex) {
+                    BOOST_REQUIRE_EQUAL(std::string(ex.what()), "Error injection: failing to send batch");
+                }
+            }
+        }
+
+        const auto batchlog_query = format("SELECT * FROM {}.{}", db::system_keyspace::NAME, db::system_keyspace::BATCHLOG_V2);
+
+        assert_that(env.execute_cql(batchlog_query).get())
+            .is_rows()
+            .with_size(batch_count);
+
+        auto get_write_attempts = [&] () -> uint64_t {
+            return env.batchlog_manager().map_reduce0([] (const db::batchlog_manager& bm) {
+                return bm.stats().write_attempts;
+            }, uint64_t(0), std::plus<uint64_t>{}).get();
+        };
+
+        BOOST_REQUIRE_EQUAL(get_write_attempts(), 0);
+
+        // We need this sleep here to make sure all batches are older than propagation delay of 1s.
+        sleep(2s).get();
+
+        {
+            scoped_error_injection error_injection("storage_proxy_fail_replay_batch");
+            bm.do_batch_log_replay(db::batchlog_manager::post_replay_cleanup::no).get();
+
+            // Half the batches are skipped due to being too fresh
+            BOOST_REQUIRE_EQUAL(get_write_attempts(), (batch_count / 2) * mutations_per_batch);
+
+            auto results = cql3::untyped_result_set(env.execute_cql(batchlog_query).get());
+            BOOST_REQUIRE_EQUAL(results.size(), batch_count);
+
+            for (const auto& row : results) {
+                BOOST_REQUIRE_EQUAL(row.get_as<int8_t>("stage"), int8_t(db::batchlog_stage::initial));
+            }
+        }
+
+        {
+            scoped_error_injection error_injection("storage_proxy_fail_replay_batch");
+            bm.do_batch_log_replay(db::batchlog_manager::post_replay_cleanup::yes).get();
+
+            // Half the batches are skipped (again) due to being too fresh
+            BOOST_REQUIRE_EQUAL(get_write_attempts(), batch_count * mutations_per_batch);
+
+            auto results = cql3::untyped_result_set(env.execute_cql(batchlog_query).get());
+            BOOST_REQUIRE_EQUAL(results.size(), batch_count);
+
+            size_t initial = 0;
+            size_t failed_replay = 0;
+            for (const auto& row : results) {
+                const auto stage = static_cast<db::batchlog_stage>(row.get_as<int8_t>("stage"));
+                switch (stage) {
+                    case db::batchlog_stage::initial:
+                        ++initial;
+                        break;
+                    case db::batchlog_stage::failed_replay:
+                        ++failed_replay;
+                        break;
+                    default:
+                        BOOST_FAIL(format("Unexpected stage: {}", int8_t(stage)));
+                }
+            }
+            BOOST_REQUIRE_EQUAL(initial, batch_count / 2);
+            BOOST_REQUIRE_EQUAL(failed_replay, batch_count / 2);
+        }
+
+        const auto fragments_query = format("SELECT * FROM MUTATION_FRAGMENTS({}.{}) WHERE partition_region = 2 ALLOW FILTERING", db::system_keyspace::NAME, db::system_keyspace::BATCHLOG_V2);
+        const auto batchlog_v2_schema = env.local_db().find_schema(db::system_keyspace::NAME, db::system_keyspace::BATCHLOG_V2);
+        const auto [initial_fragments_per_shard, failed_replay_fragments_per_shard] = extract_batchlog_fragments(
+                cql3::untyped_result_set(env.execute_cql(fragments_query).get()), batchlog_v2_schema);
+
+        for (const auto& [batchlog_shard, batchlog_shard_fragments] : initial_fragments_per_shard) {
+            if (batchlog_shard_fragments.empty()) {
+                continue;
+            }
+
+            testlog.info("Checking fragment in initial stage and batchlog shard {}", batchlog_shard);
+
+            position_in_partition::less_compare less_cmp{*batchlog_v2_schema};
+
+            size_t rts = 0;
+            auto min_live_pos = position_in_partition::after_all_clustered_rows();
+            for (const auto& f : batchlog_shard_fragments) {
+                if (f.is_range_tombstone) {
+                    ++rts;
+                    continue;
+                }
+                if (!f.is_live) {
+                    continue;
+                }
+                min_live_pos = std::min(min_live_pos, f.pos, less_cmp);
+            }
+
+            BOOST_REQUIRE(rts == 0 || rts == 2);
+            if (!rts) {
+                continue;
+            }
+
+            check_range_tombstone_start(*batchlog_shard_fragments.begin());
+
+            bool found_range_tombstone_end = false;
+            for (auto it = std::next(batchlog_shard_fragments.begin()); it != batchlog_shard_fragments.end(); ++it) {
+                if (it->is_range_tombstone) {
+                    BOOST_REQUIRE(!std::exchange(found_range_tombstone_end, true));
+                    check_range_tombstone_end(*it);
+                    BOOST_REQUIRE(less_cmp(it->pos, min_live_pos));
+                }
+            }
+        }
+    }, cfg);
+}
+
 BOOST_AUTO_TEST_SUITE_END()
--- a/test/boost/query_processor_test.cc
+++ b/test/boost/query_processor_test.cc
@@ -220,7 +220,7 @@ SEASTAR_TEST_CASE(test_query_counters) {

        // Executes a batch of (modifying) statements and waits for it to complete.
        auto process_batch = [&e](const std::vector<std::string_view>& queries, clevel cl) mutable {
-            e.execute_batch(queries, make_options(cl)).get();
+            e.execute_batch(queries, cql3::statements::batch_statement::type::UNLOGGED, make_options(cl)).get();
        };

        auto expected = get_query_metrics();
--- a/test/boost/sstable_datafile_test.cc
+++ b/test/boost/sstable_datafile_test.cc
@@ -3388,8 +3388,11 @@ SEASTAR_TEST_CASE(test_non_full_and_empty_row_keys) {
                .with_typed_column<bytes>("partition_key", [&] (const bytes& v) {
                    return partition_key::from_bytes(v).equal(*schema, dk.key());
                })
-                .with_typed_column<bytes>("clustering_key", [&] (const bytes& v) {
-                    return clustering_key::from_bytes(v).equal(*schema, ck);
+                .with_typed_column<bytes>("clustering_key", [&] (const bytes* v) {
+                    if (!v) {
+                        return ck.is_empty();
+                    }
+                    return clustering_key::from_bytes(*v).equal(*schema, ck);
                })
                .with_typed_column<sstring>("mutation_fragment_kind", "clustering row")
                .with_typed_column<bytes>("frozen_mutation_fragment", [&] (const bytes& v) {
--- a/test/boost/test_config.yaml
+++ b/test/boost/test_config.yaml
@@ -49,5 +49,7 @@ custom_args:
        - '-c1'
    s3_test:
        - '-c2 -m2G --logger-log-level s3=trace --logger-log-level http=trace --logger-log-level default_http_retry_strategy=trace'
+    batchlog_manager_test:
+        - '-c2 -m2G --logger-log-level batchlog_manager=trace:debug_error_injection=trace:testlog=trace'
 run_in_debug:
    - logalloc_standard_allocator_segment_pool_backend_test
--- a/test/cluster/dtest/commitlog_test.py
+++ b/test/cluster/dtest/commitlog_test.py
@@ -26,7 +26,6 @@ from tools.assertions import (
    assert_row_count_in_select_less,
 )
 from tools.data import insert_c1c2, rows_to_list
-from tools.files import corrupt_file
 from tools.metrics import get_node_metrics


@@ -678,6 +677,12 @@ class TestCommitLog(Tester):
        node1.stop(gently=False)

        # corrupt the commitlogs
+        def corrupt_file(file_path: str):
+            print(f"Writing junk into file header:  {file_path}")
+            with open(file_path, "r+b") as f:
+                f.seek(18)
+                f.write(os.urandom(474))
+
        for commitlog in self._get_commitlog_files():
            corrupt_file(commitlog)

@@ -687,7 +692,7 @@ class TestCommitLog(Tester):
        # check the data and the logs
        in_table = rows_to_list(session.execute(f"SELECT * FROM {self.ks}.{self.cf};"))
        assert not node1.grep_log(f"large_data - Writing large row {self.ks}/{self.cf}")
-        assert node1.grep_log("commitlog_replayer - Corrupted file:")
+        assert node1.grep_log("commitlog_replayer - Corrupted file")
        assert in_table == []

    def test_one_big_mutation_rollback_on_startup(self):
--- a/test/cluster/dtest/tools/files.py
+++ b/test/cluster/dtest/tools/files.py
@@ -1,25 +0,0 @@
-#
-# Copyright (C) 2025-present ScyllaDB
-#
-# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
-#
-
-import os
-import random
-
-
-def corrupt_file(file_path: str, chunk_size: int = 1024, percentage: float = 0.2) -> None:
-    """
-    Corrupts a file by overwriting random chunks of its content with random data.
-    Args:
-        file_path (str): The path to the file to be corrupted.
-        chunk_size (int, optional): The size of each random data chunk to write, in bytes.
-        percentage (float, optional): The percentage of the file to corrupt, represented as a float between 0 and 1.
-    """
-    with open(file_path, "r+b") as f:
-        file_size = os.path.getsize(file_path)
-        chunks = int(file_size * percentage / chunk_size) or 1
-        for _ in range(chunks):
-            random_position = random.randint(0, max(file_size - chunk_size, 0))
-            f.seek(random_position)
-            f.write(os.urandom(chunk_size))
--- a/test/cluster/test_alternator.py
+++ b/test/cluster/test_alternator.py
@@ -570,6 +570,90 @@ async def test_alternator_enforce_authorization_true(manager: ManagerClient):
    # We could further test how GRANT works, but this would be unnecessary
    # repeating of the tests in test/alternator/test_cql_rbac.py.

+@pytest.mark.asyncio
+async def test_index_requires_rf_rack_valid(manager: ManagerClient):
+    """
+    Verify that creating a table with GSI or LSI fails with an appropriate error if
+    it's tablets based and rf_rack_valid_keyspaces is disabled.
+    Adding a GSI to an existing table with tablets should fail as well.
+    """
+    servers = await manager.servers_add(1, config=alternator_config | {"rf_rack_valid_keyspaces": False})
+    alternator = get_alternator(servers[0].ip_addr)
+
+    expected_err_create = "GlobalSecondaryIndexes and LocalSecondaryIndexes with tablets require the rf_rack_valid_keyspaces option to be enabled."
+    expected_err_update_add_gsi = "GlobalSecondaryIndexes with tablets require the rf_rack_valid_keyspaces option to be enabled."
+
+    def create_table_with_index(alternator, table_name, index_type, initial_tablets):
+        create_table_args = dict(
+            TableName=table_name,
+            BillingMode='PAY_PER_REQUEST',
+            Tags=[{'Key': 'system:initial_tablets', 'Value': initial_tablets}],
+            KeySchema=[
+                {'AttributeName': 'p', 'KeyType': 'HASH'},
+                {'AttributeName': 'c', 'KeyType': 'RANGE'}
+            ],
+            AttributeDefinitions=[
+                {'AttributeName': 'p', 'AttributeType': 'S'},
+                {'AttributeName': 'c', 'AttributeType': 'S'}
+            ],
+        )
+        if index_type == "GSI":
+            create_table_args["GlobalSecondaryIndexes"] = [
+                {
+                    'IndexName': 'gsi1',
+                    'KeySchema': [
+                        {'AttributeName': 'c', 'KeyType': 'HASH'},
+                        {'AttributeName': 'p', 'KeyType': 'RANGE'},
+                    ],
+                    'Projection': {'ProjectionType': 'ALL'}
+                }
+            ]
+        elif index_type == "LSI":
+            create_table_args["LocalSecondaryIndexes"] = [
+                {
+                    'IndexName': 'lsi1',
+                    'KeySchema': [
+                        {'AttributeName': 'p', 'KeyType': 'HASH'},
+                        {'AttributeName': 'c', 'KeyType': 'RANGE'},
+                    ],
+                    'Projection': {'ProjectionType': 'ALL'}
+                }
+            ]
+        alternator.create_table(**create_table_args)
+
+    # Create a table with tablets and GSI or LSI - should fail because rf_rack_valid_keyspaces is disabled
+    for index_type in ["GSI", "LSI"]:
+        with pytest.raises(ClientError, match=expected_err_create):
+            create_table_with_index(alternator, unique_table_name(), index_type, initial_tablets='1')
+
+    # Now create the table without tablets - should succeed
+    for index_type in ["GSI", "LSI"]:
+        create_table_with_index(alternator, unique_table_name(), index_type, initial_tablets='none')
+
+    # Create a table with tablets and no indexes, then add a GSI - the update should fail
+    table_name = unique_table_name()
+    create_table_with_index(alternator, table_name, index_type=None, initial_tablets='1')
+    with pytest.raises(ClientError, match=expected_err_update_add_gsi):
+        alternator.meta.client.update_table(
+            TableName=table_name,
+            AttributeDefinitions=[
+                {'AttributeName': 'c', 'AttributeType': 'S'},
+                {'AttributeName': 'p', 'AttributeType': 'S'},
+            ],
+            GlobalSecondaryIndexUpdates=[
+                {
+                    'Create': {
+                        'IndexName': 'gsi1',
+                        'KeySchema': [
+                            {'AttributeName': 'c', 'KeyType': 'HASH'},
+                            {'AttributeName': 'p', 'KeyType': 'RANGE'},
+                        ],
+                        'Projection': {'ProjectionType': 'ALL'}
+                    }
+                }
+            ]
+        )
+
 # Unfortunately by default a Python thread print the exception that kills
 # it (e.g., pytest assert failures) but it doesn't propagate the exception
 # to the join() - so the overall test doesn't fail. The following ThreadWrapper
--- a/test/cluster/test_batchlog_manager.py
+++ b/test/cluster/test_batchlog_manager.py
@@ -56,7 +56,7 @@ async def test_batchlog_replay_while_a_node_is_down(manager: ManagerClient) -> N

        await manager.server_stop(servers[1].server_id)

-        batchlog_row_count = (await cql.run_async("SELECT COUNT(*) FROM system.batchlog", host=hosts[0]))[0].count
+        batchlog_row_count = (await cql.run_async("SELECT COUNT(*) FROM system.batchlog_v2", host=hosts[0]))[0].count
        assert batchlog_row_count > 0

        await asyncio.gather(*[manager.api.disable_injection(s.ip_addr, "skip_batch_replay") for s in servers if s != servers[1]])
@@ -72,7 +72,7 @@ async def test_batchlog_replay_while_a_node_is_down(manager: ManagerClient) -> N
        await s0_log.wait_for('Finished replayAllFailedBatches', timeout=60, from_mark=s0_mark)

        async def batchlog_empty() -> bool:
-            batchlog_row_count = (await cql.run_async("SELECT COUNT(*) FROM system.batchlog", host=hosts[0]))[0].count
+            batchlog_row_count = (await cql.run_async("SELECT COUNT(*) FROM system.batchlog_v2", host=hosts[0]))[0].count
            if batchlog_row_count == 0:
                return True
        await wait_for(batchlog_empty, time.time() + 60)
@@ -120,7 +120,7 @@ async def test_batchlog_replay_aborted_on_shutdown(manager: ManagerClient) -> No

        await asyncio.gather(*[manager.api.disable_injection(s.ip_addr, "skip_batch_replay") for s in servers if s != servers[1]])

-        batchlog_row_count = (await cql.run_async("SELECT COUNT(*) FROM system.batchlog", host=hosts[0]))[0].count
+        batchlog_row_count = (await cql.run_async("SELECT COUNT(*) FROM system.batchlog_v2", host=hosts[0]))[0].count
        assert batchlog_row_count > 0

        # The batch is replayed while server 1 is down
@@ -136,7 +136,7 @@ async def test_batchlog_replay_aborted_on_shutdown(manager: ManagerClient) -> No
        hosts = await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60)

        async def batchlog_empty() -> bool:
-            batchlog_row_count = (await cql.run_async("SELECT COUNT(*) FROM system.batchlog", host=hosts[0]))[0].count
+            batchlog_row_count = (await cql.run_async("SELECT COUNT(*) FROM system.batchlog_v2", host=hosts[0]))[0].count
            if batchlog_row_count == 0:
                return True
        await wait_for(batchlog_empty, time.time() + 60)
@@ -237,7 +237,7 @@ async def test_drop_mutations_for_dropped_table(manager: ManagerClient) -> None:
    host1, _ = hosts

    async def get_batchlog_row_count():
-        rows = await cql.run_async("SELECT COUNT(*) FROM system.batchlog", host=host1)
+        rows = await cql.run_async("SELECT COUNT(*) FROM system.batchlog_v2", host=host1)
        row = rows[0]
        assert hasattr(row, "count")

@@ -313,12 +313,14 @@ async def test_batchlog_replay_failure_during_repair(manager: ManagerClient, rep
    9. Verify that the row is deleted and there is no data resurrection.
    """

+    write_timeout_ms = 2000
+
    cmdline=['--enable-cache', '0',
             "--hinted-handoff-enabled", "0",
             "--logger-log-level", "batchlog_manager=trace:repair=debug",
             "--repair-hints-batchlog-flush-cache-time-in-ms", "1000000" if repair_cache else "0"]
    config = {"error_injections_at_startup": ["short_batchlog_manager_replay_interval"],
-              "write_request_timeout_in_ms": 2000,
+              "write_request_timeout_in_ms": write_timeout_ms,
              'group0_tombstone_gc_refresh_interval_in_ms': 1000,
              'tablets_mode_for_new_keyspaces': 'disabled'
    }
@@ -330,7 +332,7 @@ async def test_batchlog_replay_failure_during_repair(manager: ManagerClient, rep
    host1, _ = hosts

    async def get_batchlog_row_count():
-        rows = await cql.run_async("SELECT COUNT(*) FROM system.batchlog", host=host1)
+        rows = await cql.run_async("SELECT COUNT(*) FROM system.batchlog_v2", host=host1)
        row = rows[0]
        assert hasattr(row, "count")

@@ -376,7 +378,7 @@ async def test_batchlog_replay_failure_during_repair(manager: ManagerClient, rep

        await cql.run_async(f"DELETE FROM {ks}.my_table WHERE pk=0 AND ck=0")

-        time.sleep(2)
+        await asyncio.sleep((write_timeout_ms // 1000) * 2 + 1)

        batchlog_row_count = await get_batchlog_row_count()
        assert batchlog_row_count > 0
@@ -386,7 +388,7 @@ async def test_batchlog_replay_failure_during_repair(manager: ManagerClient, rep

        # Once the mutations are in the batchlog, waiting to be replayed, we can disable this.
        await disable_injection("storage_proxy_fail_remove_from_batchlog")
-        await enable_injection("batch_replay_throw")
+        await enable_injection("storage_proxy_fail_replay_batch")
        # Once the table is dropped, we can resume the replay. The bug can
        # be triggered from now on (if it's present).
        await disable_injection("skip_batch_replay")
@@ -402,7 +404,7 @@ async def test_batchlog_replay_failure_during_repair(manager: ManagerClient, rep
        await manager.api.keyspace_compaction(s1.ip_addr, ks)
        await manager.api.keyspace_compaction(s2.ip_addr, ks)

-        await disable_injection("batch_replay_throw")
+        await disable_injection("storage_proxy_fail_replay_batch")

        await s1_log.wait_for("Replaying batch", timeout=60, from_mark=s1_mark)
        await s1_log.wait_for("Finished replayAllFailedBatches", timeout=60, from_mark=s1_mark)
--- a/test/cqlpy/cassandra_tests/vector_invalid_query_test.py
+++ b/test/cqlpy/cassandra_tests/vector_invalid_query_test.py
@@ -150,6 +150,7 @@ def test_cannot_order_with_ann_on_non_vector_column(cql, test_keyspace):
 #         assertThatThrownBy(() -> execute("SELECT * FROM %s ORDER BY value ann of [0.0, 0.0] LIMIT 2")).isInstanceOf(InvalidRequestException.class);
 #     }

+@pytest.mark.xfail(reason="As we do not support filtering yet, wrong error is thrown, once VECTOR-374 is done, this should pass")
 def test_must_have_limit_specified_and_within_max_allowed(cql, test_keyspace):
    with create_table(cql, test_keyspace, "(k int PRIMARY KEY, v vector<float, 1>)") as table:
        custom_index = "vector_index" if is_scylla(cql) else "StorageAttachedIndex"
@@ -187,6 +188,7 @@ def test_must_have_limit_specified_and_within_max_allowed(cql, test_keyspace):
 #         assertEquals(1, result.size());
 #     }

+@pytest.mark.xfail(reason="As we do not support filtering yet, wrong error is thrown, once VECTOR-374 is done, this should pass")
 def test_cannot_have_aggregation_on_ann_query(cql, test_keyspace):
    with create_table(cql, test_keyspace, "(k int PRIMARY KEY, v vector<float, 1>, c int)") as table:
        custom_index = "vector_index" if is_scylla(cql) else "StorageAttachedIndex"
--- a/test/cqlpy/test_materialized_view.py
+++ b/test/cqlpy/test_materialized_view.py
@@ -1501,14 +1501,11 @@ def test_views_with_future_tombstones(cql, test_keyspace):
 def test_view_in_system_tables(cql, test_keyspace):
    with new_test_table(cql, test_keyspace, "p int PRIMARY KEY, v int") as base:
        with new_materialized_view(cql, base, '*', 'v,p', 'v is not null and p is not null') as view:
-            wait_for_view_built(cql, view)
-
-            # In view_building_coordinator path, `built_views` table is updated by view_building_worker,
-            # so there is a short window when a view is build (information is in view_build_status_v2)
-            # but it isn't marked in `built_views` locally.
-            # Doing read barrier is enough to ensure that the worker updated the table.
-            cql.execute("DROP TABLE IF EXISTS nosuchkeyspace.nosuchtable")
-
+            deadline = time.time() + 60
+            while time.time() < deadline:
+                if view in [ f'{r.keyspace_name}.{r.view_name}' for r in cql.execute('select * from system.built_views')]:
+                    break
+                time.sleep(0.1)
            res = [ f'{r.keyspace_name}.{r.view_name}' for r in cql.execute('select * from system.built_views')]
            assert view in res
            res = [ f'{r.table_name}.{r.index_name}' for r in cql.execute('select * from system."IndexInfo"')]
--- a/test/cqlpy/test_vector_index.py
+++ b/test/cqlpy/test_vector_index.py
@@ -352,3 +352,26 @@ def test_vector_search_when_tracing_is_enabled(cql, test_keyspace, scylla_only,
                f"SELECT * FROM {table} ORDER BY v ANN OF [0.2,0.3,0.4] LIMIT 1",
                trace=True,
            )
+
+
+
+@pytest.mark.xfail(reason="""We do not support primary key filtering yet, see VECTOR-374, 
+                            additionally this test will fail when that issue is fixed because pytest does not run the vector search backend.
+                            It does pass on Cassandra however, so we keep it xfailed for future reference.""")
+def test_ann_query_with_restriction_works_only_on_pk(cql, test_keyspace):
+    schema = 'p int primary key, q int, v vector<float, 3>'
+    custom_index = 'vector_index' if is_scylla(cql) else 'sai'
+    with new_test_table(cql, test_keyspace, schema) as table:
+        cql.execute(f"CREATE CUSTOM INDEX ON {table}(v) USING '{custom_index}'")
+        cql.execute(f"INSERT INTO {table} (p, q, v) VALUES (1, 1, [1.0, 1.0, 1.0])")
+        cql.execute(f"INSERT INTO {table} (p, q, v) VALUES (2, 2, [1.0, 1.0, 1.0])")
+        cql.execute(f"INSERT INTO {table} (p, q, v) VALUES (3, 3, [1.0, 1.0, 1.0])")
+
+        result = cql.execute(f"SELECT * FROM {table} WHERE p = 1 ORDER BY v ANN OF [1.0, 1.0, 1.0] LIMIT 3")
+
+        assert len(result.current_rows) == 1
+        assert result.current_rows[0].p == 1
+        with pytest.raises(InvalidRequest, match="ANN ordering by vector requires all restricted column(s) to be indexed"):
+            cql.execute(f"SELECT * FROM {table} WHERE q = 1 ORDER BY v ANN OF [1.0, 1.0, 1.0] LIMIT 3")
+
+
--- a/test/lib/cql_assertions.cc
+++ b/test/lib/cql_assertions.cc
@@ -17,12 +17,12 @@
 #include "utils/to_string.hh"
 #include "bytes.hh"

-static inline void fail(sstring msg) {
-    throw std::runtime_error(msg);
+static inline void fail(std::string_view msg, std::source_location loc) {
+    throw std::runtime_error(std::format("assertion at {}:{} failed: {}", loc.file_name(), loc.line(), msg));
 }

 void columns_assertions::fail(const sstring& msg) {
-    ::fail(msg);
+    ::fail(msg, _loc);
 }

 columns_assertions& columns_assertions::do_with_raw_column(const char* name, std::function<void(data_type, managed_bytes_view)> func) {
@@ -32,13 +32,13 @@ columns_assertions& columns_assertions::do_with_raw_column(const char* name, std
        return col->name->text() == name;
    });
    if (it == names.end()) {
-        ::fail(seastar::format("Column {} not found in metadata", name));
+        ::fail(seastar::format("Column {} not found in metadata", name), _loc);
    }

    const size_t index = std::distance(names.begin(), it);
    const auto& value  = _columns.at(index);
    if (!value) {
-        ::fail(seastar::format("Column {} is null", name));
+        ::fail(seastar::format("Column {} is null", name), _loc);
    }

    func((*it)->type, *value);
@@ -47,23 +47,24 @@ columns_assertions& columns_assertions::do_with_raw_column(const char* name, std
 }

 columns_assertions& columns_assertions::with_raw_column(const char* name, std::function<bool(managed_bytes_view)> predicate) {
-    return do_with_raw_column(name, [name, &predicate] (data_type, managed_bytes_view value) {
+    return do_with_raw_column(name, [this, name, &predicate] (data_type, managed_bytes_view value) {
        if (!predicate(value)) {
-            ::fail(seastar::format("Column {} failed predicate check: value = {}", name, value));
+            ::fail(seastar::format("Column {} failed predicate check: value = {}", name, value), _loc);
        }
    });
 }

 columns_assertions& columns_assertions::with_raw_column(const char* name, managed_bytes_view value) {
-    return do_with_raw_column(name, [name, &value] (data_type, managed_bytes_view cell_value) {
+    return do_with_raw_column(name, [this, name, &value] (data_type, managed_bytes_view cell_value) {
        if (cell_value != value) {
-            ::fail(seastar::format("Expected column {} to have value {}, but got {}", name, value, cell_value));
+            ::fail(seastar::format("Expected column {} to have value {}, but got {}", name, value, cell_value), _loc);
        }
    });
 }

-rows_assertions::rows_assertions(shared_ptr<cql_transport::messages::result_message::rows> rows)
+rows_assertions::rows_assertions(shared_ptr<cql_transport::messages::result_message::rows> rows, std::source_location loc)
    : _rows(rows)
+    , _loc(loc)
 { }

 rows_assertions
@@ -71,7 +72,17 @@ rows_assertions::with_size(size_t size) {
    const auto& rs = _rows->rs().result_set();
    auto row_count = rs.size();
    if (row_count != size) {
-        fail(format("Expected {:d} row(s) but got {:d}", size, row_count));
+        fail(format("Expected {:d} row(s) but got {:d}", size, row_count), _loc);
+    }
+    return {*this};
+}
+
+rows_assertions
+rows_assertions::with_size(std::function<bool(size_t)> predicate) {
+    const auto& rs = _rows->rs().result_set();
+    auto row_count = rs.size();
+    if (!predicate(row_count)) {
+        fail(format("Predicate failed for row count {}", row_count), _loc);
    }
    return {*this};
 }
@@ -82,7 +93,7 @@ rows_assertions::is_empty() {
    auto row_count = rs.size();
    if (row_count != 0) {
        auto&& first_row = *rs.rows().begin();
-        fail(seastar::format("Expected no rows, but got {:d}. First row: {}", row_count, first_row));
+        fail(seastar::format("Expected no rows, but got {:d}. First row: {}", row_count, first_row), _loc);
    }
    return {*this};
 }
@@ -92,7 +103,7 @@ rows_assertions::is_not_empty() {
    const auto& rs = _rows->rs().result_set();
    auto row_count = rs.size();
    if (row_count == 0) {
-        fail("Expected some rows, but was result was empty");
+        fail("Expected some rows, but was result was empty", _loc);
    }
    return {*this};
 }
@@ -103,7 +114,7 @@ rows_assertions::rows_assertions::is_null() {
    for (auto&& row : rs.rows()) {
        for (const managed_bytes_opt& v : row) {
            if (v) {
-                fail(seastar::format("Expected null values. Found: {}\n", v));
+                fail(seastar::format("Expected null values. Found: {}\n", v), _loc);
            }
        }
    }
@@ -116,7 +127,7 @@ rows_assertions::rows_assertions::is_not_null() {
    for (auto&& row : rs.rows()) {
        for (const managed_bytes_opt& v : row) {
            if (!v) {
-                fail(seastar::format("Expected non-null values. {}\n", row));
+                fail(seastar::format("Expected non-null values. {}\n", row), _loc);
            }
        }
    }
@@ -128,7 +139,7 @@ rows_assertions::with_column_types(std::initializer_list<data_type> column_types
    auto meta = _rows->rs().result_set().get_metadata();
    const auto& columns = meta.get_names();
    if (column_types.size() != columns.size()) {
-        fail(format("Expected {:d} columns, got {:d}", column_types.size(), meta.column_count()));
+        fail(format("Expected {:d} columns, got {:d}", column_types.size(), meta.column_count()), _loc);
    }
    auto expected_it = column_types.begin();
    auto actual_it = columns.begin();
@@ -136,7 +147,7 @@ rows_assertions::with_column_types(std::initializer_list<data_type> column_types
        const auto& expected_type = *expected_it++;
        const auto& actual_spec = *actual_it++;
        if (expected_type != actual_spec->type) {
-            fail(format("Column {:d}: expected type {}, got {}", i, expected_type->name(), actual_spec->type->name()));
+            fail(format("Column {:d}: expected type {}, got {}", i, expected_type->name(), actual_spec->type->name()), _loc);
        }
    }
    return {*this};
@@ -151,7 +162,7 @@ rows_assertions::with_row(std::initializer_list<bytes_opt> values) {
            return {*this};
        }
    }
-    fail(seastar::format("Expected row not found: {} not in {}\n", expected_row, _rows));
+    fail(seastar::format("Expected row not found: {} not in {}\n", expected_row, _rows), _loc);
    return {*this};
 }

@@ -164,21 +175,21 @@ rows_assertions::with_rows(std::vector<std::vector<bytes_opt>> rows) {
    int row_nr = 0;
    for (auto&& row : rows) {
        if (actual_i == actual_end) {
-            fail(format("Expected more rows ({:d}), got {:d}", rows.size(), rs.size()));
+            fail(format("Expected more rows ({:d}), got {:d}", rows.size(), rs.size()), _loc);
        }
        auto& actual = *actual_i;
        auto expected_row = row | std::views::transform(to_managed_bytes_opt);
        if (!std::ranges::equal(
            expected_row,
            actual)) {
-            fail(seastar::format("row {} differs, expected {} got {}", row_nr, row, actual));
+            fail(seastar::format("row {} differs, expected {} got {}", row_nr, row, actual), _loc);
        }
        ++actual_i;
        ++row_nr;
    }
    if (actual_i != actual_end) {
        fail(seastar::format("Expected less rows ({:d}), got {:d}. Next row is: {}", rows.size(), rs.size(),
-                    *actual_i));
+                    *actual_i), _loc);
    }
    return {*this};
 }
@@ -197,40 +208,53 @@ rows_assertions::with_rows_ignore_order(std::vector<std::vector<bytes_opt>> rows
        });
        if (found == std::end(actual)) {
            fail(seastar::format("row {} not found in result set ({})", expected,
-               fmt::join(actual | std::views::transform([] (auto& r) { return fmt::to_string(r); }), ", ")));
+               fmt::join(actual | std::views::transform([] (auto& r) { return fmt::to_string(r); }), ", ")), _loc);
        }
    }
    if (rs.size() != rows.size()) {
-        fail(format("Expected different number of rows ({:d}), got {:d}", rows.size(), rs.size()));
+        fail(format("Expected different number of rows ({:d}), got {:d}", rows.size(), rs.size()), _loc);
    }
    return {*this};
 }

 columns_assertions rows_assertions::with_columns_of_row(size_t row_index) {
    const auto& rs = _rows->rs().result_set();
-    return columns_assertions(rs.get_metadata(), rs.rows().at(row_index));
+    if (row_index >= rs.rows().size()) {
+        fail(format("Requested row index {} is out of range, result has {} rows", row_index, rs.rows().size()), _loc);
+    }
+    return columns_assertions(rs.get_metadata(), rs.rows().at(row_index), _loc);
 }

-result_msg_assertions::result_msg_assertions(shared_ptr<cql_transport::messages::result_message> msg)
+rows_assertions& rows_assertions::assert_for_columns_of_each_row(std::function<void(columns_assertions&)> func) {
+    const auto& rs = _rows->rs().result_set();
+    for (size_t i = 0; i < rs.size(); ++i) {
+        auto columns = with_columns_of_row(i);
+        func(columns);
+    }
+    return *this;
+}
+
+result_msg_assertions::result_msg_assertions(shared_ptr<cql_transport::messages::result_message> msg, std::source_location loc)
    : _msg(msg)
+    , _loc(loc)
 { }

 rows_assertions result_msg_assertions::is_rows() {
    auto rows = dynamic_pointer_cast<cql_transport::messages::result_message::rows>(_msg);
    if (!rows) {
-        fail("Expected rows in result set");
+        fail("Expected rows in result set", _loc);
    }
-    return rows_assertions(rows);
+    return rows_assertions(rows, _loc);
 }

-result_msg_assertions assert_that(shared_ptr<cql_transport::messages::result_message> msg) {
-    return result_msg_assertions(msg);
+result_msg_assertions assert_that(shared_ptr<cql_transport::messages::result_message> msg, std::source_location loc) {
+    return result_msg_assertions(msg, loc);
 }

 rows_assertions rows_assertions::with_serialized_columns_count(size_t columns_count) {
    size_t serialized_column_count = _rows->rs().get_metadata().column_count();
    if (serialized_column_count != columns_count) {
-        fail(format("Expected {:d} serialized columns(s) but got {:d}", columns_count, serialized_column_count));
+        fail(format("Expected {:d} serialized columns(s) but got {:d}", columns_count, serialized_column_count), _loc);
    }
    return {*this};
 }
@@ -255,7 +279,7 @@ void require_rows(cql_test_env& e,
                  const std::vector<std::vector<bytes_opt>>& expected,
                  const std::source_location& loc) {
    try {
-        assert_that(cquery_nofail(e, qstr, nullptr, loc)).is_rows().with_rows_ignore_order(expected);
+        assert_that(cquery_nofail(e, qstr, nullptr, loc), loc).is_rows().with_rows_ignore_order(expected);
    }
    catch (const std::exception& e) {
        BOOST_FAIL(seastar::format("query '{}' failed: {}\n{}:{}: originally from here",
--- a/test/lib/cql_assertions.hh
+++ b/test/lib/cql_assertions.hh
@@ -21,31 +21,46 @@
 class columns_assertions {
    const cql3::metadata& _metadata;
    const std::vector<managed_bytes_opt>& _columns;
+    std::source_location _loc;

    columns_assertions& do_with_raw_column(const char* name, std::function<void(data_type, managed_bytes_view)> func);

    void fail(const sstring& msg);

 public:
-    columns_assertions(const cql3::metadata& metadata, const std::vector<managed_bytes_opt>& columns)
-        : _metadata(metadata), _columns(columns)
+    columns_assertions(const cql3::metadata& metadata, const std::vector<managed_bytes_opt>& columns, std::source_location loc)
+        : _metadata(metadata), _columns(columns), _loc(loc)
    { }

    columns_assertions& with_raw_column(const char* name, std::function<bool(managed_bytes_view)> predicate);
    columns_assertions& with_raw_column(const char* name, managed_bytes_view value);

    template <typename T>
-    columns_assertions& with_typed_column(const char* name, std::function<bool(const T& value)> predicate) {
+    columns_assertions& with_typed_column(const char* name, std::function<bool(const T* value)> predicate) {
        return do_with_raw_column(name, [this, name, predicate] (data_type type, managed_bytes_view value) {
            if (type != data_type_for<T>()) {
                fail(seastar::format("Column {} is not of type {}, but of type {}", name, data_type_for<T>()->name(), type->name()));
            }
-            if (!predicate(value_cast<T>(type->deserialize(value)))) {
+            std::optional<T> t_opt;
+            if (!value.empty()) {
+                t_opt.emplace(value_cast<T>(type->deserialize(value)));
+            }
+            if (!predicate(t_opt.has_value() ? &t_opt.value() : nullptr)) {
                fail(seastar::format("Column {} failed predicate check: value = {}", name, value));
            }
        });
    }

+    template <typename T>
+    columns_assertions& with_typed_column(const char* name, std::function<bool(const T& value)> predicate) {
+        return with_typed_column<T>(name, [this, name, predicate] (const T* value) {
+            if (!value) {
+                fail(seastar::format("Column {} is null", name));
+            }
+            return predicate(*value);
+        });
+    }
+
    template <typename T>
    columns_assertions& with_typed_column(const char* name, const T& value) {
        return  with_typed_column<T>(name, [this, name, &value] (const T& cell_value) {
@@ -59,9 +74,11 @@ public:

 class rows_assertions {
    shared_ptr<cql_transport::messages::result_message::rows> _rows;
+    std::source_location _loc;
 public:
-    rows_assertions(shared_ptr<cql_transport::messages::result_message::rows> rows);
+    rows_assertions(shared_ptr<cql_transport::messages::result_message::rows> rows, std::source_location loc);
    rows_assertions with_size(size_t size);
+    rows_assertions with_size(std::function<bool(size_t)> predicate);
    rows_assertions is_empty();
    rows_assertions is_not_empty();
    rows_assertions with_column_types(std::initializer_list<data_type> column_types);
@@ -75,18 +92,21 @@ public:

    columns_assertions with_columns_of_row(size_t row_index);

+    rows_assertions& assert_for_columns_of_each_row(std::function<void(columns_assertions&)> func);
+
    rows_assertions is_null();
    rows_assertions is_not_null();
 };

 class result_msg_assertions {
    shared_ptr<cql_transport::messages::result_message> _msg;
+    std::source_location _loc;
 public:
-    result_msg_assertions(shared_ptr<cql_transport::messages::result_message> msg);
+    result_msg_assertions(shared_ptr<cql_transport::messages::result_message> msg, std::source_location loc);
    rows_assertions is_rows();
 };

-result_msg_assertions assert_that(shared_ptr<cql_transport::messages::result_message> msg);
+result_msg_assertions assert_that(shared_ptr<cql_transport::messages::result_message> msg, std::source_location loc = std::source_location::current());

 template<typename T>
 void assert_that_failed(future<T>& f)
--- a/test/lib/cql_test_env.cc
+++ b/test/lib/cql_test_env.cc
@@ -1128,11 +1128,8 @@ private:
            auth_config.authenticator_java_name = qualified_authenticator_name;
            auth_config.role_manager_java_name = qualified_role_manager_name;

+            _auth_service.start(perm_cache_config, std::ref(_qp), std::ref(group0_client), std::ref(_mnotifier), std::ref(_mm), auth_config, maintenance_socket_enabled::no, std::ref(_auth_cache)).get();

-
-            const uint64_t niceness = 19;
-            auto hashing_worker = utils::alien_worker(startlog, niceness, "pwd-hash");
-            _auth_service.start(perm_cache_config, std::ref(_qp), std::ref(group0_client), std::ref(_mnotifier), std::ref(_mm), auth_config, maintenance_socket_enabled::no, std::ref(_auth_cache), std::ref(hashing_worker)).get();
            _auth_service.invoke_on_all([this] (auth::service& auth) {
                return auth.start(_mm.local(), _sys_ks.local());
            }).get();
@@ -1159,8 +1156,9 @@ private:

            db::batchlog_manager_config bmcfg;
            bmcfg.replay_rate = 100000000;
-            bmcfg.write_request_timeout = 2s;
-            bmcfg.delay = 0ms;
+            bmcfg.replay_timeout = cfg_in.batchlog_replay_timeout.value_or(2s);
+            bmcfg.delay = cfg_in.batchlog_delay;
+            bmcfg.replay_cleanup_after_replays = cfg->batchlog_replay_cleanup_after_replays();
            _batchlog_manager.start(std::ref(_qp), std::ref(_sys_ks), bmcfg).get();
            auto stop_bm = defer_verbose_shutdown("batchlog manager", [this] {
                _batchlog_manager.stop().get();
@@ -1219,9 +1217,15 @@ private:

 public:
    future<::shared_ptr<cql_transport::messages::result_message>> execute_batch(
-        const std::vector<std::string_view>& queries, std::unique_ptr<cql3::query_options> qo) override {
+            const std::vector<std::string_view>& queries,
+            cql3::statements::batch_statement::type batch_type,
+            std::unique_ptr<cql3::query_options> qo) override {
        using cql3::statements::batch_statement;
        using cql3::statements::modification_statement;
+
+        testlog.trace("{}(type={}):\n  {}",
+                __FUNCTION__, batch_type == batch_statement::type::LOGGED ? "LOGGED" : "UNLOGGED", fmt::join(queries, "\n  "));
+
        std::vector<batch_statement::single_statement> modifications;
        std::ranges::transform(queries, back_inserter(modifications), [this](const auto& query) {
            auto stmt = local_qp().get_statement(query, _core_local.local().client_state, test_dialect());
@@ -1232,7 +1236,7 @@ public:
            return batch_statement::single_statement(static_pointer_cast<modification_statement>(stmt->statement));
        });
        auto batch = ::make_shared<batch_statement>(
-            batch_statement::type::UNLOGGED,
+            batch_type,
            std::move(modifications),
            cql3::attributes::none(),
            local_qp().get_cql_stats());
--- a/test/lib/cql_test_env.hh
+++ b/test/lib/cql_test_env.hh
@@ -25,6 +25,7 @@
 #include "cql3/values.hh"
 #include "cql3/prepared_statements_cache.hh"
 #include "cql3/query_processor.hh"
+#include "cql3/statements/batch_statement.hh"
 #include "bytes.hh"
 #include "schema/schema.hh"
 #include "service/tablet_allocator.hh"
@@ -106,6 +107,9 @@ public:
    bool run_with_raft_recovery = false;
    bool clean_data_dir_before_test = true;

+    std::optional<db_clock::duration> batchlog_replay_timeout;
+    std::chrono::milliseconds batchlog_delay = std::chrono::milliseconds(0);
+
    std::optional<timeout_config> query_timeout;

    cql_test_config();
@@ -129,7 +133,7 @@ public:

    /// Processes queries (which must be modifying queries) as a batch.
    virtual future<::shared_ptr<cql_transport::messages::result_message>> execute_batch(
-        const std::vector<std::string_view>& queries, std::unique_ptr<cql3::query_options> qo) = 0;
+        const std::vector<std::string_view>& queries, cql3::statements::batch_statement::type batch_type, std::unique_ptr<cql3::query_options> qo) = 0;

    virtual future<cql3::prepared_cache_key_type> prepare(sstring query) = 0;

--- a/test/lib/error_injection.hh
+++ b/test/lib/error_injection.hh
@@ -0,0 +1,30 @@
+/*
+ * Copyright (C) 2025-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+
+#pragma once
+
+#include "utils/error_injection.hh"
+
+// Injects the error for the duration of the scope on all shards.
+// Runs in seastar thread.
+class scoped_error_injection {
+    std::string_view _name;
+public:
+    // Add other overloads as needed
+    explicit scoped_error_injection(std::string_view name) : _name(name) {
+        smp::invoke_on_all([this] {
+            utils::get_local_injector().enable(_name);
+        }).get();
+    }
+
+    ~scoped_error_injection() {
+        smp::invoke_on_all([this] {
+            utils::get_local_injector().disable(_name);
+        }).get();
+    }
+};
--- a/test/lib/expr_test_utils.cc
+++ b/test/lib/expr_test_utils.cc
@@ -599,6 +599,9 @@ public:
        throw std::bad_function_call();
    }
    virtual schema_ptr get_table_schema(data_dictionary::table t) const override { return _table_schema; }
+    virtual db_clock::time_point get_truncation_time(data_dictionary::table t) const override {
+        return {};
+    }
    virtual lw_shared_ptr<data_dictionary::keyspace_metadata> get_keyspace_metadata(
        data_dictionary::keyspace ks) const override {
        return _keyspace_metadata;
--- a/test/pylib/suite/base.py
+++ b/test/pylib/suite/base.py
@@ -261,7 +261,9 @@ class TestSuite(ABC):
            # Check opt-in list
            if options.name:
                for p in options.name:
-                    pn = p.split('::', 2)
+                    # only allow one split, otherwise something like <file>::<class>::method
+                    # does not work here. The pytest matcher wants the full pattern eventually
+                    pn = p.split('::', 1)
                    if len(pn) == 1 and p in testname:
                        break
                    if len(pn) == 2 and pn[0] == testname:
--- a/test/scylla_gdb/test_misc.py
+++ b/test/scylla_gdb/test_misc.py
@@ -1,5 +1,6 @@
 import pytest
 import re
+import uuid

 # Convenience function to execute a scylla command in gdb, returning its
 # output as a string - or a gdb.error exception.
@@ -189,10 +190,11 @@ def coro_task(gdb, scylla_gdb):
            gdb.write(f'{name}\n')
    # This test fails sometimes, but rarely and unreliably.
    # We want to get a coredump from it the next time it fails.
-    # Sending a SIGSEGV should induce that.
    # https://github.com/scylladb/scylladb/issues/22501
-    gdb.execute("signal SIGSEGV")
-    raise gdb.error("No coroutine frames found with expected name")
+    tmpdir = request.config.getoption('scylla_tmp_dir')
+    core_filename = f"{tmpdir}/../scylla_gdb_coro_task-{uuid.uuid4()}.core"
+    gdb.execute(f"gcore {core_filename}")
+    raise gdb.error(f"No coroutine frames found with expected name. Dumped Scylla core to {core_filename}")

 def test_coro_frame(gdb, coro_task):
    # Note the offset by two words.
--- a/tools/schema_loader.cc
+++ b/tools/schema_loader.cc
@@ -155,6 +155,9 @@ private:
    virtual schema_ptr get_table_schema(data_dictionary::table t) const override {
        return unwrap(t).schema;
    }
+    virtual db_clock::time_point get_truncation_time(data_dictionary::table t) const override {
+        return {};
+    }
    virtual lw_shared_ptr<keyspace_metadata> get_keyspace_metadata(data_dictionary::keyspace ks) const override {
        return unwrap(ks).metadata;
    }
--- a/utils/CMakeLists.txt
+++ b/utils/CMakeLists.txt
@@ -15,6 +15,7 @@ target_sources(utils
    buffer_input_stream.cc
    build_id.cc
    config_file.cc
+    crypt_sha512.cc
    directories.cc
    disk-error-handler.cc
    disk_space_monitor.cc
--- a/utils/crypt_sha512.cc
+++ b/utils/crypt_sha512.cc
@@ -0,0 +1,381 @@
+/*
+ * This file originates from musl libc (git.musl-libc.org).
+ * Modifications have been made and are licensed under the following terms:
+ * Copyright (C) 2025-present ScyllaDB
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ *
+ * public domain sha512 crypt implementation
+ *
+ * original sha crypt design: http://people.redhat.com/drepper/SHA-crypt.txt
+ * in this implementation at least 32bit int is assumed,
+ * key length is limited, the $6$ prefix is mandatory, '\n' and ':' is rejected
+ * in the salt and rounds= setting must contain a valid iteration count,
+ * on error "*" is returned.
+ */
+#include <ctype.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "crypt_sha512.hh"
+#include <seastar/core/coroutine.hh>
+#include <seastar/coroutine/maybe_yield.hh>
+
+/* public domain sha512 implementation based on fips180-3 */
+/* >=2^64 bits messages are not supported (about 2000 peta bytes) */
+
+struct sha512 {
+	uint64_t len;     /* processed message length */
+	uint64_t h[8];    /* hash state */
+	uint8_t buf[128]; /* message block buffer */
+};
+
+static uint64_t ror(uint64_t n, int k) { return (n >> k) | (n << (64-k)); }
+#define Ch(x,y,z)  (z ^ (x & (y ^ z)))
+#define Maj(x,y,z) ((x & y) | (z & (x | y)))
+#define S0(x)      (ror(x,28) ^ ror(x,34) ^ ror(x,39))
+#define S1(x)      (ror(x,14) ^ ror(x,18) ^ ror(x,41))
+#define R0(x)      (ror(x,1) ^ ror(x,8) ^ (x>>7))
+#define R1(x)      (ror(x,19) ^ ror(x,61) ^ (x>>6))
+
+static const uint64_t K[80] = {
+0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL, 0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL,
+0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL, 0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL,
+0xd807aa98a3030242ULL, 0x12835b0145706fbeULL, 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
+0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL, 0x9bdc06a725c71235ULL, 0xc19bf174cf692694ULL,
+0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL, 0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL,
+0x2de92c6f592b0275ULL, 0x4a7484aa6ea6e483ULL, 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
+0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL, 0xb00327c898fb213fULL, 0xbf597fc7beef0ee4ULL,
+0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL, 0x06ca6351e003826fULL, 0x142929670a0e6e70ULL,
+0x27b70a8546d22ffcULL, 0x2e1b21385c26c926ULL, 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
+0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL, 0x81c2c92e47edaee6ULL, 0x92722c851482353bULL,
+0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL, 0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL,
+0xd192e819d6ef5218ULL, 0xd69906245565a910ULL, 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
+0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL, 0x2748774cdf8eeb99ULL, 0x34b0bcb5e19b48a8ULL,
+0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL, 0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL,
+0x748f82ee5defb2fcULL, 0x78a5636f43172f60ULL, 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
+0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL, 0xbef9a3f7b2c67915ULL, 0xc67178f2e372532bULL,
+0xca273eceea26619cULL, 0xd186b8c721c0c207ULL, 0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL,
+0x06f067aa72176fbaULL, 0x0a637dc5a2c898a6ULL, 0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
+0x28db77f523047d84ULL, 0x32caab7b40c72493ULL, 0x3c9ebe0a15c9bebcULL, 0x431d67c49c100d4cULL,
+0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL, 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL
+};
+
+static void processblock(struct sha512 *s, const uint8_t *buf)
+{
+	uint64_t W[80], t1, t2, a, b, c, d, e, f, g, h;
+	int i;
+
+	for (i = 0; i < 16; i++) {
+		W[i] = (uint64_t)buf[8*i]<<56;
+		W[i] |= (uint64_t)buf[8*i+1]<<48;
+		W[i] |= (uint64_t)buf[8*i+2]<<40;
+		W[i] |= (uint64_t)buf[8*i+3]<<32;
+		W[i] |= (uint64_t)buf[8*i+4]<<24;
+		W[i] |= (uint64_t)buf[8*i+5]<<16;
+		W[i] |= (uint64_t)buf[8*i+6]<<8;
+		W[i] |= buf[8*i+7];
+	}
+	for (; i < 80; i++)
+		W[i] = R1(W[i-2]) + W[i-7] + R0(W[i-15]) + W[i-16];
+	a = s->h[0];
+	b = s->h[1];
+	c = s->h[2];
+	d = s->h[3];
+	e = s->h[4];
+	f = s->h[5];
+	g = s->h[6];
+	h = s->h[7];
+	for (i = 0; i < 80; i++) {
+		t1 = h + S1(e) + Ch(e,f,g) + K[i] + W[i];
+		t2 = S0(a) + Maj(a,b,c);
+		h = g;
+		g = f;
+		f = e;
+		e = d + t1;
+		d = c;
+		c = b;
+		b = a;
+		a = t1 + t2;
+	}
+	s->h[0] += a;
+	s->h[1] += b;
+	s->h[2] += c;
+	s->h[3] += d;
+	s->h[4] += e;
+	s->h[5] += f;
+	s->h[6] += g;
+	s->h[7] += h;
+}
+
+static void pad(struct sha512 *s)
+{
+	unsigned r = s->len % 128;
+
+	s->buf[r++] = 0x80;
+	if (r > 112) {
+		memset(s->buf + r, 0, 128 - r);
+		r = 0;
+		processblock(s, s->buf);
+	}
+	memset(s->buf + r, 0, 120 - r);
+	s->len *= 8;
+	s->buf[120] = s->len >> 56;
+	s->buf[121] = s->len >> 48;
+	s->buf[122] = s->len >> 40;
+	s->buf[123] = s->len >> 32;
+	s->buf[124] = s->len >> 24;
+	s->buf[125] = s->len >> 16;
+	s->buf[126] = s->len >> 8;
+	s->buf[127] = s->len;
+	processblock(s, s->buf);
+}
+
+static void sha512_init(struct sha512 *s)
+{
+	s->len = 0;
+	s->h[0] = 0x6a09e667f3bcc908ULL;
+	s->h[1] = 0xbb67ae8584caa73bULL;
+	s->h[2] = 0x3c6ef372fe94f82bULL;
+	s->h[3] = 0xa54ff53a5f1d36f1ULL;
+	s->h[4] = 0x510e527fade682d1ULL;
+	s->h[5] = 0x9b05688c2b3e6c1fULL;
+	s->h[6] = 0x1f83d9abfb41bd6bULL;
+	s->h[7] = 0x5be0cd19137e2179ULL;
+}
+
+static void sha512_sum(struct sha512 *s, uint8_t *md)
+{
+	int i;
+
+	pad(s);
+	for (i = 0; i < 8; i++) {
+		md[8*i] = s->h[i] >> 56;
+		md[8*i+1] = s->h[i] >> 48;
+		md[8*i+2] = s->h[i] >> 40;
+		md[8*i+3] = s->h[i] >> 32;
+		md[8*i+4] = s->h[i] >> 24;
+		md[8*i+5] = s->h[i] >> 16;
+		md[8*i+6] = s->h[i] >> 8;
+		md[8*i+7] = s->h[i];
+	}
+}
+
+static void sha512_update(struct sha512 *s, const void *m, unsigned long len)
+{
+	const uint8_t *p = (const uint8_t *)m;
+	unsigned r = s->len % 128;
+
+	s->len += len;
+	if (r) {
+		if (len < 128 - r) {
+			memcpy(s->buf + r, p, len);
+			return;
+		}
+		memcpy(s->buf + r, p, 128 - r);
+		len -= 128 - r;
+		p += 128 - r;
+		processblock(s, s->buf);
+	}
+	for (; len >= 128; len -= 128, p += 128)
+		processblock(s, p);
+	memcpy(s->buf, p, len);
+}
+
+static const unsigned char b64[] =
+"./0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
+
+static char *to64(char *s, unsigned int u, int n)
+{
+	while (--n >= 0) {
+		*s++ = b64[u % 64];
+		u /= 64;
+	}
+	return s;
+}
+
+/* key limit is not part of the original design, added for DoS protection.
+ * rounds limit has been lowered (versus the reference/spec), also for DoS
+ * protection. runtime is O(klen^2 + klen*rounds) */
+#define KEY_MAX 256
+#define SALT_MAX 16
+#define ROUNDS_DEFAULT 5000
+#define ROUNDS_MIN 1000
+#define ROUNDS_MAX 9999999
+
+/* hash n bytes of the repeated md message digest */
+static void hashmd(struct sha512 *s, unsigned int n, const void *md)
+{
+	unsigned int i;
+
+	for (i = n; i > 64; i -= 64)
+		sha512_update(s, md, 64);
+	sha512_update(s, md, i);
+}
+
+static seastar::future<char *> sha512crypt(const char *key, const char *setting, char *output)
+{
+	struct sha512 ctx;
+	unsigned char md[64], kmd[64], smd[64];
+	unsigned int i, r, klen, slen;
+	char rounds[20] = "";
+	const char *salt;
+	char *p;
+
+	/* reject large keys */
+	for (i = 0; i <= KEY_MAX && key[i]; i++);
+	if (i > KEY_MAX)
+		co_return nullptr;
+	klen = i;
+
+	/* setting: $6$rounds=n$salt$ (rounds=n$ and closing $ are optional) */
+	if (strncmp(setting, "$6$", 3) != 0)
+		co_return nullptr;
+	salt = setting + 3;
+
+	r = ROUNDS_DEFAULT;
+	if (strncmp(salt, "rounds=", sizeof "rounds=" - 1) == 0) {
+		unsigned long u;
+		char *end;
+
+		/*
+		 * this is a deviation from the reference:
+		 * bad rounds setting is rejected if it is
+		 * - empty
+		 * - unterminated (missing '$')
+		 * - begins with anything but a decimal digit
+		 * the reference implementation treats these bad
+		 * rounds as part of the salt or parse them with
+		 * strtoul semantics which may cause problems
+		 * including non-portable hashes that depend on
+		 * the host's value of ULONG_MAX.
+		 */
+		salt += sizeof "rounds=" - 1;
+		if (!isdigit(*salt))
+			co_return nullptr;
+		u = strtoul(salt, &end, 10);
+		if (*end != '$')
+			co_return nullptr;
+		salt = end+1;
+		if (u < ROUNDS_MIN)
+			r = ROUNDS_MIN;
+		else if (u > ROUNDS_MAX)
+			co_return nullptr;
+		else
+			r = u;
+		/* needed when rounds is zero prefixed or out of bounds */
+		sprintf(rounds, "rounds=%u$", r);
+	}
+
+	for (i = 0; i < SALT_MAX && salt[i] && salt[i] != '$'; i++)
+		/* reject characters that interfere with /etc/shadow parsing */
+		if (salt[i] == '\n' || salt[i] == ':')
+			co_return nullptr;
+	slen = i;
+
+	/* B = sha(key salt key) */
+	sha512_init(&ctx);
+	sha512_update(&ctx, key, klen);
+	sha512_update(&ctx, salt, slen);
+	sha512_update(&ctx, key, klen);
+	sha512_sum(&ctx, md);
+
+	/* A = sha(key salt repeat-B alternate-B-key) */
+	sha512_init(&ctx);
+	sha512_update(&ctx, key, klen);
+	sha512_update(&ctx, salt, slen);
+	hashmd(&ctx, klen, md);
+	for (i = klen; i > 0; i >>= 1)
+		if (i & 1)
+			sha512_update(&ctx, md, sizeof md);
+		else
+			sha512_update(&ctx, key, klen);
+	sha512_sum(&ctx, md);
+
+	/* DP = sha(repeat-key), this step takes O(klen^2) time */
+	sha512_init(&ctx);
+	for (i = 0; i < klen; i++)
+		sha512_update(&ctx, key, klen);
+	sha512_sum(&ctx, kmd);
+
+	/* DS = sha(repeat-salt) */
+	sha512_init(&ctx);
+	for (i = 0; i < 16 + md[0]; i++)
+		sha512_update(&ctx, salt, slen);
+	sha512_sum(&ctx, smd);
+
+	/* iterate A = f(A,DP,DS), this step takes O(rounds*klen) time */
+	for (i = 0; i < r; i++) {
+		sha512_init(&ctx);
+		if (i % 2)
+			hashmd(&ctx, klen, kmd);
+		else
+			sha512_update(&ctx, md, sizeof md);
+		if (i % 3)
+			sha512_update(&ctx, smd, slen);
+		if (i % 7)
+			hashmd(&ctx, klen, kmd);
+		if (i % 2)
+			sha512_update(&ctx, md, sizeof md);
+		else
+			hashmd(&ctx, klen, kmd);
+		sha512_sum(&ctx, md);
+		co_await seastar::coroutine::maybe_yield();
+	}
+
+	/* output is $6$rounds=n$salt$hash */
+	p = output;
+	p += sprintf(p, "$6$%s%.*s$", rounds, slen, salt);
+#if 1
+	static const unsigned char perm[][3] = {
+		{0,21,42},{22,43,1},{44,2,23},{3,24,45},{25,46,4},
+		{47,5,26},{6,27,48},{28,49,7},{50,8,29},{9,30,51},
+		{31,52,10},{53,11,32},{12,33,54},{34,55,13},{56,14,35},
+		{15,36,57},{37,58,16},{59,17,38},{18,39,60},{40,61,19},
+		{62,20,41} };
+	for (i=0; i<21; i++) p = to64(p,
+		(md[perm[i][0]]<<16)|(md[perm[i][1]]<<8)|md[perm[i][2]], 4);
+#else
+	p = to64(p, (md[0]<<16)|(md[21]<<8)|md[42], 4);
+	p = to64(p, (md[22]<<16)|(md[43]<<8)|md[1], 4);
+	p = to64(p, (md[44]<<16)|(md[2]<<8)|md[23], 4);
+	p = to64(p, (md[3]<<16)|(md[24]<<8)|md[45], 4);
+	p = to64(p, (md[25]<<16)|(md[46]<<8)|md[4], 4);
+	p = to64(p, (md[47]<<16)|(md[5]<<8)|md[26], 4);
+	p = to64(p, (md[6]<<16)|(md[27]<<8)|md[48], 4);
+	p = to64(p, (md[28]<<16)|(md[49]<<8)|md[7], 4);
+	p = to64(p, (md[50]<<16)|(md[8]<<8)|md[29], 4);
+	p = to64(p, (md[9]<<16)|(md[30]<<8)|md[51], 4);
+	p = to64(p, (md[31]<<16)|(md[52]<<8)|md[10], 4);
+	p = to64(p, (md[53]<<16)|(md[11]<<8)|md[32], 4);
+	p = to64(p, (md[12]<<16)|(md[33]<<8)|md[54], 4);
+	p = to64(p, (md[34]<<16)|(md[55]<<8)|md[13], 4);
+	p = to64(p, (md[56]<<16)|(md[14]<<8)|md[35], 4);
+	p = to64(p, (md[15]<<16)|(md[36]<<8)|md[57], 4);
+	p = to64(p, (md[37]<<16)|(md[58]<<8)|md[16], 4);
+	p = to64(p, (md[59]<<16)|(md[17]<<8)|md[38], 4);
+	p = to64(p, (md[18]<<16)|(md[39]<<8)|md[60], 4);
+	p = to64(p, (md[40]<<16)|(md[61]<<8)|md[19], 4);
+	p = to64(p, (md[62]<<16)|(md[20]<<8)|md[41], 4);
+#endif
+	p = to64(p, md[63], 2);
+	*p = 0;
+	co_return output;
+}
+
+seastar::future<const char *> __crypt_sha512(const char *key, const char *setting, char *output)
+{
+	static const char testkey[] = "Xy01@#\x01\x02\x80\x7f\xff\r\n\x81\t !";
+	static const char testsetting[] = "$6$rounds=1234$abc0123456789$";
+	static const char testhash[] = "$6$rounds=1234$abc0123456789$BCpt8zLrc/RcyuXmCDOE1ALqMXB2MH6n1g891HhFj8.w7LxGv.FTkqq6Vxc/km3Y0jE0j24jY5PIv/oOu6reg1";
+	char testbuf[128];
+	char *p, *q;
+
+	p = co_await sha512crypt(key, setting, output);
+	/* self test and stack cleanup */
+	q = co_await sha512crypt(testkey, testsetting, testbuf);
+	if (!p || q != testbuf || memcmp(testbuf, testhash, sizeof testhash))
+		co_return "*";
+	co_return p;
+}
--- a/utils/crypt_sha512.hh
+++ b/utils/crypt_sha512.hh
@@ -0,0 +1,13 @@
+/*
+ * Copyright (C) 2025-present ScyllaDB
+ */
+
+/*
+ * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
+ */
+
+#pragma once
+
+#include <seastar/core/future.hh>
+
+seastar::future<const char *> __crypt_sha512(const char *key, const char *setting, char *output);
--- a/utils/directories.cc
+++ b/utils/directories.cc
@@ -89,9 +89,8 @@ future<> directories::create_and_verify(directories::set dir_set, recursive recu
        try {
            co_await directories::verify_owner_and_mode(path, recursive);
        } catch (...) {
-            std::exception_ptr ep = std::current_exception();
-            startlog.error("Failed owner and mode verification: {}", ep);
-            throw ep;
+            startlog.error("Failed owner and mode verification: {}", std::current_exception());
+            throw;
        }
    });
    std::move(locks.begin(), locks.end(), std::back_inserter(_locks));
--- a/utils/error_injection.hh
+++ b/utils/error_injection.hh
@@ -378,10 +378,12 @@ public:
    }

    void disable(const std::string_view& injection_name) {
+        errinj_logger.debug("Disabling injection \"{}\"", injection_name);
        _enabled.erase(injection_name);
    }

    void disable_all() {
+        errinj_logger.debug("Disabling all injections");
        _enabled.clear();
    }

--- a/utils/rjson.cc
+++ b/utils/rjson.cc
@@ -33,7 +33,7 @@ private:
    chunked_content::iterator _current_chunk;
    // _count only needed for Tell(). 32 bits is enough, we don't allow
    // more than 16 MB requests anyway.
-    unsigned _count;
+    unsigned _count{0};
 public:
    typedef char Ch;
    chunked_content_stream(chunked_content&& content)