db/config: don't use RBNO for scaling

Remove bootstrap and decomission from allowed_repair_based_node_ops.
Using RBNO over streaming for these operations has no benefits, as they
are not exposed to the out-of-date replica problem that replace,
removenode and rebuild are.
On top of that, RBNO is known to have problems with empty user tables.
Using streaming for boostrap and decomission is safe and faster
than RBNO in all condition, especially when the table is small.

One test needs adjustment as it relies on RBNO being used for all node
ops.

Fixes: SCYLLADB-105

Closes scylladb/scylladb#28080

(cherry picked from commit b637e17b19)

Closes scylladb/scylladb#28725
This commit is contained in:
Botond Dénes
2025-09-30 10:13:04 +03:00
parent 883e3e014a
commit 0dfefc3f12
3 changed files with 4 additions and 3 deletions

View File

@@ -1291,7 +1291,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
, ignore_dead_nodes_for_replace(this, "ignore_dead_nodes_for_replace", value_status::Used, "", "List dead nodes to ignore for replace operation using a comma-separated list of host IDs. E.g., scylla --ignore-dead-nodes-for-replace 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c,125ed9f4-7777-1dbn-mac8-43fddce9123e")
, override_decommission(this, "override_decommission", value_status::Deprecated, false, "Set true to force a decommissioned node to join the cluster (cannot be set if consistent-cluster-management is enabled).")
, enable_repair_based_node_ops(this, "enable_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, true, "Set true to use enable repair based node operations instead of streaming based.")
, allowed_repair_based_node_ops(this, "allowed_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, "replace,removenode,rebuild,bootstrap,decommission", "A comma separated list of node operations which are allowed to enable repair based node operations. The operations can be bootstrap, replace, removenode, decommission and rebuild.")
, allowed_repair_based_node_ops(this, "allowed_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, "replace,removenode,rebuild", "A comma separated list of node operations which are allowed to enable repair based node operations. The operations can be bootstrap, replace, removenode, decommission and rebuild.")
, enable_compacting_data_for_streaming_and_repair(this, "enable_compacting_data_for_streaming_and_repair", liveness::LiveUpdate, value_status::Used, true, "Enable the compacting reader, which compacts the data for streaming and repair (load'n'stream included) before sending it to, or synchronizing it with peers. Can reduce the amount of data to be processed by removing dead data, but adds CPU overhead.")
, enable_tombstone_gc_for_streaming_and_repair(this, "enable_tombstone_gc_for_streaming_and_repair", liveness::LiveUpdate, value_status::Used, false,
"If the compacting reader is enabled for streaming and repair (see enable_compacting_data_for_streaming_and_repair), allow it to garbage-collect tombstones."

View File

@@ -396,7 +396,8 @@ async def test_mv_first_replica_in_dc(manager: ManagerClient, delayed_replica: s
@pytest.mark.parametrize("migration_type", ["tablets_internode", "tablets_intranode", "vnodes"])
@pytest.mark.skip_mode(mode='release', reason='error injections are not supported in release mode')
async def test_mv_write_during_migration(manager: ManagerClient, migration_type: str):
cmdline = ['--smp', '2', '--logger-log-level', 'raft_topology=debug']
# RF=1 and fast boot options with streaming don't play well together, so force RBNO for bootstrap
cmdline = ['--smp', '2', '--logger-log-level', 'raft_topology=debug', "--allowed-repair-based-node-ops", "replace,removenode,rebuild,bootstrap,decommission"]
servers = await manager.servers_add(3, cmdline=cmdline)
cql = manager.get_cql()

View File

@@ -30,7 +30,7 @@ async def test_different_group0_ids(manager: ManagerClient):
"""
# Consistent topology changes are disabled to use repair based node operations.
cfg = {'force_gossip_topology_changes': True, 'tablets_mode_for_new_keyspaces': 'disabled'}
cfg = {'force_gossip_topology_changes': True, 'tablets_mode_for_new_keyspaces': 'disabled', 'allowed_repair_based_node_ops': 'bootstrap,decommission,replace,removenode,rebuild'}
scylla_a = await manager.server_add(config = cfg)
scylla_b = await manager.server_add(start=False, config = cfg)
await manager.server_start(scylla_b.server_id, seeds=[scylla_b.ip_addr])