view_builder: fix loop in view builder when tokens are moved

The view builder builds a view by going over the entire token ring,
consuming the base table partitions, and generating view updates for
each partition.

A view is considered as built when we complete a full cycle of the
token ring. Suppose we start to build a view at a token F. We will
consume all partitions with tokens starting at F until the maximum
token, then go back to the minimum token and consume all partitions
until F, and then we detect that we pass F and complete building the
view. This happens in the view builder consumer in
`check_for_built_views`.

The problem is that we check if we pass the first token F with the
condition `_step.current_token() >= it->first_token` whenever we consume
a new partition or the current_token goes back to the minimum token.
But suppose that we don't have any partitions with a token greater than
or equal to the first token (this could happen if the partition with
token F was moved to another node for example), then this condition will never be
satisfied, and we don't detect correctly when we pass F. Instead, we
go back to the minimum token, building the same token ranges again,
in a possibly infinite loop.

To fix this we add another step when reaching the end of the reader's
stream. When this happens it means we don't have any more fragments to
consume until the end of the range, so we advance the current_token to
the end of the range, simulating a partition, and check for built views
in that range.

Fixes scylladb/scylladb#21829

Closes scylladb/scylladb#22493
This commit is contained in:
Michael Litvak
2025-01-26 12:38:37 +02:00
committed by Avi Kivity
parent 439862a8d4
commit 6d34125eb7
2 changed files with 67 additions and 1 deletions

View File

@@ -3151,6 +3151,12 @@ public:
_step.build_status.pop_back();
}
}
// before going back to the minimum token, advance current_key to the end
// and check for built views in that range.
_step.current_key = {_step.prange.end().value_or(dht::ring_position::max()).value().token(), partition_key::make_empty()};
check_for_built_views();
_step.current_key = {dht::minimum_token(), partition_key::make_empty()};
for (auto&& vs : _step.build_status) {
vs.next_token = dht::minimum_token();

View File

@@ -5,9 +5,13 @@
#
import asyncio
import pytest
import logging
from test.pylib.manager_client import ManagerClient
from test.pylib.tablets import get_tablet_replica
from test.pylib.util import unique_name, wait_for_view
logger = logging.getLogger(__name__)
from test.pylib.util import wait_for_view
# This test makes sure that view building is done mainly in the streaming scheduling group
# and not the gossip scheduling group. We do that by measuring the time each group was
@@ -54,3 +58,59 @@ async def test_start_scylla_with_view_building_disabled(manager: ManagerClient):
log = await manager.server_open_log(server.server_id)
res = await log.grep(r"ERROR.*\[shard [0-9]+:[a-z]+\]")
assert len(res) == 0
# Build multiple views of one base table, and while view building is running move
# some of the base tablets to another node. Verify the view build is completed.
# More specifically, we move all tablets except the first one to reproduce issue #21829.
# The issue happens when we start building a view at a token F and then all partitions
# with tokens >=F are moved, and it causes the view builder to enter an infinite loop
# building the same token ranges repeatedly because it doesn't reach F.
@pytest.mark.asyncio
async def test_view_building_with_tablet_move(manager: ManagerClient, build_mode: str):
servers = [await manager.server_add()]
await manager.api.disable_tablet_balancing(servers[0].ip_addr)
ks = unique_name()
table = 'test'
view_count = 4
views = [f"{table}_view_{i}" for i in range(view_count)]
cql = manager.get_cql()
await cql.run_async(f"CREATE KEYSPACE {ks} WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': 1}} AND tablets = {{'initial': 4}}")
await cql.run_async(f"CREATE TABLE {ks}.{table} (pk int PRIMARY KEY, c int)")
# prefill the base table with enough rows so that view building takes some time
# and runs during the tablet move
keys = 200000 if build_mode != 'debug' else 10000
batch_size = 50
for k in range(0, keys, batch_size):
inserts = [f"INSERT INTO {ks}.{table}(pk, c) VALUES ({i}, {i})" for i in range(k, k+batch_size)]
batch = "BEGIN UNLOGGED BATCH\n" + "\n".join(inserts) + "\nAPPLY BATCH\n"
await manager.cql.run_async(batch)
logger.info("Adding new server")
servers.append(await manager.server_add())
# create some views so they are built together but starting at different tokens
for view in views:
await cql.run_async(f"CREATE MATERIALIZED VIEW {ks}.{view} AS SELECT * FROM {ks}.{table} WHERE c IS NOT NULL AND pk IS NOT NULL PRIMARY KEY (c, pk)")
await asyncio.sleep(1)
s0_host_id = await manager.get_host_id(servers[0].server_id)
s1_host_id = await manager.get_host_id(servers[1].server_id)
dst_shard = 0
# move all tablets except the first one (with lowest token range) to the other node.
table_id = await manager.get_table_id(ks, table)
rows = await manager.cql.run_async(f"SELECT last_token FROM system.tablets where table_id = {table_id}")
move_tablets_tasks = []
for r in rows[1:]:
tablet_token = r.last_token
replica = await get_tablet_replica(manager, servers[0], ks, table, tablet_token)
move_tablets_tasks.append(asyncio.create_task(manager.api.move_tablet(servers[0].ip_addr, ks, table, replica[0], replica[1], s1_host_id, dst_shard, tablet_token)))
await asyncio.gather(*move_tablets_tasks)
for view in views:
await wait_for_view(cql, view, len(servers))