view_builder: fix loop in view builder when tokens are moved
The view builder builds a view by going over the entire token ring, consuming the base table partitions, and generating view updates for each partition. A view is considered as built when we complete a full cycle of the token ring. Suppose we start to build a view at a token F. We will consume all partitions with tokens starting at F until the maximum token, then go back to the minimum token and consume all partitions until F, and then we detect that we pass F and complete building the view. This happens in the view builder consumer in `check_for_built_views`. The problem is that we check if we pass the first token F with the condition `_step.current_token() >= it->first_token` whenever we consume a new partition or the current_token goes back to the minimum token. But suppose that we don't have any partitions with a token greater than or equal to the first token (this could happen if the partition with token F was moved to another node for example), then this condition will never be satisfied, and we don't detect correctly when we pass F. Instead, we go back to the minimum token, building the same token ranges again, in a possibly infinite loop. To fix this we add another step when reaching the end of the reader's stream. When this happens it means we don't have any more fragments to consume until the end of the range, so we advance the current_token to the end of the range, simulating a partition, and check for built views in that range. Fixes scylladb/scylladb#21829 Closes scylladb/scylladb#22493
This commit is contained in:
committed by
Avi Kivity
parent
439862a8d4
commit
6d34125eb7
@@ -3151,6 +3151,12 @@ public:
|
||||
_step.build_status.pop_back();
|
||||
}
|
||||
}
|
||||
|
||||
// before going back to the minimum token, advance current_key to the end
|
||||
// and check for built views in that range.
|
||||
_step.current_key = {_step.prange.end().value_or(dht::ring_position::max()).value().token(), partition_key::make_empty()};
|
||||
check_for_built_views();
|
||||
|
||||
_step.current_key = {dht::minimum_token(), partition_key::make_empty()};
|
||||
for (auto&& vs : _step.build_status) {
|
||||
vs.next_token = dht::minimum_token();
|
||||
|
||||
@@ -5,9 +5,13 @@
|
||||
#
|
||||
import asyncio
|
||||
import pytest
|
||||
import logging
|
||||
from test.pylib.manager_client import ManagerClient
|
||||
from test.pylib.tablets import get_tablet_replica
|
||||
from test.pylib.util import unique_name, wait_for_view
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
from test.pylib.util import wait_for_view
|
||||
|
||||
# This test makes sure that view building is done mainly in the streaming scheduling group
|
||||
# and not the gossip scheduling group. We do that by measuring the time each group was
|
||||
@@ -54,3 +58,59 @@ async def test_start_scylla_with_view_building_disabled(manager: ManagerClient):
|
||||
log = await manager.server_open_log(server.server_id)
|
||||
res = await log.grep(r"ERROR.*\[shard [0-9]+:[a-z]+\]")
|
||||
assert len(res) == 0
|
||||
|
||||
# Build multiple views of one base table, and while view building is running move
|
||||
# some of the base tablets to another node. Verify the view build is completed.
|
||||
# More specifically, we move all tablets except the first one to reproduce issue #21829.
|
||||
# The issue happens when we start building a view at a token F and then all partitions
|
||||
# with tokens >=F are moved, and it causes the view builder to enter an infinite loop
|
||||
# building the same token ranges repeatedly because it doesn't reach F.
|
||||
@pytest.mark.asyncio
|
||||
async def test_view_building_with_tablet_move(manager: ManagerClient, build_mode: str):
|
||||
servers = [await manager.server_add()]
|
||||
|
||||
await manager.api.disable_tablet_balancing(servers[0].ip_addr)
|
||||
|
||||
ks = unique_name()
|
||||
table = 'test'
|
||||
|
||||
view_count = 4
|
||||
views = [f"{table}_view_{i}" for i in range(view_count)]
|
||||
|
||||
cql = manager.get_cql()
|
||||
await cql.run_async(f"CREATE KEYSPACE {ks} WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': 1}} AND tablets = {{'initial': 4}}")
|
||||
await cql.run_async(f"CREATE TABLE {ks}.{table} (pk int PRIMARY KEY, c int)")
|
||||
|
||||
# prefill the base table with enough rows so that view building takes some time
|
||||
# and runs during the tablet move
|
||||
keys = 200000 if build_mode != 'debug' else 10000
|
||||
batch_size = 50
|
||||
for k in range(0, keys, batch_size):
|
||||
inserts = [f"INSERT INTO {ks}.{table}(pk, c) VALUES ({i}, {i})" for i in range(k, k+batch_size)]
|
||||
batch = "BEGIN UNLOGGED BATCH\n" + "\n".join(inserts) + "\nAPPLY BATCH\n"
|
||||
await manager.cql.run_async(batch)
|
||||
|
||||
logger.info("Adding new server")
|
||||
servers.append(await manager.server_add())
|
||||
|
||||
# create some views so they are built together but starting at different tokens
|
||||
for view in views:
|
||||
await cql.run_async(f"CREATE MATERIALIZED VIEW {ks}.{view} AS SELECT * FROM {ks}.{table} WHERE c IS NOT NULL AND pk IS NOT NULL PRIMARY KEY (c, pk)")
|
||||
await asyncio.sleep(1)
|
||||
|
||||
s0_host_id = await manager.get_host_id(servers[0].server_id)
|
||||
s1_host_id = await manager.get_host_id(servers[1].server_id)
|
||||
dst_shard = 0
|
||||
|
||||
# move all tablets except the first one (with lowest token range) to the other node.
|
||||
table_id = await manager.get_table_id(ks, table)
|
||||
rows = await manager.cql.run_async(f"SELECT last_token FROM system.tablets where table_id = {table_id}")
|
||||
move_tablets_tasks = []
|
||||
for r in rows[1:]:
|
||||
tablet_token = r.last_token
|
||||
replica = await get_tablet_replica(manager, servers[0], ks, table, tablet_token)
|
||||
move_tablets_tasks.append(asyncio.create_task(manager.api.move_tablet(servers[0].ip_addr, ks, table, replica[0], replica[1], s1_host_id, dst_shard, tablet_token)))
|
||||
await asyncio.gather(*move_tablets_tasks)
|
||||
|
||||
for view in views:
|
||||
await wait_for_view(cql, view, len(servers))
|
||||
|
||||
Reference in New Issue
Block a user