mv: don't mark the view as built if the reader produced no partitions

When we build a materialized view we read the entire base table from start to
end to generate all required view udpates. If a view is created while another view
is being built on the same base table, this is optimized - we start generating
view udpates for the new view from the base table rows that we're currently
reading, and we read the missed initial range again after the previous view
finishes building.
The view building progress is only updated after generating view updates for
some read partitions. However, there are scenarios where we'll generate no
view updates for the entire read range. If this was not handled we could
end up in an infinite view building loop like we did in https://github.com/scylladb/scylladb/issues/17293
To handle this, we mark the view as built if the reader generated no partitions.
However, this is not always the correct conclusion. Another scenario where
the reader won't encounter any partitions is when view building is interrupted,
and then we perform a reshard. In this scenario, we set the reader for all
shards to the last unbuilt token for an existing partition before the reshard.
However, this partition may not exist on a shard after reshard, and if there
are also no partitions with higher tokens, the reader will generate no partitions
even though it hasn't finished view building.
Additionally, we already have a check that prevents infinite view building loops
without taking the partitions generated by the reader into account. At the end
of stream, before looping back to the start, we advance current_key to the end
of the built range and check for built views in that range. This handles the case
where the entire range is empty - the conditions for a built view are:
1. the "next_token" is no greater than "first_token" (the view building process
looped back, so we've built all tokens above "first_token")
2. the "current_token" is no less than "first_token" (after looping back, we've
built all tokens below "first_token")

If the range is empty, we'll pass these conditions on an empty range after advancing
"current_key" to the end because:
1. after looping back, "next_token" will be set to `dht::minimum_token`
2. "current_key" will be set to `dht::ring_position::max()`

In this patch we remove the check for partitions generated by the reader. This fixes
the issue with resharding and it does not resurrect the issue with infinite view building
that the check was introduced for.

Fixes https://github.com/scylladb/scylladb/issues/26523

Closes scylladb/scylladb#26635

(cherry picked from commit 0a22ac3c9e)

Closes scylladb/scylladb#26889
This commit is contained in:
Wojciech Mitros
2025-10-21 10:11:16 +02:00
committed by Botond Dénes
parent 1288adb3a6
commit 8e9ab7618e
2 changed files with 45 additions and 9 deletions

View File

@@ -3306,15 +3306,6 @@ public:
_step.base->schema()->cf_name(), _step.current_token(), view_names);
}
if (_step.reader.is_end_of_stream() && _step.reader.is_buffer_empty()) {
if (_step.current_key.key().is_empty()) {
// consumer got end-of-stream without consuming a single partition
vlogger.debug("Reader didn't produce anything, marking views as built");
while (!_step.build_status.empty()) {
_built_views.views.push_back(std::move(_step.build_status.back()));
_step.build_status.pop_back();
}
}
// before going back to the minimum token, advance current_key to the end
// and check for built views in that range.
_step.current_key = { _step.prange.end().value_or(dht::ring_position::max()).value().token(), partition_key::make_empty()};
@@ -3333,6 +3324,7 @@ public:
// Called in the context of a seastar::thread.
void view_builder::execute(build_step& step, exponential_backoff_retry r) {
inject_failure("dont_start_build_step");
gc_clock::time_point now = gc_clock::now();
auto compaction_state = make_lw_shared<compact_for_query_state>(
*step.reader.schema(),
@@ -3366,6 +3358,7 @@ void view_builder::execute(build_step& step, exponential_backoff_retry r) {
seastar::when_all_succeed(bookkeeping_ops.begin(), bookkeeping_ops.end()).handle_exception([] (std::exception_ptr ep) {
vlogger.warn("Failed to update materialized view bookkeeping ({}), continuing anyway.", ep);
}).get();
utils::get_local_injector().inject("delay_finishing_build_step", utils::wait_for_message(60s)).get();
}
future<> view_builder::mark_as_built(view_ptr view) {

View File

@@ -6,6 +6,7 @@
import asyncio
import pytest
import logging
import random
import time
from test.pylib.manager_client import ManagerClient, wait_for_cql_and_get_hosts
from test.pylib.tablets import get_tablet_replica
@@ -193,3 +194,45 @@ async def test_interrupt_view_build_shard_registration(manager: ManagerClient):
assert len(res) == n_partitions
res = await cql.run_async(f"SELECT * FROM {ks}.mv")
assert len(res) == n_partitions
# The test verifies that when a reshard happens when building multiple views,
# which have different progress, we won't mistakenly decide that a view is built
# even if a build step is empty due to resharding.
# Reproduces https://github.com/scylladb/scylladb/issues/26523
@pytest.mark.asyncio
@skip_mode('release', 'error injections are not supported in release mode')
async def test_empty_build_step_after_reshard(manager: ManagerClient):
server = await manager.server_add(cmdline=['--smp', '1', '--logger-log-level', 'view=debug'])
partitions = random.sample(range(1000), 129) # need more than 128 to allow the first build step to finish and save the progress
logger.info(f"Using partitions: {partitions}")
cql = manager.get_cql()
await cql.run_async(f"CREATE KEYSPACE ks WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': 1}} AND tablets={{'enabled':false}}")
await cql.run_async(f"CREATE TABLE ks.test (p int, c int, PRIMARY KEY(p,c));")
await asyncio.gather(*[cql.run_async(f"INSERT INTO ks.test (p, c) VALUES ({k}, {k+1});") for k in partitions])
# Create first materialized view and wait until building starts. The base table has enough partitions for 2 build steps.
# Allow the first build step to finish and save progress. In the second step there's only one partition left to build, which will land only on one
# of the shards after resharding.
await manager.api.enable_injection(server.ip_addr, "delay_finishing_build_step", one_shot=False)
await cql.run_async(f"CREATE MATERIALIZED VIEW ks.mv AS SELECT p, c FROM ks.test WHERE p IS NOT NULL AND c IS NOT NULL PRIMARY KEY (c, p)")
async def progress_saved():
rows = await cql.run_async(f"SELECT * FROM system.scylla_views_builds_in_progress WHERE keyspace_name = 'ks' AND view_name = 'mv'")
return len(rows) > 0 or None
await wait_for(progress_saved, time.time() + 60)
await manager.api.enable_injection(server.ip_addr, "dont_start_build_step", one_shot=False)
await manager.api.message_injection(server.ip_addr, "delay_finishing_build_step")
# Create second materialized view and immediately restart the server to cause resharding. The new view will effectively start building after the restart.
await cql.run_async(f"CREATE MATERIALIZED VIEW ks.mv2 AS SELECT p, c FROM ks.test WHERE p IS NOT NULL AND c IS NOT NULL PRIMARY KEY (c, p)")
await manager.server_stop_gracefully(server.server_id)
await manager.server_start(server.server_id, cmdline_options_override=['--smp', '2', '--logger-log-level', 'view=debug'])
cql = await reconnect_driver(manager)
await wait_for_cql_and_get_hosts(cql, [server], time.time() + 60)
await wait_for_view(cql, 'mv', 1)
await wait_for_view(cql, 'mv2', 1)
# Verify that no rows are missing
base_rows = await cql.run_async(f"SELECT * FROM ks.test")
mv_rows = await cql.run_async(f"SELECT * FROM ks.mv")
mv2_rows = await cql.run_async(f"SELECT * FROM ks.mv2")
assert len(base_rows) == len(mv_rows) == len(mv2_rows) == 129