Compare commits

..

2 Commits

Author SHA1 Message Date
copilot-swe-agent[bot]
c06760cf15 Fix multiple issues in test_out_of_space_prevention.py
- Fix variable name error: host[0] → hosts[0] on line 98
- Add missing await keywords for async operations on lines 209 and 385
- Rename class random_content_file to RandomContentFile (PascalCase)
- Fix function name typo: test_autotoogle_compaction → test_autotoggle_compaction

Co-authored-by: mykaul <4655593+mykaul@users.noreply.github.com>
2025-12-23 09:25:16 +00:00
copilot-swe-agent[bot]
c684456eba Initial plan 2025-12-23 09:21:06 +00:00
202 changed files with 1621 additions and 6304 deletions

View File

@@ -1,13 +0,0 @@
name: validate_pr_author_email
on:
pull_request_target:
types:
- opened
- synchronize
- reopened
jobs:
validate_pr_author_email:
uses: scylladb/github-automation/.github/workflows/validate_pr_author_email.yml@main

View File

@@ -13,5 +13,5 @@ jobs:
- uses: codespell-project/actions-codespell@master
with:
only_warn: 1
ignore_words_list: "ans,datas,fo,ser,ue,crate,nd,reenable,strat,stap,te,raison,iif,tread"
ignore_words_list: "ans,datas,fo,ser,ue,crate,nd,reenable,strat,stap,te,raison"
skip: "./.git,./build,./tools,*.js,*.lock,./test,./licenses,./redis/lolwut.cc,*.svg"

View File

@@ -28,7 +28,6 @@ static logging::logger logger("alternator_controller");
controller::controller(
sharded<gms::gossiper>& gossiper,
sharded<service::storage_proxy>& proxy,
sharded<service::storage_service>& ss,
sharded<service::migration_manager>& mm,
sharded<db::system_distributed_keyspace>& sys_dist_ks,
sharded<cdc::generation_service>& cdc_gen_svc,
@@ -40,7 +39,6 @@ controller::controller(
: protocol_server(sg)
, _gossiper(gossiper)
, _proxy(proxy)
, _ss(ss)
, _mm(mm)
, _sys_dist_ks(sys_dist_ks)
, _cdc_gen_svc(cdc_gen_svc)
@@ -91,7 +89,7 @@ future<> controller::start_server() {
auto get_timeout_in_ms = [] (const db::config& cfg) -> utils::updateable_value<uint32_t> {
return cfg.alternator_timeout_in_ms;
};
_executor.start(std::ref(_gossiper), std::ref(_proxy), std::ref(_ss), std::ref(_mm), std::ref(_sys_dist_ks),
_executor.start(std::ref(_gossiper), std::ref(_proxy), std::ref(_mm), std::ref(_sys_dist_ks),
sharded_parameter(get_cdc_metadata, std::ref(_cdc_gen_svc)), _ssg.value(),
sharded_parameter(get_timeout_in_ms, std::ref(_config))).get();
_server.start(std::ref(_executor), std::ref(_proxy), std::ref(_gossiper), std::ref(_auth_service), std::ref(_sl_controller)).get();

View File

@@ -15,7 +15,6 @@
namespace service {
class storage_proxy;
class storage_service;
class migration_manager;
class memory_limiter;
}
@@ -58,7 +57,6 @@ class server;
class controller : public protocol_server {
sharded<gms::gossiper>& _gossiper;
sharded<service::storage_proxy>& _proxy;
sharded<service::storage_service>& _ss;
sharded<service::migration_manager>& _mm;
sharded<db::system_distributed_keyspace>& _sys_dist_ks;
sharded<cdc::generation_service>& _cdc_gen_svc;
@@ -76,7 +74,6 @@ public:
controller(
sharded<gms::gossiper>& gossiper,
sharded<service::storage_proxy>& proxy,
sharded<service::storage_service>& ss,
sharded<service::migration_manager>& mm,
sharded<db::system_distributed_keyspace>& sys_dist_ks,
sharded<cdc::generation_service>& cdc_gen_svc,

View File

@@ -67,14 +67,6 @@ using namespace std::chrono_literals;
logging::logger elogger("alternator-executor");
namespace std {
template <> struct hash<std::pair<sstring, sstring>> {
size_t operator () (const std::pair<sstring, sstring>& p) const {
return std::hash<sstring>()(p.first) * 1009 + std::hash<sstring>()(p.second) * 3;
}
};
}
namespace alternator {
// Alternator-specific table properties stored as hidden table tags:
@@ -256,66 +248,14 @@ static const rjson::value::Member& get_single_member(const rjson::value& v, cons
return *(v.MemberBegin());
}
class executor::describe_table_info_manager : public service::migration_listener::empty_listener {
executor &_executor;
struct table_info {
utils::simple_value_with_expiry<std::uint64_t> size_in_bytes;
};
std::unordered_map<std::pair<sstring, sstring>, table_info> info_for_tables;
bool active = false;
public:
describe_table_info_manager(executor& executor) : _executor(executor) {
_executor._proxy.data_dictionary().real_database_ptr()->get_notifier().register_listener(this);
active = true;
}
describe_table_info_manager(const describe_table_info_manager &) = delete;
describe_table_info_manager(describe_table_info_manager&&) = delete;
~describe_table_info_manager() {
if (active) {
on_fatal_internal_error(elogger, "describe_table_info_manager was not stopped before destruction");
}
}
describe_table_info_manager &operator = (const describe_table_info_manager &) = delete;
describe_table_info_manager &operator = (describe_table_info_manager&&) = delete;
static std::chrono::high_resolution_clock::time_point now() {
return std::chrono::high_resolution_clock::now();
}
std::optional<std::uint64_t> get_cached_size_in_bytes(const sstring &ks_name, const sstring &cf_name) const {
auto it = info_for_tables.find({ks_name, cf_name});
if (it != info_for_tables.end()) {
return it->second.size_in_bytes.get();
}
return std::nullopt;
}
void cache_size_in_bytes(sstring ks_name, sstring cf_name, std::uint64_t size_in_bytes, std::chrono::high_resolution_clock::time_point expiry) {
info_for_tables[{std::move(ks_name), std::move(cf_name)}].size_in_bytes.set_if_longer_expiry(size_in_bytes, expiry);
}
future<> stop() {
co_await _executor._proxy.data_dictionary().real_database_ptr()->get_notifier().unregister_listener(this);
active = false;
co_return;
}
void on_drop_column_family(const sstring& ks_name, const sstring& cf_name) override {
if (!ks_name.starts_with(executor::KEYSPACE_NAME_PREFIX)) return;
info_for_tables.erase({ks_name, cf_name});
}
};
executor::executor(gms::gossiper& gossiper,
service::storage_proxy& proxy,
service::storage_service& ss,
service::migration_manager& mm,
db::system_distributed_keyspace& sdks,
cdc::metadata& cdc_metadata,
smp_service_group ssg,
utils::updateable_value<uint32_t> default_timeout_in_ms)
: _gossiper(gossiper),
_ss(ss),
_proxy(proxy),
_mm(mm),
_sdks(sdks),
@@ -328,7 +268,6 @@ executor::executor(gms::gossiper& gossiper,
_stats))
{
s_default_timeout_in_ms = std::move(default_timeout_in_ms);
_describe_table_info_manager = std::make_unique<describe_table_info_manager>(*this);
register_metrics(_metrics, _stats);
}
@@ -813,44 +752,12 @@ static future<bool> is_view_built(
}
future<> executor::cache_newly_calculated_size_on_all_shards(schema_ptr schema, std::uint64_t size_in_bytes, std::chrono::nanoseconds ttl) {
auto expiry = describe_table_info_manager::now() + ttl;
return container().invoke_on_all(
[schema, size_in_bytes, expiry] (executor& exec) {
exec._describe_table_info_manager->cache_size_in_bytes(schema->ks_name(), schema->cf_name(), size_in_bytes, expiry);
});
}
future<> executor::fill_table_size(rjson::value &table_description, schema_ptr schema, bool deleting) {
auto cached_size = _describe_table_info_manager->get_cached_size_in_bytes(schema->ks_name(), schema->cf_name());
std::uint64_t total_size = 0;
if (cached_size) {
total_size = *cached_size;
} else {
// there's no point in trying to estimate value of table that is being deleted, as other nodes more often than not might
// move forward with deletion faster than we calculate the size
if (!deleting) {
total_size = co_await _ss.estimate_total_sstable_volume(schema->id(), service::storage_service::ignore_errors::yes);
const auto expiry = std::chrono::seconds{ _proxy.data_dictionary().get_config().alternator_describe_table_info_cache_validity_in_seconds() };
// Note: we don't care when the notification of other shards will finish, as long as it will be done
// it's possible to get into race condition (next DescribeTable comes to other shard, that new shard doesn't have
// the size yet, so it will calculate it again) - this is not a problem, because it will call cache_newly_calculated_size_on_all_shards
// with expiry, which is extremely unlikely to be exactly the same as the previous one, all shards will keep the size coming with expiry that is further into the future.
// In case of the same expiry, some shards will have different size, which means DescribeTable will return different values depending on the shard
// which is also fine, as the specification doesn't give precision guarantees of any kind.
co_await cache_newly_calculated_size_on_all_shards(schema, total_size, expiry);
}
}
rjson::add(table_description, "TableSizeBytes", total_size);
}
future<rjson::value> executor::fill_table_description(schema_ptr schema, table_status tbl_status, service::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit)
static future<rjson::value> fill_table_description(schema_ptr schema, table_status tbl_status, service::storage_proxy& proxy, service::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit)
{
rjson::value table_description = rjson::empty_object();
auto tags_ptr = db::get_tags_of_table(schema);
rjson::add(table_description, "TableName", rjson::from_string(schema->cf_name()));
co_await fill_table_size(table_description, schema, tbl_status == table_status::deleting);
auto creation_timestamp = get_table_creation_time(*schema);
@@ -894,7 +801,9 @@ future<rjson::value> executor::fill_table_description(schema_ptr schema, table_s
rjson::add(table_description["ProvisionedThroughput"], "WriteCapacityUnits", wcu);
rjson::add(table_description["ProvisionedThroughput"], "NumberOfDecreasesToday", 0);
data_dictionary::table t = _proxy.data_dictionary().find_column_family(schema);
data_dictionary::table t = proxy.data_dictionary().find_column_family(schema);
if (tbl_status != table_status::deleting) {
rjson::add(table_description, "CreationDateTime", rjson::value(creation_timestamp));
@@ -931,7 +840,7 @@ future<rjson::value> executor::fill_table_description(schema_ptr schema, table_s
// (for a built view) or CREATING+Backfilling (if view building
// is in progress).
if (!is_lsi) {
if (co_await is_view_built(vptr, _proxy, client_state, trace_state, permit)) {
if (co_await is_view_built(vptr, proxy, client_state, trace_state, permit)) {
rjson::add(view_entry, "IndexStatus", "ACTIVE");
} else {
rjson::add(view_entry, "IndexStatus", "CREATING");
@@ -959,8 +868,9 @@ future<rjson::value> executor::fill_table_description(schema_ptr schema, table_s
}
rjson::add(table_description, "AttributeDefinitions", std::move(attribute_definitions));
}
executor::supplement_table_stream_info(table_description, *schema, _proxy);
executor::supplement_table_stream_info(table_description, *schema, proxy);
// FIXME: still missing some response fields (issue #5026)
co_return table_description;
}
@@ -980,7 +890,7 @@ future<executor::request_return_type> executor::describe_table(client_state& cli
get_stats_from_schema(_proxy, *schema)->api_operations.describe_table++;
tracing::add_alternator_table_name(trace_state, schema->cf_name());
rjson::value table_description = co_await fill_table_description(schema, table_status::active, client_state, trace_state, permit);
rjson::value table_description = co_await fill_table_description(schema, table_status::active, _proxy, client_state, trace_state, permit);
rjson::value response = rjson::empty_object();
rjson::add(response, "Table", std::move(table_description));
elogger.trace("returning {}", response);
@@ -1083,7 +993,7 @@ future<executor::request_return_type> executor::delete_table(client_state& clien
auto& p = _proxy.container();
schema_ptr schema = get_table(_proxy, request);
rjson::value table_description = co_await fill_table_description(schema, table_status::deleting, client_state, trace_state, permit);
rjson::value table_description = co_await fill_table_description(schema, table_status::deleting, _proxy, client_state, trace_state, permit);
co_await verify_permission(_enforce_authorization, _warn_authorization, client_state, schema, auth::permission::DROP, _stats);
co_await _mm.container().invoke_on(0, [&, cs = client_state.move_to_other_shard()] (service::migration_manager& mm) -> future<> {
size_t retries = mm.get_concurrent_ddl_retries();
@@ -1647,7 +1557,8 @@ static future<> mark_view_schemas_as_built(utils::chunked_vector<mutation>& out,
}
}
future<executor::request_return_type> executor::create_table_on_shard0(service::client_state&& client_state, tracing::trace_state_ptr trace_state, rjson::value request, bool enforce_authorization, bool warn_authorization, const db::tablets_mode_t::mode tablets_mode) {
static future<executor::request_return_type> create_table_on_shard0(service::client_state&& client_state, tracing::trace_state_ptr trace_state, rjson::value request,
service::storage_proxy& sp, service::migration_manager& mm, gms::gossiper& gossiper, bool enforce_authorization, bool warn_authorization, stats& stats, const db::tablets_mode_t::mode tablets_mode) {
SCYLLA_ASSERT(this_shard_id() == 0);
// We begin by parsing and validating the content of the CreateTable
@@ -1834,7 +1745,7 @@ future<executor::request_return_type> executor::create_table_on_shard0(service::
rjson::value* stream_specification = rjson::find(request, "StreamSpecification");
if (stream_specification && stream_specification->IsObject()) {
if (executor::add_stream_options(*stream_specification, builder, _proxy)) {
if (executor::add_stream_options(*stream_specification, builder, sp)) {
validate_cdc_log_name_length(builder.cf_name());
}
}
@@ -1853,7 +1764,7 @@ future<executor::request_return_type> executor::create_table_on_shard0(service::
set_table_creation_time(tags_map, db_clock::now());
builder.add_extension(db::tags_extension::NAME, ::make_shared<db::tags_extension>(tags_map));
co_await verify_create_permission(enforce_authorization, warn_authorization, client_state, _stats);
co_await verify_create_permission(enforce_authorization, warn_authorization, client_state, stats);
schema_ptr schema = builder.build();
for (auto& view_builder : view_builders) {
@@ -1869,18 +1780,18 @@ future<executor::request_return_type> executor::create_table_on_shard0(service::
view_builder.with_view_info(schema, include_all_columns, ""/*where clause*/);
}
size_t retries = _mm.get_concurrent_ddl_retries();
size_t retries = mm.get_concurrent_ddl_retries();
for (;;) {
auto group0_guard = co_await _mm.start_group0_operation();
auto group0_guard = co_await mm.start_group0_operation();
auto ts = group0_guard.write_timestamp();
utils::chunked_vector<mutation> schema_mutations;
auto ksm = create_keyspace_metadata(keyspace_name, _proxy, _gossiper, ts, tags_map, _proxy.features(), tablets_mode);
auto ksm = create_keyspace_metadata(keyspace_name, sp, gossiper, ts, tags_map, sp.features(), tablets_mode);
// Alternator Streams doesn't yet work when the table uses tablets (#23838)
if (stream_specification && stream_specification->IsObject()) {
auto stream_enabled = rjson::find(*stream_specification, "StreamEnabled");
if (stream_enabled && stream_enabled->IsBool() && stream_enabled->GetBool()) {
locator::replication_strategy_params params(ksm->strategy_options(), ksm->initial_tablets(), ksm->consistency_option());
const auto& topo = _proxy.local_db().get_token_metadata().get_topology();
const auto& topo = sp.local_db().get_token_metadata().get_topology();
auto rs = locator::abstract_replication_strategy::create_replication_strategy(ksm->strategy_name(), params, topo);
if (rs->uses_tablets()) {
co_return api_error::validation("Streams not yet supported on a table using tablets (issue #23838). "
@@ -1890,17 +1801,17 @@ future<executor::request_return_type> executor::create_table_on_shard0(service::
}
// Creating an index in tablets mode requires the rf_rack_valid_keyspaces option to be enabled.
// GSI and LSI indexes are based on materialized views which require this option to avoid consistency issues.
if (!view_builders.empty() && ksm->uses_tablets() && !_proxy.data_dictionary().get_config().rf_rack_valid_keyspaces()) {
if (!view_builders.empty() && ksm->uses_tablets() && !sp.data_dictionary().get_config().rf_rack_valid_keyspaces()) {
co_return api_error::validation("GlobalSecondaryIndexes and LocalSecondaryIndexes with tablets require the rf_rack_valid_keyspaces option to be enabled.");
}
try {
schema_mutations = service::prepare_new_keyspace_announcement(_proxy.local_db(), ksm, ts);
schema_mutations = service::prepare_new_keyspace_announcement(sp.local_db(), ksm, ts);
} catch (exceptions::already_exists_exception&) {
if (_proxy.data_dictionary().has_schema(keyspace_name, table_name)) {
if (sp.data_dictionary().has_schema(keyspace_name, table_name)) {
co_return api_error::resource_in_use(fmt::format("Table {} already exists", table_name));
}
}
if (_proxy.data_dictionary().try_find_table(schema->id())) {
if (sp.data_dictionary().try_find_table(schema->id())) {
// This should never happen, the ID is supposed to be unique
co_return api_error::internal(format("Table with ID {} already exists", schema->id()));
}
@@ -1909,9 +1820,9 @@ future<executor::request_return_type> executor::create_table_on_shard0(service::
for (schema_builder& view_builder : view_builders) {
schemas.push_back(view_builder.build());
}
co_await service::prepare_new_column_families_announcement(schema_mutations, _proxy, *ksm, schemas, ts);
co_await service::prepare_new_column_families_announcement(schema_mutations, sp, *ksm, schemas, ts);
if (ksm->uses_tablets()) {
co_await mark_view_schemas_as_built(schema_mutations, schemas, ts, _proxy);
co_await mark_view_schemas_as_built(schema_mutations, schemas, ts, sp);
}
// If a role is allowed to create a table, we must give it permissions to
@@ -1936,7 +1847,7 @@ future<executor::request_return_type> executor::create_table_on_shard0(service::
}
std::tie(schema_mutations, group0_guard) = co_await std::move(mc).extract();
try {
co_await _mm.announce(std::move(schema_mutations), std::move(group0_guard), fmt::format("alternator-executor: create {} table", table_name));
co_await mm.announce(std::move(schema_mutations), std::move(group0_guard), fmt::format("alternator-executor: create {} table", table_name));
break;
} catch (const service::group0_concurrent_modification& ex) {
elogger.info("Failed to execute CreateTable {} due to concurrent schema modifications. {}.",
@@ -1948,9 +1859,9 @@ future<executor::request_return_type> executor::create_table_on_shard0(service::
}
}
co_await _mm.wait_for_schema_agreement(_proxy.local_db(), db::timeout_clock::now() + 10s, nullptr);
co_await mm.wait_for_schema_agreement(sp.local_db(), db::timeout_clock::now() + 10s, nullptr);
rjson::value status = rjson::empty_object();
executor::supplement_table_info(request, *schema, _proxy);
executor::supplement_table_info(request, *schema, sp);
rjson::add(status, "TableDescription", std::move(request));
co_return rjson::print(std::move(status));
}
@@ -1959,11 +1870,10 @@ future<executor::request_return_type> executor::create_table(client_state& clien
_stats.api_operations.create_table++;
elogger.trace("Creating table {}", request);
co_return co_await _mm.container().invoke_on(0, [&, tr = tracing::global_trace_state_ptr(trace_state), request = std::move(request), &e = this->container(), client_state_other_shard = client_state.move_to_other_shard(), enforce_authorization = bool(_enforce_authorization), warn_authorization = bool(_warn_authorization)]
co_return co_await _mm.container().invoke_on(0, [&, tr = tracing::global_trace_state_ptr(trace_state), request = std::move(request), &sp = _proxy.container(), &g = _gossiper.container(), &e = this->container(), client_state_other_shard = client_state.move_to_other_shard(), enforce_authorization = bool(_enforce_authorization), warn_authorization = bool(_warn_authorization)]
(service::migration_manager& mm) mutable -> future<executor::request_return_type> {
const db::tablets_mode_t::mode tablets_mode = _proxy.data_dictionary().get_config().tablets_mode_for_new_keyspaces(); // type cast
// `invoke_on` hopped us to shard 0, but `this` points to `executor` is from 'old' shard, we need to hop it too.
co_return co_await e.local().create_table_on_shard0(client_state_other_shard.get(), tr, std::move(request), enforce_authorization, warn_authorization, std::move(tablets_mode));
co_return co_await create_table_on_shard0(client_state_other_shard.get(), tr, std::move(request), sp.local(), mm, g.local(), enforce_authorization, warn_authorization, e.local()._stats, std::move(tablets_mode));
});
}
@@ -6177,10 +6087,9 @@ future<> executor::start() {
}
future<> executor::stop() {
co_await _describe_table_info_manager->stop();
// disconnect from the value source, but keep the value unchanged.
s_default_timeout_in_ms = utils::updateable_value<uint32_t>{s_default_timeout_in_ms()};
co_await _parsed_expression_cache->stop();
return _parsed_expression_cache->stop();
}
} // namespace alternator

View File

@@ -17,13 +17,11 @@
#include "service/client_state.hh"
#include "service_permit.hh"
#include "db/timeout_clock.hh"
#include "db/config.hh"
#include "alternator/error.hh"
#include "stats.hh"
#include "utils/rjson.hh"
#include "utils/updateable_value.hh"
#include "utils/simple_value_with_expiry.hh"
#include "tracing/trace_state.hh"
@@ -43,7 +41,6 @@ namespace cql3::selection {
namespace service {
class storage_proxy;
class cas_shard;
class storage_service;
}
namespace cdc {
@@ -60,7 +57,6 @@ class schema_builder;
namespace alternator {
enum class table_status;
class rmw_operation;
class put_or_delete_item;
@@ -140,7 +136,6 @@ class expression_cache;
class executor : public peering_sharded_service<executor> {
gms::gossiper& _gossiper;
service::storage_service& _ss;
service::storage_proxy& _proxy;
service::migration_manager& _mm;
db::system_distributed_keyspace& _sdks;
@@ -153,11 +148,6 @@ class executor : public peering_sharded_service<executor> {
std::unique_ptr<parsed::expression_cache> _parsed_expression_cache;
struct describe_table_info_manager;
std::unique_ptr<describe_table_info_manager> _describe_table_info_manager;
future<> cache_newly_calculated_size_on_all_shards(schema_ptr schema, std::uint64_t size_in_bytes, std::chrono::nanoseconds ttl);
future<> fill_table_size(rjson::value &table_description, schema_ptr schema, bool deleting);
public:
using client_state = service::client_state;
// request_return_type is the return type of the executor methods, which
@@ -183,7 +173,6 @@ public:
executor(gms::gossiper& gossiper,
service::storage_proxy& proxy,
service::storage_service& ss,
service::migration_manager& mm,
db::system_distributed_keyspace& sdks,
cdc::metadata& cdc_metadata,
@@ -231,8 +220,6 @@ private:
friend class rmw_operation;
static void describe_key_schema(rjson::value& parent, const schema&, std::unordered_map<std::string,std::string> * = nullptr, const std::map<sstring, sstring> *tags = nullptr);
future<rjson::value> fill_table_description(schema_ptr schema, table_status tbl_status, service::client_state& client_state, tracing::trace_state_ptr trace_state, service_permit permit);
future<executor::request_return_type> create_table_on_shard0(service::client_state&& client_state, tracing::trace_state_ptr trace_state, rjson::value request, bool enforce_authorization, bool warn_authorization, const db::tablets_mode_t::mode tablets_mode);
future<> do_batch_write(
std::vector<std::pair<schema_ptr, put_or_delete_item>> mutation_builders,

View File

@@ -547,13 +547,17 @@ void set_view_builder(http_context& ctx, routes& r, sharded<db::view::view_build
vp.insert(b.second);
}
}
std::vector<sstring> res;
replica::database& db = vb.local().get_db();
auto uuid = validate_table(db, ks, cf_name);
replica::column_family& cf = db.find_column_family(uuid);
co_return cf.get_index_manager().list_indexes()
| std::views::transform([] (const auto& i) { return i.metadata().name(); })
| std::views::filter([&vp] (const auto& n) { return vp.contains(secondary_index::index_table_name(n)); })
| std::ranges::to<std::vector>();
res.reserve(cf.get_index_manager().list_indexes().size());
for (auto&& i : cf.get_index_manager().list_indexes()) {
if (vp.contains(secondary_index::index_table_name(i.metadata().name()))) {
res.emplace_back(i.metadata().name());
}
}
co_return res;
});
}

View File

@@ -15,7 +15,6 @@
#include "db/system_keyspace.hh"
#include "schema/schema.hh"
#include <iterator>
#include <seastar/core/abort_source.hh>
#include <seastar/coroutine/maybe_yield.hh>
#include <seastar/core/format.hh>
@@ -23,11 +22,9 @@ namespace auth {
logging::logger logger("auth-cache");
cache::cache(cql3::query_processor& qp, abort_source& as) noexcept
cache::cache(cql3::query_processor& qp) noexcept
: _current_version(0)
, _qp(qp)
, _loading_sem(1)
, _as(as) {
, _qp(qp) {
}
lw_shared_ptr<const cache::role_record> cache::get(const role_name_t& role) const noexcept {
@@ -119,8 +116,6 @@ future<> cache::load_all() {
co_return;
}
SCYLLA_ASSERT(this_shard_id() == 0);
auto units = co_await get_units(_loading_sem, 1, _as);
++_current_version;
logger.info("Loading all roles");
@@ -151,9 +146,6 @@ future<> cache::load_roles(std::unordered_set<role_name_t> roles) {
if (legacy_mode(_qp)) {
co_return;
}
SCYLLA_ASSERT(this_shard_id() == 0);
auto units = co_await get_units(_loading_sem, 1, _as);
for (const auto& name : roles) {
logger.info("Loading role {}", name);
auto role = co_await fetch_role(name);

View File

@@ -8,7 +8,6 @@
#pragma once
#include <seastar/core/abort_source.hh>
#include <unordered_set>
#include <unordered_map>
@@ -16,7 +15,6 @@
#include <seastar/core/future.hh>
#include <seastar/core/sharded.hh>
#include <seastar/core/shared_ptr.hh>
#include <seastar/core/semaphore.hh>
#include <absl/container/flat_hash_map.h>
@@ -43,7 +41,7 @@ public:
version_tag_t version; // used for seamless cache reloads
};
explicit cache(cql3::query_processor& qp, abort_source& as) noexcept;
explicit cache(cql3::query_processor& qp) noexcept;
lw_shared_ptr<const role_record> get(const role_name_t& role) const noexcept;
future<> load_all();
future<> load_roles(std::unordered_set<role_name_t> roles);
@@ -54,8 +52,6 @@ private:
roles_map _roles;
version_tag_t _current_version;
cql3::query_processor& _qp;
semaphore _loading_sem;
abort_source& _as;
future<lw_shared_ptr<role_record>> fetch_role(const role_name_t& role) const;
future<> prune_all() noexcept;

View File

@@ -12,7 +12,6 @@
#include <seastar/core/condition-variable.hh>
#include "schema/schema_fwd.hh"
#include "sstables/open_info.hh"
#include "compaction_descriptor.hh"
class reader_permit;
@@ -45,7 +44,7 @@ public:
virtual compaction_strategy_state& get_compaction_strategy_state() noexcept = 0;
virtual reader_permit make_compaction_reader_permit() const = 0;
virtual sstables::sstables_manager& get_sstables_manager() noexcept = 0;
virtual sstables::shared_sstable make_sstable(sstables::sstable_state) const = 0;
virtual sstables::shared_sstable make_sstable() const = 0;
virtual sstables::sstable_writer_config configure_writer(sstring origin) const = 0;
virtual api::timestamp_type min_memtable_timestamp() const = 0;
virtual api::timestamp_type min_memtable_live_timestamp() const = 0;

View File

@@ -416,9 +416,7 @@ future<compaction_result> compaction_task_executor::compact_sstables(compaction_
descriptor.enable_garbage_collection(co_await sstable_set_for_tombstone_gc(t));
}
descriptor.creator = [&t] (shard_id) {
// All compaction types going through this path will work on normal input sstables only.
// Off-strategy, for example, waits until the sstables move out of staging state.
return t.make_sstable(sstables::sstable_state::normal);
return t.make_sstable();
};
descriptor.replacer = [this, &t, &on_replace, offstrategy] (compaction_completion_desc desc) {
t.get_compaction_strategy().notify_completion(t, desc.old_sstables, desc.new_sstables);
@@ -1849,10 +1847,6 @@ protected:
throw make_compaction_stopped_exception();
}
}, false);
if (utils::get_local_injector().is_enabled("split_sstable_force_stop_exception")) {
throw make_compaction_stopped_exception();
}
co_return co_await do_rewrite_sstable(std::move(sst));
}
};
@@ -2290,16 +2284,12 @@ future<compaction_manager::compaction_stats_opt> compaction_manager::perform_spl
}
future<std::vector<sstables::shared_sstable>>
compaction_manager::maybe_split_new_sstable(sstables::shared_sstable sst, compaction_group_view& t, compaction_type_options::split opt) {
compaction_manager::maybe_split_sstable(sstables::shared_sstable sst, compaction_group_view& t, compaction_type_options::split opt) {
if (!split_compaction_task_executor::sstable_needs_split(sst, opt)) {
co_return std::vector<sstables::shared_sstable>{sst};
}
// Throw an error if split cannot be performed due to e.g. out of space prevention.
// We don't want to prevent split because compaction is temporarily disabled on a view only for synchronization,
// which is unneeded against new sstables that aren't part of any set yet, so never use can_proceed(&t) here.
if (is_disabled()) {
co_return coroutine::exception(std::make_exception_ptr(std::runtime_error(format("Cannot split {} because manager has compaction disabled, " \
"reason might be out of space prevention", sst->get_filename()))));
if (!can_proceed(&t)) {
co_return std::vector<sstables::shared_sstable>{sst};
}
std::vector<sstables::shared_sstable> ret;
@@ -2307,11 +2297,8 @@ compaction_manager::maybe_split_new_sstable(sstables::shared_sstable sst, compac
compaction_progress_monitor monitor;
compaction_data info = create_compaction_data();
compaction_descriptor desc = split_compaction_task_executor::make_descriptor(sst, opt);
desc.creator = [&t, sst] (shard_id _) {
// NOTE: preserves the sstable state, since we want the output to be on the same state as the original.
// For example, if base table has views, it's important that sstable produced by repair will be
// in the staging state.
return t.make_sstable(sst->state());
desc.creator = [&t] (shard_id _) {
return t.make_sstable();
};
desc.replacer = [&] (compaction_completion_desc d) {
std::move(d.new_sstables.begin(), d.new_sstables.end(), std::back_inserter(ret));

View File

@@ -376,8 +376,7 @@ public:
// Splits a single SSTable by segregating all its data according to the classifier.
// If SSTable doesn't need split, the same input SSTable is returned as output.
// If SSTable needs split, then output SSTables are returned and the input SSTable is deleted.
// Exception is thrown if the input sstable cannot be split due to e.g. out of space prevention.
future<std::vector<sstables::shared_sstable>> maybe_split_new_sstable(sstables::shared_sstable sst, compaction_group_view& t, compaction_type_options::split opt);
future<std::vector<sstables::shared_sstable>> maybe_split_sstable(sstables::shared_sstable sst, compaction_group_view& t, compaction_type_options::split opt);
// Run a custom job for a given table, defined by a function
// it completes when future returned by job is ready or returns immediately

View File

@@ -571,10 +571,10 @@ commitlog_total_space_in_mb: -1
# - "none": auditing is disabled (default)
# - "table": save audited events in audit.audit_log column family
# - "syslog": send audited events via syslog (depends on OS, but usually to /dev/log)
audit: "table"
# audit: "none"
#
# List of statement categories that should be audited.
audit_categories: "DCL,DDL,AUTH,ADMIN"
# audit_categories: "DCL,DDL,AUTH"
#
# List of tables that should be audited.
# audit_tables: "<keyspace_name>.<table_name>,<keyspace_name>.<table_name>"

View File

@@ -368,87 +368,6 @@ def find_ninja():
sys.exit(1)
def find_compiler(name):
"""
Find a compiler by name, skipping ccache wrapper directories.
This is useful when using sccache to avoid double-caching through ccache.
Args:
name: The compiler name (e.g., 'clang++', 'clang', 'gcc')
Returns:
Path to the compiler, skipping ccache directories, or None if not found.
"""
ccache_dirs = {'/usr/lib/ccache', '/usr/lib64/ccache'}
for path_dir in os.environ.get('PATH', '').split(os.pathsep):
# Skip ccache wrapper directories
if os.path.realpath(path_dir) in ccache_dirs or path_dir in ccache_dirs:
continue
candidate = os.path.join(path_dir, name)
if os.path.isfile(candidate) and os.access(candidate, os.X_OK):
return candidate
return None
def resolve_compilers_for_sccache(args, compiler_cache):
"""
When using sccache, resolve compiler paths to avoid ccache directories.
This prevents double-caching when ccache symlinks are in PATH.
Args:
args: The argument namespace with cc and cxx attributes.
compiler_cache: Path to the compiler cache binary, or None.
"""
if not compiler_cache or 'sccache' not in compiler_cache:
return
if not os.path.isabs(args.cxx):
real_cxx = find_compiler(args.cxx)
if real_cxx:
args.cxx = real_cxx
if not os.path.isabs(args.cc):
real_cc = find_compiler(args.cc)
if real_cc:
args.cc = real_cc
def find_compiler_cache(preference):
"""
Find a compiler cache based on the preference.
Args:
preference: One of 'auto', 'sccache', 'ccache', 'none', or a path to a binary.
Returns:
Path to the compiler cache binary, or None if not found/disabled.
"""
if preference == 'none':
return None
if preference == 'auto':
# Prefer sccache over ccache
for cache in ['sccache', 'ccache']:
path = which(cache)
if path:
return path
return None
if preference in ('sccache', 'ccache'):
path = which(preference)
if path:
return path
print(f"Warning: {preference} not found on PATH, disabling compiler cache")
return None
# Assume it's a path to a binary
if os.path.isfile(preference) and os.access(preference, os.X_OK):
return preference
print(f"Warning: compiler cache '{preference}' not found or not executable, disabling compiler cache")
return None
modes = {
'debug': {
'cxxflags': '-DDEBUG -DSANITIZE -DDEBUG_LSA_SANITIZER -DSCYLLA_ENABLE_ERROR_INJECTION',
@@ -813,8 +732,6 @@ arg_parser.add_argument('--compiler', action='store', dest='cxx', default='clang
help='C++ compiler path')
arg_parser.add_argument('--c-compiler', action='store', dest='cc', default='clang',
help='C compiler path')
arg_parser.add_argument('--compiler-cache', action='store', dest='compiler_cache', default='auto',
help='Compiler cache to use: auto (default, prefers sccache), sccache, ccache, none, or a path to a binary')
add_tristate(arg_parser, name='dpdk', dest='dpdk', default=False,
help='Use dpdk (from seastar dpdk sources)')
arg_parser.add_argument('--dpdk-target', action='store', dest='dpdk_target', default='',
@@ -1698,7 +1615,6 @@ deps['test/boost/combined_tests'] += [
'test/boost/schema_registry_test.cc',
'test/boost/secondary_index_test.cc',
'test/boost/sessions_test.cc',
'test/boost/simple_value_with_expiry_test.cc',
'test/boost/sstable_compaction_test.cc',
'test/boost/sstable_compressor_factory_test.cc',
'test/boost/sstable_compression_config_test.cc',
@@ -1782,18 +1698,6 @@ deps['test/vector_search/vector_store_client_test'] = ['test/vector_search/vect
deps['test/vector_search/load_balancer_test'] = ['test/vector_search/load_balancer_test.cc'] + scylla_tests_dependencies
deps['test/vector_search/client_test'] = ['test/vector_search/client_test.cc'] + scylla_tests_dependencies
boost_tests_prefixes = ["test/boost/", "test/vector_search/", "test/raft/", "test/manual/", "test/ldap/"]
# We need to link these files to all Boost tests to make sure that
# we can execute `--list_json_content` on them. That will produce
# a similar result as calling `--list_content={HRF,DOT}`.
# Unfortunately, to be able to do that, we're forced to link the
# relevant code by hand.
for key in deps.keys():
for prefix in boost_tests_prefixes:
if key.startswith(prefix):
deps[key] += ["test/lib/boost_tree_lister_injector.cc", "test/lib/boost_test_tree_lister.cc"]
wasm_deps = {}
wasm_deps['wasm/return_input.wat'] = 'test/resource/wasm/rust/return_input.rs'
@@ -2098,7 +2002,7 @@ def semicolon_separated(*flags):
def real_relpath(path, start):
return os.path.relpath(os.path.realpath(path), os.path.realpath(start))
def configure_seastar(build_dir, mode, mode_config, compiler_cache=None):
def configure_seastar(build_dir, mode, mode_config):
seastar_cxx_ld_flags = mode_config['cxx_ld_flags']
# We want to "undo" coverage for seastar if we have it enabled.
if args.coverage:
@@ -2145,10 +2049,6 @@ def configure_seastar(build_dir, mode, mode_config, compiler_cache=None):
'-DSeastar_IO_URING=ON',
]
if compiler_cache:
seastar_cmake_args += [f'-DCMAKE_CXX_COMPILER_LAUNCHER={compiler_cache}',
f'-DCMAKE_C_COMPILER_LAUNCHER={compiler_cache}']
if args.stack_guards is not None:
stack_guards = 'ON' if args.stack_guards else 'OFF'
seastar_cmake_args += ['-DSeastar_STACK_GUARDS={}'.format(stack_guards)]
@@ -2180,7 +2080,7 @@ def configure_seastar(build_dir, mode, mode_config, compiler_cache=None):
subprocess.check_call(seastar_cmd, shell=False, cwd=cmake_dir)
def configure_abseil(build_dir, mode, mode_config, compiler_cache=None):
def configure_abseil(build_dir, mode, mode_config):
abseil_cflags = mode_config['lib_cflags']
cxx_flags = mode_config['cxxflags']
if '-DSANITIZE' in cxx_flags:
@@ -2206,10 +2106,6 @@ def configure_abseil(build_dir, mode, mode_config, compiler_cache=None):
'-DABSL_PROPAGATE_CXX_STD=ON',
]
if compiler_cache:
abseil_cmake_args += [f'-DCMAKE_CXX_COMPILER_LAUNCHER={compiler_cache}',
f'-DCMAKE_C_COMPILER_LAUNCHER={compiler_cache}']
cmake_args = abseil_cmake_args[:]
abseil_build_dir = os.path.join(build_dir, mode, 'abseil')
abseil_cmd = ['cmake', '-G', 'Ninja', real_relpath('abseil', abseil_build_dir)] + cmake_args
@@ -2382,15 +2278,10 @@ def write_build_file(f,
scylla_product,
scylla_version,
scylla_release,
compiler_cache,
args):
use_precompiled_header = not args.disable_precompiled_header
warnings = get_warning_options(args.cxx)
rustc_target = pick_rustc_target('wasm32-wasi', 'wasm32-wasip1')
# If compiler cache is available, prefix the compiler with it
cxx_with_cache = f'{compiler_cache} {args.cxx}' if compiler_cache else args.cxx
# For Rust, sccache is used via RUSTC_WRAPPER environment variable
rustc_wrapper = f'RUSTC_WRAPPER={compiler_cache} ' if compiler_cache and 'sccache' in compiler_cache else ''
f.write(textwrap.dedent('''\
configure_args = {configure_args}
builddir = {outdir}
@@ -2453,7 +2344,7 @@ def write_build_file(f,
command = clang --target=wasm32 --no-standard-libraries -Wl,--export-all -Wl,--no-entry $in -o $out
description = C2WASM $out
rule rust2wasm
command = {rustc_wrapper}cargo build --target={rustc_target} --example=$example --locked --manifest-path=test/resource/wasm/rust/Cargo.toml --target-dir=$builddir/wasm/ $
command = cargo build --target={rustc_target} --example=$example --locked --manifest-path=test/resource/wasm/rust/Cargo.toml --target-dir=$builddir/wasm/ $
&& wasm-opt -Oz $builddir/wasm/{rustc_target}/debug/examples/$example.wasm -o $builddir/wasm/$example.wasm $
&& wasm-strip $builddir/wasm/$example.wasm
description = RUST2WASM $out
@@ -2469,7 +2360,7 @@ def write_build_file(f,
command = llvm-profdata merge $in -output=$out
''').format(configure_args=configure_args,
outdir=outdir,
cxx=cxx_with_cache,
cxx=args.cxx,
user_cflags=user_cflags,
warnings=warnings,
defines=defines,
@@ -2477,7 +2368,6 @@ def write_build_file(f,
user_ldflags=user_ldflags,
libs=libs,
rustc_target=rustc_target,
rustc_wrapper=rustc_wrapper,
link_pool_depth=link_pool_depth,
seastar_path=args.seastar_path,
ninja=ninja,
@@ -2562,10 +2452,10 @@ def write_build_file(f,
description = TEST {mode}
# This rule is unused for PGO stages. They use the rust lib from the parent mode.
rule rust_lib.{mode}
command = CARGO_BUILD_DEP_INFO_BASEDIR='.' {rustc_wrapper}cargo build --locked --manifest-path=rust/Cargo.toml --target-dir=$builddir/{mode} --profile=rust-{mode} $
command = CARGO_BUILD_DEP_INFO_BASEDIR='.' cargo build --locked --manifest-path=rust/Cargo.toml --target-dir=$builddir/{mode} --profile=rust-{mode} $
&& touch $out
description = RUST_LIB $out
''').format(mode=mode, antlr3_exec=args.antlr3_exec, fmt_lib=fmt_lib, test_repeat=args.test_repeat, test_timeout=args.test_timeout, rustc_wrapper=rustc_wrapper, **modeval))
''').format(mode=mode, antlr3_exec=args.antlr3_exec, fmt_lib=fmt_lib, test_repeat=args.test_repeat, test_timeout=args.test_timeout, **modeval))
f.write(
'build {mode}-build: phony {artifacts} {wasms} {vector_search_validator_bins}\n'.format(
mode=mode,
@@ -2629,7 +2519,7 @@ def write_build_file(f,
# In debug/sanitize modes, we compile with fsanitizers,
# so must use the same options during the link:
if '-DSANITIZE' in modes[mode]['cxxflags']:
f.write(' libs = -fsanitize=address -fsanitize=undefined -lubsan\n')
f.write(' libs = -fsanitize=address -fsanitize=undefined\n')
else:
f.write(' libs =\n')
f.write(f'build $builddir/{mode}/{binary}.stripped: strip $builddir/{mode}/{binary}\n')
@@ -3025,9 +2915,6 @@ def create_build_system(args):
os.makedirs(outdir, exist_ok=True)
compiler_cache = find_compiler_cache(args.compiler_cache)
resolve_compilers_for_sccache(args, compiler_cache)
scylla_product, scylla_version, scylla_release = generate_version(args.date_stamp)
for mode, mode_config in build_modes.items():
@@ -3044,8 +2931,8 @@ def create_build_system(args):
# {outdir}/{mode}/seastar/build.ninja, and
# {outdir}/{mode}/seastar/seastar.pc is queried for building flags
for mode, mode_config in build_modes.items():
configure_seastar(outdir, mode, mode_config, compiler_cache)
configure_abseil(outdir, mode, mode_config, compiler_cache)
configure_seastar(outdir, mode, mode_config)
configure_abseil(outdir, mode, mode_config)
user_cflags += ' -isystem abseil'
for mode, mode_config in build_modes.items():
@@ -3068,7 +2955,6 @@ def create_build_system(args):
scylla_product,
scylla_version,
scylla_release,
compiler_cache,
args)
generate_compdb('compile_commands.json', ninja, args.buildfile, selected_modes)
@@ -3111,10 +2997,6 @@ def configure_using_cmake(args):
selected_modes = args.selected_modes or default_modes
selected_configs = ';'.join(build_modes[mode].cmake_build_type for mode
in selected_modes)
compiler_cache = find_compiler_cache(args.compiler_cache)
resolve_compilers_for_sccache(args, compiler_cache)
settings = {
'CMAKE_CONFIGURATION_TYPES': selected_configs,
'CMAKE_CROSS_CONFIGS': selected_configs,
@@ -3132,14 +3014,6 @@ def configure_using_cmake(args):
'Scylla_WITH_DEBUG_INFO' : 'ON' if args.debuginfo else 'OFF',
'Scylla_USE_PRECOMPILED_HEADER': 'OFF' if args.disable_precompiled_header else 'ON',
}
if compiler_cache:
settings['CMAKE_CXX_COMPILER_LAUNCHER'] = compiler_cache
settings['CMAKE_C_COMPILER_LAUNCHER'] = compiler_cache
# For Rust, sccache is used via RUSTC_WRAPPER
if 'sccache' in compiler_cache:
settings['Scylla_RUSTC_WRAPPER'] = compiler_cache
if args.date_stamp:
settings['Scylla_DATE_STAMP'] = args.date_stamp
if args.staticboost:
@@ -3171,7 +3045,7 @@ def configure_using_cmake(args):
if not args.dist_only:
for mode in selected_modes:
configure_seastar(build_dir, build_modes[mode].cmake_build_type, modes[mode], compiler_cache)
configure_seastar(build_dir, build_modes[mode].cmake_build_type, modes[mode])
cmake_command = ['cmake']
cmake_command += [f'-D{var}={value}' for var, value in settings.items()]

View File

@@ -190,7 +190,7 @@ future<utils::chunked_vector<mutation>> batch_statement::get_mutations(query_pro
co_return vresult;
}
void batch_statement::verify_batch_size(query_processor& qp, const utils::chunked_vector<mutation>& mutations) const {
void batch_statement::verify_batch_size(query_processor& qp, const utils::chunked_vector<mutation>& mutations) {
if (mutations.size() <= 1) {
return; // We only warn for batch spanning multiple mutations
}
@@ -209,9 +209,8 @@ void batch_statement::verify_batch_size(query_processor& qp, const utils::chunke
for (auto&& m : mutations) {
ks_cf_pairs.insert(m.schema()->ks_name() + "." + m.schema()->cf_name());
}
const auto batch_type = _type == type::LOGGED ? "Logged" : "Unlogged";
return seastar::format("{} batch modifying {:d} partitions in {} is of size {:d} bytes, exceeding specified {} threshold of {:d} by {:d}.",
batch_type, mutations.size(), fmt::join(ks_cf_pairs, ", "), size, type, threshold, size - threshold);
return seastar::format("Batch modifying {:d} partitions in {} is of size {:d} bytes, exceeding specified {} threshold of {:d} by {:d}.",
mutations.size(), fmt::join(ks_cf_pairs, ", "), size, type, threshold, size - threshold);
};
if (size > fail_threshold) {
_logger.error("{}", error("FAIL", fail_threshold).c_str());

View File

@@ -116,7 +116,7 @@ public:
* Checks batch size to ensure threshold is met. If not, a warning is logged.
* @param cfs ColumnFamilies that will store the batch's mutations.
*/
void verify_batch_size(query_processor& qp, const utils::chunked_vector<mutation>& mutations) const;
static void verify_batch_size(query_processor& qp, const utils::chunked_vector<mutation>& mutations);
virtual future<shared_ptr<cql_transport::messages::result_message>> execute(
query_processor& qp, service::query_state& state, const query_options& options, std::optional<service::group0_guard> guard) const override;

View File

@@ -17,7 +17,6 @@
#include <seastar/core/metrics.hh>
#include <seastar/core/coroutine.hh>
#include <seastar/core/sleep.hh>
#include <seastar/coroutine/parallel_for_each.hh>
#include "batchlog_manager.hh"
#include "batchlog.hh"
@@ -319,8 +318,8 @@ future<db::all_batches_replayed> db::batchlog_manager::replay_all_failed_batches
std::unordered_map<int32_t, replay_stats> replay_stats_per_shard;
// Use a stable `now` across all batches, so skip/replay decisions are the
// same across a while prefix of written_at (across all ids).
// Use a stable `now` accross all batches, so skip/replay decisions are the
// same accross a while prefix of written_at (accross all ids).
const auto now = db_clock::now();
auto batch = [this, cleanup, limiter, schema, &all_replayed, &replay_stats_per_shard, now] (const cql3::untyped_result_set::row& row) -> future<stop_iteration> {

View File

@@ -1105,14 +1105,6 @@ db::config::config(std::shared_ptr<db::extensions> exts)
"Like native_transport_port, but clients-side port number (modulo smp) is used to route the connection to the specific shard.")
, native_shard_aware_transport_port_ssl(this, "native_shard_aware_transport_port_ssl", value_status::Used, 19142,
"Like native_transport_port_ssl, but clients-side port number (modulo smp) is used to route the connection to the specific shard.")
, native_transport_port_proxy_protocol(this, "native_transport_port_proxy_protocol", value_status::Used, 0,
"Port on which the CQL native transport listens for clients using proxy protocol v2. Disabled (0) by default.")
, native_transport_port_ssl_proxy_protocol(this, "native_transport_port_ssl_proxy_protocol", value_status::Used, 0,
"Port on which the CQL TLS native transport listens for clients using proxy protocol v2. Disabled (0) by default.")
, native_shard_aware_transport_port_proxy_protocol(this, "native_shard_aware_transport_port_proxy_protocol", value_status::Used, 0,
"Like native_transport_port_proxy_protocol, but clients-side port number (modulo smp) is used to route the connection to the specific shard.")
, native_shard_aware_transport_port_ssl_proxy_protocol(this, "native_shard_aware_transport_port_ssl_proxy_protocol", value_status::Used, 0,
"Like native_transport_port_ssl_proxy_protocol, but clients-side port number (modulo smp) is used to route the connection to the specific shard.")
, native_transport_max_threads(this, "native_transport_max_threads", value_status::Invalid, 128,
"The maximum number of thread handling requests. The meaning is the same as rpc_max_threads.\n"
"Default is different (128 versus unlimited).\n"
@@ -1478,8 +1470,6 @@ db::config::config(std::shared_ptr<db::extensions> exts)
, alternator_max_expression_cache_entries_per_shard(this, "alternator_max_expression_cache_entries_per_shard", liveness::LiveUpdate, value_status::Used, 2000, "Maximum number of cached parsed request expressions, per shard.")
, alternator_max_users_query_size_in_trace_output(this, "alternator_max_users_query_size_in_trace_output", liveness::LiveUpdate, value_status::Used, uint64_t(4096),
"Maximum size of user's command in trace output (`alternator_op` entry). Larger traces will be truncated and have `<truncated>` message appended - which doesn't count to the maximum limit.")
, alternator_describe_table_info_cache_validity_in_seconds(this, "alternator_describe_table_info_cache_validity_in_seconds", liveness::LiveUpdate, value_status::Used, 60 * 60 * 6,
"The validity of DescribeTable information - table size in bytes. This is how long calculated value will be reused before recalculation.")
, abort_on_ebadf(this, "abort_on_ebadf", value_status::Used, true, "Abort the server on incorrect file descriptor access. Throws exception when disabled.")
, sanitizer_report_backtrace(this, "sanitizer_report_backtrace", value_status::Used, false,
"In debug mode, report log-structured allocator sanitizer violations with a backtrace. Slow.")
@@ -1576,12 +1566,6 @@ db::config::config(std::shared_ptr<db::extensions> exts)
// Bigger tables will take longer to be resized. similar-sized tables can be batched into same iteration.
, tablet_load_stats_refresh_interval_in_seconds(this, "tablet_load_stats_refresh_interval_in_seconds", liveness::LiveUpdate, value_status::Used, 60,
"Tablet load stats refresh rate in seconds.")
, force_capacity_based_balancing(this, "force_capacity_based_balancing", liveness::LiveUpdate, value_status::Used, false,
"Forces the load balancer to perform capacity based balancing, instead of size based balancing.")
, size_based_balance_threshold_percentage(this, "size_based_balance_threshold_percentage", liveness::LiveUpdate, value_status::Used, 1.0,
"Sets the maximum difference in percentages between the most loaded and least loaded nodes, below which the load balancer considers nodes balanced.")
, minimal_tablet_size_for_balancing(this, "minimal_tablet_size_for_balancing", liveness::LiveUpdate, value_status::Used, service::default_target_tablet_size / 100,
"Sets the minimal tablet size for the load balancer. For any tablet smaller than this, the balancer will use this size instead of the actual tablet size.")
, default_log_level(this, "default_log_level", value_status::Used, seastar::log_level::info, "Default log level for log messages")
, logger_log_level(this, "logger_log_level", value_status::Used, {}, "Map of logger name to log level. Valid log levels are 'error', 'warn', 'info', 'debug' and 'trace'")
, log_to_stdout(this, "log_to_stdout", value_status::Used, true, "Send log output to stdout")

View File

@@ -324,10 +324,6 @@ public:
named_value<uint16_t> native_transport_port_ssl;
named_value<uint16_t> native_shard_aware_transport_port;
named_value<uint16_t> native_shard_aware_transport_port_ssl;
named_value<uint16_t> native_transport_port_proxy_protocol;
named_value<uint16_t> native_transport_port_ssl_proxy_protocol;
named_value<uint16_t> native_shard_aware_transport_port_proxy_protocol;
named_value<uint16_t> native_shard_aware_transport_port_ssl_proxy_protocol;
named_value<uint32_t> native_transport_max_threads;
named_value<uint32_t> native_transport_max_frame_size_in_mb;
named_value<sstring> broadcast_rpc_address;
@@ -477,7 +473,6 @@ public:
named_value<bool> alternator_allow_system_table_write;
named_value<uint32_t> alternator_max_expression_cache_entries_per_shard;
named_value<uint64_t> alternator_max_users_query_size_in_trace_output;
named_value<uint32_t> alternator_describe_table_info_cache_validity_in_seconds;
named_value<bool> abort_on_ebadf;
@@ -595,9 +590,6 @@ public:
named_value<bool> rf_rack_valid_keyspaces;
named_value<uint32_t> tablet_load_stats_refresh_interval_in_seconds;
named_value<bool> force_capacity_based_balancing;
named_value<float> size_based_balance_threshold_percentage;
named_value<uint64_t> minimal_tablet_size_for_balancing;
static const sstring default_tls_priority;
private:

View File

@@ -26,7 +26,6 @@
#include <seastar/core/smp.hh>
#include <seastar/coroutine/exception.hh>
#include <seastar/coroutine/parallel_for_each.hh>
#include <seastar/util/file.hh>
// Boost features.
@@ -900,7 +899,7 @@ future<> manager::migrate_ip_directories() {
co_await coroutine::parallel_for_each(dirs_to_remove, [] (auto& directory) -> future<> {
try {
manager_logger.warn("Removing hint directory {}", directory.native());
co_await seastar::recursive_remove_directory(directory);
co_await lister::rmdir(directory);
} catch (...) {
on_internal_error(manager_logger,
seastar::format("Removing a hint directory has failed. Reason: {}", std::current_exception()));

View File

@@ -12,6 +12,8 @@
#include <yaml-cpp/yaml.h>
#include <boost/lexical_cast.hpp>
#include "utils/s3/creds.hh"
#include "object_storage_endpoint_param.hh"
using namespace std::string_literals;
@@ -19,6 +21,9 @@ using namespace std::string_literals;
db::object_storage_endpoint_param::object_storage_endpoint_param(s3_storage s)
: _data(std::move(s))
{}
db::object_storage_endpoint_param::object_storage_endpoint_param(std::string endpoint, s3::endpoint_config config)
: object_storage_endpoint_param(s3_storage{std::move(endpoint), std::move(config)})
{}
db::object_storage_endpoint_param::object_storage_endpoint_param(gs_storage s)
: _data(std::move(s))
{}
@@ -27,8 +32,8 @@ db::object_storage_endpoint_param::object_storage_endpoint_param() = default;
db::object_storage_endpoint_param::object_storage_endpoint_param(const object_storage_endpoint_param&) = default;
std::string db::object_storage_endpoint_param::s3_storage::to_json_string() const {
return fmt::format("{{ \"type\": \"s3\", \"aws_region\": \"{}\", \"iam_role_arn\": \"{}\" }}",
region, iam_role_arn
return fmt::format("{{ \"port\": {}, \"use_https\": {}, \"aws_region\": \"{}\", \"iam_role_arn\": \"{}\" }}",
config.port, config.use_https, config.region, config.role_arn
);
}
@@ -94,6 +99,8 @@ const std::string& db::object_storage_endpoint_param::type() const {
db::object_storage_endpoint_param db::object_storage_endpoint_param::decode(const YAML::Node& node) {
auto name = node["name"];
auto aws_region = node["aws_region"];
auto iam_role_arn = node["iam_role_arn"];
auto type = node["type"];
auto get_opt = [](auto& node, const std::string& key, auto def) {
@@ -101,19 +108,13 @@ db::object_storage_endpoint_param db::object_storage_endpoint_param::decode(cons
return tmp ? tmp.template as<std::decay_t<decltype(def)>>() : def;
};
// aws s3 endpoint.
if (!type || type.as<std::string>() == s3_type ) {
if (!type || type.as<std::string>() == s3_type || aws_region || iam_role_arn) {
s3_storage ep;
ep.endpoint = name.as<std::string>();
auto aws_region = node["aws_region"];
ep.region = aws_region ? aws_region.as<std::string>() : std::getenv("AWS_DEFAULT_REGION");
ep.iam_role_arn = get_opt(node, "iam_role_arn", ""s);
if (maybe_legacy_endpoint_name(ep.endpoint)) {
// Support legacy config for a while
auto port = node["port"].as<unsigned>();
auto use_https = node["https"].as<bool>(false);
ep.endpoint = fmt::format("http{}://{}:{}", use_https ? "s" : "", ep.endpoint, port);
}
ep.config.port = node["port"].as<unsigned>();
ep.config.use_https = node["https"].as<bool>(false);
ep.config.region = aws_region ? aws_region.as<std::string>() : std::getenv("AWS_DEFAULT_REGION");
ep.config.role_arn = iam_role_arn ? iam_role_arn.as<std::string>() : "";
return object_storage_endpoint_param{std::move(ep)};
}

View File

@@ -13,6 +13,7 @@
#include <variant>
#include <compare>
#include <fmt/core.h>
#include "utils/s3/creds.hh"
namespace YAML {
class Node;
@@ -24,8 +25,7 @@ class object_storage_endpoint_param {
public:
struct s3_storage {
std::string endpoint;
std::string region;
std::string iam_role_arn;
s3::endpoint_config config;
std::strong_ordering operator<=>(const s3_storage&) const = default;
std::string to_json_string() const;
@@ -43,6 +43,7 @@ public:
object_storage_endpoint_param();
object_storage_endpoint_param(const object_storage_endpoint_param&);
object_storage_endpoint_param(s3_storage);
object_storage_endpoint_param(std::string endpoint, s3::endpoint_config config);
object_storage_endpoint_param(gs_storage);
std::strong_ordering operator<=>(const object_storage_endpoint_param&) const;
@@ -76,7 +77,3 @@ template <>
struct fmt::formatter<db::object_storage_endpoint_param> : fmt::formatter<std::string_view> {
auto format(const db::object_storage_endpoint_param&, fmt::format_context& ctx) const -> decltype(ctx.out());
};
inline bool maybe_legacy_endpoint_name(std::string_view ep) noexcept {
return !(ep.starts_with("http://") || ep.starts_with("https://"));
}

View File

@@ -55,7 +55,6 @@
#include "message/shared_dict.hh"
#include "replica/database.hh"
#include "db/compaction_history_entry.hh"
#include "mutation/async_utils.hh"
#include <unordered_map>
@@ -3000,9 +2999,7 @@ future<mutation> system_keyspace::get_group0_history(sharded<replica::database>&
SCYLLA_ASSERT(rs);
auto& ps = rs->partitions();
for (auto& p: ps) {
// Note: we could decorate the frozen_mutation's key to check if it's the expected one
// but since this is a single partition table, we can just check after unfreezing the whole mutation.
auto mut = co_await unfreeze_gently(p.mut(), s);
auto mut = p.mut().unfreeze(s);
auto partition_key = value_cast<sstring>(utf8_type->deserialize(mut.key().get_component(*s, 0)));
if (partition_key == GROUP0_HISTORY_KEY) {
co_return mut;

View File

@@ -200,7 +200,6 @@ public:
static constexpr auto DICTS = "dicts";
static constexpr auto VIEW_BUILDING_TASKS = "view_building_tasks";
static constexpr auto CLIENT_ROUTES = "client_routes";
static constexpr auto VERSIONS = "versions";
// auth
static constexpr auto ROLES = "roles";

View File

@@ -605,8 +605,8 @@ public:
}
static schema_ptr build_schema() {
auto id = generate_legacy_id(system_keyspace::NAME, system_keyspace::VERSIONS);
return schema_builder(system_keyspace::NAME, system_keyspace::VERSIONS, std::make_optional(id))
auto id = generate_legacy_id(system_keyspace::NAME, "versions");
return schema_builder(system_keyspace::NAME, "versions", std::make_optional(id))
.with_column("key", utf8_type, column_kind::partition_key)
.with_column("version", utf8_type)
.with_column("build_mode", utf8_type)
@@ -1120,10 +1120,9 @@ public:
}
auto tm = _db.local().get_token_metadata_ptr();
auto target_tablet_size = _db.local().get_config().target_tablet_size_in_bytes();
const uint64_t default_tablet_size = _db.local().get_config().target_tablet_size_in_bytes();
locator::load_sketch load(tm, stats, default_tablet_size);
locator::load_sketch load(tm);
co_await load.populate();
tm->get_topology().for_each_node([&] (const auto& node) {
@@ -1137,23 +1136,18 @@ public:
if (auto ip = _gossiper.local().get_address_map().find(host)) {
set_cell(r.cells(), "ip", data_value(inet_address(*ip)));
}
set_cell(r.cells(), "tablets_allocated", int64_t(load.get_tablet_count(host)));
set_cell(r.cells(), "tablets_allocated_per_shard", data_value(double(load.get_real_avg_tablet_count(host))));
set_cell(r.cells(), "storage_allocated_load", data_value(int64_t(load.get_tablet_count(host) * default_tablet_size)));
set_cell(r.cells(), "tablets_allocated", load.get_load(host));
set_cell(r.cells(), "tablets_allocated_per_shard", data_value(double(load.get_real_avg_shard_load(host))));
set_cell(r.cells(), "storage_allocated_load", data_value(int64_t(load.get_load(host) * target_tablet_size)));
if (stats && stats->capacity.contains(host)) {
auto capacity = stats->capacity.at(host);
set_cell(r.cells(), "storage_capacity", data_value(int64_t(capacity)));
if (auto utilization = load.get_allocated_utilization(host)) {
auto utilization = load.get_allocated_utilization(host, *stats, target_tablet_size);
if (utilization) {
set_cell(r.cells(), "storage_allocated_utilization", data_value(double(*utilization)));
}
if (load.has_complete_data(host)) {
if (auto utilization = load.get_storage_utilization(host)) {
set_cell(r.cells(), "storage_utilization", data_value(double(*utilization)));
}
set_cell(r.cells(), "storage_load", data_value(int64_t(load.get_disk_used(host))));
}
}
mutation_sink(m);
});
@@ -1173,8 +1167,6 @@ private:
.with_column("storage_capacity", long_type)
.with_column("storage_allocated_load", long_type)
.with_column("storage_allocated_utilization", double_type)
.with_column("storage_load", long_type)
.with_column("storage_utilization", double_type)
.with_sharder(1, 0) // shard0-only
.with_hash_version()
.build();

View File

@@ -271,12 +271,6 @@ is different, or can be configured in Alternator:
So for example, if you create a table whose name is 192 characters, you
can't create a GSI whose name is longer than 29 characters.
* DynamoDB's DescribeTable will return information about the table. According to
AWS documentation, fields TableSizeBytes, IndexSizeBytes and ItemCount can
lag behind by up to 6 hours.
The `alternator_describe_table_info_cache_validity_in_seconds` parameter allows
users to change this timeout - the default value in seconds is set to 21600 (6 hours).
## Experimental API features
Some DynamoDB API features are supported by Alternator, but considered
@@ -296,14 +290,6 @@ experimental:
considered experimental so needs to be enabled explicitly with the
`--experimental-features=alternator-streams` configuration option.
In this version, Alternator Streams is only supported if the base table
uses vnodes instead of tablets. However, by default new tables use tablets
so to create a table that can be used with Streams, you must set the tag
`system:initial_tablets` set to `none` during CreateTable - so that the
new table will use vnodes. Streams cannot be enabled on an already-existing
table that uses tablets.
See <https://github.com/scylladb/scylla/issues/23838>.
Alternator streams also differ in some respects from DynamoDB Streams:
* The number of separate "shards" in Alternator's streams is significantly
larger than is typical on DynamoDB.
@@ -389,11 +375,11 @@ they should be easy to detect. Here is a list of these unimplemented features:
another cache in front of the it. We wrote more about this here:
<https://www.scylladb.com/2017/07/31/database-caches-not-good/>
* The DescribeTable is missing some information about size estimates
(IndexSizeBytes and ItemCount - TableSizeBytes is available), and also
part of the information about indexes enabled on the table.
* The DescribeTable is missing information about size estimates, and
also part of the information about indexes enabled on the table.
<https://github.com/scylladb/scylla/issues/5320>
<https://github.com/scylladb/scylla/issues/7550>
<https://github.com/scylladb/scylla/issues/7551>
* The PartiQL syntax (SQL-like SELECT/UPDATE/INSERT/DELETE expressions)
and the operations ExecuteStatement, BatchExecuteStatement and

View File

@@ -1,6 +1,6 @@
# Introduction
Similar to the approach described in CASSANDRA-12151, we add the
Similar to the approach described in CASSANDRA-14471, we add the
concept of an audit specification. An audit has a target (syslog or a
table) and a set of events/actions that it wants recorded. We
introduce new CQL syntax for Scylla users to describe and manipulate

View File

@@ -2,11 +2,8 @@
## What is ScyllaDB?
ScyllaDB is a high-performance NoSQL database optimized for speed and scalability.
It is designed to efficiently handle large volumes of data with minimal latency,
making it ideal for data-intensive applications.
ScyllaDB is distributed under the [ScyllaDB Source Available License](https://github.com/scylladb/scylladb/blob/master/LICENSE-ScyllaDB-Source-Available.md).
ScyllaDB is a high-performance NoSQL database system, fully compatible with Apache Cassandra.
ScyllaDB is released under the GNU Affero General Public License version 3 and the Apache License, ScyllaDB is free and open-source software.
> [ScyllaDB](http://www.scylladb.com/)

View File

@@ -20,7 +20,9 @@ command line option when launchgin scylla.
You can define endpoint details in the `scylla.yaml` file. For example:
```yaml
object_storage_endpoints:
- name: https://s3.us-east-1.amazonaws.com:443
- name: s3.us-east-1.amazonaws.com
port: 443
https: true
aws_region: us-east-1
```
@@ -76,7 +78,9 @@ The examples above are intended for development or local environments. You shoul
For the EC2 Instance Metadata Service to function correctly, no additional configuration is required. However, STS requires the IAM Role ARN to be defined in the `scylla.yaml` file, as shown below:
```yaml
object_storage_endpoints:
- name: https://s3.us-east-1.amazonaws.com:443
- name: s3.us-east-1.amazonaws.com
port: 443
https: true
aws_region: us-east-1
iam_role_arn: arn:aws:iam::123456789012:instance-profile/my-instance-instance-profile
```
@@ -96,7 +100,9 @@ in `scylla.yaml`:
```yaml
object_storage_endpoints:
- name: https://s3.us-east-2.amazonaws.com:443
- name: s3.us-east-2.amazonaws.com
port: 443
https: true
aws_region: us-east-2
```

View File

@@ -372,8 +372,6 @@ Columns:
* `storage_allocated_load` - Disk space allocated for tablets, assuming each tablet has a fixed size (target_tablet_size).
* `storage_allocated_utilization` - Fraction of node's disk capacity taken for `storage_allocated_load`, where 1.0 means full utilization.
* `storage_capacity` - Total disk capacity in bytes. Used to compute `storage_allocated_utilization`. By default equal to file system's capacity.
* `storage_load` - Disk space allocated for tablets, computed with actual tablet sizes. Can be null if some of the tablet sizes are not known.
* `storage_utilization` - Fraction of node's disk capacity taken for `storage_load` (with actual tablet sizes), where 1.0 means full utilization. Can be null if some of the tablet sizes are not known.
* `tablets_allocated` - Number of tablet replicas on the node. Migrating tablets are accounted as if migration already finished.
* `tablets_allocated_per_shard` - `tablets_allocated` divided by shard count on the node.

View File

@@ -45,3 +45,10 @@ Run cqlsh:
cqlsh
Run cassandra-stress:
.. code-block:: console
cassandra-stress write -mode cql3 native

View File

@@ -10,7 +10,6 @@ Admin Tools
Admin REST API </operating-scylla/rest>
Tracing </using-scylla/tracing>
ScyllaDB SStable </operating-scylla/admin-tools/scylla-sstable/>
ScyllaDB SStable Script API </operating-scylla/admin-tools/scylla-sstable-script-api/>
ScyllaDB Types </operating-scylla/admin-tools/scylla-types/>
sstableloader
cassandra-stress </operating-scylla/admin-tools/cassandra-stress/>

View File

@@ -1,530 +0,0 @@
ScyllaDB SStable Script API
---------------------------
The script API consists of two parts:
* `ScyllaDB Consume API <scylla-consume-api_>`_ - Hook methods implemented by the script to consume a :ref:`mutation fragment stream <scylla-sstable-sstable-content>`;
* `ScyllaDB Lua API <scylla-script-lua-api_>`_ - types and methods exposed to the script to work with ScyllaDB types and values.
.. _scylla-consume-api:
ScyllaDB Consume API
~~~~~~~~~~~~~~~~~~~~~~
These methods represent the glue code between scylla-sstable's C++ code and the Lua script.
Conceptually a script is an implementation of a consumer interface. The script has to implement only the methods it is interested in. Each method has a default implementation in the interface, which simply drops the respective :ref:`mutation fragment <scylla-sstable-sstable-content>`.
For example, a script only interested in partitions can define only :ref:`consume_partition_start() <scylla-consume-partition-start-method>` and nothing else.
Therefore a completely empty script is also valid, although not very useful.
Below you will find the listing of the API methods.
These methods (if provided by the script) will be called by the scylla-sstable runtime for the appropriate events and fragment types.
.. _scylla-consume-stream-start-method:
consume_stream_start(args)
""""""""""""""""""""""""""
* Part of the Consume API. Called on the very start of the stream.
* Parameter is a Lua table containing command line arguments for the script, passed via ``--script-arg``.
* Can be used to initialize global state.
.. _scylla-consume-sstable-start-method:
consume_sstable_start(sst)
""""""""""""""""""""""""""
* Part of the Consume API.
* Called on the start of each stable.
* The parameter is of type `ScyllaDB.sstable <scylla-sstable-type_>`_.
* When SStables are merged (``--merge``), the parameter is ``nil``.
Returns whether to stop. If ``true``, `consume_sstable_end() <scylla-consume-sstable-end-method_>`_ is called, skipping the content of the sstable (or that of the entire stream if ``--merge`` is used). If ``false``, consumption follows with the content of the sstable.
.. _scylla-consume-partition-start-method:
consume_partition_start(ps)
"""""""""""""""""""""""""""
* Part of the Consume API. Called on the start of each partition.
* The parameter is of type `ScyllaDB.partition_start <scylla-partition-start-type_>`_.
* Returns whether to stop. If ``true``, `consume_partition_end() <scylla-consume-partition-end-method_>`_ is called, skipping the content of the partition. If ``false``, consumption follows with the content of the partition.
consume_static_row(sr)
""""""""""""""""""""""
* Part of the Consume API.
* Called if the partition has a static row.
* The parameter is of type `ScyllaDB.static_row <scylla-static-row-type_>`_.
* Returns whether to stop. If ``true``, `consume_partition_end() <scylla-consume-partition-end-method_>`_ is called, and the remaining content of the partition is skipped. If ``false``, consumption follows with the remaining content of the partition.
consume_clustering_row(cr)
""""""""""""""""""""""""""
* Part of the Consume API.
* Called for each clustering row.
* The parameter is of type `ScyllaDB.clustering_row <scylla-clustering-row-type_>`_.
* Returns whether to stop. If ``true``, `consume_partition_end() <scylla-consume-partition-end-method_>`_ is called, the remaining content of the partition is skipped. If ``false``, consumption follows with the remaining content of the partition.
consume_range_tombstone_change(crt)
"""""""""""""""""""""""""""""""""""
* Part of the Consume API.
* Called for each range tombstone change.
* The parameter is of type `ScyllaDB.range_tombstone_change <scylla-range-tombstone-change-type_>`_.
* Returns whether to stop. If ``true``, `consume_partition_end() <scylla-consume-partition-end-method_>`_ is called, the remaining content of the partition is skipped. If ``false``, consumption follows with the remaining content of the partition.
.. _scylla-consume-partition-end-method:
consume_partition_end()
"""""""""""""""""""""""
* Part of the Consume API.
* Called at the end of the partition.
* Returns whether to stop. If ``true``, `consume_sstable_end() <scylla-consume-sstable-end-method_>`_ is called, the remaining content of the SStable is skipped. If ``false``, consumption follows with the remaining content of the SStable.
.. _scylla-consume-sstable-end-method:
consume_sstable_end()
"""""""""""""""""""""
* Part of the Consume API.
* Called at the end of the SStable.
* Returns whether to stop. If true, `consume_stream_end() <scylla-consume-stream-end-method_>`_ is called, the remaining content of the stream is skipped. If false, consumption follows with the remaining content of the stream.
.. _scylla-consume-stream-end-method:
consume_stream_end()
""""""""""""""""""""
* Part of the Consume API.
* Called at the very end of the stream.
.. _scylla-script-lua-api:
ScyllaDB LUA API
~~~~~~~~~~~~~~~~
In addition to the `ScyllaDB Consume API <scylla-consume-api_>`_, the Lua bindings expose various types and methods that allow you to work with ScyllaDB types and values.
The listing uses the following terminology:
* Attribute - a simple attribute accessible via ``obj.attribute_name``;
* Method - a method operating on an instance of said type, invocable as ``obj:method()``;
* Magic method - magic methods defined in the metatable which define behaviour of these objects w.r.t. `Lua operators and more <http://www.lua.org/manual/5.4/manual.html#2.4>`_;
The format of an attribute description is the following:
.. code-block:: none
:class: hide-copy-button
attribute_name (type) - description
and that of a method:
.. code-block:: none
:class: hide-copy-button
method_name(arg1_type, arg2_type...) (return_type) - description
Magic methods have their signature defined by Lua and so that is not described here (these methods are not used directly anyway).
.. _scylla-atomic-cell-type:
ScyllaDB.atomic_cell
""""""""""""""""""""
Attributes:
* timestamp (integer)
* is_live (boolean) - is the cell live?
* type (string) - one of: ``regular``, ``counter-update``, ``counter-shards``, ``frozen-collection`` or ``collection``.
* has_ttl (boolean) - is the cell expiring?
* ttl (integer) - time to live in seconds, ``nil`` if cell is not expiring.
* expiry (`ScyllaDB.gc_clock_time_point <scylla-gc-clock-time-point-type_>`_) - time at which cell expires, ``nil`` if cell is not expiring.
* deletion_time (`ScyllaDB.gc_clock_time_point <scylla-gc-clock-time-point-type_>`_) - time at which cell was deleted, ``nil`` unless cell is dead or expiring.
* value:
- ``nil`` if cell is dead.
- appropriate Lua native type if type == ``regular``.
- integer if type == ``counter-update``.
- `ScyllaDB.counter_shards_value <scylla-counter-shards-value-type_>`_ if type == ``counter-shards``.
A counter-shard table has the following keys:
* id (string)
* value (integer)
* clock (integer)
.. _scylla-clustering-key-type:
ScyllaDB.clustering_key
"""""""""""""""""""""""
Attributes:
* components (table) - the column values (`ScyllaDB.data_value <scylla-data-value-type_>`_) making up the composite clustering key.
Methods:
* to_hex - convert the key to its serialized format, encoded in hex.
Magic methods:
* __tostring - can be converted to string with tostring(), uses the built-in operator<< in ScyllaDB.
.. _scylla-clustering-row-type:
ScyllaDB.clustering_row
"""""""""""""""""""""""
Attributes:
* key ($TYPE) - the clustering key's value as the appropriate Lua native type.
* tombstone (`ScyllaDB.tombstone <scylla-tombstone-type_>`_) - row tombstone, ``nil`` if no tombstone.
* shadowable_tombstone (`ScyllaDB.tombstone <scylla-tombstone-type_>`_) - shadowable tombstone of the row tombstone, ``nil`` if no tombstone.
* marker (`ScyllaDB.row_marker <scylla-row-marker-type_>`_) - the row marker, ``nil`` if row doesn't have one.
* cells (table) - table of cells, where keys are the column names and the values are either of type `ScyllaDB.atomic_cell <scylla-atomic-cell-type_>`_ or `ScyllaDB.collection <scylla-collection-type_>`_.
See also:
* `ScyllaDB.unserialize_clustering_key() <scylla-unserialize-clustering-key-method_>`_.
.. _scylla-collection-type:
ScyllaDB.collection
"""""""""""""""""""
Attributes:
* type (string) - always ``collection`` for collection.
* tombstone (`ScyllaDB.tombstone <scylla-tombstone-type_>`_) - ``nil`` if no tombstone.
* cells (table) - the collection cells, each collection cell is a table, with a ``key`` and ``value`` attribute. The key entry is the key of the collection cell for actual collections (list, set and map) and is of type `ScyllaDB.data-value <scylla-data-value-type_>`_. For tuples and UDT this is just an empty string. The value entry is the value of the collection cell and is of type `ScyllaDB.atomic-cell <scylla-atomic-cell-type_>`_.
.. _scylla-collection-cell-value-type:
ScyllaDB.collection_cell_value
""""""""""""""""""""""""""""""
Attributes:
* key (sstring) - collection cell key in human readable form.
* value (`ScyllaDB.atomic_cell <scylla-atomic-cell-type_>`_) - collection cell value.
.. _scylla-column-definition-type:
ScyllaDB.column_definition
""""""""""""""""""""""""""
Attributes:
* id (integer) - the id of the column.
* name (string) - the name of the column.
* kind (string) - the kind of the column, one of ``partition_key``, ``clustering_key``, ``static_column`` or ``regular_column``.
.. _scylla-counter-shards-value-type:
ScyllaDB.counter_shards_value
"""""""""""""""""""""""""""""
Attributes:
* value (integer) - the total value of the counter (the sum of all the shards).
* shards (table) - the shards making up this counter, a lua list containing tables, representing shards, with the following key/values:
- id (string) - the shard's id (UUID).
- value (integer) - the shard's value.
- clock (integer) - the shard's logical clock.
Magic methods:
* __tostring - can be converted to string with tostring().
.. _scylla-data-value-type:
ScyllaDB.data_value
"""""""""""""""""""
Attributes:
* value - the value represented as the appropriate Lua type
Magic methods:
* __tostring - can be converted to string with tostring().
.. _scylla-gc-clock-time-point-type:
ScyllaDB.gc_clock_time_point
""""""""""""""""""""""""""""
A time point belonging to the gc_clock, in UTC.
Attributes:
* year (integer) - [1900, +inf).
* month (integer) - [1, 12].
* day (integer) - [1, 31].
* hour (integer) - [0, 23].
* min (integer) - [0, 59].
* sec (integer) - [0, 59].
Magic methods:
* __eq - can be equal compared.
* __lt - can be less compared.
* __le - can be less-or-equal compared.
* __tostring - can be converted to string with tostring().
See also:
* `ScyllaDB.now() <scylla-now-method_>`_.
* `ScyllaDB.time_point_from_string() <scylla-time-point-from-string-method_>`_.
.. _scylla-json-writer-type:
ScyllaDB.json_writer
""""""""""""""""""""
A JSON writer object, with both low-level and high-level APIs.
The low-level API allows you to write custom JSON and it loosely follows the API of `rapidjson::Writer <https://rapidjson.org/classrapidjson_1_1_writer.html>`_ (upon which it is implemented).
The high-level API is for writing :ref:`mutation fragments <scylla-sstable-sstable-content>` as JSON directly, using the built-in JSON conversion logic that is used by :ref:`dump-data <scylla-sstable-dump-data-operation>` operation.
Low level API Methods:
* null() - write a null json value.
* bool(boolean) - write a bool json value.
* int(integer) - write an integer json value.
* double(number) - write a double json value.
* string(string) - write a string json value.
* start_object() - start a json object.
* key(string) - write the key of a json object.
* end_object() - write the end of a json object.
* start_array() - write the start of a json array.
* end_array() - write the end of a json array.
High level API Methods:
* start_stream() - start the stream, call at the very beginning.
* start_sstable() - start an sstable.
* start_partition() - start a partition.
* static_row() - write a static row to the stream.
* clustering_row() - write a clustering row to the stream.
* range_tombstone_change() - write a range tombstone change to the stream.
* end_partition() - end the current partition.
* end_sstable() - end the current sstable.
* end_stream() - end the stream, call at the very end.
.. _scylla-new-json-writer-method:
ScyllaDB.new_json_writer()
""""""""""""""""""""""""""
Create a `ScyllaDB.json_writer <scylla-json-writer-type_>`_ instance.
.. _scylla-new-position-in-partition-method:
ScyllaDB.new_position_in_partition()
""""""""""""""""""""""""""""""""""""
Creates a `ScyllaDB.position_in_partition <scylla-position-in-partition-type_>`_ instance.
Arguments:
* weight (integer) - the weight of the key.
* key (`ScyllaDB.clustering_key <scylla-clustering-key-type_>`_) - the clustering key, optional.
.. _scylla-new-ring-position-method:
ScyllaDB.new_ring_position()
""""""""""""""""""""""""""""
Creates a `ScyllaDB.ring_position <scylla-ring-position-type_>`_ instance.
Has several overloads:
* ``ScyllaDB.new_ring_position(weight, key)``.
* ``ScyllaDB.new_ring_position(weight, token)``.
* ``ScyllaDB.new_ring_position(weight, key, token)``.
Where:
* weight (integer) - the weight of the key.
* key (`ScyllaDB.partition_key <scylla-partition-key-type_>`_) - the partition key.
* token (integer) - the token (of the key if a key is provided).
.. _scylla-now-method:
ScyllaDB.now()
""""""""""""""
Create a `ScyllaDB.gc_clock_time_point <scylla-gc-clock-time-point-type_>`_ instance, representing the current time.
.. _scylla-partition-key-type:
ScyllaDB.partition_key
""""""""""""""""""""""
Attributes:
* components (table) - the column values (`ScyllaDB.data_value <scylla-data-value-type_>`_) making up the composite partition key.
Methods:
* to_hex - convert the key to its serialized format, encoded in hex.
Magic methods:
* __tostring - can be converted to string with tostring(), uses the built-in operator<< in ScyllaDB.
See also:
* :ref:`ScyllaDB.unserialize_partition_key() <scylla-unserialize-partition-key-method>`.
* :ref:`ScyllaDB.token_of() <scylla-token-of-method>`.
.. _scylla-partition-start-type:
ScyllaDB.partition_start
""""""""""""""""""""""""
Attributes:
* key - the partition key's value as the appropriate Lua native type.
* token (integer) - the partition key's token.
* tombstone (`ScyllaDB.tombstone <scylla-tombstone-type_>`_) - the partition tombstone, ``nil`` if no tombstone.
.. _scylla-position-in-partition-type:
ScyllaDB.position_in_partition
""""""""""""""""""""""""""""""
Currently used only for clustering positions.
Attributes:
* key (`ScyllaDB.clustering_key <scylla-clustering-key-type_>`_) - the clustering key, ``nil`` if the position in partition represents the min or max clustering positions.
* weight (integer) - weight of the position, either -1 (before key), 0 (at key) or 1 (after key). If key attribute is ``nil``, the weight is never 0.
Methods:
* tri_cmp - compare this position in partition to another position in partition, returns -1 (``<``), 0 (``==``) or 1 (``>``).
See also:
* `ScyllaDB.new_position_in_partition() <scylla-new-position-in-partition-method_>`_.
.. _scylla-range-tombstone-change-type:
ScyllaDB.range_tombstone_change
"""""""""""""""""""""""""""""""
Attributes:
* key ($TYPE) - the clustering key's value as the appropriate Lua native type.
* key_weight (integer) - weight of the position, either -1 (before key), 0 (at key) or 1 (after key).
* tombstone (`ScyllaDB.tombstone <scylla-tombstone-type_>`_) - tombstone, ``nil`` if no tombstone.
.. _scylla-ring-position-type:
ScyllaDB.ring_position
""""""""""""""""""""""
Attributes:
* token (integer) - the token, ``nil`` if the ring position represents the min or max ring positions.
* key (`ScyllaDB.partition_key <scylla-partition-key-type_>`_) - the partition key, ``nil`` if the ring position represents a position before/after a token.
* weight (integer) - weight of the position, either -1 (before key/token), 0 (at key) or 1 (after key/token). If key attribute is ``nil``, the weight is never 0.
Methods:
* tri_cmp - compare this ring position to another ring position, returns -1 (``<``), 0 (``==``) or 1 (``>``).
See also:
* `ScyllaDB.new_ring_position() <scylla-new-ring-position-method_>`_.
.. _scylla-row-marker-type:
ScyllaDB.row_marker
"""""""""""""""""""
Attributes:
* timestamp (integer).
* is_live (boolean) - is the marker live?
* has_ttl (boolean) - is the marker expiring?
* ttl (integer) - time to live in seconds, ``nil`` if marker is not expiring.
* expiry (`ScyllaDB.gc_clock_time_point <scylla-gc-clock-time-point-type_>`_) - time at which marker expires, ``nil`` if marker is not expiring.
* deletion_time (`ScyllaDB.gc_clock_time_point <scylla-gc-clock-time-point-type_>`_) - time at which marker was deleted, ``nil`` unless marker is dead or expiring.
.. _scylla-schema-type:
ScyllaDB.schema
"""""""""""""""
Attributes:
* partition_key_columns (table) - list of `ScyllaDB.column_definition <scylla-column-definition-type_>`_ of the key columns making up the partition key.
* clustering_key_columns (table) - list of `ScyllaDB.column_definition <scylla-column-definition-type_>`_ of the key columns making up the clustering key.
* static_columns (table) - list of `ScyllaDB.column_definition <scylla-column-definition-type_>`_ of the static columns.
* regular_columns (table) - list of `ScyllaDB.column_definition <scylla-column-definition-type_>`_ of the regular columns.
* all_columns (table) - list of `ScyllaDB.column_definition <scylla-column-definition-type_>`_ of all columns.
.. _scylla-sstable-type:
ScyllaDB.sstable
""""""""""""""""
Attributes:
* filename (string) - the full path of the sstable Data component file;
.. _scylla-static-row-type:
ScyllaDB.static_row
"""""""""""""""""""
Attributes:
* cells (table) - table of cells, where keys are the column names and the values are either of type `ScyllaDB.atomic_cell <scylla-atomic-cell-type_>`_ or `ScyllaDB.collection <scylla-collection-type_>`_.
.. _scylla-time-point-from-string-method:
ScyllaDB.time_point_from_string()
"""""""""""""""""""""""""""""""""
Create a `ScyllaDB.gc_clock_time_point <scylla-gc-clock-time-point-type_>`_ instance from the passed in string.
Argument is string, using the same format as the CQL timestamp type, see https://en.wikipedia.org/wiki/ISO_8601.
.. _scylla-token-of-method:
ScyllaDB.token_of()
"""""""""""""""""""
Compute and return the token (integer) for a `ScyllaDB.partition_key <scylla-partition-key-type_>`_.
.. _scylla-tombstone-type:
ScyllaDB.tombstone
""""""""""""""""""
Attributes:
* timestamp (integer)
* deletion_time (`ScyllaDB.gc_clock_time_point <scylla-gc-clock-time-point-type_>`_) - the point in time at which the tombstone was deleted.
.. _scylla-unserialize-clustering-key-method:
ScyllaDB.unserialize_clustering_key()
"""""""""""""""""""""""""""""""""""""
Create a `ScyllaDB.clustering_key <scylla-clustering-key-type_>`_ instance.
Argument is a string representing serialized clustering key in hex format.
.. _scylla-unserialize-partition-key-method:
ScyllaDB.unserialize_partition_key()
""""""""""""""""""""""""""""""""""""
Create a `ScyllaDB.partition_key <scylla-partition-key-type_>`_ instance.
Argument is a string representing serialized partition key in hex format.

View File

@@ -667,7 +667,7 @@ write
Writes an SStable based on a description of the content.
The description can be provided in two formats: ``CQL`` and ``JSON``.
The input format can be selected with the ``--input-format`` flag. Default is ``cql``.
In both cases the input is expected to be provided via the file whose path is passed to ``--input-file``.
In both cases the input is expected to be provided via the file whoose path is passed to ``--input-file``.
CQL input format
~~~~~~~~~~~~~~~~
@@ -858,9 +858,527 @@ Alternatively, you can provide each key-value pair via a separate ``--script-arg
--script-arg $key1=$value1 --script-arg $key2=$value2
Command line arguments will be received by the :ref:`consume_stream_start() <scylla-consume-stream-start-method>` API method.
Command line arguments will be received by the `consume_stream_start() <scylla-consume-stream-start-method_>`_ API method.
See the `scripting API </operating-scylla/admin-tools/scylla-sstable-script-api/>`_ for more details.
.. _scylla-consume-api:
ScyllaDB Consume API
~~~~~~~~~~~~~~~~~~~~~~
These methods represent the glue code between scylla-sstable's C++ code and the Lua script.
Conceptually a script is an implementation of a consumer interface. The script has to implement only the methods it is interested in. Each method has a default implementation in the interface, which simply drops the respective `mutation fragment <scylla-sstable-sstable-content_>`_.
For example, a script only interested in partitions can define only `consume_partition_start() <scylla-consume-partition-start-method_>`_ and nothing else.
Therefore a completely empty script is also valid, although not very useful.
Below you will find the listing of the API methods.
These methods (if provided by the script) will be called by the scylla-sstable runtime for the appropriate events and fragment types.
.. _scylla-consume-stream-start-method:
consume_stream_start(args)
""""""""""""""""""""""""""
* Part of the Consume API. Called on the very start of the stream.
* Parameter is a Lua table containing command line arguments for the script, passed via ``--script-arg``.
* Can be used to initialize global state.
.. _scylla-consume-sstable-start-method:
consume_sstable_start(sst)
""""""""""""""""""""""""""
* Part of the Consume API.
* Called on the start of each stable.
* The parameter is of type `ScyllaDB.sstable <scylla-sstable-type_>`_.
* When SStables are merged (``--merge``), the parameter is ``nil``.
Returns whether to stop. If ``true``, `consume_sstable_end() <scylla-consume-sstable-end-method_>`_ is called, skipping the content of the sstable (or that of the entire stream if ``--merge`` is used). If ``false``, consumption follows with the content of the sstable.
.. _scylla-consume-partition-start-method:
consume_partition_start(ps)
"""""""""""""""""""""""""""
* Part of the Consume API. Called on the start of each partition.
* The parameter is of type `ScyllaDB.partition_start <scylla-partition-start-type_>`_.
* Returns whether to stop. If ``true``, `consume_partition_end() <scylla-consume-partition-end-method_>`_ is called, skipping the content of the partition. If ``false``, consumption follows with the content of the partition.
consume_static_row(sr)
""""""""""""""""""""""
* Part of the Consume API.
* Called if the partition has a static row.
* The parameter is of type `ScyllaDB.static_row <scylla-static-row-type_>`_.
* Returns whether to stop. If ``true``, `consume_partition_end() <scylla-consume-partition-end-method_>`_ is called, and the remaining content of the partition is skipped. If ``false``, consumption follows with the remaining content of the partition.
consume_clustering_row(cr)
""""""""""""""""""""""""""
* Part of the Consume API.
* Called for each clustering row.
* The parameter is of type `ScyllaDB.clustering_row <scylla-clustering-row-type_>`_.
* Returns whether to stop. If ``true``, `consume_partition_end() <scylla-consume-partition-end-method_>`_ is called, the remaining content of the partition is skipped. If ``false``, consumption follows with the remaining content of the partition.
consume_range_tombstone_change(crt)
"""""""""""""""""""""""""""""""""""
* Part of the Consume API.
* Called for each range tombstone change.
* The parameter is of type `ScyllaDB.range_tombstone_change <scylla-range-tombstone-change-type_>`_.
* Returns whether to stop. If ``true``, `consume_partition_end() <scylla-consume-partition-end-method_>`_ is called, the remaining content of the partition is skipped. If ``false``, consumption follows with the remaining content of the partition.
.. _scylla-consume-partition-end-method:
consume_partition_end()
"""""""""""""""""""""""
* Part of the Consume API.
* Called at the end of the partition.
* Returns whether to stop. If ``true``, `consume_sstable_end() <scylla-consume-sstable-end-method_>`_ is called, the remaining content of the SStable is skipped. If ``false``, consumption follows with the remaining content of the SStable.
.. _scylla-consume-sstable-end-method:
consume_sstable_end()
"""""""""""""""""""""
* Part of the Consume API.
* Called at the end of the SStable.
* Returns whether to stop. If true, `consume_stream_end() <scylla-consume-stream-end-method_>`_ is called, the remaining content of the stream is skipped. If false, consumption follows with the remaining content of the stream.
.. _scylla-consume-stream-end-method:
consume_stream_end()
""""""""""""""""""""
* Part of the Consume API.
* Called at the very end of the stream.
ScyllaDB LUA API
~~~~~~~~~~~~~~~~
In addition to the `ScyllaDB Consume API <scylla-consume-api_>`_, the Lua bindings expose various types and methods that allow you to work with ScyllaDB types and values.
The listing uses the following terminology:
* Attribute - a simple attribute accessible via ``obj.attribute_name``;
* Method - a method operating on an instance of said type, invocable as ``obj:method()``;
* Magic method - magic methods defined in the metatable which define behaviour of these objects w.r.t. `Lua operators and more <http://www.lua.org/manual/5.4/manual.html#2.4>`_;
The format of an attribute description is the following:
.. code-block:: none
:class: hide-copy-button
attribute_name (type) - description
and that of a method:
.. code-block:: none
:class: hide-copy-button
method_name(arg1_type, arg2_type...) (return_type) - description
Magic methods have their signature defined by Lua and so that is not described here (these methods are not used directly anyway).
.. _scylla-atomic-cell-type:
ScyllaDB.atomic_cell
""""""""""""""""""""
Attributes:
* timestamp (integer)
* is_live (boolean) - is the cell live?
* type (string) - one of: ``regular``, ``counter-update``, ``counter-shards``, ``frozen-collection`` or ``collection``.
* has_ttl (boolean) - is the cell expiring?
* ttl (integer) - time to live in seconds, ``nil`` if cell is not expiring.
* expiry (`ScyllaDB.gc_clock_time_point <scylla-gc-clock-time-point-type_>`_) - time at which cell expires, ``nil`` if cell is not expiring.
* deletion_time (`ScyllaDB.gc_clock_time_point <scylla-gc-clock-time-point-type_>`_) - time at which cell was deleted, ``nil`` unless cell is dead or expiring.
* value:
- ``nil`` if cell is dead.
- appropriate Lua native type if type == ``regular``.
- integer if type == ``counter-update``.
- `ScyllaDB.counter_shards_value <scylla-counter-shards-value-type_>`_ if type == ``counter-shards``.
A counter-shard table has the following keys:
* id (string)
* value (integer)
* clock (integer)
.. _scylla-clustering-key-type:
ScyllaDB.clustering_key
"""""""""""""""""""""""
Attributes:
* components (table) - the column values (`ScyllaDB.data_value <scylla-data-value-type_>`_) making up the composite clustering key.
Methods:
* to_hex - convert the key to its serialized format, encoded in hex.
Magic methods:
* __tostring - can be converted to string with tostring(), uses the built-in operator<< in ScyllaDB.
.. _scylla-clustering-row-type:
ScyllaDB.clustering_row
"""""""""""""""""""""""
Attributes:
* key ($TYPE) - the clustering key's value as the appropriate Lua native type.
* tombstone (`ScyllaDB.tombstone <scylla-tombstone-type_>`_) - row tombstone, ``nil`` if no tombstone.
* shadowable_tombstone (`ScyllaDB.tombstone <scylla-tombstone-type_>`_) - shadowable tombstone of the row tombstone, ``nil`` if no tombstone.
* marker (`ScyllaDB.row_marker <scylla-row-marker-type_>`_) - the row marker, ``nil`` if row doesn't have one.
* cells (table) - table of cells, where keys are the column names and the values are either of type `ScyllaDB.atomic_cell <scylla-atomic-cell-type_>`_ or `ScyllaDB.collection <scylla-collection-type_>`_.
See also:
* `ScyllaDB.unserialize_clustering_key() <scylla-unserialize-clustering-key-method_>`_.
.. _scylla-collection-type:
ScyllaDB.collection
"""""""""""""""""""
Attributes:
* type (string) - always ``collection`` for collection.
* tombstone (`ScyllaDB.tombstone <scylla-tombstone-type_>`_) - ``nil`` if no tombstone.
* cells (table) - the collection cells, each collection cell is a table, with a ``key`` and ``value`` attribute. The key entry is the key of the collection cell for actual collections (list, set and map) and is of type `ScyllaDB.data-value <scylla-data-value-type_>`_. For tuples and UDT this is just an empty string. The value entry is the value of the collection cell and is of type `ScyllaDB.atomic-cell <scylla-atomic-cell-type_>`_.
.. _scylla-collection-cell-value-type:
ScyllaDB.collection_cell_value
""""""""""""""""""""""""""""""
Attributes:
* key (sstring) - collection cell key in human readable form.
* value (`ScyllaDB.atomic_cell <scylla-atomic-cell-type_>`_) - collection cell value.
.. _scylla-column-definition-type:
ScyllaDB.column_definition
""""""""""""""""""""""""""
Attributes:
* id (integer) - the id of the column.
* name (string) - the name of the column.
* kind (string) - the kind of the column, one of ``partition_key``, ``clustering_key``, ``static_column`` or ``regular_column``.
.. _scylla-counter-shards-value-type:
ScyllaDB.counter_shards_value
"""""""""""""""""""""""""""""
Attributes:
* value (integer) - the total value of the counter (the sum of all the shards).
* shards (table) - the shards making up this counter, a lua list containing tables, representing shards, with the following key/values:
- id (string) - the shard's id (UUID).
- value (integer) - the shard's value.
- clock (integer) - the shard's logical clock.
Magic methods:
* __tostring - can be converted to string with tostring().
.. _scylla-data-value-type:
ScyllaDB.data_value
"""""""""""""""""""
Attributes:
* value - the value represented as the appropriate Lua type
Magic methods:
* __tostring - can be converted to string with tostring().
.. _scylla-gc-clock-time-point-type:
ScyllaDB.gc_clock_time_point
""""""""""""""""""""""""""""
A time point belonging to the gc_clock, in UTC.
Attributes:
* year (integer) - [1900, +inf).
* month (integer) - [1, 12].
* day (integer) - [1, 31].
* hour (integer) - [0, 23].
* min (integer) - [0, 59].
* sec (integer) - [0, 59].
Magic methods:
* __eq - can be equal compared.
* __lt - can be less compared.
* __le - can be less-or-equal compared.
* __tostring - can be converted to string with tostring().
See also:
* `ScyllaDB.now() <scylla-now-method_>`_.
* `ScyllaDB.time_point_from_string() <scylla-time-point-from-string-method_>`_.
.. _scylla-json-writer-type:
ScyllaDB.json_writer
""""""""""""""""""""
A JSON writer object, with both low-level and high-level APIs.
The low-level API allows you to write custom JSON and it loosely follows the API of `rapidjson::Writer <https://rapidjson.org/classrapidjson_1_1_writer.html>`_ (upon which it is implemented).
The high-level API is for writing `mutation fragments <scylla-sstable-sstable-content_>`_ as JSON directly, using the built-in JSON conversion logic that is used by `dump-data <dump-data_>`_ operation.
Low level API Methods:
* null() - write a null json value.
* bool(boolean) - write a bool json value.
* int(integer) - write an integer json value.
* double(number) - write a double json value.
* string(string) - write a string json value.
* start_object() - start a json object.
* key(string) - write the key of a json object.
* end_object() - write the end of a json object.
* start_array() - write the start of a json array.
* end_array() - write the end of a json array.
High level API Methods:
* start_stream() - start the stream, call at the very beginning.
* start_sstable() - start an sstable.
* start_partition() - start a partition.
* static_row() - write a static row to the stream.
* clustering_row() - write a clustering row to the stream.
* range_tombstone_change() - write a range tombstone change to the stream.
* end_partition() - end the current partition.
* end_sstable() - end the current sstable.
* end_stream() - end the stream, call at the very end.
.. _scylla-new-json-writer-method:
ScyllaDB.new_json_writer()
""""""""""""""""""""""""""
Create a `ScyllaDB.json_writer <scylla-json-writer-type_>`_ instance.
.. _scylla-new-position-in-partition-method:
ScyllaDB.new_position_in_partition()
""""""""""""""""""""""""""""""""""""
Creates a `ScyllaDB.position_in_partition <scylla-position-in-partition-type_>`_ instance.
Arguments:
* weight (integer) - the weight of the key.
* key (`ScyllaDB.clustering_key <scylla-clustering-key-type_>`_) - the clustering key, optional.
.. _scylla-new-ring-position-method:
ScyllaDB.new_ring_position()
""""""""""""""""""""""""""""
Creates a `ScyllaDB.ring_position <scylla-ring-position-type_>`_ instance.
Has several overloads:
* ``ScyllaDB.new_ring_position(weight, key)``.
* ``ScyllaDB.new_ring_position(weight, token)``.
* ``ScyllaDB.new_ring_position(weight, key, token)``.
Where:
* weight (integer) - the weight of the key.
* key (`ScyllaDB.partition_key <scylla-partition-key-type_>`_) - the partition key.
* token (integer) - the token (of the key if a key is provided).
.. _scylla-now-method:
ScyllaDB.now()
""""""""""""""
Create a `ScyllaDB.gc_clock_time_point <scylla-gc-clock-time-point-type_>`_ instance, representing the current time.
.. _scylla-partition-key-type:
ScyllaDB.partition_key
""""""""""""""""""""""
Attributes:
* components (table) - the column values (`ScyllaDB.data_value <scylla-data-value-type_>`_) making up the composite partition key.
Methods:
* to_hex - convert the key to its serialized format, encoded in hex.
Magic methods:
* __tostring - can be converted to string with tostring(), uses the built-in operator<< in ScyllaDB.
See also:
* :ref:`ScyllaDB.unserialize_partition_key() <scylla-unserialize-partition-key-method>`.
* :ref:`ScyllaDB.token_of() <scylla-token-of-method>`.
.. _scylla-partition-start-type:
ScyllaDB.partition_start
""""""""""""""""""""""""
Attributes:
* key - the partition key's value as the appropriate Lua native type.
* token (integer) - the partition key's token.
* tombstone (`ScyllaDB.tombstone <scylla-tombstone-type_>`_) - the partition tombstone, ``nil`` if no tombstone.
.. _scylla-position-in-partition-type:
ScyllaDB.position_in_partition
""""""""""""""""""""""""""""""
Currently used only for clustering positions.
Attributes:
* key (`ScyllaDB.clustering_key <scylla-clustering-key-type_>`_) - the clustering key, ``nil`` if the position in partition represents the min or max clustering positions.
* weight (integer) - weight of the position, either -1 (before key), 0 (at key) or 1 (after key). If key attribute is ``nil``, the weight is never 0.
Methods:
* tri_cmp - compare this position in partition to another position in partition, returns -1 (``<``), 0 (``==``) or 1 (``>``).
See also:
* `ScyllaDB.new_position_in_partition() <scylla-new-position-in-partition-method_>`_.
.. _scylla-range-tombstone-change-type:
ScyllaDB.range_tombstone_change
"""""""""""""""""""""""""""""""
Attributes:
* key ($TYPE) - the clustering key's value as the appropriate Lua native type.
* key_weight (integer) - weight of the position, either -1 (before key), 0 (at key) or 1 (after key).
* tombstone (`ScyllaDB.tombstone <scylla-tombstone-type_>`_) - tombstone, ``nil`` if no tombstone.
.. _scylla-ring-position-type:
ScyllaDB.ring_position
""""""""""""""""""""""
Attributes:
* token (integer) - the token, ``nil`` if the ring position represents the min or max ring positions.
* key (`ScyllaDB.partition_key <scylla-partition-key-type_>`_) - the partition key, ``nil`` if the ring position represents a position before/after a token.
* weight (integer) - weight of the position, either -1 (before key/token), 0 (at key) or 1 (after key/token). If key attribute is ``nil``, the weight is never 0.
Methods:
* tri_cmp - compare this ring position to another ring position, returns -1 (``<``), 0 (``==``) or 1 (``>``).
See also:
* `ScyllaDB.new_ring_position() <scylla-new-ring-position-method_>`_.
.. _scylla-row-marker-type:
ScyllaDB.row_marker
"""""""""""""""""""
Attributes:
* timestamp (integer).
* is_live (boolean) - is the marker live?
* has_ttl (boolean) - is the marker expiring?
* ttl (integer) - time to live in seconds, ``nil`` if marker is not expiring.
* expiry (`ScyllaDB.gc_clock_time_point <scylla-gc-clock-time-point-type_>`_) - time at which marker expires, ``nil`` if marker is not expiring.
* deletion_time (`ScyllaDB.gc_clock_time_point <scylla-gc-clock-time-point-type_>`_) - time at which marker was deleted, ``nil`` unless marker is dead or expiring.
.. _scylla-schema-type:
ScyllaDB.schema
"""""""""""""""
Attributes:
* partition_key_columns (table) - list of `ScyllaDB.column_definition <scylla-column-definition-type_>`_ of the key columns making up the partition key.
* clustering_key_columns (table) - list of `ScyllaDB.column_definition <scylla-column-definition-type_>`_ of the key columns making up the clustering key.
* static_columns (table) - list of `ScyllaDB.column_definition <scylla-column-definition-type_>`_ of the static columns.
* regular_columns (table) - list of `ScyllaDB.column_definition <scylla-column-definition-type_>`_ of the regular columns.
* all_columns (table) - list of `ScyllaDB.column_definition <scylla-column-definition-type_>`_ of all columns.
.. _scylla-sstable-type:
ScyllaDB.sstable
""""""""""""""""
Attributes:
* filename (string) - the full path of the sstable Data component file;
.. _scylla-static-row-type:
ScyllaDB.static_row
"""""""""""""""""""
Attributes:
* cells (table) - table of cells, where keys are the column names and the values are either of type `ScyllaDB.atomic_cell <scylla-atomic-cell-type_>`_ or `ScyllaDB.collection <scylla-collection-type_>`_.
.. _scylla-time-point-from-string-method:
ScyllaDB.time_point_from_string()
"""""""""""""""""""""""""""""""""
Create a `ScyllaDB.gc_clock_time_point <scylla-gc-clock-time-point-type_>`_ instance from the passed in string.
Argument is string, using the same format as the CQL timestamp type, see https://en.wikipedia.org/wiki/ISO_8601.
.. _scylla-token-of-method:
ScyllaDB.token_of()
"""""""""""""""""""
Compute and return the token (integer) for a `ScyllaDB.partition_key <scylla-partition-key-type_>`_.
.. _scylla-tombstone-type:
ScyllaDB.tombstone
""""""""""""""""""
Attributes:
* timestamp (integer)
* deletion_time (`ScyllaDB.gc_clock_time_point <scylla-gc-clock-time-point-type_>`_) - the point in time at which the tombstone was deleted.
.. _scylla-unserialize-clustering-key-method:
ScyllaDB.unserialize_clustering_key()
"""""""""""""""""""""""""""""""""""""
Create a `ScyllaDB.clustering_key <scylla-clustering-key-type_>`_ instance.
Argument is a string representing serialized clustering key in hex format.
.. _scylla-unserialize-partition-key-method:
ScyllaDB.unserialize_partition_key()
""""""""""""""""""""""""""""""""""""
Create a `ScyllaDB.partition_key <scylla-partition-key-type_>`_ instance.
Argument is a string representing serialized partition key in hex format.
Examples
~~~~~~~~
@@ -879,7 +1397,7 @@ SSTables which are already on the designated version are skipped. To force rewri
Output SSTables are written to the path provided by the ``--output-dir`` flag, or to the current directory if not specified.
This directory is expected to exist and be empty. If not empty the tool will refuse to run. This can be overridden with the ``--unsafe-accept-nonempty-output-dir`` flag.
It is strongly recommended to use the system schema tables as the schema source for this command, see the :ref:`schema options <scylla-sstable-schema>` for more details.
It is strongly recommended to use the system schema tables as the schema source for this command, see the `schema options <scylla-sstable-schema_>`_ for more details.
A schema which is good enough to read the SSTable and dump its content, may not be good enough to write its content back verbatim.
An incomplete or incorrect schema can lead to the tool crashing or even data loss.

View File

@@ -111,7 +111,9 @@ should follow this format:
.. code-block:: yaml
object_storage_endpoints:
- name: https://<endpoint_address_or_domain_name>[:<port_number>]
- name: <endpoint_address_or_domain_name>
port: <port_number>
https: <true_or_false> # optional
aws_region: <region_name> # optional, e.g. us-east-1
iam_role_arn: <iam_role> # optional
@@ -121,7 +123,9 @@ Example:
.. code:: yaml
object_storage_endpoints:
- name: https://s3.us-east-1.amazonaws.com
- name: s3.us-east-1.amazonaws.com
port: 443
https: true
aws_region: us-east-1
iam_role_arn: arn:aws:iam::123456789012:instance-profile/my-instance-instance-profile

View File

@@ -14,13 +14,12 @@ Enable ScyllaDB :doc:`Authentication </operating-scylla/security/authentication>
Enabling Audit
---------------
By default, auditing is **enabled**. Enabling auditing is controlled by the ``audit:`` parameter in the ``scylla.yaml`` file.
By default, auditing is **disabled**. Enabling auditing is controlled by the ``audit:`` parameter in the ``scylla.yaml`` file.
You can set the following options:
* ``none`` - Audit is disabled (default).
* ``table`` - Audit is enabled, and messages are stored in a Scylla table.
* ``syslog`` - Audit is enabled, and messages are sent to Syslog.
* ``syslog,table`` - Audit is enabled, and messages are stored in a Scylla table and sent to Syslog.
Configuring any other value results in an error at Scylla startup.

View File

@@ -178,7 +178,6 @@ public:
gms::feature strongly_consistent_tables { *this, "STRONGLY_CONSISTENT_TABLES"sv };
gms::feature client_routes { *this, "CLIENT_ROUTES"sv };
gms::feature removenode_with_left_token_ring { *this, "REMOVENODE_WITH_LEFT_TOKEN_RING"sv };
gms::feature size_based_load_balancing { *this, "SIZE_BASED_LOAD_BALANCING"sv };
public:
const std::unordered_map<sstring, std::reference_wrapper<feature>>& registered_features() const;

View File

@@ -14,6 +14,7 @@
#include <seastar/core/sharded.hh>
#include <seastar/core/abort_source.hh>
#include "utils/log.hh"
#include "utils/s3/creds.hh"
#include "seastarx.hh"
#include <boost/program_options.hpp>
#include <yaml-cpp/yaml.h>

View File

@@ -8,112 +8,76 @@
#pragma once
#include "service/tablet_allocator_fwd.hh"
#include "locator/topology.hh"
#include "locator/token_metadata.hh"
#include "locator/tablets.hh"
#include "utils/stall_free.hh"
#include "utils/extremum_tracking.hh"
#include "utils/div_ceil.hh"
#include "utils/pretty_printers.hh"
#include <absl/container/btree_set.h>
#include <seastar/util/defer.hh>
#include <optional>
#include <vector>
namespace locator {
struct disk_usage {
using load_type = double; // Disk usage factor (0.0 to 1.0)
uint64_t capacity = 0;
uint64_t used = 0;
load_type get_load() const {
if (capacity == 0) {
return 0;
}
return load_type(used) / capacity;
}
};
/// A data structure which keeps track of load associated with data ownership
/// on shards of the whole cluster.
class load_sketch {
using shard_id = seastar::shard_id;
using load_type = disk_usage::load_type;
using load_type = ssize_t; // In tablets.
struct shard_load {
shard_id id;
disk_usage du;
size_t tablet_count = 0;
// Returns storage utilization for the shard
load_type get_load() const {
return du.get_load();
}
load_type load;
};
// Less-comparator which orders by load first (ascending), and then by shard id (ascending).
struct shard_load_cmp {
bool operator()(const shard_load& shard_a, const shard_load& shard_b) const {
auto load_a = shard_a.get_load();
auto load_b = shard_b.get_load();
return load_a == load_b ? shard_a.id < shard_b.id : load_a < load_b;
bool operator()(const shard_load& a, const shard_load& b) const {
return a.load == b.load ? a.id < b.id : a.load < b.load;
}
};
struct node_load {
std::vector<shard_load> _shards;
absl::btree_set<shard_load, shard_load_cmp> _shards_by_load;
disk_usage _du;
size_t _tablet_count = 0;
std::vector<load_type> _shards;
load_type _load = 0;
// These can be false only when _load_stats != nullptr
bool _has_valid_disk_capacity = true;
bool _has_all_tablet_sizes = true;
node_load(size_t shard_count, uint64_t capacity)
: _shards(shard_count) {
_du.capacity = capacity;
uint64_t shard_capacity = capacity / shard_count;
node_load(size_t shard_count) : _shards(shard_count) {
for (shard_id i = 0; i < shard_count; ++i) {
_shards[i].id = i;
_shards[i].du.capacity = shard_capacity;
_shards[i] = 0;
}
}
void update_shard_load(shard_id shard, ssize_t tablet_count_delta, int64_t tablet_size_delta) {
_shards_by_load.erase(_shards[shard]);
_shards[shard].tablet_count += tablet_count_delta;
_shards[shard].du.used += tablet_size_delta;
_shards_by_load.insert(_shards[shard]);
_du.used += tablet_size_delta;
_tablet_count += tablet_count_delta;
void update_shard_load(shard_id shard, load_type load_delta) {
_load += load_delta;
auto old_load = _shards[shard];
auto new_load = old_load + load_delta;
_shards_by_load.erase(shard_load{shard, old_load});
_shards[shard] = new_load;
_shards_by_load.insert(shard_load{shard, new_load});
}
void populate_shards_by_load() {
_shards_by_load.clear();
_shards_by_load.insert(_shards.begin(), _shards.end());
for (shard_id i = 0; i < _shards.size(); ++i) {
_shards_by_load.insert(shard_load{i, _shards[i]});
}
}
// Returns storage utilization for the node
load_type get_load() const noexcept {
return _du.get_load();
load_type& load() noexcept {
return _load;
}
const load_type& load() const noexcept {
return _load;
}
};
std::unordered_map<host_id, node_load> _nodes;
token_metadata_ptr _tm;
load_stats_ptr _load_stats;
uint64_t _default_tablet_size = service::default_target_tablet_size;
uint64_t _minimal_tablet_size = 0;
// When set to true, it will use gross disk capacity instead of effective_capacity and
// treat all tablet as having the same size: _default_tablet_size
bool _force_capacity_based_load = false;
private:
tablet_replica_set get_replicas_for_tablet_load(const tablet_info& ti, const tablet_transition_info* trinfo) const {
// We reflect migrations in the load as if they already happened,
@@ -121,34 +85,10 @@ private:
return trinfo ? trinfo->next : ti.replicas;
}
std::optional<uint64_t> get_disk_capacity_for_node(host_id node) {
if (_load_stats) {
if (_load_stats->tablet_stats.contains(node) && !_force_capacity_based_load) {
return _load_stats->tablet_stats.at(node).effective_capacity;
} else if (_load_stats->capacity.contains(node)) {
return _load_stats->capacity.at(node);
}
}
return std::nullopt;
}
std::optional<uint64_t> get_tablet_size(host_id host, const range_based_tablet_id& rb_tid, const tablet_info& ti, const tablet_transition_info* trinfo) const {
if (_force_capacity_based_load) {
return _default_tablet_size;
}
std::optional<uint64_t> tablet_size_opt;
if (_load_stats) {
tablet_size_opt = _load_stats->get_tablet_size_in_transition(host, rb_tid, ti, trinfo);
}
return tablet_size_opt;
}
future<> populate_table(table_id table, const tablet_map& tmap, std::optional<host_id> host, std::optional<sstring> only_dc) {
future<> populate_table(const tablet_map& tmap, std::optional<host_id> host, std::optional<sstring> only_dc) {
const topology& topo = _tm->get_topology();
co_await tmap.for_each_tablet([&] (tablet_id tid, const tablet_info& ti) -> future<> {
auto trinfo = tmap.get_tablet_transition_info(tid);
for (auto&& replica : get_replicas_for_tablet_load(ti, trinfo)) {
for (auto&& replica : get_replicas_for_tablet_load(ti, tmap.get_tablet_transition_info(tid))) {
if (host && *host != replica.host) {
continue;
}
@@ -157,41 +97,21 @@ private:
if (only_dc && node->dc_rack().dc != *only_dc) {
continue;
}
auto disk_capacity_opt = get_disk_capacity_for_node(replica.host);
auto [i, _] = _nodes.emplace(replica.host, node_load{node->get_shard_count(), disk_capacity_opt.value_or(_default_tablet_size)});
if (!disk_capacity_opt && _load_stats) {
i->second._has_valid_disk_capacity = false;
}
_nodes.emplace(replica.host, node_load{node->get_shard_count()});
}
node_load& n = _nodes.at(replica.host);
if (replica.shard < n._shards.size()) {
const range_based_tablet_id rb_tid {table, tmap.get_token_range(tid)};
auto tablet_size_opt = get_tablet_size(replica.host, rb_tid, ti, trinfo);
if (!tablet_size_opt && _load_stats) {
n._has_all_tablet_sizes = false;
}
const uint64_t tablet_size = std::max(tablet_size_opt.value_or(_default_tablet_size), _minimal_tablet_size);
n._du.used += tablet_size;
n._tablet_count++;
n._shards[replica.shard].du.used += tablet_size;
n._shards[replica.shard].tablet_count++;
n.load() += 1;
n._shards[replica.shard] += 1;
// Note: as an optimization, _shards_by_load is populated later in populate_shards_by_load()
}
}
return make_ready_future<>();
});
}
void throw_on_incomplete_data(host_id host, bool only_check_disk_capacity = false) const {
if (!has_complete_data(host, only_check_disk_capacity)) {
throw std::runtime_error(format("Can't provide accurate load computation with incomplete load_stats for host: {}", host));
}
}
public:
load_sketch(token_metadata_ptr tm, load_stats_ptr load_stats = {}, uint64_t default_tablet_size = service::default_target_tablet_size)
: _tm(std::move(tm))
, _load_stats(std::move(load_stats))
, _default_tablet_size(default_tablet_size) {
load_sketch(token_metadata_ptr tm)
: _tm(std::move(tm)) {
}
future<> populate(std::optional<host_id> host = std::nullopt,
@@ -212,11 +132,11 @@ public:
if (only_table) {
if (_tm->tablets().has_tablet_map(*only_table)) {
auto& tmap = _tm->tablets().get_tablet_map(*only_table);
co_await populate_table(*only_table, tmap, host, only_dc);
co_await populate_table(tmap, host, only_dc);
}
} else {
for (const auto& [table, tmap] : _tm->tablets().all_tables_ungrouped()) {
co_await populate_table(table, *tmap, host, only_dc);
co_await populate_table(*tmap, host, only_dc);
}
}
@@ -229,37 +149,12 @@ public:
return populate(std::nullopt, std::nullopt, dc);
}
shard_id next_shard(host_id node, size_t tablet_count, uint64_t tablet_size_sum) {
shard_id next_shard(host_id node) {
auto shard = get_least_loaded_shard(node);
pick(node, shard, tablet_count, tablet_size_sum);
pick(node, shard);
return shard;
}
bool has_complete_data(host_id node, bool only_check_disk_capacity = false) const {
if (!_nodes.contains(node)) {
return false;
}
auto& n = _nodes.at(node);
return n._has_valid_disk_capacity && (only_check_disk_capacity || n._has_all_tablet_sizes);
}
void ignore_incomplete_data(host_id node) {
if (!_nodes.contains(node)) {
return;
}
auto& n = _nodes.at(node);
n._has_valid_disk_capacity = true;
n._has_all_tablet_sizes = true;
}
void set_minimal_tablet_size(uint64_t min_ts) {
_minimal_tablet_size = min_ts;
}
void set_force_capacity_based_load(bool force_capacity_based_load) {
_force_capacity_based_load = force_capacity_based_load;
}
node_load& ensure_node(host_id node) {
if (!_nodes.contains(node)) {
const topology& topo = _tm->get_topology();
@@ -267,85 +162,63 @@ public:
if (shard_count == 0) {
throw std::runtime_error(format("Shard count not known for node {}", node));
}
auto disk_capacity_opt = get_disk_capacity_for_node(node);
auto [i, _] = _nodes.emplace(node, node_load{shard_count, disk_capacity_opt.value_or(_default_tablet_size)});
auto [i, _] = _nodes.emplace(node, node_load{shard_count});
i->second.populate_shards_by_load();
if (!disk_capacity_opt && _load_stats) {
i->second._has_valid_disk_capacity = false;
}
}
return _nodes.at(node);
}
shard_id get_least_loaded_shard(host_id node) {
auto& n = ensure_node(node);
throw_on_incomplete_data(node);
return n._shards_by_load.begin()->id;
const shard_load& s = *n._shards_by_load.begin();
return s.id;
}
shard_id get_most_loaded_shard(host_id node) {
auto& n = ensure_node(node);
throw_on_incomplete_data(node);
return std::prev(n._shards_by_load.end())->id;
const shard_load& s = *std::prev(n._shards_by_load.end());
return s.id;
}
void unload(host_id node, shard_id shard, size_t tablet_count_delta, uint64_t tablet_sizes_delta) {
throw_on_incomplete_data(node);
void unload(host_id node, shard_id shard) {
auto& n = _nodes.at(node);
n.update_shard_load(shard, -ssize_t(tablet_count_delta), -int64_t(tablet_sizes_delta));
n.update_shard_load(shard, -1);
}
void pick(host_id node, shard_id shard, size_t tablet_count_delta, uint64_t tablet_sizes_delta) {
throw_on_incomplete_data(node);
void pick(host_id node, shard_id shard) {
auto& n = _nodes.at(node);
n.update_shard_load(shard, tablet_count_delta, tablet_sizes_delta);
n.update_shard_load(shard, 1);
}
load_type get_load(host_id node) const {
if (!_nodes.contains(node)) {
return 0;
}
throw_on_incomplete_data(node);
return _nodes.at(node).get_load();
return _nodes.at(node).load();
}
uint64_t get_tablet_count(host_id node) const {
if (!_nodes.contains(node)) {
return 0;
load_type total_load() const {
load_type total = 0;
for (auto&& n : _nodes) {
total += n.second.load();
}
return _nodes.at(node)._tablet_count;
return total;
}
uint64_t get_avg_tablet_count(host_id node) const {
load_type get_avg_shard_load(host_id node) const {
if (!_nodes.contains(node)) {
return 0;
}
auto& n = _nodes.at(node);
return div_ceil(n._tablet_count, n._shards.size());
return div_ceil(n.load(), n._shards.size());
}
double get_real_avg_tablet_count(host_id node) const {
double get_real_avg_shard_load(host_id node) const {
if (!_nodes.contains(node)) {
return 0;
}
auto& n = _nodes.at(node);
return double(n._tablet_count) / n._shards.size();
}
uint64_t get_disk_used(host_id node) const {
if (!_nodes.contains(node)) {
return 0;
}
throw_on_incomplete_data(node);
return _nodes.at(node)._du.used;
}
uint64_t get_capacity(host_id node) const {
if (!_nodes.contains(node)) {
return 0;
}
throw_on_incomplete_data(node, true);
return _nodes.at(node)._du.capacity;
return double(n.load()) / n._shards.size();
}
shard_id get_shard_count(host_id node) const {
@@ -358,18 +231,17 @@ public:
// Returns the difference in tablet count between highest-loaded shard and lowest-loaded shard.
// Returns 0 when shards are perfectly balanced.
// Returns 1 when shards are imbalanced, but it's not possible to balance them.
size_t get_shard_tablet_count_imbalance(host_id node) const {
auto minmax = get_shard_minmax_tablet_count(node);
return minmax.max() - minmax.min();
load_type get_shard_imbalance(host_id node) const {
auto minmax = get_shard_minmax(node);
return minmax.max() - minmax.max();
}
min_max_tracker<load_type> get_shard_minmax(host_id node) const {
min_max_tracker<load_type> minmax;
if (_nodes.contains(node)) {
throw_on_incomplete_data(node);
auto& n = _nodes.at(node);
for (auto&& shard: n._shards) {
minmax.update(shard.get_load());
for (auto&& load: n._shards) {
minmax.update(load);
}
} else {
minmax.update(0);
@@ -377,44 +249,18 @@ public:
return minmax;
}
min_max_tracker<size_t> get_shard_minmax_tablet_count(host_id node) const {
min_max_tracker<size_t> minmax;
if (_nodes.contains(node)) {
auto& n = _nodes.at(node);
for (auto&& shard: n._shards) {
minmax.update(shard.tablet_count);
}
} else {
minmax.update(0);
}
return minmax;
}
// Returns nullopt if node is not known, or we don't have valid disk capacity.
std::optional<load_type> get_allocated_utilization(host_id node) const {
if (!_nodes.contains(node) || !has_complete_data(node, true)) {
// Returns nullopt if capacity is not known.
std::optional<double> get_allocated_utilization(host_id node, const locator::load_stats& stats, uint64_t target_tablet_size) const {
if (!_nodes.contains(node)) {
return std::nullopt;
}
const node_load& n = _nodes.at(node);
return load_type(n._tablet_count * _default_tablet_size) / n._du.capacity;
}
// Returns nullopt if node is not known, or we don't have tablet sizes or valid disk capacity.
std::optional<load_type> get_storage_utilization(host_id node) const {
if (!_nodes.contains(node) || !has_complete_data(node)) {
auto& n = _nodes.at(node);
if (!stats.capacity.contains(node)) {
return std::nullopt;
}
return _nodes.at(node).get_load();
auto capacity = stats.capacity.at(node);
return capacity > 0 ? double(n.load() * target_tablet_size) / capacity : 0;
}
};
} // namespace locator
template<>
struct fmt::formatter<locator::disk_usage> : fmt::formatter<string_view> {
template <typename FormatContext>
auto format(const locator::disk_usage& du, FormatContext& ctx) const {
return fmt::format_to(ctx.out(), "cap: {:i} used: {:i} load: {}",
utils::pretty_printed_data_size(du.capacity), utils::pretty_printed_data_size(du.used), du.get_load());
}
};

View File

@@ -403,7 +403,7 @@ tablet_replica_set network_topology_strategy::drop_tablets_in_racks(schema_ptr s
if (node.dc_rack().dc == dc && is_rack_to_drop(node.dc_rack().rack)) {
tablet_logger.debug("drop_tablets_in_rack {}.{} tablet_id={} dc={} rack={} removing replica: {}",
s->ks_name(), s->cf_name(), tb, node.dc_rack().dc, node.dc_rack().rack, tr);
load.unload(tr.host, tr.shard, 1, service::default_target_tablet_size);
load.unload(tr.host, tr.shard);
} else {
filtered.emplace_back(tr);
}
@@ -433,7 +433,7 @@ tablet_replica_set network_topology_strategy::add_tablets_in_racks(schema_ptr s,
// Assume that if there was a diff to add a rack, we don't already have a replica
// in the target rack so all nodes in the rack are eligible.
// FIXME: pick based on storage utilization: https://github.com/scylladb/scylladb/issues/26366
auto node_load = load.get_real_avg_tablet_count(node.get().host_id());
auto node_load = load.get_real_avg_shard_load(node.get().host_id());
if (node_load < min_load) {
min_load = node_load;
min_node = node.get().host_id();
@@ -445,7 +445,7 @@ tablet_replica_set network_topology_strategy::add_tablets_in_racks(schema_ptr s,
fmt::format("No candidate node in rack {}.{} to allocate tablet replica", dc, rack));
}
auto new_replica = tablet_replica{min_node, load.next_shard(min_node, 1, service::default_target_tablet_size)};
auto new_replica = tablet_replica{min_node, load.next_shard(min_node)};
new_replicas.push_back(new_replica);
tablet_logger.trace("add_tablet_in_rack {}.{} tablet_id={} dc={} rack={} load={} new_replica={}",
@@ -502,7 +502,7 @@ future<tablet_replica_set> network_topology_strategy::add_tablets_in_dc(schema_p
const auto& host_id = node.get().host_id();
if (!existing.contains(host_id)) {
// FIXME: https://github.com/scylladb/scylladb/issues/26366
candidate.nodes.emplace_back(host_id, load.get_avg_tablet_count(host_id));
candidate.nodes.emplace_back(host_id, load.get_avg_shard_load(host_id));
}
}
if (candidate.nodes.empty()) {
@@ -552,7 +552,7 @@ future<tablet_replica_set> network_topology_strategy::add_tablets_in_dc(schema_p
s->ks_name(), s->cf_name(), tb.id, rack, dc, dc_node_count, dc_rf));
}
auto host_id = nodes.back().host;
auto replica = tablet_replica{host_id, load.next_shard(host_id, 1, service::default_target_tablet_size)};
auto replica = tablet_replica{host_id, load.next_shard(host_id)};
const auto& node = tm->get_topology().get_node(host_id);
auto inserted = replicas_per_rack[node.dc_rack().rack].insert(host_id).second;
// Sanity check that a node is not used more than once
@@ -614,7 +614,7 @@ tablet_replica_set network_topology_strategy::drop_tablets_in_dc(schema_ptr s, c
if (topo.get_node(tr.host).dc_rack().dc != dc || ++nodes_in_dc <= dc_rf) {
filtered.emplace_back(tr);
} else {
load.unload(tr.host, tr.shard, 1, service::default_target_tablet_size);
load.unload(tr.host, tr.shard);
}
}
return filtered;

View File

@@ -927,56 +927,6 @@ std::optional<uint64_t> load_stats::get_tablet_size(host_id host, const range_ba
return std::nullopt;
}
std::optional<uint64_t> load_stats::get_tablet_size_in_transition(host_id host, const range_based_tablet_id& rb_tid, const tablet_info& ti, const tablet_transition_info* trinfo) const {
std::optional<uint64_t> tablet_size_opt;
tablet_size_opt = get_tablet_size(host, rb_tid);
if (tablet_size_opt) {
return tablet_size_opt;
}
// If the tablet is in transition,
// try to find it on the leaving replica, in case of tablet migration,
// or get the avg tablet size of all the replicas, in case we have a rebuild
if (trinfo) {
switch (trinfo->transition) {
case tablet_transition_kind::migration:
// Search for the tablet size on leaving replica
if (trinfo->pending_replica && trinfo->pending_replica->host == host) {
if (auto leaving_replica = get_leaving_replica(ti, *trinfo)) {
tablet_size_opt = get_tablet_size(leaving_replica->host, rb_tid);
} else {
on_internal_error_noexcept(tablet_logger, ::format("No leaving replica for tablet migration in table {}. ti.replicas: {} trinfo->next: {}",
rb_tid.table, ti.replicas, trinfo->next));
}
}
break;
case tablet_transition_kind::rebuild:
[[fallthrough]];
case tablet_transition_kind::rebuild_v2: {
// Get the avg tablet size from the available replicas
size_t replica_count = 0;
uint64_t tablet_size_sum = 0;
for (auto& replica : ti.replicas) {
auto new_tablet_size_opt = get_tablet_size(replica.host, rb_tid);
if (new_tablet_size_opt) {
tablet_size_sum += *new_tablet_size_opt;
replica_count++;
}
}
if (replica_count) {
tablet_size_opt = tablet_size_sum / replica_count;
}
break;
}
case tablet_transition_kind::intranode_migration:
[[fallthrough]];
case tablet_transition_kind::repair:
break;
}
}
return tablet_size_opt;
}
lw_shared_ptr<load_stats> load_stats::reconcile_tablets_resize(const std::unordered_set<table_id>& tables, const token_metadata& old_tm, const token_metadata& new_tm) const {
lw_shared_ptr<load_stats> reconciled_stats { make_lw_shared<load_stats>(*this) };
load_stats& new_stats = *reconciled_stats;

View File

@@ -489,12 +489,6 @@ struct load_stats {
std::optional<uint64_t> get_tablet_size(host_id host, const range_based_tablet_id& rb_tid) const;
// Returns the tablet size on the given host. If the tablet size is not found on the host, we will search for it on
// other hosts based on the tablet transition info:
// - if the tablet is in migration, and the given host is pending, the tablet size will be searched on the leaving replica
// - if the tablet is being rebuilt, we will return the average tablet size of all the replicas
std::optional<uint64_t> get_tablet_size_in_transition(host_id host, const range_based_tablet_id& rb_tid, const tablet_info& ti, const tablet_transition_info* trinfo) const;
// Modifies the tablet sizes in load_stats for the given table after a split or merge. The old_tm argument has
// to contain the token_metadata pre-resize. The function returns load_stats with tablet token ranges
// corresponding to the post-resize tablet_map.

View File

@@ -959,6 +959,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
auto ip = utils::resolve(cfg->prometheus_address || cfg->listen_address, family, preferred).get();
prometheus::config pctx;
pctx.metric_help = "Scylla server statistics";
pctx.prefix = cfg->prometheus_prefix();
pctx.allow_protobuf = cfg->prometheus_allow_protobuf();
prometheus::start(prometheus_server, pctx).get();
@@ -1790,7 +1791,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
});
checkpoint(stop_signal, "starting auth cache");
auth_cache.start(std::ref(qp), std::ref(stop_signal.as_sharded_abort_source())).get();
auth_cache.start(std::ref(qp)).get();
auto stop_auth_cache = defer_verbose_shutdown("auth cache", [&] {
auth_cache.stop().get();
});
@@ -2526,7 +2527,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
api::set_server_service_levels(ctx, cql_server_ctl, qp).get();
alternator::controller alternator_ctl(gossiper, proxy, ss, mm, sys_dist_ks, cdc_generation_service, service_memory_limiter, auth_service, sl_controller, *cfg, dbcfg.statement_scheduling_group);
alternator::controller alternator_ctl(gossiper, proxy, mm, sys_dist_ks, cdc_generation_service, service_memory_limiter, auth_service, sl_controller, *cfg, dbcfg.statement_scheduling_group);
// Register at_exit last, so that storage_service::drain_on_shutdown will be called first
auto do_drain = defer_verbose_shutdown("local storage", [&ss] {

View File

@@ -1292,7 +1292,7 @@ future<std::tuple<rpc::sink<frozen_mutation_fragment, streaming::stream_mutation
messaging_service::make_sink_and_source_for_stream_mutation_fragments(table_schema_version schema_id, streaming::plan_id plan_id, table_id cf_id, uint64_t estimated_partitions, streaming::stream_reason reason, service::session_id session, locator::host_id id) {
using value_type = std::tuple<rpc::sink<frozen_mutation_fragment, streaming::stream_mutation_fragments_cmd>, rpc::source<int32_t>>;
if (is_shutting_down()) {
return make_exception_future<value_type>(rpc::closed_error("local node is shutting down"));
return make_exception_future<value_type>(rpc::closed_error());
}
auto rpc_client = get_rpc_client(messaging_verb::STREAM_MUTATION_FRAGMENTS, addr_for_host_id(id), id);
return rpc_client->make_stream_sink<netw::serializer, frozen_mutation_fragment, streaming::stream_mutation_fragments_cmd>().then([this, session, plan_id, schema_id, cf_id, estimated_partitions, reason, rpc_client] (rpc::sink<frozen_mutation_fragment, streaming::stream_mutation_fragments_cmd> sink) mutable {
@@ -1321,7 +1321,7 @@ rpc::sink<streaming::stream_blob_cmd_data> messaging_service::make_sink_for_stre
future<std::tuple<rpc::sink<streaming::stream_blob_cmd_data>, rpc::source<streaming::stream_blob_cmd_data>>>
messaging_service::make_sink_and_source_for_stream_blob(streaming::stream_blob_meta meta, locator::host_id id) {
if (is_shutting_down()) {
co_await coroutine::return_exception(rpc::closed_error("local node is shutting down"));
co_await coroutine::return_exception(rpc::closed_error());
}
auto rpc_client = get_rpc_client(messaging_verb::STREAM_BLOB, addr_for_host_id(id), id);
auto sink = co_await rpc_client->make_stream_sink<netw::serializer, streaming::stream_blob_cmd_data>();
@@ -1370,7 +1370,7 @@ future<std::tuple<rpc::sink<repair_hash_with_cmd>, rpc::source<repair_row_on_wir
messaging_service::make_sink_and_source_for_repair_get_row_diff_with_rpc_stream(uint32_t repair_meta_id, shard_id dst_cpu_id, locator::host_id id) {
auto verb = messaging_verb::REPAIR_GET_ROW_DIFF_WITH_RPC_STREAM;
if (is_shutting_down()) {
return make_exception_future<std::tuple<rpc::sink<repair_hash_with_cmd>, rpc::source<repair_row_on_wire_with_cmd>>>(rpc::closed_error("local node is shutting down"));
return make_exception_future<std::tuple<rpc::sink<repair_hash_with_cmd>, rpc::source<repair_row_on_wire_with_cmd>>>(rpc::closed_error());
}
auto rpc_client = get_rpc_client(verb, addr_for_host_id(id), id);
return do_make_sink_source<repair_hash_with_cmd, repair_row_on_wire_with_cmd>(verb, repair_meta_id, dst_cpu_id, std::move(rpc_client), rpc());
@@ -1392,7 +1392,7 @@ future<std::tuple<rpc::sink<repair_row_on_wire_with_cmd>, rpc::source<repair_str
messaging_service::make_sink_and_source_for_repair_put_row_diff_with_rpc_stream(uint32_t repair_meta_id, shard_id dst_cpu_id, locator::host_id id) {
auto verb = messaging_verb::REPAIR_PUT_ROW_DIFF_WITH_RPC_STREAM;
if (is_shutting_down()) {
return make_exception_future<std::tuple<rpc::sink<repair_row_on_wire_with_cmd>, rpc::source<repair_stream_cmd>>>(rpc::closed_error("local node is shutting down"));
return make_exception_future<std::tuple<rpc::sink<repair_row_on_wire_with_cmd>, rpc::source<repair_stream_cmd>>>(rpc::closed_error());
}
auto rpc_client = get_rpc_client(verb, addr_for_host_id(id), id);
return do_make_sink_source<repair_row_on_wire_with_cmd, repair_stream_cmd>(verb, repair_meta_id, dst_cpu_id, std::move(rpc_client), rpc());
@@ -1414,7 +1414,7 @@ future<std::tuple<rpc::sink<repair_stream_cmd>, rpc::source<repair_hash_with_cmd
messaging_service::make_sink_and_source_for_repair_get_full_row_hashes_with_rpc_stream(uint32_t repair_meta_id, shard_id dst_cpu_id, locator::host_id id) {
auto verb = messaging_verb::REPAIR_GET_FULL_ROW_HASHES_WITH_RPC_STREAM;
if (is_shutting_down()) {
return make_exception_future<std::tuple<rpc::sink<repair_stream_cmd>, rpc::source<repair_hash_with_cmd>>>(rpc::closed_error("local node is shutting down"));
return make_exception_future<std::tuple<rpc::sink<repair_stream_cmd>, rpc::source<repair_hash_with_cmd>>>(rpc::closed_error());
}
auto rpc_client = get_rpc_client(verb, addr_for_host_id(id), id);
return do_make_sink_source<repair_stream_cmd, repair_hash_with_cmd>(verb, repair_meta_id, dst_cpu_id, std::move(rpc_client), rpc());

View File

@@ -127,21 +127,20 @@ auto send_message(messaging_service* ms, messaging_verb verb, std::optional<loca
auto rpc_handler = ms->rpc()->make_client<MsgIn(MsgOut...)>(verb);
using futurator = futurize<std::invoke_result_t<decltype(rpc_handler), rpc_protocol::client&, MsgOut...>>;
if (ms->is_shutting_down()) {
return futurator::make_exception_future(rpc::closed_error("local node is shutting down"));
return futurator::make_exception_future(rpc::closed_error());
}
auto rpc_client_ptr = ms->get_rpc_client(verb, id, host_id);
auto& rpc_client = *rpc_client_ptr;
return rpc_handler(rpc_client, std::forward<MsgOut>(msg)...).handle_exception([ms = ms->shared_from_this(), id, host_id, verb, rpc_client_ptr = std::move(rpc_client_ptr)] (std::exception_ptr&& eptr) {
ms->increment_dropped_messages(verb);
if (const auto* exp = try_catch<rpc::closed_error>(eptr)) {
if (try_catch<rpc::closed_error>(eptr)) {
// This is a transport error
if (host_id) {
ms->remove_error_rpc_client(verb, *host_id);
} else {
ms->remove_error_rpc_client(verb, id);
}
return futurator::make_exception_future(rpc::closed_error(fmt::format("got error from node {}/{}: {}",
host_id.value_or(locator::host_id{}), id.addr, exp->what())));
return futurator::make_exception_future(std::move(eptr));
} else {
// This is expected to be a rpc server error, e.g., the rpc handler throws a std::runtime_error.
return futurator::make_exception_future(std::move(eptr));
@@ -166,21 +165,20 @@ auto send_message_timeout(messaging_service* ms, messaging_verb verb, std::optio
auto rpc_handler = ms->rpc()->make_client<MsgIn(MsgOut...)>(verb);
using futurator = futurize<std::invoke_result_t<decltype(rpc_handler), rpc_protocol::client&, MsgOut...>>;
if (ms->is_shutting_down()) {
return futurator::make_exception_future(rpc::closed_error("local node is shutting down"));
return futurator::make_exception_future(rpc::closed_error());
}
auto rpc_client_ptr = ms->get_rpc_client(verb, id, host_id);
auto& rpc_client = *rpc_client_ptr;
return rpc_handler(rpc_client, timeout, std::forward<MsgOut>(msg)...).handle_exception([ms = ms->shared_from_this(), id, host_id, verb, rpc_client_ptr = std::move(rpc_client_ptr)] (std::exception_ptr&& eptr) {
ms->increment_dropped_messages(verb);
if (const auto* exp = try_catch<rpc::closed_error>(eptr)) {
if (try_catch<rpc::closed_error>(eptr)) {
// This is a transport error
if (host_id) {
ms->remove_error_rpc_client(verb, *host_id);
} else {
ms->remove_error_rpc_client(verb, id);
}
return futurator::make_exception_future(rpc::closed_error(fmt::format("got error from node {}/{}: {}",
host_id.value_or(locator::host_id{}), id.addr, exp->what())));
return futurator::make_exception_future(std::move(eptr));
} else {
// This is expected to be a rpc server error, e.g., the rpc handler throws a std::runtime_error.
return futurator::make_exception_future(std::move(eptr));
@@ -208,7 +206,7 @@ auto send_message_cancellable(messaging_service* ms, messaging_verb verb, std::o
auto rpc_handler = ms->rpc()->make_client<MsgIn(MsgOut...)>(verb);
using futurator = futurize<std::invoke_result_t<decltype(rpc_handler), rpc_protocol::client&, MsgOut...>>;
if (ms->is_shutting_down()) {
return futurator::make_exception_future(rpc::closed_error("local node is shutting down"));
return futurator::make_exception_future(rpc::closed_error());
}
auto rpc_client_ptr = ms->get_rpc_client(verb, id, host_id);
auto& rpc_client = *rpc_client_ptr;
@@ -224,15 +222,14 @@ auto send_message_cancellable(messaging_service* ms, messaging_verb verb, std::o
return rpc_handler(rpc_client, c_ref, std::forward<MsgOut>(msg)...).handle_exception([ms = ms->shared_from_this(), id, host_id, verb, rpc_client_ptr = std::move(rpc_client_ptr), sub = std::move(sub)] (std::exception_ptr&& eptr) {
ms->increment_dropped_messages(verb);
if (const auto* exp = try_catch<rpc::closed_error>(eptr)) {
if (try_catch<rpc::closed_error>(eptr)) {
// This is a transport error
if (host_id) {
ms->remove_error_rpc_client(verb, *host_id);
} else {
ms->remove_error_rpc_client(verb, id);
}
return futurator::make_exception_future(rpc::closed_error(fmt::format("got error from node {}/{}: {}",
host_id.value_or(locator::host_id{}), id.addr, exp->what())));
return futurator::make_exception_future(std::move(eptr));
} else if (try_catch<rpc::canceled_error>(eptr)) {
// Translate low-level canceled_error into high-level abort_requested_exception.
return futurator::make_exception_future(abort_requested_exception{});
@@ -258,10 +255,9 @@ auto send_message_timeout_cancellable(messaging_service* ms, messaging_verb verb
auto rpc_handler = ms->rpc()->make_client<MsgIn(MsgOut...)>(verb);
using futurator = futurize<std::invoke_result_t<decltype(rpc_handler), rpc_protocol::client&, MsgOut...>>;
if (ms->is_shutting_down()) {
return futurator::make_exception_future(rpc::closed_error("local node is shutting down"));
return futurator::make_exception_future(rpc::closed_error());
}
auto address = ms->addr_for_host_id(host_id);
auto rpc_client_ptr = ms->get_rpc_client(verb, address, host_id);
auto rpc_client_ptr = ms->get_rpc_client(verb, ms->addr_for_host_id(host_id), host_id);
auto& rpc_client = *rpc_client_ptr;
auto c = std::make_unique<seastar::rpc::cancellable>();
@@ -273,13 +269,12 @@ auto send_message_timeout_cancellable(messaging_service* ms, messaging_verb verb
return futurator::make_exception_future(abort_requested_exception{});
}
return rpc_handler(rpc_client, timeout, c_ref, std::forward<MsgOut>(msg)...).handle_exception([ms = ms->shared_from_this(), host_id, address, verb, rpc_client_ptr = std::move(rpc_client_ptr), sub = std::move(sub)] (std::exception_ptr&& eptr) {
return rpc_handler(rpc_client, timeout, c_ref, std::forward<MsgOut>(msg)...).handle_exception([ms = ms->shared_from_this(), host_id, verb, rpc_client_ptr = std::move(rpc_client_ptr), sub = std::move(sub)] (std::exception_ptr&& eptr) {
ms->increment_dropped_messages(verb);
if (const auto* exp = try_catch<rpc::closed_error>(eptr)) {
if (try_catch<rpc::closed_error>(eptr)) {
// This is a transport error
ms->remove_error_rpc_client(verb, host_id);
return futurator::make_exception_future(rpc::closed_error(fmt::format("got error from node {}/{}: {}",
host_id, address.addr, exp->what())));
return futurator::make_exception_future(std::move(eptr));
} else if (try_catch<rpc::canceled_error>(eptr)) {
// Translate low-level canceled_error into high-level abort_requested_exception.
return futurator::make_exception_future(abort_requested_exception{});

View File

@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:9d387b5ff44094e9b6c587d3e0cb2e7098ea68924f3f9947ff7574be3c378a4e
size 6475784
oid sha256:3cbe2dd05945f8fb76ebce2ea70864063d2b282c4d5080af1f290ead43321ab3
size 6444732

View File

@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:3b35c1ed982e025b4c3d079c2d14873a115ff8e8d364a19633bf83052e52a059
size 6473408
oid sha256:ad1705d5c37cc6b6cd24354b83fee8da64a14f918351d357f21cf771a650ad3d
size 6452816

View File

@@ -1179,7 +1179,6 @@ private:
bool full = is_incremental_repair_using_all_sstables();
auto& tinfo = tmap.get_tablet_info(id);
auto sstables_repaired_at = tinfo.sstables_repaired_at;
auto gid = locator::global_tablet_id{tid, id};
// Consider this:
// 1) n1 is the topology coordinator
// 2) n1 schedules and executes a tablet repair with session id s1 for a tablet on n3 an n4.
@@ -1191,16 +1190,14 @@ private:
// To avoid the deadlock, we can throw in step 7 so that n2 will
// proceed to the end_repair stage and release the lock. After that,
// the scheduler could schedule the tablet repair again.
if (_rs._repair_compaction_locks.contains(gid)) {
if (_rs._repair_compaction_locks.contains(_frozen_topology_guard)) {
auto msg = fmt::format("Tablet repair session={} table={} is in progress", _frozen_topology_guard, tid);
rlogger.info("{}", msg);
throw std::runtime_error(msg);
}
co_await utils::get_local_injector().inject("incremental_repair_prepare_wait", utils::wait_for_message(60s));
auto reenablers_and_holders = co_await table.get_compaction_reenablers_and_lock_holders_for_repair(_db.local(), _frozen_topology_guard, _range);
for (auto& lock_holder : reenablers_and_holders.lock_holders) {
_rs._repair_compaction_locks[gid].push_back(std::move(lock_holder));
_rs._repair_compaction_locks[_frozen_topology_guard].push_back(std::move(lock_holder));
}
auto sstables = co_await table.take_storage_snapshot(_range);
_incremental_repair_meta.sst_set = make_lw_shared<sstables::sstable_set>(sstables::make_partitioned_sstable_set(_schema, _range));
@@ -2839,20 +2836,9 @@ future<> repair_service::init_ms_handlers() {
auto& table = local_repair.get_db().local().find_column_family(gid.table);
auto erm = table.get_effective_replication_map();
auto& tmap = erm->get_token_metadata_ptr()->tablets().get_tablet_map(gid.table);
auto* trinfo = tmap.get_tablet_transition_info(gid.tablet);
if (!trinfo) {
auto msg = fmt::format("Skipped repair_update_compaction_ctrl gid={} session_id={} since tablet is not in transition", gid, topo_guard);
rlogger.warn("{}", msg);
throw std::runtime_error(msg);
}
if (trinfo->stage != locator::tablet_transition_stage::end_repair) {
auto msg = fmt::format("Skipped repair_update_compaction_ctrl gid={} session_id={} since tablet is not in tablet_transition_stage::end_repair", gid, topo_guard);
rlogger.warn("{}", msg);
throw std::runtime_error(msg);
}
auto range = tmap.get_token_range(gid.tablet);
co_await table.clear_being_repaired_for_range(range);
auto removed = local_repair._repair_compaction_locks.erase(gid);
auto removed = local_repair._repair_compaction_locks.erase(topo_guard);
rlogger.info("Got repair_update_compaction_ctrl gid={} session_id={} removed={}", gid, topo_guard, removed);
});
});

View File

@@ -154,7 +154,7 @@ class repair_service : public seastar::peering_sharded_service<repair_service> {
std::unordered_set<locator::host_id> ignore_nodes);
public:
std::unordered_map<locator::global_tablet_id, std::vector<seastar::rwlock::holder>> _repair_compaction_locks;
std::unordered_map<service::session_id, std::vector<seastar::rwlock::holder>> _repair_compaction_locks;
public:
repair_service(sharded<service::topology_state_machine>& tsm,

View File

@@ -84,10 +84,6 @@ class compaction_group {
seastar::named_gate _async_gate;
// Gates flushes.
seastar::named_gate _flush_gate;
// Gates sstable being added to the group.
// This prevents the group from being considered empty when sstables are being added.
// Crucial for tablet split which ACKs split for a table when all pre-split groups are empty.
seastar::named_gate _sstable_add_gate;
bool _tombstone_gc_enabled = true;
std::optional<compaction::compaction_backlog_tracker> _backlog_tracker;
repair_classifier_func _repair_sstable_classifier;
@@ -252,10 +248,6 @@ public:
return _flush_gate;
}
seastar::named_gate& sstable_add_gate() noexcept {
return _sstable_add_gate;
}
compaction::compaction_manager& get_compaction_manager() noexcept;
const compaction::compaction_manager& get_compaction_manager() const noexcept;
@@ -314,8 +306,8 @@ public:
uint64_t live_disk_space_used() const;
void for_each_compaction_group(std::function<void(const compaction_group_ptr&)> action) const;
utils::small_vector<compaction_group_ptr, 3> compaction_groups_immediate();
utils::small_vector<const_compaction_group_ptr, 3> compaction_groups_immediate() const;
utils::small_vector<compaction_group_ptr, 3> compaction_groups();
utils::small_vector<const_compaction_group_ptr, 3> compaction_groups() const;
utils::small_vector<compaction_group_ptr, 3> split_unready_groups() const;
bool split_unready_groups_are_empty() const;
@@ -442,7 +434,7 @@ public:
virtual bool all_storage_groups_split() = 0;
virtual future<> split_all_storage_groups(tasks::task_info tablet_split_task_info) = 0;
virtual future<> maybe_split_compaction_group_of(size_t idx) = 0;
virtual future<std::vector<sstables::shared_sstable>> maybe_split_new_sstable(const sstables::shared_sstable& sst) = 0;
virtual future<std::vector<sstables::shared_sstable>> maybe_split_sstable(const sstables::shared_sstable& sst) = 0;
virtual dht::token_range get_token_range_after_split(const dht::token&) const noexcept = 0;
virtual lw_shared_ptr<sstables::sstable_set> make_sstable_set() const = 0;

View File

@@ -2816,7 +2816,7 @@ future<> database::snapshot_table_on_all_shards(sharded<database>& sharded_db, t
co_await flush_table_on_all_shards(sharded_db, uuid);
}
auto table_shards = co_await get_table_on_all_shards(sharded_db, uuid);
co_await snapshot_table_on_all_shards(sharded_db, table_shards, tag);
co_await table::snapshot_on_all_shards(sharded_db, table_shards, tag);
}
future<> database::snapshot_tables_on_all_shards(sharded<database>& sharded_db, std::string_view ks_name, std::vector<sstring> table_names, sstring tag, bool skip_flush) {
@@ -2952,7 +2952,7 @@ future<> database::truncate_table_on_all_shards(sharded<database>& sharded_db, s
auto truncated_at = truncated_at_opt.value_or(db_clock::now());
auto name = snapshot_name_opt.value_or(
format("{:d}-{}", truncated_at.time_since_epoch().count(), cf.schema()->cf_name()));
co_await snapshot_table_on_all_shards(sharded_db, table_shards, name);
co_await table::snapshot_on_all_shards(sharded_db, table_shards, name);
}
co_await sharded_db.invoke_on_all([&] (database& db) {

View File

@@ -95,6 +95,7 @@ class reconcilable_result;
namespace bi = boost::intrusive;
namespace tracing { class trace_state_ptr; }
namespace s3 { struct endpoint_config; }
namespace lang { class manager; }
@@ -603,28 +604,9 @@ public:
data_dictionary::table as_data_dictionary() const;
// The usage of these functions are restricted to preexisting sstables that aren't being
// moved anywhere, so should never be used in the context of file streaming and intra
// node migration. The only user today is distributed loader, which populates the
// sstables for each column family on boot.
future<> add_sstable_and_update_cache(sstables::shared_sstable sst,
sstables::offstrategy offstrategy = sstables::offstrategy::no);
future<> add_sstables_and_update_cache(const std::vector<sstables::shared_sstable>& ssts);
// Restricted to new sstables produced by external processes such as repair.
// The sstable might undergo split if table is in split mode.
// If no need for split, the input sstable will only be attached to the sstable set.
// If split happens, the output sstables will be attached and the input sstable unlinked.
// On failure, the input sstable is unlinked and exception propagated to the caller.
// The on_add callback will be called on all sstables to be added into the set.
[[nodiscard]] future<std::vector<sstables::shared_sstable>>
add_new_sstable_and_update_cache(sstables::shared_sstable new_sst,
std::function<future<>(sstables::shared_sstable)> on_add,
sstables::offstrategy offstrategy = sstables::offstrategy::no);
[[nodiscard]] future<std::vector<sstables::shared_sstable>>
add_new_sstables_and_update_cache(std::vector<sstables::shared_sstable> new_ssts,
std::function<future<>(sstables::shared_sstable)> on_add);
future<> move_sstables_from_staging(std::vector<sstables::shared_sstable>);
sstables::shared_sstable make_sstable();
void set_truncation_time(db_clock::time_point truncated_at) noexcept {
@@ -742,9 +724,7 @@ private:
return _config.enable_cache && _schema->caching_options().enabled();
}
void update_stats_for_new_sstable(const sstables::shared_sstable& sst) noexcept;
// This function can throw even if the sstable was added into the set. When the sstable was successfully
// added, the sstable ptr @sst will be set to nullptr. Allowing caller to optionally discard the sstable.
future<> do_add_sstable_and_update_cache(compaction_group& cg, sstables::shared_sstable& sst, sstables::offstrategy, bool trigger_compaction);
future<> do_add_sstable_and_update_cache(compaction_group& cg, sstables::shared_sstable sst, sstables::offstrategy, bool trigger_compaction);
future<> do_add_sstable_and_update_cache(sstables::shared_sstable sst, sstables::offstrategy offstrategy, bool trigger_compaction);
// Helpers which add sstable on behalf of a compaction group and refreshes compound set.
void add_sstable(compaction_group& cg, sstables::shared_sstable sstable);
@@ -1057,11 +1037,37 @@ public:
db::replay_position set_low_replay_position_mark();
db::replay_position highest_flushed_replay_position() const;
future<std::pair<std::vector<sstables::shared_sstable>, sstable_list_permit>> snapshot_sstables();
private:
using snapshot_file_set = foreign_ptr<std::unique_ptr<std::unordered_set<sstring>>>;
future<snapshot_file_set> take_snapshot(sstring jsondir);
// Writes the table schema and the manifest of all files in the snapshot directory.
future<> finalize_snapshot(const global_table_ptr& table_shards, sstring jsondir, std::vector<snapshot_file_set> file_sets);
static future<> seal_snapshot(sstring jsondir, std::vector<snapshot_file_set> file_sets);
public:
static future<> snapshot_on_all_shards(sharded<database>& sharded_db, const global_table_ptr& table_shards, sstring name);
future<std::unordered_map<sstring, snapshot_details>> get_snapshot_details();
static future<snapshot_details> get_snapshot_details(std::filesystem::path snapshot_dir, std::filesystem::path datadir);
/*!
* \brief write the schema to a 'schema.cql' file at the given directory.
*
* When doing a snapshot, the snapshot directory contains a 'schema.cql' file
* with a CQL command that can be used to generate the schema.
* The content is is similar to the result of the CQL DESCRIBE command of the table.
*
* When a schema has indexes, local indexes or views, those indexes and views
* are represented by their own schemas.
* In those cases, the method would write the relevant information for each of the schemas:
*
* The schema of the base table would output a file with the CREATE TABLE command
* and the schema of the view that is used for the index would output a file with the
* CREATE INDEX command.
* The same is true for local index and MATERIALIZED VIEW.
*/
future<> write_schema_as_cql(const global_table_ptr& table_shards, sstring dir) const;
bool incremental_backups_enabled() const {
return _config.enable_incremental_backups;
}
@@ -1352,8 +1358,7 @@ public:
// Clones storage of a given tablet. Memtable is flushed first to guarantee that the
// snapshot (list of sstables) will include all the data written up to the time it was taken.
// If leave_unsealead is set, all the destination sstables will be left unsealed.
future<utils::chunked_vector<sstables::entry_descriptor>> clone_tablet_storage(locator::tablet_id tid, bool leave_unsealed);
future<utils::chunked_vector<sstables::entry_descriptor>> clone_tablet_storage(locator::tablet_id tid);
friend class compaction_group;
friend class compaction::compaction_task_impl;
@@ -2014,7 +2019,6 @@ private:
keyspace::config make_keyspace_config(const keyspace_metadata& ksm, system_keyspace is_system);
struct table_truncate_state;
static future<> snapshot_table_on_all_shards(sharded<database>& sharded_db, const global_table_ptr& table_shards, sstring name);
static future<> truncate_table_on_all_shards(sharded<database>& db, sharded<db::system_keyspace>& sys_ks, const global_table_ptr&, std::optional<db_clock::time_point> truncated_at_opt, bool with_snapshot, std::optional<sstring> snapshot_name_opt);
future<> truncate(db::system_keyspace& sys_ks, column_family& cf, std::vector<lw_shared_ptr<replica::table>>& views, const table_truncate_state&);
public:

View File

@@ -210,9 +210,9 @@ table::add_memtables_to_reader_list(std::vector<mutation_reader>& readers,
auto sgs = storage_groups_for_token_range(token_range);
reserve_fn(std::ranges::fold_left(sgs | std::views::transform(std::mem_fn(&storage_group::memtable_count)), uint64_t(0), std::plus{}));
for (auto& sg : sgs) {
sg->for_each_compaction_group([&] (const compaction_group_ptr &cg) {
for (auto& cg : sg->compaction_groups()) {
add_memtables_from_cg(*cg);
});
}
}
}
@@ -423,27 +423,15 @@ bool compaction_group::memtable_has_key(const dht::decorated_key& key) const {
}
api::timestamp_type storage_group::min_memtable_timestamp() const {
api::timestamp_type min_timestamp = api::max_timestamp;
for_each_compaction_group([&min_timestamp] (const compaction_group_ptr& cg) {
min_timestamp = std::min(min_timestamp, cg->min_memtable_timestamp());
});
return min_timestamp;
return std::ranges::min(compaction_groups() | std::views::transform(std::mem_fn(&compaction_group::min_memtable_timestamp)));
}
api::timestamp_type storage_group::min_memtable_live_timestamp() const {
api::timestamp_type min_timestamp = api::max_timestamp;
for_each_compaction_group([&min_timestamp] (const compaction_group_ptr& cg) {
min_timestamp = std::min(min_timestamp, cg->min_memtable_live_timestamp());
});
return min_timestamp;
return std::ranges::min(compaction_groups() | std::views::transform(std::mem_fn(&compaction_group::min_memtable_live_timestamp)));
}
api::timestamp_type storage_group::min_memtable_live_row_marker_timestamp() const {
api::timestamp_type min_timestamp = api::max_timestamp;
for_each_compaction_group([&min_timestamp] (const compaction_group_ptr& cg) {
min_timestamp = std::min(min_timestamp, cg->min_memtable_live_row_marker_timestamp());
});
return min_timestamp;
return std::ranges::min(compaction_groups() | std::views::transform(std::mem_fn(&compaction_group::min_memtable_live_row_marker_timestamp)));
}
api::timestamp_type table::min_memtable_timestamp() const {
@@ -733,7 +721,7 @@ public:
bool all_storage_groups_split() override { return true; }
future<> split_all_storage_groups(tasks::task_info tablet_split_task_info) override { return make_ready_future(); }
future<> maybe_split_compaction_group_of(size_t idx) override { return make_ready_future(); }
future<std::vector<sstables::shared_sstable>> maybe_split_new_sstable(const sstables::shared_sstable& sst) override {
future<std::vector<sstables::shared_sstable>> maybe_split_sstable(const sstables::shared_sstable& sst) override {
return make_ready_future<std::vector<sstables::shared_sstable>>(std::vector<sstables::shared_sstable>{sst});
}
dht::token_range get_token_range_after_split(const dht::token&) const noexcept override { return dht::token_range(); }
@@ -891,7 +879,7 @@ public:
bool all_storage_groups_split() override;
future<> split_all_storage_groups(tasks::task_info tablet_split_task_info) override;
future<> maybe_split_compaction_group_of(size_t idx) override;
future<std::vector<sstables::shared_sstable>> maybe_split_new_sstable(const sstables::shared_sstable& sst) override;
future<std::vector<sstables::shared_sstable>> maybe_split_sstable(const sstables::shared_sstable& sst) override;
dht::token_range get_token_range_after_split(const dht::token& token) const noexcept override {
return tablet_map().get_token_range_after_split(token);
}
@@ -945,7 +933,7 @@ void storage_group::for_each_compaction_group(std::function<void(const compactio
}
}
utils::small_vector<compaction_group_ptr, 3> storage_group::compaction_groups_immediate() {
utils::small_vector<compaction_group_ptr, 3> storage_group::compaction_groups() {
utils::small_vector<compaction_group_ptr, 3> cgs;
for_each_compaction_group([&cgs] (const compaction_group_ptr& cg) {
cgs.push_back(cg);
@@ -953,7 +941,7 @@ utils::small_vector<compaction_group_ptr, 3> storage_group::compaction_groups_im
return cgs;
}
utils::small_vector<const_compaction_group_ptr, 3> storage_group::compaction_groups_immediate() const {
utils::small_vector<const_compaction_group_ptr, 3> storage_group::compaction_groups() const {
utils::small_vector<const_compaction_group_ptr, 3> cgs;
for_each_compaction_group([&cgs] (const compaction_group_ptr& cg) {
cgs.push_back(cg);
@@ -1142,8 +1130,7 @@ future<> tablet_storage_group_manager::maybe_split_compaction_group_of(size_t id
}
future<std::vector<sstables::shared_sstable>>
tablet_storage_group_manager::maybe_split_new_sstable(const sstables::shared_sstable& sst) {
co_await utils::get_local_injector().inject("maybe_split_new_sstable_wait", utils::wait_for_message(120s));
tablet_storage_group_manager::maybe_split_sstable(const sstables::shared_sstable& sst) {
if (!tablet_map().needs_split()) {
co_return std::vector<sstables::shared_sstable>{sst};
}
@@ -1151,7 +1138,8 @@ tablet_storage_group_manager::maybe_split_new_sstable(const sstables::shared_sst
auto& cg = compaction_group_for_sstable(sst);
auto holder = cg.async_gate().hold();
auto& view = cg.view_for_sstable(sst);
co_return co_await _t.get_compaction_manager().maybe_split_new_sstable(sst, view, co_await split_compaction_options());
auto lock_holder = co_await _t.get_compaction_manager().get_incremental_repair_read_lock(view, "maybe_split_sstable");
co_return co_await _t.get_compaction_manager().maybe_split_sstable(sst, view, co_await split_compaction_options());
}
future<> table::maybe_split_compaction_group_of(locator::tablet_id tablet_id) {
@@ -1161,7 +1149,7 @@ future<> table::maybe_split_compaction_group_of(locator::tablet_id tablet_id) {
future<std::vector<sstables::shared_sstable>> table::maybe_split_new_sstable(const sstables::shared_sstable& sst) {
auto holder = async_gate().hold();
co_return co_await _sg_manager->maybe_split_new_sstable(sst);
co_return co_await _sg_manager->maybe_split_sstable(sst);
}
dht::token_range table::get_token_range_after_split(const dht::token& token) const noexcept {
@@ -1269,7 +1257,7 @@ future<> table::parallel_foreach_compaction_group(std::function<future<>(compact
tlogger.info("foreach_compaction_group_wait: released");
});
co_await coroutine::parallel_for_each(sg.compaction_groups_immediate(), [&] (compaction_group_ptr cg) -> future<> {
co_await coroutine::parallel_for_each(sg.compaction_groups(), [&] (compaction_group_ptr cg) -> future<> {
if (auto holder = try_hold_gate(cg->async_gate())) {
co_await action(*cg);
}
@@ -1342,7 +1330,7 @@ future<utils::chunked_vector<sstables::shared_sstable>> table::take_sstable_set_
}
future<utils::chunked_vector<sstables::entry_descriptor>>
table::clone_tablet_storage(locator::tablet_id tid, bool leave_unsealed) {
table::clone_tablet_storage(locator::tablet_id tid) {
utils::chunked_vector<sstables::entry_descriptor> ret;
auto holder = async_gate().hold();
@@ -1354,7 +1342,7 @@ table::clone_tablet_storage(locator::tablet_id tid, bool leave_unsealed) {
// by compaction while we are waiting for the lock.
auto deletion_guard = co_await get_sstable_list_permit();
co_await sg.make_sstable_set()->for_each_sstable_gently([&] (const sstables::shared_sstable& sst) -> future<> {
ret.push_back(co_await sst->clone(calculate_generation_for_new_table(), leave_unsealed));
ret.push_back(co_await sst->clone(calculate_generation_for_new_table()));
});
co_return ret;
}
@@ -1366,10 +1354,10 @@ void table::update_stats_for_new_sstable(const sstables::shared_sstable& sst) no
}
future<>
table::do_add_sstable_and_update_cache(compaction_group& cg, sstables::shared_sstable& sst, sstables::offstrategy offstrategy,
table::do_add_sstable_and_update_cache(compaction_group& cg, sstables::shared_sstable sst, sstables::offstrategy offstrategy,
bool trigger_compaction) {
auto permit = co_await seastar::get_units(_sstable_set_mutation_sem, 1);
co_return co_await get_row_cache().invalidate(row_cache::external_updater([&] () mutable noexcept {
co_return co_await get_row_cache().invalidate(row_cache::external_updater([&] () noexcept {
// FIXME: this is not really noexcept, but we need to provide strong exception guarantees.
// atomically load all opened sstables into column family.
if (!offstrategy) {
@@ -1381,8 +1369,6 @@ table::do_add_sstable_and_update_cache(compaction_group& cg, sstables::shared_ss
if (trigger_compaction) {
try_trigger_compaction(cg);
}
// Resetting sstable ptr to inform the caller the sstable has been loaded successfully.
sst = nullptr;
}), dht::partition_range::make({sst->get_first_decorated_key(), true}, {sst->get_last_decorated_key(), true}), [sst, schema = _schema] (const dht::decorated_key& key) {
return sst->filter_has_key(sstables::key::from_partition_key(*schema, key.key()));
});
@@ -1390,10 +1376,12 @@ table::do_add_sstable_and_update_cache(compaction_group& cg, sstables::shared_ss
future<>
table::do_add_sstable_and_update_cache(sstables::shared_sstable new_sst, sstables::offstrategy offstrategy, bool trigger_compaction) {
auto& cg = compaction_group_for_sstable(new_sst);
// Hold gate to make share compaction group is alive.
auto holder = cg.async_gate().hold();
co_await do_add_sstable_and_update_cache(cg, new_sst, offstrategy, trigger_compaction);
for (auto sst : co_await maybe_split_new_sstable(new_sst)) {
auto& cg = compaction_group_for_sstable(sst);
// Hold gate to make share compaction group is alive.
auto holder = cg.async_gate().hold();
co_await do_add_sstable_and_update_cache(cg, std::move(sst), offstrategy, trigger_compaction);
}
}
future<>
@@ -1411,85 +1399,6 @@ table::add_sstables_and_update_cache(const std::vector<sstables::shared_sstable>
trigger_compaction();
}
future<std::vector<sstables::shared_sstable>>
table::add_new_sstable_and_update_cache(sstables::shared_sstable new_sst,
std::function<future<>(sstables::shared_sstable)> on_add,
sstables::offstrategy offstrategy) {
std::vector<sstables::shared_sstable> ret, ssts;
std::exception_ptr ex;
try {
bool trigger_compaction = offstrategy == sstables::offstrategy::no;
auto& cg = compaction_group_for_sstable(new_sst);
// This prevents compaction group from being considered empty until the holder is released.
// Helpful for tablet split, where split is acked for a table when all pre-split groups are empty.
auto sstable_add_holder = cg.sstable_add_gate().hold();
ret = ssts = co_await maybe_split_new_sstable(new_sst);
// on successful split, input sstable is unlinked.
new_sst = nullptr;
for (auto& sst : ssts) {
auto& cg = compaction_group_for_sstable(sst);
// Hold gate to make sure compaction group is alive.
auto holder = cg.async_gate().hold();
co_await on_add(sst);
// If do_add_sstable_and_update_cache() throws after sstable has been loaded, the pointer
// sst passed by reference will be set to nullptr, so it won't be unlinked in the exception
// handler below.
co_await do_add_sstable_and_update_cache(cg, sst, offstrategy, trigger_compaction);
sst = nullptr;
}
} catch (...) {
ex = std::current_exception();
}
if (ex) {
// on failed split, input sstable is unlinked here.
if (new_sst) {
tlogger.error("Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", new_sst->get_filename(), new_sst->get_origin(), ex);
co_await new_sst->unlink();
}
// on failure after successful split, sstables not attached yet will be unlinked
co_await coroutine::parallel_for_each(ssts, [&ex] (sstables::shared_sstable sst) -> future<> {
if (sst) {
tlogger.error("Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", sst->get_filename(), sst->get_origin(), ex);
co_await sst->unlink();
}
});
co_await coroutine::return_exception_ptr(std::move(ex));
}
co_return std::move(ret);
}
future<std::vector<sstables::shared_sstable>>
table::add_new_sstables_and_update_cache(std::vector<sstables::shared_sstable> new_ssts,
std::function<future<>(sstables::shared_sstable)> on_add) {
std::exception_ptr ex;
std::vector<sstables::shared_sstable> ret;
// We rely on add_new_sstable_and_update_cache() to unlink the sstable fed into it,
// so the exception handling below will only have to unlink sstables not processed yet.
try {
for (auto& sst: new_ssts) {
auto ssts = co_await add_new_sstable_and_update_cache(std::exchange(sst, nullptr), on_add);
std::ranges::move(ssts, std::back_inserter(ret));
}
} catch (...) {
ex = std::current_exception();
}
if (ex) {
co_await coroutine::parallel_for_each(new_ssts, [&ex] (sstables::shared_sstable sst) -> future<> {
if (sst) {
tlogger.error("Failed to load SSTable {} of origin {} due to {}, it will be unlinked...", sst->get_filename(), sst->get_origin(), ex);
co_await sst->unlink();
}
});
co_await coroutine::return_exception_ptr(std::move(ex));
}
co_return std::move(ret);
}
future<>
table::update_cache(compaction_group& cg, lw_shared_ptr<memtable> m, std::vector<sstables::shared_sstable> ssts) {
auto permit = co_await seastar::get_units(_sstable_set_mutation_sem, 1);
@@ -1983,7 +1892,7 @@ sstables::file_size_stats compaction_group::live_disk_space_used_full_stats() co
}
uint64_t storage_group::live_disk_space_used() const {
auto cgs = const_cast<storage_group&>(*this).compaction_groups_immediate();
auto cgs = const_cast<storage_group&>(*this).compaction_groups();
return std::ranges::fold_left(cgs | std::views::transform(std::mem_fn(&compaction_group::live_disk_space_used)), uint64_t(0), std::plus{});
}
@@ -2110,9 +2019,10 @@ future<std::vector<compaction::compaction_group_view*>> table::get_compaction_gr
auto sgs = storage_groups_for_token_range(range);
for (auto& sg : sgs) {
co_await coroutine::maybe_yield();
sg->for_each_compaction_group([&ret] (const compaction_group_ptr& cg) {
auto cgs = sg->compaction_groups();
for (auto& cg : cgs) {
ret.push_back(&cg->view_for_unrepaired_data());
});
}
}
co_return ret;
}
@@ -2139,7 +2049,7 @@ future<compaction_reenablers_and_lock_holders> table::get_compaction_reenablers_
future<> table::clear_being_repaired_for_range(dht::token_range range) {
auto sgs = storage_groups_for_token_range(range);
for (auto& sg : sgs) {
auto cgs = sg->compaction_groups_immediate();
auto cgs = sg->compaction_groups();
for (auto& cg : cgs) {
auto sstables = cg->all_sstables();
co_await coroutine::maybe_yield();
@@ -2581,11 +2491,9 @@ future<> table::drop_quarantined_sstables() {
}
bool storage_group::no_compacted_sstable_undeleted() const {
auto ret = true;
for_each_compaction_group([&ret] (const compaction_group_ptr& cg) {
ret &= cg->compacted_undeleted_sstables().empty();
return std::ranges::all_of(compaction_groups(), [] (const_compaction_group_ptr& cg) {
return cg->compacted_undeleted_sstables().empty();
});
return ret;
}
// Gets the list of all sstables in the column family, including ones that are
@@ -2704,8 +2612,8 @@ public:
sstables::sstables_manager& get_sstables_manager() noexcept override {
return _t.get_sstables_manager();
}
sstables::shared_sstable make_sstable(sstables::sstable_state state) const override {
return _t.make_sstable(state);
sstables::shared_sstable make_sstable() const override {
return _t.make_sstable();
}
sstables::sstable_writer_config configure_writer(sstring origin) const override {
auto cfg = _t.get_sstables_manager().configure_writer(std::move(origin));
@@ -2823,7 +2731,6 @@ future<> compaction_group::stop(sstring reason) noexcept {
auto flush_future = co_await seastar::coroutine::as_future(flush());
co_await _flush_gate.close();
co_await _sstable_add_gate.close();
// FIXME: indentation
_compaction_disabler_for_views.clear();
co_await utils::get_local_injector().inject("compaction_group_stop_wait", utils::wait_for_message(60s));
@@ -2837,7 +2744,7 @@ future<> compaction_group::stop(sstring reason) noexcept {
}
bool compaction_group::empty() const noexcept {
return _memtables->empty() && live_sstable_count() == 0 && _sstable_add_gate.get_count() == 0;
return _memtables->empty() && live_sstable_count() == 0;
}
const schema_ptr& compaction_group::schema() const {
@@ -2850,9 +2757,9 @@ void compaction_group::clear_sstables() {
}
void storage_group::clear_sstables() {
for_each_compaction_group([] (const compaction_group_ptr& cg) {
for (auto cg : compaction_groups()) {
cg->clear_sstables();
});
}
}
table::table(schema_ptr schema, config config, lw_shared_ptr<const storage_options> sopts, compaction::compaction_manager& compaction_manager,
@@ -3179,7 +3086,7 @@ future<> table::update_repaired_at_for_merge() {
for (auto& x : sgs) {
auto sg = x.second;
if (sg) {
auto cgs = sg->compaction_groups_immediate();
auto cgs = sg->compaction_groups();
for (auto& cg : cgs) {
auto cre = co_await cg->get_compaction_manager().stop_and_disable_compaction("update_repaired_at_for_merge", cg->view_for_unrepaired_data());
co_await cg->update_repaired_at_for_merge();
@@ -3293,7 +3200,7 @@ db::replay_position table::highest_flushed_replay_position() const {
}
struct manifest_json : public json::json_base {
json::json_chunked_list<std::string_view> files;
json::json_chunked_list<sstring> files;
manifest_json() {
register_params();
@@ -3312,25 +3219,22 @@ private:
}
};
class snapshot_writer {
public:
virtual future<> init() = 0;
virtual future<> sync() = 0;
virtual future<output_stream<char>> stream_for(sstring component) = 0;
virtual ~snapshot_writer() = default;
};
using snapshot_file_set = foreign_ptr<std::unique_ptr<std::unordered_set<sstring>>>;
static future<> write_manifest(snapshot_writer& writer, std::vector<snapshot_file_set> file_sets) {
future<>
table::seal_snapshot(sstring jsondir, std::vector<snapshot_file_set> file_sets) {
manifest_json manifest;
for (const auto& fsp : file_sets) {
for (auto& rf : *fsp) {
manifest.files.push(std::string_view(rf));
manifest.files.push(std::move(rf));
}
}
auto streamer = json::stream_object(std::move(manifest));
auto out = co_await writer.stream_for("manifest.json");
auto jsonfile = jsondir + "/manifest.json";
tlogger.debug("Storing manifest {}", jsonfile);
co_await io_check([jsondir] { return recursive_touch_directory(jsondir); });
auto f = co_await open_checked_file_dma(general_disk_error_handler, jsonfile, open_flags::wo | open_flags::create | open_flags::truncate);
auto out = co_await make_file_output_stream(std::move(f));
std::exception_ptr ex;
try {
co_await streamer(std::move(out));
@@ -3341,27 +3245,19 @@ static future<> write_manifest(snapshot_writer& writer, std::vector<snapshot_fil
if (ex) {
co_await coroutine::return_exception_ptr(std::move(ex));
}
co_await io_check(sync_directory, std::move(jsondir));
}
/*!
* \brief write the schema to a 'schema.cql' file at the given directory.
*
* When doing a snapshot, the snapshot directory contains a 'schema.cql' file
* with a CQL command that can be used to generate the schema.
* The content is is similar to the result of the CQL DESCRIBE command of the table.
*
* When a schema has indexes, local indexes or views, those indexes and views
* are represented by their own schemas.
* In those cases, the method would write the relevant information for each of the schemas:
*
* The schema of the base table would output a file with the CREATE TABLE command
* and the schema of the view that is used for the index would output a file with the
* CREATE INDEX command.
* The same is true for local index and MATERIALIZED VIEW.
*/
static future<> write_schema_as_cql(snapshot_writer& writer, cql3::description schema_desc) {
future<> table::write_schema_as_cql(const global_table_ptr& table_shards, sstring dir) const {
auto schema_desc = schema()->describe(
replica::make_schema_describe_helper(table_shards),
cql3::describe_option::STMTS);
auto schema_description = std::move(*schema_desc.create_statement);
auto out = co_await writer.stream_for("schema.cql");
auto schema_file_name = dir + "/schema.cql";
auto f = co_await open_checked_file_dma(general_disk_error_handler, schema_file_name, open_flags::wo | open_flags::create | open_flags::truncate);
auto out = co_await make_file_output_stream(std::move(f));
std::exception_ptr ex;
auto view = managed_bytes_view(schema_description.as_managed_bytes());
@@ -3382,87 +3278,73 @@ static future<> write_schema_as_cql(snapshot_writer& writer, cql3::description s
}
}
class local_snapshot_writer : public snapshot_writer {
std::filesystem::path _dir;
public:
local_snapshot_writer(std::filesystem::path dir, sstring name)
: _dir(dir / sstables::snapshots_dir / name)
{}
future<> init() override {
co_await io_check([this] { return recursive_touch_directory(_dir.native()); });
}
future<> sync() override {
co_await io_check([this] { return sync_directory(_dir.native()); });
}
future<output_stream<char>> stream_for(sstring component) override {
auto file_name = (_dir / component).native();
auto f = co_await open_checked_file_dma(general_disk_error_handler, file_name, open_flags::wo | open_flags::create | open_flags::truncate);
co_return co_await make_file_output_stream(std::move(f));
}
};
// Runs the orchestration code on an arbitrary shard to balance the load.
future<> database::snapshot_table_on_all_shards(sharded<database>& sharded_db, const global_table_ptr& table_shards, sstring name) {
auto writer = std::visit(overloaded_functor{
[&name] (const data_dictionary::storage_options::local& loc) -> std::unique_ptr<snapshot_writer> {
if (loc.dir.empty()) {
// virtual tables don't have initialized local storage
return nullptr;
}
return std::make_unique<local_snapshot_writer>(loc.dir, name);
},
[] (const data_dictionary::storage_options::s3&) -> std::unique_ptr<snapshot_writer> {
throw std::runtime_error("Snapshotting non-local tables is not implemented");
}
}, table_shards->get_storage_options().value);
if (!writer) {
future<> table::snapshot_on_all_shards(sharded<database>& sharded_db, const global_table_ptr& table_shards, sstring name) {
auto* so = std::get_if<storage_options::local>(&table_shards->get_storage_options().value);
if (so == nullptr) {
throw std::runtime_error("Snapshotting non-local tables is not implemented");
}
if (so->dir.empty()) { // virtual tables don't have initialized local storage
co_return;
}
auto orchestrator = std::hash<sstring>()(name) % smp::count;
auto jsondir = (so->dir / sstables::snapshots_dir / name).native();
auto orchestrator = std::hash<sstring>()(jsondir) % smp::count;
co_await smp::submit_to(orchestrator, [&] () -> future<> {
auto& t = *table_shards;
auto s = t.schema();
tlogger.debug("Taking snapshot of {}.{}: name={}", s->ks_name(), s->cf_name(), name);
tlogger.debug("Taking snapshot of {}.{}: directory={}", s->ks_name(), s->cf_name(), jsondir);
std::vector<snapshot_file_set> file_sets(smp::count);
std::vector<table::snapshot_file_set> file_sets;
file_sets.reserve(smp::count);
co_await writer->init();
co_await smp::invoke_on_all([&] -> future<> {
auto& t = *table_shards;
auto [tables, permit] = co_await t.snapshot_sstables();
auto table_names = co_await t.get_sstables_manager().take_snapshot(std::move(tables), name);
file_sets[this_shard_id()] = make_foreign(std::make_unique<std::unordered_set<sstring>>(std::move(table_names)));
co_await io_check([&jsondir] { return recursive_touch_directory(jsondir); });
co_await coroutine::parallel_for_each(smp::all_cpus(), [&] (unsigned shard) -> future<> {
file_sets.emplace_back(co_await smp::submit_to(shard, [&] {
return table_shards->take_snapshot(jsondir);
}));
});
co_await writer->sync();
co_await io_check(sync_directory, jsondir);
std::exception_ptr ex;
tlogger.debug("snapshot {}: writing schema.cql", name);
auto schema_desc = s->describe(replica::make_schema_describe_helper(table_shards), cql3::describe_option::STMTS);
co_await write_schema_as_cql(*writer, std::move(schema_desc)).handle_exception([&] (std::exception_ptr ptr) {
tlogger.error("Failed writing schema file in snapshot in {} with exception {}", name, ptr);
ex = std::move(ptr);
});
tlogger.debug("snapshot {}: seal_snapshot", name);
co_await write_manifest(*writer, std::move(file_sets)).handle_exception([&] (std::exception_ptr ptr) {
tlogger.error("Failed to seal snapshot in {}: {}.", name, ptr);
ex = std::move(ptr);
});
if (ex) {
co_await coroutine::return_exception_ptr(std::move(ex));
}
co_await writer->sync();
co_await t.finalize_snapshot(table_shards, std::move(jsondir), std::move(file_sets));
});
}
future<std::pair<std::vector<sstables::shared_sstable>, table::sstable_list_permit>> table::snapshot_sstables() {
auto permit = co_await get_sstable_list_permit();
future<table::snapshot_file_set> table::take_snapshot(sstring jsondir) {
tlogger.trace("take_snapshot {}", jsondir);
auto sstable_deletion_guard = co_await get_sstable_list_permit();
auto tables = *_sstables->all() | std::ranges::to<std::vector<sstables::shared_sstable>>();
co_return std::make_pair(std::move(tables), std::move(permit));
auto table_names = std::make_unique<std::unordered_set<sstring>>();
co_await _sstables_manager.dir_semaphore().parallel_for_each(tables, [&jsondir, &table_names] (sstables::shared_sstable sstable) {
table_names->insert(sstable->component_basename(sstables::component_type::Data));
return io_check([sstable, &dir = jsondir] {
return sstable->snapshot(dir);
});
});
co_return make_foreign(std::move(table_names));
}
future<> table::finalize_snapshot(const global_table_ptr& table_shards, sstring jsondir, std::vector<snapshot_file_set> file_sets) {
std::exception_ptr ex;
tlogger.debug("snapshot {}: writing schema.cql", jsondir);
co_await write_schema_as_cql(table_shards, jsondir).handle_exception([&] (std::exception_ptr ptr) {
tlogger.error("Failed writing schema file in snapshot in {} with exception {}", jsondir, ptr);
ex = std::move(ptr);
});
tlogger.debug("snapshot {}: seal_snapshot", jsondir);
co_await seal_snapshot(jsondir, std::move(file_sets)).handle_exception([&] (std::exception_ptr ptr) {
tlogger.error("Failed to seal snapshot in {}: {}.", jsondir, ptr);
ex = std::move(ptr);
});
if (ex) {
co_await coroutine::return_exception_ptr(std::move(ex));
}
}
future<bool> table::snapshot_exists(sstring tag) {
@@ -3474,7 +3356,6 @@ future<bool> table::snapshot_exists(sstring tag) {
sstring jsondir = (so->dir / sstables::snapshots_dir / tag).native();
bool exists = false;
try {
future<stat_data> (&file_stat)(std::string_view, follow_symlink) noexcept = seastar::file_stat;
auto sd = co_await io_check(file_stat, jsondir, follow_symlink::no);
if (sd.type != directory_entry_type::directory) {
throw std::error_code(ENOTDIR, std::system_category());
@@ -3530,7 +3411,6 @@ future<table::snapshot_details> table::get_snapshot_details(fs::path snapshot_di
const auto& name = de->name;
// FIXME: optimize stat calls by keeping the base directory open and use statat instead, here and below.
// See https://github.com/scylladb/seastar/pull/3163
future<stat_data> (&file_stat)(std::string_view, follow_symlink) noexcept = seastar::file_stat;
auto sd = co_await io_check(file_stat, (snapshot_dir / name).native(), follow_symlink::no);
auto size = sd.allocated_size;
@@ -3545,7 +3425,7 @@ future<table::snapshot_details> table::get_snapshot_details(fs::path snapshot_di
details.live += size;
continue;
}
// If the number of links is greater than 1, it is still possible that the file is linked to another snapshot
// If the number of linkes is greater than 1, it is still possible that the file is linked to another snapshot
// So check the datadir for the file too.
} else {
continue;
@@ -3589,7 +3469,7 @@ future<> compaction_group::flush() noexcept {
}
future<> storage_group::flush() noexcept {
for (auto& cg : compaction_groups_immediate()) {
for (auto& cg : compaction_groups()) {
co_await cg->flush();
}
}
@@ -3607,11 +3487,7 @@ size_t compaction_group::memtable_count() const noexcept {
}
size_t storage_group::memtable_count() const {
size_t count = 0;
for_each_compaction_group([&count] (const compaction_group_ptr& cg) {
count += cg->memtable_count();
});
return count;
return std::ranges::fold_left(compaction_groups() | std::views::transform(std::mem_fn(&compaction_group::memtable_count)), size_t(0), std::plus{});
}
future<> table::flush(std::optional<db::replay_position> pos) {
@@ -3629,7 +3505,7 @@ future<> table::flush(std::optional<db::replay_position> pos) {
}
bool storage_group::can_flush() const {
return std::ranges::any_of(compaction_groups_immediate(), std::mem_fn(&compaction_group::can_flush));
return std::ranges::any_of(compaction_groups(), std::mem_fn(&compaction_group::can_flush));
}
bool table::can_flush() const {
@@ -3660,11 +3536,9 @@ bool storage_group::compaction_disabled() const {
// Compaction group that has been stopped will be excluded, since the group will not be available for a caller
// to disable compaction explicitly on it, e.g. on truncate, and the caller might want to perform a check
// that compaction was disabled on all groups. Stopping a group is equivalent to disabling compaction on it.
bool all_disabled = true;
for_each_compaction_group([&all_disabled] (const compaction_group_ptr& cg) {
all_disabled &= cg->stopped() || cg->compaction_disabled();
});
return all_disabled;
return std::ranges::all_of(compaction_groups()
| std::views::filter(std::not_fn(&compaction_group::stopped)), [] (const_compaction_group_ptr& cg) {
return cg->compaction_disabled(); });
}
// NOTE: does not need to be futurized, but might eventually, depending on
@@ -4449,11 +4323,11 @@ std::vector<mutation_source> table::select_memtables_as_mutation_sources(dht::to
auto& sg = storage_group_for_token(token);
std::vector<mutation_source> mss;
mss.reserve(sg.memtable_count());
sg.for_each_compaction_group([&mss] (const compaction_group_ptr &cg) {
for (auto& cg : sg.compaction_groups()) {
for (auto& mt : *cg->memtables()) {
mss.emplace_back(mt->as_data_source());
}
});
}
return mss;
}
@@ -4613,7 +4487,7 @@ future<> compaction_group::cleanup() {
}
future<> table::clear_inactive_reads_for_tablet(database& db, storage_group& sg) {
for (auto& cg_ptr : sg.compaction_groups_immediate()) {
for (auto& cg_ptr : sg.compaction_groups()) {
co_await db.clear_inactive_reads_for_tablet(_schema->id(), cg_ptr->token_range());
}
}
@@ -4654,13 +4528,13 @@ future<> table::stop_compaction_groups(storage_group& sg) {
}
future<> table::flush_compaction_groups(storage_group& sg) {
for (auto& cg_ptr : sg.compaction_groups_immediate()) {
for (auto& cg_ptr : sg.compaction_groups()) {
co_await cg_ptr->flush();
}
}
future<> table::cleanup_compaction_groups(database& db, db::system_keyspace& sys_ks, locator::tablet_id tid, storage_group& sg) {
for (auto& cg_ptr : sg.compaction_groups_immediate()) {
for (auto& cg_ptr : sg.compaction_groups()) {
co_await cg_ptr->cleanup();
// FIXME: at this point _highest_rp might be greater than the replay_position of the last cleaned mutation,
// and can cover some mutations which weren't cleaned, causing them to be lost during replay.

View File

@@ -1,13 +1,6 @@
find_program(CARGO cargo
REQUIRED)
# Set up RUSTC_WRAPPER for sccache support if configured
if(Scylla_RUSTC_WRAPPER)
set(RUSTC_WRAPPER_ENV "RUSTC_WRAPPER=${Scylla_RUSTC_WRAPPER}")
else()
set(RUSTC_WRAPPER_ENV "")
endif()
function(add_rust_library name)
# used for profiles defined in Cargo.toml
if(CMAKE_CONFIGURATION_TYPES)
@@ -23,7 +16,7 @@ function(add_rust_library name)
set(library ${target_dir}/lib${name}.a)
add_custom_command(
OUTPUT ${library}
COMMAND ${CMAKE_COMMAND} -E env CARGO_BUILD_DEP_INFO_BASEDIR=. ${RUSTC_WRAPPER_ENV} ${CARGO} build --locked --target-dir=${target_dir} --profile=${profile}
COMMAND ${CMAKE_COMMAND} -E env CARGO_BUILD_DEP_INFO_BASEDIR=. ${CARGO} build --locked --target-dir=${target_dir} --profile=${profile}
COMMAND ${CMAKE_COMMAND} -E copy ${target_dir}/${profile}/lib${name}.a ${library}
DEPENDS Cargo.lock
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}

View File

@@ -390,11 +390,9 @@ dark_green = (195, 215, 195)
light_red = (255, 200, 200)
light_green = (200, 255, 200)
light_gray = (240, 240, 240)
scylla_blue = (87, 209, 229)
tablet_colors = {
(Tablet.STATE_NORMAL, None): GRAY,
(Tablet.STATE_NORMAL, 'repair'): scylla_blue,
(Tablet.STATE_JOINING, 'allow_write_both_read_old'): dark_green,
(Tablet.STATE_LEAVING, 'allow_write_both_read_old'): dark_red,
(Tablet.STATE_JOINING, 'write_both_read_old'): dark_green,
@@ -534,8 +532,6 @@ def update_from_cql(initial=False):
state = (Tablet.STATE_JOINING, tablet.stage)
elif replica in leaving:
state = (Tablet.STATE_LEAVING, tablet.stage)
elif tablet.stage == 'repair':
state = (Tablet.STATE_NORMAL, tablet.stage)
else:
state = (Tablet.STATE_NORMAL, None)

View File

@@ -4109,16 +4109,6 @@ class scylla_fiber(gdb.Command):
return res
return None
# Coroutines need special handling as they allocate the future object on their frame.
if name.strip().endswith('[clone .resume]'):
self._maybe_log(f"Current task is a coroutine, trying to find the promise in the coroutine frame: 0x{ptr_meta.ptr:x}+{ptr_meta.size}\n", verbose)
# Skip the first two pointers, these are the coroutine resume and destroy function pointers.
for maybe_tptr in range(ptr_meta.ptr + 2 * _vptr_type().sizeof, ptr_meta.ptr + ptr_meta.size, _vptr_type().sizeof):
res = self._probe_pointer(maybe_tptr, scanned_region_size, using_seastar_allocator, verbose)
if res is not None:
return res
return None
if name.startswith('vtable for seastar::internal::when_all_state'):
when_all_state_base_ptr_type = gdb.lookup_type('seastar::internal::when_all_state_base').pointer()
when_all_state_base = gdb.Value(int(ptr_meta.ptr)).reinterpret_cast(when_all_state_base_ptr_type)
@@ -4205,9 +4195,6 @@ class scylla_fiber(gdb.Command):
parser.add_argument("--force-fallback-mode", action="store_true", default=False,
help="Force fallback mode to be used, that is, scan a fixed-size region of memory"
" (configurable via --scanned-region-size), instead of relying on `scylla ptr` for determining the size of the task objects.")
parser.add_argument("--direction", action="store", choices=['forward', 'backward', 'both'], default='both',
help="Direction in which to walk the continuation chain. 'forward' walks futures waiting on the given task,"
" 'backward' walks futures the given task is waiting on, 'both' does both.")
parser.add_argument("task", action="store", help="An expression that evaluates to a valid `seastar::task*` value. Cannot contain white-space.")
try:
@@ -4237,20 +4224,14 @@ class scylla_fiber(gdb.Command):
gdb.write("Provided pointer 0x{:016x} is not an object managed by seastar or not a task pointer\n".format(initial_task_ptr))
return
if (args.direction == 'backward' or args.direction == 'both'):
backwards_fiber = self._walk(self._walk_backward, this_task[0], this_task[2], args.max_depth, args.scanned_region_size, using_seastar_allocator, args.verbose)
else:
backwards_fiber = []
backwards_fiber = self._walk(self._walk_backward, this_task[0], this_task[2], args.max_depth, args.scanned_region_size, using_seastar_allocator, args.verbose)
for i, task_info in enumerate(reversed(backwards_fiber)):
format_task_line(i - len(backwards_fiber), task_info)
format_task_line(0, this_task)
if (args.direction == 'forward' or args.direction == 'both'):
forward_fiber = self._walk(self._walk_forward, this_task[0], this_task[2], args.max_depth, args.scanned_region_size, using_seastar_allocator, args.verbose)
else:
forward_fiber = []
forward_fiber = self._walk(self._walk_forward, this_task[0], this_task[2], args.max_depth, args.scanned_region_size, using_seastar_allocator, args.verbose)
for i, task_info in enumerate(forward_fiber):
format_task_line(i + 1, task_info)
@@ -5123,15 +5104,10 @@ class scylla_small_objects(gdb.Command):
span_end = int(span_start + span.size() * self._page_size)
# span's free list
try:
span_next_free = span.page['freelist']
while span_next_free:
self._free_in_span.add(int(span_next_free))
span_next_free = span_next_free['next']
except gdb.error:
# This loop sometimes steps on "Cannot access memory at address", causing CI instability.
# Catch the exception and break the freelist traversal loop gracefully.
gdb.write(f"Warning: error traversing freelist of span [0x{span_start:x}, 0x{span_end:x}), some of the listed objects in this span may be free objects.\n")
span_next_free = span.page['freelist']
while span_next_free:
self._free_in_span.add(int(span_next_free))
span_next_free = span_next_free['next']
return span_start, span_end
@@ -5874,18 +5850,6 @@ class scylla_read_stats(gdb.Command):
def __init__(self):
gdb.Command.__init__(self, 'scylla read-stats', gdb.COMMAND_USER, gdb.COMPLETE_COMMAND)
@staticmethod
def foreach_permit(semaphore, fn):
"""Mirror of reader_concurrency_semaphore::foreach_permit()"""
for permit_list in (
semaphore['_permit_list'],
semaphore['_wait_list']['_admission_queue'],
semaphore['_wait_list']['_memory_queue'],
semaphore['_ready_list'],
semaphore['_inactive_reads']):
for permit in intrusive_list(permit_list):
fn(permit)
@staticmethod
def dump_reads_from_semaphore(semaphore):
try:
@@ -5900,7 +5864,7 @@ class scylla_read_stats(gdb.Command):
permit_summaries = defaultdict(permit_stats)
total = permit_stats()
def summarize_permit(permit):
for permit in intrusive_list(permit_list):
schema_name = "*.*"
schema = permit['_schema']
try:
@@ -5920,8 +5884,6 @@ class scylla_read_stats(gdb.Command):
permit_summaries[(schema_name, description, state)].add(summary)
total.add(summary)
scylla_read_stats.foreach_permit(semaphore, summarize_permit)
if not permit_summaries:
return
@@ -5931,9 +5893,7 @@ class scylla_read_stats(gdb.Command):
inactive_read_count = len(intrusive_list(semaphore['_inactive_reads']))
waiters = int(semaphore["_stats"]["waiters"])
gdb.write("Semaphore ({}*) 0x{:x} {} with: {}/{} count and {}/{} memory resources, queued: {}, inactive={}\n".format(
semaphore.type.name,
int(semaphore.address),
gdb.write("Semaphore {} with: {}/{} count and {}/{} memory resources, queued: {}, inactive={}\n".format(
semaphore_name,
initial_count - int(semaphore['_resources']['count']), initial_count,
initial_memory - int(semaphore['_resources']['memory']), initial_memory,

Submodule seastar updated: 4dcd4df5e7...7ec14e836a

View File

@@ -224,13 +224,7 @@ future<> service::client_state::has_access(const sstring& ks, auth::command_desc
ks + " can be granted only SELECT or DESCRIBE permissions to a non-superuser.");
}
static const std::unordered_set<auth::resource> vector_search_system_resources = {
auth::make_data_resource(db::system_keyspace::NAME, db::system_keyspace::GROUP0_HISTORY),
auth::make_data_resource(db::system_keyspace::NAME, db::system_keyspace::VERSIONS),
};
if ((cmd.resource.kind() == auth::resource_kind::data && cmd.permission == auth::permission::SELECT && is_vector_indexed.has_value() && is_vector_indexed.value()) ||
(cmd.permission == auth::permission::SELECT && vector_search_system_resources.contains(cmd.resource))) {
if (cmd.resource.kind() == auth::resource_kind::data && cmd.permission == auth::permission::SELECT && is_vector_indexed.has_value() && is_vector_indexed.value()) {
co_return co_await ensure_has_permission<auth::command_desc_with_permission_set>({auth::permission_set::of<auth::permission::SELECT, auth::permission::VECTOR_SEARCH_INDEXING>(), cmd.resource});

View File

@@ -79,8 +79,7 @@ group0_state_machine::group0_state_machine(raft_group0_client& client, migration
// the node won't try to fetch a topology snapshot if the other
// node doesn't support it yet.
_topology_change_enabled = true;
}))
, _in_memory_state_machine_enabled(utils::get_local_injector().is_enabled("group0_enable_sm_immediately")) {
})) {
_state_id_handler.run();
}
@@ -155,27 +154,6 @@ static future<> notify_client_route_change_if_needed(storage_service& storage_se
}
}
// Meant to be used only in error injections.
static future<> maybe_partially_apply_cdc_generation_deletion_then_get_stuck(
std::function<future<>(utils::chunked_vector<frozen_mutation_and_schema>)> mutate,
const utils::chunked_vector<frozen_mutation_and_schema>& mutations) {
auto is_cdc_generation_data_clearing_mutation = [] (const frozen_mutation_and_schema& fm_s) {
return fm_s.s->id() == db::system_keyspace::cdc_generations_v3()->id()
&& !fm_s.fm.unfreeze(fm_s.s).partition().row_tombstones().empty();
};
if (std::any_of(mutations.begin(), mutations.end(), is_cdc_generation_data_clearing_mutation)) {
utils::chunked_vector<frozen_mutation_and_schema> filtered_mutations;
std::copy_if(mutations.begin(), mutations.end(), std::back_inserter(filtered_mutations), is_cdc_generation_data_clearing_mutation);
co_await mutate(std::move(filtered_mutations));
while (true) {
slogger.info("group0 has hung on error injection, waiting for the process to be killed");
co_await seastar::sleep(std::chrono::seconds(1));
}
}
}
future<> write_mutations_to_database(storage_service& storage_service, storage_proxy& proxy, gms::inet_address from, utils::chunked_vector<canonical_mutation> cms) {
utils::chunked_vector<frozen_mutation_and_schema> mutations;
client_routes_service::client_route_keys client_routes_update;
@@ -200,13 +178,7 @@ future<> write_mutations_to_database(storage_service& storage_service, storage_p
throw std::runtime_error(::format("Error while applying mutations: {}", e));
}
auto mutate = [&proxy] (utils::chunked_vector<frozen_mutation_and_schema> mutations) {
return proxy.mutate_locally(std::move(mutations), tracing::trace_state_ptr(), db::commitlog::force_sync::no);
};
if (utils::get_local_injector().is_enabled("group0_simulate_partial_application_of_cdc_generation_deletion")) {
co_await maybe_partially_apply_cdc_generation_deletion_then_get_stuck(mutate, mutations);
}
co_await mutate(std::move(mutations));
co_await proxy.mutate_locally(std::move(mutations), tracing::trace_state_ptr(), db::commitlog::force_sync::no);
if (need_system_topology_flush) {
slogger.trace("write_mutations_to_database: flushing {}.{}", db::system_keyspace::NAME, db::system_keyspace::TOPOLOGY);
@@ -299,40 +271,41 @@ future<> group0_state_machine::merge_and_apply(group0_state_machine_merger& merg
// If we crash before appending the state ID, when we reapply the command after restart, the change will be applied because
// the state ID was not yet appended so the above check will pass.
std::optional<storage_service::state_change_hint> topology_state_change_hint;
modules_to_reload modules_to_reload;
// TODO: reapplication of a command after a crash may require contacting a quorum (we need to learn that the command
// is committed from a leader). But we may want to ensure that group 0 state is consistent after restart even without
// access to quorum, which means we cannot allow partially applied commands. We need to ensure that either the entire
// change is applied and the state ID is updated or none of this happens.
// E.g. use a write-ahead-entry which contains all this information and make sure it's replayed during restarts.
co_await std::visit(make_visitor(
[&] (schema_change& chng) -> future<> {
modules_to_reload = get_modules_to_reload(chng.mutations);
auto modules_to_reload = get_modules_to_reload(chng.mutations);
co_await _mm.merge_schema_from(locator::host_id{cmd.creator_id.uuid()}, std::move(chng.mutations));
co_await reload_modules(std::move(modules_to_reload));
},
[&] (broadcast_table_query& query) -> future<> {
auto result = co_await service::broadcast_tables::execute_broadcast_table_query(_sp, query.query, cmd.new_state_id);
_client.set_query_result(cmd.new_state_id, std::move(result));
},
[&] (topology_change& chng) -> future<> {
modules_to_reload = get_modules_to_reload(chng.mutations);
topology_state_change_hint = {.tablets_hint = replica::get_tablet_metadata_change_hint(chng.mutations)};
auto modules_to_reload = get_modules_to_reload(chng.mutations);
auto tablet_keys = replica::get_tablet_metadata_change_hint(chng.mutations);
co_await write_mutations_to_database(_ss, _sp, cmd.creator_addr, std::move(chng.mutations));
co_await _ss.topology_transition({.tablets_hint = std::move(tablet_keys)});
co_await reload_modules(std::move(modules_to_reload));
},
[&] (mixed_change& chng) -> future<> {
modules_to_reload = get_modules_to_reload(chng.mutations);
topology_state_change_hint.emplace();
auto modules_to_reload = get_modules_to_reload(chng.mutations);
co_await _mm.merge_schema_from(locator::host_id{cmd.creator_id.uuid()}, std::move(chng.mutations));
co_await _ss.topology_transition();
co_await reload_modules(std::move(modules_to_reload));
},
[&] (write_mutations& muts) -> future<> {
modules_to_reload = get_modules_to_reload(muts.mutations);
auto modules_to_reload = get_modules_to_reload(muts.mutations);
co_await write_mutations_to_database(_ss, _sp, cmd.creator_addr, std::move(muts.mutations));
}
), cmd.change);
if (_in_memory_state_machine_enabled) {
if (topology_state_change_hint) {
co_await _ss.topology_transition(std::move(*topology_state_change_hint));
}
co_await reload_modules(std::move(modules_to_reload));
}
), cmd.change);
co_await _sp.mutate_locally({std::move(history)}, nullptr);
}
@@ -440,23 +413,9 @@ void group0_state_machine::drop_snapshot(raft::snapshot_id id) {
}
future<> group0_state_machine::load_snapshot(raft::snapshot_id id) {
// topology_state_load applies persisted state machine state into
// memory and thus needs to be protected with apply mutex
auto read_apply_mutex_holder = co_await _client.hold_read_apply_mutex(_abort_source);
if (_in_memory_state_machine_enabled) {
co_await reload_state();
}
}
future<> group0_state_machine::enable_in_memory_state_machine() {
auto read_apply_mutex_holder = co_await _client.hold_read_apply_mutex(_abort_source);
if (!_in_memory_state_machine_enabled) {
_in_memory_state_machine_enabled = true;
co_await reload_state();
}
}
future<> group0_state_machine::reload_state() {
// we assume that the apply mutex is held, topology_state_load applies
// persisted state machine into memory so it needs to be protected with it
co_await _ss.topology_state_load();
co_await _ss.view_building_state_load();
if (_feature_service.compression_dicts) {

View File

@@ -113,33 +113,9 @@ class group0_state_machine : public raft_state_machine {
gms::feature_service& _feature_service;
gms::feature::listener_registration _topology_on_raft_support_listener;
// This boolean controls whether the in-memory data structures should be updated
// after snapshot transfer / command application.
//
// The reason for the flag is to protect from reading a partially applied state.
// A group0 command may consist of multiple mutations that are not applied
// in a single, atomic operation, but rather separately. A node can crash
// in the middle of applying such a command, leaving the group0 in an inconsistent
// state. Thanks to the idempotency of mutations, applying the group0 command
// again, fully, will make the state consistent again. Therefore, we use this
// flag to control when the in memory state machine should be updated from the
// on-disk state - we can only do that if we know that the group0 table state
// is consistent.
//
// The only exception to the above rule is the schema - the schema state is
// loaded into memory before group0 is initialized, and the in-memory state
// is reloaded even if _in_memory_state_machine_enabled is set to false.
// Resolving this exception should be possible, but would require considerable
// effort in refactoring the migration manager code. In the meantime, we are
// fine with this exception because the migration manager applies all schema
// mutations of a single command atomically, in a single commitlog entry -
// therefore, we should not observe broken invariants in the schema module.
bool _in_memory_state_machine_enabled;
modules_to_reload get_modules_to_reload(const utils::chunked_vector<canonical_mutation>& mutations);
future<> reload_modules(modules_to_reload modules);
future<> merge_and_apply(group0_state_machine_merger& merger);
future<> reload_state();
public:
group0_state_machine(raft_group0_client& client, migration_manager& mm, storage_proxy& sp, storage_service& ss,
gms::gossiper& gossiper, gms::feature_service& feat, bool topology_change_enabled);
@@ -149,7 +125,6 @@ public:
future<> load_snapshot(raft::snapshot_id id) override;
future<> transfer_snapshot(raft::server_id from_id, raft::snapshot_descriptor snp) override;
future<> abort() override;
future<> enable_in_memory_state_machine();
};
bool should_flush_system_topology_after_applying(const mutation& mut, const data_dictionary::database db);

View File

@@ -244,7 +244,6 @@ raft_server_for_group raft_group0::create_server_for_group0(raft::group_id gid,
service::migration_manager& mm, bool topology_change_enabled) {
auto state_machine = std::make_unique<group0_state_machine>(
_client, mm, qp.proxy(), ss, _gossiper, _feat, topology_change_enabled);
auto& state_machine_ref = *state_machine;
auto rpc = std::make_unique<group0_rpc>(_raft_gr.direct_fd(), *state_machine, _ms.local(), _raft_gr.failure_detector(), gid, my_id);
// Keep a reference to a specific RPC class.
auto& rpc_ref = *rpc;
@@ -278,7 +277,6 @@ raft_server_for_group raft_group0::create_server_for_group0(raft::group_id gid,
.ticker = std::move(ticker),
.rpc = rpc_ref,
.persistence = persistence_ref,
.state_machine = state_machine_ref,
.default_op_timeout_in_ms = qp.proxy().get_db().local().get_config().group0_raft_op_timeout_in_ms
};
}
@@ -473,10 +471,8 @@ future<> raft_group0::start_server_for_group0(raft::group_id group0_id, service:
auto srv_for_group0 = create_server_for_group0(group0_id, my_id, ss, qp, mm, topology_change_enabled);
auto& persistence = srv_for_group0.persistence;
auto& server = *srv_for_group0.server;
co_await with_scheduling_group(_sg, [this, &srv_for_group0] (this auto self) -> future<> {
auto& state_machine = dynamic_cast<group0_state_machine&>(srv_for_group0.state_machine);
co_await _raft_gr.start_server_for_group(std::move(srv_for_group0));
co_await state_machine.enable_in_memory_state_machine();
co_await with_scheduling_group(_sg, [this, srv_for_group0 = std::move(srv_for_group0)] () mutable {
return _raft_gr.start_server_for_group(std::move(srv_for_group0));
});
_group0.emplace<raft::group_id>(group0_id);

View File

@@ -92,6 +92,8 @@ static logging::logger logger("group0_client");
*
* Furthermore, obtaining the guard ensures that we don't read partial state, since it holds a lock that is also taken
* during command application (`_read_apply_mutex_holder`). The lock is released just before sending the command to Raft.
* TODO: we may still read partial state if we crash in the middle of command application.
* See `group0_state_machine::apply` for a proposed fix.
*
* Obtaining the guard also ensures that there is no concurrent group 0 operation running on this node using another lock
* (`_operation_mutex_holder`); if we allowed multiple concurrent operations to run, some of them could fail

View File

@@ -27,7 +27,6 @@ namespace service {
class raft_rpc;
class raft_sys_table_storage;
class raft_state_machine;
using raft_ticker_type = seastar::timer<lowres_clock>;
struct raft_group_not_found: public raft::error {
@@ -52,7 +51,6 @@ struct raft_server_for_group {
std::unique_ptr<raft_ticker_type> ticker;
raft_rpc& rpc;
raft_sys_table_storage& persistence;
raft_state_machine& state_machine;
std::optional<seastar::shared_future<>> aborted;
std::optional<utils::updateable_value<uint32_t>> default_op_timeout_in_ms;
};

View File

@@ -534,14 +534,12 @@ future<storage_service::nodes_to_notify_after_sync> storage_service::sync_raft_t
std::vector<future<>> sys_ks_futures;
auto process_left_node = [&] (raft::server_id id, locator::host_id host_id, std::optional<gms::inet_address> ip, bool notify) -> future<> {
auto process_left_node = [&] (raft::server_id id, locator::host_id host_id, std::optional<gms::inet_address> ip) -> future<> {
if (ip) {
sys_ks_futures.push_back(_sys_ks.local().remove_endpoint(*ip));
co_await _gossiper.force_remove_endpoint(host_id, gms::null_permit_id);
if (notify) {
nodes_to_notify.left.push_back({*ip, host_id});
}
nodes_to_notify.left.push_back({*ip, host_id});
}
if (t.left_nodes_rs.find(id) != t.left_nodes_rs.end()) {
@@ -638,7 +636,7 @@ future<storage_service::nodes_to_notify_after_sync> storage_service::sync_raft_t
for (const auto& id: t.left_nodes) {
locator::host_id host_id{id.uuid()};
auto ip = _address_map.find(host_id);
co_await process_left_node(id, host_id, ip, id_to_ip_map.find(host_id) != id_to_ip_map.end());
co_await process_left_node(id, host_id, ip);
if (ip) {
sys_ks_futures.push_back(raft_topology_update_ip(host_id, *ip, id_to_ip_map, nullptr));
}
@@ -856,7 +854,7 @@ future<> storage_service::topology_state_load(state_change_hint hint) {
}
}
// Ban all left and ignored nodes. We do not allow them to go back online.
// Ban all left and ignord nodes. We do not allow them to go back online.
co_await _messaging.local().ban_hosts(boost::join(topology.left_nodes, topology.ignored_nodes)
| std::views::transform([] (auto id) { return locator::host_id{id.uuid()}; })
| std::ranges::to<utils::chunked_vector<locator::host_id>>());
@@ -5113,21 +5111,11 @@ semaphore& storage_service::get_do_sample_sstables_concurrency_limiter() {
return _do_sample_sstables_concurrency_limiter;
}
future<uint64_t> storage_service::estimate_total_sstable_volume(table_id t, ignore_errors errors) {
future<uint64_t> storage_service::estimate_total_sstable_volume(table_id t) {
co_return co_await seastar::map_reduce(
_db.local().get_token_metadata().get_host_ids(),
[&] (auto h) -> future<uint64_t> {
try {
co_return co_await ser::storage_service_rpc_verbs::send_estimate_sstable_volume(&_messaging.local(), h, t);
}
catch(...) {
if (errors == ignore_errors::yes) {
// If the call failed we just return 0 for this one
slogger.info("call to estimate_total_sstable_volume failed for table {} and host {}, returning 0", t, h);
co_return 0;
}
throw;
}
return ser::storage_service_rpc_verbs::send_estimate_sstable_volume(&_messaging.local(), h, t);
},
uint64_t(0),
std::plus<uint64_t>()
@@ -5275,6 +5263,7 @@ future<> storage_service::raft_rebuild(utils::optional_param sdc_param) {
rtlogger.info("request rebuild for: {} source_dc={}", raft_server.id(), sdc_param);
topology_mutation_builder builder(guard.write_timestamp());
builder.set_session(session_id(guard.new_group0_state_id()));
sstring source_dc = sdc_param.value_or("");
if (sdc_param.force() && !source_dc.empty()) {
source_dc += ":force";
@@ -6537,19 +6526,14 @@ future<> storage_service::clone_locally_tablet_storage(locator::global_tablet_id
leaving.host, pending.host));
}
// All sstables cloned locally will be left unsealed, until they're loaded into the table.
// This is to guarantee no unsplit sstables will be left sealed on disk, which could
// cause problems if unsplit sstables are found after split was ACKed to coordinator.
bool leave_unsealed = true;
auto d = co_await smp::submit_to(leaving.shard, [this, tablet, leave_unsealed] () -> future<utils::chunked_vector<sstables::entry_descriptor>> {
auto d = co_await smp::submit_to(leaving.shard, [this, tablet] () -> future<utils::chunked_vector<sstables::entry_descriptor>> {
auto& table = _db.local().find_column_family(tablet.table);
auto op = table.stream_in_progress();
co_return co_await table.clone_tablet_storage(tablet.tablet, leave_unsealed);
co_return co_await table.clone_tablet_storage(tablet.tablet);
});
rtlogger.debug("Cloned storage of tablet {} from leaving replica {}, {} sstables were found", tablet, leaving, d.size());
auto load_sstable = [leave_unsealed] (const dht::sharder& sharder, replica::table& t, sstables::entry_descriptor d) -> future<sstables::shared_sstable> {
auto load_sstable = [] (const dht::sharder& sharder, replica::table& t, sstables::entry_descriptor d) -> future<sstables::shared_sstable> {
auto& mng = t.get_sstables_manager();
auto sst = mng.make_sstable(t.schema(), t.get_storage_options(), d.generation, d.state.value_or(sstables::sstable_state::normal),
d.version, d.format, db_clock::now(), default_io_error_handler_gen());
@@ -6557,8 +6541,7 @@ future<> storage_service::clone_locally_tablet_storage(locator::global_tablet_id
// will still point to leaving replica at this stage in migration. If node goes down,
// SSTables will be loaded at pending replica and migration is retried, so correctness
// wise, we're good.
auto cfg = sstables::sstable_open_config{ .current_shard_as_sstable_owner = true,
.unsealed_sstable = leave_unsealed };
auto cfg = sstables::sstable_open_config{ .current_shard_as_sstable_owner = true };
co_await sst->load(sharder, cfg);
co_return sst;
};
@@ -6566,23 +6549,16 @@ future<> storage_service::clone_locally_tablet_storage(locator::global_tablet_id
co_await smp::submit_to(pending.shard, [this, tablet, load_sstable, d = std::move(d)] () mutable -> future<> {
// Loads cloned sstables from leaving replica into pending one.
auto& table = _db.local().find_column_family(tablet.table);
auto& sstm = table.get_sstables_manager();
auto op = table.stream_in_progress();
dht::auto_refreshing_sharder sharder(table.shared_from_this());
std::unordered_set<sstables::shared_sstable> ssts;
std::vector<sstables::shared_sstable> ssts;
ssts.reserve(d.size());
for (auto&& sst_desc : d) {
ssts.insert(co_await load_sstable(sharder, table, std::move(sst_desc)));
ssts.push_back(co_await load_sstable(sharder, table, std::move(sst_desc)));
}
auto on_add = [&ssts, &sstm] (sstables::shared_sstable loading_sst) -> future<> {
if (ssts.contains(loading_sst)) {
auto cfg = sstm.configure_writer(loading_sst->get_origin());
co_await loading_sst->seal_sstable(cfg.backup);
}
co_return;
};
auto loaded_ssts = co_await table.add_new_sstables_and_update_cache(std::vector(ssts.begin(), ssts.end()), on_add);
_view_building_worker.local().load_sstables(tablet.table, loaded_ssts);
co_await table.add_sstables_and_update_cache(ssts);
_view_building_worker.local().load_sstables(tablet.table, ssts);
});
rtlogger.debug("Successfully loaded storage of tablet {} into pending replica {}", tablet, pending);
}
@@ -7288,10 +7264,6 @@ future<locator::load_stats> storage_service::load_stats_for_tablet_based_tables(
tls.effective_capacity = si.available + sum_tablet_sizes;
}
utils::get_local_injector().inject("clear_tablet_stats_in_load_stats", [&] {
load_stats.tablet_stats.erase(this_host);
});
co_return std::move(load_stats);
}

View File

@@ -1143,11 +1143,7 @@ private:
utils::disk_space_monitor* _disk_space_monitor; // != nullptr only on shard0.
public:
struct ignore_errors_tag;
using ignore_errors = seastar::bool_class<ignore_errors_tag>;
// if ignore_errors set to yes, the function will not throw an exception - any error will be ignored
// and node for which an error happened will be considered having size 0.
future<uint64_t> estimate_total_sstable_volume(table_id, ignore_errors = ignore_errors::no);
future<uint64_t> estimate_total_sstable_volume(table_id);
future<std::vector<std::byte>> train_dict(utils::chunked_vector<temporary_buffer<char>> sample);
future<> publish_new_sstable_dict(table_id, std::span<const std::byte>, service::raft_group0_client&);
void set_train_dict_callback(decltype(_train_dict));

View File

@@ -204,7 +204,6 @@ struct colocated_tablets {
// to same destination.
struct migration_tablet_set {
std::variant<global_tablet_id, colocated_tablets> tablet_s;
uint64_t tablet_set_disk_size = 0;
table_id table() const {
return std::visit(
@@ -233,9 +232,7 @@ struct migration_tablet_set {
return std::holds_alternative<colocated_tablets>(tablet_s);
}
bool operator==(const migration_tablet_set& rhs) const {
return tablet_s == rhs.tablet_s;
}
auto operator<=>(const migration_tablet_set&) const = default;
};
struct migration_candidate {
@@ -529,20 +526,16 @@ class load_balancer {
// Represents metric for load which we want to equalize between shards or nodes.
// Load balancer equalizes storage utilization.
// In case force_capacity_based_balancing is true, it is assumed that each tablet has equal size and that
// shards and nodes can have different capacity. If force_capacity_based_balancing is false,
// tablet sizes are fetched from load_stats.
// So we equalize: sum of tablet_sizes / capacity_in_bytes.
// It is assumed that each tablet has equal size and that shards and nodes can have different capacity.
// So we equalize: tablet_count * target_tablet_size / capacity_in_bytes.
using load_type = double;
using table_candidates_map = std::unordered_map<table_id, std::unordered_set<migration_tablet_set>>;
struct shard_load {
size_t tablet_count = 0;
std::optional<disk_usage> dusage;
absl::flat_hash_map<table_id, size_t> tablet_count_per_table;
absl::flat_hash_map<table_id, uint64_t> tablet_sizes_per_table;
// Number of tablets which are streamed from this shard.
size_t streaming_read_load = 0;
@@ -590,16 +583,15 @@ class load_balancer {
host_id id;
uint64_t shard_count = 0;
uint64_t tablet_count = 0;
std::optional<disk_usage> dusage; // Invariant: bool(dusage) || drained.
std::optional<uint64_t> capacity; // Invariant: bool(capacity) || drained.
bool drained = false;
const locator::node* node; // never nullptr
// The average shard load on this node.
// Valid only when "dusage" is set.
// Valid only when "capacity" is set.
load_type avg_load = 0;
absl::flat_hash_map<table_id, size_t> tablet_count_per_table;
absl::flat_hash_map<table_id, uint64_t> tablet_sizes_per_table;
// heap which tracks most-loaded shards using shards_by_load_cmp().
// Valid during intra-node plan-making for nodes which are in the source node set.
@@ -610,8 +602,8 @@ class load_balancer {
utils::chunked_vector<skipped_candidate> skipped_candidates;
std::optional<double> capacity_per_shard() const {
return dusage.transform([&] (auto du) {
return load_type(du.capacity) / shard_count;
return capacity.transform([&] (auto cap) {
return double(cap) / shard_count;
});
}
@@ -628,17 +620,16 @@ class load_balancer {
}
// Call when tablet_count or capacity changes.
void update() {
if (auto load = get_avg_load()) {
void update(uint64_t target_tablet_size) {
if (auto load = get_avg_load(tablet_count, target_tablet_size)) {
avg_load = *load;
}
}
// Result engaged when !drained.
std::optional<load_type> get_avg_load(uint64_t used_size_delta = 0) const {
return dusage.transform([&] (auto du) {
du.used += used_size_delta;
return du.get_load();
std::optional<load_type> get_avg_load(uint64_t tablets, uint64_t target_tablet_size) const {
return capacity.transform([&] (auto capacity) {
return load_type(tablets * target_tablet_size) / capacity;
});
}
@@ -651,20 +642,20 @@ class load_balancer {
}
// Result engaged for !drained nodes.
std::optional<load_type> shard_load(shard_id shard, int64_t used_size_delta = 0) const {
return shards[shard].dusage.transform([&] (auto du) {
du.used += used_size_delta;
return du.get_load();
std::optional<load_type> shard_load(shard_id shard, uint64_t target_tablet_size) const {
return shard_load(shard, shards[shard].tablet_count, target_tablet_size);
}
// Result engaged for !drained nodes.
std::optional<load_type> shard_load(shard_id shard, uint64_t shard_tablet_count, uint64_t target_tablet_size) const {
return capacity_per_shard().transform([&] (auto cap_per_shard) {
return load_type(shard_tablet_count * target_tablet_size) / cap_per_shard;
});
}
auto shards_by_load_cmp() {
return [this] (const auto& a, const auto& b) {
if (dusage) {
return shards[a].dusage->get_load() < shards[b].dusage->get_load();
} else {
return shards[a].tablet_count < shards[b].tablet_count;
}
return shards[a].tablet_count < shards[b].tablet_count;
};
}
@@ -820,8 +811,6 @@ class load_balancer {
std::unordered_set<global_tablet_id> _scheduled_tablets;
// Holds tablet replica count per table in the balanced node set (within a single DC).
absl::flat_hash_map<table_id, size_t> _tablet_count_per_table;
// Holds total used storage per table in the DC
absl::flat_hash_map<table_id, uint64_t> _disk_used_per_table;
dc_name _dc;
std::optional<sstring> _rack; // Set when plan making is limited to a single rack.
size_t _total_capacity_shards; // Total number of non-drained shards in the balanced node set.
@@ -832,19 +821,6 @@ class load_balancer {
std::unordered_set<host_id> _skiplist;
bool _use_table_aware_balancing = true;
double _initial_scale = 1;
// This is the maximum load delta between the most and least loaded nodes,
// below which the balancer considers the DC balanced
double _size_based_balance_threshold = 0.01;
// When this is set to true, the balancer assumes all tablets
// have the same size: _target_tablet_size
bool _force_capacity_based_balancing = false;
// The minimal tablet size the balancer will compute load with. For any tablet smaller than this,
// the balancer will use this size instead of the actual tablet size.
uint64_t _minimal_tablet_size = service::default_target_tablet_size / 100;
private:
tablet_replica_set get_replicas_for_tablet_load(const tablet_info& ti, const tablet_transition_info* trinfo) const {
// We reflect migrations in the load as if they already happened,
@@ -931,16 +907,7 @@ public:
, _table_load_stats(std::move(table_load_stats))
, _stats(stats)
, _skiplist(std::move(skiplist))
, _size_based_balance_threshold(db.get_config().size_based_balance_threshold_percentage() / 100.0)
, _force_capacity_based_balancing(db.get_config().force_capacity_based_balancing())
, _minimal_tablet_size(db.get_config().minimal_tablet_size_for_balancing()) {
// Force capacity based balancing until all the nodes have been upgraded
if (!_db.features().size_based_load_balancing && !_force_capacity_based_balancing) {
lblogger.info("Size based load balancing cluster feature disabled; forcing capacity based balancing");
_force_capacity_based_balancing = true;
}
}
{ }
bool ongoing_rack_list_colocation() const {
return _topology != nullptr && _sys_ks != nullptr && !_topology->paused_rf_change_requests.empty();
@@ -1005,13 +972,6 @@ public:
return (it != _table_load_stats->tables.end()) ? &it->second : nullptr;
}
std::optional<uint64_t> get_tablet_size(host_id host, const range_based_tablet_id& rb_tid, const tablet_info& ti, const tablet_transition_info* trinfo) const {
if (_table_load_stats) {
return _table_load_stats->get_tablet_size_in_transition(host, rb_tid, ti, trinfo);
}
return std::nullopt;
}
future<bool> needs_auto_repair(const locator::global_tablet_id& gid, const locator::tablet_info& info,
const locator::repair_scheduler_config& config, const db_clock::time_point& now, db_clock::duration& diff) {
co_return false;
@@ -1030,25 +990,15 @@ public:
load.id = host;
load.node = node;
load.shard_count = node->get_shard_count();
load.shards.resize(load.shard_count);
if (!load.shard_count) {
throw std::runtime_error(format("Shard count of {} not found in topology", host));
}
if (!_db.features().tablet_load_stats_v2) {
// This way load calculation will hold tablet count.
load.dusage = disk_usage{_target_tablet_size * load.shard_count, 0};
} else if (_table_load_stats) {
if (_table_load_stats->tablet_stats.contains(host) && !_force_capacity_based_balancing) {
load.dusage = disk_usage{_table_load_stats->tablet_stats.at(host).effective_capacity, 0};
} else if (_table_load_stats->capacity.contains(host)) {
load.dusage = disk_usage{_table_load_stats->capacity.at(host), 0};
}
}
load.shards.resize(load.shard_count);
if (load.dusage) {
for (auto& sload : load.shards) {
sload.dusage = disk_usage{ load.dusage->capacity / load.shard_count, 0 };
}
load.capacity = _target_tablet_size * load.shard_count;
} else if (_table_load_stats && _table_load_stats->capacity.contains(host)) {
load.capacity = _table_load_stats->capacity.at(host);
}
}
@@ -1981,10 +1931,6 @@ public:
const auto& table_groups = _tm->tablets().all_table_groups();
auto finalize_decision = [&] {
if (utils::get_local_injector().enter("tablet_resize_finalization_postpone")) {
return;
}
_stats.for_cluster().resizes_finalized++;
resize_plan.finalize_resize.insert(table);
};
@@ -2068,19 +2014,6 @@ public:
return utils::get_local_injector().enter("tablet_allocator_shuffle");
}
bool is_balanced(load_type min_load, load_type max_load) const {
if (_force_capacity_based_balancing) {
return min_load == max_load;
}
if (max_load == 0) {
return true;
}
const load_type load_delta = max_load - min_load;
return (load_delta / max_load) < _size_based_balance_threshold;
}
// If cluster cannot agree on tablet merge feature, then merge will not be finalized since
// not all nodes in the cluster can handle the finalization step.
bool bypass_merge_completion() const {
@@ -2124,14 +2057,14 @@ public:
return *shard_info.candidates_all_tables.begin();
}
// Evaluates impact on load balance of migrating a tablet set of a given table to dst.
migration_badness evaluate_dst_badness(node_load_map& nodes, table_id table, tablet_replica dst, uint64_t tablet_set_disk_size) {
// Evaluates impact on load balance of migrating a single tablet of a given table to dst.
migration_badness evaluate_dst_badness(node_load_map& nodes, table_id table, tablet_replica dst) {
_stats.for_dc(_dc).candidates_evaluated++;
auto& node_info = nodes[dst.host];
// Size of all tablet replicas of the table in bytes.
uint64_t table_size = _disk_used_per_table[table];
uint64_t table_size = _tablet_count_per_table[table] * _target_tablet_size;
if (node_info.drained) {
// Moving a tablet to a drained node is always bad.
@@ -2141,36 +2074,33 @@ public:
double ideal_table_load = double(table_size) / _total_capacity_storage;
auto compute_load_and_dst_badness = [&] (uint64_t capacity, uint64_t new_used) {
double new_load = double(new_used) / capacity;
// Divide badness by table_size to take into account that moving a tablet of a small table has
// greater impact on balance of that table than moving a tablet of the same size of a larger table
return std::make_pair(new_load, (new_load - ideal_table_load) / table_size);
};
uint64_t capacity = node_info.shards[dst.shard].dusage->capacity;
uint64_t new_used = node_info.shards[dst.shard].tablet_sizes_per_table[table] + tablet_set_disk_size;
auto [new_shard_load, dst_shard_badness] = compute_load_and_dst_badness(capacity, new_used);
// max number of tablets per shard to keep perfect distribution.
double shard_balance_threshold = ideal_table_load;
auto new_tablet_count_per_shard = node_info.shards[dst.shard].tablet_count_per_table[table] + 1;
auto new_shard_load = *node_info.shard_load(dst.shard, new_tablet_count_per_shard, _target_tablet_size);
auto dst_shard_badness = (new_shard_load - shard_balance_threshold) / table_size;
lblogger.trace("Table {} @{} shard balance threshold: {}, dst: {} ({:.4f})", table, dst,
ideal_table_load, new_shard_load, dst_shard_badness);
shard_balance_threshold, new_shard_load, dst_shard_badness);
capacity = node_info.dusage->capacity;
new_used = node_info.tablet_sizes_per_table[table] + tablet_set_disk_size;
auto [new_node_load, dst_node_badness] = compute_load_and_dst_badness(capacity, new_used);
// max number of tablets per node to keep perfect distribution.
double node_balance_threshold = ideal_table_load;
size_t new_tablet_count_per_node = node_info.tablet_count_per_table[table] + 1;
load_type new_node_load = *node_info.get_avg_load(new_tablet_count_per_node, _target_tablet_size);
auto dst_node_badness = (new_node_load - node_balance_threshold) / table_size;
lblogger.trace("Table {} @{} node balance threshold: {}, dst: {} ({:.4f})", table, dst,
ideal_table_load, new_node_load, dst_node_badness);
node_balance_threshold, new_node_load, dst_node_badness);
return migration_badness{0, 0, dst_shard_badness, dst_node_badness};
}
// Evaluates impact on load balance of migrating a tablet set of a given table from src.
migration_badness evaluate_src_badness(node_load_map& nodes, table_id table, tablet_replica src, uint64_t tablet_set_disk_size) {
// Evaluates impact on load balance of migrating a single tablet of a given table from src.
migration_badness evaluate_src_badness(node_load_map& nodes, table_id table, tablet_replica src) {
_stats.for_dc(_dc).candidates_evaluated++;
auto& node_info = nodes[src.host];
// Size of all tablet replicas of the table in bytes.
uint64_t table_size = _disk_used_per_table[table];
uint64_t table_size = _tablet_count_per_table[table] * _target_tablet_size;
if (node_info.drained) {
// Moving a tablet away from a drained node is always good.
@@ -2179,32 +2109,28 @@ public:
double ideal_table_load = double(table_size) / _total_capacity_storage;
auto compute_load_and_src_badness = [&] (uint64_t capacity, uint64_t new_used) {
// Divide badness by table_size to take into account that moving a tablet of a small table has
// greater impact on balance of that table than moving a tablet of the same size of a larger table
double new_load = double(new_used) / capacity;
return std::make_pair(new_load, (ideal_table_load - new_load) / table_size);
};
uint64_t capacity = node_info.shards[src.shard].dusage->capacity;
uint64_t new_used = node_info.shards[src.shard].tablet_sizes_per_table[table] - tablet_set_disk_size;
auto [new_shard_load, src_shard_badness] = compute_load_and_src_badness(capacity, new_used);
double leaving_shard_balance_threshold = ideal_table_load;
auto new_tablet_count_per_shard = node_info.shards[src.shard].tablet_count_per_table[table] - 1;
auto new_shard_load = *node_info.shard_load(src.shard, new_tablet_count_per_shard, _target_tablet_size);
auto src_shard_badness = (leaving_shard_balance_threshold - new_shard_load) / table_size;
lblogger.trace("Table {} @{} shard balance threshold: {}, src: {} ({:.4f})", table, src,
ideal_table_load, new_shard_load, src_shard_badness);
leaving_shard_balance_threshold, new_shard_load, src_shard_badness);
capacity = node_info.dusage->capacity;
new_used = node_info.tablet_sizes_per_table[table] - tablet_set_disk_size;
auto [new_node_load, src_node_badness] = compute_load_and_src_badness(capacity, new_used);
// max number of tablets per node to keep perfect distribution.
double leaving_node_balance_threshold = ideal_table_load;
size_t new_tablet_count_per_node = node_info.tablet_count_per_table[table] - 1;
auto new_node_load = *node_info.get_avg_load(new_tablet_count_per_node, _target_tablet_size);
auto src_node_badness = (leaving_node_balance_threshold - new_node_load) / table_size;
lblogger.trace("Table {} @{} node balance threshold: {}, src: {} ({:.4f})", table, src,
ideal_table_load, new_node_load, src_node_badness);
leaving_node_balance_threshold, new_node_load, src_node_badness);
return migration_badness{src_shard_badness, src_node_badness, 0, 0};
}
// Evaluates impact on load balance of migrating a single tablet of a given table from src to dst.
migration_badness evaluate_candidate(node_load_map& nodes, table_id table, tablet_replica src, tablet_replica dst, uint64_t tablet_set_disk_size) {
auto src_badness = evaluate_src_badness(nodes, table, src, tablet_set_disk_size);
auto dst_badness = evaluate_dst_badness(nodes, table, dst, tablet_set_disk_size);
migration_badness evaluate_candidate(node_load_map& nodes, table_id table, tablet_replica src, tablet_replica dst) {
auto src_badness = evaluate_src_badness(nodes, table, src);
auto dst_badness = evaluate_dst_badness(nodes, table, dst);
if (src.host == dst.host) {
src_badness.src_node_badness = 0;
@@ -2232,7 +2158,7 @@ public:
for (auto&& [table, tablets] : shard_info.candidates) {
if (!tablets.empty()) {
auto badness = evaluate_candidate(nodes, table, src, dst, tablets.begin()->tablet_set_disk_size);
auto badness = evaluate_candidate(nodes, table, src, dst);
auto candidate = migration_candidate{*tablets.begin(), src, dst, badness};
lblogger.trace("Candidate: {}", candidate);
if (!best_candidate || candidate.badness < best_candidate->badness) {
@@ -2306,9 +2232,9 @@ public:
// where tablets are moved back and forth between nodes and convergence is never reached.
//
// The assumption is that the algorithm moves tablets from more loaded nodes to less loaded nodes,
// so convergence is reached where the node we picked as source has lower or equal load, than the node we
// picked as the destination will have post-movement.
bool check_convergence(node_load& src_info, node_load& dst_info, uint64_t tablet_sizes) {
// so convergence is reached where the node we picked as source has lower load, or will have lower
// load post-movement, than the node we picked as the destination.
bool check_convergence(node_load& src_info, node_load& dst_info, unsigned delta = 1) {
if (src_info.drained) {
return true;
}
@@ -2324,9 +2250,10 @@ public:
}
// Prevent load inversion post-movement which can lead to oscillations.
if (src_info.avg_load <= *dst_info.get_avg_load(tablet_sizes)) {
lblogger.trace("Load inversion post-movement: src={} (avg_load={}), dst={} (avg_load={}) tablet_sizes={}",
src_info.id, src_info.avg_load, dst_info.id, dst_info.avg_load, tablet_sizes);
if (*src_info.get_avg_load(src_info.tablet_count - delta, _target_tablet_size) <
*dst_info.get_avg_load(dst_info.tablet_count + delta, _target_tablet_size)) {
lblogger.trace("Load inversion post-movement: src={} (avg_load={}), dst={} (avg_load={})",
src_info.id, src_info.avg_load, dst_info.id, dst_info.avg_load);
return false;
}
@@ -2334,83 +2261,76 @@ public:
}
bool check_convergence(node_load& src_info, node_load& dst_info, const migration_tablet_set& tablet_set) {
return check_convergence(src_info, dst_info, tablet_set.tablet_set_disk_size);
return check_convergence(src_info, dst_info, tablet_set.tablets().size());
}
// Checks whether moving a tablet from shard A to B (intra-node) would go against convergence.
// Returns false if the tablet should not be moved, and true if it may be moved.
// Can be called when node_info.drained.
bool check_intranode_convergence(const node_load& node_info, shard_id src_shard, shard_id dst_shard,
uint64_t used_size_delta) {
return node_info.shard_load(src_shard) > node_info.shard_load(dst_shard, int64_t(used_size_delta));
unsigned delta = 1) {
return node_info.shards[src_shard].tablet_count > node_info.shards[dst_shard].tablet_count + delta;
}
// Can be called when node_info.drained.
bool check_intranode_convergence(const node_load& node_info, shard_id src_shard, shard_id dst_shard,
const migration_tablet_set& tablet_set) {
return check_intranode_convergence(node_info, src_shard, dst_shard, tablet_set.tablet_set_disk_size);
return check_intranode_convergence(node_info, src_shard, dst_shard, tablet_set.tablets().size());
}
// Adjusts the load of the source and destination shards in the host where intra-node migration happens.
void update_node_load_on_migration(node_load& node_load, host_id host, shard_id src, shard_id dst, const migration_tablet_set& tablet_set) {
auto tablet_count = tablet_set.tablets().size();
auto tablet_sizes = tablet_set.tablet_set_disk_size;
auto table = tablet_set.tablets().front().table;
auto& dst_shard = node_load.shards[dst];
dst_shard.tablet_count += tablet_count;
dst_shard.tablet_count_per_table[table] += tablet_count;
dst_shard.tablet_sizes_per_table[table] += tablet_sizes;
dst_shard.dusage->used += tablet_sizes;
auto& src_shard = node_load.shards[src];
src_shard.tablet_count -= tablet_count;
src_shard.tablet_count_per_table[table] -= tablet_count;
src_shard.tablet_sizes_per_table[table] -= tablet_sizes;
src_shard.dusage->used -= tablet_sizes;
void update_node_load_on_migration(node_load& node_load, host_id host, shard_id src, shard_id dst, global_tablet_id tablet) {
auto& src_info = node_load.shards[src];
auto& dst_info = node_load.shards[dst];
dst_info.tablet_count++;
src_info.tablet_count--;
dst_info.tablet_count_per_table[tablet.table]++;
src_info.tablet_count_per_table[tablet.table]--;
}
// Adjusts the load of the source and destination (host:shard) that were picked for the migration.
void update_node_load_on_migration(node_load_map& nodes, tablet_replica src, tablet_replica dst, global_tablet_id source_tablet) {
{
auto& target_info = nodes[dst.host];
target_info.shards[dst.shard].tablet_count++;
target_info.shards[dst.shard].tablet_count_per_table[source_tablet.table]++;
target_info.tablet_count_per_table[source_tablet.table]++;
target_info.tablet_count += 1;
target_info.update(_target_tablet_size);
}
auto& src_node_info = nodes[src.host];
auto& src_shard_info = src_node_info.shards[src.shard];
src_shard_info.tablet_count -= 1;
src_shard_info.tablet_count_per_table[source_tablet.table]--;
src_node_info.tablet_count_per_table[source_tablet.table]--;
src_node_info.tablet_count -= 1;
src_node_info.update(_target_tablet_size);
}
void update_node_load_on_migration(node_load& node_load, host_id host, shard_id src, shard_id dst, const migration_tablet_set& tablet_set) {
for (auto tablet : tablet_set.tablets()) {
update_node_load_on_migration(node_load, host, src, dst, tablet);
}
}
void update_node_load_on_migration(node_load_map& nodes, tablet_replica src, tablet_replica dst, const migration_tablet_set& tablet_set) {
auto tablet_count = tablet_set.tablets().size();
auto tablet_sizes = tablet_set.tablet_set_disk_size;
auto table = tablet_set.tablets().front().table;
auto& dst_node = nodes[dst.host];
auto& dst_shard = dst_node.shards[dst.shard];
dst_shard.tablet_count += tablet_count;
dst_shard.tablet_count_per_table[table] += tablet_count;
dst_shard.tablet_sizes_per_table[table] += tablet_sizes;
dst_shard.dusage->used += tablet_sizes;
dst_node.tablet_count_per_table[table] += tablet_count;
dst_node.tablet_sizes_per_table[table] += tablet_sizes;
dst_node.tablet_count += tablet_count;
dst_node.dusage->used += tablet_sizes;
dst_node.update();
auto& src_node = nodes[src.host];
auto& src_shard = src_node.shards[src.shard];
src_shard.tablet_count -= tablet_count;
src_shard.tablet_count_per_table[table] -= tablet_count;
src_shard.tablet_sizes_per_table[table] -= tablet_sizes;
if (src_shard.dusage) {
src_shard.dusage->used -= tablet_sizes;
for (auto tablet : tablet_set.tablets()) {
update_node_load_on_migration(nodes, src, dst, tablet);
}
src_node.tablet_count_per_table[table] -= tablet_count;
src_node.tablet_sizes_per_table[table] -= tablet_sizes;
src_node.tablet_count -= tablet_count;
if (src_node.dusage) {
src_node.dusage->used -= tablet_sizes;
}
src_node.update();
}
static void unload(locator::load_sketch& sketch, host_id host, shard_id shard, const migration_tablet_set& tablet_set) {
sketch.unload(host, shard, tablet_set.tablets().size(), tablet_set.tablet_set_disk_size);
for (auto _ : tablet_set.tablets()) {
sketch.unload(host, shard);
}
}
static void pick(locator::load_sketch& sketch, host_id host, shard_id shard, const migration_tablet_set& tablet_set) {
sketch.pick(host, shard, tablet_set.tablets().size(), tablet_set.tablet_set_disk_size);
for (auto _ : tablet_set.tablets()) {
sketch.pick(host, shard);
}
}
void mark_as_scheduled(const tablet_migration_info& mig) {
@@ -2443,7 +2363,7 @@ public:
}
std::make_heap(src_shards.begin(), src_shards.end(), node_load.shards_by_load_cmp());
load_type max_load = 0; // Tracks max load among shards which ran out of candidates.
size_t max_load = 0; // Tracks max load among shards which ran out of candidates.
while (true) {
co_await coroutine::maybe_yield();
@@ -2483,16 +2403,14 @@ public:
// Convergence check
// When in shuffle mode, exit condition is guaranteed by running out of candidates or by load limit.
if (!shuffle && src == dst) {
if (!shuffle && (src == dst || !check_intranode_convergence(node_load, src, dst))) {
lblogger.debug("Node {} is balanced", host);
break;
}
if (!src_info.has_candidates()) {
lblogger.debug("No more candidates on shard {} of {}", src, host);
if (src_info.dusage) {
max_load = std::max(max_load, src_info.dusage->get_load());
}
max_load = std::max(max_load, src_info.tablet_count);
src_shards.pop_back();
push_back.cancel();
continue;
@@ -2501,6 +2419,7 @@ public:
auto candidate = co_await peek_candidate(nodes, src_info, tablet_replica{host, src}, tablet_replica{host, dst});
auto tablets = candidate.tablets;
// Recheck convergence to avoid oscillations if co-located tablets are being migrated together.
if (!shuffle && (src == dst || !check_intranode_convergence(node_load, src, dst, tablets))) {
lblogger.debug("Node {} is balanced", host);
break;
@@ -2520,7 +2439,7 @@ public:
}
apply_load(nodes, mig_streaming_info);
lblogger.debug("Adding migration: {} size: {}", mig, tablets.tablet_set_disk_size);
lblogger.debug("Adding migration: {}", mig);
_stats.for_dc(node_load.dc()).migrations_produced++;
_stats.for_dc(node_load.dc()).intranode_migrations_produced++;
mark_as_scheduled(mig);
@@ -2529,8 +2448,8 @@ public:
erase_candidates(nodes, tmap, tablets);
update_node_load_on_migration(node_load, host, src, dst, tablets);
pick(sketch, host, dst, tablets);
unload(sketch, host, src, tablets);
sketch.pick(host, dst);
sketch.unload(host, src);
}
co_return plan;
@@ -2759,7 +2678,7 @@ public:
-> future<migration_candidate> {
if (drain_skipped) {
auto source_tablets = src_node_info.skipped_candidates.back().tablets;
auto badness = evaluate_candidate(nodes, source_tablets.table(), src, dst, source_tablets.tablet_set_disk_size);
auto badness = evaluate_candidate(nodes, source_tablets.table(), src, dst);
co_return migration_candidate{source_tablets, src, dst, badness};
} else {
auto&& src_shard_info = src_node_info.shards[src.shard];
@@ -2786,7 +2705,7 @@ public:
continue;
}
auto badness = evaluate_dst_badness(nodes, tablets.table(), tablet_replica{new_target, 0}, tablets.tablet_set_disk_size);
auto badness = evaluate_dst_badness(nodes, tablets.table(), tablet_replica{new_target, 0});
if (!min_dst_host || badness.dst_node_badness < min_dst_badness.dst_node_badness) {
min_dst_badness = badness;
min_dst_host = new_target;
@@ -2806,29 +2725,21 @@ public:
// Find the best shards on best targets.
std::vector<tablet_replica> best_dsts;
for (auto host : best_hosts) {
for (shard_id new_dst_shard = 0; new_dst_shard < nodes[host].shard_count; new_dst_shard++) {
co_await coroutine::maybe_yield();
auto new_dst = tablet_replica{host, new_dst_shard};
auto badness = evaluate_dst_badness(nodes, tablets.table(), new_dst, tablets.tablet_set_disk_size);
auto badness = evaluate_dst_badness(nodes, tablets.table(), new_dst);
if (!min_dst || badness < min_dst_badness) {
min_dst_badness = badness;
min_dst = new_dst;
best_dsts.clear();
}
if (badness.dst_shard_badness == min_dst_badness.dst_shard_badness) {
best_dsts.push_back(new_dst);
}
}
if (min_dst && !min_dst_badness.is_bad()) {
break;
}
}
if (best_dsts.size() > 1) {
min_dst = best_dsts[rand_int() % best_dsts.size()];
}
if (!min_dst) {
on_internal_error(lblogger, fmt::format("No destination shards on {}", best_hosts));
@@ -2855,42 +2766,31 @@ public:
// Consider better alternatives.
if (drain_skipped) {
auto tablets = src_node_info.skipped_candidates.back().tablets;
auto badness = evaluate_src_badness(nodes, tablets.table(), src, tablets.tablet_set_disk_size);
auto badness = evaluate_src_badness(nodes, tablets.table(), src);
co_await evaluate_targets(tablets, src, badness);
} else {
// Find a better candidate.
// Consider different tables. For each table, first find the best source shard.
// Then find the best target node. Then find the best shard on the target node.
for (auto [table, tablet_count] : src_node_info.tablet_count_per_table) {
if (tablet_count == 0) {
for (auto [table, load] : src_node_info.tablet_count_per_table) {
migration_badness min_src_badness;
std::optional<tablet_replica> min_src;
if (load == 0) {
lblogger.trace("No src candidates for table {} on node {}", table, src.host);
continue;
}
migration_badness min_src_badness;
std::optional<tablet_replica> min_src;
std::optional<migration_tablet_set> min_tablet_set;
auto check_candidate = [&] (const tablet_replica& new_src, const migration_tablet_set& tablet_set) {
auto badness = evaluate_src_badness(nodes, table, new_src, tablet_set.tablet_set_disk_size);
if (!min_src || badness < min_src_badness) {
min_src_badness = badness;
min_src = new_src;
min_tablet_set = tablet_set;
}
};
for (auto new_src_shard: src_node_info.shards_by_load) {
auto new_src = tablet_replica{src.host, new_src_shard};
if (src_node_info.shards[new_src_shard].candidates[table].empty()) {
lblogger.trace("No src candidates for table {} on shard {}", table, new_src);
continue;
}
co_await coroutine::maybe_yield();
if (_force_capacity_based_balancing) {
check_candidate(new_src, *src_node_info.shards[new_src_shard].candidates[table].begin());
} else {
for (const auto& tablet_set: src_node_info.shards[new_src_shard].candidates[table]) {
check_candidate(new_src, tablet_set);
}
auto badness = evaluate_src_badness(nodes, table, new_src);
if (!min_src || badness < min_src_badness) {
min_src_badness = badness;
min_src = new_src;
}
}
@@ -2899,7 +2799,8 @@ public:
continue;
}
co_await evaluate_targets(*min_tablet_set, *min_src, min_src_badness);
auto tablet = *src_node_info.shards[min_src->shard].candidates[table].begin();
co_await evaluate_targets(tablet, *min_src, min_src_badness);
if (!min_candidate.badness.is_bad()) {
break;
}
@@ -3001,7 +2902,7 @@ public:
shard_id shard = 0;
for (auto&& shard_load : node_load.shards) {
lblogger.debug("shard {}: load: {}, tablets: {}, candidates: {}, tables: {}", tablet_replica {host, shard},
node_load.shard_load(shard), shard_load.tablet_count,
node_load.shard_load(shard, _target_tablet_size), shard_load.tablet_count,
shard_load.candidate_count(), shard_load.tablet_count_per_table);
shard++;
}
@@ -3119,7 +3020,7 @@ public:
// When draining nodes, disable convergence checks so that all tablets are migrated away.
bool can_check_convergence = !shuffle && nodes_to_drain.empty();
if (can_check_convergence) {
if (!shuffle && nodes_to_drain.empty()) {
// Check if all nodes reached the same avg_load. There are three sets of nodes: target, candidates (nodes_by_load)
// and off-candidates (removed from nodes_by_load). At any time, the avg_load for target is not greater than
// that of any candidate, and avg_load of any candidate is not greater than that of any in the off-candidates set.
@@ -3129,12 +3030,17 @@ public:
// is tracked in max_off_candidate_load. If max_off_candidate_load is equal to target's avg_load,
// it means that all nodes have equal avg_load. We take the maximum with the current candidate in src_node_info
// to handle the case of off-candidates being empty. In that case, max_off_candidate_load is 0.
const load_type max_load = std::max(max_off_candidate_load, src_node_info.avg_load);
if (is_balanced(target_info.avg_load, max_load)) {
if (std::max(max_off_candidate_load, src_node_info.avg_load) == target_info.avg_load) {
lblogger.debug("Balance achieved.");
_stats.for_dc(dc).stop_balance++;
break;
}
if (!check_convergence(src_node_info, target_info)) {
lblogger.debug("No more candidates. Load would be inverted.");
_stats.for_dc(dc).stop_load_inversion++;
break;
}
}
// Pick best target shard.
@@ -3142,13 +3048,13 @@ public:
auto dst = global_shard_id {target, _load_sketch->get_least_loaded_shard(target)};
lblogger.trace("target shard: {}, tablets={}, load={}", dst.shard,
target_info.shards[dst.shard].tablet_count,
target_info.shard_load(dst.shard));
target_info.shard_load(dst.shard, _target_tablet_size));
if (lblogger.is_enabled(seastar::log_level::trace)) {
shard_id shard = 0;
for (auto&& shard_load : target_info.shards) {
lblogger.trace("shard {}: load: {}, tablets: {}, candidates: {}, tables: {}", tablet_replica{dst.host, shard},
target_info.shard_load(shard), shard_load.tablet_count,
target_info.shard_load(shard, _target_tablet_size), shard_load.tablet_count,
shard_load.candidate_count(), shard_load.tablet_count_per_table);
shard++;
}
@@ -3164,6 +3070,7 @@ public:
dst = candidate.dst;
auto& tmap = tmeta.get_tablet_map(source_tablets.table());
// If best candidate is co-located sibling tablets, then convergence is re-checked to avoid oscillations.
if (can_check_convergence && !check_convergence(src_node_info, target_info, source_tablets)) {
lblogger.debug("No more candidates. Load would be inverted.");
_stats.for_dc(dc).stop_load_inversion++;
@@ -3211,7 +3118,7 @@ public:
if (can_accept_load(nodes, mig_streaming_info)) {
apply_load(nodes, mig_streaming_info);
lblogger.debug("Adding migration: {} size: {}", mig, source_tablets.tablet_set_disk_size);
lblogger.debug("Adding migration: {}", mig);
_stats.for_dc(dc).migrations_produced++;
mark_as_scheduled(mig);
plan.add(std::move(mig));
@@ -3333,8 +3240,8 @@ public:
auto shuffle = in_shuffle_mode();
_stats.for_dc(dc).calls++;
lblogger.debug("Examining DC {} rack {} (shuffle={}, balancing={}, tablets_per_shard_goal={}, force_capacity_based_balancing={})",
dc, rack, shuffle, _tm->tablets().balancing_enabled(), _tablets_per_shard_goal, _force_capacity_based_balancing);
lblogger.debug("Examining DC {} rack {} (shuffle={}, balancing={}, tablets_per_shard_goal={})",
dc, rack, shuffle, _tm->tablets().balancing_enabled(), _tablets_per_shard_goal);
const locator::topology& topo = _tm->get_topology();
@@ -3433,54 +3340,60 @@ public:
}
}
_load_sketch = locator::load_sketch(_tm, _table_load_stats, _force_capacity_based_balancing ? _target_tablet_size : 0);
_load_sketch->set_minimal_tablet_size(_minimal_tablet_size);
_load_sketch->set_force_capacity_based_load(_force_capacity_based_balancing);
co_await _load_sketch->populate_dc(dc);
// If we don't have nodes to drain, remove nodes which don't have complete tablet sizes
if (nodes_to_drain.empty()) {
for (auto nodes_i = nodes.begin(); nodes_i != nodes.end();) {
host_id host = nodes_i->first;
if (!_load_sketch->has_complete_data(host)) {
lblogger.info("Node {} does not have complete tablet stats, ignoring", nodes_i->first);
nodes_i = nodes.erase(nodes_i);
} else {
++nodes_i;
}
}
}
plan.set_has_nodes_to_drain(!nodes_to_drain.empty());
// Invariant: node.dusage || node.drained
// Invariant: node.capacity || node.drained
for (auto& [host, node] : nodes) {
if (node.drained) {
continue;
}
if (!node.dusage) {
if (!node.capacity) {
lblogger.info("Cannot balance because capacity of node {} (or more) is unknown", host);
co_return plan;
}
}
// For size based balancing, only excluded nodes are allowed to have incomplete tablet stats
for (auto& [host, node] : nodes) {
if (!_load_sketch->has_complete_data(host)) {
if (!_force_capacity_based_balancing && node.drained && node.node->is_excluded()) {
_load_sketch->ignore_incomplete_data(host);
} else {
lblogger.info("Cannot balance because node {} (or more) has incomplete tablet stats", host);
co_return plan;
// Compute load imbalance.
_total_capacity_shards = 0;
_total_capacity_nodes = 0;
_total_capacity_storage = 0;
load_type max_load = 0;
load_type min_load = 0;
std::optional<host_id> min_load_node = std::nullopt;
for (auto&& [host, load] : nodes) {
load.update(_target_tablet_size);
_stats.for_node(dc, host).load = load.avg_load;
if (!load.drained) {
if (!min_load_node || load.avg_load < min_load) {
min_load = load.avg_load;
min_load_node = host;
}
if (load.avg_load > max_load) {
max_load = load.avg_load;
}
_total_capacity_shards += load.shard_count;
_total_capacity_nodes++;
_total_capacity_storage += *load.capacity;
}
}
// Check if we have destination nodes
const bool has_dest_nodes = std::ranges::any_of(std::views::values(nodes), [&] (const auto& load) {
return !load.drained;
});
if (!has_dest_nodes) {
for (auto&& [host, load] : nodes) {
size_t read = 0;
size_t write = 0;
for (auto& shard_load : load.shards) {
read += shard_load.streaming_read_load;
write += shard_load.streaming_write_load;
}
auto level = (read + write) > 0 ? seastar::log_level::info : seastar::log_level::debug;
lblogger.log(level, "Node {}: dc={} rack={} load={} tablets={} shards={} tablets/shard={} state={} cap={}"
" stream_read={} stream_write={}",
host, dc, load.rack(), load.avg_load, load.tablet_count, load.shard_count,
load.tablets_per_shard(), load.state(), load.capacity, read, write);
}
if (!min_load_node) {
if (!nodes_to_drain.empty()) {
throw std::runtime_error(format("There are nodes with tablets to drain but no candidate nodes in DC {}."
" Consider adding new nodes or reducing replication factor.", dc));
@@ -3502,13 +3415,13 @@ public:
// Compute per-shard load and candidate tablets.
_load_sketch = locator::load_sketch(_tm);
co_await _load_sketch->populate_dc(dc);
_tablet_count_per_table.clear();
_disk_used_per_table.clear();
for (auto&& [table, tables] : _tm->tablets().all_table_groups()) {
const auto& tmap = _tm->tablets().get_tablet_map(table);
uint64_t total_tablet_count = 0;
uint64_t total_tablet_sizes = 0;
uint64_t total_load = 0;
auto get_replicas = [this] (std::optional<tablet_desc> t) -> tablet_replica_set {
return t ? sorted_replicas_for_tablet_load(*t->info, t->transition) : tablet_replica_set{};
@@ -3546,60 +3459,26 @@ public:
if (!nodes.contains(replica.host)) {
continue;
}
utils::small_vector<uint64_t, 2> tablet_sizes;
uint64_t tablet_sizes_sum = 0;
for (auto tid : tids) {
if (_force_capacity_based_balancing) {
tablet_sizes_sum += _target_tablet_size;
tablet_sizes.push_back(_target_tablet_size);
} else {
uint64_t tablet_group_size = 0;
auto token_range = tmap.get_token_range(tid);
for (auto group_member : tables) {
const range_based_tablet_id rb_tid {group_member, token_range};
auto& member_tmap = _tm->tablets().get_tablet_map(group_member);
auto& ti = member_tmap.get_tablet_info(tid);
auto trinfo = member_tmap.get_tablet_transition_info(tid);
auto tablet_size_opt = get_tablet_size(replica.host, rb_tid, ti, trinfo);
const uint64_t tablet_size = std::max(tablet_size_opt.value_or(_target_tablet_size), _minimal_tablet_size);
tablet_group_size += tablet_size;
tablet_sizes_sum += tablet_size;
}
tablet_sizes.push_back(tablet_group_size);
}
}
auto& node_load_info = nodes[replica.host];
shard_load& shard_load_info = node_load_info.shards[replica.shard];
if (shard_load_info.tablet_count == 0) {
node_load_info.shards_by_load.push_back(replica.shard);
}
shard_load_info.tablet_count += tids.size();
if (shard_load_info.dusage) {
shard_load_info.dusage->used += tablet_sizes_sum;
}
shard_load_info.tablet_count_per_table[table] += tids.size();
shard_load_info.tablet_sizes_per_table[table] += tablet_sizes_sum;
node_load_info.tablet_count_per_table[table] += tids.size();
node_load_info.tablet_sizes_per_table[table] += tablet_sizes_sum;
if (node_load_info.dusage) {
node_load_info.dusage->used += tablet_sizes_sum;
}
total_tablet_count += tids.size();
total_tablet_sizes += tablet_sizes_sum;
total_load += tids.size();
if (tmap.needs_merge() && tids.size() == 2) {
// Exclude both sibling tablets if either haven't finished migration yet. That's to prevent balancer from
// un-doing the colocation.
if (!migrating(t1) && !migrating(t2)) {
auto candidate = colocated_tablets{global_tablet_id{table, t1.tid}, global_tablet_id{table, t2->tid}};
add_candidate(shard_load_info, migration_tablet_set{std::move(candidate), tablet_sizes_sum});
add_candidate(shard_load_info, migration_tablet_set{std::move(candidate)});
}
} else {
if (tids.size() != tablet_sizes.size()) {
on_internal_error(lblogger, "Number of co-located tablets and their sizes don't match.");
}
for (size_t i = 0; i < tids.size(); i++) {
if (!migrating(get_table_desc(tids[i]))) { // migrating tablets are not candidates
add_candidate(shard_load_info, migration_tablet_set{global_tablet_id{table, tids[i]}, tablet_sizes[i]});
for (auto tid : tids) {
if (!migrating(get_table_desc(tid))) { // migrating tablets are not candidates
add_candidate(shard_load_info, migration_tablet_set{global_tablet_id{table, tid}});
}
}
}
@@ -3607,51 +3486,10 @@ public:
return make_ready_future<>();
});
_disk_used_per_table[table] = total_tablet_sizes;
_tablet_count_per_table[table] = total_tablet_count;
_tablet_count_per_table[table] = total_load;
}
// Compute load imbalance.
_total_capacity_shards = 0;
_total_capacity_nodes = 0;
_total_capacity_storage = 0;
load_type max_load = 0;
load_type min_load = 0;
std::optional<host_id> min_load_node = std::nullopt;
for (auto&& [host, load] : nodes) {
load.update();
_stats.for_node(dc, host).load = load.avg_load;
if (!load.drained) {
if (!min_load_node || load.avg_load < min_load) {
min_load = load.avg_load;
min_load_node = host;
}
if (load.avg_load > max_load) {
max_load = load.avg_load;
}
_total_capacity_shards += load.shard_count;
_total_capacity_nodes++;
_total_capacity_storage += load.dusage->capacity;
}
}
for (auto&& [host, load] : nodes) {
size_t read = 0;
size_t write = 0;
for (auto& shard_load : load.shards) {
read += shard_load.streaming_read_load;
write += shard_load.streaming_write_load;
}
auto level = (read + write) > 0 ? seastar::log_level::info : seastar::log_level::debug;
lblogger.log(level, "Node {}: dc={} rack={} load={} tablets={} shards={} tablets/shard={} state={} cap={}"
" stream_read={} stream_write={}",
host, dc, load.rack(), load.avg_load, load.tablet_count, load.shard_count,
load.tablets_per_shard(), load.state(), load.dusage->capacity, read, write);
}
if (!nodes_to_drain.empty() || (_tm->tablets().balancing_enabled() && (shuffle || !is_balanced(min_load, max_load)))) {
if (!nodes_to_drain.empty() || (_tm->tablets().balancing_enabled() && (shuffle || max_load != min_load))) {
host_id target = *min_load_node;
lblogger.info("target node: {}, avg_load: {}, max: {}", target, min_load, max_load);
plan.merge(co_await make_internode_plan(dc, nodes, nodes_to_drain, target));

View File

@@ -1243,7 +1243,6 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
// Record the repair_time returned by the repair_tablet rpc call
db_clock::time_point repair_time;
service::session_id session_id;
uint32_t repair_update_compaction_ctrl_retried = 0;
};
std::unordered_map<locator::global_tablet_id, tablet_migration_state> _tablets;
@@ -1837,37 +1836,22 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
break;
case locator::tablet_transition_stage::end_repair: {
if (do_barrier()) {
if (tablet_state.session_id.uuid().is_null()) {
tablet_state.session_id = trinfo.session_id;
}
if (action_failed(tablet_state.repair_update_compaction_ctrl)) {
rtlogger.warn("Failed to perform repair_update_compaction_ctrl for tablet repair tablet_id={} session_id={} nr_retried={}",
gid, tablet_state.session_id, tablet_state.repair_update_compaction_ctrl_retried);
// Do not erase the tablet from _tablets or delete
// the transitions yet so we can retry the
// repair_update_compaction_ctrl verb
tablet_state.repair_update_compaction_ctrl_retried++;
rtlogger.warn("Failed to perform repair_update_compaction_ctrl for tablet repair tablet_id={}", gid);
_tablets.erase(gid);
updates.emplace_back(get_mutation_builder().del_transition(last_token).build());
break;
}
bool feature = _feature_service.tablet_incremental_repair;
if (advance_in_background(gid, tablet_state.repair_update_compaction_ctrl, "repair_update_compaction_ctrl", [this, ms = &_messaging,
gid = gid, sid = tablet_state.session_id, feature, &tmap] () -> future<> {
if (advance_in_background(gid, tablet_state.repair_update_compaction_ctrl, "repair_update_compaction_ctrl", [ms = &_messaging,
gid = gid, sid = tablet_state.session_id, _replicas = tmap.get_tablet_info(gid.tablet).replicas, feature] () -> future<> {
if (feature) {
if (utils::get_local_injector().enter("fail_rpc_repair_update_compaction_ctrl")) {
auto msg = fmt::format("Failed repair_update_compaction_ctrl for tablet repair tablet_id={} session_id={} due to error injection", gid, sid);
rtlogger.info("{}", msg);
throw std::runtime_error(msg);
}
auto& replicas = tmap.get_tablet_info(gid.tablet).replicas;
co_await coroutine::parallel_for_each(replicas, [this, ms, gid, sid] (locator::tablet_replica r) -> future<> {
if (!is_excluded(raft::server_id(r.host.uuid()))) {
auto replicas = std::move(_replicas);
co_await coroutine::parallel_for_each(replicas, [replicas, ms, gid, sid] (locator::tablet_replica& r) -> future<> {
co_await ser::repair_rpc_verbs::send_repair_update_compaction_ctrl(ms, r.host, gid, sid);
}
});
}
})) {
if (utils::get_local_injector().enter("log_tablet_transition_stage_end_repair")) {
rtlogger.info("The end_repair stage finished for tablet repair tablet_id={} session_id={}", gid, tablet_state.session_id);
}
_tablets.erase(gid);
updates.emplace_back(get_mutation_builder().del_transition(last_token).build());
}
@@ -1979,39 +1963,26 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
auto leaving = locator::get_leaving_replica(tinfo, trinfo);
auto pending = trinfo.pending_replica;
const dht::token_range trange {tmap.get_token_range(gid.tablet)};
switch (trinfo.transition) {
case locator::tablet_transition_kind::migration:
if (leaving && pending) {
// Handle tablet migration
new_load_stats = old_load_stats->migrate_tablet_size(leaving->host, pending->host, gid, trange);
break;
case locator::tablet_transition_kind::rebuild:
[[fallthrough]];
case locator::tablet_transition_kind::rebuild_v2:
// Handle rebuild
if (pending && old_load_stats->tablet_stats.contains(pending->host)) {
// Compute the average tablet size of existing replicas
uint64_t tablet_size_sum = 0;
size_t replica_count = 0;
const locator::range_based_tablet_id rb_tid {gid.table, trange};
auto tsi = get_migration_streaming_info(get_token_metadata().get_topology(), tinfo, trinfo);
for (auto& r : tsi.read_from) {
auto tablet_size_opt = old_load_stats->get_tablet_size(r.host, rb_tid);
if (tablet_size_opt) {
tablet_size_sum += *tablet_size_opt;
replica_count++;
}
}
if (replica_count) {
new_load_stats = make_lw_shared<locator::load_stats>(*old_load_stats);
new_load_stats->tablet_stats.at(pending->host).tablet_sizes[gid.table][trange] = tablet_size_sum / replica_count;
} else if (!leaving && pending) {
// Handle rebuild: compute the average tablet size of existing replicas
new_load_stats = make_lw_shared<locator::load_stats>(*old_load_stats);
uint64_t tablet_size_sum = 0;
size_t replica_count = 0;
const locator::range_based_tablet_id rb_tid {gid.table, trange};
for (auto r : tinfo.replicas) {
auto tablet_size_opt = new_load_stats->get_tablet_size(r.host, rb_tid);
if (tablet_size_opt) {
tablet_size_sum += *tablet_size_opt;
replica_count++;
}
}
break;
case locator::tablet_transition_kind::repair:
[[fallthrough]];
case locator::tablet_transition_kind::intranode_migration:
break;
if (replica_count && new_load_stats->tablet_stats.contains(pending->host)) {
new_load_stats->tablet_stats.at(pending->host).tablet_sizes[gid.table][trange] = tablet_size_sum / replica_count;
}
}
if (new_load_stats) {
_tablet_allocator.set_load_stats(std::move(new_load_stats));
@@ -2652,10 +2623,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
co_await _voter_handler.on_node_removed(replaced_node_id, _as);
}
}
utils::get_local_injector().inject("crash_coordinator_before_stream", [] {
rtlogger.info("crash_coordinator_before_stream: aborting");
abort();
});
utils::get_local_injector().inject("crash_coordinator_before_stream", [] { abort(); });
raft_topology_cmd cmd{raft_topology_cmd::command::stream_ranges};
auto state = node.rs->state;
try {
@@ -3190,8 +3158,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
}
case topology_request::rebuild: {
topology_mutation_builder builder(node.guard.write_timestamp());
builder.set_session(session_id(node.guard.new_group0_state_id()))
.with_node(node.id)
builder.with_node(node.id)
.set("node_state", node_state::rebuilding)
.del("topology_request");
co_await update_topology_state(take_guard(std::move(node)), {builder.build(), rtbuilder.build()},

View File

@@ -1696,9 +1696,7 @@ void writer::consume_end_of_stream() {
.map = _collector.get_ext_timestamp_stats()
});
_sst.write_scylla_metadata(_shard, std::move(identifier), std::move(ld_stats), std::move(ts_stats));
if (!_cfg.leave_unsealed) {
_sst.seal_sstable(_cfg.backup).get();
}
_sst.seal_sstable(_cfg.backup).get();
}
uint64_t writer::data_file_position_for_tests() const {

View File

@@ -60,17 +60,12 @@ fmt::formatter<sstables::object_name>::format(const sstables::object_name& n, fm
return fmt::format_to(ctx.out(), "{}", n.str());
}
static shared_ptr<s3::client> make_s3_client(const db::object_storage_endpoint_param& ep, semaphore& memory, std::function<shared_ptr<s3::client>(std::string)> factory) {
auto& epc = ep.get_s3_storage();
return s3::client::make(epc.endpoint, epc.region, epc.iam_role_arn, memory, std::move(factory));
}
class s3_client_wrapper : public sstables::object_storage_client {
shared_ptr<s3::client> _client;
shard_client_factory _cf;
public:
s3_client_wrapper(const db::object_storage_endpoint_param& ep, semaphore& memory, shard_client_factory cf)
: _client(make_s3_client(ep, memory, std::bind_front(&s3_client_wrapper::shard_client, this)))
s3_client_wrapper(const std::string& host, s3::endpoint_config_ptr cfg, semaphore& memory, shard_client_factory cf)
: _client(s3::client::make(host, cfg, memory, std::bind_front(&s3_client_wrapper::shard_client, this)))
, _cf(std::move(cf))
{}
shared_ptr<s3::client> shard_client(std::string host) const {
@@ -103,8 +98,8 @@ public:
return _client->upload_file(std::move(path), name.str(), up, as);
}
future<> update_config(const db::object_storage_endpoint_param& ep) override {
auto& epc = ep.get_s3_storage();
return _client->update_config(epc.region, epc.iam_role_arn);
auto s3_cfg = make_lw_shared<s3::endpoint_config>(ep.get_s3_storage().config);
return _client->update_config(std::move(s3_cfg));
}
future<> close() override {
return _client->close();
@@ -295,7 +290,9 @@ public:
shared_ptr<object_storage_client> sstables::make_object_storage_client(const db::object_storage_endpoint_param& ep, semaphore& memory, shard_client_factory cf) {
if (ep.is_s3_storage()) {
return seastar::make_shared<s3_client_wrapper>(ep, memory, std::move(cf));
auto& epc = ep.get_s3_storage();
auto s3_cfg = make_lw_shared<s3::endpoint_config>(epc.config);
return seastar::make_shared<s3_client_wrapper>(epc.endpoint, std::move(s3_cfg), memory, std::move(cf));
}
if (ep.is_gs_storage()) {
return seastar::make_shared<gs_client_wrapper>(ep, memory, std::move(cf));

View File

@@ -83,8 +83,6 @@ struct sstable_open_config {
bool current_shard_as_sstable_owner = false;
// Do not move the sharding metadata to the sharder, keeping it in the scylla metadata..
bool keep_sharding_metadata = false;
// Allows unsealed sstable to be loaded, since it must read components from temporary TOC instead.
bool unsealed_sstable = false;
};
}

View File

@@ -719,7 +719,7 @@ future<> sstable_directory::filesystem_components_lister::cleanup_column_family_
fs::path dirpath = _directory / de->name;
if (dirpath.extension().string() == tempdir_extension) {
dirlog.info("Found temporary sstable directory: {}, removing", dirpath);
futures.push_back(io_check([dirpath = std::move(dirpath)] () { return seastar::recursive_remove_directory(dirpath); }));
futures.push_back(io_check([dirpath = std::move(dirpath)] () { return lister::rmdir(dirpath); }));
}
}
co_return futures;

View File

@@ -836,14 +836,13 @@ future<std::vector<sstring>> sstable::read_and_parse_toc(file f) {
// This is small enough, and well-defined. Easier to just read it all
// at once
future<> sstable::read_toc(sstable_open_config cfg) noexcept {
future<> sstable::read_toc() noexcept {
if (_recognized_components.size()) {
co_return;
}
try {
auto toc_type = cfg.unsealed_sstable ? component_type::TemporaryTOC : component_type::TOC;
co_await do_read_simple(toc_type, [&] (version_types v, file f) -> future<> {
co_await do_read_simple(component_type::TOC, [&] (version_types v, file f) -> future<> {
auto comps = co_await read_and_parse_toc(f);
for (auto& c: comps) {
// accept trailing newlines
@@ -901,8 +900,8 @@ future<std::unordered_map<component_type, file>> sstable::readable_file_for_all_
co_return std::move(files);
}
future<entry_descriptor> sstable::clone(generation_type new_generation, bool leave_unsealed) const {
co_await _storage->clone(*this, new_generation, leave_unsealed);
future<entry_descriptor> sstable::clone(generation_type new_generation) const {
co_await _storage->snapshot(*this, _storage->prefix(), storage::absolute_path::yes, new_generation);
co_return entry_descriptor(new_generation, _version, _format, component_type::TOC, _state);
}
@@ -1726,7 +1725,7 @@ void sstable::disable_component_memory_reload() {
}
future<> sstable::load_metadata(sstable_open_config cfg) noexcept {
co_await read_toc(cfg);
co_await read_toc();
// read scylla-meta after toc. Might need it to parse
// rest (hint extensions)
co_await read_scylla_metadata();
@@ -2123,7 +2122,7 @@ future<> sstable::seal_sstable(bool backup)
_marked_for_deletion = mark_for_deletion::none;
}
if (backup) {
co_await _storage->snapshot(*this, "backups");
co_await _storage->snapshot(*this, "backups", storage::absolute_path::no);
}
}
@@ -2489,9 +2488,9 @@ std::vector<std::pair<component_type, sstring>> sstable::all_components() const
return all;
}
future<> sstable::snapshot(const sstring& name) const {
future<> sstable::snapshot(const sstring& dir) const {
auto lock = co_await get_units(_mutate_sem, 1);
co_await _storage->snapshot(*this, format("{}/{}", sstables::snapshots_dir, name));
co_await _storage->snapshot(*this, dir, storage::absolute_path::yes);
}
future<> sstable::change_state(sstable_state to, delayed_commit_changes* delay_commit) {
@@ -3961,13 +3960,11 @@ class sstable_stream_sink_impl : public sstable_stream_sink {
shared_sstable _sst;
component_type _type;
bool _last_component;
bool _leave_unsealed;
public:
sstable_stream_sink_impl(shared_sstable sst, component_type type, sstable_stream_sink_cfg cfg)
sstable_stream_sink_impl(shared_sstable sst, component_type type, bool last_component)
: _sst(std::move(sst))
, _type(type)
, _last_component(cfg.last_component)
, _leave_unsealed(cfg.leave_unsealed)
, _last_component(last_component)
{}
private:
future<> load_metadata() const {
@@ -4014,12 +4011,10 @@ public:
co_return co_await make_file_output_stream(std::move(f), stream_options);
}
future<shared_sstable> close() override {
future<shared_sstable> close_and_seal() override {
if (_last_component) {
// If we are the last component in a sequence, we can seal the table.
if (!_leave_unsealed) {
co_await _sst->_storage->seal(*_sst);
}
co_await _sst->_storage->seal(*_sst);
co_return std::move(_sst);
}
_sst = {};
@@ -4036,7 +4031,7 @@ public:
}
};
std::unique_ptr<sstable_stream_sink> create_stream_sink(schema_ptr schema, sstables_manager& sstm, const data_dictionary::storage_options& s_opts, sstable_state state, std::string_view component_filename, sstable_stream_sink_cfg cfg) {
std::unique_ptr<sstable_stream_sink> create_stream_sink(schema_ptr schema, sstables_manager& sstm, const data_dictionary::storage_options& s_opts, sstable_state state, std::string_view component_filename, bool last_component) {
auto desc = parse_path(component_filename, schema->ks_name(), schema->cf_name());
auto sst = sstm.make_sstable(schema, s_opts, desc.generation, state, desc.version, desc.format);
@@ -4047,7 +4042,7 @@ std::unique_ptr<sstable_stream_sink> create_stream_sink(schema_ptr schema, sstab
type = component_type::TemporaryTOC;
}
return std::make_unique<sstable_stream_sink_impl>(std::move(sst), type, cfg);
return std::make_unique<sstable_stream_sink_impl>(std::move(sst), type, last_component);
}
generation_type

View File

@@ -109,7 +109,6 @@ struct sstable_writer_config {
size_t promoted_index_auto_scale_threshold;
uint64_t max_sstable_size = std::numeric_limits<uint64_t>::max();
bool backup = false;
bool leave_unsealed = false;
mutation_fragment_stream_validation_level validation_level;
std::optional<db::replay_position> replay_position;
std::optional<int> sstable_level;
@@ -418,8 +417,8 @@ public:
return component_basename(_schema->ks_name(), _schema->cf_name(), _version, _generation, _format, f);
}
component_name get_filename(component_type f = component_type::Data) const {
return component_name(*this, f);
component_name get_filename() const {
return component_name(*this, component_type::Data);
}
component_name toc_filename() const {
@@ -438,7 +437,7 @@ public:
std::vector<std::pair<component_type, sstring>> all_components() const;
future<> snapshot(const sstring& name) const;
future<> snapshot(const sstring& dir) const;
// Delete the sstable by unlinking all sstable files
// Ignores all errors.
@@ -694,7 +693,7 @@ private:
future<> update_info_for_opened_data(sstable_open_config cfg = {});
future<> read_toc(sstable_open_config cfg = {}) noexcept;
future<> read_toc() noexcept;
future<> read_summary() noexcept;
void write_summary() {
@@ -1070,9 +1069,8 @@ public:
future<std::unordered_map<component_type, file>> readable_file_for_all_components() const;
// Clones this sstable with a new generation, under the same location as the original one.
// If leave_unsealed is true, the destination sstable is left unsealed.
// Implementation is underlying storage specific.
future<entry_descriptor> clone(generation_type new_generation, bool leave_unsealed = false) const;
future<entry_descriptor> clone(generation_type new_generation) const;
struct lesser_reclaimed_memory {
// comparator class to be used by the _reclaimed set in sstables manager
@@ -1246,18 +1244,13 @@ public:
// closes this component. If this is the last component in a set (see "last_component" in creating method below)
// the table on disk will be sealed.
// Returns sealed sstable if last, or nullptr otherwise.
virtual future<shared_sstable> close() = 0;
virtual future<shared_sstable> close_and_seal() = 0;
virtual future<> abort() = 0;
};
struct sstable_stream_sink_cfg {
bool last_component = false;
bool leave_unsealed = false;
};
// Creates a sink object which can receive a component file sourced from above source object data.
std::unique_ptr<sstable_stream_sink> create_stream_sink(schema_ptr, sstables_manager&, const data_dictionary::storage_options&, sstable_state, std::string_view component_filename, sstable_stream_sink_cfg cfg);
std::unique_ptr<sstable_stream_sink> create_stream_sink(schema_ptr, sstables_manager&, const data_dictionary::storage_options&, sstable_state, std::string_view component_filename, bool last_component);
} // namespace sstables

View File

@@ -11,7 +11,6 @@
#include <unordered_map>
#include "utils/log.hh"
#include "sstables/sstables_manager.hh"
#include "sstables/sstable_directory.hh"
#include "sstables/sstables_registry.hh"
#include "sstables/partition_index_cache.hh"
#include "sstables/sstables.hh"
@@ -21,7 +20,6 @@
#include "gms/feature.hh"
#include "gms/feature_service.hh"
#include "utils/assert.hh"
#include "utils/http.hh"
#include "exceptions/exceptions.hh"
namespace sstables {
@@ -81,7 +79,7 @@ storage_manager::object_storage_endpoint::object_storage_endpoint(db::object_sto
storage_manager::storage_manager(const db::config& cfg, config stm_cfg)
: _object_storage_clients_memory(stm_cfg.object_storage_clients_memory)
, _config_updater(std::make_unique<config_updater>(cfg, *this))
, _config_updater(this_shard_id() == 0 ? std::make_unique<config_updater>(cfg, *this) : nullptr)
{
for (auto& e : cfg.object_storage_endpoints()) {
_object_storage_endpoints.emplace(std::make_pair(e.key(), e));
@@ -139,16 +137,6 @@ future<> storage_manager::update_config(const db::config& cfg) {
auto storage_manager::get_endpoint(const sstring& endpoint) -> object_storage_endpoint& {
auto found = _object_storage_endpoints.find(endpoint);
if (found == _object_storage_endpoints.end() && maybe_legacy_endpoint_name(endpoint)) {
found = _object_storage_endpoints.begin();
while (found != _object_storage_endpoints.end()) {
auto uri = utils::http::parse_simple_url(found->first);
if (uri.host == endpoint) {
break;
}
found++;
}
}
if (found == _object_storage_endpoints.end()) {
smlogger.error("unable to find {} in configured object-storage endpoints", endpoint);
throw std::invalid_argument(format("endpoint {} not found", endpoint));
@@ -171,20 +159,7 @@ sstring storage_manager::get_endpoint_type(sstring endpoint) {
}
bool storage_manager::is_known_endpoint(sstring endpoint) const {
if (_object_storage_endpoints.contains(endpoint)) {
return true;
}
if (maybe_legacy_endpoint_name(endpoint)) {
for (auto ep : _object_storage_endpoints) {
auto uri = utils::http::parse_simple_url(ep.first);
if (uri.host == endpoint) {
return true;
}
}
}
return false;
return _object_storage_endpoints.contains(endpoint);
}
std::vector<sstring> storage_manager::endpoints(sstring type) const noexcept {
@@ -195,7 +170,9 @@ std::vector<sstring> storage_manager::endpoints(sstring type) const noexcept {
storage_manager::config_updater::config_updater(const db::config& cfg, storage_manager& sstm)
: action([&sstm, &cfg] () mutable {
return sstm.update_config(cfg);
return sstm.container().invoke_on_all([&cfg](auto& sstm) -> future<> {
co_await sstm.update_config(cfg);
});
})
, observer(cfg.object_storage_endpoints.observe(action.make_observer()))
{}
@@ -428,19 +405,6 @@ future<> sstables_manager::delete_atomically(std::vector<shared_sstable> ssts) {
co_await storage.atomic_delete_complete(std::move(ctx));
}
future<std::unordered_set<sstring>> sstables_manager::take_snapshot(std::vector<shared_sstable> ssts, sstring name) {
std::unordered_set<sstring> table_names;
co_await _dir_semaphore.parallel_for_each(ssts, [&name, &table_names] (sstables::shared_sstable sstable) {
table_names.insert(sstable->component_basename(sstables::component_type::Data));
return io_check([sstable, &name] {
return sstable->snapshot(name);
});
});
co_return table_names;
}
future<> sstables_manager::close() {
_closing = true;
maybe_done();

View File

@@ -248,7 +248,6 @@ public:
}
future<> delete_atomically(std::vector<shared_sstable> ssts);
future<std::unordered_set<sstring>> take_snapshot(std::vector<shared_sstable> ssts, sstring jsondir);
future<lw_shared_ptr<const data_dictionary::storage_options>> init_table_storage(const schema& s, const data_dictionary::storage_options& so);
future<> destroy_table_storage(const data_dictionary::storage_options& so);
future<> init_keyspace_storage(const data_dictionary::storage_options& so, sstring dir);

View File

@@ -50,11 +50,7 @@ class filesystem_storage final : public sstables::storage {
std::optional<std::filesystem::path> _temp_dir; // Valid while the sstable is being created, until sealed
private:
enum class link_mode {
default_mode,
mark_for_removal,
leave_unsealed,
};
using mark_for_removal = bool_class<class mark_for_removal_tag>;
template <typename Comp>
requires std::is_same_v<Comp, component_type> || std::is_same_v<Comp, sstring>
@@ -65,7 +61,8 @@ private:
future<> check_create_links_replay(const sstable& sst, const sstring& dst_dir, generation_type dst_gen, const std::vector<std::pair<sstables::component_type, sstring>>& comps) const;
future<> remove_temp_dir();
virtual future<> create_links(const sstable& sst, const std::filesystem::path& dir) const override;
future<> create_links_common(const sstable& sst, sstring dst_dir, generation_type dst_gen, link_mode mode) const;
future<> create_links_common(const sstable& sst, sstring dst_dir, generation_type dst_gen, mark_for_removal mark_for_removal) const;
future<> create_links_common(const sstable& sst, const std::filesystem::path& dir, std::optional<generation_type> dst_gen) const;
future<> touch_temp_dir(const sstable& sst);
future<> move(const sstable& sst, sstring new_dir, generation_type generation, delayed_commit_changes* delay) override;
future<> rename_new_file(const sstable& sst, sstring from_name, sstring to_name) const;
@@ -86,8 +83,7 @@ public:
{}
virtual future<> seal(const sstable& sst) override;
virtual future<> snapshot(const sstable& sst, sstring name) const override;
virtual future<> clone(const sstable& sst, generation_type gen, bool leave_unsealed) const override;
virtual future<> snapshot(const sstable& sst, sstring dir, absolute_path abs, std::optional<generation_type> gen) const override;
virtual future<> change_state(const sstable& sst, sstable_state state, generation_type generation, delayed_commit_changes* delay) override;
// runs in async context
virtual void open(sstable& sst) override;
@@ -360,13 +356,8 @@ future<> filesystem_storage::check_create_links_replay(const sstable& sst, const
/// \param sst - the sstable to work on
/// \param dst_dir - the destination directory.
/// \param generation - the generation of the destination sstable
/// \param mode - what will be done after all components were linked
/// mark_for_removal - mark the sstable for removal after linking it to the destination dst_dir
/// leave_unsealed - leaves the destination sstable unsealed
future<> filesystem_storage::create_links_common(const sstable& sst, sstring dst_dir, generation_type generation, link_mode mode) const {
// They're mutually exclusive, so we can assume only one is set.
bool mark_for_removal = mode == link_mode::mark_for_removal;
bool leave_unsealed = mode == link_mode::leave_unsealed;
/// \param mark_for_removal - mark the sstable for removal after linking it to the destination dst_dir
future<> filesystem_storage::create_links_common(const sstable& sst, sstring dst_dir, generation_type generation, mark_for_removal mark_for_removal) const {
sstlog.trace("create_links: {} -> {} generation={} mark_for_removal={}", sst.get_filename(), dst_dir, generation, mark_for_removal);
auto comps = sst.all_components();
co_await check_create_links_replay(sst, dst_dir, generation, comps);
@@ -375,11 +366,7 @@ future<> filesystem_storage::create_links_common(const sstable& sst, sstring dst
co_await sst.sstable_write_io_check(idempotent_link_file, fmt::to_string(sst.filename(component_type::TOC)), std::move(dst));
auto dir = opened_directory(dst_dir);
co_await dir.sync(sst._write_error_handler);
co_await parallel_for_each(comps, [this, &sst, &dst_dir, generation, leave_unsealed] (auto p) {
// Skips the linking of TOC file if the destination will be left unsealed.
if (leave_unsealed && p.first == component_type::TOC) {
return make_ready_future<>();
}
co_await parallel_for_each(comps, [this, &sst, &dst_dir, generation] (auto p) {
auto src = filename(sst, _dir.native(), sst._generation, p.second);
auto dst = filename(sst, dst_dir, generation, p.second);
return sst.sstable_write_io_check(idempotent_link_file, std::move(src), std::move(dst));
@@ -392,10 +379,9 @@ future<> filesystem_storage::create_links_common(const sstable& sst, sstring dst
auto src_temp_toc = filename(sst, _dir.native(), sst._generation, component_type::TemporaryTOC);
co_await sst.sstable_write_io_check(rename_file, std::move(dst_temp_toc), std::move(src_temp_toc));
co_await _dir.sync(sst._write_error_handler);
} else if (!leave_unsealed) {
} else {
// Now that the source sstable is linked to dir, remove
// the TemporaryTOC file at the destination.
// This is bypassed if destination will be left unsealed.
co_await sst.sstable_write_io_check(remove_file, std::move(dst_temp_toc));
}
co_await dir.sync(sst._write_error_handler);
@@ -403,17 +389,23 @@ future<> filesystem_storage::create_links_common(const sstable& sst, sstring dst
sstlog.trace("create_links: {} -> {} generation={}: done", sst.get_filename(), dst_dir, generation);
}
future<> filesystem_storage::create_links(const sstable& sst, const std::filesystem::path& dir) const {
return create_links_common(sst, dir.native(), sst._generation, link_mode::default_mode);
future<> filesystem_storage::create_links_common(const sstable& sst, const std::filesystem::path& dir, std::optional<generation_type> gen) const {
return create_links_common(sst, dir.native(), gen.value_or(sst._generation), mark_for_removal::no);
}
future<> filesystem_storage::snapshot(const sstable& sst, sstring name) const {
std::filesystem::path snapshot_dir = _base_dir.path() / name;
co_await sst.sstable_touch_directory_io_check(snapshot_dir);
co_await create_links_common(sst, snapshot_dir.native(), sst._generation, link_mode::default_mode);
future<> filesystem_storage::create_links(const sstable& sst, const std::filesystem::path& dir) const {
return create_links_common(sst, dir.native(), sst._generation, mark_for_removal::no);
}
future<> filesystem_storage::clone(const sstable& sst, generation_type gen, bool leave_unsealed) const {
co_await create_links_common(sst, _dir.path().native(), std::move(gen), leave_unsealed ? link_mode::leave_unsealed : link_mode::default_mode);
future<> filesystem_storage::snapshot(const sstable& sst, sstring dir, absolute_path abs, std::optional<generation_type> gen) const {
std::filesystem::path snapshot_dir;
if (abs) {
snapshot_dir = dir;
} else {
snapshot_dir = _dir.path() / dir;
}
co_await sst.sstable_touch_directory_io_check(snapshot_dir);
co_await create_links_common(sst, snapshot_dir, std::move(gen));
}
future<> filesystem_storage::move(const sstable& sst, sstring new_dir, generation_type new_generation, delayed_commit_changes* delay_commit) {
@@ -421,7 +413,7 @@ future<> filesystem_storage::move(const sstable& sst, sstring new_dir, generatio
sstring old_dir = _dir.native();
sstlog.debug("Moving {} old_generation={} to {} new_generation={} do_sync_dirs={}",
sst.get_filename(), sst._generation, new_dir, new_generation, delay_commit == nullptr);
co_await create_links_common(sst, new_dir, new_generation, link_mode::mark_for_removal);
co_await create_links_common(sst, new_dir, new_generation, mark_for_removal::yes);
co_await change_dir(new_dir);
generation_type old_generation = sst._generation;
co_await coroutine::parallel_for_each(sst.all_components(), [&sst, old_generation, old_dir] (auto p) {
@@ -606,8 +598,7 @@ public:
{}
future<> seal(const sstable& sst) override;
future<> snapshot(const sstable& sst, sstring name) const override;
future<> clone(const sstable& sst, generation_type gen, bool leave_unsealed) const override;
future<> snapshot(const sstable& sst, sstring dir, absolute_path abs, std::optional<generation_type>) const override;
future<> change_state(const sstable& sst, sstable_state state, generation_type generation, delayed_commit_changes* delay) override;
// runs in async context
void open(sstable& sst) override;
@@ -824,16 +815,11 @@ future<> object_storage_base::unlink_component(const sstable& sst, component_typ
}
}
future<> object_storage_base::snapshot(const sstable& sst, sstring name) const {
future<> object_storage_base::snapshot(const sstable& sst, sstring dir, absolute_path abs, std::optional<generation_type> gen) const {
on_internal_error(sstlog, "Snapshotting S3 objects not implemented");
co_return;
}
future<> object_storage_base::clone(const sstable& sst, generation_type gen, bool leave_unsealed) const {
on_internal_error(sstlog, "Cloning S3 objects not implemented");
co_return;
}
std::unique_ptr<sstables::storage> make_storage(sstables_manager& manager, const data_dictionary::storage_options& s_opts, sstable_state state) {
return std::visit(overloaded_functor {
[state] (const data_dictionary::storage_options::local& loc) mutable -> std::unique_ptr<sstables::storage> {

View File

@@ -95,11 +95,11 @@ class storage {
public:
virtual ~storage() {}
using absolute_path = bool_class<class absolute_path_tag>; // FIXME -- should go away eventually
using sync_dir = bool_class<struct sync_dir_tag>; // meaningful only to filesystem storage
virtual future<> seal(const sstable& sst) = 0;
virtual future<> snapshot(const sstable& sst, sstring name) const = 0;
virtual future<> clone(const sstable& sst, generation_type gen, bool leave_unsealed) const = 0;
virtual future<> snapshot(const sstable& sst, sstring dir, absolute_path abs, std::optional<generation_type> gen = {}) const = 0;
virtual future<> change_state(const sstable& sst, sstable_state to, generation_type generation, delayed_commit_changes* delay) = 0;
// runs in async context
virtual void open(sstable& sst) = 0;

View File

@@ -63,45 +63,30 @@ mutation_reader_consumer make_streaming_consumer(sstring origin,
}
schema_ptr s = reader.schema();
// SSTable will be only sealed when added to the sstable set, so we make sure unsplit sstables aren't
// left sealed on the table directory.
auto cfg = cf->get_sstables_manager().configure_writer(origin);
cfg.leave_unsealed = true;
return sst->write_components(std::move(reader), adjusted_estimated_partitions, s,
cfg, encoding_stats{}).then([sst] {
return sst->open_data();
}).then([cf, sst, offstrategy, origin, repaired_at, sstable_list_to_mark_as_repaired, frozen_guard, cfg] -> future<std::vector<sstables::shared_sstable>> {
auto on_add = [sst, origin, repaired_at, sstable_list_to_mark_as_repaired, frozen_guard, cfg] (sstables::shared_sstable loading_sst) -> future<> {
if (repaired_at && sstables::repair_origin == origin) {
loading_sst->being_repaired = frozen_guard;
if (sstable_list_to_mark_as_repaired) {
sstable_list_to_mark_as_repaired->insert(loading_sst);
}
}).then([cf, sst, offstrategy, origin, repaired_at, sstable_list_to_mark_as_repaired, frozen_guard] -> future<> {
if (repaired_at && sstables::repair_origin == origin) {
sst->being_repaired = frozen_guard;
if (sstable_list_to_mark_as_repaired) {
sstable_list_to_mark_as_repaired->insert(sst);
}
if (loading_sst == sst) {
co_await loading_sst->seal_sstable(cfg.backup);
}
co_return;
};
}
if (offstrategy && sstables::repair_origin == origin) {
sstables::sstlog.debug("Enabled automatic off-strategy trigger for table {}.{}",
cf->schema()->ks_name(), cf->schema()->cf_name());
cf->enable_off_strategy_trigger();
}
co_return co_await cf->add_new_sstable_and_update_cache(sst, on_add, offstrategy);
}).then([cf, s, sst, use_view_update_path, &vb, &vbw] (std::vector<sstables::shared_sstable> new_sstables) mutable -> future<> {
auto& vb_ = vb;
auto new_sstables_ = std::move(new_sstables);
auto table = cf;
co_await cf->add_sstable_and_update_cache(sst, offstrategy);
}).then([cf, s, sst, use_view_update_path, &vb, &vbw]() mutable -> future<> {
if (use_view_update_path == db::view::sstable_destination_decision::staging_managed_by_vbc) {
co_return co_await vbw.local().register_staging_sstable_tasks(new_sstables_, cf->schema()->id());
return vbw.local().register_staging_sstable_tasks({sst}, cf->schema()->id());
} else if (use_view_update_path == db::view::sstable_destination_decision::staging_directly_to_generator) {
co_await coroutine::parallel_for_each(new_sstables_, [&vb_, &table] (sstables::shared_sstable sst) -> future<> {
return vb_.local().register_staging_sstable(sst, table);
});
return vb.local().register_staging_sstable(sst, std::move(cf));
}
co_return;
return make_ready_future<>();
});
};
if (!offstrategy) {

View File

@@ -52,16 +52,8 @@ static future<> load_sstable_for_tablet(const file_stream_id& ops_id, replica::d
auto erm = t.get_effective_replication_map();
auto& sstm = t.get_sstables_manager();
auto sst = sstm.make_sstable(t.schema(), t.get_storage_options(), desc.generation, state, desc.version, desc.format);
sstables::sstable_open_config cfg { .unsealed_sstable = true };
co_await sst->load(erm->get_sharder(*t.schema()), cfg);
auto on_add = [sst, &sstm] (sstables::shared_sstable loading_sst) -> future<> {
if (loading_sst == sst) {
auto cfg = sstm.configure_writer(sst->get_origin());
co_await loading_sst->seal_sstable(cfg.backup);
}
co_return;
};
auto new_sstables = co_await t.add_new_sstable_and_update_cache(sst, on_add);
co_await sst->load(erm->get_sharder(*t.schema()));
co_await t.add_sstable_and_update_cache(sst);
blogger.info("stream_sstables[{}] Loaded sstable {} successfully", ops_id, sst->toc_filename());
if (state == sstables::sstable_state::staging) {
@@ -72,7 +64,7 @@ static future<> load_sstable_for_tablet(const file_stream_id& ops_id, replica::d
// so then, the view building coordinator can decide to process it once the migration
// is finished.
// (Instead of registering the sstable to view update generator which may process it immediately.)
co_await sharded_vbw.local().register_staging_sstable_tasks(new_sstables, t.schema()->id());
co_await sharded_vbw.local().register_staging_sstable_tasks({sst}, t.schema()->id());
}
});
}
@@ -351,11 +343,7 @@ future<> stream_blob_handler(replica::database& db, db::view::view_building_work
auto& table = db.find_column_family(meta.table);
auto& sstm = table.get_sstables_manager();
// SSTable will be only sealed when added to the sstable set, so we make sure unsplit sstables aren't
// left sealed on the table directory.
sstables::sstable_stream_sink_cfg cfg { .last_component = meta.fops == file_ops::load_sstables,
.leave_unsealed = true };
auto sstable_sink = sstables::create_stream_sink(table.schema(), sstm, table.get_storage_options(), sstable_state(meta), meta.filename, cfg);
auto sstable_sink = sstables::create_stream_sink(table.schema(), sstm, table.get_storage_options(), sstable_state(meta), meta.filename, meta.fops == file_ops::load_sstables);
auto out = co_await sstable_sink->output(foptions, stream_options);
co_return output_result{
[sstable_sink = std::move(sstable_sink), &meta, &db, &vbw](store_result res) -> future<> {
@@ -363,7 +351,7 @@ future<> stream_blob_handler(replica::database& db, db::view::view_building_work
co_await sstable_sink->abort();
co_return;
}
auto sst = co_await sstable_sink->close();
auto sst = co_await sstable_sink->close_and_seal();
if (sst) {
blogger.debug("stream_sstables[{}] Loading sstable {} on shard {}", meta.ops_id, sst->toc_filename(), meta.dst_shard_id);
auto desc = sst->get_descriptor(sstables::component_type::TOC);

View File

@@ -6,6 +6,9 @@ import run
import os
import requests
import time
import cassandra.cluster
import cassandra.auth
# When tests are to be run against AWS (the "--aws" option), it is not
# necessary to start Scylla at all. All we need to do is to run pytest.

View File

@@ -33,6 +33,7 @@ import botocore
import gzip
import requests
import pytest
from contextlib import contextmanager
from .util import random_string
from .test_manual_requests import get_signed_request

View File

@@ -21,7 +21,7 @@ from functools import cache
import re
from .util import unique_table_name, random_string, new_test_table
from .util import is_aws, unique_table_name, random_string, new_test_table
from .test_gsi_updatetable import wait_for_gsi, wait_for_gsi_gone
from .test_gsi import assert_index_query

View File

@@ -17,10 +17,8 @@
import pytest
from botocore.exceptions import ClientError
import re
import time
from test.alternator.util import multiset, new_test_table, random_string, scylla_config_temporary
import requests
import time, datetime
from test.alternator.util import multiset, create_test_table, new_test_table
# Test that DescribeTable correctly returns the table's name and state
def test_describe_table_basic(test_table):
@@ -77,7 +75,7 @@ def test_describe_table_creation_time(dynamodb):
with new_test_table(dynamodb, **schema) as table1:
# let's sleep few ms, so table2 creation time was always bigger, as we now return CreationDateTime in ms precision
time.sleep(0.002)
time.sleep(0.002)
with new_test_table(dynamodb, **schema) as table2:
got1 = table1.meta.client.describe_table(TableName=table1.name)['Table']
got2 = table2.meta.client.describe_table(TableName=table2.name)['Table']
@@ -116,54 +114,10 @@ def test_describe_table_item_count(test_table):
# Similar test for estimated size in bytes - TableSizeBytes - which again,
# may reflect the size as long as six hours ago.
@pytest.mark.xfail(reason="DescribeTable does not return table size")
def test_describe_table_size(test_table):
got = test_table.meta.client.describe_table(TableName=test_table.name)['Table']
assert 'TableSizeBytes' in got
assert got['TableSizeBytes'] >= 0
# this is scylla-only test - it uses scylla's configuration option to stabilize test
# we also don't have any guarantees about sizes returned by dynamodb
@pytest.mark.parametrize("cache_validity_in_seconds", [0, 3600])
def test_describe_table_size_with_N_timeout(scylla_only, dynamodb, rest_api, cache_validity_in_seconds):
'''
This tests side effect of how describe table works in ScyllaDB - it caches calculated
values. The cache validity is set to 1 hour (long enough to cover whole test duration).
We call DescribeTable twice, expecting second value to be the same as first one due to the cache.
'''
schema = {
'KeySchema': [ { 'AttributeName': 'p', 'KeyType': 'HASH' },
{ 'AttributeName': 'c', 'KeyType': 'RANGE' }
],
'AttributeDefinitions': [
{ 'AttributeName': 'p', 'AttributeType': 'S' },
{ 'AttributeName': 'c', 'AttributeType': 'S' },
],
}
with scylla_config_temporary(dynamodb, 'alternator_describe_table_info_cache_validity_in_seconds', str(cache_validity_in_seconds)):
with new_test_table(dynamodb, **schema) as test_table:
got1 = test_table.meta.client.describe_table(TableName=test_table.name)['Table']
assert 'TableSizeBytes' in got1
assert got1['TableSizeBytes'] >= 0
p = random_string()
c = random_string()
v = random_string()
test_table.put_item(Item={'p': p, 'c': c, 'v': v})
ks = 'alternator_' + test_table.name
cf = test_table.name
# We need to flush memtables to make sure size is updated, as current implementation
# calculates size based on sstables only
response = requests.post(rest_api+f'/storage_service/keyspace_flush/{ks}', params={'cf' : cf})
assert response.ok
got2 = test_table.meta.client.describe_table(TableName=test_table.name)['Table']
assert 'TableSizeBytes' in got2
if cache_validity_in_seconds == 0:
assert got2['TableSizeBytes'] > got1['TableSizeBytes']
else:
assert got2['TableSizeBytes'] == got1['TableSizeBytes']
# Test the ProvisionedThroughput attribute returned by DescribeTable.
# This is a very partial test: Our test table is configured without

View File

@@ -9,6 +9,7 @@
# various cases of that issue.
import pytest
import time
from botocore.exceptions import ClientError
from .util import random_string, full_scan, full_query, multiset, \
new_test_table, wait_for_gsi, wait_for_gsi_gone

View File

@@ -8,7 +8,8 @@
import pytest
from botocore.exceptions import ClientError
from test.alternator.util import new_test_table
from decimal import Decimal
from test.alternator.util import random_string, random_bytes, new_test_table
# When creating a table with PROVISIONED billing mode, ProvisionedThroughput must be explicitly set,
# and the same values should be reflected when the table is described.

View File

@@ -7,6 +7,7 @@
import pytest
from botocore.exceptions import ClientError
from test.alternator.util import random_string, random_bytes, new_test_table
import decimal
from decimal import Decimal
KB = 1024

View File

@@ -155,11 +155,11 @@ def test_delete_item_returnvalues(test_table_s):
# Other ReturnValue options - UPDATED_OLD, ALL_NEW, UPDATED_NEW,
# are supported by other operations but not by PutItem:
with pytest.raises(ClientError, match='ValidationException'):
test_table_s.delete_item(Key={'p': p}, ReturnValues='UPDATED_OLD')
test_table_s.delete_item(Key={'p': p}, ReturnValues='UPDATE_OLD')
with pytest.raises(ClientError, match='ValidationException'):
test_table_s.delete_item(Key={'p': p}, ReturnValues='ALL_NEW')
with pytest.raises(ClientError, match='ValidationException'):
test_table_s.delete_item(Key={'p': p}, ReturnValues='UPDATED_NEW')
test_table_s.delete_item(Key={'p': p}, ReturnValues='UPDATE_NEW')
# Also, obviously, a non-supported setting "DOG" also returns in error:
with pytest.raises(ClientError, match='ValidationException'):
test_table_s.delete_item(Key={'p': p}, ReturnValues='DOG')

View File

@@ -5,6 +5,7 @@
# Tests for the Scan operation
import pytest
import time
from boto3.dynamodb.conditions import Attr
from botocore.exceptions import ClientError

View File

@@ -5,9 +5,13 @@
import pytest
from test.alternator.test_metrics import metrics, get_metrics, check_increases_metric
from contextlib import contextmanager
from cassandra.auth import PlainTextAuthProvider
from cassandra.cluster import Cluster, ExecutionProfile, EXEC_PROFILE_DEFAULT, ConsistencyLevel
from cassandra.policies import RoundRobinPolicy
import time
import re
from .util import random_string, unique_table_name
from .util import random_string, is_aws, unique_table_name
from .test_cql_rbac import new_role, new_dynamodb
# new_service_level() is a context manager for temporarily creating a new

View File

@@ -11,7 +11,7 @@ import requests
from botocore.exceptions import ClientError
from boto3.dynamodb.conditions import Key
from .util import full_scan, scylla_config_read, scylla_config_temporary
from .util import full_scan, scylla_config_read, scylla_config_write, scylla_config_temporary
internal_prefix = '.scylla.alternator.'

View File

@@ -14,7 +14,7 @@ from re import fullmatch
import pytest
from botocore.exceptions import ClientError
from .util import list_tables, unique_table_name, create_test_table, random_string, new_test_table, is_aws, scylla_config_read
from test.alternator.util import list_tables, multiset, unique_table_name, create_test_table, random_string, new_test_table, is_aws, scylla_config_read
# Utility function for create a table with a given name and some valid

View File

@@ -14,9 +14,10 @@
# will probably go away eventually.
import pytest
import boto3
from botocore.exceptions import ClientError
from .util import new_test_table, scylla_config_read, scylla_config_temporary
from .util import new_test_table, wait_for_gsi, random_string, full_scan, full_query, multiset, scylla_config_read, scylla_config_temporary
# All tests in this file are scylla-only
@pytest.fixture(scope="function", autouse=True)

View File

@@ -998,7 +998,7 @@ def test_transact_get_items_projection_expression(test_table_s):
# ProjectionExpression also supports ExpressionAttributeNames.
@pytest.mark.xfail(reason="#5064 - transactions not yet supported")
def test_transact_get_items_projection_expression_attribute_names(test_table_s):
def test_transact_get_items_projection_expression(test_table_s):
p = random_string()
item = {'p': p, 'x': 1, 'y': 2, 'z': 3}
test_table_s.put_item(Item=item)

View File

@@ -8,6 +8,7 @@ import string
import random
import collections
import time
import re
import requests
import json
import pytest

View File

@@ -365,7 +365,6 @@ add_scylla_test(combined_tests
schema_registry_test.cc
secondary_index_test.cc
sessions_test.cc
simple_value_with_expiry_test.cc
sstable_compaction_test.cc
sstable_compressor_factory_test.cc
sstable_compression_config_test.cc

View File

@@ -110,7 +110,7 @@ public:
virtual compaction::compaction_strategy_state& get_compaction_strategy_state() noexcept override { return _compaction_strategy_state; }
virtual reader_permit make_compaction_reader_permit() const override { return _semaphore.make_permit(); }
virtual sstables::sstables_manager& get_sstables_manager() noexcept override { return _sst_man; }
virtual sstables::shared_sstable make_sstable(sstables::sstable_state) const override { return _sstable_factory(); }
virtual sstables::shared_sstable make_sstable() const override { return _sstable_factory(); }
virtual sstables::sstable_writer_config configure_writer(sstring origin) const override { return _sst_man.configure_writer(std::move(origin)); }
virtual api::timestamp_type min_memtable_timestamp() const override { return api::min_timestamp; }
virtual api::timestamp_type min_memtable_live_timestamp() const override { return api::min_timestamp; }

View File

@@ -387,27 +387,4 @@ SEASTAR_TEST_CASE(select_from_vector_indexed_table) {
enable_tablets(db_config_with_auth()));
}
SEASTAR_TEST_CASE(select_from_vector_search_system_table) {
return do_with_cql_env_thread(
[](auto&& env) {
create_user_if_not_exists(env, bob);
with_user(env, bob, [&env] {
BOOST_REQUIRE_EXCEPTION(env.execute_cql("SELECT * FROM system.group0_history").get(), exceptions::unauthorized_exception,
exception_predicate::message_contains("User bob has none of the permissions (VECTOR_SEARCH_INDEXING, SELECT) on"));
});
with_user(env, bob, [&env] {
BOOST_REQUIRE_EXCEPTION(env.execute_cql("SELECT * FROM system.versions").get(), exceptions::unauthorized_exception,
exception_predicate::message_contains("User bob has none of the permissions (VECTOR_SEARCH_INDEXING, SELECT) on"));
});
cquery_nofail(env, "GRANT VECTOR_SEARCH_INDEXING ON ALL KEYSPACES TO bob");
with_user(env, bob, [&env] {
cquery_nofail(env, "SELECT * FROM system.group0_history");
});
with_user(env, bob, [&env] {
cquery_nofail(env, "SELECT * FROM system.versions");
});
},
db_config_with_auth());
}
BOOST_AUTO_TEST_SUITE_END()

View File

@@ -152,14 +152,9 @@ SEASTAR_TEST_CASE(test_inject_future_disabled) {
utils::error_injection<true> errinj;
auto start_time = steady_clock::now();
static constexpr milliseconds long_sleep_msec(10000);
return errinj.inject("futid", long_sleep_msec).then([start_time] {
return errinj.inject("futid", sleep_msec).then([start_time] {
auto wait_time = steady_clock::now() - start_time;
// Because the injection "futid" was not enabled, we expect the
// sleep to have not happened. If we measure the time that passed,
// it's obviously not zero (especially in a slow debug build on a
// busy test machine), but certainly not the full long_sleep_msec.
BOOST_REQUIRE_LT(wait_time, long_sleep_msec);
BOOST_REQUIRE_LT(wait_time, sleep_msec);
return make_ready_future<>();
});
}

View File

@@ -234,13 +234,13 @@ SEASTAR_THREAD_TEST_CASE(test_load_sketch) {
std::vector<unsigned> node3_shards(node3_shard_count, 0);
for (unsigned i = 0; i < node1_shard_count * 3; ++i) {
node1_shards[load.next_shard(host1, 1, service::default_target_tablet_size)] += 1;
node1_shards[load.next_shard(host1)] += 1;
}
for (unsigned i = 0; i < node2_shard_count * 3; ++i) {
node2_shards[load.next_shard(host2, 1, service::default_target_tablet_size)] += 1;
node2_shards[load.next_shard(host2)] += 1;
}
for (unsigned i = 0; i < node3_shard_count * 3; ++i) {
node3_shards[load.next_shard(host3, 1, service::default_target_tablet_size)] += 1;
node3_shards[load.next_shard(host3)] += 1;
}
for (unsigned i = 1; i < node1_shard_count; ++i) {
@@ -300,7 +300,7 @@ SEASTAR_THREAD_TEST_CASE(test_load_sketch) {
// host3 has max shard load of 3 and 3 shards, and 4 tablets allocated.
// So to achieve even load we need to allocate 3 * 3 - 4 = 5 more tablets.
for (int i = 0; i < 5; ++i) {
auto s = load.next_shard(host3, 1, service::default_target_tablet_size);
auto s = load.next_shard(host3);
node3_shards[s] += 1;
}

View File

@@ -7,9 +7,7 @@
*/
#include "utils/assert.hh"
#include <seastar/core/sstring.hh>
#include <fmt/ranges.h>
#include <fmt/format.h>
#include <seastar/core/future.hh>
#include <seastar/testing/test_case.hh>
@@ -29,7 +27,7 @@ namespace fs = std::filesystem;
using namespace sstables;
using namespace tests;
#if 1
static future<> create_file_of_size(fs::path file, size_t dest_size) {
auto f = co_await seastar::open_file_dma(file.string(), open_flags::wo|open_flags::create);
auto os = co_await make_file_output_stream(std::move(f));
@@ -48,7 +46,7 @@ static future<> create_file_of_size(fs::path file, size_t dest_size) {
co_await os.flush();
co_await os.close();
}
#endif
static future<> compare_streams(input_stream<char>& is1, input_stream<char>& is2, size_t total) {
uint64_t read = 0;
while (!is1.eof()) {
@@ -60,7 +58,6 @@ static future<> compare_streams(input_stream<char>& is1, input_stream<char>& is2
BOOST_REQUIRE_EQUAL(buf, buf2);
read += buf.size();
}
BOOST_REQUIRE((co_await is1.read()).empty());
BOOST_REQUIRE((co_await is2.read()).empty());
BOOST_REQUIRE_EQUAL(read, total);
}
@@ -73,10 +70,8 @@ future<> test_file_upload(test_env_config cfg, size_t size) {
auto client = env.manager().get_endpoint_client(ep);
tmpdir tmp;
//for multiple tests that may run in CI in the same time we want different files to be used in each test
sstring test_file_name = fmt::format("testfile-{}-{}", std::chrono::system_clock::now().time_since_epoch().count(), tests::random::get_int(0, 1000000));
auto path = tmp.path() / test_file_name;
object_name name(bucket, test_file_name);
auto path = tmp.path() / "testfile";
object_name name(bucket, "testfile");
utils::upload_progress up;
create_file_of_size(path, size).get();

Some files were not shown because too many files have changed in this diff Show More