Files
scylla/sstable_dict_autotrainer.hh
Avi Kivity 0ae22a09d4 LICENSE: Update to version 1.1
Updated terms of non-commercial use (must be a never-customer).
2026-04-12 19:46:33 +03:00

68 lines
2.3 KiB
C++

/*
* Copyright (C) 2025-present ScyllaDB
*/
/*
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.1
*/
#pragma once
#include "replica/database.hh"
#include "service/raft/raft_group0_client.hh"
// Responsible for re-training the SSTable compression dicts
// (for dict-aware tables) periodically.
//
// As of this writing, it works like this:
// every $tick_period (15 minutes), if we are the current Raft leader,
// we check for dict-aware tables which have no dict, or a dict older
// than $retrain_period.
// For those tables, if they have enough data (>1GiB) for a training,
// we train a new dict and check if it's significantly better
// than the current one (provides ratio smaller than 95% of current ratio),
// and if so, we update the dict.
class sstable_dict_autotrainer {
public:
struct config {
utils::updateable_value<float> tick_period_in_seconds;
utils::updateable_value<float> retrain_period_in_seconds;
utils::updateable_value<uint64_t> min_dataset_bytes;
utils::updateable_value<float> min_improvement_factor;
};
private:
service::storage_service& _ss;
service::raft_group0_client& _group0_client;
config _cfg;
abort_source _as;
future<> _fiber;
future<> tick();
future<> run();
public:
// Must be constructed and run on shard 0.
sstable_dict_autotrainer(service::storage_service&, service::raft_group0_client&, config);
future<> stop();
};
// Computes the compression ratio of the given compressor
// (provided by the `factory` based on `initial_schema` with overwritten `params`),
// on the given set of samples.
future<float> try_one_compression_config(
sstable_compressor_factory& factory,
schema_ptr initial_schema,
const compression_parameters& params,
const utils::chunked_vector<temporary_buffer<char>>& validation_samples
);
// Computes the compression ratio of the given compressor
// (provided by the `factory` based on `initial_schema` with overwritten `params`,
// and with recommended dict set to `dict`),
// on the given set of samples.
future<float> try_one_compression_config(
std::span<std::byte> dict,
schema_ptr initial_schema,
const compression_parameters& params,
const utils::chunked_vector<temporary_buffer<char>>& validation_samples
);