Files
scylla/sstable_dict_autotrainer.hh
Michał Chojnowski 4f0d453acf dict_autotrainer: introduce sstable_dict_autotrainer
Add a fiber responsible for periodic re-training of compression dictionaries
(for tables which opted into dict-aware compression).

As of this patch, it works like this:
every `$tick_period` (15 minutes), if we are the current Raft leader,
we check for dict-aware tables which have no dict, or a dict older
than `$retrain_period`.

For those tables, if they have enough data (>1GiB) for a training,
we train a new dict and check if it's significantly better
than the current one (provides ratio smaller than 95% of current ratio),
and if so, we update the dict.
2025-04-01 00:07:30 +02:00

68 lines
2.3 KiB
C++

/*
* Copyright (C) 2025-present ScyllaDB
*/
/*
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
*/
#pragma once
#include "replica/database.hh"
#include "service/raft/raft_group0_client.hh"
// Responsible for re-training the SSTable compression dicts
// (for dict-aware tables) periodically.
//
// As of this writing, it works like this:
// every $tick_period (15 minutes), if we are the current Raft leader,
// we check for dict-aware tables which have no dict, or a dict older
// than $retrain_period.
// For those tables, if they have enough data (>1GiB) for a training,
// we train a new dict and check if it's significantly better
// than the current one (provides ratio smaller than 95% of current ratio),
// and if so, we update the dict.
class sstable_dict_autotrainer {
public:
struct config {
utils::updateable_value<float> tick_period_in_seconds;
utils::updateable_value<float> retrain_period_in_seconds;
utils::updateable_value<uint64_t> min_dataset_bytes;
utils::updateable_value<float> min_improvement_factor;
};
private:
service::storage_service& _ss;
service::raft_group0_client& _group0_client;
config _cfg;
abort_source _as;
future<> _fiber;
future<> tick();
future<> run();
public:
// Must be constructed and run on shard 0.
sstable_dict_autotrainer(service::storage_service&, service::raft_group0_client&, config);
future<> stop();
};
// Computes the compression ratio of the given compressor
// (provided by the `factory` based on `initial_schema` with overwritten `params`),
// on the given set of samples.
future<float> try_one_compression_config(
sstable_compressor_factory& factory,
schema_ptr initial_schema,
const compression_parameters& params,
const utils::chunked_vector<temporary_buffer<char>>& validation_samples
);
// Computes the compression ratio of the given compressor
// (provided by the `factory` based on `initial_schema` with overwritten `params`,
// and with recommended dict set to `dict`),
// on the given set of samples.
future<float> try_one_compression_config(
std::span<std::byte> dict,
schema_ptr initial_schema,
const compression_parameters& params,
const utils::chunked_vector<temporary_buffer<char>>& validation_samples
);