sstables: Fix update of tombstone GC settings to have immediate effect

After "repair: Get rid of the gc_grace_seconds", the sstable's schema (mode,
gc period if applicable, etc) is used to estimate the amount of droppable
data (or determine full expiration = max_deletion_time < gc_before).
It could happen that the user switched from timeout to repair mode, but
sstables will still use the old mode, despite the user asked for a new one.
Another example is when you play with value of grace period, to prevent
data resurrection if repair won't be able to run in a timely manner.
The problem persists until all sstables using old GC settings are recompacted
or node is restarted.
To fix this, we have to feed latest schema into sstable procedures used
for expiration purposes.

Fixes #15643.

Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>

Closes scylladb/scylladb#15746

(cherry picked from commit fded314e46)
This commit is contained in:
Raphael S. Carvalho
2023-10-17 16:03:32 -03:00
committed by Botond Dénes
parent ac7ed6857a
commit 7288bdfe09
8 changed files with 14 additions and 16 deletions

View File

@@ -1800,7 +1800,7 @@ get_fully_expired_sstables(const table_state& table_s, const std::vector<sstable
int64_t min_timestamp = std::numeric_limits<int64_t>::max();
for (auto& sstable : overlapping) {
auto gc_before = sstable->get_gc_before_for_fully_expire(compaction_time, table_s.get_tombstone_gc_state());
auto gc_before = sstable->get_gc_before_for_fully_expire(compaction_time, table_s.get_tombstone_gc_state(), table_s.schema());
if (sstable->get_max_local_deletion_time() >= gc_before) {
min_timestamp = std::min(min_timestamp, sstable->get_stats_metadata().min_timestamp);
}
@@ -1819,7 +1819,7 @@ get_fully_expired_sstables(const table_state& table_s, const std::vector<sstable
// SStables that do not contain live data is added to list of possibly expired sstables.
for (auto& candidate : compacting) {
auto gc_before = candidate->get_gc_before_for_fully_expire(compaction_time, table_s.get_tombstone_gc_state());
auto gc_before = candidate->get_gc_before_for_fully_expire(compaction_time, table_s.get_tombstone_gc_state(), table_s.schema());
clogger.debug("Checking if candidate of generation {} and max_deletion_time {} is expired, gc_before is {}",
candidate->generation(), candidate->get_stats_metadata().max_local_deletion_time, gc_before);
// A fully expired sstable which has an ancestor undeleted shouldn't be compacted because

View File

@@ -51,7 +51,7 @@ std::vector<compaction_descriptor> compaction_strategy_impl::get_cleanup_compact
}));
}
bool compaction_strategy_impl::worth_dropping_tombstones(const shared_sstable& sst, gc_clock::time_point compaction_time, const tombstone_gc_state& gc_state) {
bool compaction_strategy_impl::worth_dropping_tombstones(const shared_sstable& sst, gc_clock::time_point compaction_time, const table_state& t) {
if (_disable_tombstone_compaction) {
return false;
}
@@ -62,7 +62,7 @@ bool compaction_strategy_impl::worth_dropping_tombstones(const shared_sstable& s
if (db_clock::now()-_tombstone_compaction_interval < sst->data_file_write_time()) {
return false;
}
auto gc_before = sst->get_gc_before_for_drop_estimation(compaction_time, gc_state);
auto gc_before = sst->get_gc_before_for_drop_estimation(compaction_time, t.get_tombstone_gc_state(), t.schema());
return sst->estimate_droppable_tombstone_ratio(gc_before) >= _tombstone_threshold;
}

View File

@@ -64,7 +64,7 @@ public:
// Check if a given sstable is entitled for tombstone compaction based on its
// droppable tombstone histogram and gc_before.
bool worth_dropping_tombstones(const shared_sstable& sst, gc_clock::time_point compaction_time, const tombstone_gc_state& gc_state);
bool worth_dropping_tombstones(const shared_sstable& sst, gc_clock::time_point compaction_time, const table_state& t);
virtual std::unique_ptr<compaction_backlog_tracker::impl> make_backlog_tracker() const = 0;

View File

@@ -51,15 +51,15 @@ compaction_descriptor leveled_compaction_strategy::get_sstables_for_compaction(t
auto& sstables = manifest.get_level(level);
// filter out sstables which droppable tombstone ratio isn't greater than the defined threshold.
auto e = boost::range::remove_if(sstables, [this, compaction_time, &table_s] (const sstables::shared_sstable& sst) -> bool {
return !worth_dropping_tombstones(sst, compaction_time, table_s.get_tombstone_gc_state());
return !worth_dropping_tombstones(sst, compaction_time, table_s);
});
sstables.erase(e, sstables.end());
if (sstables.empty()) {
continue;
}
auto& sst = *std::max_element(sstables.begin(), sstables.end(), [&] (auto& i, auto& j) {
auto gc_before1 = i->get_gc_before_for_drop_estimation(compaction_time, table_s.get_tombstone_gc_state());
auto gc_before2 = j->get_gc_before_for_drop_estimation(compaction_time, table_s.get_tombstone_gc_state());
auto gc_before1 = i->get_gc_before_for_drop_estimation(compaction_time, table_s.get_tombstone_gc_state(), table_s.schema());
auto gc_before2 = j->get_gc_before_for_drop_estimation(compaction_time, table_s.get_tombstone_gc_state(), table_s.schema());
return i->estimate_droppable_tombstone_ratio(gc_before1) < j->estimate_droppable_tombstone_ratio(gc_before2);
});
return sstables::compaction_descriptor({ sst }, sst->get_sstable_level());

View File

@@ -243,7 +243,7 @@ size_tiered_compaction_strategy::get_sstables_for_compaction(table_state& table_
for (auto&& sstables : buckets | boost::adaptors::reversed) {
// filter out sstables which droppable tombstone ratio isn't greater than the defined threshold.
auto e = boost::range::remove_if(sstables, [this, compaction_time, &table_s] (const sstables::shared_sstable& sst) -> bool {
return !worth_dropping_tombstones(sst, compaction_time, table_s.get_tombstone_gc_state());
return !worth_dropping_tombstones(sst, compaction_time, table_s);
});
sstables.erase(e, sstables.end());
if (sstables.empty()) {

View File

@@ -366,7 +366,7 @@ time_window_compaction_strategy::get_next_non_expired_sstables(table_state& tabl
// if there is no sstable to compact in standard way, try compacting single sstable whose droppable tombstone
// ratio is greater than threshold.
auto e = boost::range::remove_if(non_expiring_sstables, [this, compaction_time, &table_s] (const shared_sstable& sst) -> bool {
return !worth_dropping_tombstones(sst, compaction_time, table_s.get_tombstone_gc_state());
return !worth_dropping_tombstones(sst, compaction_time, table_s);
});
non_expiring_sstables.erase(e, non_expiring_sstables.end());
if (non_expiring_sstables.empty()) {

View File

@@ -3033,8 +3033,7 @@ std::optional<large_data_stats_entry> sstable::get_large_data_stat(large_data_ty
// gc_before for all the partitions that have record in repair history map. It
// is fine that some of the partitions inside the sstable does not have a
// record.
gc_clock::time_point sstable::get_gc_before_for_drop_estimation(const gc_clock::time_point& compaction_time, const tombstone_gc_state& gc_state) const {
auto s = get_schema();
gc_clock::time_point sstable::get_gc_before_for_drop_estimation(const gc_clock::time_point& compaction_time, const tombstone_gc_state& gc_state, const schema_ptr& s) const {
auto start = get_first_decorated_key().token();
auto end = get_last_decorated_key().token();
auto range = dht::token_range(dht::token_range::bound(start, true), dht::token_range::bound(end, true));
@@ -3050,9 +3049,8 @@ gc_clock::time_point sstable::get_gc_before_for_drop_estimation(const gc_clock::
// in the repair history map, we can not drop the sstable, in such case we
// return gc_clock::time_point::min() as gc_before. Otherwise, return the
// gc_before from the repair history map.
gc_clock::time_point sstable::get_gc_before_for_fully_expire(const gc_clock::time_point& compaction_time, const tombstone_gc_state& gc_state) const {
gc_clock::time_point sstable::get_gc_before_for_fully_expire(const gc_clock::time_point& compaction_time, const tombstone_gc_state& gc_state, const schema_ptr& s) const {
auto deletion_time = get_max_local_deletion_time();
auto s = get_schema();
// No need to query gc_before for the sstable if the max_deletion_time is max()
if (deletion_time == gc_clock::time_point(gc_clock::duration(std::numeric_limits<int>::max()))) {
sstlog.trace("sstable={}, ks={}, cf={}, get_max_local_deletion_time={}, min_timestamp={}, gc_grace_seconds={}, shortcut",

View File

@@ -935,8 +935,8 @@ public:
friend std::unique_ptr<DataConsumeRowsContext>
data_consume_rows(const schema&, shared_sstable, typename DataConsumeRowsContext::consumer&);
friend void lw_shared_ptr_deleter<sstables::sstable>::dispose(sstable* s);
gc_clock::time_point get_gc_before_for_drop_estimation(const gc_clock::time_point& compaction_time, const tombstone_gc_state& gc_state) const;
gc_clock::time_point get_gc_before_for_fully_expire(const gc_clock::time_point& compaction_time, const tombstone_gc_state& gc_state) const;
gc_clock::time_point get_gc_before_for_drop_estimation(const gc_clock::time_point& compaction_time, const tombstone_gc_state& gc_state, const schema_ptr& s) const;
gc_clock::time_point get_gc_before_for_fully_expire(const gc_clock::time_point& compaction_time, const tombstone_gc_state& gc_state, const schema_ptr& s) const;
future<uint32_t> read_digest();
future<checksum> read_checksum();