Replace LRU with W-TinyLFU cache eviction policy

Add Count-Min Sketch for frequency estimation and replace the single LRU
list with a W-TinyLFU policy using window, probation, and protected segments.
Update cache_tracker::touch() to use the new touch() method that handles
segment-aware promotion.

Co-authored-by: dorlaor <1735237+dorlaor@users.noreply.github.com>
This commit is contained in:
copilot-swe-agent[bot]
2026-03-13 21:09:52 +00:00
parent 76c6354f9f
commit 481d5ae2e5
4 changed files with 375 additions and 37 deletions

View File

@@ -214,11 +214,7 @@ void cache_tracker::clear() {
}
void cache_tracker::touch(rows_entry& e) {
// last dummy may not be linked if evicted
if (e.is_linked()) {
_lru.remove(e);
}
_lru.add(e);
_lru.touch(e);
}
void cache_tracker::insert(cache_entry& entry) {

View File

@@ -10,7 +10,27 @@ Cache is always paired with its underlying mutation source which it mirrors. Tha
Eviction is about removing parts of the data from memory and recording the fact that information about those parts is missing. Eviction doesn't change the set of writes represented by cache as part of its `mutation_source` interface.
The smallest object which can be evicted, called eviction unit, is currently a single row (`rows_entry`). Eviction units are linked in an LRU owned by a `cache_tracker`. The LRU determines eviction order. The LRU is shared among many tables. Currently, there is one per `database`.
The smallest object which can be evicted, called eviction unit, is currently a single row (`rows_entry`). Eviction units are managed by a W-TinyLFU policy owned by a `cache_tracker`. The W-TinyLFU policy determines eviction order. It is shared among many tables. Currently, there is one per `database`.
### W-TinyLFU Eviction Policy
The cache uses a W-TinyLFU (Window Tiny Least Frequently Used) eviction policy,
which combines recency and frequency information for better hit rates than plain LRU.
The policy organizes entries into three segments:
- **Window** (~1% of cache): A small LRU that admits all new entries. This allows
new entries to build up frequency information before competing for main cache space.
- **Probation** (~19% of cache): Part of the main SLRU cache. Entries from the window
compete with probation victims for admission using a TinyLFU frequency filter.
- **Protected** (~80% of cache): The other part of the main SLRU cache. Entries are
promoted here from probation when accessed again.
The TinyLFU frequency filter uses a Count-Min Sketch to compactly estimate access
frequency. When eviction is needed, the window victim competes with the probation
victim: the entry with higher estimated frequency survives in probation while the
other is evicted. The sketch is periodically aged (all counts halved) to adapt to
changing access patterns.
All `rows_entry` objects which are owned by a `cache_tracker` are assumed to be either contained in a cache (in some `row_cache::partitions_type`) or
be owned by a (detached) `partition_snapshot`. When the last row from a `partition_entry` is evicted, the containing `cache_entry` is evicted from the cache.

105
utils/count_min_sketch.hh Normal file
View File

@@ -0,0 +1,105 @@
/*
* Copyright (C) 2024-present ScyllaDB
*/
/*
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
*/
#pragma once
#include <cstdint>
#include <cstddef>
#include <vector>
#include <algorithm>
namespace utils {
/// A Count-Min Sketch with 4-bit counters for frequency estimation.
///
/// Used by the W-TinyLFU cache admission policy to estimate access frequency.
/// Each counter is 4 bits (max value 15), and counters are packed 16 per
/// uint64_t word. The sketch uses 4 independent hash functions (rows) and
/// returns the minimum count across all rows for frequency estimation.
class count_min_sketch {
static constexpr size_t depth = 4;
static constexpr uint64_t reset_mask = 0x7777777777777777ULL;
static constexpr uint64_t seeds[depth] = {
0x9e3779b97f4a7c15ULL,
0xbf58476d1ce4e5b9ULL,
0x94d049bb133111ebULL,
0xd6e8feb86659fd93ULL,
};
std::vector<uint64_t> _table;
size_t _width;
size_t _width_mask;
size_t _words_per_row;
static uint64_t mix(uint64_t key, uint64_t seed) noexcept {
uint64_t h = key * seed;
h ^= h >> 32;
h *= 0xd6e8feb86659fd93ULL;
h ^= h >> 32;
return h;
}
size_t counter_index(size_t row, uint64_t key) const noexcept {
return mix(key, seeds[row]) & _width_mask;
}
static uint8_t get_counter(uint64_t word, size_t pos) noexcept {
return (word >> (pos * 4)) & 0x0FULL;
}
size_t word_index(size_t row, size_t col) const noexcept {
return row * _words_per_row + col / 16;
}
public:
/// Construct a sketch with the given number of counters per row.
/// \param width_log2 Log base 2 of the number of counters per row.
/// Total memory is approximately depth * 2^width_log2 / 2 bytes.
explicit count_min_sketch(size_t width_log2 = 16)
: _width(size_t(1) << width_log2)
, _width_mask(_width - 1)
, _words_per_row(_width / 16)
{
_table.resize(depth * _words_per_row, 0);
}
void increment(uint64_t key) noexcept {
for (size_t row = 0; row < depth; ++row) {
size_t col = counter_index(row, key);
size_t wi = word_index(row, col);
size_t pos = col & 15;
uint8_t val = get_counter(_table[wi], pos);
if (val < 15) {
_table[wi] += (1ULL << (pos * 4));
}
}
}
uint8_t estimate(uint64_t key) const noexcept {
uint8_t min_val = 15;
for (size_t row = 0; row < depth; ++row) {
size_t col = counter_index(row, key);
size_t wi = word_index(row, col);
size_t pos = col & 15;
min_val = std::min(min_val, get_counter(_table[wi], pos));
}
return min_val;
}
/// Halve all counters (aging/decay).
void reset() noexcept {
for (auto& word : _table) {
word = (word >> 1) & reset_mask;
}
}
size_t width() const noexcept { return _width; }
};
} // namespace utils

View File

@@ -9,8 +9,18 @@
#pragma once
#include "utils/assert.hh"
#include "utils/count_min_sketch.hh"
#include <boost/intrusive/list.hpp>
#include <seastar/core/memory.hh>
#include <algorithm>
// Identifies which W-TinyLFU segment an evictable belongs to.
enum class lru_segment : uint8_t {
none = 0,
window = 1,
probation = 2,
protected_ = 3,
};
class evictable {
friend class lru;
@@ -32,6 +42,7 @@ protected:
static_assert(std::is_nothrow_constructible_v<lru_link_type, lru_link_type&&>);
private:
lru_link_type _lru_link;
lru_segment _segment = lru_segment::none;
protected:
// Prevent destruction via evictable pointer. LRU is not aware of allocation strategy.
// Prevent destruction of a linked evictable. While we could unlink the evictable here
@@ -54,6 +65,7 @@ public:
void swap(evictable& o) noexcept {
_lru_link.swap_nodes(o._lru_link);
std::swap(_segment, o._segment);
}
virtual bool is_index() const noexcept {
@@ -76,13 +88,27 @@ class index_evictable : public evictable {
}
};
// Implements LRU cache replacement for row cache and sstable index cache.
// Implements W-TinyLFU cache replacement for row cache and sstable index cache.
//
// W-TinyLFU uses a small admission window backed by an LRU and a main cache
// organized as a Segmented LRU (SLRU) with probation and protected segments.
// Admission to the main cache is controlled by a TinyLFU frequency filter
// implemented via a Count-Min Sketch.
//
// New entries enter the window. When eviction is needed, the window victim
// competes with the probation victim: the entry with higher estimated
// frequency survives in probation while the other is evicted.
// Touching an entry in probation promotes it to the protected segment.
// When the protected segment exceeds its target size, the least-recently-used
// protected entry is demoted back to probation.
class lru {
private:
using lru_type = boost::intrusive::list<evictable,
boost::intrusive::member_hook<evictable, evictable::lru_link_type, &evictable::_lru_link>,
boost::intrusive::constant_time_size<false>>; // we need this to have bi::auto_unlink on hooks.
lru_type _list;
lru_type _window;
lru_type _probation;
lru_type _protected;
// See the comment to index_evictable.
using index_lru_type = boost::intrusive::list<index_evictable,
@@ -92,24 +118,201 @@ private:
using reclaiming_result = seastar::memory::reclaiming_result;
public:
~lru() {
while (!_list.empty()) {
evictable& e = _list.front();
remove(e);
e.on_evicted();
static constexpr size_t sketch_width_log2 = 16;
static constexpr size_t sketch_width = size_t(1) << sketch_width_log2;
static constexpr size_t sample_threshold = sketch_width * 10;
utils::count_min_sketch _sketch{sketch_width_log2};
size_t _window_size = 0;
size_t _probation_size = 0;
size_t _protected_size = 0;
size_t _sample_count = 0;
size_t total_size() const noexcept {
return _window_size + _probation_size + _protected_size;
}
size_t max_window_size() const noexcept {
return std::max(size_t(1), total_size() / 100);
}
size_t max_protected_size() const noexcept {
return total_size() * 80 / 100;
}
static uint64_t entry_key(const evictable& e) noexcept {
return static_cast<uint64_t>(reinterpret_cast<uintptr_t>(&e));
}
void record_access(const evictable& e) noexcept {
_sketch.increment(entry_key(e));
if (++_sample_count >= sample_threshold) {
_sketch.reset();
_sample_count = 0;
}
}
lru_type& segment_list(lru_segment seg) noexcept {
switch (seg) {
case lru_segment::window: return _window;
case lru_segment::probation: return _probation;
case lru_segment::protected_: return _protected;
default: SCYLLA_ASSERT(false && "invalid segment"); __builtin_unreachable();
}
}
void increment_size(lru_segment seg) noexcept {
switch (seg) {
case lru_segment::window: ++_window_size; break;
case lru_segment::probation: ++_probation_size; break;
case lru_segment::protected_: ++_protected_size; break;
default: break;
}
}
void decrement_size(lru_segment seg) noexcept {
switch (seg) {
case lru_segment::window: --_window_size; break;
case lru_segment::probation: --_probation_size; break;
case lru_segment::protected_: --_protected_size; break;
default: break;
}
}
void remove_from_segment(evictable& e) noexcept {
auto& list = segment_list(e._segment);
list.erase(list.iterator_to(e));
decrement_size(e._segment);
e._segment = lru_segment::none;
}
void add_to_segment(evictable& e, lru_segment seg) noexcept {
e._segment = seg;
segment_list(seg).push_back(e);
increment_size(seg);
}
void add_to_segment_front(evictable& e, lru_segment seg) noexcept {
e._segment = seg;
segment_list(seg).push_front(e);
increment_size(seg);
}
// Move excess protected entries to probation.
void rebalance_protected() noexcept {
size_t max_prot = max_protected_size();
while (_protected_size > max_prot && !_protected.empty()) {
evictable& victim = _protected.front();
remove_from_segment(victim);
add_to_segment(victim, lru_segment::probation);
}
}
// Evicts a single element using W-TinyLFU policy.
template <bool Shallow = false>
reclaiming_result do_evict(bool should_evict_index) noexcept {
// Index eviction path: evict the least recently used index entry.
if (should_evict_index && !_index_list.empty()) {
evictable& e = _index_list.front();
remove(e);
if constexpr (!Shallow) {
e.on_evicted();
} else {
e.on_evicted_shallow();
}
return reclaiming_result::reclaimed_something;
}
if (_window.empty() && _probation.empty() && _protected.empty()) {
return reclaiming_result::reclaimed_nothing;
}
rebalance_protected();
// Drain excess from window using TinyLFU admission.
while (_window_size > max_window_size() && !_window.empty()) {
evictable& w_victim = _window.front();
if (!_probation.empty()) {
// Competition: window victim vs. probation victim.
evictable& p_victim = _probation.front();
uint8_t w_freq = _sketch.estimate(entry_key(w_victim));
uint8_t p_freq = _sketch.estimate(entry_key(p_victim));
if (w_freq >= p_freq) {
// Admit window victim to probation; evict probation victim.
remove_from_segment(w_victim);
add_to_segment(w_victim, lru_segment::probation);
remove(p_victim);
if constexpr (!Shallow) {
p_victim.on_evicted();
} else {
p_victim.on_evicted_shallow();
}
} else {
// Reject window victim.
remove(w_victim);
if constexpr (!Shallow) {
w_victim.on_evicted();
} else {
w_victim.on_evicted_shallow();
}
}
return reclaiming_result::reclaimed_something;
}
// Probation is empty: move window victim to probation and retry.
remove_from_segment(w_victim);
add_to_segment(w_victim, lru_segment::probation);
}
// Window is within target. Evict from probation, then window, then protected.
evictable* victim = nullptr;
if (!_probation.empty()) {
victim = &_probation.front();
} else if (!_window.empty()) {
victim = &_window.front();
} else if (!_protected.empty()) {
victim = &_protected.front();
} else {
return reclaiming_result::reclaimed_nothing;
}
remove(*victim);
if constexpr (!Shallow) {
victim->on_evicted();
} else {
victim->on_evicted_shallow();
}
return reclaiming_result::reclaimed_something;
}
public:
~lru() {
auto drain = [this](lru_type& list) {
while (!list.empty()) {
evictable& e = list.front();
remove(e);
e.on_evicted();
}
};
drain(_window);
drain(_probation);
drain(_protected);
}
void remove(evictable& e) noexcept {
_list.erase(_list.iterator_to(e));
auto& list = segment_list(e._segment);
list.erase(list.iterator_to(e));
decrement_size(e._segment);
e._segment = lru_segment::none;
if (e.is_index()) {
_index_list.erase(_index_list.iterator_to(static_cast<index_evictable&>(e)));
}
}
void add(evictable& e) noexcept {
_list.push_back(e);
record_access(e);
add_to_segment(e, lru_segment::window);
if (e.is_index()) {
_index_list.push_back(static_cast<index_evictable&>(e));
}
@@ -117,36 +320,50 @@ public:
// Like add(e) but makes sure that e is evicted right before "more_recent" in the absence of later touches.
void add_before(evictable& more_recent, evictable& e) noexcept {
_list.insert(_list.iterator_to(more_recent), e);
record_access(e);
lru_segment seg = more_recent._segment;
auto& list = segment_list(seg);
list.insert(list.iterator_to(more_recent), e);
e._segment = seg;
increment_size(seg);
}
// Handles access to an entry:
// - In window: moves to back of window.
// - In probation: promotes to protected.
// - In protected: moves to back of protected.
// - Not linked: adds to window.
void touch(evictable& e) noexcept {
remove(e);
add(e);
record_access(e);
switch (e._segment) {
case lru_segment::none:
add_to_segment(e, lru_segment::window);
break;
case lru_segment::window:
_window.erase(_window.iterator_to(e));
_window.push_back(e);
break;
case lru_segment::probation:
_probation.erase(_probation.iterator_to(e));
--_probation_size;
e._segment = lru_segment::protected_;
_protected.push_back(e);
++_protected_size;
break;
case lru_segment::protected_:
_protected.erase(_protected.iterator_to(e));
_protected.push_back(e);
break;
}
}
// Evicts a single element from the LRU
template <bool Shallow = false>
reclaiming_result do_evict(bool should_evict_index) noexcept {
if (_list.empty()) {
return reclaiming_result::reclaimed_nothing;
}
evictable& e = (should_evict_index && !_index_list.empty()) ? _index_list.front() : _list.front();
remove(e);
if constexpr (!Shallow) {
e.on_evicted();
} else {
e.on_evicted_shallow();
}
return reclaiming_result::reclaimed_something;
}
// Evicts a single element from the LRU.
// Evicts a single element using the W-TinyLFU policy.
reclaiming_result evict(bool should_evict_index = false) noexcept {
return do_evict<false>(should_evict_index);
}
// Evicts a single element from the LRU.
// Evicts a single element using the W-TinyLFU policy.
// Will call on_evicted_shallow() instead of on_evicted().
reclaiming_result evict_shallow() noexcept {
return do_evict<true>(false);