Merge "Fix use-after-free when destroying partition_snapshots in the background"from Tomasz
"
partition_snapshots created in the memtable will keep a reference to
the memtable (as region*) and to memtable::_cleaner. As long as the
reader is alive, the memtable will be kept alive by
partition_snapshot_flat_reader::_container_guard. But after that
nothing prevents it from being destroyed. The snapshot can outlive the
read if mutation_cleaner::merge_and_destroy() defers its destruction
for later. When the read ends after memtable was flushed, the snapshot
will be queued in the cache's cleaner, but internally will reference
memtable's region and cleaner. This will result in a use-after-free
when the snapshot resumes destruction.
The fix is to update snapshots's region and cleaner references at the
time of queueing to point to the cache's region and cleaner.
When memtable is destroyed without being moved to cache there is no
problem because the snapshot would be queued into memtable's cleaner,
which will be drained on destruction from all snapshots.
Introduced in f3da043 (in >= 3.0-rc1)
Fixes #4030.
Tests:
- mvcc_test (debug)
"
* tag 'fix-snapshot-merging-use-after-free-v1.1' of github.com:tgrabiec/scylla:
tests: mvcc: Add test_snapshot_merging_after_container_is_destroyed
tests: mvcc: Introduce mvcc_container::migrate()
tests: mvcc: Make mvcc_partition move-constructible
tests: mvcc: Introduce mvcc_container::make_not_evictable()
tests: mvcc: Allow constructing mvcc_container without a cache_tracker
mutation_cleaner: Migrate partition_snapshots when queueing for background cleanup
mvcc: partition_snapshot: Introduce migrate()
mutation_cleaner: impl: Store a back-reference to the owning mutation_cleaner
This commit is contained in:
@@ -26,6 +26,8 @@
|
||||
|
||||
#include "utils/logalloc.hh"
|
||||
|
||||
class mutation_cleaner;
|
||||
|
||||
class mutation_cleaner_impl final {
|
||||
using snapshot_list = boost::intrusive::slist<partition_snapshot,
|
||||
boost::intrusive::member_hook<partition_snapshot, boost::intrusive::slist_member_hook<>, &partition_snapshot::_cleaner_hook>>;
|
||||
@@ -38,6 +40,7 @@ class mutation_cleaner_impl final {
|
||||
private:
|
||||
logalloc::region& _region;
|
||||
cache_tracker* _tracker;
|
||||
mutation_cleaner* _cleaner;
|
||||
partition_version_list _versions;
|
||||
lw_shared_ptr<worker> _worker_state;
|
||||
seastar::scheduling_group _scheduling_group;
|
||||
@@ -46,9 +49,10 @@ private:
|
||||
stop_iteration merge_some() noexcept;
|
||||
void start_worker();
|
||||
public:
|
||||
mutation_cleaner_impl(logalloc::region& r, cache_tracker* t, seastar::scheduling_group sg = seastar::current_scheduling_group())
|
||||
mutation_cleaner_impl(logalloc::region& r, cache_tracker* t, mutation_cleaner* cleaner, seastar::scheduling_group sg = seastar::current_scheduling_group())
|
||||
: _region(r)
|
||||
, _tracker(t)
|
||||
, _cleaner(cleaner)
|
||||
, _worker_state(make_lw_shared<worker>())
|
||||
, _scheduling_group(sg)
|
||||
{
|
||||
@@ -91,6 +95,7 @@ void mutation_cleaner_impl::merge_and_destroy(partition_snapshot& ps) noexcept {
|
||||
} else {
|
||||
// The snapshot must not be reachable by partitino_entry::read() after this,
|
||||
// which is ensured by slide_to_oldest() == stop_iteration::no.
|
||||
ps.migrate(&_region, _cleaner);
|
||||
_worker_state->snapshots.push_front(ps);
|
||||
_worker_state->cv.signal();
|
||||
}
|
||||
@@ -109,9 +114,12 @@ class mutation_cleaner final {
|
||||
lw_shared_ptr<mutation_cleaner_impl> _impl;
|
||||
public:
|
||||
mutation_cleaner(logalloc::region& r, cache_tracker* t, seastar::scheduling_group sg = seastar::current_scheduling_group())
|
||||
: _impl(make_lw_shared<mutation_cleaner_impl>(r, t, sg)) {
|
||||
: _impl(make_lw_shared<mutation_cleaner_impl>(r, t, this, sg)) {
|
||||
}
|
||||
|
||||
mutation_cleaner(mutation_cleaner&&) = delete;
|
||||
mutation_cleaner(const mutation_cleaner&) = delete;
|
||||
|
||||
void set_scheduling_group(seastar::scheduling_group sg) {
|
||||
_impl->set_scheduling_group(sg);
|
||||
}
|
||||
|
||||
@@ -2415,6 +2415,9 @@ memory::reclaiming_result mutation_cleaner_impl::clear_some() noexcept {
|
||||
|
||||
void mutation_cleaner_impl::merge(mutation_cleaner_impl& r) noexcept {
|
||||
_versions.splice(r._versions);
|
||||
for (partition_snapshot& snp : r._worker_state->snapshots) {
|
||||
snp.migrate(&_region, _cleaner);
|
||||
}
|
||||
_worker_state->snapshots.splice(_worker_state->snapshots.end(), r._worker_state->snapshots);
|
||||
if (!_worker_state->snapshots.empty()) {
|
||||
_worker_state->cv.signal();
|
||||
|
||||
@@ -171,11 +171,11 @@ tombstone partition_entry::partition_tombstone() const {
|
||||
}
|
||||
|
||||
partition_snapshot::~partition_snapshot() {
|
||||
with_allocator(_region.allocator(), [this] {
|
||||
with_allocator(region().allocator(), [this] {
|
||||
if (_version && _version.is_unique_owner()) {
|
||||
auto v = &*_version;
|
||||
_version = {};
|
||||
remove_or_mark_as_unique_owner(v, &_cleaner);
|
||||
remove_or_mark_as_unique_owner(v, _cleaner);
|
||||
} else if (_entry) {
|
||||
_entry->_snapshot = nullptr;
|
||||
}
|
||||
@@ -199,7 +199,7 @@ stop_iteration partition_snapshot::merge_partition_versions() {
|
||||
_version = partition_version_ref(*current);
|
||||
}
|
||||
while (auto prev = current->prev()) {
|
||||
_region.allocator().invalidate_references();
|
||||
region().allocator().invalidate_references();
|
||||
if (current->partition().apply_monotonically(*schema(), std::move(prev->partition()), _tracker, is_preemptible::yes) == stop_iteration::no) {
|
||||
return stop_iteration::no;
|
||||
}
|
||||
|
||||
@@ -299,8 +299,8 @@ private:
|
||||
partition_version_ref _version;
|
||||
partition_entry* _entry;
|
||||
phase_type _phase;
|
||||
logalloc::region& _region;
|
||||
mutation_cleaner& _cleaner;
|
||||
logalloc::region* _region;
|
||||
mutation_cleaner* _cleaner;
|
||||
cache_tracker* _tracker;
|
||||
boost::intrusive::slist_member_hook<> _cleaner_hook;
|
||||
friend class partition_entry;
|
||||
@@ -312,7 +312,7 @@ public:
|
||||
partition_entry* entry,
|
||||
cache_tracker* tracker, // non-null for evictable snapshots
|
||||
phase_type phase = default_phase)
|
||||
: _schema(std::move(s)), _entry(entry), _phase(phase), _region(region), _cleaner(cleaner), _tracker(tracker) { }
|
||||
: _schema(std::move(s)), _entry(entry), _phase(phase), _region(®ion), _cleaner(&cleaner), _tracker(tracker) { }
|
||||
partition_snapshot(const partition_snapshot&) = delete;
|
||||
partition_snapshot(partition_snapshot&&) = delete;
|
||||
partition_snapshot& operator=(const partition_snapshot&) = delete;
|
||||
@@ -344,12 +344,19 @@ public:
|
||||
// to the latest version.
|
||||
stop_iteration slide_to_oldest() noexcept;
|
||||
|
||||
// Must be called after snapshot's original region is merged into a different region
|
||||
// before the original region is destroyed, unless the snapshot is destroyed earlier.
|
||||
void migrate(logalloc::region* region, mutation_cleaner* cleaner) noexcept {
|
||||
_region = region;
|
||||
_cleaner = cleaner;
|
||||
}
|
||||
|
||||
~partition_snapshot();
|
||||
|
||||
partition_version_ref& version();
|
||||
|
||||
change_mark get_change_mark() {
|
||||
return {_region.reclaim_counter(), version_count()};
|
||||
return {_region->reclaim_counter(), version_count()};
|
||||
}
|
||||
|
||||
const partition_version_ref& version() const;
|
||||
@@ -365,9 +372,9 @@ public:
|
||||
}
|
||||
|
||||
const schema_ptr& schema() const { return _schema; }
|
||||
logalloc::region& region() const { return _region; }
|
||||
logalloc::region& region() const { return *_region; }
|
||||
cache_tracker* tracker() const { return _tracker; }
|
||||
mutation_cleaner& cleaner() { return _cleaner; }
|
||||
mutation_cleaner& cleaner() { return *_cleaner; }
|
||||
|
||||
tombstone partition_tombstone() const;
|
||||
::static_row static_row(bool digest_requested) const;
|
||||
|
||||
@@ -153,29 +153,56 @@ class mvcc_partition;
|
||||
|
||||
// Together with mvcc_partition abstracts memory management details of dealing with MVCC.
|
||||
class mvcc_container {
|
||||
cache_tracker _tracker;
|
||||
schema_ptr _schema;
|
||||
std::optional<cache_tracker> _tracker;
|
||||
std::optional<logalloc::region> _region_holder;
|
||||
std::optional<mutation_cleaner> _cleaner_holder;
|
||||
partition_snapshot::phase_type _phase = partition_snapshot::min_phase;
|
||||
dirty_memory_manager _mgr;
|
||||
real_dirty_memory_accounter _acc{_mgr, _tracker, 0};
|
||||
std::optional<real_dirty_memory_accounter> _acc;
|
||||
logalloc::region* _region;
|
||||
mutation_cleaner* _cleaner;
|
||||
public:
|
||||
mvcc_container(schema_ptr s) : _schema(s) {}
|
||||
struct no_tracker {};
|
||||
mvcc_container(schema_ptr s)
|
||||
: _schema(s)
|
||||
, _tracker(std::make_optional<cache_tracker>())
|
||||
, _acc(std::make_optional<real_dirty_memory_accounter>(_mgr, *_tracker, 0))
|
||||
, _region(&_tracker->region())
|
||||
, _cleaner(&_tracker->cleaner())
|
||||
{ }
|
||||
mvcc_container(schema_ptr s, no_tracker)
|
||||
: _schema(s)
|
||||
, _region_holder(std::make_optional<logalloc::region>())
|
||||
, _cleaner_holder(std::make_optional<mutation_cleaner>(*_region_holder, nullptr))
|
||||
, _region(&*_region_holder)
|
||||
, _cleaner(&*_cleaner_holder)
|
||||
{ }
|
||||
mvcc_container(mvcc_container&&) = delete;
|
||||
|
||||
// Call only when this container was constructed with a tracker
|
||||
mvcc_partition make_evictable(const mutation_partition& mp);
|
||||
logalloc::region& region() { return _tracker.region(); }
|
||||
cache_tracker& tracker() { return _tracker; }
|
||||
mutation_cleaner& cleaner() { return _tracker.cleaner(); }
|
||||
// Call only when this container was constructed without a tracker
|
||||
mvcc_partition make_not_evictable(const mutation_partition& mp);
|
||||
logalloc::region& region() { return *_region; }
|
||||
cache_tracker* tracker() { return &*_tracker; }
|
||||
mutation_cleaner& cleaner() { return *_cleaner; }
|
||||
partition_snapshot::phase_type next_phase() { return ++_phase; }
|
||||
partition_snapshot::phase_type phase() const { return _phase; }
|
||||
real_dirty_memory_accounter& accounter() { return _acc; }
|
||||
real_dirty_memory_accounter& accounter() { return *_acc; }
|
||||
|
||||
mutation_partition squashed(partition_snapshot_ptr& snp) {
|
||||
logalloc::allocating_section as;
|
||||
return as(_tracker.region(), [&] {
|
||||
return as(region(), [&] {
|
||||
return snp->squashed();
|
||||
});
|
||||
}
|
||||
|
||||
// Merges other into this
|
||||
void merge(mvcc_container& other) {
|
||||
_region->merge(*other._region);
|
||||
_cleaner->merge(*other._cleaner);
|
||||
}
|
||||
};
|
||||
|
||||
class mvcc_partition {
|
||||
@@ -191,6 +218,8 @@ public:
|
||||
: _s(s), _e(std::move(e)), _container(container), _evictable(evictable) {
|
||||
}
|
||||
|
||||
mvcc_partition(mvcc_partition&&) = default;
|
||||
|
||||
~mvcc_partition() {
|
||||
with_allocator(region().allocator(), [&] {
|
||||
_e = {};
|
||||
@@ -215,7 +244,7 @@ public:
|
||||
logalloc::allocating_section as;
|
||||
with_allocator(region().allocator(), [&] {
|
||||
as(region(), [&] {
|
||||
_e.upgrade(_s, new_schema, _container.cleaner(), &_container.tracker());
|
||||
_e.upgrade(_s, new_schema, _container.cleaner(), _container.tracker());
|
||||
_s = new_schema;
|
||||
});
|
||||
});
|
||||
@@ -224,7 +253,7 @@ public:
|
||||
partition_snapshot_ptr read() {
|
||||
logalloc::allocating_section as;
|
||||
return as(region(), [&] {
|
||||
return _e.read(region(), _container.cleaner(), schema(), &_container.tracker(), _container.phase());
|
||||
return _e.read(region(), _container.cleaner(), schema(), _container.tracker(), _container.phase());
|
||||
});
|
||||
}
|
||||
|
||||
@@ -241,7 +270,7 @@ void mvcc_partition::apply_to_evictable(partition_entry&& src, schema_ptr src_sc
|
||||
mutation_cleaner src_cleaner(region(), no_cache_tracker);
|
||||
auto c = as(region(), [&] {
|
||||
return _e.apply_to_incomplete(*schema(), std::move(src), *src_schema, src_cleaner, as, region(),
|
||||
_container.tracker(), _container.next_phase(), _container.accounter());
|
||||
*_container.tracker(), _container.next_phase(), _container.accounter());
|
||||
});
|
||||
repeat([&] {
|
||||
return c.run();
|
||||
@@ -284,6 +313,15 @@ mvcc_partition mvcc_container::make_evictable(const mutation_partition& mp) {
|
||||
});
|
||||
}
|
||||
|
||||
mvcc_partition mvcc_container::make_not_evictable(const mutation_partition& mp) {
|
||||
return with_allocator(region().allocator(), [&] {
|
||||
logalloc::allocating_section as;
|
||||
return as(region(), [&] {
|
||||
return mvcc_partition(_schema, partition_entry(mutation_partition(*_schema, mp)), *this, false);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(test_apply_to_incomplete) {
|
||||
return seastar::async([] {
|
||||
simple_schema table;
|
||||
@@ -901,3 +939,39 @@ SEASTAR_TEST_CASE(test_versions_are_merged_when_snapshots_go_away) {
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
// Reproducer of #4030
|
||||
SEASTAR_TEST_CASE(test_snapshot_merging_after_container_is_destroyed) {
|
||||
return seastar::async([] {
|
||||
random_mutation_generator gen(random_mutation_generator::generate_counters::no);
|
||||
auto s = gen.schema();
|
||||
|
||||
mutation m1 = gen();
|
||||
m1.partition().make_fully_continuous();
|
||||
|
||||
mutation m2 = gen();
|
||||
m2.partition().make_fully_continuous();
|
||||
|
||||
auto c1 = std::make_unique<mvcc_container>(s, mvcc_container::no_tracker{});
|
||||
auto c2 = std::make_unique<mvcc_container>(s, mvcc_container::no_tracker{});
|
||||
|
||||
auto e = std::make_unique<mvcc_partition>(c1->make_not_evictable(m1.partition()));
|
||||
auto snap1 = e->read();
|
||||
|
||||
*e += m2;
|
||||
|
||||
auto snap2 = e->read();
|
||||
|
||||
while (!need_preempt()) {} // Ensure need_preempt() to force snapshot destruction to defer
|
||||
|
||||
snap1 = {};
|
||||
|
||||
c2->merge(*c1);
|
||||
|
||||
snap2 = {};
|
||||
e.reset();
|
||||
c1 = {};
|
||||
|
||||
c2->cleaner().drain().get();
|
||||
});
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user