In that level no io_priority_class-es exist. Instead, all the IO happens in the context of current sched-group. File API no longer accepts prio class argument (and makes io_intent arg mandatory to impls). So the change consists of - removing all usage of io_priority_class - patching file_impl's inheritants to updated API - priority manager goes away altogether - IO bandwidth update is performed on respective sched group - tune-up scylla-gdb.py io_queues command The first change is huge and was made semi-autimatically by: - grep io_priority_class | default_priority_class - remove all calls, found methods' args and class' fields Patching file_impl-s is smaller, but also mechanical: - replace io_priority_class& argument with io_intent* one - pass intent to lower file (if applicatble) Dropping the priority manager is: - git-rm .cc and .hh - sed out all the #include-s - fix configure.py and cmakefile The scylla-gdb.py update is a bit hairry -- it needs to use task queues list for IO classes names and shares, but to detect it should it checks for the "commitlog" group is present. Signed-off-by: Pavel Emelyanov <xemul@scylladb.com> Closes #13963
570 lines
25 KiB
C++
570 lines
25 KiB
C++
/*
|
|
* Copyright (C) 2020-present ScyllaDB
|
|
*/
|
|
|
|
/*
|
|
* SPDX-License-Identifier: AGPL-3.0-or-later
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include "sstables/index_entry.hh"
|
|
#include "sstables/column_translation.hh"
|
|
#include "sstables/promoted_index_blocks_reader.hh"
|
|
#include "parsers.hh"
|
|
#include "schema/schema.hh"
|
|
#include "utils/cached_file.hh"
|
|
|
|
#include <seastar/core/byteorder.hh>
|
|
#include <seastar/core/on_internal_error.hh>
|
|
|
|
#include <optional>
|
|
|
|
namespace sstables {
|
|
|
|
extern logging::logger sstlog;
|
|
|
|
}
|
|
|
|
namespace sstables::mc {
|
|
|
|
/// A read-through cache of promoted index entries.
|
|
///
|
|
/// Designed for a single user. Methods must not be invoked concurrently.
|
|
///
|
|
/// All methods provide basic exception guarantee.
|
|
class cached_promoted_index {
|
|
public:
|
|
using pi_index_type = uint32_t; // promoted index block sequence number, 0 .. _blocks_count
|
|
using pi_offset_type = uint32_t; // Offset into the promoted index region in the index file, relative
|
|
// to the start of the promoted index.
|
|
|
|
// Can be in one of the three states, with increasing number of fields being valid:
|
|
//
|
|
// l0) start is not engaged: only index and offset is valid
|
|
// l1) start is engaged: in addition to the above, start is valid
|
|
// l2) end is engaged: all fields are valid
|
|
//
|
|
// This is in order to save on CPU by avoiding parsing the whole block during binary search,
|
|
// which only needs the "start" field.
|
|
struct promoted_index_block {
|
|
pi_index_type index;
|
|
pi_offset_type offset;
|
|
std::optional<position_in_partition> start;
|
|
std::optional<position_in_partition> end;
|
|
std::optional<deletion_time> end_open_marker;
|
|
uint64_t data_file_offset;
|
|
uint64_t width;
|
|
|
|
promoted_index_block(pi_index_type index, pi_offset_type offset)
|
|
: index(index)
|
|
, offset(offset)
|
|
{}
|
|
|
|
bool operator<(const promoted_index_block& other) const { return index < other.index; }
|
|
bool operator==(const promoted_index_block& other) const { return index == other.index; }
|
|
|
|
friend std::ostream& operator<<(std::ostream& out, const promoted_index_block& b) {
|
|
return out << "{idx=" << b.index
|
|
<< ", offset=" << b.offset
|
|
<< ", start=" << b.start
|
|
<< ", end=" << b.end
|
|
<< ", end_open_marker=" << b.end_open_marker
|
|
<< ", datafile_offset=" << b.data_file_offset
|
|
<< ", width=" << b.width << "}";
|
|
}
|
|
|
|
/// \brief Returns the amount of memory occupied by this object and all its contents.
|
|
size_t memory_usage() const {
|
|
size_t result = sizeof(promoted_index_block);
|
|
if (!start) {
|
|
return result;
|
|
}
|
|
result += start->external_memory_usage();
|
|
if (!end) {
|
|
return result;
|
|
}
|
|
result += end->external_memory_usage();
|
|
return result;
|
|
}
|
|
};
|
|
|
|
struct metrics {
|
|
uint64_t hits_l0 = 0; // Number of requests for promoted_index_block in state l0
|
|
// which didn't have to go to the page cache
|
|
uint64_t hits_l1 = 0; // Number of requests for promoted_index_block in state l1
|
|
// which didn't have to go to the page cache
|
|
uint64_t hits_l2 = 0; // Number of requests for promoted_index_block in state l2
|
|
// which didn't have to go to the page cache
|
|
uint64_t misses_l0 = 0; // Number of requests for promoted_index_block in state l0
|
|
// which didn't have to go to the page cache
|
|
uint64_t misses_l1 = 0; // Number of requests for promoted_index_block in state l1
|
|
// which didn't have to go to the page cache
|
|
uint64_t misses_l2 = 0; // Number of requests for promoted_index_block in state l2
|
|
// which didn't have to go to the page cache
|
|
|
|
uint64_t evictions = 0; // Number of promoted_index_blocks which got evicted
|
|
uint64_t populations = 0; // Number of promoted_index_blocks which got inserted
|
|
uint64_t block_count = 0; // Number of promoted_index_blocks currently cached
|
|
uint64_t used_bytes = 0; // Number of bytes currently used by promoted_index_blocks
|
|
};
|
|
|
|
struct block_comparator {
|
|
using is_transparent = void; // for std::set to see the lower_bound() overload which takes any key.
|
|
|
|
const schema& _s;
|
|
|
|
bool operator()(const promoted_index_block& lhs, position_in_partition_view rhs) const {
|
|
assert(lhs.start);
|
|
position_in_partition::less_compare less(_s);
|
|
return less(*lhs.start, rhs);
|
|
}
|
|
|
|
bool operator()(position_in_partition_view lhs, const promoted_index_block& rhs) const {
|
|
assert(rhs.start);
|
|
position_in_partition::less_compare less(_s);
|
|
return less(lhs, *rhs.start);
|
|
}
|
|
|
|
bool operator()(const promoted_index_block& lhs, const promoted_index_block& rhs) const {
|
|
return lhs < rhs;
|
|
}
|
|
|
|
bool operator()(const promoted_index_block& lhs, pi_index_type rhs) const {
|
|
return lhs.index < rhs;
|
|
}
|
|
|
|
bool operator()(pi_index_type lhs, const promoted_index_block& rhs) const {
|
|
return lhs < rhs.index;
|
|
}
|
|
};
|
|
private:
|
|
// Cache of the parsed promoted index blocks.
|
|
//
|
|
// Why have it? We could have cached only at the cached_file level
|
|
// and parse on each get_block_*(). Caching parsed blocks is still
|
|
// useful for implementing upper_bound_cache_only(), which can leverage
|
|
// materialized _blocks to find approximation of the upper bound.
|
|
//
|
|
// Index lookups are I/O bound and amortized by reading from the data file,
|
|
// so extra CPU overhead to maintain the blocks is not noticeable and
|
|
// savings in CPU time from less over-reads more than compensate
|
|
// for it.
|
|
//
|
|
using block_set_type = std::set<promoted_index_block, block_comparator>;
|
|
block_set_type _blocks;
|
|
public:
|
|
const schema& _s;
|
|
uint64_t _promoted_index_start;
|
|
uint64_t _promoted_index_size;
|
|
metrics& _metrics;
|
|
const pi_index_type _blocks_count;
|
|
cached_file& _cached_file;
|
|
data_consumer::primitive_consumer _primitive_parser;
|
|
clustering_parser _clustering_parser;
|
|
promoted_index_block_parser _block_parser;
|
|
reader_permit _permit;
|
|
cached_file::stream _stream;
|
|
logalloc::allocating_section _as;
|
|
private:
|
|
// Feeds the stream into the consumer until the consumer is satisfied.
|
|
// Does not give unconsumed data back to the stream.
|
|
template <typename Consumer>
|
|
future<> consume_stream(cached_file::stream& s, Consumer& c) {
|
|
return repeat([&] {
|
|
return s.next_page_view().then([&] (cached_file::page_view&& page) {
|
|
if (!page) {
|
|
on_internal_error(sstlog, "End of stream while parsing");
|
|
}
|
|
return _as(_cached_file.region(), [&] {
|
|
auto buf = page.get_buf();
|
|
return stop_iteration(c.consume(buf) == data_consumer::read_status::ready);
|
|
});
|
|
});
|
|
});
|
|
}
|
|
|
|
// Returns offset of the entry in the offset map for the promoted index block of the index idx.
|
|
// The offset is relative to the promoted index start in the index file.
|
|
// idx must be in the range 0..(_blocks_count-1)
|
|
pi_offset_type get_offset_entry_pos(pi_index_type idx) const {
|
|
return _promoted_index_size - (_blocks_count - idx) * sizeof(pi_offset_type);
|
|
}
|
|
|
|
future<pi_offset_type> read_block_offset(pi_index_type idx, tracing::trace_state_ptr trace_state) {
|
|
_stream = _cached_file.read(_promoted_index_start + get_offset_entry_pos(idx), _permit, trace_state);
|
|
return _stream.next_page_view().then([this] (cached_file::page_view page) {
|
|
temporary_buffer<char> buf = page.get_buf();
|
|
static_assert(noexcept(std::declval<data_consumer::primitive_consumer>().read_32(buf)));
|
|
if (__builtin_expect(_primitive_parser.read_32(buf) == data_consumer::read_status::ready, true)) {
|
|
return make_ready_future<pi_offset_type>(_primitive_parser._u32);
|
|
}
|
|
return consume_stream(_stream, _primitive_parser).then([this] {
|
|
return _primitive_parser._u32;
|
|
});
|
|
});
|
|
}
|
|
|
|
// Postconditions:
|
|
// - block.start is engaged and valid.
|
|
future<> read_block_start(promoted_index_block& block, tracing::trace_state_ptr trace_state) {
|
|
_stream = _cached_file.read(_promoted_index_start + block.offset, _permit, trace_state);
|
|
_clustering_parser.reset();
|
|
return consume_stream(_stream, _clustering_parser).then([this, &block] {
|
|
auto mem_before = block.memory_usage();
|
|
block.start.emplace(_clustering_parser.get_and_reset());
|
|
_metrics.used_bytes += block.memory_usage() - mem_before;
|
|
});
|
|
}
|
|
|
|
// Postconditions:
|
|
// - block.end is engaged, all fields in the block are valid
|
|
future<> read_block(promoted_index_block& block, tracing::trace_state_ptr trace_state) {
|
|
_stream = _cached_file.read(_promoted_index_start + block.offset, _permit, trace_state);
|
|
_block_parser.reset();
|
|
return consume_stream(_stream, _block_parser).then([this, &block] {
|
|
auto mem_before = block.memory_usage();
|
|
block.start.emplace(std::move(_block_parser.start()));
|
|
block.end.emplace(std::move(_block_parser.end()));
|
|
block.end_open_marker = _block_parser.end_open_marker();
|
|
block.data_file_offset = _block_parser.offset();
|
|
block.width = _block_parser.width();
|
|
_metrics.used_bytes += block.memory_usage() - mem_before;
|
|
});
|
|
}
|
|
|
|
/// \brief Returns a pointer to promoted_index_block entry which has at least offset and index fields valid.
|
|
future<promoted_index_block*> get_block_only_offset(pi_index_type idx, tracing::trace_state_ptr trace_state) {
|
|
auto i = _blocks.lower_bound(idx);
|
|
if (i != _blocks.end() && i->index == idx) {
|
|
++_metrics.hits_l0;
|
|
return make_ready_future<promoted_index_block*>(const_cast<promoted_index_block*>(&*i));
|
|
}
|
|
++_metrics.misses_l0;
|
|
return read_block_offset(idx, trace_state).then([this, idx, hint = i] (pi_offset_type offset) {
|
|
auto i = this->_blocks.emplace_hint(hint, idx, offset);
|
|
_metrics.used_bytes += sizeof(promoted_index_block);
|
|
++_metrics.block_count;
|
|
++_metrics.populations;
|
|
return const_cast<promoted_index_block*>(&*i);
|
|
});
|
|
}
|
|
|
|
void erase_range(block_set_type::iterator begin, block_set_type::iterator end) {
|
|
while (begin != end) {
|
|
--_metrics.block_count;
|
|
++_metrics.evictions;
|
|
_metrics.used_bytes -= begin->memory_usage();
|
|
begin = _blocks.erase(begin);
|
|
}
|
|
}
|
|
public:
|
|
cached_promoted_index(const schema& s,
|
|
uint64_t promoted_index_start,
|
|
uint64_t promoted_index_size,
|
|
metrics& m,
|
|
reader_permit permit,
|
|
column_values_fixed_lengths cvfl,
|
|
cached_file& f,
|
|
pi_index_type blocks_count)
|
|
: _blocks(block_comparator{s})
|
|
, _s(s)
|
|
, _promoted_index_start(promoted_index_start)
|
|
, _promoted_index_size(promoted_index_size)
|
|
, _metrics(m)
|
|
, _blocks_count(blocks_count)
|
|
, _cached_file(f)
|
|
, _primitive_parser(permit)
|
|
, _clustering_parser(s, permit, cvfl, true)
|
|
, _block_parser(s, permit, std::move(cvfl))
|
|
, _permit(std::move(permit))
|
|
{ }
|
|
|
|
~cached_promoted_index() {
|
|
_metrics.block_count -= _blocks.size();
|
|
_metrics.evictions += _blocks.size();
|
|
for (auto&& b : _blocks) {
|
|
_metrics.used_bytes -= b.memory_usage();
|
|
}
|
|
}
|
|
|
|
/// \brief Returns a pointer to promoted_index_block entry which has at least offset, index and start fields valid.
|
|
future<promoted_index_block*> get_block_with_start(pi_index_type idx, tracing::trace_state_ptr trace_state) {
|
|
return get_block_only_offset(idx, trace_state).then([this, trace_state] (promoted_index_block* block) {
|
|
if (block->start) {
|
|
++_metrics.hits_l1;
|
|
return make_ready_future<promoted_index_block*>(block);
|
|
}
|
|
++_metrics.misses_l1;
|
|
return read_block_start(*block, trace_state).then([block] { return block; });
|
|
});
|
|
}
|
|
|
|
/// \brief Returns a pointer to promoted_index_block entry which has all the fields valid.
|
|
future<promoted_index_block*> get_block(pi_index_type idx, tracing::trace_state_ptr trace_state) {
|
|
return get_block_only_offset(idx, trace_state).then([this, trace_state] (promoted_index_block* block) {
|
|
if (block->end) {
|
|
++_metrics.hits_l2;
|
|
return make_ready_future<promoted_index_block*>(block);
|
|
}
|
|
++_metrics.misses_l2;
|
|
return read_block(*block, trace_state).then([block] { return block; });
|
|
});
|
|
}
|
|
|
|
/// \brief Returns a data file offset into the partition such that all fragments
|
|
/// that follow have strictly greater positions than pos.
|
|
///
|
|
/// The returned information can be useful in determining the I/O boundary for read-ahead.
|
|
///
|
|
/// \note There may be still elements with positions > pos before the returned position,
|
|
/// so this does not return an exact upper bound.
|
|
///
|
|
/// Resolving with std::nullopt means the position is not known. The caller should
|
|
/// use the end of the partition as the upper bound.
|
|
future<std::optional<uint64_t>> upper_bound_cache_only(position_in_partition_view pos, tracing::trace_state_ptr trace_state) {
|
|
auto i = _blocks.upper_bound(pos);
|
|
if (i == _blocks.end()) {
|
|
return make_ready_future<std::optional<uint64_t>>(std::nullopt);
|
|
}
|
|
auto& block = const_cast<promoted_index_block&>(*i);
|
|
if (!block.end) {
|
|
return read_block(block, trace_state).then([&block] {
|
|
return make_ready_future<std::optional<uint64_t>>(block.data_file_offset);
|
|
});
|
|
}
|
|
return make_ready_future<std::optional<uint64_t>>(block.data_file_offset);
|
|
}
|
|
|
|
// Invalidates information about blocks with smaller indexes than a given block.
|
|
void invalidate_prior(promoted_index_block* block, tracing::trace_state_ptr trace_state) {
|
|
erase_range(_blocks.begin(), _blocks.lower_bound(block->index));
|
|
}
|
|
|
|
cached_file& file() { return _cached_file; }
|
|
};
|
|
|
|
/// Cursor implementation which does binary search over index entries.
|
|
///
|
|
/// Memory consumption: O(log(N))
|
|
///
|
|
/// Worst-case lookup cost:
|
|
///
|
|
/// comparisons: O(log(N))
|
|
/// I/O: O(log(N))
|
|
///
|
|
/// N = number of index entries
|
|
///
|
|
class bsearch_clustered_cursor : public clustered_index_cursor {
|
|
using pi_offset_type = cached_promoted_index::pi_offset_type;
|
|
using pi_index_type = cached_promoted_index::pi_index_type;
|
|
using promoted_index_block = cached_promoted_index::promoted_index_block;
|
|
|
|
const schema& _s;
|
|
const pi_index_type _blocks_count;
|
|
seastar::shared_ptr<cached_file> _cached_file;
|
|
cached_promoted_index _promoted_index;
|
|
|
|
// Points to the block whose start is greater than the position of the cursor (its upper bound).
|
|
pi_index_type _current_idx = 0;
|
|
|
|
// Used internally by advance_to_upper_bound() to avoid allocating state.
|
|
pi_index_type _upper_idx;
|
|
|
|
// Points to the upper bound of the cursor.
|
|
std::optional<position_in_partition> _current_pos;
|
|
|
|
tracing::trace_state_ptr _trace_state;
|
|
private:
|
|
// Advances the cursor to the nearest block whose start position is > pos.
|
|
//
|
|
// upper_idx should be the index of the block which is known to have start position > pos.
|
|
// upper_idx can be set to _blocks_count if no such entry is known.
|
|
//
|
|
// Async calls must be serialized.
|
|
future<> advance_to_upper_bound(position_in_partition_view pos) {
|
|
// Binary search over blocks.
|
|
//
|
|
// Post conditions:
|
|
//
|
|
// pos < get_block_start(_current_idx)
|
|
// For each i < _current_idx: pos >= get_block_start(i)
|
|
//
|
|
// Invariants:
|
|
//
|
|
// pos < get_block_start(_upper_idx) [*]
|
|
// pos >= get_block_start(_current_idx)
|
|
//
|
|
// [*] Assuming get_block_start(_blocks_count) == position_in_partition::after_all_clustered_rows().
|
|
//
|
|
// get_block_start(x) = *_promoted_index.get_block_with_start(x).start
|
|
//
|
|
// Eventually _current_idx will reach _upper_idx.
|
|
|
|
_upper_idx = _blocks_count;
|
|
return repeat([this, pos] {
|
|
if (_current_idx >= _upper_idx) {
|
|
if (_current_idx == _blocks_count) {
|
|
_current_pos = position_in_partition::after_all_clustered_rows();
|
|
}
|
|
tracing::trace(_trace_state, "mc_bsearch_clustered_cursor: bisecting done, current=[{}] .start={}", _current_idx, _current_pos);
|
|
sstlog.trace("mc_bsearch_clustered_cursor {}: bisecting done, current=[{}] .start={}", fmt::ptr(this), _current_idx, _current_pos);
|
|
return make_ready_future<stop_iteration>(stop_iteration::yes);
|
|
}
|
|
|
|
auto mid = _current_idx + (_upper_idx - _current_idx) / 2;
|
|
tracing::trace(_trace_state, "mc_bsearch_clustered_cursor: bisecting range [{}, {}], mid={}", _current_idx, _upper_idx, mid);
|
|
sstlog.trace("mc_bsearch_clustered_cursor {}: bisecting range [{}, {}], mid={}", fmt::ptr(this), _current_idx, _upper_idx, mid);
|
|
return _promoted_index.get_block_with_start(mid, _trace_state).then([this, mid, pos] (promoted_index_block* block) {
|
|
position_in_partition::less_compare less(_s);
|
|
sstlog.trace("mc_bsearch_clustered_cursor {}: compare with [{}] .start={}", fmt::ptr(this), mid, block->start);
|
|
if (less(pos, *block->start)) {
|
|
// Eventually _current_idx will reach _upper_idx, so _current_pos only needs to be
|
|
// updated whenever _upper_idx changes.
|
|
_current_pos = *block->start;
|
|
_upper_idx = mid;
|
|
} else {
|
|
_current_idx = mid + 1;
|
|
}
|
|
return stop_iteration::no;
|
|
});
|
|
});
|
|
}
|
|
public:
|
|
bsearch_clustered_cursor(const schema& s,
|
|
uint64_t promoted_index_start,
|
|
uint64_t promoted_index_size,
|
|
cached_promoted_index::metrics& metrics,
|
|
reader_permit permit,
|
|
column_values_fixed_lengths cvfl,
|
|
seastar::shared_ptr<cached_file> f,
|
|
pi_index_type blocks_count,
|
|
tracing::trace_state_ptr trace_state)
|
|
: _s(s)
|
|
, _blocks_count(blocks_count)
|
|
, _cached_file(std::move(f))
|
|
, _promoted_index(s,
|
|
promoted_index_start,
|
|
promoted_index_size,
|
|
metrics,
|
|
std::move(permit),
|
|
std::move(cvfl),
|
|
*_cached_file,
|
|
blocks_count)
|
|
, _trace_state(std::move(trace_state))
|
|
{ }
|
|
|
|
future<std::optional<skip_info>> advance_to(position_in_partition_view pos) override {
|
|
position_in_partition::less_compare less(_s);
|
|
|
|
sstlog.trace("mc_bsearch_clustered_cursor {}: advance_to({}), _current_pos={}, _current_idx={}, cached={}",
|
|
fmt::ptr(this), pos, _current_pos, _current_idx, _promoted_index.file().cached_bytes());
|
|
|
|
if (_current_pos) {
|
|
if (less(pos, *_current_pos)) {
|
|
sstlog.trace("mc_bsearch_clustered_cursor {}: same block", fmt::ptr(this));
|
|
return make_ready_future<std::optional<skip_info>>(std::nullopt);
|
|
}
|
|
++_current_idx;
|
|
}
|
|
|
|
return advance_to_upper_bound(pos).then([this] {
|
|
if (_current_idx == 0) {
|
|
sstlog.trace("mc_bsearch_clustered_cursor {}: same block", fmt::ptr(this));
|
|
return make_ready_future<std::optional<skip_info>>(std::nullopt);
|
|
}
|
|
return _promoted_index.get_block(_current_idx - 1, _trace_state).then([this] (promoted_index_block* block) {
|
|
sstlog.trace("mc_bsearch_clustered_cursor {}: [{}] = {}", fmt::ptr(this), _current_idx - 1, *block);
|
|
offset_in_partition datafile_offset = block->data_file_offset;
|
|
sstlog.trace("mc_bsearch_clustered_cursor {}: datafile_offset={}", fmt::ptr(this), datafile_offset);
|
|
if (_current_idx < 2) {
|
|
return make_ready_future<std::optional<skip_info>>(
|
|
skip_info{datafile_offset, tombstone(), position_in_partition::before_all_clustered_rows()});
|
|
}
|
|
// _current_idx points to the block whose start is > than the cursor (upper bound).
|
|
// The cursor is in _current_idx - 1. We will tell the data file reader to skip to
|
|
// the beginning of _current_idx - 1. The index block contains tombstone which was
|
|
// active at the end of the block, not at the beginning of the block, so we need
|
|
// to read the active tombstone from the preceding block, _current_idx - 2.
|
|
return _promoted_index.get_block(_current_idx - 2, _trace_state)
|
|
.then([this, datafile_offset] (promoted_index_block* block) -> std::optional<skip_info> {
|
|
sstlog.trace("mc_bsearch_clustered_cursor {}: [{}] = {}", fmt::ptr(this), _current_idx - 2, *block);
|
|
// XXX: Until we have automatic eviction, we need to invalidate cached index blocks
|
|
// as we walk so that memory footprint is not O(N) but O(log(N)).
|
|
_promoted_index.invalidate_prior(block, _trace_state);
|
|
if (!block->end_open_marker) {
|
|
return skip_info{datafile_offset, tombstone(), position_in_partition::before_all_clustered_rows()};
|
|
}
|
|
auto tomb = tombstone(*block->end_open_marker);
|
|
sstlog.trace("mc_bsearch_clustered_cursor {}: tombstone={}, pos={}", fmt::ptr(this), tomb, *block->end);
|
|
return skip_info{datafile_offset, tomb, *block->end};
|
|
});
|
|
});
|
|
});
|
|
}
|
|
|
|
future<std::optional<offset_in_partition>> probe_upper_bound(position_in_partition_view pos) override {
|
|
return _promoted_index.upper_bound_cache_only(pos, _trace_state);
|
|
}
|
|
|
|
future<std::optional<entry_info>> next_entry() override {
|
|
if (_current_idx == _blocks_count) {
|
|
return make_ready_future<std::optional<entry_info>>(std::nullopt);
|
|
}
|
|
return _promoted_index.get_block(_current_idx, _trace_state)
|
|
.then([this] (promoted_index_block* block) -> std::optional<entry_info> {
|
|
sstlog.trace("mc_bsearch_clustered_cursor {}: block {}: start={}, end={}, offset={}", fmt::ptr(this), _current_idx,
|
|
*block->start, *block->end, block->data_file_offset);
|
|
++_current_idx;
|
|
return entry_info{*block->start, *block->end, block->data_file_offset};
|
|
});
|
|
}
|
|
|
|
// Advances the cursor to the first promoted index block whose start position is greater than `pos`
|
|
// or to the end if there is no such block.
|
|
//
|
|
// If the block existed and advancing was successful (i.e. we weren't already at this block),
|
|
// returns `skip_info` describing this block. Otherwise returns nullopt.
|
|
future<std::optional<skip_info>> advance_past(position_in_partition_view pos) {
|
|
return advance_to_upper_bound(pos).then([this] {
|
|
if (_current_idx == _blocks_count) {
|
|
return make_ready_future<std::optional<skip_info>>(std::nullopt);
|
|
}
|
|
return _promoted_index.get_block(_current_idx, _trace_state).then([this] (promoted_index_block* block) {
|
|
offset_in_partition datafile_offset = block->data_file_offset;
|
|
if (_current_idx == 0) {
|
|
return make_ready_future<std::optional<skip_info>>(skip_info{datafile_offset, tombstone(), position_in_partition::before_all_clustered_rows()});
|
|
}
|
|
return _promoted_index.get_block(_current_idx - 1, _trace_state)
|
|
.then([this, datafile_offset] (promoted_index_block* block) -> std::optional<skip_info> {
|
|
_promoted_index.invalidate_prior(block, _trace_state);
|
|
if (!block->end_open_marker) {
|
|
return skip_info{datafile_offset, tombstone(), position_in_partition::before_all_clustered_rows()};
|
|
}
|
|
auto tomb = tombstone(*block->end_open_marker);
|
|
return skip_info{datafile_offset, tomb, *block->end};
|
|
});
|
|
});
|
|
});
|
|
}
|
|
|
|
// Returns the offset in the data file of the first row in the last promoted index block
|
|
// (shortly: the offset of the last promoted index block in the data file), or nullopt
|
|
// if there are no blocks.
|
|
future<std::optional<uint64_t>> last_block_offset() {
|
|
if (_blocks_count == 0) {
|
|
return make_ready_future<std::optional<uint64_t>>();
|
|
}
|
|
return _promoted_index.get_block(_blocks_count - 1, _trace_state)
|
|
.then([] (promoted_index_block* block) {
|
|
return std::optional<uint64_t>(block->data_file_offset);
|
|
});
|
|
}
|
|
|
|
future<> close() noexcept override {
|
|
return make_ready_future<>();
|
|
}
|
|
};
|
|
|
|
}
|