Merge "Improve LSA descriptor encoding" from Pavel

"
The LSA small objects allocation latency is greatly affected by
the way this allocator encodes the object descriptor in front of
each allocated slot.

Nowadays it's one of VLE variants implemented with the help of a
loop. Re-implementing this piece with less instructions and without
a loop allows greatly reducing the allocation latency.

The speed-up mostly comes from loop-less code that doesn't confuse
branch predictor. Also the express encoder seems to benefit from
writing 8 bytes of the encoded value in one go, rather than byte-
-by-byte.

Perf measurements:

1. (new) logallog test shows ~40% smaller times

2. perf_mutation in release mode shows ~2% increase in tps

3. the encoder itself is 2 - 4 times faster on x86_64 and
   1.05 - 3 times faster on aarch64. The speed-up depends on
   the 'encoded length', old encoder has linear time, the
   new one is constant

tests: unit(dev), perf(release), just encoder on Aarch64
"

* 'br-lsa-alloc-latency-4' of https://github.com/xemul/scylla:
  lsa: Use express encoder
  uleb64: Add express encoding
  lsa: Extract uleb64 code into header
  test: LSA allocation perf test
This commit is contained in:
Avi Kivity
2021-06-16 18:07:13 +03:00
4 changed files with 252 additions and 60 deletions

View File

@@ -526,6 +526,7 @@ scylla_tests = set([
'test/perf/perf_collection',
'test/perf/perf_row_cache_update',
'test/perf/perf_row_cache_reads',
'test/perf/logalloc',
'test/perf/perf_simple_query',
'test/perf/perf_sstable',
'test/unit/lsa_async_eviction_test',
@@ -1179,6 +1180,7 @@ tests_not_using_seastar_test_framework = set([
'test/perf/perf_mutation',
'test/perf/perf_collection',
'test/perf/perf_row_cache_update',
'test/perf/logalloc',
'test/unit/lsa_async_eviction_test',
'test/unit/lsa_sync_eviction_test',
'test/unit/row_cache_alloc_stress_test',

92
test/perf/logalloc.cc Normal file
View File

@@ -0,0 +1,92 @@
/*
* Copyright (C) 2021 ScyllaDB
*/
/*
* This file is part of Scylla.
*
* Scylla is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Scylla is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
*/
#include <seastar/core/distributed.hh>
#include <seastar/core/app-template.hh>
#include <seastar/core/sstring.hh>
#include <seastar/core/thread.hh>
#include <seastar/core/reactor.hh>
#include <fmt/core.h>
#include <random>
#include "utils/allocation_strategy.hh"
#include "utils/logalloc.hh"
#include "log.hh"
#include "test/perf/perf.hh"
#include "test/lib/reader_permit.hh"
class piggie {
size_t _extra_size;
public:
size_t storage_size() const noexcept { return sizeof(piggie) + _extra_size; }
piggie(size_t sz) noexcept : _extra_size(sz) {}
};
static constexpr unsigned nr_seq_allocations = 1024;
static constexpr unsigned nr_iterations = 20000;
static constexpr unsigned nr_sizes = 32;
int main(int argc, char** argv) {
app_template app;
return app.run(argc, argv, [&app] {
return seastar::async([&] {
logalloc::prime_segment_pool(memory::stats().total_memory(), memory::min_free_memory()).get();
logalloc::region reg;
std::array<piggie*, nr_seq_allocations> objects;
std::array<size_t, nr_sizes> sizes;
std::random_device rd;
std::mt19937 g(rd());
for (int i = 0; i < nr_sizes; i++) {
sizes[i] = i;
}
auto& allocator = reg.allocator();
std::chrono::duration<double> total;
for (int iter = 0; iter < nr_iterations; iter++) {
std::shuffle(sizes.begin(), sizes.end(), g);
void* mem = allocator.alloc<piggie>(sizeof(piggie) + sizes[0]);
objects[0] = new (mem) piggie(sizes[0]);
auto start = std::chrono::steady_clock::now();
for (int i = 1; i < nr_seq_allocations; i++) {
void* mem = allocator.alloc<piggie>(sizeof(piggie) + sizes[i % nr_sizes]);
objects[i] = new (mem) piggie(sizes[i % nr_sizes]);
}
total += std::chrono::steady_clock::now() - start;
for (int i = 0; i < nr_seq_allocations; i++) {
allocator.destroy(objects[i]);
}
reg.full_compaction();
}
fmt::print("Total time: {} s\n", total.count());
});
});
}

View File

@@ -45,6 +45,7 @@
#include "utils/dynamic_bitset.hh"
#include "utils/log_heap.hh"
#include "utils/preempt.hh"
#include "utils/vle.hh"
#include <random>
#include <chrono>
@@ -210,6 +211,8 @@ uint32_t
migrate_fn_type::register_migrator(migrate_fn_type* m) {
auto& migrators = *debug::static_migrators;
auto idx = migrators.add(m);
// object_descriptor encodes 2 * index() + 1
assert(idx * 2 + 1 < utils::uleb64_express_supreme);
m->_migrators = migrators.shared_from_this();
return idx;
}
@@ -1150,77 +1153,26 @@ class region_impl final : public basic_region_impl {
}
segment::size_type encoded_size() const {
return log2floor(_n) / 6 + 1; // 0 is illegal
return utils::uleb64_encoded_size(_n); // 0 is illegal
}
void encode(char*& pos) const {
uint64_t b = 64;
auto n = _n;
auto start = pos;
do {
b |= n & 63;
n >>= 6;
if (!n) {
b |= 128;
}
unpoison(pos, 1);
*pos++ = b;
b = 0;
} while (n);
poison(start, pos - start);
utils::uleb64_encode(pos, _n, poison<char>, unpoison);
}
// non-canonical encoding to allow padding (for alignment); encoded_size must be
// sufficient (greater than this->encoded_size())
void encode(char*& pos, size_t encoded_size) const {
uint64_t b = 64;
auto start = pos;
unpoison(start, encoded_size);
auto n = _n;
do {
b |= n & 63;
n >>= 6;
if (!--encoded_size) {
b |= 128;
}
*pos++ = b;
b = 0;
} while (encoded_size);
poison(start, pos - start);
// sufficient (greater than this->encoded_size()), _n must be the migrator's
// index() (i.e. -- suitable for express encoding)
void encode(char*& pos, size_t encoded_size, size_t size) const {
utils::uleb64_express_encode(pos, _n, encoded_size, size, poison<char>, unpoison);
}
static object_descriptor decode_forwards(const char*& pos) {
unsigned n = 0;
unsigned shift = 0;
auto p = pos; // avoid aliasing; p++ doesn't touch memory
uint8_t b;
do {
unpoison(p, 1);
b = *p++;
if (shift < 32) {
// non-canonical encoding can cause large shift; undefined in C++
n |= uint32_t(b & 63) << shift;
}
shift += 6;
} while ((b & 128) == 0);
poison(pos, p - pos);
pos = p;
return object_descriptor(n);
return object_descriptor(utils::uleb64_decode_forwards(pos, poison<char>, unpoison));
}
static object_descriptor decode_backwards(const char*& pos) {
unsigned n = 0;
uint8_t b;
auto p = pos; // avoid aliasing; --p doesn't touch memory
do {
--p;
unpoison(p, 1);
b = *p;
n = (n << 6) | (b & 63);
} while ((b & 64) == 0);
poison(p, pos - p);
pos = p;
return object_descriptor(n);
return object_descriptor(utils::uleb64_decode_bacwards(pos, poison<char>, unpoison));
}
friend std::ostream& operator<<(std::ostream& out, const object_descriptor& desc) {
@@ -1292,7 +1244,7 @@ private:
auto old_active_offset = _active_offset;
auto pos = _active->at<char>(_active_offset);
// Use non-canonical encoding to allow for alignment pad
desc.encode(pos, obj_offset - _active_offset);
desc.encode(pos, obj_offset - _active_offset, size);
unpoison(pos, size);
_active_offset = obj_offset + size;

146
utils/vle.hh Normal file
View File

@@ -0,0 +1,146 @@
/*
* Copyright (C) 2021 ScyllaDB
*/
/*
* This file is part of Scylla.
*
* Scylla is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Scylla is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
*/
#pragma once
#include <seastar/core/bitops.hh>
#include <seastar/core/byteorder.hh>
namespace utils {
/*
* The express encoder below is optimized to encode a value
* that may only have non-zeroes in its first 12 bits
*/
static constexpr size_t uleb64_express_bits = 12;
static constexpr uint32_t uleb64_express_supreme = 1 << uleb64_express_bits;
// Returns the number of bytes needed to encode the value
// The value cannot be 0 (not checked)
static inline size_t uleb64_encoded_size(uint32_t val) noexcept {
return seastar::log2floor(val) / 6 + 1;
}
template <typename Poison, typename Unpoison>
requires std::is_invocable<Poison, const char*, size_t>::value && std::is_invocable<Unpoison, const char*, size_t>::value
static inline void uleb64_encode(char*& pos, uint32_t val, Poison&& poison, Unpoison&& unpoison) noexcept {
uint64_t b = 64;
auto start = pos;
do {
b |= val & 63;
val >>= 6;
if (!val) {
b |= 128;
}
unpoison(pos, 1);
*pos++ = b;
b = 0;
} while (val);
poison(start, pos - start);
}
template <typename Poison, typename Unpoison>
requires std::is_invocable<Poison, const char*, size_t>::value && std::is_invocable<Unpoison, const char*, size_t>::value
static inline void uleb64_encode(char*& pos, uint32_t val, size_t encoded_size, Poison&& poison, Unpoison&& unpoison) noexcept {
uint64_t b = 64;
auto start = pos;
unpoison(start, encoded_size);
do {
b |= val & 63;
val >>= 6;
if (!--encoded_size) {
b |= 128;
}
*pos++ = b;
b = 0;
} while (encoded_size);
poison(start, pos - start);
}
#if !defined(SEASTAR_ASAN_ENABLED)
static inline void uleb64_express_encode_impl(char*& pos, uint64_t val, size_t size) noexcept {
static_assert(uleb64_express_bits == 12);
if (size > sizeof(uint64_t)) {
static uint64_t zero = 0;
std::copy_n(reinterpret_cast<char*>(&zero), sizeof(zero), pos + size - sizeof(uint64_t));
}
seastar::write_le(pos, uint64_t(((val & 0xfc0) << 2) | ((val & 0x3f) | 64)));
pos += size;
pos[-1] |= 0x80;
}
template <typename Poison, typename Unpoison>
requires std::is_invocable<Poison, const char*, size_t>::value && std::is_invocable<Unpoison, const char*, size_t>::value
static inline void uleb64_express_encode(char*& pos, uint32_t val, size_t encoded_size, size_t gap, Poison&& poison, Unpoison&& unpoison) noexcept {
if (encoded_size + gap > sizeof(uint64_t)) {
uleb64_express_encode_impl(pos, val, encoded_size);
} else {
uleb64_encode(pos, val, encoded_size, poison, unpoison);
}
}
#else
template <typename Poison, typename Unpoison>
requires std::is_invocable<Poison, const char*, size_t>::value && std::is_invocable<Unpoison, const char*, size_t>::value
static inline void uleb64_express_encode(char*& pos, uint32_t val, size_t encoded_size, size_t gap, Poison&& poison, Unpoison&& unpoison) noexcept {
uleb64_encode(pos, val, encoded_size, poison, unpoison);
}
#endif
template <typename Poison, typename Unpoison>
requires std::is_invocable<Poison, const char*, size_t>::value && std::is_invocable<Unpoison, const char*, size_t>::value
static inline uint32_t uleb64_decode_forwards(const char*& pos, Poison&& poison, Unpoison&& unpoison) noexcept {
uint32_t n = 0;
unsigned shift = 0;
auto p = pos; // avoid aliasing; p++ doesn't touch memory
uint8_t b;
do {
unpoison(p, 1);
b = *p++;
if (shift < 32) {
// non-canonical encoding can cause large shift; undefined in C++
n |= uint32_t(b & 63) << shift;
}
shift += 6;
} while ((b & 128) == 0);
poison(pos, p - pos);
pos = p;
return n;
}
template <typename Poison, typename Unpoison>
requires std::is_invocable<Poison, const char*, size_t>::value && std::is_invocable<Unpoison, const char*, size_t>::value
static inline uint32_t uleb64_decode_bacwards(const char*& pos, Poison&& poison, Unpoison&& unpoison) noexcept {
uint32_t n = 0;
uint8_t b;
auto p = pos; // avoid aliasing; --p doesn't touch memory
do {
--p;
unpoison(p, 1);
b = *p;
n = (n << 6) | (b & 63);
} while ((b & 64) == 0);
poison(p, pos - p);
pos = p;
return n;
}
} // namespace utils