Merge 'utils: crc: generate crc barrett fold tables at compile time' from Avi Kivity
We use Barrett tables (misspelled in the code unfortunately) to fold crc computations of multiple buffers into a single crc. This is important because it turns out to be faster to compute crc of three different buffers in parallel rather than compute the crc of one large buffer, since the crc instruction has latency 3. Currently, we have a separate code generation step to compute the fold tables. The step generates a new C++ source files with the tables. But modern C++ allows us to do this computation at compile time, avoiding the code generation step. This simplifies the build. This series does that. There is some complication in that the code uses compiler intrinsics for the computation, and these are not constexpr friendly. So we first introduce constexpr-friendly alternatives and use them. To prove the transformation is correct, I compared the generated code from before the series and from just before the last step (where we use constexpr evaluation but still retain the generated file) and saw no difference in the values. Note that constexpr is not strictly needed - we could have run the code in the global variables' initializer. But that would cause a crash if we run on a pre-clmul machine, and is not as fun. Closes #11957 * github.com:scylladb/scylladb: test: crc: add unit tests for constexpr clmul and barrett fold utils: crc combine table: generate at compile time utils: barrett: inline functions in header utils: crc combine table: generate tables at compile time utils: crc combine table: extract table generation into a constexpr function utils: crc combine table: extract "pow table" code into constexpr function utils: crc combine table: store tables std::arrray rather than C array utils: barrett: make the barrett reduction constexpr friendly utils: clmul: add 64-bit constexpr clmul utils: barrett: extract barrett reduction constants utils: barrett: reorder functions utils: make clmul() constexpr
This commit is contained in:
10
configure.py
10
configure.py
@@ -910,6 +910,7 @@ scylla_core = (['message/messaging_service.cc',
|
||||
'utils/config_file.cc',
|
||||
'utils/multiprecision_int.cc',
|
||||
'utils/gz/crc_combine.cc',
|
||||
'utils/gz/crc_combine_table.cc',
|
||||
'gms/version_generator.cc',
|
||||
'gms/versioned_value.cc',
|
||||
'gms/gossiper.cc',
|
||||
@@ -1324,8 +1325,6 @@ deps['test/raft/discovery_test'] = ['test/raft/discovery_test.cc',
|
||||
'test/lib/log.cc',
|
||||
'service/raft/discovery.cc'] + scylla_raft_dependencies
|
||||
|
||||
deps['utils/gz/gen_crc_combine_table'] = ['utils/gz/gen_crc_combine_table.cc']
|
||||
|
||||
|
||||
warnings = [
|
||||
'-Wall',
|
||||
@@ -1954,7 +1953,6 @@ with open(buildfile, 'w') as f:
|
||||
] + [
|
||||
'abseil/' + x for x in abseil_libs
|
||||
]])
|
||||
objs.append('$builddir/' + mode + '/gen/utils/gz/crc_combine_table.o')
|
||||
if binary in tests:
|
||||
local_libs = '$seastar_libs_{} $libs'.format(mode)
|
||||
if binary in pure_boost_tests:
|
||||
@@ -2003,12 +2001,6 @@ with open(buildfile, 'w') as f:
|
||||
rust_libs[staticlib] = src
|
||||
else:
|
||||
raise Exception('No rule for ' + src)
|
||||
compiles['$builddir/' + mode + '/gen/utils/gz/crc_combine_table.o'] = '$builddir/' + mode + '/gen/utils/gz/crc_combine_table.cc'
|
||||
compiles['$builddir/' + mode + '/utils/gz/gen_crc_combine_table.o'] = 'utils/gz/gen_crc_combine_table.cc'
|
||||
f.write('build {}: run {}\n'.format('$builddir/' + mode + '/gen/utils/gz/crc_combine_table.cc',
|
||||
'$builddir/' + mode + '/utils/gz/gen_crc_combine_table'))
|
||||
f.write('build {}: link_build.{} {}\n'.format('$builddir/' + mode + '/utils/gz/gen_crc_combine_table', mode,
|
||||
'$builddir/' + mode + '/utils/gz/gen_crc_combine_table.o'))
|
||||
f.write(' libs = $seastar_libs_{}\n'.format(mode))
|
||||
f.write(
|
||||
'build {mode}-objects: phony {objs}\n'.format(
|
||||
|
||||
@@ -10,8 +10,28 @@
|
||||
|
||||
#include <boost/test/unit_test.hpp>
|
||||
#include "utils/crc.hh"
|
||||
#include "utils/clmul.hh"
|
||||
#include "utils/gz/barett.hh"
|
||||
#include <seastar/core/print.hh>
|
||||
|
||||
constexpr uint32_t input_32_1_c = 0x12345678;
|
||||
uint32_t input_32_1 = input_32_1_c; // NOT constexpr
|
||||
|
||||
constexpr uint32_t input_32_2_c = 0xabcdef12;
|
||||
uint32_t input_32_2 = input_32_2_c; // NOT constexpr
|
||||
|
||||
constexpr uint64_t input_64_1_c = 0x1234567890abcdef;
|
||||
uint64_t input_64_1 = input_64_1_c; // NOT constexpr
|
||||
|
||||
BOOST_AUTO_TEST_CASE(clmul_u32_constexpr_equals_native) {
|
||||
constexpr auto constexpr_result = clmul(input_32_1_c, input_32_2_c);
|
||||
BOOST_REQUIRE_EQUAL(clmul(input_32_1, input_32_2), constexpr_result);
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(barrett_fold_constexpr_equals_native) {
|
||||
constexpr auto constexpr_result = crc32_fold_barett_u64(input_64_1_c);
|
||||
BOOST_REQUIRE_EQUAL(crc32_fold_barett_u64(input_64_1), constexpr_result);
|
||||
}
|
||||
inline
|
||||
uint32_t
|
||||
do_compute_crc(utils::crc32& c) {
|
||||
|
||||
@@ -10,6 +10,26 @@
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <type_traits>
|
||||
|
||||
inline
|
||||
constexpr uint64_t clmul_u32_constexpr(uint32_t p1, uint32_t p2) {
|
||||
uint64_t result = 0;
|
||||
for (unsigned i = 0; i < 32; ++i) {
|
||||
result ^= (((p1 >> i) & 1) * uint64_t(p2)) << i;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// returns the low half of the result
|
||||
inline
|
||||
constexpr uint64_t clmul_u64_low_constexpr(uint64_t p1, uint64_t p2) {
|
||||
uint64_t result = 0;
|
||||
for (unsigned i = 0; i < 64; ++i) {
|
||||
result ^= (((p1 >> i) & 1) * p2) << i;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
#if defined(__x86_64__) || defined(__i386__)
|
||||
|
||||
@@ -24,9 +44,10 @@ uint64_t clmul_u32(uint32_t p1, uint32_t p2) {
|
||||
return _mm_extract_epi64(p, 0);
|
||||
}
|
||||
|
||||
constexpr
|
||||
inline
|
||||
uint64_t clmul(uint32_t p1, uint32_t p2) {
|
||||
return clmul_u32(p1, p2);
|
||||
return std::is_constant_evaluated() ? clmul_u32_constexpr(p1, p2) : clmul_u32(p1, p2);
|
||||
}
|
||||
|
||||
#elif defined(__aarch64__)
|
||||
@@ -39,9 +60,10 @@ uint64_t clmul_u32(uint32_t p1, uint32_t p2) {
|
||||
return vmull_p64(p1, p2);
|
||||
}
|
||||
|
||||
constexpr
|
||||
inline
|
||||
uint64_t clmul(uint32_t p1, uint32_t p2) {
|
||||
return clmul_u32(p1, p2);
|
||||
return std::is_constant_evaluated() ? clmul_u32_constexpr(p1, p2) : clmul_u32(p1, p2);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -29,6 +29,10 @@
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include "utils/clmul.hh"
|
||||
|
||||
inline
|
||||
constexpr uint64_t barrett_reduction_constants[2] = { 0x00000001F7011641, 0x00000001DB710641 };
|
||||
|
||||
/*
|
||||
* Calculates representation of p(x) mod G(x) using Barett reduction.
|
||||
@@ -38,23 +42,25 @@
|
||||
* The parameter p is a bit-reversed representation of the polynomial,
|
||||
* the least significant bit corresponds to the coefficient of x^63.
|
||||
*/
|
||||
inline uint32_t crc32_fold_barett_u64(uint64_t p);
|
||||
inline constexpr uint32_t crc32_fold_barett_u64_constexpr(uint64_t p) {
|
||||
auto x0 = p;
|
||||
auto x1 = x0;
|
||||
uint64_t mask32 = 0xffff'ffff;
|
||||
x0 = clmul_u64_low_constexpr(x0 & mask32, barrett_reduction_constants[0]);
|
||||
x0 = clmul_u64_low_constexpr(x0 & mask32, barrett_reduction_constants[1]);
|
||||
return (x0 ^ x1) >> 32;
|
||||
}
|
||||
|
||||
#if defined(__x86_64__) || defined(__i386__)
|
||||
|
||||
#include <wmmintrin.h>
|
||||
|
||||
inline uint32_t crc32_fold_barett_u64_in_m128(__m128i);
|
||||
|
||||
uint32_t crc32_fold_barett_u64(uint64_t p) {
|
||||
return crc32_fold_barett_u64_in_m128(_mm_set_epi64x(0, p));
|
||||
}
|
||||
|
||||
inline
|
||||
uint32_t crc32_fold_barett_u64_in_m128(__m128i x0) {
|
||||
__m128i x1;
|
||||
const __m128i mask32 = (__m128i)(__v4si){ int32_t(0xFFFFFFFF) };
|
||||
const __v2di barrett_reduction_constants =
|
||||
(__v2di){ 0x00000001F7011641, 0x00000001DB710641 };
|
||||
const __v2di brc =
|
||||
(__v2di){ barrett_reduction_constants[0], barrett_reduction_constants[1] };
|
||||
|
||||
/*
|
||||
* Reduce 64 => 32 bits using Barrett reduction.
|
||||
@@ -100,26 +106,25 @@ uint32_t crc32_fold_barett_u64_in_m128(__m128i x0) {
|
||||
*
|
||||
*/
|
||||
x1 = x0;
|
||||
x0 = _mm_clmulepi64_si128(x0 & mask32, barrett_reduction_constants, 0x00);
|
||||
x0 = _mm_clmulepi64_si128(x0 & mask32, barrett_reduction_constants, 0x10);
|
||||
x0 = _mm_clmulepi64_si128(x0 & mask32, brc, 0x00);
|
||||
x0 = _mm_clmulepi64_si128(x0 & mask32, brc, 0x10);
|
||||
return _mm_cvtsi128_si32(_mm_srli_si128(x0 ^ x1, 4));
|
||||
}
|
||||
|
||||
inline
|
||||
uint32_t crc32_fold_barett_u64_native(uint64_t p) {
|
||||
return crc32_fold_barett_u64_in_m128(_mm_set_epi64x(0, p));
|
||||
}
|
||||
|
||||
#elif defined(__aarch64__)
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
inline uint32_t crc32_fold_barett_u64_in_u64x2(uint64x2_t);
|
||||
|
||||
uint32_t crc32_fold_barett_u64(uint64_t p) {
|
||||
return crc32_fold_barett_u64_in_u64x2(
|
||||
vcombine_u64((uint64x1_t)p, (uint64x1_t)0UL));
|
||||
}
|
||||
|
||||
inline
|
||||
uint32_t crc32_fold_barett_u64_in_u64x2(uint64x2_t x0) {
|
||||
uint64x2_t x1;
|
||||
const uint64_t barrett_reduction_constant_lo = 0x00000001F7011641;
|
||||
const uint64_t barrett_reduction_constant_hi = 0x00000001DB710641;
|
||||
const uint64_t barrett_reduction_constant_lo = barrett_reduction_constants[0];
|
||||
const uint64_t barrett_reduction_constant_hi = barrett_reduction_constants[1];
|
||||
|
||||
x1 = x0;
|
||||
x0 = vreinterpretq_u64_p128(
|
||||
@@ -131,8 +136,20 @@ uint32_t crc32_fold_barett_u64_in_u64x2(uint64x2_t x0) {
|
||||
return vgetq_lane_u64(vshrq_n_u64(x0 ^ x1, 32), 0);
|
||||
}
|
||||
|
||||
inline
|
||||
uint32_t crc32_fold_barett_u64_native(uint64_t p) {
|
||||
return crc32_fold_barett_u64_in_u64x2(
|
||||
vcombine_u64((uint64x1_t)p, (uint64x1_t)0UL));
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#error "Not implemented for this arch"
|
||||
|
||||
#endif
|
||||
|
||||
inline
|
||||
constexpr
|
||||
uint32_t crc32_fold_barett_u64(uint64_t p) {
|
||||
return std::is_constant_evaluated() ? crc32_fold_barett_u64_constexpr(p) : crc32_fold_barett_u64_native(p);
|
||||
}
|
||||
|
||||
65
utils/gz/crc_combine_table.cc
Normal file
65
utils/gz/crc_combine_table.cc
Normal file
@@ -0,0 +1,65 @@
|
||||
/*
|
||||
* Copyright (C) 2018-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
*
|
||||
*/
|
||||
|
||||
#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__)
|
||||
|
||||
#include <array>
|
||||
|
||||
#include "crc_combine_table.hh"
|
||||
#include "utils/clmul.hh"
|
||||
#include "barett.hh"
|
||||
|
||||
template <int bits>
|
||||
static
|
||||
constexpr
|
||||
std::array<uint32_t, bits>
|
||||
make_crc32_power_table() {
|
||||
std::array<uint32_t, bits> pows;
|
||||
pows[0] = 0x00800000; // x^8
|
||||
for (int i = 1; i < bits; ++i) {
|
||||
// x^(2*N) mod G(x)
|
||||
// = (x^N)*(x^N) mod G(x)
|
||||
// = (x^N mod G(x))^2 mod G(x)
|
||||
pows[i] = crc32_fold_barett_u64(clmul(pows[i - 1], pows[i - 1]) << 1);
|
||||
}
|
||||
return pows;
|
||||
}
|
||||
|
||||
static
|
||||
constexpr
|
||||
std::array<uint32_t, 256>
|
||||
make_crc32_table(int base, int radix_bits, uint32_t one, std::array<uint32_t, 32> pows) {
|
||||
std::array<uint32_t, 256> table;
|
||||
for (int i = 0; i < (1 << radix_bits); ++i) {
|
||||
uint32_t product = one;
|
||||
for (int j = 0; j < radix_bits; ++j) {
|
||||
if (i & (1 << j)) {
|
||||
product = crc32_fold_barett_u64(clmul(product, pows[base + j]) << 1);
|
||||
}
|
||||
}
|
||||
table[i] = product;
|
||||
}
|
||||
return table;
|
||||
}
|
||||
|
||||
static constexpr int bits = 32;
|
||||
static constexpr int radix_bits = 8;
|
||||
static constexpr uint32_t one = 0x80000000; // x^0
|
||||
static constexpr auto pows = make_crc32_power_table<bits>(); // pows[i] = x^(2^i*8) mod G(x)
|
||||
|
||||
constinit std::array<uint32_t, 256> crc32_x_pow_radix_8_table_base_0 = make_crc32_table(0, radix_bits, one, pows);
|
||||
constinit std::array<uint32_t, 256> crc32_x_pow_radix_8_table_base_8 = make_crc32_table(8, radix_bits, one, pows);
|
||||
constinit std::array<uint32_t, 256> crc32_x_pow_radix_8_table_base_16 = make_crc32_table(16, radix_bits, one, pows);
|
||||
constinit std::array<uint32_t, 256> crc32_x_pow_radix_8_table_base_24 = make_crc32_table(24, radix_bits, one, pows);
|
||||
|
||||
#else
|
||||
|
||||
#error "Not implemented for this CPU architecture."
|
||||
|
||||
#endif
|
||||
@@ -10,6 +10,7 @@
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <array>
|
||||
|
||||
/*
|
||||
* Let t_i be the following polynomial depending on i and u:
|
||||
@@ -36,7 +37,7 @@
|
||||
* (u >> 6) & 1,
|
||||
* (u >> 7) & 1)
|
||||
*/
|
||||
extern uint32_t crc32_x_pow_radix_8_table_base_0[256];
|
||||
extern uint32_t crc32_x_pow_radix_8_table_base_8[256];
|
||||
extern uint32_t crc32_x_pow_radix_8_table_base_16[256];
|
||||
extern uint32_t crc32_x_pow_radix_8_table_base_24[256];
|
||||
extern std::array<uint32_t, 256> crc32_x_pow_radix_8_table_base_0;
|
||||
extern std::array<uint32_t, 256> crc32_x_pow_radix_8_table_base_8;
|
||||
extern std::array<uint32_t, 256> crc32_x_pow_radix_8_table_base_16;
|
||||
extern std::array<uint32_t, 256> crc32_x_pow_radix_8_table_base_24;
|
||||
|
||||
@@ -1,74 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2018-present ScyllaDB
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: AGPL-3.0-or-later
|
||||
*
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__)
|
||||
|
||||
#include "utils/clmul.hh"
|
||||
#include "barett.hh"
|
||||
|
||||
#include <seastar/core/print.hh>
|
||||
|
||||
int main() {
|
||||
const int bits = 32;
|
||||
const int radix_bits = 8;
|
||||
const uint32_t one = 0x80000000; // x^0
|
||||
|
||||
std::cout << "/*\n"
|
||||
" * Generated with gen_crc_combine_table.cc\n"
|
||||
" * DO NOT EDIT!\n"
|
||||
" */\n"
|
||||
"\n"
|
||||
"#include \"utils/gz/crc_combine_table.hh\"\n"
|
||||
"\n";
|
||||
|
||||
uint32_t pows[bits]; // pows[i] = x^(2^i*8) mod G(x)
|
||||
pows[0] = 0x00800000; // x^8
|
||||
for (int i = 1; i < bits; ++i) {
|
||||
// x^(2*N) mod G(x)
|
||||
// = (x^N)*(x^N) mod G(x)
|
||||
// = (x^N mod G(x))^2 mod G(x)
|
||||
pows[i] = crc32_fold_barett_u64(clmul(pows[i - 1], pows[i - 1]) << 1);
|
||||
}
|
||||
|
||||
for (int base = 0; base < bits; base += radix_bits) {
|
||||
std::cout << "uint32_t crc32_x_pow_radix_8_table_base_" << base << "[" << (1<<radix_bits) << "] = {";
|
||||
|
||||
for (int i = 0; i < (1 << radix_bits); ++i) {
|
||||
uint32_t product = one;
|
||||
for (int j = 0; j < radix_bits; ++j) {
|
||||
if (i & (1 << j)) {
|
||||
product = crc32_fold_barett_u64(clmul(product, pows[base + j]) << 1);
|
||||
}
|
||||
}
|
||||
if (i % 4 == 0) {
|
||||
std::cout << "\n ";
|
||||
}
|
||||
std::cout << seastar::format(" 0x{:0>8x},", product);
|
||||
}
|
||||
|
||||
std::cout << "\n};\n\n";
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
int main() {
|
||||
std::cout << "/*\n"
|
||||
" * Generated with gen_crc_combine_table.cc\n"
|
||||
" * DO NOT EDIT!\n"
|
||||
" */\n"
|
||||
"\n"
|
||||
"/* Not implemented for this CPU architecture. */\n"
|
||||
"\n";
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
Reference in New Issue
Block a user