Merge 'utils: crc: generate crc barrett fold tables at compile time' from Avi Kivity

We use Barrett tables (misspelled in the code unfortunately) to fold
crc computations of multiple buffers into a single crc. This is important
because it turns out to be faster to compute crc of three different buffers
in parallel rather than compute the crc of one large buffer, since the crc
instruction has latency 3.

Currently, we have a separate code generation step to compute the
fold tables. The step generates a new C++ source files with the tables.
But modern C++ allows us to do this computation at compile time, avoiding
the code generation step. This simplifies the build.

This series does that. There is some complication in that the code uses
compiler intrinsics for the computation, and these are not constexpr friendly.
So we first introduce constexpr-friendly alternatives and use them.

To prove the transformation is correct, I compared the generated code from
before the series and from just before the last step (where we use constexpr
evaluation but still retain the generated file) and saw no difference in the values.

Note that constexpr is not strictly needed - we could have run the code in the
global variables' initializer. But that would cause a crash if we run on a pre-clmul
machine, and is not as fun.

Closes #11957

* github.com:scylladb/scylladb:
  test: crc: add unit tests for constexpr clmul and barrett fold
  utils: crc combine table: generate at compile time
  utils: barrett: inline functions in header
  utils: crc combine table: generate tables at compile time
  utils: crc combine table: extract table generation into a constexpr function
  utils: crc combine table: extract "pow table" code into constexpr function
  utils: crc combine table: store tables std::arrray rather than C array
  utils: barrett: make the barrett reduction constexpr friendly
  utils: clmul: add 64-bit constexpr clmul
  utils: barrett: extract barrett reduction constants
  utils: barrett: reorder functions
  utils: make clmul() constexpr
This commit is contained in:
Nadav Har'El
2022-11-14 18:01:52 +02:00
committed by Kamil Braun
7 changed files with 152 additions and 109 deletions

View File

@@ -910,6 +910,7 @@ scylla_core = (['message/messaging_service.cc',
'utils/config_file.cc',
'utils/multiprecision_int.cc',
'utils/gz/crc_combine.cc',
'utils/gz/crc_combine_table.cc',
'gms/version_generator.cc',
'gms/versioned_value.cc',
'gms/gossiper.cc',
@@ -1324,8 +1325,6 @@ deps['test/raft/discovery_test'] = ['test/raft/discovery_test.cc',
'test/lib/log.cc',
'service/raft/discovery.cc'] + scylla_raft_dependencies
deps['utils/gz/gen_crc_combine_table'] = ['utils/gz/gen_crc_combine_table.cc']
warnings = [
'-Wall',
@@ -1954,7 +1953,6 @@ with open(buildfile, 'w') as f:
] + [
'abseil/' + x for x in abseil_libs
]])
objs.append('$builddir/' + mode + '/gen/utils/gz/crc_combine_table.o')
if binary in tests:
local_libs = '$seastar_libs_{} $libs'.format(mode)
if binary in pure_boost_tests:
@@ -2003,12 +2001,6 @@ with open(buildfile, 'w') as f:
rust_libs[staticlib] = src
else:
raise Exception('No rule for ' + src)
compiles['$builddir/' + mode + '/gen/utils/gz/crc_combine_table.o'] = '$builddir/' + mode + '/gen/utils/gz/crc_combine_table.cc'
compiles['$builddir/' + mode + '/utils/gz/gen_crc_combine_table.o'] = 'utils/gz/gen_crc_combine_table.cc'
f.write('build {}: run {}\n'.format('$builddir/' + mode + '/gen/utils/gz/crc_combine_table.cc',
'$builddir/' + mode + '/utils/gz/gen_crc_combine_table'))
f.write('build {}: link_build.{} {}\n'.format('$builddir/' + mode + '/utils/gz/gen_crc_combine_table', mode,
'$builddir/' + mode + '/utils/gz/gen_crc_combine_table.o'))
f.write(' libs = $seastar_libs_{}\n'.format(mode))
f.write(
'build {mode}-objects: phony {objs}\n'.format(

View File

@@ -10,8 +10,28 @@
#include <boost/test/unit_test.hpp>
#include "utils/crc.hh"
#include "utils/clmul.hh"
#include "utils/gz/barett.hh"
#include <seastar/core/print.hh>
constexpr uint32_t input_32_1_c = 0x12345678;
uint32_t input_32_1 = input_32_1_c; // NOT constexpr
constexpr uint32_t input_32_2_c = 0xabcdef12;
uint32_t input_32_2 = input_32_2_c; // NOT constexpr
constexpr uint64_t input_64_1_c = 0x1234567890abcdef;
uint64_t input_64_1 = input_64_1_c; // NOT constexpr
BOOST_AUTO_TEST_CASE(clmul_u32_constexpr_equals_native) {
constexpr auto constexpr_result = clmul(input_32_1_c, input_32_2_c);
BOOST_REQUIRE_EQUAL(clmul(input_32_1, input_32_2), constexpr_result);
}
BOOST_AUTO_TEST_CASE(barrett_fold_constexpr_equals_native) {
constexpr auto constexpr_result = crc32_fold_barett_u64(input_64_1_c);
BOOST_REQUIRE_EQUAL(crc32_fold_barett_u64(input_64_1), constexpr_result);
}
inline
uint32_t
do_compute_crc(utils::crc32& c) {

View File

@@ -10,6 +10,26 @@
#pragma once
#include <cstdint>
#include <type_traits>
inline
constexpr uint64_t clmul_u32_constexpr(uint32_t p1, uint32_t p2) {
uint64_t result = 0;
for (unsigned i = 0; i < 32; ++i) {
result ^= (((p1 >> i) & 1) * uint64_t(p2)) << i;
}
return result;
}
// returns the low half of the result
inline
constexpr uint64_t clmul_u64_low_constexpr(uint64_t p1, uint64_t p2) {
uint64_t result = 0;
for (unsigned i = 0; i < 64; ++i) {
result ^= (((p1 >> i) & 1) * p2) << i;
}
return result;
}
#if defined(__x86_64__) || defined(__i386__)
@@ -24,9 +44,10 @@ uint64_t clmul_u32(uint32_t p1, uint32_t p2) {
return _mm_extract_epi64(p, 0);
}
constexpr
inline
uint64_t clmul(uint32_t p1, uint32_t p2) {
return clmul_u32(p1, p2);
return std::is_constant_evaluated() ? clmul_u32_constexpr(p1, p2) : clmul_u32(p1, p2);
}
#elif defined(__aarch64__)
@@ -39,9 +60,10 @@ uint64_t clmul_u32(uint32_t p1, uint32_t p2) {
return vmull_p64(p1, p2);
}
constexpr
inline
uint64_t clmul(uint32_t p1, uint32_t p2) {
return clmul_u32(p1, p2);
return std::is_constant_evaluated() ? clmul_u32_constexpr(p1, p2) : clmul_u32(p1, p2);
}
#endif

View File

@@ -29,6 +29,10 @@
#pragma once
#include <cstdint>
#include "utils/clmul.hh"
inline
constexpr uint64_t barrett_reduction_constants[2] = { 0x00000001F7011641, 0x00000001DB710641 };
/*
* Calculates representation of p(x) mod G(x) using Barett reduction.
@@ -38,23 +42,25 @@
* The parameter p is a bit-reversed representation of the polynomial,
* the least significant bit corresponds to the coefficient of x^63.
*/
inline uint32_t crc32_fold_barett_u64(uint64_t p);
inline constexpr uint32_t crc32_fold_barett_u64_constexpr(uint64_t p) {
auto x0 = p;
auto x1 = x0;
uint64_t mask32 = 0xffff'ffff;
x0 = clmul_u64_low_constexpr(x0 & mask32, barrett_reduction_constants[0]);
x0 = clmul_u64_low_constexpr(x0 & mask32, barrett_reduction_constants[1]);
return (x0 ^ x1) >> 32;
}
#if defined(__x86_64__) || defined(__i386__)
#include <wmmintrin.h>
inline uint32_t crc32_fold_barett_u64_in_m128(__m128i);
uint32_t crc32_fold_barett_u64(uint64_t p) {
return crc32_fold_barett_u64_in_m128(_mm_set_epi64x(0, p));
}
inline
uint32_t crc32_fold_barett_u64_in_m128(__m128i x0) {
__m128i x1;
const __m128i mask32 = (__m128i)(__v4si){ int32_t(0xFFFFFFFF) };
const __v2di barrett_reduction_constants =
(__v2di){ 0x00000001F7011641, 0x00000001DB710641 };
const __v2di brc =
(__v2di){ barrett_reduction_constants[0], barrett_reduction_constants[1] };
/*
* Reduce 64 => 32 bits using Barrett reduction.
@@ -100,26 +106,25 @@ uint32_t crc32_fold_barett_u64_in_m128(__m128i x0) {
*
*/
x1 = x0;
x0 = _mm_clmulepi64_si128(x0 & mask32, barrett_reduction_constants, 0x00);
x0 = _mm_clmulepi64_si128(x0 & mask32, barrett_reduction_constants, 0x10);
x0 = _mm_clmulepi64_si128(x0 & mask32, brc, 0x00);
x0 = _mm_clmulepi64_si128(x0 & mask32, brc, 0x10);
return _mm_cvtsi128_si32(_mm_srli_si128(x0 ^ x1, 4));
}
inline
uint32_t crc32_fold_barett_u64_native(uint64_t p) {
return crc32_fold_barett_u64_in_m128(_mm_set_epi64x(0, p));
}
#elif defined(__aarch64__)
#include <arm_neon.h>
inline uint32_t crc32_fold_barett_u64_in_u64x2(uint64x2_t);
uint32_t crc32_fold_barett_u64(uint64_t p) {
return crc32_fold_barett_u64_in_u64x2(
vcombine_u64((uint64x1_t)p, (uint64x1_t)0UL));
}
inline
uint32_t crc32_fold_barett_u64_in_u64x2(uint64x2_t x0) {
uint64x2_t x1;
const uint64_t barrett_reduction_constant_lo = 0x00000001F7011641;
const uint64_t barrett_reduction_constant_hi = 0x00000001DB710641;
const uint64_t barrett_reduction_constant_lo = barrett_reduction_constants[0];
const uint64_t barrett_reduction_constant_hi = barrett_reduction_constants[1];
x1 = x0;
x0 = vreinterpretq_u64_p128(
@@ -131,8 +136,20 @@ uint32_t crc32_fold_barett_u64_in_u64x2(uint64x2_t x0) {
return vgetq_lane_u64(vshrq_n_u64(x0 ^ x1, 32), 0);
}
inline
uint32_t crc32_fold_barett_u64_native(uint64_t p) {
return crc32_fold_barett_u64_in_u64x2(
vcombine_u64((uint64x1_t)p, (uint64x1_t)0UL));
}
#else
#error "Not implemented for this arch"
#endif
inline
constexpr
uint32_t crc32_fold_barett_u64(uint64_t p) {
return std::is_constant_evaluated() ? crc32_fold_barett_u64_constexpr(p) : crc32_fold_barett_u64_native(p);
}

View File

@@ -0,0 +1,65 @@
/*
* Copyright (C) 2018-present ScyllaDB
*/
/*
* SPDX-License-Identifier: AGPL-3.0-or-later
*
*/
#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__)
#include <array>
#include "crc_combine_table.hh"
#include "utils/clmul.hh"
#include "barett.hh"
template <int bits>
static
constexpr
std::array<uint32_t, bits>
make_crc32_power_table() {
std::array<uint32_t, bits> pows;
pows[0] = 0x00800000; // x^8
for (int i = 1; i < bits; ++i) {
// x^(2*N) mod G(x)
// = (x^N)*(x^N) mod G(x)
// = (x^N mod G(x))^2 mod G(x)
pows[i] = crc32_fold_barett_u64(clmul(pows[i - 1], pows[i - 1]) << 1);
}
return pows;
}
static
constexpr
std::array<uint32_t, 256>
make_crc32_table(int base, int radix_bits, uint32_t one, std::array<uint32_t, 32> pows) {
std::array<uint32_t, 256> table;
for (int i = 0; i < (1 << radix_bits); ++i) {
uint32_t product = one;
for (int j = 0; j < radix_bits; ++j) {
if (i & (1 << j)) {
product = crc32_fold_barett_u64(clmul(product, pows[base + j]) << 1);
}
}
table[i] = product;
}
return table;
}
static constexpr int bits = 32;
static constexpr int radix_bits = 8;
static constexpr uint32_t one = 0x80000000; // x^0
static constexpr auto pows = make_crc32_power_table<bits>(); // pows[i] = x^(2^i*8) mod G(x)
constinit std::array<uint32_t, 256> crc32_x_pow_radix_8_table_base_0 = make_crc32_table(0, radix_bits, one, pows);
constinit std::array<uint32_t, 256> crc32_x_pow_radix_8_table_base_8 = make_crc32_table(8, radix_bits, one, pows);
constinit std::array<uint32_t, 256> crc32_x_pow_radix_8_table_base_16 = make_crc32_table(16, radix_bits, one, pows);
constinit std::array<uint32_t, 256> crc32_x_pow_radix_8_table_base_24 = make_crc32_table(24, radix_bits, one, pows);
#else
#error "Not implemented for this CPU architecture."
#endif

View File

@@ -10,6 +10,7 @@
#pragma once
#include <cstdint>
#include <array>
/*
* Let t_i be the following polynomial depending on i and u:
@@ -36,7 +37,7 @@
* (u >> 6) & 1,
* (u >> 7) & 1)
*/
extern uint32_t crc32_x_pow_radix_8_table_base_0[256];
extern uint32_t crc32_x_pow_radix_8_table_base_8[256];
extern uint32_t crc32_x_pow_radix_8_table_base_16[256];
extern uint32_t crc32_x_pow_radix_8_table_base_24[256];
extern std::array<uint32_t, 256> crc32_x_pow_radix_8_table_base_0;
extern std::array<uint32_t, 256> crc32_x_pow_radix_8_table_base_8;
extern std::array<uint32_t, 256> crc32_x_pow_radix_8_table_base_16;
extern std::array<uint32_t, 256> crc32_x_pow_radix_8_table_base_24;

View File

@@ -1,74 +0,0 @@
/*
* Copyright (C) 2018-present ScyllaDB
*/
/*
* SPDX-License-Identifier: AGPL-3.0-or-later
*
*/
#include <iostream>
#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__)
#include "utils/clmul.hh"
#include "barett.hh"
#include <seastar/core/print.hh>
int main() {
const int bits = 32;
const int radix_bits = 8;
const uint32_t one = 0x80000000; // x^0
std::cout << "/*\n"
" * Generated with gen_crc_combine_table.cc\n"
" * DO NOT EDIT!\n"
" */\n"
"\n"
"#include \"utils/gz/crc_combine_table.hh\"\n"
"\n";
uint32_t pows[bits]; // pows[i] = x^(2^i*8) mod G(x)
pows[0] = 0x00800000; // x^8
for (int i = 1; i < bits; ++i) {
// x^(2*N) mod G(x)
// = (x^N)*(x^N) mod G(x)
// = (x^N mod G(x))^2 mod G(x)
pows[i] = crc32_fold_barett_u64(clmul(pows[i - 1], pows[i - 1]) << 1);
}
for (int base = 0; base < bits; base += radix_bits) {
std::cout << "uint32_t crc32_x_pow_radix_8_table_base_" << base << "[" << (1<<radix_bits) << "] = {";
for (int i = 0; i < (1 << radix_bits); ++i) {
uint32_t product = one;
for (int j = 0; j < radix_bits; ++j) {
if (i & (1 << j)) {
product = crc32_fold_barett_u64(clmul(product, pows[base + j]) << 1);
}
}
if (i % 4 == 0) {
std::cout << "\n ";
}
std::cout << seastar::format(" 0x{:0>8x},", product);
}
std::cout << "\n};\n\n";
}
}
#else
int main() {
std::cout << "/*\n"
" * Generated with gen_crc_combine_table.cc\n"
" * DO NOT EDIT!\n"
" */\n"
"\n"
"/* Not implemented for this CPU architecture. */\n"
"\n";
return 0;
}
#endif