vint: optimise deserialisation routine

At the moment, vint deserialisation is using a naive approach, reading each byte separately. In practice, vints are going to most often appears inside larger buffers. That means we can read 8-bytes at a time end then figure out unneded parts and mask them out. This way we avoid a loop and do less memory loads which are much more expensive than arithmetic operations (even if they hit the cache).
2019-02-21 17:00:39 +00:00
parent 57de2c26b3
commit 552fc0c6b9
1 changed files with 18 additions and 2 deletions
--- a/vint-serialization.cc
+++ b/vint-serialization.cc
@@ -136,7 +136,9 @@ vint_size_type unsigned_vint::serialized_size(uint64_t value) noexcept {
 }

 uint64_t unsigned_vint::deserialize(bytes_view v) {
-    const int8_t first_byte = v[0];
+    auto src = v.data();
+    auto len = v.size();
+    const int8_t first_byte = *src;

    // No additional bytes, since the most significant bit is not set.
    if (first_byte >= 0) {
@@ -148,11 +150,25 @@ uint64_t unsigned_vint::deserialize(bytes_view v) {
    // Extract the bits not used for counting bytes.
    auto result = uint64_t(first_byte) & first_byte_value_mask(extra_bytes_size);

+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+    uint64_t value;
+    // If we can overread do that. It is cheaper to have a single 64-bit read and
+    // then mask out the unneeded part than to do 8x 1 byte reads.
+    if (__builtin_expect(len >= sizeof(uint64_t) + 1, true)) {
+        std::copy_n(src + 1, sizeof(uint64_t), reinterpret_cast<int8_t*>(&value));
+    } else {
+        value = 0;
+        std::copy_n(src + 1, extra_bytes_size, reinterpret_cast<int8_t*>(&value));
+    }
+    value = be_to_cpu(value << (64 - (extra_bytes_size * 8)));
+    result <<= (extra_bytes_size * 8) % 64;
+    result |= value;
+#else
    for (vint_size_type index = 0; index < extra_bytes_size; ++index) {
        result <<= 8;
        result |= (uint64_t(v[index + 1]) & uint64_t(0xff));
    }
-
+#endif
    return result;
 }