From e93c54e83798190a0d0bbc80c11fb37dcb604132 Mon Sep 17 00:00:00 2001 From: Piotr Sarna Date: Wed, 12 Feb 2020 17:37:22 +0100 Subject: [PATCH] db,view: fix generating view updates for partition tombstones The update generation path must track and apply all tombstones, both from the existing base row (if read-before-write was needed) and for the new row. One such path contained an error, because it assumed that if the existing row is empty, then the update can be simply generated from the new row. However, lack of the existing row can also be the result of a partition/range tombstone. If that's the case, it needs to be applied, because it's entirely possible that this partition row also hides the new row. Without taking the partition tombstone into account, creating a future tombstone and inserting an out-of-order write before it in the base table can result in ghost rows in the view table. This patch comes with a test which was proven to fail before the changes. Branches 3.1,3.2,3.3 Fixes #5793 Tests: unit(dev) Message-Id: <8d3b2abad31572668693ab585f37f4af5bb7577a.1581525398.git.sarna@scylladb.com> --- db/view/view.cc | 6 +++++- test/boost/cql_query_test.cc | 42 ++++++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/db/view/view.cc b/db/view/view.cc index 9b52ae5d2e..5101b3b7b2 100644 --- a/db/view/view.cc +++ b/db/view/view.cc @@ -910,7 +910,11 @@ future view_update_builder::on_results() { if (_update && !_update->is_end_of_partition()) { if (_update->is_clustering_row()) { apply_tracked_tombstones(_update_tombstone_tracker, _update->as_mutable_clustering_row()); - generate_update(std::move(*_update).as_clustering_row(), { }); + auto existing_tombstone = _existing_tombstone_tracker.current_tombstone(); + auto existing = existing_tombstone + ? std::optional(std::in_place, _update->as_clustering_row().key(), row_tombstone(std::move(existing_tombstone)), row_marker(), ::row()) + : std::nullopt; + generate_update(std::move(*_update).as_clustering_row(), std::move(existing)); } return advance_updates(); } diff --git a/test/boost/cql_query_test.cc b/test/boost/cql_query_test.cc index 71ae2c802d..1a2b0a3373 100644 --- a/test/boost/cql_query_test.cc +++ b/test/boost/cql_query_test.cc @@ -4996,3 +4996,45 @@ SEASTAR_TEST_CASE(test_time_uuid_fcts_result) { require_timestamp_timeuuid_or_date("tounixtimestamp"); }); } + +// Test that tombstones with future timestamps work correctly +// when a write with lower timestamp arrives - in such case, +// if the base row is covered by such a tombstone, a view update +// needs to take it into account. Refs #5793 +SEASTAR_TEST_CASE(test_views_with_future_tombstones) { + return do_with_cql_env_thread([] (auto& e) { + cquery_nofail(e, "CREATE TABLE t (a int, b int, c int, d int, e int, PRIMARY KEY (a,b,c));"); + cquery_nofail(e, "CREATE MATERIALIZED VIEW tv AS SELECT * FROM t" + " WHERE a IS NOT NULL AND b IS NOT NULL AND c IS NOT NULL PRIMARY KEY (b,a,c);"); + + // Partition tombstone + cquery_nofail(e, "delete from t using timestamp 10 where a=1;"); + auto msg = cquery_nofail(e, "select * from t;"); + assert_that(msg).is_rows().with_size(0); + cquery_nofail(e, "insert into t (a,b,c,d,e) values (1,2,3,4,5) using timestamp 8;"); + msg = cquery_nofail(e, "select * from t;"); + assert_that(msg).is_rows().with_size(0); + msg = cquery_nofail(e, "select * from tv;"); + assert_that(msg).is_rows().with_size(0); + + // Range tombstone + cquery_nofail(e, "delete from t using timestamp 16 where a=2 and b > 1 and b < 4;"); + msg = cquery_nofail(e, "select * from t;"); + assert_that(msg).is_rows().with_size(0); + cquery_nofail(e, "insert into t (a,b,c,d,e) values (2,3,4,5,6) using timestamp 12;"); + msg = cquery_nofail(e, "select * from t;"); + assert_that(msg).is_rows().with_size(0); + msg = cquery_nofail(e, "select * from tv;"); + assert_that(msg).is_rows().with_size(0); + + // Row tombstone + cquery_nofail(e, "delete from t using timestamp 24 where a=3 and b=4 and c=5;"); + msg = cquery_nofail(e, "select * from t;"); + assert_that(msg).is_rows().with_size(0); + cquery_nofail(e, "insert into t (a,b,c,d,e) values (3,4,5,6,7) using timestamp 18;"); + msg = cquery_nofail(e, "select * from t;"); + assert_that(msg).is_rows().with_size(0); + msg = cquery_nofail(e, "select * from tv;"); + assert_that(msg).is_rows().with_size(0); + }); +}