Merge 'vector_index: allow recreating vector indexes on the same column' from Dawid Pawlik
This series allows creating multiple vector indexes on the same column so users can rebuild an index without losing query availability. The intended flow is: 1. Create a new vector index on a column that already has one. 2. Keep serving ANN queries from the old index while the new one is being built. 3. Verify the new index is ready. 4. Automatically switch to the remaining index. 5. Drop the old index. To make that deterministic, `index_version` is changed from the base table schema version to a real creation timeuuid. When multiple vector indexes exist on the same column, ANN query planning now picks the index according to the routing implemented in Vector Store (newest serving index). This keeps queries on the old index until it the new one is up and ready. This patch also removes the create-time restriction that rejected a second vector index on the same column. Name collisions are still rejected as before. Test coverage is updated accordingly: - Scylla now verifies that two vector indexes can coexist on the same column. - Cassandra/SAI behavior is still covered and is still expected to reject duplicate indexes on the same column. Fixes: VECTOR-610 Closes scylladb/scylladb#29407 * github.com:scylladb/scylladb: docs: document vector index metadata and duplicate handling test/cqlpy: cover vector index duplicate creation rules vector_index: allow multiple named indexes on one column vector_index: store `index_version` as creation timeuuid
This commit is contained in:
@@ -675,7 +675,22 @@ create_index_statement::build_index_schema(data_dictionary::database db, locator
|
||||
}
|
||||
auto index = make_index_metadata(targets, accepted_name, kind, index_options);
|
||||
auto existing_index = schema->find_index_noname(index);
|
||||
if (existing_index) {
|
||||
bool is_vector = _idx_properties->custom_class && _idx_properties->custom_class == "vector_index";
|
||||
// For vector indexes:
|
||||
// - unnamed ones are blocked by the duplicate check on the same column;
|
||||
// - named ones are only checked for name uniqueness — allowing multiple named indexes on the same column.
|
||||
// For all other indexes:
|
||||
// - always block duplicates on the same column.
|
||||
//
|
||||
// Name uniqueness without IF NOT EXISTS is enforced before.
|
||||
// The name check here handles IF NOT EXISTS when the index with same name
|
||||
// exists in the same keyspace (on the same or different table) - needed because
|
||||
// vector indexes have no backing view table, so the `has_schema()` check
|
||||
// below cannot catch this case (issue #26672).
|
||||
bool duplicate = (is_vector && !_index_name.empty())
|
||||
? db.existing_index_names(keyspace()).contains(_index_name)
|
||||
: existing_index.has_value();
|
||||
if (duplicate) {
|
||||
if (_if_not_exists) {
|
||||
return std::make_pair(std::nullopt, std::move(warnings));
|
||||
} else {
|
||||
@@ -683,15 +698,6 @@ create_index_statement::build_index_schema(data_dictionary::database db, locator
|
||||
format("Index {} is a duplicate of existing index {}", index.name(), existing_index.value().name()));
|
||||
}
|
||||
}
|
||||
bool existing_vector_index = _idx_properties->custom_class && _idx_properties->custom_class == "vector_index" && secondary_index::vector_index::has_vector_index_on_column(*schema, targets[0]->column_name());
|
||||
bool custom_index_with_same_name = _idx_properties->custom_class && db.existing_index_names(keyspace()).contains(_index_name);
|
||||
if (existing_vector_index || custom_index_with_same_name) {
|
||||
if (_if_not_exists) {
|
||||
return std::make_pair(std::nullopt, std::move(warnings));
|
||||
} else {
|
||||
throw exceptions::invalid_request_exception("There exists a duplicate custom index");
|
||||
}
|
||||
}
|
||||
auto index_table_name = secondary_index::index_table_name(accepted_name);
|
||||
if (db.has_schema(keyspace(), index_table_name)) {
|
||||
// We print this error even if _if_not_exists - in this case the user
|
||||
|
||||
@@ -53,7 +53,7 @@ index_specific_prop_defs::get_options() const {
|
||||
auto options = get_raw_options();
|
||||
options.emplace(db::index::secondary_index::custom_class_option_name, *custom_class);
|
||||
if (index_version.has_value()) {
|
||||
options.emplace(db::index::secondary_index::index_version_option_name, index_version->to_sstring());
|
||||
options.emplace(db::index::secondary_index::index_version_option_name, fmt::to_string(*index_version));
|
||||
}
|
||||
return options;
|
||||
}
|
||||
|
||||
@@ -31,7 +31,7 @@ public:
|
||||
bool is_custom = false;
|
||||
std::optional<sstring> custom_class;
|
||||
// The only assumption about the value of `index_version` should be that it is different for every index.
|
||||
std::optional<table_schema_version> index_version;
|
||||
std::optional<utils::UUID> index_version;
|
||||
|
||||
void validate() const;
|
||||
index_options_map get_raw_options() const;
|
||||
|
||||
@@ -151,6 +151,11 @@ columns are treated as filtering columns. The local vector index requires that t
|
||||
of the base table is also the partition key of the index and the vector column is the first one
|
||||
from the following columns.
|
||||
|
||||
ScyllaDB allows creating multiple **named** vector indexes on the same vector column.
|
||||
This can be used to create a replacement index before dropping an older one.
|
||||
Unnamed duplicate vector index definitions are still rejected, and index names
|
||||
must remain unique within a keyspace.
|
||||
|
||||
Example of a simple index:
|
||||
|
||||
.. code-block:: cql
|
||||
|
||||
@@ -8,3 +8,22 @@ Vector indexes are custom indexes (USING 'vector\_index'). Their `target` option
|
||||
- Local vector index with filtering columns `((p1, p2), v, f1, f2)`: JSON with `tc`, `pk`, and `fc`: `{"tc":"v","pk":["p1","p2"],"fc":["f1","f2"]}`
|
||||
|
||||
The `target` option acts as the interface for the vector-store service, providing the metadata necessary to determine which columns are indexed and how they are structured.
|
||||
|
||||
## Metadata semantics
|
||||
|
||||
Vector indexes also store an `index_version` option in `system_schema.indexes`.
|
||||
It is an auto-generated timeuuid created by `CREATE INDEX`, and it identifies a specific index creation time.
|
||||
|
||||
In particular:
|
||||
|
||||
- dropping and recreating the same vector index definition generates a new `index_version`;
|
||||
- altering the base table does not change an existing vector index's `index_version`;
|
||||
- when multiple named vector indexes exist on the same target column, the Vector Store routes queries to the most recent, serving one (with the highest `index_version`).
|
||||
|
||||
## Duplicate detection
|
||||
|
||||
Scylla allows multiple **named** vector indexes on the same target column.
|
||||
This is useful for zero-downtime recreation: create a replacement index, let it build, and then drop the older one.
|
||||
|
||||
Unnamed duplicate vector indexes are still rejected. For identity detection, `index_version` is ignored,
|
||||
because it changes on every `CREATE INDEX` even when the logical index definition stays the same.
|
||||
|
||||
@@ -106,7 +106,7 @@ public:
|
||||
virtual void validate(const schema &schema, const cql3::statements::index_specific_prop_defs &properties,
|
||||
const std::vector<::shared_ptr<cql3::statements::index_target>> &targets, const gms::feature_service& fs,
|
||||
const data_dictionary::database& db) const = 0;
|
||||
virtual table_schema_version index_version(const schema& schema) = 0;
|
||||
virtual utils::UUID index_version(const schema& schema) = 0;
|
||||
};
|
||||
|
||||
struct stats {
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
#include "index/secondary_index_manager.hh"
|
||||
#include "index/target_parser.hh"
|
||||
#include "types/concrete_types.hh"
|
||||
#include "utils/UUID_gen.hh"
|
||||
#include "types/types.hh"
|
||||
#include "utils/managed_string.hh"
|
||||
#include <ranges>
|
||||
@@ -484,11 +485,11 @@ bool vector_index::is_vector_index_on_column(const index_metadata& im, const sst
|
||||
return false;
|
||||
}
|
||||
|
||||
/// Returns the schema version of the base table at which the index was created.
|
||||
/// This is used to determine if the index needs to be rebuilt after a schema change.
|
||||
/// The CREATE INDEX and DROP INDEX statements does change the schema version.
|
||||
table_schema_version vector_index::index_version(const schema& schema) {
|
||||
return schema.version();
|
||||
/// Returns a timeuuid representing the time at which the index was created.
|
||||
/// This is used to determine if the index needs to be rebuilt, and to enable
|
||||
/// routing by creation time when multiple vector indexes exist on the same column.
|
||||
utils::UUID vector_index::index_version(const schema& schema) {
|
||||
return utils::UUID_gen::get_time_UUID();
|
||||
}
|
||||
|
||||
std::unique_ptr<secondary_index::custom_index> vector_index_factory() {
|
||||
|
||||
@@ -31,7 +31,7 @@ public:
|
||||
void validate(const schema &schema, const cql3::statements::index_specific_prop_defs &properties,
|
||||
const std::vector<::shared_ptr<cql3::statements::index_target>> &targets, const gms::feature_service& fs,
|
||||
const data_dictionary::database& db) const override;
|
||||
table_schema_version index_version(const schema& schema) override;
|
||||
utils::UUID index_version(const schema& schema) override;
|
||||
static bool has_vector_index(const schema& s);
|
||||
static bool has_vector_index_on_column(const schema& s, const sstring& target_name);
|
||||
static bool is_vector_index_on_column(const index_metadata& im, const sstring& target_name);
|
||||
|
||||
@@ -731,7 +731,20 @@ bool index_metadata::operator==(const index_metadata& other) const {
|
||||
}
|
||||
|
||||
bool index_metadata::equals_noname(const index_metadata& other) const {
|
||||
return _kind == other._kind && _options == other._options;
|
||||
if (_kind != other._kind || _options.size() != other._options.size()) {
|
||||
return false;
|
||||
}
|
||||
for (const auto& [key, value] : _options) {
|
||||
// The index_version is unique for each index creation
|
||||
if (key == "index_version") {
|
||||
continue;
|
||||
}
|
||||
auto it = other._options.find(key);
|
||||
if (it == other._options.end() || it->second != value) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
const table_id& index_metadata::id() const {
|
||||
|
||||
@@ -286,26 +286,21 @@ def test_vector_index_version_on_recreate(cql, test_keyspace, scylla_only, skip_
|
||||
schema = 'p int primary key, v vector<float, 3>'
|
||||
with new_test_table(cql, test_keyspace, schema) as table:
|
||||
_, table_name = table.split('.')
|
||||
base_table_version_query = f"SELECT version FROM system_schema.scylla_tables WHERE keyspace_name = '{test_keyspace}' AND table_name = '{table_name}'"
|
||||
index_version_query = f"SELECT * FROM system_schema.indexes WHERE keyspace_name = '{test_keyspace}' AND table_name = '{table_name}' AND index_name = 'abc'"
|
||||
|
||||
# Fetch the base table version.
|
||||
version = str(cql.execute(base_table_version_query).one().version)
|
||||
|
||||
# Create the vector index.
|
||||
cql.execute(f"CREATE CUSTOM INDEX abc ON {table}(v) USING 'vector_index'")
|
||||
|
||||
# Fetch the index version.
|
||||
# It should be the same as the base table version before the index was created.
|
||||
# Fetch the index version (a timeuuid generated at index creation time).
|
||||
result = cql.execute(index_version_query)
|
||||
assert len(result.current_rows) == 1
|
||||
assert result.current_rows[0].options['index_version'] == version
|
||||
version = result.current_rows[0].options['index_version']
|
||||
|
||||
# Drop and create new index with the same parameters.
|
||||
cql.execute(f"DROP INDEX {test_keyspace}.abc")
|
||||
cql.execute(f"CREATE CUSTOM INDEX abc ON {table}(v) USING 'vector_index'")
|
||||
|
||||
# Check if the index version changed.
|
||||
# Check that the index version changed after recreation.
|
||||
result = cql.execute(index_version_query)
|
||||
assert len(result.current_rows) == 1
|
||||
assert result.current_rows[0].options['index_version'] != version
|
||||
@@ -315,25 +310,20 @@ def test_vector_index_version_unaffected_by_alter(cql, test_keyspace, scylla_onl
|
||||
schema = 'p int primary key, v vector<float, 3>'
|
||||
with new_test_table(cql, test_keyspace, schema) as table:
|
||||
_, table_name = table.split('.')
|
||||
base_table_version_query = f"SELECT version FROM system_schema.scylla_tables WHERE keyspace_name = '{test_keyspace}' AND table_name = '{table_name}'"
|
||||
index_version_query = f"SELECT * FROM system_schema.indexes WHERE keyspace_name = '{test_keyspace}' AND table_name = '{table_name}' AND index_name = 'abc'"
|
||||
|
||||
# Fetch the base table version.
|
||||
version = str(cql.execute(base_table_version_query).one().version)
|
||||
|
||||
# Create the vector index.
|
||||
cql.execute(f"CREATE CUSTOM INDEX abc ON {table}(v) USING 'vector_index'")
|
||||
|
||||
# Fetch the index version.
|
||||
# It should be the same as the base table version before the index was created.
|
||||
# Fetch the index version (a timeuuid generated at index creation time).
|
||||
result = cql.execute(index_version_query)
|
||||
assert len(result.current_rows) == 1
|
||||
assert result.current_rows[0].options['index_version'] == version
|
||||
version = result.current_rows[0].options['index_version']
|
||||
|
||||
# ALTER the base table.
|
||||
cql.execute(f"ALTER TABLE {table} ADD v2 vector<float, 3>")
|
||||
|
||||
# Check if the index version is still the same.
|
||||
# Check that the index version is still the same after ALTER.
|
||||
result = cql.execute(index_version_query)
|
||||
assert len(result.current_rows) == 1
|
||||
assert result.current_rows[0].options['index_version'] == version
|
||||
@@ -404,22 +394,115 @@ def test_vector_index_target_serialization_local_index_with_filtering_columns(cq
|
||||
assert len(res) == 1
|
||||
assert json.loads(res[0].options['target']) == {"tc": "v", "pk": ["p1", "p2"], "fc": ["f1", "f2"]}
|
||||
|
||||
def test_one_vector_index_on_column(cql, test_keyspace, skip_on_scylla_vnodes):
|
||||
def test_no_duplicate_named_vector_index_on_column(cql, test_keyspace, skip_on_scylla_vnodes):
|
||||
schema = "p int primary key, v vector<float, 3>"
|
||||
with new_test_table(cql, test_keyspace, schema) as table:
|
||||
cql.execute(f"CREATE CUSTOM INDEX idx1 ON {table}(v) USING 'sai'")
|
||||
with pytest.raises(InvalidRequest, match=r"already exists"):
|
||||
cql.execute(f"CREATE CUSTOM INDEX idx1 ON {table}(v) USING 'sai'")
|
||||
|
||||
|
||||
def test_no_duplicate_unnamed_vector_index_on_column(cql, test_keyspace, skip_on_scylla_vnodes):
|
||||
schema = "p int primary key, v vector<float, 3>"
|
||||
with new_test_table(cql, test_keyspace, schema) as table:
|
||||
cql.execute(f"CREATE CUSTOM INDEX ON {table}(v) USING 'sai'")
|
||||
with pytest.raises(InvalidRequest, match=r"There exists a duplicate custom index|Cannot create more than one storage-attached index on the same column|is a duplicate of existing index"):
|
||||
with pytest.raises(InvalidRequest, match=r"duplicate of existing index"):
|
||||
cql.execute(f"CREATE CUSTOM INDEX ON {table}(v) USING 'sai'")
|
||||
|
||||
|
||||
def test_no_duplicate_unnamed_vector_index_with_if_not_exists(cql, test_keyspace, skip_on_scylla_vnodes):
|
||||
schema = "p int primary key, v vector<float, 3>"
|
||||
with new_test_table(cql, test_keyspace, schema) as table:
|
||||
ks, cf = table.split(".")
|
||||
cql.execute(f"CREATE CUSTOM INDEX IF NOT EXISTS ON {table}(v) USING 'sai'")
|
||||
# Should succeed silently without creating a duplicate
|
||||
cql.execute(f"CREATE CUSTOM INDEX IF NOT EXISTS ON {table}(v) USING 'sai'")
|
||||
rows = list(cql.execute(f"SELECT index_name FROM system_schema.indexes WHERE keyspace_name='{ks}' AND table_name='{cf}'"))
|
||||
index_names = [row.index_name for row in rows]
|
||||
assert len(index_names) == 1
|
||||
|
||||
|
||||
def test_no_duplicate_named_vector_index_with_if_not_exists(cql, test_keyspace, skip_on_scylla_vnodes):
|
||||
schema = "p int primary key, v vector<float, 3>"
|
||||
with new_test_table(cql, test_keyspace, schema) as table:
|
||||
ks, cf = table.split(".")
|
||||
cql.execute(f"CREATE CUSTOM INDEX IF NOT EXISTS idx1 ON {table}(v) USING 'sai'")
|
||||
# Should silently succeed without creating a duplicate
|
||||
cql.execute(f"CREATE CUSTOM INDEX IF NOT EXISTS idx1 ON {table}(v) USING 'sai'")
|
||||
rows = list(cql.execute(f"SELECT index_name FROM system_schema.indexes WHERE keyspace_name='{ks}' AND table_name='{cf}'"))
|
||||
assert len(rows) == 1
|
||||
assert rows[0].index_name == 'idx1'
|
||||
|
||||
|
||||
# Scylla allows creating multiple vector indexes with different names on the same column.
|
||||
# Cassandra does not - it rejects the second index with "Cannot create more than one storage-attached index on the same column".
|
||||
def test_multiple_vector_indexes_different_names_on_column(cql, test_keyspace, scylla_only, skip_without_tablets):
|
||||
schema = "p int primary key, v vector<float, 3>"
|
||||
with new_test_table(cql, test_keyspace, schema) as table:
|
||||
ks, cf = table.split(".")
|
||||
cql.execute(f"CREATE CUSTOM INDEX idx1 ON {table}(v) USING 'vector_index'")
|
||||
cql.execute(f"CREATE CUSTOM INDEX idx2 ON {table}(v) USING 'vector_index'")
|
||||
# Both indexes should exist in system_schema.indexes
|
||||
rows = list(cql.execute(f"SELECT index_name FROM system_schema.indexes WHERE keyspace_name='{ks}' AND table_name='{cf}'"))
|
||||
index_names = {row.index_name for row in rows}
|
||||
assert 'idx1' in index_names
|
||||
assert 'idx2' in index_names
|
||||
|
||||
|
||||
def test_named_vector_index_after_unnamed_on_same_column(cql, test_keyspace, scylla_only, skip_without_tablets):
|
||||
schema = "p int primary key, v vector<float, 3>"
|
||||
with new_test_table(cql, test_keyspace, schema) as table:
|
||||
ks, cf = table.split(".")
|
||||
cql.execute(f"CREATE CUSTOM INDEX ON {table}(v) USING 'vector_index'")
|
||||
cql.execute(f"CREATE CUSTOM INDEX idx1 ON {table}(v) USING 'vector_index'")
|
||||
rows = list(cql.execute(f"SELECT index_name FROM system_schema.indexes WHERE keyspace_name='{ks}' AND table_name='{cf}'"))
|
||||
index_names = {row.index_name for row in rows}
|
||||
assert 'idx1' in index_names
|
||||
assert len(index_names) == 2
|
||||
|
||||
|
||||
def test_unnamed_vector_index_after_named_on_same_column(cql, test_keyspace, skip_on_scylla_vnodes):
|
||||
schema = "p int primary key, v vector<float, 3>"
|
||||
with new_test_table(cql, test_keyspace, schema) as table:
|
||||
cql.execute(f"CREATE CUSTOM INDEX idx1 ON {table}(v) USING 'sai'")
|
||||
with pytest.raises(InvalidRequest, match=r"duplicate of existing index"):
|
||||
cql.execute(f"CREATE CUSTOM INDEX ON {table}(v) USING 'sai'")
|
||||
|
||||
# Validates fix for issue #26672
|
||||
def test_two_same_name_indexes_on_different_tables_with_if_not_exists(cql, test_keyspace, scylla_only, skip_without_tablets):
|
||||
def test_two_same_name_indexes_on_different_tables_with_if_not_exists(cql, test_keyspace, skip_on_scylla_vnodes):
|
||||
schema = "p int primary key, v vector<float, 3>"
|
||||
with new_test_table(cql, test_keyspace, schema) as table:
|
||||
schema = "p int primary key, v vector<float, 3>"
|
||||
with new_test_table(cql, test_keyspace, schema) as table2:
|
||||
cql.execute(f"CREATE CUSTOM INDEX IF NOT EXISTS ann_index ON {table}(v) USING 'vector_index'")
|
||||
cql.execute(f"CREATE CUSTOM INDEX IF NOT EXISTS ann_index ON {table2}(v) USING 'vector_index'")
|
||||
ks, cf1 = table.split(".")
|
||||
_, cf2 = table2.split(".")
|
||||
cql.execute(f"CREATE CUSTOM INDEX IF NOT EXISTS ann_index ON {table}(v) USING 'sai'")
|
||||
# The query below succeeds although silently does not create a new index.
|
||||
# This is because the IF NOT EXISTS check looks for an existing index with the same name
|
||||
# within the whole keyspace, not just the same table.
|
||||
# Issue: VECTOR-641
|
||||
cql.execute(f"CREATE CUSTOM INDEX IF NOT EXISTS ann_index ON {table2}(v) USING 'sai'")
|
||||
rows1 = list(cql.execute(f"SELECT index_name FROM system_schema.indexes WHERE keyspace_name='{ks}' AND table_name='{cf1}'"))
|
||||
rows2 = list(cql.execute(f"SELECT index_name FROM system_schema.indexes WHERE keyspace_name='{ks}' AND table_name='{cf2}'"))
|
||||
assert len(rows1) == 1
|
||||
assert rows1[0].index_name == 'ann_index'
|
||||
assert len(rows2) == 0
|
||||
|
||||
|
||||
def test_two_same_name_indexes_on_different_columns_with_if_not_exists(cql, test_keyspace, skip_on_scylla_vnodes):
|
||||
schema = "p int primary key, v vector<float, 3>, v2 vector<float, 3>"
|
||||
with new_test_table(cql, test_keyspace, schema) as table:
|
||||
ks, cf = table.split(".")
|
||||
cql.execute(f"CREATE CUSTOM INDEX IF NOT EXISTS ann_index ON {table}(v) USING 'sai'")
|
||||
# The query below succeeds although silently does not create a new index.
|
||||
# This is because the IF NOT EXISTS check looks for an existing index with the same name
|
||||
# within the table, not just the same column.
|
||||
# Issue: VECTOR-641
|
||||
cql.execute(f"CREATE CUSTOM INDEX IF NOT EXISTS ann_index ON {table}(v2) USING 'sai'")
|
||||
rows = list(cql.execute(f"SELECT index_name FROM system_schema.indexes WHERE keyspace_name='{ks}' AND table_name='{cf}'"))
|
||||
assert len(rows) == 1
|
||||
assert rows[0].index_name == 'ann_index'
|
||||
|
||||
|
||||
###############################################################################
|
||||
# SAI (StorageAttachedIndex) compatibility tests
|
||||
|
||||
Reference in New Issue
Block a user