diff --git a/db/view/regular_column_transformation.hh b/db/view/regular_column_transformation.hh index e1271c3ea8..33f8de3a6c 100644 --- a/db/view/regular_column_transformation.hh +++ b/db/view/regular_column_transformation.hh @@ -44,6 +44,10 @@ class row_marker; class regular_column_transformation : public column_computation { public: struct result { + // We can use "bytes" instead of "managed_bytes" here because we know + // that a column_computation is only used for generating a key value, + // and that is limited to 64K. This limitation is enforced below - + // we never linearize a cell's value if its size is more than 64K. std::optional _value; // _ttl and _expiry are only defined if _value is set. @@ -92,6 +96,18 @@ public: result(atomic_cell_view cell, Func f = {}) { _ts = cell.timestamp(); if (cell.is_live()) { + // If the cell is larger than what a key can hold (64KB), + // return a missing value. This lets us skip this item during + // view building and avoid hanging the view build as described + // in #8627. But it doesn't prevent later inserting such a item + // to the base table, nor does it implement front-end specific + // limits (such as Alternator's 1K or 2K limits - see #10347). + // Those stricter limits should be validated in the base-table + // write code, not here - deep inside the view update code. + // Note also we assume that f() doesn't grow the value further. + if (cell.value().size() >= 65536) { + return; + } _value = f(to_bytes(cell.value())); if (_value) { if (cell.is_live_and_has_ttl()) { diff --git a/test/alternator/test_gsi.py b/test/alternator/test_gsi.py index b86a1f33b5..42534e3b99 100644 --- a/test/alternator/test_gsi.py +++ b/test/alternator/test_gsi.py @@ -615,6 +615,122 @@ def test_gsi_wrong_type_attribute_batch(test_table_gsi_2): for p in [p1, p2, p3]: assert not 'Item' in test_table_gsi_2.get_item(Key={'p': p}, ConsistentRead=True) +# Test when a table has a GSI, if the indexed attribute is a partition key +# in the GSI and its value is 2048 bytes, the update operation is rejected, +# and is added to neither base table nor index. DynamoDB limits partition +# keys to that length (see test_limits.py::test_limit_partition_key_len_2048) +# so wants to limit the GSI keys as well. +# Note that in test_gsi_updatetable.py we have a similar test for when adding +# a pre-existing table. In that case we can't reject the base-table update +# because the oversized attribute is already there - but can just drop this +# item from the GSI. +@pytest.mark.xfail(reason="issue #10347: key length limits not enforced") +def test_gsi_limit_partition_key_len_2048(test_table_gsi_2): + # A value for 'x' (the GSI's partition key) of length 2048 is fine: + p = random_string() + x = 'a'*2048 + test_table_gsi_2.put_item(Item={'p': p, 'x': x}) + assert_index_query(test_table_gsi_2, 'hello', [{'p': p, 'x': x}], + KeyConditions={ + 'x': {'AttributeValueList': [x], 'ComparisonOperator': 'EQ'}}) + # PutItem with oversized for 'x' is rejected, item isn't created even + # in the base table. + p = random_string() + x = 'a'*2049 + with pytest.raises(ClientError, match='ValidationException.*2048'): + test_table_gsi_2.put_item(Item={'p': p, 'x': x}) + assert not 'Item' in test_table_gsi_2.get_item(Key={'p': p}, ConsistentRead=True) + +# This is a variant of the above test, where we don't insist that the +# partition key length limit must be exactly 2048 bytes as in DynamoDB, +# but that it be *at least* 2408. I.e., we verify that 2048-byte values +# are allowed for GSI partition keys, while very long keys that surpass +# Scylla's low-level key-length limit (64 KB) are forbidden with an +# appropriate error message and not an "internal server error". This test +# should pass even if Alternator decides to adopt a different key length +# limits from DynamoDB. We do have to adopt *some* limit because the +# internal Scylla implementation has a 64 KB limit on key lengths. +@pytest.mark.xfail(reason="issue #10347: key length limits not enforced") +def test_gsi_limit_partition_key_len(test_table_gsi_2): + # A value for 'x' (the GSI's partition key) of length 2048 is fine: + p = random_string() + x = 'a'*2048 + test_table_gsi_2.put_item(Item={'p': p, 'x': x}) + assert_index_query(test_table_gsi_2, 'hello', [{'p': p, 'x': x}], + KeyConditions={ + 'x': {'AttributeValueList': [x], 'ComparisonOperator': 'EQ'}}) + # Attribute, that is a GSI partition key, of length 64 KB + 1 is forbidden: + # it obviously exceeds DynamoDB's limit (2048 bytes), but also exceeds + # Scylla's internal limit on key length (64 KB - 1). We except to get a + # reasonable error on request validation - not some "internal server error". + # We actually used to get this "internal server error" for 64 KB - 2 + # (this is probably related to issue #16772). + p = random_string() + x = 'a'*65536 + with pytest.raises(ClientError, match='ValidationException.*limit'): + test_table_gsi_2.put_item(Item={'p': p, 'x': x}) + assert not 'Item' in test_table_gsi_2.get_item(Key={'p': p}, ConsistentRead=True) + +# Test when a table has a GSI, if the indexed attribute is a partition key +# in the GSI and its value is 1024 bytes, the update operation is rejected, +# and is added to neither base table nor index. DynamoDB limits partition +# keys to that length (see test_limits.py::test_limit_partition_key_len_1024) +# so wants to limit the GSI keys as well. +# Note that in test_gsi_updatetable.py we have a similar test for when adding +# a pre-existing table. In that case we can't reject the base-table update +# because the oversized attribute is already there - but can just drop this +# item from the GSI. +@pytest.mark.xfail(reason="issue #10347: key length limits not enforced") +def test_gsi_limit_sort_key_len_1024(test_table_gsi_5): + # A value for 'x' (the GSI's partition key) of length 1024 is fine: + p = random_string() + c = random_string() + x = 'a'*1024 + test_table_gsi_5.put_item(Item={'p': p, 'c': c, 'x': x}) + assert_index_query(test_table_gsi_5, 'hello', [{'p': p, 'c': c, 'x': x}], + KeyConditions={ + 'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}, + 'x': {'AttributeValueList': [x], 'ComparisonOperator': 'EQ'}}) + # PutItem with oversized for 'x' is rejected, item isn't created even + # in the base table. + p = random_string() + x = 'a'*1025 + with pytest.raises(ClientError, match='ValidationException.*1024'): + test_table_gsi_5.put_item(Item={'p': p, 'c': c, 'x': x}) + assert not 'Item' in test_table_gsi_5.get_item(Key={'p': p, 'c': c}, ConsistentRead=True) + +# This is a variant of the above test, where we don't insist that the +# partition key length limit must be exactly 1024 bytes as in DynamoDB, +# but that it be *at least* 1024. I.e., we verify that 1024-byte values +# are allowed for GSI partition keys, while very long keys that surpass +# Scylla's low-level key-length limit (64 KB) are forbidden with an +# appropriate error message and not an "internal server error". This test +# should pass even if Alternator decides to adopt a different key length +# limits from DynamoDB. We do have to adopt *some* limit because the +# internal Scylla implementation has a 64 KB limit on key lengths. +@pytest.mark.xfail(reason="issue #10347: key length limits not enforced") +def test_gsi_limit_sort_key_len(test_table_gsi_5): + # A value for 'x' (the GSI's partition key) of length 1024 is fine: + p = random_string() + c = random_string() + x = 'a'*1024 + test_table_gsi_5.put_item(Item={'p': p, 'c': c, 'x': x}) + assert_index_query(test_table_gsi_5, 'hello', [{'p': p, 'c': c, 'x': x}], + KeyConditions={ + 'p': {'AttributeValueList': [p], 'ComparisonOperator': 'EQ'}, + 'x': {'AttributeValueList': [x], 'ComparisonOperator': 'EQ'}}) + # Attribute, that is a GSI partition key, of length 64 KB + 1 is forbidden: + # it obviously exceeds DynamoDB's limit (1024 bytes), but also exceeds + # Scylla's internal limit on key length (64 KB - 1). We except to get a + # reasonable error on request validation - not some "internal server error". + # We actually used to get this "internal server error" for 64 KB - 2 + # (this is probably related to issue #16772). + p = random_string() + x = 'a'*65536 + with pytest.raises(ClientError, match='ValidationException.*limit'): + test_table_gsi_5.put_item(Item={'p': p, 'c': c, 'x': x}) + assert not 'Item' in test_table_gsi_5.get_item(Key={'p': p, 'c': c}, ConsistentRead=True) + # A third scenario of GSI. Index has a hash key and a sort key, both are # non-key attributes from the base table. This scenario may be very # difficult to implement in Alternator because Scylla's materialized-views diff --git a/test/alternator/test_gsi_updatetable.py b/test/alternator/test_gsi_updatetable.py index 8f0d3d15aa..a693fbab96 100644 --- a/test/alternator/test_gsi_updatetable.py +++ b/test/alternator/test_gsi_updatetable.py @@ -603,3 +603,127 @@ def test_updatetable_delete_missing_gsi(dynamodb, table1): dynamodb.meta.client.update_table(TableName=table1.name, GlobalSecondaryIndexUpdates=[{ 'Delete': { 'IndexName': 'nonexistent' } }]) + +# Whereas DynamoDB allows attribute values to reach a generous length (they +# are only limited by the item's size limit, 400 KB), an attribute which is +# a *key* has much stricter limits - 2048 bytes for a partition key, 1024 +# bytes for a sort key. This means that if a table has a GSI or LSI and +# one of the attributes serves as a key in that GSI and LSI, DynamoDB +# limits its length. In the tests test_gsi.py::test_gsi_limit_* we verified +# that attempts to write an oversized value to an attribute which is a +# GSI key are rejected. Here we test what happens when adding a GSI to +# a table with pre-existing data, which already includes items with oversized +# values for the key attribute. These items can't be "rejected" - they +# are already in the base table - but should be skipped while filling the +# GSI. What we don't want to happen is to see the view building hang, +# as described in issue #8627 and #10347. +# The first test here, test_gsi_backfill_oversized_key(), doesn't check the +# specific limits of 2048 and 1024 bytes, it only checks that an item with +# a 65 KB attribute (above Scylla's internal limitations for keys) are +# cleanly skipped and don't cause view build hangs. The following test +# test_gsi_backfill_key_limits will check the specific limits. +def test_gsi_backfill_oversized_key(dynamodb): + # First create, and fill, a table without GSI: + with new_test_table(dynamodb, + KeySchema=[ { 'AttributeName': 'p', 'KeyType': 'HASH' }, + { 'AttributeName': 'c', 'KeyType': 'RANGE' } ], + AttributeDefinitions=[ { 'AttributeName': 'p', 'AttributeType': 'S' }, + { 'AttributeName': 'c', 'AttributeType': 'S' } ]) as table: + p1 = random_string() + p2 = random_string() + c = random_string() + # Create two items, one has a small "x" attribute, the other has + # a 65 KB "x" attribute. + table.put_item(Item={'p': p1, 'c': c, 'x': 'hello'}) + table.put_item(Item={'p': p2, 'c': c, 'x': 'a'*66500}) + # Now use UpdateTable to create two GSIs. In one of them "x" will be + # the partition key, and in the other "x" will be a sort key. + # DynamoDB limits the number of indexes that can be added in one + # UpdateTable command to just one, so we need to do it in two separate + # commands and wait for each to complete. + dynamodb.meta.client.update_table(TableName=table.name, + AttributeDefinitions=[{ 'AttributeName': 'x', 'AttributeType': 'S' }], + GlobalSecondaryIndexUpdates=[ + { 'Create': { 'IndexName': 'index1', + 'KeySchema': [{ 'AttributeName': 'x', 'KeyType': 'HASH' }], + 'Projection': { 'ProjectionType': 'ALL' }} + } + ]) + wait_for_gsi(table, 'index1') + dynamodb.meta.client.update_table(TableName=table.name, + AttributeDefinitions=[{ 'AttributeName': 'x', 'AttributeType': 'S' }, + { 'AttributeName': 'c', 'AttributeType': 'S' }], + GlobalSecondaryIndexUpdates=[ + { 'Create': { 'IndexName': 'index2', + 'KeySchema': [{ 'AttributeName': 'c', 'KeyType': 'HASH' }, + { 'AttributeName': 'x', 'KeyType': 'RANGE' }], + 'Projection': { 'ProjectionType': 'ALL' }} + } + ]) + wait_for_gsi(table, 'index2') + # Verify that the items with the oversized x are missing from both + # GSIs, so only the one item with x = hello should appear in both. + # Note that we don't need to retry the reads here (i.e., use the + # assert_index_scan() or assert_index_query() functions) because after + # we waited for backfilling to complete, we know all the pre-existing + # data is already in the index. + assert [{'p': p1, 'c': c, 'x': 'hello'}] == full_scan(table, ConsistentRead=False, IndexName='index1') + assert [{'p': p1, 'c': c, 'x': 'hello'}] == full_scan(table, ConsistentRead=False, IndexName='index2') + +# The previous test, test_gsi_backfill_oversized_key(), checked that a +# grossly oversized GSI key attribute (over Scylla's internal key limit +# of 64 KB) doesn't hang the view building process. This test verifies +# more specifically that DynamoDB's documented limits - 2048 bytes for +# a GSI partition key and 1024 for a GSI sort key - are implemented. An +# item that has an attribute longer than that should simply be skipped +# during view building. +# Reproduces issue #10347. +@pytest.mark.xfail(reason="issue #10347: key length limits not enforced") +def test_gsi_backfill_key_limits(dynamodb): + # First create, and fill, a table without GSI: + with new_test_table(dynamodb, + KeySchema=[ { 'AttributeName': 'p', 'KeyType': 'HASH' }, + { 'AttributeName': 'c', 'KeyType': 'RANGE' } ], + AttributeDefinitions=[ { 'AttributeName': 'p', 'AttributeType': 'S' }, + { 'AttributeName': 'c', 'AttributeType': 'S' } ]) as table: + # Create four items, with 'x' attribute sizes of 1024, 1025, 2048 + # and 2049. Only one item (1024) has x suitable for a sort key, + # and three (1024, 1025 and 2048) have length suitable for a partition + # key. The unsuitable items will be missing from the indexes. + lengths = [1024, 1025, 2048, 2049] + p = [random_string() for length in lengths] + x = ['a'*length for length in lengths] + c = random_string() + for i in range(len(lengths)): + table.put_item(Item={'p': p[i], 'c': c, 'x': x[i]}) + # Now use UpdateTable to create two GSIs. In one of them "x" will be + # the partition key, and in the other "x" will be a sort key. + # DynamoDB limits the number of indexes that can be added in one + # UpdateTable command to just one, so we need to do it in two separate + # commands and wait for each to complete. + dynamodb.meta.client.update_table(TableName=table.name, + AttributeDefinitions=[{ 'AttributeName': 'x', 'AttributeType': 'S' }], + GlobalSecondaryIndexUpdates=[ + { 'Create': { 'IndexName': 'index1', + 'KeySchema': [{ 'AttributeName': 'x', 'KeyType': 'HASH' }], + 'Projection': { 'ProjectionType': 'ALL' }} + } + ]) + wait_for_gsi(table, 'index1') + dynamodb.meta.client.update_table(TableName=table.name, + AttributeDefinitions=[{ 'AttributeName': 'x', 'AttributeType': 'S' }, + { 'AttributeName': 'c', 'AttributeType': 'S' }], + GlobalSecondaryIndexUpdates=[ + { 'Create': { 'IndexName': 'index2', + 'KeySchema': [{ 'AttributeName': 'c', 'KeyType': 'HASH' }, + { 'AttributeName': 'x', 'KeyType': 'RANGE' }], + 'Projection': { 'ProjectionType': 'ALL' }} + } + ]) + wait_for_gsi(table, 'index2') + # Verify that the items with the oversized x are missing from both + # GSIs. For index1 (x is a partition key, limited to 2048 bytes) + # items 0,1,2 should appear, for index2 (x is a sort key, limited + # to 1024 bytes), only item 0 should appear. + assert multiset([{'p': p[i], 'c': c, 'x': x[i]} for i in range(3)]) == multiset(full_scan(table, ConsistentRead=False, IndexName='index1')) + assert [{'p': p[0], 'c': c, 'x': x[0]}] == full_scan(table, ConsistentRead=False, IndexName='index2')