cql3/select_statement: reject PER PARTITION LIMIT with SELECT DISTINCT

Before this patch we silently allowed and ignored PER PARTITION LIMIT. SELECT DISTINCT requires all the partition key columns, which means that setting PER PARTITION LIMIT is redundant - only one result will be returned from every partition anyway. Cassandra behaves the same way, so this patch also ensures compatibility. Fixes scylladb/scylladb#15109 Closes scylladb/scylladb#22950
2025-02-20 01:04:56 +01:00
parent e6227f9a25
commit 854d2917a1
4 changed files with 16 additions and 6 deletions
--- a/cql3/statements/raw/select_statement.hh
+++ b/cql3/statements/raw/select_statement.hh
@@ -135,9 +135,9 @@ private:
        selection::selection& selection,
        const restrictions::statement_restrictions& restrictions);

-    static void validate_distinct_selection(const schema& schema,
+    void validate_distinct_selection(const schema& schema,
        const selection::selection& selection,
-        const restrictions::statement_restrictions& restrictions);
+        const restrictions::statement_restrictions& restrictions) const;

    /** If ALLOW FILTERING was not specified, this verifies that it is not needed */
    void check_needs_filtering(
--- a/cql3/statements/select_statement.cc
+++ b/cql3/statements/select_statement.cc
@@ -2383,8 +2383,12 @@ select_statement::ordering_comparator_type select_statement::get_ordering_compar

 void select_statement::validate_distinct_selection(const schema& schema,
                                                   const selection::selection& selection,
-                                                   const restrictions::statement_restrictions& restrictions)
+                                                   const restrictions::statement_restrictions& restrictions) const
 {
+    if (_per_partition_limit) {
+        throw exceptions::invalid_request_exception("PER PARTITION LIMIT is not allowed with SELECT DISTINCT queries");
+    }
+
    if (restrictions.has_non_primary_key_restriction() || restrictions.has_clustering_columns_restriction()) {
        throw exceptions::invalid_request_exception(
            "SELECT DISTINCT with WHERE clause only supports restriction by partition key.");
--- a/test/cqlpy/cassandra_tests/validation/operations/select_limit_test.py
+++ b/test/cqlpy/cassandra_tests/validation/operations/select_limit_test.py
@@ -16,7 +16,7 @@ def testSparseTable(cql, test_keyspace):
                execute(cql, table, "INSERT INTO %s (userid, url, day, month, year) VALUES (?, ?, 1, 'jan', 2012)", i, f"http://foo.{tld}")
        assertRowCount(execute(cql, table, "SELECT * FROM %s LIMIT 4"), 4)

-@pytest.mark.xfail(reason="issues #9879, #15099, #15109")
+@pytest.mark.xfail(reason="issues #9879, #15099")
 def testPerPartitionLimit(cql, test_keyspace):
    with create_table(cql, test_keyspace, "(a int, b int, c int, PRIMARY KEY (a, b))") as table:
        for i in range(5):
@@ -110,7 +110,7 @@ def testPerPartitionLimit(cql, test_keyspace):
        assertInvalidMessage(cql, table, "PER PARTITION LIMIT is not allowed with aggregate queries.",
                             "SELECT COUNT(*) FROM %s PER PARTITION LIMIT ?", 3)

-@pytest.mark.xfail(reason="issues #9879, #15109")
+@pytest.mark.xfail(reason="issues #9879")
 def testPerPartitionLimitWithStaticDataAndPaging(cql, test_keyspace):
    with create_table(cql, test_keyspace, "(a int, b int, s int static, c int, PRIMARY KEY (a, b))") as table:
        for i in range(5):
--- a/test/cqlpy/test_distinct.py
+++ b/test/cqlpy/test_distinct.py
@@ -74,7 +74,13 @@ def test_distinct_limit(cql, test_keyspace):
            assert n == len(results)
            assert set(results).issubset(set(all))

-# Test combination of SELECT DISTINCT, COUNT, GROUP BY and LIMIT. 
+# Test that SELECT DISTINCT and PER PARTITION LIMIT is not allowed in the same query
+# reproduces #15109
+def test_distinct_and_per_partition_limit(cql, table1):
+    with pytest.raises(InvalidRequest, match='PER PARTITION LIMIT is not allowed with SELECT DISTINCT queries'):
+        cql.execute(f"select distinct p from {table1} per partition limit 1")
+
+# Test combination of SELECT DISTINCT, COUNT, GROUP BY and LIMIT.
 # COUNT + GROUP BY means generate one count per partition, and the
 # SELECT DISTINCT simply hands the counter just one row per partition
 # to count, so all the counts come up 1. Adding a LIMIT to all of