In this PR a simple test for fencing is added. It exercises the data plane, meaning if it somehow happens that the node has a stale topology version, then requests from this node will get an error 'stale topology'. The test just decrements the node version manually through CQL, so it's quite artificial. To test a more real-world scenario we need to allow the topology change fiber to sometimes skip unavailable nodes. Now the algorithm fails and retries indefinitely in this case. The PR also adds some logs, and removes one seemingly redundant topology version increment, see the commit messages for details. Closes #14901 * github.com:scylladb/scylladb: test_fencing: add test_fence_hints test.py: output the skipped tests test.py: add skip_mode decorator and fixture test.py: add mode fixture hints: add debug log for dropped hints hints: send_one_hint: extend the scope of file_send_gate holder pylib: add ScyllaMetrics hints manager: add send_errors counter token_metadata: add debug logs fencing: add simple data plane test random_tables.py: add counter column type raft topology: don't increment version when transitioning to node_state::normal
219 lines
11 KiB
Python
219 lines
11 KiB
Python
#
|
|
# Copyright (C) 2022-present ScyllaDB
|
|
#
|
|
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
#
|
|
# This file configures pytest for all tests in this directory, and also
|
|
# defines common test fixtures for all of them to use
|
|
|
|
import ssl
|
|
from typing import List
|
|
from test.pylib.random_tables import RandomTables
|
|
from test.pylib.util import unique_name
|
|
from test.pylib.manager_client import ManagerClient, IPAddress
|
|
from test.pylib.async_cql import event_loop, run_async
|
|
import logging
|
|
import pytest
|
|
from cassandra.cluster import Session # type: ignore # pylint: disable=no-name-in-module
|
|
from cassandra.cluster import Cluster, ConsistencyLevel # type: ignore # pylint: disable=no-name-in-module
|
|
from cassandra.cluster import ExecutionProfile, EXEC_PROFILE_DEFAULT # type: ignore # pylint: disable=no-name-in-module
|
|
from cassandra.policies import ExponentialReconnectionPolicy # type: ignore
|
|
from cassandra.policies import RoundRobinPolicy # type: ignore
|
|
from cassandra.policies import TokenAwarePolicy # type: ignore
|
|
from cassandra.policies import WhiteListRoundRobinPolicy # type: ignore
|
|
from cassandra.connection import DRIVER_NAME # type: ignore # pylint: disable=no-name-in-module
|
|
from cassandra.connection import DRIVER_VERSION # type: ignore # pylint: disable=no-name-in-module
|
|
|
|
|
|
Session.run_async = run_async # patch Session for convenience
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
print(f"Driver name {DRIVER_NAME}, version {DRIVER_VERSION}")
|
|
|
|
|
|
def pytest_addoption(parser):
|
|
parser.addoption('--manager-api', action='store', required=True,
|
|
help='Manager unix socket path')
|
|
parser.addoption('--mode', action='store', required=True,
|
|
help='Scylla build mode. Tests can use it to adjust their behavior.')
|
|
parser.addoption('--host', action='store', default='localhost',
|
|
help='CQL server host to connect to')
|
|
parser.addoption('--port', action='store', default='9042',
|
|
help='CQL server port to connect to')
|
|
parser.addoption('--ssl', action='store_true',
|
|
help='Connect to CQL via an encrypted TLSv1.2 connection')
|
|
|
|
|
|
# This is a constant used in `pytest_runtest_makereport` below to store a flag
|
|
# indicating test failure in a stash which can then be accessed from fixtures.
|
|
FAILED_KEY = pytest.StashKey[bool]()
|
|
|
|
|
|
@pytest.hookimpl(tryfirst=True, hookwrapper=True)
|
|
def pytest_runtest_makereport(item, call):
|
|
"""This is a post-test hook execucted by the pytest library.
|
|
Use it to access the test result and store a flag indicating failure
|
|
so we can later retrieve it in our fixtures like `manager`.
|
|
|
|
`item.stash` is the same stash as `request.node.stash` (in the `request`
|
|
fixture provided by pytest).
|
|
"""
|
|
outcome = yield
|
|
report = outcome.get_result()
|
|
item.stash[FAILED_KEY] = report.when == "call" and report.failed
|
|
|
|
|
|
# cluster_con helper: set up client object for communicating with the CQL API.
|
|
def cluster_con(hosts: List[IPAddress], port: int, use_ssl: bool):
|
|
"""Create a CQL Cluster connection object according to configuration.
|
|
It does not .connect() yet."""
|
|
assert len(hosts) > 0, "python driver connection needs at least one host to connect to"
|
|
profile = ExecutionProfile(
|
|
load_balancing_policy=RoundRobinPolicy(),
|
|
consistency_level=ConsistencyLevel.LOCAL_QUORUM,
|
|
serial_consistency_level=ConsistencyLevel.LOCAL_SERIAL,
|
|
# The default timeouts should have been more than enough, but in some
|
|
# extreme cases with a very slow debug build running on a slow or very busy
|
|
# machine, they may not be. Observed tests reach 160 seconds. So it's
|
|
# incremented to 200 seconds.
|
|
# See issue #11289.
|
|
# NOTE: request_timeout is the main cause of timeouts, even if logs say heartbeat
|
|
request_timeout=200)
|
|
whitelist_profile = ExecutionProfile(
|
|
load_balancing_policy=TokenAwarePolicy(WhiteListRoundRobinPolicy(hosts)),
|
|
consistency_level=ConsistencyLevel.LOCAL_QUORUM,
|
|
serial_consistency_level=ConsistencyLevel.LOCAL_SERIAL,
|
|
request_timeout=200)
|
|
if use_ssl:
|
|
# Scylla does not support any earlier TLS protocol. If you try,
|
|
# you will get mysterious EOF errors (see issue #6971) :-(
|
|
ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLSv1_2)
|
|
else:
|
|
ssl_context = None
|
|
|
|
return Cluster(execution_profiles={EXEC_PROFILE_DEFAULT: profile, 'whitelist': whitelist_profile},
|
|
contact_points=hosts,
|
|
port=port,
|
|
# TODO: make the protocol version an option, to allow testing with
|
|
# different versions. If we drop this setting completely, it will
|
|
# mean pick the latest version supported by the client and the server.
|
|
protocol_version=4,
|
|
# NOTE: No auth provider as auth keysppace has RF=1 and topology will take
|
|
# down nodes, causing errors. If auth is needed in the future for topology
|
|
# tests, they should bump up auth RF and run repair.
|
|
ssl_context=ssl_context,
|
|
# The default timeouts should have been more than enough, but in some
|
|
# extreme cases with a very slow debug build running on a slow or very busy
|
|
# machine, they may not be. Observed tests reach 160 seconds. So it's
|
|
# incremented to 200 seconds.
|
|
# See issue #11289.
|
|
connect_timeout = 200,
|
|
control_connection_timeout = 200,
|
|
# NOTE: max_schema_agreement_wait must be 2x or 3x smaller than request_timeout
|
|
# else the driver can't handle a server being down
|
|
max_schema_agreement_wait=20,
|
|
idle_heartbeat_timeout=200,
|
|
# The default reconnection policy has a large maximum interval
|
|
# between retries (600 seconds). In tests that restart/replace nodes,
|
|
# where a node can be unavailable for an extended period of time,
|
|
# this can cause the reconnection retry interval to get very large,
|
|
# longer than a test timeout.
|
|
reconnection_policy = ExponentialReconnectionPolicy(1.0, 4.0)
|
|
)
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
async def manager_internal(event_loop, request):
|
|
"""Session fixture to set up client object for communicating with the Cluster API.
|
|
Pass the Unix socket path where the Manager server API is listening.
|
|
Pass a function to create driver connections.
|
|
Test cases (functions) should not use this fixture.
|
|
"""
|
|
port = int(request.config.getoption('port'))
|
|
use_ssl = bool(request.config.getoption('ssl'))
|
|
manager_int = ManagerClient(request.config.getoption('manager_api'), port, use_ssl, cluster_con)
|
|
yield manager_int
|
|
await manager_int.stop() # Stop client session and close driver after last test
|
|
|
|
@pytest.fixture(scope="function")
|
|
async def manager(request, manager_internal):
|
|
"""Per test fixture to notify Manager client object when tests begin so it can
|
|
perform checks for cluster state.
|
|
"""
|
|
test_case_name = request.node.name
|
|
await manager_internal.before_test(test_case_name)
|
|
yield manager_internal
|
|
# `request.node.stash` contains a flag stored in `pytest_runtest_makereport`
|
|
# that indicates test failure.
|
|
failed = request.node.stash[FAILED_KEY]
|
|
await manager_internal.after_test(test_case_name, not failed)
|
|
|
|
# "cql" fixture: set up client object for communicating with the CQL API.
|
|
# Since connection is managed by manager just return that object
|
|
@pytest.fixture(scope="function")
|
|
def cql(manager):
|
|
yield manager.cql
|
|
|
|
# Consistent schema change feature is optionally enabled and
|
|
# some tests are expected to fail on Scylla without this
|
|
# option enabled, and pass with it enabled (and also pass on Cassandra).
|
|
# These tests should use the "fails_without_consistent_cluster_management"
|
|
# fixture. When consistent mode becomes the default, this fixture can be removed.
|
|
@pytest.fixture(scope="function")
|
|
def check_pre_consistent_cluster_management(manager):
|
|
# If not running on Scylla, return false.
|
|
names = [row.table_name for row in manager.cql.execute(
|
|
"SELECT * FROM system_schema.tables WHERE keyspace_name = 'system'")]
|
|
if not any('scylla' in name for name in names):
|
|
return False
|
|
# In Scylla, we check Raft mode by inspecting the configuration via CQL.
|
|
consistent = list(manager.cql.execute("SELECT value FROM system.config WHERE name = 'consistent_cluster_management'"))
|
|
return len(consistent) == 0 or consistent[0].value == "false"
|
|
|
|
|
|
@pytest.fixture(scope="function")
|
|
def fails_without_consistent_cluster_management(request, check_pre_consistent_cluster_management):
|
|
if check_pre_consistent_cluster_management:
|
|
request.node.add_marker(pytest.mark.xfail(reason="Test expected to fail without consistent cluster management "
|
|
"feature on"))
|
|
|
|
|
|
# "random_tables" fixture: Creates and returns a temporary RandomTables object
|
|
# used in tests to make schema changes. Tables are dropped after test finishes
|
|
# unless the cluster is dirty or the test has failed.
|
|
@pytest.fixture(scope="function")
|
|
async def random_tables(request, manager):
|
|
rf_marker = request.node.get_closest_marker("replication_factor")
|
|
replication_factor = rf_marker.args[0] if rf_marker is not None else 3 # Default 3
|
|
tables = RandomTables(request.node.name, manager, unique_name(), replication_factor)
|
|
yield tables
|
|
|
|
# Don't drop tables at the end if we failed or the cluster is dirty - it may be impossible
|
|
# (e.g. the cluster is completely dead) and it doesn't matter (we won't reuse the cluster
|
|
# anyway).
|
|
# The cluster will be marked as dirty if the test failed, but that happens
|
|
# at the end of `manager` fixture which we depend on (so these steps will be
|
|
# executed after us) - so at this point, we need to check for failure ourselves too.
|
|
failed = request.node.stash[FAILED_KEY]
|
|
if not failed and not await manager.is_dirty():
|
|
tables.drop_all()
|
|
|
|
@pytest.fixture(scope="function")
|
|
def mode(request):
|
|
return request.config.getoption('mode')
|
|
|
|
skipped_funcs = {}
|
|
def skip_mode(mode: str, reason: str):
|
|
def wrap(func):
|
|
skipped_funcs[(func, mode)] = reason
|
|
return func
|
|
return wrap
|
|
|
|
@pytest.fixture(scope="function", autouse=True)
|
|
def skip_mode_fixture(request, mode):
|
|
skip_reason = skipped_funcs.get((request.function, mode))
|
|
if skip_reason is not None:
|
|
pytest.skip(f'{request.node.name} skipped, reason: {skip_reason}')
|