test/pylib: mark cluster as dirty after a failed test

We don't expect the cluster to be functioning at all after a failed test. The whole cluster might have crashed, for example. In these situations the framework would report multiple errors (one for the actual failure, another for a failed post-condition check because the cluster was down) which would only obscure the report and make debugging harder. It's also not safe in general to reuse the cluster in another test - if the test previous failed, we should not assume that it's in a valid state. Therefore, mark the cluster as dirty after a failed test. This will let us recycle the cluster based on the dirty flag and it will disable post-condition check after a failed test (which is only done on non-dirty clusters). To implement this in topology tests, we use the `pytest_runtest_makereport` hook which executes after a test finishes but before fixtures finish. There we store a test-failed flag in a stash provided by pytest, then access the flag in the `manager` fixture.
2023-01-26 17:33:32 +01:00
parent 977375d13f
commit a9dbd89478
4 changed files with 40 additions and 11 deletions
--- a/test.py
+++ b/test.py
@@ -706,7 +706,7 @@ class CQLApprovalTest(Test):
                self.is_before_test_ok = True
                cluster.take_log_savepoint()
                self.is_executed_ok = await run_test(self, options, env=self.env)
-                cluster.after_test(self.uname)
+                cluster.after_test(self.uname, self.is_executed_ok)
                self.is_after_test_ok = True

                if self.is_executed_ok is False:
@@ -860,7 +860,7 @@ class PythonTest(Test):
            self.is_before_test_ok = True
            cluster.take_log_savepoint()
            status = await run_test(self, options)
-            cluster.after_test(self.uname)
+            cluster.after_test(self.uname, status)
            self.is_after_test_ok = True
            self.success = status
        except Exception as e:
--- a/test/pylib/manager_client.py
+++ b/test/pylib/manager_client.py
@@ -86,10 +86,10 @@ class ManagerClient():
            # await self._wait_for_cluster()
            await self.driver_connect()  # Connect driver to new cluster

-    async def after_test(self, test_case_name: str) -> None:
+    async def after_test(self, test_case_name: str, success: bool) -> None:
        """Tell harness this test finished"""
-        logger.debug("after_test for %s", test_case_name)
-        cluster_str = await self.client.get_text(f"/cluster/after-test")
+        logger.debug("after_test for %s (success: %s)", test_case_name, success)
+        cluster_str = await self.client.get_text(f"/cluster/after-test/{success}")
        logger.info("Cluster after test %s: %s", test_case_name, cluster_str)

    async def is_manager_up(self) -> bool:
--- a/test/pylib/scylla_cluster.py
+++ b/test/pylib/scylla_cluster.py
@@ -740,10 +740,14 @@ class ScyllaCluster:
        for server in self.running.values():
            server.write_log_marker(f"------ Starting test {name} ------\n")

-    def after_test(self, name) -> None:
-        """If the cluster is not dirty, check that it's still alive and the test
+    def after_test(self, name: str, success: bool) -> None:
+        """Mark the cluster as dirty after a failed test.
+        If the cluster is not dirty, check that it's still alive and the test
        hasn't left any garbage."""
        assert self.start_exception is None
+        if not success:
+            self.logger.debug(f"Test failed using cluster {self.name}, marking the cluster as dirty")
+            self.is_dirty = True
        if self.is_dirty:
            self.logger.info(f"The cluster {self.name} is dirty, not checking"
                             f" keyspace count post-condition")
@@ -939,7 +943,7 @@ class ScyllaClusterManager:
        add_get('/cluster/host-ip/{server_id}', self._cluster_server_ip_addr)
        add_get('/cluster/host-id/{server_id}', self._cluster_host_id)
        add_get('/cluster/before-test/{test_case_name}', self._before_test_req)
-        add_get('/cluster/after-test', self._after_test)
+        add_get('/cluster/after-test/{success}', self._after_test)
        add_get('/cluster/mark-dirty', self._mark_dirty)
        add_get('/cluster/server/{server_id}/stop', self._cluster_server_stop)
        add_get('/cluster/server/{server_id}/stop_gracefully', self._cluster_server_stop_gracefully)
@@ -991,9 +995,11 @@ class ScyllaClusterManager:
    async def _after_test(self, _request) -> aiohttp.web.Response:
        assert self.cluster is not None
        assert self.current_test_case_full_name
-        self.logger.info("Finished test %s, cluster: %s", self.current_test_case_full_name, self.cluster)
+        success = _request.match_info["success"] == "True"
+        self.logger.info("Test %s %s, cluster: %s", self.current_test_case_full_name,
+                         "SUCCEEDED" if success else "FAILED", self.cluster)
        try:
-            self.cluster.after_test(self.current_test_case_full_name)
+            self.cluster.after_test(self.current_test_case_full_name, success)
        finally:
            self.current_test_case_full_name = ''
        self.is_after_test_ok = True
--- a/test/topology/conftest.py
+++ b/test/topology/conftest.py
@@ -37,6 +37,26 @@ def pytest_addoption(parser):
    parser.addoption('--ssl', action='store_true',
                     help='Connect to CQL via an encrypted TLSv1.2 connection')

+
+# This is a constant used in `pytest_runtest_makereport` below to store a flag
+# indicating test failure in a stash which can then be accessed from fixtures.
+FAILED_KEY = pytest.StashKey[bool]()
+
+
+@pytest.hookimpl(tryfirst=True, hookwrapper=True)
+def pytest_runtest_makereport(item, call):
+    """This is a post-test hook execucted by the pytest library.
+    Use it to access the test result and store a flag indicating failure
+    so we can later retrieve it in our fixtures like `manager`.
+
+    `item.stash` is the same stash as `request.node.stash` (in the `request`
+    fixture provided by pytest).
+    """
+    outcome = yield
+    report = outcome.get_result()
+    item.stash[FAILED_KEY] = report.when == "call" and report.failed
+
+
 # Change default pytest-asyncio event_loop fixture scope to session to
 # allow async fixtures with scope larger than function. (e.g. manager fixture)
 # See https://github.com/pytest-dev/pytest-asyncio/issues/68
@@ -159,7 +179,10 @@ async def manager(request, manager_internal):
    test_case_name = request.node.name
    await manager_internal.before_test(test_case_name)
    yield manager_internal
-    await manager_internal.after_test(test_case_name)
+    # `request.node.stash` contains a flag stored in `pytest_runtest_makereport`
+    # that indicates test failure.
+    failed = request.node.stash[FAILED_KEY]
+    await manager_internal.after_test(test_case_name, not failed)

 # "cql" fixture: set up client object for communicating with the CQL API.
 # Since connection is managed by manager just return that object