From f26bd6ff21df2f1d155ca17eef360423b706ab7f Mon Sep 17 00:00:00 2001 From: John Snow Date: Thu, 11 Nov 2021 09:37:15 -0500 Subject: [PATCH 1/5] python/aqmp: Fix disconnect during capabilities negotiation If we receive ConnectionResetError (ECONNRESET) while attempting to perform capabilities negotiation -- prior to the establishment of the async reader/writer tasks -- the disconnect function is not aware that we are in an error pathway. As a result, when attempting to close the StreamWriter, we'll see the same ConnectionResetError that caused us to initiate a disconnect in the first place, which will cause the disconnect task itself to fail, which emits a CRITICAL logging event. I still don't know if there's a smarter way to check to see if an exception received at this point is "the same" exception as the one that caused the initial disconnect, but for now the problem can be avoided by improving the error pathway detection in the exit path. Reported-by: Thomas Huth Signed-off-by: John Snow Tested-by: Thomas Huth Message-id: 20211111143719.2162525-2-jsnow@redhat.com Signed-off-by: John Snow --- python/qemu/aqmp/protocol.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/python/qemu/aqmp/protocol.py b/python/qemu/aqmp/protocol.py index ae1df24026..860b79512d 100644 --- a/python/qemu/aqmp/protocol.py +++ b/python/qemu/aqmp/protocol.py @@ -623,13 +623,21 @@ class AsyncProtocol(Generic[T]): def _done(task: Optional['asyncio.Future[Any]']) -> bool: return task is not None and task.done() - # NB: We can't rely on _bh_tasks being done() here, it may not - # yet have had a chance to run and gather itself. + # Are we already in an error pathway? If either of the tasks are + # already done, or if we have no tasks but a reader/writer; we + # must be. + # + # NB: We can't use _bh_tasks to check for premature task + # completion, because it may not yet have had a chance to run + # and gather itself. tasks = tuple(filter(None, (self._writer_task, self._reader_task))) error_pathway = _done(self._reader_task) or _done(self._writer_task) + if not tasks: + error_pathway |= bool(self._reader) or bool(self._writer) try: - # Try to flush the writer, if possible: + # Try to flush the writer, if possible. + # This *may* cause an error and force us over into the error path. if not error_pathway: await self._bh_flush_writer() except BaseException as err: @@ -639,7 +647,7 @@ class AsyncProtocol(Generic[T]): self.logger.debug("%s:\n%s\n", emsg, pretty_traceback()) raise finally: - # Cancel any still-running tasks: + # Cancel any still-running tasks (Won't raise): if self._writer_task is not None and not self._writer_task.done(): self.logger.debug("Cancelling writer task.") self._writer_task.cancel() @@ -652,7 +660,7 @@ class AsyncProtocol(Generic[T]): self.logger.debug("Waiting for tasks to complete ...") await asyncio.wait(tasks) - # Lastly, close the stream itself. (May raise): + # Lastly, close the stream itself. (*May raise*!): await self._bh_close_stream(error_pathway) self.logger.debug("Disconnected.") From 25de7f50121f251c45d111582d786db7ce0768d3 Mon Sep 17 00:00:00 2001 From: John Snow Date: Thu, 11 Nov 2021 09:37:16 -0500 Subject: [PATCH 2/5] python/aqmp: fix ConnectError string method When ConnectError is used to wrap an Exception that was initialized without an error message, we are treated to a traceback with a rubbish line like this: ... ConnectError: Failed to establish session: Correct this to use the name of an exception as a fallback message: ... ConnectError: Failed to establish session: EOFError Better! Signed-off-by: John Snow Reported-by: Thomas Huth Tested-by: Thomas Huth Message-id: 20211111143719.2162525-3-jsnow@redhat.com Signed-off-by: John Snow --- python/qemu/aqmp/protocol.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/qemu/aqmp/protocol.py b/python/qemu/aqmp/protocol.py index 860b79512d..5190b33b13 100644 --- a/python/qemu/aqmp/protocol.py +++ b/python/qemu/aqmp/protocol.py @@ -79,7 +79,11 @@ class ConnectError(AQMPError): self.exc: Exception = exc def __str__(self) -> str: - return f"{self.error_message}: {self.exc!s}" + cause = str(self.exc) + if not cause: + # If there's no error string, use the exception name. + cause = exception_summary(self.exc) + return f"{self.error_message}: {cause}" class StateError(AQMPError): From 47b43acd57e6e0b2910683f00a758adbc418dde6 Mon Sep 17 00:00:00 2001 From: John Snow Date: Thu, 11 Nov 2021 09:37:17 -0500 Subject: [PATCH 3/5] scripts/device-crash-test: simplify Exception handling We don't need to handle KeyboardInterruptError specifically; we can instead tighten the scope of the broad Exception handlers to only catch "Exception", which has the effect of allowing all BaseException classes that do not inherit from Exception to be raised through. KeyboardInterruptError and a few other important ones are BaseExceptions, so this does the same thing with less code. Signed-off-by: John Snow Reported-by: Thomas Huth Tested-by: Thomas Huth Message-id: 20211111143719.2162525-4-jsnow@redhat.com Signed-off-by: John Snow --- scripts/device-crash-test | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/scripts/device-crash-test b/scripts/device-crash-test index 8331c057b8..d91e8616ef 100755 --- a/scripts/device-crash-test +++ b/scripts/device-crash-test @@ -317,9 +317,7 @@ class QemuBinaryInfo(object): try: vm.launch() mi['runnable'] = True - except KeyboardInterrupt: - raise - except: + except Exception: dbg("exception trying to run binary=%s machine=%s", self.binary, machine, exc_info=sys.exc_info()) dbg("log: %r", vm.get_log()) mi['runnable'] = False @@ -360,9 +358,7 @@ def checkOneCase(args, testcase): exc_traceback = None try: vm.launch() - except KeyboardInterrupt: - raise - except: + except Exception: exc_traceback = traceback.format_exc() dbg("Exception while running test case") finally: From 76f86e78b255d17aad2ebd1177069c86f08039ef Mon Sep 17 00:00:00 2001 From: John Snow Date: Thu, 11 Nov 2021 09:37:18 -0500 Subject: [PATCH 4/5] scripts/device-crash-test: don't emit AQMP connection errors to stdout These errors are expected, so they shouldn't clog up terminal output. In the event that they're *not* expected, we'll be seeing an awful lot more output concerning the nature of the failure. Reported-by: Thomas Huth Signed-off-by: John Snow Tested-by: Thomas Huth Message-id: 20211111143719.2162525-5-jsnow@redhat.com Signed-off-by: John Snow --- scripts/device-crash-test | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/device-crash-test b/scripts/device-crash-test index d91e8616ef..49bcd61b4f 100755 --- a/scripts/device-crash-test +++ b/scripts/device-crash-test @@ -499,6 +499,12 @@ def main(): lvl = logging.WARN logging.basicConfig(stream=sys.stdout, level=lvl, format='%(levelname)s: %(message)s') + if not args.debug: + # Async QMP, when in use, is chatty about connection failures. + # This script knowingly generates a ton of connection errors. + # Silence this logger. + logging.getLogger('qemu.aqmp.qmp_client').setLevel(logging.CRITICAL) + fatal_failures = [] wl_stats = {} skipped = 0 From c398a241ec4138e0b995a0215dea84ca93b0384f Mon Sep 17 00:00:00 2001 From: John Snow Date: Thu, 11 Nov 2021 09:37:19 -0500 Subject: [PATCH 5/5] scripts/device-crash-test: hide tracebacks for QMP connect errors Generally, the traceback for a connection failure is uninteresting and all we need to know is that the connection attempt failed. Reduce the verbosity in these cases, except when debugging. Signed-off-by: John Snow Reported-by: Thomas Huth Tested-by: Thomas Huth Message-id: 20211111143719.2162525-6-jsnow@redhat.com Signed-off-by: John Snow --- scripts/device-crash-test | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/scripts/device-crash-test b/scripts/device-crash-test index 49bcd61b4f..3db0ffe5b8 100755 --- a/scripts/device-crash-test +++ b/scripts/device-crash-test @@ -36,6 +36,7 @@ from itertools import chain sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'python')) from qemu.machine import QEMUMachine +from qemu.aqmp import ConnectError logger = logging.getLogger('device-crash-test') dbg = logger.debug @@ -355,10 +356,12 @@ def checkOneCase(args, testcase): dbg("will launch QEMU: %s", cmdline) vm = QEMUMachine(binary=binary, args=args) + exc = None exc_traceback = None try: vm.launch() - except Exception: + except Exception as this_exc: + exc = this_exc exc_traceback = traceback.format_exc() dbg("Exception while running test case") finally: @@ -366,8 +369,9 @@ def checkOneCase(args, testcase): ec = vm.exitcode() log = vm.get_log() - if exc_traceback is not None or ec != 0: - return {'exc_traceback':exc_traceback, + if exc is not None or ec != 0: + return {'exc': exc, + 'exc_traceback':exc_traceback, 'exitcode':ec, 'log':log, 'testcase':testcase, @@ -455,6 +459,17 @@ def logFailure(f, level): for l in f['log'].strip().split('\n'): logger.log(level, "log: %s", l) logger.log(level, "exit code: %r", f['exitcode']) + + # If the Exception is merely a QMP connect error, + # reduce the logging level for its traceback to + # improve visual clarity. + if isinstance(f.get('exc'), ConnectError): + logger.log(level, "%s.%s: %s", + type(f['exc']).__module__, + type(f['exc']).__qualname__, + str(f['exc'])) + level = logging.DEBUG + if f['exc_traceback']: logger.log(level, "exception:") for l in f['exc_traceback'].split('\n'):