From f26bd6ff21df2f1d155ca17eef360423b706ab7f Mon Sep 17 00:00:00 2001
From: John Snow <jsnow@redhat.com>
Date: Thu, 11 Nov 2021 09:37:15 -0500
Subject: [PATCH 1/5] python/aqmp: Fix disconnect during capabilities
 negotiation

If we receive ConnectionResetError (ECONNRESET) while attempting to
perform capabilities negotiation -- prior to the establishment of the
async reader/writer tasks -- the disconnect function is not aware that
we are in an error pathway.

As a result, when attempting to close the StreamWriter, we'll see the
same ConnectionResetError that caused us to initiate a disconnect in the
first place, which will cause the disconnect task itself to fail, which
emits a CRITICAL logging event.

I still don't know if there's a smarter way to check to see if an
exception received at this point is "the same" exception as the one that
caused the initial disconnect, but for now the problem can be avoided by
improving the error pathway detection in the exit path.

Reported-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: John Snow <jsnow@redhat.com>
Tested-by: Thomas Huth <thuth@redhat.com>
Message-id: 20211111143719.2162525-2-jsnow@redhat.com
Signed-off-by: John Snow <jsnow@redhat.com>
---
 python/qemu/aqmp/protocol.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/python/qemu/aqmp/protocol.py b/python/qemu/aqmp/protocol.py
index ae1df24026..860b79512d 100644
--- a/python/qemu/aqmp/protocol.py
+++ b/python/qemu/aqmp/protocol.py
@@ -623,13 +623,21 @@ class AsyncProtocol(Generic[T]):
         def _done(task: Optional['asyncio.Future[Any]']) -> bool:
             return task is not None and task.done()
 
-        # NB: We can't rely on _bh_tasks being done() here, it may not
-        #     yet have had a chance to run and gather itself.
+        # Are we already in an error pathway? If either of the tasks are
+        # already done, or if we have no tasks but a reader/writer; we
+        # must be.
+        #
+        # NB: We can't use _bh_tasks to check for premature task
+        # completion, because it may not yet have had a chance to run
+        # and gather itself.
         tasks = tuple(filter(None, (self._writer_task, self._reader_task)))
         error_pathway = _done(self._reader_task) or _done(self._writer_task)
+        if not tasks:
+            error_pathway |= bool(self._reader) or bool(self._writer)
 
         try:
-            # Try to flush the writer, if possible:
+            # Try to flush the writer, if possible.
+            # This *may* cause an error and force us over into the error path.
             if not error_pathway:
                 await self._bh_flush_writer()
         except BaseException as err:
@@ -639,7 +647,7 @@ class AsyncProtocol(Generic[T]):
             self.logger.debug("%s:\n%s\n", emsg, pretty_traceback())
             raise
         finally:
-            # Cancel any still-running tasks:
+            # Cancel any still-running tasks (Won't raise):
             if self._writer_task is not None and not self._writer_task.done():
                 self.logger.debug("Cancelling writer task.")
                 self._writer_task.cancel()
@@ -652,7 +660,7 @@ class AsyncProtocol(Generic[T]):
                 self.logger.debug("Waiting for tasks to complete ...")
                 await asyncio.wait(tasks)
 
-            # Lastly, close the stream itself. (May raise):
+            # Lastly, close the stream itself. (*May raise*!):
             await self._bh_close_stream(error_pathway)
             self.logger.debug("Disconnected.")
 

From 25de7f50121f251c45d111582d786db7ce0768d3 Mon Sep 17 00:00:00 2001
From: John Snow <jsnow@redhat.com>
Date: Thu, 11 Nov 2021 09:37:16 -0500
Subject: [PATCH 2/5] python/aqmp: fix ConnectError string method

When ConnectError is used to wrap an Exception that was initialized
without an error message, we are treated to a traceback with a rubbish
line like this:

... ConnectError: Failed to establish session:

Correct this to use the name of an exception as a fallback message:

... ConnectError: Failed to establish session: EOFError

Better!

Signed-off-by: John Snow <jsnow@redhat.com>
Reported-by: Thomas Huth <thuth@redhat.com>
Tested-by: Thomas Huth <thuth@redhat.com>
Message-id: 20211111143719.2162525-3-jsnow@redhat.com
Signed-off-by: John Snow <jsnow@redhat.com>
---
 python/qemu/aqmp/protocol.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/python/qemu/aqmp/protocol.py b/python/qemu/aqmp/protocol.py
index 860b79512d..5190b33b13 100644
--- a/python/qemu/aqmp/protocol.py
+++ b/python/qemu/aqmp/protocol.py
@@ -79,7 +79,11 @@ class ConnectError(AQMPError):
         self.exc: Exception = exc
 
     def __str__(self) -> str:
-        return f"{self.error_message}: {self.exc!s}"
+        cause = str(self.exc)
+        if not cause:
+            # If there's no error string, use the exception name.
+            cause = exception_summary(self.exc)
+        return f"{self.error_message}: {cause}"
 
 
 class StateError(AQMPError):

From 47b43acd57e6e0b2910683f00a758adbc418dde6 Mon Sep 17 00:00:00 2001
From: John Snow <jsnow@redhat.com>
Date: Thu, 11 Nov 2021 09:37:17 -0500
Subject: [PATCH 3/5] scripts/device-crash-test: simplify Exception handling

We don't need to handle KeyboardInterruptError specifically; we can
instead tighten the scope of the broad Exception handlers to only catch
"Exception", which has the effect of allowing all BaseException classes
that do not inherit from Exception to be raised through.

KeyboardInterruptError and a few other important ones are
BaseExceptions, so this does the same thing with less code.

Signed-off-by: John Snow <jsnow@redhat.com>
Reported-by: Thomas Huth <thuth@redhat.com>
Tested-by: Thomas Huth <thuth@redhat.com>
Message-id: 20211111143719.2162525-4-jsnow@redhat.com
Signed-off-by: John Snow <jsnow@redhat.com>
---
 scripts/device-crash-test | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/scripts/device-crash-test b/scripts/device-crash-test
index 8331c057b8..d91e8616ef 100755
--- a/scripts/device-crash-test
+++ b/scripts/device-crash-test
@@ -317,9 +317,7 @@ class QemuBinaryInfo(object):
         try:
             vm.launch()
             mi['runnable'] = True
-        except KeyboardInterrupt:
-            raise
-        except:
+        except Exception:
             dbg("exception trying to run binary=%s machine=%s", self.binary, machine, exc_info=sys.exc_info())
             dbg("log: %r", vm.get_log())
             mi['runnable'] = False
@@ -360,9 +358,7 @@ def checkOneCase(args, testcase):
     exc_traceback = None
     try:
         vm.launch()
-    except KeyboardInterrupt:
-        raise
-    except:
+    except Exception:
         exc_traceback = traceback.format_exc()
         dbg("Exception while running test case")
     finally:

From 76f86e78b255d17aad2ebd1177069c86f08039ef Mon Sep 17 00:00:00 2001
From: John Snow <jsnow@redhat.com>
Date: Thu, 11 Nov 2021 09:37:18 -0500
Subject: [PATCH 4/5] scripts/device-crash-test: don't emit AQMP connection
 errors to stdout

These errors are expected, so they shouldn't clog up terminal output. In
the event that they're *not* expected, we'll be seeing an awful lot more
output concerning the nature of the failure.

Reported-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: John Snow <jsnow@redhat.com>
Tested-by: Thomas Huth <thuth@redhat.com>
Message-id: 20211111143719.2162525-5-jsnow@redhat.com
Signed-off-by: John Snow <jsnow@redhat.com>
---
 scripts/device-crash-test | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/scripts/device-crash-test b/scripts/device-crash-test
index d91e8616ef..49bcd61b4f 100755
--- a/scripts/device-crash-test
+++ b/scripts/device-crash-test
@@ -499,6 +499,12 @@ def main():
         lvl = logging.WARN
     logging.basicConfig(stream=sys.stdout, level=lvl, format='%(levelname)s: %(message)s')
 
+    if not args.debug:
+        # Async QMP, when in use, is chatty about connection failures.
+        # This script knowingly generates a ton of connection errors.
+        # Silence this logger.
+        logging.getLogger('qemu.aqmp.qmp_client').setLevel(logging.CRITICAL)
+
     fatal_failures = []
     wl_stats = {}
     skipped = 0

From c398a241ec4138e0b995a0215dea84ca93b0384f Mon Sep 17 00:00:00 2001
From: John Snow <jsnow@redhat.com>
Date: Thu, 11 Nov 2021 09:37:19 -0500
Subject: [PATCH 5/5] scripts/device-crash-test: hide tracebacks for QMP
 connect errors

Generally, the traceback for a connection failure is uninteresting and
all we need to know is that the connection attempt failed.

Reduce the verbosity in these cases, except when debugging.

Signed-off-by: John Snow <jsnow@redhat.com>
Reported-by: Thomas Huth <thuth@redhat.com>
Tested-by: Thomas Huth <thuth@redhat.com>
Message-id: 20211111143719.2162525-6-jsnow@redhat.com
Signed-off-by: John Snow <jsnow@redhat.com>
---
 scripts/device-crash-test | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/scripts/device-crash-test b/scripts/device-crash-test
index 49bcd61b4f..3db0ffe5b8 100755
--- a/scripts/device-crash-test
+++ b/scripts/device-crash-test
@@ -36,6 +36,7 @@ from itertools import chain
 
 sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'python'))
 from qemu.machine import QEMUMachine
+from qemu.aqmp import ConnectError
 
 logger = logging.getLogger('device-crash-test')
 dbg = logger.debug
@@ -355,10 +356,12 @@ def checkOneCase(args, testcase):
     dbg("will launch QEMU: %s", cmdline)
     vm = QEMUMachine(binary=binary, args=args)
 
+    exc = None
     exc_traceback = None
     try:
         vm.launch()
-    except Exception:
+    except Exception as this_exc:
+        exc = this_exc
         exc_traceback = traceback.format_exc()
         dbg("Exception while running test case")
     finally:
@@ -366,8 +369,9 @@ def checkOneCase(args, testcase):
         ec = vm.exitcode()
         log = vm.get_log()
 
-    if exc_traceback is not None or ec != 0:
-        return {'exc_traceback':exc_traceback,
+    if exc is not None or ec != 0:
+        return {'exc': exc,
+                'exc_traceback':exc_traceback,
                 'exitcode':ec,
                 'log':log,
                 'testcase':testcase,
@@ -455,6 +459,17 @@ def logFailure(f, level):
     for l in f['log'].strip().split('\n'):
         logger.log(level, "log: %s", l)
     logger.log(level, "exit code: %r", f['exitcode'])
+
+    # If the Exception is merely a QMP connect error,
+    # reduce the logging level for its traceback to
+    # improve visual clarity.
+    if isinstance(f.get('exc'), ConnectError):
+        logger.log(level, "%s.%s: %s",
+                   type(f['exc']).__module__,
+                   type(f['exc']).__qualname__,
+                   str(f['exc']))
+        level = logging.DEBUG
+
     if f['exc_traceback']:
         logger.log(level, "exception:")
         for l in f['exc_traceback'].split('\n'):