diff mbox series

[1/2] qemurunner: Impove stdout logging handling

Message ID 20231218152951.3398964-1-richard.purdie@linuxfoundation.org
State New
Headers show
Series [1/2] qemurunner: Impove stdout logging handling | expand

Commit Message

Richard Purdie Dec. 18, 2023, 3:29 p.m. UTC
We need to ensure we read from and log the output from qemu stdout
as otherwise the buffers can fill and block, leading qemu to hand.
Use our existing logging thread to do this.

Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org>
---
 meta/lib/oeqa/utils/qemurunner.py | 54 ++++++++++++++++++-------------
 1 file changed, 32 insertions(+), 22 deletions(-)

Comments

Fabio Estevam Dec. 18, 2023, 4:04 p.m. UTC | #1
On Mon, Dec 18, 2023 at 12:30 PM Richard Purdie
<richard.purdie@linuxfoundation.org> wrote:
>
> We need to ensure we read from and log the output from qemu stdout
> as otherwise the buffers can fill and block, leading qemu to hand.
> Use our existing logging thread to do this.

s/Impove/Improve on the Subject line of both patches.
Richard Purdie Dec. 18, 2023, 5:30 p.m. UTC | #2
On Mon, 2023-12-18 at 13:04 -0300, Fabio Estevam wrote:
> On Mon, Dec 18, 2023 at 12:30 PM Richard Purdie
> <richard.purdie@linuxfoundation.org> wrote:
> > 
> > We need to ensure we read from and log the output from qemu stdout
> > as otherwise the buffers can fill and block, leading qemu to hand.
> > Use our existing logging thread to do this.
> 
> s/Impove/Improve on the Subject line of both patches.

Ah, yes, thanks. I've tweaked it on master-next.

Cheers,

Richard
Khem Raj Dec. 18, 2023, 5:45 p.m. UTC | #3
I tried the two patches in this series. It did improve the situation
but I am still getting SSH timeouts. But this time its 13 tests
earlier it used to be 40+
btw. my images are using systemd. So it might be good to see if we see
this with poky-altcfg as well or not.

On Mon, Dec 18, 2023 at 7:29 AM Richard Purdie
<richard.purdie@linuxfoundation.org> wrote:
>
> We need to ensure we read from and log the output from qemu stdout
> as otherwise the buffers can fill and block, leading qemu to hand.
> Use our existing logging thread to do this.
>
> Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org>
> ---
>  meta/lib/oeqa/utils/qemurunner.py | 54 ++++++++++++++++++-------------
>  1 file changed, 32 insertions(+), 22 deletions(-)
>
> diff --git a/meta/lib/oeqa/utils/qemurunner.py b/meta/lib/oeqa/utils/qemurunner.py
> index ee367078000..36bdfb96ae6 100644
> --- a/meta/lib/oeqa/utils/qemurunner.py
> +++ b/meta/lib/oeqa/utils/qemurunner.py
> @@ -29,6 +29,15 @@ control_chars = [chr(x) for x in control_range
>                  if chr(x) not in string.printable]
>  re_control_char = re.compile('[%s]' % re.escape("".join(control_chars)))
>
> +def getOutput(o):
> +    import fcntl
> +    fl = fcntl.fcntl(o, fcntl.F_GETFL)
> +    fcntl.fcntl(o, fcntl.F_SETFL, fl | os.O_NONBLOCK)
> +    try:
> +        return os.read(o.fileno(), 1000000).decode("utf-8")
> +    except BlockingIOError:
> +        return ""
> +
>  class QemuRunner:
>
>      def __init__(self, machine, rootfs, display, tmpdir, deploy_dir_image, logfile, boottime, dump_dir, use_kvm, logger, use_slirp=False,
> @@ -55,6 +64,7 @@ class QemuRunner:
>          self.boottime = boottime
>          self.logged = False
>          self.thread = None
> +        self.threadsock = None
>          self.use_kvm = use_kvm
>          self.use_ovmf = use_ovmf
>          self.use_slirp = use_slirp
> @@ -119,21 +129,11 @@ class QemuRunner:
>                  f.write(msg)
>          self.msg += self.decode_qemulog(msg)
>
> -    def getOutput(self, o):
> -        import fcntl
> -        fl = fcntl.fcntl(o, fcntl.F_GETFL)
> -        fcntl.fcntl(o, fcntl.F_SETFL, fl | os.O_NONBLOCK)
> -        try:
> -            return os.read(o.fileno(), 1000000).decode("utf-8")
> -        except BlockingIOError:
> -            return ""
> -
> -
>      def handleSIGCHLD(self, signum, frame):
>          if self.runqemu and self.runqemu.poll():
>              if self.runqemu.returncode:
>                  self.logger.error('runqemu exited with code %d' % self.runqemu.returncode)
> -                self.logger.error('Output from runqemu:\n%s' % self.getOutput(self.runqemu.stdout))
> +                self.logger.error('Output from runqemu:\n%s' % getOutput(self.runqemu.stdout))
>                  self.stop()
>
>      def start(self, qemuparams = None, get_ip = True, extra_bootparams = None, runqemuparams='', launch_cmd=None, discard_writes=True):
> @@ -282,7 +282,7 @@ class QemuRunner:
>                  if self.runqemu.returncode:
>                      # No point waiting any longer
>                      self.logger.warning('runqemu exited with code %d' % self.runqemu.returncode)
> -                    self.logger.warning("Output from runqemu:\n%s" % self.getOutput(output))
> +                    self.logger.warning("Output from runqemu:\n%s" % getOutput(output))
>                      self.stop()
>                      return False
>              time.sleep(0.5)
> @@ -309,7 +309,7 @@ class QemuRunner:
>              ps = subprocess.Popen(['ps', 'axww', '-o', 'pid,ppid,pri,ni,command '], stdout=subprocess.PIPE).communicate()[0]
>              processes = ps.decode("utf-8")
>              self.logger.debug("Running processes:\n%s" % processes)
> -            op = self.getOutput(output)
> +            op = getOutput(output)
>              self.stop()
>              if op:
>                  self.logger.error("Output from runqemu:\n%s" % op)
> @@ -387,7 +387,7 @@ class QemuRunner:
>                             time.time() - connect_time))
>
>          # We are alive: qemu is running
> -        out = self.getOutput(output)
> +        out = getOutput(output)
>          netconf = False # network configuration is not required by default
>          self.logger.debug("qemu started in %.2f seconds - qemu procces pid is %s (%s)" %
>                            (time.time() - (endtime - self.runqemutime),
> @@ -430,9 +430,10 @@ class QemuRunner:
>          self.logger.debug("Target IP: %s" % self.ip)
>          self.logger.debug("Server IP: %s" % self.server_ip)
>
> +        self.thread = LoggingThread(self.log, self.threadsock, self.logger, self.runqemu.stdout)
> +        self.thread.start()
> +
>          if self.serial_ports >= 2:
> -            self.thread = LoggingThread(self.log, self.threadsock, self.logger)
> -            self.thread.start()
>              if not self.thread.connection_established.wait(self.boottime):
>                  self.logger.error("Didn't receive a console connection from qemu. "
>                               "Here is the qemu command line used:\n%s\nand "
> @@ -444,7 +445,7 @@ class QemuRunner:
>          self.logger.debug("Waiting at most %d seconds for login banner (%s)" %
>                            (self.boottime, time.strftime("%D %H:%M:%S")))
>          endtime = time.time() + self.boottime
> -        filelist = [self.server_socket, self.runqemu.stdout]
> +        filelist = [self.server_socket]
>          reachedlogin = False
>          stopread = False
>          qemusock = None
> @@ -564,7 +565,7 @@ class QemuRunner:
>                  self.logger.debug("Sending SIGKILL to runqemu")
>                  os.killpg(os.getpgid(self.runqemu.pid), signal.SIGKILL)
>              if not self.runqemu.stdout.closed:
> -                self.logger.info("Output from runqemu:\n%s" % self.getOutput(self.runqemu.stdout))
> +                self.logger.info("Output from runqemu:\n%s" % getOutput(self.runqemu.stdout))
>              self.runqemu.stdin.close()
>              self.runqemu.stdout.close()
>              self.runqemu_exited = True
> @@ -700,9 +701,11 @@ class QemuRunner:
>  # event loop. The mechanism for stopping the thread is a simple pipe which
>  # will wake up the poll and allow for tearing everything down.
>  class LoggingThread(threading.Thread):
> -    def __init__(self, logfunc, sock, logger):
> +    def __init__(self, logfunc, sock, logger, qemuoutput):
>          self.connection_established = threading.Event()
> +
>          self.serversock = sock
> +        self.qemuoutput = qemuoutput
>          self.logfunc = logfunc
>          self.logger = logger
>          self.readsock = None
> @@ -732,7 +735,8 @@ class LoggingThread(threading.Thread):
>
>      def teardown(self):
>          self.logger.debug("Tearing down logging thread")
> -        self.close_socket(self.serversock)
> +        if self.serversock:
> +            self.close_socket(self.serversock)
>
>          if self.readsock is not None:
>              self.close_socket(self.readsock)
> @@ -747,7 +751,9 @@ class LoggingThread(threading.Thread):
>      def eventloop(self):
>          poll = select.poll()
>          event_read_mask = self.errorevents | self.readevents
> -        poll.register(self.serversock.fileno())
> +        if self.serversock:
> +            poll.register(self.serversock.fileno())
> +        poll.register(self.qemuoutput.fileno())
>          poll.register(self.readpipe, event_read_mask)
>
>          breakout = False
> @@ -767,7 +773,7 @@ class LoggingThread(threading.Thread):
>                      break
>
>                  # A connection request was received
> -                elif self.serversock.fileno() == event[0]:
> +                elif self.serversock and self.serversock.fileno() == event[0]:
>                      self.logger.debug("Connection request received")
>                      self.readsock, _ = self.serversock.accept()
>                      self.readsock.setblocking(0)
> @@ -781,6 +787,10 @@ class LoggingThread(threading.Thread):
>                  elif self.readsock.fileno() == event[0]:
>                      data = self.recv(1024)
>                      self.logfunc(data)
> +                elif self.qemuoutput.fileno() == event[0]:
> +                    data = getOutput(self.qemuoutput)
> +                    self.logger.debug("Data received on qemu stdout %s" % data)
> +                    self.logfunc(data, ".stdout")
>
>      # Since the socket is non-blocking make sure to honor EAGAIN
>      # and EWOULDBLOCK.
> --
> 2.39.2
>
>
> -=-=-=-=-=-=-=-=-=-=-=-
> Links: You receive all messages sent to this group.
> View/Reply Online (#192655): https://lists.openembedded.org/g/openembedded-core/message/192655
> Mute This Topic: https://lists.openembedded.org/mt/103244414/1997914
> Group Owner: openembedded-core+owner@lists.openembedded.org
> Unsubscribe: https://lists.openembedded.org/g/openembedded-core/unsub [raj.khem@gmail.com]
> -=-=-=-=-=-=-=-=-=-=-=-
>
Richard Purdie Dec. 18, 2023, 5:58 p.m. UTC | #4
On Mon, 2023-12-18 at 09:45 -0800, Khem Raj wrote:
> I tried the two patches in this series. It did improve the situation
> but I am still getting SSH timeouts. But this time its 13 tests
> earlier it used to be 40+
> btw. my images are using systemd. So it might be good to see if we see
> this with poky-altcfg as well or not.

Do you have the log.do_testimage and the ${WORKDIR}/testimage/ files?

Did you still see rcu stalls in the logs?

Cheers,

Richard
Khem Raj Dec. 18, 2023, 6:07 p.m. UTC | #5
On Mon, Dec 18, 2023 at 9:58 AM Richard Purdie
<richard.purdie@linuxfoundation.org> wrote:
>
> On Mon, 2023-12-18 at 09:45 -0800, Khem Raj wrote:
> > I tried the two patches in this series. It did improve the situation
> > but I am still getting SSH timeouts. But this time its 13 tests
> > earlier it used to be 40+
> > btw. my images are using systemd. So it might be good to see if we see
> > this with poky-altcfg as well or not.
>
> Do you have the log.do_testimage and the ${WORKDIR}/testimage/ files?

yes, further I ran the failing tests in loop one after another still
one test gzip fails with ssh timeouts

https://busybox.net/~kraj/log.do_testimage.503
https://busybox.net/~kraj/testimage/

there are two runs in the testimages folder. In one you see the RCU
stall and in second you do not
but it fails with same ssh timeout issue.

>
> Did you still see rcu stalls in the logs?
>
> Cheers,
>
> Richard
Alex Bennée Dec. 18, 2023, 10:27 p.m. UTC | #6
"Richard Purdie" <richard.purdie@linuxfoundation.org> writes:

> On Mon, 2023-12-18 at 09:45 -0800, Khem Raj wrote:
>> I tried the two patches in this series. It did improve the situation
>> but I am still getting SSH timeouts. But this time its 13 tests
>> earlier it used to be 40+
>> btw. my images are using systemd. So it might be good to see if we see
>> this with poky-altcfg as well or not.
>
> Do you have the log.do_testimage and the ${WORKDIR}/testimage/ files?
>
> Did you still see rcu stalls in the logs?

rcu stalls in the kernel is just a sign of QEMU's relative inefficiency
compared to real HW. However multiple -smp will help alleviate busy
vCPUs if you can. If you want guest time to be a bit more realistic you
can use icount but actual wall clock time will be longer as you can only
have one thread.

>
> Cheers,
>
> Richard
>
>
> -=-=-=-=-=-=-=-=-=-=-=-
> Links: You receive all messages sent to this group.
> View/Reply Online (#192674): https://lists.openembedded.org/g/openembedded-core/message/192674
> Mute This Topic: https://lists.openembedded.org/mt/103244414/5085169
> Group Owner: openembedded-core+owner@lists.openembedded.org
> Unsubscribe: https://lists.openembedded.org/g/openembedded-core/unsub [alex.bennee@linaro.org]
> -=-=-=-=-=-=-=-=-=-=-=-
Richard Purdie Dec. 18, 2023, 10:54 p.m. UTC | #7
On Mon, 2023-12-18 at 22:27 +0000, Alex Bennée wrote:
> "Richard Purdie" <richard.purdie@linuxfoundation.org> writes:
> 
> > On Mon, 2023-12-18 at 09:45 -0800, Khem Raj wrote:
> > > I tried the two patches in this series. It did improve the situation
> > > but I am still getting SSH timeouts. But this time its 13 tests
> > > earlier it used to be 40+
> > > btw. my images are using systemd. So it might be good to see if we see
> > > this with poky-altcfg as well or not.
> > 
> > Do you have the log.do_testimage and the ${WORKDIR}/testimage/ files?
> > 
> > Did you still see rcu stalls in the logs?
> 
> rcu stalls in the kernel is just a sign of QEMU's relative inefficiency
> compared to real HW. However multiple -smp will help alleviate busy
> vCPUs if you can. If you want guest time to be a bit more realistic you
> can use icount but actual wall clock time will be longer as you can only
> have one thread.

We have thought that but I realised rcu stalls in the 8250 serial TX
path could mean something quite different, related to 2/2 in this
series.

Cheers,

Richard
Richard Purdie Dec. 18, 2023, 11:01 p.m. UTC | #8
On Mon, 2023-12-18 at 10:07 -0800, Khem Raj wrote:
> On Mon, Dec 18, 2023 at 9:58 AM Richard Purdie
> <richard.purdie@linuxfoundation.org> wrote:
> > 
> > On Mon, 2023-12-18 at 09:45 -0800, Khem Raj wrote:
> > > I tried the two patches in this series. It did improve the situation
> > > but I am still getting SSH timeouts. But this time its 13 tests
> > > earlier it used to be 40+
> > > btw. my images are using systemd. So it might be good to see if we see
> > > this with poky-altcfg as well or not.
> > 
> > Do you have the log.do_testimage and the ${WORKDIR}/testimage/ files?
> 
> yes, further I ran the failing tests in loop one after another still
> one test gzip fails with ssh timeouts
> 
> https://busybox.net/~kraj/log.do_testimage.503
> https://busybox.net/~kraj/testimage/
> 
> there are two runs in the testimages folder. In one you see the RCU
> stall and in second you do not
> but it fails with same ssh timeout issue.
> 
> > 
> > Did you still see rcu stalls in the logs?

What is interesting is there is ~3MB of nulls in the .2 serial log. The
rcu stall is also:

[   88.261687]  serial8250_tx_chars+0xea/0x2b0
[   88.261689]  serial8250_handle_irq+0x1e9/0x330
[   88.261691]  serial8250_default_handle_irq+0x4a/0x90
[   88.261693]  serial8250_interrupt+0x66/0xc0
[   88.261696]  __handle_irq_event_percpu+0x54/0x1c0
[   88.261701]  handle_irq_event+0x3d/0x80

i.e. it is stalled in the serial TX path.

The big question is why is there so many nulls on the serial port. I
see a few on my local x86 test runs but only ~4kb, not megabytes of
them. I hadn't worked out where/what they are from yet.

I suspect something in the serial/kernel/qemu space isn't interacting
correctly.

Find the cause of the nulls and we might make progress on this.

Cheers,

Richard
Richard Purdie Dec. 19, 2023, 12:03 p.m. UTC | #9
On Mon, 2023-12-18 at 23:01 +0000, Richard Purdie via
lists.openembedded.org wrote:
> On Mon, 2023-12-18 at 10:07 -0800, Khem Raj wrote:
> > On Mon, Dec 18, 2023 at 9:58 AM Richard Purdie
> > <richard.purdie@linuxfoundation.org> wrote:
> > > 
> > > On Mon, 2023-12-18 at 09:45 -0800, Khem Raj wrote:
> > > > I tried the two patches in this series. It did improve the situation
> > > > but I am still getting SSH timeouts. But this time its 13 tests
> > > > earlier it used to be 40+
> > > > btw. my images are using systemd. So it might be good to see if we see
> > > > this with poky-altcfg as well or not.
> > > 
> > > Do you have the log.do_testimage and the ${WORKDIR}/testimage/ files?
> > 
> > yes, further I ran the failing tests in loop one after another still
> > one test gzip fails with ssh timeouts
> > 
> > https://busybox.net/~kraj/log.do_testimage.503
> > https://busybox.net/~kraj/testimage/
> > 
> > there are two runs in the testimages folder. In one you see the RCU
> > stall and in second you do not
> > but it fails with same ssh timeout issue.
> > 
> > > 
> > > Did you still see rcu stalls in the logs?
> 
> What is interesting is there is ~3MB of nulls in the .2 serial log. The
> rcu stall is also:
> 
> [   88.261687]  serial8250_tx_chars+0xea/0x2b0
> [   88.261689]  serial8250_handle_irq+0x1e9/0x330
> [   88.261691]  serial8250_default_handle_irq+0x4a/0x90
> [   88.261693]  serial8250_interrupt+0x66/0xc0
> [   88.261696]  __handle_irq_event_percpu+0x54/0x1c0
> [   88.261701]  handle_irq_event+0x3d/0x80
> 
> i.e. it is stalled in the serial TX path.
> 
> The big question is why is there so many nulls on the serial port. I
> see a few on my local x86 test runs but only ~4kb, not megabytes of
> them. I hadn't worked out where/what they are from yet.
> 
> I suspect something in the serial/kernel/qemu space isn't interacting
> correctly.
> 
> Find the cause of the nulls and we might make progress on this.

I went digging and it is a mess. That 3MB file is actually a wtmp file
from /var/log. The reason is a failure of the "target_dumper" commands
and that the simplistic serial command handling doesn't like binary
files.

I've a few patches which should improve the situation but we're clearly
multiple issues at play here where issues are stacking on top of other
issues. My rough list of issues:

a) stdout logging from qemu wasn't being read and could overflow
buffers locking qemu
b) serial data from commands wasn't being fully read and could overflow
buffers locking qemu
c) the dumper log reading command is reading binary files like wtmp
d) the dumper logs overwrite each other so are useless anyway
e) the dumper logs run on every failed command
f) qmp monitor dump command also run on every failed command

I think we'd be much better off if we drop the dumper/monitor stuff and
qemu runs might be quite a bit faster too.

Cheers,

Richard
Mikko Rapeli Dec. 19, 2023, 12:12 p.m. UTC | #10
Hi,

On Tue, Dec 19, 2023 at 12:03:07PM +0000, Richard Purdie wrote:
> On Mon, 2023-12-18 at 23:01 +0000, Richard Purdie via
> lists.openembedded.org wrote:
> > On Mon, 2023-12-18 at 10:07 -0800, Khem Raj wrote:
> > > On Mon, Dec 18, 2023 at 9:58 AM Richard Purdie
> > > <richard.purdie@linuxfoundation.org> wrote:
> > > > 
> > > > On Mon, 2023-12-18 at 09:45 -0800, Khem Raj wrote:
> > > > > I tried the two patches in this series. It did improve the situation
> > > > > but I am still getting SSH timeouts. But this time its 13 tests
> > > > > earlier it used to be 40+
> > > > > btw. my images are using systemd. So it might be good to see if we see
> > > > > this with poky-altcfg as well or not.
> > > > 
> > > > Do you have the log.do_testimage and the ${WORKDIR}/testimage/ files?
> > > 
> > > yes, further I ran the failing tests in loop one after another still
> > > one test gzip fails with ssh timeouts
> > > 
> > > https://busybox.net/~kraj/log.do_testimage.503
> > > https://busybox.net/~kraj/testimage/
> > > 
> > > there are two runs in the testimages folder. In one you see the RCU
> > > stall and in second you do not
> > > but it fails with same ssh timeout issue.
> > > 
> > > > 
> > > > Did you still see rcu stalls in the logs?
> > 
> > What is interesting is there is ~3MB of nulls in the .2 serial log. The
> > rcu stall is also:
> > 
> > [   88.261687]  serial8250_tx_chars+0xea/0x2b0
> > [   88.261689]  serial8250_handle_irq+0x1e9/0x330
> > [   88.261691]  serial8250_default_handle_irq+0x4a/0x90
> > [   88.261693]  serial8250_interrupt+0x66/0xc0
> > [   88.261696]  __handle_irq_event_percpu+0x54/0x1c0
> > [   88.261701]  handle_irq_event+0x3d/0x80
> > 
> > i.e. it is stalled in the serial TX path.
> > 
> > The big question is why is there so many nulls on the serial port. I
> > see a few on my local x86 test runs but only ~4kb, not megabytes of
> > them. I hadn't worked out where/what they are from yet.
> > 
> > I suspect something in the serial/kernel/qemu space isn't interacting
> > correctly.
> > 
> > Find the cause of the nulls and we might make progress on this.
> 
> I went digging and it is a mess. That 3MB file is actually a wtmp file
> from /var/log. The reason is a failure of the "target_dumper" commands
> and that the simplistic serial command handling doesn't like binary
> files.
> 
> I've a few patches which should improve the situation but we're clearly
> multiple issues at play here where issues are stacking on top of other
> issues. My rough list of issues:
> 
> a) stdout logging from qemu wasn't being read and could overflow
> buffers locking qemu
> b) serial data from commands wasn't being fully read and could overflow
> buffers locking qemu
> c) the dumper log reading command is reading binary files like wtmp
> d) the dumper logs overwrite each other so are useless anyway
> e) the dumper logs run on every failed command
> f) qmp monitor dump command also run on every failed command
> 
> I think we'd be much better off if we drop the dumper/monitor stuff and
> qemu runs might be quite a bit faster too.

I agree to this. If target has serious issues like a full lockup, then dumper and
monitor commands don't help anyway. Capturing the full serial logs would be sufficient,
though I'd like to see a live capture of /var/log/message and systemd journal in case
services started by init have major issues.

Cheers,

-Mikko
Alex Bennée Dec. 19, 2023, 3:25 p.m. UTC | #11
"Richard Purdie" <richard.purdie@linuxfoundation.org> writes:

> On Mon, 2023-12-18 at 10:07 -0800, Khem Raj wrote:
>> On Mon, Dec 18, 2023 at 9:58 AM Richard Purdie
>> <richard.purdie@linuxfoundation.org> wrote:
>> > 
>> > On Mon, 2023-12-18 at 09:45 -0800, Khem Raj wrote:
>> > > I tried the two patches in this series. It did improve the situation
>> > > but I am still getting SSH timeouts. But this time its 13 tests
>> > > earlier it used to be 40+
>> > > btw. my images are using systemd. So it might be good to see if we see
>> > > this with poky-altcfg as well or not.
>> > 
>> > Do you have the log.do_testimage and the ${WORKDIR}/testimage/ files?
>> 
>> yes, further I ran the failing tests in loop one after another still
>> one test gzip fails with ssh timeouts
>> 
>> https://busybox.net/~kraj/log.do_testimage.503
>> https://busybox.net/~kraj/testimage/
>> 
>> there are two runs in the testimages folder. In one you see the RCU
>> stall and in second you do not
>> but it fails with same ssh timeout issue.
>> 
>> > 
>> > Did you still see rcu stalls in the logs?
>
> What is interesting is there is ~3MB of nulls in the .2 serial log. The
> rcu stall is also:
>
> [   88.261687]  serial8250_tx_chars+0xea/0x2b0
> [   88.261689]  serial8250_handle_irq+0x1e9/0x330
> [   88.261691]  serial8250_default_handle_irq+0x4a/0x90
> [   88.261693]  serial8250_interrupt+0x66/0xc0
> [   88.261696]  __handle_irq_event_percpu+0x54/0x1c0
> [   88.261701]  handle_irq_event+0x3d/0x80
>
> i.e. it is stalled in the serial TX path.
>
> The big question is why is there so many nulls on the serial port. I
> see a few on my local x86 test runs but only ~4kb, not megabytes of
> them. I hadn't worked out where/what they are from yet.
>
> I suspect something in the serial/kernel/qemu space isn't interacting
> correctly.

We generally don't model FIFOs in QEMU and also we will block the main
thread if we can't write data out.

>
> Find the cause of the nulls and we might make progress on this.
>
> Cheers,
>
> Richard
>
>
>
> -=-=-=-=-=-=-=-=-=-=-=-
> Links: You receive all messages sent to this group.
> View/Reply Online (#192681): https://lists.openembedded.org/g/openembedded-core/message/192681
> Mute This Topic: https://lists.openembedded.org/mt/103244414/5085169
> Group Owner: openembedded-core+owner@lists.openembedded.org
> Unsubscribe: https://lists.openembedded.org/g/openembedded-core/unsub [alex.bennee@linaro.org]
> -=-=-=-=-=-=-=-=-=-=-=-
diff mbox series

Patch

diff --git a/meta/lib/oeqa/utils/qemurunner.py b/meta/lib/oeqa/utils/qemurunner.py
index ee367078000..36bdfb96ae6 100644
--- a/meta/lib/oeqa/utils/qemurunner.py
+++ b/meta/lib/oeqa/utils/qemurunner.py
@@ -29,6 +29,15 @@  control_chars = [chr(x) for x in control_range
                 if chr(x) not in string.printable]
 re_control_char = re.compile('[%s]' % re.escape("".join(control_chars)))
 
+def getOutput(o):
+    import fcntl
+    fl = fcntl.fcntl(o, fcntl.F_GETFL)
+    fcntl.fcntl(o, fcntl.F_SETFL, fl | os.O_NONBLOCK)
+    try:
+        return os.read(o.fileno(), 1000000).decode("utf-8")
+    except BlockingIOError:
+        return ""
+
 class QemuRunner:
 
     def __init__(self, machine, rootfs, display, tmpdir, deploy_dir_image, logfile, boottime, dump_dir, use_kvm, logger, use_slirp=False,
@@ -55,6 +64,7 @@  class QemuRunner:
         self.boottime = boottime
         self.logged = False
         self.thread = None
+        self.threadsock = None
         self.use_kvm = use_kvm
         self.use_ovmf = use_ovmf
         self.use_slirp = use_slirp
@@ -119,21 +129,11 @@  class QemuRunner:
                 f.write(msg)
         self.msg += self.decode_qemulog(msg)
 
-    def getOutput(self, o):
-        import fcntl
-        fl = fcntl.fcntl(o, fcntl.F_GETFL)
-        fcntl.fcntl(o, fcntl.F_SETFL, fl | os.O_NONBLOCK)
-        try:
-            return os.read(o.fileno(), 1000000).decode("utf-8")
-        except BlockingIOError:
-            return ""
-
-
     def handleSIGCHLD(self, signum, frame):
         if self.runqemu and self.runqemu.poll():
             if self.runqemu.returncode:
                 self.logger.error('runqemu exited with code %d' % self.runqemu.returncode)
-                self.logger.error('Output from runqemu:\n%s' % self.getOutput(self.runqemu.stdout))
+                self.logger.error('Output from runqemu:\n%s' % getOutput(self.runqemu.stdout))
                 self.stop()
 
     def start(self, qemuparams = None, get_ip = True, extra_bootparams = None, runqemuparams='', launch_cmd=None, discard_writes=True):
@@ -282,7 +282,7 @@  class QemuRunner:
                 if self.runqemu.returncode:
                     # No point waiting any longer
                     self.logger.warning('runqemu exited with code %d' % self.runqemu.returncode)
-                    self.logger.warning("Output from runqemu:\n%s" % self.getOutput(output))
+                    self.logger.warning("Output from runqemu:\n%s" % getOutput(output))
                     self.stop()
                     return False
             time.sleep(0.5)
@@ -309,7 +309,7 @@  class QemuRunner:
             ps = subprocess.Popen(['ps', 'axww', '-o', 'pid,ppid,pri,ni,command '], stdout=subprocess.PIPE).communicate()[0]
             processes = ps.decode("utf-8")
             self.logger.debug("Running processes:\n%s" % processes)
-            op = self.getOutput(output)
+            op = getOutput(output)
             self.stop()
             if op:
                 self.logger.error("Output from runqemu:\n%s" % op)
@@ -387,7 +387,7 @@  class QemuRunner:
                            time.time() - connect_time))
 
         # We are alive: qemu is running
-        out = self.getOutput(output)
+        out = getOutput(output)
         netconf = False # network configuration is not required by default
         self.logger.debug("qemu started in %.2f seconds - qemu procces pid is %s (%s)" %
                           (time.time() - (endtime - self.runqemutime),
@@ -430,9 +430,10 @@  class QemuRunner:
         self.logger.debug("Target IP: %s" % self.ip)
         self.logger.debug("Server IP: %s" % self.server_ip)
 
+        self.thread = LoggingThread(self.log, self.threadsock, self.logger, self.runqemu.stdout)
+        self.thread.start()
+
         if self.serial_ports >= 2:
-            self.thread = LoggingThread(self.log, self.threadsock, self.logger)
-            self.thread.start()
             if not self.thread.connection_established.wait(self.boottime):
                 self.logger.error("Didn't receive a console connection from qemu. "
                              "Here is the qemu command line used:\n%s\nand "
@@ -444,7 +445,7 @@  class QemuRunner:
         self.logger.debug("Waiting at most %d seconds for login banner (%s)" %
                           (self.boottime, time.strftime("%D %H:%M:%S")))
         endtime = time.time() + self.boottime
-        filelist = [self.server_socket, self.runqemu.stdout]
+        filelist = [self.server_socket]
         reachedlogin = False
         stopread = False
         qemusock = None
@@ -564,7 +565,7 @@  class QemuRunner:
                 self.logger.debug("Sending SIGKILL to runqemu")
                 os.killpg(os.getpgid(self.runqemu.pid), signal.SIGKILL)
             if not self.runqemu.stdout.closed:
-                self.logger.info("Output from runqemu:\n%s" % self.getOutput(self.runqemu.stdout))
+                self.logger.info("Output from runqemu:\n%s" % getOutput(self.runqemu.stdout))
             self.runqemu.stdin.close()
             self.runqemu.stdout.close()
             self.runqemu_exited = True
@@ -700,9 +701,11 @@  class QemuRunner:
 # event loop. The mechanism for stopping the thread is a simple pipe which
 # will wake up the poll and allow for tearing everything down.
 class LoggingThread(threading.Thread):
-    def __init__(self, logfunc, sock, logger):
+    def __init__(self, logfunc, sock, logger, qemuoutput):
         self.connection_established = threading.Event()
+
         self.serversock = sock
+        self.qemuoutput = qemuoutput
         self.logfunc = logfunc
         self.logger = logger
         self.readsock = None
@@ -732,7 +735,8 @@  class LoggingThread(threading.Thread):
 
     def teardown(self):
         self.logger.debug("Tearing down logging thread")
-        self.close_socket(self.serversock)
+        if self.serversock:
+            self.close_socket(self.serversock)
 
         if self.readsock is not None:
             self.close_socket(self.readsock)
@@ -747,7 +751,9 @@  class LoggingThread(threading.Thread):
     def eventloop(self):
         poll = select.poll()
         event_read_mask = self.errorevents | self.readevents
-        poll.register(self.serversock.fileno())
+        if self.serversock:
+            poll.register(self.serversock.fileno())
+        poll.register(self.qemuoutput.fileno())
         poll.register(self.readpipe, event_read_mask)
 
         breakout = False
@@ -767,7 +773,7 @@  class LoggingThread(threading.Thread):
                     break
 
                 # A connection request was received
-                elif self.serversock.fileno() == event[0]:
+                elif self.serversock and self.serversock.fileno() == event[0]:
                     self.logger.debug("Connection request received")
                     self.readsock, _ = self.serversock.accept()
                     self.readsock.setblocking(0)
@@ -781,6 +787,10 @@  class LoggingThread(threading.Thread):
                 elif self.readsock.fileno() == event[0]:
                     data = self.recv(1024)
                     self.logfunc(data)
+                elif self.qemuoutput.fileno() == event[0]:
+                    data = getOutput(self.qemuoutput)
+                    self.logger.debug("Data received on qemu stdout %s" % data)
+                    self.logfunc(data, ".stdout")
 
     # Since the socket is non-blocking make sure to honor EAGAIN
     # and EWOULDBLOCK.