Message ID | 20231218152951.3398964-1-richard.purdie@linuxfoundation.org |
---|---|
State | New |
Headers | show |
Series | [1/2] qemurunner: Impove stdout logging handling | expand |
On Mon, Dec 18, 2023 at 12:30 PM Richard Purdie <richard.purdie@linuxfoundation.org> wrote: > > We need to ensure we read from and log the output from qemu stdout > as otherwise the buffers can fill and block, leading qemu to hand. > Use our existing logging thread to do this. s/Impove/Improve on the Subject line of both patches.
On Mon, 2023-12-18 at 13:04 -0300, Fabio Estevam wrote: > On Mon, Dec 18, 2023 at 12:30 PM Richard Purdie > <richard.purdie@linuxfoundation.org> wrote: > > > > We need to ensure we read from and log the output from qemu stdout > > as otherwise the buffers can fill and block, leading qemu to hand. > > Use our existing logging thread to do this. > > s/Impove/Improve on the Subject line of both patches. Ah, yes, thanks. I've tweaked it on master-next. Cheers, Richard
I tried the two patches in this series. It did improve the situation but I am still getting SSH timeouts. But this time its 13 tests earlier it used to be 40+ btw. my images are using systemd. So it might be good to see if we see this with poky-altcfg as well or not. On Mon, Dec 18, 2023 at 7:29 AM Richard Purdie <richard.purdie@linuxfoundation.org> wrote: > > We need to ensure we read from and log the output from qemu stdout > as otherwise the buffers can fill and block, leading qemu to hand. > Use our existing logging thread to do this. > > Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org> > --- > meta/lib/oeqa/utils/qemurunner.py | 54 ++++++++++++++++++------------- > 1 file changed, 32 insertions(+), 22 deletions(-) > > diff --git a/meta/lib/oeqa/utils/qemurunner.py b/meta/lib/oeqa/utils/qemurunner.py > index ee367078000..36bdfb96ae6 100644 > --- a/meta/lib/oeqa/utils/qemurunner.py > +++ b/meta/lib/oeqa/utils/qemurunner.py > @@ -29,6 +29,15 @@ control_chars = [chr(x) for x in control_range > if chr(x) not in string.printable] > re_control_char = re.compile('[%s]' % re.escape("".join(control_chars))) > > +def getOutput(o): > + import fcntl > + fl = fcntl.fcntl(o, fcntl.F_GETFL) > + fcntl.fcntl(o, fcntl.F_SETFL, fl | os.O_NONBLOCK) > + try: > + return os.read(o.fileno(), 1000000).decode("utf-8") > + except BlockingIOError: > + return "" > + > class QemuRunner: > > def __init__(self, machine, rootfs, display, tmpdir, deploy_dir_image, logfile, boottime, dump_dir, use_kvm, logger, use_slirp=False, > @@ -55,6 +64,7 @@ class QemuRunner: > self.boottime = boottime > self.logged = False > self.thread = None > + self.threadsock = None > self.use_kvm = use_kvm > self.use_ovmf = use_ovmf > self.use_slirp = use_slirp > @@ -119,21 +129,11 @@ class QemuRunner: > f.write(msg) > self.msg += self.decode_qemulog(msg) > > - def getOutput(self, o): > - import fcntl > - fl = fcntl.fcntl(o, fcntl.F_GETFL) > - fcntl.fcntl(o, fcntl.F_SETFL, fl | os.O_NONBLOCK) > - try: > - return os.read(o.fileno(), 1000000).decode("utf-8") > - except BlockingIOError: > - return "" > - > - > def handleSIGCHLD(self, signum, frame): > if self.runqemu and self.runqemu.poll(): > if self.runqemu.returncode: > self.logger.error('runqemu exited with code %d' % self.runqemu.returncode) > - self.logger.error('Output from runqemu:\n%s' % self.getOutput(self.runqemu.stdout)) > + self.logger.error('Output from runqemu:\n%s' % getOutput(self.runqemu.stdout)) > self.stop() > > def start(self, qemuparams = None, get_ip = True, extra_bootparams = None, runqemuparams='', launch_cmd=None, discard_writes=True): > @@ -282,7 +282,7 @@ class QemuRunner: > if self.runqemu.returncode: > # No point waiting any longer > self.logger.warning('runqemu exited with code %d' % self.runqemu.returncode) > - self.logger.warning("Output from runqemu:\n%s" % self.getOutput(output)) > + self.logger.warning("Output from runqemu:\n%s" % getOutput(output)) > self.stop() > return False > time.sleep(0.5) > @@ -309,7 +309,7 @@ class QemuRunner: > ps = subprocess.Popen(['ps', 'axww', '-o', 'pid,ppid,pri,ni,command '], stdout=subprocess.PIPE).communicate()[0] > processes = ps.decode("utf-8") > self.logger.debug("Running processes:\n%s" % processes) > - op = self.getOutput(output) > + op = getOutput(output) > self.stop() > if op: > self.logger.error("Output from runqemu:\n%s" % op) > @@ -387,7 +387,7 @@ class QemuRunner: > time.time() - connect_time)) > > # We are alive: qemu is running > - out = self.getOutput(output) > + out = getOutput(output) > netconf = False # network configuration is not required by default > self.logger.debug("qemu started in %.2f seconds - qemu procces pid is %s (%s)" % > (time.time() - (endtime - self.runqemutime), > @@ -430,9 +430,10 @@ class QemuRunner: > self.logger.debug("Target IP: %s" % self.ip) > self.logger.debug("Server IP: %s" % self.server_ip) > > + self.thread = LoggingThread(self.log, self.threadsock, self.logger, self.runqemu.stdout) > + self.thread.start() > + > if self.serial_ports >= 2: > - self.thread = LoggingThread(self.log, self.threadsock, self.logger) > - self.thread.start() > if not self.thread.connection_established.wait(self.boottime): > self.logger.error("Didn't receive a console connection from qemu. " > "Here is the qemu command line used:\n%s\nand " > @@ -444,7 +445,7 @@ class QemuRunner: > self.logger.debug("Waiting at most %d seconds for login banner (%s)" % > (self.boottime, time.strftime("%D %H:%M:%S"))) > endtime = time.time() + self.boottime > - filelist = [self.server_socket, self.runqemu.stdout] > + filelist = [self.server_socket] > reachedlogin = False > stopread = False > qemusock = None > @@ -564,7 +565,7 @@ class QemuRunner: > self.logger.debug("Sending SIGKILL to runqemu") > os.killpg(os.getpgid(self.runqemu.pid), signal.SIGKILL) > if not self.runqemu.stdout.closed: > - self.logger.info("Output from runqemu:\n%s" % self.getOutput(self.runqemu.stdout)) > + self.logger.info("Output from runqemu:\n%s" % getOutput(self.runqemu.stdout)) > self.runqemu.stdin.close() > self.runqemu.stdout.close() > self.runqemu_exited = True > @@ -700,9 +701,11 @@ class QemuRunner: > # event loop. The mechanism for stopping the thread is a simple pipe which > # will wake up the poll and allow for tearing everything down. > class LoggingThread(threading.Thread): > - def __init__(self, logfunc, sock, logger): > + def __init__(self, logfunc, sock, logger, qemuoutput): > self.connection_established = threading.Event() > + > self.serversock = sock > + self.qemuoutput = qemuoutput > self.logfunc = logfunc > self.logger = logger > self.readsock = None > @@ -732,7 +735,8 @@ class LoggingThread(threading.Thread): > > def teardown(self): > self.logger.debug("Tearing down logging thread") > - self.close_socket(self.serversock) > + if self.serversock: > + self.close_socket(self.serversock) > > if self.readsock is not None: > self.close_socket(self.readsock) > @@ -747,7 +751,9 @@ class LoggingThread(threading.Thread): > def eventloop(self): > poll = select.poll() > event_read_mask = self.errorevents | self.readevents > - poll.register(self.serversock.fileno()) > + if self.serversock: > + poll.register(self.serversock.fileno()) > + poll.register(self.qemuoutput.fileno()) > poll.register(self.readpipe, event_read_mask) > > breakout = False > @@ -767,7 +773,7 @@ class LoggingThread(threading.Thread): > break > > # A connection request was received > - elif self.serversock.fileno() == event[0]: > + elif self.serversock and self.serversock.fileno() == event[0]: > self.logger.debug("Connection request received") > self.readsock, _ = self.serversock.accept() > self.readsock.setblocking(0) > @@ -781,6 +787,10 @@ class LoggingThread(threading.Thread): > elif self.readsock.fileno() == event[0]: > data = self.recv(1024) > self.logfunc(data) > + elif self.qemuoutput.fileno() == event[0]: > + data = getOutput(self.qemuoutput) > + self.logger.debug("Data received on qemu stdout %s" % data) > + self.logfunc(data, ".stdout") > > # Since the socket is non-blocking make sure to honor EAGAIN > # and EWOULDBLOCK. > -- > 2.39.2 > > > -=-=-=-=-=-=-=-=-=-=-=- > Links: You receive all messages sent to this group. > View/Reply Online (#192655): https://lists.openembedded.org/g/openembedded-core/message/192655 > Mute This Topic: https://lists.openembedded.org/mt/103244414/1997914 > Group Owner: openembedded-core+owner@lists.openembedded.org > Unsubscribe: https://lists.openembedded.org/g/openembedded-core/unsub [raj.khem@gmail.com] > -=-=-=-=-=-=-=-=-=-=-=- >
On Mon, 2023-12-18 at 09:45 -0800, Khem Raj wrote: > I tried the two patches in this series. It did improve the situation > but I am still getting SSH timeouts. But this time its 13 tests > earlier it used to be 40+ > btw. my images are using systemd. So it might be good to see if we see > this with poky-altcfg as well or not. Do you have the log.do_testimage and the ${WORKDIR}/testimage/ files? Did you still see rcu stalls in the logs? Cheers, Richard
On Mon, Dec 18, 2023 at 9:58 AM Richard Purdie <richard.purdie@linuxfoundation.org> wrote: > > On Mon, 2023-12-18 at 09:45 -0800, Khem Raj wrote: > > I tried the two patches in this series. It did improve the situation > > but I am still getting SSH timeouts. But this time its 13 tests > > earlier it used to be 40+ > > btw. my images are using systemd. So it might be good to see if we see > > this with poky-altcfg as well or not. > > Do you have the log.do_testimage and the ${WORKDIR}/testimage/ files? yes, further I ran the failing tests in loop one after another still one test gzip fails with ssh timeouts https://busybox.net/~kraj/log.do_testimage.503 https://busybox.net/~kraj/testimage/ there are two runs in the testimages folder. In one you see the RCU stall and in second you do not but it fails with same ssh timeout issue. > > Did you still see rcu stalls in the logs? > > Cheers, > > Richard
"Richard Purdie" <richard.purdie@linuxfoundation.org> writes: > On Mon, 2023-12-18 at 09:45 -0800, Khem Raj wrote: >> I tried the two patches in this series. It did improve the situation >> but I am still getting SSH timeouts. But this time its 13 tests >> earlier it used to be 40+ >> btw. my images are using systemd. So it might be good to see if we see >> this with poky-altcfg as well or not. > > Do you have the log.do_testimage and the ${WORKDIR}/testimage/ files? > > Did you still see rcu stalls in the logs? rcu stalls in the kernel is just a sign of QEMU's relative inefficiency compared to real HW. However multiple -smp will help alleviate busy vCPUs if you can. If you want guest time to be a bit more realistic you can use icount but actual wall clock time will be longer as you can only have one thread. > > Cheers, > > Richard > > > -=-=-=-=-=-=-=-=-=-=-=- > Links: You receive all messages sent to this group. > View/Reply Online (#192674): https://lists.openembedded.org/g/openembedded-core/message/192674 > Mute This Topic: https://lists.openembedded.org/mt/103244414/5085169 > Group Owner: openembedded-core+owner@lists.openembedded.org > Unsubscribe: https://lists.openembedded.org/g/openembedded-core/unsub [alex.bennee@linaro.org] > -=-=-=-=-=-=-=-=-=-=-=-
On Mon, 2023-12-18 at 22:27 +0000, Alex Bennée wrote: > "Richard Purdie" <richard.purdie@linuxfoundation.org> writes: > > > On Mon, 2023-12-18 at 09:45 -0800, Khem Raj wrote: > > > I tried the two patches in this series. It did improve the situation > > > but I am still getting SSH timeouts. But this time its 13 tests > > > earlier it used to be 40+ > > > btw. my images are using systemd. So it might be good to see if we see > > > this with poky-altcfg as well or not. > > > > Do you have the log.do_testimage and the ${WORKDIR}/testimage/ files? > > > > Did you still see rcu stalls in the logs? > > rcu stalls in the kernel is just a sign of QEMU's relative inefficiency > compared to real HW. However multiple -smp will help alleviate busy > vCPUs if you can. If you want guest time to be a bit more realistic you > can use icount but actual wall clock time will be longer as you can only > have one thread. We have thought that but I realised rcu stalls in the 8250 serial TX path could mean something quite different, related to 2/2 in this series. Cheers, Richard
On Mon, 2023-12-18 at 10:07 -0800, Khem Raj wrote: > On Mon, Dec 18, 2023 at 9:58 AM Richard Purdie > <richard.purdie@linuxfoundation.org> wrote: > > > > On Mon, 2023-12-18 at 09:45 -0800, Khem Raj wrote: > > > I tried the two patches in this series. It did improve the situation > > > but I am still getting SSH timeouts. But this time its 13 tests > > > earlier it used to be 40+ > > > btw. my images are using systemd. So it might be good to see if we see > > > this with poky-altcfg as well or not. > > > > Do you have the log.do_testimage and the ${WORKDIR}/testimage/ files? > > yes, further I ran the failing tests in loop one after another still > one test gzip fails with ssh timeouts > > https://busybox.net/~kraj/log.do_testimage.503 > https://busybox.net/~kraj/testimage/ > > there are two runs in the testimages folder. In one you see the RCU > stall and in second you do not > but it fails with same ssh timeout issue. > > > > > Did you still see rcu stalls in the logs? What is interesting is there is ~3MB of nulls in the .2 serial log. The rcu stall is also: [ 88.261687] serial8250_tx_chars+0xea/0x2b0 [ 88.261689] serial8250_handle_irq+0x1e9/0x330 [ 88.261691] serial8250_default_handle_irq+0x4a/0x90 [ 88.261693] serial8250_interrupt+0x66/0xc0 [ 88.261696] __handle_irq_event_percpu+0x54/0x1c0 [ 88.261701] handle_irq_event+0x3d/0x80 i.e. it is stalled in the serial TX path. The big question is why is there so many nulls on the serial port. I see a few on my local x86 test runs but only ~4kb, not megabytes of them. I hadn't worked out where/what they are from yet. I suspect something in the serial/kernel/qemu space isn't interacting correctly. Find the cause of the nulls and we might make progress on this. Cheers, Richard
On Mon, 2023-12-18 at 23:01 +0000, Richard Purdie via lists.openembedded.org wrote: > On Mon, 2023-12-18 at 10:07 -0800, Khem Raj wrote: > > On Mon, Dec 18, 2023 at 9:58 AM Richard Purdie > > <richard.purdie@linuxfoundation.org> wrote: > > > > > > On Mon, 2023-12-18 at 09:45 -0800, Khem Raj wrote: > > > > I tried the two patches in this series. It did improve the situation > > > > but I am still getting SSH timeouts. But this time its 13 tests > > > > earlier it used to be 40+ > > > > btw. my images are using systemd. So it might be good to see if we see > > > > this with poky-altcfg as well or not. > > > > > > Do you have the log.do_testimage and the ${WORKDIR}/testimage/ files? > > > > yes, further I ran the failing tests in loop one after another still > > one test gzip fails with ssh timeouts > > > > https://busybox.net/~kraj/log.do_testimage.503 > > https://busybox.net/~kraj/testimage/ > > > > there are two runs in the testimages folder. In one you see the RCU > > stall and in second you do not > > but it fails with same ssh timeout issue. > > > > > > > > Did you still see rcu stalls in the logs? > > What is interesting is there is ~3MB of nulls in the .2 serial log. The > rcu stall is also: > > [ 88.261687] serial8250_tx_chars+0xea/0x2b0 > [ 88.261689] serial8250_handle_irq+0x1e9/0x330 > [ 88.261691] serial8250_default_handle_irq+0x4a/0x90 > [ 88.261693] serial8250_interrupt+0x66/0xc0 > [ 88.261696] __handle_irq_event_percpu+0x54/0x1c0 > [ 88.261701] handle_irq_event+0x3d/0x80 > > i.e. it is stalled in the serial TX path. > > The big question is why is there so many nulls on the serial port. I > see a few on my local x86 test runs but only ~4kb, not megabytes of > them. I hadn't worked out where/what they are from yet. > > I suspect something in the serial/kernel/qemu space isn't interacting > correctly. > > Find the cause of the nulls and we might make progress on this. I went digging and it is a mess. That 3MB file is actually a wtmp file from /var/log. The reason is a failure of the "target_dumper" commands and that the simplistic serial command handling doesn't like binary files. I've a few patches which should improve the situation but we're clearly multiple issues at play here where issues are stacking on top of other issues. My rough list of issues: a) stdout logging from qemu wasn't being read and could overflow buffers locking qemu b) serial data from commands wasn't being fully read and could overflow buffers locking qemu c) the dumper log reading command is reading binary files like wtmp d) the dumper logs overwrite each other so are useless anyway e) the dumper logs run on every failed command f) qmp monitor dump command also run on every failed command I think we'd be much better off if we drop the dumper/monitor stuff and qemu runs might be quite a bit faster too. Cheers, Richard
Hi, On Tue, Dec 19, 2023 at 12:03:07PM +0000, Richard Purdie wrote: > On Mon, 2023-12-18 at 23:01 +0000, Richard Purdie via > lists.openembedded.org wrote: > > On Mon, 2023-12-18 at 10:07 -0800, Khem Raj wrote: > > > On Mon, Dec 18, 2023 at 9:58 AM Richard Purdie > > > <richard.purdie@linuxfoundation.org> wrote: > > > > > > > > On Mon, 2023-12-18 at 09:45 -0800, Khem Raj wrote: > > > > > I tried the two patches in this series. It did improve the situation > > > > > but I am still getting SSH timeouts. But this time its 13 tests > > > > > earlier it used to be 40+ > > > > > btw. my images are using systemd. So it might be good to see if we see > > > > > this with poky-altcfg as well or not. > > > > > > > > Do you have the log.do_testimage and the ${WORKDIR}/testimage/ files? > > > > > > yes, further I ran the failing tests in loop one after another still > > > one test gzip fails with ssh timeouts > > > > > > https://busybox.net/~kraj/log.do_testimage.503 > > > https://busybox.net/~kraj/testimage/ > > > > > > there are two runs in the testimages folder. In one you see the RCU > > > stall and in second you do not > > > but it fails with same ssh timeout issue. > > > > > > > > > > > Did you still see rcu stalls in the logs? > > > > What is interesting is there is ~3MB of nulls in the .2 serial log. The > > rcu stall is also: > > > > [ 88.261687] serial8250_tx_chars+0xea/0x2b0 > > [ 88.261689] serial8250_handle_irq+0x1e9/0x330 > > [ 88.261691] serial8250_default_handle_irq+0x4a/0x90 > > [ 88.261693] serial8250_interrupt+0x66/0xc0 > > [ 88.261696] __handle_irq_event_percpu+0x54/0x1c0 > > [ 88.261701] handle_irq_event+0x3d/0x80 > > > > i.e. it is stalled in the serial TX path. > > > > The big question is why is there so many nulls on the serial port. I > > see a few on my local x86 test runs but only ~4kb, not megabytes of > > them. I hadn't worked out where/what they are from yet. > > > > I suspect something in the serial/kernel/qemu space isn't interacting > > correctly. > > > > Find the cause of the nulls and we might make progress on this. > > I went digging and it is a mess. That 3MB file is actually a wtmp file > from /var/log. The reason is a failure of the "target_dumper" commands > and that the simplistic serial command handling doesn't like binary > files. > > I've a few patches which should improve the situation but we're clearly > multiple issues at play here where issues are stacking on top of other > issues. My rough list of issues: > > a) stdout logging from qemu wasn't being read and could overflow > buffers locking qemu > b) serial data from commands wasn't being fully read and could overflow > buffers locking qemu > c) the dumper log reading command is reading binary files like wtmp > d) the dumper logs overwrite each other so are useless anyway > e) the dumper logs run on every failed command > f) qmp monitor dump command also run on every failed command > > I think we'd be much better off if we drop the dumper/monitor stuff and > qemu runs might be quite a bit faster too. I agree to this. If target has serious issues like a full lockup, then dumper and monitor commands don't help anyway. Capturing the full serial logs would be sufficient, though I'd like to see a live capture of /var/log/message and systemd journal in case services started by init have major issues. Cheers, -Mikko
"Richard Purdie" <richard.purdie@linuxfoundation.org> writes: > On Mon, 2023-12-18 at 10:07 -0800, Khem Raj wrote: >> On Mon, Dec 18, 2023 at 9:58 AM Richard Purdie >> <richard.purdie@linuxfoundation.org> wrote: >> > >> > On Mon, 2023-12-18 at 09:45 -0800, Khem Raj wrote: >> > > I tried the two patches in this series. It did improve the situation >> > > but I am still getting SSH timeouts. But this time its 13 tests >> > > earlier it used to be 40+ >> > > btw. my images are using systemd. So it might be good to see if we see >> > > this with poky-altcfg as well or not. >> > >> > Do you have the log.do_testimage and the ${WORKDIR}/testimage/ files? >> >> yes, further I ran the failing tests in loop one after another still >> one test gzip fails with ssh timeouts >> >> https://busybox.net/~kraj/log.do_testimage.503 >> https://busybox.net/~kraj/testimage/ >> >> there are two runs in the testimages folder. In one you see the RCU >> stall and in second you do not >> but it fails with same ssh timeout issue. >> >> > >> > Did you still see rcu stalls in the logs? > > What is interesting is there is ~3MB of nulls in the .2 serial log. The > rcu stall is also: > > [ 88.261687] serial8250_tx_chars+0xea/0x2b0 > [ 88.261689] serial8250_handle_irq+0x1e9/0x330 > [ 88.261691] serial8250_default_handle_irq+0x4a/0x90 > [ 88.261693] serial8250_interrupt+0x66/0xc0 > [ 88.261696] __handle_irq_event_percpu+0x54/0x1c0 > [ 88.261701] handle_irq_event+0x3d/0x80 > > i.e. it is stalled in the serial TX path. > > The big question is why is there so many nulls on the serial port. I > see a few on my local x86 test runs but only ~4kb, not megabytes of > them. I hadn't worked out where/what they are from yet. > > I suspect something in the serial/kernel/qemu space isn't interacting > correctly. We generally don't model FIFOs in QEMU and also we will block the main thread if we can't write data out. > > Find the cause of the nulls and we might make progress on this. > > Cheers, > > Richard > > > > -=-=-=-=-=-=-=-=-=-=-=- > Links: You receive all messages sent to this group. > View/Reply Online (#192681): https://lists.openembedded.org/g/openembedded-core/message/192681 > Mute This Topic: https://lists.openembedded.org/mt/103244414/5085169 > Group Owner: openembedded-core+owner@lists.openembedded.org > Unsubscribe: https://lists.openembedded.org/g/openembedded-core/unsub [alex.bennee@linaro.org] > -=-=-=-=-=-=-=-=-=-=-=-
diff --git a/meta/lib/oeqa/utils/qemurunner.py b/meta/lib/oeqa/utils/qemurunner.py index ee367078000..36bdfb96ae6 100644 --- a/meta/lib/oeqa/utils/qemurunner.py +++ b/meta/lib/oeqa/utils/qemurunner.py @@ -29,6 +29,15 @@ control_chars = [chr(x) for x in control_range if chr(x) not in string.printable] re_control_char = re.compile('[%s]' % re.escape("".join(control_chars))) +def getOutput(o): + import fcntl + fl = fcntl.fcntl(o, fcntl.F_GETFL) + fcntl.fcntl(o, fcntl.F_SETFL, fl | os.O_NONBLOCK) + try: + return os.read(o.fileno(), 1000000).decode("utf-8") + except BlockingIOError: + return "" + class QemuRunner: def __init__(self, machine, rootfs, display, tmpdir, deploy_dir_image, logfile, boottime, dump_dir, use_kvm, logger, use_slirp=False, @@ -55,6 +64,7 @@ class QemuRunner: self.boottime = boottime self.logged = False self.thread = None + self.threadsock = None self.use_kvm = use_kvm self.use_ovmf = use_ovmf self.use_slirp = use_slirp @@ -119,21 +129,11 @@ class QemuRunner: f.write(msg) self.msg += self.decode_qemulog(msg) - def getOutput(self, o): - import fcntl - fl = fcntl.fcntl(o, fcntl.F_GETFL) - fcntl.fcntl(o, fcntl.F_SETFL, fl | os.O_NONBLOCK) - try: - return os.read(o.fileno(), 1000000).decode("utf-8") - except BlockingIOError: - return "" - - def handleSIGCHLD(self, signum, frame): if self.runqemu and self.runqemu.poll(): if self.runqemu.returncode: self.logger.error('runqemu exited with code %d' % self.runqemu.returncode) - self.logger.error('Output from runqemu:\n%s' % self.getOutput(self.runqemu.stdout)) + self.logger.error('Output from runqemu:\n%s' % getOutput(self.runqemu.stdout)) self.stop() def start(self, qemuparams = None, get_ip = True, extra_bootparams = None, runqemuparams='', launch_cmd=None, discard_writes=True): @@ -282,7 +282,7 @@ class QemuRunner: if self.runqemu.returncode: # No point waiting any longer self.logger.warning('runqemu exited with code %d' % self.runqemu.returncode) - self.logger.warning("Output from runqemu:\n%s" % self.getOutput(output)) + self.logger.warning("Output from runqemu:\n%s" % getOutput(output)) self.stop() return False time.sleep(0.5) @@ -309,7 +309,7 @@ class QemuRunner: ps = subprocess.Popen(['ps', 'axww', '-o', 'pid,ppid,pri,ni,command '], stdout=subprocess.PIPE).communicate()[0] processes = ps.decode("utf-8") self.logger.debug("Running processes:\n%s" % processes) - op = self.getOutput(output) + op = getOutput(output) self.stop() if op: self.logger.error("Output from runqemu:\n%s" % op) @@ -387,7 +387,7 @@ class QemuRunner: time.time() - connect_time)) # We are alive: qemu is running - out = self.getOutput(output) + out = getOutput(output) netconf = False # network configuration is not required by default self.logger.debug("qemu started in %.2f seconds - qemu procces pid is %s (%s)" % (time.time() - (endtime - self.runqemutime), @@ -430,9 +430,10 @@ class QemuRunner: self.logger.debug("Target IP: %s" % self.ip) self.logger.debug("Server IP: %s" % self.server_ip) + self.thread = LoggingThread(self.log, self.threadsock, self.logger, self.runqemu.stdout) + self.thread.start() + if self.serial_ports >= 2: - self.thread = LoggingThread(self.log, self.threadsock, self.logger) - self.thread.start() if not self.thread.connection_established.wait(self.boottime): self.logger.error("Didn't receive a console connection from qemu. " "Here is the qemu command line used:\n%s\nand " @@ -444,7 +445,7 @@ class QemuRunner: self.logger.debug("Waiting at most %d seconds for login banner (%s)" % (self.boottime, time.strftime("%D %H:%M:%S"))) endtime = time.time() + self.boottime - filelist = [self.server_socket, self.runqemu.stdout] + filelist = [self.server_socket] reachedlogin = False stopread = False qemusock = None @@ -564,7 +565,7 @@ class QemuRunner: self.logger.debug("Sending SIGKILL to runqemu") os.killpg(os.getpgid(self.runqemu.pid), signal.SIGKILL) if not self.runqemu.stdout.closed: - self.logger.info("Output from runqemu:\n%s" % self.getOutput(self.runqemu.stdout)) + self.logger.info("Output from runqemu:\n%s" % getOutput(self.runqemu.stdout)) self.runqemu.stdin.close() self.runqemu.stdout.close() self.runqemu_exited = True @@ -700,9 +701,11 @@ class QemuRunner: # event loop. The mechanism for stopping the thread is a simple pipe which # will wake up the poll and allow for tearing everything down. class LoggingThread(threading.Thread): - def __init__(self, logfunc, sock, logger): + def __init__(self, logfunc, sock, logger, qemuoutput): self.connection_established = threading.Event() + self.serversock = sock + self.qemuoutput = qemuoutput self.logfunc = logfunc self.logger = logger self.readsock = None @@ -732,7 +735,8 @@ class LoggingThread(threading.Thread): def teardown(self): self.logger.debug("Tearing down logging thread") - self.close_socket(self.serversock) + if self.serversock: + self.close_socket(self.serversock) if self.readsock is not None: self.close_socket(self.readsock) @@ -747,7 +751,9 @@ class LoggingThread(threading.Thread): def eventloop(self): poll = select.poll() event_read_mask = self.errorevents | self.readevents - poll.register(self.serversock.fileno()) + if self.serversock: + poll.register(self.serversock.fileno()) + poll.register(self.qemuoutput.fileno()) poll.register(self.readpipe, event_read_mask) breakout = False @@ -767,7 +773,7 @@ class LoggingThread(threading.Thread): break # A connection request was received - elif self.serversock.fileno() == event[0]: + elif self.serversock and self.serversock.fileno() == event[0]: self.logger.debug("Connection request received") self.readsock, _ = self.serversock.accept() self.readsock.setblocking(0) @@ -781,6 +787,10 @@ class LoggingThread(threading.Thread): elif self.readsock.fileno() == event[0]: data = self.recv(1024) self.logfunc(data) + elif self.qemuoutput.fileno() == event[0]: + data = getOutput(self.qemuoutput) + self.logger.debug("Data received on qemu stdout %s" % data) + self.logfunc(data, ".stdout") # Since the socket is non-blocking make sure to honor EAGAIN # and EWOULDBLOCK.
We need to ensure we read from and log the output from qemu stdout as otherwise the buffers can fill and block, leading qemu to hand. Use our existing logging thread to do this. Signed-off-by: Richard Purdie <richard.purdie@linuxfoundation.org> --- meta/lib/oeqa/utils/qemurunner.py | 54 ++++++++++++++++++------------- 1 file changed, 32 insertions(+), 22 deletions(-)