diff mbox series

chardev: race condition with tcp_chr_disconnect

Message ID 1562959033-223586-1-git-send-email-andrey.shinkevich@virtuozzo.com
State New
Headers show
Series chardev: race condition with tcp_chr_disconnect | expand

Commit Message

Andrey Shinkevich July 12, 2019, 7:17 p.m. UTC
When tcp_chr_disconnect() is called, other thread may be still writing
to the channel. This patch protects only read operations that initiate
the disconnection.

Signed-off-by: Andrey Shinkevich <andrey.shinkevich@virtuozzo.com>
---
The segmentation fault occurred because of the race condition when the
write operation was interrupted by the closed channel. The issue can be
reproduced by running the iotest 093 with the parameter read-zeroes=on,
self.vm.add_drive(self.test_img, "file.read-zeroes=on")
The back trace is below:

Thread 3 (Thread 0x7fabe105a700 (LWP 22443)):
#0  0x000055631de53b69 in tcp_chr_free_connection (chr=0x55631f754360) at chardev/char-socket.c:418
#1  0x000055631de53e32 in tcp_chr_disconnect (chr=0x55631f754360) at chardev/char-socket.c:478
#2  0x000055631de53ffa in tcp_chr_read (chan=0x55631f754630, cond=(G_IO_IN | G_IO_HUP), opaque=0x55631f754360) at chardev/char-socket.c:511
#3  0x000055631de6eb6e in qio_channel_fd_source_dispatch (source=0x7fabdc004200, callback=0x55631de53eb4 <tcp_chr_read>, user_data=0x55631f754360) at io/channel-watch.c:84
#4  0x00007fac0481d049 in g_main_context_dispatch () at /lib64/libglib-2.0.so.0
#5  0x00007fac0481d3a8 in g_main_context_iterate.isra.19 () at /lib64/libglib-2.0.so.0
#6  0x00007fac0481d67a in g_main_loop_run () at /lib64/libglib-2.0.so.0
#7  0x000055631dac72cb in iothread_run (opaque=0x55631f784180) at iothread.c:82
#8  0x000055631deda136 in qemu_thread_start (args=0x55631f77bf40) at util/qemu-thread-posix.c:502
#9  0x00007fabec2a9dd5 in start_thread (arg=0x7fabe105a700) at pthread_create.c:307
#10 0x00007fabebfd2ead in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:111

Thread 1 (Thread 0x7fac05316d80 (LWP 22373)):
#0  0x000055631dd815e3 in object_get_class (obj=0x0) at qom/object.c:905
#1  0x000055631de6817d in qio_channel_writev_full (ioc=0x0, iov=0x7fff982b6540, niov=1, fds=0x0, nfds=0, errp=0x0) at io/channel.c:77
#2  0x000055631de4f71c in io_channel_send_full (ioc=0x0, buf=0x55631f784e00, len=137, fds=0x0, nfds=0) at chardev/char-io.c:123
#3  0x000055631de5316b in tcp_chr_write (chr=0x55631f754360, buf=0x55631f784e00 "{\"timestamp\": {\"seconds\": 1562851228, \"microseconds\": 27216}, \"event\": \"SHUTDOWN\", \"data\": {\"guest\": false, \"reason\": \"host-qmp-quit\"}}\r\n", len=137) at chardev/char-socket.c:161
#4  0x000055631de4b1e5 in qemu_chr_write_buffer (s=0x55631f754360, buf=0x55631f784e00 "{\"timestamp\": {\"seconds\": 1562851228, \"microseconds\": 27216}, \"event\": \"SHUTDOWN\", \"data\": {\"guest\": false, \"reason\": \"host-qmp-quit\"}}\r\n", len=137, offset=0x7fff982b6630, write_all=false) at chardev/char.c:114
#5  0x000055631de4b354 in qemu_chr_write (s=0x55631f754360, buf=0x55631f784e00 "{\"timestamp\": {\"seconds\": 1562851228, \"microseconds\": 27216}, \"event\": \"SHUTDOWN\", \"data\": {\"guest\": false, \"reason\": \"host-qmp-quit\"}}\r\n", len=137, write_all=false) at chardev/char.c:149
#6  0x000055631de4e358 in qemu_chr_fe_write (be=0x55631f784010, buf=0x55631f784e00 "{\"timestamp\": {\"seconds\": 1562851228, \"microseconds\": 27216}, \"event\": \"SHUTDOWN\", \"data\": {\"guest\": false, \"reason\": \"host-qmp-quit\"}}\r\n", len=137) at chardev/char-fe.c:42
#7  0x000055631dd013f8 in monitor_flush_locked (mon=0x55631f784010) at monitor/monitor.c:124
#8  0x000055631dd015f7 in monitor_puts (mon=0x55631f784010, str=0x5563200c7780 "{\"timestamp\": {\"seconds\": 1562851228, \"microseconds\": 27216}, \"event\": \"SHUTDOWN\", \"data\": {\"guest\": false, \"reason\": \"host-qmp-quit\"}}\n") at monitor/monitor.c:166
#9  0x000055631dd02c76 in qmp_send_response (mon=0x55631f784010, rsp=0x556320136020) at monitor/qmp.c:94
#10 0x000055631dd018ef in monitor_qapi_event_emit (event=QAPI_EVENT_SHUTDOWN, qdict=0x556320136020) at monitor/monitor.c:266
#11 0x000055631dd019e9 in monitor_qapi_event_queue_no_reenter (event=QAPI_EVENT_SHUTDOWN, qdict=0x556320136020) at monitor/monitor.c:291
#12 0x000055631dd01d9d in qapi_event_emit (event=QAPI_EVENT_SHUTDOWN, qdict=0x556320136020) at monitor/monitor.c:366
#13 0x000055631dec4304 in qapi_event_send_shutdown (guest=false, reason=SHUTDOWN_CAUSE_HOST_QMP_QUIT) at qapi/qapi-events-run-state.c:43
#14 0x000055631dad02f2 in qemu_system_shutdown (cause=SHUTDOWN_CAUSE_HOST_QMP_QUIT) at vl.c:1779
#15 0x000055631dad0406 in main_loop_should_exit () at vl.c:1827
#16 0x000055631dad050b in main_loop () at vl.c:1866
#17 0x000055631dad7e27 in main (argc=22, argv=0x7fff982b6c88, envp=0x7fff982b6d40) at vl.c:4547

 chardev/char-socket.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

Comments

Max Reitz July 15, 2019, 5:23 p.m. UTC | #1
On 12.07.19 21:17, Andrey Shinkevich wrote:
> When tcp_chr_disconnect() is called, other thread may be still writing
> to the channel. This patch protects only read operations that initiate
> the disconnection.
> 
> Signed-off-by: Andrey Shinkevich <andrey.shinkevich@virtuozzo.com>
> ---

Have you looked at
https://lists.nongnu.org/archive/html/qemu-devel/2019-02/msg06174.html
already?  From a glance, it looks like that series supersedes this one.

(No, I don’t know why the other series is delayed.  I keep reminding
Paolo of it.)

Max
Paolo Bonzini July 15, 2019, 6:27 p.m. UTC | #2
On 15/07/19 19:23, Max Reitz wrote:
> On 12.07.19 21:17, Andrey Shinkevich wrote:
>> When tcp_chr_disconnect() is called, other thread may be still writing
>> to the channel. This patch protects only read operations that initiate
>> the disconnection.
>>
>> Signed-off-by: Andrey Shinkevich <andrey.shinkevich@virtuozzo.com>
>> ---
> 
> Have you looked at
> https://lists.nongnu.org/archive/html/qemu-devel/2019-02/msg06174.html
> already?  From a glance, it looks like that series supersedes this one.
> 
> (No, I don’t know why the other series is delayed.

Because it broke some testcases in tests/vhost-user-test.  They are
disabled by default, because AFAIR they broke on some CI environment,
but they are supposed to work.

Paolo

> I keep reminding
> Paolo of it.)
> 
> Max
>
Max Reitz July 16, 2019, 7:47 a.m. UTC | #3
On 15.07.19 20:27, Paolo Bonzini wrote:
> On 15/07/19 19:23, Max Reitz wrote:
>> On 12.07.19 21:17, Andrey Shinkevich wrote:
>>> When tcp_chr_disconnect() is called, other thread may be still writing
>>> to the channel. This patch protects only read operations that initiate
>>> the disconnection.
>>>
>>> Signed-off-by: Andrey Shinkevich <andrey.shinkevich@virtuozzo.com>
>>> ---
>>
>> Have you looked at
>> https://lists.nongnu.org/archive/html/qemu-devel/2019-02/msg06174.html
>> already?  From a glance, it looks like that series supersedes this one.
>>
>> (No, I don’t know why the other series is delayed.
> 
> Because it broke some testcases in tests/vhost-user-test.  They are
> disabled by default, because AFAIR they broke on some CI environment,
> but they are supposed to work.

Ah, OK.  Thanks!

Max
Andrey Shinkevich July 16, 2019, 1:02 p.m. UTC | #4
On 15/07/2019 20:23, Max Reitz wrote:
> On 12.07.19 21:17, Andrey Shinkevich wrote:
>> When tcp_chr_disconnect() is called, other thread may be still writing
>> to the channel. This patch protects only read operations that initiate
>> the disconnection.
>>
>> Signed-off-by: Andrey Shinkevich <andrey.shinkevich@virtuozzo.com>
>> ---
> 
> Have you looked at
> https://lists.nongnu.org/archive/html/qemu-devel/2019-02/msg06174.html
> already?  From a glance, it looks like that series supersedes this one.
> 

No, I had missed that series when I tackled to resolve the issue.
Thank you Max for letting me know. Please include me to cc there.
My series resolves the particular case. The tests pass.

Andrey

> (No, I don’t know why the other series is delayed.  I keep reminding
> Paolo of it.)
> 
> Max
>
Andrey Shinkevich July 16, 2019, 1:08 p.m. UTC | #5
On 15/07/2019 21:27, Paolo Bonzini wrote:
> On 15/07/19 19:23, Max Reitz wrote:
>> On 12.07.19 21:17, Andrey Shinkevich wrote:
>>> When tcp_chr_disconnect() is called, other thread may be still writing
>>> to the channel. This patch protects only read operations that initiate
>>> the disconnection.
>>>
>>> Signed-off-by: Andrey Shinkevich <andrey.shinkevich@virtuozzo.com>
>>> ---
>>
>> Have you looked at
>> https://lists.nongnu.org/archive/html/qemu-devel/2019-02/msg06174.html
>> already?  From a glance, it looks like that series supersedes this one.
>>
>> (No, I don’t know why the other series is delayed.
> 
> Because it broke some testcases in tests/vhost-user-test.  They are
> disabled by default, because AFAIR they broke on some CI environment,
> but they are supposed to work.
> 
> Paolo
> 
>> I keep reminding
>> Paolo of it.)
>>
>> Max
>>
> 

The test check-qtest-x86_64: tests/qos-test hangs with the 
QTEST_VHOST_USER_FIXME set even without applying the series:

Thread 4 (Thread 0x2ade7a2bb700 (LWP 492566)):
#0  0x00002ade6f5431c9 in syscall () at 
../sysdeps/unix/sysv/linux/x86_64/syscall.S:38
#1  0x00005599dec08bb6 in qemu_futex_wait (f=0x5599df6651d4 
<rcu_call_ready_event>, val=4294967295) at 
/home/andrey/git/qemu/include/qemu/futex.h:29
#2  0x00005599dec08d7f in qemu_event_wait (ev=0x5599df6651d4 
<rcu_call_ready_event>) at util/qemu-thread-posix.c:442
#3  0x00005599dec21ea1 in call_rcu_thread (opaque=0x0) at util/rcu.c:260
#4  0x00005599dec08f2c in qemu_thread_start (args=0x5599e10568f0) at 
util/qemu-thread-posix.c:502
#5  0x00002ade6f236dd5 in start_thread (arg=0x2ade7a2bb700) at 
pthread_create.c:307
#6  0x00002ade6f548ead in clone () at 
../sysdeps/unix/sysv/linux/x86_64/clone.S:111

Thread 3 (Thread 0x2ade7a4bc700 (LWP 492567)):
#0  0x00002ade6f53e20d in poll () at ../sysdeps/unix/syscall-template.S:81
#1  0x00002ade56e7c32c in g_main_context_iterate.isra.19 () at 
/lib64/libglib-2.0.so.0
#2  0x00002ade56e7c67a in g_main_loop_run () at /lib64/libglib-2.0.so.0
#3  0x00005599de7f6772 in iothread_run (opaque=0x5599e1196a30) at 
iothread.c:82
#4  0x00005599dec08f2c in qemu_thread_start (args=0x5599e11a87a0) at 
util/qemu-thread-posix.c:502
#5  0x00002ade6f236dd5 in start_thread (arg=0x2ade7a4bc700) at 
pthread_create.c:307
#6  0x00002ade6f548ead in clone () at 
../sysdeps/unix/sysv/linux/x86_64/clone.S:111

Thread 2 (Thread 0x2ade7a7ae700 (LWP 492568)):
#0  0x00002ade6f23e361 in __sigwait (sig=0x2ade7a7ab750, set=<optimized 
out>) at ../sysdeps/unix/sysv/linux/sigwait.c:60
#1  0x00002ade6f23e361 in __sigwait (set=0x2ade7a7ab760, 
sig=0x2ade7a7ab750) at ../sysdeps/unix/sysv/linux/sigwait.c:95
#2  0x00005599de655fee in qemu_dummy_cpu_thread_fn (arg=0x5599e11a9eb0) 
at /home/andrey/git/qemu/cpus.c:1331
#3  0x00005599dec08f2c in qemu_thread_start (args=0x5599e11cd140) at 
util/qemu-thread-posix.c:502
#4  0x00002ade6f236dd5 in start_thread (arg=0x2ade7a7ae700) at 
pthread_create.c:307
#5  0x00002ade6f548ead in clone () at 
../sysdeps/unix/sysv/linux/x86_64/clone.S:111

Thread 1 (Thread 0x2ade5661c280 (LWP 492564)):
#0  0x00002ade6f53e2cf in __GI_ppoll (fds=0x5599e20e1360, nfds=9, 
timeout=<optimized out>, sigmask=0x0) at 
../sysdeps/unix/sysv/linux/ppoll.c:56
#1  0x00005599dec0232d in qemu_poll_ns (fds=0x5599e20e1360, nfds=9, 
timeout=43793998964000) at util/qemu-timer.c:334
#2  0x00005599dec03510 in os_host_main_loop_wait 
(timeout=43793998964000) at util/main-loop.c:240
#3  0x00005599dec03634 in main_loop_wait (nonblocking=0) at 
util/main-loop.c:521
#4  0x00005599de7ff439 in main_loop () at vl.c:1791
#5  0x00005599de806dca in main (argc=19, argv=0x7ffe5b3b66a8, 
envp=0x7ffe5b3b6748) at vl.c:4473

Andrey
Paolo Bonzini July 16, 2019, 2:24 p.m. UTC | #6
On 16/07/19 15:08, Andrey Shinkevich wrote:
> The test check-qtest-x86_64: tests/qos-test hangs with the 
> QTEST_VHOST_USER_FIXME set even without applying the series:

Hmm it must have bitrot. :((  I hope I can look at it on Thursday.

Paolo
Andrey Shinkevich July 17, 2019, 8:28 a.m. UTC | #7
On 15/07/2019 21:27, Paolo Bonzini wrote:
> On 15/07/19 19:23, Max Reitz wrote:
>> On 12.07.19 21:17, Andrey Shinkevich wrote:
>>> When tcp_chr_disconnect() is called, other thread may be still writing
>>> to the channel. This patch protects only read operations that initiate
>>> the disconnection.
>>>
>>> Signed-off-by: Andrey Shinkevich <andrey.shinkevich@virtuozzo.com>
>>> ---
>>
>> Have you looked at
>> https://lists.nongnu.org/archive/html/qemu-devel/2019-02/msg06174.html
>> already?  From a glance, it looks like that series supersedes this one.
>>
>> (No, I don’t know why the other series is delayed.
> 
> Because it broke some testcases in tests/vhost-user-test.  They are
> disabled by default, because AFAIR they broke on some CI environment,
> but they are supposed to work.
> 
> Paolo
> 
>> I keep reminding
>> Paolo of it.)
>>
>> Max
>>
> 

The test check-qtest-x86_64: tests/qos-test hangs with the 
QTEST_VHOST_USER_FIXME set even without applying the series:

Thread 4 (Thread 0x2ade7a2bb700 (LWP 492566)):
#0  0x00002ade6f5431c9 in syscall () at 
../sysdeps/unix/sysv/linux/x86_64/syscall.S:38
#1  0x00005599dec08bb6 in qemu_futex_wait (f=0x5599df6651d4 
<rcu_call_ready_event>, val=4294967295) at 
/home/andrey/git/qemu/include/qemu/futex.h:29
#2  0x00005599dec08d7f in qemu_event_wait (ev=0x5599df6651d4 
<rcu_call_ready_event>) at util/qemu-thread-posix.c:442
#3  0x00005599dec21ea1 in call_rcu_thread (opaque=0x0) at util/rcu.c:260
#4  0x00005599dec08f2c in qemu_thread_start (args=0x5599e10568f0) at 
util/qemu-thread-posix.c:502
#5  0x00002ade6f236dd5 in start_thread (arg=0x2ade7a2bb700) at 
pthread_create.c:307
#6  0x00002ade6f548ead in clone () at 
../sysdeps/unix/sysv/linux/x86_64/clone.S:111

Thread 3 (Thread 0x2ade7a4bc700 (LWP 492567)):
#0  0x00002ade6f53e20d in poll () at ../sysdeps/unix/syscall-template.S:81
#1  0x00002ade56e7c32c in g_main_context_iterate.isra.19 () at 
/lib64/libglib-2.0.so.0
#2  0x00002ade56e7c67a in g_main_loop_run () at /lib64/libglib-2.0.so.0
#3  0x00005599de7f6772 in iothread_run (opaque=0x5599e1196a30) at 
iothread.c:82
#4  0x00005599dec08f2c in qemu_thread_start (args=0x5599e11a87a0) at 
util/qemu-thread-posix.c:502
#5  0x00002ade6f236dd5 in start_thread (arg=0x2ade7a4bc700) at 
pthread_create.c:307
#6  0x00002ade6f548ead in clone () at 
../sysdeps/unix/sysv/linux/x86_64/clone.S:111

Thread 2 (Thread 0x2ade7a7ae700 (LWP 492568)):
#0  0x00002ade6f23e361 in __sigwait (sig=0x2ade7a7ab750, set=<optimized 
out>) at ../sysdeps/unix/sysv/linux/sigwait.c:60
#1  0x00002ade6f23e361 in __sigwait (set=0x2ade7a7ab760, 
sig=0x2ade7a7ab750) at ../sysdeps/unix/sysv/linux/sigwait.c:95
#2  0x00005599de655fee in qemu_dummy_cpu_thread_fn (arg=0x5599e11a9eb0) 
at /home/andrey/git/qemu/cpus.c:1331
#3  0x00005599dec08f2c in qemu_thread_start (args=0x5599e11cd140) at 
util/qemu-thread-posix.c:502
#4  0x00002ade6f236dd5 in start_thread (arg=0x2ade7a7ae700) at 
pthread_create.c:307
#5  0x00002ade6f548ead in clone () at 
../sysdeps/unix/sysv/linux/x86_64/clone.S:111

Thread 1 (Thread 0x2ade5661c280 (LWP 492564)):
#0  0x00002ade6f53e2cf in __GI_ppoll (fds=0x5599e20e1360, nfds=9, 
timeout=<optimized out>, sigmask=0x0) at 
../sysdeps/unix/sysv/linux/ppoll.c:56
#1  0x00005599dec0232d in qemu_poll_ns (fds=0x5599e20e1360, nfds=9, 
timeout=43793998964000) at util/qemu-timer.c:334
#2  0x00005599dec03510 in os_host_main_loop_wait 
(timeout=43793998964000) at util/main-loop.c:240
#3  0x00005599dec03634 in main_loop_wait (nonblocking=0) at 
util/main-loop.c:521
#4  0x00005599de7ff439 in main_loop () at vl.c:1791
#5  0x00005599de806dca in main (argc=19, argv=0x7ffe5b3b66a8, 
envp=0x7ffe5b3b6748) at vl.c:4473

Andrey
Andrey Shinkevich July 17, 2019, 8:44 a.m. UTC | #8
On 16/07/2019 17:24, Paolo Bonzini wrote:
> On 16/07/19 15:08, Andrey Shinkevich wrote:
>> The test check-qtest-x86_64: tests/qos-test hangs with the
>> QTEST_VHOST_USER_FIXME set even without applying the series:
> 
> Hmm it must have bitrot. :((  I hope I can look at it on Thursday.
> 
> Paolo
> 

Thank you Paolo.
I'm sorry about sending the same message again. There was a mess in my 
Thunderbird with extracting the thread "char-socket: Fix race condition" 
from the archive yesterday. Some messages were failed to download and to 
send.

Andrey
diff mbox series

Patch

diff --git a/chardev/char-socket.c b/chardev/char-socket.c
index 7ca5d97..14eb7c0 100644
--- a/chardev/char-socket.c
+++ b/chardev/char-socket.c
@@ -490,6 +490,16 @@  static void tcp_chr_disconnect(Chardev *chr)
     }
 }
 
+/*
+ * Call to allow the write operation to complete
+ */
+static void tcp_chr_disconnect_locked(Chardev *chr)
+{
+    qemu_mutex_lock(&chr->chr_write_lock);
+    tcp_chr_disconnect(chr);
+    qemu_mutex_unlock(&chr->chr_write_lock);
+}
+
 static gboolean tcp_chr_read(QIOChannel *chan, GIOCondition cond, void *opaque)
 {
     Chardev *chr = CHARDEV(opaque);
@@ -508,7 +518,7 @@  static gboolean tcp_chr_read(QIOChannel *chan, GIOCondition cond, void *opaque)
     size = tcp_chr_recv(chr, (void *)buf, len);
     if (size == 0 || (size == -1 && errno != EAGAIN)) {
         /* connection closed */
-        tcp_chr_disconnect(chr);
+        tcp_chr_disconnect_locked(chr);
     } else if (size > 0) {
         if (s->do_telnetopt) {
             tcp_chr_process_IAC_bytes(chr, s, buf, &size);
@@ -544,7 +554,7 @@  static int tcp_chr_sync_read(Chardev *chr, const uint8_t *buf, int len)
     qio_channel_set_blocking(s->ioc, false, NULL);
     if (size == 0) {
         /* connection closed */
-        tcp_chr_disconnect(chr);
+        tcp_chr_disconnect_locked(chr);
     }
 
     return size;