diff mbox

[LEDE-DEV] uloop: use a waker for notifying sigchld and loop cancel events

Message ID 1465438832-20133-1-git-send-email-yszhou4tech@gmail.com
State Accepted
Headers show

Commit Message

Yousong Zhou June 9, 2016, 2:20 a.m. UTC
Fix a race condition when do_sigchld, uloop_cancelled were set just
before epoll_wait(timeout=-1), resulting the loop stuck in the syscall
without noticing the events just happened

Signed-off-by: Yousong Zhou <yszhou4tech@gmail.com>
---
 uloop-epoll.c  |  2 +-
 uloop-kqueue.c |  2 +-
 uloop.c        | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 63 insertions(+), 6 deletions(-)

Comments

Conor O'Gorman June 15, 2016, 9:39 a.m. UTC | #1
On 09/06/16 03:20, Yousong Zhou wrote:
> Fix a race condition when do_sigchld, uloop_cancelled were set just
> before epoll_wait(timeout=-1), resulting the loop stuck in the syscall
> without noticing the events just happened
>
> Signed-off-by: Yousong Zhou <yszhou4tech@gmail.com>
> ---
>   uloop-epoll.c  |  2 +-
>   uloop-kqueue.c |  2 +-
>   uloop.c        | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++++----
>   3 files changed, 63 insertions(+), 6 deletions(-)
>
Why not change the timeout?
Felix Fietkau June 15, 2016, 9:58 a.m. UTC | #2
On 2016-06-09 04:20, Yousong Zhou wrote:
> Fix a race condition when do_sigchld, uloop_cancelled were set just
> before epoll_wait(timeout=-1), resulting the loop stuck in the syscall
> without noticing the events just happened
> 
> Signed-off-by: Yousong Zhou <yszhou4tech@gmail.com>
I found a few bugs and inconsistencies in that patch, and pushed a
fixed version here:
https://git.lede-project.org/?p=project/libubox.git;a=commitdiff;h=4e3a47a4cb438866bc4b9cb2f5d16226ffb48502

Please review and test it some more before I push an update to LEDE.

Thanks,

- Felix
Felix Fietkau June 15, 2016, 9:58 a.m. UTC | #3
On 2016-06-15 11:39, Conor O'Gorman wrote:
> On 09/06/16 03:20, Yousong Zhou wrote:
>> Fix a race condition when do_sigchld, uloop_cancelled were set just
>> before epoll_wait(timeout=-1), resulting the loop stuck in the syscall
>> without noticing the events just happened
>>
>> Signed-off-by: Yousong Zhou <yszhou4tech@gmail.com>
>> ---
>>   uloop-epoll.c  |  2 +-
>>   uloop-kqueue.c |  2 +-
>>   uloop.c        | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++++----
>>   3 files changed, 63 insertions(+), 6 deletions(-)
>>
> Why not change the timeout?
Changing the timeout would not fix the race, and it would lead to
unnecessary process wakeups and extra latency for signal processing.

- Felix
Yousong Zhou June 15, 2016, 11:20 a.m. UTC | #4
On 15 June 2016 at 17:58, Felix Fietkau <nbd@nbd.name> wrote:
> On 2016-06-09 04:20, Yousong Zhou wrote:
>> Fix a race condition when do_sigchld, uloop_cancelled were set just
>> before epoll_wait(timeout=-1), resulting the loop stuck in the syscall
>> without noticing the events just happened
>>
>> Signed-off-by: Yousong Zhou <yszhou4tech@gmail.com>
> I found a few bugs and inconsistencies in that patch, and pushed a
> fixed version here:
> https://git.lede-project.org/?p=project/libubox.git;a=commitdiff;h=4e3a47a4cb438866bc4b9cb2f5d16226ffb48502
>
> Please review and test it some more before I push an update to LEDE.
>
> Thanks,
>
> - Felix

Ack from me.

The patch looks better.  Thanks

                yousong
Mats Karrman June 15, 2016, 1:07 p.m. UTC | #5
On 2016-06-15 13:20, Yousong Zhou wrote:
> On 15 June 2016 at 17:58, Felix Fietkau <nbd@nbd.name> wrote:
>> On 2016-06-09 04:20, Yousong Zhou wrote:
>>> Fix a race condition when do_sigchld, uloop_cancelled were set just
>>> before epoll_wait(timeout=-1), resulting the loop stuck in the syscall
>>> without noticing the events just happened
>>>
>>> Signed-off-by: Yousong Zhou <yszhou4tech@gmail.com>
>> I found a few bugs and inconsistencies in that patch, and pushed a
>> fixed version here:
>> https://git.lede-project.org/?p=project/libubox.git;a=commitdiff;h=4e3a47a4cb438866bc4b9cb2f5d16226ffb48502
>>
>> Please review and test it some more before I push an update to LEDE.
>>
>> Thanks,
>>
>> - Felix
> Ack from me.
>
> The patch looks better.  Thanks
>
>                  yousong
Tried this version together with procd-2016-03-09 
(b12bb150ed38a4409bef5127c77b060ee616b860)
but I still see the same behavior/error.

BR // Mats
Hans Dedecker June 15, 2016, 1:13 p.m. UTC | #6
On Wed, Jun 15, 2016 at 11:58 AM, Felix Fietkau <nbd@nbd.name> wrote:
> On 2016-06-15 11:39, Conor O'Gorman wrote:
>> On 09/06/16 03:20, Yousong Zhou wrote:
>>> Fix a race condition when do_sigchld, uloop_cancelled were set just
>>> before epoll_wait(timeout=-1), resulting the loop stuck in the syscall
>>> without noticing the events just happened
>>>
>>> Signed-off-by: Yousong Zhou <yszhou4tech@gmail.com>
>>> ---
>>>   uloop-epoll.c  |  2 +-
>>>   uloop-kqueue.c |  2 +-
>>>   uloop.c        | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++++----
>>>   3 files changed, 63 insertions(+), 6 deletions(-)
>>>
>> Why not change the timeout?
> Changing the timeout would not fix the race, and it would lead to
> unnecessary process wakeups and extra latency for signal processing.
>
> - Felix
>
> _______________________________________________
> Lede-dev mailing list
> Lede-dev@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/lede-dev
Could it be that the wiker_pipe does not get properly initialized as
waker_pipe never is assigned a fd value ?

Br,
Hans
Felix Fietkau June 15, 2016, 1:16 p.m. UTC | #7
On 2016-06-15 15:13, Hans Dedecker wrote:
> On Wed, Jun 15, 2016 at 11:58 AM, Felix Fietkau <nbd@nbd.name> wrote:
>> On 2016-06-15 11:39, Conor O'Gorman wrote:
>>> On 09/06/16 03:20, Yousong Zhou wrote:
>>>> Fix a race condition when do_sigchld, uloop_cancelled were set just
>>>> before epoll_wait(timeout=-1), resulting the loop stuck in the syscall
>>>> without noticing the events just happened
>>>>
>>>> Signed-off-by: Yousong Zhou <yszhou4tech@gmail.com>
>>>> ---
>>>>   uloop-epoll.c  |  2 +-
>>>>   uloop-kqueue.c |  2 +-
>>>>   uloop.c        | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++++----
>>>>   3 files changed, 63 insertions(+), 6 deletions(-)
>>>>
>>> Why not change the timeout?
>> Changing the timeout would not fix the race, and it would lead to
>> unnecessary process wakeups and extra latency for signal processing.
>>
>> - Felix
>>
>> _______________________________________________
>> Lede-dev mailing list
>> Lede-dev@lists.infradead.org
>> http://lists.infradead.org/mailman/listinfo/lede-dev
> Could it be that the wiker_pipe does not get properly initialized as
> waker_pipe never is assigned a fd value ?
I accidentally dropped that in a copy&paste screwup. Fix pushed to git.

Thanks,

- Felix
Mats Karrman June 15, 2016, 2:54 p.m. UTC | #8
On 2016-06-15 15:16, Felix Fietkau wrote:
> On 2016-06-15 15:13, Hans Dedecker wrote:  >> On Wed, Jun 15, 2016 at 11:58 AM, Felix Fietkau <nbd@nbd.name> >> 
wrote: >>> On 2016-06-15 11:39, Conor O'Gorman wrote: >>>> On 09/06/16 
03:20, Yousong Zhou wrote: >>>>> Fix a race condition when do_sigchld, 
uloop_cancelled were >>>>> set just before epoll_wait(timeout=-1), 
resulting the loop >>>>> stuck in the syscall without noticing the 
events just >>>>> happened >>>>> >>>>> Signed-off-by: Yousong Zhou 
<yszhou4tech@gmail.com> --- >>>>> uloop-epoll.c  |  2 +- uloop-kqueue.c 
|  2 +- uloop.c >>>>> | 65 >>>>> 
++++++++++++++++++++++++++++++++++++++++++++++++++++++---- 3 >>>>> files 
changed, 63 insertions(+), 6 deletions(-) >>>>> >>>> Why not change the 
timeout? >>> Changing the timeout would not fix the race, and it would 
lead >>> to unnecessary process wakeups and extra latency for signal >>> 
processing. >>> >>> - Felix >>> >>> 
_______________________________________________ Lede-dev mailing >>> 
list Lede-dev@lists.infradead.org >>> 
http://lists.infradead.org/mailman/listinfo/lede-dev >> Could it be that 
the wiker_pipe does not get properly initialized >> as waker_pipe never 
is assigned a fd value ? > I accidentally dropped that in a copy&paste 
screwup. Fix pushed to > git. > > Thanks, > > - Felix
Ran 118 reboots with this fix without any problem so there is a good 
chance that
the latest version of libubox (c2f2c47f3e9a2d709ec82a79f6fadd3124c18781)
finally fixed the issue.
:-) // Mats
Felix Fietkau June 15, 2016, 3:01 p.m. UTC | #9
On 2016-06-15 16:54, Mats Karrman wrote:
> Ran 118 reboots with this fix without any problem so there is a good 
> chance that
> the latest version of libubox (c2f2c47f3e9a2d709ec82a79f6fadd3124c18781)
> finally fixed the issue.
> :-) // Mats
Pushed out the update to the LEDE tree, thanks for testing.

- Felix
Hans Dedecker June 16, 2016, 7:26 a.m. UTC | #10
On Wed, Jun 15, 2016 at 4:54 PM, Mats Karrman <mats.dev.list@gmail.com> wrote:
>
> On 2016-06-15 15:16, Felix Fietkau wrote:
>>
>> On 2016-06-15 15:13, Hans Dedecker wrote:  >> On Wed, Jun 15, 2016 at
>> 11:58 AM, Felix Fietkau <nbd@nbd.name> >>
>
> wrote: >>> On 2016-06-15 11:39, Conor O'Gorman wrote: >>>> On 09/06/16
> 03:20, Yousong Zhou wrote: >>>>> Fix a race condition when do_sigchld,
> uloop_cancelled were >>>>> set just before epoll_wait(timeout=-1), resulting
> the loop >>>>> stuck in the syscall without noticing the events just >>>>>
> happened >>>>> >>>>> Signed-off-by: Yousong Zhou <yszhou4tech@gmail.com> ---
>>>>>> uloop-epoll.c  |  2 +- uloop-kqueue.c |  2 +- uloop.c >>>>> | 65 >>>>>
> ++++++++++++++++++++++++++++++++++++++++++++++++++++++---- 3 >>>>> files
> changed, 63 insertions(+), 6 deletions(-) >>>>> >>>> Why not change the
> timeout? >>> Changing the timeout would not fix the race, and it would lead
>>>> to unnecessary process wakeups and extra latency for signal >>>
> processing. >>> >>> - Felix >>> >>>
> _______________________________________________ Lede-dev mailing >>> list
> Lede-dev@lists.infradead.org >>>
> http://lists.infradead.org/mailman/listinfo/lede-dev >> Could it be that the
> wiker_pipe does not get properly initialized >> as waker_pipe never is
> assigned a fd value ? > I accidentally dropped that in a copy&paste screwup.
> Fix pushed to > git. > > Thanks, > > - Felix
> Ran 118 reboots with this fix without any problem so there is a good chance
> that
> the latest version of libubox (c2f2c47f3e9a2d709ec82a79f6fadd3124c18781)
> finally fixed the issue.
> :-) // Mats
>
Same result here after a night of testing; no hanging reboots observed anymore.

Thx,
Hans
diff mbox

Patch

diff --git a/uloop-epoll.c b/uloop-epoll.c
index bb652fd..6014bea 100644
--- a/uloop-epoll.c
+++ b/uloop-epoll.c
@@ -23,7 +23,7 @@ 
 #define EPOLLRDHUP 0x2000
 #endif
 
-int uloop_init(void)
+static int uloop_init_pollfd(void)
 {
 	if (poll_fd >= 0)
 		return 0;
diff --git a/uloop-kqueue.c b/uloop-kqueue.c
index 0cb1c14..ba5595b 100644
--- a/uloop-kqueue.c
+++ b/uloop-kqueue.c
@@ -15,7 +15,7 @@ 
  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
-int uloop_init(void)
+static int uloop_init_pollfd(void)
 {
 	struct timespec timeout = { 0, 0 };
 	struct kevent ev = {};
diff --git a/uloop.c b/uloop.c
index cd3de85..b59c31f 100644
--- a/uloop.c
+++ b/uloop.c
@@ -63,6 +63,11 @@  static bool do_sigchld = false;
 static struct uloop_fd_event cur_fds[ULOOP_MAX_EVENTS];
 static int cur_fd, cur_nfds;
 
+static int waker_pipe[2] = {-1, -1};
+static struct uloop_fd waker_fd;
+
+int uloop_fd_add(struct uloop_fd *sock, unsigned int flags);
+
 #ifdef USE_KQUEUE
 #include "uloop-kqueue.c"
 #endif
@@ -71,6 +76,45 @@  static int cur_fd, cur_nfds;
 #include "uloop-epoll.c"
 #endif
 
+static void waker_consume(struct uloop_fd *fd, unsigned int events)
+{
+	char buf[4];
+
+	while (read(fd->fd, buf, 4) > 0)
+		;
+}
+
+static int waker_init(void)
+{
+	if (waker_pipe[0] >= 0 && waker_pipe[1] >= 0)
+		return 0;
+
+	if (pipe(waker_pipe) < 0)
+		return -1;
+
+	fcntl(waker_pipe[0], F_SETFD, fcntl(waker_pipe[0], F_GETFD) | O_NONBLOCK);
+	fcntl(waker_pipe[1], F_SETFD, fcntl(waker_pipe[1], F_GETFD) | O_NONBLOCK);
+
+	waker_fd.fd = waker_pipe[0];
+	waker_fd.cb = waker_consume;
+	uloop_fd_add(&waker_fd, ULOOP_READ);
+
+	return 0;
+}
+
+int uloop_init(void)
+{
+	if (uloop_init_pollfd() < 0)
+		return -1;
+
+	if (waker_init() < 0) {
+		close(poll_fd);
+		poll_fd = -1;
+		return -1;
+	}
+	return 0;
+}
+
 static bool uloop_fd_stack_event(struct uloop_fd *fd, int events)
 {
 	struct uloop_fd_stack *cur;
@@ -330,12 +374,18 @@  static void uloop_handle_processes(void)
 
 static void uloop_handle_sigint(int signo)
 {
+	char buf[1] = {'w'};
+
 	uloop_cancelled = true;
+	write(waker_pipe[1], buf, 1);
 }
 
 static void uloop_sigchld(int signo)
 {
+	char buf[1] = {'w'};
+
 	do_sigchld = true;
+	write(waker_pipe[1], buf, 1);
 }
 
 static void uloop_install_handler(int signum, void (*handler)(int), struct sigaction* old, bool add)
@@ -477,11 +527,18 @@  void uloop_run(void)
 
 void uloop_done(void)
 {
-	if (poll_fd < 0)
-		return;
+	int i;
 
-	close(poll_fd);
-	poll_fd = -1;
+	if (poll_fd >= 0) {
+		close(poll_fd);
+		poll_fd = -1;
+	}
+	for (i = 0; i < ARRAY_SIZE(waker_pipe); i++) {
+		if (waker_pipe[i] >= 0) {
+			close(waker_pipe[i]);
+			waker_pipe[i] = -1;
+		}
+	}
 
 	uloop_clear_timeouts();
 	uloop_clear_processes();