diff mbox series

net: qrtr: ns: Protect radix_tree_deref_slot() using rcu read locks

Message ID 20200926165625.11660-1-manivannan.sadhasivam@linaro.org
State Accepted
Delegated to: David Miller
Headers show
Series net: qrtr: ns: Protect radix_tree_deref_slot() using rcu read locks | expand

Commit Message

Manivannan Sadhasivam Sept. 26, 2020, 4:56 p.m. UTC
The rcu read locks are needed to avoid potential race condition while
dereferencing radix tree from multiple threads. The issue was identified
by syzbot. Below is the crash report:

=============================
WARNING: suspicious RCU usage
5.7.0-syzkaller #0 Not tainted
-----------------------------
include/linux/radix-tree.h:176 suspicious rcu_dereference_check() usage!

other info that might help us debug this:

rcu_scheduler_active = 2, debug_locks = 1
2 locks held by kworker/u4:1/21:
 #0: ffff88821b097938 ((wq_completion)qrtr_ns_handler){+.+.}-{0:0}, at: spin_unlock_irq include/linux/spinlock.h:403 [inline]
 #0: ffff88821b097938 ((wq_completion)qrtr_ns_handler){+.+.}-{0:0}, at: process_one_work+0x6df/0xfd0 kernel/workqueue.c:2241
 #1: ffffc90000dd7d80 ((work_completion)(&qrtr_ns.work)){+.+.}-{0:0}, at: process_one_work+0x71e/0xfd0 kernel/workqueue.c:2243

stack backtrace:
CPU: 0 PID: 21 Comm: kworker/u4:1 Not tainted 5.7.0-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Workqueue: qrtr_ns_handler qrtr_ns_worker
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x1e9/0x30e lib/dump_stack.c:118
 radix_tree_deref_slot include/linux/radix-tree.h:176 [inline]
 ctrl_cmd_new_lookup net/qrtr/ns.c:558 [inline]
 qrtr_ns_worker+0x2aff/0x4500 net/qrtr/ns.c:674
 process_one_work+0x76e/0xfd0 kernel/workqueue.c:2268
 worker_thread+0xa7f/0x1450 kernel/workqueue.c:2414
 kthread+0x353/0x380 kernel/kthread.c:268

Fixes: 0c2204a4ad71 ("net: qrtr: Migrate nameservice to kernel from userspace")
Reported-and-tested-by: syzbot+0f84f6eed90503da72fc@syzkaller.appspotmail.com
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
---
 net/qrtr/ns.c | 34 +++++++++++++++++++++++++---------
 1 file changed, 25 insertions(+), 9 deletions(-)

Comments

David Miller Sept. 28, 2020, 10:56 p.m. UTC | #1
From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Date: Sat, 26 Sep 2020 22:26:25 +0530

> The rcu read locks are needed to avoid potential race condition while
> dereferencing radix tree from multiple threads. The issue was identified
> by syzbot. Below is the crash report:
 ...
> Fixes: 0c2204a4ad71 ("net: qrtr: Migrate nameservice to kernel from userspace")
> Reported-and-tested-by: syzbot+0f84f6eed90503da72fc@syzkaller.appspotmail.com
> Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>

Applied and queued up for -stable, thank you.
Doug Anderson Oct. 1, 2020, 10:53 p.m. UTC | #2
Hi,

On Mon, Sep 28, 2020 at 4:15 PM David Miller <davem@davemloft.net> wrote:
>
> From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
> Date: Sat, 26 Sep 2020 22:26:25 +0530
>
> > The rcu read locks are needed to avoid potential race condition while
> > dereferencing radix tree from multiple threads. The issue was identified
> > by syzbot. Below is the crash report:
>  ...
> > Fixes: 0c2204a4ad71 ("net: qrtr: Migrate nameservice to kernel from userspace")
> > Reported-and-tested-by: syzbot+0f84f6eed90503da72fc@syzkaller.appspotmail.com
> > Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
>
> Applied and queued up for -stable, thank you.

The cure is worse than the disease.  I tested by picking back to a
5.4-based kernel and got this crash.  I expect the crash would also be
present on mainline:

 BUG: sleeping function called from invalid context at net/core/sock.c:3000
 in_atomic(): 0, irqs_disabled(): 0, non_block: 0, pid: 7, name: kworker/u16:0
 3 locks held by kworker/u16:0/7:
  #0: ffffff81b65a7b28 ((wq_completion)qrtr_ns_handler){+.+.}, at:
process_one_work+0x1bc/0x614
  #1: ffffff81b6edfd58 ((work_completion)(&qrtr_ns.work)){+.+.}, at:
process_one_work+0x1e4/0x614
  #2: ffffffd01144c328 (rcu_read_lock){....}, at: rcu_lock_acquire+0x8/0x38
 CPU: 6 PID: 7 Comm: kworker/u16:0 Not tainted 5.4.68 #33
 Hardware name: Google Lazor (rev0) with LTE (DT)
 Workqueue: qrtr_ns_handler qrtr_ns_worker
 Call trace:
  dump_backtrace+0x0/0x158
  show_stack+0x20/0x2c
  dump_stack+0xdc/0x180
  ___might_sleep+0x1c0/0x1d0
  __might_sleep+0x50/0x88
  lock_sock_nested+0x34/0x94
  qrtr_sendmsg+0x7c/0x260
  sock_sendmsg+0x44/0x5c
  kernel_sendmsg+0x50/0x64
  lookup_notify+0xa8/0x118
  qrtr_ns_worker+0x8d8/0x1050
  process_one_work+0x338/0x614
  worker_thread+0x29c/0x46c
  kthread+0x150/0x160
  ret_from_fork+0x10/0x18

I'll give the stack crawl from kgdb too since inlining makes things
less obvious with the above...

(gdb) bt
#0  arch_kgdb_breakpoint ()
    at .../arch/arm64/include/asm/kgdb.h:21
#1  kgdb_breakpoint ()
    at .../kernel/debug/debug_core.c:1183
#2  0xffffffd010131058 in ___might_sleep (
    file=file@entry=0xffffffd010efec42 "net/core/sock.c",
    line=line@entry=3000, preempt_offset=preempt_offset@entry=0)
    at .../kernel/sched/core.c:7994
#3  0xffffffd010130ee0 in __might_sleep (
    file=0xffffffd010efec42 "net/core/sock.c", line=3000,
    preempt_offset=0)
    at .../kernel/sched/core.c:7965
#4  0xffffffd01094d1c8 in lock_sock_nested (
    sk=sk@entry=0xffffff8147e457c0, subclass=0)
    at .../net/core/sock.c:3000
#5  0xffffffd010b26028 in lock_sock (sk=0xffffff8147e457c0)
    at .../include/net/sock.h:1536
#6  qrtr_sendmsg (sock=0xffffff8148c4b240, msg=0xffffff81422afab8,
    len=20)
    at .../net/qrtr/qrtr.c:891
#7  0xffffffd01093f8f4 in sock_sendmsg_nosec (
    sock=0xffffff8148c4b240, msg=0xffffff81422afab8)
    at .../net/socket.c:638
#8  sock_sendmsg (sock=sock@entry=0xffffff8148c4b240,
    msg=msg@entry=0xffffff81422afab8)
    at .../net/socket.c:658
#9  0xffffffd01093f95c in kernel_sendmsg (sock=0x1,
    msg=msg@entry=0xffffff81422afab8, vec=<optimized out>,
    vec@entry=0xffffff81422afaa8, num=<optimized out>, num@entry=1,
    size=<optimized out>, size@entry=20)
    at .../net/socket.c:678
#10 0xffffffd010b28be0 in service_announce_new (
    dest=dest@entry=0xffffff81422afc20,
    srv=srv@entry=0xffffff81370f6380)
    at .../net/qrtr/ns.c:127
#11 0xffffffd010b279f4 in announce_servers (sq=0xffffff81422afc20)
    at .../net/qrtr/ns.c:207
#12 ctrl_cmd_hello (sq=0xffffff81422afc20)
    at .../net/qrtr/ns.c:328
#13 qrtr_ns_worker (work=<optimized out>)
    at .../net/qrtr/ns.c:661
#14 0xffffffd010119a94 in process_one_work (
    worker=worker@entry=0xffffff8142267900,
    work=0xffffffd0128ddaf8 <qrtr_ns+48>)
    at .../kernel/workqueue.c:2272
#15 0xffffffd01011a16c in worker_thread (
    __worker=__worker@entry=0xffffff8142267900)
    at .../kernel/workqueue.c:2418
#16 0xffffffd01011fb78 in kthread (_create=0xffffff8142269200)
    at .../kernel/kthread.c:268
#17 0xffffffd01008645c in ret_from_fork ()
    at .../arch/arm64/kernel/entry.S:1169


-Doug
Manivannan Sadhasivam Oct. 2, 2020, 7:10 a.m. UTC | #3
Hi Doug,

On Thu, Oct 01, 2020 at 03:53:12PM -0700, Doug Anderson wrote:
> Hi,
> 
> On Mon, Sep 28, 2020 at 4:15 PM David Miller <davem@davemloft.net> wrote:
> >
> > From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
> > Date: Sat, 26 Sep 2020 22:26:25 +0530
> >
> > > The rcu read locks are needed to avoid potential race condition while
> > > dereferencing radix tree from multiple threads. The issue was identified
> > > by syzbot. Below is the crash report:
> >  ...
> > > Fixes: 0c2204a4ad71 ("net: qrtr: Migrate nameservice to kernel from userspace")
> > > Reported-and-tested-by: syzbot+0f84f6eed90503da72fc@syzkaller.appspotmail.com
> > > Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
> >
> > Applied and queued up for -stable, thank you.
> 
> The cure is worse than the disease.  I tested by picking back to a
> 5.4-based kernel and got this crash.  I expect the crash would also be
> present on mainline:
>

Thanks for the report! I intended to fix the issue reported by syzbot but
failed to notice the lock_sock() in qrtr_sendmsg() function. This function is
not supposed to be called while holding a lock as it might sleep.

I'll submit a patch to fix this issue asap.

Thanks,
Mani
 
>  BUG: sleeping function called from invalid context at net/core/sock.c:3000
>  in_atomic(): 0, irqs_disabled(): 0, non_block: 0, pid: 7, name: kworker/u16:0
>  3 locks held by kworker/u16:0/7:
>   #0: ffffff81b65a7b28 ((wq_completion)qrtr_ns_handler){+.+.}, at:
> process_one_work+0x1bc/0x614
>   #1: ffffff81b6edfd58 ((work_completion)(&qrtr_ns.work)){+.+.}, at:
> process_one_work+0x1e4/0x614
>   #2: ffffffd01144c328 (rcu_read_lock){....}, at: rcu_lock_acquire+0x8/0x38
>  CPU: 6 PID: 7 Comm: kworker/u16:0 Not tainted 5.4.68 #33
>  Hardware name: Google Lazor (rev0) with LTE (DT)
>  Workqueue: qrtr_ns_handler qrtr_ns_worker
>  Call trace:
>   dump_backtrace+0x0/0x158
>   show_stack+0x20/0x2c
>   dump_stack+0xdc/0x180
>   ___might_sleep+0x1c0/0x1d0
>   __might_sleep+0x50/0x88
>   lock_sock_nested+0x34/0x94
>   qrtr_sendmsg+0x7c/0x260
>   sock_sendmsg+0x44/0x5c
>   kernel_sendmsg+0x50/0x64
>   lookup_notify+0xa8/0x118
>   qrtr_ns_worker+0x8d8/0x1050
>   process_one_work+0x338/0x614
>   worker_thread+0x29c/0x46c
>   kthread+0x150/0x160
>   ret_from_fork+0x10/0x18
> 
> I'll give the stack crawl from kgdb too since inlining makes things
> less obvious with the above...
> 
> (gdb) bt
> #0  arch_kgdb_breakpoint ()
>     at .../arch/arm64/include/asm/kgdb.h:21
> #1  kgdb_breakpoint ()
>     at .../kernel/debug/debug_core.c:1183
> #2  0xffffffd010131058 in ___might_sleep (
>     file=file@entry=0xffffffd010efec42 "net/core/sock.c",
>     line=line@entry=3000, preempt_offset=preempt_offset@entry=0)
>     at .../kernel/sched/core.c:7994
> #3  0xffffffd010130ee0 in __might_sleep (
>     file=0xffffffd010efec42 "net/core/sock.c", line=3000,
>     preempt_offset=0)
>     at .../kernel/sched/core.c:7965
> #4  0xffffffd01094d1c8 in lock_sock_nested (
>     sk=sk@entry=0xffffff8147e457c0, subclass=0)
>     at .../net/core/sock.c:3000
> #5  0xffffffd010b26028 in lock_sock (sk=0xffffff8147e457c0)
>     at .../include/net/sock.h:1536
> #6  qrtr_sendmsg (sock=0xffffff8148c4b240, msg=0xffffff81422afab8,
>     len=20)
>     at .../net/qrtr/qrtr.c:891
> #7  0xffffffd01093f8f4 in sock_sendmsg_nosec (
>     sock=0xffffff8148c4b240, msg=0xffffff81422afab8)
>     at .../net/socket.c:638
> #8  sock_sendmsg (sock=sock@entry=0xffffff8148c4b240,
>     msg=msg@entry=0xffffff81422afab8)
>     at .../net/socket.c:658
> #9  0xffffffd01093f95c in kernel_sendmsg (sock=0x1,
>     msg=msg@entry=0xffffff81422afab8, vec=<optimized out>,
>     vec@entry=0xffffff81422afaa8, num=<optimized out>, num@entry=1,
>     size=<optimized out>, size@entry=20)
>     at .../net/socket.c:678
> #10 0xffffffd010b28be0 in service_announce_new (
>     dest=dest@entry=0xffffff81422afc20,
>     srv=srv@entry=0xffffff81370f6380)
>     at .../net/qrtr/ns.c:127
> #11 0xffffffd010b279f4 in announce_servers (sq=0xffffff81422afc20)
>     at .../net/qrtr/ns.c:207
> #12 ctrl_cmd_hello (sq=0xffffff81422afc20)
>     at .../net/qrtr/ns.c:328
> #13 qrtr_ns_worker (work=<optimized out>)
>     at .../net/qrtr/ns.c:661
> #14 0xffffffd010119a94 in process_one_work (
>     worker=worker@entry=0xffffff8142267900,
>     work=0xffffffd0128ddaf8 <qrtr_ns+48>)
>     at .../kernel/workqueue.c:2272
> #15 0xffffffd01011a16c in worker_thread (
>     __worker=__worker@entry=0xffffff8142267900)
>     at .../kernel/workqueue.c:2418
> #16 0xffffffd01011fb78 in kthread (_create=0xffffff8142269200)
>     at .../kernel/kthread.c:268
> #17 0xffffffd01008645c in ret_from_fork ()
>     at .../arch/arm64/kernel/entry.S:1169
> 
> 
> -Doug
diff mbox series

Patch

diff --git a/net/qrtr/ns.c b/net/qrtr/ns.c
index d8252fdab851..934999b56d60 100644
--- a/net/qrtr/ns.c
+++ b/net/qrtr/ns.c
@@ -193,12 +193,13 @@  static int announce_servers(struct sockaddr_qrtr *sq)
 	struct qrtr_server *srv;
 	struct qrtr_node *node;
 	void __rcu **slot;
-	int ret;
+	int ret = 0;
 
 	node = node_get(qrtr_ns.local_node);
 	if (!node)
 		return 0;
 
+	rcu_read_lock();
 	/* Announce the list of servers registered in this node */
 	radix_tree_for_each_slot(slot, &node->servers, &iter, 0) {
 		srv = radix_tree_deref_slot(slot);
@@ -206,11 +207,14 @@  static int announce_servers(struct sockaddr_qrtr *sq)
 		ret = service_announce_new(sq, srv);
 		if (ret < 0) {
 			pr_err("failed to announce new service\n");
-			return ret;
+			goto err_out;
 		}
 	}
 
-	return 0;
+err_out:
+	rcu_read_unlock();
+
+	return ret;
 }
 
 static struct qrtr_server *server_add(unsigned int service,
@@ -335,7 +339,7 @@  static int ctrl_cmd_bye(struct sockaddr_qrtr *from)
 	struct qrtr_node *node;
 	void __rcu **slot;
 	struct kvec iv;
-	int ret;
+	int ret = 0;
 
 	iv.iov_base = &pkt;
 	iv.iov_len = sizeof(pkt);
@@ -344,11 +348,13 @@  static int ctrl_cmd_bye(struct sockaddr_qrtr *from)
 	if (!node)
 		return 0;
 
+	rcu_read_lock();
 	/* Advertise removal of this client to all servers of remote node */
 	radix_tree_for_each_slot(slot, &node->servers, &iter, 0) {
 		srv = radix_tree_deref_slot(slot);
 		server_del(node, srv->port);
 	}
+	rcu_read_unlock();
 
 	/* Advertise the removal of this client to all local servers */
 	local_node = node_get(qrtr_ns.local_node);
@@ -359,6 +365,7 @@  static int ctrl_cmd_bye(struct sockaddr_qrtr *from)
 	pkt.cmd = cpu_to_le32(QRTR_TYPE_BYE);
 	pkt.client.node = cpu_to_le32(from->sq_node);
 
+	rcu_read_lock();
 	radix_tree_for_each_slot(slot, &local_node->servers, &iter, 0) {
 		srv = radix_tree_deref_slot(slot);
 
@@ -372,11 +379,14 @@  static int ctrl_cmd_bye(struct sockaddr_qrtr *from)
 		ret = kernel_sendmsg(qrtr_ns.sock, &msg, &iv, 1, sizeof(pkt));
 		if (ret < 0) {
 			pr_err("failed to send bye cmd\n");
-			return ret;
+			goto err_out;
 		}
 	}
 
-	return 0;
+err_out:
+	rcu_read_unlock();
+
+	return ret;
 }
 
 static int ctrl_cmd_del_client(struct sockaddr_qrtr *from,
@@ -394,7 +404,7 @@  static int ctrl_cmd_del_client(struct sockaddr_qrtr *from,
 	struct list_head *li;
 	void __rcu **slot;
 	struct kvec iv;
-	int ret;
+	int ret = 0;
 
 	iv.iov_base = &pkt;
 	iv.iov_len = sizeof(pkt);
@@ -434,6 +444,7 @@  static int ctrl_cmd_del_client(struct sockaddr_qrtr *from,
 	pkt.client.node = cpu_to_le32(node_id);
 	pkt.client.port = cpu_to_le32(port);
 
+	rcu_read_lock();
 	radix_tree_for_each_slot(slot, &local_node->servers, &iter, 0) {
 		srv = radix_tree_deref_slot(slot);
 
@@ -447,11 +458,14 @@  static int ctrl_cmd_del_client(struct sockaddr_qrtr *from,
 		ret = kernel_sendmsg(qrtr_ns.sock, &msg, &iv, 1, sizeof(pkt));
 		if (ret < 0) {
 			pr_err("failed to send del client cmd\n");
-			return ret;
+			goto err_out;
 		}
 	}
 
-	return 0;
+err_out:
+	rcu_read_unlock();
+
+	return ret;
 }
 
 static int ctrl_cmd_new_server(struct sockaddr_qrtr *from,
@@ -554,6 +568,7 @@  static int ctrl_cmd_new_lookup(struct sockaddr_qrtr *from,
 	filter.service = service;
 	filter.instance = instance;
 
+	rcu_read_lock();
 	radix_tree_for_each_slot(node_slot, &nodes, &node_iter, 0) {
 		node = radix_tree_deref_slot(node_slot);
 
@@ -568,6 +583,7 @@  static int ctrl_cmd_new_lookup(struct sockaddr_qrtr *from,
 			lookup_notify(from, srv, true);
 		}
 	}
+	rcu_read_unlock();
 
 	/* Empty notification, to indicate end of listing */
 	lookup_notify(from, NULL, true);