diff mbox

[0/3,v4] macvtap driver

Message ID 1265655334.31760.9.camel@w-sridhar.beaverton.ibm.com
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Sridhar Samudrala Feb. 8, 2010, 6:55 p.m. UTC
On Mon, 2010-02-08 at 09:14 -0800, Ed Swierk wrote:
> > From: Arnd Bergmann <arnd@arndb.de>
> > Date: Sat, 30 Jan 2010 23:22:15 +0100
> >
> >> This is the fourth version of the macvtap driver,
> >> based on the comments I got for the last version
> >> I got a few days ago. Very few changes:
> >>
> >> * release netdev in chardev open function so
> >>   we can destroy it properly.
> >> * Implement TUNSETSNDBUF
> >> * fix sleeping call in rcu_read_lock
> >> * Fix comment in namespace isolation patch
> >> * Fix small context difference to make it apply
> >>   to net-next
> >>
> >> I can't really test here while travelling, so please
> >> give it a go if you're interested in this driver.
> 
> I'm seeing complaints from might_sleep():
> 
> Feb  8 16:21:06 ti102 kernel: BUG: sleeping function called from
> invalid context at include/linux/kernel.h:155
> Feb  8 16:21:06 ti102 kernel: in_atomic(): 1, irqs_disabled(): 0, pid:
> 2881, name: qemu-kvm
> Feb  8 16:21:06 ti102 kernel: Pid: 2881, comm: qemu-kvm Not tainted
> 2.6.29.6.Ar-224527.2009eswierk8 #1
> Feb  8 16:21:06 ti102 kernel: Call Trace:
> Feb  8 16:21:06 ti102 kernel: [<c0119250>] __might_sleep+0xdc/0xe3
> Feb  8 16:21:06 ti102 kernel: [<c0210f7c>] copy_to_user+0x36/0x106
> Feb  8 16:21:06 ti102 kernel: [<c02af568>] memcpy_toiovec+0x2c/0x50
> Feb  8 16:21:06 ti102 kernel: [<c02afbb3>] skb_copy_datagram_iovec+0x47/0x184
> Feb  8 16:21:06 ti102 kernel: [<c034bd07>] ? _spin_unlock_irqrestore+0x17/0x2c
> Feb  8 16:21:06 ti102 kernel: [<f829a776>]
> macvtap_aio_read+0x102/0x158 [macvtap]
> Feb  8 16:21:06 ti102 kernel: [<c011eaf7>] ? default_wake_function+0x0/0xd
> Feb  8 16:21:06 ti102 kernel: [<c016c75f>] do_sync_read+0xab/0xe9
> Feb  8 16:21:06 ti102 kernel: [<c0133933>] ? autoremove_wake_function+0x0/0x33
> Feb  8 16:21:06 ti102 kernel: [<c019211f>] ? eventfd_read+0x121/0x156
> Feb  8 16:21:06 ti102 kernel: [<c011eaf7>] ? default_wake_function+0x0/0xd
> Feb  8 16:21:06 ti102 kernel: [<c016d101>] vfs_read+0xb5/0x129
> Feb  8 16:21:06 ti102 kernel: [<c016d20e>] sys_read+0x3b/0x60
> Feb  8 16:21:06 ti102 kernel: [<c0102e71>] sysenter_do_call+0x12/0x25

I am also seeing this issue with net-next-2.6.
Basically macvtap_put_user() and macvtap_get_user() call copy_to/from_user
from within a RCU read-side critical section.

The following patch fixes this issue by releasing the RCU read lock before
calling these routines, but instead hold a reference to q->sk.

Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>





--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Arnd Bergmann Feb. 10, 2010, 2:48 p.m. UTC | #1
On Monday 08 February 2010, Sridhar Samudrala wrote:
> I am also seeing this issue with net-next-2.6.
> Basically macvtap_put_user() and macvtap_get_user() call copy_to/from_user
> from within a RCU read-side critical section.
> 
> The following patch fixes this issue by releasing the RCU read lock before
> calling these routines, but instead hold a reference to q->sk.
> 
> Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>

Yes, we need something like this, but we also need to protect the
device from going away. The concept right now is to use file_get_queue
to protect both the macvtap_queue and the macvlan_dev from going
away. The sock_hold will keep the macvtap_queue around, but
as far as I can tell, a user could still destroy the macvlan_dev
using netlink at the same time, which still breaks.

>  /* Get packet from user space buffer */
> -static ssize_t macvtap_get_user(struct macvtap_queue *q,
> +static ssize_t macvtap_get_user(struct macvlan_dev *vlan, struct sock *sk,
>                                 const struct iovec *iv, size_t count,
>                                 int noblock)
>  {
> @@ -331,10 +331,10 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q,
>         if (unlikely(len < ETH_HLEN))
>                 return -EINVAL;
>  
> -       skb = sock_alloc_send_skb(&q->sk, NET_IP_ALIGN + len, noblock, &err);
> +       skb = sock_alloc_send_skb(sk, NET_IP_ALIGN + len, noblock, &err);
>  
>         if (!skb) {
> -               macvlan_count_rx(q->vlan, 0, false, false);
> +               macvlan_count_rx(vlan, 0, false, false);
>                 return err;
>         }
>  
> @@ -342,14 +342,14 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q,
>         skb_put(skb, count);
>  
>         if (skb_copy_datagram_from_iovec(skb, 0, iv, 0, len)) {
> -               macvlan_count_rx(q->vlan, 0, false, false);
> +               macvlan_count_rx(vlan, 0, false, false);
>                 kfree_skb(skb);
>                 return -EFAULT;
>         }
>  
>         skb_set_network_header(skb, ETH_HLEN);
>  
> -       macvlan_start_xmit(skb, q->vlan->dev);
> +       macvlan_start_xmit(skb, vlan->dev);
>  
>         return count;
>  }

What are these changes for? The lifetime of q is the same as &q->sk, so
it won't change anything, right?
Moving the macvlan_count_rx and maxclan_start_xmit under the lock
should be fine though, but we'd have to take it twice then for
each transmit.

I'd hope that this could get simpler by adding zero-copy transmit,
where we first get_user() the whole buffer and do the rest under
rcu_read_lock_bh().

> @@ -393,15 +399,20 @@ static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv,
>  {
>         struct file *file = iocb->ki_filp;
>         struct macvtap_queue *q = macvtap_file_get_queue(file);
> +       struct macvlan_dev *vlan;
> +       struct sock *sk;
>  
>         DECLARE_WAITQUEUE(wait, current);
>         struct sk_buff *skb;
>         ssize_t len, ret = 0;
>  
> -       if (!q) {
> -               ret = -ENOLINK;
> -               goto out;
> -       }
> +       if (!q)
> +               return -ENOLINK;
> +
> +       vlan = q->vlan;
> +       sk = &q->sk;
> +       sock_hold(sk);
> +       macvtap_file_put_queue();

Here, we probably need to prevent vlan from going away by dev_hold(),
not just sock_hold(). Or is one implied by the other?

	Arnd
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sridhar Samudrala Feb. 10, 2010, 6:05 p.m. UTC | #2
On Wed, 2010-02-10 at 15:48 +0100, Arnd Bergmann wrote:
> On Monday 08 February 2010, Sridhar Samudrala wrote:
> > I am also seeing this issue with net-next-2.6.
> > Basically macvtap_put_user() and macvtap_get_user() call copy_to/from_user
> > from within a RCU read-side critical section.
> > 
> > The following patch fixes this issue by releasing the RCU read lock before
> > calling these routines, but instead hold a reference to q->sk.
> > 
> > Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>
> 
> Yes, we need something like this, but we also need to protect the
> device from going away. The concept right now is to use file_get_queue
> to protect both the macvtap_queue and the macvlan_dev from going
> away. The sock_hold will keep the macvtap_queue around, but
> as far as I can tell, a user could still destroy the macvlan_dev
> using netlink at the same time, which still breaks.

may be we should do a dev_hold() in macvtap_set_queue() and dev_put()
in macvtap_del_queue() so that the underlying device cannot go away as
long the macvtap fd is open.

Thanks
Sridhar

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Patrick McHardy Feb. 10, 2010, 6:10 p.m. UTC | #3
Sridhar Samudrala wrote:
> On Wed, 2010-02-10 at 15:48 +0100, Arnd Bergmann wrote:
>> On Monday 08 February 2010, Sridhar Samudrala wrote:
>>> I am also seeing this issue with net-next-2.6.
>>> Basically macvtap_put_user() and macvtap_get_user() call copy_to/from_user
>>> from within a RCU read-side critical section.
>>>
>>> The following patch fixes this issue by releasing the RCU read lock before
>>> calling these routines, but instead hold a reference to q->sk.
>>>
>>> Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>
>> Yes, we need something like this, but we also need to protect the
>> device from going away. The concept right now is to use file_get_queue
>> to protect both the macvtap_queue and the macvlan_dev from going
>> away. The sock_hold will keep the macvtap_queue around, but
>> as far as I can tell, a user could still destroy the macvlan_dev
>> using netlink at the same time, which still breaks.
> 
> may be we should do a dev_hold() in macvtap_set_queue() and dev_put()
> in macvtap_del_queue() so that the underlying device cannot go away as
> long the macvtap fd is open.

You either need some kind of loose binding (f.i. using the ifindex)
or need to handle the case that the device goes away asynchronously
by indicating an error to the socket and unbinding it.

But you can't make the lifetime of the device dependant on the socket.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index ad1f6ef..e3102ab 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -320,7 +320,7 @@  out:
 }
 
 /* Get packet from user space buffer */
-static ssize_t macvtap_get_user(struct macvtap_queue *q,
+static ssize_t macvtap_get_user(struct macvlan_dev *vlan, struct sock *sk,
 				const struct iovec *iv, size_t count,
 				int noblock)
 {
@@ -331,10 +331,10 @@  static ssize_t macvtap_get_user(struct macvtap_queue *q,
 	if (unlikely(len < ETH_HLEN))
 		return -EINVAL;
 
-	skb = sock_alloc_send_skb(&q->sk, NET_IP_ALIGN + len, noblock, &err);
+	skb = sock_alloc_send_skb(sk, NET_IP_ALIGN + len, noblock, &err);
 
 	if (!skb) {
-		macvlan_count_rx(q->vlan, 0, false, false);
+		macvlan_count_rx(vlan, 0, false, false);
 		return err;
 	}
 
@@ -342,14 +342,14 @@  static ssize_t macvtap_get_user(struct macvtap_queue *q,
 	skb_put(skb, count);
 
 	if (skb_copy_datagram_from_iovec(skb, 0, iv, 0, len)) {
-		macvlan_count_rx(q->vlan, 0, false, false);
+		macvlan_count_rx(vlan, 0, false, false);
 		kfree_skb(skb);
 		return -EFAULT;
 	}
 
 	skb_set_network_header(skb, ETH_HLEN);
 
-	macvlan_start_xmit(skb, q->vlan->dev);
+	macvlan_start_xmit(skb, vlan->dev);
 
 	return count;
 }
@@ -360,23 +360,29 @@  static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv,
 	struct file *file = iocb->ki_filp;
 	ssize_t result = -ENOLINK;
 	struct macvtap_queue *q = macvtap_file_get_queue(file);
+	struct macvlan_dev *vlan;
+	struct sock *sk;
 
 	if (!q)
 		goto out;
 
-	result = macvtap_get_user(q, iv, iov_length(iv, count),
+	vlan = q->vlan;
+	sk = &q->sk;
+	sock_hold(sk);
+	macvtap_file_put_queue();
+
+	result = macvtap_get_user(vlan, sk, iv, iov_length(iv, count),
 			      file->f_flags & O_NONBLOCK);
+	sock_put(sk);
 out:
-	macvtap_file_put_queue();
 	return result;
 }
 
 /* Put packet to the user space buffer */
-static ssize_t macvtap_put_user(struct macvtap_queue *q,
+static ssize_t macvtap_put_user(struct macvlan_dev *vlan,
 				const struct sk_buff *skb,
 				const struct iovec *iv, int len)
 {
-	struct macvlan_dev *vlan = q->vlan;
 	int ret;
 
 	len = min_t(int, skb->len, len);
@@ -393,15 +399,20 @@  static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv,
 {
 	struct file *file = iocb->ki_filp;
 	struct macvtap_queue *q = macvtap_file_get_queue(file);
+	struct macvlan_dev *vlan;
+	struct sock *sk;
 
 	DECLARE_WAITQUEUE(wait, current);
 	struct sk_buff *skb;
 	ssize_t len, ret = 0;
 
-	if (!q) {
-		ret = -ENOLINK;
-		goto out;
-	}
+	if (!q)
+		return -ENOLINK;
+
+	vlan = q->vlan;
+	sk = &q->sk;
+	sock_hold(sk);
+	macvtap_file_put_queue();
 
 	len = iov_length(iv, count);
 	if (len < 0) {
@@ -409,12 +420,12 @@  static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv,
 		goto out;
 	}
 
-	add_wait_queue(q->sk.sk_sleep, &wait);
+	add_wait_queue(sk->sk_sleep, &wait);
 	while (len) {
 		current->state = TASK_INTERRUPTIBLE;
 
 		/* Read frames from the queue */
-		skb = skb_dequeue(&q->sk.sk_receive_queue);
+		skb = skb_dequeue(&sk->sk_receive_queue);
 		if (!skb) {
 			if (file->f_flags & O_NONBLOCK) {
 				ret = -EAGAIN;
@@ -428,16 +439,16 @@  static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv,
 			schedule();
 			continue;
 		}
-		ret = macvtap_put_user(q, skb, iv, len);
+		ret = macvtap_put_user(vlan, skb, iv, len);
 		kfree_skb(skb);
 		break;
 	}
 
 	current->state = TASK_RUNNING;
-	remove_wait_queue(q->sk.sk_sleep, &wait);
+	remove_wait_queue(sk->sk_sleep, &wait);
 
 out:
-	macvtap_file_put_queue();
+	sock_put(sk);
 	return ret;
 }