diff mbox series

[net-next,RFC,2/2] tun: enable napi_gro_frags() for TUN/TAP driver

Message ID 20170905223551.27925-3-ppenkov@google.com
State RFC, archived
Delegated to: David Miller
Headers show
Series Improve code coverage of syzkaller | expand

Commit Message

Petar Penkov Sept. 5, 2017, 10:35 p.m. UTC
Add a TUN/TAP receive mode that exercises the napi_gro_frags()
interface. This mode is available only in TAP mode, as the interface
expects packets with Ethernet headers.

Furthermore, packets follow the layout of the iovec_iter that was
received. The first iovec is the linear data, and every one after the
first is a fragment. If there are more fragments than the max number,
drop the packet. Additionally, invoke eth_get_headlen() to exercise flow
dissector code and to verify that the header resides in the linear data.

The napi_gro_frags() mode requires setting the IFF_NAPI_FRAGS option.
This is imposed because this mode is intended for testing via tools like
syzkaller and packetdrill, and the increased flexibility it provides can
introduce security vulnerabilities.

Signed-off-by: Petar Penkov <ppenkov@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Mahesh Bandewar <maheshb@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: davem@davemloft.net
Cc: ppenkov@stanford.edu
---
 drivers/net/tun.c           | 135 ++++++++++++++++++++++++++++++++++++++++++--
 include/uapi/linux/if_tun.h |   1 +
 2 files changed, 130 insertions(+), 6 deletions(-)

Comments

Eric Dumazet Sept. 6, 2017, 6:01 p.m. UTC | #1
On Tue, 2017-09-05 at 15:35 -0700, Petar Penkov wrote:
> Add a TUN/TAP receive mode that exercises the napi_gro_frags()
> interface. This mode is available only in TAP mode, as the interface
> expects packets with Ethernet headers.
> 


Hi Petar, thanks a lot for this work.

I must confess I have to retract one feedback I gave while reviewing
your patches.


> +		local_bh_disable();
> +		data = napi_alloc_frag(fragsz);
> +		local_bh_enable();
> +		if (!data) {
> +			err = -ENOMEM;
> +			goto free;
> +		}
> +
> +		page = virt_to_page(data);
> +		offset = offset_in_page(data);

These two lines above indeed trigger too many problems in the kernel.
(Like the one you tried to cover here
https://patchwork.kernel.org/patch/9927927/ )

Please use for your next submission the code you originally had :

	page = virt_to_head_page(data);
	offset = data - page_address(page);


Thanks !
diff mbox series

Patch

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index d5c824e3ec42..2ba9809ab6cd 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -75,6 +75,7 @@ 
 #include <linux/skb_array.h>
 #include <linux/bpf.h>
 #include <linux/bpf_trace.h>
+#include <linux/mutex.h>
 
 #include <linux/uaccess.h>
 
@@ -120,8 +121,15 @@  do {								\
 #define TUN_VNET_LE     0x80000000
 #define TUN_VNET_BE     0x40000000
 
+#if IS_ENABLED(CONFIG_TUN_NAPI)
+#define TUN_FEATURES_EXTRA IFF_NAPI_FRAGS
+#else
+#define TUN_FEATURES_EXTRA 0
+#endif
+
 #define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \
-		      IFF_MULTI_QUEUE)
+		      IFF_MULTI_QUEUE | TUN_FEATURES_EXTRA)
+
 #define GOODCOPY_LEN 128
 
 #define FLT_EXACT_COUNT 8
@@ -173,6 +181,7 @@  struct tun_file {
 		unsigned int ifindex;
 	};
 	struct napi_struct napi;
+	struct mutex napi_mutex;	/* Protects access to the above napi */
 	struct list_head next;
 	struct tun_struct *detached;
 	struct skb_array tx_array;
@@ -276,6 +285,7 @@  static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile)
 		netif_napi_add(tun->dev, &tfile->napi, tun_napi_poll,
 			       NAPI_POLL_WEIGHT);
 		napi_enable(&tfile->napi);
+		mutex_init(&tfile->napi_mutex);
 	}
 }
 
@@ -291,6 +301,11 @@  static void tun_napi_del(struct tun_file *tfile)
 		netif_napi_del(&tfile->napi);
 }
 
+static bool tun_napi_frags_enabled(const struct tun_struct *tun)
+{
+	return READ_ONCE(tun->flags) & IFF_NAPI_FRAGS;
+}
+
 #ifdef CONFIG_TUN_VNET_CROSS_LE
 static inline bool tun_legacy_is_little_endian(struct tun_struct *tun)
 {
@@ -1034,7 +1049,8 @@  static void tun_poll_controller(struct net_device *dev)
 	 * supports polling, which enables bridge devices in virt setups to
 	 * still use netconsole
 	 * If NAPI is enabled, however, we need to schedule polling for all
-	 * queues.
+	 * queues unless we are using napi_gro_frags(), which we call in
+	 * process context and not in NAPI context.
 	 */
 
 	if (IS_ENABLED(CONFIG_TUN_NAPI)) {
@@ -1042,6 +1058,9 @@  static void tun_poll_controller(struct net_device *dev)
 		struct tun_file *tfile;
 		int i;
 
+		if (tun_napi_frags_enabled(tun))
+			return;
+
 		rcu_read_lock();
 		for (i = 0; i < tun->numqueues; i++) {
 			tfile = rcu_dereference(tun->tfiles[i]);
@@ -1264,6 +1283,64 @@  static unsigned int tun_chr_poll(struct file *file, poll_table *wait)
 	return mask;
 }
 
+static struct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile,
+					    size_t len,
+					    const struct iov_iter *it)
+{
+	struct sk_buff *skb;
+	size_t linear;
+	int err;
+	int i;
+
+	if (it->nr_segs > MAX_SKB_FRAGS + 1)
+		return ERR_PTR(-ENOMEM);
+
+	local_bh_disable();
+	skb = napi_get_frags(&tfile->napi);
+	local_bh_enable();
+	if (!skb)
+		return ERR_PTR(-ENOMEM);
+
+	linear = iov_iter_single_seg_count(it);
+	err = __skb_grow(skb, linear);
+	if (err)
+		goto free;
+
+	skb->len = len;
+	skb->data_len = len - linear;
+	skb->truesize += skb->data_len;
+
+	for (i = 1; i < it->nr_segs; i++) {
+		size_t fragsz = it->iov[i].iov_len;
+		unsigned long offset;
+		struct page *page;
+		void *data;
+
+		if (fragsz == 0 || fragsz > PAGE_SIZE) {
+			err = -EINVAL;
+			goto free;
+		}
+
+		local_bh_disable();
+		data = napi_alloc_frag(fragsz);
+		local_bh_enable();
+		if (!data) {
+			err = -ENOMEM;
+			goto free;
+		}
+
+		page = virt_to_page(data);
+		offset = offset_in_page(data);
+		skb_fill_page_desc(skb, i - 1, page, offset, fragsz);
+	}
+
+	return skb;
+free:
+	/* frees skb and all frags allocated with napi_alloc_frag() */
+	napi_free_frags(&tfile->napi);
+	return ERR_PTR(err);
+}
+
 /* prepad is the amount to reserve at front.  len is length after that.
  * linear is a hint as to how much to copy (usually headers). */
 static struct sk_buff *tun_alloc_skb(struct tun_file *tfile,
@@ -1466,6 +1543,7 @@  static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 	int err;
 	u32 rxhash;
 	int generic_xdp = 1;
+	bool frags = tun_napi_frags_enabled(tun);
 
 	if (!(tun->dev->flags & IFF_UP))
 		return -EIO;
@@ -1523,7 +1601,7 @@  static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 			zerocopy = true;
 	}
 
-	if (tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) {
+	if (!frags && tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) {
 		skb = tun_build_skb(tun, tfile, from, &gso, len, &generic_xdp);
 		if (IS_ERR(skb)) {
 			this_cpu_inc(tun->pcpu_stats->rx_dropped);
@@ -1540,10 +1618,24 @@  static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 				linear = tun16_to_cpu(tun, gso.hdr_len);
 		}
 
-		skb = tun_alloc_skb(tfile, align, copylen, linear, noblock);
+		if (frags) {
+			mutex_lock(&tfile->napi_mutex);
+			skb = tun_napi_alloc_frags(tfile, copylen, from);
+			/* tun_napi_alloc_frags() enforces a layout for the skb.
+			 * If zerocopy is enabled, then this layout will be
+			 * overwritten by zerocopy_sg_from_iter().
+			 */
+			zerocopy = false;
+		} else {
+			skb = tun_alloc_skb(tfile, align, copylen, linear,
+					    noblock);
+		}
+
 		if (IS_ERR(skb)) {
 			if (PTR_ERR(skb) != -EAGAIN)
 				this_cpu_inc(tun->pcpu_stats->rx_dropped);
+			if (frags)
+				mutex_unlock(&tfile->napi_mutex);
 			return PTR_ERR(skb);
 		}
 
@@ -1555,6 +1647,11 @@  static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 		if (err) {
 			this_cpu_inc(tun->pcpu_stats->rx_dropped);
 			kfree_skb(skb);
+			if (frags) {
+				tfile->napi.skb = NULL;
+				mutex_unlock(&tfile->napi_mutex);
+			}
+
 			return -EFAULT;
 		}
 	}
@@ -1562,6 +1659,11 @@  static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 	if (virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun))) {
 		this_cpu_inc(tun->pcpu_stats->rx_frame_errors);
 		kfree_skb(skb);
+		if (frags) {
+			tfile->napi.skb = NULL;
+			mutex_unlock(&tfile->napi_mutex);
+		}
+
 		return -EINVAL;
 	}
 
@@ -1587,7 +1689,8 @@  static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 		skb->dev = tun->dev;
 		break;
 	case IFF_TAP:
-		skb->protocol = eth_type_trans(skb, tun->dev);
+		if (!frags)
+			skb->protocol = eth_type_trans(skb, tun->dev);
 		break;
 	}
 
@@ -1622,7 +1725,23 @@  static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 
 	rxhash = __skb_get_hash_symmetric(skb);
 
-	if (IS_ENABLED(CONFIG_TUN_NAPI)) {
+	if (frags) {
+		/* Exercise flow dissector code path. */
+		u32 headlen = eth_get_headlen(skb->data, skb_headlen(skb));
+
+		if (headlen > skb_headlen(skb) || headlen < ETH_HLEN) {
+			this_cpu_inc(tun->pcpu_stats->rx_dropped);
+			napi_free_frags(&tfile->napi);
+			mutex_unlock(&tfile->napi_mutex);
+			WARN_ON(1);
+			return -ENOMEM;
+		}
+
+		local_bh_disable();
+		napi_gro_frags(&tfile->napi);
+		local_bh_enable();
+		mutex_unlock(&tfile->napi_mutex);
+	} else if (IS_ENABLED(CONFIG_TUN_NAPI)) {
 		struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
 		int queue_len;
 
@@ -2168,6 +2287,10 @@  static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 	tun->flags = (tun->flags & ~TUN_FEATURES) |
 		(ifr->ifr_flags & TUN_FEATURES);
 
+	if (!IS_ENABLED(CONFIG_TUN_NAPI) ||
+	    (tun->flags & TUN_TYPE_MASK) != IFF_TAP)
+		tun->flags = tun->flags & ~IFF_NAPI_FRAGS;
+
 	/* Make sure persistent devices do not get stuck in
 	 * xoff state.
 	 */
diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
index 3cb5e1d85ddd..1eb1eb42f151 100644
--- a/include/uapi/linux/if_tun.h
+++ b/include/uapi/linux/if_tun.h
@@ -60,6 +60,7 @@ 
 /* TUNSETIFF ifr flags */
 #define IFF_TUN		0x0001
 #define IFF_TAP		0x0002
+#define IFF_NAPI_FRAGS	0x0010
 #define IFF_NO_PI	0x1000
 /* This flag has no real effect */
 #define IFF_ONE_QUEUE	0x2000