diff mbox

[1/4] net: add support for per-paged-fragment destructors

Message ID 1320850927-30240-1-git-send-email-ian.campbell@citrix.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Ian Campbell Nov. 9, 2011, 3:02 p.m. UTC
Entities which care about the complete lifecycle of pages which they inject
into the network stack via an skb paged fragment can choose to set this
destructor in order to receive a callback when the stack is really finished
with a page (including all clones, retransmits, pull-ups etc etc).

This destructor will always be propagated alongside the struct page when
copying skb_frag_t->page. This is the reason I chose to embed the destructor in
a "struct { } page" within the skb_frag_t, rather than as a separate field,
since it allows existing code which propagates ->frags[N].page to Just
Work(tm).

When the destructor is present the page reference counting is done slightly

Comments

Michał Mirosław Nov. 9, 2011, 3:33 p.m. UTC | #1
On Wed, Nov 09, 2011 at 03:02:04PM +0000, Ian Campbell wrote:
> Entities which care about the complete lifecycle of pages which they inject
> into the network stack via an skb paged fragment can choose to set this
> destructor in order to receive a callback when the stack is really finished
> with a page (including all clones, retransmits, pull-ups etc etc).
[...]
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -139,9 +139,16 @@ struct sk_buff;
>  
>  typedef struct skb_frag_struct skb_frag_t;
>  
> +struct skb_frag_destructor {
> +	atomic_t ref;
> +	int (*destroy)(void *data);
> +	void *data;
> +};
> +
>  struct skb_frag_struct {
>  	struct {
>  		struct page *p;
> +		struct skb_frag_destructor *destructor;
>  	} page;

You can get rid of the data field of skb_frag_destructor: if destroy() gets
pointer to the destroyed struct skb_frag_set_destructor, its users can
get at containing struct via container_of() if needed and the memory
pointed to by data won't have to be managed separately.

Nice work, BTW!

Best Regards,
Michał Mirosław
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ian Campbell Nov. 9, 2011, 4:25 p.m. UTC | #2
On Wed, 2011-11-09 at 15:33 +0000, Michał Mirosław wrote:
> On Wed, Nov 09, 2011 at 03:02:04PM +0000, Ian Campbell wrote:
> > Entities which care about the complete lifecycle of pages which they inject
> > into the network stack via an skb paged fragment can choose to set this
> > destructor in order to receive a callback when the stack is really finished
> > with a page (including all clones, retransmits, pull-ups etc etc).
> [...]
> > --- a/include/linux/skbuff.h
> > +++ b/include/linux/skbuff.h
> > @@ -139,9 +139,16 @@ struct sk_buff;
> >  
> >  typedef struct skb_frag_struct skb_frag_t;
> >  
> > +struct skb_frag_destructor {
> > +	atomic_t ref;
> > +	int (*destroy)(void *data);
> > +	void *data;
> > +};
> > +
> >  struct skb_frag_struct {
> >  	struct {
> >  		struct page *p;
> > +		struct skb_frag_destructor *destructor;
> >  	} page;
> 
> You can get rid of the data field of skb_frag_destructor: if destroy() gets
> pointer to the destroyed struct skb_frag_set_destructor, its users can
> get at containing struct via container_of() if needed and the memory
> pointed to by data won't have to be managed separately.

At the moment you can share one destructor between all the frags,
whereas data is specific to the frag. At the moment the only user is the
rpc patch which doesn't make use of this capability (i.e. it could use
container_of) but I'm not sure about other potential users yet.

> Nice work, BTW!

Thanks.

> 
> Best Regards,
> Michał Mirosław


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michał Mirosław Nov. 9, 2011, 5:24 p.m. UTC | #3
On Wed, Nov 09, 2011 at 04:25:14PM +0000, Ian Campbell wrote:
> On Wed, 2011-11-09 at 15:33 +0000, Michał Mirosław wrote:
> > On Wed, Nov 09, 2011 at 03:02:04PM +0000, Ian Campbell wrote:
> > > Entities which care about the complete lifecycle of pages which they inject
> > > into the network stack via an skb paged fragment can choose to set this
> > > destructor in order to receive a callback when the stack is really finished
> > > with a page (including all clones, retransmits, pull-ups etc etc).
> > [...]
> > > --- a/include/linux/skbuff.h
> > > +++ b/include/linux/skbuff.h
> > > @@ -139,9 +139,16 @@ struct sk_buff;
> > >  
> > >  typedef struct skb_frag_struct skb_frag_t;
> > >  
> > > +struct skb_frag_destructor {
> > > +	atomic_t ref;
> > > +	int (*destroy)(void *data);
> > > +	void *data;
> > > +};
> > > +
> > >  struct skb_frag_struct {
> > >  	struct {
> > >  		struct page *p;
> > > +		struct skb_frag_destructor *destructor;
> > >  	} page;
> > 
> > You can get rid of the data field of skb_frag_destructor: if destroy() gets
> > pointer to the destroyed struct skb_frag_set_destructor, its users can
> > get at containing struct via container_of() if needed and the memory
> > pointed to by data won't have to be managed separately.
> At the moment you can share one destructor between all the frags,
> whereas data is specific to the frag.
[...]

If you want distinct data pointers then you need to also have per-frag
skb_frag_destructor as you wrote in this patch. So removing 'data' field
saves memory but doesn't change anything else except the way to reference
the data (container_of() instead of pointer dereference).

Best Regards,
Michał Mirosław
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ian Campbell Nov. 9, 2011, 5:28 p.m. UTC | #4
On Wed, 2011-11-09 at 17:24 +0000, Michał Mirosław wrote:
> On Wed, Nov 09, 2011 at 04:25:14PM +0000, Ian Campbell wrote:
> > On Wed, 2011-11-09 at 15:33 +0000, Michał Mirosław wrote:
> > > On Wed, Nov 09, 2011 at 03:02:04PM +0000, Ian Campbell wrote:
> > > > Entities which care about the complete lifecycle of pages which they inject
> > > > into the network stack via an skb paged fragment can choose to set this
> > > > destructor in order to receive a callback when the stack is really finished
> > > > with a page (including all clones, retransmits, pull-ups etc etc).
> > > [...]
> > > > --- a/include/linux/skbuff.h
> > > > +++ b/include/linux/skbuff.h
> > > > @@ -139,9 +139,16 @@ struct sk_buff;
> > > >  
> > > >  typedef struct skb_frag_struct skb_frag_t;
> > > >  
> > > > +struct skb_frag_destructor {
> > > > +	atomic_t ref;
> > > > +	int (*destroy)(void *data);
> > > > +	void *data;
> > > > +};
> > > > +
> > > >  struct skb_frag_struct {
> > > >  	struct {
> > > >  		struct page *p;
> > > > +		struct skb_frag_destructor *destructor;
> > > >  	} page;
> > > 
> > > You can get rid of the data field of skb_frag_destructor: if destroy() gets
> > > pointer to the destroyed struct skb_frag_set_destructor, its users can
> > > get at containing struct via container_of() if needed and the memory
> > > pointed to by data won't have to be managed separately.
> > At the moment you can share one destructor between all the frags,
> > whereas data is specific to the frag.
> [...]
> 
> If you want distinct data pointers then you need to also have per-frag
> skb_frag_destructor as you wrote in this patch. So removing 'data' field
> saves memory but doesn't change anything else except the way to reference
> the data (container_of() instead of pointer dereference).

Oh yes, you are absolutely right.

Ian.

> 
> Best Regards,
> Michał Mirosław


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

differently. No references are held by the network stack on the struct page (it
is up to the caller to manage this as necessary) instead the network stack will
track references via the count embedded in the destructor structure. When this
reference count reaches zero then the destructor will be called and the caller
can take the necesary steps to release the page (i.e. release the struct page
reference itself).

The intention is that callers can use this callback to delay completion to
_their_ callers until the network stack has completely released the page, in
order to prevent use-after-free or modification of data pages which are still
in use by the stack.

It is allowable (indeed expected) for a caller to share a single destructor
instance between multiple pages injected into the stack e.g. a group of pages
included in a single higher level operation might share a destructor which is
used to complete that higher level operation.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: "Michał Mirosław" <mirq-linux@rere.qmqm.pl>
Cc: netdev@vger.kernel.org
---
 include/linux/skbuff.h |   44 ++++++++++++++++++++++++++++++++++++++++++++
 net/core/skbuff.c      |   17 +++++++++++++++++
 2 files changed, 61 insertions(+), 0 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 6a6b352..5bdb74d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -139,9 +139,16 @@  struct sk_buff;
 
 typedef struct skb_frag_struct skb_frag_t;
 
+struct skb_frag_destructor {
+	atomic_t ref;
+	int (*destroy)(void *data);
+	void *data;
+};
+
 struct skb_frag_struct {
 	struct {
 		struct page *p;
+		struct skb_frag_destructor *destructor;
 	} page;
 #if (BITS_PER_LONG > 32) || (PAGE_SIZE >= 65536)
 	__u32 page_offset;
@@ -1160,6 +1167,31 @@  static inline int skb_pagelen(const struct sk_buff *skb)
 }
 
 /**
+ * skb_frag_set_destructor - set destructor for a paged fragment
+ * @skb: buffer containing fragment to be initialised
+ * @i: paged fragment index to initialise
+ * @destroy: the destructor to use for this fragment
+ *
+ * Sets @destroy as the destructor to be called when all references to
+ * the frag @i in @skb (tracked over skb_clone, retransmit, pull-ups,
+ * etc) are released.
+ *
+ * When a destructor is set then reference counting is performed on
+ * @destroy->ref. When the ref reaches zero then @destroy->destroy
+ * will be called. The caller is responsible for holding and managing
+ * any other references (such a the struct page reference count).
+ *
+ * This function must be called before any use of skb_frag_ref() or
+ * skb_frag_unref().
+ */
+static inline void skb_frag_set_destructor(struct sk_buff *skb, int i,
+					   struct skb_frag_destructor *destroy)
+{
+	skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+	frag->page.destructor = destroy;
+}
+
+/**
  * __skb_fill_page_desc - initialise a paged fragment in an skb
  * @skb: buffer containing fragment to be initialised
  * @i: paged fragment index to initialise
@@ -1178,6 +1210,7 @@  static inline void __skb_fill_page_desc(struct sk_buff *skb, int i,
 	skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 
 	frag->page.p		  = page;
+	frag->page.destructor     = NULL;
 	frag->page_offset	  = off;
 	skb_frag_size_set(frag, size);
 }
@@ -1704,6 +1737,9 @@  static inline struct page *skb_frag_page(const skb_frag_t *frag)
 	return frag->page.p;
 }
 
+extern void skb_frag_destructor_ref(struct skb_frag_destructor *destroy);
+extern void skb_frag_destructor_unref(struct skb_frag_destructor *destroy);
+
 /**
  * __skb_frag_ref - take an addition reference on a paged fragment.
  * @frag: the paged fragment
@@ -1712,6 +1748,10 @@  static inline struct page *skb_frag_page(const skb_frag_t *frag)
  */
 static inline void __skb_frag_ref(skb_frag_t *frag)
 {
+	if (unlikely(frag->page.destructor)) {
+		skb_frag_destructor_ref(frag->page.destructor);
+		return;
+	}
 	get_page(skb_frag_page(frag));
 }
 
@@ -1735,6 +1775,10 @@  static inline void skb_frag_ref(struct sk_buff *skb, int f)
  */
 static inline void __skb_frag_unref(skb_frag_t *frag)
 {
+	if (unlikely(frag->page.destructor)) {
+		skb_frag_destructor_unref(frag->page.destructor);
+		return;
+	}
 	put_page(skb_frag_page(frag));
 }
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index ca4db40..1c5cc05 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -302,6 +302,23 @@  struct sk_buff *dev_alloc_skb(unsigned int length)
 }
 EXPORT_SYMBOL(dev_alloc_skb);
 
+void skb_frag_destructor_ref(struct skb_frag_destructor *destroy)
+{
+	BUG_ON(destroy == NULL);
+	atomic_inc(&destroy->ref);
+}
+EXPORT_SYMBOL(skb_frag_destructor_ref);
+
+void skb_frag_destructor_unref(struct skb_frag_destructor *destroy)
+{
+	if (destroy == NULL)
+		return;
+
+	if (atomic_dec_and_test(&destroy->ref))
+		destroy->destroy(destroy->data);
+}
+EXPORT_SYMBOL(skb_frag_destructor_unref);
+
 static void skb_drop_list(struct sk_buff **listp)
 {
 	struct sk_buff *list = *listp;