diff mbox

[0/4] skb paged fragment destructors

Message ID 1323172634.23681.73.camel@zakaz.uk.xensource.com
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Ian Campbell Dec. 6, 2011, 11:57 a.m. UTC
On Wed, 2011-11-09 at 15:01 +0000, Ian Campbell wrote:
>       * split linear data allocation and shinfo allocation into two. I
>         suspect this will have its own performance implications? On the
>         positive side skb_shared_info could come from its own fixed size
>         pool/cache which might have some benefits

I played with this to see how it would look. Illustrative patch below. 

I figure that lots of small frames is the interesting workload for a
change such as this but I don't know if iperf is necessarily the best
benchmark for measuring that.
Before changing things I got:
        iperf -c qarun -m -t 60 -u -b 10000M -l 64
        ------------------------------------------------------------
        Client connecting to qarun, UDP port 5001
        Sending 64 byte datagrams
        UDP buffer size:   224 KByte (default)
        ------------------------------------------------------------
        [  3] local 10.80.225.63 port 45857 connected with 10.80.224.22 port 5001
        [ ID] Interval       Transfer     Bandwidth
        [  3]  0.0-60.0 sec    844 MBytes    118 Mbits/sec
        [  3] Sent 13820376 datagrams
        [  3] Server Report:
        [  3]  0.0-60.0 sec    844 MBytes    118 Mbits/sec  0.005 ms    0/13820375 (0%)
        [  3]  0.0-60.0 sec  1 datagrams received out-of-order
whereas with the patch:
        # iperf -c qarun -m -t 60 -u -b 10000M -l 64
        ------------------------------------------------------------
        Client connecting to qarun, UDP port 5001
        Sending 64 byte datagrams
        UDP buffer size:   224 KByte (default)
        ------------------------------------------------------------
        [  3] local 10.80.225.63 port 42504 connected with 10.80.224.22 port 5001
        [ ID] Interval       Transfer     Bandwidth
        [  3]  0.0-60.0 sec    833 MBytes    116 Mbits/sec
        [  3] Sent 13645857 datagrams
        [  3] Server Report:
        [  3]  0.0-60.0 sec    833 MBytes    116 Mbits/sec  0.005 ms    0/13645856 (0%)
        [  3]  0.0-60.0 sec  1 datagrams received out-of-order

With 1200 byte datagrams I get basically identical throughput.

(nb: none of the skb destructor stuff was present in either case)

>       * steal a bit a pointer to indicate destructor pointer vs regular
>         struct page pointer (moving the struct page into the destructor
>         datastructure for that case). Stops us sharing a single
>         destructor between multiple pages, but that isn't critical

I also implemented this and compared to the above the patch is pretty
fugly, especially its impact on the SUNRPC patches to actually use the
feature. It needs a bit more cleanup before I can post it for comparison
though.

Ian.

 drivers/net/ethernet/broadcom/bnx2.c        |    5 +-
 drivers/net/ethernet/broadcom/bnx2x/bnx2x.h |    4 +-
 drivers/net/ethernet/broadcom/tg3.c         |    3 +-
 include/linux/skbuff.h                      |    9 ++-
 net/core/skbuff.c                           |   67 ++++++++++++++++----------
 5 files changed, 51 insertions(+), 37 deletions(-)





--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Eric Dumazet Dec. 6, 2011, 1:24 p.m. UTC | #1
Le mardi 06 décembre 2011 à 11:57 +0000, Ian Campbell a écrit :
> On Wed, 2011-11-09 at 15:01 +0000, Ian Campbell wrote:
> >       * split linear data allocation and shinfo allocation into two. I
> >         suspect this will have its own performance implications? On the
> >         positive side skb_shared_info could come from its own fixed size
> >         pool/cache which might have some benefits
> 
> I played with this to see how it would look. Illustrative patch below. 
> 
> I figure that lots of small frames is the interesting workload for a
> change such as this but I don't know if iperf is necessarily the best
> benchmark for measuring that.
> Before changing things I got:
>         iperf -c qarun -m -t 60 -u -b 10000M -l 64
>         ------------------------------------------------------------
>         Client connecting to qarun, UDP port 5001
>         Sending 64 byte datagrams
>         UDP buffer size:   224 KByte (default)
>         ------------------------------------------------------------
>         [  3] local 10.80.225.63 port 45857 connected with 10.80.224.22 port 5001
>         [ ID] Interval       Transfer     Bandwidth
>         [  3]  0.0-60.0 sec    844 MBytes    118 Mbits/sec
>         [  3] Sent 13820376 datagrams
>         [  3] Server Report:
>         [  3]  0.0-60.0 sec    844 MBytes    118 Mbits/sec  0.005 ms    0/13820375 (0%)
>         [  3]  0.0-60.0 sec  1 datagrams received out-of-order
> whereas with the patch:
>         # iperf -c qarun -m -t 60 -u -b 10000M -l 64
>         ------------------------------------------------------------
>         Client connecting to qarun, UDP port 5001
>         Sending 64 byte datagrams
>         UDP buffer size:   224 KByte (default)
>         ------------------------------------------------------------
>         [  3] local 10.80.225.63 port 42504 connected with 10.80.224.22 port 5001
>         [ ID] Interval       Transfer     Bandwidth
>         [  3]  0.0-60.0 sec    833 MBytes    116 Mbits/sec
>         [  3] Sent 13645857 datagrams
>         [  3] Server Report:
>         [  3]  0.0-60.0 sec    833 MBytes    116 Mbits/sec  0.005 ms    0/13645856 (0%)
>         [  3]  0.0-60.0 sec  1 datagrams received out-of-order
> 
> With 1200 byte datagrams I get basically identical throughput.
> 
> (nb: none of the skb destructor stuff was present in either case)

Sorry, but the real problem is that if skb producer and consumer are not
on same CPU, each skb will now hit SLUB slowpath three times instead of
two.

Some workloads are : One cpu fully handling IRQ from device, dispatching
skbs to consumers on other cpus.

Plus skb->truesize is wrong after your patch.
Not sure if cloning is correct either...

Anyway, do we _really_ need 16 frags per skb, I dont know....

This gives problems when/if skb must be linearized and we hit
PAGE_ALLOC_COSTLY_ORDER

Alternatively, we could use order-1 or order-2 pages on x86 to get
8192/16384 bytes frags. (fallback to order-0 pages in case of allocation
failures)



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ian Campbell Dec. 7, 2011, 1:35 p.m. UTC | #2
On Tue, 2011-12-06 at 13:24 +0000, Eric Dumazet wrote:
> Le mardi 06 décembre 2011 à 11:57 +0000, Ian Campbell a écrit :
> > On Wed, 2011-11-09 at 15:01 +0000, Ian Campbell wrote:
> > >       * split linear data allocation and shinfo allocation into two. I
> > >         suspect this will have its own performance implications? On the
> > >         positive side skb_shared_info could come from its own fixed size
> > >         pool/cache which might have some benefits
> > 
> > I played with this to see how it would look. Illustrative patch below. 
> > 
> > I figure that lots of small frames is the interesting workload for a
> > change such as this but I don't know if iperf is necessarily the best
> > benchmark for measuring that.
> > Before changing things I got:
> >         iperf -c qarun -m -t 60 -u -b 10000M -l 64
> >         ------------------------------------------------------------
> >         Client connecting to qarun, UDP port 5001
> >         Sending 64 byte datagrams
> >         UDP buffer size:   224 KByte (default)
> >         ------------------------------------------------------------
> >         [  3] local 10.80.225.63 port 45857 connected with 10.80.224.22 port 5001
> >         [ ID] Interval       Transfer     Bandwidth
> >         [  3]  0.0-60.0 sec    844 MBytes    118 Mbits/sec
> >         [  3] Sent 13820376 datagrams
> >         [  3] Server Report:
> >         [  3]  0.0-60.0 sec    844 MBytes    118 Mbits/sec  0.005 ms    0/13820375 (0%)
> >         [  3]  0.0-60.0 sec  1 datagrams received out-of-order
> > whereas with the patch:
> >         # iperf -c qarun -m -t 60 -u -b 10000M -l 64
> >         ------------------------------------------------------------
> >         Client connecting to qarun, UDP port 5001
> >         Sending 64 byte datagrams
> >         UDP buffer size:   224 KByte (default)
> >         ------------------------------------------------------------
> >         [  3] local 10.80.225.63 port 42504 connected with 10.80.224.22 port 5001
> >         [ ID] Interval       Transfer     Bandwidth
> >         [  3]  0.0-60.0 sec    833 MBytes    116 Mbits/sec
> >         [  3] Sent 13645857 datagrams
> >         [  3] Server Report:
> >         [  3]  0.0-60.0 sec    833 MBytes    116 Mbits/sec  0.005 ms    0/13645856 (0%)
> >         [  3]  0.0-60.0 sec  1 datagrams received out-of-order
> > 
> > With 1200 byte datagrams I get basically identical throughput.
> > 
> > (nb: none of the skb destructor stuff was present in either case)
> 
> Sorry, but the real problem is that if skb producer and consumer are not
> on same CPU, each skb will now hit SLUB slowpath three times instead of
> two.
> 
> Some workloads are : One cpu fully handling IRQ from device, dispatching
> skbs to consumers on other cpus.

So something like a multithreaded apache benchmark would be interesting?

> Plus skb->truesize is wrong after your patch.
> Not sure if cloning is correct either...

Me neither, the patch was just one which happened to run well enough to
run some numbers on. I'll be sure to double-check/correct that stuff if
I persist with the approach.

> Anyway, do we _really_ need 16 frags per skb, I dont know....

MAX_SKB_FRAGS is:
        /* To allow 64K frame to be packed as single skb without frag_list. Since
         * GRO uses frags we allocate at least 16 regardless of page size.
         */
        #if (65536/PAGE_SIZE + 2) < 16
        #define MAX_SKB_FRAGS 16UL
        #else
        #define MAX_SKB_FRAGS (65536/PAGE_SIZE + 2)
        #endif
        
So I think actually it turns out to be 18 on systems with 4k pages. If
we reduced that to be 16 that would save 48 bytes on amd64 which pulls
the shinfo size down to 440 and then

        NET_SKB_PAD (64) + 1500 + 14 + 440 = 2018

which does fit in half a page.

Using 16 still means that a 64k frame fits precisely in frags on a 4096
page size system. I don't know what the extra 2 was for (it predates
git), perhaps just to allow for some slop due to misalignment in the
first page of data?

> This gives problems when/if skb must be linearized and we hit
> PAGE_ALLOC_COSTLY_ORDER
> 
> Alternatively, we could use order-1 or order-2 pages on x86 to get
> 8192/16384 bytes frags. (fallback to order-0 pages in case of allocation
> failures)

Or fallback to separate allocation of shinfo?

Ian.



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ian Campbell Dec. 9, 2011, 1:47 p.m. UTC | #3
On Wed, 2011-12-07 at 13:35 +0000, Ian Campbell wrote:
> On Tue, 2011-12-06 at 13:24 +0000, Eric Dumazet wrote:
> > Le mardi 06 décembre 2011 à 11:57 +0000, Ian Campbell a écrit :
> > > On Wed, 2011-11-09 at 15:01 +0000, Ian Campbell wrote:
> > > >       * split linear data allocation and shinfo allocation into two. I
> > > >         suspect this will have its own performance implications? On the
> > > >         positive side skb_shared_info could come from its own fixed size
> > > >         pool/cache which might have some benefits
> > > 
> > > I played with this to see how it would look. Illustrative patch below. 
> > > 
> > > I figure that lots of small frames is the interesting workload for a
> > > change such as this but I don't know if iperf is necessarily the best
> > > benchmark for measuring that.
> > > Before changing things I got:
> > >         iperf -c qarun -m -t 60 -u -b 10000M -l 64
> > >         ------------------------------------------------------------
> > >         Client connecting to qarun, UDP port 5001
> > >         Sending 64 byte datagrams
> > >         UDP buffer size:   224 KByte (default)
> > >         ------------------------------------------------------------
> > >         [  3] local 10.80.225.63 port 45857 connected with 10.80.224.22 port 5001
> > >         [ ID] Interval       Transfer     Bandwidth
> > >         [  3]  0.0-60.0 sec    844 MBytes    118 Mbits/sec
> > >         [  3] Sent 13820376 datagrams
> > >         [  3] Server Report:
> > >         [  3]  0.0-60.0 sec    844 MBytes    118 Mbits/sec  0.005 ms    0/13820375 (0%)
> > >         [  3]  0.0-60.0 sec  1 datagrams received out-of-order
> > > whereas with the patch:
> > >         # iperf -c qarun -m -t 60 -u -b 10000M -l 64
> > >         ------------------------------------------------------------
> > >         Client connecting to qarun, UDP port 5001
> > >         Sending 64 byte datagrams
> > >         UDP buffer size:   224 KByte (default)
> > >         ------------------------------------------------------------
> > >         [  3] local 10.80.225.63 port 42504 connected with 10.80.224.22 port 5001
> > >         [ ID] Interval       Transfer     Bandwidth
> > >         [  3]  0.0-60.0 sec    833 MBytes    116 Mbits/sec
> > >         [  3] Sent 13645857 datagrams
> > >         [  3] Server Report:
> > >         [  3]  0.0-60.0 sec    833 MBytes    116 Mbits/sec  0.005 ms    0/13645856 (0%)
> > >         [  3]  0.0-60.0 sec  1 datagrams received out-of-order
> > > 
> > > With 1200 byte datagrams I get basically identical throughput.
> > > 
> > > (nb: none of the skb destructor stuff was present in either case)
> > 
> > Sorry, but the real problem is that if skb producer and consumer are not
> > on same CPU, each skb will now hit SLUB slowpath three times instead of
> > two.
> > 
> > Some workloads are : One cpu fully handling IRQ from device, dispatching
> > skbs to consumers on other cpus.
> 
> So something like a multithreaded apache benchmark would be interesting?
> 
> > Plus skb->truesize is wrong after your patch.
> > Not sure if cloning is correct either...
> 
> Me neither, the patch was just one which happened to run well enough to
> run some numbers on. I'll be sure to double-check/correct that stuff if
> I persist with the approach.
> 
> > Anyway, do we _really_ need 16 frags per skb, I dont know....
> 
> MAX_SKB_FRAGS is:
>         /* To allow 64K frame to be packed as single skb without frag_list. Since
>          * GRO uses frags we allocate at least 16 regardless of page size.
>          */
>         #if (65536/PAGE_SIZE + 2) < 16
>         #define MAX_SKB_FRAGS 16UL
>         #else
>         #define MAX_SKB_FRAGS (65536/PAGE_SIZE + 2)
>         #endif
>         
> So I think actually it turns out to be 18 on systems with 4k pages. If
> we reduced that to be 16 that would save 48 bytes on amd64 which pulls
> the shinfo size down to 440 and then
> 
>         NET_SKB_PAD (64) + 1500 + 14 + 440 = 2018
> 
> which does fit in half a page.
> 
> Using 16 still means that a 64k frame fits precisely in frags on a 4096
> page size system. I don't know what the extra 2 was for (it predates
> git), perhaps just to allow for some slop due to misalignment in the
> first page of data?

Tracked it down in TGLX's historic tree as:
http://git.kernel.org/?p=linux/kernel/git/tglx/history.git;a=commitdiff;h=80223d5186f73bf42a7e260c66c9cb9f7d8ec9cf

Having found the title I did a bit of searching around for discussion
but all I could find was other folks not having any idea where the + 2
comes from either...

http://kerneltrap.org/mailarchive/linux-netdev/2008/4/21/1529914 sort of
wonders if the + 2 might be including a head which crosses a page
boundary. I'm not sure it is sane/useful to account for that in
MAX_SKB_FRAGS. If we had a concept like MAX_SKB_PAGES then it would
perhaps make sense to have + 2 there, but AFAICT drivers etc are already
accounting for this appropriately by adding a further + 2 (or sometimes
+ 1) to MAX_SKB_FRAGS.

Ian.

> 
> > This gives problems when/if skb must be linearized and we hit
> > PAGE_ALLOC_COSTLY_ORDER
> > 
> > Alternatively, we could use order-1 or order-2 pages on x86 to get
> > 8192/16384 bytes frags. (fallback to order-0 pages in case of allocation
> > failures)
> 
> Or fallback to separate allocation of shinfo?
> 
> Ian.
> 
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Dec. 9, 2011, 6:34 p.m. UTC | #4
From: Ian Campbell <Ian.Campbell@citrix.com>
Date: Fri, 9 Dec 2011 13:47:07 +0000

> If we had a concept like MAX_SKB_PAGES then it would perhaps make
> sense to have + 2 there, but AFAICT drivers etc are already
> accounting for this appropriately by adding a further + 2 (or
> sometimes + 1) to MAX_SKB_FRAGS.

Any kind of code like this, including the "+ 2" in the skbuff header,
should be coded to use some kind of macro so we can track this
dependency instead of stumbling onto it and accidently breaking lots
of stuff if we want to change this "2" value.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ian Campbell Dec. 21, 2011, 11:03 a.m. UTC | #5
On Fri, 2011-12-09 at 18:34 +0000, David Miller wrote:
> From: Ian Campbell <Ian.Campbell@citrix.com>
> Date: Fri, 9 Dec 2011 13:47:07 +0000
> 
> > If we had a concept like MAX_SKB_PAGES then it would perhaps make
> > sense to have + 2 there, but AFAICT drivers etc are already
> > accounting for this appropriately by adding a further + 2 (or
> > sometimes + 1) to MAX_SKB_FRAGS.
> 
> Any kind of code like this, including the "+ 2" in the skbuff header,
> should be coded to use some kind of macro so we can track this
> dependency instead of stumbling onto it and accidently breaking lots
> of stuff if we want to change this "2" value.

Agreed.

Part of the problem is that no one seems to have any idea what this
particular + 2 means. My best hypothesis is that it accounts for the
pages used by the linear area (which potentially crosses a page
boundary).

On that basis I propose to do the following + a sweep of the tree to
determine who means which. There's quite a lot of MAX_SKB_FRAGS in the
tree but based on quick inspection many are related specifically to the
bounds of the frags list and so don't need to change.

        /* To allow 64K frame to be packed as single skb without frag_list. Since
         * GRO uses frags we allocate at least 16 regardless of page size.
         */
        #if (65536/PAGE_SIZE) < 16
        #define MAX_SKB_FRAGS 16UL
        #else
        #define MAX_SKB_FRAGS (65536/PAGE_SIZE)
        #endif
        
        /* The linear area can cross a page boundary */
        #define MAX_SKB_HEAD_PAGES 2UL
        #define MAX_SKB_PAGES (MAX_SKB_FRAGS + MAX_SKB_HEAD_PAGES)

This means that MAX_SKB_FRAGS will shrink by 2. The new symbol
MAX_SKB_PAGES will have the old value of MAX_SKB_FRAGS.

I'm also considering getting rid of the #if since I don't think we
support pages < 4K on any arch, do we?

To aid sequencing the transition it might be best to go with
MAX_SKB_FRAGS (16 + 2) and MAX_SKB_HEADER_PAGES (0) first, then convert
everyone to use the one they really meant and finally move the 2 from
SKB_FRAGS to SKB_HEADER_PAGES. I need to think about that.

Some drivers currently use MAX_SKB_FRAGS + 1 which I think is accounting
for the linear area in cases where the hardware cannot receive one cross
a page boundary. I'm considering defining MIN_SKB_HEAD_PAGES to use in
these circumstances.

Ian.



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Dec. 21, 2011, 11:08 a.m. UTC | #6
Le mercredi 21 décembre 2011 à 11:03 +0000, Ian Campbell a écrit :
> On Fri, 2011-12-09 at 18:34 +0000, David Miller wrote:
> > From: Ian Campbell <Ian.Campbell@citrix.com>
> > Date: Fri, 9 Dec 2011 13:47:07 +0000
> > 
> > > If we had a concept like MAX_SKB_PAGES then it would perhaps make
> > > sense to have + 2 there, but AFAICT drivers etc are already
> > > accounting for this appropriately by adding a further + 2 (or
> > > sometimes + 1) to MAX_SKB_FRAGS.
> > 
> > Any kind of code like this, including the "+ 2" in the skbuff header,
> > should be coded to use some kind of macro so we can track this
> > dependency instead of stumbling onto it and accidently breaking lots
> > of stuff if we want to change this "2" value.
> 
> Agreed.
> 
> Part of the problem is that no one seems to have any idea what this
> particular + 2 means. My best hypothesis is that it accounts for the
> pages used by the linear area (which potentially crosses a page
> boundary).

I dont understand the point.

linear data is allocated with kmalloc(), so technically speaking its
located in a single page, but page order can be 0, 1, 2, 3, ...
MAX_ORDER.



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ian Campbell Dec. 21, 2011, 11:18 a.m. UTC | #7
On Wed, 2011-12-21 at 11:08 +0000, Eric Dumazet wrote:
> Le mercredi 21 décembre 2011 à 11:03 +0000, Ian Campbell a écrit :
> > On Fri, 2011-12-09 at 18:34 +0000, David Miller wrote:
> > > From: Ian Campbell <Ian.Campbell@citrix.com>
> > > Date: Fri, 9 Dec 2011 13:47:07 +0000
> > > 
> > > > If we had a concept like MAX_SKB_PAGES then it would perhaps make
> > > > sense to have + 2 there, but AFAICT drivers etc are already
> > > > accounting for this appropriately by adding a further + 2 (or
> > > > sometimes + 1) to MAX_SKB_FRAGS.
> > > 
> > > Any kind of code like this, including the "+ 2" in the skbuff header,
> > > should be coded to use some kind of macro so we can track this
> > > dependency instead of stumbling onto it and accidently breaking lots
> > > of stuff if we want to change this "2" value.
> > 
> > Agreed.
> > 
> > Part of the problem is that no one seems to have any idea what this
> > particular + 2 means. My best hypothesis is that it accounts for the
> > pages used by the linear area (which potentially crosses a page
> > boundary).
> 
> I dont understand the point.

I did say hypothesis ;-) Do you know what that + 2 is actually all
about?

If no one knows what it is for maybe we should just remove it directly
instead of what I proposed?

> linear data is allocated with kmalloc(), so technically speaking its
> located in a single page, but page order can be 0, 1, 2, 3, ...
> MAX_ORDER.

I think I must misunderstand the terminology.

An order 1 allocation is in multiples of PAGE_SIZE, isn't it, even
though they happen to be contiguous?

Or are you considering the possibility of kmalloc returning a super page
of some description? Isn't that to some extent transparent to the caller
who (assuming PAGE_SIZE 4096) doesn't know if kmalloc(16384) returned
4*4096 contiguous pages or 1*16384 page?

Anyway I take your underlying point that 2*PAGE_SIZE is no kind of limit
on the size of the linear region.

Ian.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Dec. 21, 2011, 12:30 p.m. UTC | #8
Le mercredi 21 décembre 2011 à 11:18 +0000, Ian Campbell a écrit :

> 
> An order 1 allocation is in multiples of PAGE_SIZE, isn't it, even
> though they happen to be contiguous?

Really an order-1 allocation allocates one page, a compound one.

> 
> Or are you considering the possibility of kmalloc returning a super page
> of some description? Isn't that to some extent transparent to the caller
> who (assuming PAGE_SIZE 4096) doesn't know if kmalloc(16384) returned
> 4*4096 contiguous pages or 1*16384 page?
> 

Its a part of a _single_ page, or the total of it.

Its mandatory, or else many drivers would break their DMA handling.

> Anyway I take your underlying point that 2*PAGE_SIZE is no kind of limit
> on the size of the linear region.
> 

Sure, since net/unix/af_unix.c can happily allocate huge linear skbs, if
memory is not too fragmented of course...

(The stream unix socket limit their skbs to SKB_MAX_ALLOC (up to 4
pages), but datagram ones have no limit)



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ian Campbell Dec. 21, 2011, 1:48 p.m. UTC | #9
On Wed, 2011-12-21 at 12:30 +0000, Eric Dumazet wrote:
> Le mercredi 21 décembre 2011 à 11:18 +0000, Ian Campbell a écrit :
> 
> > 
> > An order 1 allocation is in multiples of PAGE_SIZE, isn't it, even
> > though they happen to be contiguous?
> 
> Really an order-1 allocation allocates one page, a compound one.

Oh, right, I see what you mean.

You snipped the question about the + 2, I take it you have no idea what
it is all about either?

Ian.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Dec. 21, 2011, 2:02 p.m. UTC | #10
Le mercredi 21 décembre 2011 à 13:48 +0000, Ian Campbell a écrit :
> On Wed, 2011-12-21 at 12:30 +0000, Eric Dumazet wrote:
> > Le mercredi 21 décembre 2011 à 11:18 +0000, Ian Campbell a écrit :
> > 
> > > 
> > > An order 1 allocation is in multiples of PAGE_SIZE, isn't it, even
> > > though they happen to be contiguous?
> > 
> > Really an order-1 allocation allocates one page, a compound one.
> 
> Oh, right, I see what you mean.
> 
> You snipped the question about the + 2, I take it you have no idea what
> it is all about either?
> 

No idea on this +2 point.

I know some hardwares have limits on a fragment length (for example IGB
uses IGB_MAX_DATA_PER_TXD limit), but I dont know why we add extra frags
in generic skb.



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Dec. 21, 2011, 7:28 p.m. UTC | #11
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 21 Dec 2011 15:02:18 +0100

> No idea on this +2 point.

I think I know, and I believe I instructed Alexey Kuznetsov to do
this.

When sendfile() is performed, we might start the SKB with the last few
bytes of one page, and end the SKB with the first few bytes of another
page.

In order to fit a full 64K frame into an SKB in this situation we have
to accomodate this case.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/net/ethernet/broadcom/bnx2.c b/drivers/net/ethernet/broadcom/bnx2.c
index 83d8cef..4e37e05 100644
--- a/drivers/net/ethernet/broadcom/bnx2.c
+++ b/drivers/net/ethernet/broadcom/bnx2.c
@@ -5320,8 +5320,7 @@  bnx2_set_rx_ring_size(struct bnx2 *bp, u32 size)
 	/* 8 for CRC and VLAN */
 	rx_size = bp->dev->mtu + ETH_HLEN + BNX2_RX_OFFSET + 8;
 
-	rx_space = SKB_DATA_ALIGN(rx_size + BNX2_RX_ALIGN) + NET_SKB_PAD +
-		SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+	rx_space = SKB_DATA_ALIGN(rx_size + BNX2_RX_ALIGN) + NET_SKB_PAD;
 
 	bp->rx_copy_thresh = BNX2_RX_COPY_THRESH;
 	bp->rx_pg_ring_size = 0;
@@ -5345,7 +5344,7 @@  bnx2_set_rx_ring_size(struct bnx2 *bp, u32 size)
 	bp->rx_buf_use_size = rx_size;
 	/* hw alignment + build_skb() overhead*/
 	bp->rx_buf_size = SKB_DATA_ALIGN(bp->rx_buf_use_size + BNX2_RX_ALIGN) +
-		NET_SKB_PAD + SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+		NET_SKB_PAD;
 	bp->rx_jumbo_thresh = rx_size - BNX2_RX_OFFSET;
 	bp->rx_ring_size = size;
 	bp->rx_max_ring = bnx2_find_max_ring(size, MAX_RX_RINGS);
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
index 0f7b7a4..b3de327 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
@@ -1203,9 +1203,7 @@  struct bnx2x {
 	 */
 #define BNX2X_FW_RX_ALIGN_START	(1UL << BNX2X_RX_ALIGN_SHIFT)
 
-#define BNX2X_FW_RX_ALIGN_END					\
-	max(1UL << BNX2X_RX_ALIGN_SHIFT, 			\
-	    SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
+#define BNX2X_FW_RX_ALIGN_END	(1UL << BNX2X_RX_ALIGN_SHIFT)
 
 #define BNX2X_PXP_DRAM_ALIGN		(BNX2X_RX_ALIGN_SHIFT - 5)
 
diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index d9e9c8c..dbea57b 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -5426,8 +5426,7 @@  static int tg3_alloc_rx_data(struct tg3 *tp, struct tg3_rx_prodring_set *tpr,
 	 * Callers depend upon this behavior and assume that
 	 * we leave everything unchanged if we fail.
 	 */
-	skb_size = SKB_DATA_ALIGN(data_size + TG3_RX_OFFSET(tp)) +
-		   SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+	skb_size = SKB_DATA_ALIGN(data_size + TG3_RX_OFFSET(tp));
 	data = kmalloc(skb_size, GFP_ATOMIC);
 	if (!data)
 		return -ENOMEM;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 09b7ea5..48e91a0 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -40,8 +40,7 @@ 
 
 #define SKB_DATA_ALIGN(X)	(((X) + (SMP_CACHE_BYTES - 1)) & \
 				 ~(SMP_CACHE_BYTES - 1))
-#define SKB_WITH_OVERHEAD(X)	\
-	((X) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
+#define SKB_WITH_OVERHEAD(X)	((X))
 #define SKB_MAX_ORDER(X, ORDER) \
 	SKB_WITH_OVERHEAD((PAGE_SIZE << (ORDER)) - (X))
 #define SKB_MAX_HEAD(X)		(SKB_MAX_ORDER((X), 0))
@@ -476,6 +475,7 @@  struct sk_buff {
 	sk_buff_data_t		end;
 	unsigned char		*head,
 				*data;
+	struct skb_shared_info	*shinfo;
 	unsigned int		truesize;
 	atomic_t		users;
 };
@@ -634,7 +634,10 @@  static inline unsigned char *skb_end_pointer(const struct sk_buff *skb)
 #endif
 
 /* Internal */
-#define skb_shinfo(SKB)	((struct skb_shared_info *)(skb_end_pointer(SKB)))
+static inline struct skb_shared_info *skb_shinfo(const struct sk_buff *skb)
+{
+	return skb->shinfo;
+}
 
 static inline struct skb_shared_hwtstamps *skb_hwtstamps(struct sk_buff *skb)
 {
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 7dc05ec..cfbe8e5 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -73,6 +73,7 @@ 
 
 static struct kmem_cache *skbuff_head_cache __read_mostly;
 static struct kmem_cache *skbuff_fclone_cache __read_mostly;
+static struct kmem_cache *skbuff_shinfo_cache __read_mostly;
 
 static void sock_pipe_buf_release(struct pipe_inode_info *pipe,
 				  struct pipe_buffer *buf)
@@ -118,7 +119,7 @@  static const struct pipe_buf_operations sock_pipe_buf_ops = {
  *
  *	Out of line support code for skb_put(). Not user callable.
  */
-static void skb_over_panic(struct sk_buff *skb, int sz, void *here)
+static void skb_over_panic(struct sk_buff*skb, int sz, void *here)
 {
 	printk(KERN_EMERG "skb_over_panic: text:%p len:%d put:%d head:%p "
 			  "data:%p tail:%#lx end:%#lx dev:%s\n",
@@ -184,22 +185,21 @@  struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 		goto out;
 	prefetchw(skb);
 
+	shinfo = kmem_cache_alloc_node(skbuff_shinfo_cache,
+				       gfp_mask & ~__GFP_DMA, node);
+	if (!shinfo)
+		goto noshinfo;
+	prefetchw(shinfo);
+
 	/* We do our best to align skb_shared_info on a separate cache
 	 * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
 	 * aligned memory blocks, unless SLUB/SLAB debug is enabled.
 	 * Both skb->head and skb_shared_info are cache line aligned.
 	 */
 	size = SKB_DATA_ALIGN(size);
-	size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 	data = kmalloc_node_track_caller(size, gfp_mask, node);
 	if (!data)
 		goto nodata;
-	/* kmalloc(size) might give us more room than requested.
-	 * Put skb_shared_info exactly at the end of allocated zone,
-	 * to allow max possible filling before reallocation.
-	 */
-	size = SKB_WITH_OVERHEAD(ksize(data));
-	prefetchw(data + size);
 
 	/*
 	 * Only clear those fields we need to clear, not those that we will
@@ -210,6 +210,7 @@  struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 	/* Account for allocated memory : skb + skb->head */
 	skb->truesize = SKB_TRUESIZE(size);
 	atomic_set(&skb->users, 1);
+	skb->shinfo = shinfo;
 	skb->head = data;
 	skb->data = data;
 	skb_reset_tail_pointer(skb);
@@ -219,7 +220,7 @@  struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 #endif
 
 	/* make sure we initialize shinfo sequentially */
-	shinfo = skb_shinfo(skb);
+	BUG_ON(shinfo != skb_shinfo(skb));
 	memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
 	atomic_set(&shinfo->dataref, 1);
 	kmemcheck_annotate_variable(shinfo->destructor_arg);
@@ -238,6 +239,8 @@  struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 out:
 	return skb;
 nodata:
+	kmem_cache_free(skbuff_shinfo_cache, shinfo);
+noshinfo:
 	kmem_cache_free(cache, skb);
 	skb = NULL;
 	goto out;
@@ -254,8 +257,8 @@  EXPORT_SYMBOL(__alloc_skb);
  * On a failure the return is %NULL, and @data is not freed.
  * Notes :
  *  Before IO, driver allocates only data buffer where NIC put incoming frame
- *  Driver should add room at head (NET_SKB_PAD) and
- *  MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info))
+ *  Driver should add room at head (NET_SKB_PAD) and need not
+ *  add room at tail (SKB_DATA_ALIGN(skb_shared_info))
  *  After IO, driver calls build_skb(), to allocate sk_buff and populate it
  *  before giving packet to stack.
  *  RX rings only contains data buffers, not full skbs.
@@ -270,11 +273,17 @@  struct sk_buff *build_skb(void *data)
 	if (!skb)
 		return NULL;
 
-	size = ksize(data) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+	shinfo = kmem_cache_alloc(skbuff_shinfo_cache, GFP_ATOMIC);
+	if (!shinfo)
+		goto noshinfo;
+	prefetchw(shinfo);
+
+	size = ksize(data);
 
 	memset(skb, 0, offsetof(struct sk_buff, tail));
 	skb->truesize = SKB_TRUESIZE(size);
 	atomic_set(&skb->users, 1);
+	skb->shinfo = shinfo;
 	skb->head = data;
 	skb->data = data;
 	skb_reset_tail_pointer(skb);
@@ -284,12 +293,17 @@  struct sk_buff *build_skb(void *data)
 #endif
 
 	/* make sure we initialize shinfo sequentially */
-	shinfo = skb_shinfo(skb);
+	BUG_ON(shinfo != skb_shinfo(skb));
 	memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
 	atomic_set(&shinfo->dataref, 1);
 	kmemcheck_annotate_variable(shinfo->destructor_arg);
 
+out:
 	return skb;
+noshinfo:
+	kmem_cache_free(skbuff_head_cache, skb);
+	skb = NULL;
+	goto out;
 }
 EXPORT_SYMBOL(build_skb);
 
@@ -378,7 +392,7 @@  static void skb_clone_fraglist(struct sk_buff *skb)
 		skb_get(list);
 }
 
-static void skb_release_data(struct sk_buff *skb)
+static void skb_release_data(struct sk_buff *skb, int free_shinfo)
 {
 	if (!skb->cloned ||
 	    !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
@@ -404,6 +418,8 @@  static void skb_release_data(struct sk_buff *skb)
 		if (skb_has_frag_list(skb))
 			skb_drop_fraglist(skb);
 
+		if (free_shinfo)
+			kmem_cache_free(skbuff_shinfo_cache, skb_shinfo(skb));
 		kfree(skb->head);
 	}
 }
@@ -474,7 +490,7 @@  static void skb_release_head_state(struct sk_buff *skb)
 static void skb_release_all(struct sk_buff *skb)
 {
 	skb_release_head_state(skb);
-	skb_release_data(skb);
+	skb_release_data(skb, 1);
 }
 
 /**
@@ -646,6 +662,7 @@  static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
 	C(tail);
 	C(end);
 	C(head);
+	C(shinfo);
 	C(data);
 	C(truesize);
 	atomic_set(&n->users, 1);
@@ -941,18 +958,14 @@  int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 		fastpath = atomic_read(&skb_shinfo(skb)->dataref) == delta;
 	}
 
-	if (fastpath &&
-	    size + sizeof(struct skb_shared_info) <= ksize(skb->head)) {
-		memmove(skb->head + size, skb_shinfo(skb),
-			offsetof(struct skb_shared_info,
-				 frags[skb_shinfo(skb)->nr_frags]));
+	if (fastpath && size <= ksize(skb->head)) {
 		memmove(skb->head + nhead, skb->head,
 			skb_tail_pointer(skb) - skb->head);
 		off = nhead;
 		goto adjust_others;
 	}
 
-	data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
+	data = kmalloc(size, gfp_mask);
 	if (!data)
 		goto nodata;
 
@@ -961,10 +974,6 @@  int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 	 */
 	memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head);
 
-	memcpy((struct skb_shared_info *)(data + size),
-	       skb_shinfo(skb),
-	       offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));
-after apply
 	if (fastpath) {
 		kfree(skb->head);
 	} else {
@@ -979,7 +988,7 @@  int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 		if (skb_has_frag_list(skb))
 			skb_clone_fraglist(skb);
 
-		skb_release_data(skb);
+		skb_release_data(skb, 0);
 	}
 	off = (data + nhead) - skb->head;
 
@@ -2956,6 +2965,12 @@  void __init skb_init(void)
 						0,
 						SLAB_HWCACHE_ALIGN|SLAB_PANIC,
 						NULL);
+
+	skbuff_shinfo_cache = kmem_cache_create("skbuff_shinfo_cache",
+						sizeof(struct skb_shared_info),
+						0,
+						SLAB_HWCACHE_ALIGN|SLAB_PANIC,
+						NULL);
 }
 
 /**