Message ID | 20200825091629.12949-2-bjorn.topel@gmail.com |
---|---|
State | Awaiting Upstream |
Delegated to: | David Miller |
Headers | show |
Series | Avoid premature Rx buffer reuse for XDP_REDIRECT | expand |
On Tue, Aug 25, 2020 at 11:16:27AM +0200, Björn Töpel wrote: > From: Björn Töpel <bjorn.topel@intel.com> > > The page recycle code, incorrectly, relied on that a page fragment > could not be freed inside xdp_do_redirect(). This assumption leads to > that page fragments that are used by the stack/XDP redirect can be > reused and overwritten. > > To avoid this, store the page count prior invoking xdp_do_redirect(). > > Longer explanation: > > Intel NICs have a recycle mechanism. The main idea is that a page is > split into two parts. One part is owned by the driver, one part might > be owned by someone else, such as the stack. > > t0: Page is allocated, and put on the Rx ring > +--------------- > used by NIC ->| upper buffer > (rx_buffer) +--------------- > | lower buffer > +--------------- > page count == USHRT_MAX > rx_buffer->pagecnt_bias == USHRT_MAX > > t1: Buffer is received, and passed to the stack (e.g.) > +--------------- > | upper buff (skb) > +--------------- > used by NIC ->| lower buffer > (rx_buffer) +--------------- > page count == USHRT_MAX > rx_buffer->pagecnt_bias == USHRT_MAX - 1 > > t2: Buffer is received, and redirected > +--------------- > | upper buff (skb) > +--------------- > used by NIC ->| lower buffer > (rx_buffer) +--------------- > > Now, prior calling xdp_do_redirect(): > page count == USHRT_MAX > rx_buffer->pagecnt_bias == USHRT_MAX - 2 > > This means that buffer *cannot* be flipped/reused, because the skb is > still using it. > > The problem arises when xdp_do_redirect() actually frees the > segment. Then we get: > page count == USHRT_MAX - 1 > rx_buffer->pagecnt_bias == USHRT_MAX - 2 > > From a recycle perspective, the buffer can be flipped and reused, > which means that the skb data area is passed to the Rx HW ring! > > To work around this, the page count is stored prior calling > xdp_do_redirect(). > > Note that this is not optimal, since the NIC could actually reuse the > "lower buffer" again. However, then we need to track whether > XDP_REDIRECT consumed the buffer or not. > > Fixes: d9314c474d4f ("i40e: add support for XDP_REDIRECT") > Reported-by: Li RongQing <lirongqing@baidu.com> > Signed-off-by: Björn Töpel <bjorn.topel@intel.com> > --- > drivers/net/ethernet/intel/i40e/i40e_txrx.c | 28 +++++++++++++++------ > 1 file changed, 21 insertions(+), 7 deletions(-) > > diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c > index 3e5c566ceb01..5e446dc39190 100644 > --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c > +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c > @@ -1873,7 +1873,8 @@ static inline bool i40e_page_is_reusable(struct page *page) > * > * In either case, if the page is reusable its refcount is increased. > **/ > -static bool i40e_can_reuse_rx_page(struct i40e_rx_buffer *rx_buffer) > +static bool i40e_can_reuse_rx_page(struct i40e_rx_buffer *rx_buffer, > + int rx_buffer_pgcnt) > { > unsigned int pagecnt_bias = rx_buffer->pagecnt_bias; > struct page *page = rx_buffer->page; > @@ -1884,7 +1885,7 @@ static bool i40e_can_reuse_rx_page(struct i40e_rx_buffer *rx_buffer) > > #if (PAGE_SIZE < 8192) > /* if we are only owner of page we can reuse it */ > - if (unlikely((page_count(page) - pagecnt_bias) > 1)) > + if (unlikely((rx_buffer_pgcnt - pagecnt_bias) > 1)) > return false; > #else > #define I40E_LAST_OFFSET \ > @@ -1939,6 +1940,15 @@ static void i40e_add_rx_frag(struct i40e_ring *rx_ring, > #endif > } > > +static int i40e_rx_buffer_page_count(struct i40e_rx_buffer *rx_buffer) > +{ > +#if (PAGE_SIZE < 8192) > + return page_count(rx_buffer->page); > +#else > + return 0; > +#endif > +} > + > /** > * i40e_get_rx_buffer - Fetch Rx buffer and synchronize data for use > * @rx_ring: rx descriptor ring to transact packets on > @@ -1948,11 +1958,13 @@ static void i40e_add_rx_frag(struct i40e_ring *rx_ring, > * for use by the CPU. > */ > static struct i40e_rx_buffer *i40e_get_rx_buffer(struct i40e_ring *rx_ring, > - const unsigned int size) > + const unsigned int size, > + int *rx_buffer_pgcnt) > { > struct i40e_rx_buffer *rx_buffer; > > rx_buffer = i40e_rx_bi(rx_ring, rx_ring->next_to_clean); > + *rx_buffer_pgcnt = i40e_rx_buffer_page_count(rx_buffer); What i previously meant was: #if (PAGE_SIZE < 8192) *rx_buffer_pgcnt = page_count(rx_buffer->page); #endif and see below > prefetchw(rx_buffer->page); > > /* we are reusing so sync this buffer for CPU use */ > @@ -2112,9 +2124,10 @@ static struct sk_buff *i40e_build_skb(struct i40e_ring *rx_ring, > * either recycle the buffer or unmap it and free the associated resources. > */ > static void i40e_put_rx_buffer(struct i40e_ring *rx_ring, > - struct i40e_rx_buffer *rx_buffer) > + struct i40e_rx_buffer *rx_buffer, > + int rx_buffer_pgcnt) > { > - if (i40e_can_reuse_rx_page(rx_buffer)) { > + if (i40e_can_reuse_rx_page(rx_buffer, rx_buffer_pgcnt)) { > /* hand second half of page back to the ring */ > i40e_reuse_rx_page(rx_ring, rx_buffer); > } else { > @@ -2319,6 +2332,7 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget) > unsigned int xdp_xmit = 0; > bool failure = false; > struct xdp_buff xdp; > + int rx_buffer_pgcnt; you could move scope this variable only for the while (likely(total_rx_packets < (unsigned int)budget)) loop and init this to 0. then you could drop the helper function you've added. and BTW the page_count is not being used for big pages but i agree that it's better to have it set to 0. > > #if (PAGE_SIZE < 8192) > xdp.frame_sz = i40e_rx_frame_truesize(rx_ring, 0); > @@ -2370,7 +2384,7 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget) > break; > > i40e_trace(clean_rx_irq, rx_ring, rx_desc, skb); > - rx_buffer = i40e_get_rx_buffer(rx_ring, size); > + rx_buffer = i40e_get_rx_buffer(rx_ring, size, &rx_buffer_pgcnt); > > /* retrieve a buffer from the ring */ > if (!skb) { > @@ -2413,7 +2427,7 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget) > break; > } > > - i40e_put_rx_buffer(rx_ring, rx_buffer); > + i40e_put_rx_buffer(rx_ring, rx_buffer, rx_buffer_pgcnt); > cleaned_count++; > > if (i40e_is_non_eop(rx_ring, rx_desc, skb)) > -- > 2.25.1 >
On 2020-08-25 13:13, Maciej Fijalkowski wrote: > On Tue, Aug 25, 2020 at 11:16:27AM +0200, Björn Töpel wrote: [...] >> struct i40e_rx_buffer *rx_buffer; >> >> rx_buffer = i40e_rx_bi(rx_ring, rx_ring->next_to_clean); >> + *rx_buffer_pgcnt = i40e_rx_buffer_page_count(rx_buffer); > > What i previously meant was: > > #if (PAGE_SIZE < 8192) > *rx_buffer_pgcnt = page_count(rx_buffer->page); > #endif > > and see below > Right... >> prefetchw(rx_buffer->page); >> >> /* we are reusing so sync this buffer for CPU use */ >> @@ -2112,9 +2124,10 @@ static struct sk_buff *i40e_build_skb(struct i40e_ring *rx_ring, >> * either recycle the buffer or unmap it and free the associated resources. >> */ >> static void i40e_put_rx_buffer(struct i40e_ring *rx_ring, >> - struct i40e_rx_buffer *rx_buffer) >> + struct i40e_rx_buffer *rx_buffer, >> + int rx_buffer_pgcnt) >> { >> - if (i40e_can_reuse_rx_page(rx_buffer)) { >> + if (i40e_can_reuse_rx_page(rx_buffer, rx_buffer_pgcnt)) { >> /* hand second half of page back to the ring */ >> i40e_reuse_rx_page(rx_ring, rx_buffer); >> } else { >> @@ -2319,6 +2332,7 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget) >> unsigned int xdp_xmit = 0; >> bool failure = false; >> struct xdp_buff xdp; >> + int rx_buffer_pgcnt; > > you could move scope this variable only for the > > while (likely(total_rx_packets < (unsigned int)budget)) > > loop and init this to 0. then you could drop the helper function you've > added. and BTW the page_count is not being used for big pages but i agree > that it's better to have it set to 0. > ...but isn't it a bit nasty with an output parameter that relies on the that the input was set to zero. I guess it's a matter of taste, but I find that more error prone. Let me know if you have strong feelings about this, and I'll respin (but I rather not!). Björn >> >> #if (PAGE_SIZE < 8192) >> xdp.frame_sz = i40e_rx_frame_truesize(rx_ring, 0); >> @@ -2370,7 +2384,7 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget) >> break; >> >> i40e_trace(clean_rx_irq, rx_ring, rx_desc, skb); >> - rx_buffer = i40e_get_rx_buffer(rx_ring, size); >> + rx_buffer = i40e_get_rx_buffer(rx_ring, size, &rx_buffer_pgcnt); >> >> /* retrieve a buffer from the ring */ >> if (!skb) { >> @@ -2413,7 +2427,7 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget) >> break; >> } >> >> - i40e_put_rx_buffer(rx_ring, rx_buffer); >> + i40e_put_rx_buffer(rx_ring, rx_buffer, rx_buffer_pgcnt); >> cleaned_count++; >> >> if (i40e_is_non_eop(rx_ring, rx_desc, skb)) >> -- >> 2.25.1 >>
On Tue, Aug 25, 2020 at 01:25:16PM +0200, Björn Töpel wrote: > On 2020-08-25 13:13, Maciej Fijalkowski wrote: > > On Tue, Aug 25, 2020 at 11:16:27AM +0200, Björn Töpel wrote: > [...] > > > struct i40e_rx_buffer *rx_buffer; > > > rx_buffer = i40e_rx_bi(rx_ring, rx_ring->next_to_clean); > > > + *rx_buffer_pgcnt = i40e_rx_buffer_page_count(rx_buffer); > > > > What i previously meant was: > > > > #if (PAGE_SIZE < 8192) > > *rx_buffer_pgcnt = page_count(rx_buffer->page); > > #endif > > > > and see below > > > > Right... > > > > prefetchw(rx_buffer->page); > > > /* we are reusing so sync this buffer for CPU use */ > > > @@ -2112,9 +2124,10 @@ static struct sk_buff *i40e_build_skb(struct i40e_ring *rx_ring, > > > * either recycle the buffer or unmap it and free the associated resources. > > > */ > > > static void i40e_put_rx_buffer(struct i40e_ring *rx_ring, > > > - struct i40e_rx_buffer *rx_buffer) > > > + struct i40e_rx_buffer *rx_buffer, > > > + int rx_buffer_pgcnt) > > > { > > > - if (i40e_can_reuse_rx_page(rx_buffer)) { > > > + if (i40e_can_reuse_rx_page(rx_buffer, rx_buffer_pgcnt)) { > > > /* hand second half of page back to the ring */ > > > i40e_reuse_rx_page(rx_ring, rx_buffer); > > > } else { > > > @@ -2319,6 +2332,7 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget) > > > unsigned int xdp_xmit = 0; > > > bool failure = false; > > > struct xdp_buff xdp; > > > + int rx_buffer_pgcnt; > > > > you could move scope this variable only for the > > > > while (likely(total_rx_packets < (unsigned int)budget)) > > > > loop and init this to 0. then you could drop the helper function you've > > added. and BTW the page_count is not being used for big pages but i agree > > that it's better to have it set to 0. > > > > ...but isn't it a bit nasty with an output parameter that relies on the that > the input was set to zero. I guess it's a matter of taste, but I find that > more error prone. > > Let me know if you have strong feelings about this, and I'll respin (but I > rather not!). Up to you. No strong feelings, i just think that i40e_rx_buffer_page_count is not needed. But if you want to keep it, then i was usually asking people to provide the doxygen descriptions for newly introduced functions... :P but scoping it still makes sense to me, static analysis tools would agree with me I guess. > > > Björn > > > > > #if (PAGE_SIZE < 8192) > > > xdp.frame_sz = i40e_rx_frame_truesize(rx_ring, 0); > > > @@ -2370,7 +2384,7 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget) > > > break; > > > i40e_trace(clean_rx_irq, rx_ring, rx_desc, skb); > > > - rx_buffer = i40e_get_rx_buffer(rx_ring, size); > > > + rx_buffer = i40e_get_rx_buffer(rx_ring, size, &rx_buffer_pgcnt); > > > /* retrieve a buffer from the ring */ > > > if (!skb) { > > > @@ -2413,7 +2427,7 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget) > > > break; > > > } > > > - i40e_put_rx_buffer(rx_ring, rx_buffer); > > > + i40e_put_rx_buffer(rx_ring, rx_buffer, rx_buffer_pgcnt); > > > cleaned_count++; > > > if (i40e_is_non_eop(rx_ring, rx_desc, skb)) > > > -- > > > 2.25.1 > > >
On 2020-08-25 13:29, Maciej Fijalkowski wrote: > On Tue, Aug 25, 2020 at 01:25:16PM +0200, Björn Töpel wrote: >> On 2020-08-25 13:13, Maciej Fijalkowski wrote: >>> On Tue, Aug 25, 2020 at 11:16:27AM +0200, Björn Töpel wrote: >> [...] >>>> struct i40e_rx_buffer *rx_buffer; >>>> rx_buffer = i40e_rx_bi(rx_ring, rx_ring->next_to_clean); >>>> + *rx_buffer_pgcnt = i40e_rx_buffer_page_count(rx_buffer); >>> >>> What i previously meant was: >>> >>> #if (PAGE_SIZE < 8192) >>> *rx_buffer_pgcnt = page_count(rx_buffer->page); >>> #endif >>> >>> and see below >>> >> >> Right... >> >>>> prefetchw(rx_buffer->page); >>>> /* we are reusing so sync this buffer for CPU use */ >>>> @@ -2112,9 +2124,10 @@ static struct sk_buff *i40e_build_skb(struct i40e_ring *rx_ring, >>>> * either recycle the buffer or unmap it and free the associated resources. >>>> */ >>>> static void i40e_put_rx_buffer(struct i40e_ring *rx_ring, >>>> - struct i40e_rx_buffer *rx_buffer) >>>> + struct i40e_rx_buffer *rx_buffer, >>>> + int rx_buffer_pgcnt) >>>> { >>>> - if (i40e_can_reuse_rx_page(rx_buffer)) { >>>> + if (i40e_can_reuse_rx_page(rx_buffer, rx_buffer_pgcnt)) { >>>> /* hand second half of page back to the ring */ >>>> i40e_reuse_rx_page(rx_ring, rx_buffer); >>>> } else { >>>> @@ -2319,6 +2332,7 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget) >>>> unsigned int xdp_xmit = 0; >>>> bool failure = false; >>>> struct xdp_buff xdp; >>>> + int rx_buffer_pgcnt; >>> >>> you could move scope this variable only for the >>> >>> while (likely(total_rx_packets < (unsigned int)budget)) >>> >>> loop and init this to 0. then you could drop the helper function you've >>> added. and BTW the page_count is not being used for big pages but i agree >>> that it's better to have it set to 0. >>> >> >> ...but isn't it a bit nasty with an output parameter that relies on the that >> the input was set to zero. I guess it's a matter of taste, but I find that >> more error prone. >> >> Let me know if you have strong feelings about this, and I'll respin (but I >> rather not!). > > Up to you. No strong feelings, i just think that i40e_rx_buffer_page_count > is not needed. But if you want to keep it, then i was usually asking > people to provide the doxygen descriptions for newly introduced > functions... :P > > but scoping it still makes sense to me, static analysis tools would agree > with me I guess. > Fair enough! I'll spin a v2! Thanks for taking a look! Björn >> >> >> Björn >> >> >>>> #if (PAGE_SIZE < 8192) >>>> xdp.frame_sz = i40e_rx_frame_truesize(rx_ring, 0); >>>> @@ -2370,7 +2384,7 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget) >>>> break; >>>> i40e_trace(clean_rx_irq, rx_ring, rx_desc, skb); >>>> - rx_buffer = i40e_get_rx_buffer(rx_ring, size); >>>> + rx_buffer = i40e_get_rx_buffer(rx_ring, size, &rx_buffer_pgcnt); >>>> /* retrieve a buffer from the ring */ >>>> if (!skb) { >>>> @@ -2413,7 +2427,7 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget) >>>> break; >>>> } >>>> - i40e_put_rx_buffer(rx_ring, rx_buffer); >>>> + i40e_put_rx_buffer(rx_ring, rx_buffer, rx_buffer_pgcnt); >>>> cleaned_count++; >>>> if (i40e_is_non_eop(rx_ring, rx_desc, skb)) >>>> -- >>>> 2.25.1 >>>>
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 3e5c566ceb01..5e446dc39190 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -1873,7 +1873,8 @@ static inline bool i40e_page_is_reusable(struct page *page) * * In either case, if the page is reusable its refcount is increased. **/ -static bool i40e_can_reuse_rx_page(struct i40e_rx_buffer *rx_buffer) +static bool i40e_can_reuse_rx_page(struct i40e_rx_buffer *rx_buffer, + int rx_buffer_pgcnt) { unsigned int pagecnt_bias = rx_buffer->pagecnt_bias; struct page *page = rx_buffer->page; @@ -1884,7 +1885,7 @@ static bool i40e_can_reuse_rx_page(struct i40e_rx_buffer *rx_buffer) #if (PAGE_SIZE < 8192) /* if we are only owner of page we can reuse it */ - if (unlikely((page_count(page) - pagecnt_bias) > 1)) + if (unlikely((rx_buffer_pgcnt - pagecnt_bias) > 1)) return false; #else #define I40E_LAST_OFFSET \ @@ -1939,6 +1940,15 @@ static void i40e_add_rx_frag(struct i40e_ring *rx_ring, #endif } +static int i40e_rx_buffer_page_count(struct i40e_rx_buffer *rx_buffer) +{ +#if (PAGE_SIZE < 8192) + return page_count(rx_buffer->page); +#else + return 0; +#endif +} + /** * i40e_get_rx_buffer - Fetch Rx buffer and synchronize data for use * @rx_ring: rx descriptor ring to transact packets on @@ -1948,11 +1958,13 @@ static void i40e_add_rx_frag(struct i40e_ring *rx_ring, * for use by the CPU. */ static struct i40e_rx_buffer *i40e_get_rx_buffer(struct i40e_ring *rx_ring, - const unsigned int size) + const unsigned int size, + int *rx_buffer_pgcnt) { struct i40e_rx_buffer *rx_buffer; rx_buffer = i40e_rx_bi(rx_ring, rx_ring->next_to_clean); + *rx_buffer_pgcnt = i40e_rx_buffer_page_count(rx_buffer); prefetchw(rx_buffer->page); /* we are reusing so sync this buffer for CPU use */ @@ -2112,9 +2124,10 @@ static struct sk_buff *i40e_build_skb(struct i40e_ring *rx_ring, * either recycle the buffer or unmap it and free the associated resources. */ static void i40e_put_rx_buffer(struct i40e_ring *rx_ring, - struct i40e_rx_buffer *rx_buffer) + struct i40e_rx_buffer *rx_buffer, + int rx_buffer_pgcnt) { - if (i40e_can_reuse_rx_page(rx_buffer)) { + if (i40e_can_reuse_rx_page(rx_buffer, rx_buffer_pgcnt)) { /* hand second half of page back to the ring */ i40e_reuse_rx_page(rx_ring, rx_buffer); } else { @@ -2319,6 +2332,7 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget) unsigned int xdp_xmit = 0; bool failure = false; struct xdp_buff xdp; + int rx_buffer_pgcnt; #if (PAGE_SIZE < 8192) xdp.frame_sz = i40e_rx_frame_truesize(rx_ring, 0); @@ -2370,7 +2384,7 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget) break; i40e_trace(clean_rx_irq, rx_ring, rx_desc, skb); - rx_buffer = i40e_get_rx_buffer(rx_ring, size); + rx_buffer = i40e_get_rx_buffer(rx_ring, size, &rx_buffer_pgcnt); /* retrieve a buffer from the ring */ if (!skb) { @@ -2413,7 +2427,7 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget) break; } - i40e_put_rx_buffer(rx_ring, rx_buffer); + i40e_put_rx_buffer(rx_ring, rx_buffer, rx_buffer_pgcnt); cleaned_count++; if (i40e_is_non_eop(rx_ring, rx_desc, skb))