diff mbox

[v8,06/11] net/mlx4_en: add page recycle to prepare rx ring for tx support

Message ID 1468309894-26258-7-git-send-email-bblanco@plumgrid.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Brenden Blanco July 12, 2016, 7:51 a.m. UTC
The mlx4 driver by default allocates order-3 pages for the ring to
consume in multiple fragments. When the device has an xdp program, this
behavior will prevent tx actions since the page must be re-mapped in
TODEVICE mode, which cannot be done if the page is still shared.

Start by making the allocator configurable based on whether xdp is
running, such that order-0 pages are always used and never shared.

Since this will stress the page allocator, add a simple page cache to
each rx ring. Pages in the cache are left dma-mapped, and in drop-only
stress tests the page allocator is eliminated from the perf report.

Note that setting an xdp program will now require the rings to be
reconfigured.

Before:
 26.91%  ksoftirqd/0  [mlx4_en]         [k] mlx4_en_process_rx_cq
 17.88%  ksoftirqd/0  [mlx4_en]         [k] mlx4_en_alloc_frags
  6.00%  ksoftirqd/0  [mlx4_en]         [k] mlx4_en_free_frag
  4.49%  ksoftirqd/0  [kernel.vmlinux]  [k] get_page_from_freelist
  3.21%  swapper      [kernel.vmlinux]  [k] intel_idle
  2.73%  ksoftirqd/0  [kernel.vmlinux]  [k] bpf_map_lookup_elem
  2.57%  swapper      [mlx4_en]         [k] mlx4_en_process_rx_cq

After:
 31.72%  swapper      [kernel.vmlinux]       [k] intel_idle
  8.79%  swapper      [mlx4_en]              [k] mlx4_en_process_rx_cq
  7.54%  swapper      [kernel.vmlinux]       [k] poll_idle
  6.36%  swapper      [mlx4_core]            [k] mlx4_eq_int
  4.21%  swapper      [kernel.vmlinux]       [k] tasklet_action
  4.03%  swapper      [kernel.vmlinux]       [k] cpuidle_enter_state
  3.43%  swapper      [mlx4_en]              [k] mlx4_en_prepare_rx_desc
  2.18%  swapper      [kernel.vmlinux]       [k] native_irq_return_iret
  1.37%  swapper      [kernel.vmlinux]       [k] menu_select
  1.09%  swapper      [kernel.vmlinux]       [k] bpf_map_lookup_elem

Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
---
 drivers/net/ethernet/mellanox/mlx4/en_ethtool.c |  2 +-
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c  | 46 +++++++++++++++--
 drivers/net/ethernet/mellanox/mlx4/en_rx.c      | 69 ++++++++++++++++++++++---
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h    | 12 ++++-
 4 files changed, 115 insertions(+), 14 deletions(-)

Comments

Tariq Toukan July 12, 2016, 12:09 p.m. UTC | #1
>The mlx4 driver by default allocates order-3 pages for the ring to consume in multiple fragments. When the device has an xdp program, this behavior will prevent tx actions since the page must be re-mapped in TODEVICE mode, which cannot be done if the page is still shared.
>
>Start by making the allocator configurable based on whether xdp is running, such that order-0 pages are always used and never shared.
>
>Since this will stress the page allocator, add a simple page cache to each rx ring. Pages in the cache are left dma-mapped, and in drop-only stress tests the page allocator is eliminated from the perf report.
>
>Note that setting an xdp program will now require the rings to be reconfigured.
>
>Before:
> 26.91%  ksoftirqd/0  [mlx4_en]         [k] mlx4_en_process_rx_cq
> 17.88%  ksoftirqd/0  [mlx4_en]         [k] mlx4_en_alloc_frags
>  6.00%  ksoftirqd/0  [mlx4_en]         [k] mlx4_en_free_frag
>  4.49%  ksoftirqd/0  [kernel.vmlinux]  [k] get_page_from_freelist
>  3.21%  swapper      [kernel.vmlinux]  [k] intel_idle
>  2.73%  ksoftirqd/0  [kernel.vmlinux]  [k] bpf_map_lookup_elem
>  2.57%  swapper      [mlx4_en]         [k] mlx4_en_process_rx_cq
>
>After:
> 31.72%  swapper      [kernel.vmlinux]       [k] intel_idle
>  8.79%  swapper      [mlx4_en]              [k] mlx4_en_process_rx_cq
>  7.54%  swapper      [kernel.vmlinux]       [k] poll_idle
>  6.36%  swapper      [mlx4_core]            [k] mlx4_eq_int
>  4.21%  swapper      [kernel.vmlinux]       [k] tasklet_action
>  4.03%  swapper      [kernel.vmlinux]       [k] cpuidle_enter_state
>  3.43%  swapper      [mlx4_en]              [k] mlx4_en_prepare_rx_desc
>  2.18%  swapper      [kernel.vmlinux]       [k] native_irq_return_iret
>  1.37%  swapper      [kernel.vmlinux]       [k] menu_select
>  1.09%  swapper      [kernel.vmlinux]       [k] bpf_map_lookup_elem
>
>Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>

Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
David Miller July 12, 2016, 9:18 p.m. UTC | #2
From: Brenden Blanco <bblanco@plumgrid.com>
Date: Tue, 12 Jul 2016 00:51:29 -0700

> +	mlx4_en_free_resources(priv);
> +
>  	old_prog = xchg(&priv->prog, prog);
>  	if (old_prog)
>  		bpf_prog_put(old_prog);
>  
> -	return 0;
> +	err = mlx4_en_alloc_resources(priv);
> +	if (err) {
> +		en_err(priv, "Failed reallocating port resources\n");
> +		goto out;
> +	}
> +	if (port_up) {
> +		err = mlx4_en_start_port(dev);
> +		if (err)
> +			en_err(priv, "Failed starting port\n");

A failed configuration operation should _NEVER_ leave the interface in
an inoperative state like these error paths do.

You must instead preallocate the necessary resources, and only change
the chip's configuration and commit to the new settings once you have
successfully allocated those resources.
Brenden Blanco July 13, 2016, 12:54 a.m. UTC | #3
On Tue, Jul 12, 2016 at 02:18:32PM -0700, David Miller wrote:
> From: Brenden Blanco <bblanco@plumgrid.com>
> Date: Tue, 12 Jul 2016 00:51:29 -0700
> 
> > +	mlx4_en_free_resources(priv);
> > +
> >  	old_prog = xchg(&priv->prog, prog);
> >  	if (old_prog)
> >  		bpf_prog_put(old_prog);
> >  
> > -	return 0;
> > +	err = mlx4_en_alloc_resources(priv);
> > +	if (err) {
> > +		en_err(priv, "Failed reallocating port resources\n");
> > +		goto out;
> > +	}
> > +	if (port_up) {
> > +		err = mlx4_en_start_port(dev);
> > +		if (err)
> > +			en_err(priv, "Failed starting port\n");
> 
> A failed configuration operation should _NEVER_ leave the interface in
> an inoperative state like these error paths do.
> 
> You must instead preallocate the necessary resources, and only change
> the chip's configuration and commit to the new settings once you have
> successfully allocated those resources.
I'll see what I can do here.
Tariq Toukan July 13, 2016, 7:17 a.m. UTC | #4
On 13/07/2016 3:54 AM, Brenden Blanco wrote:
> On Tue, Jul 12, 2016 at 02:18:32PM -0700, David Miller wrote:
>> From: Brenden Blanco <bblanco@plumgrid.com>
>> Date: Tue, 12 Jul 2016 00:51:29 -0700
>>
>>> +	mlx4_en_free_resources(priv);
>>> +
>>>   	old_prog = xchg(&priv->prog, prog);
>>>   	if (old_prog)
>>>   		bpf_prog_put(old_prog);
>>>   
>>> -	return 0;
>>> +	err = mlx4_en_alloc_resources(priv);
>>> +	if (err) {
>>> +		en_err(priv, "Failed reallocating port resources\n");
>>> +		goto out;
>>> +	}
>>> +	if (port_up) {
>>> +		err = mlx4_en_start_port(dev);
>>> +		if (err)
>>> +			en_err(priv, "Failed starting port\n");
>> A failed configuration operation should _NEVER_ leave the interface in
>> an inoperative state like these error paths do.
>>
>> You must instead preallocate the necessary resources, and only change
>> the chip's configuration and commit to the new settings once you have
>> successfully allocated those resources.
> I'll see what I can do here.
That's exactly what we're doing in a patchset that will be submitted to 
net very soon (this week).
It fixes/refactors these failure flows just like Dave described, 
something like:

     err = mlx4_en_try_alloc_resources(priv, tmp, &new_prof);
     if (err)
         goto out;

     if (priv->port_up) {
         port_up = 1;
         mlx4_en_stop_port(dev, 1);
     }

     mlx4_en_safe_replace_resources(priv, tmp);

     if (port_up) {
         err = mlx4_en_start_port(dev);
         if (err)
             en_err(priv, "Failed starting port\n");
     }

I suggest you keep your code aligned with current net-next driver, and 
later I will take it and fix it (once merged with net).

Regards,
Tariq
Brenden Blanco July 13, 2016, 3:40 p.m. UTC | #5
On Wed, Jul 13, 2016 at 10:17:26AM +0300, Tariq Toukan wrote:
> 
> On 13/07/2016 3:54 AM, Brenden Blanco wrote:
> >On Tue, Jul 12, 2016 at 02:18:32PM -0700, David Miller wrote:
> >>From: Brenden Blanco <bblanco@plumgrid.com>
> >>Date: Tue, 12 Jul 2016 00:51:29 -0700
> >>
> >>>+	mlx4_en_free_resources(priv);
> >>>+
> >>>  	old_prog = xchg(&priv->prog, prog);
> >>>  	if (old_prog)
> >>>  		bpf_prog_put(old_prog);
> >>>-	return 0;
> >>>+	err = mlx4_en_alloc_resources(priv);
> >>>+	if (err) {
> >>>+		en_err(priv, "Failed reallocating port resources\n");
> >>>+		goto out;
> >>>+	}
> >>>+	if (port_up) {
> >>>+		err = mlx4_en_start_port(dev);
> >>>+		if (err)
> >>>+			en_err(priv, "Failed starting port\n");
> >>A failed configuration operation should _NEVER_ leave the interface in
> >>an inoperative state like these error paths do.
> >>
> >>You must instead preallocate the necessary resources, and only change
> >>the chip's configuration and commit to the new settings once you have
> >>successfully allocated those resources.
> >I'll see what I can do here.
> That's exactly what we're doing in a patchset that will be submitted
> to net very soon (this week).
Thanks Tariq!
As an example, I had originally tried to integrate this code into
mlx4_en_set_channels, which seems to have the same problem.
> It fixes/refactors these failure flows just like Dave described,
> something like:
> 
>     err = mlx4_en_try_alloc_resources(priv, tmp, &new_prof);
>     if (err)
>         goto out;
> 
>     if (priv->port_up) {
>         port_up = 1;
>         mlx4_en_stop_port(dev, 1);
>     }
> 
>     mlx4_en_safe_replace_resources(priv, tmp);
> 
>     if (port_up) {
>         err = mlx4_en_start_port(dev);
>         if (err)
>             en_err(priv, "Failed starting port\n");
>     }
> 
> I suggest you keep your code aligned with current net-next driver,
> and later I will take it and fix it (once merged with net).
Another option is to avoid entirely the tx_ring_num change, so as to
keep the majority of the initialized state valid. We would only allocate
a new set of pages and refill the rx rings once we have confirmed there
are enough resources.

So others can follow the discussion, there are multiple reasons to
reconfigure the rings.
1. The rx frags should be page-per-packet
2. The pages should be mapped DMA_BIDIRECTIONAL
3. Each rx ring should have a dedicated tx ring, which is off limits
from the upper stack
4. The dedicated tx ring will have a pointer back to its rx ring for
recycling

#1 and #2 can be done to the side ahead of time, as you are also
suggesting.

Currently, to achieve #3, we increase tx_ring_num while keeping
num_tx_rings_p_up the same. This precipitates a round of
free/alloc_resources, which takes some time and has many opportunities
for failure.
However, we could resurrect an earlier approach that keeps the
tx_ring_num unchanged, and instead just do a
netif_set_real_num_tx_queues(tx_ring_num - rsv_tx_rings) to hide it from
the stack. This would require that there be enough rings ahead of time,
with a simple bounds check like:
if (tx_ring_num < rsv_tx_rings + MLX4_EN_MAX_TX_RING_P_UP) {
	en_err(priv, "XDP requires minimum %d + %d rings\n", rsv_tx_rings,
		MLX4_EN_MAX_TX_RING_P_UP);
	return -EINVAL;
}
The default values for tx_ring_num and rx_ring_num will only hit this
case when operating in a low memory environment, in which case the user
must increase the number of channels manually. I think that is a fair
tradeoff.

The rest of #1, #2, and #4 can be done in a guaranteed fashion once the
buffers are allocated, since it would just be a few loops to refresh the
rx_desc and recycle_ring.
> 
> Regards,
> Tariq
Brenden Blanco July 15, 2016, 9:52 p.m. UTC | #6
On Wed, Jul 13, 2016 at 08:40:59AM -0700, Brenden Blanco wrote:
> On Wed, Jul 13, 2016 at 10:17:26AM +0300, Tariq Toukan wrote:
> > 
> > On 13/07/2016 3:54 AM, Brenden Blanco wrote:
> > >On Tue, Jul 12, 2016 at 02:18:32PM -0700, David Miller wrote:
> > >>From: Brenden Blanco <bblanco@plumgrid.com>
> > >>Date: Tue, 12 Jul 2016 00:51:29 -0700
> > >>
> > >>>+	mlx4_en_free_resources(priv);
> > >>>+
> > >>>  	old_prog = xchg(&priv->prog, prog);
> > >>>  	if (old_prog)
> > >>>  		bpf_prog_put(old_prog);
> > >>>-	return 0;
> > >>>+	err = mlx4_en_alloc_resources(priv);
> > >>>+	if (err) {
> > >>>+		en_err(priv, "Failed reallocating port resources\n");
> > >>>+		goto out;
> > >>>+	}
> > >>>+	if (port_up) {
> > >>>+		err = mlx4_en_start_port(dev);
> > >>>+		if (err)
> > >>>+			en_err(priv, "Failed starting port\n");
> > >>A failed configuration operation should _NEVER_ leave the interface in
> > >>an inoperative state like these error paths do.
> > >>
> > >>You must instead preallocate the necessary resources, and only change
> > >>the chip's configuration and commit to the new settings once you have
> > >>successfully allocated those resources.
> > >I'll see what I can do here.
> > That's exactly what we're doing in a patchset that will be submitted
> > to net very soon (this week).
> Thanks Tariq!
> As an example, I had originally tried to integrate this code into
> mlx4_en_set_channels, which seems to have the same problem.
> > It fixes/refactors these failure flows just like Dave described,
> > something like:
> > 
> >     err = mlx4_en_try_alloc_resources(priv, tmp, &new_prof);
> >     if (err)
> >         goto out;
> > 
> >     if (priv->port_up) {
> >         port_up = 1;
> >         mlx4_en_stop_port(dev, 1);
> >     }
> > 
> >     mlx4_en_safe_replace_resources(priv, tmp);
> > 
> >     if (port_up) {
> >         err = mlx4_en_start_port(dev);
> >         if (err)
> >             en_err(priv, "Failed starting port\n");
> >     }
> > 
> > I suggest you keep your code aligned with current net-next driver,
> > and later I will take it and fix it (once merged with net).
So, I took Dave's suggestion to heart, and spent the last 2 days seeing
what was possible to implement with just xdp as the focus, rather than
an overall cleanup which Tariq will be looking at.

Unfortunately, this turned out to a be a bit of a rat hole.

What I wanted to do was to pre-allocate all the required pages before
reaching the point of no return. Doing this isn't all that hard, since
it should just be a few loops. However, I ended with a bit more
duplicated code than one would like, since I had to tease out the
various sections that assume exclusive access to hardware.

But, more than that, is that I don't see a way to fill these pages into
the rings safely while hardware still has ability to write into the old
ones. There was no "pause" API that I could find besides
mlx4_en_stop_port(). That function is fairly destructive and requires
the resource allocation in mlx4_en_start_port() to succeed to recover
the port status.

One option that I considered would be to drain buffers from the rx ring,
and just let mlx4_en_recover_from_oom() do its job once we update the
page template in frag_info[]. This, however, also requires the queues to
be paused safely, so we again have to rely on mlx4_en_stop_port().

One change I can make is to avoid allocating additional tx rings, which
means that we can skip the calls to mlx4_en_free/alloc_resources().

The resulting code would then mirror what mlx4_en_change_mtu() does:

	if (port_up) {
		err = mlx4_en_start_port(dev);
		if (err)
			queue_work(mdev->workqueue, &priv->watchdog_task);
	}

I intend to respin the patchset with this approach, and a few other
changes as requested elsewhere. If the above is still unacceptable, feel
free to let me know and I will avoid spamming the list.
> Another option is to avoid entirely the tx_ring_num change, so as to
> keep the majority of the initialized state valid. We would only allocate
> a new set of pages and refill the rx rings once we have confirmed there
> are enough resources.
> 
> So others can follow the discussion, there are multiple reasons to
> reconfigure the rings.
> 1. The rx frags should be page-per-packet
> 2. The pages should be mapped DMA_BIDIRECTIONAL
> 3. Each rx ring should have a dedicated tx ring, which is off limits
> from the upper stack
> 4. The dedicated tx ring will have a pointer back to its rx ring for
> recycling
> 
> #1 and #2 can be done to the side ahead of time, as you are also
> suggesting.
> 
> Currently, to achieve #3, we increase tx_ring_num while keeping
> num_tx_rings_p_up the same. This precipitates a round of
> free/alloc_resources, which takes some time and has many opportunities
> for failure.
> However, we could resurrect an earlier approach that keeps the
> tx_ring_num unchanged, and instead just do a
> netif_set_real_num_tx_queues(tx_ring_num - rsv_tx_rings) to hide it from
> the stack. This would require that there be enough rings ahead of time,
> with a simple bounds check like:
> if (tx_ring_num < rsv_tx_rings + MLX4_EN_MAX_TX_RING_P_UP) {
> 	en_err(priv, "XDP requires minimum %d + %d rings\n", rsv_tx_rings,
> 		MLX4_EN_MAX_TX_RING_P_UP);
> 	return -EINVAL;
> }
> The default values for tx_ring_num and rx_ring_num will only hit this
> case when operating in a low memory environment, in which case the user
> must increase the number of channels manually. I think that is a fair
> tradeoff.
> 
> The rest of #1, #2, and #4 can be done in a guaranteed fashion once the
> buffers are allocated, since it would just be a few loops to refresh the
> rx_desc and recycle_ring.
> > 
> > Regards,
> > Tariq
Thanks,
Brenden
Brenden Blanco July 19, 2016, 5:41 p.m. UTC | #7
On Tue, Jul 19, 2016 at 04:33:28PM +0300, Tariq Toukan wrote:
[...]
> >So, I took Dave's suggestion to heart, and spent the last 2 days seeing
> >what was possible to implement with just xdp as the focus, rather than
> >an overall cleanup which Tariq will be looking at.
> >
> >Unfortunately, this turned out to a be a bit of a rat hole.
> >
> >What I wanted to do was to pre-allocate all the required pages before
> >reaching the point of no return. Doing this isn't all that hard, since
> >it should just be a few loops. However, I ended with a bit more
> >duplicated code than one would like, since I had to tease out the
> >various sections that assume exclusive access to hardware.
> >
> >But, more than that, is that I don't see a way to fill these pages into
> >the rings safely while hardware still has ability to write into the old
> >ones. There was no "pause" API that I could find besides
> >mlx4_en_stop_port(). That function is fairly destructive and requires
> >the resource allocation in mlx4_en_start_port() to succeed to recover
> >the port status.
> >
> >One option that I considered would be to drain buffers from the rx ring,
> >and just let mlx4_en_recover_from_oom() do its job once we update the
> >page template in frag_info[]. This, however, also requires the queues to
> >be paused safely, so we again have to rely on mlx4_en_stop_port().
> >
> >One change I can make is to avoid allocating additional tx rings, which
> >means that we can skip the calls to mlx4_en_free/alloc_resources().
> I think it's more natural to manage the two types of tx rings
> without converting one type to the other dynamically, just to avoid
> re-allocation.
> I don't see the re-allocation of resources as a serious issue here,
> especially after I submitted patches that add resiliency and make it
> more safe (see them in [1]).
Yes, I saw those, it would be helpful for the v8 version but with v9
onwards I guess it is not conflicting, since no reallocation is done.
> We just need to make sure we don't end up with an inoperative port,
> I will take care of it once net and net-next are merged.
Thanks
> 
> I'd like to keep the tx rings separation, also when it comes to
> resource allocation, so that we allocate xdp rings when needed, and
> not borrow them from the other type.
> I have two possible solutions in mind:
> 1) Based on the suggestion you have in v8, in which rsv tx rings
> grow beyond to the regular tx array, while maintaining the
> accounting.
> One improvement to make the code more clear and readable is to
> explicitly define two fields for accounting the number of tx rings:
> - rename rsv_tx_rings to xdp_tx_rings.
I'll incorporate this one, since anyway the code is changing a bit to
accommodate prog-per-ring.
> - have a new priv->netdev_num_tx_rings = priv->num_tx_rings - xdp_tx_rings.
> and then modify driver rings iterators accordingly.
The ring iteration is sprinkled in too many places to incorporate
easily, so this makes sense as its own cleanup, with either (1) or (2)
from your proposal.
> 
> 2) Another solution I have in mind goes as follows.
> Expand the current tx_ring array to be two dimensional, where
> tx_ring[0] points to the regular tx array, and tx_ring[1] points to
> the xdp rings.
> We look to keep using the same napi handlers and not get into code
> duplication, and make minimal adjustments that do not harm
> performance.
> The idea is to mark CQ's types in creation time, so that we know for
> each CQ whether it's for a regular TX ring or an XDP tx ring.
> When we get a completion, we can just read the CQ type, and then
> choose the correct tx_ring according to cq type and cq->ring,
> something like tx_ring[cq->cq_type][cq->ring]; (we can do it so that
> we avoid using an if statement).
> It is possible to use the existing enum cq_type field "cq->is_tx",
> rename it (say to 'type'), and expand the enum cq_type with an
> additional value for TX_XDP.
> In addition, similarly to the previous suggestion, also here we
> would want to use two separate fields for the number of tx rings,
> one for netdev and one for xdp, and modify the rings iterators
> accordingly.
I think (2) makes sense, but the proof will be in the code, depending on
how easy to review it is.
> 
> [1]:
> https://patchwork.ozlabs.org/patch/649620/
> https://patchwork.ozlabs.org/patch/649619/
> 
> Another (not related) nit, as you're already working on the next V,
> I suggest we rename priv->prog to priv->bpf_prog.
I think xdp_prog is better, but point taken. Will update.
> 
> Regards,
> Tariq
diff mbox

Patch

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
index 51a2e82..d3d51fa 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
@@ -47,7 +47,7 @@ 
 #define EN_ETHTOOL_SHORT_MASK cpu_to_be16(0xffff)
 #define EN_ETHTOOL_WORD_MASK  cpu_to_be32(0xffffffff)
 
-static int mlx4_en_moderation_update(struct mlx4_en_priv *priv)
+int mlx4_en_moderation_update(struct mlx4_en_priv *priv)
 {
 	int i;
 	int err = 0;
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 369a2ef..252c893d 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2532,21 +2532,59 @@  static int mlx4_en_set_tx_maxrate(struct net_device *dev, int queue_index, u32 m
 static int mlx4_xdp_set(struct net_device *dev, struct bpf_prog *prog)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
 	struct bpf_prog *old_prog;
+	int port_up = 0;
+	int err;
+
+	/* No need to reconfigure buffers when simply swapping the
+	 * program for a new one.
+	 */
+	if (READ_ONCE(priv->prog) && prog) {
+		/* This xchg is paired with READ_ONCE in the fast path, but is
+		 * also protected from itself via rtnl lock
+		 */
+		old_prog = xchg(&priv->prog, prog);
+		if (old_prog)
+			bpf_prog_put(old_prog);
+		return 0;
+	}
 
 	if (priv->num_frags > 1) {
 		en_err(priv, "Cannot set XDP if MTU requires multiple frags\n");
 		return -EOPNOTSUPP;
 	}
 
-	/* This xchg is paired with READ_ONCE in the fast path, but is
-	 * also protected from itself via rtnl lock
-	 */
+	mutex_lock(&mdev->state_lock);
+	if (priv->port_up) {
+		port_up = 1;
+		mlx4_en_stop_port(dev, 1);
+	}
+
+	mlx4_en_free_resources(priv);
+
 	old_prog = xchg(&priv->prog, prog);
 	if (old_prog)
 		bpf_prog_put(old_prog);
 
-	return 0;
+	err = mlx4_en_alloc_resources(priv);
+	if (err) {
+		en_err(priv, "Failed reallocating port resources\n");
+		goto out;
+	}
+	if (port_up) {
+		err = mlx4_en_start_port(dev);
+		if (err)
+			en_err(priv, "Failed starting port\n");
+	}
+
+	err = mlx4_en_moderation_update(priv);
+
+out:
+	if (err)
+		priv->prog = NULL;
+	mutex_unlock(&mdev->state_lock);
+	return err;
 }
 
 static bool mlx4_xdp_attached(struct net_device *dev)
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index adfa123..f26306c 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -57,7 +57,7 @@  static int mlx4_alloc_pages(struct mlx4_en_priv *priv,
 	struct page *page;
 	dma_addr_t dma;
 
-	for (order = MLX4_EN_ALLOC_PREFER_ORDER; ;) {
+	for (order = frag_info->order; ;) {
 		gfp_t gfp = _gfp;
 
 		if (order)
@@ -70,7 +70,7 @@  static int mlx4_alloc_pages(struct mlx4_en_priv *priv,
 			return -ENOMEM;
 	}
 	dma = dma_map_page(priv->ddev, page, 0, PAGE_SIZE << order,
-			   PCI_DMA_FROMDEVICE);
+			   frag_info->dma_dir);
 	if (dma_mapping_error(priv->ddev, dma)) {
 		put_page(page);
 		return -ENOMEM;
@@ -124,7 +124,8 @@  out:
 	while (i--) {
 		if (page_alloc[i].page != ring_alloc[i].page) {
 			dma_unmap_page(priv->ddev, page_alloc[i].dma,
-				page_alloc[i].page_size, PCI_DMA_FROMDEVICE);
+				page_alloc[i].page_size,
+				priv->frag_info[i].dma_dir);
 			page = page_alloc[i].page;
 			/* Revert changes done by mlx4_alloc_pages */
 			page_ref_sub(page, page_alloc[i].page_size /
@@ -145,7 +146,7 @@  static void mlx4_en_free_frag(struct mlx4_en_priv *priv,
 
 	if (next_frag_end > frags[i].page_size)
 		dma_unmap_page(priv->ddev, frags[i].dma, frags[i].page_size,
-			       PCI_DMA_FROMDEVICE);
+			       frag_info->dma_dir);
 
 	if (frags[i].page)
 		put_page(frags[i].page);
@@ -176,7 +177,8 @@  out:
 
 		page_alloc = &ring->page_alloc[i];
 		dma_unmap_page(priv->ddev, page_alloc->dma,
-			       page_alloc->page_size, PCI_DMA_FROMDEVICE);
+			       page_alloc->page_size,
+			       priv->frag_info[i].dma_dir);
 		page = page_alloc->page;
 		/* Revert changes done by mlx4_alloc_pages */
 		page_ref_sub(page, page_alloc->page_size /
@@ -201,7 +203,7 @@  static void mlx4_en_destroy_allocator(struct mlx4_en_priv *priv,
 		       i, page_count(page_alloc->page));
 
 		dma_unmap_page(priv->ddev, page_alloc->dma,
-				page_alloc->page_size, PCI_DMA_FROMDEVICE);
+				page_alloc->page_size, frag_info->dma_dir);
 		while (page_alloc->page_offset + frag_info->frag_stride <
 		       page_alloc->page_size) {
 			put_page(page_alloc->page);
@@ -244,6 +246,12 @@  static int mlx4_en_prepare_rx_desc(struct mlx4_en_priv *priv,
 	struct mlx4_en_rx_alloc *frags = ring->rx_info +
 					(index << priv->log_rx_info);
 
+	if (ring->page_cache.index > 0) {
+		frags[0] = ring->page_cache.buf[--ring->page_cache.index];
+		rx_desc->data[0].addr = cpu_to_be64(frags[0].dma);
+		return 0;
+	}
+
 	return mlx4_en_alloc_frags(priv, rx_desc, frags, ring->page_alloc, gfp);
 }
 
@@ -502,12 +510,39 @@  void mlx4_en_recover_from_oom(struct mlx4_en_priv *priv)
 	}
 }
 
+/* When the rx ring is running in page-per-packet mode, a released frame can go
+ * directly into a small cache, to avoid unmapping or touching the page
+ * allocator. In bpf prog performance scenarios, buffers are either forwarded
+ * or dropped, never converted to skbs, so every page can come directly from
+ * this cache when it is sized to be a multiple of the napi budget.
+ */
+bool mlx4_en_rx_recycle(struct mlx4_en_rx_ring *ring,
+			struct mlx4_en_rx_alloc *frame)
+{
+	struct mlx4_en_page_cache *cache = &ring->page_cache;
+
+	if (cache->index >= MLX4_EN_CACHE_SIZE)
+		return false;
+
+	cache->buf[cache->index++] = *frame;
+	return true;
+}
+
 void mlx4_en_destroy_rx_ring(struct mlx4_en_priv *priv,
 			     struct mlx4_en_rx_ring **pring,
 			     u32 size, u16 stride)
 {
 	struct mlx4_en_dev *mdev = priv->mdev;
 	struct mlx4_en_rx_ring *ring = *pring;
+	int i;
+
+	for (i = 0; i < ring->page_cache.index; i++) {
+		struct mlx4_en_rx_alloc *frame = &ring->page_cache.buf[i];
+
+		dma_unmap_page(priv->ddev, frame->dma, frame->page_size,
+			       priv->frag_info[0].dma_dir);
+		put_page(frame->page);
+	}
 
 	mlx4_free_hwq_res(mdev->dev, &ring->wqres, size * stride + TXBB_SIZE);
 	vfree(ring->rx_info);
@@ -863,6 +898,8 @@  int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 				bpf_warn_invalid_xdp_action(act);
 			case XDP_ABORTED:
 			case XDP_DROP:
+				if (mlx4_en_rx_recycle(ring, frags))
+					goto consumed;
 				goto next;
 			}
 		}
@@ -1018,6 +1055,7 @@  next:
 		for (nr = 0; nr < priv->num_frags; nr++)
 			mlx4_en_free_frag(priv, frags, nr);
 
+consumed:
 		++cq->mcq.cons_index;
 		index = (cq->mcq.cons_index) & ring->size_mask;
 		cqe = mlx4_en_get_cqe(cq->buf, index, priv->cqe_size) + factor;
@@ -1093,19 +1131,34 @@  static const int frag_sizes[] = {
 
 void mlx4_en_calc_rx_buf(struct net_device *dev)
 {
+	enum dma_data_direction dma_dir = PCI_DMA_FROMDEVICE;
 	struct mlx4_en_priv *priv = netdev_priv(dev);
 	int eff_mtu = MLX4_EN_EFF_MTU(dev->mtu);
+	int order = MLX4_EN_ALLOC_PREFER_ORDER;
+	u32 align = SMP_CACHE_BYTES;
 	int buf_size = 0;
 	int i = 0;
 
+	/* bpf requires buffers to be set up as 1 packet per page.
+	 * This only works when num_frags == 1.
+	 */
+	if (priv->prog) {
+		/* This will gain efficient xdp frame recycling at the expense
+		 * of more costly truesize accounting
+		 */
+		align = PAGE_SIZE;
+		order = 0;
+	}
+
 	while (buf_size < eff_mtu) {
+		priv->frag_info[i].order = order;
 		priv->frag_info[i].frag_size =
 			(eff_mtu > buf_size + frag_sizes[i]) ?
 				frag_sizes[i] : eff_mtu - buf_size;
 		priv->frag_info[i].frag_prefix_size = buf_size;
 		priv->frag_info[i].frag_stride =
-				ALIGN(priv->frag_info[i].frag_size,
-				      SMP_CACHE_BYTES);
+				ALIGN(priv->frag_info[i].frag_size, align);
+		priv->frag_info[i].dma_dir = dma_dir;
 		buf_size += priv->frag_info[i].frag_size;
 		i++;
 	}
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index 35ecfa2..0e0ecd1 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -259,6 +259,12 @@  struct mlx4_en_rx_alloc {
 	u32		page_size;
 };
 
+#define MLX4_EN_CACHE_SIZE (2 * NAPI_POLL_WEIGHT)
+struct mlx4_en_page_cache {
+	u32 index;
+	struct mlx4_en_rx_alloc buf[MLX4_EN_CACHE_SIZE];
+};
+
 struct mlx4_en_tx_ring {
 	/* cache line used and dirtied in tx completion
 	 * (mlx4_en_free_tx_buf())
@@ -323,6 +329,7 @@  struct mlx4_en_rx_ring {
 	u8  fcs_del;
 	void *buf;
 	void *rx_info;
+	struct mlx4_en_page_cache page_cache;
 	unsigned long bytes;
 	unsigned long packets;
 	unsigned long csum_ok;
@@ -442,7 +449,9 @@  struct mlx4_en_mc_list {
 struct mlx4_en_frag_info {
 	u16 frag_size;
 	u16 frag_prefix_size;
-	u16 frag_stride;
+	u32 frag_stride;
+	enum dma_data_direction dma_dir;
+	int order;
 };
 
 #ifdef CONFIG_MLX4_EN_DCB
@@ -654,6 +663,7 @@  void mlx4_en_set_stats_bitmap(struct mlx4_dev *dev,
 
 void mlx4_en_free_resources(struct mlx4_en_priv *priv);
 int mlx4_en_alloc_resources(struct mlx4_en_priv *priv);
+int mlx4_en_moderation_update(struct mlx4_en_priv *priv);
 
 int mlx4_en_create_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq **pcq,
 		      int entries, int ring, enum cq_type mode, int node);