diff mbox

[RFC,net-next,1/3] ixgbe: support netdev_ops->ndo_xmit_flush()

Message ID 1408887738-7661-2-git-send-email-dborkman@redhat.com
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Daniel Borkmann Aug. 24, 2014, 1:42 p.m. UTC
This implements the deferred tail pointer flush API for the ixgbe
driver. Similar version also proposed longer time ago by Alexander Duyck.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 28 +++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

Comments

David Miller Aug. 25, 2014, 5:55 a.m. UTC | #1
From: Daniel Borkmann <dborkman@redhat.com>
Date: Sun, 24 Aug 2014 15:42:16 +0200

> +	/* we need this if more than one processor can write to our tail
> +	 * at a time, it synchronizes IO on IA64/Altix systems
> +	 */
> +	mmiowb();

Unlike for IGB, this doesn't exist in the IXGBE driver, please do not add
it.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jesper Dangaard Brouer Aug. 25, 2014, 12:07 p.m. UTC | #2
On Sun, 24 Aug 2014 15:42:16 +0200
Daniel Borkmann <dborkman@redhat.com> wrote:

> This implements the deferred tail pointer flush API for the ixgbe
> driver. Similar version also proposed longer time ago by Alexander Duyck.

I've run some benchmarks with this patch only, which actually shows a
performance regression.

Using trafgen with QDISC_BYPASS and mmap mode, via cmdline:
 trafgen --cpp  --dev eth5 --conf udp_example01.trafgen -V --cpus 1

BASELINE(no-patch): trafgen QDISC_BYPASS and mmap:
 - tx:1562539 pps

(This patch only): ixgbe use of .ndo_xmit_flush.
 - tx:1532299 pps

Regression: -30240 pps
 * In nanosec: (1/1562539*10^9)-(1/1532299*10^9) = -12.63 ns


As DaveM points out, me might not need the mmiowb().
Result when not performing the mmiowb():
 - tx:1548352 pps

Still a small regression: -14187 pps
 * In nanosec: (1/1562539*10^9)-(1/1548352*10^9) = -5.86 ns


I was not expecting this "slowdown", with this rather simple use of the
new ndo_xmit_flush API.  Can anyone explain why this is happening?



 
> Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
> ---
>  drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 28 +++++++++++++++++++++++----
>  1 file changed, 24 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> index 87bd53f..4e073cf 100644
> --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
> @@ -6957,10 +6957,6 @@ static void ixgbe_tx_map(struct ixgbe_ring *tx_ring,
>  		i = 0;
>  
>  	tx_ring->next_to_use = i;
> -
> -	/* notify HW of packet */
> -	ixgbe_write_tail(tx_ring, i);
> -
>  	return;
>  dma_error:
>  	dev_err(tx_ring->dev, "TX DMA map failed\n");
> @@ -7301,6 +7297,29 @@ static netdev_tx_t ixgbe_xmit_frame(struct sk_buff *skb,
>  	return __ixgbe_xmit_frame(skb, netdev, NULL);
>  }
>  
> +static inline struct ixgbe_ring *
> +__ixgb_tx_queue_mapping(struct ixgbe_adapter *adapter, u16 queue)
> +{
> +	if (unlikely(queue >= adapter->num_tx_queues))
> +		queue = queue % adapter->num_tx_queues;
> +
> +	return adapter->tx_ring[queue];
> +}
> +
> +static void ixgbe_xmit_flush(struct net_device *netdev, u16 queue)
> +{
> +	struct ixgbe_adapter *adapter = netdev_priv(netdev);
> +	struct ixgbe_ring *tx_ring;
> +
> +	tx_ring = __ixgb_tx_queue_mapping(adapter, queue);
> +	ixgbe_write_tail(tx_ring, tx_ring->next_to_use);
> +
> +	/* we need this if more than one processor can write to our tail
> +	 * at a time, it synchronizes IO on IA64/Altix systems
> +	 */
> +	mmiowb();

This is the mmiowb() which is avoided in above measurement.

> +}
> +
>  /**
>   * ixgbe_set_mac - Change the Ethernet Address of the NIC
>   * @netdev: network interface device structure
> @@ -7914,6 +7933,7 @@ static const struct net_device_ops ixgbe_netdev_ops = {
>  	.ndo_open		= ixgbe_open,
>  	.ndo_stop		= ixgbe_close,
>  	.ndo_start_xmit		= ixgbe_xmit_frame,
> +	.ndo_xmit_flush		= ixgbe_xmit_flush,
>  	.ndo_select_queue	= ixgbe_select_queue,
>  	.ndo_set_rx_mode	= ixgbe_set_rx_mode,
>  	.ndo_validate_addr	= eth_validate_addr,
Duyck, Alexander H Aug. 25, 2014, 10:51 p.m. UTC | #3
On 08/25/2014 05:07 AM, Jesper Dangaard Brouer wrote:
> On Sun, 24 Aug 2014 15:42:16 +0200
> Daniel Borkmann <dborkman@redhat.com> wrote:
> 
>> This implements the deferred tail pointer flush API for the ixgbe
>> driver. Similar version also proposed longer time ago by Alexander Duyck.
> 
> I've run some benchmarks with this patch only, which actually shows a
> performance regression.
> 
> Using trafgen with QDISC_BYPASS and mmap mode, via cmdline:
>  trafgen --cpp  --dev eth5 --conf udp_example01.trafgen -V --cpus 1
> 
> BASELINE(no-patch): trafgen QDISC_BYPASS and mmap:
>  - tx:1562539 pps
> 
> (This patch only): ixgbe use of .ndo_xmit_flush.
>  - tx:1532299 pps
> 
> Regression: -30240 pps
>  * In nanosec: (1/1562539*10^9)-(1/1532299*10^9) = -12.63 ns
> 
> 
> As DaveM points out, me might not need the mmiowb().
> Result when not performing the mmiowb():
>  - tx:1548352 pps
> 
> Still a small regression: -14187 pps
>  * In nanosec: (1/1562539*10^9)-(1/1548352*10^9) = -5.86 ns
> 
> 
> I was not expecting this "slowdown", with this rather simple use of the
> new ndo_xmit_flush API.  Can anyone explain why this is happening?

One possibility is that we are now doing less stuff between the time we
write tail and when we grab the qdisc lock (locked transactions are
stalled by MMIO) so that we are spending more time stuck waiting for the
write to complete and doing nothing.

Then of course there are always the funny oddball quirks such as the
code changes might have changed the alignment of a loop resulting in Tx
cleanup more expensive than it was before.

Thanks,

Alex
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jesper Dangaard Brouer Aug. 26, 2014, 6:44 a.m. UTC | #4
On Mon, 25 Aug 2014 15:51:50 -0700
Alexander Duyck <alexander.h.duyck@intel.com> wrote:

> On 08/25/2014 05:07 AM, Jesper Dangaard Brouer wrote:
> > On Sun, 24 Aug 2014 15:42:16 +0200
> > Daniel Borkmann <dborkman@redhat.com> wrote:
> > 
> >> This implements the deferred tail pointer flush API for the ixgbe
> >> driver. Similar version also proposed longer time ago by Alexander Duyck.
> > 
> > I've run some benchmarks with this patch only, which actually shows a
> > performance regression.
> > 
> > Using trafgen with QDISC_BYPASS and mmap mode, via cmdline:
> >  trafgen --cpp  --dev eth5 --conf udp_example01.trafgen -V --cpus 1
> > 
> > BASELINE(no-patch): trafgen QDISC_BYPASS and mmap:
> >  - tx:1562539 pps
> > 
> > (This patch only): ixgbe use of .ndo_xmit_flush.
> >  - tx:1532299 pps
> > 
> > Regression: -30240 pps
> >  * In nanosec: (1/1562539*10^9)-(1/1532299*10^9) = -12.63 ns
> > 
> > 
> > As DaveM points out, me might not need the mmiowb().
> > Result when not performing the mmiowb():
> >  - tx:1548352 pps
> > 
> > Still a small regression: -14187 pps
> >  * In nanosec: (1/1562539*10^9)-(1/1548352*10^9) = -5.86 ns
> > 
> > 
> > I was not expecting this "slowdown", with this rather simple use of the
> > new ndo_xmit_flush API.  Can anyone explain why this is happening?
> 
> One possibility is that we are now doing less stuff between the time we
> write tail and when we grab the qdisc lock (locked transactions are
> stalled by MMIO) so that we are spending more time stuck waiting for the
> write to complete and doing nothing.

In this testcase we are bypassing the qdisc code path, but still taking
the HARD_TX_LOCK.  I were only expecting in the area of -2ns due to the
extra function call overhead.

But when we start to include the qdisc code path, then the performance
regression gets even worse.  I would like an explanation for that, see:

 http://thread.gmane.org/gmane.linux.network/327254/focus=327431


> Then of course there are always the funny oddball quirks such as the
> code changes might have changed the alignment of a loop resulting in Tx
> cleanup more expensive than it was before.

Yes, this is when it gets hairy!
Jesper Dangaard Brouer Aug. 27, 2014, 11:34 a.m. UTC | #5
On Mon, 25 Aug 2014 14:07:21 +0200
Jesper Dangaard Brouer <brouer@redhat.com> wrote:

> On Sun, 24 Aug 2014 15:42:16 +0200
> Daniel Borkmann <dborkman@redhat.com> wrote:
> 
> > This implements the deferred tail pointer flush API for the ixgbe
> > driver. Similar version also proposed longer time ago by Alexander Duyck.
> 
> I've run some benchmarks with this patch only, which actually shows a
> performance regression.
> 
[...]
>
> Still a small regression: -14187 pps
>  * In nanosec: (1/1562539*10^9)-(1/1548352*10^9) = -5.86 ns
>  
> I was not expecting this "slowdown", with this rather simple use of the
> new ndo_xmit_flush API.  Can anyone explain why this is happening?

I've re-run this experiment with more accuracy, e.g. C-state tuning, no
Hyper-Threading, and using pktgen. See desc in thread subj: "Get rid of
ndo_xmit_flush"[1].

DaveM was right in reverting this API, according to my new more
accurate measurements, the conclusion is the same, this API hurts performance.

Compared to baseline, with this patch (except not using mmiowb()):
 * (1/5609929*10^9)-(1/5388719*10^9) = -7.32 ns

Details below signature.

[1] http://thread.gmane.org/gmane.linux.network/327502/focus=327803
diff mbox

Patch

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 87bd53f..4e073cf 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -6957,10 +6957,6 @@  static void ixgbe_tx_map(struct ixgbe_ring *tx_ring,
 		i = 0;
 
 	tx_ring->next_to_use = i;
-
-	/* notify HW of packet */
-	ixgbe_write_tail(tx_ring, i);
-
 	return;
 dma_error:
 	dev_err(tx_ring->dev, "TX DMA map failed\n");
@@ -7301,6 +7297,29 @@  static netdev_tx_t ixgbe_xmit_frame(struct sk_buff *skb,
 	return __ixgbe_xmit_frame(skb, netdev, NULL);
 }
 
+static inline struct ixgbe_ring *
+__ixgb_tx_queue_mapping(struct ixgbe_adapter *adapter, u16 queue)
+{
+	if (unlikely(queue >= adapter->num_tx_queues))
+		queue = queue % adapter->num_tx_queues;
+
+	return adapter->tx_ring[queue];
+}
+
+static void ixgbe_xmit_flush(struct net_device *netdev, u16 queue)
+{
+	struct ixgbe_adapter *adapter = netdev_priv(netdev);
+	struct ixgbe_ring *tx_ring;
+
+	tx_ring = __ixgb_tx_queue_mapping(adapter, queue);
+	ixgbe_write_tail(tx_ring, tx_ring->next_to_use);
+
+	/* we need this if more than one processor can write to our tail
+	 * at a time, it synchronizes IO on IA64/Altix systems
+	 */
+	mmiowb();
+}
+
 /**
  * ixgbe_set_mac - Change the Ethernet Address of the NIC
  * @netdev: network interface device structure
@@ -7914,6 +7933,7 @@  static const struct net_device_ops ixgbe_netdev_ops = {
 	.ndo_open		= ixgbe_open,
 	.ndo_stop		= ixgbe_close,
 	.ndo_start_xmit		= ixgbe_xmit_frame,
+	.ndo_xmit_flush		= ixgbe_xmit_flush,
 	.ndo_select_queue	= ixgbe_select_queue,
 	.ndo_set_rx_mode	= ixgbe_set_rx_mode,
 	.ndo_validate_addr	= eth_validate_addr,