diff mbox series

[net-next,V3,1/2] cxgb4: Add support for FW_ETH_TX_PKT_VM_WR

Message ID 1530001483-17816-1-git-send-email-ganeshgr@chelsio.com
State Changes Requested, archived
Delegated to: David Miller
Headers show
Series [net-next,V3,1/2] cxgb4: Add support for FW_ETH_TX_PKT_VM_WR | expand

Commit Message

Ganesh Goudar June 26, 2018, 8:24 a.m. UTC
From: Arjun Vynipadath <arjun@chelsio.com>

The present TX workrequest(FW_ETH_TX_PKT_WR) cant be used for
host->vf communication, since it doesn't loopback the outgoing
packets to virtual interfaces on the same port. This can be done using
FW_ETH_TX_PKT_VM_WR.
This fix depends on ethtool_flags to determine what WR to use for
TX path. Support for setting this flags by user is added in next commit.

Based on the original work by : Casey Leedom <leedom@chelsio.com>

V3
- Made eth_flags type consistent across struct adapter and
  struct port_info.
V2
- Renamed t4_eth_xmit() and t4vf_eth_xmit(), since some compilers
  were warning about conflicting definition in cxgb4vf driver

Signed-off-by: Casey Leedom <leedom@chelsio.com>
Signed-off-by: Arjun Vynipadath <arjun@chelsio.com>
Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h      |  13 +-
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c |   2 +-
 drivers/net/ethernet/chelsio/cxgb4/sge.c        | 372 +++++++++++++++++++++++-
 3 files changed, 383 insertions(+), 4 deletions(-)

Comments

kernel test robot June 26, 2018, 11:23 a.m. UTC | #1
Hi Arjun,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on net-next/master]

url:    https://github.com/0day-ci/linux/commits/Ganesh-Goudar/cxgb4-Add-support-for-FW_ETH_TX_PKT_VM_WR/20180626-163628
config: x86_64-randconfig-x001-201825 (attached as .config)
compiler: gcc-7 (Debian 7.3.0-16) 7.3.0
reproduce:
        # save the attached .config to linux build tree
        make ARCH=x86_64 

All errors (new ones prefixed by >>):

   drivers/net/ethernet/chelsio/cxgb4/sge.c: In function 'cxgb4_vf_eth_xmit':
>> drivers/net/ethernet/chelsio/cxgb4/sge.c:1646:18: error: assignment of read-only variable 'fw_hdr_copy_len'
     fw_hdr_copy_len = (sizeof(wr->ethmacdst) + sizeof(wr->ethmacsrc) +
                     ^

vim +/fw_hdr_copy_len +1646 drivers/net/ethernet/chelsio/cxgb4/sge.c

  1622	
  1623	/**
  1624	 *	cxgb4_vf_eth_xmit - add a packet to an Ethernet TX queue
  1625	 *	@skb: the packet
  1626	 *	@dev: the egress net device
  1627	 *
  1628	 *	Add a packet to an SGE Ethernet TX queue.  Runs with softirqs disabled.
  1629	 */
  1630	static netdev_tx_t cxgb4_vf_eth_xmit(struct sk_buff *skb,
  1631					     struct net_device *dev)
  1632	{
  1633		dma_addr_t addr[MAX_SKB_FRAGS + 1];
  1634		const struct skb_shared_info *ssi;
  1635		struct fw_eth_tx_pkt_vm_wr *wr;
  1636		int qidx, credits, max_pkt_len;
  1637		const size_t fw_hdr_copy_len;
  1638		struct cpl_tx_pkt_core *cpl;
  1639		const struct port_info *pi;
  1640		unsigned int flits, ndesc;
  1641		struct sge_eth_txq *txq;
  1642		struct adapter *adapter;
  1643		u64 cntrl, *end;
  1644		u32 wr_mid;
  1645	
> 1646		fw_hdr_copy_len = (sizeof(wr->ethmacdst) + sizeof(wr->ethmacsrc) +
  1647				   sizeof(wr->ethtype) + sizeof(wr->vlantci));
  1648	
  1649		/* The chip minimum packet length is 10 octets but the firmware
  1650		 * command that we are using requires that we copy the Ethernet header
  1651		 * (including the VLAN tag) into the header so we reject anything
  1652		 * smaller than that ...
  1653		 */
  1654		if (unlikely(skb->len < fw_hdr_copy_len))
  1655			goto out_free;
  1656	
  1657		/* Discard the packet if the length is greater than mtu */
  1658		max_pkt_len = ETH_HLEN + dev->mtu;
  1659		if (skb_vlan_tag_present(skb))
  1660			max_pkt_len += VLAN_HLEN;
  1661		if (!skb_shinfo(skb)->gso_size && (unlikely(skb->len > max_pkt_len)))
  1662			goto out_free;
  1663	
  1664		/* Figure out which TX Queue we're going to use. */
  1665		pi = netdev_priv(dev);
  1666		adapter = pi->adapter;
  1667		qidx = skb_get_queue_mapping(skb);
  1668		WARN_ON(qidx >= pi->nqsets);
  1669		txq = &adapter->sge.ethtxq[pi->first_qset + qidx];
  1670	
  1671		/* Take this opportunity to reclaim any TX Descriptors whose DMA
  1672		 * transfers have completed.
  1673		 */
  1674		cxgb4_reclaim_completed_tx(adapter, &txq->q, true);
  1675	
  1676		/* Calculate the number of flits and TX Descriptors we're going to
  1677		 * need along with how many TX Descriptors will be left over after
  1678		 * we inject our Work Request.
  1679		 */
  1680		flits = t4vf_calc_tx_flits(skb);
  1681		ndesc = flits_to_desc(flits);
  1682		credits = txq_avail(&txq->q) - ndesc;
  1683	
  1684		if (unlikely(credits < 0)) {
  1685			/* Not enough room for this packet's Work Request.  Stop the
  1686			 * TX Queue and return a "busy" condition.  The queue will get
  1687			 * started later on when the firmware informs us that space
  1688			 * has opened up.
  1689			 */
  1690			eth_txq_stop(txq);
  1691			dev_err(adapter->pdev_dev,
  1692				"%s: TX ring %u full while queue awake!\n",
  1693				dev->name, qidx);
  1694			return NETDEV_TX_BUSY;
  1695		}
  1696	
  1697		if (!t4vf_is_eth_imm(skb) &&
  1698		    unlikely(cxgb4_map_skb(adapter->pdev_dev, skb, addr) < 0)) {
  1699			/* We need to map the skb into PCI DMA space (because it can't
  1700			 * be in-lined directly into the Work Request) and the mapping
  1701			 * operation failed.  Record the error and drop the packet.
  1702			 */
  1703			txq->mapping_err++;
  1704			goto out_free;
  1705		}
  1706	
  1707		wr_mid = FW_WR_LEN16_V(DIV_ROUND_UP(flits, 2));
  1708		if (unlikely(credits < ETHTXQ_STOP_THRES)) {
  1709			/* After we're done injecting the Work Request for this
  1710			 * packet, we'll be below our "stop threshold" so stop the TX
  1711			 * Queue now and schedule a request for an SGE Egress Queue
  1712			 * Update message.  The queue will get started later on when
  1713			 * the firmware processes this Work Request and sends us an
  1714			 * Egress Queue Status Update message indicating that space
  1715			 * has opened up.
  1716			 */
  1717			eth_txq_stop(txq);
  1718			wr_mid |= FW_WR_EQUEQ_F | FW_WR_EQUIQ_F;
  1719		}
  1720	
  1721		/* Start filling in our Work Request.  Note that we do _not_ handle
  1722		 * the WR Header wrapping around the TX Descriptor Ring.  If our
  1723		 * maximum header size ever exceeds one TX Descriptor, we'll need to
  1724		 * do something else here.
  1725		 */
  1726		WARN_ON(DIV_ROUND_UP(T4VF_ETHTXQ_MAX_HDR, TXD_PER_EQ_UNIT) > 1);
  1727		wr = (void *)&txq->q.desc[txq->q.pidx];
  1728		wr->equiq_to_len16 = cpu_to_be32(wr_mid);
  1729		wr->r3[0] = cpu_to_be32(0);
  1730		wr->r3[1] = cpu_to_be32(0);
  1731		skb_copy_from_linear_data(skb, (void *)wr->ethmacdst, fw_hdr_copy_len);
  1732		end = (u64 *)wr + flits;
  1733	
  1734		/* If this is a Large Send Offload packet we'll put in an LSO CPL
  1735		 * message with an encapsulated TX Packet CPL message.  Otherwise we
  1736		 * just use a TX Packet CPL message.
  1737		 */
  1738		ssi = skb_shinfo(skb);
  1739		if (ssi->gso_size) {
  1740			struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
  1741			bool v6 = (ssi->gso_type & SKB_GSO_TCPV6) != 0;
  1742			int l3hdr_len = skb_network_header_len(skb);
  1743			int eth_xtra_len = skb_network_offset(skb) - ETH_HLEN;
  1744	
  1745			wr->op_immdlen =
  1746				cpu_to_be32(FW_WR_OP_V(FW_ETH_TX_PKT_VM_WR) |
  1747					    FW_WR_IMMDLEN_V(sizeof(*lso) +
  1748							    sizeof(*cpl)));
  1749			 /* Fill in the LSO CPL message. */
  1750			lso->lso_ctrl =
  1751				cpu_to_be32(LSO_OPCODE_V(CPL_TX_PKT_LSO) |
  1752					    LSO_FIRST_SLICE_F |
  1753					    LSO_LAST_SLICE_F |
  1754					    LSO_IPV6_V(v6) |
  1755					    LSO_ETHHDR_LEN_V(eth_xtra_len / 4) |
  1756					    LSO_IPHDR_LEN_V(l3hdr_len / 4) |
  1757					    LSO_TCPHDR_LEN_V(tcp_hdr(skb)->doff));
  1758			lso->ipid_ofst = cpu_to_be16(0);
  1759			lso->mss = cpu_to_be16(ssi->gso_size);
  1760			lso->seqno_offset = cpu_to_be32(0);
  1761			if (is_t4(adapter->params.chip))
  1762				lso->len = cpu_to_be32(skb->len);
  1763			else
  1764				lso->len = cpu_to_be32(LSO_T5_XFER_SIZE_V(skb->len));
  1765	
  1766			/* Set up TX Packet CPL pointer, control word and perform
  1767			 * accounting.
  1768			 */
  1769			cpl = (void *)(lso + 1);
  1770	
  1771			if (CHELSIO_CHIP_VERSION(adapter->params.chip) <= CHELSIO_T5)
  1772				cntrl = TXPKT_ETHHDR_LEN_V(eth_xtra_len);
  1773			else
  1774				cntrl = T6_TXPKT_ETHHDR_LEN_V(eth_xtra_len);
  1775	
  1776			cntrl |= TXPKT_CSUM_TYPE_V(v6 ?
  1777						   TX_CSUM_TCPIP6 : TX_CSUM_TCPIP) |
  1778				 TXPKT_IPHDR_LEN_V(l3hdr_len);
  1779			txq->tso++;
  1780			txq->tx_cso += ssi->gso_segs;
  1781		} else {
  1782			int len;
  1783	
  1784			len = (t4vf_is_eth_imm(skb)
  1785			       ? skb->len + sizeof(*cpl)
  1786			       : sizeof(*cpl));
  1787			wr->op_immdlen =
  1788				cpu_to_be32(FW_WR_OP_V(FW_ETH_TX_PKT_VM_WR) |
  1789					    FW_WR_IMMDLEN_V(len));
  1790	
  1791			/* Set up TX Packet CPL pointer, control word and perform
  1792			 * accounting.
  1793			 */
  1794			cpl = (void *)(wr + 1);
  1795			if (skb->ip_summed == CHECKSUM_PARTIAL) {
  1796				cntrl = hwcsum(adapter->params.chip, skb) |
  1797					TXPKT_IPCSUM_DIS_F;
  1798				txq->tx_cso++;
  1799			} else {
  1800				cntrl = TXPKT_L4CSUM_DIS_F | TXPKT_IPCSUM_DIS_F;
  1801			}
  1802		}
  1803	
  1804		/* If there's a VLAN tag present, add that to the list of things to
  1805		 * do in this Work Request.
  1806		 */
  1807		if (skb_vlan_tag_present(skb)) {
  1808			txq->vlan_ins++;
  1809			cntrl |= TXPKT_VLAN_VLD_F | TXPKT_VLAN_V(skb_vlan_tag_get(skb));
  1810		}
  1811	
  1812		 /* Fill in the TX Packet CPL message header. */
  1813		cpl->ctrl0 = cpu_to_be32(TXPKT_OPCODE_V(CPL_TX_PKT_XT) |
  1814					 TXPKT_INTF_V(pi->port_id) |
  1815					 TXPKT_PF_V(0));
  1816		cpl->pack = cpu_to_be16(0);
  1817		cpl->len = cpu_to_be16(skb->len);
  1818		cpl->ctrl1 = cpu_to_be64(cntrl);
  1819	
  1820		/* Fill in the body of the TX Packet CPL message with either in-lined
  1821		 * data or a Scatter/Gather List.
  1822		 */
  1823		if (t4vf_is_eth_imm(skb)) {
  1824			/* In-line the packet's data and free the skb since we don't
  1825			 * need it any longer.
  1826			 */
  1827			cxgb4_inline_tx_skb(skb, &txq->q, cpl + 1);
  1828			dev_consume_skb_any(skb);
  1829		} else {
  1830			/* Write the skb's Scatter/Gather list into the TX Packet CPL
  1831			 * message and retain a pointer to the skb so we can free it
  1832			 * later when its DMA completes.  (We store the skb pointer
  1833			 * in the Software Descriptor corresponding to the last TX
  1834			 * Descriptor used by the Work Request.)
  1835			 *
  1836			 * The retained skb will be freed when the corresponding TX
  1837			 * Descriptors are reclaimed after their DMAs complete.
  1838			 * However, this could take quite a while since, in general,
  1839			 * the hardware is set up to be lazy about sending DMA
  1840			 * completion notifications to us and we mostly perform TX
  1841			 * reclaims in the transmit routine.
  1842			 *
  1843			 * This is good for performamce but means that we rely on new
  1844			 * TX packets arriving to run the destructors of completed
  1845			 * packets, which open up space in their sockets' send queues.
  1846			 * Sometimes we do not get such new packets causing TX to
  1847			 * stall.  A single UDP transmitter is a good example of this
  1848			 * situation.  We have a clean up timer that periodically
  1849			 * reclaims completed packets but it doesn't run often enough
  1850			 * (nor do we want it to) to prevent lengthy stalls.  A
  1851			 * solution to this problem is to run the destructor early,
  1852			 * after the packet is queued but before it's DMAd.  A con is
  1853			 * that we lie to socket memory accounting, but the amount of
  1854			 * extra memory is reasonable (limited by the number of TX
  1855			 * descriptors), the packets do actually get freed quickly by
  1856			 * new packets almost always, and for protocols like TCP that
  1857			 * wait for acks to really free up the data the extra memory
  1858			 * is even less.  On the positive side we run the destructors
  1859			 * on the sending CPU rather than on a potentially different
  1860			 * completing CPU, usually a good thing.
  1861			 *
  1862			 * Run the destructor before telling the DMA engine about the
  1863			 * packet to make sure it doesn't complete and get freed
  1864			 * prematurely.
  1865			 */
  1866			struct ulptx_sgl *sgl = (struct ulptx_sgl *)(cpl + 1);
  1867			struct sge_txq *tq = &txq->q;
  1868			int last_desc;
  1869	
  1870			/* If the Work Request header was an exact multiple of our TX
  1871			 * Descriptor length, then it's possible that the starting SGL
  1872			 * pointer lines up exactly with the end of our TX Descriptor
  1873			 * ring.  If that's the case, wrap around to the beginning
  1874			 * here ...
  1875			 */
  1876			if (unlikely((void *)sgl == (void *)tq->stat)) {
  1877				sgl = (void *)tq->desc;
  1878				end = (void *)((void *)tq->desc +
  1879					       ((void *)end - (void *)tq->stat));
  1880			}
  1881	
  1882			cxgb4_write_sgl(skb, tq, sgl, end, 0, addr);
  1883			skb_orphan(skb);
  1884	
  1885			last_desc = tq->pidx + ndesc - 1;
  1886			if (last_desc >= tq->size)
  1887				last_desc -= tq->size;
  1888			tq->sdesc[last_desc].skb = skb;
  1889			tq->sdesc[last_desc].sgl = sgl;
  1890		}
  1891	
  1892		/* Advance our internal TX Queue state, tell the hardware about
  1893		 * the new TX descriptors and return success.
  1894		 */
  1895		txq_advance(&txq->q, ndesc);
  1896	
  1897		cxgb4_ring_tx_db(adapter, &txq->q, ndesc);
  1898		return NETDEV_TX_OK;
  1899	
  1900	out_free:
  1901		/* An error of some sort happened.  Free the TX skb and tell the
  1902		 * OS that we've "dealt" with the packet ...
  1903		 */
  1904		dev_kfree_skb_any(skb);
  1905		return NETDEV_TX_OK;
  1906	}
  1907	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
diff mbox series

Patch

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
index 1adb968..a4ea53d 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
@@ -522,6 +522,15 @@  enum {
 	MAX_INGQ = MAX_ETH_QSETS + INGQ_EXTRAS,
 };
 
+enum {
+	PRIV_FLAG_PORT_TX_VM_BIT,
+};
+
+#define PRIV_FLAG_PORT_TX_VM		BIT(PRIV_FLAG_PORT_TX_VM_BIT)
+
+#define PRIV_FLAGS_ADAP			0
+#define PRIV_FLAGS_PORT			PRIV_FLAG_PORT_TX_VM
+
 struct adapter;
 struct sge_rspq;
 
@@ -558,6 +567,7 @@  struct port_info {
 	struct hwtstamp_config tstamp_config;
 	bool ptp_enable;
 	struct sched_table *sched_tbl;
+	u32 eth_flags;
 };
 
 struct dentry;
@@ -868,6 +878,7 @@  struct adapter {
 	unsigned int flags;
 	unsigned int adap_idx;
 	enum chip_type chip;
+	u32 eth_flags;
 
 	int msg_enable;
 	__be16 vxlan_port;
@@ -1334,7 +1345,7 @@  void t4_os_link_changed(struct adapter *adap, int port_id, int link_stat);
 void t4_free_sge_resources(struct adapter *adap);
 void t4_free_ofld_rxqs(struct adapter *adap, int n, struct sge_ofld_rxq *q);
 irq_handler_t t4_intr_handler(struct adapter *adap);
-netdev_tx_t t4_eth_xmit(struct sk_buff *skb, struct net_device *dev);
+netdev_tx_t t4_start_xmit(struct sk_buff *skb, struct net_device *dev);
 int t4_ethrx_handler(struct sge_rspq *q, const __be64 *rsp,
 		     const struct pkt_gl *gl);
 int t4_mgmt_tx(struct adapter *adap, struct sk_buff *skb);
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index bc03c17..d3b0f9c 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -3217,7 +3217,7 @@  static netdev_features_t cxgb_fix_features(struct net_device *dev,
 static const struct net_device_ops cxgb4_netdev_ops = {
 	.ndo_open             = cxgb_open,
 	.ndo_stop             = cxgb_close,
-	.ndo_start_xmit       = t4_eth_xmit,
+	.ndo_start_xmit       = t4_start_xmit,
 	.ndo_select_queue     =	cxgb_select_queue,
 	.ndo_get_stats64      = cxgb_get_stats,
 	.ndo_set_rx_mode      = cxgb_set_rxmode,
diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c
index 395e2a0..f1311fd 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c
@@ -1288,13 +1288,13 @@  static inline void t6_fill_tnl_lso(struct sk_buff *skb,
 }
 
 /**
- *	t4_eth_xmit - add a packet to an Ethernet Tx queue
+ *	cxgb4_eth_xmit - add a packet to an Ethernet Tx queue
  *	@skb: the packet
  *	@dev: the egress net device
  *
  *	Add a packet to an SGE Ethernet Tx queue.  Runs with softirqs disabled.
  */
-netdev_tx_t t4_eth_xmit(struct sk_buff *skb, struct net_device *dev)
+static netdev_tx_t cxgb4_eth_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	u32 wr_mid, ctrl0, op;
 	u64 cntrl, *end, *sgl;
@@ -1547,6 +1547,374 @@  out_free:	dev_kfree_skb_any(skb);
 	return NETDEV_TX_OK;
 }
 
+/* Constants ... */
+enum {
+	/* Egress Queue sizes, producer and consumer indices are all in units
+	 * of Egress Context Units bytes.  Note that as far as the hardware is
+	 * concerned, the free list is an Egress Queue (the host produces free
+	 * buffers which the hardware consumes) and free list entries are
+	 * 64-bit PCI DMA addresses.
+	 */
+	EQ_UNIT = SGE_EQ_IDXSIZE,
+	FL_PER_EQ_UNIT = EQ_UNIT / sizeof(__be64),
+	TXD_PER_EQ_UNIT = EQ_UNIT / sizeof(__be64),
+
+	T4VF_ETHTXQ_MAX_HDR = (sizeof(struct fw_eth_tx_pkt_vm_wr) +
+			       sizeof(struct cpl_tx_pkt_lso_core) +
+			       sizeof(struct cpl_tx_pkt_core)) / sizeof(__be64),
+};
+
+/**
+ *	t4vf_is_eth_imm - can an Ethernet packet be sent as immediate data?
+ *	@skb: the packet
+ *
+ *	Returns whether an Ethernet packet is small enough to fit completely as
+ *	immediate data.
+ */
+static inline int t4vf_is_eth_imm(const struct sk_buff *skb)
+{
+	/* The VF Driver uses the FW_ETH_TX_PKT_VM_WR firmware Work Request
+	 * which does not accommodate immediate data.  We could dike out all
+	 * of the support code for immediate data but that would tie our hands
+	 * too much if we ever want to enhace the firmware.  It would also
+	 * create more differences between the PF and VF Drivers.
+	 */
+	return false;
+}
+
+/**
+ *	t4vf_calc_tx_flits - calculate the number of flits for a packet TX WR
+ *	@skb: the packet
+ *
+ *	Returns the number of flits needed for a TX Work Request for the
+ *	given Ethernet packet, including the needed WR and CPL headers.
+ */
+static inline unsigned int t4vf_calc_tx_flits(const struct sk_buff *skb)
+{
+	unsigned int flits;
+
+	/* If the skb is small enough, we can pump it out as a work request
+	 * with only immediate data.  In that case we just have to have the
+	 * TX Packet header plus the skb data in the Work Request.
+	 */
+	if (t4vf_is_eth_imm(skb))
+		return DIV_ROUND_UP(skb->len + sizeof(struct cpl_tx_pkt),
+				    sizeof(__be64));
+
+	/* Otherwise, we're going to have to construct a Scatter gather list
+	 * of the skb body and fragments.  We also include the flits necessary
+	 * for the TX Packet Work Request and CPL.  We always have a firmware
+	 * Write Header (incorporated as part of the cpl_tx_pkt_lso and
+	 * cpl_tx_pkt structures), followed by either a TX Packet Write CPL
+	 * message or, if we're doing a Large Send Offload, an LSO CPL message
+	 * with an embedded TX Packet Write CPL message.
+	 */
+	flits = sgl_len(skb_shinfo(skb)->nr_frags + 1);
+	if (skb_shinfo(skb)->gso_size)
+		flits += (sizeof(struct fw_eth_tx_pkt_vm_wr) +
+			  sizeof(struct cpl_tx_pkt_lso_core) +
+			  sizeof(struct cpl_tx_pkt_core)) / sizeof(__be64);
+	else
+		flits += (sizeof(struct fw_eth_tx_pkt_vm_wr) +
+			  sizeof(struct cpl_tx_pkt_core)) / sizeof(__be64);
+	return flits;
+}
+
+/**
+ *	cxgb4_vf_eth_xmit - add a packet to an Ethernet TX queue
+ *	@skb: the packet
+ *	@dev: the egress net device
+ *
+ *	Add a packet to an SGE Ethernet TX queue.  Runs with softirqs disabled.
+ */
+static netdev_tx_t cxgb4_vf_eth_xmit(struct sk_buff *skb,
+				     struct net_device *dev)
+{
+	dma_addr_t addr[MAX_SKB_FRAGS + 1];
+	const struct skb_shared_info *ssi;
+	struct fw_eth_tx_pkt_vm_wr *wr;
+	int qidx, credits, max_pkt_len;
+	const size_t fw_hdr_copy_len;
+	struct cpl_tx_pkt_core *cpl;
+	const struct port_info *pi;
+	unsigned int flits, ndesc;
+	struct sge_eth_txq *txq;
+	struct adapter *adapter;
+	u64 cntrl, *end;
+	u32 wr_mid;
+
+	fw_hdr_copy_len = (sizeof(wr->ethmacdst) + sizeof(wr->ethmacsrc) +
+			   sizeof(wr->ethtype) + sizeof(wr->vlantci));
+
+	/* The chip minimum packet length is 10 octets but the firmware
+	 * command that we are using requires that we copy the Ethernet header
+	 * (including the VLAN tag) into the header so we reject anything
+	 * smaller than that ...
+	 */
+	if (unlikely(skb->len < fw_hdr_copy_len))
+		goto out_free;
+
+	/* Discard the packet if the length is greater than mtu */
+	max_pkt_len = ETH_HLEN + dev->mtu;
+	if (skb_vlan_tag_present(skb))
+		max_pkt_len += VLAN_HLEN;
+	if (!skb_shinfo(skb)->gso_size && (unlikely(skb->len > max_pkt_len)))
+		goto out_free;
+
+	/* Figure out which TX Queue we're going to use. */
+	pi = netdev_priv(dev);
+	adapter = pi->adapter;
+	qidx = skb_get_queue_mapping(skb);
+	WARN_ON(qidx >= pi->nqsets);
+	txq = &adapter->sge.ethtxq[pi->first_qset + qidx];
+
+	/* Take this opportunity to reclaim any TX Descriptors whose DMA
+	 * transfers have completed.
+	 */
+	cxgb4_reclaim_completed_tx(adapter, &txq->q, true);
+
+	/* Calculate the number of flits and TX Descriptors we're going to
+	 * need along with how many TX Descriptors will be left over after
+	 * we inject our Work Request.
+	 */
+	flits = t4vf_calc_tx_flits(skb);
+	ndesc = flits_to_desc(flits);
+	credits = txq_avail(&txq->q) - ndesc;
+
+	if (unlikely(credits < 0)) {
+		/* Not enough room for this packet's Work Request.  Stop the
+		 * TX Queue and return a "busy" condition.  The queue will get
+		 * started later on when the firmware informs us that space
+		 * has opened up.
+		 */
+		eth_txq_stop(txq);
+		dev_err(adapter->pdev_dev,
+			"%s: TX ring %u full while queue awake!\n",
+			dev->name, qidx);
+		return NETDEV_TX_BUSY;
+	}
+
+	if (!t4vf_is_eth_imm(skb) &&
+	    unlikely(cxgb4_map_skb(adapter->pdev_dev, skb, addr) < 0)) {
+		/* We need to map the skb into PCI DMA space (because it can't
+		 * be in-lined directly into the Work Request) and the mapping
+		 * operation failed.  Record the error and drop the packet.
+		 */
+		txq->mapping_err++;
+		goto out_free;
+	}
+
+	wr_mid = FW_WR_LEN16_V(DIV_ROUND_UP(flits, 2));
+	if (unlikely(credits < ETHTXQ_STOP_THRES)) {
+		/* After we're done injecting the Work Request for this
+		 * packet, we'll be below our "stop threshold" so stop the TX
+		 * Queue now and schedule a request for an SGE Egress Queue
+		 * Update message.  The queue will get started later on when
+		 * the firmware processes this Work Request and sends us an
+		 * Egress Queue Status Update message indicating that space
+		 * has opened up.
+		 */
+		eth_txq_stop(txq);
+		wr_mid |= FW_WR_EQUEQ_F | FW_WR_EQUIQ_F;
+	}
+
+	/* Start filling in our Work Request.  Note that we do _not_ handle
+	 * the WR Header wrapping around the TX Descriptor Ring.  If our
+	 * maximum header size ever exceeds one TX Descriptor, we'll need to
+	 * do something else here.
+	 */
+	WARN_ON(DIV_ROUND_UP(T4VF_ETHTXQ_MAX_HDR, TXD_PER_EQ_UNIT) > 1);
+	wr = (void *)&txq->q.desc[txq->q.pidx];
+	wr->equiq_to_len16 = cpu_to_be32(wr_mid);
+	wr->r3[0] = cpu_to_be32(0);
+	wr->r3[1] = cpu_to_be32(0);
+	skb_copy_from_linear_data(skb, (void *)wr->ethmacdst, fw_hdr_copy_len);
+	end = (u64 *)wr + flits;
+
+	/* If this is a Large Send Offload packet we'll put in an LSO CPL
+	 * message with an encapsulated TX Packet CPL message.  Otherwise we
+	 * just use a TX Packet CPL message.
+	 */
+	ssi = skb_shinfo(skb);
+	if (ssi->gso_size) {
+		struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
+		bool v6 = (ssi->gso_type & SKB_GSO_TCPV6) != 0;
+		int l3hdr_len = skb_network_header_len(skb);
+		int eth_xtra_len = skb_network_offset(skb) - ETH_HLEN;
+
+		wr->op_immdlen =
+			cpu_to_be32(FW_WR_OP_V(FW_ETH_TX_PKT_VM_WR) |
+				    FW_WR_IMMDLEN_V(sizeof(*lso) +
+						    sizeof(*cpl)));
+		 /* Fill in the LSO CPL message. */
+		lso->lso_ctrl =
+			cpu_to_be32(LSO_OPCODE_V(CPL_TX_PKT_LSO) |
+				    LSO_FIRST_SLICE_F |
+				    LSO_LAST_SLICE_F |
+				    LSO_IPV6_V(v6) |
+				    LSO_ETHHDR_LEN_V(eth_xtra_len / 4) |
+				    LSO_IPHDR_LEN_V(l3hdr_len / 4) |
+				    LSO_TCPHDR_LEN_V(tcp_hdr(skb)->doff));
+		lso->ipid_ofst = cpu_to_be16(0);
+		lso->mss = cpu_to_be16(ssi->gso_size);
+		lso->seqno_offset = cpu_to_be32(0);
+		if (is_t4(adapter->params.chip))
+			lso->len = cpu_to_be32(skb->len);
+		else
+			lso->len = cpu_to_be32(LSO_T5_XFER_SIZE_V(skb->len));
+
+		/* Set up TX Packet CPL pointer, control word and perform
+		 * accounting.
+		 */
+		cpl = (void *)(lso + 1);
+
+		if (CHELSIO_CHIP_VERSION(adapter->params.chip) <= CHELSIO_T5)
+			cntrl = TXPKT_ETHHDR_LEN_V(eth_xtra_len);
+		else
+			cntrl = T6_TXPKT_ETHHDR_LEN_V(eth_xtra_len);
+
+		cntrl |= TXPKT_CSUM_TYPE_V(v6 ?
+					   TX_CSUM_TCPIP6 : TX_CSUM_TCPIP) |
+			 TXPKT_IPHDR_LEN_V(l3hdr_len);
+		txq->tso++;
+		txq->tx_cso += ssi->gso_segs;
+	} else {
+		int len;
+
+		len = (t4vf_is_eth_imm(skb)
+		       ? skb->len + sizeof(*cpl)
+		       : sizeof(*cpl));
+		wr->op_immdlen =
+			cpu_to_be32(FW_WR_OP_V(FW_ETH_TX_PKT_VM_WR) |
+				    FW_WR_IMMDLEN_V(len));
+
+		/* Set up TX Packet CPL pointer, control word and perform
+		 * accounting.
+		 */
+		cpl = (void *)(wr + 1);
+		if (skb->ip_summed == CHECKSUM_PARTIAL) {
+			cntrl = hwcsum(adapter->params.chip, skb) |
+				TXPKT_IPCSUM_DIS_F;
+			txq->tx_cso++;
+		} else {
+			cntrl = TXPKT_L4CSUM_DIS_F | TXPKT_IPCSUM_DIS_F;
+		}
+	}
+
+	/* If there's a VLAN tag present, add that to the list of things to
+	 * do in this Work Request.
+	 */
+	if (skb_vlan_tag_present(skb)) {
+		txq->vlan_ins++;
+		cntrl |= TXPKT_VLAN_VLD_F | TXPKT_VLAN_V(skb_vlan_tag_get(skb));
+	}
+
+	 /* Fill in the TX Packet CPL message header. */
+	cpl->ctrl0 = cpu_to_be32(TXPKT_OPCODE_V(CPL_TX_PKT_XT) |
+				 TXPKT_INTF_V(pi->port_id) |
+				 TXPKT_PF_V(0));
+	cpl->pack = cpu_to_be16(0);
+	cpl->len = cpu_to_be16(skb->len);
+	cpl->ctrl1 = cpu_to_be64(cntrl);
+
+	/* Fill in the body of the TX Packet CPL message with either in-lined
+	 * data or a Scatter/Gather List.
+	 */
+	if (t4vf_is_eth_imm(skb)) {
+		/* In-line the packet's data and free the skb since we don't
+		 * need it any longer.
+		 */
+		cxgb4_inline_tx_skb(skb, &txq->q, cpl + 1);
+		dev_consume_skb_any(skb);
+	} else {
+		/* Write the skb's Scatter/Gather list into the TX Packet CPL
+		 * message and retain a pointer to the skb so we can free it
+		 * later when its DMA completes.  (We store the skb pointer
+		 * in the Software Descriptor corresponding to the last TX
+		 * Descriptor used by the Work Request.)
+		 *
+		 * The retained skb will be freed when the corresponding TX
+		 * Descriptors are reclaimed after their DMAs complete.
+		 * However, this could take quite a while since, in general,
+		 * the hardware is set up to be lazy about sending DMA
+		 * completion notifications to us and we mostly perform TX
+		 * reclaims in the transmit routine.
+		 *
+		 * This is good for performamce but means that we rely on new
+		 * TX packets arriving to run the destructors of completed
+		 * packets, which open up space in their sockets' send queues.
+		 * Sometimes we do not get such new packets causing TX to
+		 * stall.  A single UDP transmitter is a good example of this
+		 * situation.  We have a clean up timer that periodically
+		 * reclaims completed packets but it doesn't run often enough
+		 * (nor do we want it to) to prevent lengthy stalls.  A
+		 * solution to this problem is to run the destructor early,
+		 * after the packet is queued but before it's DMAd.  A con is
+		 * that we lie to socket memory accounting, but the amount of
+		 * extra memory is reasonable (limited by the number of TX
+		 * descriptors), the packets do actually get freed quickly by
+		 * new packets almost always, and for protocols like TCP that
+		 * wait for acks to really free up the data the extra memory
+		 * is even less.  On the positive side we run the destructors
+		 * on the sending CPU rather than on a potentially different
+		 * completing CPU, usually a good thing.
+		 *
+		 * Run the destructor before telling the DMA engine about the
+		 * packet to make sure it doesn't complete and get freed
+		 * prematurely.
+		 */
+		struct ulptx_sgl *sgl = (struct ulptx_sgl *)(cpl + 1);
+		struct sge_txq *tq = &txq->q;
+		int last_desc;
+
+		/* If the Work Request header was an exact multiple of our TX
+		 * Descriptor length, then it's possible that the starting SGL
+		 * pointer lines up exactly with the end of our TX Descriptor
+		 * ring.  If that's the case, wrap around to the beginning
+		 * here ...
+		 */
+		if (unlikely((void *)sgl == (void *)tq->stat)) {
+			sgl = (void *)tq->desc;
+			end = (void *)((void *)tq->desc +
+				       ((void *)end - (void *)tq->stat));
+		}
+
+		cxgb4_write_sgl(skb, tq, sgl, end, 0, addr);
+		skb_orphan(skb);
+
+		last_desc = tq->pidx + ndesc - 1;
+		if (last_desc >= tq->size)
+			last_desc -= tq->size;
+		tq->sdesc[last_desc].skb = skb;
+		tq->sdesc[last_desc].sgl = sgl;
+	}
+
+	/* Advance our internal TX Queue state, tell the hardware about
+	 * the new TX descriptors and return success.
+	 */
+	txq_advance(&txq->q, ndesc);
+
+	cxgb4_ring_tx_db(adapter, &txq->q, ndesc);
+	return NETDEV_TX_OK;
+
+out_free:
+	/* An error of some sort happened.  Free the TX skb and tell the
+	 * OS that we've "dealt" with the packet ...
+	 */
+	dev_kfree_skb_any(skb);
+	return NETDEV_TX_OK;
+}
+
+netdev_tx_t t4_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct port_info *pi = netdev_priv(dev);
+
+	if (unlikely(pi->eth_flags & PRIV_FLAG_PORT_TX_VM))
+		return cxgb4_vf_eth_xmit(skb, dev);
+
+	return cxgb4_eth_xmit(skb, dev);
+}
+
 /**
  *	reclaim_completed_tx_imm - reclaim completed control-queue Tx descs
  *	@q: the SGE control Tx queue