diff mbox series

[net-next,RFC,v1,05/10] nvme-tcp: Add DDP offload control path

Message ID 20200930162010.21610-6-borisp@mellanox.com
State RFC
Delegated to: David Miller
Headers show
Series nvme-tcp receive offloads | expand

Commit Message

Boris Pismenny Sept. 30, 2020, 4:20 p.m. UTC
This commit introduces direct data placement offload to NVME
TCP. There is a context per queue, which is established after the
handshake
using the tcp_ddp_sk_add/del NDOs.

Additionally, a resynchronization routine is used to assist
hardware recovery from TCP OOO, and continue the offload.
Resynchronization operates as follows:
1. TCP OOO causes the NIC HW to stop the offload
2. NIC HW identifies a PDU header at some TCP sequence number,
and asks NVMe-TCP to confirm it.
This request is delivered from the NIC driver to NVMe-TCP by first
finding the socket for the packet that triggered the request, and
then fiding the nvme_tcp_queue that is used by this routine.
Finally, the request is recorded in the nvme_tcp_queue.
3. When NVMe-TCP observes the requested TCP sequence, it will compare
it with the PDU header TCP sequence, and report the result to the
NIC driver (tcp_ddp_resync), which will update the HW,
and resume offload when all is successful.

Furthermore, we let the offloading driver advertise what is the max hw
sectors/segments via tcp_ddp_limits.

A follow-up patch introduces the data-path changes required for this
offload.

Signed-off-by: Boris Pismenny <borisp@mellanox.com>
Signed-off-by: Ben Ben-Ishay <benishay@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Yoray Zack <yorayz@mellanox.com>
---
 drivers/nvme/host/tcp.c  | 188 +++++++++++++++++++++++++++++++++++++++
 include/linux/nvme-tcp.h |   2 +
 2 files changed, 190 insertions(+)

Comments

Sagi Grimberg Oct. 8, 2020, 10:19 p.m. UTC | #1
On 9/30/20 9:20 AM, Boris Pismenny wrote:
> This commit introduces direct data placement offload to NVME
> TCP. There is a context per queue, which is established after the
> handshake
> using the tcp_ddp_sk_add/del NDOs.
> 
> Additionally, a resynchronization routine is used to assist
> hardware recovery from TCP OOO, and continue the offload.
> Resynchronization operates as follows:
> 1. TCP OOO causes the NIC HW to stop the offload
> 2. NIC HW identifies a PDU header at some TCP sequence number,
> and asks NVMe-TCP to confirm it.
> This request is delivered from the NIC driver to NVMe-TCP by first
> finding the socket for the packet that triggered the request, and
> then fiding the nvme_tcp_queue that is used by this routine.
> Finally, the request is recorded in the nvme_tcp_queue.
> 3. When NVMe-TCP observes the requested TCP sequence, it will compare
> it with the PDU header TCP sequence, and report the result to the
> NIC driver (tcp_ddp_resync), which will update the HW,
> and resume offload when all is successful.
> 
> Furthermore, we let the offloading driver advertise what is the max hw
> sectors/segments via tcp_ddp_limits.
> 
> A follow-up patch introduces the data-path changes required for this
> offload.
> 
> Signed-off-by: Boris Pismenny <borisp@mellanox.com>
> Signed-off-by: Ben Ben-Ishay <benishay@mellanox.com>
> Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
> Signed-off-by: Yoray Zack <yorayz@mellanox.com>
> ---
>   drivers/nvme/host/tcp.c  | 188 +++++++++++++++++++++++++++++++++++++++
>   include/linux/nvme-tcp.h |   2 +
>   2 files changed, 190 insertions(+)
> 
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index 8f4f29f18b8c..06711ac095f2 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -62,6 +62,7 @@ enum nvme_tcp_queue_flags {
>   	NVME_TCP_Q_ALLOCATED	= 0,
>   	NVME_TCP_Q_LIVE		= 1,
>   	NVME_TCP_Q_POLLING	= 2,
> +	NVME_TCP_Q_OFFLOADS     = 3,
>   };
>   
>   enum nvme_tcp_recv_state {
> @@ -110,6 +111,8 @@ struct nvme_tcp_queue {
>   	void (*state_change)(struct sock *);
>   	void (*data_ready)(struct sock *);
>   	void (*write_space)(struct sock *);
> +
> +	atomic64_t  resync_req;
>   };
>   
>   struct nvme_tcp_ctrl {
> @@ -129,6 +132,8 @@ struct nvme_tcp_ctrl {
>   	struct delayed_work	connect_work;
>   	struct nvme_tcp_request async_req;
>   	u32			io_queues[HCTX_MAX_TYPES];
> +
> +	struct net_device       *offloading_netdev;
>   };
>   
>   static LIST_HEAD(nvme_tcp_ctrl_list);
> @@ -223,6 +228,159 @@ static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req,
>   	return nvme_tcp_pdu_data_left(req) <= len;
>   }
>   
> +#ifdef CONFIG_TCP_DDP
> +
> +bool nvme_tcp_resync_request(struct sock *sk, u32 seq, u32 flags);
> +const struct tcp_ddp_ulp_ops nvme_tcp_ddp_ulp_ops __read_mostly = {
> +	.resync_request		= nvme_tcp_resync_request,
> +};
> +
> +static
> +int nvme_tcp_offload_socket(struct nvme_tcp_queue *queue,
> +			    struct nvme_tcp_config *config)
> +{
> +	struct net_device *netdev = get_netdev_for_sock(queue->sock->sk, true);
> +	struct tcp_ddp_config *ddp_config = (struct tcp_ddp_config *)config;
> +	int ret;
> +
> +	if (unlikely(!netdev)) {

Let's remove unlikely from non datapath routines, its slightly
confusing.

> +		pr_info_ratelimited("%s: netdev not found\n", __func__);

dev_info_ratelimited with queue->ctrl->ctrl.device ?
Also, lets remove __func__. This usually is not very helpful.

> +		return -EINVAL;
> +	}
> +
> +	if (!(netdev->features & NETIF_F_HW_TCP_DDP)) {
> +		dev_put(netdev);
> +		return -EINVAL;

EINVAL or ENODEV?

> +	}
> +
> +	ret = netdev->tcp_ddp_ops->tcp_ddp_sk_add(netdev,
> +						 queue->sock->sk,
> +						 ddp_config);
> +	if (!ret)
> +		inet_csk(queue->sock->sk)->icsk_ulp_ddp_ops = &nvme_tcp_ddp_ulp_ops;
> +	else
> +		dev_put(netdev);
> +	return ret;
> +}
> +
> +static
> +void nvme_tcp_unoffload_socket(struct nvme_tcp_queue *queue)
> +{
> +	struct net_device *netdev = queue->ctrl->offloading_netdev;
> +
> +	if (unlikely(!netdev)) {
> +		pr_info_ratelimited("%s: netdev not found\n", __func__);

Same here.

> +		return;
> +	}
> +
> +	netdev->tcp_ddp_ops->tcp_ddp_sk_del(netdev, queue->sock->sk);
> +
> +	inet_csk(queue->sock->sk)->icsk_ulp_ddp_ops = NULL;

Just a general question, why is this needed?

> +	dev_put(netdev); /* put the queue_init get_netdev_for_sock() */
> +}
> +
> +static
> +int nvme_tcp_offload_limits(struct nvme_tcp_queue *queue,
> +			    struct tcp_ddp_limits *limits)
> +{
> +	struct net_device *netdev = get_netdev_for_sock(queue->sock->sk, true);
> +	int ret = 0;
> +
> +	if (unlikely(!netdev)) {
> +		pr_info_ratelimited("%s: netdev not found\n", __func__);
> +		return -EINVAL;

Same here

> +	}
> +
> +	if (netdev->features & NETIF_F_HW_TCP_DDP &&
> +	    netdev->tcp_ddp_ops &&
> +	    netdev->tcp_ddp_ops->tcp_ddp_limits)
> +			ret = netdev->tcp_ddp_ops->tcp_ddp_limits(netdev, limits);
> +	else
> +			ret = -EOPNOTSUPP;
> +
> +	if (!ret) {
> +		queue->ctrl->offloading_netdev = netdev;
> +		pr_info("%s netdev %s offload limits: max_ddp_sgl_len %d\n",
> +			__func__, netdev->name, limits->max_ddp_sgl_len);

dev_info, and given that it per-queue, please make it dev_dbg.

> +		queue->ctrl->ctrl.max_segments = limits->max_ddp_sgl_len;
> +		queue->ctrl->ctrl.max_hw_sectors =
> +			limits->max_ddp_sgl_len << (ilog2(SZ_4K) - 9);
> +	} else {
> +		queue->ctrl->offloading_netdev = NULL;

Maybe nullify in the controller setup intead?

> +	}
> +
> +	dev_put(netdev);
> +
> +	return ret;
> +}
> +
> +static
> +void nvme_tcp_resync_response(struct nvme_tcp_queue *queue,
> +			      unsigned int pdu_seq)
> +{
> +	struct net_device *netdev = queue->ctrl->offloading_netdev;
> +	u64 resync_val;
> +	u32 resync_seq;
> +
> +	if (unlikely(!netdev)) {
> +		pr_info_ratelimited("%s: netdev not found\n", __func__);
> +		return;

What happens now, fallback to SW? Maybe dev_warn then..
will the SW keep seeing these responses after one failed?

> +	}
> +
> +	resync_val = atomic64_read(&queue->resync_req);
> +	if ((resync_val & TCP_DDP_RESYNC_REQ) == 0)
> +		return;
> +
> +	resync_seq = resync_val >> 32;
> +	if (before(pdu_seq, resync_seq))
> +		return;

I think it will be better to pass the skb to this func and keep the
pdu_seq contained locally.

> +
> +	if (atomic64_cmpxchg(&queue->resync_req, resync_val, (resync_val - 1)))
> +		netdev->tcp_ddp_ops->tcp_ddp_resync(netdev, queue->sock->sk, pdu_seq);

A small comment on this manipulation may help the reader.

> +}
> +
> +bool nvme_tcp_resync_request(struct sock *sk, u32 seq, u32 flags)
> +{
> +	struct nvme_tcp_queue *queue = sk->sk_user_data;
> +
> +	atomic64_set(&queue->resync_req,
> +		     (((uint64_t)seq << 32) | flags));
> +
> +	return true;
> +}
> +
> +#else
> +
> +static
> +int nvme_tcp_offload_socket(struct nvme_tcp_queue *queue,
> +			    struct nvme_tcp_config *config)
> +{
> +	return -EINVAL;
> +}
> +
> +static
> +void nvme_tcp_unoffload_socket(struct nvme_tcp_queue *queue)
> +{}
> +
> +static
> +int nvme_tcp_offload_limits(struct nvme_tcp_queue *queue,
> +			    struct tcp_ddp_limits *limits)
> +{
> +	return -EINVAL;
> +}
> +
> +static
> +void nvme_tcp_resync_response(struct nvme_tcp_queue *queue,
> +			      unsigned int pdu_seq)
> +{}
> +
> +bool nvme_tcp_resync_request(struct sock *sk, u32 seq, u32 flags)
> +{
> +	return false;
> +}
> +
> +#endif
> +
>   static void nvme_tcp_init_iter(struct nvme_tcp_request *req,
>   		unsigned int dir)
>   {
> @@ -628,6 +786,11 @@ static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
>   	size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining);
>   	int ret;
>   
> +	u64 pdu_seq = TCP_SKB_CB(skb)->seq + *offset - queue->pdu_offset;
> +
> +	if (test_bit(NVME_TCP_Q_OFFLOADS, &queue->flags))
> +		nvme_tcp_resync_response(queue, pdu_seq);

Here, just pass in (queue, skb)

> +
>   	ret = skb_copy_bits(skb, *offset,
>   		&pdu[queue->pdu_offset], rcv_len);
>   	if (unlikely(ret))
> @@ -1370,6 +1533,8 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
>   {
>   	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
>   	struct nvme_tcp_queue *queue = &ctrl->queues[qid];
> +	struct nvme_tcp_config config;
> +	struct tcp_ddp_limits limits;
>   	int ret, rcv_pdu_size;
>   
>   	queue->ctrl = ctrl;
> @@ -1487,6 +1652,26 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
>   #endif
>   	write_unlock_bh(&queue->sock->sk->sk_callback_lock);
>   
> +	if (nvme_tcp_queue_id(queue) != 0) {

	if (!nvme_tcp_admin_queue(queue)) {

> +		config.cfg.type		= TCP_DDP_NVME;
> +		config.pfv		= NVME_TCP_PFV_1_0;
> +		config.cpda		= 0;
> +		config.dgst		= queue->hdr_digest ?
> +						NVME_TCP_HDR_DIGEST_ENABLE : 0;
> +		config.dgst		|= queue->data_digest ?
> +						NVME_TCP_DATA_DIGEST_ENABLE : 0;
> +		config.queue_size	= queue->queue_size;
> +		config.queue_id		= nvme_tcp_queue_id(queue);
> +		config.io_cpu		= queue->io_cpu;

Can the config initialization move to nvme_tcp_offload_socket?

> +
> +		ret = nvme_tcp_offload_socket(queue, &config);
> +		if (!ret)
> +			set_bit(NVME_TCP_Q_OFFLOADS, &queue->flags);
> +	} else {
> +		ret = nvme_tcp_offload_limits(queue, &limits);
> +	}

I'm thinking that instead of this conditional, we want to place
nvme nvme_tcp_alloc_admin_queue in nvme_tcp_alloc_admin_queue, and
also move nvme_tcp_alloc_admin_queue to __nvme_tcp_alloc_io_queues
loop.

> +	/* offload is opportunistic - failure is non-critical */

Than make it void...

> +
>   	return 0;
>   
>   err_init_connect:
> @@ -1519,6 +1704,9 @@ static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue)
>   	kernel_sock_shutdown(queue->sock, SHUT_RDWR);
>   	nvme_tcp_restore_sock_calls(queue);
>   	cancel_work_sync(&queue->io_work);
> +
> +	if (test_bit(NVME_TCP_Q_OFFLOADS, &queue->flags))
> +		nvme_tcp_unoffload_socket(queue);

Why not in nvme_tcp_free_queue, symmetric to the alloc?

>   }
>   
>   static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
> diff --git a/include/linux/nvme-tcp.h b/include/linux/nvme-tcp.h
> index 959e0bd9a913..65df64c34ecd 100644
> --- a/include/linux/nvme-tcp.h
> +++ b/include/linux/nvme-tcp.h
> @@ -8,6 +8,8 @@
>   #define _LINUX_NVME_TCP_H
>   
>   #include <linux/nvme.h>
> +#include <net/sock.h>
> +#include <net/tcp_ddp.h>

Why is this needed? I think we want to place this in tcp.c no?
Boris Pismenny Oct. 19, 2020, 6:28 p.m. UTC | #2
On 09/10/2020 1:19, Sagi Grimberg wrote:
> On 9/30/20 9:20 AM, Boris Pismenny wrote:
>> This commit introduces direct data placement offload to NVME
>> TCP. There is a context per queue, which is established after the
>> handshake
>> using the tcp_ddp_sk_add/del NDOs.
>>
>> Additionally, a resynchronization routine is used to assist
>> hardware recovery from TCP OOO, and continue the offload.
>> Resynchronization operates as follows:
>> 1. TCP OOO causes the NIC HW to stop the offload
>> 2. NIC HW identifies a PDU header at some TCP sequence number,
>> and asks NVMe-TCP to confirm it.
>> This request is delivered from the NIC driver to NVMe-TCP by first
>> finding the socket for the packet that triggered the request, and
>> then fiding the nvme_tcp_queue that is used by this routine.
>> Finally, the request is recorded in the nvme_tcp_queue.
>> 3. When NVMe-TCP observes the requested TCP sequence, it will compare
>> it with the PDU header TCP sequence, and report the result to the
>> NIC driver (tcp_ddp_resync), which will update the HW,
>> and resume offload when all is successful.
>>
>> Furthermore, we let the offloading driver advertise what is the max hw
>> sectors/segments via tcp_ddp_limits.
>>
>> A follow-up patch introduces the data-path changes required for this
>> offload.
>>
>> Signed-off-by: Boris Pismenny <borisp@mellanox.com>
>> Signed-off-by: Ben Ben-Ishay <benishay@mellanox.com>
>> Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
>> Signed-off-by: Yoray Zack <yorayz@mellanox.com>
>> ---
>>   drivers/nvme/host/tcp.c  | 188 +++++++++++++++++++++++++++++++++++++++
>>   include/linux/nvme-tcp.h |   2 +
>>   2 files changed, 190 insertions(+)
>>
>> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
>> index 8f4f29f18b8c..06711ac095f2 100644
>> --- a/drivers/nvme/host/tcp.c
>> +++ b/drivers/nvme/host/tcp.c
>> @@ -62,6 +62,7 @@ enum nvme_tcp_queue_flags {
>>   	NVME_TCP_Q_ALLOCATED	= 0,
>>   	NVME_TCP_Q_LIVE		= 1,
>>   	NVME_TCP_Q_POLLING	= 2,
>> +	NVME_TCP_Q_OFFLOADS     = 3,
>>   };
>>   
>>   enum nvme_tcp_recv_state {
>> @@ -110,6 +111,8 @@ struct nvme_tcp_queue {
>>   	void (*state_change)(struct sock *);
>>   	void (*data_ready)(struct sock *);
>>   	void (*write_space)(struct sock *);
>> +
>> +	atomic64_t  resync_req;
>>   };
>>   
>>   struct nvme_tcp_ctrl {
>> @@ -129,6 +132,8 @@ struct nvme_tcp_ctrl {
>>   	struct delayed_work	connect_work;
>>   	struct nvme_tcp_request async_req;
>>   	u32			io_queues[HCTX_MAX_TYPES];
>> +
>> +	struct net_device       *offloading_netdev;
>>   };
>>   
>>   static LIST_HEAD(nvme_tcp_ctrl_list);
>> @@ -223,6 +228,159 @@ static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req,
>>   	return nvme_tcp_pdu_data_left(req) <= len;
>>   }
>>   
>> +#ifdef CONFIG_TCP_DDP
>> +
>> +bool nvme_tcp_resync_request(struct sock *sk, u32 seq, u32 flags);
>> +const struct tcp_ddp_ulp_ops nvme_tcp_ddp_ulp_ops __read_mostly = {
>> +	.resync_request		= nvme_tcp_resync_request,
>> +};
>> +
>> +static
>> +int nvme_tcp_offload_socket(struct nvme_tcp_queue *queue,
>> +			    struct nvme_tcp_config *config)
>> +{
>> +	struct net_device *netdev = get_netdev_for_sock(queue->sock->sk, true);
>> +	struct tcp_ddp_config *ddp_config = (struct tcp_ddp_config *)config;
>> +	int ret;
>> +
>> +	if (unlikely(!netdev)) {
> Let's remove unlikely from non datapath routines, its slightly
> confusing.
>
>> +		pr_info_ratelimited("%s: netdev not found\n", __func__);
> dev_info_ratelimited with queue->ctrl->ctrl.device ?
> Also, lets remove __func__. This usually is not very helpful.
>
>> +		return -EINVAL;
>> +	}
>> +
>> +	if (!(netdev->features & NETIF_F_HW_TCP_DDP)) {
>> +		dev_put(netdev);
>> +		return -EINVAL;
> EINVAL or ENODEV?
ENODEV seems more appropriate, we'll use it. Thanks
>> +	}
>> +
>> +	ret = netdev->tcp_ddp_ops->tcp_ddp_sk_add(netdev,
>> +						 queue->sock->sk,
>> +						 ddp_config);
>> +	if (!ret)
>> +		inet_csk(queue->sock->sk)->icsk_ulp_ddp_ops = &nvme_tcp_ddp_ulp_ops;
>> +	else
>> +		dev_put(netdev);
>> +	return ret;
>> +}
>> +
>> +static
>> +void nvme_tcp_unoffload_socket(struct nvme_tcp_queue *queue)
>> +{
>> +	struct net_device *netdev = queue->ctrl->offloading_netdev;
>> +
>> +	if (unlikely(!netdev)) {
>> +		pr_info_ratelimited("%s: netdev not found\n", __func__);
> Same here.
>
>> +		return;
>> +	}
>> +
>> +	netdev->tcp_ddp_ops->tcp_ddp_sk_del(netdev, queue->sock->sk);
>> +
>> +	inet_csk(queue->sock->sk)->icsk_ulp_ddp_ops = NULL;
> Just a general question, why is this needed?
This assignment is symmetric with the nvme_tcp_offload_socket assignment. The idea was to ensure that the functions established during offload cannot be used from this moment.

>> +	dev_put(netdev); /* put the queue_init get_netdev_for_sock() */
>> +}
>> +
>> +static
>> +int nvme_tcp_offload_limits(struct nvme_tcp_queue *queue,
>> +			    struct tcp_ddp_limits *limits)
>> +{
>> +	struct net_device *netdev = get_netdev_for_sock(queue->sock->sk, true);
>> +	int ret = 0;
>> +
>> +	if (unlikely(!netdev)) {
>> +		pr_info_ratelimited("%s: netdev not found\n", __func__);
>> +		return -EINVAL;
> Same here
>
>> +	}
>> +
>> +	if (netdev->features & NETIF_F_HW_TCP_DDP &&
>> +	    netdev->tcp_ddp_ops &&
>> +	    netdev->tcp_ddp_ops->tcp_ddp_limits)
>> +			ret = netdev->tcp_ddp_ops->tcp_ddp_limits(netdev, limits);
>> +	else
>> +			ret = -EOPNOTSUPP;
>> +
>> +	if (!ret) {
>> +		queue->ctrl->offloading_netdev = netdev;
>> +		pr_info("%s netdev %s offload limits: max_ddp_sgl_len %d\n",
>> +			__func__, netdev->name, limits->max_ddp_sgl_len);
> dev_info, and given that it per-queue, please make it dev_dbg.
>
>> +		queue->ctrl->ctrl.max_segments = limits->max_ddp_sgl_len;
>> +		queue->ctrl->ctrl.max_hw_sectors =
>> +			limits->max_ddp_sgl_len << (ilog2(SZ_4K) - 9);
>> +	} else {
>> +		queue->ctrl->offloading_netdev = NULL;
> Maybe nullify in the controller setup intead?
It is already set to zero after allocation (i.e., kzalloc). The goal here is to ensure it is zero in case there is a reset and it was non zero due to offload being used in the past. 

>> +	}
>> +
>> +	dev_put(netdev);
>> +
>> +	return ret;
>> +}
>> +
>> +static
>> +void nvme_tcp_resync_response(struct nvme_tcp_queue *queue,
>> +			      unsigned int pdu_seq)
>> +{
>> +	struct net_device *netdev = queue->ctrl->offloading_netdev;
>> +	u64 resync_val;
>> +	u32 resync_seq;
>> +
>> +	if (unlikely(!netdev)) {
>> +		pr_info_ratelimited("%s: netdev not found\n", __func__);
>> +		return;
> What happens now, fallback to SW? Maybe dev_warn then..
> will the SW keep seeing these responses after one failed?
As long as there is no resync response the system falls back to software, and this message is emitted. I'll move it to below to display it only when it is relevant and not every time this function is called.

>> +	}
>> +
>> +	resync_val = atomic64_read(&queue->resync_req);
>> +	if ((resync_val & TCP_DDP_RESYNC_REQ) == 0)
>> +		return;
>> +
>> +	resync_seq = resync_val >> 32;
>> +	if (before(pdu_seq, resync_seq))
>> +		return;
> I think it will be better to pass the skb to this func and keep the
> pdu_seq contained locally.

This requires passing the offset to obtain the sequence:
u64 pdu_seq = TCP_SKB_CB(skb)->seq + *offset - queue->pdu_offset;

It makes the interface a bit ugly, IMO.

>> +
>> +	if (atomic64_cmpxchg(&queue->resync_req, resync_val, (resync_val - 1)))
>> +		netdev->tcp_ddp_ops->tcp_ddp_resync(netdev, queue->sock->sk, pdu_seq);
> A small comment on this manipulation may help the reader.
>
>> +}
>> +
>> +bool nvme_tcp_resync_request(struct sock *sk, u32 seq, u32 flags)
>> +{
>> +	struct nvme_tcp_queue *queue = sk->sk_user_data;
>> +
>> +	atomic64_set(&queue->resync_req,
>> +		     (((uint64_t)seq << 32) | flags));
>> +
>> +	return true;
>> +}
>> +
>> +#else
>> +
>> +static
>> +int nvme_tcp_offload_socket(struct nvme_tcp_queue *queue,
>> +			    struct nvme_tcp_config *config)
>> +{
>> +	return -EINVAL;
>> +}
>> +
>> +static
>> +void nvme_tcp_unoffload_socket(struct nvme_tcp_queue *queue)
>> +{}
>> +
>> +static
>> +int nvme_tcp_offload_limits(struct nvme_tcp_queue *queue,
>> +			    struct tcp_ddp_limits *limits)
>> +{
>> +	return -EINVAL;
>> +}
>> +
>> +static
>> +void nvme_tcp_resync_response(struct nvme_tcp_queue *queue,
>> +			      unsigned int pdu_seq)
>> +{}
>> +
>> +bool nvme_tcp_resync_request(struct sock *sk, u32 seq, u32 flags)
>> +{
>> +	return false;
>> +}
>> +
>> +#endif
>> +
>>   static void nvme_tcp_init_iter(struct nvme_tcp_request *req,
>>   		unsigned int dir)
>>   {
>> @@ -628,6 +786,11 @@ static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
>>   	size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining);
>>   	int ret;
>>   
>> +	u64 pdu_seq = TCP_SKB_CB(skb)->seq + *offset - queue->pdu_offset;
>> +
>> +	if (test_bit(NVME_TCP_Q_OFFLOADS, &queue->flags))
>> +		nvme_tcp_resync_response(queue, pdu_seq);
> Here, just pass in (queue, skb)

Do you mean (queue, skb, offset)?
We need the offset for the pdu_seq calculation above.

>> +
>>   	ret = skb_copy_bits(skb, *offset,
>>   		&pdu[queue->pdu_offset], rcv_len);
>>   	if (unlikely(ret))
>> @@ -1370,6 +1533,8 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
>>   {
>>   	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
>>   	struct nvme_tcp_queue *queue = &ctrl->queues[qid];
>> +	struct nvme_tcp_config config;
>> +	struct tcp_ddp_limits limits;
>>   	int ret, rcv_pdu_size;
>>   
>>   	queue->ctrl = ctrl;
>> @@ -1487,6 +1652,26 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
>>   #endif
>>   	write_unlock_bh(&queue->sock->sk->sk_callback_lock);
>>   
>> +	if (nvme_tcp_queue_id(queue) != 0) {
> 	if (!nvme_tcp_admin_queue(queue)) {
>
>> +		config.cfg.type		= TCP_DDP_NVME;
>> +		config.pfv		= NVME_TCP_PFV_1_0;
>> +		config.cpda		= 0;
>> +		config.dgst		= queue->hdr_digest ?
>> +						NVME_TCP_HDR_DIGEST_ENABLE : 0;
>> +		config.dgst		|= queue->data_digest ?
>> +						NVME_TCP_DATA_DIGEST_ENABLE : 0;
>> +		config.queue_size	= queue->queue_size;
>> +		config.queue_id		= nvme_tcp_queue_id(queue);
>> +		config.io_cpu		= queue->io_cpu;
> Can the config initialization move to nvme_tcp_offload_socket?

Definitely. The original idea of placing it here was that the nvme-tcp handshake may influence the parameters. But, at this moment, I realize that it is orthogonal.

>> +
>> +		ret = nvme_tcp_offload_socket(queue, &config);
>> +		if (!ret)
>> +			set_bit(NVME_TCP_Q_OFFLOADS, &queue->flags);
>> +	} else {
>> +		ret = nvme_tcp_offload_limits(queue, &limits);
>> +	}
> I'm thinking that instead of this conditional, we want to place
> nvme nvme_tcp_alloc_admin_queue in nvme_tcp_alloc_admin_queue, and
> also move nvme_tcp_alloc_admin_queue to __nvme_tcp_alloc_io_queues
> loop.

We need to do it only after the socket is connected to obtain the 5-tuple and the appropriate netdev, and preferably after the protocol handshake, as there is nothing to offload there.

>> +	/* offload is opportunistic - failure is non-critical */
> Than make it void...
>
>> +
>>   	return 0;
>>   
>>   err_init_connect:
>> @@ -1519,6 +1704,9 @@ static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue)
>>   	kernel_sock_shutdown(queue->sock, SHUT_RDWR);
>>   	nvme_tcp_restore_sock_calls(queue);
>>   	cancel_work_sync(&queue->io_work);
>> +
>> +	if (test_bit(NVME_TCP_Q_OFFLOADS, &queue->flags))
>> +		nvme_tcp_unoffload_socket(queue);
> Why not in nvme_tcp_free_queue, symmetric to the alloc?

Sure. I've tried to keep it close to the socket disconnect, which isn't symmetrical for some reason, which suggests that these are interchangable?!

>>   }
>>   
>>   static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
>> diff --git a/include/linux/nvme-tcp.h b/include/linux/nvme-tcp.h
>> index 959e0bd9a913..65df64c34ecd 100644
>> --- a/include/linux/nvme-tcp.h
>> +++ b/include/linux/nvme-tcp.h
>> @@ -8,6 +8,8 @@
>>   #define _LINUX_NVME_TCP_H
>>   
>>   #include <linux/nvme.h>
>> +#include <net/sock.h>
>> +#include <net/tcp_ddp.h>
> Why is this needed? I think we want to place this in tcp.c no?
Not needed. It's probably leftover from previous iterations on the code. Removed for the next iteration of the patchset.
Shai Malin Nov. 8, 2020, 6:51 a.m. UTC | #3
On 09/10/2020 1:19, Sagi Grimberg wrote:
> On 9/30/20 9:20 AM, Boris Pismenny wrote:
> > This commit introduces direct data placement offload to NVME TCP.
> > There is a context per queue, which is established after the 
> > handshake using the tcp_ddp_sk_add/del NDOs.
> >
> > Additionally, a resynchronization routine is used to assist hardware 
> > recovery from TCP OOO, and continue the offload.
> > Resynchronization operates as follows:
> > 1. TCP OOO causes the NIC HW to stop the offload 2. NIC HW 
> > identifies a PDU header at some TCP sequence number, and asks 
> > NVMe-TCP to
> confirm
> > it.
> > This request is delivered from the NIC driver to NVMe-TCP by first 
> > finding the socket for the packet that triggered the request, and 
> > then fiding the nvme_tcp_queue that is used by this routine.
> > Finally, the request is recorded in the nvme_tcp_queue.
> > 3. When NVMe-TCP observes the requested TCP sequence, it will 
> > compare it with the PDU header TCP sequence, and report the result 
> > to the NIC driver (tcp_ddp_resync), which will update the HW, and 
> > resume offload when all is successful.
> >
> > Furthermore, we let the offloading driver advertise what is the max 
> > hw sectors/segments via tcp_ddp_limits.
> >
> > A follow-up patch introduces the data-path changes required for this 
> > offload.
> >
> > Signed-off-by: Boris Pismenny <borisp@mellanox.com>
> > Signed-off-by: Ben Ben-Ishay <benishay@mellanox.com>
> > Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
> > Signed-off-by: Yoray Zack <yorayz@mellanox.com>
> > ---
> >   drivers/nvme/host/tcp.c  | 188
> +++++++++++++++++++++++++++++++++++++++
> >   include/linux/nvme-tcp.h |   2 +
> >   2 files changed, 190 insertions(+)
> >
> > diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index
> > 8f4f29f18b8c..06711ac095f2 100644
> > --- a/drivers/nvme/host/tcp.c
> > +++ b/drivers/nvme/host/tcp.c
> > @@ -62,6 +62,7 @@ enum nvme_tcp_queue_flags {
> >   	NVME_TCP_Q_ALLOCATED	= 0,
> >   	NVME_TCP_Q_LIVE		= 1,
> >   	NVME_TCP_Q_POLLING	= 2,
> > +	NVME_TCP_Q_OFFLOADS     = 3,

Sagi - following our discussion and your suggestions regarding the NVMeTCP Offload ULP module that we are working on at Marvell in which a TCP_OFFLOAD transport type would be added, we are concerned that perhaps the generic term "offload" for both the transport type (for the Marvell work) and for the DDP and CRC offload queue (for the Mellanox work) may be misleading and confusing to developers and to users. Perhaps the naming should be "direct data placement", e.g. NVME_TCP_Q_DDP or NVME_TCP_Q_DIRECT?
Also, no need to quote the entire patch. Just a few lines above your response like I did here.
Sagi Grimberg Nov. 9, 2020, 11:23 p.m. UTC | #4
>>> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index
>>> 8f4f29f18b8c..06711ac095f2 100644
>>> --- a/drivers/nvme/host/tcp.c
>>> +++ b/drivers/nvme/host/tcp.c
>>> @@ -62,6 +62,7 @@ enum nvme_tcp_queue_flags {
>>>    	NVME_TCP_Q_ALLOCATED	= 0,
>>>    	NVME_TCP_Q_LIVE		= 1,
>>>    	NVME_TCP_Q_POLLING	= 2,
>>> +	NVME_TCP_Q_OFFLOADS     = 3,
> 
> Sagi - following our discussion and your suggestions regarding the NVMeTCP Offload ULP module that we are working on at Marvell in which a TCP_OFFLOAD transport type would be added,

We still need to see how this pans out.. it's hard to predict if this is
the best approach before seeing the code. I'd suggest to share some code
so others can share their input.

> we are concerned that perhaps the generic term "offload" for both the transport type (for the Marvell work) and for the DDP and CRC offload queue (for the Mellanox work) may be misleading and confusing to developers and to users. Perhaps the naming should be "direct data placement", e.g. NVME_TCP_Q_DDP or NVME_TCP_Q_DIRECT?

We can call this NVME_TCP_Q_DDP, no issues with that.
Shai Malin Nov. 11, 2020, 5:12 a.m. UTC | #5
On 11/10/2020 1:24 AM, Sagi Grimberg wrote: 

> >>> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index
> >>> 8f4f29f18b8c..06711ac095f2 100644
> >>> --- a/drivers/nvme/host/tcp.c
> >>> +++ b/drivers/nvme/host/tcp.c
> >>> @@ -62,6 +62,7 @@ enum nvme_tcp_queue_flags {
> >>>    	NVME_TCP_Q_ALLOCATED	= 0,
> >>>    	NVME_TCP_Q_LIVE		= 1,
> >>>    	NVME_TCP_Q_POLLING	= 2,
> >>> +	NVME_TCP_Q_OFFLOADS     = 3,
> >
> > Sagi - following our discussion and your suggestions regarding the
> > NVMeTCP Offload ULP module that we are working on at Marvell in which
> > a TCP_OFFLOAD transport type would be added,
> 
> We still need to see how this pans out.. it's hard to predict if this is the best
> approach before seeing the code. I'd suggest to share some code so others
> can share their input.
> 

We plan to do this soon.

> > we are concerned that perhaps the generic term "offload" for both the
> transport type (for the Marvell work) and for the DDP and CRC offload queue
> (for the Mellanox work) may be misleading and confusing to developers and
> to users. Perhaps the naming should be "direct data placement", e.g.
> NVME_TCP_Q_DDP or NVME_TCP_Q_DIRECT?
> 
> We can call this NVME_TCP_Q_DDP, no issues with that.
> 

Great. Thanks.
Shai Malin Nov. 11, 2020, 5:43 a.m. UTC | #6
On 11/10/2020 1:24 AM, Sagi Grimberg wrote: 

> >>> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c 
> >>> index
> >>> 8f4f29f18b8c..06711ac095f2 100644
> >>> --- a/drivers/nvme/host/tcp.c
> >>> +++ b/drivers/nvme/host/tcp.c
> >>> @@ -62,6 +62,7 @@ enum nvme_tcp_queue_flags {
> >>>    	NVME_TCP_Q_ALLOCATED	= 0,
> >>>    	NVME_TCP_Q_LIVE		= 1,
> >>>    	NVME_TCP_Q_POLLING	= 2,
> >>> +	NVME_TCP_Q_OFFLOADS     = 3,
> >
> > Sagi - following our discussion and your suggestions regarding the 
> > NVMeTCP Offload ULP module that we are working on at Marvell in 
> > which a TCP_OFFLOAD transport type would be added,
> 
> We still need to see how this pans out.. it's hard to predict if this 
> is the best approach before seeing the code. I'd suggest to share some 
> code so others can share their input.
> 

We plan to do this soon.

> > we are concerned that perhaps the generic term "offload" for both 
> > the
> transport type (for the Marvell work) and for the DDP and CRC offload 
> queue (for the Mellanox work) may be misleading and confusing to 
> developers and to users. Perhaps the naming should be "direct data placement", e.g.
> NVME_TCP_Q_DDP or NVME_TCP_Q_DIRECT?
> 
> We can call this NVME_TCP_Q_DDP, no issues with that.
> 

Great. Thanks.
diff mbox series

Patch

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 8f4f29f18b8c..06711ac095f2 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -62,6 +62,7 @@  enum nvme_tcp_queue_flags {
 	NVME_TCP_Q_ALLOCATED	= 0,
 	NVME_TCP_Q_LIVE		= 1,
 	NVME_TCP_Q_POLLING	= 2,
+	NVME_TCP_Q_OFFLOADS     = 3,
 };
 
 enum nvme_tcp_recv_state {
@@ -110,6 +111,8 @@  struct nvme_tcp_queue {
 	void (*state_change)(struct sock *);
 	void (*data_ready)(struct sock *);
 	void (*write_space)(struct sock *);
+
+	atomic64_t  resync_req;
 };
 
 struct nvme_tcp_ctrl {
@@ -129,6 +132,8 @@  struct nvme_tcp_ctrl {
 	struct delayed_work	connect_work;
 	struct nvme_tcp_request async_req;
 	u32			io_queues[HCTX_MAX_TYPES];
+
+	struct net_device       *offloading_netdev;
 };
 
 static LIST_HEAD(nvme_tcp_ctrl_list);
@@ -223,6 +228,159 @@  static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req,
 	return nvme_tcp_pdu_data_left(req) <= len;
 }
 
+#ifdef CONFIG_TCP_DDP
+
+bool nvme_tcp_resync_request(struct sock *sk, u32 seq, u32 flags);
+const struct tcp_ddp_ulp_ops nvme_tcp_ddp_ulp_ops __read_mostly = {
+	.resync_request		= nvme_tcp_resync_request,
+};
+
+static
+int nvme_tcp_offload_socket(struct nvme_tcp_queue *queue,
+			    struct nvme_tcp_config *config)
+{
+	struct net_device *netdev = get_netdev_for_sock(queue->sock->sk, true);
+	struct tcp_ddp_config *ddp_config = (struct tcp_ddp_config *)config;
+	int ret;
+
+	if (unlikely(!netdev)) {
+		pr_info_ratelimited("%s: netdev not found\n", __func__);
+		return -EINVAL;
+	}
+
+	if (!(netdev->features & NETIF_F_HW_TCP_DDP)) {
+		dev_put(netdev);
+		return -EINVAL;
+	}
+
+	ret = netdev->tcp_ddp_ops->tcp_ddp_sk_add(netdev,
+						 queue->sock->sk,
+						 ddp_config);
+	if (!ret)
+		inet_csk(queue->sock->sk)->icsk_ulp_ddp_ops = &nvme_tcp_ddp_ulp_ops;
+	else
+		dev_put(netdev);
+	return ret;
+}
+
+static
+void nvme_tcp_unoffload_socket(struct nvme_tcp_queue *queue)
+{
+	struct net_device *netdev = queue->ctrl->offloading_netdev;
+
+	if (unlikely(!netdev)) {
+		pr_info_ratelimited("%s: netdev not found\n", __func__);
+		return;
+	}
+
+	netdev->tcp_ddp_ops->tcp_ddp_sk_del(netdev, queue->sock->sk);
+
+	inet_csk(queue->sock->sk)->icsk_ulp_ddp_ops = NULL;
+	dev_put(netdev); /* put the queue_init get_netdev_for_sock() */
+}
+
+static
+int nvme_tcp_offload_limits(struct nvme_tcp_queue *queue,
+			    struct tcp_ddp_limits *limits)
+{
+	struct net_device *netdev = get_netdev_for_sock(queue->sock->sk, true);
+	int ret = 0;
+
+	if (unlikely(!netdev)) {
+		pr_info_ratelimited("%s: netdev not found\n", __func__);
+		return -EINVAL;
+	}
+
+	if (netdev->features & NETIF_F_HW_TCP_DDP &&
+	    netdev->tcp_ddp_ops &&
+	    netdev->tcp_ddp_ops->tcp_ddp_limits)
+			ret = netdev->tcp_ddp_ops->tcp_ddp_limits(netdev, limits);
+	else
+			ret = -EOPNOTSUPP;
+
+	if (!ret) {
+		queue->ctrl->offloading_netdev = netdev;
+		pr_info("%s netdev %s offload limits: max_ddp_sgl_len %d\n",
+			__func__, netdev->name, limits->max_ddp_sgl_len);
+		queue->ctrl->ctrl.max_segments = limits->max_ddp_sgl_len;
+		queue->ctrl->ctrl.max_hw_sectors =
+			limits->max_ddp_sgl_len << (ilog2(SZ_4K) - 9);
+	} else {
+		queue->ctrl->offloading_netdev = NULL;
+	}
+
+	dev_put(netdev);
+
+	return ret;
+}
+
+static
+void nvme_tcp_resync_response(struct nvme_tcp_queue *queue,
+			      unsigned int pdu_seq)
+{
+	struct net_device *netdev = queue->ctrl->offloading_netdev;
+	u64 resync_val;
+	u32 resync_seq;
+
+	if (unlikely(!netdev)) {
+		pr_info_ratelimited("%s: netdev not found\n", __func__);
+		return;
+	}
+
+	resync_val = atomic64_read(&queue->resync_req);
+	if ((resync_val & TCP_DDP_RESYNC_REQ) == 0)
+		return;
+
+	resync_seq = resync_val >> 32;
+	if (before(pdu_seq, resync_seq))
+		return;
+
+	if (atomic64_cmpxchg(&queue->resync_req, resync_val, (resync_val - 1)))
+		netdev->tcp_ddp_ops->tcp_ddp_resync(netdev, queue->sock->sk, pdu_seq);
+}
+
+bool nvme_tcp_resync_request(struct sock *sk, u32 seq, u32 flags)
+{
+	struct nvme_tcp_queue *queue = sk->sk_user_data;
+
+	atomic64_set(&queue->resync_req,
+		     (((uint64_t)seq << 32) | flags));
+
+	return true;
+}
+
+#else
+
+static
+int nvme_tcp_offload_socket(struct nvme_tcp_queue *queue,
+			    struct nvme_tcp_config *config)
+{
+	return -EINVAL;
+}
+
+static
+void nvme_tcp_unoffload_socket(struct nvme_tcp_queue *queue)
+{}
+
+static
+int nvme_tcp_offload_limits(struct nvme_tcp_queue *queue,
+			    struct tcp_ddp_limits *limits)
+{
+	return -EINVAL;
+}
+
+static
+void nvme_tcp_resync_response(struct nvme_tcp_queue *queue,
+			      unsigned int pdu_seq)
+{}
+
+bool nvme_tcp_resync_request(struct sock *sk, u32 seq, u32 flags)
+{
+	return false;
+}
+
+#endif
+
 static void nvme_tcp_init_iter(struct nvme_tcp_request *req,
 		unsigned int dir)
 {
@@ -628,6 +786,11 @@  static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
 	size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining);
 	int ret;
 
+	u64 pdu_seq = TCP_SKB_CB(skb)->seq + *offset - queue->pdu_offset;
+
+	if (test_bit(NVME_TCP_Q_OFFLOADS, &queue->flags))
+		nvme_tcp_resync_response(queue, pdu_seq);
+
 	ret = skb_copy_bits(skb, *offset,
 		&pdu[queue->pdu_offset], rcv_len);
 	if (unlikely(ret))
@@ -1370,6 +1533,8 @@  static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
 {
 	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
 	struct nvme_tcp_queue *queue = &ctrl->queues[qid];
+	struct nvme_tcp_config config;
+	struct tcp_ddp_limits limits;
 	int ret, rcv_pdu_size;
 
 	queue->ctrl = ctrl;
@@ -1487,6 +1652,26 @@  static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
 #endif
 	write_unlock_bh(&queue->sock->sk->sk_callback_lock);
 
+	if (nvme_tcp_queue_id(queue) != 0) {
+		config.cfg.type		= TCP_DDP_NVME;
+		config.pfv		= NVME_TCP_PFV_1_0;
+		config.cpda		= 0;
+		config.dgst		= queue->hdr_digest ?
+						NVME_TCP_HDR_DIGEST_ENABLE : 0;
+		config.dgst		|= queue->data_digest ?
+						NVME_TCP_DATA_DIGEST_ENABLE : 0;
+		config.queue_size	= queue->queue_size;
+		config.queue_id		= nvme_tcp_queue_id(queue);
+		config.io_cpu		= queue->io_cpu;
+
+		ret = nvme_tcp_offload_socket(queue, &config);
+		if (!ret)
+			set_bit(NVME_TCP_Q_OFFLOADS, &queue->flags);
+	} else {
+		ret = nvme_tcp_offload_limits(queue, &limits);
+	}
+	/* offload is opportunistic - failure is non-critical */
+
 	return 0;
 
 err_init_connect:
@@ -1519,6 +1704,9 @@  static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue)
 	kernel_sock_shutdown(queue->sock, SHUT_RDWR);
 	nvme_tcp_restore_sock_calls(queue);
 	cancel_work_sync(&queue->io_work);
+
+	if (test_bit(NVME_TCP_Q_OFFLOADS, &queue->flags))
+		nvme_tcp_unoffload_socket(queue);
 }
 
 static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
diff --git a/include/linux/nvme-tcp.h b/include/linux/nvme-tcp.h
index 959e0bd9a913..65df64c34ecd 100644
--- a/include/linux/nvme-tcp.h
+++ b/include/linux/nvme-tcp.h
@@ -8,6 +8,8 @@ 
 #define _LINUX_NVME_TCP_H
 
 #include <linux/nvme.h>
+#include <net/sock.h>
+#include <net/tcp_ddp.h>
 
 #define NVME_TCP_DISC_PORT	8009
 #define NVME_TCP_ADMIN_CCSZ	SZ_8K