diff mbox series

[net-next,RFC,v1,06/10] nvme-tcp: Add DDP data-path

Message ID 20200930162010.21610-7-borisp@mellanox.com
State RFC
Delegated to: David Miller
Headers show
Series nvme-tcp receive offloads | expand

Commit Message

Boris Pismenny Sept. 30, 2020, 4:20 p.m. UTC
Introduce the NVMe-TCP DDP data-path offload.
Using this interface, the NIC hardware will scatter TCP payload directly
to the BIO pages according to the command_id in the PDU.
To maintain the correctness of the network stack, the driver is expected
to construct SKBs that point to the BIO pages.

The data-path interface contains two routines: tcp_ddp_setup/teardown.
The setup provides the mapping from command_id to the request buffers,
while the teardown removes this mapping.

For efficiency, we introduce an asynchronous nvme completion, which is
split between NVMe-TCP and the NIC driver as follows:
NVMe-TCP performs the specific completion, while NIC driver performs the
generic mq_blk completion.

Signed-off-by: Boris Pismenny <borisp@mellanox.com>
Signed-off-by: Ben Ben-Ishay <benishay@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Yoray Zack <yorayz@mellanox.com>
---
 drivers/nvme/host/tcp.c | 121 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 117 insertions(+), 4 deletions(-)

Comments

Sagi Grimberg Oct. 8, 2020, 10:29 p.m. UTC | #1
>   bool nvme_tcp_resync_request(struct sock *sk, u32 seq, u32 flags);
> +void nvme_tcp_ddp_teardown_done(void *ddp_ctx);
>   const struct tcp_ddp_ulp_ops nvme_tcp_ddp_ulp_ops __read_mostly = {
> +
>   	.resync_request		= nvme_tcp_resync_request,
> +	.ddp_teardown_done	= nvme_tcp_ddp_teardown_done,
>   };
>   
> +static
> +int nvme_tcp_teardown_ddp(struct nvme_tcp_queue *queue,
> +			  uint16_t command_id,
> +			  struct request *rq)
> +{
> +	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
> +	struct net_device *netdev = queue->ctrl->offloading_netdev;
> +	int ret;
> +
> +	if (unlikely(!netdev)) {
> +		pr_info_ratelimited("%s: netdev not found\n", __func__);

dev_info_ratelimited

> +		return -EINVAL;
> +	}
> +
> +	ret = netdev->tcp_ddp_ops->tcp_ddp_teardown(netdev, queue->sock->sk,
> +						    &req->ddp, rq);
> +	sg_free_table_chained(&req->ddp.sg_table, SG_CHUNK_SIZE);
> +	req->offloaded = false;
> +	return ret;
> +}
> +
> +void nvme_tcp_ddp_teardown_done(void *ddp_ctx)
> +{
> +	struct request *rq = ddp_ctx;
> +	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
> +
> +	if (!nvme_try_complete_req(rq, cpu_to_le16(req->status << 1), req->result))
> +		nvme_complete_rq(rq);
> +}
> +
> +static
> +int nvme_tcp_setup_ddp(struct nvme_tcp_queue *queue,
> +		       uint16_t command_id,
> +		       struct request *rq)
> +{
> +	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
> +	struct net_device *netdev = queue->ctrl->offloading_netdev;
> +	int ret;
> +
> +	req->offloaded = false;
> +
> +	if (unlikely(!netdev)) {
> +		pr_info_ratelimited("%s: netdev not found\n", __func__);
> +		return -EINVAL;
> +	}
> +
> +	req->ddp.command_id = command_id;
> +	req->ddp.sg_table.sgl = req->ddp.first_sgl;
> +	ret = sg_alloc_table_chained(&req->ddp.sg_table,
> +		blk_rq_nr_phys_segments(rq), req->ddp.sg_table.sgl,
> +		SG_CHUNK_SIZE);
> +	if (ret)
> +		return -ENOMEM;

newline here

> +	req->ddp.nents = blk_rq_map_sg(rq->q, rq, req->ddp.sg_table.sgl);
> +
> +	ret = netdev->tcp_ddp_ops->tcp_ddp_setup(netdev,
> +						 queue->sock->sk,
> +						 &req->ddp);
> +	if (!ret)
> +		req->offloaded = true;
> +	return ret;
> +}
> +
>   static
>   int nvme_tcp_offload_socket(struct nvme_tcp_queue *queue,
>   			    struct nvme_tcp_config *config)
> @@ -351,6 +422,25 @@ bool nvme_tcp_resync_request(struct sock *sk, u32 seq, u32 flags)
>   
>   #else
>   
> +static
> +int nvme_tcp_setup_ddp(struct nvme_tcp_queue *queue,
> +		       uint16_t command_id,
> +		       struct request *rq)
> +{
> +	return -EINVAL;
> +}
> +
> +static
> +int nvme_tcp_teardown_ddp(struct nvme_tcp_queue *queue,
> +			  uint16_t command_id,
> +			  struct request *rq)
> +{
> +	return -EINVAL;
> +}
> +
> +void nvme_tcp_ddp_teardown_done(void *ddp_ctx)
> +{}
> +
>   static
>   int nvme_tcp_offload_socket(struct nvme_tcp_queue *queue,
>   			    struct nvme_tcp_config *config)
> @@ -630,6 +720,7 @@ static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
>   static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
>   		struct nvme_completion *cqe)
>   {
> +	struct nvme_tcp_request *req;
>   	struct request *rq;
>   
>   	rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), cqe->command_id);
> @@ -641,8 +732,15 @@ static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
>   		return -EINVAL;
>   	}
>   
> -	if (!nvme_try_complete_req(rq, cqe->status, cqe->result))
> -		nvme_complete_rq(rq);
> +	req = blk_mq_rq_to_pdu(rq);
> +	if (req->offloaded) {
> +		req->status = cqe->status;
> +		req->result = cqe->result;
> +		nvme_tcp_teardown_ddp(queue, cqe->command_id, rq);
> +	} else {
> +		if (!nvme_try_complete_req(rq, cqe->status, cqe->result))
> +			nvme_complete_rq(rq);
> +	}
>   	queue->nr_cqe++;
>   
>   	return 0;
> @@ -836,9 +934,18 @@ static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
>   static inline void nvme_tcp_end_request(struct request *rq, u16 status)
>   {
>   	union nvme_result res = {};
> +	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
> +	struct nvme_tcp_queue *queue = req->queue;
> +	struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
>   
> -	if (!nvme_try_complete_req(rq, cpu_to_le16(status << 1), res))
> -		nvme_complete_rq(rq);
> +	if (req->offloaded) {
> +		req->status = cpu_to_le16(status << 1);
> +		req->result = res;
> +		nvme_tcp_teardown_ddp(queue, pdu->command_id, rq);
> +	} else {
> +		if (!nvme_try_complete_req(rq, cpu_to_le16(status << 1), res))
> +			nvme_complete_rq(rq);
> +	}
>   }
>   
>   static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
> @@ -1115,6 +1222,7 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
>   	bool inline_data = nvme_tcp_has_inline_data(req);
>   	u8 hdgst = nvme_tcp_hdgst_len(queue);
>   	int len = sizeof(*pdu) + hdgst - req->offset;
> +	struct request *rq = blk_mq_rq_from_pdu(req);
>   	int flags = MSG_DONTWAIT;
>   	int ret;
>   
> @@ -1123,6 +1231,10 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
>   	else
>   		flags |= MSG_EOR;
>   
> +	if (test_bit(NVME_TCP_Q_OFFLOADS, &queue->flags) &&
> +	    blk_rq_nr_phys_segments(rq) && rq_data_dir(rq) == READ)
> +		nvme_tcp_setup_ddp(queue, pdu->cmd.common.command_id, rq);

I'd assume that this is something we want to setup in
nvme_tcp_setup_cmd_pdu. Why do it here?
Sagi Grimberg Oct. 8, 2020, 11 p.m. UTC | #2
>>   static
>>   int nvme_tcp_offload_socket(struct nvme_tcp_queue *queue,
>>                   struct nvme_tcp_config *config)
>> @@ -630,6 +720,7 @@ static void nvme_tcp_error_recovery(struct 
>> nvme_ctrl *ctrl)
>>   static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
>>           struct nvme_completion *cqe)
>>   {
>> +    struct nvme_tcp_request *req;
>>       struct request *rq;
>>       rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), cqe->command_id);
>> @@ -641,8 +732,15 @@ static int nvme_tcp_process_nvme_cqe(struct 
>> nvme_tcp_queue *queue,
>>           return -EINVAL;
>>       }
>> -    if (!nvme_try_complete_req(rq, cqe->status, cqe->result))
>> -        nvme_complete_rq(rq);
>> +    req = blk_mq_rq_to_pdu(rq);
>> +    if (req->offloaded) {
>> +        req->status = cqe->status;
>> +        req->result = cqe->result;
>> +        nvme_tcp_teardown_ddp(queue, cqe->command_id, rq);
>> +    } else {
>> +        if (!nvme_try_complete_req(rq, cqe->status, cqe->result))
>> +            nvme_complete_rq(rq);
>> +    }

Oh forgot to ask,

We have places in the driver that we may complete (cancel) one
or more requests from the error recovery or timeout flow. We
first prevent future incoming RX on the socket such that we
can safely cancel requests. This may break with the deferred
completion in ddp_teardown_done.

If I have a request that is waiting for ddp_teardown_done do
I have a way to tell the HW to never call ddp_teardown_done
on a specific socket?

If so the place to is in nvme_tcp_stop_queue.
Boris Pismenny Nov. 8, 2020, 9:44 a.m. UTC | #3
On 09/10/2020 1:29, Sagi Grimberg wrote:
>
>>   static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
>> @@ -1115,6 +1222,7 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
>>   	bool inline_data = nvme_tcp_has_inline_data(req);
>>   	u8 hdgst = nvme_tcp_hdgst_len(queue);
>>   	int len = sizeof(*pdu) + hdgst - req->offset;
>> +	struct request *rq = blk_mq_rq_from_pdu(req);
>>   	int flags = MSG_DONTWAIT;
>>   	int ret;
>>   
>> @@ -1123,6 +1231,10 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
>>   	else
>>   		flags |= MSG_EOR;
>>   
>> +	if (test_bit(NVME_TCP_Q_OFFLOADS, &queue->flags) &&
>> +	    blk_rq_nr_phys_segments(rq) && rq_data_dir(rq) == READ)
>> +		nvme_tcp_setup_ddp(queue, pdu->cmd.common.command_id, rq);
> I'd assume that this is something we want to setup in
> nvme_tcp_setup_cmd_pdu. Why do it here?
Our goal in placing it here is to keep both setup and teardown in the same thread.
This enables drivers to avoid locking for per-queue operations.
Boris Pismenny Nov. 8, 2020, 1:59 p.m. UTC | #4
On 09/10/2020 2:00, Sagi Grimberg wrote:
>>>   static
>>>   int nvme_tcp_offload_socket(struct nvme_tcp_queue *queue,
>>>                   struct nvme_tcp_config *config)
>>> @@ -630,6 +720,7 @@ static void nvme_tcp_error_recovery(struct 
>>> nvme_ctrl *ctrl)
>>>   static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
>>>           struct nvme_completion *cqe)
>>>   {
>>> +    struct nvme_tcp_request *req;
>>>       struct request *rq;
>>>       rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), cqe->command_id);
>>> @@ -641,8 +732,15 @@ static int nvme_tcp_process_nvme_cqe(struct 
>>> nvme_tcp_queue *queue,
>>>           return -EINVAL;
>>>       }
>>> -    if (!nvme_try_complete_req(rq, cqe->status, cqe->result))
>>> -        nvme_complete_rq(rq);
>>> +    req = blk_mq_rq_to_pdu(rq);
>>> +    if (req->offloaded) {
>>> +        req->status = cqe->status;
>>> +        req->result = cqe->result;
>>> +        nvme_tcp_teardown_ddp(queue, cqe->command_id, rq);
>>> +    } else {
>>> +        if (!nvme_try_complete_req(rq, cqe->status, cqe->result))
>>> +            nvme_complete_rq(rq);
>>> +    }
> Oh forgot to ask,
>
> We have places in the driver that we may complete (cancel) one
> or more requests from the error recovery or timeout flow. We
> first prevent future incoming RX on the socket such that we
> can safely cancel requests. This may break with the deferred
> completion in ddp_teardown_done.
>
> If I have a request that is waiting for ddp_teardown_done do
> I have a way to tell the HW to never call ddp_teardown_done
> on a specific socket?
>
> If so the place to is in nvme_tcp_stop_queue.
Interesting and indeed, it is a problem that we haven't considered.
Sagi Grimberg Nov. 9, 2020, 11:18 p.m. UTC | #5
>>>    static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
>>> @@ -1115,6 +1222,7 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
>>>    	bool inline_data = nvme_tcp_has_inline_data(req);
>>>    	u8 hdgst = nvme_tcp_hdgst_len(queue);
>>>    	int len = sizeof(*pdu) + hdgst - req->offset;
>>> +	struct request *rq = blk_mq_rq_from_pdu(req);
>>>    	int flags = MSG_DONTWAIT;
>>>    	int ret;
>>>    
>>> @@ -1123,6 +1231,10 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
>>>    	else
>>>    		flags |= MSG_EOR;
>>>    
>>> +	if (test_bit(NVME_TCP_Q_OFFLOADS, &queue->flags) &&
>>> +	    blk_rq_nr_phys_segments(rq) && rq_data_dir(rq) == READ)
>>> +		nvme_tcp_setup_ddp(queue, pdu->cmd.common.command_id, rq);
>> I'd assume that this is something we want to setup in
>> nvme_tcp_setup_cmd_pdu. Why do it here?
> Our goal in placing it here is to keep both setup and teardown in the same thread.
> This enables drivers to avoid locking for per-queue operations.

I also think that it is cleaner when setting up the PDU. Do note that if
queues match 1x1 with cpu cores then any synchronization is pretty
lightweight, and if not, we have other synchronizations anyways...
diff mbox series

Patch

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 06711ac095f2..7bd97f856677 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -56,6 +56,11 @@  struct nvme_tcp_request {
 	size_t			offset;
 	size_t			data_sent;
 	enum nvme_tcp_send_state state;
+
+	bool			offloaded;
+	struct tcp_ddp_io	ddp;
+	__le16			status;
+	union nvme_result	result;
 };
 
 enum nvme_tcp_queue_flags {
@@ -231,10 +236,76 @@  static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req,
 #ifdef CONFIG_TCP_DDP
 
 bool nvme_tcp_resync_request(struct sock *sk, u32 seq, u32 flags);
+void nvme_tcp_ddp_teardown_done(void *ddp_ctx);
 const struct tcp_ddp_ulp_ops nvme_tcp_ddp_ulp_ops __read_mostly = {
+
 	.resync_request		= nvme_tcp_resync_request,
+	.ddp_teardown_done	= nvme_tcp_ddp_teardown_done,
 };
 
+static
+int nvme_tcp_teardown_ddp(struct nvme_tcp_queue *queue,
+			  uint16_t command_id,
+			  struct request *rq)
+{
+	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
+	struct net_device *netdev = queue->ctrl->offloading_netdev;
+	int ret;
+
+	if (unlikely(!netdev)) {
+		pr_info_ratelimited("%s: netdev not found\n", __func__);
+		return -EINVAL;
+	}
+
+	ret = netdev->tcp_ddp_ops->tcp_ddp_teardown(netdev, queue->sock->sk,
+						    &req->ddp, rq);
+	sg_free_table_chained(&req->ddp.sg_table, SG_CHUNK_SIZE);
+	req->offloaded = false;
+	return ret;
+}
+
+void nvme_tcp_ddp_teardown_done(void *ddp_ctx)
+{
+	struct request *rq = ddp_ctx;
+	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
+
+	if (!nvme_try_complete_req(rq, cpu_to_le16(req->status << 1), req->result))
+		nvme_complete_rq(rq);
+}
+
+static
+int nvme_tcp_setup_ddp(struct nvme_tcp_queue *queue,
+		       uint16_t command_id,
+		       struct request *rq)
+{
+	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
+	struct net_device *netdev = queue->ctrl->offloading_netdev;
+	int ret;
+
+	req->offloaded = false;
+
+	if (unlikely(!netdev)) {
+		pr_info_ratelimited("%s: netdev not found\n", __func__);
+		return -EINVAL;
+	}
+
+	req->ddp.command_id = command_id;
+	req->ddp.sg_table.sgl = req->ddp.first_sgl;
+	ret = sg_alloc_table_chained(&req->ddp.sg_table,
+		blk_rq_nr_phys_segments(rq), req->ddp.sg_table.sgl,
+		SG_CHUNK_SIZE);
+	if (ret)
+		return -ENOMEM;
+	req->ddp.nents = blk_rq_map_sg(rq->q, rq, req->ddp.sg_table.sgl);
+
+	ret = netdev->tcp_ddp_ops->tcp_ddp_setup(netdev,
+						 queue->sock->sk,
+						 &req->ddp);
+	if (!ret)
+		req->offloaded = true;
+	return ret;
+}
+
 static
 int nvme_tcp_offload_socket(struct nvme_tcp_queue *queue,
 			    struct nvme_tcp_config *config)
@@ -351,6 +422,25 @@  bool nvme_tcp_resync_request(struct sock *sk, u32 seq, u32 flags)
 
 #else
 
+static
+int nvme_tcp_setup_ddp(struct nvme_tcp_queue *queue,
+		       uint16_t command_id,
+		       struct request *rq)
+{
+	return -EINVAL;
+}
+
+static
+int nvme_tcp_teardown_ddp(struct nvme_tcp_queue *queue,
+			  uint16_t command_id,
+			  struct request *rq)
+{
+	return -EINVAL;
+}
+
+void nvme_tcp_ddp_teardown_done(void *ddp_ctx)
+{}
+
 static
 int nvme_tcp_offload_socket(struct nvme_tcp_queue *queue,
 			    struct nvme_tcp_config *config)
@@ -630,6 +720,7 @@  static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
 static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
 		struct nvme_completion *cqe)
 {
+	struct nvme_tcp_request *req;
 	struct request *rq;
 
 	rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), cqe->command_id);
@@ -641,8 +732,15 @@  static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
 		return -EINVAL;
 	}
 
-	if (!nvme_try_complete_req(rq, cqe->status, cqe->result))
-		nvme_complete_rq(rq);
+	req = blk_mq_rq_to_pdu(rq);
+	if (req->offloaded) {
+		req->status = cqe->status;
+		req->result = cqe->result;
+		nvme_tcp_teardown_ddp(queue, cqe->command_id, rq);
+	} else {
+		if (!nvme_try_complete_req(rq, cqe->status, cqe->result))
+			nvme_complete_rq(rq);
+	}
 	queue->nr_cqe++;
 
 	return 0;
@@ -836,9 +934,18 @@  static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
 static inline void nvme_tcp_end_request(struct request *rq, u16 status)
 {
 	union nvme_result res = {};
+	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
+	struct nvme_tcp_queue *queue = req->queue;
+	struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
 
-	if (!nvme_try_complete_req(rq, cpu_to_le16(status << 1), res))
-		nvme_complete_rq(rq);
+	if (req->offloaded) {
+		req->status = cpu_to_le16(status << 1);
+		req->result = res;
+		nvme_tcp_teardown_ddp(queue, pdu->command_id, rq);
+	} else {
+		if (!nvme_try_complete_req(rq, cpu_to_le16(status << 1), res))
+			nvme_complete_rq(rq);
+	}
 }
 
 static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
@@ -1115,6 +1222,7 @@  static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
 	bool inline_data = nvme_tcp_has_inline_data(req);
 	u8 hdgst = nvme_tcp_hdgst_len(queue);
 	int len = sizeof(*pdu) + hdgst - req->offset;
+	struct request *rq = blk_mq_rq_from_pdu(req);
 	int flags = MSG_DONTWAIT;
 	int ret;
 
@@ -1123,6 +1231,10 @@  static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
 	else
 		flags |= MSG_EOR;
 
+	if (test_bit(NVME_TCP_Q_OFFLOADS, &queue->flags) &&
+	    blk_rq_nr_phys_segments(rq) && rq_data_dir(rq) == READ)
+		nvme_tcp_setup_ddp(queue, pdu->cmd.common.command_id, rq);
+
 	if (queue->hdr_digest && !req->offset)
 		nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
 
@@ -2448,6 +2560,7 @@  static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns,
 	req->data_len = blk_rq_nr_phys_segments(rq) ?
 				blk_rq_payload_bytes(rq) : 0;
 	req->curr_bio = rq->bio;
+	req->offloaded = false;
 
 	if (rq_data_dir(rq) == WRITE &&
 	    req->data_len <= nvme_tcp_inline_data_size(queue))