Message ID | 20200930162010.21610-7-borisp@mellanox.com |
---|---|
State | RFC |
Delegated to: | David Miller |
Headers | show |
Series | nvme-tcp receive offloads | expand |
> bool nvme_tcp_resync_request(struct sock *sk, u32 seq, u32 flags); > +void nvme_tcp_ddp_teardown_done(void *ddp_ctx); > const struct tcp_ddp_ulp_ops nvme_tcp_ddp_ulp_ops __read_mostly = { > + > .resync_request = nvme_tcp_resync_request, > + .ddp_teardown_done = nvme_tcp_ddp_teardown_done, > }; > > +static > +int nvme_tcp_teardown_ddp(struct nvme_tcp_queue *queue, > + uint16_t command_id, > + struct request *rq) > +{ > + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); > + struct net_device *netdev = queue->ctrl->offloading_netdev; > + int ret; > + > + if (unlikely(!netdev)) { > + pr_info_ratelimited("%s: netdev not found\n", __func__); dev_info_ratelimited > + return -EINVAL; > + } > + > + ret = netdev->tcp_ddp_ops->tcp_ddp_teardown(netdev, queue->sock->sk, > + &req->ddp, rq); > + sg_free_table_chained(&req->ddp.sg_table, SG_CHUNK_SIZE); > + req->offloaded = false; > + return ret; > +} > + > +void nvme_tcp_ddp_teardown_done(void *ddp_ctx) > +{ > + struct request *rq = ddp_ctx; > + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); > + > + if (!nvme_try_complete_req(rq, cpu_to_le16(req->status << 1), req->result)) > + nvme_complete_rq(rq); > +} > + > +static > +int nvme_tcp_setup_ddp(struct nvme_tcp_queue *queue, > + uint16_t command_id, > + struct request *rq) > +{ > + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); > + struct net_device *netdev = queue->ctrl->offloading_netdev; > + int ret; > + > + req->offloaded = false; > + > + if (unlikely(!netdev)) { > + pr_info_ratelimited("%s: netdev not found\n", __func__); > + return -EINVAL; > + } > + > + req->ddp.command_id = command_id; > + req->ddp.sg_table.sgl = req->ddp.first_sgl; > + ret = sg_alloc_table_chained(&req->ddp.sg_table, > + blk_rq_nr_phys_segments(rq), req->ddp.sg_table.sgl, > + SG_CHUNK_SIZE); > + if (ret) > + return -ENOMEM; newline here > + req->ddp.nents = blk_rq_map_sg(rq->q, rq, req->ddp.sg_table.sgl); > + > + ret = netdev->tcp_ddp_ops->tcp_ddp_setup(netdev, > + queue->sock->sk, > + &req->ddp); > + if (!ret) > + req->offloaded = true; > + return ret; > +} > + > static > int nvme_tcp_offload_socket(struct nvme_tcp_queue *queue, > struct nvme_tcp_config *config) > @@ -351,6 +422,25 @@ bool nvme_tcp_resync_request(struct sock *sk, u32 seq, u32 flags) > > #else > > +static > +int nvme_tcp_setup_ddp(struct nvme_tcp_queue *queue, > + uint16_t command_id, > + struct request *rq) > +{ > + return -EINVAL; > +} > + > +static > +int nvme_tcp_teardown_ddp(struct nvme_tcp_queue *queue, > + uint16_t command_id, > + struct request *rq) > +{ > + return -EINVAL; > +} > + > +void nvme_tcp_ddp_teardown_done(void *ddp_ctx) > +{} > + > static > int nvme_tcp_offload_socket(struct nvme_tcp_queue *queue, > struct nvme_tcp_config *config) > @@ -630,6 +720,7 @@ static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl) > static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue, > struct nvme_completion *cqe) > { > + struct nvme_tcp_request *req; > struct request *rq; > > rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), cqe->command_id); > @@ -641,8 +732,15 @@ static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue, > return -EINVAL; > } > > - if (!nvme_try_complete_req(rq, cqe->status, cqe->result)) > - nvme_complete_rq(rq); > + req = blk_mq_rq_to_pdu(rq); > + if (req->offloaded) { > + req->status = cqe->status; > + req->result = cqe->result; > + nvme_tcp_teardown_ddp(queue, cqe->command_id, rq); > + } else { > + if (!nvme_try_complete_req(rq, cqe->status, cqe->result)) > + nvme_complete_rq(rq); > + } > queue->nr_cqe++; > > return 0; > @@ -836,9 +934,18 @@ static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb, > static inline void nvme_tcp_end_request(struct request *rq, u16 status) > { > union nvme_result res = {}; > + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); > + struct nvme_tcp_queue *queue = req->queue; > + struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu; > > - if (!nvme_try_complete_req(rq, cpu_to_le16(status << 1), res)) > - nvme_complete_rq(rq); > + if (req->offloaded) { > + req->status = cpu_to_le16(status << 1); > + req->result = res; > + nvme_tcp_teardown_ddp(queue, pdu->command_id, rq); > + } else { > + if (!nvme_try_complete_req(rq, cpu_to_le16(status << 1), res)) > + nvme_complete_rq(rq); > + } > } > > static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, > @@ -1115,6 +1222,7 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req) > bool inline_data = nvme_tcp_has_inline_data(req); > u8 hdgst = nvme_tcp_hdgst_len(queue); > int len = sizeof(*pdu) + hdgst - req->offset; > + struct request *rq = blk_mq_rq_from_pdu(req); > int flags = MSG_DONTWAIT; > int ret; > > @@ -1123,6 +1231,10 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req) > else > flags |= MSG_EOR; > > + if (test_bit(NVME_TCP_Q_OFFLOADS, &queue->flags) && > + blk_rq_nr_phys_segments(rq) && rq_data_dir(rq) == READ) > + nvme_tcp_setup_ddp(queue, pdu->cmd.common.command_id, rq); I'd assume that this is something we want to setup in nvme_tcp_setup_cmd_pdu. Why do it here?
>> static >> int nvme_tcp_offload_socket(struct nvme_tcp_queue *queue, >> struct nvme_tcp_config *config) >> @@ -630,6 +720,7 @@ static void nvme_tcp_error_recovery(struct >> nvme_ctrl *ctrl) >> static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue, >> struct nvme_completion *cqe) >> { >> + struct nvme_tcp_request *req; >> struct request *rq; >> rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), cqe->command_id); >> @@ -641,8 +732,15 @@ static int nvme_tcp_process_nvme_cqe(struct >> nvme_tcp_queue *queue, >> return -EINVAL; >> } >> - if (!nvme_try_complete_req(rq, cqe->status, cqe->result)) >> - nvme_complete_rq(rq); >> + req = blk_mq_rq_to_pdu(rq); >> + if (req->offloaded) { >> + req->status = cqe->status; >> + req->result = cqe->result; >> + nvme_tcp_teardown_ddp(queue, cqe->command_id, rq); >> + } else { >> + if (!nvme_try_complete_req(rq, cqe->status, cqe->result)) >> + nvme_complete_rq(rq); >> + } Oh forgot to ask, We have places in the driver that we may complete (cancel) one or more requests from the error recovery or timeout flow. We first prevent future incoming RX on the socket such that we can safely cancel requests. This may break with the deferred completion in ddp_teardown_done. If I have a request that is waiting for ddp_teardown_done do I have a way to tell the HW to never call ddp_teardown_done on a specific socket? If so the place to is in nvme_tcp_stop_queue.
On 09/10/2020 1:29, Sagi Grimberg wrote: > >> static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, >> @@ -1115,6 +1222,7 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req) >> bool inline_data = nvme_tcp_has_inline_data(req); >> u8 hdgst = nvme_tcp_hdgst_len(queue); >> int len = sizeof(*pdu) + hdgst - req->offset; >> + struct request *rq = blk_mq_rq_from_pdu(req); >> int flags = MSG_DONTWAIT; >> int ret; >> >> @@ -1123,6 +1231,10 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req) >> else >> flags |= MSG_EOR; >> >> + if (test_bit(NVME_TCP_Q_OFFLOADS, &queue->flags) && >> + blk_rq_nr_phys_segments(rq) && rq_data_dir(rq) == READ) >> + nvme_tcp_setup_ddp(queue, pdu->cmd.common.command_id, rq); > I'd assume that this is something we want to setup in > nvme_tcp_setup_cmd_pdu. Why do it here? Our goal in placing it here is to keep both setup and teardown in the same thread. This enables drivers to avoid locking for per-queue operations.
On 09/10/2020 2:00, Sagi Grimberg wrote: >>> static >>> int nvme_tcp_offload_socket(struct nvme_tcp_queue *queue, >>> struct nvme_tcp_config *config) >>> @@ -630,6 +720,7 @@ static void nvme_tcp_error_recovery(struct >>> nvme_ctrl *ctrl) >>> static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue, >>> struct nvme_completion *cqe) >>> { >>> + struct nvme_tcp_request *req; >>> struct request *rq; >>> rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), cqe->command_id); >>> @@ -641,8 +732,15 @@ static int nvme_tcp_process_nvme_cqe(struct >>> nvme_tcp_queue *queue, >>> return -EINVAL; >>> } >>> - if (!nvme_try_complete_req(rq, cqe->status, cqe->result)) >>> - nvme_complete_rq(rq); >>> + req = blk_mq_rq_to_pdu(rq); >>> + if (req->offloaded) { >>> + req->status = cqe->status; >>> + req->result = cqe->result; >>> + nvme_tcp_teardown_ddp(queue, cqe->command_id, rq); >>> + } else { >>> + if (!nvme_try_complete_req(rq, cqe->status, cqe->result)) >>> + nvme_complete_rq(rq); >>> + } > Oh forgot to ask, > > We have places in the driver that we may complete (cancel) one > or more requests from the error recovery or timeout flow. We > first prevent future incoming RX on the socket such that we > can safely cancel requests. This may break with the deferred > completion in ddp_teardown_done. > > If I have a request that is waiting for ddp_teardown_done do > I have a way to tell the HW to never call ddp_teardown_done > on a specific socket? > > If so the place to is in nvme_tcp_stop_queue. Interesting and indeed, it is a problem that we haven't considered.
>>> static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, >>> @@ -1115,6 +1222,7 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req) >>> bool inline_data = nvme_tcp_has_inline_data(req); >>> u8 hdgst = nvme_tcp_hdgst_len(queue); >>> int len = sizeof(*pdu) + hdgst - req->offset; >>> + struct request *rq = blk_mq_rq_from_pdu(req); >>> int flags = MSG_DONTWAIT; >>> int ret; >>> >>> @@ -1123,6 +1231,10 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req) >>> else >>> flags |= MSG_EOR; >>> >>> + if (test_bit(NVME_TCP_Q_OFFLOADS, &queue->flags) && >>> + blk_rq_nr_phys_segments(rq) && rq_data_dir(rq) == READ) >>> + nvme_tcp_setup_ddp(queue, pdu->cmd.common.command_id, rq); >> I'd assume that this is something we want to setup in >> nvme_tcp_setup_cmd_pdu. Why do it here? > Our goal in placing it here is to keep both setup and teardown in the same thread. > This enables drivers to avoid locking for per-queue operations. I also think that it is cleaner when setting up the PDU. Do note that if queues match 1x1 with cpu cores then any synchronization is pretty lightweight, and if not, we have other synchronizations anyways...
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 06711ac095f2..7bd97f856677 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -56,6 +56,11 @@ struct nvme_tcp_request { size_t offset; size_t data_sent; enum nvme_tcp_send_state state; + + bool offloaded; + struct tcp_ddp_io ddp; + __le16 status; + union nvme_result result; }; enum nvme_tcp_queue_flags { @@ -231,10 +236,76 @@ static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req, #ifdef CONFIG_TCP_DDP bool nvme_tcp_resync_request(struct sock *sk, u32 seq, u32 flags); +void nvme_tcp_ddp_teardown_done(void *ddp_ctx); const struct tcp_ddp_ulp_ops nvme_tcp_ddp_ulp_ops __read_mostly = { + .resync_request = nvme_tcp_resync_request, + .ddp_teardown_done = nvme_tcp_ddp_teardown_done, }; +static +int nvme_tcp_teardown_ddp(struct nvme_tcp_queue *queue, + uint16_t command_id, + struct request *rq) +{ + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); + struct net_device *netdev = queue->ctrl->offloading_netdev; + int ret; + + if (unlikely(!netdev)) { + pr_info_ratelimited("%s: netdev not found\n", __func__); + return -EINVAL; + } + + ret = netdev->tcp_ddp_ops->tcp_ddp_teardown(netdev, queue->sock->sk, + &req->ddp, rq); + sg_free_table_chained(&req->ddp.sg_table, SG_CHUNK_SIZE); + req->offloaded = false; + return ret; +} + +void nvme_tcp_ddp_teardown_done(void *ddp_ctx) +{ + struct request *rq = ddp_ctx; + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); + + if (!nvme_try_complete_req(rq, cpu_to_le16(req->status << 1), req->result)) + nvme_complete_rq(rq); +} + +static +int nvme_tcp_setup_ddp(struct nvme_tcp_queue *queue, + uint16_t command_id, + struct request *rq) +{ + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); + struct net_device *netdev = queue->ctrl->offloading_netdev; + int ret; + + req->offloaded = false; + + if (unlikely(!netdev)) { + pr_info_ratelimited("%s: netdev not found\n", __func__); + return -EINVAL; + } + + req->ddp.command_id = command_id; + req->ddp.sg_table.sgl = req->ddp.first_sgl; + ret = sg_alloc_table_chained(&req->ddp.sg_table, + blk_rq_nr_phys_segments(rq), req->ddp.sg_table.sgl, + SG_CHUNK_SIZE); + if (ret) + return -ENOMEM; + req->ddp.nents = blk_rq_map_sg(rq->q, rq, req->ddp.sg_table.sgl); + + ret = netdev->tcp_ddp_ops->tcp_ddp_setup(netdev, + queue->sock->sk, + &req->ddp); + if (!ret) + req->offloaded = true; + return ret; +} + static int nvme_tcp_offload_socket(struct nvme_tcp_queue *queue, struct nvme_tcp_config *config) @@ -351,6 +422,25 @@ bool nvme_tcp_resync_request(struct sock *sk, u32 seq, u32 flags) #else +static +int nvme_tcp_setup_ddp(struct nvme_tcp_queue *queue, + uint16_t command_id, + struct request *rq) +{ + return -EINVAL; +} + +static +int nvme_tcp_teardown_ddp(struct nvme_tcp_queue *queue, + uint16_t command_id, + struct request *rq) +{ + return -EINVAL; +} + +void nvme_tcp_ddp_teardown_done(void *ddp_ctx) +{} + static int nvme_tcp_offload_socket(struct nvme_tcp_queue *queue, struct nvme_tcp_config *config) @@ -630,6 +720,7 @@ static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl) static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue, struct nvme_completion *cqe) { + struct nvme_tcp_request *req; struct request *rq; rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), cqe->command_id); @@ -641,8 +732,15 @@ static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue, return -EINVAL; } - if (!nvme_try_complete_req(rq, cqe->status, cqe->result)) - nvme_complete_rq(rq); + req = blk_mq_rq_to_pdu(rq); + if (req->offloaded) { + req->status = cqe->status; + req->result = cqe->result; + nvme_tcp_teardown_ddp(queue, cqe->command_id, rq); + } else { + if (!nvme_try_complete_req(rq, cqe->status, cqe->result)) + nvme_complete_rq(rq); + } queue->nr_cqe++; return 0; @@ -836,9 +934,18 @@ static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb, static inline void nvme_tcp_end_request(struct request *rq, u16 status) { union nvme_result res = {}; + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_tcp_queue *queue = req->queue; + struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu; - if (!nvme_try_complete_req(rq, cpu_to_le16(status << 1), res)) - nvme_complete_rq(rq); + if (req->offloaded) { + req->status = cpu_to_le16(status << 1); + req->result = res; + nvme_tcp_teardown_ddp(queue, pdu->command_id, rq); + } else { + if (!nvme_try_complete_req(rq, cpu_to_le16(status << 1), res)) + nvme_complete_rq(rq); + } } static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, @@ -1115,6 +1222,7 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req) bool inline_data = nvme_tcp_has_inline_data(req); u8 hdgst = nvme_tcp_hdgst_len(queue); int len = sizeof(*pdu) + hdgst - req->offset; + struct request *rq = blk_mq_rq_from_pdu(req); int flags = MSG_DONTWAIT; int ret; @@ -1123,6 +1231,10 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req) else flags |= MSG_EOR; + if (test_bit(NVME_TCP_Q_OFFLOADS, &queue->flags) && + blk_rq_nr_phys_segments(rq) && rq_data_dir(rq) == READ) + nvme_tcp_setup_ddp(queue, pdu->cmd.common.command_id, rq); + if (queue->hdr_digest && !req->offset) nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); @@ -2448,6 +2560,7 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns, req->data_len = blk_rq_nr_phys_segments(rq) ? blk_rq_payload_bytes(rq) : 0; req->curr_bio = rq->bio; + req->offloaded = false; if (rq_data_dir(rq) == WRITE && req->data_len <= nvme_tcp_inline_data_size(queue))