diff mbox series

[v2,bpf-next,04/13] tcp: Introduce inet_csk_reqsk_queue_migrate().

Message ID 20201207132456.65472-5-kuniyu@amazon.co.jp
State Superseded
Headers show
Series Socket migration for SO_REUSEPORT. | expand

Commit Message

Kuniyuki Iwashima Dec. 7, 2020, 1:24 p.m. UTC
This patch defines a new function to migrate ESTABLISHED/SYN_RECV sockets.

Listening sockets hold incoming connections as a linked list of struct
request_sock in the accept queue, and each request has reference to its
full socket and listener. In inet_csk_reqsk_queue_migrate(), we only unlink
the requests from the closing listener's queue and relink them to the head
of the new listener's queue. We do not process each request and its
reference to the listener, so the migration completes in O(1) time
complexity.

Moreover, if TFO requests caused RST before 3WHS has completed, they are
held in the listener's TFO queue to prevent DDoS attack. Thus, we also
migrate the requests in the TFO queue in the same way.

After 3WHS has completed, there are three access patterns to incoming
sockets:

  (1) access to the full socket instead of request_sock
  (2) access to request_sock from access queue
  (3) access to request_sock from TFO queue

In the first case, the full socket does not have a reference to its request
socket and listener, so we do not need the correct listener set in the
request socket. In the second case, we always have the correct listener and
currently do not use req->rsk_listener. However, in the third case of
TCP_SYN_RECV sockets, we take special care in the next commit.

Reviewed-by: Benjamin Herrenschmidt <benh@amazon.com>
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
---
 include/net/inet_connection_sock.h |  1 +
 net/ipv4/inet_connection_sock.c    | 68 ++++++++++++++++++++++++++++++
 2 files changed, 69 insertions(+)
diff mbox series

Patch

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 7338b3865a2a..2ea2d743f8fc 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -260,6 +260,7 @@  struct dst_entry *inet_csk_route_child_sock(const struct sock *sk,
 struct sock *inet_csk_reqsk_queue_add(struct sock *sk,
 				      struct request_sock *req,
 				      struct sock *child);
+void inet_csk_reqsk_queue_migrate(struct sock *sk, struct sock *nsk);
 void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
 				   unsigned long timeout);
 struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child,
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 1451aa9712b0..5da38a756e4c 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -992,6 +992,74 @@  struct sock *inet_csk_reqsk_queue_add(struct sock *sk,
 }
 EXPORT_SYMBOL(inet_csk_reqsk_queue_add);
 
+void inet_csk_reqsk_queue_migrate(struct sock *sk, struct sock *nsk)
+{
+	struct request_sock_queue *old_accept_queue, *new_accept_queue;
+	struct fastopen_queue *old_fastopenq, *new_fastopenq;
+	spinlock_t *l1, *l2, *l3, *l4;
+
+	old_accept_queue = &inet_csk(sk)->icsk_accept_queue;
+	new_accept_queue = &inet_csk(nsk)->icsk_accept_queue;
+	old_fastopenq = &old_accept_queue->fastopenq;
+	new_fastopenq = &new_accept_queue->fastopenq;
+
+	l1 = &old_accept_queue->rskq_lock;
+	l2 = &new_accept_queue->rskq_lock;
+	l3 = &old_fastopenq->lock;
+	l4 = &new_fastopenq->lock;
+
+	/* sk is never selected as the new listener from reuse->socks[],
+	 * so inversion deadlock does not happen here,
+	 * but change the order to avoid the warning of lockdep.
+	 */
+	if (sk < nsk) {
+		swap(l1, l2);
+		swap(l3, l4);
+	}
+
+	spin_lock(l1);
+	spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
+
+	if (old_accept_queue->rskq_accept_head) {
+		if (new_accept_queue->rskq_accept_head)
+			old_accept_queue->rskq_accept_tail->dl_next =
+				new_accept_queue->rskq_accept_head;
+		else
+			new_accept_queue->rskq_accept_tail = old_accept_queue->rskq_accept_tail;
+
+		new_accept_queue->rskq_accept_head = old_accept_queue->rskq_accept_head;
+		old_accept_queue->rskq_accept_head = NULL;
+		old_accept_queue->rskq_accept_tail = NULL;
+
+		WRITE_ONCE(nsk->sk_ack_backlog, nsk->sk_ack_backlog + sk->sk_ack_backlog);
+		WRITE_ONCE(sk->sk_ack_backlog, 0);
+	}
+
+	spin_unlock(l2);
+	spin_unlock(l1);
+
+	spin_lock_bh(l3);
+	spin_lock_bh_nested(l4, SINGLE_DEPTH_NESTING);
+
+	new_fastopenq->qlen += old_fastopenq->qlen;
+	old_fastopenq->qlen = 0;
+
+	if (old_fastopenq->rskq_rst_head) {
+		if (new_fastopenq->rskq_rst_head)
+			old_fastopenq->rskq_rst_tail->dl_next = new_fastopenq->rskq_rst_head;
+		else
+			old_fastopenq->rskq_rst_tail = new_fastopenq->rskq_rst_tail;
+
+		new_fastopenq->rskq_rst_head = old_fastopenq->rskq_rst_head;
+		old_fastopenq->rskq_rst_head = NULL;
+		old_fastopenq->rskq_rst_tail = NULL;
+	}
+
+	spin_unlock_bh(l4);
+	spin_unlock_bh(l3);
+}
+EXPORT_SYMBOL(inet_csk_reqsk_queue_migrate);
+
 struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child,
 					 struct request_sock *req, bool own_req)
 {