diff mbox series

[bpf,v2,6/6] bpf: sockmap/tls, close can race with map free

Message ID 156261331866.31108.6405316261950259075.stgit@ubuntu3-kvm1
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series [bpf,v2,1/6] tls: remove close callback sock unlock/lock and flush_sync | expand

Commit Message

John Fastabend July 8, 2019, 7:15 p.m. UTC
When a map free is called and in parallel a socket is closed we
have two paths that can potentially reset the socket prot ops, the
bpf close() path and the map free path. This creates a problem
with which prot ops should be used from the socket closed side.

If the map_free side completes first then we want to call the
original lowest level ops. However, if the tls path runs first
we want to call the sockmap ops. Additionally there was no locking
around prot updates in TLS code paths so the prot ops could
be changed multiple times once from TLS path and again from sockmap
side potentially leaving ops pointed at either TLS or sockmap
when psock and/or tls context have already been destroyed.

To fix this race first only update ops inside callback lock
so that TLS, sockmap and lowest level all agree on prot state.
Second and a ULP callback update() so that lower layers can
inform the upper layer when they are being removed allowing the
upper layer to reset prot ops.

This gets us close to allowing sockmap and tls to be stacked
in arbitrary order but will save that patch for *next trees.

Reported-by: syzbot+06537213db7ba2745c4a@syzkaller.appspotmail.com
Fixes: 02c558b2d5d6 ("bpf: sockmap, support for msg_peek in sk_msg with redirect ingress")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
 include/linux/skmsg.h |    8 +++++++-
 include/net/tcp.h     |    3 +++
 net/core/skmsg.c      |    4 ++--
 net/ipv4/tcp_ulp.c    |   13 +++++++++++++
 net/tls/tls_main.c    |   35 +++++++++++++++++++++++++++++------
 5 files changed, 54 insertions(+), 9 deletions(-)

Comments

Jakub Kicinski July 10, 2019, 2:36 a.m. UTC | #1
On Mon, 08 Jul 2019 19:15:18 +0000, John Fastabend wrote:
> @@ -836,22 +841,39 @@ static int tls_init(struct sock *sk)

There is a goto out above this which has to be turned into return 0;
if out now releases the lock.

>  	if (sk->sk_state != TCP_ESTABLISHED)
>  		return -ENOTSUPP;
>  
> +	tls_build_proto(sk);
> +
>  	/* allocate tls context */
> +	write_lock_bh(&sk->sk_callback_lock);
>  	ctx = create_ctx(sk);
>  	if (!ctx) {
>  		rc = -ENOMEM;
>  		goto out;
>  	}
>  
> -	tls_build_proto(sk);
>  	ctx->tx_conf = TLS_BASE;
>  	ctx->rx_conf = TLS_BASE;
>  	ctx->sk_proto = sk->sk_prot;
>  	update_sk_prot(sk, ctx);
>  out:
> +	write_unlock_bh(&sk->sk_callback_lock);
>  	return rc;
>  }
Jakub Kicinski July 10, 2019, 2:38 a.m. UTC | #2
On Mon, 08 Jul 2019 19:15:18 +0000, John Fastabend wrote:
> @@ -352,15 +354,18 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
>  	if (ctx->tx_conf == TLS_BASE && ctx->rx_conf == TLS_BASE)
>  		goto skip_tx_cleanup;
>  
> -	sk->sk_prot = ctx->sk_proto;
>  	tls_sk_proto_cleanup(sk, ctx, timeo);
>  
>  skip_tx_cleanup:
> +	write_lock_bh(&sk->sk_callback_lock);
> +	icsk->icsk_ulp_data = NULL;

Is ulp_data pointer now supposed to be updated under the
sk_callback_lock?

> +	if (sk->sk_prot->close == tls_sk_proto_close)
> +		sk->sk_prot = ctx->sk_proto;
> +	write_unlock_bh(&sk->sk_callback_lock);
>  	release_sock(sk);
>  	if (ctx->rx_conf == TLS_SW)
>  		tls_sw_release_strp_rx(ctx);
> -	sk_proto_close(sk, timeout);
> -
> +	ctx->sk_proto_close(sk, timeout);
>  	if (ctx->tx_conf != TLS_HW && ctx->rx_conf != TLS_HW &&
>  	    ctx->tx_conf != TLS_HW_RECORD && ctx->rx_conf != TLS_HW_RECORD)
>  		tls_ctx_free(ctx);
John Fastabend July 10, 2019, 3:33 a.m. UTC | #3
Jakub Kicinski wrote:
> On Mon, 08 Jul 2019 19:15:18 +0000, John Fastabend wrote:
> > @@ -352,15 +354,18 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
> >  	if (ctx->tx_conf == TLS_BASE && ctx->rx_conf == TLS_BASE)
> >  		goto skip_tx_cleanup;
> >  
> > -	sk->sk_prot = ctx->sk_proto;
> >  	tls_sk_proto_cleanup(sk, ctx, timeo);
> >  
> >  skip_tx_cleanup:
> > +	write_lock_bh(&sk->sk_callback_lock);
> > +	icsk->icsk_ulp_data = NULL;
> 
> Is ulp_data pointer now supposed to be updated under the
> sk_callback_lock?

Yes otherwise it can race with tls_update(). I didn't remove the
ulp pointer null set from tcp_ulp.c though. Could be done in this
patch or as a follow up.
> 
> > +	if (sk->sk_prot->close == tls_sk_proto_close)
> > +		sk->sk_prot = ctx->sk_proto;
> > +	write_unlock_bh(&sk->sk_callback_lock);
> >  	release_sock(sk);
> >  	if (ctx->rx_conf == TLS_SW)
> >  		tls_sw_release_strp_rx(ctx);
> > -	sk_proto_close(sk, timeout);
> > -
> > +	ctx->sk_proto_close(sk, timeout);
> >  	if (ctx->tx_conf != TLS_HW && ctx->rx_conf != TLS_HW &&
> >  	    ctx->tx_conf != TLS_HW_RECORD && ctx->rx_conf != TLS_HW_RECORD)
> >  		tls_ctx_free(ctx);
Jakub Kicinski July 10, 2019, 7:35 p.m. UTC | #4
On Tue, 09 Jul 2019 20:33:58 -0700, John Fastabend wrote:
> Jakub Kicinski wrote:
> > On Mon, 08 Jul 2019 19:15:18 +0000, John Fastabend wrote:  
> > > @@ -352,15 +354,18 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
> > >  	if (ctx->tx_conf == TLS_BASE && ctx->rx_conf == TLS_BASE)
> > >  		goto skip_tx_cleanup;
> > >  
> > > -	sk->sk_prot = ctx->sk_proto;
> > >  	tls_sk_proto_cleanup(sk, ctx, timeo);
> > >  
> > >  skip_tx_cleanup:
> > > +	write_lock_bh(&sk->sk_callback_lock);
> > > +	icsk->icsk_ulp_data = NULL;  
> > 
> > Is ulp_data pointer now supposed to be updated under the
> > sk_callback_lock?  
> 
> Yes otherwise it can race with tls_update(). I didn't remove the
> ulp pointer null set from tcp_ulp.c though. Could be done in this
> patch or as a follow up.

Do we need to hold the lock in unhash, too, or is unhash called with
sk_callback_lock held?

> > > +	if (sk->sk_prot->close == tls_sk_proto_close)
> > > +		sk->sk_prot = ctx->sk_proto;
> > > +	write_unlock_bh(&sk->sk_callback_lock);
> > >  	release_sock(sk);
> > >  	if (ctx->rx_conf == TLS_SW)
> > >  		tls_sw_release_strp_rx(ctx);
> > > -	sk_proto_close(sk, timeout);
> > > -
> > > +	ctx->sk_proto_close(sk, timeout);
> > >  	if (ctx->tx_conf != TLS_HW && ctx->rx_conf != TLS_HW &&
> > >  	    ctx->tx_conf != TLS_HW_RECORD && ctx->rx_conf != TLS_HW_RECORD)
> > >  		tls_ctx_free(ctx);
John Fastabend July 11, 2019, 4:39 p.m. UTC | #5
Jakub Kicinski wrote:
> On Tue, 09 Jul 2019 20:33:58 -0700, John Fastabend wrote:
> > Jakub Kicinski wrote:
> > > On Mon, 08 Jul 2019 19:15:18 +0000, John Fastabend wrote:  
> > > > @@ -352,15 +354,18 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
> > > >  	if (ctx->tx_conf == TLS_BASE && ctx->rx_conf == TLS_BASE)
> > > >  		goto skip_tx_cleanup;
> > > >  
> > > > -	sk->sk_prot = ctx->sk_proto;
> > > >  	tls_sk_proto_cleanup(sk, ctx, timeo);
> > > >  
> > > >  skip_tx_cleanup:
> > > > +	write_lock_bh(&sk->sk_callback_lock);
> > > > +	icsk->icsk_ulp_data = NULL;  
> > > 
> > > Is ulp_data pointer now supposed to be updated under the
> > > sk_callback_lock?  
> > 
> > Yes otherwise it can race with tls_update(). I didn't remove the
> > ulp pointer null set from tcp_ulp.c though. Could be done in this
> > patch or as a follow up.
> 
> Do we need to hold the lock in unhash, too, or is unhash called with
> sk_callback_lock held?
> 

We should hold the lock here. Also we should reset sk_prot similar to
other paths in case we get here without a close() call. syzbot hasn't
found that path yet but I'll add some tests for it.

	write_lock_bh(...)
	icsk_ulp_data = NULL
	sk->sk_prot = ctx->sk_proto;
	write_unlock_bh(...)

Thanks
diff mbox series

Patch

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 50ced8aba9db..e4b3fb4bb77c 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -354,7 +354,13 @@  static inline void sk_psock_restore_proto(struct sock *sk,
 	sk->sk_write_space = psock->saved_write_space;
 
 	if (psock->sk_proto) {
-		sk->sk_prot = psock->sk_proto;
+		struct inet_connection_sock *icsk = inet_csk(sk);
+		bool has_ulp = !!icsk->icsk_ulp_data;
+
+		if (has_ulp)
+			tcp_update_ulp(sk, psock->sk_proto);
+		else
+			sk->sk_prot = psock->sk_proto;
 		psock->sk_proto = NULL;
 	}
 }
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 9d36cc88d043..123cac4c96f2 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2102,6 +2102,8 @@  struct tcp_ulp_ops {
 
 	/* initialize ulp */
 	int (*init)(struct sock *sk);
+	/* update ulp */
+	void (*update)(struct sock *sk, struct proto *p);
 	/* cleanup ulp */
 	void (*release)(struct sock *sk);
 
@@ -2113,6 +2115,7 @@  void tcp_unregister_ulp(struct tcp_ulp_ops *type);
 int tcp_set_ulp(struct sock *sk, const char *name);
 void tcp_get_available_ulp(char *buf, size_t len);
 void tcp_cleanup_ulp(struct sock *sk);
+void tcp_update_ulp(struct sock *sk, struct proto *p);
 
 #define MODULE_ALIAS_TCP_ULP(name)				\
 	__MODULE_INFO(alias, alias_userspace, name);		\
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 93bffaad2135..6832eeb4b785 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -585,12 +585,12 @@  EXPORT_SYMBOL_GPL(sk_psock_destroy);
 
 void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
 {
-	rcu_assign_sk_user_data(sk, NULL);
 	sk_psock_cork_free(psock);
 	sk_psock_zap_ingress(psock);
-	sk_psock_restore_proto(sk, psock);
 
 	write_lock_bh(&sk->sk_callback_lock);
+	sk_psock_restore_proto(sk, psock);
+	rcu_assign_sk_user_data(sk, NULL);
 	if (psock->progs.skb_parser)
 		sk_psock_stop_strp(sk, psock);
 	write_unlock_bh(&sk->sk_callback_lock);
diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c
index 3d8a1d835471..4849edb62d52 100644
--- a/net/ipv4/tcp_ulp.c
+++ b/net/ipv4/tcp_ulp.c
@@ -96,6 +96,19 @@  void tcp_get_available_ulp(char *buf, size_t maxlen)
 	rcu_read_unlock();
 }
 
+void tcp_update_ulp(struct sock *sk, struct proto *proto)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+
+	if (!icsk->icsk_ulp_ops) {
+		sk->sk_prot = proto;
+		return;
+	}
+
+	if (icsk->icsk_ulp_ops->update)
+		icsk->icsk_ulp_ops->update(sk, proto);
+}
+
 void tcp_cleanup_ulp(struct sock *sk)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index e8418456ee24..4ba5476fbc5f 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -336,15 +336,17 @@  static void tls_sk_proto_unhash(struct sock *sk)
 
 static void tls_sk_proto_close(struct sock *sk, long timeout)
 {
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tls_context *ctx = tls_get_ctx(sk);
 	long timeo = sock_sndtimeo(sk, 0);
-	void (*sk_proto_close)(struct sock *sk, long timeout);
+
+	if (unlikely(!ctx))
+		return;
 
 	if (ctx->tx_conf == TLS_SW)
 		tls_sw_cancel_work_tx(ctx);
 
 	lock_sock(sk);
-	sk_proto_close = ctx->sk_proto_close;
 
 	if (ctx->tx_conf == TLS_HW_RECORD && ctx->rx_conf == TLS_HW_RECORD)
 		goto skip_tx_cleanup;
@@ -352,15 +354,18 @@  static void tls_sk_proto_close(struct sock *sk, long timeout)
 	if (ctx->tx_conf == TLS_BASE && ctx->rx_conf == TLS_BASE)
 		goto skip_tx_cleanup;
 
-	sk->sk_prot = ctx->sk_proto;
 	tls_sk_proto_cleanup(sk, ctx, timeo);
 
 skip_tx_cleanup:
+	write_lock_bh(&sk->sk_callback_lock);
+	icsk->icsk_ulp_data = NULL;
+	if (sk->sk_prot->close == tls_sk_proto_close)
+		sk->sk_prot = ctx->sk_proto;
+	write_unlock_bh(&sk->sk_callback_lock);
 	release_sock(sk);
 	if (ctx->rx_conf == TLS_SW)
 		tls_sw_release_strp_rx(ctx);
-	sk_proto_close(sk, timeout);
-
+	ctx->sk_proto_close(sk, timeout);
 	if (ctx->tx_conf != TLS_HW && ctx->rx_conf != TLS_HW &&
 	    ctx->tx_conf != TLS_HW_RECORD && ctx->rx_conf != TLS_HW_RECORD)
 		tls_ctx_free(ctx);
@@ -836,22 +841,39 @@  static int tls_init(struct sock *sk)
 	if (sk->sk_state != TCP_ESTABLISHED)
 		return -ENOTSUPP;
 
+	tls_build_proto(sk);
+
 	/* allocate tls context */
+	write_lock_bh(&sk->sk_callback_lock);
 	ctx = create_ctx(sk);
 	if (!ctx) {
 		rc = -ENOMEM;
 		goto out;
 	}
 
-	tls_build_proto(sk);
 	ctx->tx_conf = TLS_BASE;
 	ctx->rx_conf = TLS_BASE;
 	ctx->sk_proto = sk->sk_prot;
 	update_sk_prot(sk, ctx);
 out:
+	write_unlock_bh(&sk->sk_callback_lock);
 	return rc;
 }
 
+static void tls_update(struct sock *sk, struct proto *p)
+{
+	struct tls_context *ctx;
+
+	ctx = tls_get_ctx(sk);
+	if (likely(ctx)) {
+		ctx->sk_proto_close = p->close;
+		ctx->unhash = p->unhash;
+		ctx->sk_proto = p;
+	} else {
+		sk->sk_prot = p;
+	}
+}
+
 void tls_register_device(struct tls_device *device)
 {
 	spin_lock_bh(&device_spinlock);
@@ -872,6 +894,7 @@  static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = {
 	.name			= "tls",
 	.owner			= THIS_MODULE,
 	.init			= tls_init,
+	.update			= tls_update,
 };
 
 static int __init tls_register(void)