Message ID | 1592748544-41555-1-git-send-email-tariqt@mellanox.com |
---|---|
State | Superseded |
Delegated to: | David Miller |
Headers | show |
Series | [net] net: Do not clear the socket TX queue in sock_orphan() | expand |
On 6/21/20 7:09 AM, Tariq Toukan wrote: > sock_orphan() call to sk_set_socket() implies clearing the sock TX queue. > This might cause unexpected out-of-order transmit, as outstanding packets > can pick a different TX queue and bypass the ones already queued. > This is undesired in general. More specifically, it breaks the in-order > scheduling property guarantee for device-offloaded TLS sockets. > > Introduce a function variation __sk_set_socket() that does not clear > the TX queue, and call it from sock_orphan(). > All other callers of sk_set_socket() do not operate on an active socket, > so they do not need this change. > > Fixes: e022f0b4a03f ("net: Introduce sk_tx_queue_mapping") > Signed-off-by: Tariq Toukan <tariqt@mellanox.com> > Reviewed-by: Boris Pismenny <borisp@mellanox.com> > --- > include/net/sock.h | 9 +++++++-- > 1 file changed, 7 insertions(+), 2 deletions(-) > > Please queue for -stable. > > diff --git a/include/net/sock.h b/include/net/sock.h > index c53cc42b5ab9..23e43f3d79f0 100644 > --- a/include/net/sock.h > +++ b/include/net/sock.h > @@ -1846,10 +1846,15 @@ static inline int sk_rx_queue_get(const struct sock *sk) > } > #endif > > +static inline void __sk_set_socket(struct sock *sk, struct socket *sock) > +{ > + sk->sk_socket = sock; > +} > + > static inline void sk_set_socket(struct sock *sk, struct socket *sock) > { > sk_tx_queue_clear(sk); > - sk->sk_socket = sock; > + __sk_set_socket(sk, sock); > } > Hmm... I think we should have a single sk_set_socket() call, and remove the sk_tx_queue_clear() from it. sk_tx_queue_clear() should be put where it is needed, instead of being hidden in sk_set_socket() diff --git a/include/net/sock.h b/include/net/sock.h index c53cc42b5ab92d0062519e60435b85c75564a967..3428619faae4340485b200f49d9cce4fb09086b3 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1848,7 +1848,6 @@ static inline int sk_rx_queue_get(const struct sock *sk) static inline void sk_set_socket(struct sock *sk, struct socket *sock) { - sk_tx_queue_clear(sk); sk->sk_socket = sock; } diff --git a/net/core/sock.c b/net/core/sock.c index 6c4acf1f0220b1f925ebcfaa847632ec0dbe0b9b..134de0d37f77ba781b2b3341c94a97a1b2d57a2d 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1767,6 +1767,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, cgroup_sk_alloc(&sk->sk_cgrp_data); sock_update_classid(&sk->sk_cgrp_data); sock_update_netprioidx(&sk->sk_cgrp_data); + sk_tx_queue_clear(sk); } return sk; @@ -1990,6 +1991,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) */ sk_refcnt_debug_inc(newsk); sk_set_socket(newsk, NULL); + sk_tx_queue_clear(newsk); RCU_INIT_POINTER(newsk->sk_wq, NULL); if (newsk->sk_prot->sockets_allocated)
On 6/22/2020 6:53 PM, Eric Dumazet wrote: > > > On 6/21/20 7:09 AM, Tariq Toukan wrote: >> sock_orphan() call to sk_set_socket() implies clearing the sock TX queue. >> This might cause unexpected out-of-order transmit, as outstanding packets >> can pick a different TX queue and bypass the ones already queued. >> This is undesired in general. More specifically, it breaks the in-order >> scheduling property guarantee for device-offloaded TLS sockets. >> >> Introduce a function variation __sk_set_socket() that does not clear >> the TX queue, and call it from sock_orphan(). >> All other callers of sk_set_socket() do not operate on an active socket, >> so they do not need this change. >> >> Fixes: e022f0b4a03f ("net: Introduce sk_tx_queue_mapping") >> Signed-off-by: Tariq Toukan <tariqt@mellanox.com> >> Reviewed-by: Boris Pismenny <borisp@mellanox.com> >> --- >> include/net/sock.h | 9 +++++++-- >> 1 file changed, 7 insertions(+), 2 deletions(-) >> >> Please queue for -stable. >> >> diff --git a/include/net/sock.h b/include/net/sock.h >> index c53cc42b5ab9..23e43f3d79f0 100644 >> --- a/include/net/sock.h >> +++ b/include/net/sock.h >> @@ -1846,10 +1846,15 @@ static inline int sk_rx_queue_get(const struct sock *sk) >> } >> #endif >> >> +static inline void __sk_set_socket(struct sock *sk, struct socket *sock) >> +{ >> + sk->sk_socket = sock; >> +} >> + >> static inline void sk_set_socket(struct sock *sk, struct socket *sock) >> { >> sk_tx_queue_clear(sk); >> - sk->sk_socket = sock; >> + __sk_set_socket(sk, sock); >> } >> > > Hmm... > > I think we should have a single sk_set_socket() call, and remove > the sk_tx_queue_clear() from it. > > sk_tx_queue_clear() should be put where it is needed, instead of being hidden > in sk_set_socket() > Thanks Eric, sounds good to me, I will respin, just have some questions below. > diff --git a/include/net/sock.h b/include/net/sock.h > index c53cc42b5ab92d0062519e60435b85c75564a967..3428619faae4340485b200f49d9cce4fb09086b3 100644 > --- a/include/net/sock.h > +++ b/include/net/sock.h > @@ -1848,7 +1848,6 @@ static inline int sk_rx_queue_get(const struct sock *sk) > > static inline void sk_set_socket(struct sock *sk, struct socket *sock) > { > - sk_tx_queue_clear(sk); > sk->sk_socket = sock; > } > > diff --git a/net/core/sock.c b/net/core/sock.c > index 6c4acf1f0220b1f925ebcfaa847632ec0dbe0b9b..134de0d37f77ba781b2b3341c94a97a1b2d57a2d 100644 > --- a/net/core/sock.c > +++ b/net/core/sock.c > @@ -1767,6 +1767,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, > cgroup_sk_alloc(&sk->sk_cgrp_data); > sock_update_classid(&sk->sk_cgrp_data); > sock_update_netprioidx(&sk->sk_cgrp_data); > + sk_tx_queue_clear(sk); Why add it here? I don't see a call to sk_set_socket(). > } > > return sk; > @@ -1990,6 +1991,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) > */ > sk_refcnt_debug_inc(newsk); > sk_set_socket(newsk, NULL); > + sk_tx_queue_clear(newsk); > RCU_INIT_POINTER(newsk->sk_wq, NULL); > > if (newsk->sk_prot->sockets_allocated) > So in addition to sock_orphan(), now sk_tx_queue_clear() won't be called also from: 1. sock_graft(): Looks fine to me. 2. sock_init_data(): I think we should add an explicit call to sk_tx_queue_clear() here. The one for RX sk_rx_queue_clear() is already there. 3. mptcp_sock_graft(): Looks fine to me. Regards, Tariq
On 6/22/20 9:24 AM, Tariq Toukan wrote: > > > On 6/22/2020 6:53 PM, Eric Dumazet wrote: >> >> >> On 6/21/20 7:09 AM, Tariq Toukan wrote: >>> sock_orphan() call to sk_set_socket() implies clearing the sock TX queue. >>> This might cause unexpected out-of-order transmit, as outstanding packets >>> can pick a different TX queue and bypass the ones already queued. >>> This is undesired in general. More specifically, it breaks the in-order >>> scheduling property guarantee for device-offloaded TLS sockets. >>> >>> Introduce a function variation __sk_set_socket() that does not clear >>> the TX queue, and call it from sock_orphan(). >>> All other callers of sk_set_socket() do not operate on an active socket, >>> so they do not need this change. >>> >>> Fixes: e022f0b4a03f ("net: Introduce sk_tx_queue_mapping") >>> Signed-off-by: Tariq Toukan <tariqt@mellanox.com> >>> Reviewed-by: Boris Pismenny <borisp@mellanox.com> >>> --- >>> include/net/sock.h | 9 +++++++-- >>> 1 file changed, 7 insertions(+), 2 deletions(-) >>> >>> Please queue for -stable. >>> >>> diff --git a/include/net/sock.h b/include/net/sock.h >>> index c53cc42b5ab9..23e43f3d79f0 100644 >>> --- a/include/net/sock.h >>> +++ b/include/net/sock.h >>> @@ -1846,10 +1846,15 @@ static inline int sk_rx_queue_get(const struct sock *sk) >>> } >>> #endif >>> +static inline void __sk_set_socket(struct sock *sk, struct socket *sock) >>> +{ >>> + sk->sk_socket = sock; >>> +} >>> + >>> static inline void sk_set_socket(struct sock *sk, struct socket *sock) >>> { >>> sk_tx_queue_clear(sk); >>> - sk->sk_socket = sock; >>> + __sk_set_socket(sk, sock); >>> } >>> >> >> Hmm... >> >> I think we should have a single sk_set_socket() call, and remove >> the sk_tx_queue_clear() from it. >> >> sk_tx_queue_clear() should be put where it is needed, instead of being hidden >> in sk_set_socket() >> > > Thanks Eric, sounds good to me, I will respin, just have some questions below. > >> diff --git a/include/net/sock.h b/include/net/sock.h >> index c53cc42b5ab92d0062519e60435b85c75564a967..3428619faae4340485b200f49d9cce4fb09086b3 100644 >> --- a/include/net/sock.h >> +++ b/include/net/sock.h >> @@ -1848,7 +1848,6 @@ static inline int sk_rx_queue_get(const struct sock *sk) >> static inline void sk_set_socket(struct sock *sk, struct socket *sock) >> { >> - sk_tx_queue_clear(sk); >> sk->sk_socket = sock; >> } >> diff --git a/net/core/sock.c b/net/core/sock.c >> index 6c4acf1f0220b1f925ebcfaa847632ec0dbe0b9b..134de0d37f77ba781b2b3341c94a97a1b2d57a2d 100644 >> --- a/net/core/sock.c >> +++ b/net/core/sock.c >> @@ -1767,6 +1767,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, >> cgroup_sk_alloc(&sk->sk_cgrp_data); >> sock_update_classid(&sk->sk_cgrp_data); >> sock_update_netprioidx(&sk->sk_cgrp_data); >> + sk_tx_queue_clear(sk); > > Why add it here? > I don't see a call to sk_set_socket(). Yes, but the intent is to set the initial value of sk->sk_tx_queue_mapping (USHRT_MAX) when socket object is allocated/populated, not later in some unrelated paths. We move into one location all the initializers. (Most fields initial value is 0, so we do not clear them a second time) > >> } >> return sk; >> @@ -1990,6 +1991,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) >> */ >> sk_refcnt_debug_inc(newsk); >> sk_set_socket(newsk, NULL); >> + sk_tx_queue_clear(newsk); >> RCU_INIT_POINTER(newsk->sk_wq, NULL); >> if (newsk->sk_prot->sockets_allocated) >> > > So in addition to sock_orphan(), now sk_tx_queue_clear() won't be called also from: > 1. sock_graft(): Looks fine to me. > 2. sock_init_data(): I think we should add an explicit call to sk_tx_queue_clear() here. The one for RX sk_rx_queue_clear() is already there. Why ? Initial value found is socket should be just fine. If not, normal skb->ooo_okay should prevail, if protocols really care about OOO. > 3. mptcp_sock_graft(): Looks fine to me. > > Regards, > Tariq
On 6/22/2020 7:56 PM, Eric Dumazet wrote: > > > On 6/22/20 9:24 AM, Tariq Toukan wrote: >> >> >> On 6/22/2020 6:53 PM, Eric Dumazet wrote: >>> >>> >>> On 6/21/20 7:09 AM, Tariq Toukan wrote: >>>> sock_orphan() call to sk_set_socket() implies clearing the sock TX queue. >>>> This might cause unexpected out-of-order transmit, as outstanding packets >>>> can pick a different TX queue and bypass the ones already queued. >>>> This is undesired in general. More specifically, it breaks the in-order >>>> scheduling property guarantee for device-offloaded TLS sockets. >>>> >>>> Introduce a function variation __sk_set_socket() that does not clear >>>> the TX queue, and call it from sock_orphan(). >>>> All other callers of sk_set_socket() do not operate on an active socket, >>>> so they do not need this change. >>>> >>>> Fixes: e022f0b4a03f ("net: Introduce sk_tx_queue_mapping") >>>> Signed-off-by: Tariq Toukan <tariqt@mellanox.com> >>>> Reviewed-by: Boris Pismenny <borisp@mellanox.com> >>>> --- >>>> include/net/sock.h | 9 +++++++-- >>>> 1 file changed, 7 insertions(+), 2 deletions(-) >>>> >>>> Please queue for -stable. >>>> >>>> diff --git a/include/net/sock.h b/include/net/sock.h >>>> index c53cc42b5ab9..23e43f3d79f0 100644 >>>> --- a/include/net/sock.h >>>> +++ b/include/net/sock.h >>>> @@ -1846,10 +1846,15 @@ static inline int sk_rx_queue_get(const struct sock *sk) >>>> } >>>> #endif >>>> +static inline void __sk_set_socket(struct sock *sk, struct socket *sock) >>>> +{ >>>> + sk->sk_socket = sock; >>>> +} >>>> + >>>> static inline void sk_set_socket(struct sock *sk, struct socket *sock) >>>> { >>>> sk_tx_queue_clear(sk); >>>> - sk->sk_socket = sock; >>>> + __sk_set_socket(sk, sock); >>>> } >>>> >>> >>> Hmm... >>> >>> I think we should have a single sk_set_socket() call, and remove >>> the sk_tx_queue_clear() from it. >>> >>> sk_tx_queue_clear() should be put where it is needed, instead of being hidden >>> in sk_set_socket() >>> >> >> Thanks Eric, sounds good to me, I will respin, just have some questions below. >> >>> diff --git a/include/net/sock.h b/include/net/sock.h >>> index c53cc42b5ab92d0062519e60435b85c75564a967..3428619faae4340485b200f49d9cce4fb09086b3 100644 >>> --- a/include/net/sock.h >>> +++ b/include/net/sock.h >>> @@ -1848,7 +1848,6 @@ static inline int sk_rx_queue_get(const struct sock *sk) >>> static inline void sk_set_socket(struct sock *sk, struct socket *sock) >>> { >>> - sk_tx_queue_clear(sk); >>> sk->sk_socket = sock; >>> } >>> diff --git a/net/core/sock.c b/net/core/sock.c >>> index 6c4acf1f0220b1f925ebcfaa847632ec0dbe0b9b..134de0d37f77ba781b2b3341c94a97a1b2d57a2d 100644 >>> --- a/net/core/sock.c >>> +++ b/net/core/sock.c >>> @@ -1767,6 +1767,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, >>> cgroup_sk_alloc(&sk->sk_cgrp_data); >>> sock_update_classid(&sk->sk_cgrp_data); >>> sock_update_netprioidx(&sk->sk_cgrp_data); >>> + sk_tx_queue_clear(sk); >> >> Why add it here? >> I don't see a call to sk_set_socket(). > > Yes, but the intent is to set the initial value of sk->sk_tx_queue_mapping (USHRT_MAX) > when socket object is allocated/populated, not later in some unrelated paths. > > We move into one location all the initializers. > (Most fields initial value is 0, so we do not clear them a second time) > >> >>> } >>> return sk; >>> @@ -1990,6 +1991,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) >>> */ >>> sk_refcnt_debug_inc(newsk); >>> sk_set_socket(newsk, NULL); >>> + sk_tx_queue_clear(newsk); >>> RCU_INIT_POINTER(newsk->sk_wq, NULL); >>> if (newsk->sk_prot->sockets_allocated) >>> >> >> So in addition to sock_orphan(), now sk_tx_queue_clear() won't be called also from: >> 1. sock_graft(): Looks fine to me. >> 2. sock_init_data(): I think we should add an explicit call to sk_tx_queue_clear() here. The one for RX sk_rx_queue_clear() is already there. > > Why ? Initial value found is socket should be just fine. > > If not, normal skb->ooo_okay should prevail, if protocols really care about OOO. > >> 3. mptcp_sock_graft(): Looks fine to me. >> >> Regards, >> Tariq Thanks. I prepared the patch, will test it against the bug repro I have and submit. Tariq
diff --git a/include/net/sock.h b/include/net/sock.h index c53cc42b5ab9..23e43f3d79f0 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1846,10 +1846,15 @@ static inline int sk_rx_queue_get(const struct sock *sk) } #endif +static inline void __sk_set_socket(struct sock *sk, struct socket *sock) +{ + sk->sk_socket = sock; +} + static inline void sk_set_socket(struct sock *sk, struct socket *sock) { sk_tx_queue_clear(sk); - sk->sk_socket = sock; + __sk_set_socket(sk, sock); } static inline wait_queue_head_t *sk_sleep(struct sock *sk) @@ -1868,7 +1873,7 @@ static inline void sock_orphan(struct sock *sk) { write_lock_bh(&sk->sk_callback_lock); sock_set_flag(sk, SOCK_DEAD); - sk_set_socket(sk, NULL); + __sk_set_socket(sk, NULL); sk->sk_wq = NULL; write_unlock_bh(&sk->sk_callback_lock); }