[bisected] xfrm: TCP connection initiating PMTU discovery stalls on v3.

Message ID	2630078.LdJXSsPB40@h2o.as.studentenwerk.mhn.de
State	RFC, archived
Delegated to:	David Miller
Headers	show Return-Path: <netdev-owner@vger.kernel.org> X-Original-To: patchwork-incoming@ozlabs.org Delivered-To: patchwork-incoming@ozlabs.org Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by ozlabs.org (Postfix) with ESMTP id 3204F140079 for <patchwork-incoming@ozlabs.org>; Sat, 13 Dec 2014 03:58:57 +1100 (AEDT) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1030844AbaLLQ6h (ORCPT <rfc822;patchwork-incoming@ozlabs.org>); Fri, 12 Dec 2014 11:58:37 -0500 Received: from dresden.studentenwerk.mhn.de ([141.84.225.229]:48162 "EHLO email.studentenwerk.mhn.de" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1030596AbaLLQ6e (ORCPT <rfc822;netdev@vger.kernel.org>); Fri, 12 Dec 2014 11:58:34 -0500 Received: from mailhub.studentenwerk.mhn.de (mailhub.studentenwerk.mhn.de [127.0.0.1]) by email.studentenwerk.mhn.de (Postfix) with ESMTP id 3jzdVb4zPZz2hZc; Fri, 12 Dec 2014 17:58:31 +0100 (CET) From: Wolfgang Walter <linux@stwm.de> To: Eric Dumazet <eric.dumazet@gmail.com> Cc: Thomas Jarosch <thomas.jarosch@intra2net.com>, netdev@vger.kernel.org, Eric Dumazet <edumazet@google.com>, Herbert Xu <herbert@gondor.apana.org.au>, Steffen Klassert <steffen.klassert@secunet.com> Subject: Re: [bisected] xfrm: TCP connection initiating PMTU discovery stalls on v3. Date: Fri, 12 Dec 2014 17:58:31 +0100 Message-ID: <2630078.LdJXSsPB40@h2o.as.studentenwerk.mhn.de> User-Agent: KMail/4.13.2 (Linux/3.13.0-32-generic; KDE/4.13.2; i686; ; ) In-Reply-To: <1418238603.27198.37.camel@edumazet-glaptop2.roam.corp.google.com> References: <1709726.jUgUSQI9sl@pikkukde.a.i2n> <6700748.kZ07xrNvHX@h2o.as.studentenwerk.mhn.de> <1418238603.27198.37.camel@edumazet-glaptop2.roam.corp.google.com> MIME-Version: 1.0 Content-Type: multipart/mixed; boundary="nextPart10928713.MmJWZevIig" Content-Transfer-Encoding: 7Bit Sender: netdev-owner@vger.kernel.org Precedence: bulk List-ID: <netdev.vger.kernel.org> X-Mailing-List: netdev@vger.kernel.org

diff --git a/include/linux/tcp.h b/include/linux/tcp.h --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -131,7 +131,7 @@ struct tcp_sock { /* inet_connection_sock has to be the first member of tcp_sock */ struct inet_connection_sock inet_conn; u16 tcp_header_len; /* Bytes of tcp header to send */ - u16 xmit_size_goal_segs; /* Goal for segmenting output packets */ + u16 gso_segs; /* Max number of segs per GSO packet */ /* * Header prediction flags diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -836,47 +836,29 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, int large_allowed) { struct tcp_sock *tp = tcp_sk(sk); - u32 xmit_size_goal, old_size_goal; - - xmit_size_goal = mss_now; - - if (large_allowed && sk_can_gso(sk)) { - u32 gso_size, hlen; - - /* Maybe we should/could use sk->sk_prot->max_header here ? */ - hlen = inet_csk(sk)->icsk_af_ops->net_header_len + - inet_csk(sk)->icsk_ext_hdr_len + - tp->tcp_header_len; - - /* Goal is to send at least one packet per ms, - * not one big TSO packet every 100 ms. - * This preserves ACK clocking and is consistent - * with tcp_tso_should_defer() heuristic. - */ - gso_size = sk->sk_pacing_rate / (2 * MSEC_PER_SEC); - gso_size = max_t(u32, gso_size, - sysctl_tcp_min_tso_segs * mss_now); - - xmit_size_goal = min_t(u32, gso_size, - sk->sk_gso_max_size - 1 - hlen); - - xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal); - - /* We try hard to avoid divides here */ - old_size_goal = tp->xmit_size_goal_segs * mss_now; - - if (likely(old_size_goal <= xmit_size_goal && - old_size_goal + mss_now > xmit_size_goal)) { - xmit_size_goal = old_size_goal; - } else { - tp->xmit_size_goal_segs = - min_t(u16, xmit_size_goal / mss_now, - sk->sk_gso_max_segs); - xmit_size_goal = tp->xmit_size_goal_segs * mss_now; - } + u32 new_size_goal, size_goal, hlen; + + if (!large_allowed || !sk_can_gso(sk)) + return mss_now; + + /* Maybe we should/could use sk->sk_prot->max_header here ? */ + hlen = inet_csk(sk)->icsk_af_ops->net_header_len + + inet_csk(sk)->icsk_ext_hdr_len + + tp->tcp_header_len; + + new_size_goal = sk->sk_gso_max_size - 1 - hlen; + new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal); + + /* We try hard to avoid divides here */ + size_goal = tp->gso_segs * mss_now; + if (unlikely(new_size_goal < size_goal || + new_size_goal >= size_goal + mss_now)) { + tp->gso_segs = min_t(u16, new_size_goal / mss_now, + sk->sk_gso_max_segs); + size_goal = tp->gso_segs * mss_now; } - return max(xmit_size_goal, mss_now); + return max(size_goal, mss_now); } static int tcp_send_mss(struct sock *sk, int *size_goal, int flags) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1484,6 +1484,27 @@ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp, ((nonagle & TCP_NAGLE_CORK) || (!nonagle && tp->packets_out && tcp_minshall_check(tp))); } + +/* Return how many segs we'd like on a TSO packet, + * to send one TSO packet per ms + */ +static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now) +{ + u32 bytes, segs; + + bytes = min(sk->sk_pacing_rate >> 10, + sk->sk_gso_max_size - 1 - MAX_TCP_HEADER); + + /* Goal is to send at least one packet per ms, + * not one big TSO packet every 100 ms. + * This preserves ACK clocking and is consistent + * with tcp_tso_should_defer() heuristic. + */ + segs = max_t(u32, bytes / mss_now, sysctl_tcp_min_tso_segs); + + return min_t(u32, segs, sk->sk_gso_max_segs); +} + /* Returns the portion of skb which can be sent right away */ static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb, @@ -1687,7 +1708,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, * This algorithm is from John Heffner. */ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, - bool *is_cwnd_limited) + bool *is_cwnd_limited, u32 max_segs) { struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); @@ -1717,8 +1738,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, limit = min(send_win, cong_win); /* If a full-sized TSO skb can be sent, do it. */ - if (limit >= min_t(unsigned int, sk->sk_gso_max_size, - tp->xmit_size_goal_segs * tp->mss_cache)) + if (limit >= max_segs * tp->mss_cache) goto send_now; /* Middle in queue won't get any more data, full sendable already? */ @@ -1915,6 +1935,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int cwnd_quota; int result; bool is_cwnd_limited = false; + u32 max_segs; sent_pkts = 0; @@ -1928,6 +1949,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, } } + max_segs = tcp_tso_autosize(sk, mss_now); while ((skb = tcp_send_head(sk))) { unsigned int limit; @@ -1960,10 +1982,23 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, break; } else { if (!push_one && - tcp_tso_should_defer(sk, skb, &is_cwnd_limited)) + tcp_tso_should_defer(sk, skb, &is_cwnd_limited, + max_segs)) break; } + limit = mss_now; + if (tso_segs > 1 && !tcp_urg_mode(tp)) + limit = tcp_mss_split_point(sk, skb, mss_now, + min_t(unsigned int, + cwnd_quota, + max_segs), + nonagle); + + if (skb->len > limit && + unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) + break; + /* TCP Small Queues : * Control number of packets in qdisc/devices to two packets / or ~1 ms. * This allows for : @@ -1974,8 +2009,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, * of queued bytes to ensure line rate. * One example is wifi aggregation (802.11 AMPDU) */ - limit = max_t(unsigned int, sysctl_tcp_limit_output_bytes, - sk->sk_pacing_rate >> 10); + limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10); + limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes); if (atomic_read(&sk->sk_wmem_alloc) > limit) { set_bit(TSQ_THROTTLED, &tp->tsq_flags); @@ -1988,18 +2023,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, break; } - limit = mss_now; - if (tso_segs > 1 && !tcp_urg_mode(tp)) - limit = tcp_mss_split_point(sk, skb, mss_now, - min_t(unsigned int, - cwnd_quota, - sk->sk_gso_max_segs), - nonagle); - - if (skb->len > limit && - unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) - break; - TCP_SKB_CB(skb)->when = tcp_time_stamp; if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) -- 1.7.10.4

[bisected] xfrm: TCP connection initiating PMTU discovery stalls on v3.

Commit Message

Comments

Patch