From patchwork Fri Mar 6 03:18:24 2015 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Fan Du X-Patchwork-Id: 446989 X-Patchwork-Delegate: davem@davemloft.net Return-Path: X-Original-To: patchwork-incoming@ozlabs.org Delivered-To: patchwork-incoming@ozlabs.org Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by ozlabs.org (Postfix) with ESMTP id 8AEA91400DE for ; Fri, 6 Mar 2015 14:23:30 +1100 (AEDT) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1754122AbbCFDX0 (ORCPT ); Thu, 5 Mar 2015 22:23:26 -0500 Received: from mga03.intel.com ([134.134.136.65]:38531 "EHLO mga03.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1754121AbbCFDXW (ORCPT ); Thu, 5 Mar 2015 22:23:22 -0500 Received: from fmsmga001.fm.intel.com ([10.253.24.23]) by orsmga103.jf.intel.com with ESMTP; 05 Mar 2015 19:20:47 -0800 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.11,351,1422950400"; d="scan'208";a="675963544" Received: from dufan-optiplex-9010.bj.intel.com ([10.238.155.116]) by fmsmga001.fm.intel.com with ESMTP; 05 Mar 2015 19:23:21 -0800 From: Fan Du To: jheffner@psc.edu Cc: davem@davemloft.net, edumazet@google.com, netdev@vger.kernel.org, fengyuleidian0615@gmail.com Subject: [PATCHv5 net-next 3/4] ipv4: Create probe timer for tcp PMTU as per RFC4821 Date: Fri, 6 Mar 2015 11:18:24 +0800 Message-Id: <1425611905-28503-4-git-send-email-fan.du@intel.com> X-Mailer: git-send-email 1.7.9.5 In-Reply-To: <1425611905-28503-1-git-send-email-fan.du@intel.com> References: <1425611905-28503-1-git-send-email-fan.du@intel.com> Sender: netdev-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: netdev@vger.kernel.org As per RFC4821 7.3. Selecting Probe Size, a probe timer should be armed once probing has converged. Once this timer expired, probing again to take advantage of any path PMTU change. The recommended probing interval is 10 minutes per RFC1981. Probing interval could be sysctled by sysctl_tcp_probe_interval. Eric Dumazet suggested to implement pseudo timer based on 32bits jiffies tcp_time_stamp instead of using classic timer for such rare event. Signed-off-by: Fan Du --- v5: - Zero probe_size before resetting search range. v4: - No changes v3: - Fix pseudo timer delta caculation. v2: - Create a pseudo timer based on 32bits jiffies tcp_time_stamp to control when to reprobing as suggested by Eric. --- include/net/inet_connection_sock.h | 2 + include/net/netns/ipv4.h | 1 + include/net/tcp.h | 3 ++ net/ipv4/sysctl_net_ipv4.c | 7 ++++++ net/ipv4/tcp_ipv4.c | 1 + net/ipv4/tcp_output.c | 38 ++++++++++++++++++++++++++++++++++- net/ipv4/tcp_timer.c | 1 + 7 files changed, 51 insertions(+), 2 deletions(-) diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 5976bde..b9a6b0a 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -126,6 +126,8 @@ struct inet_connection_sock { /* Information on the current probe. */ int probe_size; + + u32 probe_timestamp; } icsk_mtup; u32 icsk_ca_priv[16]; u32 icsk_user_timeout; diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 374bf3f..6323a22 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -86,6 +86,7 @@ struct netns_ipv4 { int sysctl_tcp_mtu_probing; int sysctl_tcp_base_mss; int sysctl_tcp_probe_threshold; + u32 sysctl_tcp_probe_interval; struct ping_group_range ping_group_range; diff --git a/include/net/tcp.h b/include/net/tcp.h index d269c91..dd6adbc 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -67,6 +67,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); /* The least MTU to use for probing */ #define TCP_BASE_MSS 1024 +/* probing interval, default to 10 minutes as per RFC4821 */ +#define TCP_PROBE_INTERVAL 600 + /* Specify interval when tcp mtu probing will stop */ #define TCP_PROBE_THRESHOLD 8 diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index d3c09c1..fdf8991 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -890,6 +890,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "tcp_probe_interval", + .data = &init_net.ipv4.sysctl_tcp_probe_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { } }; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 35790d9..f0c6fc3 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2461,6 +2461,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_ecn = 2; net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; + net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; return 0; fail: diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 46acddc..83e3132 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1354,6 +1354,8 @@ void tcp_mtup_init(struct sock *sk) icsk->icsk_af_ops->net_header_len; icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss); icsk->icsk_mtup.probe_size = 0; + if (icsk->icsk_mtup.enabled) + icsk->icsk_mtup.probe_timestamp = tcp_time_stamp; } EXPORT_SYMBOL(tcp_mtup_init); @@ -1823,6 +1825,31 @@ send_now: return false; } +static inline void tcp_mtu_check_reprobe(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); + u32 interval; + s32 delta; + + interval = net->ipv4.sysctl_tcp_probe_interval; + delta = tcp_time_stamp - icsk->icsk_mtup.probe_timestamp; + if (unlikely(delta >= interval * HZ)) { + int mss = tcp_current_mss(sk); + + /* Update current search range */ + icsk->icsk_mtup.probe_size = 0; + icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + + sizeof(struct tcphdr) + + icsk->icsk_af_ops->net_header_len; + icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); + + /* Update probe time stamp */ + icsk->icsk_mtup.probe_timestamp = tcp_time_stamp; + } +} + /* Create a new MTU probe if we are ready. * MTU probe is regularly attempting to increase the path MTU by * deliberately sending larger packets. This discovers routing @@ -1865,9 +1892,16 @@ static int tcp_mtu_probe(struct sock *sk) icsk->icsk_mtup.search_low) >> 1); size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache; interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low; + /* When misfortune happens, we are reprobing actively, + * and then reprobe timer has expired. We stick with current + * probing process by not resetting search range to its orignal. + */ if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) || - interval < max(1, net->ipv4.sysctl_tcp_probe_threshold)) { - /* TODO: set timer for probe_converge_event */ + interval < net->ipv4.sysctl_tcp_probe_threshold) { + /* Check whether enough time has elaplased for + * another round of probing. + */ + tcp_mtu_check_reprobe(sk); return -1; } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 0732b78..1550593 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -107,6 +107,7 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) if (net->ipv4.sysctl_tcp_mtu_probing) { if (!icsk->icsk_mtup.enabled) { icsk->icsk_mtup.enabled = 1; + icsk->icsk_mtup.probe_timestamp = tcp_time_stamp; tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); } else { struct net *net = sock_net(sk);