From patchwork Wed Jul 11 02:18:04 2012 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Li Yu X-Patchwork-Id: 170345 X-Patchwork-Delegate: davem@davemloft.net Return-Path: X-Original-To: patchwork-incoming@ozlabs.org Delivered-To: patchwork-incoming@ozlabs.org Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by ozlabs.org (Postfix) with ESMTP id C997E2C020A for ; Wed, 11 Jul 2012 12:18:39 +1000 (EST) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1754910Ab2GKCSg (ORCPT ); Tue, 10 Jul 2012 22:18:36 -0400 Received: from mail-yx0-f174.google.com ([209.85.213.174]:35258 "EHLO mail-yx0-f174.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1754682Ab2GKCSK (ORCPT ); Tue, 10 Jul 2012 22:18:10 -0400 Received: by yenl2 with SMTP id l2so745376yen.19 for ; Tue, 10 Jul 2012 19:18:10 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20120113; h=message-id:date:from:user-agent:mime-version:to:subject:references :in-reply-to:content-type:content-transfer-encoding; bh=Gq3+uOe+67RefsXZTzOlvXVlobJuwzNn9tazEomnh8o=; b=kxOHh1iyvwlbQ7gPBreAPSWuHT8XYs7PSW144PlSAIp7RjOnxr6VDG+nfX1pyRceYD v6d6aGSQ3M/lFDxazpmy8YLbGAV7PApYpHdiazWgm09jIv5MQO7WC4zozE6tmMWONmtT 6xVVVKKP/iHhzip/Zte6AmPGx5M/HEOCYgW74Evl6kBMYIU8o/oo3OmnynSZZAfRCQZV Upel9smHD5fXLD7it1oFPfGwSSvsv4VQe+0dPpZmaokZW4QeqLU1Gaiczl6oDlKgQ4Ag nSOFLaeKA8uDyDE3OqWi+HAaNLHU+8bRsBqU3As/dPs2yKZLDRZnF5SBZph1WmaEfSB3 vfhA== Received: by 10.66.83.200 with SMTP id s8mr78143760pay.10.1341973089542; Tue, 10 Jul 2012 19:18:09 -0700 (PDT) Received: from [10.32.101.204] ([202.55.20.10]) by mx.google.com with ESMTPS id vz9sm687037pbc.12.2012.07.10.19.18.06 (version=SSLv3 cipher=OTHER); Tue, 10 Jul 2012 19:18:08 -0700 (PDT) Message-ID: <4FFCE25C.5080309@gmail.com> Date: Wed, 11 Jul 2012 10:18:04 +0800 From: Li Yu User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:13.0) Gecko/20120615 Thunderbird/13.0.1 MIME-Version: 1.0 To: Linux Netdev List Subject: [RFC][PATCH 4/4] skbtrace: four TCP/IP tracepoints tcp/icsk_connection, tcp_sendlim, tcp_congestion References: <4FFBC6B6.2000600@gmail.com> In-Reply-To: <4FFBC6B6.2000600@gmail.com> Sender: netdev-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: netdev@vger.kernel.org From: Li Yu This implements four skbtrace traces for TCP. (1) tcp/icsk_connection is for trace basic state migration of TCP protocol, e.g. SYN_RECV -> ESTABLISHED. (2) tcp_sendlim is for trace TCP sending limitation. e.g. congestion window is limited to send segments. (3) tcp_congestion is for trace TCP congestion events, e.g. Loss, FRTO and etc. Thanks. Sign-off-by: Li Yu --- include/linux/skbtrace.h | 3 include/linux/skbtrace_api.h | 1 include/net/skbtrace_api_ipv4.h | 124 ++++++++++++ include/trace/events/skbtrace.h | 1 include/trace/events/skbtrace_ipv4.h | 49 ++++ net/core/net-traces.c | 4 net/ipv4/Kconfig | 8 net/ipv4/Makefile | 1 net/ipv4/inet_connection_sock.c | 2 net/ipv4/inet_timewait_sock.c | 3 net/ipv4/skbtrace-ipv4.c | 345 +++++++++++++++++++++++++++++++++++ net/ipv4/tcp.c | 5 net/ipv4/tcp_input.c | 12 + net/ipv4/tcp_ipv4.c | 4 net/ipv4/tcp_minisocks.c | 4 net/ipv4/tcp_output.c | 61 ++++-- 16 files changed, 610 insertions(+), 17 deletions(-) @@ -1777,6 +1783,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, } } + sndlim = skbtrace_tcp_sndlim_ok; + result = 0; while ((skb = tcp_send_head(sk))) { unsigned int limit; @@ -1784,20 +1792,27 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, BUG_ON(!tso_segs); cwnd_quota = tcp_cwnd_test(tp, skb); - if (!cwnd_quota) + if (!cwnd_quota) { + sndlim = skbtrace_tcp_sndlim_cwnd; break; + } - if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) + if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) { + sndlim = skbtrace_tcp_sndlim_swnd; break; - + } if (tso_segs == 1) { if (unlikely(!tcp_nagle_test(tp, skb, mss_now, - (tcp_skb_is_last(sk, skb) ? - nonagle : TCP_NAGLE_PUSH)))) + (tcp_skb_is_last(sk, skb) ? + nonagle : TCP_NAGLE_PUSH)))) { + sndlim = skbtrace_tcp_sndlim_nagle; break; + } } else { - if (!push_one && tcp_tso_should_defer(sk, skb)) + if (!push_one && tcp_tso_should_defer(sk, skb)) { + sndlim = skbtrace_tcp_sndlim_tso; break; + } } limit = mss_now; @@ -1806,14 +1821,18 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, cwnd_quota); if (skb->len > limit && - unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) + unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) { + sndlim = skbtrace_tcp_sndlim_frag; break; + } TCP_SKB_CB(skb)->when = tcp_time_stamp; - if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) + result = tcp_transmit_skb(sk, skb, 1, gfp); + if (unlikely(result)) { + sndlim = skbtrace_tcp_sndlim_other; break; - + } /* Advance the send_head. This one is sent out. * This call will increment packets_out. */ @@ -1822,17 +1841,25 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, tcp_minshall_update(tp, mss_now, skb); sent_pkts += tcp_skb_pcount(skb); - if (push_one) + if (push_one) { + sndlim = skbtrace_tcp_sndlim_pushone; break; + } } if (inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery) tp->prr_out += sent_pkts; if (likely(sent_pkts)) { + trace_tcp_sendlimit(sk, skbtrace_tcp_sndlim_ok, sent_pkts); tcp_cwnd_validate(sk); - return false; - } - return !tp->packets_out && tcp_send_head(sk); + retval = false; + } else + retval = !tp->packets_out && tcp_send_head(sk); + + if (skbtrace_tcp_sndlim_ok != sndlim) + trace_tcp_sendlimit(sk, sndlim, result); + + return retval; } /* Push out any pending frames which were held back due to -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html diff --git a/include/linux/skbtrace.h b/include/linux/skbtrace.h index 34b9144..b35d7b3 100644 --- a/include/linux/skbtrace.h +++ b/include/linux/skbtrace.h @@ -67,6 +67,9 @@ extern atomic64_t skbtrace_event_seq; struct skbtrace_context { union { struct skbtrace_block blk; + struct skbtrace_tcp_cong_blk tcp_cong; + struct skbtrace_tcp_conn_blk tcp_conn; + struct skbtrace_tcp_sendlim_blk tcp_sendlim; }; }; diff --git a/include/linux/skbtrace_api.h b/include/linux/skbtrace_api.h index 7489856..281a868 100644 --- a/include/linux/skbtrace_api.h +++ b/include/linux/skbtrace_api.h @@ -68,5 +68,6 @@ struct skbtrace_block { } __packed; #include +#include #endif diff --git a/include/net/skbtrace_api_ipv4.h b/include/net/skbtrace_api_ipv4.h new file mode 100644 index 0000000..a3e6462 --- /dev/null +++ b/include/net/skbtrace_api_ipv4.h @@ -0,0 +1,124 @@ +/* + * skbtrace - sk_buff trace utilty + * + * User/Kernel Interface + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * 2012 Li Yu + * + */ +#ifndef _NET_SKBTRACE_API_IPV4_H +#define _NET_SKBTRACE_API_IPV4_H + +#include + +#ifdef __KERNEL__ +#include +#include +#endif + +/********************* TCP section *********************/ + +/* skbtrace_block->action */ +enum { + skbtrace_action_tcp_min = 101, + skbtrace_action_tcp_congestion = 101, + skbtrace_action_tcp_connection = 102, + skbtrace_action_tcp_sendlimit = 103, + skbtrace_action_tcp_max = 199, +}; + +/* TCP congestion event (101) */ + +/* flags */ +enum { + skbtrace_tcp_cong_cwr = 4, + skbtrace_tcp_cong_loss = 5, + skbtrace_tcp_cong_fastrtx = 6, + skbtrace_tcp_cong_frto = 7, + skbtrace_tcp_cong_frto_loss = 8, + skbtrace_tcp_cong_leave = 9, +}; + +struct skbtrace_tcp_cong_blk { + struct skbtrace_block blk; + __u32 rcv_rtt; + __u32 rto; + __u32 cwnd; + __u32 sndnxt; + __u32 snduna; +} __packed; + +/* TCP basic connection events (101) */ +struct skbtrace_tcp_conn_blk { + struct skbtrace_block blk; + union { + struct { + struct sockaddr local; + struct sockaddr peer; + }; + struct { + struct sockaddr_in local; + struct sockaddr_in peer; + } inet; + struct { + struct sockaddr_in6 local; + struct sockaddr_in6 peer; + } inet6; + } addr; +} __packed; + +/* TCP send limit event (102) */ +enum { + skbtrace_tcp_sndlim_cwnd = 4, + skbtrace_tcp_sndlim_swnd = 5, + skbtrace_tcp_sndlim_nagle = 6, + skbtrace_tcp_sndlim_tso = 7, + skbtrace_tcp_sndlim_frag = 8, /* most likely ENOMEM errors */ + skbtrace_tcp_sndlim_pushone = 9, + skbtrace_tcp_sndlim_other = 10, + skbtrace_tcp_sndlim_ok = 11, +}; + + +/* val member: + * skbtrace_tcp_sndlim_other: the return value of tcp_transmit_skb() + * skbtrace_tcp_sndlim_ok: total sent pkts + * other cases: send limit occurs under MTU probe if 1, otherwise, it is 0 + */ +struct skbtrace_tcp_sendlim_blk { + struct skbtrace_block blk; + __u32 val; + __u32 count; + struct timespec begin; + __u32 snd_ssthresh; + __u32 snd_cwnd; + __u32 snd_cwnd_cnt; + __u32 snd_wnd; +} __packed; + +/********************* icsk section *********************/ + +/* skbtrace_block->action */ +enum { + skbtrace_action_icsk_min = 201, + skbtrace_action_icsk_connection = 201, + skbtrace_action_icsk_max = 299, +}; + +/* Use skbtrace_tcp_conn_blk */ + +#endif diff --git a/include/trace/events/skbtrace.h b/include/trace/events/skbtrace.h index bf8c2cb..91567bf 100644 --- a/include/trace/events/skbtrace.h +++ b/include/trace/events/skbtrace.h @@ -27,5 +27,6 @@ #include #include +#include #endif diff --git a/include/trace/events/skbtrace_ipv4.h b/include/trace/events/skbtrace_ipv4.h new file mode 100644 index 0000000..73a9fb0 --- /dev/null +++ b/include/trace/events/skbtrace_ipv4.h @@ -0,0 +1,49 @@ + /* + * skbtrace - sk_buff trace utilty + * + * The IPv4 related skbtrace events + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Thanks for Web10G project here, some sources reference to it. + * + * 2012 Li Yu + * + */ + +#if !defined(_TRACE_EVENTS_SKBTRACE_IPV4_H) +#define _TRACE_EVENTS_SKBTRACE_IPV4_H + +#include + +struct sock; + +DECLARE_TRACE(icsk_connection, + TP_PROTO(struct sock *sk, __u32 state), + TP_ARGS(sk, state)); + +DECLARE_TRACE(tcp_congestion, + TP_PROTO(struct sock *sk, int reason, int prior_state), + TP_ARGS(sk, reason, prior_state)); + +DECLARE_TRACE(tcp_connection, + TP_PROTO(void *sk, __u32 state), + TP_ARGS(sk, state)); + +DECLARE_TRACE(tcp_sendlimit, + TP_PROTO(struct sock *sk, int reason, int val), + TP_ARGS(sk, reason, val)); + +#endif diff --git a/net/core/net-traces.c b/net/core/net-traces.c index d86a58b..95ad083 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -45,5 +45,9 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll); EXPORT_TRACEPOINT_SYMBOL_GPL(name); NEW_SKBTRACE_TP(skb_rps_info); +NEW_SKBTRACE_TP(tcp_congestion); +NEW_SKBTRACE_TP(tcp_connection); +NEW_SKBTRACE_TP(icsk_connection); +NEW_SKBTRACE_TP(tcp_sendlimit); #endif diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 20f1cb5..feb5e28 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -415,6 +415,14 @@ config INET_UDP_DIAG Support for UDP socket monitoring interface used by the ss tool. If unsure, say Y. +config SKBTRACE_IPV4 + tristate "TCP/IPv4 protocol suite support for skbtrace" + depends on SKBTRACE + default m + ---help--- + Support for IPv4 part of skbtrace. which only contains TCP/IPv4 + specific events. + menuconfig TCP_CONG_ADVANCED bool "TCP: advanced congestion control" ---help--- diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index ff75d3b..4b03aef 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -50,6 +50,7 @@ obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o obj-$(CONFIG_CGROUP_MEM_RES_CTLR_KMEM) += tcp_memcontrol.o obj-$(CONFIG_NETLABEL) += cipso_ipv4.o +obj-${CONFIG_SKBTRACE_IPV4} += skbtrace-ipv4.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ xfrm4_output.o diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 034ddbe..a69becb 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -15,6 +15,7 @@ #include #include +#include #include #include @@ -702,6 +703,7 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) sk_dst_reset(sk); sk->sk_prot->hash(sk); + trace_icsk_connection(sk, TCP_LISTEN); return 0; } diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 2784db3..9363a6b 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include #include #include @@ -205,6 +207,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat atomic_set(&tw->tw_refcnt, 0); inet_twsk_dead_node_init(tw); __module_get(tw->tw_prot->owner); + trace_tcp_connection(tw, state + TCP_MAX_STATES); } return tw; diff --git a/net/ipv4/skbtrace-ipv4.c b/net/ipv4/skbtrace-ipv4.c new file mode 100644 index 0000000..ed486be --- /dev/null +++ b/net/ipv4/skbtrace-ipv4.c @@ -0,0 +1,345 @@ +/* + * skbtrace - sk_buff trace for TCP/IPv4 protocol suite support + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * 2012 Li Yu + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +static void skbtrace_tcp_congestion(struct skbtrace_tracepoint *t, + struct sock *sk, int reason, int prior_state) +SKBTRACE_SOCK_EVENT_BEGIN + struct skbtrace_context *ctx; + struct skbtrace_tcp_cong_blk blk, *b; + struct tcp_sock *tp; + + if (skbtrace_tcp_cong_leave == reason && + inet_csk(sk)->icsk_ca_state == TCP_CA_Open) + return; + + local_bh_disable(); + ctx = skbtrace_context_get(sk); + if (ctx) { + if (skbtrace_action_tcp_congestion != ctx->blk.action) + skbtrace_probe(&ctx->blk); + b = &ctx->tcp_cong; + } else + b = &blk; + + tp = tcp_sk(sk); + INIT_SKBTRACE_BLOCK(&b->blk, tp, + skbtrace_action_tcp_congestion, + 1 << reason, + sizeof(*b)); + b->cwnd = tp->snd_cwnd * tp->mss_cache; + b->rcv_rtt = tp->rcv_rtt_est.rtt; + b->rto = inet_csk(sk)->icsk_rto; + b->snduna = tp->snd_una; + b->sndnxt = tp->snd_nxt; + skbtrace_probe(&b->blk); + local_bh_enable(); +SKBTRACE_SOCK_EVENT_END + +static void skbtrace_tcp_connection(struct skbtrace_tracepoint *t, + void *ptr, u32 state) +{ + struct sock *sk = ptr; + struct inet_timewait_sock *tw = inet_twsk(ptr); + + switch (state) { + case TCP_TIME_WAIT + TCP_MAX_STATES: + case TCP_FIN_WAIT2 + TCP_MAX_STATES: + { + struct skbtrace_tcp_conn_blk blk; + + state -= TCP_MAX_STATES; + INIT_SKBTRACE_BLOCK(&blk.blk, tw, + skbtrace_action_tcp_connection, + 1 << (state + skbtrace_flags_reserved_max), + sizeof(blk)); + blk.addr.inet.local.sin_family = AF_INET; + blk.addr.inet.local.sin_port = tw->tw_sport; + blk.addr.inet.local.sin_addr.s_addr = tw->tw_rcv_saddr; + blk.addr.inet.peer.sin_family = AF_INET; + blk.addr.inet.peer.sin_port = tw->tw_dport; + blk.addr.inet.peer.sin_addr.s_addr = tw->tw_daddr; + skbtrace_probe(&blk.blk); + break; + } + case TCP_ESTABLISHED: + case TCP_FIN_WAIT1: + case TCP_CLOSE: + case TCP_CLOSE_WAIT: + case TCP_LAST_ACK: + case TCP_SYN_SENT: + case TCP_SYN_RECV: + case TCP_CLOSING: + { + struct skbtrace_context *ctx; + struct skbtrace_tcp_conn_blk blk, *b; + + local_bh_disable(); + b = &blk; + ctx = skbtrace_context_get(sk); + if (ctx) { + if (skbtrace_action_tcp_connection + != ctx->blk.action) + skbtrace_probe(&ctx->blk); + b = &ctx->tcp_conn; + } + INIT_SKBTRACE_BLOCK(&b->blk, ptr, + skbtrace_action_tcp_connection, + 1 << (state + skbtrace_flags_reserved_max), + sizeof(blk)); + __inet_sock_getname(sk, &b->addr.local, NULL, 0); + if (TCP_LISTEN != state) + __inet_sock_getname(sk, &b->addr.peer, NULL, 1); + skbtrace_probe(&b->blk); + local_bh_enable(); + break; + } + } +} + +static void skbtrace_icsk_connection(struct skbtrace_tracepoint *t, + struct sock *sk, u32 state) +SKBTRACE_SOCK_EVENT_BEGIN + struct skbtrace_context *ctx; + struct skbtrace_tcp_conn_blk blk, *b; + + if (TCP_LISTEN != state) + return; + + local_bh_disable(); + ctx = skbtrace_context_get(sk); + if (ctx) { + if (skbtrace_action_icsk_connection != ctx->blk.action) + skbtrace_probe(&ctx->blk); + b = &ctx->tcp_conn; + } else + b = &blk; + INIT_SKBTRACE_BLOCK(&b->blk, sk, + skbtrace_action_icsk_connection, + 1 << (state + skbtrace_flags_reserved_max), + sizeof(blk)); + __inet_sock_getname(sk, &b->addr.local, NULL, 0); + skbtrace_probe(&b->blk); + local_bh_enable(); +SKBTRACE_SOCK_EVENT_END + +static const char * const skbtrace_tcp_sendlimit_options[] = { + "cwnd", + "swnd", + "nagle", + "tso", + "frag", + "pushone", + "other", + "ok", +}; + +static const int skbtrace_tcp_sendlimit_masks[] = { + skbtrace_tcp_sndlim_cwnd, + skbtrace_tcp_sndlim_swnd, + skbtrace_tcp_sndlim_nagle, + skbtrace_tcp_sndlim_tso, + skbtrace_tcp_sndlim_frag, + skbtrace_tcp_sndlim_pushone, + skbtrace_tcp_sndlim_other, + skbtrace_tcp_sndlim_ok, +}; + +static int skbtrace_tcp_sendlimit_setopt(struct skbtrace_tracepoint *t, + char *name, char *options) +{ + unsigned long mask = 0UL; + char *cur; + int ret = 0; + + if (options) { + if (strncmp(options, "skip=", sizeof("skip=") - 1)) { + options = NULL; + ret = -EINVAL; + } else + options += sizeof("skip=") - 1; + } + + if (!options || '\x0' == *options) + goto quit; + + mask = 0UL; + cur = strsep(&options, ":"); + while (cur) { + int i, nr_options; + + nr_options = sizeof(skbtrace_tcp_sendlimit_masks)/sizeof(int); + for (i = 0; i < nr_options; i++) { + if (!strcmp(cur, skbtrace_tcp_sendlimit_options[i])) { + mask |= (1 << skbtrace_tcp_sendlimit_masks[i]); + break; + } + } + if (i >= nr_options) { + mask = 0UL; + ret = -EINVAL; + } + cur = strsep(&options, ":"); + } + +quit: + t->private = (void *)(mask); + return ret; +} + +static char *skbtrace_tcp_sendlimit_desc(struct skbtrace_tracepoint *t) +{ + char *desc; + unsigned long mask = (unsigned long)t->private; + int i, nr_options, copied; + + desc = kmalloc(strlen(t->name) + 128, GFP_KERNEL); + if (!desc) + return NULL; + + copied = sprintf(desc, "%s enabled:%d skip=", t->name, t->enabled); + nr_options = sizeof(skbtrace_tcp_sendlimit_masks)/sizeof(int); + for (i = 0; i < nr_options; i++) { + int this_n; + const char *this_p; + + this_n = skbtrace_tcp_sendlimit_masks[i]; + this_p = skbtrace_tcp_sendlimit_options[i]; + if (t->enabled && (mask & (1 << this_n))) + copied += sprintf(desc + copied, "%s,", this_p); + else if (!t->enabled) + copied += sprintf(desc + copied, "%s,", this_p); + } + + sprintf(desc + copied, "\n"); + return desc; +} + +static inline void tcp_sendlimit_block_setup(struct skbtrace_tcp_sendlim_blk *b, + struct sock *sk, int reason, int val) +{ + struct tcp_sock *tp = tcp_sk(sk); + + INIT_SKBTRACE_BLOCK(&b->blk, tp, + skbtrace_action_tcp_sendlimit, + 1 << reason, + sizeof(*b)); + + b->val = val; + b->count = 1; + b->begin = current_kernel_time(); + + b->snd_ssthresh = tp->snd_ssthresh; + b->snd_cwnd = tp->snd_cwnd; + b->snd_cwnd_cnt = tp->snd_cwnd_cnt; + b->snd_wnd = tp->snd_wnd; +} + +static void skbtrace_tcp_sendlimit(struct skbtrace_tracepoint *t, + struct sock *sk, int reason, int val) +SKBTRACE_SOCK_EVENT_BEGIN + struct skbtrace_context *ctx; + unsigned long mask = (unsigned long)t->private; + + if (mask & (1<blk.action == skbtrace_action_tcp_sendlimit && + (ctx->blk.flags & (1 << reason)) && + ctx->tcp_sendlim.val == val && + current_kernel_time().tv_sec == ctx->blk.ts.tv_sec) { + /* same event happens continuously */ + ++ctx->tcp_sendlim.count; + local_bh_enable(); + return; + } + + /* fire up last event or the same but delayed too much event */ + skbtrace_probe(&ctx->blk); + + /* initialize new context */ + tcp_sendlimit_block_setup(&ctx->tcp_sendlim, sk, reason, val); + local_bh_enable(); +SKBTRACE_SOCK_EVENT_END + +static struct skbtrace_tracepoint af_inet4[] = { + { + .name = "tcp_congestion", + .probe = skbtrace_tcp_congestion, + }, + { + .name = "tcp_connection", + .probe = skbtrace_tcp_connection, + }, + { + .name = "icsk_connection", + .probe = skbtrace_icsk_connection, + }, + { + .name = "tcp_sendlimit", + .probe = skbtrace_tcp_sendlimit, + .setup_options = skbtrace_tcp_sendlimit_setopt, + .desc = skbtrace_tcp_sendlimit_desc, + }, + EMPTY_SKBTRACE_TP +}; + +static int skbtrace_ipv4_init(void) +{ + return skbtrace_register_tracepoints(AF_INET, af_inet4); +} + +static void skbtrace_ipv4_cleanup(void) +{ + skbtrace_unregister_tracepoints(AF_INET); +} + +module_init(skbtrace_ipv4_init); +module_exit(skbtrace_ipv4_cleanup); +MODULE_ALIAS("skbtrace-af-" __stringify(AF_INET)); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 3ba605f..d85c8d7 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -279,6 +279,9 @@ #include #include +#include +#include + int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; struct percpu_counter tcp_orphan_count; @@ -1925,6 +1928,8 @@ void tcp_set_state(struct sock *sk, int state) TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); } + trace_tcp_connection(sk, state); + /* Change state AFTER socket is unhashed to avoid closed * socket sitting in hash tables. */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ca0d0e7..8f8b5f5 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -74,6 +74,8 @@ #include #include #include +#include +#include int sysctl_tcp_timestamps __read_mostly = 1; int sysctl_tcp_window_scaling __read_mostly = 1; @@ -861,6 +863,7 @@ void tcp_enter_cwr(struct sock *sk, const int set_ssthresh) tcp_set_ca_state(sk, TCP_CA_CWR); } + trace_tcp_congestion(sk, skbtrace_tcp_cong_cwr, 0); } /* @@ -2151,6 +2154,8 @@ void tcp_enter_frto(struct sock *sk) tcp_set_ca_state(sk, TCP_CA_Disorder); tp->high_seq = tp->snd_nxt; tp->frto_counter = 1; + + trace_tcp_congestion(sk, skbtrace_tcp_cong_frto, 0); } /* Enter Loss state after F-RTO was applied. Dupack arrived after RTO, @@ -2218,6 +2223,8 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) TCP_ECN_queue_cwr(tp); tcp_clear_all_retrans_hints(tp); + + trace_tcp_congestion(sk, skbtrace_tcp_cong_frto_loss, 0); } static void tcp_clear_retrans_partial(struct tcp_sock *tp) @@ -2247,6 +2254,8 @@ void tcp_enter_loss(struct sock *sk, int how) struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; + trace_tcp_congestion(sk, skbtrace_tcp_cong_loss, 0); + /* Reduce ssthresh if it has not yet been made inside this window. */ if (icsk->icsk_ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) { @@ -3217,6 +3226,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, /* Otherwise enter Recovery state */ tcp_enter_recovery(sk, (flag & FLAG_ECE)); fast_rexmit = 1; + trace_tcp_congestion(sk, skbtrace_tcp_cong_fastrtx, 0); } if (do_lost || (tcp_is_fack(tp) && tcp_head_timedout(sk))) @@ -3770,6 +3780,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) u32 prior_fackets; int prior_packets; int prior_sacked = tp->sacked_out; + int prior_state = icsk->icsk_ca_state; int pkts_acked = 0; int newly_acked_sacked = 0; bool frto_cwnd = false; @@ -3864,6 +3875,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); tcp_fastretrans_alert(sk, pkts_acked, newly_acked_sacked, is_dupack, flag); + trace_tcp_congestion(sk, skbtrace_tcp_cong_leave, prior_state); } else { if ((flag & FLAG_DATA_ACKED) && !frto_cwnd) tcp_cong_avoid(sk, ack, prior_in_flight); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 64568fa..505e4fd 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -85,6 +85,9 @@ #include #include +#include +#include + int sysctl_tcp_tw_reuse __read_mostly; int sysctl_tcp_low_latency __read_mostly; EXPORT_SYMBOL(sysctl_tcp_low_latency); @@ -1528,6 +1531,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, if (__inet_inherit_port(sk, newsk) < 0) goto put_and_exit; __inet_hash_nolisten(newsk, NULL); + trace_tcp_connection(newsk, TCP_SYN_RECV); return newsk; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 72b7c63..0a8b4be 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -23,10 +23,13 @@ #include #include #include +#include #include #include #include +#include + int sysctl_tcp_syncookies __read_mostly = 1; EXPORT_SYMBOL(sysctl_tcp_syncookies); @@ -189,6 +192,7 @@ kill_with_rst: /* FIN arrived, enter true time-wait state. */ tw->tw_substate = TCP_TIME_WAIT; + trace_tcp_connection(tw, TCP_TIME_WAIT + TCP_MAX_STATES); tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq; if (tmp_opt.saw_tstamp) { tcptw->tw_ts_recent_stamp = get_seconds(); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index c465d3e..a7c0488 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -42,6 +42,9 @@ #include #include +#include +#include + /* People can turn this off for buggy TCP's found in printers etc. */ int sysctl_tcp_retrans_collapse __read_mostly = 1; @@ -1660,15 +1663,18 @@ static int tcp_mtu_probe(struct sock *sk) if (tp->snd_wnd < size_needed) return -1; - if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp))) + if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp))) { + trace_tcp_sendlimit(sk, skbtrace_tcp_sndlim_swnd, 1); return 0; - + } /* Do we need to wait to drain cwnd? With none in flight, don't stall */ if (tcp_packets_in_flight(tp) + 2 > tp->snd_cwnd) { if (!tcp_packets_in_flight(tp)) return -1; - else + else { + trace_tcp_sendlimit(sk, skbtrace_tcp_sndlim_cwnd, 1); return 0; + } } /* We're allowed to probe. Build it now. */ @@ -1763,7 +1769,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, struct sk_buff *skb; unsigned int tso_segs, sent_pkts; int cwnd_quota; - int result; + int retval, result, sndlim; sent_pkts = 0;