From patchwork Fri Feb 1 07:03:59 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Martin KaFai Lau X-Patchwork-Id: 1034607 X-Patchwork-Delegate: bpf@iogearbox.net Return-Path: X-Original-To: patchwork-incoming-netdev@ozlabs.org Delivered-To: patchwork-incoming-netdev@ozlabs.org Authentication-Results: ozlabs.org; spf=none (mailfrom) smtp.mailfrom=vger.kernel.org (client-ip=209.132.180.67; helo=vger.kernel.org; envelope-from=netdev-owner@vger.kernel.org; receiver=) Authentication-Results: ozlabs.org; dmarc=pass (p=none dis=none) header.from=fb.com Authentication-Results: ozlabs.org; dkim=pass (1024-bit key; unprotected) header.d=fb.com header.i=@fb.com header.b="gV+A+qGc"; dkim-atps=neutral Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by ozlabs.org (Postfix) with ESMTP id 43rSkq5gGWz9s9G for ; Fri, 1 Feb 2019 18:04:03 +1100 (AEDT) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1726700AbfBAHEC (ORCPT ); Fri, 1 Feb 2019 02:04:02 -0500 Received: from mx0a-00082601.pphosted.com ([67.231.145.42]:45708 "EHLO mx0a-00082601.pphosted.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1726598AbfBAHEC (ORCPT ); Fri, 1 Feb 2019 02:04:02 -0500 Received: from pps.filterd (m0109333.ppops.net [127.0.0.1]) by mx0a-00082601.pphosted.com (8.16.0.27/8.16.0.27) with SMTP id x116xahj008499 for ; Thu, 31 Jan 2019 23:04:00 -0800 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=fb.com; h=from : to : cc : subject : date : message-id : in-reply-to : references : mime-version : content-type; s=facebook; bh=pAo27L3CUlhxuHaIwnlHdT92uofHi+N27F+BeGhPiME=; b=gV+A+qGcfpuJpGZSeGVYOHC5uFeXvMsqUwXj8o6fs7EEh62YTAkSGTtLpZrWsOqf6nqv Q9afYYZ7mZqH8h6W5nxGPnMce3mxqILbusdmybOFS7L5kF2zfilREUfLwR39YoprvNdV DLuiJtpwAKLRlAM7C7OrG7A9LTRbQ5J8+54= Received: from maileast.thefacebook.com ([199.201.65.23]) by mx0a-00082601.pphosted.com with ESMTP id 2qcfgj88k3-1 (version=TLSv1.2 cipher=ECDHE-RSA-AES256-SHA384 bits=256 verify=NOT) for ; Thu, 31 Jan 2019 23:04:00 -0800 Received: from mx-out.facebook.com (2620:10d:c0a1:3::13) by mail.thefacebook.com (2620:10d:c021:18::176) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA) id 15.1.1531.3; Thu, 31 Jan 2019 23:03:59 -0800 Received: by devbig005.ftw2.facebook.com (Postfix, from userid 6611) id 10B85294256F; Thu, 31 Jan 2019 23:03:59 -0800 (PST) Smtp-Origin-Hostprefix: devbig From: Martin KaFai Lau Smtp-Origin-Hostname: devbig005.ftw2.facebook.com To: CC: Alexei Starovoitov , Daniel Borkmann , , Lawrence Brakmo Smtp-Origin-Cluster: ftw2c04 Subject: [PATCH bpf-next 3/6] bpf: Add struct bpf_tcp_sock and BPF_FUNC_tcp_sock Date: Thu, 31 Jan 2019 23:03:59 -0800 Message-ID: <20190201070359.4148496-1-kafai@fb.com> X-Mailer: git-send-email 2.17.1 In-Reply-To: <20190201070356.4148189-1-kafai@fb.com> References: <20190201070356.4148189-1-kafai@fb.com> X-FB-Internal: Safe MIME-Version: 1.0 X-Proofpoint-Virus-Version: vendor=fsecure engine=2.50.10434:, , definitions=2019-02-01_05:, , signatures=0 X-Proofpoint-Spam-Reason: safe X-FB-Internal: Safe Sender: netdev-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: netdev@vger.kernel.org This patch adds a helper function BPF_FUNC_tcp_sock and it is currently available for cg_skb and sched_(cls|act): struct bpf_tcp_sock *bpf_tcp_sock(struct bpf_sock *sk); int cg_skb_foo(struct __sk_buff *skb) { struct bpf_tcp_sock *tp; struct bpf_sock *sk; __u32 snd_cwnd; sk = skb->sk; if (!sk) return 1; tp = bpf_tcp_sock(sk); if (!tp) return 1; snd_cwnd = tp->snd_cwnd; /* ... */ return 1; } A 'struct bpf_tcp_sock' is also added to the uapi bpf.h to provide read-only access. bpf_tcp_sock has all the existing tcp_sock's fields that has already been exposed by the bpf_sock_ops. i.e. no new tcp_sock's fields are exposed in bpf.h. This helper ensures the sk is a tcp_sock. If it is not a tcp_sock, it returns NULL. Hence, the caller needs to check for NULL before accessing bpf_tcp_sock. The current use case is to expose members from tcp_sock to allow a cg_skb_bpf_prog to provide per cgroup traffic policing/shaping. Signed-off-by: Martin KaFai Lau --- include/linux/bpf.h | 30 ++++++++++++++++ include/uapi/linux/bpf.h | 51 +++++++++++++++++++++++++- kernel/bpf/verifier.c | 31 ++++++++++++++-- net/core/filter.c | 77 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 186 insertions(+), 3 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 72022a0c442d..cc3667f8c003 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -172,6 +172,7 @@ enum bpf_return_type { RET_PTR_TO_MAP_VALUE, /* returns a pointer to map elem value */ RET_PTR_TO_MAP_VALUE_OR_NULL, /* returns a pointer to map elem value or NULL */ RET_PTR_TO_SOCKET_OR_NULL, /* returns a pointer to a socket or NULL */ + RET_PTR_TO_TCP_SOCK_OR_NULL, /* returns a pointer to a tcp_sock or NULL */ }; /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs @@ -227,6 +228,8 @@ enum bpf_reg_type { PTR_TO_SOCKET_OR_NULL, /* reg points to struct bpf_sock or NULL */ PTR_TO_SOCK_COMMON, /* reg points to sock_common */ PTR_TO_SOCK_COMMON_OR_NULL, /* reg points to sock_common or NULL */ + PTR_TO_TCP_SOCK, /* reg points to struct bpf_tcp_sock */ + PTR_TO_TCP_SOCK_OR_NULL, /* reg points to struct bpf_tcp_sock or NULL */ }; /* The information passed from prog-specific *_is_valid_access @@ -923,4 +926,31 @@ static inline u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, } #endif +#ifdef CONFIG_INET +bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type, + struct bpf_insn_access_aux *info); + +u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size); +#else +static inline bool bpf_tcp_sock_is_valid_access(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + return false; +} + +static inline u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size) +{ + return 0; +} +#endif /* CONFIG_INET */ + #endif /* _LINUX_BPF_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b7a5312c4ccf..5f6c947abdd8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2336,6 +2336,15 @@ union bpf_attr { * Return * The same pointer if *sk* is a fullsock. Otherwise, * NULL is returned. + * + * struct bpf_tcp_sock *bpf_tcp_sock(struct bpf_sock *sk) + * Description + * Obtain a **struct bpf_tcp_sock** pointer from a + * **struct bpf_sock** pointer. + * + * Return + * Pointer to **struct bpf_tcp_sock**, or **NULL** in + * case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2431,7 +2440,8 @@ union bpf_attr { FN(msg_push_data), \ FN(msg_pop_data), \ FN(rc_pointer_rel), \ - FN(sk_fullsock), + FN(sk_fullsock), \ + FN(tcp_sock), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -2614,6 +2624,45 @@ struct bpf_sock { */ }; +struct bpf_tcp_sock { + __u32 snd_cwnd; /* Sending congestion window */ + __u32 srtt_us; /* smoothed round trip time << 3 in usecs */ + __u32 rtt_min; + __u32 snd_ssthresh; /* Slow start size threshold */ + __u32 rcv_nxt; /* What we want to receive next */ + __u32 snd_nxt; /* Next sequence we send */ + __u32 snd_una; /* First byte we want an ack for */ + __u32 mss_cache; /* Cached effective mss, not including SACKS */ + __u32 ecn_flags; /* ECN status bits. */ + __u32 rate_delivered; /* saved rate sample: packets delivered */ + __u32 rate_interval_us; /* saved rate sample: time elapsed */ + __u32 packets_out; /* Packets which are "in flight" */ + __u32 retrans_out; /* Retransmitted packets out */ + __u32 total_retrans; /* Total retransmits for entire connection */ + __u32 segs_in; /* RFC4898 tcpEStatsPerfSegsIn + * total number of segments in. + */ + __u32 data_segs_in; /* RFC4898 tcpEStatsPerfDataSegsIn + * total number of data segments in. + */ + __u32 segs_out; /* RFC4898 tcpEStatsPerfSegsOut + * The total number of segments sent. + */ + __u32 data_segs_out; /* RFC4898 tcpEStatsPerfDataSegsOut + * total number of data segments sent. + */ + __u32 lost_out; /* Lost packets */ + __u32 sacked_out; /* SACK'd packets */ + __u64 bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived + * sum(delta(rcv_nxt)), or how many bytes + * were acked. + */ + __u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked + * sum(delta(snd_una)), or how many bytes + * were acked. + */ +}; + struct bpf_sock_tuple { union { struct { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b4500f29dff1..f8ba690dad80 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -333,14 +333,16 @@ static bool type_is_pkt_pointer(enum bpf_reg_type type) static bool type_is_sk_pointer(enum bpf_reg_type type) { return type == PTR_TO_SOCKET || - type == PTR_TO_SOCK_COMMON; + type == PTR_TO_SOCK_COMMON || + type == PTR_TO_TCP_SOCK; } static bool reg_type_may_be_null(enum bpf_reg_type type) { return type == PTR_TO_MAP_VALUE_OR_NULL || type == PTR_TO_SOCKET_OR_NULL || - type == PTR_TO_SOCK_COMMON_OR_NULL; + type == PTR_TO_SOCK_COMMON_OR_NULL || + type == PTR_TO_TCP_SOCK_OR_NULL; } static bool type_is_refcounted(enum bpf_reg_type type) @@ -400,6 +402,8 @@ static const char * const reg_type_str[] = { [PTR_TO_SOCKET_OR_NULL] = "sock_or_null", [PTR_TO_SOCK_COMMON] = "sock_common", [PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null", + [PTR_TO_TCP_SOCK] = "tcp_sock", + [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null", }; static char slot_type_char[] = { @@ -1201,6 +1205,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_SOCKET_OR_NULL: case PTR_TO_SOCK_COMMON: case PTR_TO_SOCK_COMMON_OR_NULL: + case PTR_TO_TCP_SOCK: + case PTR_TO_TCP_SOCK_OR_NULL: return true; default: return false; @@ -1638,6 +1644,9 @@ static int check_sock_access(struct bpf_verifier_env *env, u32 regno, int off, case PTR_TO_SOCKET: valid = bpf_sock_is_valid_access(off, size, t, &info); break; + case PTR_TO_TCP_SOCK: + valid = bpf_tcp_sock_is_valid_access(off, size, t, &info); + break; default: valid = false; } @@ -1795,6 +1804,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, case PTR_TO_SOCK_COMMON: pointer_desc = "sock_common "; break; + case PTR_TO_TCP_SOCK: + pointer_desc = "tcp_sock "; + break; default: break; } @@ -3021,6 +3033,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn /* For mark_ptr_or_null_reg() */ regs[BPF_REG_0].id = ++env->id_gen; } + } else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL; + regs[BPF_REG_0].id = ++env->id_gen; } else { verbose(env, "unknown return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); @@ -3282,6 +3298,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, case PTR_TO_SOCKET_OR_NULL: case PTR_TO_SOCK_COMMON: case PTR_TO_SOCK_COMMON_OR_NULL: + case PTR_TO_TCP_SOCK: + case PTR_TO_TCP_SOCK_OR_NULL: verbose(env, "R%d pointer arithmetic on %s prohibited\n", dst, reg_type_str[ptr_reg->type]); return -EACCES; @@ -4517,6 +4535,8 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, reg->type = PTR_TO_SOCKET; } else if (reg->type == PTR_TO_SOCK_COMMON_OR_NULL) { reg->type = PTR_TO_SOCK_COMMON; + } else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) { + reg->type = PTR_TO_TCP_SOCK; } if (is_null || !reg_is_refcounted(reg)) { @@ -5704,6 +5724,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, case PTR_TO_SOCKET_OR_NULL: case PTR_TO_SOCK_COMMON: case PTR_TO_SOCK_COMMON_OR_NULL: + case PTR_TO_TCP_SOCK: + case PTR_TO_TCP_SOCK_OR_NULL: /* Only valid matches are exact, which memcmp() above * would have accepted */ @@ -6023,6 +6045,8 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type) case PTR_TO_SOCKET_OR_NULL: case PTR_TO_SOCK_COMMON: case PTR_TO_SOCK_COMMON_OR_NULL: + case PTR_TO_TCP_SOCK: + case PTR_TO_TCP_SOCK_OR_NULL: return false; default: return true; @@ -6997,6 +7021,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) case PTR_TO_SOCK_COMMON: convert_ctx_access = bpf_sock_convert_ctx_access; break; + case PTR_TO_TCP_SOCK: + convert_ctx_access = bpf_tcp_sock_convert_ctx_access; + break; default: continue; } diff --git a/net/core/filter.c b/net/core/filter.c index b3b4b5faede4..bdc39c3081ff 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5305,6 +5305,77 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = { .arg5_type = ARG_ANYTHING, }; +bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + if (off < 0 || off >= offsetofend(struct bpf_tcp_sock, bytes_acked)) + return false; + + if (off % size != 0) + return false; + + switch (off) { + case offsetof(struct bpf_tcp_sock, bytes_received): + case offsetof(struct bpf_tcp_sock, bytes_acked): + return size == sizeof(__u64); + default: + return size == sizeof(__u32); + } +} + +u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + +#define BPF_TCP_SOCK_GET_COMMON(FIELD) \ + do { \ + BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD) > \ + FIELD_SIZEOF(struct bpf_tcp_sock, FIELD)); \ + *insn++ = BPF_LDX_MEM(FIELD_SIZEOF(struct tcp_sock, FIELD), \ + si->dst_reg, si->src_reg, \ + offsetof(struct tcp_sock, FIELD)); \ + } while (0) + + CONVERT_COMMON_TCP_SOCK_FIELDS(struct bpf_tcp_sock, + BPF_TCP_SOCK_GET_COMMON); + + if (insn > insn_buf) + return insn - insn_buf; + + switch (si->off) { + case offsetof(struct bpf_tcp_sock, rtt_min): + BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, rtt_min) != + sizeof(struct minmax)); + BUILD_BUG_ON(sizeof(struct minmax) < + sizeof(struct minmax_sample)); + + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + offsetof(struct tcp_sock, rtt_min) + + offsetof(struct minmax_sample, v)); + break; + } + + return insn - insn_buf; +} + +BPF_CALL_1(bpf_tcp_sock, const struct sock *, sk) +{ + if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP) + return (unsigned long)sk; + + return (unsigned long)NULL; +} + +static const struct bpf_func_proto bpf_tcp_sock_proto = { + .func = bpf_tcp_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_TCP_SOCK_OR_NULL, + .arg1_type = ARG_PTR_TO_SOCK_COMMON, +}; + #endif /* CONFIG_INET */ bool bpf_helper_changes_pkt_data(void *func) @@ -5450,6 +5521,10 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_local_storage_proto; case BPF_FUNC_sk_fullsock: return &bpf_sk_fullsock_proto; +#ifdef CONFIG_INET + case BPF_FUNC_tcp_sock: + return &bpf_tcp_sock_proto; +#endif default: return sk_filter_func_proto(func_id, prog); } @@ -5540,6 +5615,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_lookup_udp_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; + case BPF_FUNC_tcp_sock: + return &bpf_tcp_sock_proto; #endif default: return bpf_base_func_proto(func_id);