diff mbox series

[RFC,bpf-next,07/11] bpf: Add helper to retrieve socket in BPF

Message ID 20180509210709.7201-8-joe@wand.net.nz
State RFC, archived
Delegated to: BPF Maintainers
Headers show
Series Add socket lookup support | expand

Commit Message

Joe Stringer May 9, 2018, 9:07 p.m. UTC
This patch adds a new BPF helper function, sk_lookup() which allows BPF
programs to find out if there is a socket listening on this host, and
returns a socket pointer which the BPF program can then access to
determine, for instance, whether to forward or drop traffic. sk_lookup()
takes a reference on the socket, so when a BPF program makes use of this
function, it must subsequently pass the returned pointer into the newly
added sk_release() to return the reference.

By way of example, the following pseudocode would filter inbound
connections at XDP if there is no corresponding service listening for
the traffic:

  struct bpf_sock_tuple tuple;
  struct bpf_sock_ops *sk;

  populate_tuple(ctx, &tuple); // Extract the 5tuple from the packet
  sk = bpf_sk_lookup(ctx, &tuple, sizeof tuple, netns, 0);
  if (!sk) {
    // Couldn't find a socket listening for this traffic. Drop.
    return TC_ACT_SHOT;
  }
  bpf_sk_release(sk, 0);
  return TC_ACT_OK;

Signed-off-by: Joe Stringer <joe@wand.net.nz>
---
 include/uapi/linux/bpf.h                  |  39 +++++++++++-
 kernel/bpf/verifier.c                     |   8 ++-
 net/core/filter.c                         | 102 ++++++++++++++++++++++++++++++
 tools/include/uapi/linux/bpf.h            |  40 +++++++++++-
 tools/testing/selftests/bpf/bpf_helpers.h |   7 ++
 5 files changed, 193 insertions(+), 3 deletions(-)

Comments

Martin KaFai Lau May 11, 2018, 5 a.m. UTC | #1
On Wed, May 09, 2018 at 02:07:05PM -0700, Joe Stringer wrote:
> This patch adds a new BPF helper function, sk_lookup() which allows BPF
> programs to find out if there is a socket listening on this host, and
> returns a socket pointer which the BPF program can then access to
> determine, for instance, whether to forward or drop traffic. sk_lookup()
> takes a reference on the socket, so when a BPF program makes use of this
> function, it must subsequently pass the returned pointer into the newly
> added sk_release() to return the reference.
> 
> By way of example, the following pseudocode would filter inbound
> connections at XDP if there is no corresponding service listening for
> the traffic:
> 
>   struct bpf_sock_tuple tuple;
>   struct bpf_sock_ops *sk;
> 
>   populate_tuple(ctx, &tuple); // Extract the 5tuple from the packet
>   sk = bpf_sk_lookup(ctx, &tuple, sizeof tuple, netns, 0);
>   if (!sk) {
>     // Couldn't find a socket listening for this traffic. Drop.
>     return TC_ACT_SHOT;
>   }
>   bpf_sk_release(sk, 0);
>   return TC_ACT_OK;
> 
> Signed-off-by: Joe Stringer <joe@wand.net.nz>
> ---
>  include/uapi/linux/bpf.h                  |  39 +++++++++++-
>  kernel/bpf/verifier.c                     |   8 ++-
>  net/core/filter.c                         | 102 ++++++++++++++++++++++++++++++
>  tools/include/uapi/linux/bpf.h            |  40 +++++++++++-
>  tools/testing/selftests/bpf/bpf_helpers.h |   7 ++
>  5 files changed, 193 insertions(+), 3 deletions(-)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index d615c777b573..29f38838dbca 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -1828,6 +1828,25 @@ union bpf_attr {
>   * 	Return
>   * 		0 on success, or a negative error in case of failure.
>   *
> + * struct bpf_sock_ops *bpf_sk_lookup(ctx, tuple, tuple_size, netns, flags)
> + * 	Decription
> + * 		Look for socket matching 'tuple'. The return value must be checked,
> + * 		and if non-NULL, released via bpf_sk_release().
> + * 		@ctx: pointer to ctx
> + * 		@tuple: pointer to struct bpf_sock_tuple
> + * 		@tuple_size: size of the tuple
> + * 		@flags: flags value
> + * 	Return
> + * 		pointer to socket ops on success, or
> + * 		NULL in case of failure
> + *
> + *  int bpf_sk_release(sock, flags)
> + * 	Description
> + * 		Release the reference held by 'sock'.
> + * 		@sock: Pointer reference to release. Must be found via bpf_sk_lookup().
> + * 		@flags: flags value
> + * 	Return
> + * 		0 on success, or a negative error in case of failure.
>   */
>  #define __BPF_FUNC_MAPPER(FN)		\
>  	FN(unspec),			\
> @@ -1898,7 +1917,9 @@ union bpf_attr {
>  	FN(xdp_adjust_tail),		\
>  	FN(skb_get_xfrm_state),		\
>  	FN(get_stack),			\
> -	FN(skb_load_bytes_relative),
> +	FN(skb_load_bytes_relative),	\
> +	FN(sk_lookup),			\
> +	FN(sk_release),
>  
>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>   * function eBPF program intends to call
> @@ -2060,6 +2081,22 @@ struct bpf_sock {
>  				 */
>  };
>  
> +struct bpf_sock_tuple {
> +	union {
> +		__be32 ipv6[4];
> +		__be32 ipv4;
> +	} saddr;
> +	union {
> +		__be32 ipv6[4];
> +		__be32 ipv4;
> +	} daddr;
> +	__be16 sport;
> +	__be16 dport;
> +	__u32 dst_if;
> +	__u8 family;
> +	__u8 proto;
> +};
> +
>  #define XDP_PACKET_HEADROOM 256
>  
>  /* User return codes for XDP prog type.
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 92b9a5dc465a..579012c483e4 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -153,6 +153,12 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
>   * PTR_TO_MAP_VALUE, PTR_TO_SOCKET_OR_NULL becomes PTR_TO_SOCKET when the type
>   * passes through a NULL-check conditional. For the branch wherein the state is
>   * changed to CONST_IMM, the verifier releases the reference.
> + *
> + * For each helper function that allocates a reference, such as bpf_sk_lookup(),
> + * there is a corresponding release function, such as bpf_sk_release(). When
> + * a reference type passes into the release function, the verifier also releases
> + * the reference. If any unchecked or unreleased reference remains at the end of
> + * the program, the verifier rejects it.
>   */
>  
>  /* verifier_state + insn_idx are pushed to stack when branch is encountered */
> @@ -277,7 +283,7 @@ static bool arg_type_is_refcounted(enum bpf_arg_type type)
>   */
>  static bool is_release_function(enum bpf_func_id func_id)
>  {
> -	return false;
> +	return func_id == BPF_FUNC_sk_release;
>  }
>  
>  /* string representation of 'enum bpf_reg_type' */
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 4c35152fb3a8..751c255d17d3 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -58,8 +58,12 @@
>  #include <net/busy_poll.h>
>  #include <net/tcp.h>
>  #include <net/xfrm.h>
> +#include <net/udp.h>
>  #include <linux/bpf_trace.h>
>  #include <net/xdp_sock.h>
> +#include <net/inet_hashtables.h>
> +#include <net/inet6_hashtables.h>
> +#include <net/net_namespace.h>
>  
>  /**
>   *	sk_filter_trim_cap - run a packet through a socket filter
> @@ -4032,6 +4036,96 @@ static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
>  };
>  #endif
>  
> +struct sock *
> +sk_lookup(struct net *net, struct bpf_sock_tuple *tuple) {
Would it be possible to have another version that
returns a sk without taking its refcnt?
It may have performance benefit.

> +	int dst_if = (int)tuple->dst_if;
> +	struct in6_addr *src6;
> +	struct in6_addr *dst6;
> +
> +	if (tuple->family == AF_INET6) {
> +		src6 = (struct in6_addr *)&tuple->saddr.ipv6;
> +		dst6 = (struct in6_addr *)&tuple->daddr.ipv6;
> +	} else if (tuple->family != AF_INET) {
> +		return ERR_PTR(-EOPNOTSUPP);
> +	}
> +
> +	if (tuple->proto == IPPROTO_TCP) {
> +		if (tuple->family == AF_INET)
> +			return inet_lookup(net, &tcp_hashinfo, NULL, 0,
> +					   tuple->saddr.ipv4, tuple->sport,
> +					   tuple->daddr.ipv4, tuple->dport,
> +					   dst_if);
> +		else
> +			return inet6_lookup(net, &tcp_hashinfo, NULL, 0,
> +					    src6, tuple->sport,
> +					    dst6, tuple->dport, dst_if);
> +	} else if (tuple->proto == IPPROTO_UDP) {
> +		if (tuple->family == AF_INET)
> +			return udp4_lib_lookup(net, tuple->saddr.ipv4,
> +					       tuple->sport, tuple->daddr.ipv4,
> +					       tuple->dport, dst_if);
> +		else
> +			return udp6_lib_lookup(net, src6, tuple->sport,
> +					       dst6, tuple->dport, dst_if);
> +	} else {
> +		return ERR_PTR(-EOPNOTSUPP);
> +	}
> +
> +	return NULL;
> +}
> +
> +BPF_CALL_5(bpf_sk_lookup, struct sk_buff *, skb,
> +	   struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)
> +{
> +	struct net *caller_net = dev_net(skb->dev);
> +	struct sock *sk = NULL;
> +	struct net *net;
> +
> +	/* XXX: Perform verification-time checking of tuple size? */
> +	if (unlikely(len != sizeof(struct bpf_sock_tuple) || flags))
> +		goto out;
> +
> +	net = get_net_ns_by_id(caller_net, netns_id);
> +	if (unlikely(!net))
> +		goto out;
> +
> +	sk = sk_lookup(net, tuple);
> +	put_net(net);
> +	if (IS_ERR_OR_NULL(sk))
> +		sk = NULL;
> +	else
> +		sk = sk_to_full_sk(sk);
> +out:
> +	return (unsigned long) sk;
> +}
> +
> +static const struct bpf_func_proto bpf_sk_lookup_proto = {
> +	.func		= bpf_sk_lookup,
> +	.gpl_only	= false,
> +	.ret_type	= RET_PTR_TO_SOCKET_OR_NULL,
> +	.arg1_type	= ARG_PTR_TO_CTX,
> +	.arg2_type	= ARG_PTR_TO_MEM,
> +	.arg3_type	= ARG_CONST_SIZE,
> +	.arg4_type	= ARG_ANYTHING,
> +	.arg5_type	= ARG_ANYTHING,
> +};
> +
> +BPF_CALL_2(bpf_sk_release, struct sock *, sk, u64, flags)
> +{
> +	sock_gen_put(sk);
> +	if (unlikely(flags))
> +		return -EINVAL;
> +	return 0;
> +}
> +
> +static const struct bpf_func_proto bpf_sk_release_proto = {
> +	.func		= bpf_sk_release,
> +	.gpl_only	= false,
> +	.ret_type	= RET_INTEGER,
> +	.arg1_type	= ARG_PTR_TO_SOCKET,
> +	.arg2_type	= ARG_ANYTHING,
> +};
> +
>  static const struct bpf_func_proto *
>  bpf_base_func_proto(enum bpf_func_id func_id)
>  {
> @@ -4181,6 +4275,10 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
>  	case BPF_FUNC_skb_get_xfrm_state:
>  		return &bpf_skb_get_xfrm_state_proto;
>  #endif
> +	case BPF_FUNC_sk_lookup:
> +		return &bpf_sk_lookup_proto;
> +	case BPF_FUNC_sk_release:
> +		return &bpf_sk_release_proto;
>  	default:
>  		return bpf_base_func_proto(func_id);
>  	}
> @@ -4292,6 +4390,10 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
>  		return &bpf_get_socket_uid_proto;
>  	case BPF_FUNC_sk_redirect_map:
>  		return &bpf_sk_redirect_map_proto;
> +	case BPF_FUNC_sk_lookup:
> +		return &bpf_sk_lookup_proto;
> +	case BPF_FUNC_sk_release:
> +		return &bpf_sk_release_proto;
>  	default:
>  		return bpf_base_func_proto(func_id);
>  	}
> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> index fff51c187d1e..29f38838dbca 100644
> --- a/tools/include/uapi/linux/bpf.h
> +++ b/tools/include/uapi/linux/bpf.h
> @@ -117,6 +117,7 @@ enum bpf_map_type {
>  	BPF_MAP_TYPE_DEVMAP,
>  	BPF_MAP_TYPE_SOCKMAP,
>  	BPF_MAP_TYPE_CPUMAP,
> +	BPF_MAP_TYPE_XSKMAP,
>  };
>  
>  enum bpf_prog_type {
> @@ -1827,6 +1828,25 @@ union bpf_attr {
>   * 	Return
>   * 		0 on success, or a negative error in case of failure.
>   *
> + * struct bpf_sock_ops *bpf_sk_lookup(ctx, tuple, tuple_size, netns, flags)
> + * 	Decription
> + * 		Look for socket matching 'tuple'. The return value must be checked,
> + * 		and if non-NULL, released via bpf_sk_release().
> + * 		@ctx: pointer to ctx
> + * 		@tuple: pointer to struct bpf_sock_tuple
> + * 		@tuple_size: size of the tuple
> + * 		@flags: flags value
> + * 	Return
> + * 		pointer to socket ops on success, or
> + * 		NULL in case of failure
> + *
> + *  int bpf_sk_release(sock, flags)
> + * 	Description
> + * 		Release the reference held by 'sock'.
> + * 		@sock: Pointer reference to release. Must be found via bpf_sk_lookup().
> + * 		@flags: flags value
> + * 	Return
> + * 		0 on success, or a negative error in case of failure.
>   */
>  #define __BPF_FUNC_MAPPER(FN)		\
>  	FN(unspec),			\
> @@ -1897,7 +1917,9 @@ union bpf_attr {
>  	FN(xdp_adjust_tail),		\
>  	FN(skb_get_xfrm_state),		\
>  	FN(get_stack),			\
> -	FN(skb_load_bytes_relative),
> +	FN(skb_load_bytes_relative),	\
> +	FN(sk_lookup),			\
> +	FN(sk_release),
>  
>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>   * function eBPF program intends to call
> @@ -2059,6 +2081,22 @@ struct bpf_sock {
>  				 */
>  };
>  
> +struct bpf_sock_tuple {
> +	union {
> +		__be32 ipv6[4];
> +		__be32 ipv4;
> +	} saddr;
> +	union {
> +		__be32 ipv6[4];
> +		__be32 ipv4;
> +	} daddr;
> +	__be16 sport;
> +	__be16 dport;
> +	__u32 dst_if;
> +	__u8 family;
> +	__u8 proto;
> +};
> +
>  #define XDP_PACKET_HEADROOM 256
>  
>  /* User return codes for XDP prog type.
> diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
> index 265f8e0e8ada..4dc311ea0c16 100644
> --- a/tools/testing/selftests/bpf/bpf_helpers.h
> +++ b/tools/testing/selftests/bpf/bpf_helpers.h
> @@ -103,6 +103,13 @@ static int (*bpf_skb_get_xfrm_state)(void *ctx, int index, void *state,
>  	(void *) BPF_FUNC_skb_get_xfrm_state;
>  static int (*bpf_get_stack)(void *ctx, void *buf, int size, int flags) =
>  	(void *) BPF_FUNC_get_stack;
> +static struct bpf_sock *(*bpf_sk_lookup)(void *ctx,
> +					 struct bpf_sock_tuple *tuple,
> +					 int size, unsigned int netns_id,
> +					 unsigned long long flags) =
> +	(void *) BPF_FUNC_sk_lookup;
> +static int (*bpf_sk_release)(struct bpf_sock *sk, unsigned long long flags) =
> +	(void *) BPF_FUNC_sk_release;
>  
>  /* llvm builtin functions that eBPF C program may use to
>   * emit BPF_LD_ABS and BPF_LD_IND instructions
> -- 
> 2.14.1
>
Joe Stringer May 11, 2018, 9:08 p.m. UTC | #2
On 10 May 2018 at 22:00, Martin KaFai Lau <kafai@fb.com> wrote:
> On Wed, May 09, 2018 at 02:07:05PM -0700, Joe Stringer wrote:
>> This patch adds a new BPF helper function, sk_lookup() which allows BPF
>> programs to find out if there is a socket listening on this host, and
>> returns a socket pointer which the BPF program can then access to
>> determine, for instance, whether to forward or drop traffic. sk_lookup()
>> takes a reference on the socket, so when a BPF program makes use of this
>> function, it must subsequently pass the returned pointer into the newly
>> added sk_release() to return the reference.
>>
>> By way of example, the following pseudocode would filter inbound
>> connections at XDP if there is no corresponding service listening for
>> the traffic:
>>
>>   struct bpf_sock_tuple tuple;
>>   struct bpf_sock_ops *sk;
>>
>>   populate_tuple(ctx, &tuple); // Extract the 5tuple from the packet
>>   sk = bpf_sk_lookup(ctx, &tuple, sizeof tuple, netns, 0);
>>   if (!sk) {
>>     // Couldn't find a socket listening for this traffic. Drop.
>>     return TC_ACT_SHOT;
>>   }
>>   bpf_sk_release(sk, 0);
>>   return TC_ACT_OK;
>>
>> Signed-off-by: Joe Stringer <joe@wand.net.nz>
>> ---

...

>> @@ -4032,6 +4036,96 @@ static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
>>  };
>>  #endif
>>
>> +struct sock *
>> +sk_lookup(struct net *net, struct bpf_sock_tuple *tuple) {
> Would it be possible to have another version that
> returns a sk without taking its refcnt?
> It may have performance benefit.

Not really. The sockets are not RCU-protected, and established sockets
may be torn down without notice. If we don't take a reference, there's
no guarantee that the socket will continue to exist for the duration
of running the BPF program.

From what I follow, the comment below has a hidden implication which
is that sockets without SOCK_RCU_FREE, eg established sockets, may be
directly freed regardless of RCU.

/* Sockets having SOCK_RCU_FREE will call this function after one RCU
 * grace period. This is the case for UDP sockets and TCP listeners.
 */
static void __sk_destruct(struct rcu_head *head)
...

Therefore without the refcount, it won't be safe.
Martin KaFai Lau May 11, 2018, 9:41 p.m. UTC | #3
On Fri, May 11, 2018 at 02:08:01PM -0700, Joe Stringer wrote:
> On 10 May 2018 at 22:00, Martin KaFai Lau <kafai@fb.com> wrote:
> > On Wed, May 09, 2018 at 02:07:05PM -0700, Joe Stringer wrote:
> >> This patch adds a new BPF helper function, sk_lookup() which allows BPF
> >> programs to find out if there is a socket listening on this host, and
> >> returns a socket pointer which the BPF program can then access to
> >> determine, for instance, whether to forward or drop traffic. sk_lookup()
> >> takes a reference on the socket, so when a BPF program makes use of this
> >> function, it must subsequently pass the returned pointer into the newly
> >> added sk_release() to return the reference.
> >>
> >> By way of example, the following pseudocode would filter inbound
> >> connections at XDP if there is no corresponding service listening for
> >> the traffic:
> >>
> >>   struct bpf_sock_tuple tuple;
> >>   struct bpf_sock_ops *sk;
> >>
> >>   populate_tuple(ctx, &tuple); // Extract the 5tuple from the packet
> >>   sk = bpf_sk_lookup(ctx, &tuple, sizeof tuple, netns, 0);
> >>   if (!sk) {
> >>     // Couldn't find a socket listening for this traffic. Drop.
> >>     return TC_ACT_SHOT;
> >>   }
> >>   bpf_sk_release(sk, 0);
> >>   return TC_ACT_OK;
> >>
> >> Signed-off-by: Joe Stringer <joe@wand.net.nz>
> >> ---
> 
> ...
> 
> >> @@ -4032,6 +4036,96 @@ static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
> >>  };
> >>  #endif
> >>
> >> +struct sock *
> >> +sk_lookup(struct net *net, struct bpf_sock_tuple *tuple) {
> > Would it be possible to have another version that
> > returns a sk without taking its refcnt?
> > It may have performance benefit.
> 
> Not really. The sockets are not RCU-protected, and established sockets
> may be torn down without notice. If we don't take a reference, there's
> no guarantee that the socket will continue to exist for the duration
> of running the BPF program.
> 
> From what I follow, the comment below has a hidden implication which
> is that sockets without SOCK_RCU_FREE, eg established sockets, may be
> directly freed regardless of RCU.
Right, SOCK_RCU_FREE sk is the one I am concern about.
For example, TCP_LISTEN socket does not require taking a refcnt
now.  Doing a bpf_sk_lookup() may have a rather big
impact on handling TCP syn flood.  or the usual intention
is to redirect instead of passing it up to the stack?


> 
> /* Sockets having SOCK_RCU_FREE will call this function after one RCU
>  * grace period. This is the case for UDP sockets and TCP listeners.
>  */
> static void __sk_destruct(struct rcu_head *head)
> ...
> 
> Therefore without the refcount, it won't be safe.
Joe Stringer May 12, 2018, 12:54 a.m. UTC | #4
On 11 May 2018 at 14:41, Martin KaFai Lau <kafai@fb.com> wrote:
> On Fri, May 11, 2018 at 02:08:01PM -0700, Joe Stringer wrote:
>> On 10 May 2018 at 22:00, Martin KaFai Lau <kafai@fb.com> wrote:
>> > On Wed, May 09, 2018 at 02:07:05PM -0700, Joe Stringer wrote:
>> >> This patch adds a new BPF helper function, sk_lookup() which allows BPF
>> >> programs to find out if there is a socket listening on this host, and
>> >> returns a socket pointer which the BPF program can then access to
>> >> determine, for instance, whether to forward or drop traffic. sk_lookup()
>> >> takes a reference on the socket, so when a BPF program makes use of this
>> >> function, it must subsequently pass the returned pointer into the newly
>> >> added sk_release() to return the reference.
>> >>
>> >> By way of example, the following pseudocode would filter inbound
>> >> connections at XDP if there is no corresponding service listening for
>> >> the traffic:
>> >>
>> >>   struct bpf_sock_tuple tuple;
>> >>   struct bpf_sock_ops *sk;
>> >>
>> >>   populate_tuple(ctx, &tuple); // Extract the 5tuple from the packet
>> >>   sk = bpf_sk_lookup(ctx, &tuple, sizeof tuple, netns, 0);
>> >>   if (!sk) {
>> >>     // Couldn't find a socket listening for this traffic. Drop.
>> >>     return TC_ACT_SHOT;
>> >>   }
>> >>   bpf_sk_release(sk, 0);
>> >>   return TC_ACT_OK;
>> >>
>> >> Signed-off-by: Joe Stringer <joe@wand.net.nz>
>> >> ---
>>
>> ...
>>
>> >> @@ -4032,6 +4036,96 @@ static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
>> >>  };
>> >>  #endif
>> >>
>> >> +struct sock *
>> >> +sk_lookup(struct net *net, struct bpf_sock_tuple *tuple) {
>> > Would it be possible to have another version that
>> > returns a sk without taking its refcnt?
>> > It may have performance benefit.
>>
>> Not really. The sockets are not RCU-protected, and established sockets
>> may be torn down without notice. If we don't take a reference, there's
>> no guarantee that the socket will continue to exist for the duration
>> of running the BPF program.
>>
>> From what I follow, the comment below has a hidden implication which
>> is that sockets without SOCK_RCU_FREE, eg established sockets, may be
>> directly freed regardless of RCU.
> Right, SOCK_RCU_FREE sk is the one I am concern about.
> For example, TCP_LISTEN socket does not require taking a refcnt
> now.  Doing a bpf_sk_lookup() may have a rather big
> impact on handling TCP syn flood.  or the usual intention
> is to redirect instead of passing it up to the stack?

I see, if you're only interested in listen sockets then probably this
series could be extended with a new flag, eg something like
BPF_F_SK_FIND_LISTENERS which restricts the set of possible sockets
found to only listen sockets, then the implementation would call into
__inet_lookup_listener() instead of inet_lookup(). The presence of
that flag in the relevant register during CALL instruction would show
that the verifier should not reference-track the result, then there'd
need to be a check on the release to ensure that this unreferenced
socket is never released. Just a thought, completely untested and I
could still be missing some detail..

That said, I don't completely follow how you would expect to handle
the traffic for sockets that are already established - the helper
would no longer find those sockets, so you wouldn't know whether to
pass the traffic up the stack for established traffic or not.
Alexei Starovoitov May 15, 2018, 3:16 a.m. UTC | #5
On Fri, May 11, 2018 at 05:54:33PM -0700, Joe Stringer wrote:
> On 11 May 2018 at 14:41, Martin KaFai Lau <kafai@fb.com> wrote:
> > On Fri, May 11, 2018 at 02:08:01PM -0700, Joe Stringer wrote:
> >> On 10 May 2018 at 22:00, Martin KaFai Lau <kafai@fb.com> wrote:
> >> > On Wed, May 09, 2018 at 02:07:05PM -0700, Joe Stringer wrote:
> >> >> This patch adds a new BPF helper function, sk_lookup() which allows BPF
> >> >> programs to find out if there is a socket listening on this host, and
> >> >> returns a socket pointer which the BPF program can then access to
> >> >> determine, for instance, whether to forward or drop traffic. sk_lookup()
> >> >> takes a reference on the socket, so when a BPF program makes use of this
> >> >> function, it must subsequently pass the returned pointer into the newly
> >> >> added sk_release() to return the reference.
> >> >>
> >> >> By way of example, the following pseudocode would filter inbound
> >> >> connections at XDP if there is no corresponding service listening for
> >> >> the traffic:
> >> >>
> >> >>   struct bpf_sock_tuple tuple;
> >> >>   struct bpf_sock_ops *sk;
> >> >>
> >> >>   populate_tuple(ctx, &tuple); // Extract the 5tuple from the packet
> >> >>   sk = bpf_sk_lookup(ctx, &tuple, sizeof tuple, netns, 0);
> >> >>   if (!sk) {
> >> >>     // Couldn't find a socket listening for this traffic. Drop.
> >> >>     return TC_ACT_SHOT;
> >> >>   }
> >> >>   bpf_sk_release(sk, 0);
> >> >>   return TC_ACT_OK;
> >> >>
> >> >> Signed-off-by: Joe Stringer <joe@wand.net.nz>
> >> >> ---
> >>
> >> ...
> >>
> >> >> @@ -4032,6 +4036,96 @@ static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
> >> >>  };
> >> >>  #endif
> >> >>
> >> >> +struct sock *
> >> >> +sk_lookup(struct net *net, struct bpf_sock_tuple *tuple) {
> >> > Would it be possible to have another version that
> >> > returns a sk without taking its refcnt?
> >> > It may have performance benefit.
> >>
> >> Not really. The sockets are not RCU-protected, and established sockets
> >> may be torn down without notice. If we don't take a reference, there's
> >> no guarantee that the socket will continue to exist for the duration
> >> of running the BPF program.
> >>
> >> From what I follow, the comment below has a hidden implication which
> >> is that sockets without SOCK_RCU_FREE, eg established sockets, may be
> >> directly freed regardless of RCU.
> > Right, SOCK_RCU_FREE sk is the one I am concern about.
> > For example, TCP_LISTEN socket does not require taking a refcnt
> > now.  Doing a bpf_sk_lookup() may have a rather big
> > impact on handling TCP syn flood.  or the usual intention
> > is to redirect instead of passing it up to the stack?
> 
> I see, if you're only interested in listen sockets then probably this
> series could be extended with a new flag, eg something like
> BPF_F_SK_FIND_LISTENERS which restricts the set of possible sockets
> found to only listen sockets, then the implementation would call into
> __inet_lookup_listener() instead of inet_lookup(). The presence of
> that flag in the relevant register during CALL instruction would show
> that the verifier should not reference-track the result, then there'd
> need to be a check on the release to ensure that this unreferenced
> socket is never released. Just a thought, completely untested and I
> could still be missing some detail..
> 
> That said, I don't completely follow how you would expect to handle
> the traffic for sockets that are already established - the helper
> would no longer find those sockets, so you wouldn't know whether to
> pass the traffic up the stack for established traffic or not.

I think Martin has a valid concern here that if somebody starts using
this helper on the rx traffic the bpf program (via these two new
helpers) will be doing refcnt++ and refcnt-- even for listener
sockets which will cause synflood to suffer.
One can argue that this is bpf author mistake, but without fixes
(and api changes) to the helper the programmer doesn't really have a way
of avoiding this situation.
Also udp sockets don't need refcnt at all.
How about we split this single helper into three:
- bpf_sk_lookup_tcp_established() that will return refcnt-ed socket
and has to be bpf_sk_release()d by the program.
- bpf_sk_lookup_tcp_listener() that doesn't refcnt, since progs
run in rcu.
- bpf_sk_lookup_udp() that also doesn't refcnt.
The logic you want to put into this helper can be easily
replicated with these three helpers and the whole thing will
be much faster.
Thoughts?
Martin KaFai Lau May 15, 2018, 4:48 p.m. UTC | #6
On Mon, May 14, 2018 at 08:16:59PM -0700, Alexei Starovoitov wrote:
> On Fri, May 11, 2018 at 05:54:33PM -0700, Joe Stringer wrote:
> > On 11 May 2018 at 14:41, Martin KaFai Lau <kafai@fb.com> wrote:
> > > On Fri, May 11, 2018 at 02:08:01PM -0700, Joe Stringer wrote:
> > >> On 10 May 2018 at 22:00, Martin KaFai Lau <kafai@fb.com> wrote:
> > >> > On Wed, May 09, 2018 at 02:07:05PM -0700, Joe Stringer wrote:
> > >> >> This patch adds a new BPF helper function, sk_lookup() which allows BPF
> > >> >> programs to find out if there is a socket listening on this host, and
> > >> >> returns a socket pointer which the BPF program can then access to
> > >> >> determine, for instance, whether to forward or drop traffic. sk_lookup()
> > >> >> takes a reference on the socket, so when a BPF program makes use of this
> > >> >> function, it must subsequently pass the returned pointer into the newly
> > >> >> added sk_release() to return the reference.
> > >> >>
> > >> >> By way of example, the following pseudocode would filter inbound
> > >> >> connections at XDP if there is no corresponding service listening for
> > >> >> the traffic:
> > >> >>
> > >> >>   struct bpf_sock_tuple tuple;
> > >> >>   struct bpf_sock_ops *sk;
> > >> >>
> > >> >>   populate_tuple(ctx, &tuple); // Extract the 5tuple from the packet
> > >> >>   sk = bpf_sk_lookup(ctx, &tuple, sizeof tuple, netns, 0);
> > >> >>   if (!sk) {
> > >> >>     // Couldn't find a socket listening for this traffic. Drop.
> > >> >>     return TC_ACT_SHOT;
> > >> >>   }
> > >> >>   bpf_sk_release(sk, 0);
> > >> >>   return TC_ACT_OK;
> > >> >>
> > >> >> Signed-off-by: Joe Stringer <joe@wand.net.nz>
> > >> >> ---
> > >>
> > >> ...
> > >>
> > >> >> @@ -4032,6 +4036,96 @@ static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
> > >> >>  };
> > >> >>  #endif
> > >> >>
> > >> >> +struct sock *
> > >> >> +sk_lookup(struct net *net, struct bpf_sock_tuple *tuple) {
> > >> > Would it be possible to have another version that
> > >> > returns a sk without taking its refcnt?
> > >> > It may have performance benefit.
> > >>
> > >> Not really. The sockets are not RCU-protected, and established sockets
> > >> may be torn down without notice. If we don't take a reference, there's
> > >> no guarantee that the socket will continue to exist for the duration
> > >> of running the BPF program.
> > >>
> > >> From what I follow, the comment below has a hidden implication which
> > >> is that sockets without SOCK_RCU_FREE, eg established sockets, may be
> > >> directly freed regardless of RCU.
> > > Right, SOCK_RCU_FREE sk is the one I am concern about.
> > > For example, TCP_LISTEN socket does not require taking a refcnt
> > > now.  Doing a bpf_sk_lookup() may have a rather big
> > > impact on handling TCP syn flood.  or the usual intention
> > > is to redirect instead of passing it up to the stack?
> > 
> > I see, if you're only interested in listen sockets then probably this
> > series could be extended with a new flag, eg something like
> > BPF_F_SK_FIND_LISTENERS which restricts the set of possible sockets
> > found to only listen sockets, then the implementation would call into
> > __inet_lookup_listener() instead of inet_lookup(). The presence of
> > that flag in the relevant register during CALL instruction would show
> > that the verifier should not reference-track the result, then there'd
> > need to be a check on the release to ensure that this unreferenced
> > socket is never released. Just a thought, completely untested and I
> > could still be missing some detail..
> > 
> > That said, I don't completely follow how you would expect to handle
> > the traffic for sockets that are already established - the helper
> > would no longer find those sockets, so you wouldn't know whether to
> > pass the traffic up the stack for established traffic or not.
> 
> I think Martin has a valid concern here that if somebody starts using
> this helper on the rx traffic the bpf program (via these two new
> helpers) will be doing refcnt++ and refcnt-- even for listener
> sockets which will cause synflood to suffer.
> One can argue that this is bpf author mistake, but without fixes
> (and api changes) to the helper the programmer doesn't really have a way
> of avoiding this situation.
> Also udp sockets don't need refcnt at all.
> How about we split this single helper into three:
> - bpf_sk_lookup_tcp_established() that will return refcnt-ed socket
> and has to be bpf_sk_release()d by the program.
> - bpf_sk_lookup_tcp_listener() that doesn't refcnt, since progs
> run in rcu.
> - bpf_sk_lookup_udp() that also doesn't refcnt.
> The logic you want to put into this helper can be easily
> replicated with these three helpers and the whole thing will
> be much faster.
> Thoughts?
Just came to my mind.

or can we explore something like:

On the bpf_sk_lookup() side, use __inet[6]_lookup()
and __udp[46]_lib_lookup() instead.  That should
only take refcnt if it has to.

On the bpf_sk_release() side, it skips refcnt--
if sk is SOCK_RCU_FREE.
Joe Stringer May 16, 2018, 6:55 p.m. UTC | #7
On 15 May 2018 at 09:48, Martin KaFai Lau <kafai@fb.com> wrote:
> On Mon, May 14, 2018 at 08:16:59PM -0700, Alexei Starovoitov wrote:
>> On Fri, May 11, 2018 at 05:54:33PM -0700, Joe Stringer wrote:
>> > On 11 May 2018 at 14:41, Martin KaFai Lau <kafai@fb.com> wrote:
>> > > On Fri, May 11, 2018 at 02:08:01PM -0700, Joe Stringer wrote:
>> > >> On 10 May 2018 at 22:00, Martin KaFai Lau <kafai@fb.com> wrote:
>> > >> > On Wed, May 09, 2018 at 02:07:05PM -0700, Joe Stringer wrote:
>> > >> >> This patch adds a new BPF helper function, sk_lookup() which allows BPF
>> > >> >> programs to find out if there is a socket listening on this host, and
>> > >> >> returns a socket pointer which the BPF program can then access to
>> > >> >> determine, for instance, whether to forward or drop traffic. sk_lookup()
>> > >> >> takes a reference on the socket, so when a BPF program makes use of this
>> > >> >> function, it must subsequently pass the returned pointer into the newly
>> > >> >> added sk_release() to return the reference.
>> > >> >>
>> > >> >> By way of example, the following pseudocode would filter inbound
>> > >> >> connections at XDP if there is no corresponding service listening for
>> > >> >> the traffic:
>> > >> >>
>> > >> >>   struct bpf_sock_tuple tuple;
>> > >> >>   struct bpf_sock_ops *sk;
>> > >> >>
>> > >> >>   populate_tuple(ctx, &tuple); // Extract the 5tuple from the packet
>> > >> >>   sk = bpf_sk_lookup(ctx, &tuple, sizeof tuple, netns, 0);
>> > >> >>   if (!sk) {
>> > >> >>     // Couldn't find a socket listening for this traffic. Drop.
>> > >> >>     return TC_ACT_SHOT;
>> > >> >>   }
>> > >> >>   bpf_sk_release(sk, 0);
>> > >> >>   return TC_ACT_OK;
>> > >> >>
>> > >> >> Signed-off-by: Joe Stringer <joe@wand.net.nz>
>> > >> >> ---
>> > >>
>> > >> ...
>> > >>
>> > >> >> @@ -4032,6 +4036,96 @@ static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
>> > >> >>  };
>> > >> >>  #endif
>> > >> >>
>> > >> >> +struct sock *
>> > >> >> +sk_lookup(struct net *net, struct bpf_sock_tuple *tuple) {
>> > >> > Would it be possible to have another version that
>> > >> > returns a sk without taking its refcnt?
>> > >> > It may have performance benefit.
>> > >>
>> > >> Not really. The sockets are not RCU-protected, and established sockets
>> > >> may be torn down without notice. If we don't take a reference, there's
>> > >> no guarantee that the socket will continue to exist for the duration
>> > >> of running the BPF program.
>> > >>
>> > >> From what I follow, the comment below has a hidden implication which
>> > >> is that sockets without SOCK_RCU_FREE, eg established sockets, may be
>> > >> directly freed regardless of RCU.
>> > > Right, SOCK_RCU_FREE sk is the one I am concern about.
>> > > For example, TCP_LISTEN socket does not require taking a refcnt
>> > > now.  Doing a bpf_sk_lookup() may have a rather big
>> > > impact on handling TCP syn flood.  or the usual intention
>> > > is to redirect instead of passing it up to the stack?
>> >
>> > I see, if you're only interested in listen sockets then probably this
>> > series could be extended with a new flag, eg something like
>> > BPF_F_SK_FIND_LISTENERS which restricts the set of possible sockets
>> > found to only listen sockets, then the implementation would call into
>> > __inet_lookup_listener() instead of inet_lookup(). The presence of
>> > that flag in the relevant register during CALL instruction would show
>> > that the verifier should not reference-track the result, then there'd
>> > need to be a check on the release to ensure that this unreferenced
>> > socket is never released. Just a thought, completely untested and I
>> > could still be missing some detail..
>> >
>> > That said, I don't completely follow how you would expect to handle
>> > the traffic for sockets that are already established - the helper
>> > would no longer find those sockets, so you wouldn't know whether to
>> > pass the traffic up the stack for established traffic or not.
>>
>> I think Martin has a valid concern here that if somebody starts using
>> this helper on the rx traffic the bpf program (via these two new
>> helpers) will be doing refcnt++ and refcnt-- even for listener
>> sockets which will cause synflood to suffer.
>> One can argue that this is bpf author mistake, but without fixes
>> (and api changes) to the helper the programmer doesn't really have a way
>> of avoiding this situation.
>> Also udp sockets don't need refcnt at all.
>> How about we split this single helper into three:
>> - bpf_sk_lookup_tcp_established() that will return refcnt-ed socket
>> and has to be bpf_sk_release()d by the program.
>> - bpf_sk_lookup_tcp_listener() that doesn't refcnt, since progs
>> run in rcu.
>> - bpf_sk_lookup_udp() that also doesn't refcnt.
>> The logic you want to put into this helper can be easily
>> replicated with these three helpers and the whole thing will
>> be much faster.
>> Thoughts?
> Just came to my mind.
>
> or can we explore something like:
>
> On the bpf_sk_lookup() side, use __inet[6]_lookup()
> and __udp[46]_lib_lookup() instead.  That should
> only take refcnt if it has to.
>
> On the bpf_sk_release() side, it skips refcnt--
> if sk is SOCK_RCU_FREE.

Reflecting the discussion from IOVisor call:

I voiced a concern with the above proposal by Alexei that it leaks
kernel implementation detail (established sockets are refcnted) into
the BPF API.

Martin's proposal here addresses this concern. We can force all
sk_lookup()s to match with a bpf_sk_release(), then inside the
bpf_sk_release() we can deal with the details of whether any freeing
is actually required.

It's still useful to split the helpers out into bpf_sk_lookup_tcp()
and bpf_sk_lookup_udp() because then we don't need to deal with the
forward-compatibility concern of adding support for different socket
types (eg SCTP). That said, the TCP established/listener split does
not have an immediate user, so we don't need to split these at this
time. If there is a use case for only finding the listener sockets, we
can always add a flag to the bpf_sk_lookup_tcp() helper to only find
the listener sockets.

I'll respin, thanks for the feedback all.
diff mbox series

Patch

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d615c777b573..29f38838dbca 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1828,6 +1828,25 @@  union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
+ * struct bpf_sock_ops *bpf_sk_lookup(ctx, tuple, tuple_size, netns, flags)
+ * 	Decription
+ * 		Look for socket matching 'tuple'. The return value must be checked,
+ * 		and if non-NULL, released via bpf_sk_release().
+ * 		@ctx: pointer to ctx
+ * 		@tuple: pointer to struct bpf_sock_tuple
+ * 		@tuple_size: size of the tuple
+ * 		@flags: flags value
+ * 	Return
+ * 		pointer to socket ops on success, or
+ * 		NULL in case of failure
+ *
+ *  int bpf_sk_release(sock, flags)
+ * 	Description
+ * 		Release the reference held by 'sock'.
+ * 		@sock: Pointer reference to release. Must be found via bpf_sk_lookup().
+ * 		@flags: flags value
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -1898,7 +1917,9 @@  union bpf_attr {
 	FN(xdp_adjust_tail),		\
 	FN(skb_get_xfrm_state),		\
 	FN(get_stack),			\
-	FN(skb_load_bytes_relative),
+	FN(skb_load_bytes_relative),	\
+	FN(sk_lookup),			\
+	FN(sk_release),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -2060,6 +2081,22 @@  struct bpf_sock {
 				 */
 };
 
+struct bpf_sock_tuple {
+	union {
+		__be32 ipv6[4];
+		__be32 ipv4;
+	} saddr;
+	union {
+		__be32 ipv6[4];
+		__be32 ipv4;
+	} daddr;
+	__be16 sport;
+	__be16 dport;
+	__u32 dst_if;
+	__u8 family;
+	__u8 proto;
+};
+
 #define XDP_PACKET_HEADROOM 256
 
 /* User return codes for XDP prog type.
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 92b9a5dc465a..579012c483e4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -153,6 +153,12 @@  static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
  * PTR_TO_MAP_VALUE, PTR_TO_SOCKET_OR_NULL becomes PTR_TO_SOCKET when the type
  * passes through a NULL-check conditional. For the branch wherein the state is
  * changed to CONST_IMM, the verifier releases the reference.
+ *
+ * For each helper function that allocates a reference, such as bpf_sk_lookup(),
+ * there is a corresponding release function, such as bpf_sk_release(). When
+ * a reference type passes into the release function, the verifier also releases
+ * the reference. If any unchecked or unreleased reference remains at the end of
+ * the program, the verifier rejects it.
  */
 
 /* verifier_state + insn_idx are pushed to stack when branch is encountered */
@@ -277,7 +283,7 @@  static bool arg_type_is_refcounted(enum bpf_arg_type type)
  */
 static bool is_release_function(enum bpf_func_id func_id)
 {
-	return false;
+	return func_id == BPF_FUNC_sk_release;
 }
 
 /* string representation of 'enum bpf_reg_type' */
diff --git a/net/core/filter.c b/net/core/filter.c
index 4c35152fb3a8..751c255d17d3 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -58,8 +58,12 @@ 
 #include <net/busy_poll.h>
 #include <net/tcp.h>
 #include <net/xfrm.h>
+#include <net/udp.h>
 #include <linux/bpf_trace.h>
 #include <net/xdp_sock.h>
+#include <net/inet_hashtables.h>
+#include <net/inet6_hashtables.h>
+#include <net/net_namespace.h>
 
 /**
  *	sk_filter_trim_cap - run a packet through a socket filter
@@ -4032,6 +4036,96 @@  static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
 };
 #endif
 
+struct sock *
+sk_lookup(struct net *net, struct bpf_sock_tuple *tuple) {
+	int dst_if = (int)tuple->dst_if;
+	struct in6_addr *src6;
+	struct in6_addr *dst6;
+
+	if (tuple->family == AF_INET6) {
+		src6 = (struct in6_addr *)&tuple->saddr.ipv6;
+		dst6 = (struct in6_addr *)&tuple->daddr.ipv6;
+	} else if (tuple->family != AF_INET) {
+		return ERR_PTR(-EOPNOTSUPP);
+	}
+
+	if (tuple->proto == IPPROTO_TCP) {
+		if (tuple->family == AF_INET)
+			return inet_lookup(net, &tcp_hashinfo, NULL, 0,
+					   tuple->saddr.ipv4, tuple->sport,
+					   tuple->daddr.ipv4, tuple->dport,
+					   dst_if);
+		else
+			return inet6_lookup(net, &tcp_hashinfo, NULL, 0,
+					    src6, tuple->sport,
+					    dst6, tuple->dport, dst_if);
+	} else if (tuple->proto == IPPROTO_UDP) {
+		if (tuple->family == AF_INET)
+			return udp4_lib_lookup(net, tuple->saddr.ipv4,
+					       tuple->sport, tuple->daddr.ipv4,
+					       tuple->dport, dst_if);
+		else
+			return udp6_lib_lookup(net, src6, tuple->sport,
+					       dst6, tuple->dport, dst_if);
+	} else {
+		return ERR_PTR(-EOPNOTSUPP);
+	}
+
+	return NULL;
+}
+
+BPF_CALL_5(bpf_sk_lookup, struct sk_buff *, skb,
+	   struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)
+{
+	struct net *caller_net = dev_net(skb->dev);
+	struct sock *sk = NULL;
+	struct net *net;
+
+	/* XXX: Perform verification-time checking of tuple size? */
+	if (unlikely(len != sizeof(struct bpf_sock_tuple) || flags))
+		goto out;
+
+	net = get_net_ns_by_id(caller_net, netns_id);
+	if (unlikely(!net))
+		goto out;
+
+	sk = sk_lookup(net, tuple);
+	put_net(net);
+	if (IS_ERR_OR_NULL(sk))
+		sk = NULL;
+	else
+		sk = sk_to_full_sk(sk);
+out:
+	return (unsigned long) sk;
+}
+
+static const struct bpf_func_proto bpf_sk_lookup_proto = {
+	.func		= bpf_sk_lookup,
+	.gpl_only	= false,
+	.ret_type	= RET_PTR_TO_SOCKET_OR_NULL,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+	.arg4_type	= ARG_ANYTHING,
+	.arg5_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_sk_release, struct sock *, sk, u64, flags)
+{
+	sock_gen_put(sk);
+	if (unlikely(flags))
+		return -EINVAL;
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_sk_release_proto = {
+	.func		= bpf_sk_release,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_SOCKET,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *
 bpf_base_func_proto(enum bpf_func_id func_id)
 {
@@ -4181,6 +4275,10 @@  tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_skb_get_xfrm_state:
 		return &bpf_skb_get_xfrm_state_proto;
 #endif
+	case BPF_FUNC_sk_lookup:
+		return &bpf_sk_lookup_proto;
+	case BPF_FUNC_sk_release:
+		return &bpf_sk_release_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -4292,6 +4390,10 @@  sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_socket_uid_proto;
 	case BPF_FUNC_sk_redirect_map:
 		return &bpf_sk_redirect_map_proto;
+	case BPF_FUNC_sk_lookup:
+		return &bpf_sk_lookup_proto;
+	case BPF_FUNC_sk_release:
+		return &bpf_sk_release_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index fff51c187d1e..29f38838dbca 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -117,6 +117,7 @@  enum bpf_map_type {
 	BPF_MAP_TYPE_DEVMAP,
 	BPF_MAP_TYPE_SOCKMAP,
 	BPF_MAP_TYPE_CPUMAP,
+	BPF_MAP_TYPE_XSKMAP,
 };
 
 enum bpf_prog_type {
@@ -1827,6 +1828,25 @@  union bpf_attr {
  * 	Return
  * 		0 on success, or a negative error in case of failure.
  *
+ * struct bpf_sock_ops *bpf_sk_lookup(ctx, tuple, tuple_size, netns, flags)
+ * 	Decription
+ * 		Look for socket matching 'tuple'. The return value must be checked,
+ * 		and if non-NULL, released via bpf_sk_release().
+ * 		@ctx: pointer to ctx
+ * 		@tuple: pointer to struct bpf_sock_tuple
+ * 		@tuple_size: size of the tuple
+ * 		@flags: flags value
+ * 	Return
+ * 		pointer to socket ops on success, or
+ * 		NULL in case of failure
+ *
+ *  int bpf_sk_release(sock, flags)
+ * 	Description
+ * 		Release the reference held by 'sock'.
+ * 		@sock: Pointer reference to release. Must be found via bpf_sk_lookup().
+ * 		@flags: flags value
+ * 	Return
+ * 		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -1897,7 +1917,9 @@  union bpf_attr {
 	FN(xdp_adjust_tail),		\
 	FN(skb_get_xfrm_state),		\
 	FN(get_stack),			\
-	FN(skb_load_bytes_relative),
+	FN(skb_load_bytes_relative),	\
+	FN(sk_lookup),			\
+	FN(sk_release),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -2059,6 +2081,22 @@  struct bpf_sock {
 				 */
 };
 
+struct bpf_sock_tuple {
+	union {
+		__be32 ipv6[4];
+		__be32 ipv4;
+	} saddr;
+	union {
+		__be32 ipv6[4];
+		__be32 ipv4;
+	} daddr;
+	__be16 sport;
+	__be16 dport;
+	__u32 dst_if;
+	__u8 family;
+	__u8 proto;
+};
+
 #define XDP_PACKET_HEADROOM 256
 
 /* User return codes for XDP prog type.
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 265f8e0e8ada..4dc311ea0c16 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -103,6 +103,13 @@  static int (*bpf_skb_get_xfrm_state)(void *ctx, int index, void *state,
 	(void *) BPF_FUNC_skb_get_xfrm_state;
 static int (*bpf_get_stack)(void *ctx, void *buf, int size, int flags) =
 	(void *) BPF_FUNC_get_stack;
+static struct bpf_sock *(*bpf_sk_lookup)(void *ctx,
+					 struct bpf_sock_tuple *tuple,
+					 int size, unsigned int netns_id,
+					 unsigned long long flags) =
+	(void *) BPF_FUNC_sk_lookup;
+static int (*bpf_sk_release)(struct bpf_sock *sk, unsigned long long flags) =
+	(void *) BPF_FUNC_sk_release;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions