diff mbox series

[bpf-next,1/3] net: Allow iterating sockmap and sockhash

Message ID 20200828094834.23290-2-lmb@cloudflare.com
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series Sockmap iterator | expand

Commit Message

Lorenz Bauer Aug. 28, 2020, 9:48 a.m. UTC
Add bpf_iter support for sockmap / sockhash, based on the bpf_sk_storage and
hashtable implementation. sockmap and sockhash share the same iteration
context: a pointer to an arbitrary key and a pointer to a socket. Both
pointers may be NULL, and so BPF has to perform a NULL check before accessing
them. Technically it's not possible for sockhash iteration to yield a NULL
socket, but we ignore this to be able to use a single iteration point.

Iteration will visit all keys that remain unmodified during the lifetime of
the iterator. It may or may not visit newly added ones.

Signed-off-by: Lorenz Bauer <lmb@cloudflare.com>
---
 net/core/sock_map.c | 283 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 283 insertions(+)

Comments

Jakub Sitnicki Aug. 31, 2020, 10:03 a.m. UTC | #1
On Fri, Aug 28, 2020 at 11:48 AM CEST, Lorenz Bauer wrote:
> Add bpf_iter support for sockmap / sockhash, based on the bpf_sk_storage and
> hashtable implementation. sockmap and sockhash share the same iteration
> context: a pointer to an arbitrary key and a pointer to a socket. Both
> pointers may be NULL, and so BPF has to perform a NULL check before accessing
> them. Technically it's not possible for sockhash iteration to yield a NULL
> socket, but we ignore this to be able to use a single iteration point.
>
> Iteration will visit all keys that remain unmodified during the lifetime of
> the iterator. It may or may not visit newly added ones.
>
> Signed-off-by: Lorenz Bauer <lmb@cloudflare.com>
> ---
>  net/core/sock_map.c | 283 ++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 283 insertions(+)
>
> diff --git a/net/core/sock_map.c b/net/core/sock_map.c
> index d6c6e1e312fc..31c4332f06e4 100644
> --- a/net/core/sock_map.c
> +++ b/net/core/sock_map.c
> @@ -703,6 +703,116 @@ const struct bpf_func_proto bpf_msg_redirect_map_proto = {
>  	.arg4_type      = ARG_ANYTHING,
>  };
>
> +struct sock_map_seq_info {
> +	struct bpf_map *map;
> +	struct sock *sk;
> +	u32 index;
> +};
> +
> +struct bpf_iter__sockmap {
> +	__bpf_md_ptr(struct bpf_iter_meta *, meta);
> +	__bpf_md_ptr(struct bpf_map *, map);
> +	__bpf_md_ptr(void *, key);
> +	__bpf_md_ptr(struct bpf_sock *, sk);
> +};
> +
> +DEFINE_BPF_ITER_FUNC(sockmap, struct bpf_iter_meta *meta,
> +		     struct bpf_map *map, void *key,
> +		     struct sock *sk)
> +
> +static void *sock_map_seq_lookup_elem(struct sock_map_seq_info *info)
> +{
> +	if (unlikely(info->index >= info->map->max_entries))
> +		return NULL;
> +
> +	info->sk = __sock_map_lookup_elem(info->map, info->index);
> +	if (!info->sk || !sk_fullsock(info->sk))

As we've talked off-line, we don't expect neither timewait nor request
sockets in sockmap so sk_fullsock() check is likely not needed.

> +		info->sk = NULL;
> +
> +	/* continue iterating */
> +	return info;
> +}
> +
> +static void *sock_map_seq_start(struct seq_file *seq, loff_t *pos)
> +{
> +	struct sock_map_seq_info *info = seq->private;
> +
> +	if (*pos == 0)
> +		++*pos;
> +
> +	/* pairs with sock_map_seq_stop */
> +	rcu_read_lock();
> +	return sock_map_seq_lookup_elem(info);
> +}
> +
> +static void *sock_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
> +{
> +	struct sock_map_seq_info *info = seq->private;
> +
> +	++*pos;
> +	++info->index;
> +
> +	return sock_map_seq_lookup_elem(info);
> +}
> +
> +static int __sock_map_seq_show(struct seq_file *seq, void *v)
> +{
> +	struct sock_map_seq_info *info = seq->private;
> +	struct bpf_iter__sockmap ctx = {};
> +	struct bpf_iter_meta meta;
> +	struct bpf_prog *prog;
> +
> +	meta.seq = seq;
> +	prog = bpf_iter_get_info(&meta, !v);
> +	if (!prog)
> +		return 0;
> +
> +	ctx.meta = &meta;
> +	ctx.map = info->map;
> +	if (v) {
> +		ctx.key = &info->index;
> +		ctx.sk = (struct bpf_sock *)info->sk;
> +	}
> +
> +	return bpf_iter_run_prog(prog, &ctx);
> +}
> +
> +static int sock_map_seq_show(struct seq_file *seq, void *v)
> +{
> +	return __sock_map_seq_show(seq, v);
> +}
> +
> +static void sock_map_seq_stop(struct seq_file *seq, void *v)
> +{
> +	if (!v)
> +		(void)__sock_map_seq_show(seq, NULL);
> +
> +	/* pairs with sock_map_seq_start */
> +	rcu_read_unlock();
> +}
> +
> +static const struct seq_operations sock_map_seq_ops = {
> +	.start	= sock_map_seq_start,
> +	.next	= sock_map_seq_next,
> +	.stop	= sock_map_seq_stop,
> +	.show	= sock_map_seq_show,
> +};
> +
> +static int sock_map_init_seq_private(void *priv_data,
> +				     struct bpf_iter_aux_info *aux)
> +{
> +	struct sock_map_seq_info *info = priv_data;
> +
> +	info->map = aux->map;
> +	return 0;
> +}
> +
> +static const struct bpf_iter_seq_info sock_map_iter_seq_info = {
> +	.seq_ops		= &sock_map_seq_ops,
> +	.init_seq_private	= sock_map_init_seq_private,
> +	.seq_priv_size		= sizeof(struct sock_map_seq_info),
> +};
> +
>  static int sock_map_btf_id;
>  const struct bpf_map_ops sock_map_ops = {
>  	.map_alloc		= sock_map_alloc,

[...]

> @@ -1198,6 +1309,120 @@ const struct bpf_func_proto bpf_msg_redirect_hash_proto = {
>  	.arg4_type      = ARG_ANYTHING,
>  };
>
> +struct sock_hash_seq_info {
> +	struct bpf_map *map;
> +	struct bpf_shtab *htab;
> +	u32 bucket_id;
> +};
> +
> +static void *sock_hash_seq_find_next(struct sock_hash_seq_info *info,
> +				     struct bpf_shtab_elem *prev_elem)
> +{
> +	const struct bpf_shtab *htab = info->htab;
> +	struct bpf_shtab_bucket *bucket;
> +	struct bpf_shtab_elem *elem;
> +	struct hlist_node *node;
> +
> +	/* try to find next elem in the same bucket */
> +	if (prev_elem) {
> +		node = rcu_dereference_raw(hlist_next_rcu(&prev_elem->node));

I'm not convinced we need to go for the rcu_dereference_raw()
variant. Access happens inside read-side critical section, which we
entered with rcu_read_lock() in sock_hash_seq_start().

That's typical and rcu_dereference() seems appropriate. Basing this on
what I read in Documentation/RCU/rcu_dereference.rst.

> +		elem = hlist_entry_safe(node, struct bpf_shtab_elem, node);
> +		if (elem)
> +			return elem;
> +
> +		/* no more elements, continue in the next bucket */
> +		info->bucket_id++;
> +	}
> +
> +	for (; info->bucket_id < htab->buckets_num; info->bucket_id++) {
> +		bucket = &htab->buckets[info->bucket_id];
> +		node = rcu_dereference_raw(hlist_first_rcu(&bucket->head));
> +		elem = hlist_entry_safe(node, struct bpf_shtab_elem, node);
> +		if (elem)
> +			return elem;
> +	}
> +
> +	return NULL;
> +}
> +
> +static void *sock_hash_seq_start(struct seq_file *seq, loff_t *pos)
> +{
> +	struct sock_hash_seq_info *info = seq->private;
> +
> +	if (*pos == 0)
> +		++*pos;
> +
> +	/* pairs with sock_hash_seq_stop */
> +	rcu_read_lock();
> +	return sock_hash_seq_find_next(info, NULL);
> +}
> +
> +static void *sock_hash_seq_next(struct seq_file *seq, void *v, loff_t *pos)
> +{
> +	struct sock_hash_seq_info *info = seq->private;
> +
> +	++*pos;
> +	return sock_hash_seq_find_next(info, v);
> +}
> +
> +static int __sock_hash_seq_show(struct seq_file *seq, struct bpf_shtab_elem *elem)
> +{
> +	struct sock_hash_seq_info *info = seq->private;
> +	struct bpf_iter__sockmap ctx = {};
> +	struct bpf_iter_meta meta;
> +	struct bpf_prog *prog;
> +
> +	meta.seq = seq;
> +	prog = bpf_iter_get_info(&meta, !elem);
> +	if (!prog)
> +		return 0;
> +
> +	ctx.meta = &meta;
> +	ctx.map = info->map;
> +	if (elem) {
> +		ctx.key = elem->key;
> +		ctx.sk = (struct bpf_sock *)elem->sk;
> +	}
> +
> +	return bpf_iter_run_prog(prog, &ctx);
> +}
> +
> +static int sock_hash_seq_show(struct seq_file *seq, void *v)
> +{
> +	return __sock_hash_seq_show(seq, v);
> +}
> +
> +static void sock_hash_seq_stop(struct seq_file *seq, void *v)
> +{
> +	if (!v)
> +		(void)__sock_hash_seq_show(seq, NULL);
> +
> +	/* pairs with sock_hash_seq_start */
> +	rcu_read_unlock();
> +}
> +
> +static const struct seq_operations sock_hash_seq_ops = {
> +	.start	= sock_hash_seq_start,
> +	.next	= sock_hash_seq_next,
> +	.stop	= sock_hash_seq_stop,
> +	.show	= sock_hash_seq_show,
> +};
> +
> +static int sock_hash_init_seq_private(void *priv_data,
> +				     struct bpf_iter_aux_info *aux)
> +{
> +	struct sock_hash_seq_info *info = priv_data;
> +
> +	info->map = aux->map;
> +	return 0;
> +}
> +
> +static const struct bpf_iter_seq_info sock_hash_iter_seq_info = {
> +	.seq_ops		= &sock_hash_seq_ops,
> +	.init_seq_private	= sock_hash_init_seq_private,
> +	.seq_priv_size		= sizeof(struct sock_hash_seq_info),
> +};
> +
>  static int sock_hash_map_btf_id;
>  const struct bpf_map_ops sock_hash_ops = {
>  	.map_alloc		= sock_hash_alloc,

[...]
Lorenz Bauer Sept. 1, 2020, 8:59 a.m. UTC | #2
On Mon, 31 Aug 2020 at 11:04, Jakub Sitnicki <jakub@cloudflare.com> wrote:
>
> On Fri, Aug 28, 2020 at 11:48 AM CEST, Lorenz Bauer wrote:
> > Add bpf_iter support for sockmap / sockhash, based on the bpf_sk_storage and
> > hashtable implementation. sockmap and sockhash share the same iteration
> > context: a pointer to an arbitrary key and a pointer to a socket. Both
> > pointers may be NULL, and so BPF has to perform a NULL check before accessing
> > them. Technically it's not possible for sockhash iteration to yield a NULL
> > socket, but we ignore this to be able to use a single iteration point.
> >
> > Iteration will visit all keys that remain unmodified during the lifetime of
> > the iterator. It may or may not visit newly added ones.
> >
> > Signed-off-by: Lorenz Bauer <lmb@cloudflare.com>
> > ---
> >  net/core/sock_map.c | 283 ++++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 283 insertions(+)
> >
> > diff --git a/net/core/sock_map.c b/net/core/sock_map.c
> > index d6c6e1e312fc..31c4332f06e4 100644
> > --- a/net/core/sock_map.c
> > +++ b/net/core/sock_map.c
> > @@ -703,6 +703,116 @@ const struct bpf_func_proto bpf_msg_redirect_map_proto = {
> >       .arg4_type      = ARG_ANYTHING,
> >  };
> >
> > +struct sock_map_seq_info {
> > +     struct bpf_map *map;
> > +     struct sock *sk;
> > +     u32 index;
> > +};
> > +
> > +struct bpf_iter__sockmap {
> > +     __bpf_md_ptr(struct bpf_iter_meta *, meta);
> > +     __bpf_md_ptr(struct bpf_map *, map);
> > +     __bpf_md_ptr(void *, key);
> > +     __bpf_md_ptr(struct bpf_sock *, sk);
> > +};
> > +
> > +DEFINE_BPF_ITER_FUNC(sockmap, struct bpf_iter_meta *meta,
> > +                  struct bpf_map *map, void *key,
> > +                  struct sock *sk)
> > +
> > +static void *sock_map_seq_lookup_elem(struct sock_map_seq_info *info)
> > +{
> > +     if (unlikely(info->index >= info->map->max_entries))
> > +             return NULL;
> > +
> > +     info->sk = __sock_map_lookup_elem(info->map, info->index);
> > +     if (!info->sk || !sk_fullsock(info->sk))
>
> As we've talked off-line, we don't expect neither timewait nor request
> sockets in sockmap so sk_fullsock() check is likely not needed.

Ack.

>
> > +             info->sk = NULL;
> > +
> > +     /* continue iterating */
> > +     return info;
> > +}
> > +
> > +static void *sock_map_seq_start(struct seq_file *seq, loff_t *pos)
> > +{
> > +     struct sock_map_seq_info *info = seq->private;
> > +
> > +     if (*pos == 0)
> > +             ++*pos;
> > +
> > +     /* pairs with sock_map_seq_stop */
> > +     rcu_read_lock();
> > +     return sock_map_seq_lookup_elem(info);
> > +}
> > +
> > +static void *sock_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
> > +{
> > +     struct sock_map_seq_info *info = seq->private;
> > +
> > +     ++*pos;
> > +     ++info->index;
> > +
> > +     return sock_map_seq_lookup_elem(info);
> > +}
> > +
> > +static int __sock_map_seq_show(struct seq_file *seq, void *v)
> > +{
> > +     struct sock_map_seq_info *info = seq->private;
> > +     struct bpf_iter__sockmap ctx = {};
> > +     struct bpf_iter_meta meta;
> > +     struct bpf_prog *prog;
> > +
> > +     meta.seq = seq;
> > +     prog = bpf_iter_get_info(&meta, !v);
> > +     if (!prog)
> > +             return 0;
> > +
> > +     ctx.meta = &meta;
> > +     ctx.map = info->map;
> > +     if (v) {
> > +             ctx.key = &info->index;
> > +             ctx.sk = (struct bpf_sock *)info->sk;
> > +     }
> > +
> > +     return bpf_iter_run_prog(prog, &ctx);
> > +}
> > +
> > +static int sock_map_seq_show(struct seq_file *seq, void *v)
> > +{
> > +     return __sock_map_seq_show(seq, v);
> > +}
> > +
> > +static void sock_map_seq_stop(struct seq_file *seq, void *v)
> > +{
> > +     if (!v)
> > +             (void)__sock_map_seq_show(seq, NULL);
> > +
> > +     /* pairs with sock_map_seq_start */
> > +     rcu_read_unlock();
> > +}
> > +
> > +static const struct seq_operations sock_map_seq_ops = {
> > +     .start  = sock_map_seq_start,
> > +     .next   = sock_map_seq_next,
> > +     .stop   = sock_map_seq_stop,
> > +     .show   = sock_map_seq_show,
> > +};
> > +
> > +static int sock_map_init_seq_private(void *priv_data,
> > +                                  struct bpf_iter_aux_info *aux)
> > +{
> > +     struct sock_map_seq_info *info = priv_data;
> > +
> > +     info->map = aux->map;
> > +     return 0;
> > +}
> > +
> > +static const struct bpf_iter_seq_info sock_map_iter_seq_info = {
> > +     .seq_ops                = &sock_map_seq_ops,
> > +     .init_seq_private       = sock_map_init_seq_private,
> > +     .seq_priv_size          = sizeof(struct sock_map_seq_info),
> > +};
> > +
> >  static int sock_map_btf_id;
> >  const struct bpf_map_ops sock_map_ops = {
> >       .map_alloc              = sock_map_alloc,
>
> [...]
>
> > @@ -1198,6 +1309,120 @@ const struct bpf_func_proto bpf_msg_redirect_hash_proto = {
> >       .arg4_type      = ARG_ANYTHING,
> >  };
> >
> > +struct sock_hash_seq_info {
> > +     struct bpf_map *map;
> > +     struct bpf_shtab *htab;
> > +     u32 bucket_id;
> > +};
> > +
> > +static void *sock_hash_seq_find_next(struct sock_hash_seq_info *info,
> > +                                  struct bpf_shtab_elem *prev_elem)
> > +{
> > +     const struct bpf_shtab *htab = info->htab;
> > +     struct bpf_shtab_bucket *bucket;
> > +     struct bpf_shtab_elem *elem;
> > +     struct hlist_node *node;
> > +
> > +     /* try to find next elem in the same bucket */
> > +     if (prev_elem) {
> > +             node = rcu_dereference_raw(hlist_next_rcu(&prev_elem->node));
>
> I'm not convinced we need to go for the rcu_dereference_raw()
> variant. Access happens inside read-side critical section, which we
> entered with rcu_read_lock() in sock_hash_seq_start().
>
> That's typical and rcu_dereference() seems appropriate. Basing this on
> what I read in Documentation/RCU/rcu_dereference.rst.

Yeah, that makes sense to me. However, sock_hash_get_next_key also
uses rcu_dereference_raw. John, can you shed some light on why that
is? Can we replace that with plain rcu_dereference as well?

>
> > +             elem = hlist_entry_safe(node, struct bpf_shtab_elem, node);
> > +             if (elem)
> > +                     return elem;
> > +
> > +             /* no more elements, continue in the next bucket */
> > +             info->bucket_id++;
> > +     }
> > +
> > +     for (; info->bucket_id < htab->buckets_num; info->bucket_id++) {
> > +             bucket = &htab->buckets[info->bucket_id];
> > +             node = rcu_dereference_raw(hlist_first_rcu(&bucket->head));
> > +             elem = hlist_entry_safe(node, struct bpf_shtab_elem, node);
> > +             if (elem)
> > +                     return elem;
> > +     }
> > +
> > +     return NULL;
> > +}
> > +
> > +static void *sock_hash_seq_start(struct seq_file *seq, loff_t *pos)
> > +{
> > +     struct sock_hash_seq_info *info = seq->private;
> > +
> > +     if (*pos == 0)
> > +             ++*pos;
> > +
> > +     /* pairs with sock_hash_seq_stop */
> > +     rcu_read_lock();
> > +     return sock_hash_seq_find_next(info, NULL);
> > +}
> > +
> > +static void *sock_hash_seq_next(struct seq_file *seq, void *v, loff_t *pos)
> > +{
> > +     struct sock_hash_seq_info *info = seq->private;
> > +
> > +     ++*pos;
> > +     return sock_hash_seq_find_next(info, v);
> > +}
> > +
> > +static int __sock_hash_seq_show(struct seq_file *seq, struct bpf_shtab_elem *elem)
> > +{
> > +     struct sock_hash_seq_info *info = seq->private;
> > +     struct bpf_iter__sockmap ctx = {};
> > +     struct bpf_iter_meta meta;
> > +     struct bpf_prog *prog;
> > +
> > +     meta.seq = seq;
> > +     prog = bpf_iter_get_info(&meta, !elem);
> > +     if (!prog)
> > +             return 0;
> > +
> > +     ctx.meta = &meta;
> > +     ctx.map = info->map;
> > +     if (elem) {
> > +             ctx.key = elem->key;
> > +             ctx.sk = (struct bpf_sock *)elem->sk;
> > +     }
> > +
> > +     return bpf_iter_run_prog(prog, &ctx);
> > +}
> > +
> > +static int sock_hash_seq_show(struct seq_file *seq, void *v)
> > +{
> > +     return __sock_hash_seq_show(seq, v);
> > +}
> > +
> > +static void sock_hash_seq_stop(struct seq_file *seq, void *v)
> > +{
> > +     if (!v)
> > +             (void)__sock_hash_seq_show(seq, NULL);
> > +
> > +     /* pairs with sock_hash_seq_start */
> > +     rcu_read_unlock();
> > +}
> > +
> > +static const struct seq_operations sock_hash_seq_ops = {
> > +     .start  = sock_hash_seq_start,
> > +     .next   = sock_hash_seq_next,
> > +     .stop   = sock_hash_seq_stop,
> > +     .show   = sock_hash_seq_show,
> > +};
> > +
> > +static int sock_hash_init_seq_private(void *priv_data,
> > +                                  struct bpf_iter_aux_info *aux)
> > +{
> > +     struct sock_hash_seq_info *info = priv_data;
> > +
> > +     info->map = aux->map;
> > +     return 0;
> > +}
> > +
> > +static const struct bpf_iter_seq_info sock_hash_iter_seq_info = {
> > +     .seq_ops                = &sock_hash_seq_ops,
> > +     .init_seq_private       = sock_hash_init_seq_private,
> > +     .seq_priv_size          = sizeof(struct sock_hash_seq_info),
> > +};
> > +
> >  static int sock_hash_map_btf_id;
> >  const struct bpf_map_ops sock_hash_ops = {
> >       .map_alloc              = sock_hash_alloc,
>
> [...]
diff mbox series

Patch

diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index d6c6e1e312fc..31c4332f06e4 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -703,6 +703,116 @@  const struct bpf_func_proto bpf_msg_redirect_map_proto = {
 	.arg4_type      = ARG_ANYTHING,
 };
 
+struct sock_map_seq_info {
+	struct bpf_map *map;
+	struct sock *sk;
+	u32 index;
+};
+
+struct bpf_iter__sockmap {
+	__bpf_md_ptr(struct bpf_iter_meta *, meta);
+	__bpf_md_ptr(struct bpf_map *, map);
+	__bpf_md_ptr(void *, key);
+	__bpf_md_ptr(struct bpf_sock *, sk);
+};
+
+DEFINE_BPF_ITER_FUNC(sockmap, struct bpf_iter_meta *meta,
+		     struct bpf_map *map, void *key,
+		     struct sock *sk)
+
+static void *sock_map_seq_lookup_elem(struct sock_map_seq_info *info)
+{
+	if (unlikely(info->index >= info->map->max_entries))
+		return NULL;
+
+	info->sk = __sock_map_lookup_elem(info->map, info->index);
+	if (!info->sk || !sk_fullsock(info->sk))
+		info->sk = NULL;
+
+	/* continue iterating */
+	return info;
+}
+
+static void *sock_map_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct sock_map_seq_info *info = seq->private;
+
+	if (*pos == 0)
+		++*pos;
+
+	/* pairs with sock_map_seq_stop */
+	rcu_read_lock();
+	return sock_map_seq_lookup_elem(info);
+}
+
+static void *sock_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct sock_map_seq_info *info = seq->private;
+
+	++*pos;
+	++info->index;
+
+	return sock_map_seq_lookup_elem(info);
+}
+
+static int __sock_map_seq_show(struct seq_file *seq, void *v)
+{
+	struct sock_map_seq_info *info = seq->private;
+	struct bpf_iter__sockmap ctx = {};
+	struct bpf_iter_meta meta;
+	struct bpf_prog *prog;
+
+	meta.seq = seq;
+	prog = bpf_iter_get_info(&meta, !v);
+	if (!prog)
+		return 0;
+
+	ctx.meta = &meta;
+	ctx.map = info->map;
+	if (v) {
+		ctx.key = &info->index;
+		ctx.sk = (struct bpf_sock *)info->sk;
+	}
+
+	return bpf_iter_run_prog(prog, &ctx);
+}
+
+static int sock_map_seq_show(struct seq_file *seq, void *v)
+{
+	return __sock_map_seq_show(seq, v);
+}
+
+static void sock_map_seq_stop(struct seq_file *seq, void *v)
+{
+	if (!v)
+		(void)__sock_map_seq_show(seq, NULL);
+
+	/* pairs with sock_map_seq_start */
+	rcu_read_unlock();
+}
+
+static const struct seq_operations sock_map_seq_ops = {
+	.start	= sock_map_seq_start,
+	.next	= sock_map_seq_next,
+	.stop	= sock_map_seq_stop,
+	.show	= sock_map_seq_show,
+};
+
+static int sock_map_init_seq_private(void *priv_data,
+				     struct bpf_iter_aux_info *aux)
+{
+	struct sock_map_seq_info *info = priv_data;
+
+	info->map = aux->map;
+	return 0;
+}
+
+static const struct bpf_iter_seq_info sock_map_iter_seq_info = {
+	.seq_ops		= &sock_map_seq_ops,
+	.init_seq_private	= sock_map_init_seq_private,
+	.seq_priv_size		= sizeof(struct sock_map_seq_info),
+};
+
 static int sock_map_btf_id;
 const struct bpf_map_ops sock_map_ops = {
 	.map_alloc		= sock_map_alloc,
@@ -716,6 +826,7 @@  const struct bpf_map_ops sock_map_ops = {
 	.map_check_btf		= map_check_no_btf,
 	.map_btf_name		= "bpf_stab",
 	.map_btf_id		= &sock_map_btf_id,
+	.iter_seq_info		= &sock_map_iter_seq_info,
 };
 
 struct bpf_shtab_elem {
@@ -1198,6 +1309,120 @@  const struct bpf_func_proto bpf_msg_redirect_hash_proto = {
 	.arg4_type      = ARG_ANYTHING,
 };
 
+struct sock_hash_seq_info {
+	struct bpf_map *map;
+	struct bpf_shtab *htab;
+	u32 bucket_id;
+};
+
+static void *sock_hash_seq_find_next(struct sock_hash_seq_info *info,
+				     struct bpf_shtab_elem *prev_elem)
+{
+	const struct bpf_shtab *htab = info->htab;
+	struct bpf_shtab_bucket *bucket;
+	struct bpf_shtab_elem *elem;
+	struct hlist_node *node;
+
+	/* try to find next elem in the same bucket */
+	if (prev_elem) {
+		node = rcu_dereference_raw(hlist_next_rcu(&prev_elem->node));
+		elem = hlist_entry_safe(node, struct bpf_shtab_elem, node);
+		if (elem)
+			return elem;
+
+		/* no more elements, continue in the next bucket */
+		info->bucket_id++;
+	}
+
+	for (; info->bucket_id < htab->buckets_num; info->bucket_id++) {
+		bucket = &htab->buckets[info->bucket_id];
+		node = rcu_dereference_raw(hlist_first_rcu(&bucket->head));
+		elem = hlist_entry_safe(node, struct bpf_shtab_elem, node);
+		if (elem)
+			return elem;
+	}
+
+	return NULL;
+}
+
+static void *sock_hash_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct sock_hash_seq_info *info = seq->private;
+
+	if (*pos == 0)
+		++*pos;
+
+	/* pairs with sock_hash_seq_stop */
+	rcu_read_lock();
+	return sock_hash_seq_find_next(info, NULL);
+}
+
+static void *sock_hash_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct sock_hash_seq_info *info = seq->private;
+
+	++*pos;
+	return sock_hash_seq_find_next(info, v);
+}
+
+static int __sock_hash_seq_show(struct seq_file *seq, struct bpf_shtab_elem *elem)
+{
+	struct sock_hash_seq_info *info = seq->private;
+	struct bpf_iter__sockmap ctx = {};
+	struct bpf_iter_meta meta;
+	struct bpf_prog *prog;
+
+	meta.seq = seq;
+	prog = bpf_iter_get_info(&meta, !elem);
+	if (!prog)
+		return 0;
+
+	ctx.meta = &meta;
+	ctx.map = info->map;
+	if (elem) {
+		ctx.key = elem->key;
+		ctx.sk = (struct bpf_sock *)elem->sk;
+	}
+
+	return bpf_iter_run_prog(prog, &ctx);
+}
+
+static int sock_hash_seq_show(struct seq_file *seq, void *v)
+{
+	return __sock_hash_seq_show(seq, v);
+}
+
+static void sock_hash_seq_stop(struct seq_file *seq, void *v)
+{
+	if (!v)
+		(void)__sock_hash_seq_show(seq, NULL);
+
+	/* pairs with sock_hash_seq_start */
+	rcu_read_unlock();
+}
+
+static const struct seq_operations sock_hash_seq_ops = {
+	.start	= sock_hash_seq_start,
+	.next	= sock_hash_seq_next,
+	.stop	= sock_hash_seq_stop,
+	.show	= sock_hash_seq_show,
+};
+
+static int sock_hash_init_seq_private(void *priv_data,
+				     struct bpf_iter_aux_info *aux)
+{
+	struct sock_hash_seq_info *info = priv_data;
+
+	info->map = aux->map;
+	return 0;
+}
+
+static const struct bpf_iter_seq_info sock_hash_iter_seq_info = {
+	.seq_ops		= &sock_hash_seq_ops,
+	.init_seq_private	= sock_hash_init_seq_private,
+	.seq_priv_size		= sizeof(struct sock_hash_seq_info),
+};
+
 static int sock_hash_map_btf_id;
 const struct bpf_map_ops sock_hash_ops = {
 	.map_alloc		= sock_hash_alloc,
@@ -1211,6 +1436,7 @@  const struct bpf_map_ops sock_hash_ops = {
 	.map_check_btf		= map_check_no_btf,
 	.map_btf_name		= "bpf_shtab",
 	.map_btf_id		= &sock_hash_map_btf_id,
+	.iter_seq_info		= &sock_hash_iter_seq_info,
 };
 
 static struct sk_psock_progs *sock_map_progs(struct bpf_map *map)
@@ -1321,3 +1547,60 @@  void sock_map_close(struct sock *sk, long timeout)
 	release_sock(sk);
 	saved_close(sk, timeout);
 }
+
+static int sock_map_iter_attach_target(struct bpf_prog *prog,
+				       union bpf_iter_link_info *linfo,
+				       struct bpf_iter_aux_info *aux)
+{
+	struct bpf_map *map;
+	int err = -EINVAL;
+
+	if (!linfo->map.map_fd)
+		return -EBADF;
+
+	map = bpf_map_get_with_uref(linfo->map.map_fd);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
+
+	if (map->map_type != BPF_MAP_TYPE_SOCKMAP)
+		goto put_map;
+
+	if (prog->aux->max_rdonly_access > map->key_size) {
+		err = -EACCES;
+		goto put_map;
+	}
+
+	aux->map = map;
+	return 0;
+
+put_map:
+	bpf_map_put_with_uref(map);
+	return err;
+}
+
+static void sock_map_iter_detach_target(struct bpf_iter_aux_info *aux)
+{
+	bpf_map_put_with_uref(aux->map);
+}
+
+static struct bpf_iter_reg sock_map_iter_reg = {
+	.target			= "sockmap",
+	.attach_target		= sock_map_iter_attach_target,
+	.detach_target		= sock_map_iter_detach_target,
+	.show_fdinfo		= bpf_iter_map_show_fdinfo,
+	.fill_link_info		= bpf_iter_map_fill_link_info,
+	.ctx_arg_info_size	= 2,
+	.ctx_arg_info		= {
+		{ offsetof(struct bpf_iter__sockmap, key),
+		  PTR_TO_RDONLY_BUF_OR_NULL },
+		{ offsetof(struct bpf_iter__sockmap, sk),
+		  PTR_TO_SOCKET_OR_NULL },
+	},
+	.seq_info		= &sock_map_iter_seq_info,
+};
+
+static int __init bpf_sockmap_iter_init(void)
+{
+	return bpf_iter_reg_target(&sock_map_iter_reg);
+}
+late_initcall(bpf_sockmap_iter_init);