diff mbox series

[bpf-next,v2,11/11] bpf: Add bpf_dctcp example

Message ID 20191221062620.1184118-1-kafai@fb.com
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series Introduce BPF STRUCT_OPS | expand

Commit Message

Martin KaFai Lau Dec. 21, 2019, 6:26 a.m. UTC
This patch adds a bpf_dctcp example.  It currently does not do
no-ECN fallback but the same could be done through the cgrp2-bpf.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
 tools/testing/selftests/bpf/bpf_tcp_helpers.h | 228 ++++++++++++++++++
 .../selftests/bpf/prog_tests/bpf_tcp_ca.c     | 218 +++++++++++++++++
 tools/testing/selftests/bpf/progs/bpf_dctcp.c | 210 ++++++++++++++++
 3 files changed, 656 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/bpf_tcp_helpers.h
 create mode 100644 tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_dctcp.c

Comments

Andrii Nakryiko Dec. 23, 2019, 11:26 p.m. UTC | #1
On Fri, Dec 20, 2019 at 10:26 PM Martin KaFai Lau <kafai@fb.com> wrote:
>
> This patch adds a bpf_dctcp example.  It currently does not do
> no-ECN fallback but the same could be done through the cgrp2-bpf.
>
> Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> ---
>  tools/testing/selftests/bpf/bpf_tcp_helpers.h | 228 ++++++++++++++++++
>  .../selftests/bpf/prog_tests/bpf_tcp_ca.c     | 218 +++++++++++++++++
>  tools/testing/selftests/bpf/progs/bpf_dctcp.c | 210 ++++++++++++++++
>  3 files changed, 656 insertions(+)
>  create mode 100644 tools/testing/selftests/bpf/bpf_tcp_helpers.h
>  create mode 100644 tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
>  create mode 100644 tools/testing/selftests/bpf/progs/bpf_dctcp.c
>
> diff --git a/tools/testing/selftests/bpf/bpf_tcp_helpers.h b/tools/testing/selftests/bpf/bpf_tcp_helpers.h
> new file mode 100644
> index 000000000000..7ba8c1b4157a
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/bpf_tcp_helpers.h
> @@ -0,0 +1,228 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef __BPF_TCP_HELPERS_H
> +#define __BPF_TCP_HELPERS_H
> +
> +#include <stdbool.h>
> +#include <linux/types.h>
> +#include <bpf_helpers.h>
> +#include <bpf_core_read.h>
> +#include "bpf_trace_helpers.h"
> +
> +#define BPF_TCP_OPS_0(fname, ret_type, ...) BPF_TRACE_x(0, #fname"_sec", fname, ret_type, __VA_ARGS__)
> +#define BPF_TCP_OPS_1(fname, ret_type, ...) BPF_TRACE_x(1, #fname"_sec", fname, ret_type, __VA_ARGS__)
> +#define BPF_TCP_OPS_2(fname, ret_type, ...) BPF_TRACE_x(2, #fname"_sec", fname, ret_type, __VA_ARGS__)
> +#define BPF_TCP_OPS_3(fname, ret_type, ...) BPF_TRACE_x(3, #fname"_sec", fname, ret_type, __VA_ARGS__)
> +#define BPF_TCP_OPS_4(fname, ret_type, ...) BPF_TRACE_x(4, #fname"_sec", fname, ret_type, __VA_ARGS__)
> +#define BPF_TCP_OPS_5(fname, ret_type, ...) BPF_TRACE_x(5, #fname"_sec", fname, ret_type, __VA_ARGS__)

Should we try to put those BPF programs into some section that would
indicate they are used with struct opts? libbpf doesn't use or enforce
that (even though it could to derive and enforce that they are
STRUCT_OPS programs). So something like
SEC("struct_ops/<ideally-operation-name-here>"). I think having this
convention is very useful for consistency and to do a quick ELF dump
and see what is where. WDYT?

> +
> +struct sock_common {
> +       unsigned char   skc_state;
> +} __attribute__((preserve_access_index));
> +
> +struct sock {
> +       struct sock_common      __sk_common;
> +} __attribute__((preserve_access_index));
> +
> +struct inet_sock {
> +       struct sock             sk;
> +} __attribute__((preserve_access_index));
> +
> +struct inet_connection_sock {
> +       struct inet_sock          icsk_inet;
> +       __u8                      icsk_ca_state:6,
> +                                 icsk_ca_setsockopt:1,
> +                                 icsk_ca_dst_locked:1;
> +       struct {
> +               __u8              pending;
> +       } icsk_ack;
> +       __u64                     icsk_ca_priv[104 / sizeof(__u64)];
> +} __attribute__((preserve_access_index));
> +
> +struct tcp_sock {
> +       struct inet_connection_sock     inet_conn;
> +
> +       __u32   rcv_nxt;
> +       __u32   snd_nxt;
> +       __u32   snd_una;
> +       __u8    ecn_flags;
> +       __u32   delivered;
> +       __u32   delivered_ce;
> +       __u32   snd_cwnd;
> +       __u32   snd_cwnd_cnt;
> +       __u32   snd_cwnd_clamp;
> +       __u32   snd_ssthresh;
> +       __u8    syn_data:1,     /* SYN includes data */
> +               syn_fastopen:1, /* SYN includes Fast Open option */
> +               syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */
> +               syn_fastopen_ch:1, /* Active TFO re-enabling probe */
> +               syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
> +               save_syn:1,     /* Save headers of SYN packet */
> +               is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */
> +               syn_smc:1;      /* SYN includes SMC */
> +       __u32   max_packets_out;
> +       __u32   lsndtime;
> +       __u32   prior_cwnd;
> +} __attribute__((preserve_access_index));
> +
> +static __always_inline struct inet_connection_sock *inet_csk(const struct sock *sk)
> +{
> +       return (struct inet_connection_sock *)sk;
> +}
> +
> +static __always_inline void *inet_csk_ca(const struct sock *sk)
> +{
> +       return (void *)inet_csk(sk)->icsk_ca_priv;
> +}
> +
> +static __always_inline struct tcp_sock *tcp_sk(const struct sock *sk)
> +{
> +       return (struct tcp_sock *)sk;
> +}
> +
> +static __always_inline bool before(__u32 seq1, __u32 seq2)
> +{
> +       return (__s32)(seq1-seq2) < 0;
> +}
> +#define after(seq2, seq1)      before(seq1, seq2)
> +
> +#define        TCP_ECN_OK              1
> +#define        TCP_ECN_QUEUE_CWR       2
> +#define        TCP_ECN_DEMAND_CWR      4
> +#define        TCP_ECN_SEEN            8
> +
> +enum inet_csk_ack_state_t {
> +       ICSK_ACK_SCHED  = 1,
> +       ICSK_ACK_TIMER  = 2,
> +       ICSK_ACK_PUSHED = 4,
> +       ICSK_ACK_PUSHED2 = 8,
> +       ICSK_ACK_NOW = 16       /* Send the next ACK immediately (once) */
> +};
> +
> +enum tcp_ca_event {
> +       CA_EVENT_TX_START = 0,
> +       CA_EVENT_CWND_RESTART = 1,
> +       CA_EVENT_COMPLETE_CWR = 2,
> +       CA_EVENT_LOSS = 3,
> +       CA_EVENT_ECN_NO_CE = 4,
> +       CA_EVENT_ECN_IS_CE = 5,
> +};
> +
> +enum tcp_ca_state {
> +       TCP_CA_Open = 0,
> +       TCP_CA_Disorder = 1,
> +       TCP_CA_CWR = 2,
> +       TCP_CA_Recovery = 3,
> +       TCP_CA_Loss = 4
> +};
> +
> +struct ack_sample {
> +       __u32 pkts_acked;
> +       __s32 rtt_us;
> +       __u32 in_flight;
> +} __attribute__((preserve_access_index));
> +
> +struct rate_sample {
> +       __u64  prior_mstamp; /* starting timestamp for interval */
> +       __u32  prior_delivered; /* tp->delivered at "prior_mstamp" */
> +       __s32  delivered;               /* number of packets delivered over interval */
> +       long interval_us;       /* time for tp->delivered to incr "delivered" */
> +       __u32 snd_interval_us;  /* snd interval for delivered packets */
> +       __u32 rcv_interval_us;  /* rcv interval for delivered packets */
> +       long rtt_us;            /* RTT of last (S)ACKed packet (or -1) */
> +       int  losses;            /* number of packets marked lost upon ACK */
> +       __u32  acked_sacked;    /* number of packets newly (S)ACKed upon ACK */
> +       __u32  prior_in_flight; /* in flight before this ACK */
> +       bool is_app_limited;    /* is sample from packet with bubble in pipe? */
> +       bool is_retrans;        /* is sample from retransmission? */
> +       bool is_ack_delayed;    /* is this (likely) a delayed ACK? */
> +} __attribute__((preserve_access_index));
> +
> +#define TCP_CA_NAME_MAX                16
> +#define TCP_CONG_NEEDS_ECN     0x2
> +
> +struct tcp_congestion_ops {
> +       __u32 flags;
> +
> +       /* initialize private data (optional) */
> +       void (*init)(struct sock *sk);
> +       /* cleanup private data  (optional) */
> +       void (*release)(struct sock *sk);
> +
> +       /* return slow start threshold (required) */
> +       __u32 (*ssthresh)(struct sock *sk);
> +       /* do new cwnd calculation (required) */
> +       void (*cong_avoid)(struct sock *sk, __u32 ack, __u32 acked);
> +       /* call before changing ca_state (optional) */
> +       void (*set_state)(struct sock *sk, __u8 new_state);
> +       /* call when cwnd event occurs (optional) */
> +       void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev);
> +       /* call when ack arrives (optional) */
> +       void (*in_ack_event)(struct sock *sk, __u32 flags);
> +       /* new value of cwnd after loss (required) */
> +       __u32  (*undo_cwnd)(struct sock *sk);
> +       /* hook for packet ack accounting (optional) */
> +       void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
> +       /* override sysctl_tcp_min_tso_segs */
> +       __u32 (*min_tso_segs)(struct sock *sk);
> +       /* returns the multiplier used in tcp_sndbuf_expand (optional) */
> +       __u32 (*sndbuf_expand)(struct sock *sk);
> +       /* call when packets are delivered to update cwnd and pacing rate,
> +        * after all the ca_state processing. (optional)
> +        */
> +       void (*cong_control)(struct sock *sk, const struct rate_sample *rs);
> +
> +       char            name[TCP_CA_NAME_MAX];
> +};

Can all of these types come from vmlinux.h instead of being duplicated here?

> +
> +#define min(a, b) ((a) < (b) ? (a) : (b))
> +#define max(a, b) ((a) > (b) ? (a) : (b))
> +#define min_not_zero(x, y) ({                  \
> +       typeof(x) __x = (x);                    \
> +       typeof(y) __y = (y);                    \
> +       __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); })
> +

[...]

> +static struct bpf_object *load(const char *filename, const char *map_name,
> +                              struct bpf_link **link)
> +{
> +       struct bpf_object *obj;
> +       struct bpf_map *map;
> +       struct bpf_link *l;
> +       int err;
> +
> +       obj = bpf_object__open(filename);
> +       if (CHECK(IS_ERR(obj), "bpf_obj__open_file", "obj:%ld\n",
> +                 PTR_ERR(obj)))
> +               return obj;
> +
> +       err = bpf_object__load(obj);
> +       if (CHECK(err, "bpf_object__load", "err:%d\n", err)) {
> +               bpf_object__close(obj);
> +               return ERR_PTR(err);
> +       }
> +
> +       map = bpf_object__find_map_by_name(obj, map_name);
> +       if (CHECK(!map, "bpf_object__find_map_by_name", "%s not found\n",
> +                   map_name)) {
> +               bpf_object__close(obj);
> +               return ERR_PTR(-ENOENT);
> +       }
> +

use skeleton instead?

> +       l = bpf_map__attach_struct_ops(map);
> +       if (CHECK(IS_ERR(l), "bpf_struct_ops_map__attach", "err:%ld\n",
> +                 PTR_ERR(l))) {
> +               bpf_object__close(obj);
> +               return (void *)l;
> +       }
> +
> +       *link = l;
> +
> +       return obj;
> +}
> +
> +static void test_dctcp(void)
> +{
> +       struct bpf_object *obj;
> +       /* compiler warning... */
> +       struct bpf_link *link = NULL;
> +
> +       obj = load("bpf_dctcp.o", "dctcp", &link);
> +       if (IS_ERR(obj))
> +               return;
> +
> +       do_test("bpf_dctcp");
> +
> +       bpf_link__destroy(link);
> +       bpf_object__close(obj);
> +}
> +
> +void test_bpf_tcp_ca(void)
> +{
> +       if (test__start_subtest("dctcp"))
> +               test_dctcp();
> +}

[...]
Martin KaFai Lau Dec. 24, 2019, 1:31 a.m. UTC | #2
On Mon, Dec 23, 2019 at 03:26:50PM -0800, Andrii Nakryiko wrote:
> On Fri, Dec 20, 2019 at 10:26 PM Martin KaFai Lau <kafai@fb.com> wrote:
> >
> > This patch adds a bpf_dctcp example.  It currently does not do
> > no-ECN fallback but the same could be done through the cgrp2-bpf.
> >
> > Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> > ---
> >  tools/testing/selftests/bpf/bpf_tcp_helpers.h | 228 ++++++++++++++++++
> >  .../selftests/bpf/prog_tests/bpf_tcp_ca.c     | 218 +++++++++++++++++
> >  tools/testing/selftests/bpf/progs/bpf_dctcp.c | 210 ++++++++++++++++
> >  3 files changed, 656 insertions(+)
> >  create mode 100644 tools/testing/selftests/bpf/bpf_tcp_helpers.h
> >  create mode 100644 tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
> >  create mode 100644 tools/testing/selftests/bpf/progs/bpf_dctcp.c
> >
> > diff --git a/tools/testing/selftests/bpf/bpf_tcp_helpers.h b/tools/testing/selftests/bpf/bpf_tcp_helpers.h
> > new file mode 100644
> > index 000000000000..7ba8c1b4157a
> > --- /dev/null
> > +++ b/tools/testing/selftests/bpf/bpf_tcp_helpers.h
> > @@ -0,0 +1,228 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +#ifndef __BPF_TCP_HELPERS_H
> > +#define __BPF_TCP_HELPERS_H
> > +
> > +#include <stdbool.h>
> > +#include <linux/types.h>
> > +#include <bpf_helpers.h>
> > +#include <bpf_core_read.h>
> > +#include "bpf_trace_helpers.h"
> > +
> > +#define BPF_TCP_OPS_0(fname, ret_type, ...) BPF_TRACE_x(0, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > +#define BPF_TCP_OPS_1(fname, ret_type, ...) BPF_TRACE_x(1, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > +#define BPF_TCP_OPS_2(fname, ret_type, ...) BPF_TRACE_x(2, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > +#define BPF_TCP_OPS_3(fname, ret_type, ...) BPF_TRACE_x(3, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > +#define BPF_TCP_OPS_4(fname, ret_type, ...) BPF_TRACE_x(4, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > +#define BPF_TCP_OPS_5(fname, ret_type, ...) BPF_TRACE_x(5, #fname"_sec", fname, ret_type, __VA_ARGS__)
> 
> Should we try to put those BPF programs into some section that would
> indicate they are used with struct opts? libbpf doesn't use or enforce
> that (even though it could to derive and enforce that they are
> STRUCT_OPS programs). So something like
> SEC("struct_ops/<ideally-operation-name-here>"). I think having this
> convention is very useful for consistency and to do a quick ELF dump
> and see what is where. WDYT?
I did not use it here because I don't want any misperception that it is
a required convention by libbpf.

Sure, I can prefix it here and comment that it is just a
convention but not a libbpf's requirement.

> 
> > +
> > +struct sock_common {
> > +       unsigned char   skc_state;
> > +} __attribute__((preserve_access_index));
> > +
> > +struct sock {
> > +       struct sock_common      __sk_common;
> > +} __attribute__((preserve_access_index));
> > +
> > +struct inet_sock {
> > +       struct sock             sk;
> > +} __attribute__((preserve_access_index));
> > +
> > +struct inet_connection_sock {
> > +       struct inet_sock          icsk_inet;
> > +       __u8                      icsk_ca_state:6,
> > +                                 icsk_ca_setsockopt:1,
> > +                                 icsk_ca_dst_locked:1;
> > +       struct {
> > +               __u8              pending;
> > +       } icsk_ack;
> > +       __u64                     icsk_ca_priv[104 / sizeof(__u64)];
> > +} __attribute__((preserve_access_index));
> > +
> > +struct tcp_sock {
> > +       struct inet_connection_sock     inet_conn;
> > +
> > +       __u32   rcv_nxt;
> > +       __u32   snd_nxt;
> > +       __u32   snd_una;
> > +       __u8    ecn_flags;
> > +       __u32   delivered;
> > +       __u32   delivered_ce;
> > +       __u32   snd_cwnd;
> > +       __u32   snd_cwnd_cnt;
> > +       __u32   snd_cwnd_clamp;
> > +       __u32   snd_ssthresh;
> > +       __u8    syn_data:1,     /* SYN includes data */
> > +               syn_fastopen:1, /* SYN includes Fast Open option */
> > +               syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */
> > +               syn_fastopen_ch:1, /* Active TFO re-enabling probe */
> > +               syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
> > +               save_syn:1,     /* Save headers of SYN packet */
> > +               is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */
> > +               syn_smc:1;      /* SYN includes SMC */
> > +       __u32   max_packets_out;
> > +       __u32   lsndtime;
> > +       __u32   prior_cwnd;
> > +} __attribute__((preserve_access_index));
> > +
> > +static __always_inline struct inet_connection_sock *inet_csk(const struct sock *sk)
> > +{
> > +       return (struct inet_connection_sock *)sk;
> > +}
> > +
> > +static __always_inline void *inet_csk_ca(const struct sock *sk)
> > +{
> > +       return (void *)inet_csk(sk)->icsk_ca_priv;
> > +}
> > +
> > +static __always_inline struct tcp_sock *tcp_sk(const struct sock *sk)
> > +{
> > +       return (struct tcp_sock *)sk;
> > +}
> > +
> > +static __always_inline bool before(__u32 seq1, __u32 seq2)
> > +{
> > +       return (__s32)(seq1-seq2) < 0;
> > +}
> > +#define after(seq2, seq1)      before(seq1, seq2)
> > +
> > +#define        TCP_ECN_OK              1
> > +#define        TCP_ECN_QUEUE_CWR       2
> > +#define        TCP_ECN_DEMAND_CWR      4
> > +#define        TCP_ECN_SEEN            8
> > +
> > +enum inet_csk_ack_state_t {
> > +       ICSK_ACK_SCHED  = 1,
> > +       ICSK_ACK_TIMER  = 2,
> > +       ICSK_ACK_PUSHED = 4,
> > +       ICSK_ACK_PUSHED2 = 8,
> > +       ICSK_ACK_NOW = 16       /* Send the next ACK immediately (once) */
> > +};
> > +
> > +enum tcp_ca_event {
> > +       CA_EVENT_TX_START = 0,
> > +       CA_EVENT_CWND_RESTART = 1,
> > +       CA_EVENT_COMPLETE_CWR = 2,
> > +       CA_EVENT_LOSS = 3,
> > +       CA_EVENT_ECN_NO_CE = 4,
> > +       CA_EVENT_ECN_IS_CE = 5,
> > +};
> > +
> > +enum tcp_ca_state {
> > +       TCP_CA_Open = 0,
> > +       TCP_CA_Disorder = 1,
> > +       TCP_CA_CWR = 2,
> > +       TCP_CA_Recovery = 3,
> > +       TCP_CA_Loss = 4
> > +};
> > +
> > +struct ack_sample {
> > +       __u32 pkts_acked;
> > +       __s32 rtt_us;
> > +       __u32 in_flight;
> > +} __attribute__((preserve_access_index));
> > +
> > +struct rate_sample {
> > +       __u64  prior_mstamp; /* starting timestamp for interval */
> > +       __u32  prior_delivered; /* tp->delivered at "prior_mstamp" */
> > +       __s32  delivered;               /* number of packets delivered over interval */
> > +       long interval_us;       /* time for tp->delivered to incr "delivered" */
> > +       __u32 snd_interval_us;  /* snd interval for delivered packets */
> > +       __u32 rcv_interval_us;  /* rcv interval for delivered packets */
> > +       long rtt_us;            /* RTT of last (S)ACKed packet (or -1) */
> > +       int  losses;            /* number of packets marked lost upon ACK */
> > +       __u32  acked_sacked;    /* number of packets newly (S)ACKed upon ACK */
> > +       __u32  prior_in_flight; /* in flight before this ACK */
> > +       bool is_app_limited;    /* is sample from packet with bubble in pipe? */
> > +       bool is_retrans;        /* is sample from retransmission? */
> > +       bool is_ack_delayed;    /* is this (likely) a delayed ACK? */
> > +} __attribute__((preserve_access_index));
> > +
> > +#define TCP_CA_NAME_MAX                16
> > +#define TCP_CONG_NEEDS_ECN     0x2
> > +
> > +struct tcp_congestion_ops {
> > +       __u32 flags;
> > +
> > +       /* initialize private data (optional) */
> > +       void (*init)(struct sock *sk);
> > +       /* cleanup private data  (optional) */
> > +       void (*release)(struct sock *sk);
> > +
> > +       /* return slow start threshold (required) */
> > +       __u32 (*ssthresh)(struct sock *sk);
> > +       /* do new cwnd calculation (required) */
> > +       void (*cong_avoid)(struct sock *sk, __u32 ack, __u32 acked);
> > +       /* call before changing ca_state (optional) */
> > +       void (*set_state)(struct sock *sk, __u8 new_state);
> > +       /* call when cwnd event occurs (optional) */
> > +       void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev);
> > +       /* call when ack arrives (optional) */
> > +       void (*in_ack_event)(struct sock *sk, __u32 flags);
> > +       /* new value of cwnd after loss (required) */
> > +       __u32  (*undo_cwnd)(struct sock *sk);
> > +       /* hook for packet ack accounting (optional) */
> > +       void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
> > +       /* override sysctl_tcp_min_tso_segs */
> > +       __u32 (*min_tso_segs)(struct sock *sk);
> > +       /* returns the multiplier used in tcp_sndbuf_expand (optional) */
> > +       __u32 (*sndbuf_expand)(struct sock *sk);
> > +       /* call when packets are delivered to update cwnd and pacing rate,
> > +        * after all the ca_state processing. (optional)
> > +        */
> > +       void (*cong_control)(struct sock *sk, const struct rate_sample *rs);
> > +
> > +       char            name[TCP_CA_NAME_MAX];
> > +};
> 
> Can all of these types come from vmlinux.h instead of being duplicated here?
It can but I prefer leaving it as is in bpf_tcp_helpers.h like another
existing test in kfree_skb.c.  Without directly using the same struct in
vmlinux.h,  I think it is a good test for libbpf.
That remind me to shuffle the member ordering a little in tcp_congestion_ops
here.

> 
> > +
> > +#define min(a, b) ((a) < (b) ? (a) : (b))
> > +#define max(a, b) ((a) > (b) ? (a) : (b))
> > +#define min_not_zero(x, y) ({                  \
> > +       typeof(x) __x = (x);                    \
> > +       typeof(y) __y = (y);                    \
> > +       __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); })
> > +
> 
> [...]
> 
> > +static struct bpf_object *load(const char *filename, const char *map_name,
> > +                              struct bpf_link **link)
> > +{
> > +       struct bpf_object *obj;
> > +       struct bpf_map *map;
> > +       struct bpf_link *l;
> > +       int err;
> > +
> > +       obj = bpf_object__open(filename);
> > +       if (CHECK(IS_ERR(obj), "bpf_obj__open_file", "obj:%ld\n",
> > +                 PTR_ERR(obj)))
> > +               return obj;
> > +
> > +       err = bpf_object__load(obj);
> > +       if (CHECK(err, "bpf_object__load", "err:%d\n", err)) {
> > +               bpf_object__close(obj);
> > +               return ERR_PTR(err);
> > +       }
> > +
> > +       map = bpf_object__find_map_by_name(obj, map_name);
> > +       if (CHECK(!map, "bpf_object__find_map_by_name", "%s not found\n",
> > +                   map_name)) {
> > +               bpf_object__close(obj);
> > +               return ERR_PTR(-ENOENT);
> > +       }
> > +
> 
> use skeleton instead?
Will give it a spin.

> 
> > +       l = bpf_map__attach_struct_ops(map);
> > +       if (CHECK(IS_ERR(l), "bpf_struct_ops_map__attach", "err:%ld\n",
> > +                 PTR_ERR(l))) {
> > +               bpf_object__close(obj);
> > +               return (void *)l;
> > +       }
> > +
> > +       *link = l;
> > +
> > +       return obj;
> > +}
> > +
Andrii Nakryiko Dec. 24, 2019, 7:01 a.m. UTC | #3
On Mon, Dec 23, 2019 at 5:31 PM Martin Lau <kafai@fb.com> wrote:
>
> On Mon, Dec 23, 2019 at 03:26:50PM -0800, Andrii Nakryiko wrote:
> > On Fri, Dec 20, 2019 at 10:26 PM Martin KaFai Lau <kafai@fb.com> wrote:
> > >
> > > This patch adds a bpf_dctcp example.  It currently does not do
> > > no-ECN fallback but the same could be done through the cgrp2-bpf.
> > >
> > > Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> > > ---
> > >  tools/testing/selftests/bpf/bpf_tcp_helpers.h | 228 ++++++++++++++++++
> > >  .../selftests/bpf/prog_tests/bpf_tcp_ca.c     | 218 +++++++++++++++++
> > >  tools/testing/selftests/bpf/progs/bpf_dctcp.c | 210 ++++++++++++++++
> > >  3 files changed, 656 insertions(+)
> > >  create mode 100644 tools/testing/selftests/bpf/bpf_tcp_helpers.h
> > >  create mode 100644 tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
> > >  create mode 100644 tools/testing/selftests/bpf/progs/bpf_dctcp.c
> > >
> > > diff --git a/tools/testing/selftests/bpf/bpf_tcp_helpers.h b/tools/testing/selftests/bpf/bpf_tcp_helpers.h
> > > new file mode 100644
> > > index 000000000000..7ba8c1b4157a
> > > --- /dev/null
> > > +++ b/tools/testing/selftests/bpf/bpf_tcp_helpers.h
> > > @@ -0,0 +1,228 @@
> > > +/* SPDX-License-Identifier: GPL-2.0 */
> > > +#ifndef __BPF_TCP_HELPERS_H
> > > +#define __BPF_TCP_HELPERS_H
> > > +
> > > +#include <stdbool.h>
> > > +#include <linux/types.h>
> > > +#include <bpf_helpers.h>
> > > +#include <bpf_core_read.h>
> > > +#include "bpf_trace_helpers.h"
> > > +
> > > +#define BPF_TCP_OPS_0(fname, ret_type, ...) BPF_TRACE_x(0, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > > +#define BPF_TCP_OPS_1(fname, ret_type, ...) BPF_TRACE_x(1, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > > +#define BPF_TCP_OPS_2(fname, ret_type, ...) BPF_TRACE_x(2, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > > +#define BPF_TCP_OPS_3(fname, ret_type, ...) BPF_TRACE_x(3, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > > +#define BPF_TCP_OPS_4(fname, ret_type, ...) BPF_TRACE_x(4, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > > +#define BPF_TCP_OPS_5(fname, ret_type, ...) BPF_TRACE_x(5, #fname"_sec", fname, ret_type, __VA_ARGS__)
> >
> > Should we try to put those BPF programs into some section that would
> > indicate they are used with struct opts? libbpf doesn't use or enforce
> > that (even though it could to derive and enforce that they are
> > STRUCT_OPS programs). So something like
> > SEC("struct_ops/<ideally-operation-name-here>"). I think having this
> > convention is very useful for consistency and to do a quick ELF dump
> > and see what is where. WDYT?
> I did not use it here because I don't want any misperception that it is
> a required convention by libbpf.
>
> Sure, I can prefix it here and comment that it is just a
> convention but not a libbpf's requirement.

Well, we can actually make it a requirement of sorts. Currently your
code expects that BPF program's type is UNSPEC and then it sets it to
STRUCT_OPS. Alternatively we can say that any BPF program in
SEC("struct_ops/<whatever>") will be automatically assigned
STRUCT_OPTS BPF program type (which is done generically in
bpf_object__open()), and then as .struct_ops section is parsed, all
those programs will be "assembled" by the code you added into a
struct_ops map.

It's a requirement "of sorts", because even if user doesn't do that,
stuff will still work, if user manually will call
bpf_program__set_struct_ops(prog). Which actually reminds me that it
would be good to add bpf_program__set_struct_ops() and
bpf_program__is_struct_ops() APIs for completeness, similarly to how
KP's LSM patch set does.

BTW, libbpf will emit debug message for every single BPF program it
doesn't recognize section for, so it is still nice to have it be
something more or less standardized and recognizable by libbpf.

>
> >
> > > +

[...]

> >
> > Can all of these types come from vmlinux.h instead of being duplicated here?
> It can but I prefer leaving it as is in bpf_tcp_helpers.h like another
> existing test in kfree_skb.c.  Without directly using the same struct in
> vmlinux.h,  I think it is a good test for libbpf.
> That remind me to shuffle the member ordering a little in tcp_congestion_ops
> here.

Sure no problem. When I looked at this it was a bit discouraging on
how much types I'd need to duplicate, but surely we don't want to make
an impression that vmlinux.h is the only way to achieve this.

>
> >
> > > +
> > > +#define min(a, b) ((a) < (b) ? (a) : (b))
> > > +#define max(a, b) ((a) > (b) ? (a) : (b))
> > > +#define min_not_zero(x, y) ({                  \
> > > +       typeof(x) __x = (x);                    \
> > > +       typeof(y) __y = (y);                    \
> > > +       __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); })
> > > +
> >
> > [...]
> >
> > > +static struct bpf_object *load(const char *filename, const char *map_name,
> > > +                              struct bpf_link **link)
> > > +{
> > > +       struct bpf_object *obj;
> > > +       struct bpf_map *map;
> > > +       struct bpf_link *l;
> > > +       int err;
> > > +
> > > +       obj = bpf_object__open(filename);
> > > +       if (CHECK(IS_ERR(obj), "bpf_obj__open_file", "obj:%ld\n",
> > > +                 PTR_ERR(obj)))
> > > +               return obj;
> > > +
> > > +       err = bpf_object__load(obj);
> > > +       if (CHECK(err, "bpf_object__load", "err:%d\n", err)) {
> > > +               bpf_object__close(obj);
> > > +               return ERR_PTR(err);
> > > +       }
> > > +
> > > +       map = bpf_object__find_map_by_name(obj, map_name);
> > > +       if (CHECK(!map, "bpf_object__find_map_by_name", "%s not found\n",
> > > +                   map_name)) {
> > > +               bpf_object__close(obj);
> > > +               return ERR_PTR(-ENOENT);
> > > +       }
> > > +
> >
> > use skeleton instead?
> Will give it a spin.
>
> >
> > > +       l = bpf_map__attach_struct_ops(map);
> > > +       if (CHECK(IS_ERR(l), "bpf_struct_ops_map__attach", "err:%ld\n",
> > > +                 PTR_ERR(l))) {
> > > +               bpf_object__close(obj);
> > > +               return (void *)l;
> > > +       }
> > > +
> > > +       *link = l;
> > > +
> > > +       return obj;
> > > +}
> > > +
Martin KaFai Lau Dec. 24, 2019, 7:32 a.m. UTC | #4
On Mon, Dec 23, 2019 at 11:01:55PM -0800, Andrii Nakryiko wrote:
> > > Can all of these types come from vmlinux.h instead of being duplicated here?
> > It can but I prefer leaving it as is in bpf_tcp_helpers.h like another
> > existing test in kfree_skb.c.  Without directly using the same struct in
> > vmlinux.h,  I think it is a good test for libbpf.
> > That remind me to shuffle the member ordering a little in tcp_congestion_ops
> > here.
> 
> Sure no problem. When I looked at this it was a bit discouraging on
> how much types I'd need to duplicate, but surely we don't want to make
> an impression that vmlinux.h is the only way to achieve this.
IMO, it is a very compact set of fields that work for both dctcp and cubic.
Also, duplication is not a concern here.  The deviation between
the kernel and bpf_tcp_helpers.h is not a problem with CO-RE.
Martin KaFai Lau Dec. 24, 2019, 4:50 p.m. UTC | #5
On Mon, Dec 23, 2019 at 11:01:55PM -0800, Andrii Nakryiko wrote:
> On Mon, Dec 23, 2019 at 5:31 PM Martin Lau <kafai@fb.com> wrote:
> >
> > On Mon, Dec 23, 2019 at 03:26:50PM -0800, Andrii Nakryiko wrote:
> > > On Fri, Dec 20, 2019 at 10:26 PM Martin KaFai Lau <kafai@fb.com> wrote:
> > > >
> > > > This patch adds a bpf_dctcp example.  It currently does not do
> > > > no-ECN fallback but the same could be done through the cgrp2-bpf.
> > > >
> > > > Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> > > > ---
> > > >  tools/testing/selftests/bpf/bpf_tcp_helpers.h | 228 ++++++++++++++++++
> > > >  .../selftests/bpf/prog_tests/bpf_tcp_ca.c     | 218 +++++++++++++++++
> > > >  tools/testing/selftests/bpf/progs/bpf_dctcp.c | 210 ++++++++++++++++
> > > >  3 files changed, 656 insertions(+)
> > > >  create mode 100644 tools/testing/selftests/bpf/bpf_tcp_helpers.h
> > > >  create mode 100644 tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
> > > >  create mode 100644 tools/testing/selftests/bpf/progs/bpf_dctcp.c
> > > >
> > > > diff --git a/tools/testing/selftests/bpf/bpf_tcp_helpers.h b/tools/testing/selftests/bpf/bpf_tcp_helpers.h
> > > > new file mode 100644
> > > > index 000000000000..7ba8c1b4157a
> > > > --- /dev/null
> > > > +++ b/tools/testing/selftests/bpf/bpf_tcp_helpers.h
> > > > @@ -0,0 +1,228 @@
> > > > +/* SPDX-License-Identifier: GPL-2.0 */
> > > > +#ifndef __BPF_TCP_HELPERS_H
> > > > +#define __BPF_TCP_HELPERS_H
> > > > +
> > > > +#include <stdbool.h>
> > > > +#include <linux/types.h>
> > > > +#include <bpf_helpers.h>
> > > > +#include <bpf_core_read.h>
> > > > +#include "bpf_trace_helpers.h"
> > > > +
> > > > +#define BPF_TCP_OPS_0(fname, ret_type, ...) BPF_TRACE_x(0, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > > > +#define BPF_TCP_OPS_1(fname, ret_type, ...) BPF_TRACE_x(1, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > > > +#define BPF_TCP_OPS_2(fname, ret_type, ...) BPF_TRACE_x(2, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > > > +#define BPF_TCP_OPS_3(fname, ret_type, ...) BPF_TRACE_x(3, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > > > +#define BPF_TCP_OPS_4(fname, ret_type, ...) BPF_TRACE_x(4, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > > > +#define BPF_TCP_OPS_5(fname, ret_type, ...) BPF_TRACE_x(5, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > >
> > > Should we try to put those BPF programs into some section that would
> > > indicate they are used with struct opts? libbpf doesn't use or enforce
> > > that (even though it could to derive and enforce that they are
> > > STRUCT_OPS programs). So something like
> > > SEC("struct_ops/<ideally-operation-name-here>"). I think having this
> > > convention is very useful for consistency and to do a quick ELF dump
> > > and see what is where. WDYT?
> > I did not use it here because I don't want any misperception that it is
> > a required convention by libbpf.
> >
> > Sure, I can prefix it here and comment that it is just a
> > convention but not a libbpf's requirement.
> 
> Well, we can actually make it a requirement of sorts. Currently your
> code expects that BPF program's type is UNSPEC and then it sets it to
> STRUCT_OPS. Alternatively we can say that any BPF program in
> SEC("struct_ops/<whatever>") will be automatically assigned
> STRUCT_OPTS BPF program type (which is done generically in
> bpf_object__open()), and then as .struct_ops section is parsed, all
> those programs will be "assembled" by the code you added into a
> struct_ops map.
Setting BPF_PROG_TYPE_STRUCT_OPS can be done automatically at open
phase (during collect_reloc time).  I will make this change.

> 
> It's a requirement "of sorts", because even if user doesn't do that,
> stuff will still work, if user manually will call
> bpf_program__set_struct_ops(prog). Which actually reminds me that it
> would be good to add bpf_program__set_struct_ops() and
Although there is BPF_PROG_TYPE_FNS macro, 
I don't see moving bpf_prog__set_struct_ops(prog) to LIBBPF_API is useful
while actually may cause confusion and error.  How could __set_struct_ops()
a prog to struct_ops prog_type help a program, which is not used in
SEC(".struct_ops"), to be loaded successfully as a struct_ops prog?

Assigning a bpf_prog to a function ptr under the SEC(".struct_ops")
is the only way for a program to be successfully loaded as
struct_ops prog type.  Extra way to allow a prog to be changed to
struct_ops prog_type is either useless or redundant.

If it is really necessary to have __set_struct_ops() as a API
for completeness, it can be added...

> bpf_program__is_struct_ops() APIs for completeness, similarly to how
is_struct_ops() makes sense.

> BTW, libbpf will emit debug message for every single BPF program it
> doesn't recognize section for, so it is still nice to have it be
> something more or less standardized and recognizable by libbpf.
I can make this debug (not error) message go away too after
setting the BPF_PROG_TYPE_STRUCT_OPS automatically at open time.
Andrii Nakryiko Dec. 26, 2019, 7:02 p.m. UTC | #6
On Tue, Dec 24, 2019 at 8:50 AM Martin Lau <kafai@fb.com> wrote:
>
> On Mon, Dec 23, 2019 at 11:01:55PM -0800, Andrii Nakryiko wrote:
> > On Mon, Dec 23, 2019 at 5:31 PM Martin Lau <kafai@fb.com> wrote:
> > >
> > > On Mon, Dec 23, 2019 at 03:26:50PM -0800, Andrii Nakryiko wrote:
> > > > On Fri, Dec 20, 2019 at 10:26 PM Martin KaFai Lau <kafai@fb.com> wrote:
> > > > >
> > > > > This patch adds a bpf_dctcp example.  It currently does not do
> > > > > no-ECN fallback but the same could be done through the cgrp2-bpf.
> > > > >
> > > > > Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> > > > > ---
> > > > >  tools/testing/selftests/bpf/bpf_tcp_helpers.h | 228 ++++++++++++++++++
> > > > >  .../selftests/bpf/prog_tests/bpf_tcp_ca.c     | 218 +++++++++++++++++
> > > > >  tools/testing/selftests/bpf/progs/bpf_dctcp.c | 210 ++++++++++++++++
> > > > >  3 files changed, 656 insertions(+)
> > > > >  create mode 100644 tools/testing/selftests/bpf/bpf_tcp_helpers.h
> > > > >  create mode 100644 tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
> > > > >  create mode 100644 tools/testing/selftests/bpf/progs/bpf_dctcp.c
> > > > >
> > > > > diff --git a/tools/testing/selftests/bpf/bpf_tcp_helpers.h b/tools/testing/selftests/bpf/bpf_tcp_helpers.h
> > > > > new file mode 100644
> > > > > index 000000000000..7ba8c1b4157a
> > > > > --- /dev/null
> > > > > +++ b/tools/testing/selftests/bpf/bpf_tcp_helpers.h
> > > > > @@ -0,0 +1,228 @@
> > > > > +/* SPDX-License-Identifier: GPL-2.0 */
> > > > > +#ifndef __BPF_TCP_HELPERS_H
> > > > > +#define __BPF_TCP_HELPERS_H
> > > > > +
> > > > > +#include <stdbool.h>
> > > > > +#include <linux/types.h>
> > > > > +#include <bpf_helpers.h>
> > > > > +#include <bpf_core_read.h>
> > > > > +#include "bpf_trace_helpers.h"
> > > > > +
> > > > > +#define BPF_TCP_OPS_0(fname, ret_type, ...) BPF_TRACE_x(0, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > > > > +#define BPF_TCP_OPS_1(fname, ret_type, ...) BPF_TRACE_x(1, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > > > > +#define BPF_TCP_OPS_2(fname, ret_type, ...) BPF_TRACE_x(2, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > > > > +#define BPF_TCP_OPS_3(fname, ret_type, ...) BPF_TRACE_x(3, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > > > > +#define BPF_TCP_OPS_4(fname, ret_type, ...) BPF_TRACE_x(4, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > > > > +#define BPF_TCP_OPS_5(fname, ret_type, ...) BPF_TRACE_x(5, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > > >
> > > > Should we try to put those BPF programs into some section that would
> > > > indicate they are used with struct opts? libbpf doesn't use or enforce
> > > > that (even though it could to derive and enforce that they are
> > > > STRUCT_OPS programs). So something like
> > > > SEC("struct_ops/<ideally-operation-name-here>"). I think having this
> > > > convention is very useful for consistency and to do a quick ELF dump
> > > > and see what is where. WDYT?
> > > I did not use it here because I don't want any misperception that it is
> > > a required convention by libbpf.
> > >
> > > Sure, I can prefix it here and comment that it is just a
> > > convention but not a libbpf's requirement.
> >
> > Well, we can actually make it a requirement of sorts. Currently your
> > code expects that BPF program's type is UNSPEC and then it sets it to
> > STRUCT_OPS. Alternatively we can say that any BPF program in
> > SEC("struct_ops/<whatever>") will be automatically assigned
> > STRUCT_OPTS BPF program type (which is done generically in
> > bpf_object__open()), and then as .struct_ops section is parsed, all
> > those programs will be "assembled" by the code you added into a
> > struct_ops map.
> Setting BPF_PROG_TYPE_STRUCT_OPS can be done automatically at open
> phase (during collect_reloc time).  I will make this change.
>

Can you please extend exiting logic in __bpf_object__open() to do
this? See how libbpf_prog_type_by_name() is used for that.

> >
> > It's a requirement "of sorts", because even if user doesn't do that,
> > stuff will still work, if user manually will call
> > bpf_program__set_struct_ops(prog). Which actually reminds me that it
> > would be good to add bpf_program__set_struct_ops() and
> Although there is BPF_PROG_TYPE_FNS macro,
> I don't see moving bpf_prog__set_struct_ops(prog) to LIBBPF_API is useful
> while actually may cause confusion and error.  How could __set_struct_ops()
> a prog to struct_ops prog_type help a program, which is not used in
> SEC(".struct_ops"), to be loaded successfully as a struct_ops prog?
>
> Assigning a bpf_prog to a function ptr under the SEC(".struct_ops")
> is the only way for a program to be successfully loaded as
> struct_ops prog type.  Extra way to allow a prog to be changed to
> struct_ops prog_type is either useless or redundant.

Well, first of all, just for consistency with everything else. We have
such methods for all prog_types, so I'd like to avoid a special
snowflake one that doesn't.

Second, while high-level libbpf API provides all the magic to
construct STRUCT_OPS map based on .struct_ops section types,
technically, user might decide to do that using low-level map creation
API, right? So not making unnecessary assumptions and providing
complete APIs is a good thing, IMO. Especially if it costs basically
nothing in terms of code and maintenance.

>
> If it is really necessary to have __set_struct_ops() as a API
> for completeness, it can be added...
>
> > bpf_program__is_struct_ops() APIs for completeness, similarly to how
> is_struct_ops() makes sense.
>
> > BTW, libbpf will emit debug message for every single BPF program it
> > doesn't recognize section for, so it is still nice to have it be
> > something more or less standardized and recognizable by libbpf.
> I can make this debug (not error) message go away too after
> setting the BPF_PROG_TYPE_STRUCT_OPS automatically at open time.

Yep, that would be great.
Martin KaFai Lau Dec. 26, 2019, 8:25 p.m. UTC | #7
On Thu, Dec 26, 2019 at 11:02:26AM -0800, Andrii Nakryiko wrote:
> On Tue, Dec 24, 2019 at 8:50 AM Martin Lau <kafai@fb.com> wrote:
> >
> > On Mon, Dec 23, 2019 at 11:01:55PM -0800, Andrii Nakryiko wrote:
> > > On Mon, Dec 23, 2019 at 5:31 PM Martin Lau <kafai@fb.com> wrote:
> > > >
> > > > On Mon, Dec 23, 2019 at 03:26:50PM -0800, Andrii Nakryiko wrote:
> > > > > On Fri, Dec 20, 2019 at 10:26 PM Martin KaFai Lau <kafai@fb.com> wrote:
> > > > > >
> > > > > > This patch adds a bpf_dctcp example.  It currently does not do
> > > > > > no-ECN fallback but the same could be done through the cgrp2-bpf.
> > > > > >
> > > > > > Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> > > > > > ---
> > > > > >  tools/testing/selftests/bpf/bpf_tcp_helpers.h | 228 ++++++++++++++++++
> > > > > >  .../selftests/bpf/prog_tests/bpf_tcp_ca.c     | 218 +++++++++++++++++
> > > > > >  tools/testing/selftests/bpf/progs/bpf_dctcp.c | 210 ++++++++++++++++
> > > > > >  3 files changed, 656 insertions(+)
> > > > > >  create mode 100644 tools/testing/selftests/bpf/bpf_tcp_helpers.h
> > > > > >  create mode 100644 tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
> > > > > >  create mode 100644 tools/testing/selftests/bpf/progs/bpf_dctcp.c
> > > > > >
> > > > > > diff --git a/tools/testing/selftests/bpf/bpf_tcp_helpers.h b/tools/testing/selftests/bpf/bpf_tcp_helpers.h
> > > > > > new file mode 100644
> > > > > > index 000000000000..7ba8c1b4157a
> > > > > > --- /dev/null
> > > > > > +++ b/tools/testing/selftests/bpf/bpf_tcp_helpers.h
> > > > > > @@ -0,0 +1,228 @@
> > > > > > +/* SPDX-License-Identifier: GPL-2.0 */
> > > > > > +#ifndef __BPF_TCP_HELPERS_H
> > > > > > +#define __BPF_TCP_HELPERS_H
> > > > > > +
> > > > > > +#include <stdbool.h>
> > > > > > +#include <linux/types.h>
> > > > > > +#include <bpf_helpers.h>
> > > > > > +#include <bpf_core_read.h>
> > > > > > +#include "bpf_trace_helpers.h"
> > > > > > +
> > > > > > +#define BPF_TCP_OPS_0(fname, ret_type, ...) BPF_TRACE_x(0, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > > > > > +#define BPF_TCP_OPS_1(fname, ret_type, ...) BPF_TRACE_x(1, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > > > > > +#define BPF_TCP_OPS_2(fname, ret_type, ...) BPF_TRACE_x(2, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > > > > > +#define BPF_TCP_OPS_3(fname, ret_type, ...) BPF_TRACE_x(3, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > > > > > +#define BPF_TCP_OPS_4(fname, ret_type, ...) BPF_TRACE_x(4, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > > > > > +#define BPF_TCP_OPS_5(fname, ret_type, ...) BPF_TRACE_x(5, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > > > >
> > > > > Should we try to put those BPF programs into some section that would
> > > > > indicate they are used with struct opts? libbpf doesn't use or enforce
> > > > > that (even though it could to derive and enforce that they are
> > > > > STRUCT_OPS programs). So something like
> > > > > SEC("struct_ops/<ideally-operation-name-here>"). I think having this
> > > > > convention is very useful for consistency and to do a quick ELF dump
> > > > > and see what is where. WDYT?
> > > > I did not use it here because I don't want any misperception that it is
> > > > a required convention by libbpf.
> > > >
> > > > Sure, I can prefix it here and comment that it is just a
> > > > convention but not a libbpf's requirement.
> > >
> > > Well, we can actually make it a requirement of sorts. Currently your
> > > code expects that BPF program's type is UNSPEC and then it sets it to
> > > STRUCT_OPS. Alternatively we can say that any BPF program in
> > > SEC("struct_ops/<whatever>") will be automatically assigned
> > > STRUCT_OPTS BPF program type (which is done generically in
> > > bpf_object__open()), and then as .struct_ops section is parsed, all
> > > those programs will be "assembled" by the code you added into a
> > > struct_ops map.
> > Setting BPF_PROG_TYPE_STRUCT_OPS can be done automatically at open
> > phase (during collect_reloc time).  I will make this change.
> >
> 
> Can you please extend exiting logic in __bpf_object__open() to do
> this? See how libbpf_prog_type_by_name() is used for that.
Does it have to call libbpf_prog_type_by_name() if everything
has already been decided by the earlier
bpf_object__collect_struct_ops_map_reloc()?

> 
> > >
> > > It's a requirement "of sorts", because even if user doesn't do that,
> > > stuff will still work, if user manually will call
> > > bpf_program__set_struct_ops(prog). Which actually reminds me that it
> > > would be good to add bpf_program__set_struct_ops() and
> > Although there is BPF_PROG_TYPE_FNS macro,
> > I don't see moving bpf_prog__set_struct_ops(prog) to LIBBPF_API is useful
> > while actually may cause confusion and error.  How could __set_struct_ops()
> > a prog to struct_ops prog_type help a program, which is not used in
> > SEC(".struct_ops"), to be loaded successfully as a struct_ops prog?
> >
> > Assigning a bpf_prog to a function ptr under the SEC(".struct_ops")
> > is the only way for a program to be successfully loaded as
> > struct_ops prog type.  Extra way to allow a prog to be changed to
> > struct_ops prog_type is either useless or redundant.
> 
> Well, first of all, just for consistency with everything else. We have
> such methods for all prog_types, so I'd like to avoid a special
> snowflake one that doesn't.
Yes, for consistency is fine as I mentioned in the earlier reply,
as long as it is understood the usefulness of it.

> Second, while high-level libbpf API provides all the magic to
> construct STRUCT_OPS map based on .struct_ops section types,
> technically, user might decide to do that using low-level map creation
> API, right?
How?

Correct that the map api is reused as is in SEC(".struct_ops").

For prog, AFAICT, it is not possible to create struct_ops
prog from raw and use it in struct_ops map unless more LIBBPF_API
is added.  Lets put aside the need to find the btf_vmlinux
and its btf-types...etc.  At least, there is no LIBBPF_API to
set prog->attach_btf_id.  Considering the amount of preparation
is needed to create a struct_ops map from raw,  I would like
to see a real use case first before even considering what else
is needed and add another LIBBPF_API that may not be used.
Andrii Nakryiko Dec. 26, 2019, 8:48 p.m. UTC | #8
On Thu, Dec 26, 2019 at 12:25 PM Martin Lau <kafai@fb.com> wrote:
>
> On Thu, Dec 26, 2019 at 11:02:26AM -0800, Andrii Nakryiko wrote:
> > On Tue, Dec 24, 2019 at 8:50 AM Martin Lau <kafai@fb.com> wrote:
> > >
> > > On Mon, Dec 23, 2019 at 11:01:55PM -0800, Andrii Nakryiko wrote:
> > > > On Mon, Dec 23, 2019 at 5:31 PM Martin Lau <kafai@fb.com> wrote:
> > > > >
> > > > > On Mon, Dec 23, 2019 at 03:26:50PM -0800, Andrii Nakryiko wrote:
> > > > > > On Fri, Dec 20, 2019 at 10:26 PM Martin KaFai Lau <kafai@fb.com> wrote:
> > > > > > >
> > > > > > > This patch adds a bpf_dctcp example.  It currently does not do
> > > > > > > no-ECN fallback but the same could be done through the cgrp2-bpf.
> > > > > > >
> > > > > > > Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> > > > > > > ---
> > > > > > >  tools/testing/selftests/bpf/bpf_tcp_helpers.h | 228 ++++++++++++++++++
> > > > > > >  .../selftests/bpf/prog_tests/bpf_tcp_ca.c     | 218 +++++++++++++++++
> > > > > > >  tools/testing/selftests/bpf/progs/bpf_dctcp.c | 210 ++++++++++++++++
> > > > > > >  3 files changed, 656 insertions(+)
> > > > > > >  create mode 100644 tools/testing/selftests/bpf/bpf_tcp_helpers.h
> > > > > > >  create mode 100644 tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
> > > > > > >  create mode 100644 tools/testing/selftests/bpf/progs/bpf_dctcp.c
> > > > > > >
> > > > > > > diff --git a/tools/testing/selftests/bpf/bpf_tcp_helpers.h b/tools/testing/selftests/bpf/bpf_tcp_helpers.h
> > > > > > > new file mode 100644
> > > > > > > index 000000000000..7ba8c1b4157a
> > > > > > > --- /dev/null
> > > > > > > +++ b/tools/testing/selftests/bpf/bpf_tcp_helpers.h
> > > > > > > @@ -0,0 +1,228 @@
> > > > > > > +/* SPDX-License-Identifier: GPL-2.0 */
> > > > > > > +#ifndef __BPF_TCP_HELPERS_H
> > > > > > > +#define __BPF_TCP_HELPERS_H
> > > > > > > +
> > > > > > > +#include <stdbool.h>
> > > > > > > +#include <linux/types.h>
> > > > > > > +#include <bpf_helpers.h>
> > > > > > > +#include <bpf_core_read.h>
> > > > > > > +#include "bpf_trace_helpers.h"
> > > > > > > +
> > > > > > > +#define BPF_TCP_OPS_0(fname, ret_type, ...) BPF_TRACE_x(0, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > > > > > > +#define BPF_TCP_OPS_1(fname, ret_type, ...) BPF_TRACE_x(1, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > > > > > > +#define BPF_TCP_OPS_2(fname, ret_type, ...) BPF_TRACE_x(2, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > > > > > > +#define BPF_TCP_OPS_3(fname, ret_type, ...) BPF_TRACE_x(3, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > > > > > > +#define BPF_TCP_OPS_4(fname, ret_type, ...) BPF_TRACE_x(4, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > > > > > > +#define BPF_TCP_OPS_5(fname, ret_type, ...) BPF_TRACE_x(5, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > > > > >
> > > > > > Should we try to put those BPF programs into some section that would
> > > > > > indicate they are used with struct opts? libbpf doesn't use or enforce
> > > > > > that (even though it could to derive and enforce that they are
> > > > > > STRUCT_OPS programs). So something like
> > > > > > SEC("struct_ops/<ideally-operation-name-here>"). I think having this
> > > > > > convention is very useful for consistency and to do a quick ELF dump
> > > > > > and see what is where. WDYT?
> > > > > I did not use it here because I don't want any misperception that it is
> > > > > a required convention by libbpf.
> > > > >
> > > > > Sure, I can prefix it here and comment that it is just a
> > > > > convention but not a libbpf's requirement.
> > > >
> > > > Well, we can actually make it a requirement of sorts. Currently your
> > > > code expects that BPF program's type is UNSPEC and then it sets it to
> > > > STRUCT_OPS. Alternatively we can say that any BPF program in
> > > > SEC("struct_ops/<whatever>") will be automatically assigned
> > > > STRUCT_OPTS BPF program type (which is done generically in
> > > > bpf_object__open()), and then as .struct_ops section is parsed, all
> > > > those programs will be "assembled" by the code you added into a
> > > > struct_ops map.
> > > Setting BPF_PROG_TYPE_STRUCT_OPS can be done automatically at open
> > > phase (during collect_reloc time).  I will make this change.
> > >
> >
> > Can you please extend exiting logic in __bpf_object__open() to do
> > this? See how libbpf_prog_type_by_name() is used for that.
> Does it have to call libbpf_prog_type_by_name() if everything
> has already been decided by the earlier
> bpf_object__collect_struct_ops_map_reloc()?

We can certainly change the logic to omit guessing program type if
it's already set to something else than UNSPEC.

But all I'm asking is that instead of using #fname"_sec" section name,
is to use "struct_ops/"#fname, because it's consistent with all other
program types. If you do that, then you don't have to do anything
extra (well, add single entry to section_defs, of course), it will
just work as is.

>
> >
> > > >
> > > > It's a requirement "of sorts", because even if user doesn't do that,
> > > > stuff will still work, if user manually will call
> > > > bpf_program__set_struct_ops(prog). Which actually reminds me that it
> > > > would be good to add bpf_program__set_struct_ops() and
> > > Although there is BPF_PROG_TYPE_FNS macro,
> > > I don't see moving bpf_prog__set_struct_ops(prog) to LIBBPF_API is useful
> > > while actually may cause confusion and error.  How could __set_struct_ops()
> > > a prog to struct_ops prog_type help a program, which is not used in
> > > SEC(".struct_ops"), to be loaded successfully as a struct_ops prog?
> > >
> > > Assigning a bpf_prog to a function ptr under the SEC(".struct_ops")
> > > is the only way for a program to be successfully loaded as
> > > struct_ops prog type.  Extra way to allow a prog to be changed to
> > > struct_ops prog_type is either useless or redundant.
> >
> > Well, first of all, just for consistency with everything else. We have
> > such methods for all prog_types, so I'd like to avoid a special
> > snowflake one that doesn't.
> Yes, for consistency is fine as I mentioned in the earlier reply,
> as long as it is understood the usefulness of it.
>
> > Second, while high-level libbpf API provides all the magic to
> > construct STRUCT_OPS map based on .struct_ops section types,
> > technically, user might decide to do that using low-level map creation
> > API, right?
> How?
>
> Correct that the map api is reused as is in SEC(".struct_ops").
>
> For prog, AFAICT, it is not possible to create struct_ops
> prog from raw and use it in struct_ops map unless more LIBBPF_API
> is added.  Lets put aside the need to find the btf_vmlinux
> and its btf-types...etc.  At least, there is no LIBBPF_API to
> set prog->attach_btf_id.  Considering the amount of preparation
> is needed to create a struct_ops map from raw,  I would like
> to see a real use case first before even considering what else
> is needed and add another LIBBPF_API that may not be used.

To be clear, I don't think anyone in their right mind should do this
by hand. I'm just saying that in the end it's not magic, just calls to
low-level map APIs. See above, though, all I care is consistent
pattern for sections names: "program_type/<whatever-makes-sense>".
Martin KaFai Lau Dec. 26, 2019, 10:20 p.m. UTC | #9
On Thu, Dec 26, 2019 at 12:48:09PM -0800, Andrii Nakryiko wrote:
> On Thu, Dec 26, 2019 at 12:25 PM Martin Lau <kafai@fb.com> wrote:
> >
> > On Thu, Dec 26, 2019 at 11:02:26AM -0800, Andrii Nakryiko wrote:
> > > On Tue, Dec 24, 2019 at 8:50 AM Martin Lau <kafai@fb.com> wrote:
> > > >
> > > > On Mon, Dec 23, 2019 at 11:01:55PM -0800, Andrii Nakryiko wrote:
> > > > > On Mon, Dec 23, 2019 at 5:31 PM Martin Lau <kafai@fb.com> wrote:
> > > > > >
> > > > > > On Mon, Dec 23, 2019 at 03:26:50PM -0800, Andrii Nakryiko wrote:
> > > > > > > On Fri, Dec 20, 2019 at 10:26 PM Martin KaFai Lau <kafai@fb.com> wrote:
> > > > > > > >
> > > > > > > > This patch adds a bpf_dctcp example.  It currently does not do
> > > > > > > > no-ECN fallback but the same could be done through the cgrp2-bpf.
> > > > > > > >
> > > > > > > > Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> > > > > > > > ---
> > > > > > > >  tools/testing/selftests/bpf/bpf_tcp_helpers.h | 228 ++++++++++++++++++
> > > > > > > >  .../selftests/bpf/prog_tests/bpf_tcp_ca.c     | 218 +++++++++++++++++
> > > > > > > >  tools/testing/selftests/bpf/progs/bpf_dctcp.c | 210 ++++++++++++++++
> > > > > > > >  3 files changed, 656 insertions(+)
> > > > > > > >  create mode 100644 tools/testing/selftests/bpf/bpf_tcp_helpers.h
> > > > > > > >  create mode 100644 tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
> > > > > > > >  create mode 100644 tools/testing/selftests/bpf/progs/bpf_dctcp.c
> > > > > > > >
> > > > > > > > diff --git a/tools/testing/selftests/bpf/bpf_tcp_helpers.h b/tools/testing/selftests/bpf/bpf_tcp_helpers.h
> > > > > > > > new file mode 100644
> > > > > > > > index 000000000000..7ba8c1b4157a
> > > > > > > > --- /dev/null
> > > > > > > > +++ b/tools/testing/selftests/bpf/bpf_tcp_helpers.h
> > > > > > > > @@ -0,0 +1,228 @@
> > > > > > > > +/* SPDX-License-Identifier: GPL-2.0 */
> > > > > > > > +#ifndef __BPF_TCP_HELPERS_H
> > > > > > > > +#define __BPF_TCP_HELPERS_H
> > > > > > > > +
> > > > > > > > +#include <stdbool.h>
> > > > > > > > +#include <linux/types.h>
> > > > > > > > +#include <bpf_helpers.h>
> > > > > > > > +#include <bpf_core_read.h>
> > > > > > > > +#include "bpf_trace_helpers.h"
> > > > > > > > +
> > > > > > > > +#define BPF_TCP_OPS_0(fname, ret_type, ...) BPF_TRACE_x(0, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > > > > > > > +#define BPF_TCP_OPS_1(fname, ret_type, ...) BPF_TRACE_x(1, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > > > > > > > +#define BPF_TCP_OPS_2(fname, ret_type, ...) BPF_TRACE_x(2, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > > > > > > > +#define BPF_TCP_OPS_3(fname, ret_type, ...) BPF_TRACE_x(3, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > > > > > > > +#define BPF_TCP_OPS_4(fname, ret_type, ...) BPF_TRACE_x(4, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > > > > > > > +#define BPF_TCP_OPS_5(fname, ret_type, ...) BPF_TRACE_x(5, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > > > > > >
> > > > > > > Should we try to put those BPF programs into some section that would
> > > > > > > indicate they are used with struct opts? libbpf doesn't use or enforce
> > > > > > > that (even though it could to derive and enforce that they are
> > > > > > > STRUCT_OPS programs). So something like
> > > > > > > SEC("struct_ops/<ideally-operation-name-here>"). I think having this
> > > > > > > convention is very useful for consistency and to do a quick ELF dump
> > > > > > > and see what is where. WDYT?
> > > > > > I did not use it here because I don't want any misperception that it is
> > > > > > a required convention by libbpf.
> > > > > >
> > > > > > Sure, I can prefix it here and comment that it is just a
> > > > > > convention but not a libbpf's requirement.
> > > > >
> > > > > Well, we can actually make it a requirement of sorts. Currently your
> > > > > code expects that BPF program's type is UNSPEC and then it sets it to
> > > > > STRUCT_OPS. Alternatively we can say that any BPF program in
> > > > > SEC("struct_ops/<whatever>") will be automatically assigned
> > > > > STRUCT_OPTS BPF program type (which is done generically in
> > > > > bpf_object__open()), and then as .struct_ops section is parsed, all
> > > > > those programs will be "assembled" by the code you added into a
> > > > > struct_ops map.
> > > > Setting BPF_PROG_TYPE_STRUCT_OPS can be done automatically at open
> > > > phase (during collect_reloc time).  I will make this change.
> > > >
> > >
> > > Can you please extend exiting logic in __bpf_object__open() to do
> > > this? See how libbpf_prog_type_by_name() is used for that.
> > Does it have to call libbpf_prog_type_by_name() if everything
> > has already been decided by the earlier
> > bpf_object__collect_struct_ops_map_reloc()?
> 
> We can certainly change the logic to omit guessing program type if
> it's already set to something else than UNSPEC.
> 
> But all I'm asking is that instead of using #fname"_sec" section name,
> is to use "struct_ops/"#fname, because it's consistent with all other
> program types. If you do that, then you don't have to do anything
> extra (well, add single entry to section_defs, of course), it will
> just work as is.
Re: adding "struct_ops/" to section_defs,
Sure. as long as SEC(".struct_ops") can use prog that
libbpf_prog_type_by_name() concluded it is either -ESRCH or
STRUCT_OPS.

It is not the only change though.  Other changes are still
needed in collect_reloc (e.g. check prog type mismatch).
They won't be much though.
Andrii Nakryiko Dec. 26, 2019, 10:25 p.m. UTC | #10
On Thu, Dec 26, 2019 at 2:20 PM Martin Lau <kafai@fb.com> wrote:
>
> On Thu, Dec 26, 2019 at 12:48:09PM -0800, Andrii Nakryiko wrote:
> > On Thu, Dec 26, 2019 at 12:25 PM Martin Lau <kafai@fb.com> wrote:
> > >
> > > On Thu, Dec 26, 2019 at 11:02:26AM -0800, Andrii Nakryiko wrote:
> > > > On Tue, Dec 24, 2019 at 8:50 AM Martin Lau <kafai@fb.com> wrote:
> > > > >
> > > > > On Mon, Dec 23, 2019 at 11:01:55PM -0800, Andrii Nakryiko wrote:
> > > > > > On Mon, Dec 23, 2019 at 5:31 PM Martin Lau <kafai@fb.com> wrote:
> > > > > > >
> > > > > > > On Mon, Dec 23, 2019 at 03:26:50PM -0800, Andrii Nakryiko wrote:
> > > > > > > > On Fri, Dec 20, 2019 at 10:26 PM Martin KaFai Lau <kafai@fb.com> wrote:
> > > > > > > > >
> > > > > > > > > This patch adds a bpf_dctcp example.  It currently does not do
> > > > > > > > > no-ECN fallback but the same could be done through the cgrp2-bpf.
> > > > > > > > >
> > > > > > > > > Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> > > > > > > > > ---
> > > > > > > > >  tools/testing/selftests/bpf/bpf_tcp_helpers.h | 228 ++++++++++++++++++
> > > > > > > > >  .../selftests/bpf/prog_tests/bpf_tcp_ca.c     | 218 +++++++++++++++++
> > > > > > > > >  tools/testing/selftests/bpf/progs/bpf_dctcp.c | 210 ++++++++++++++++
> > > > > > > > >  3 files changed, 656 insertions(+)
> > > > > > > > >  create mode 100644 tools/testing/selftests/bpf/bpf_tcp_helpers.h
> > > > > > > > >  create mode 100644 tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
> > > > > > > > >  create mode 100644 tools/testing/selftests/bpf/progs/bpf_dctcp.c
> > > > > > > > >
> > > > > > > > > diff --git a/tools/testing/selftests/bpf/bpf_tcp_helpers.h b/tools/testing/selftests/bpf/bpf_tcp_helpers.h
> > > > > > > > > new file mode 100644
> > > > > > > > > index 000000000000..7ba8c1b4157a
> > > > > > > > > --- /dev/null
> > > > > > > > > +++ b/tools/testing/selftests/bpf/bpf_tcp_helpers.h
> > > > > > > > > @@ -0,0 +1,228 @@
> > > > > > > > > +/* SPDX-License-Identifier: GPL-2.0 */
> > > > > > > > > +#ifndef __BPF_TCP_HELPERS_H
> > > > > > > > > +#define __BPF_TCP_HELPERS_H
> > > > > > > > > +
> > > > > > > > > +#include <stdbool.h>
> > > > > > > > > +#include <linux/types.h>
> > > > > > > > > +#include <bpf_helpers.h>
> > > > > > > > > +#include <bpf_core_read.h>
> > > > > > > > > +#include "bpf_trace_helpers.h"
> > > > > > > > > +
> > > > > > > > > +#define BPF_TCP_OPS_0(fname, ret_type, ...) BPF_TRACE_x(0, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > > > > > > > > +#define BPF_TCP_OPS_1(fname, ret_type, ...) BPF_TRACE_x(1, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > > > > > > > > +#define BPF_TCP_OPS_2(fname, ret_type, ...) BPF_TRACE_x(2, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > > > > > > > > +#define BPF_TCP_OPS_3(fname, ret_type, ...) BPF_TRACE_x(3, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > > > > > > > > +#define BPF_TCP_OPS_4(fname, ret_type, ...) BPF_TRACE_x(4, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > > > > > > > > +#define BPF_TCP_OPS_5(fname, ret_type, ...) BPF_TRACE_x(5, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > > > > > > >
> > > > > > > > Should we try to put those BPF programs into some section that would
> > > > > > > > indicate they are used with struct opts? libbpf doesn't use or enforce
> > > > > > > > that (even though it could to derive and enforce that they are
> > > > > > > > STRUCT_OPS programs). So something like
> > > > > > > > SEC("struct_ops/<ideally-operation-name-here>"). I think having this
> > > > > > > > convention is very useful for consistency and to do a quick ELF dump
> > > > > > > > and see what is where. WDYT?
> > > > > > > I did not use it here because I don't want any misperception that it is
> > > > > > > a required convention by libbpf.
> > > > > > >
> > > > > > > Sure, I can prefix it here and comment that it is just a
> > > > > > > convention but not a libbpf's requirement.
> > > > > >
> > > > > > Well, we can actually make it a requirement of sorts. Currently your
> > > > > > code expects that BPF program's type is UNSPEC and then it sets it to
> > > > > > STRUCT_OPS. Alternatively we can say that any BPF program in
> > > > > > SEC("struct_ops/<whatever>") will be automatically assigned
> > > > > > STRUCT_OPTS BPF program type (which is done generically in
> > > > > > bpf_object__open()), and then as .struct_ops section is parsed, all
> > > > > > those programs will be "assembled" by the code you added into a
> > > > > > struct_ops map.
> > > > > Setting BPF_PROG_TYPE_STRUCT_OPS can be done automatically at open
> > > > > phase (during collect_reloc time).  I will make this change.
> > > > >
> > > >
> > > > Can you please extend exiting logic in __bpf_object__open() to do
> > > > this? See how libbpf_prog_type_by_name() is used for that.
> > > Does it have to call libbpf_prog_type_by_name() if everything
> > > has already been decided by the earlier
> > > bpf_object__collect_struct_ops_map_reloc()?
> >
> > We can certainly change the logic to omit guessing program type if
> > it's already set to something else than UNSPEC.
> >
> > But all I'm asking is that instead of using #fname"_sec" section name,
> > is to use "struct_ops/"#fname, because it's consistent with all other
> > program types. If you do that, then you don't have to do anything
> > extra (well, add single entry to section_defs, of course), it will
> > just work as is.
> Re: adding "struct_ops/" to section_defs,
> Sure. as long as SEC(".struct_ops") can use prog that
> libbpf_prog_type_by_name() concluded it is either -ESRCH or
> STRUCT_OPS.
>
> It is not the only change though.  Other changes are still
> needed in collect_reloc (e.g. check prog type mismatch).
> They won't be much though.

sounds good, thanks!
diff mbox series

Patch

diff --git a/tools/testing/selftests/bpf/bpf_tcp_helpers.h b/tools/testing/selftests/bpf/bpf_tcp_helpers.h
new file mode 100644
index 000000000000..7ba8c1b4157a
--- /dev/null
+++ b/tools/testing/selftests/bpf/bpf_tcp_helpers.h
@@ -0,0 +1,228 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __BPF_TCP_HELPERS_H
+#define __BPF_TCP_HELPERS_H
+
+#include <stdbool.h>
+#include <linux/types.h>
+#include <bpf_helpers.h>
+#include <bpf_core_read.h>
+#include "bpf_trace_helpers.h"
+
+#define BPF_TCP_OPS_0(fname, ret_type, ...) BPF_TRACE_x(0, #fname"_sec", fname, ret_type, __VA_ARGS__)
+#define BPF_TCP_OPS_1(fname, ret_type, ...) BPF_TRACE_x(1, #fname"_sec", fname, ret_type, __VA_ARGS__)
+#define BPF_TCP_OPS_2(fname, ret_type, ...) BPF_TRACE_x(2, #fname"_sec", fname, ret_type, __VA_ARGS__)
+#define BPF_TCP_OPS_3(fname, ret_type, ...) BPF_TRACE_x(3, #fname"_sec", fname, ret_type, __VA_ARGS__)
+#define BPF_TCP_OPS_4(fname, ret_type, ...) BPF_TRACE_x(4, #fname"_sec", fname, ret_type, __VA_ARGS__)
+#define BPF_TCP_OPS_5(fname, ret_type, ...) BPF_TRACE_x(5, #fname"_sec", fname, ret_type, __VA_ARGS__)
+
+struct sock_common {
+	unsigned char	skc_state;
+} __attribute__((preserve_access_index));
+
+struct sock {
+	struct sock_common	__sk_common;
+} __attribute__((preserve_access_index));
+
+struct inet_sock {
+	struct sock		sk;
+} __attribute__((preserve_access_index));
+
+struct inet_connection_sock {
+	struct inet_sock	  icsk_inet;
+	__u8			  icsk_ca_state:6,
+				  icsk_ca_setsockopt:1,
+				  icsk_ca_dst_locked:1;
+	struct {
+		__u8		  pending;
+	} icsk_ack;
+	__u64			  icsk_ca_priv[104 / sizeof(__u64)];
+} __attribute__((preserve_access_index));
+
+struct tcp_sock {
+	struct inet_connection_sock	inet_conn;
+
+	__u32	rcv_nxt;
+	__u32	snd_nxt;
+	__u32	snd_una;
+	__u8	ecn_flags;
+	__u32	delivered;
+	__u32	delivered_ce;
+	__u32	snd_cwnd;
+	__u32	snd_cwnd_cnt;
+	__u32	snd_cwnd_clamp;
+	__u32	snd_ssthresh;
+	__u8	syn_data:1,	/* SYN includes data */
+		syn_fastopen:1,	/* SYN includes Fast Open option */
+		syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */
+		syn_fastopen_ch:1, /* Active TFO re-enabling probe */
+		syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
+		save_syn:1,	/* Save headers of SYN packet */
+		is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */
+		syn_smc:1;	/* SYN includes SMC */
+	__u32	max_packets_out;
+	__u32	lsndtime;
+	__u32	prior_cwnd;
+} __attribute__((preserve_access_index));
+
+static __always_inline struct inet_connection_sock *inet_csk(const struct sock *sk)
+{
+	return (struct inet_connection_sock *)sk;
+}
+
+static __always_inline void *inet_csk_ca(const struct sock *sk)
+{
+	return (void *)inet_csk(sk)->icsk_ca_priv;
+}
+
+static __always_inline struct tcp_sock *tcp_sk(const struct sock *sk)
+{
+	return (struct tcp_sock *)sk;
+}
+
+static __always_inline bool before(__u32 seq1, __u32 seq2)
+{
+	return (__s32)(seq1-seq2) < 0;
+}
+#define after(seq2, seq1) 	before(seq1, seq2)
+
+#define	TCP_ECN_OK		1
+#define	TCP_ECN_QUEUE_CWR	2
+#define	TCP_ECN_DEMAND_CWR	4
+#define	TCP_ECN_SEEN		8
+
+enum inet_csk_ack_state_t {
+	ICSK_ACK_SCHED	= 1,
+	ICSK_ACK_TIMER  = 2,
+	ICSK_ACK_PUSHED = 4,
+	ICSK_ACK_PUSHED2 = 8,
+	ICSK_ACK_NOW = 16	/* Send the next ACK immediately (once) */
+};
+
+enum tcp_ca_event {
+	CA_EVENT_TX_START = 0,
+	CA_EVENT_CWND_RESTART = 1,
+	CA_EVENT_COMPLETE_CWR = 2,
+	CA_EVENT_LOSS = 3,
+	CA_EVENT_ECN_NO_CE = 4,
+	CA_EVENT_ECN_IS_CE = 5,
+};
+
+enum tcp_ca_state {
+	TCP_CA_Open = 0,
+	TCP_CA_Disorder = 1,
+	TCP_CA_CWR = 2,
+	TCP_CA_Recovery = 3,
+	TCP_CA_Loss = 4
+};
+
+struct ack_sample {
+	__u32 pkts_acked;
+	__s32 rtt_us;
+	__u32 in_flight;
+} __attribute__((preserve_access_index));
+
+struct rate_sample {
+	__u64  prior_mstamp; /* starting timestamp for interval */
+	__u32  prior_delivered;	/* tp->delivered at "prior_mstamp" */
+	__s32  delivered;		/* number of packets delivered over interval */
+	long interval_us;	/* time for tp->delivered to incr "delivered" */
+	__u32 snd_interval_us;	/* snd interval for delivered packets */
+	__u32 rcv_interval_us;	/* rcv interval for delivered packets */
+	long rtt_us;		/* RTT of last (S)ACKed packet (or -1) */
+	int  losses;		/* number of packets marked lost upon ACK */
+	__u32  acked_sacked;	/* number of packets newly (S)ACKed upon ACK */
+	__u32  prior_in_flight;	/* in flight before this ACK */
+	bool is_app_limited;	/* is sample from packet with bubble in pipe? */
+	bool is_retrans;	/* is sample from retransmission? */
+	bool is_ack_delayed;	/* is this (likely) a delayed ACK? */
+} __attribute__((preserve_access_index));
+
+#define TCP_CA_NAME_MAX		16
+#define TCP_CONG_NEEDS_ECN	0x2
+
+struct tcp_congestion_ops {
+	__u32 flags;
+
+	/* initialize private data (optional) */
+	void (*init)(struct sock *sk);
+	/* cleanup private data  (optional) */
+	void (*release)(struct sock *sk);
+
+	/* return slow start threshold (required) */
+	__u32 (*ssthresh)(struct sock *sk);
+	/* do new cwnd calculation (required) */
+	void (*cong_avoid)(struct sock *sk, __u32 ack, __u32 acked);
+	/* call before changing ca_state (optional) */
+	void (*set_state)(struct sock *sk, __u8 new_state);
+	/* call when cwnd event occurs (optional) */
+	void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev);
+	/* call when ack arrives (optional) */
+	void (*in_ack_event)(struct sock *sk, __u32 flags);
+	/* new value of cwnd after loss (required) */
+	__u32  (*undo_cwnd)(struct sock *sk);
+	/* hook for packet ack accounting (optional) */
+	void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
+	/* override sysctl_tcp_min_tso_segs */
+	__u32 (*min_tso_segs)(struct sock *sk);
+	/* returns the multiplier used in tcp_sndbuf_expand (optional) */
+	__u32 (*sndbuf_expand)(struct sock *sk);
+	/* call when packets are delivered to update cwnd and pacing rate,
+	 * after all the ca_state processing. (optional)
+	 */
+	void (*cong_control)(struct sock *sk, const struct rate_sample *rs);
+
+	char 		name[TCP_CA_NAME_MAX];
+};
+
+#define min(a, b) ((a) < (b) ? (a) : (b))
+#define max(a, b) ((a) > (b) ? (a) : (b))
+#define min_not_zero(x, y) ({			\
+	typeof(x) __x = (x);			\
+	typeof(y) __y = (y);			\
+	__x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); })
+
+static __always_inline __u32 tcp_slow_start(struct tcp_sock *tp, __u32 acked)
+{
+	__u32 cwnd = min(tp->snd_cwnd + acked, tp->snd_ssthresh);
+
+	acked -= cwnd - tp->snd_cwnd;
+	tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);
+
+	return acked;
+}
+
+static __always_inline bool tcp_in_slow_start(const struct tcp_sock *tp)
+{
+	return tp->snd_cwnd < tp->snd_ssthresh;
+}
+
+static __always_inline bool tcp_is_cwnd_limited(const struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+
+	/* If in slow start, ensure cwnd grows to twice what was ACKed. */
+	if (tcp_in_slow_start(tp))
+		return tp->snd_cwnd < 2 * tp->max_packets_out;
+
+	return !!BPF_CORE_READ_BITFIELD(tp, is_cwnd_limited);
+}
+
+static __always_inline void tcp_cong_avoid_ai(struct tcp_sock *tp, __u32 w, __u32 acked)
+{
+	/* If credits accumulated at a higher w, apply them gently now. */
+	if (tp->snd_cwnd_cnt >= w) {
+		tp->snd_cwnd_cnt = 0;
+		tp->snd_cwnd++;
+	}
+
+	tp->snd_cwnd_cnt += acked;
+	if (tp->snd_cwnd_cnt >= w) {
+		__u32 delta = tp->snd_cwnd_cnt / w;
+
+		tp->snd_cwnd_cnt -= delta * w;
+		tp->snd_cwnd += delta;
+	}
+	tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_cwnd_clamp);
+}
+
+#endif
diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
new file mode 100644
index 000000000000..7fc05d990f4d
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
@@ -0,0 +1,218 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook */
+
+#include <linux/err.h>
+#include <test_progs.h>
+
+#define min(a, b) ((a) < (b) ? (a) : (b))
+
+static const unsigned int total_bytes = 10 * 1024 * 1024;
+static const struct timeval timeo_sec = { .tv_sec = 10 };
+static const size_t timeo_optlen = sizeof(timeo_sec);
+static int stop, duration;
+
+static int settimeo(int fd)
+{
+	int err;
+
+	err = setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &timeo_sec,
+			 timeo_optlen);
+	if (CHECK(err == -1, "setsockopt(fd, SO_RCVTIMEO)", "errno:%d\n",
+		  errno))
+		return -1;
+
+	err = setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, &timeo_sec,
+			 timeo_optlen);
+	if (CHECK(err == -1, "setsockopt(fd, SO_SNDTIMEO)", "errno:%d\n",
+		  errno))
+		return -1;
+
+	return 0;
+}
+
+static int settcpca(int fd, const char *tcp_ca)
+{
+	int err;
+
+	err = setsockopt(fd, IPPROTO_TCP, TCP_CONGESTION, tcp_ca, strlen(tcp_ca));
+	if (CHECK(err == -1, "setsockopt(fd, TCP_CONGESTION)", "errno:%d\n",
+		  errno))
+		return -1;
+
+	return 0;
+}
+
+static void *server(void *arg)
+{
+	int lfd = (int)(long)arg, err = 0, fd;
+	ssize_t nr_sent = 0, bytes = 0;
+	char batch[1500];
+
+	fd = accept(lfd, NULL, NULL);
+	while (fd == -1) {
+		if (errno == EINTR)
+			continue;
+		err = -errno;
+		goto done;
+	}
+
+	if (settimeo(fd)) {
+		err = -errno;
+		goto done;
+	}
+
+	while (bytes < total_bytes && !READ_ONCE(stop)) {
+		nr_sent = send(fd, &batch,
+			       min(total_bytes - bytes, sizeof(batch)), 0);
+		if (nr_sent == -1 && errno == EINTR)
+			continue;
+		if (nr_sent == -1) {
+			err = -errno;
+			break;
+		}
+		bytes += nr_sent;
+	}
+
+	CHECK(bytes != total_bytes, "send", "%zd != %u nr_sent:%zd errno:%d\n",
+	      bytes, total_bytes, nr_sent, errno);
+
+done:
+	if (fd != -1)
+		close(fd);
+	if (err) {
+		WRITE_ONCE(stop, 1);
+		return ERR_PTR(err);
+	}
+	return NULL;
+}
+
+static void do_test(const char *tcp_ca)
+{
+	struct sockaddr_in6 sa6 = {};
+	ssize_t nr_recv = 0, bytes = 0;
+	int lfd = -1, fd = -1;
+	pthread_t srv_thread;
+	socklen_t addrlen = sizeof(sa6);
+	void *thread_ret;
+	char batch[1500];
+	int err;
+
+	WRITE_ONCE(stop, 0);
+
+	lfd = socket(AF_INET6, SOCK_STREAM, 0);
+	if (CHECK(lfd == -1, "socket", "errno:%d\n", errno))
+		return;
+	fd = socket(AF_INET6, SOCK_STREAM, 0);
+	if (CHECK(fd == -1, "socket", "errno:%d\n", errno)) {
+		close(lfd);
+		return;
+	}
+
+	if (settcpca(lfd, tcp_ca) || settcpca(fd, tcp_ca) ||
+	    settimeo(lfd) || settimeo(fd))
+		goto done;
+
+	/* bind, listen and start server thread to accept */
+	sa6.sin6_family = AF_INET6;
+	sa6.sin6_addr = in6addr_loopback;
+	err = bind(lfd, (struct sockaddr *)&sa6, addrlen);
+	if (CHECK(err == -1, "bind", "errno:%d\n", errno))
+		goto done;
+	err = getsockname(lfd, (struct sockaddr *)&sa6, &addrlen);
+	if (CHECK(err == -1, "getsockname", "errno:%d\n", errno))
+		goto done;
+	err = listen(lfd, 1);
+	if (CHECK(err == -1, "listen", "errno:%d\n", errno))
+		goto done;
+	err = pthread_create(&srv_thread, NULL, server, (void *)(long)lfd);
+	if (CHECK(err != 0, "pthread_create", "err:%d\n", err))
+		goto done;
+
+	/* connect to server */
+	err = connect(fd, (struct sockaddr *)&sa6, addrlen);
+	if (CHECK(err == -1, "connect", "errno:%d\n", errno))
+		goto wait_thread;
+
+	/* recv total_bytes */
+	while (bytes < total_bytes && !READ_ONCE(stop)) {
+		nr_recv = recv(fd, &batch,
+			       min(total_bytes - bytes, sizeof(batch)), 0);
+		if (nr_recv == -1 && errno == EINTR)
+			continue;
+		if (nr_recv == -1)
+			break;
+		bytes += nr_recv;
+	}
+
+	CHECK(bytes != total_bytes, "recv", "%zd != %u nr_recv:%zd errno:%d\n",
+	      bytes, total_bytes, nr_recv, errno);
+
+wait_thread:
+	WRITE_ONCE(stop, 1);
+	pthread_join(srv_thread, &thread_ret);
+	CHECK(IS_ERR(thread_ret), "pthread_join", "thread_ret:%ld",
+	      PTR_ERR(thread_ret));
+done:
+	close(lfd);
+	close(fd);
+}
+
+static struct bpf_object *load(const char *filename, const char *map_name,
+			       struct bpf_link **link)
+{
+	struct bpf_object *obj;
+	struct bpf_map *map;
+	struct bpf_link *l;
+	int err;
+
+	obj = bpf_object__open(filename);
+	if (CHECK(IS_ERR(obj), "bpf_obj__open_file", "obj:%ld\n",
+		  PTR_ERR(obj)))
+		return obj;
+
+	err = bpf_object__load(obj);
+	if (CHECK(err, "bpf_object__load", "err:%d\n", err)) {
+		bpf_object__close(obj);
+		return ERR_PTR(err);
+	}
+
+	map = bpf_object__find_map_by_name(obj, map_name);
+	if (CHECK(!map, "bpf_object__find_map_by_name", "%s not found\n",
+		    map_name)) {
+		bpf_object__close(obj);
+		return ERR_PTR(-ENOENT);
+	}
+
+	l = bpf_map__attach_struct_ops(map);
+	if (CHECK(IS_ERR(l), "bpf_struct_ops_map__attach", "err:%ld\n",
+		  PTR_ERR(l))) {
+		bpf_object__close(obj);
+		return (void *)l;
+	}
+
+	*link = l;
+
+	return obj;
+}
+
+static void test_dctcp(void)
+{
+	struct bpf_object *obj;
+	/* compiler warning... */
+	struct bpf_link *link = NULL;
+
+	obj = load("bpf_dctcp.o", "dctcp", &link);
+	if (IS_ERR(obj))
+		return;
+
+	do_test("bpf_dctcp");
+
+	bpf_link__destroy(link);
+	bpf_object__close(obj);
+}
+
+void test_bpf_tcp_ca(void)
+{
+	if (test__start_subtest("dctcp"))
+		test_dctcp();
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp.c b/tools/testing/selftests/bpf/progs/bpf_dctcp.c
new file mode 100644
index 000000000000..5f9b613663e5
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_dctcp.c
@@ -0,0 +1,210 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook */
+
+/* WARNING: This implemenation is not necessarily the same
+ * as the tcp_dctcp.c.  The purpose is mainly for testing
+ * the kernel BPF logic.
+ */
+
+#include <linux/bpf.h>
+#include <linux/types.h>
+#include "bpf_tcp_helpers.h"
+
+char _license[] SEC("license") = "GPL";
+
+#define DCTCP_MAX_ALPHA	1024U
+
+struct dctcp {
+	__u32 old_delivered;
+	__u32 old_delivered_ce;
+	__u32 prior_rcv_nxt;
+	__u32 dctcp_alpha;
+	__u32 next_seq;
+	__u32 ce_state;
+	__u32 loss_cwnd;
+};
+
+static unsigned int dctcp_shift_g = 4; /* g = 1/2^4 */
+static unsigned int dctcp_alpha_on_init = DCTCP_MAX_ALPHA;
+
+static __always_inline void dctcp_reset(const struct tcp_sock *tp,
+					struct dctcp *ca)
+{
+	ca->next_seq = tp->snd_nxt;
+
+	ca->old_delivered = tp->delivered;
+	ca->old_delivered_ce = tp->delivered_ce;
+}
+
+BPF_TCP_OPS_1(dctcp_init, void, struct sock *, sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct dctcp *ca = inet_csk_ca(sk);
+
+	ca->prior_rcv_nxt = tp->rcv_nxt;
+	ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA);
+	ca->loss_cwnd = 0;
+	ca->ce_state = 0;
+
+	dctcp_reset(tp, ca);
+}
+
+BPF_TCP_OPS_1(dctcp_ssthresh, __u32, struct sock *, sk)
+{
+	struct dctcp *ca = inet_csk_ca(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	ca->loss_cwnd = tp->snd_cwnd;
+	return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) >> 11U), 2U);
+}
+
+BPF_TCP_OPS_2(dctcp_update_alpha, void,
+	      struct sock *, sk, __u32, flags)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	struct dctcp *ca = inet_csk_ca(sk);
+
+	/* Expired RTT */
+	if (!before(tp->snd_una, ca->next_seq)) {
+		__u32 delivered_ce = tp->delivered_ce - ca->old_delivered_ce;
+		__u32 alpha = ca->dctcp_alpha;
+
+		/* alpha = (1 - g) * alpha + g * F */
+
+		alpha -= min_not_zero(alpha, alpha >> dctcp_shift_g);
+		if (delivered_ce) {
+			__u32 delivered = tp->delivered - ca->old_delivered;
+
+			/* If dctcp_shift_g == 1, a 32bit value would overflow
+			 * after 8 M packets.
+			 */
+			delivered_ce <<= (10 - dctcp_shift_g);
+			delivered_ce /= max(1U, delivered);
+
+			alpha = min(alpha + delivered_ce, DCTCP_MAX_ALPHA);
+		}
+		ca->dctcp_alpha = alpha;
+		dctcp_reset(tp, ca);
+	}
+}
+
+static __always_inline void dctcp_react_to_loss(struct sock *sk)
+{
+	struct dctcp *ca = inet_csk_ca(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	ca->loss_cwnd = tp->snd_cwnd;
+	tp->snd_ssthresh = max(tp->snd_cwnd >> 1U, 2U);
+}
+
+BPF_TCP_OPS_2(dctcp_state, void, struct sock *, sk, __u8, new_state)
+{
+	if (new_state == TCP_CA_Recovery &&
+	    new_state != BPF_CORE_READ_BITFIELD(inet_csk(sk), icsk_ca_state))
+		dctcp_react_to_loss(sk);
+	/* We handle RTO in dctcp_cwnd_event to ensure that we perform only
+	 * one loss-adjustment per RTT.
+	 */
+}
+
+static __always_inline void dctcp_ece_ack_cwr(struct sock *sk, __u32 ce_state)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (ce_state == 1)
+		tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+	else
+		tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+}
+
+/* Minimal DCTP CE state machine:
+ *
+ * S:	0 <- last pkt was non-CE
+ *	1 <- last pkt was CE
+ */
+static __always_inline
+void dctcp_ece_ack_update(struct sock *sk, enum tcp_ca_event evt,
+			  __u32 *prior_rcv_nxt, __u32 *ce_state)
+{
+	__u32 new_ce_state = (evt == CA_EVENT_ECN_IS_CE) ? 1 : 0;
+
+	if (*ce_state != new_ce_state) {
+		/* CE state has changed, force an immediate ACK to
+		 * reflect the new CE state. If an ACK was delayed,
+		 * send that first to reflect the prior CE state.
+		 */
+		if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) {
+			dctcp_ece_ack_cwr(sk, *ce_state);
+			bpf_tcp_send_ack(sk, *prior_rcv_nxt);
+		}
+		inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
+	}
+	*prior_rcv_nxt = tcp_sk(sk)->rcv_nxt;
+	*ce_state = new_ce_state;
+	dctcp_ece_ack_cwr(sk, new_ce_state);
+}
+
+BPF_TCP_OPS_2(dctcp_cwnd_event, void,
+	      struct sock *, sk, enum tcp_ca_event, ev)
+{
+	struct dctcp *ca = inet_csk_ca(sk);
+
+	switch (ev) {
+	case CA_EVENT_ECN_IS_CE:
+	case CA_EVENT_ECN_NO_CE:
+		dctcp_ece_ack_update(sk, ev, &ca->prior_rcv_nxt, &ca->ce_state);
+		break;
+	case CA_EVENT_LOSS:
+		dctcp_react_to_loss(sk);
+		break;
+	default:
+		/* Don't care for the rest. */
+		break;
+	}
+}
+
+BPF_TCP_OPS_1(dctcp_cwnd_undo, __u32, struct sock *, sk)
+{
+	const struct dctcp *ca = inet_csk_ca(sk);
+
+	return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
+}
+
+BPF_TCP_OPS_3(tcp_reno_cong_avoid, void,
+	      struct sock *, sk, __u32, ack, __u32, acked)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (!tcp_is_cwnd_limited(sk))
+		return;
+
+	/* In "safe" area, increase. */
+	if (tcp_in_slow_start(tp)) {
+		acked = tcp_slow_start(tp, acked);
+		if (!acked)
+			return;
+	}
+	/* In dangerous area, increase slowly. */
+	tcp_cong_avoid_ai(tp, tp->snd_cwnd, acked);
+}
+
+SEC(".struct_ops")
+struct tcp_congestion_ops dctcp_nouse = {
+	.init		= (void *)dctcp_init,
+	.set_state	= (void *)dctcp_state,
+	.flags		= TCP_CONG_NEEDS_ECN,
+	.name		= "bpf_dctcp_nouse",
+};
+
+SEC(".struct_ops")
+struct tcp_congestion_ops dctcp = {
+	.init		= (void *)dctcp_init,
+	.in_ack_event   = (void *)dctcp_update_alpha,
+	.cwnd_event	= (void *)dctcp_cwnd_event,
+	.ssthresh	= (void *)dctcp_ssthresh,
+	.cong_avoid	= (void *)tcp_reno_cong_avoid,
+	.undo_cwnd	= (void *)dctcp_cwnd_undo,
+	.set_state	= (void *)dctcp_state,
+	.flags		= TCP_CONG_NEEDS_ECN,
+	.name		= "bpf_dctcp",
+};