Message ID | 20200325055745.10710-6-joe@wand.net.nz |
---|---|
State | Changes Requested |
Delegated to: | BPF Maintainers |
Headers | show |
Series | Add bpf_sk_assign eBPF helper | expand |
On Wed, 25 Mar 2020 at 05:58, Joe Stringer <joe@wand.net.nz> wrote: > > From: Lorenz Bauer <lmb@cloudflare.com> > > Attach a tc direct-action classifier to lo in a fresh network > namespace, and rewrite all connection attempts to localhost:4321 > to localhost:1234 (for port tests) and connections to unreachable > IPv4/IPv6 IPs to the local socket (for address tests). Can you extend this to cover UDP as well? > > Keep in mind that both client to server and server to client traffic > passes the classifier. > > Signed-off-by: Lorenz Bauer <lmb@cloudflare.com> > Co-authored-by: Joe Stringer <joe@wand.net.nz> > Signed-off-by: Joe Stringer <joe@wand.net.nz> > --- > v2: Rebase onto test_progs infrastructure > v1: Initial commit > --- > tools/testing/selftests/bpf/Makefile | 2 +- > .../selftests/bpf/prog_tests/sk_assign.c | 244 ++++++++++++++++++ > .../selftests/bpf/progs/test_sk_assign.c | 127 +++++++++ > 3 files changed, 372 insertions(+), 1 deletion(-) > create mode 100644 tools/testing/selftests/bpf/prog_tests/sk_assign.c > create mode 100644 tools/testing/selftests/bpf/progs/test_sk_assign.c > > diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile > index 7729892e0b04..4f7f83d059ca 100644 > --- a/tools/testing/selftests/bpf/Makefile > +++ b/tools/testing/selftests/bpf/Makefile > @@ -76,7 +76,7 @@ TEST_PROGS_EXTENDED := with_addr.sh \ > # Compile but not part of 'make run_tests' > TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \ > flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \ > - test_lirc_mode2_user xdping test_cpp runqslower > + test_lirc_mode2_user xdping test_cpp runqslower test_sk_assign > > TEST_CUSTOM_PROGS = urandom_read > > diff --git a/tools/testing/selftests/bpf/prog_tests/sk_assign.c b/tools/testing/selftests/bpf/prog_tests/sk_assign.c > new file mode 100644 > index 000000000000..1f0afcc20c48 > --- /dev/null > +++ b/tools/testing/selftests/bpf/prog_tests/sk_assign.c > @@ -0,0 +1,244 @@ > +// SPDX-License-Identifier: GPL-2.0 > +// Copyright (c) 2018 Facebook > +// Copyright (c) 2019 Cloudflare > +// Copyright (c) 2020 Isovalent, Inc. > +/* > + * Test that the socket assign program is able to redirect traffic towards a > + * socket, regardless of whether the port or address destination of the traffic > + * matches the port. > + */ > + > +#define _GNU_SOURCE > +#include <fcntl.h> > +#include <signal.h> > +#include <stdlib.h> > +#include <unistd.h> > + > +#include "test_progs.h" > + > +#define TEST_DPORT 4321 > +#define TEST_DADDR (0xC0A80203) > +#define NS_SELF "/proc/self/ns/net" > + > +static __u32 duration; > + > +static bool configure_stack(int self_net) > +{ > + /* Move to a new networking namespace */ > + if (CHECK_FAIL(unshare(CLONE_NEWNET))) > + return false; > + > + /* Configure necessary links, routes */ > + if (CHECK_FAIL(system("ip link set dev lo up"))) > + return false; > + if (CHECK_FAIL(system("ip route add local default dev lo"))) > + return false; > + if (CHECK_FAIL(system("ip -6 route add local default dev lo"))) > + return false; > + > + /* Load qdisc, BPF program */ > + if (CHECK_FAIL(system("tc qdisc add dev lo clsact"))) > + return false; > + if (CHECK_FAIL(system("tc filter add dev lo ingress bpf direct-action " > + "object-file ./test_sk_assign.o section sk_assign_test"))) > + return false; > + > + return true; > +} > + > +static int start_server(const struct sockaddr *addr, socklen_t len) > +{ > + int fd; > + > + fd = socket(addr->sa_family, SOCK_STREAM, 0); > + if (CHECK_FAIL(fd == -1)) > + goto out; > + if (CHECK_FAIL(bind(fd, addr, len) == -1)) > + goto close_out; > + if (CHECK_FAIL(listen(fd, 128) == -1)) > + goto close_out; > + > + goto out; > + > +close_out: > + close(fd); > + fd = -1; > +out: > + return fd; > +} > + > +static void handle_timeout(int signum) > +{ > + if (signum == SIGALRM) > + fprintf(stderr, "Timed out while connecting to server\n"); > + kill(0, SIGKILL); > +} > + > +static struct sigaction timeout_action = { > + .sa_handler = handle_timeout, > +}; > + > +static int connect_to_server(const struct sockaddr *addr, socklen_t len) > +{ > + int fd = -1; > + > + fd = socket(addr->sa_family, SOCK_STREAM, 0); > + if (CHECK_FAIL(fd == -1)) > + goto out; > + if (CHECK_FAIL(sigaction(SIGALRM, &timeout_action, NULL))) > + goto out; > + alarm(3); > + if (CHECK_FAIL(connect(fd, addr, len) == -1)) > + goto close_out; > + > + goto out; > + > +close_out: > + close(fd); > + fd = -1; > +out: > + return fd; > +} > + > +static in_port_t get_port(int fd) > +{ > + struct sockaddr_storage name; > + socklen_t len; > + in_port_t port = 0; > + > + len = sizeof(name); > + if (CHECK_FAIL(getsockname(fd, (struct sockaddr *)&name, &len))) > + return port; > + > + switch (name.ss_family) { > + case AF_INET: > + port = ((struct sockaddr_in *)&name)->sin_port; > + break; > + case AF_INET6: > + port = ((struct sockaddr_in6 *)&name)->sin6_port; > + break; > + default: > + CHECK(1, "Invalid address family", "%d\n", name.ss_family); > + } > + return port; > +} > + > +static int run_test(int server_fd, const struct sockaddr *addr, socklen_t len) > +{ > + int client = -1, srv_client = -1; > + char buf[] = "testing"; > + in_port_t port; > + int ret = 1; > + > + client = connect_to_server(addr, len); > + if (client == -1) { > + perror("Cannot connect to server"); > + goto out; > + } > + > + srv_client = accept(server_fd, NULL, NULL); > + if (CHECK_FAIL(srv_client == -1)) { > + perror("Can't accept connection"); > + goto out; > + } > + if (CHECK_FAIL(write(client, buf, sizeof(buf)) != sizeof(buf))) { > + perror("Can't write on client"); > + goto out; > + } > + if (CHECK_FAIL(read(srv_client, buf, sizeof(buf)) != sizeof(buf))) { > + perror("Can't read on server"); > + goto out; > + } > + > + port = get_port(srv_client); > + if (CHECK_FAIL(!port)) > + goto out; > + if (CHECK(port != htons(TEST_DPORT), "Expected", "port %u but got %u", > + TEST_DPORT, ntohs(port))) > + goto out; > + > + ret = 0; > +out: > + close(client); > + close(srv_client); > + return ret; > +} > + > +static int do_sk_assign(void) > +{ > + struct sockaddr_in addr4; > + struct sockaddr_in6 addr6; > + int server = -1; > + int server_v6 = -1; > + int err = 1; > + > + memset(&addr4, 0, sizeof(addr4)); > + addr4.sin_family = AF_INET; > + addr4.sin_addr.s_addr = htonl(INADDR_LOOPBACK); > + addr4.sin_port = htons(1234); > + > + memset(&addr6, 0, sizeof(addr6)); > + addr6.sin6_family = AF_INET6; > + addr6.sin6_addr = in6addr_loopback; > + addr6.sin6_port = htons(1234); > + > + server = start_server((const struct sockaddr *)&addr4, sizeof(addr4)); > + if (server == -1) > + goto out; > + > + server_v6 = start_server((const struct sockaddr *)&addr6, > + sizeof(addr6)); > + if (server_v6 == -1) > + goto out; > + > + /* Connect to unbound ports */ > + addr4.sin_port = htons(TEST_DPORT); > + addr6.sin6_port = htons(TEST_DPORT); > + > + test__start_subtest("ipv4 port redir"); > + if (run_test(server, (const struct sockaddr *)&addr4, sizeof(addr4))) > + goto out; > + > + test__start_subtest("ipv6 port redir"); > + if (run_test(server_v6, (const struct sockaddr *)&addr6, sizeof(addr6))) > + goto out; > + > + /* Connect to unbound addresses */ > + addr4.sin_addr.s_addr = htonl(TEST_DADDR); > + addr6.sin6_addr.s6_addr32[3] = htonl(TEST_DADDR); > + > + test__start_subtest("ipv4 addr redir"); > + if (run_test(server, (const struct sockaddr *)&addr4, sizeof(addr4))) > + goto out; > + > + test__start_subtest("ipv6 addr redir"); > + if (run_test(server_v6, (const struct sockaddr *)&addr6, sizeof(addr6))) > + goto out; > + > + err = 0; > +out: > + close(server); > + close(server_v6); > + return err; > +} > + > +void test_sk_assign(void) > +{ > + int self_net; > + > + self_net = open(NS_SELF, O_RDONLY); > + if (CHECK_FAIL(self_net < 0)) { > + perror("Unable to open "NS_SELF); > + return; > + } > + > + if (!configure_stack(self_net)) { > + perror("configure_stack"); > + goto cleanup; > + } > + > + do_sk_assign(); > + > +cleanup: > + close(self_net); > +} > diff --git a/tools/testing/selftests/bpf/progs/test_sk_assign.c b/tools/testing/selftests/bpf/progs/test_sk_assign.c > new file mode 100644 > index 000000000000..7de30ad3f594 > --- /dev/null > +++ b/tools/testing/selftests/bpf/progs/test_sk_assign.c > @@ -0,0 +1,127 @@ > +// SPDX-License-Identifier: GPL-2.0 > +// Copyright (c) 2019 Cloudflare Ltd. > + > +#include <stddef.h> > +#include <stdbool.h> > +#include <string.h> > +#include <linux/bpf.h> > +#include <linux/if_ether.h> > +#include <linux/in.h> > +#include <linux/ip.h> > +#include <linux/ipv6.h> > +#include <linux/pkt_cls.h> > +#include <linux/tcp.h> > +#include <sys/socket.h> > +#include <bpf/bpf_helpers.h> > +#include <bpf/bpf_endian.h> > + > +int _version SEC("version") = 1; > +char _license[] SEC("license") = "GPL"; > + > +/* Fill 'tuple' with L3 info, and attempt to find L4. On fail, return NULL. */ > +static struct bpf_sock_tuple *get_tuple(void *data, __u64 nh_off, > + void *data_end, __u16 eth_proto, > + bool *ipv4) > +{ > + struct bpf_sock_tuple *result; > + __u8 proto = 0; > + __u64 ihl_len; > + > + if (eth_proto == bpf_htons(ETH_P_IP)) { > + struct iphdr *iph = (struct iphdr *)(data + nh_off); > + > + if (iph + 1 > data_end) > + return NULL; > + if (iph->ihl != 5) > + /* Options are not supported */ > + return NULL; > + ihl_len = iph->ihl * 4; > + proto = iph->protocol; > + *ipv4 = true; > + result = (struct bpf_sock_tuple *)&iph->saddr; > + } else if (eth_proto == bpf_htons(ETH_P_IPV6)) { > + struct ipv6hdr *ip6h = (struct ipv6hdr *)(data + nh_off); > + > + if (ip6h + 1 > data_end) > + return NULL; > + ihl_len = sizeof(*ip6h); > + proto = ip6h->nexthdr; > + *ipv4 = false; > + result = (struct bpf_sock_tuple *)&ip6h->saddr; > + } else { > + return NULL; > + } > + > + if (result + 1 > data_end || proto != IPPROTO_TCP) > + return NULL; > + > + return result; > +} > + > +SEC("sk_assign_test") > +int bpf_sk_assign_test(struct __sk_buff *skb) > +{ > + void *data_end = (void *)(long)skb->data_end; > + void *data = (void *)(long)skb->data; > + struct ethhdr *eth = (struct ethhdr *)(data); > + struct bpf_sock_tuple *tuple, ln = {0}; > + struct bpf_sock *sk; > + int tuple_len; > + bool ipv4; > + int ret; > + > + if (eth + 1 > data_end) > + return TC_ACT_SHOT; > + > + tuple = get_tuple(data, sizeof(*eth), data_end, eth->h_proto, &ipv4); > + if (!tuple) > + return TC_ACT_SHOT; > + > + tuple_len = ipv4 ? sizeof(tuple->ipv4) : sizeof(tuple->ipv6); > + sk = bpf_skc_lookup_tcp(skb, tuple, tuple_len, BPF_F_CURRENT_NETNS, 0); > + if (sk) { > + if (sk->state != BPF_TCP_LISTEN) > + goto assign; > + > + bpf_sk_release(sk); > + } > + > + if (ipv4) { > + if (tuple->ipv4.dport != bpf_htons(4321)) > + return TC_ACT_OK; > + > + ln.ipv4.daddr = bpf_htonl(0x7f000001); > + ln.ipv4.dport = bpf_htons(1234); > + > + sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv4), > + BPF_F_CURRENT_NETNS, 0); > + } else { > + if (tuple->ipv6.dport != bpf_htons(4321)) > + return TC_ACT_OK; > + > + /* Upper parts of daddr are already zero. */ > + ln.ipv6.daddr[3] = bpf_htonl(0x1); > + ln.ipv6.dport = bpf_htons(1234); > + > + sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv6), > + BPF_F_CURRENT_NETNS, 0); > + } > + > + /* We can't do a single skc_lookup_tcp here, because then the compiler > + * will likely spill tuple_len to the stack. This makes it lose all > + * bounds information in the verifier, which then rejects the call as > + * unsafe. > + */ > + if (!sk) > + return TC_ACT_SHOT; > + > + if (sk->state != BPF_TCP_LISTEN) { > + bpf_sk_release(sk); > + return TC_ACT_SHOT; > + } > + > +assign: > + ret = bpf_sk_assign(skb, sk, 0); > + bpf_sk_release(sk); > + return ret == 0 ? TC_ACT_OK : TC_ACT_SHOT; > +} > -- > 2.20.1 >
On 3/24/20 10:57 PM, Joe Stringer wrote: > From: Lorenz Bauer <lmb@cloudflare.com> > > Attach a tc direct-action classifier to lo in a fresh network > namespace, and rewrite all connection attempts to localhost:4321 > to localhost:1234 (for port tests) and connections to unreachable > IPv4/IPv6 IPs to the local socket (for address tests). > > Keep in mind that both client to server and server to client traffic > passes the classifier. > > Signed-off-by: Lorenz Bauer <lmb@cloudflare.com> > Co-authored-by: Joe Stringer <joe@wand.net.nz> > Signed-off-by: Joe Stringer <joe@wand.net.nz> > --- > v2: Rebase onto test_progs infrastructure > v1: Initial commit > --- > tools/testing/selftests/bpf/Makefile | 2 +- > .../selftests/bpf/prog_tests/sk_assign.c | 244 ++++++++++++++++++ > .../selftests/bpf/progs/test_sk_assign.c | 127 +++++++++ > 3 files changed, 372 insertions(+), 1 deletion(-) > create mode 100644 tools/testing/selftests/bpf/prog_tests/sk_assign.c > create mode 100644 tools/testing/selftests/bpf/progs/test_sk_assign.c > > diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile > index 7729892e0b04..4f7f83d059ca 100644 > --- a/tools/testing/selftests/bpf/Makefile > +++ b/tools/testing/selftests/bpf/Makefile > @@ -76,7 +76,7 @@ TEST_PROGS_EXTENDED := with_addr.sh \ > # Compile but not part of 'make run_tests' > TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \ > flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \ > - test_lirc_mode2_user xdping test_cpp runqslower > + test_lirc_mode2_user xdping test_cpp runqslower test_sk_assign No test_sk_assign any more as the test is integrated into test_progs, right? > > TEST_CUSTOM_PROGS = urandom_read > > diff --git a/tools/testing/selftests/bpf/prog_tests/sk_assign.c b/tools/testing/selftests/bpf/prog_tests/sk_assign.c > new file mode 100644 > index 000000000000..1f0afcc20c48 > --- /dev/null > +++ b/tools/testing/selftests/bpf/prog_tests/sk_assign.c > @@ -0,0 +1,244 @@ > +// SPDX-License-Identifier: GPL-2.0 > +// Copyright (c) 2018 Facebook > +// Copyright (c) 2019 Cloudflare > +// Copyright (c) 2020 Isovalent, Inc. > +/* > + * Test that the socket assign program is able to redirect traffic towards a > + * socket, regardless of whether the port or address destination of the traffic > + * matches the port. > + */ > + > +#define _GNU_SOURCE > +#include <fcntl.h> > +#include <signal.h> > +#include <stdlib.h> > +#include <unistd.h> > + > +#include "test_progs.h" > + > +#define TEST_DPORT 4321 > +#define TEST_DADDR (0xC0A80203) > +#define NS_SELF "/proc/self/ns/net" > + > +static __u32 duration; > + > +static bool configure_stack(int self_net) self_net parameter is not used. > +{ > + /* Move to a new networking namespace */ > + if (CHECK_FAIL(unshare(CLONE_NEWNET))) > + return false; You can use CHECK to encode better error messages. Thhis is what most test_progs tests are using. > + /* Configure necessary links, routes */ > + if (CHECK_FAIL(system("ip link set dev lo up"))) > + return false; > + if (CHECK_FAIL(system("ip route add local default dev lo"))) > + return false; > + if (CHECK_FAIL(system("ip -6 route add local default dev lo"))) > + return false; > + > + /* Load qdisc, BPF program */ > + if (CHECK_FAIL(system("tc qdisc add dev lo clsact"))) > + return false; > + if (CHECK_FAIL(system("tc filter add dev lo ingress bpf direct-action " > + "object-file ./test_sk_assign.o section sk_assign_test"))) > + return false; > + > + return true; > +} > + > +static int start_server(const struct sockaddr *addr, socklen_t len) > +{ > + int fd; > + > + fd = socket(addr->sa_family, SOCK_STREAM, 0); > + if (CHECK_FAIL(fd == -1)) > + goto out; > + if (CHECK_FAIL(bind(fd, addr, len) == -1)) > + goto close_out; > + if (CHECK_FAIL(listen(fd, 128) == -1)) > + goto close_out; > + > + goto out; > + > +close_out: > + close(fd); > + fd = -1; > +out: > + return fd; > +} > + > +static void handle_timeout(int signum) > +{ > + if (signum == SIGALRM) > + fprintf(stderr, "Timed out while connecting to server\n"); > + kill(0, SIGKILL); > +} > + > +static struct sigaction timeout_action = { > + .sa_handler = handle_timeout, > +}; > + > +static int connect_to_server(const struct sockaddr *addr, socklen_t len) > +{ > + int fd = -1; > + > + fd = socket(addr->sa_family, SOCK_STREAM, 0); > + if (CHECK_FAIL(fd == -1)) > + goto out; > + if (CHECK_FAIL(sigaction(SIGALRM, &timeout_action, NULL))) > + goto out; should this goto close_out? > + alarm(3); > + if (CHECK_FAIL(connect(fd, addr, len) == -1)) > + goto close_out; > + > + goto out; > + > +close_out: > + close(fd); > + fd = -1; > +out: > + return fd; > +} > + > +static in_port_t get_port(int fd) > +{ > + struct sockaddr_storage name; > + socklen_t len; > + in_port_t port = 0; > + > + len = sizeof(name); > + if (CHECK_FAIL(getsockname(fd, (struct sockaddr *)&name, &len))) > + return port; > + > + switch (name.ss_family) { > + case AF_INET: > + port = ((struct sockaddr_in *)&name)->sin_port; > + break; > + case AF_INET6: > + port = ((struct sockaddr_in6 *)&name)->sin6_port; > + break; > + default: > + CHECK(1, "Invalid address family", "%d\n", name.ss_family); > + } > + return port; > +} > + > +static int run_test(int server_fd, const struct sockaddr *addr, socklen_t len) > +{ > + int client = -1, srv_client = -1; > + char buf[] = "testing"; > + in_port_t port; > + int ret = 1; > + > + client = connect_to_server(addr, len); > + if (client == -1) { > + perror("Cannot connect to server"); > + goto out; > + } > + > + srv_client = accept(server_fd, NULL, NULL); > + if (CHECK_FAIL(srv_client == -1)) { > + perror("Can't accept connection"); > + goto out; > + } > + if (CHECK_FAIL(write(client, buf, sizeof(buf)) != sizeof(buf))) { > + perror("Can't write on client"); > + goto out; > + } > + if (CHECK_FAIL(read(srv_client, buf, sizeof(buf)) != sizeof(buf))) { > + perror("Can't read on server"); > + goto out; > + } > + > + port = get_port(srv_client); > + if (CHECK_FAIL(!port)) > + goto out; > + if (CHECK(port != htons(TEST_DPORT), "Expected", "port %u but got %u", > + TEST_DPORT, ntohs(port))) > + goto out; > + > + ret = 0; > +out: > + close(client); > + close(srv_client); > + return ret; > +} > + > +static int do_sk_assign(void) > +{ > + struct sockaddr_in addr4; > + struct sockaddr_in6 addr6; > + int server = -1; > + int server_v6 = -1; > + int err = 1; > + > + memset(&addr4, 0, sizeof(addr4)); > + addr4.sin_family = AF_INET; > + addr4.sin_addr.s_addr = htonl(INADDR_LOOPBACK); > + addr4.sin_port = htons(1234); > + > + memset(&addr6, 0, sizeof(addr6)); > + addr6.sin6_family = AF_INET6; > + addr6.sin6_addr = in6addr_loopback; > + addr6.sin6_port = htons(1234); > + > + server = start_server((const struct sockaddr *)&addr4, sizeof(addr4)); > + if (server == -1) > + goto out; > + > + server_v6 = start_server((const struct sockaddr *)&addr6, > + sizeof(addr6)); > + if (server_v6 == -1) > + goto out; > + > + /* Connect to unbound ports */ > + addr4.sin_port = htons(TEST_DPORT); > + addr6.sin6_port = htons(TEST_DPORT); > + > + test__start_subtest("ipv4 port redir"); > + if (run_test(server, (const struct sockaddr *)&addr4, sizeof(addr4))) > + goto out; > + > + test__start_subtest("ipv6 port redir"); > + if (run_test(server_v6, (const struct sockaddr *)&addr6, sizeof(addr6))) > + goto out; > + > + /* Connect to unbound addresses */ > + addr4.sin_addr.s_addr = htonl(TEST_DADDR); > + addr6.sin6_addr.s6_addr32[3] = htonl(TEST_DADDR); > + > + test__start_subtest("ipv4 addr redir"); > + if (run_test(server, (const struct sockaddr *)&addr4, sizeof(addr4))) > + goto out; > + > + test__start_subtest("ipv6 addr redir"); > + if (run_test(server_v6, (const struct sockaddr *)&addr6, sizeof(addr6))) > + goto out; > + > + err = 0; > +out: > + close(server); > + close(server_v6); > + return err; > +} > + > +void test_sk_assign(void) > +{ > + int self_net; > + > + self_net = open(NS_SELF, O_RDONLY); > + if (CHECK_FAIL(self_net < 0)) { > + perror("Unable to open "NS_SELF); > + return; > + } > + > + if (!configure_stack(self_net)) { > + perror("configure_stack"); > + goto cleanup; > + } > + > + do_sk_assign(); > + > +cleanup: > + close(self_net); Did we exit the newly unshared net namespace and restored the previous namespace? > +} > diff --git a/tools/testing/selftests/bpf/progs/test_sk_assign.c b/tools/testing/selftests/bpf/progs/test_sk_assign.c > new file mode 100644 > index 000000000000..7de30ad3f594 > --- /dev/null > +++ b/tools/testing/selftests/bpf/progs/test_sk_assign.c > @@ -0,0 +1,127 @@ > +// SPDX-License-Identifier: GPL-2.0 > +// Copyright (c) 2019 Cloudflare Ltd. > + > +#include <stddef.h> > +#include <stdbool.h> > +#include <string.h> > +#include <linux/bpf.h> > +#include <linux/if_ether.h> > +#include <linux/in.h> > +#include <linux/ip.h> > +#include <linux/ipv6.h> > +#include <linux/pkt_cls.h> > +#include <linux/tcp.h> > +#include <sys/socket.h> > +#include <bpf/bpf_helpers.h> > +#include <bpf/bpf_endian.h> > + > +int _version SEC("version") = 1; > +char _license[] SEC("license") = "GPL"; > + > +/* Fill 'tuple' with L3 info, and attempt to find L4. On fail, return NULL. */ > +static struct bpf_sock_tuple *get_tuple(void *data, __u64 nh_off, > + void *data_end, __u16 eth_proto, > + bool *ipv4) > +{ > + struct bpf_sock_tuple *result; > + __u8 proto = 0; > + __u64 ihl_len; > + > + if (eth_proto == bpf_htons(ETH_P_IP)) { > + struct iphdr *iph = (struct iphdr *)(data + nh_off); > + > + if (iph + 1 > data_end) > + return NULL; > + if (iph->ihl != 5) > + /* Options are not supported */ > + return NULL; > + ihl_len = iph->ihl * 4; > + proto = iph->protocol; > + *ipv4 = true; > + result = (struct bpf_sock_tuple *)&iph->saddr; > + } else if (eth_proto == bpf_htons(ETH_P_IPV6)) { > + struct ipv6hdr *ip6h = (struct ipv6hdr *)(data + nh_off); > + > + if (ip6h + 1 > data_end) > + return NULL; > + ihl_len = sizeof(*ip6h); > + proto = ip6h->nexthdr; > + *ipv4 = false; > + result = (struct bpf_sock_tuple *)&ip6h->saddr; > + } else { > + return NULL; > + } > + > + if (result + 1 > data_end || proto != IPPROTO_TCP) > + return NULL; > + > + return result; > +} > + > +SEC("sk_assign_test") > +int bpf_sk_assign_test(struct __sk_buff *skb) > +{ > + void *data_end = (void *)(long)skb->data_end; > + void *data = (void *)(long)skb->data; > + struct ethhdr *eth = (struct ethhdr *)(data); > + struct bpf_sock_tuple *tuple, ln = {0}; > + struct bpf_sock *sk; > + int tuple_len; > + bool ipv4; > + int ret; > + > + if (eth + 1 > data_end) > + return TC_ACT_SHOT; > + > + tuple = get_tuple(data, sizeof(*eth), data_end, eth->h_proto, &ipv4); > + if (!tuple) > + return TC_ACT_SHOT; > + > + tuple_len = ipv4 ? sizeof(tuple->ipv4) : sizeof(tuple->ipv6); > + sk = bpf_skc_lookup_tcp(skb, tuple, tuple_len, BPF_F_CURRENT_NETNS, 0); You can get rid of tuple_len with if (ipv4) sk = bpf_skc_lookup_tcp(..., sizeof(tuple->ipv4), ...); else sk = bpf_skc_lookup_tcp(..., sizeof(tuple->ipv6), ...); and later on you can do common bpf_skc_lookup_tcp. But it may not be worthwhile to do it, as you have two separate calls in the above instead. > + if (sk) { > + if (sk->state != BPF_TCP_LISTEN) > + goto assign; > + > + bpf_sk_release(sk); > + } > + > + if (ipv4) { > + if (tuple->ipv4.dport != bpf_htons(4321)) > + return TC_ACT_OK; > + > + ln.ipv4.daddr = bpf_htonl(0x7f000001); > + ln.ipv4.dport = bpf_htons(1234); > + > + sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv4), > + BPF_F_CURRENT_NETNS, 0); > + } else { > + if (tuple->ipv6.dport != bpf_htons(4321)) > + return TC_ACT_OK; > + > + /* Upper parts of daddr are already zero. */ > + ln.ipv6.daddr[3] = bpf_htonl(0x1); > + ln.ipv6.dport = bpf_htons(1234); > + > + sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv6), > + BPF_F_CURRENT_NETNS, 0); > + } > + > + /* We can't do a single skc_lookup_tcp here, because then the compiler > + * will likely spill tuple_len to the stack. This makes it lose all > + * bounds information in the verifier, which then rejects the call as > + * unsafe. > + */ This is a known issue. For scalars, only constant is restored properly in verifier at this moment. I did some hacking before to enable any scalars. The fear is this will make pruning performs worse. More study is needed here. > + if (!sk) > + return TC_ACT_SHOT; > + > + if (sk->state != BPF_TCP_LISTEN) { > + bpf_sk_release(sk); > + return TC_ACT_SHOT; > + } > + > +assign: > + ret = bpf_sk_assign(skb, sk, 0); > + bpf_sk_release(sk); > + return ret == 0 ? TC_ACT_OK : TC_ACT_SHOT; > +} >
On Wed, Mar 25, 2020 at 3:35 AM Lorenz Bauer <lmb@cloudflare.com> wrote: > > On Wed, 25 Mar 2020 at 05:58, Joe Stringer <joe@wand.net.nz> wrote: > > > > From: Lorenz Bauer <lmb@cloudflare.com> > > > > Attach a tc direct-action classifier to lo in a fresh network > > namespace, and rewrite all connection attempts to localhost:4321 > > to localhost:1234 (for port tests) and connections to unreachable > > IPv4/IPv6 IPs to the local socket (for address tests). > > Can you extend this to cover UDP as well? I'm working on a follow-up series for UDP, we need this too.
On Wed, Mar 25, 2020 at 11:18 AM Yonghong Song <yhs@fb.com> wrote: > > > > On 3/24/20 10:57 PM, Joe Stringer wrote: > > From: Lorenz Bauer <lmb@cloudflare.com> > > > > Attach a tc direct-action classifier to lo in a fresh network > > namespace, and rewrite all connection attempts to localhost:4321 > > to localhost:1234 (for port tests) and connections to unreachable > > IPv4/IPv6 IPs to the local socket (for address tests). > > > > Keep in mind that both client to server and server to client traffic > > passes the classifier. > > > > Signed-off-by: Lorenz Bauer <lmb@cloudflare.com> > > Co-authored-by: Joe Stringer <joe@wand.net.nz> > > Signed-off-by: Joe Stringer <joe@wand.net.nz> > > --- > > v2: Rebase onto test_progs infrastructure > > v1: Initial commit > > --- > > tools/testing/selftests/bpf/Makefile | 2 +- > > .../selftests/bpf/prog_tests/sk_assign.c | 244 ++++++++++++++++++ > > .../selftests/bpf/progs/test_sk_assign.c | 127 +++++++++ > > 3 files changed, 372 insertions(+), 1 deletion(-) > > create mode 100644 tools/testing/selftests/bpf/prog_tests/sk_assign.c > > create mode 100644 tools/testing/selftests/bpf/progs/test_sk_assign.c > > > > diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile > > index 7729892e0b04..4f7f83d059ca 100644 > > --- a/tools/testing/selftests/bpf/Makefile > > +++ b/tools/testing/selftests/bpf/Makefile > > @@ -76,7 +76,7 @@ TEST_PROGS_EXTENDED := with_addr.sh \ > > # Compile but not part of 'make run_tests' > > TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \ > > flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \ > > - test_lirc_mode2_user xdping test_cpp runqslower > > + test_lirc_mode2_user xdping test_cpp runqslower test_sk_assign > > No test_sk_assign any more as the test is integrated into test_progs, right? I'll fix it up. > > +static __u32 duration; > > + > > +static bool configure_stack(int self_net) > > self_net parameter is not used. Hrm, why didn't the compiler tell me this..? Will fix. > > +{ > > + /* Move to a new networking namespace */ > > + if (CHECK_FAIL(unshare(CLONE_NEWNET))) > > + return false; > > You can use CHECK to encode better error messages. Thhis is what > most test_progs tests are using. I was going back and forth on this when I was writing this bit. CHECK_FAIL() already prints the line that fails, so when debugging it's pretty clear what call went wrong if you dig into the code. Combine with perror() and you actually get a readable string of the error, whereas the common form for CHECK() seems to be just printing the error code which the developer then has to do symbol lookup to interpret.. if (CHECK(efd < 0, "open", "err %d errno %d\n", efd, errno)) Example output with CHECK_FAIL / perror approach: # ./test_progs -t assign ... Timed out while connecting to server connect_to_server:FAIL:90 Cannot connect to server: Interrupted system call #46/1 ipv4 port redir:FAIL #46 sk_assign:FAIL Summary: 0/0 PASSED, 0 SKIPPED, 2 FAILED Diff to make this happen is just connect to a port that the BPF program doesn't redirect: $ git diff diff --git a/tools/testing/selftests/bpf/prog_tests/sk_assign.c b/tools/testing/selftests/bpf/prog_tests/sk_assign.c index 1f0afcc20c48..ba661145518a 100644 --- a/tools/testing/selftests/bpf/prog_tests/sk_assign.c +++ b/tools/testing/selftests/bpf/prog_tests/sk_assign.c @@ -192,7 +191,7 @@ static int do_sk_assign(void) goto out; /* Connect to unbound ports */ - addr4.sin_port = htons(TEST_DPORT); + addr4.sin_port = htons(666); addr6.sin6_port = htons(TEST_DPORT); test__start_subtest("ipv4 port redir"); (I had to drop the kill() call as well, but that's part of the next revision of this series..) > > +static int connect_to_server(const struct sockaddr *addr, socklen_t len) > > +{ > > + int fd = -1; > > + > > + fd = socket(addr->sa_family, SOCK_STREAM, 0); > > + if (CHECK_FAIL(fd == -1)) > > + goto out; > > + if (CHECK_FAIL(sigaction(SIGALRM, &timeout_action, NULL))) > > + goto out; > > should this goto close_out? Will fix. > > +void test_sk_assign(void) > > +{ > > + int self_net; > > + > > + self_net = open(NS_SELF, O_RDONLY); > > + if (CHECK_FAIL(self_net < 0)) { > > + perror("Unable to open "NS_SELF); > > + return; > > + } > > + > > + if (!configure_stack(self_net)) { > > + perror("configure_stack"); > > + goto cleanup; > > + } > > + > > + do_sk_assign(); > > + > > +cleanup: > > + close(self_net); > > Did we exit the newly unshared net namespace and restored the previous > namespace? Ah I've mainly just been running this test so it didn't affect me but I realise now I dropped the hunks that were intended to do this cleanup. Will fix. > > + /* We can't do a single skc_lookup_tcp here, because then the compiler > > + * will likely spill tuple_len to the stack. This makes it lose all > > + * bounds information in the verifier, which then rejects the call as > > + * unsafe. > > + */ > > This is a known issue. For scalars, only constant is restored properly > in verifier at this moment. I did some hacking before to enable any > scalars. The fear is this will make pruning performs worse. More > study is needed here. Thanks for the background. Do you want me to refer to any specific release version or date or commit for this comment or it's fine to leave as-is?
On 3/25/20 2:20 PM, Joe Stringer wrote: > On Wed, Mar 25, 2020 at 11:18 AM Yonghong Song <yhs@fb.com> wrote: >> >> >> >> On 3/24/20 10:57 PM, Joe Stringer wrote: >>> From: Lorenz Bauer <lmb@cloudflare.com> >>> >>> Attach a tc direct-action classifier to lo in a fresh network >>> namespace, and rewrite all connection attempts to localhost:4321 >>> to localhost:1234 (for port tests) and connections to unreachable >>> IPv4/IPv6 IPs to the local socket (for address tests). >>> >>> Keep in mind that both client to server and server to client traffic >>> passes the classifier. >>> >>> Signed-off-by: Lorenz Bauer <lmb@cloudflare.com> >>> Co-authored-by: Joe Stringer <joe@wand.net.nz> >>> Signed-off-by: Joe Stringer <joe@wand.net.nz> >>> --- >>> v2: Rebase onto test_progs infrastructure >>> v1: Initial commit >>> --- >>> tools/testing/selftests/bpf/Makefile | 2 +- >>> .../selftests/bpf/prog_tests/sk_assign.c | 244 ++++++++++++++++++ >>> .../selftests/bpf/progs/test_sk_assign.c | 127 +++++++++ >>> 3 files changed, 372 insertions(+), 1 deletion(-) >>> create mode 100644 tools/testing/selftests/bpf/prog_tests/sk_assign.c >>> create mode 100644 tools/testing/selftests/bpf/progs/test_sk_assign.c >>> >>> diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile >>> index 7729892e0b04..4f7f83d059ca 100644 >>> --- a/tools/testing/selftests/bpf/Makefile >>> +++ b/tools/testing/selftests/bpf/Makefile >>> @@ -76,7 +76,7 @@ TEST_PROGS_EXTENDED := with_addr.sh \ >>> # Compile but not part of 'make run_tests' >>> TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \ >>> flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \ >>> - test_lirc_mode2_user xdping test_cpp runqslower >>> + test_lirc_mode2_user xdping test_cpp runqslower test_sk_assign >> >> No test_sk_assign any more as the test is integrated into test_progs, right? > > I'll fix it up. > >>> +static __u32 duration; >>> + >>> +static bool configure_stack(int self_net) >> >> self_net parameter is not used. > > Hrm, why didn't the compiler tell me this..? Will fix. > >>> +{ >>> + /* Move to a new networking namespace */ >>> + if (CHECK_FAIL(unshare(CLONE_NEWNET))) >>> + return false; >> >> You can use CHECK to encode better error messages. Thhis is what >> most test_progs tests are using. > > I was going back and forth on this when I was writing this bit. > CHECK_FAIL() already prints the line that fails, so when debugging > it's pretty clear what call went wrong if you dig into the code. > Combine with perror() and you actually get a readable string of the > error, whereas the common form for CHECK() seems to be just printing > the error code which the developer then has to do symbol lookup to > interpret.. > > if (CHECK(efd < 0, "open", "err %d errno %d\n", efd, errno)) > > Example output with CHECK_FAIL / perror approach: > > # ./test_progs -t assign > ... > Timed out while connecting to server > connect_to_server:FAIL:90 > Cannot connect to server: Interrupted system call > #46/1 ipv4 port redir:FAIL > #46 sk_assign:FAIL > Summary: 0/0 PASSED, 0 SKIPPED, 2 FAILED I won't insist since CHECK_FAIL should roughly provide enough information for failure. CHECK might be more useful if you want to provide more context, esp. if the same routine is called in multiple places and you can have a marker to differentiate which call site caused the problem. But again, just a suggestion. CHECK_FAIL is okay to me. > > Diff to make this happen is just connect to a port that the BPF > program doesn't redirect: > > $ git diff > diff --git a/tools/testing/selftests/bpf/prog_tests/sk_assign.c > b/tools/testing/selftests/bpf/prog_tests/sk_assign.c > index 1f0afcc20c48..ba661145518a 100644 > --- a/tools/testing/selftests/bpf/prog_tests/sk_assign.c > +++ b/tools/testing/selftests/bpf/prog_tests/sk_assign.c > @@ -192,7 +191,7 @@ static int do_sk_assign(void) > goto out; > > /* Connect to unbound ports */ > - addr4.sin_port = htons(TEST_DPORT); > + addr4.sin_port = htons(666); > addr6.sin6_port = htons(TEST_DPORT); > > test__start_subtest("ipv4 port redir"); > > (I had to drop the kill() call as well, but that's part of the next > revision of this series..) > >>> +static int connect_to_server(const struct sockaddr *addr, socklen_t len) >>> +{ >>> + int fd = -1; >>> + >>> + fd = socket(addr->sa_family, SOCK_STREAM, 0); >>> + if (CHECK_FAIL(fd == -1)) >>> + goto out; >>> + if (CHECK_FAIL(sigaction(SIGALRM, &timeout_action, NULL))) >>> + goto out; >> >> should this goto close_out? > > Will fix. > >>> +void test_sk_assign(void) >>> +{ >>> + int self_net; >>> + >>> + self_net = open(NS_SELF, O_RDONLY); >>> + if (CHECK_FAIL(self_net < 0)) { >>> + perror("Unable to open "NS_SELF); >>> + return; >>> + } >>> + >>> + if (!configure_stack(self_net)) { >>> + perror("configure_stack"); >>> + goto cleanup; >>> + } >>> + >>> + do_sk_assign(); >>> + >>> +cleanup: >>> + close(self_net); >> >> Did we exit the newly unshared net namespace and restored the previous >> namespace? > > Ah I've mainly just been running this test so it didn't affect me but > I realise now I dropped the hunks that were intended to do this > cleanup. Will fix. > >>> + /* We can't do a single skc_lookup_tcp here, because then the compiler >>> + * will likely spill tuple_len to the stack. This makes it lose all >>> + * bounds information in the verifier, which then rejects the call as >>> + * unsafe. >>> + */ >> >> This is a known issue. For scalars, only constant is restored properly >> in verifier at this moment. I did some hacking before to enable any >> scalars. The fear is this will make pruning performs worse. More >> study is needed here. > > Thanks for the background. Do you want me to refer to any specific > release version or date or commit for this comment or it's fine to > leave as-is? Maybe add a "workaround:" marker in the comments so later we can search and find these examples if we have compiler/verifier improvements. -bash-4.4$ egrep -ri workaround test_get_stack_rawtp.c: * This is an acceptable workaround since there is one entry here. test_seg6_loop.c: // workaround: define induction variable "i" as "long" instead test_sysctl_loop1.c: /* a workaround to prevent compiler from generating -bash-4.4$
On Wed, Mar 25, 2020 at 3:01 PM Yonghong Song <yhs@fb.com> wrote: > > > > On 3/25/20 2:20 PM, Joe Stringer wrote: > > On Wed, Mar 25, 2020 at 11:18 AM Yonghong Song <yhs@fb.com> wrote: > >> > >> > >> > >> On 3/24/20 10:57 PM, Joe Stringer wrote: > >>> From: Lorenz Bauer <lmb@cloudflare.com> > >>> > >>> Attach a tc direct-action classifier to lo in a fresh network > >>> namespace, and rewrite all connection attempts to localhost:4321 > >>> to localhost:1234 (for port tests) and connections to unreachable > >>> IPv4/IPv6 IPs to the local socket (for address tests). > >>> > >>> Keep in mind that both client to server and server to client traffic > >>> passes the classifier. > >>> > >>> Signed-off-by: Lorenz Bauer <lmb@cloudflare.com> > >>> Co-authored-by: Joe Stringer <joe@wand.net.nz> > >>> Signed-off-by: Joe Stringer <joe@wand.net.nz> > >>> --- > >>> v2: Rebase onto test_progs infrastructure > >>> v1: Initial commit > >>> --- > >>> tools/testing/selftests/bpf/Makefile | 2 +- > >>> .../selftests/bpf/prog_tests/sk_assign.c | 244 ++++++++++++++++++ > >>> .../selftests/bpf/progs/test_sk_assign.c | 127 +++++++++ > >>> 3 files changed, 372 insertions(+), 1 deletion(-) > >>> create mode 100644 tools/testing/selftests/bpf/prog_tests/sk_assign.c > >>> create mode 100644 tools/testing/selftests/bpf/progs/test_sk_assign.c > >>> > >>> diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile > >>> index 7729892e0b04..4f7f83d059ca 100644 > >>> --- a/tools/testing/selftests/bpf/Makefile > >>> +++ b/tools/testing/selftests/bpf/Makefile > >>> @@ -76,7 +76,7 @@ TEST_PROGS_EXTENDED := with_addr.sh \ > >>> # Compile but not part of 'make run_tests' > >>> TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \ > >>> flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \ > >>> - test_lirc_mode2_user xdping test_cpp runqslower > >>> + test_lirc_mode2_user xdping test_cpp runqslower test_sk_assign > >> > >> No test_sk_assign any more as the test is integrated into test_progs, right? > > > > I'll fix it up. > > > >>> +static __u32 duration; > >>> + > >>> +static bool configure_stack(int self_net) > >> > >> self_net parameter is not used. > > > > Hrm, why didn't the compiler tell me this..? Will fix. > > > >>> +{ > >>> + /* Move to a new networking namespace */ > >>> + if (CHECK_FAIL(unshare(CLONE_NEWNET))) > >>> + return false; > >> > >> You can use CHECK to encode better error messages. Thhis is what > >> most test_progs tests are using. > > > > I was going back and forth on this when I was writing this bit. > > CHECK_FAIL() already prints the line that fails, so when debugging > > it's pretty clear what call went wrong if you dig into the code. > > Combine with perror() and you actually get a readable string of the > > error, whereas the common form for CHECK() seems to be just printing > > the error code which the developer then has to do symbol lookup to > > interpret.. > > > > if (CHECK(efd < 0, "open", "err %d errno %d\n", efd, errno)) > > > > Example output with CHECK_FAIL / perror approach: > > > > # ./test_progs -t assign > > ... > > Timed out while connecting to server > > connect_to_server:FAIL:90 > > Cannot connect to server: Interrupted system call > > #46/1 ipv4 port redir:FAIL > > #46 sk_assign:FAIL > > Summary: 0/0 PASSED, 0 SKIPPED, 2 FAILED > > I won't insist since CHECK_FAIL should roughly provide enough > information for failure. CHECK might be more useful if you want > to provide more context, esp. if the same routine is called > in multiple places and you can have a marker to differentiate > which call site caused the problem. Good point, maybe for extra context the subtests can use CHECK() in addition to the CHECK_FAIL. > But again, just a suggestion. CHECK_FAIL is okay to me. <snip> > >>> + /* We can't do a single skc_lookup_tcp here, because then the compiler > >>> + * will likely spill tuple_len to the stack. This makes it lose all > >>> + * bounds information in the verifier, which then rejects the call as > >>> + * unsafe. > >>> + */ > >> > >> This is a known issue. For scalars, only constant is restored properly > >> in verifier at this moment. I did some hacking before to enable any > >> scalars. The fear is this will make pruning performs worse. More > >> study is needed here. > > > > Thanks for the background. Do you want me to refer to any specific > > release version or date or commit for this comment or it's fine to > > leave as-is? > > Maybe add a "workaround:" marker in the comments so later we can search > and find these examples if we have compiler/verifier improvements. > > -bash-4.4$ egrep -ri workaround > test_get_stack_rawtp.c: * This is an acceptable workaround since there > is one entry here. > test_seg6_loop.c: // workaround: define induction variable "i" as > "long" instead > test_sysctl_loop1.c: /* a workaround to prevent compiler from generating > -bash-4.4$ SGTM, Will roll that in thanks.
On Tue, Mar 24, 2020 at 10:58 PM Joe Stringer <joe@wand.net.nz> wrote: > > From: Lorenz Bauer <lmb@cloudflare.com> > > Attach a tc direct-action classifier to lo in a fresh network > namespace, and rewrite all connection attempts to localhost:4321 > to localhost:1234 (for port tests) and connections to unreachable > IPv4/IPv6 IPs to the local socket (for address tests). > > Keep in mind that both client to server and server to client traffic > passes the classifier. > > Signed-off-by: Lorenz Bauer <lmb@cloudflare.com> > Co-authored-by: Joe Stringer <joe@wand.net.nz> > Signed-off-by: Joe Stringer <joe@wand.net.nz> > --- Can you please check that you test fails (instead of getting stuck) when there is something wrong with network. We went through this exercise with tcp_rtt and sockmap_listen, where a bunch of stuff was blocking. This works fine when everything works, but horribly breaks when something is not working. Given this is part of test_progs, let's please make sure we don't deadlock anywhere. > v2: Rebase onto test_progs infrastructure > v1: Initial commit > --- > tools/testing/selftests/bpf/Makefile | 2 +- > .../selftests/bpf/prog_tests/sk_assign.c | 244 ++++++++++++++++++ > .../selftests/bpf/progs/test_sk_assign.c | 127 +++++++++ > 3 files changed, 372 insertions(+), 1 deletion(-) > create mode 100644 tools/testing/selftests/bpf/prog_tests/sk_assign.c > create mode 100644 tools/testing/selftests/bpf/progs/test_sk_assign.c > [...]
On Tue, Mar 24, 2020 at 10:58 PM Joe Stringer <joe@wand.net.nz> wrote: > > From: Lorenz Bauer <lmb@cloudflare.com> > > Attach a tc direct-action classifier to lo in a fresh network > namespace, and rewrite all connection attempts to localhost:4321 > to localhost:1234 (for port tests) and connections to unreachable > IPv4/IPv6 IPs to the local socket (for address tests). > > Keep in mind that both client to server and server to client traffic > passes the classifier. > > Signed-off-by: Lorenz Bauer <lmb@cloudflare.com> > Co-authored-by: Joe Stringer <joe@wand.net.nz> > Signed-off-by: Joe Stringer <joe@wand.net.nz> > --- > v2: Rebase onto test_progs infrastructure > v1: Initial commit > --- > tools/testing/selftests/bpf/Makefile | 2 +- > .../selftests/bpf/prog_tests/sk_assign.c | 244 ++++++++++++++++++ > .../selftests/bpf/progs/test_sk_assign.c | 127 +++++++++ > 3 files changed, 372 insertions(+), 1 deletion(-) > create mode 100644 tools/testing/selftests/bpf/prog_tests/sk_assign.c > create mode 100644 tools/testing/selftests/bpf/progs/test_sk_assign.c > > diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile > index 7729892e0b04..4f7f83d059ca 100644 > --- a/tools/testing/selftests/bpf/Makefile > +++ b/tools/testing/selftests/bpf/Makefile > @@ -76,7 +76,7 @@ TEST_PROGS_EXTENDED := with_addr.sh \ > # Compile but not part of 'make run_tests' > TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \ > flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \ > - test_lirc_mode2_user xdping test_cpp runqslower > + test_lirc_mode2_user xdping test_cpp runqslower test_sk_assign > > TEST_CUSTOM_PROGS = urandom_read > > diff --git a/tools/testing/selftests/bpf/prog_tests/sk_assign.c b/tools/testing/selftests/bpf/prog_tests/sk_assign.c > new file mode 100644 > index 000000000000..1f0afcc20c48 > --- /dev/null > +++ b/tools/testing/selftests/bpf/prog_tests/sk_assign.c > @@ -0,0 +1,244 @@ > +// SPDX-License-Identifier: GPL-2.0 > +// Copyright (c) 2018 Facebook > +// Copyright (c) 2019 Cloudflare > +// Copyright (c) 2020 Isovalent, Inc. > +/* > + * Test that the socket assign program is able to redirect traffic towards a > + * socket, regardless of whether the port or address destination of the traffic > + * matches the port. > + */ > + > +#define _GNU_SOURCE > +#include <fcntl.h> > +#include <signal.h> > +#include <stdlib.h> > +#include <unistd.h> > + > +#include "test_progs.h" > + > +#define TEST_DPORT 4321 > +#define TEST_DADDR (0xC0A80203) > +#define NS_SELF "/proc/self/ns/net" > + > +static __u32 duration; > + > +static bool configure_stack(int self_net) > +{ > + /* Move to a new networking namespace */ > + if (CHECK_FAIL(unshare(CLONE_NEWNET))) > + return false; > + > + /* Configure necessary links, routes */ > + if (CHECK_FAIL(system("ip link set dev lo up"))) > + return false; > + if (CHECK_FAIL(system("ip route add local default dev lo"))) > + return false; > + if (CHECK_FAIL(system("ip -6 route add local default dev lo"))) > + return false; > + > + /* Load qdisc, BPF program */ > + if (CHECK_FAIL(system("tc qdisc add dev lo clsact"))) > + return false; > + if (CHECK_FAIL(system("tc filter add dev lo ingress bpf direct-action " > + "object-file ./test_sk_assign.o section sk_assign_test"))) > + return false; > + > + return true; > +} > + > +static int start_server(const struct sockaddr *addr, socklen_t len) > +{ > + int fd; > + > + fd = socket(addr->sa_family, SOCK_STREAM, 0); > + if (CHECK_FAIL(fd == -1)) > + goto out; > + if (CHECK_FAIL(bind(fd, addr, len) == -1)) > + goto close_out; > + if (CHECK_FAIL(listen(fd, 128) == -1)) > + goto close_out; > + > + goto out; > + > +close_out: > + close(fd); > + fd = -1; > +out: > + return fd; > +} > + > +static void handle_timeout(int signum) > +{ > + if (signum == SIGALRM) > + fprintf(stderr, "Timed out while connecting to server\n"); > + kill(0, SIGKILL); > +} > + > +static struct sigaction timeout_action = { > + .sa_handler = handle_timeout, > +}; > + > +static int connect_to_server(const struct sockaddr *addr, socklen_t len) > +{ > + int fd = -1; > + > + fd = socket(addr->sa_family, SOCK_STREAM, 0); > + if (CHECK_FAIL(fd == -1)) > + goto out; > + if (CHECK_FAIL(sigaction(SIGALRM, &timeout_action, NULL))) > + goto out; no-no-no, we are not doing this. It's part of prog_tests and shouldn't install its own signal handlers and sending asynchronous signals to itself. Please find another way to have a timeout. > + alarm(3); > + if (CHECK_FAIL(connect(fd, addr, len) == -1)) > + goto close_out; > + > + goto out; > + > +close_out: > + close(fd); > + fd = -1; > +out: > + return fd; > +} > + > +static in_port_t get_port(int fd) > +{ > + struct sockaddr_storage name; > + socklen_t len; > + in_port_t port = 0; > + > + len = sizeof(name); > + if (CHECK_FAIL(getsockname(fd, (struct sockaddr *)&name, &len))) > + return port; > + > + switch (name.ss_family) { > + case AF_INET: > + port = ((struct sockaddr_in *)&name)->sin_port; > + break; > + case AF_INET6: > + port = ((struct sockaddr_in6 *)&name)->sin6_port; > + break; > + default: > + CHECK(1, "Invalid address family", "%d\n", name.ss_family); > + } > + return port; > +} > + > +static int run_test(int server_fd, const struct sockaddr *addr, socklen_t len) > +{ > + int client = -1, srv_client = -1; > + char buf[] = "testing"; > + in_port_t port; > + int ret = 1; > + > + client = connect_to_server(addr, len); > + if (client == -1) { > + perror("Cannot connect to server"); > + goto out; > + } > + > + srv_client = accept(server_fd, NULL, NULL); > + if (CHECK_FAIL(srv_client == -1)) { > + perror("Can't accept connection"); > + goto out; > + } > + if (CHECK_FAIL(write(client, buf, sizeof(buf)) != sizeof(buf))) { > + perror("Can't write on client"); > + goto out; > + } > + if (CHECK_FAIL(read(srv_client, buf, sizeof(buf)) != sizeof(buf))) { > + perror("Can't read on server"); > + goto out; > + } > + > + port = get_port(srv_client); > + if (CHECK_FAIL(!port)) > + goto out; > + if (CHECK(port != htons(TEST_DPORT), "Expected", "port %u but got %u", > + TEST_DPORT, ntohs(port))) > + goto out; > + > + ret = 0; > +out: > + close(client); > + close(srv_client); > + return ret; > +} > + > +static int do_sk_assign(void) > +{ > + struct sockaddr_in addr4; > + struct sockaddr_in6 addr6; > + int server = -1; > + int server_v6 = -1; > + int err = 1; > + > + memset(&addr4, 0, sizeof(addr4)); > + addr4.sin_family = AF_INET; > + addr4.sin_addr.s_addr = htonl(INADDR_LOOPBACK); > + addr4.sin_port = htons(1234); > + > + memset(&addr6, 0, sizeof(addr6)); > + addr6.sin6_family = AF_INET6; > + addr6.sin6_addr = in6addr_loopback; > + addr6.sin6_port = htons(1234); > + > + server = start_server((const struct sockaddr *)&addr4, sizeof(addr4)); > + if (server == -1) > + goto out; > + > + server_v6 = start_server((const struct sockaddr *)&addr6, > + sizeof(addr6)); > + if (server_v6 == -1) > + goto out; > + > + /* Connect to unbound ports */ > + addr4.sin_port = htons(TEST_DPORT); > + addr6.sin6_port = htons(TEST_DPORT); > + > + test__start_subtest("ipv4 port redir"); > + if (run_test(server, (const struct sockaddr *)&addr4, sizeof(addr4))) > + goto out; > + > + test__start_subtest("ipv6 port redir"); > + if (run_test(server_v6, (const struct sockaddr *)&addr6, sizeof(addr6))) > + goto out; > + > + /* Connect to unbound addresses */ > + addr4.sin_addr.s_addr = htonl(TEST_DADDR); > + addr6.sin6_addr.s6_addr32[3] = htonl(TEST_DADDR); > + > + test__start_subtest("ipv4 addr redir"); > + if (run_test(server, (const struct sockaddr *)&addr4, sizeof(addr4))) > + goto out; > + > + test__start_subtest("ipv6 addr redir"); test__start_subtest() returns false if subtest is supposed to be skipped. If you ignore that, then test_progs -t and -n filtering won't work properly. > + if (run_test(server_v6, (const struct sockaddr *)&addr6, sizeof(addr6))) > + goto out; > + > + err = 0; > +out: > + close(server); > + close(server_v6); > + return err; > +} > + > +void test_sk_assign(void) > +{ > + int self_net; > + > + self_net = open(NS_SELF, O_RDONLY); I'm not familiar with what this does. Can you please explain briefly what are the side effects of this? Asking because of shared test_progs environment worries, of course. > + if (CHECK_FAIL(self_net < 0)) { > + perror("Unable to open "NS_SELF); > + return; > + } > + > + if (!configure_stack(self_net)) { > + perror("configure_stack"); > + goto cleanup; > + } > + > + do_sk_assign(); > + > +cleanup: > + close(self_net); > +} > diff --git a/tools/testing/selftests/bpf/progs/test_sk_assign.c b/tools/testing/selftests/bpf/progs/test_sk_assign.c > new file mode 100644 > index 000000000000..7de30ad3f594 > --- /dev/null > +++ b/tools/testing/selftests/bpf/progs/test_sk_assign.c > @@ -0,0 +1,127 @@ > +// SPDX-License-Identifier: GPL-2.0 > +// Copyright (c) 2019 Cloudflare Ltd. > + > +#include <stddef.h> > +#include <stdbool.h> > +#include <string.h> > +#include <linux/bpf.h> > +#include <linux/if_ether.h> > +#include <linux/in.h> > +#include <linux/ip.h> > +#include <linux/ipv6.h> > +#include <linux/pkt_cls.h> > +#include <linux/tcp.h> > +#include <sys/socket.h> > +#include <bpf/bpf_helpers.h> > +#include <bpf/bpf_endian.h> > + > +int _version SEC("version") = 1; > +char _license[] SEC("license") = "GPL"; > + > +/* Fill 'tuple' with L3 info, and attempt to find L4. On fail, return NULL. */ > +static struct bpf_sock_tuple *get_tuple(void *data, __u64 nh_off, > + void *data_end, __u16 eth_proto, > + bool *ipv4) > +{ > + struct bpf_sock_tuple *result; > + __u8 proto = 0; > + __u64 ihl_len; > + > + if (eth_proto == bpf_htons(ETH_P_IP)) { > + struct iphdr *iph = (struct iphdr *)(data + nh_off); > + > + if (iph + 1 > data_end) > + return NULL; > + if (iph->ihl != 5) > + /* Options are not supported */ > + return NULL; > + ihl_len = iph->ihl * 4; > + proto = iph->protocol; > + *ipv4 = true; > + result = (struct bpf_sock_tuple *)&iph->saddr; > + } else if (eth_proto == bpf_htons(ETH_P_IPV6)) { > + struct ipv6hdr *ip6h = (struct ipv6hdr *)(data + nh_off); > + > + if (ip6h + 1 > data_end) > + return NULL; > + ihl_len = sizeof(*ip6h); > + proto = ip6h->nexthdr; > + *ipv4 = false; > + result = (struct bpf_sock_tuple *)&ip6h->saddr; > + } else { > + return NULL; > + } > + > + if (result + 1 > data_end || proto != IPPROTO_TCP) > + return NULL; > + > + return result; > +} > + > +SEC("sk_assign_test") Please use "canonical" section name that libbpf recognizes. This woulde be "action/" or "classifier/", right? > +int bpf_sk_assign_test(struct __sk_buff *skb) > +{ > + void *data_end = (void *)(long)skb->data_end; > + void *data = (void *)(long)skb->data; > + struct ethhdr *eth = (struct ethhdr *)(data); > + struct bpf_sock_tuple *tuple, ln = {0}; > + struct bpf_sock *sk; > + int tuple_len; > + bool ipv4; > + int ret; > + > + if (eth + 1 > data_end) > + return TC_ACT_SHOT; > + > + tuple = get_tuple(data, sizeof(*eth), data_end, eth->h_proto, &ipv4); > + if (!tuple) > + return TC_ACT_SHOT; > + > + tuple_len = ipv4 ? sizeof(tuple->ipv4) : sizeof(tuple->ipv6); > + sk = bpf_skc_lookup_tcp(skb, tuple, tuple_len, BPF_F_CURRENT_NETNS, 0); > + if (sk) { > + if (sk->state != BPF_TCP_LISTEN) > + goto assign; > + > + bpf_sk_release(sk); > + } > + > + if (ipv4) { > + if (tuple->ipv4.dport != bpf_htons(4321)) > + return TC_ACT_OK; > + > + ln.ipv4.daddr = bpf_htonl(0x7f000001); > + ln.ipv4.dport = bpf_htons(1234); > + > + sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv4), > + BPF_F_CURRENT_NETNS, 0); > + } else { > + if (tuple->ipv6.dport != bpf_htons(4321)) > + return TC_ACT_OK; > + > + /* Upper parts of daddr are already zero. */ > + ln.ipv6.daddr[3] = bpf_htonl(0x1); > + ln.ipv6.dport = bpf_htons(1234); > + > + sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv6), > + BPF_F_CURRENT_NETNS, 0); > + } > + > + /* We can't do a single skc_lookup_tcp here, because then the compiler > + * will likely spill tuple_len to the stack. This makes it lose all > + * bounds information in the verifier, which then rejects the call as > + * unsafe. > + */ > + if (!sk) > + return TC_ACT_SHOT; > + > + if (sk->state != BPF_TCP_LISTEN) { > + bpf_sk_release(sk); > + return TC_ACT_SHOT; > + } > + > +assign: > + ret = bpf_sk_assign(skb, sk, 0); > + bpf_sk_release(sk); > + return ret == 0 ? TC_ACT_OK : TC_ACT_SHOT; > +} > -- > 2.20.1 >
On Wed, Mar 25, 2020 at 7:16 PM Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote: > > On Tue, Mar 24, 2020 at 10:58 PM Joe Stringer <joe@wand.net.nz> wrote: > > > > From: Lorenz Bauer <lmb@cloudflare.com> > > > > Attach a tc direct-action classifier to lo in a fresh network > > namespace, and rewrite all connection attempts to localhost:4321 > > to localhost:1234 (for port tests) and connections to unreachable > > IPv4/IPv6 IPs to the local socket (for address tests). > > > > Keep in mind that both client to server and server to client traffic > > passes the classifier. > > > > Signed-off-by: Lorenz Bauer <lmb@cloudflare.com> > > Co-authored-by: Joe Stringer <joe@wand.net.nz> > > Signed-off-by: Joe Stringer <joe@wand.net.nz> > > --- <snip> > > +static void handle_timeout(int signum) > > +{ > > + if (signum == SIGALRM) > > + fprintf(stderr, "Timed out while connecting to server\n"); > > + kill(0, SIGKILL); > > +} > > + > > +static struct sigaction timeout_action = { > > + .sa_handler = handle_timeout, > > +}; > > + > > +static int connect_to_server(const struct sockaddr *addr, socklen_t len) > > +{ > > + int fd = -1; > > + > > + fd = socket(addr->sa_family, SOCK_STREAM, 0); > > + if (CHECK_FAIL(fd == -1)) > > + goto out; > > + if (CHECK_FAIL(sigaction(SIGALRM, &timeout_action, NULL))) > > + goto out; > > no-no-no, we are not doing this. It's part of prog_tests and shouldn't > install its own signal handlers and sending asynchronous signals to > itself. Please find another way to have a timeout. I realise it didn't clean up after itself. How about signal(SIGALRM, SIG_DFL); just like other existing tests do? > > + test__start_subtest("ipv4 addr redir"); > > + if (run_test(server, (const struct sockaddr *)&addr4, sizeof(addr4))) > > + goto out; > > + > > + test__start_subtest("ipv6 addr redir"); > > test__start_subtest() returns false if subtest is supposed to be > skipped. If you ignore that, then test_progs -t and -n filtering won't > work properly. Will fix. > > + if (run_test(server_v6, (const struct sockaddr *)&addr6, sizeof(addr6))) > > + goto out; > > + > > + err = 0; > > +out: > > + close(server); > > + close(server_v6); > > + return err; > > +} > > + > > +void test_sk_assign(void) > > +{ > > + int self_net; > > + > > + self_net = open(NS_SELF, O_RDONLY); > > I'm not familiar with what this does. Can you please explain briefly > what are the side effects of this? Asking because of shared test_progs > environment worries, of course. This one is opening an fd to the current program's netns path on the filesystem. The intention was to use it to switch back to the current netns after the unshare() call elsewhere which switches to a new netns. As per the other feedback the bit where it switches back to this netns was dropped along the way so I'll fix it up in the next revision. > > +SEC("sk_assign_test") > > Please use "canonical" section name that libbpf recognizes. This > woulde be "action/" or "classifier/", right? Will fix.
On Wed, Mar 25, 2020 at 01:55:59PM -0700, Joe Stringer wrote: > On Wed, Mar 25, 2020 at 3:35 AM Lorenz Bauer <lmb@cloudflare.com> wrote: > > > > On Wed, 25 Mar 2020 at 05:58, Joe Stringer <joe@wand.net.nz> wrote: > > > > > > From: Lorenz Bauer <lmb@cloudflare.com> > > > > > > Attach a tc direct-action classifier to lo in a fresh network > > > namespace, and rewrite all connection attempts to localhost:4321 > > > to localhost:1234 (for port tests) and connections to unreachable > > > IPv4/IPv6 IPs to the local socket (for address tests). > > > > Can you extend this to cover UDP as well? > > I'm working on a follow-up series for UDP, we need this too. Other than selftests, what are the changes for UDP in patch 1 - 4?
On Wed, Mar 25, 2020 at 10:28:11PM -0700, Joe Stringer wrote: > On Wed, Mar 25, 2020 at 7:16 PM Andrii Nakryiko > <andrii.nakryiko@gmail.com> wrote: > > > > On Tue, Mar 24, 2020 at 10:58 PM Joe Stringer <joe@wand.net.nz> wrote: > > > > > > From: Lorenz Bauer <lmb@cloudflare.com> > > > > > > Attach a tc direct-action classifier to lo in a fresh network > > > namespace, and rewrite all connection attempts to localhost:4321 > > > to localhost:1234 (for port tests) and connections to unreachable > > > IPv4/IPv6 IPs to the local socket (for address tests). > > > > > > Keep in mind that both client to server and server to client traffic > > > passes the classifier. > > > > > > Signed-off-by: Lorenz Bauer <lmb@cloudflare.com> > > > Co-authored-by: Joe Stringer <joe@wand.net.nz> > > > Signed-off-by: Joe Stringer <joe@wand.net.nz> > > > --- > > <snip> > > > > +static void handle_timeout(int signum) > > > +{ > > > + if (signum == SIGALRM) > > > + fprintf(stderr, "Timed out while connecting to server\n"); > > > + kill(0, SIGKILL); > > > +} > > > + > > > +static struct sigaction timeout_action = { > > > + .sa_handler = handle_timeout, > > > +}; > > > + > > > +static int connect_to_server(const struct sockaddr *addr, socklen_t len) > > > +{ > > > + int fd = -1; > > > + > > > + fd = socket(addr->sa_family, SOCK_STREAM, 0); > > > + if (CHECK_FAIL(fd == -1)) > > > + goto out; > > > + if (CHECK_FAIL(sigaction(SIGALRM, &timeout_action, NULL))) > > > + goto out; > > > > no-no-no, we are not doing this. It's part of prog_tests and shouldn't > > install its own signal handlers and sending asynchronous signals to > > itself. Please find another way to have a timeout. > > I realise it didn't clean up after itself. How about signal(SIGALRM, > SIG_DFL); just like other existing tests do? or setsockopt SO_SNDTIMEO?
On Wed, Mar 25, 2020 at 11:25 PM Martin KaFai Lau <kafai@fb.com> wrote: > > On Wed, Mar 25, 2020 at 01:55:59PM -0700, Joe Stringer wrote: > > On Wed, Mar 25, 2020 at 3:35 AM Lorenz Bauer <lmb@cloudflare.com> wrote: > > > > > > On Wed, 25 Mar 2020 at 05:58, Joe Stringer <joe@wand.net.nz> wrote: > > > > > > > > From: Lorenz Bauer <lmb@cloudflare.com> > > > > > > > > Attach a tc direct-action classifier to lo in a fresh network > > > > namespace, and rewrite all connection attempts to localhost:4321 > > > > to localhost:1234 (for port tests) and connections to unreachable > > > > IPv4/IPv6 IPs to the local socket (for address tests). > > > > > > Can you extend this to cover UDP as well? > > > > I'm working on a follow-up series for UDP, we need this too. > Other than selftests, what are the changes for UDP in patch 1 - 4? Nothing in those patches, I have refactoring of all of the socket helpers, skc_lookup_udp() and adding flags to the socket lookup functions to support only looking for a certain type of sockets - established or listen. This helps to avoid multiple lookups in these cases where you really just want to look up established sockets with the packet tuple first then look up the listener socket with the unrelated/tproxy tuple. For UDP it makes it easier to find the correct socket and in general (including TCP) helps to avoid up to two socket hashtable lookups for this use case. This part is because the current helpers all look up the established socket first then the listener socket, so for the first packet that hits these we perform both of these lookups for the packet tuple (which finds nothing), then look up an established socket for the target tuple (which finds nothing) then finally a listen socket for the target tuple. It's about another 300+ / 250- changes overall, of which a large chunk is one patch that refactors the code into macros. I haven't narrowed down for sure whether the lookup flags patch is required for UDP cases yet.
On Wed, 25 Mar 2020 at 18:17, Yonghong Song <yhs@fb.com> wrote: > > > > On 3/24/20 10:57 PM, Joe Stringer wrote: > > From: Lorenz Bauer <lmb@cloudflare.com> > > > > Attach a tc direct-action classifier to lo in a fresh network > > namespace, and rewrite all connection attempts to localhost:4321 > > to localhost:1234 (for port tests) and connections to unreachable > > IPv4/IPv6 IPs to the local socket (for address tests). > > > > Keep in mind that both client to server and server to client traffic > > passes the classifier. > > > > Signed-off-by: Lorenz Bauer <lmb@cloudflare.com> > > Co-authored-by: Joe Stringer <joe@wand.net.nz> > > Signed-off-by: Joe Stringer <joe@wand.net.nz> > > --- > > v2: Rebase onto test_progs infrastructure > > v1: Initial commit > > --- > > tools/testing/selftests/bpf/Makefile | 2 +- > > .../selftests/bpf/prog_tests/sk_assign.c | 244 ++++++++++++++++++ > > .../selftests/bpf/progs/test_sk_assign.c | 127 +++++++++ > > 3 files changed, 372 insertions(+), 1 deletion(-) > > create mode 100644 tools/testing/selftests/bpf/prog_tests/sk_assign.c > > create mode 100644 tools/testing/selftests/bpf/progs/test_sk_assign.c > > > > diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile > > index 7729892e0b04..4f7f83d059ca 100644 > > --- a/tools/testing/selftests/bpf/Makefile > > +++ b/tools/testing/selftests/bpf/Makefile > > @@ -76,7 +76,7 @@ TEST_PROGS_EXTENDED := with_addr.sh \ > > # Compile but not part of 'make run_tests' > > TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \ > > flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \ > > - test_lirc_mode2_user xdping test_cpp runqslower > > + test_lirc_mode2_user xdping test_cpp runqslower test_sk_assign > > No test_sk_assign any more as the test is integrated into test_progs, right? > > > > > TEST_CUSTOM_PROGS = urandom_read > > > > diff --git a/tools/testing/selftests/bpf/prog_tests/sk_assign.c b/tools/testing/selftests/bpf/prog_tests/sk_assign.c > > new file mode 100644 > > index 000000000000..1f0afcc20c48 > > --- /dev/null > > +++ b/tools/testing/selftests/bpf/prog_tests/sk_assign.c > > @@ -0,0 +1,244 @@ > > +// SPDX-License-Identifier: GPL-2.0 > > +// Copyright (c) 2018 Facebook > > +// Copyright (c) 2019 Cloudflare > > +// Copyright (c) 2020 Isovalent, Inc. > > +/* > > + * Test that the socket assign program is able to redirect traffic towards a > > + * socket, regardless of whether the port or address destination of the traffic > > + * matches the port. > > + */ > > + > > +#define _GNU_SOURCE > > +#include <fcntl.h> > > +#include <signal.h> > > +#include <stdlib.h> > > +#include <unistd.h> > > + > > +#include "test_progs.h" > > + > > +#define TEST_DPORT 4321 > > +#define TEST_DADDR (0xC0A80203) > > +#define NS_SELF "/proc/self/ns/net" > > + > > +static __u32 duration; > > + > > +static bool configure_stack(int self_net) > > self_net parameter is not used. > > > +{ > > + /* Move to a new networking namespace */ > > + if (CHECK_FAIL(unshare(CLONE_NEWNET))) > > + return false; > > You can use CHECK to encode better error messages. Thhis is what > most test_progs tests are using. > > > + /* Configure necessary links, routes */ > > + if (CHECK_FAIL(system("ip link set dev lo up"))) > > + return false; > > + if (CHECK_FAIL(system("ip route add local default dev lo"))) > > + return false; > > + if (CHECK_FAIL(system("ip -6 route add local default dev lo"))) > > + return false; > > + > > + /* Load qdisc, BPF program */ > > + if (CHECK_FAIL(system("tc qdisc add dev lo clsact"))) > > + return false; > > + if (CHECK_FAIL(system("tc filter add dev lo ingress bpf direct-action " > > + "object-file ./test_sk_assign.o section sk_assign_test"))) > > + return false; > > + > > + return true; > > +} > > + > > +static int start_server(const struct sockaddr *addr, socklen_t len) > > +{ > > + int fd; > > + > > + fd = socket(addr->sa_family, SOCK_STREAM, 0); > > + if (CHECK_FAIL(fd == -1)) > > + goto out; > > + if (CHECK_FAIL(bind(fd, addr, len) == -1)) > > + goto close_out; > > + if (CHECK_FAIL(listen(fd, 128) == -1)) > > + goto close_out; > > + > > + goto out; > > + > > +close_out: > > + close(fd); > > + fd = -1; > > +out: > > + return fd; > > +} > > + > > +static void handle_timeout(int signum) > > +{ > > + if (signum == SIGALRM) > > + fprintf(stderr, "Timed out while connecting to server\n"); > > + kill(0, SIGKILL); > > +} > > + > > +static struct sigaction timeout_action = { > > + .sa_handler = handle_timeout, > > +}; > > + > > +static int connect_to_server(const struct sockaddr *addr, socklen_t len) > > +{ > > + int fd = -1; > > + > > + fd = socket(addr->sa_family, SOCK_STREAM, 0); > > + if (CHECK_FAIL(fd == -1)) > > + goto out; > > + if (CHECK_FAIL(sigaction(SIGALRM, &timeout_action, NULL))) > > + goto out; > > should this goto close_out? > > > + alarm(3); > > + if (CHECK_FAIL(connect(fd, addr, len) == -1)) > > + goto close_out; > > + > > + goto out; > > + > > +close_out: > > + close(fd); > > + fd = -1; > > +out: > > + return fd; > > +} > > + > > +static in_port_t get_port(int fd) > > +{ > > + struct sockaddr_storage name; > > + socklen_t len; > > + in_port_t port = 0; > > + > > + len = sizeof(name); > > + if (CHECK_FAIL(getsockname(fd, (struct sockaddr *)&name, &len))) > > + return port; > > + > > + switch (name.ss_family) { > > + case AF_INET: > > + port = ((struct sockaddr_in *)&name)->sin_port; > > + break; > > + case AF_INET6: > > + port = ((struct sockaddr_in6 *)&name)->sin6_port; > > + break; > > + default: > > + CHECK(1, "Invalid address family", "%d\n", name.ss_family); > > + } > > + return port; > > +} > > + > > +static int run_test(int server_fd, const struct sockaddr *addr, socklen_t len) > > +{ > > + int client = -1, srv_client = -1; > > + char buf[] = "testing"; > > + in_port_t port; > > + int ret = 1; > > + > > + client = connect_to_server(addr, len); > > + if (client == -1) { > > + perror("Cannot connect to server"); > > + goto out; > > + } > > + > > + srv_client = accept(server_fd, NULL, NULL); > > + if (CHECK_FAIL(srv_client == -1)) { > > + perror("Can't accept connection"); > > + goto out; > > + } > > + if (CHECK_FAIL(write(client, buf, sizeof(buf)) != sizeof(buf))) { > > + perror("Can't write on client"); > > + goto out; > > + } > > + if (CHECK_FAIL(read(srv_client, buf, sizeof(buf)) != sizeof(buf))) { > > + perror("Can't read on server"); > > + goto out; > > + } > > + > > + port = get_port(srv_client); > > + if (CHECK_FAIL(!port)) > > + goto out; > > + if (CHECK(port != htons(TEST_DPORT), "Expected", "port %u but got %u", > > + TEST_DPORT, ntohs(port))) > > + goto out; > > + > > + ret = 0; > > +out: > > + close(client); > > + close(srv_client); > > + return ret; > > +} > > + > > +static int do_sk_assign(void) > > +{ > > + struct sockaddr_in addr4; > > + struct sockaddr_in6 addr6; > > + int server = -1; > > + int server_v6 = -1; > > + int err = 1; > > + > > + memset(&addr4, 0, sizeof(addr4)); > > + addr4.sin_family = AF_INET; > > + addr4.sin_addr.s_addr = htonl(INADDR_LOOPBACK); > > + addr4.sin_port = htons(1234); > > + > > + memset(&addr6, 0, sizeof(addr6)); > > + addr6.sin6_family = AF_INET6; > > + addr6.sin6_addr = in6addr_loopback; > > + addr6.sin6_port = htons(1234); > > + > > + server = start_server((const struct sockaddr *)&addr4, sizeof(addr4)); > > + if (server == -1) > > + goto out; > > + > > + server_v6 = start_server((const struct sockaddr *)&addr6, > > + sizeof(addr6)); > > + if (server_v6 == -1) > > + goto out; > > + > > + /* Connect to unbound ports */ > > + addr4.sin_port = htons(TEST_DPORT); > > + addr6.sin6_port = htons(TEST_DPORT); > > + > > + test__start_subtest("ipv4 port redir"); > > + if (run_test(server, (const struct sockaddr *)&addr4, sizeof(addr4))) > > + goto out; > > + > > + test__start_subtest("ipv6 port redir"); > > + if (run_test(server_v6, (const struct sockaddr *)&addr6, sizeof(addr6))) > > + goto out; > > + > > + /* Connect to unbound addresses */ > > + addr4.sin_addr.s_addr = htonl(TEST_DADDR); > > + addr6.sin6_addr.s6_addr32[3] = htonl(TEST_DADDR); > > + > > + test__start_subtest("ipv4 addr redir"); > > + if (run_test(server, (const struct sockaddr *)&addr4, sizeof(addr4))) > > + goto out; > > + > > + test__start_subtest("ipv6 addr redir"); > > + if (run_test(server_v6, (const struct sockaddr *)&addr6, sizeof(addr6))) > > + goto out; > > + > > + err = 0; > > +out: > > + close(server); > > + close(server_v6); > > + return err; > > +} > > + > > +void test_sk_assign(void) > > +{ > > + int self_net; > > + > > + self_net = open(NS_SELF, O_RDONLY); > > + if (CHECK_FAIL(self_net < 0)) { > > + perror("Unable to open "NS_SELF); > > + return; > > + } > > + > > + if (!configure_stack(self_net)) { > > + perror("configure_stack"); > > + goto cleanup; > > + } > > + > > + do_sk_assign(); > > + > > +cleanup: > > + close(self_net); > > Did we exit the newly unshared net namespace and restored the previous > namespace? > > > +} > > diff --git a/tools/testing/selftests/bpf/progs/test_sk_assign.c b/tools/testing/selftests/bpf/progs/test_sk_assign.c > > new file mode 100644 > > index 000000000000..7de30ad3f594 > > --- /dev/null > > +++ b/tools/testing/selftests/bpf/progs/test_sk_assign.c > > @@ -0,0 +1,127 @@ > > +// SPDX-License-Identifier: GPL-2.0 > > +// Copyright (c) 2019 Cloudflare Ltd. > > + > > +#include <stddef.h> > > +#include <stdbool.h> > > +#include <string.h> > > +#include <linux/bpf.h> > > +#include <linux/if_ether.h> > > +#include <linux/in.h> > > +#include <linux/ip.h> > > +#include <linux/ipv6.h> > > +#include <linux/pkt_cls.h> > > +#include <linux/tcp.h> > > +#include <sys/socket.h> > > +#include <bpf/bpf_helpers.h> > > +#include <bpf/bpf_endian.h> > > + > > +int _version SEC("version") = 1; > > +char _license[] SEC("license") = "GPL"; > > + > > +/* Fill 'tuple' with L3 info, and attempt to find L4. On fail, return NULL. */ > > +static struct bpf_sock_tuple *get_tuple(void *data, __u64 nh_off, > > + void *data_end, __u16 eth_proto, > > + bool *ipv4) > > +{ > > + struct bpf_sock_tuple *result; > > + __u8 proto = 0; > > + __u64 ihl_len; > > + > > + if (eth_proto == bpf_htons(ETH_P_IP)) { > > + struct iphdr *iph = (struct iphdr *)(data + nh_off); > > + > > + if (iph + 1 > data_end) > > + return NULL; > > + if (iph->ihl != 5) > > + /* Options are not supported */ > > + return NULL; > > + ihl_len = iph->ihl * 4; > > + proto = iph->protocol; > > + *ipv4 = true; > > + result = (struct bpf_sock_tuple *)&iph->saddr; > > + } else if (eth_proto == bpf_htons(ETH_P_IPV6)) { > > + struct ipv6hdr *ip6h = (struct ipv6hdr *)(data + nh_off); > > + > > + if (ip6h + 1 > data_end) > > + return NULL; > > + ihl_len = sizeof(*ip6h); > > + proto = ip6h->nexthdr; > > + *ipv4 = false; > > + result = (struct bpf_sock_tuple *)&ip6h->saddr; > > + } else { > > + return NULL; > > + } > > + > > + if (result + 1 > data_end || proto != IPPROTO_TCP) > > + return NULL; > > + > > + return result; > > +} > > + > > +SEC("sk_assign_test") > > +int bpf_sk_assign_test(struct __sk_buff *skb) > > +{ > > + void *data_end = (void *)(long)skb->data_end; > > + void *data = (void *)(long)skb->data; > > + struct ethhdr *eth = (struct ethhdr *)(data); > > + struct bpf_sock_tuple *tuple, ln = {0}; > > + struct bpf_sock *sk; > > + int tuple_len; > > + bool ipv4; > > + int ret; > > + > > + if (eth + 1 > data_end) > > + return TC_ACT_SHOT; > > + > > + tuple = get_tuple(data, sizeof(*eth), data_end, eth->h_proto, &ipv4); > > + if (!tuple) > > + return TC_ACT_SHOT; > > + > > + tuple_len = ipv4 ? sizeof(tuple->ipv4) : sizeof(tuple->ipv6); > > + sk = bpf_skc_lookup_tcp(skb, tuple, tuple_len, BPF_F_CURRENT_NETNS, 0); > > You can get rid of tuple_len with > if (ipv4) > sk = bpf_skc_lookup_tcp(..., sizeof(tuple->ipv4), ...); > else > sk = bpf_skc_lookup_tcp(..., sizeof(tuple->ipv6), ...); > > and later on you can do common bpf_skc_lookup_tcp. > But it may not be worthwhile to do it, as you have two separate calls > in the above instead. > > > + if (sk) { > > + if (sk->state != BPF_TCP_LISTEN) > > + goto assign; > > + > > + bpf_sk_release(sk); > > + } > > + > > + if (ipv4) { > > + if (tuple->ipv4.dport != bpf_htons(4321)) > > + return TC_ACT_OK; > > + > > + ln.ipv4.daddr = bpf_htonl(0x7f000001); > > + ln.ipv4.dport = bpf_htons(1234); > > + > > + sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv4), > > + BPF_F_CURRENT_NETNS, 0); > > + } else { > > + if (tuple->ipv6.dport != bpf_htons(4321)) > > + return TC_ACT_OK; > > + > > + /* Upper parts of daddr are already zero. */ > > + ln.ipv6.daddr[3] = bpf_htonl(0x1); > > + ln.ipv6.dport = bpf_htons(1234); > > + > > + sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv6), > > + BPF_F_CURRENT_NETNS, 0); > > + } > > + > > + /* We can't do a single skc_lookup_tcp here, because then the compiler > > + * will likely spill tuple_len to the stack. This makes it lose all > > + * bounds information in the verifier, which then rejects the call as > > + * unsafe. > > + */ > > This is a known issue. For scalars, only constant is restored properly > in verifier at this moment. I did some hacking before to enable any > scalars. The fear is this will make pruning performs worse. More > study is needed here. Of topic, but: this is actually one of the most challenging issues for us when writing BPF. It forces us to have very deep call graphs to hopefully avoid clang spilling the constants. Please let me know if I can help in any way. > > > + if (!sk) > > + return TC_ACT_SHOT; > > + > > + if (sk->state != BPF_TCP_LISTEN) { > > + bpf_sk_release(sk); > > + return TC_ACT_SHOT; > > + } > > + > > +assign: > > + ret = bpf_sk_assign(skb, sk, 0); > > + bpf_sk_release(sk); > > + return ret == 0 ? TC_ACT_OK : TC_ACT_SHOT; > > +} > >
On Wed, Mar 25, 2020 at 10:28 PM Joe Stringer <joe@wand.net.nz> wrote: > > On Wed, Mar 25, 2020 at 7:16 PM Andrii Nakryiko > <andrii.nakryiko@gmail.com> wrote: > > > > On Tue, Mar 24, 2020 at 10:58 PM Joe Stringer <joe@wand.net.nz> wrote: > > > > > > From: Lorenz Bauer <lmb@cloudflare.com> > > > > > > Attach a tc direct-action classifier to lo in a fresh network > > > namespace, and rewrite all connection attempts to localhost:4321 > > > to localhost:1234 (for port tests) and connections to unreachable > > > IPv4/IPv6 IPs to the local socket (for address tests). > > > > > > Keep in mind that both client to server and server to client traffic > > > passes the classifier. > > > > > > Signed-off-by: Lorenz Bauer <lmb@cloudflare.com> > > > Co-authored-by: Joe Stringer <joe@wand.net.nz> > > > Signed-off-by: Joe Stringer <joe@wand.net.nz> > > > --- > > <snip> > > > > +static void handle_timeout(int signum) > > > +{ > > > + if (signum == SIGALRM) > > > + fprintf(stderr, "Timed out while connecting to server\n"); > > > + kill(0, SIGKILL); > > > +} > > > + > > > +static struct sigaction timeout_action = { > > > + .sa_handler = handle_timeout, > > > +}; > > > + > > > +static int connect_to_server(const struct sockaddr *addr, socklen_t len) > > > +{ > > > + int fd = -1; > > > + > > > + fd = socket(addr->sa_family, SOCK_STREAM, 0); > > > + if (CHECK_FAIL(fd == -1)) > > > + goto out; > > > + if (CHECK_FAIL(sigaction(SIGALRM, &timeout_action, NULL))) > > > + goto out; > > > > no-no-no, we are not doing this. It's part of prog_tests and shouldn't > > install its own signal handlers and sending asynchronous signals to > > itself. Please find another way to have a timeout. > > I realise it didn't clean up after itself. How about signal(SIGALRM, > SIG_DFL); just like other existing tests do? You have alarm(3), which will deliver SIGALARM later, when other test is running. Once you clean up this custom signal handler it will cause test_progs to coredump. So still no, please find another way to do timeouts. > > > > + test__start_subtest("ipv4 addr redir"); > > > + if (run_test(server, (const struct sockaddr *)&addr4, sizeof(addr4))) > > > + goto out; > > > + > > > + test__start_subtest("ipv6 addr redir"); > > > > test__start_subtest() returns false if subtest is supposed to be > > skipped. If you ignore that, then test_progs -t and -n filtering won't > > work properly. > > Will fix. > > > > + if (run_test(server_v6, (const struct sockaddr *)&addr6, sizeof(addr6))) > > > + goto out; > > > + > > > + err = 0; > > > +out: > > > + close(server); > > > + close(server_v6); > > > + return err; > > > +} > > > + > > > +void test_sk_assign(void) > > > +{ > > > + int self_net; > > > + > > > + self_net = open(NS_SELF, O_RDONLY); > > > > I'm not familiar with what this does. Can you please explain briefly > > what are the side effects of this? Asking because of shared test_progs > > environment worries, of course. > > This one is opening an fd to the current program's netns path on the > filesystem. The intention was to use it to switch back to the current > netns after the unshare() call elsewhere which switches to a new > netns. As per the other feedback the bit where it switches back to > this netns was dropped along the way so I'll fix it up in the next > revision. > > > > +SEC("sk_assign_test") > > > > Please use "canonical" section name that libbpf recognizes. This > > woulde be "action/" or "classifier/", right? > > Will fix.
On Thu, Mar 26, 2020 at 10:13:31AM +0000, Lorenz Bauer wrote: > > > + > > > + if (ipv4) { > > > + if (tuple->ipv4.dport != bpf_htons(4321)) > > > + return TC_ACT_OK; > > > + > > > + ln.ipv4.daddr = bpf_htonl(0x7f000001); > > > + ln.ipv4.dport = bpf_htons(1234); > > > + > > > + sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv4), > > > + BPF_F_CURRENT_NETNS, 0); > > > + } else { > > > + if (tuple->ipv6.dport != bpf_htons(4321)) > > > + return TC_ACT_OK; > > > + > > > + /* Upper parts of daddr are already zero. */ > > > + ln.ipv6.daddr[3] = bpf_htonl(0x1); > > > + ln.ipv6.dport = bpf_htons(1234); > > > + > > > + sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv6), > > > + BPF_F_CURRENT_NETNS, 0); > > > + } > > > + > > > + /* We can't do a single skc_lookup_tcp here, because then the compiler > > > + * will likely spill tuple_len to the stack. This makes it lose all > > > + * bounds information in the verifier, which then rejects the call as > > > + * unsafe. > > > + */ > > > > This is a known issue. For scalars, only constant is restored properly > > in verifier at this moment. I did some hacking before to enable any > > scalars. The fear is this will make pruning performs worse. More > > study is needed here. > > Of topic, but: this is actually one of the most challenging issues for > us when writing > BPF. It forces us to have very deep call graphs to hopefully avoid clang > spilling the constants. Please let me know if I can help in any way. Thanks for bringing this up. Yonghong, please correct me if I'm wrong. I think you've experimented with tracking spilled constants. The first issue came with spilling of 4 byte constant. The verifier tracks 8 byte slots and lots of places assume that slot granularity. It's not clear yet how to refactor the verifier. Ideas, help are greatly appreciated. The second concern was pruning, but iirc the experiments were inconclusive. selftests/bpf only has old fb progs. Hence, I think, the step zero is for everyone to contribute their bpf programs written in C. If we have both cilium and cloudflare progs as selftests it will help a lot to guide such long lasting verifier decisions.
On Thu, Mar 26, 2020 at 12:37 PM Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote: > > On Wed, Mar 25, 2020 at 10:28 PM Joe Stringer <joe@wand.net.nz> wrote: > > > > On Wed, Mar 25, 2020 at 7:16 PM Andrii Nakryiko > > <andrii.nakryiko@gmail.com> wrote: > > > > > > On Tue, Mar 24, 2020 at 10:58 PM Joe Stringer <joe@wand.net.nz> wrote: > > > > > > > > From: Lorenz Bauer <lmb@cloudflare.com> > > > > > > > > Attach a tc direct-action classifier to lo in a fresh network > > > > namespace, and rewrite all connection attempts to localhost:4321 > > > > to localhost:1234 (for port tests) and connections to unreachable > > > > IPv4/IPv6 IPs to the local socket (for address tests). > > > > > > > > Keep in mind that both client to server and server to client traffic > > > > passes the classifier. > > > > > > > > Signed-off-by: Lorenz Bauer <lmb@cloudflare.com> > > > > Co-authored-by: Joe Stringer <joe@wand.net.nz> > > > > Signed-off-by: Joe Stringer <joe@wand.net.nz> > > > > --- > > > > <snip> > > > > > > +static void handle_timeout(int signum) > > > > +{ > > > > + if (signum == SIGALRM) > > > > + fprintf(stderr, "Timed out while connecting to server\n"); > > > > + kill(0, SIGKILL); > > > > +} > > > > + > > > > +static struct sigaction timeout_action = { > > > > + .sa_handler = handle_timeout, > > > > +}; > > > > + > > > > +static int connect_to_server(const struct sockaddr *addr, socklen_t len) > > > > +{ > > > > + int fd = -1; > > > > + > > > > + fd = socket(addr->sa_family, SOCK_STREAM, 0); > > > > + if (CHECK_FAIL(fd == -1)) > > > > + goto out; > > > > + if (CHECK_FAIL(sigaction(SIGALRM, &timeout_action, NULL))) > > > > + goto out; > > > > > > no-no-no, we are not doing this. It's part of prog_tests and shouldn't > > > install its own signal handlers and sending asynchronous signals to > > > itself. Please find another way to have a timeout. > > > > I realise it didn't clean up after itself. How about signal(SIGALRM, > > SIG_DFL); just like other existing tests do? > > You have alarm(3), which will deliver SIGALARM later, when other test > is running. Once you clean up this custom signal handler it will cause > test_progs to coredump. So still no, please find another way to do > timeouts. FWIW I hit this issue and alarm(0) afterwards fixed it up. That said the SO_SNDTIMEO / SO_RCVTIMEO alternative should work fine for this use case instead so I'll switch to that, it's marginally cleaner.
On 3/26/20 2:07 PM, Alexei Starovoitov wrote: > On Thu, Mar 26, 2020 at 10:13:31AM +0000, Lorenz Bauer wrote: >>>> + >>>> + if (ipv4) { >>>> + if (tuple->ipv4.dport != bpf_htons(4321)) >>>> + return TC_ACT_OK; >>>> + >>>> + ln.ipv4.daddr = bpf_htonl(0x7f000001); >>>> + ln.ipv4.dport = bpf_htons(1234); >>>> + >>>> + sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv4), >>>> + BPF_F_CURRENT_NETNS, 0); >>>> + } else { >>>> + if (tuple->ipv6.dport != bpf_htons(4321)) >>>> + return TC_ACT_OK; >>>> + >>>> + /* Upper parts of daddr are already zero. */ >>>> + ln.ipv6.daddr[3] = bpf_htonl(0x1); >>>> + ln.ipv6.dport = bpf_htons(1234); >>>> + >>>> + sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv6), >>>> + BPF_F_CURRENT_NETNS, 0); >>>> + } >>>> + >>>> + /* We can't do a single skc_lookup_tcp here, because then the compiler >>>> + * will likely spill tuple_len to the stack. This makes it lose all >>>> + * bounds information in the verifier, which then rejects the call as >>>> + * unsafe. >>>> + */ >>> >>> This is a known issue. For scalars, only constant is restored properly >>> in verifier at this moment. I did some hacking before to enable any >>> scalars. The fear is this will make pruning performs worse. More >>> study is needed here. >> >> Of topic, but: this is actually one of the most challenging issues for >> us when writing >> BPF. It forces us to have very deep call graphs to hopefully avoid clang >> spilling the constants. Please let me know if I can help in any way. > > Thanks for bringing this up. > Yonghong, please correct me if I'm wrong. Yes. The summary below is correct. For reference, the below bcc issue documents some of my investigation: https://github.com/iovisor/bcc/issues/2463 > I think you've experimented with tracking spilled constants. The first issue > came with spilling of 4 byte constant. The verifier tracks 8 byte slots and > lots of places assume that slot granularity. It's not clear yet how to refactor > the verifier. Ideas, help are greatly appreciated. I cannot remember exactly what I did then. Probably remember the spilled size too. Since the hack is never peer reviewed, maybe my approach has bugs. > The second concern was pruning, but iirc the experiments were inconclusive. > selftests/bpf only has old fb progs. Hence, I think, the step zero is for > everyone to contribute their bpf programs written in C. If we have both > cilium and cloudflare progs as selftests it will help a lot to guide such long > lasting verifier decisions. Yes, this is inconclusive and I did not do any active investigation here since just enhancing the non-const spill won't resolve the above issue. But totally agree that if we had an implementation, we should measure its impact on verifier speed.
On Wed, Mar 25, 2020 at 11:38 PM Joe Stringer <joe@wand.net.nz> wrote: > > On Wed, Mar 25, 2020 at 11:25 PM Martin KaFai Lau <kafai@fb.com> wrote: > > > > On Wed, Mar 25, 2020 at 01:55:59PM -0700, Joe Stringer wrote: > > > On Wed, Mar 25, 2020 at 3:35 AM Lorenz Bauer <lmb@cloudflare.com> wrote: > > > > > > > > On Wed, 25 Mar 2020 at 05:58, Joe Stringer <joe@wand.net.nz> wrote: > > > > > > > > > > From: Lorenz Bauer <lmb@cloudflare.com> > > > > > > > > > > Attach a tc direct-action classifier to lo in a fresh network > > > > > namespace, and rewrite all connection attempts to localhost:4321 > > > > > to localhost:1234 (for port tests) and connections to unreachable > > > > > IPv4/IPv6 IPs to the local socket (for address tests). > > > > > > > > Can you extend this to cover UDP as well? > > > > > > I'm working on a follow-up series for UDP, we need this too. > > Other than selftests, what are the changes for UDP in patch 1 - 4? > > Nothing in those patches, I have refactoring of all of the socket > helpers, skc_lookup_udp() and adding flags to the socket lookup > functions to support only looking for a certain type of sockets - > established or listen. This helps to avoid multiple lookups in these > cases where you really just want to look up established sockets with > the packet tuple first then look up the listener socket with the > unrelated/tproxy tuple. For UDP it makes it easier to find the correct > socket and in general (including TCP) helps to avoid up to two socket > hashtable lookups for this use case. This part is because the current > helpers all look up the established socket first then the listener > socket, so for the first packet that hits these we perform both of > these lookups for the packet tuple (which finds nothing), then look up > an established socket for the target tuple (which finds nothing) then > finally a listen socket for the target tuple. It's about another 300+ > / 250- changes overall, of which a large chunk is one patch that > refactors the code into macros. I haven't narrowed down for sure > whether the lookup flags patch is required for UDP cases yet. FWIW I did some more testing and it was not apparent that skc_lookup_udp is at all necessary, I was able to roll in UDP support in the next revision of this series with no special extra patches. I'll keep working on those other optimizations in the background though.
On Thu, 26 Mar 2020 at 21:07, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > > On Thu, Mar 26, 2020 at 10:13:31AM +0000, Lorenz Bauer wrote: > > > > + > > > > + if (ipv4) { > > > > + if (tuple->ipv4.dport != bpf_htons(4321)) > > > > + return TC_ACT_OK; > > > > + > > > > + ln.ipv4.daddr = bpf_htonl(0x7f000001); > > > > + ln.ipv4.dport = bpf_htons(1234); > > > > + > > > > + sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv4), > > > > + BPF_F_CURRENT_NETNS, 0); > > > > + } else { > > > > + if (tuple->ipv6.dport != bpf_htons(4321)) > > > > + return TC_ACT_OK; > > > > + > > > > + /* Upper parts of daddr are already zero. */ > > > > + ln.ipv6.daddr[3] = bpf_htonl(0x1); > > > > + ln.ipv6.dport = bpf_htons(1234); > > > > + > > > > + sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv6), > > > > + BPF_F_CURRENT_NETNS, 0); > > > > + } > > > > + > > > > + /* We can't do a single skc_lookup_tcp here, because then the compiler > > > > + * will likely spill tuple_len to the stack. This makes it lose all > > > > + * bounds information in the verifier, which then rejects the call as > > > > + * unsafe. > > > > + */ > > > > > > This is a known issue. For scalars, only constant is restored properly > > > in verifier at this moment. I did some hacking before to enable any > > > scalars. The fear is this will make pruning performs worse. More > > > study is needed here. > > > > Of topic, but: this is actually one of the most challenging issues for > > us when writing > > BPF. It forces us to have very deep call graphs to hopefully avoid clang > > spilling the constants. Please let me know if I can help in any way. > > Thanks for bringing this up. > Yonghong, please correct me if I'm wrong. > I think you've experimented with tracking spilled constants. The first issue > came with spilling of 4 byte constant. The verifier tracks 8 byte slots and > lots of places assume that slot granularity. It's not clear yet how to refactor > the verifier. Ideas, help are greatly appreciated. > The second concern was pruning, but iirc the experiments were inconclusive. > selftests/bpf only has old fb progs. Hence, I think, the step zero is for > everyone to contribute their bpf programs written in C. If we have both > cilium and cloudflare progs as selftests it will help a lot to guide such long > lasting verifier decisions. Ok, I'll try to get something sorted out. We have a TC classifier that would be suitable, and I've been meaning to get it open sourced. Does the integration into the test suite have to involve running packets through it, or is compile and load enough?
On Fri, Mar 27, 2020 at 3:03 AM Lorenz Bauer <lmb@cloudflare.com> wrote: > > > > Thanks for bringing this up. > > Yonghong, please correct me if I'm wrong. > > I think you've experimented with tracking spilled constants. The first issue > > came with spilling of 4 byte constant. The verifier tracks 8 byte slots and > > lots of places assume that slot granularity. It's not clear yet how to refactor > > the verifier. Ideas, help are greatly appreciated. > > The second concern was pruning, but iirc the experiments were inconclusive. > > selftests/bpf only has old fb progs. Hence, I think, the step zero is for > > everyone to contribute their bpf programs written in C. If we have both > > cilium and cloudflare progs as selftests it will help a lot to guide such long > > lasting verifier decisions. > > Ok, I'll try to get something sorted out. We have a TC classifier that > would be suitable, > and I've been meaning to get it open sourced. Does the integration into the > test suite have to involve running packets through it, or is compile > and load enough? It would be great if you can add it as part of test_progs and run it with one or two packets via prog_test_run like all the tests do. Thanks!
On Thu, Mar 26, 2020 at 2:07 PM Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > The second concern was pruning, but iirc the experiments were inconclusive. > selftests/bpf only has old fb progs. Hence, I think, the step zero is for > everyone to contribute their bpf programs written in C. If we have both > cilium and cloudflare progs as selftests it will help a lot to guide such long > lasting verifier decisions. How would you like to handle program changes over time for this? In Cilium community we periodically rebuild bpf-next VM images for testing, and run every pull request against those images[0]. We also test against specific older kernels, currently 4.9 and 4.19. This allows us to get some sense for the impact of upstream changes while developing Cilium features, but unfortunately doesn't allow everyone using kernel selftests to get that feedback at least from the kernel tree. We also have a verifier complexity test script where we compile with the maximum number of features (to ideally generate the most complex programs possible) then attempt to load all of the various programs, and output the complexity count that the kernel reports[1,2] which we can track over time. However Cilium BPF programs are actively developing and even if we merge these programs into the kernel tree, they will get out-of-date quickly. Up until recently everything was verifying fine compiling with LLVM7 and loading into bpf-next. Over the past month we started noticing new issues not with the existing implementation, but in *new* BPF features. As we increased complexity, our CI started failing against bpf-next[3] while they loaded fine on older kernels. We ended up mitigating by upgrading to LLVM-10. Long story short, there's several moving parts; changing BPF program implementations, changing the compiler toolchain, changing the kernel verifier. So my question is basically, where's the line of responsibility for what the kernel selftests are responsible for vs integration tests? How do we maintain those over time as the BPF programs and compiler changes? Do we just parachute the ~11K LoC of Cilium datapath into the kernel tree once per cycle? Or should Cilium autobuild a verifier-test docker image that kernel testing scripts can pull & run? Or would it be helpful to have a separate GitHub project similar to libbpf that pulls out kernel selftests, Cilium progs, fb progs, cloudflare progs, etc automatically and centralizes a generic suite of BPF verifier integration tests? Some other option? [0] https://github.com/cilium/packer-ci-build [1] https://github.com/cilium/cilium/blob/master/test/bpf/check-complexity.sh [2] https://github.com/cilium/cilium/blob/master/test/bpf/verifier-test.sh [3] https://github.com/cilium/cilium/issues/10517
On 3/27/20 8:06 PM, Joe Stringer wrote: > On Thu, Mar 26, 2020 at 2:07 PM Alexei Starovoitov > <alexei.starovoitov@gmail.com> wrote: >> The second concern was pruning, but iirc the experiments were inconclusive. >> selftests/bpf only has old fb progs. Hence, I think, the step zero is for >> everyone to contribute their bpf programs written in C. If we have both >> cilium and cloudflare progs as selftests it will help a lot to guide such long >> lasting verifier decisions. > > How would you like to handle program changes over time for this? > > In Cilium community we periodically rebuild bpf-next VM images for > testing, and run every pull request against those images[0]. We also > test against specific older kernels, currently 4.9 and 4.19. This > allows us to get some sense for the impact of upstream changes while > developing Cilium features, but unfortunately doesn't allow everyone > using kernel selftests to get that feedback at least from the kernel > tree. We also have a verifier complexity test script where we compile > with the maximum number of features (to ideally generate the most > complex programs possible) then attempt to load all of the various > programs, and output the complexity count that the kernel reports[1,2] > which we can track over time. > > However Cilium BPF programs are actively developing and even if we > merge these programs into the kernel tree, they will get out-of-date > quickly. Up until recently everything was verifying fine compiling > with LLVM7 and loading into bpf-next. Over the past month we started > noticing new issues not with the existing implementation, but in *new* > BPF features. As we increased complexity, our CI started failing > against bpf-next[3] while they loaded fine on older kernels. We ended > up mitigating by upgrading to LLVM-10. Long story short, there's > several moving parts; changing BPF program implementations, changing > the compiler toolchain, changing the kernel verifier. So my question > is basically, where's the line of responsibility for what the kernel > selftests are responsible for vs integration tests? How do we maintain > those over time as the BPF programs and compiler changes? I wonder whether it would make sense to create test cases which are based on our cilium/cilium:latest docker image. Those would be run as part of BPF kernel selftests as well as part of our own CI for every PR. I think it could be some basic connectivity test, service mapping, etc with a bunch of application containers. One nice thing that just came to mind is that it would actually allow for easy testing of latest clang/llvm git by rerunning the test script and remapping them as a volume, e.g.: docker run -it -v /root/clang+llvm-7.0.1-x86_64-linux-gnu-ubuntu-18.04/bin/clang:/bin/clang \ -v /root/clang+llvm-7.0.1-x86_64-linux-gnu-ubuntu-18.04/bin/llc:/bin/llc \ cilium/cilium:latest /bin/sh Perhaps that would be more useful and always up to date than a copy of the code base that would get stale next day? In the end in this context kernel changes and/or llvm changes might be of interest to check whether anything potentially blows up, so having a self-contained packaging might be useful. Thoughts? > Do we just parachute the ~11K LoC of Cilium datapath into the kernel > tree once per cycle? Or should Cilium autobuild a verifier-test docker > image that kernel testing scripts can pull & run? Or would it be > helpful to have a separate GitHub project similar to libbpf that pulls > out kernel selftests, Cilium progs, fb progs, cloudflare progs, etc > automatically and centralizes a generic suite of BPF verifier > integration tests? Some other option? > > [0] https://github.com/cilium/packer-ci-build > [1] https://github.com/cilium/cilium/blob/master/test/bpf/check-complexity.sh > [2] https://github.com/cilium/cilium/blob/master/test/bpf/verifier-test.sh > [3] https://github.com/cilium/cilium/issues/10517 >
On Fri, Mar 27, 2020 at 09:16:52PM +0100, Daniel Borkmann wrote: > > Perhaps that would be more useful and always up to date than a copy of the > code base that would get stale next day? In the end in this context kernel > changes and/or llvm changes might be of interest to check whether anything > potentially blows up, so having a self-contained packaging might be useful. > Thoughts? Right now we have zero cilium progs in selftest :) so any number of progs is better than nothing. > > Do we just parachute the ~11K LoC of Cilium datapath into the kernel > > tree once per cycle? I don't think 11k progs updated every kernel release will catch much more than basic reduced set. selftests/bpf is not a substitute for cilium CI. I would prefer .c tests to be developed once and frozen. More tests can be added every 6 month or so. I think copy-paste is ok-ish too, but would be much better to think through about aspects of the code first. It worked well for fb xdp/tc progs. I took some of them, refactored them to selftest/Makefile and they were kept as-is for the last 3 years. While real progs kept being actively changed. For example: progs/test_l4lb.c is about 1/10th of the real deal here: https://github.com/facebookincubator/katran/tree/master/katran/lib/bpf In terms of lines code, number of includes and so on. While hacking katran into selftest I tried to capture the _style_ of C code. Like: bool get_packet_dst(struct real_definition **real, struct packet_description *pckt, struct vip_meta *vip_info, bool is_ipv6) I wouldn't have written the prototype this way. Passing double pointer as a return value as a first argument is not my style :) The style of nested 'if', etc. Those are the things that make clang generate specific code patterns that I was trying to preserve in selftests. Example 2: progs/strobemeta.h is about 1/4 of real thing. Yet frozen it was super useful for the work on bounded loops. Example 3: progs/test_get_stack_rawtp.c is one specific code pattern that used in our profiling prog. This one was immensely helpful to track down jmp32/alu32 bugs. It's the one that we're still fixing (see John's jmp32 work). and so on. selftests/bpf/progs don't need to be full copy-paste. Ideally the capture the essence of the progs in the short form. clang bpf backend and verifier smartness keep changing. So having frozen selftests is necessary to see the trend and capture bugs in the backend and in the verifier.
On Fri, Mar 27, 2020 at 12:07 PM Joe Stringer <joe@wand.net.nz> wrote: > > On Thu, Mar 26, 2020 at 2:07 PM Alexei Starovoitov > <alexei.starovoitov@gmail.com> wrote: > > The second concern was pruning, but iirc the experiments were inconclusive. > > selftests/bpf only has old fb progs. Hence, I think, the step zero is for > > everyone to contribute their bpf programs written in C. If we have both > > cilium and cloudflare progs as selftests it will help a lot to guide such long > > lasting verifier decisions. > > How would you like to handle program changes over time for this? > > In Cilium community we periodically rebuild bpf-next VM images for > testing, and run every pull request against those images[0]. We also > test against specific older kernels, currently 4.9 and 4.19. This > allows us to get some sense for the impact of upstream changes while > developing Cilium features, but unfortunately doesn't allow everyone > using kernel selftests to get that feedback at least from the kernel > tree. We also have a verifier complexity test script where we compile > with the maximum number of features (to ideally generate the most > complex programs possible) then attempt to load all of the various > programs, and output the complexity count that the kernel reports[1,2] > which we can track over time. > > However Cilium BPF programs are actively developing and even if we > merge these programs into the kernel tree, they will get out-of-date > quickly. Up until recently everything was verifying fine compiling > with LLVM7 and loading into bpf-next. Over the past month we started > noticing new issues not with the existing implementation, but in *new* > BPF features. As we increased complexity, our CI started failing > against bpf-next[3] while they loaded fine on older kernels. We ended > up mitigating by upgrading to LLVM-10. Long story short, there's > several moving parts; changing BPF program implementations, changing > the compiler toolchain, changing the kernel verifier. So my question > is basically, where's the line of responsibility for what the kernel > selftests are responsible for vs integration tests? How do we maintain > those over time as the BPF programs and compiler changes? Just wanted to point out that libbpf's Github CI has multi-kernel testing, so we'll be able to capture regressions on old kernels that are caused by libbpf and/or nightly clang (we are currently pulling clang-11 from nightly packages). We are also testing against latest kernel as well, so if they break, we'll need to fix them. Which is why I'd like those programs to be manageable in size and complexity and a simple part of test_progs, not in some Docker container :) > > Do we just parachute the ~11K LoC of Cilium datapath into the kernel > tree once per cycle? Or should Cilium autobuild a verifier-test docker > image that kernel testing scripts can pull & run? Or would it be > helpful to have a separate GitHub project similar to libbpf that pulls > out kernel selftests, Cilium progs, fb progs, cloudflare progs, etc > automatically and centralizes a generic suite of BPF verifier > integration tests? Some other option? > > [0] https://github.com/cilium/packer-ci-build > [1] https://github.com/cilium/cilium/blob/master/test/bpf/check-complexity.sh > [2] https://github.com/cilium/cilium/blob/master/test/bpf/verifier-test.sh > [3] https://github.com/cilium/cilium/issues/10517
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 7729892e0b04..4f7f83d059ca 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -76,7 +76,7 @@ TEST_PROGS_EXTENDED := with_addr.sh \ # Compile but not part of 'make run_tests' TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \ flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \ - test_lirc_mode2_user xdping test_cpp runqslower + test_lirc_mode2_user xdping test_cpp runqslower test_sk_assign TEST_CUSTOM_PROGS = urandom_read diff --git a/tools/testing/selftests/bpf/prog_tests/sk_assign.c b/tools/testing/selftests/bpf/prog_tests/sk_assign.c new file mode 100644 index 000000000000..1f0afcc20c48 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/sk_assign.c @@ -0,0 +1,244 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2018 Facebook +// Copyright (c) 2019 Cloudflare +// Copyright (c) 2020 Isovalent, Inc. +/* + * Test that the socket assign program is able to redirect traffic towards a + * socket, regardless of whether the port or address destination of the traffic + * matches the port. + */ + +#define _GNU_SOURCE +#include <fcntl.h> +#include <signal.h> +#include <stdlib.h> +#include <unistd.h> + +#include "test_progs.h" + +#define TEST_DPORT 4321 +#define TEST_DADDR (0xC0A80203) +#define NS_SELF "/proc/self/ns/net" + +static __u32 duration; + +static bool configure_stack(int self_net) +{ + /* Move to a new networking namespace */ + if (CHECK_FAIL(unshare(CLONE_NEWNET))) + return false; + + /* Configure necessary links, routes */ + if (CHECK_FAIL(system("ip link set dev lo up"))) + return false; + if (CHECK_FAIL(system("ip route add local default dev lo"))) + return false; + if (CHECK_FAIL(system("ip -6 route add local default dev lo"))) + return false; + + /* Load qdisc, BPF program */ + if (CHECK_FAIL(system("tc qdisc add dev lo clsact"))) + return false; + if (CHECK_FAIL(system("tc filter add dev lo ingress bpf direct-action " + "object-file ./test_sk_assign.o section sk_assign_test"))) + return false; + + return true; +} + +static int start_server(const struct sockaddr *addr, socklen_t len) +{ + int fd; + + fd = socket(addr->sa_family, SOCK_STREAM, 0); + if (CHECK_FAIL(fd == -1)) + goto out; + if (CHECK_FAIL(bind(fd, addr, len) == -1)) + goto close_out; + if (CHECK_FAIL(listen(fd, 128) == -1)) + goto close_out; + + goto out; + +close_out: + close(fd); + fd = -1; +out: + return fd; +} + +static void handle_timeout(int signum) +{ + if (signum == SIGALRM) + fprintf(stderr, "Timed out while connecting to server\n"); + kill(0, SIGKILL); +} + +static struct sigaction timeout_action = { + .sa_handler = handle_timeout, +}; + +static int connect_to_server(const struct sockaddr *addr, socklen_t len) +{ + int fd = -1; + + fd = socket(addr->sa_family, SOCK_STREAM, 0); + if (CHECK_FAIL(fd == -1)) + goto out; + if (CHECK_FAIL(sigaction(SIGALRM, &timeout_action, NULL))) + goto out; + alarm(3); + if (CHECK_FAIL(connect(fd, addr, len) == -1)) + goto close_out; + + goto out; + +close_out: + close(fd); + fd = -1; +out: + return fd; +} + +static in_port_t get_port(int fd) +{ + struct sockaddr_storage name; + socklen_t len; + in_port_t port = 0; + + len = sizeof(name); + if (CHECK_FAIL(getsockname(fd, (struct sockaddr *)&name, &len))) + return port; + + switch (name.ss_family) { + case AF_INET: + port = ((struct sockaddr_in *)&name)->sin_port; + break; + case AF_INET6: + port = ((struct sockaddr_in6 *)&name)->sin6_port; + break; + default: + CHECK(1, "Invalid address family", "%d\n", name.ss_family); + } + return port; +} + +static int run_test(int server_fd, const struct sockaddr *addr, socklen_t len) +{ + int client = -1, srv_client = -1; + char buf[] = "testing"; + in_port_t port; + int ret = 1; + + client = connect_to_server(addr, len); + if (client == -1) { + perror("Cannot connect to server"); + goto out; + } + + srv_client = accept(server_fd, NULL, NULL); + if (CHECK_FAIL(srv_client == -1)) { + perror("Can't accept connection"); + goto out; + } + if (CHECK_FAIL(write(client, buf, sizeof(buf)) != sizeof(buf))) { + perror("Can't write on client"); + goto out; + } + if (CHECK_FAIL(read(srv_client, buf, sizeof(buf)) != sizeof(buf))) { + perror("Can't read on server"); + goto out; + } + + port = get_port(srv_client); + if (CHECK_FAIL(!port)) + goto out; + if (CHECK(port != htons(TEST_DPORT), "Expected", "port %u but got %u", + TEST_DPORT, ntohs(port))) + goto out; + + ret = 0; +out: + close(client); + close(srv_client); + return ret; +} + +static int do_sk_assign(void) +{ + struct sockaddr_in addr4; + struct sockaddr_in6 addr6; + int server = -1; + int server_v6 = -1; + int err = 1; + + memset(&addr4, 0, sizeof(addr4)); + addr4.sin_family = AF_INET; + addr4.sin_addr.s_addr = htonl(INADDR_LOOPBACK); + addr4.sin_port = htons(1234); + + memset(&addr6, 0, sizeof(addr6)); + addr6.sin6_family = AF_INET6; + addr6.sin6_addr = in6addr_loopback; + addr6.sin6_port = htons(1234); + + server = start_server((const struct sockaddr *)&addr4, sizeof(addr4)); + if (server == -1) + goto out; + + server_v6 = start_server((const struct sockaddr *)&addr6, + sizeof(addr6)); + if (server_v6 == -1) + goto out; + + /* Connect to unbound ports */ + addr4.sin_port = htons(TEST_DPORT); + addr6.sin6_port = htons(TEST_DPORT); + + test__start_subtest("ipv4 port redir"); + if (run_test(server, (const struct sockaddr *)&addr4, sizeof(addr4))) + goto out; + + test__start_subtest("ipv6 port redir"); + if (run_test(server_v6, (const struct sockaddr *)&addr6, sizeof(addr6))) + goto out; + + /* Connect to unbound addresses */ + addr4.sin_addr.s_addr = htonl(TEST_DADDR); + addr6.sin6_addr.s6_addr32[3] = htonl(TEST_DADDR); + + test__start_subtest("ipv4 addr redir"); + if (run_test(server, (const struct sockaddr *)&addr4, sizeof(addr4))) + goto out; + + test__start_subtest("ipv6 addr redir"); + if (run_test(server_v6, (const struct sockaddr *)&addr6, sizeof(addr6))) + goto out; + + err = 0; +out: + close(server); + close(server_v6); + return err; +} + +void test_sk_assign(void) +{ + int self_net; + + self_net = open(NS_SELF, O_RDONLY); + if (CHECK_FAIL(self_net < 0)) { + perror("Unable to open "NS_SELF); + return; + } + + if (!configure_stack(self_net)) { + perror("configure_stack"); + goto cleanup; + } + + do_sk_assign(); + +cleanup: + close(self_net); +} diff --git a/tools/testing/selftests/bpf/progs/test_sk_assign.c b/tools/testing/selftests/bpf/progs/test_sk_assign.c new file mode 100644 index 000000000000..7de30ad3f594 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_sk_assign.c @@ -0,0 +1,127 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2019 Cloudflare Ltd. + +#include <stddef.h> +#include <stdbool.h> +#include <string.h> +#include <linux/bpf.h> +#include <linux/if_ether.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/pkt_cls.h> +#include <linux/tcp.h> +#include <sys/socket.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> + +int _version SEC("version") = 1; +char _license[] SEC("license") = "GPL"; + +/* Fill 'tuple' with L3 info, and attempt to find L4. On fail, return NULL. */ +static struct bpf_sock_tuple *get_tuple(void *data, __u64 nh_off, + void *data_end, __u16 eth_proto, + bool *ipv4) +{ + struct bpf_sock_tuple *result; + __u8 proto = 0; + __u64 ihl_len; + + if (eth_proto == bpf_htons(ETH_P_IP)) { + struct iphdr *iph = (struct iphdr *)(data + nh_off); + + if (iph + 1 > data_end) + return NULL; + if (iph->ihl != 5) + /* Options are not supported */ + return NULL; + ihl_len = iph->ihl * 4; + proto = iph->protocol; + *ipv4 = true; + result = (struct bpf_sock_tuple *)&iph->saddr; + } else if (eth_proto == bpf_htons(ETH_P_IPV6)) { + struct ipv6hdr *ip6h = (struct ipv6hdr *)(data + nh_off); + + if (ip6h + 1 > data_end) + return NULL; + ihl_len = sizeof(*ip6h); + proto = ip6h->nexthdr; + *ipv4 = false; + result = (struct bpf_sock_tuple *)&ip6h->saddr; + } else { + return NULL; + } + + if (result + 1 > data_end || proto != IPPROTO_TCP) + return NULL; + + return result; +} + +SEC("sk_assign_test") +int bpf_sk_assign_test(struct __sk_buff *skb) +{ + void *data_end = (void *)(long)skb->data_end; + void *data = (void *)(long)skb->data; + struct ethhdr *eth = (struct ethhdr *)(data); + struct bpf_sock_tuple *tuple, ln = {0}; + struct bpf_sock *sk; + int tuple_len; + bool ipv4; + int ret; + + if (eth + 1 > data_end) + return TC_ACT_SHOT; + + tuple = get_tuple(data, sizeof(*eth), data_end, eth->h_proto, &ipv4); + if (!tuple) + return TC_ACT_SHOT; + + tuple_len = ipv4 ? sizeof(tuple->ipv4) : sizeof(tuple->ipv6); + sk = bpf_skc_lookup_tcp(skb, tuple, tuple_len, BPF_F_CURRENT_NETNS, 0); + if (sk) { + if (sk->state != BPF_TCP_LISTEN) + goto assign; + + bpf_sk_release(sk); + } + + if (ipv4) { + if (tuple->ipv4.dport != bpf_htons(4321)) + return TC_ACT_OK; + + ln.ipv4.daddr = bpf_htonl(0x7f000001); + ln.ipv4.dport = bpf_htons(1234); + + sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv4), + BPF_F_CURRENT_NETNS, 0); + } else { + if (tuple->ipv6.dport != bpf_htons(4321)) + return TC_ACT_OK; + + /* Upper parts of daddr are already zero. */ + ln.ipv6.daddr[3] = bpf_htonl(0x1); + ln.ipv6.dport = bpf_htons(1234); + + sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv6), + BPF_F_CURRENT_NETNS, 0); + } + + /* We can't do a single skc_lookup_tcp here, because then the compiler + * will likely spill tuple_len to the stack. This makes it lose all + * bounds information in the verifier, which then rejects the call as + * unsafe. + */ + if (!sk) + return TC_ACT_SHOT; + + if (sk->state != BPF_TCP_LISTEN) { + bpf_sk_release(sk); + return TC_ACT_SHOT; + } + +assign: + ret = bpf_sk_assign(skb, sk, 0); + bpf_sk_release(sk); + return ret == 0 ? TC_ACT_OK : TC_ACT_SHOT; +}