diff mbox series

[PATCHv2,bpf-next,5/5] selftests: bpf: add test for sk_assign

Message ID 20200325055745.10710-6-joe@wand.net.nz
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series Add bpf_sk_assign eBPF helper | expand

Commit Message

Joe Stringer March 25, 2020, 5:57 a.m. UTC
From: Lorenz Bauer <lmb@cloudflare.com>

Attach a tc direct-action classifier to lo in a fresh network
namespace, and rewrite all connection attempts to localhost:4321
to localhost:1234 (for port tests) and connections to unreachable
IPv4/IPv6 IPs to the local socket (for address tests).

Keep in mind that both client to server and server to client traffic
passes the classifier.

Signed-off-by: Lorenz Bauer <lmb@cloudflare.com>
Co-authored-by: Joe Stringer <joe@wand.net.nz>
Signed-off-by: Joe Stringer <joe@wand.net.nz>
---
v2: Rebase onto test_progs infrastructure
v1: Initial commit
---
 tools/testing/selftests/bpf/Makefile          |   2 +-
 .../selftests/bpf/prog_tests/sk_assign.c      | 244 ++++++++++++++++++
 .../selftests/bpf/progs/test_sk_assign.c      | 127 +++++++++
 3 files changed, 372 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/sk_assign.c
 create mode 100644 tools/testing/selftests/bpf/progs/test_sk_assign.c

Comments

Lorenz Bauer March 25, 2020, 10:35 a.m. UTC | #1
On Wed, 25 Mar 2020 at 05:58, Joe Stringer <joe@wand.net.nz> wrote:
>
> From: Lorenz Bauer <lmb@cloudflare.com>
>
> Attach a tc direct-action classifier to lo in a fresh network
> namespace, and rewrite all connection attempts to localhost:4321
> to localhost:1234 (for port tests) and connections to unreachable
> IPv4/IPv6 IPs to the local socket (for address tests).

Can you extend this to cover UDP as well?

>
> Keep in mind that both client to server and server to client traffic
> passes the classifier.
>
> Signed-off-by: Lorenz Bauer <lmb@cloudflare.com>
> Co-authored-by: Joe Stringer <joe@wand.net.nz>
> Signed-off-by: Joe Stringer <joe@wand.net.nz>
> ---
> v2: Rebase onto test_progs infrastructure
> v1: Initial commit
> ---
>  tools/testing/selftests/bpf/Makefile          |   2 +-
>  .../selftests/bpf/prog_tests/sk_assign.c      | 244 ++++++++++++++++++
>  .../selftests/bpf/progs/test_sk_assign.c      | 127 +++++++++
>  3 files changed, 372 insertions(+), 1 deletion(-)
>  create mode 100644 tools/testing/selftests/bpf/prog_tests/sk_assign.c
>  create mode 100644 tools/testing/selftests/bpf/progs/test_sk_assign.c
>
> diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
> index 7729892e0b04..4f7f83d059ca 100644
> --- a/tools/testing/selftests/bpf/Makefile
> +++ b/tools/testing/selftests/bpf/Makefile
> @@ -76,7 +76,7 @@ TEST_PROGS_EXTENDED := with_addr.sh \
>  # Compile but not part of 'make run_tests'
>  TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \
>         flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \
> -       test_lirc_mode2_user xdping test_cpp runqslower
> +       test_lirc_mode2_user xdping test_cpp runqslower test_sk_assign
>
>  TEST_CUSTOM_PROGS = urandom_read
>
> diff --git a/tools/testing/selftests/bpf/prog_tests/sk_assign.c b/tools/testing/selftests/bpf/prog_tests/sk_assign.c
> new file mode 100644
> index 000000000000..1f0afcc20c48
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/prog_tests/sk_assign.c
> @@ -0,0 +1,244 @@
> +// SPDX-License-Identifier: GPL-2.0
> +// Copyright (c) 2018 Facebook
> +// Copyright (c) 2019 Cloudflare
> +// Copyright (c) 2020 Isovalent, Inc.
> +/*
> + * Test that the socket assign program is able to redirect traffic towards a
> + * socket, regardless of whether the port or address destination of the traffic
> + * matches the port.
> + */
> +
> +#define _GNU_SOURCE
> +#include <fcntl.h>
> +#include <signal.h>
> +#include <stdlib.h>
> +#include <unistd.h>
> +
> +#include "test_progs.h"
> +
> +#define TEST_DPORT 4321
> +#define TEST_DADDR (0xC0A80203)
> +#define NS_SELF "/proc/self/ns/net"
> +
> +static __u32 duration;
> +
> +static bool configure_stack(int self_net)
> +{
> +       /* Move to a new networking namespace */
> +       if (CHECK_FAIL(unshare(CLONE_NEWNET)))
> +               return false;
> +
> +       /* Configure necessary links, routes */
> +       if (CHECK_FAIL(system("ip link set dev lo up")))
> +               return false;
> +       if (CHECK_FAIL(system("ip route add local default dev lo")))
> +               return false;
> +       if (CHECK_FAIL(system("ip -6 route add local default dev lo")))
> +               return false;
> +
> +       /* Load qdisc, BPF program */
> +       if (CHECK_FAIL(system("tc qdisc add dev lo clsact")))
> +               return false;
> +       if (CHECK_FAIL(system("tc filter add dev lo ingress bpf direct-action "
> +                    "object-file ./test_sk_assign.o section sk_assign_test")))
> +               return false;
> +
> +       return true;
> +}
> +
> +static int start_server(const struct sockaddr *addr, socklen_t len)
> +{
> +       int fd;
> +
> +       fd = socket(addr->sa_family, SOCK_STREAM, 0);
> +       if (CHECK_FAIL(fd == -1))
> +               goto out;
> +       if (CHECK_FAIL(bind(fd, addr, len) == -1))
> +               goto close_out;
> +       if (CHECK_FAIL(listen(fd, 128) == -1))
> +               goto close_out;
> +
> +       goto out;
> +
> +close_out:
> +       close(fd);
> +       fd = -1;
> +out:
> +       return fd;
> +}
> +
> +static void handle_timeout(int signum)
> +{
> +       if (signum == SIGALRM)
> +               fprintf(stderr, "Timed out while connecting to server\n");
> +       kill(0, SIGKILL);
> +}
> +
> +static struct sigaction timeout_action = {
> +       .sa_handler = handle_timeout,
> +};
> +
> +static int connect_to_server(const struct sockaddr *addr, socklen_t len)
> +{
> +       int fd = -1;
> +
> +       fd = socket(addr->sa_family, SOCK_STREAM, 0);
> +       if (CHECK_FAIL(fd == -1))
> +               goto out;
> +       if (CHECK_FAIL(sigaction(SIGALRM, &timeout_action, NULL)))
> +               goto out;
> +       alarm(3);
> +       if (CHECK_FAIL(connect(fd, addr, len) == -1))
> +               goto close_out;
> +
> +       goto out;
> +
> +close_out:
> +       close(fd);
> +       fd = -1;
> +out:
> +       return fd;
> +}
> +
> +static in_port_t get_port(int fd)
> +{
> +       struct sockaddr_storage name;
> +       socklen_t len;
> +       in_port_t port = 0;
> +
> +       len = sizeof(name);
> +       if (CHECK_FAIL(getsockname(fd, (struct sockaddr *)&name, &len)))
> +               return port;
> +
> +       switch (name.ss_family) {
> +       case AF_INET:
> +               port = ((struct sockaddr_in *)&name)->sin_port;
> +               break;
> +       case AF_INET6:
> +               port = ((struct sockaddr_in6 *)&name)->sin6_port;
> +               break;
> +       default:
> +               CHECK(1, "Invalid address family", "%d\n", name.ss_family);
> +       }
> +       return port;
> +}
> +
> +static int run_test(int server_fd, const struct sockaddr *addr, socklen_t len)
> +{
> +       int client = -1, srv_client = -1;
> +       char buf[] = "testing";
> +       in_port_t port;
> +       int ret = 1;
> +
> +       client = connect_to_server(addr, len);
> +       if (client == -1) {
> +               perror("Cannot connect to server");
> +               goto out;
> +       }
> +
> +       srv_client = accept(server_fd, NULL, NULL);
> +       if (CHECK_FAIL(srv_client == -1)) {
> +               perror("Can't accept connection");
> +               goto out;
> +       }
> +       if (CHECK_FAIL(write(client, buf, sizeof(buf)) != sizeof(buf))) {
> +               perror("Can't write on client");
> +               goto out;
> +       }
> +       if (CHECK_FAIL(read(srv_client, buf, sizeof(buf)) != sizeof(buf))) {
> +               perror("Can't read on server");
> +               goto out;
> +       }
> +
> +       port = get_port(srv_client);
> +       if (CHECK_FAIL(!port))
> +               goto out;
> +       if (CHECK(port != htons(TEST_DPORT), "Expected", "port %u but got %u",
> +                 TEST_DPORT, ntohs(port)))
> +               goto out;
> +
> +       ret = 0;
> +out:
> +       close(client);
> +       close(srv_client);
> +       return ret;
> +}
> +
> +static int do_sk_assign(void)
> +{
> +       struct sockaddr_in addr4;
> +       struct sockaddr_in6 addr6;
> +       int server = -1;
> +       int server_v6 = -1;
> +       int err = 1;
> +
> +       memset(&addr4, 0, sizeof(addr4));
> +       addr4.sin_family = AF_INET;
> +       addr4.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
> +       addr4.sin_port = htons(1234);
> +
> +       memset(&addr6, 0, sizeof(addr6));
> +       addr6.sin6_family = AF_INET6;
> +       addr6.sin6_addr = in6addr_loopback;
> +       addr6.sin6_port = htons(1234);
> +
> +       server = start_server((const struct sockaddr *)&addr4, sizeof(addr4));
> +       if (server == -1)
> +               goto out;
> +
> +       server_v6 = start_server((const struct sockaddr *)&addr6,
> +                                sizeof(addr6));
> +       if (server_v6 == -1)
> +               goto out;
> +
> +       /* Connect to unbound ports */
> +       addr4.sin_port = htons(TEST_DPORT);
> +       addr6.sin6_port = htons(TEST_DPORT);
> +
> +       test__start_subtest("ipv4 port redir");
> +       if (run_test(server, (const struct sockaddr *)&addr4, sizeof(addr4)))
> +               goto out;
> +
> +       test__start_subtest("ipv6 port redir");
> +       if (run_test(server_v6, (const struct sockaddr *)&addr6, sizeof(addr6)))
> +               goto out;
> +
> +       /* Connect to unbound addresses */
> +       addr4.sin_addr.s_addr = htonl(TEST_DADDR);
> +       addr6.sin6_addr.s6_addr32[3] = htonl(TEST_DADDR);
> +
> +       test__start_subtest("ipv4 addr redir");
> +       if (run_test(server, (const struct sockaddr *)&addr4, sizeof(addr4)))
> +               goto out;
> +
> +       test__start_subtest("ipv6 addr redir");
> +       if (run_test(server_v6, (const struct sockaddr *)&addr6, sizeof(addr6)))
> +               goto out;
> +
> +       err = 0;
> +out:
> +       close(server);
> +       close(server_v6);
> +       return err;
> +}
> +
> +void test_sk_assign(void)
> +{
> +       int self_net;
> +
> +       self_net = open(NS_SELF, O_RDONLY);
> +       if (CHECK_FAIL(self_net < 0)) {
> +               perror("Unable to open "NS_SELF);
> +               return;
> +       }
> +
> +       if (!configure_stack(self_net)) {
> +               perror("configure_stack");
> +               goto cleanup;
> +       }
> +
> +       do_sk_assign();
> +
> +cleanup:
> +       close(self_net);
> +}
> diff --git a/tools/testing/selftests/bpf/progs/test_sk_assign.c b/tools/testing/selftests/bpf/progs/test_sk_assign.c
> new file mode 100644
> index 000000000000..7de30ad3f594
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/progs/test_sk_assign.c
> @@ -0,0 +1,127 @@
> +// SPDX-License-Identifier: GPL-2.0
> +// Copyright (c) 2019 Cloudflare Ltd.
> +
> +#include <stddef.h>
> +#include <stdbool.h>
> +#include <string.h>
> +#include <linux/bpf.h>
> +#include <linux/if_ether.h>
> +#include <linux/in.h>
> +#include <linux/ip.h>
> +#include <linux/ipv6.h>
> +#include <linux/pkt_cls.h>
> +#include <linux/tcp.h>
> +#include <sys/socket.h>
> +#include <bpf/bpf_helpers.h>
> +#include <bpf/bpf_endian.h>
> +
> +int _version SEC("version") = 1;
> +char _license[] SEC("license") = "GPL";
> +
> +/* Fill 'tuple' with L3 info, and attempt to find L4. On fail, return NULL. */
> +static struct bpf_sock_tuple *get_tuple(void *data, __u64 nh_off,
> +                                       void *data_end, __u16 eth_proto,
> +                                       bool *ipv4)
> +{
> +       struct bpf_sock_tuple *result;
> +       __u8 proto = 0;
> +       __u64 ihl_len;
> +
> +       if (eth_proto == bpf_htons(ETH_P_IP)) {
> +               struct iphdr *iph = (struct iphdr *)(data + nh_off);
> +
> +               if (iph + 1 > data_end)
> +                       return NULL;
> +               if (iph->ihl != 5)
> +                       /* Options are not supported */
> +                       return NULL;
> +               ihl_len = iph->ihl * 4;
> +               proto = iph->protocol;
> +               *ipv4 = true;
> +               result = (struct bpf_sock_tuple *)&iph->saddr;
> +       } else if (eth_proto == bpf_htons(ETH_P_IPV6)) {
> +               struct ipv6hdr *ip6h = (struct ipv6hdr *)(data + nh_off);
> +
> +               if (ip6h + 1 > data_end)
> +                       return NULL;
> +               ihl_len = sizeof(*ip6h);
> +               proto = ip6h->nexthdr;
> +               *ipv4 = false;
> +               result = (struct bpf_sock_tuple *)&ip6h->saddr;
> +       } else {
> +               return NULL;
> +       }
> +
> +       if (result + 1 > data_end || proto != IPPROTO_TCP)
> +               return NULL;
> +
> +       return result;
> +}
> +
> +SEC("sk_assign_test")
> +int bpf_sk_assign_test(struct __sk_buff *skb)
> +{
> +       void *data_end = (void *)(long)skb->data_end;
> +       void *data = (void *)(long)skb->data;
> +       struct ethhdr *eth = (struct ethhdr *)(data);
> +       struct bpf_sock_tuple *tuple, ln = {0};
> +       struct bpf_sock *sk;
> +       int tuple_len;
> +       bool ipv4;
> +       int ret;
> +
> +       if (eth + 1 > data_end)
> +               return TC_ACT_SHOT;
> +
> +       tuple = get_tuple(data, sizeof(*eth), data_end, eth->h_proto, &ipv4);
> +       if (!tuple)
> +               return TC_ACT_SHOT;
> +
> +       tuple_len = ipv4 ? sizeof(tuple->ipv4) : sizeof(tuple->ipv6);
> +       sk = bpf_skc_lookup_tcp(skb, tuple, tuple_len, BPF_F_CURRENT_NETNS, 0);
> +       if (sk) {
> +               if (sk->state != BPF_TCP_LISTEN)
> +                       goto assign;
> +
> +               bpf_sk_release(sk);
> +       }
> +
> +       if (ipv4) {
> +               if (tuple->ipv4.dport != bpf_htons(4321))
> +                       return TC_ACT_OK;
> +
> +               ln.ipv4.daddr = bpf_htonl(0x7f000001);
> +               ln.ipv4.dport = bpf_htons(1234);
> +
> +               sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv4),
> +                                       BPF_F_CURRENT_NETNS, 0);
> +       } else {
> +               if (tuple->ipv6.dport != bpf_htons(4321))
> +                       return TC_ACT_OK;
> +
> +               /* Upper parts of daddr are already zero. */
> +               ln.ipv6.daddr[3] = bpf_htonl(0x1);
> +               ln.ipv6.dport = bpf_htons(1234);
> +
> +               sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv6),
> +                                       BPF_F_CURRENT_NETNS, 0);
> +       }
> +
> +       /* We can't do a single skc_lookup_tcp here, because then the compiler
> +        * will likely spill tuple_len to the stack. This makes it lose all
> +        * bounds information in the verifier, which then rejects the call as
> +        * unsafe.
> +        */
> +       if (!sk)
> +               return TC_ACT_SHOT;
> +
> +       if (sk->state != BPF_TCP_LISTEN) {
> +               bpf_sk_release(sk);
> +               return TC_ACT_SHOT;
> +       }
> +
> +assign:
> +       ret = bpf_sk_assign(skb, sk, 0);
> +       bpf_sk_release(sk);
> +       return ret == 0 ? TC_ACT_OK : TC_ACT_SHOT;
> +}
> --
> 2.20.1
>
Yonghong Song March 25, 2020, 6:17 p.m. UTC | #2
On 3/24/20 10:57 PM, Joe Stringer wrote:
> From: Lorenz Bauer <lmb@cloudflare.com>
> 
> Attach a tc direct-action classifier to lo in a fresh network
> namespace, and rewrite all connection attempts to localhost:4321
> to localhost:1234 (for port tests) and connections to unreachable
> IPv4/IPv6 IPs to the local socket (for address tests).
> 
> Keep in mind that both client to server and server to client traffic
> passes the classifier.
> 
> Signed-off-by: Lorenz Bauer <lmb@cloudflare.com>
> Co-authored-by: Joe Stringer <joe@wand.net.nz>
> Signed-off-by: Joe Stringer <joe@wand.net.nz>
> ---
> v2: Rebase onto test_progs infrastructure
> v1: Initial commit
> ---
>   tools/testing/selftests/bpf/Makefile          |   2 +-
>   .../selftests/bpf/prog_tests/sk_assign.c      | 244 ++++++++++++++++++
>   .../selftests/bpf/progs/test_sk_assign.c      | 127 +++++++++
>   3 files changed, 372 insertions(+), 1 deletion(-)
>   create mode 100644 tools/testing/selftests/bpf/prog_tests/sk_assign.c
>   create mode 100644 tools/testing/selftests/bpf/progs/test_sk_assign.c
> 
> diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
> index 7729892e0b04..4f7f83d059ca 100644
> --- a/tools/testing/selftests/bpf/Makefile
> +++ b/tools/testing/selftests/bpf/Makefile
> @@ -76,7 +76,7 @@ TEST_PROGS_EXTENDED := with_addr.sh \
>   # Compile but not part of 'make run_tests'
>   TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \
>   	flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \
> -	test_lirc_mode2_user xdping test_cpp runqslower
> +	test_lirc_mode2_user xdping test_cpp runqslower test_sk_assign

No test_sk_assign any more as the test is integrated into test_progs, right?

>   
>   TEST_CUSTOM_PROGS = urandom_read
>   
> diff --git a/tools/testing/selftests/bpf/prog_tests/sk_assign.c b/tools/testing/selftests/bpf/prog_tests/sk_assign.c
> new file mode 100644
> index 000000000000..1f0afcc20c48
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/prog_tests/sk_assign.c
> @@ -0,0 +1,244 @@
> +// SPDX-License-Identifier: GPL-2.0
> +// Copyright (c) 2018 Facebook
> +// Copyright (c) 2019 Cloudflare
> +// Copyright (c) 2020 Isovalent, Inc.
> +/*
> + * Test that the socket assign program is able to redirect traffic towards a
> + * socket, regardless of whether the port or address destination of the traffic
> + * matches the port.
> + */
> +
> +#define _GNU_SOURCE
> +#include <fcntl.h>
> +#include <signal.h>
> +#include <stdlib.h>
> +#include <unistd.h>
> +
> +#include "test_progs.h"
> +
> +#define TEST_DPORT 4321
> +#define TEST_DADDR (0xC0A80203)
> +#define NS_SELF "/proc/self/ns/net"
> +
> +static __u32 duration;
> +
> +static bool configure_stack(int self_net)

self_net parameter is not used.

> +{
> +	/* Move to a new networking namespace */
> +	if (CHECK_FAIL(unshare(CLONE_NEWNET)))
> +		return false;

You can use CHECK to encode better error messages. Thhis is what
most test_progs tests are using.

> +	/* Configure necessary links, routes */
> +	if (CHECK_FAIL(system("ip link set dev lo up")))
> +		return false;
> +	if (CHECK_FAIL(system("ip route add local default dev lo")))
> +		return false;
> +	if (CHECK_FAIL(system("ip -6 route add local default dev lo")))
> +		return false;
> +
> +	/* Load qdisc, BPF program */
> +	if (CHECK_FAIL(system("tc qdisc add dev lo clsact")))
> +		return false;
> +	if (CHECK_FAIL(system("tc filter add dev lo ingress bpf direct-action "
> +		     "object-file ./test_sk_assign.o section sk_assign_test")))
> +		return false;
> +
> +	return true;
> +}
> +
> +static int start_server(const struct sockaddr *addr, socklen_t len)
> +{
> +	int fd;
> +
> +	fd = socket(addr->sa_family, SOCK_STREAM, 0);
> +	if (CHECK_FAIL(fd == -1))
> +		goto out;
> +	if (CHECK_FAIL(bind(fd, addr, len) == -1))
> +		goto close_out;
> +	if (CHECK_FAIL(listen(fd, 128) == -1))
> +		goto close_out;
> +
> +	goto out;
> +
> +close_out:
> +	close(fd);
> +	fd = -1;
> +out:
> +	return fd;
> +}
> +
> +static void handle_timeout(int signum)
> +{
> +	if (signum == SIGALRM)
> +		fprintf(stderr, "Timed out while connecting to server\n");
> +	kill(0, SIGKILL);
> +}
> +
> +static struct sigaction timeout_action = {
> +	.sa_handler = handle_timeout,
> +};
> +
> +static int connect_to_server(const struct sockaddr *addr, socklen_t len)
> +{
> +	int fd = -1;
> +
> +	fd = socket(addr->sa_family, SOCK_STREAM, 0);
> +	if (CHECK_FAIL(fd == -1))
> +		goto out;
> +	if (CHECK_FAIL(sigaction(SIGALRM, &timeout_action, NULL)))
> +		goto out;

should this goto close_out?

> +	alarm(3);
> +	if (CHECK_FAIL(connect(fd, addr, len) == -1))
> +		goto close_out;
> +
> +	goto out;
> +
> +close_out:
> +	close(fd);
> +	fd = -1;
> +out:
> +	return fd;
> +}
> +
> +static in_port_t get_port(int fd)
> +{
> +	struct sockaddr_storage name;
> +	socklen_t len;
> +	in_port_t port = 0;
> +
> +	len = sizeof(name);
> +	if (CHECK_FAIL(getsockname(fd, (struct sockaddr *)&name, &len)))
> +		return port;
> +
> +	switch (name.ss_family) {
> +	case AF_INET:
> +		port = ((struct sockaddr_in *)&name)->sin_port;
> +		break;
> +	case AF_INET6:
> +		port = ((struct sockaddr_in6 *)&name)->sin6_port;
> +		break;
> +	default:
> +		CHECK(1, "Invalid address family", "%d\n", name.ss_family);
> +	}
> +	return port;
> +}
> +
> +static int run_test(int server_fd, const struct sockaddr *addr, socklen_t len)
> +{
> +	int client = -1, srv_client = -1;
> +	char buf[] = "testing";
> +	in_port_t port;
> +	int ret = 1;
> +
> +	client = connect_to_server(addr, len);
> +	if (client == -1) {
> +		perror("Cannot connect to server");
> +		goto out;
> +	}
> +
> +	srv_client = accept(server_fd, NULL, NULL);
> +	if (CHECK_FAIL(srv_client == -1)) {
> +		perror("Can't accept connection");
> +		goto out;
> +	}
> +	if (CHECK_FAIL(write(client, buf, sizeof(buf)) != sizeof(buf))) {
> +		perror("Can't write on client");
> +		goto out;
> +	}
> +	if (CHECK_FAIL(read(srv_client, buf, sizeof(buf)) != sizeof(buf))) {
> +		perror("Can't read on server");
> +		goto out;
> +	}
> +
> +	port = get_port(srv_client);
> +	if (CHECK_FAIL(!port))
> +		goto out;
> +	if (CHECK(port != htons(TEST_DPORT), "Expected", "port %u but got %u",
> +		  TEST_DPORT, ntohs(port)))
> +		goto out;
> +
> +	ret = 0;
> +out:
> +	close(client);
> +	close(srv_client);
> +	return ret;
> +}
> +
> +static int do_sk_assign(void)
> +{
> +	struct sockaddr_in addr4;
> +	struct sockaddr_in6 addr6;
> +	int server = -1;
> +	int server_v6 = -1;
> +	int err = 1;
> +
> +	memset(&addr4, 0, sizeof(addr4));
> +	addr4.sin_family = AF_INET;
> +	addr4.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
> +	addr4.sin_port = htons(1234);
> +
> +	memset(&addr6, 0, sizeof(addr6));
> +	addr6.sin6_family = AF_INET6;
> +	addr6.sin6_addr = in6addr_loopback;
> +	addr6.sin6_port = htons(1234);
> +
> +	server = start_server((const struct sockaddr *)&addr4, sizeof(addr4));
> +	if (server == -1)
> +		goto out;
> +
> +	server_v6 = start_server((const struct sockaddr *)&addr6,
> +				 sizeof(addr6));
> +	if (server_v6 == -1)
> +		goto out;
> +
> +	/* Connect to unbound ports */
> +	addr4.sin_port = htons(TEST_DPORT);
> +	addr6.sin6_port = htons(TEST_DPORT);
> +
> +	test__start_subtest("ipv4 port redir");
> +	if (run_test(server, (const struct sockaddr *)&addr4, sizeof(addr4)))
> +		goto out;
> +
> +	test__start_subtest("ipv6 port redir");
> +	if (run_test(server_v6, (const struct sockaddr *)&addr6, sizeof(addr6)))
> +		goto out;
> +
> +	/* Connect to unbound addresses */
> +	addr4.sin_addr.s_addr = htonl(TEST_DADDR);
> +	addr6.sin6_addr.s6_addr32[3] = htonl(TEST_DADDR);
> +
> +	test__start_subtest("ipv4 addr redir");
> +	if (run_test(server, (const struct sockaddr *)&addr4, sizeof(addr4)))
> +		goto out;
> +
> +	test__start_subtest("ipv6 addr redir");
> +	if (run_test(server_v6, (const struct sockaddr *)&addr6, sizeof(addr6)))
> +		goto out;
> +
> +	err = 0;
> +out:
> +	close(server);
> +	close(server_v6);
> +	return err;
> +}
> +
> +void test_sk_assign(void)
> +{
> +	int self_net;
> +
> +	self_net = open(NS_SELF, O_RDONLY);
> +	if (CHECK_FAIL(self_net < 0)) {
> +		perror("Unable to open "NS_SELF);
> +		return;
> +	}
> +
> +	if (!configure_stack(self_net)) {
> +		perror("configure_stack");
> +		goto cleanup;
> +	}
> +
> +	do_sk_assign();
> +
> +cleanup:
> +	close(self_net);

Did we exit the newly unshared net namespace and restored the previous 
namespace?

> +}
> diff --git a/tools/testing/selftests/bpf/progs/test_sk_assign.c b/tools/testing/selftests/bpf/progs/test_sk_assign.c
> new file mode 100644
> index 000000000000..7de30ad3f594
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/progs/test_sk_assign.c
> @@ -0,0 +1,127 @@
> +// SPDX-License-Identifier: GPL-2.0
> +// Copyright (c) 2019 Cloudflare Ltd.
> +
> +#include <stddef.h>
> +#include <stdbool.h>
> +#include <string.h>
> +#include <linux/bpf.h>
> +#include <linux/if_ether.h>
> +#include <linux/in.h>
> +#include <linux/ip.h>
> +#include <linux/ipv6.h>
> +#include <linux/pkt_cls.h>
> +#include <linux/tcp.h>
> +#include <sys/socket.h>
> +#include <bpf/bpf_helpers.h>
> +#include <bpf/bpf_endian.h>
> +
> +int _version SEC("version") = 1;
> +char _license[] SEC("license") = "GPL";
> +
> +/* Fill 'tuple' with L3 info, and attempt to find L4. On fail, return NULL. */
> +static struct bpf_sock_tuple *get_tuple(void *data, __u64 nh_off,
> +					void *data_end, __u16 eth_proto,
> +					bool *ipv4)
> +{
> +	struct bpf_sock_tuple *result;
> +	__u8 proto = 0;
> +	__u64 ihl_len;
> +
> +	if (eth_proto == bpf_htons(ETH_P_IP)) {
> +		struct iphdr *iph = (struct iphdr *)(data + nh_off);
> +
> +		if (iph + 1 > data_end)
> +			return NULL;
> +		if (iph->ihl != 5)
> +			/* Options are not supported */
> +			return NULL;
> +		ihl_len = iph->ihl * 4;
> +		proto = iph->protocol;
> +		*ipv4 = true;
> +		result = (struct bpf_sock_tuple *)&iph->saddr;
> +	} else if (eth_proto == bpf_htons(ETH_P_IPV6)) {
> +		struct ipv6hdr *ip6h = (struct ipv6hdr *)(data + nh_off);
> +
> +		if (ip6h + 1 > data_end)
> +			return NULL;
> +		ihl_len = sizeof(*ip6h);
> +		proto = ip6h->nexthdr;
> +		*ipv4 = false;
> +		result = (struct bpf_sock_tuple *)&ip6h->saddr;
> +	} else {
> +		return NULL;
> +	}
> +
> +	if (result + 1 > data_end || proto != IPPROTO_TCP)
> +		return NULL;
> +
> +	return result;
> +}
> +
> +SEC("sk_assign_test")
> +int bpf_sk_assign_test(struct __sk_buff *skb)
> +{
> +	void *data_end = (void *)(long)skb->data_end;
> +	void *data = (void *)(long)skb->data;
> +	struct ethhdr *eth = (struct ethhdr *)(data);
> +	struct bpf_sock_tuple *tuple, ln = {0};
> +	struct bpf_sock *sk;
> +	int tuple_len;
> +	bool ipv4;
> +	int ret;
> +
> +	if (eth + 1 > data_end)
> +		return TC_ACT_SHOT;
> +
> +	tuple = get_tuple(data, sizeof(*eth), data_end, eth->h_proto, &ipv4);
> +	if (!tuple)
> +		return TC_ACT_SHOT;
> +
> +	tuple_len = ipv4 ? sizeof(tuple->ipv4) : sizeof(tuple->ipv6);
> +	sk = bpf_skc_lookup_tcp(skb, tuple, tuple_len, BPF_F_CURRENT_NETNS, 0);

You can get rid of tuple_len with
	if (ipv4)
		sk = bpf_skc_lookup_tcp(..., sizeof(tuple->ipv4), ...);
	else
		sk = bpf_skc_lookup_tcp(..., sizeof(tuple->ipv6), ...);

and later on you can do common bpf_skc_lookup_tcp.
But it may not be worthwhile to do it, as you have two separate calls
in the above instead.

> +	if (sk) {
> +		if (sk->state != BPF_TCP_LISTEN)
> +			goto assign;
> +
> +		bpf_sk_release(sk);
> +	}
> +
> +	if (ipv4) {
> +		if (tuple->ipv4.dport != bpf_htons(4321))
> +			return TC_ACT_OK;
> +
> +		ln.ipv4.daddr = bpf_htonl(0x7f000001);
> +		ln.ipv4.dport = bpf_htons(1234);
> +
> +		sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv4),
> +					BPF_F_CURRENT_NETNS, 0);
> +	} else {
> +		if (tuple->ipv6.dport != bpf_htons(4321))
> +			return TC_ACT_OK;
> +
> +		/* Upper parts of daddr are already zero. */
> +		ln.ipv6.daddr[3] = bpf_htonl(0x1);
> +		ln.ipv6.dport = bpf_htons(1234);
> +
> +		sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv6),
> +					BPF_F_CURRENT_NETNS, 0);
> +	}
> +
> +	/* We can't do a single skc_lookup_tcp here, because then the compiler
> +	 * will likely spill tuple_len to the stack. This makes it lose all
> +	 * bounds information in the verifier, which then rejects the call as
> +	 * unsafe.
> +	 */

This is a known issue. For scalars, only constant is restored properly
in verifier at this moment. I did some hacking before to enable any
scalars. The fear is this will make pruning performs worse. More
study is needed here.

> +	if (!sk)
> +		return TC_ACT_SHOT;
> +
> +	if (sk->state != BPF_TCP_LISTEN) {
> +		bpf_sk_release(sk);
> +		return TC_ACT_SHOT;
> +	}
> +
> +assign:
> +	ret = bpf_sk_assign(skb, sk, 0);
> +	bpf_sk_release(sk);
> +	return ret == 0 ? TC_ACT_OK : TC_ACT_SHOT;
> +}
>
Joe Stringer March 25, 2020, 8:55 p.m. UTC | #3
On Wed, Mar 25, 2020 at 3:35 AM Lorenz Bauer <lmb@cloudflare.com> wrote:
>
> On Wed, 25 Mar 2020 at 05:58, Joe Stringer <joe@wand.net.nz> wrote:
> >
> > From: Lorenz Bauer <lmb@cloudflare.com>
> >
> > Attach a tc direct-action classifier to lo in a fresh network
> > namespace, and rewrite all connection attempts to localhost:4321
> > to localhost:1234 (for port tests) and connections to unreachable
> > IPv4/IPv6 IPs to the local socket (for address tests).
>
> Can you extend this to cover UDP as well?

I'm working on a follow-up series for UDP, we need this too.
Joe Stringer March 25, 2020, 9:20 p.m. UTC | #4
On Wed, Mar 25, 2020 at 11:18 AM Yonghong Song <yhs@fb.com> wrote:
>
>
>
> On 3/24/20 10:57 PM, Joe Stringer wrote:
> > From: Lorenz Bauer <lmb@cloudflare.com>
> >
> > Attach a tc direct-action classifier to lo in a fresh network
> > namespace, and rewrite all connection attempts to localhost:4321
> > to localhost:1234 (for port tests) and connections to unreachable
> > IPv4/IPv6 IPs to the local socket (for address tests).
> >
> > Keep in mind that both client to server and server to client traffic
> > passes the classifier.
> >
> > Signed-off-by: Lorenz Bauer <lmb@cloudflare.com>
> > Co-authored-by: Joe Stringer <joe@wand.net.nz>
> > Signed-off-by: Joe Stringer <joe@wand.net.nz>
> > ---
> > v2: Rebase onto test_progs infrastructure
> > v1: Initial commit
> > ---
> >   tools/testing/selftests/bpf/Makefile          |   2 +-
> >   .../selftests/bpf/prog_tests/sk_assign.c      | 244 ++++++++++++++++++
> >   .../selftests/bpf/progs/test_sk_assign.c      | 127 +++++++++
> >   3 files changed, 372 insertions(+), 1 deletion(-)
> >   create mode 100644 tools/testing/selftests/bpf/prog_tests/sk_assign.c
> >   create mode 100644 tools/testing/selftests/bpf/progs/test_sk_assign.c
> >
> > diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
> > index 7729892e0b04..4f7f83d059ca 100644
> > --- a/tools/testing/selftests/bpf/Makefile
> > +++ b/tools/testing/selftests/bpf/Makefile
> > @@ -76,7 +76,7 @@ TEST_PROGS_EXTENDED := with_addr.sh \
> >   # Compile but not part of 'make run_tests'
> >   TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \
> >       flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \
> > -     test_lirc_mode2_user xdping test_cpp runqslower
> > +     test_lirc_mode2_user xdping test_cpp runqslower test_sk_assign
>
> No test_sk_assign any more as the test is integrated into test_progs, right?

I'll fix it up.

> > +static __u32 duration;
> > +
> > +static bool configure_stack(int self_net)
>
> self_net parameter is not used.

Hrm, why didn't the compiler tell me this..? Will fix.

> > +{
> > +     /* Move to a new networking namespace */
> > +     if (CHECK_FAIL(unshare(CLONE_NEWNET)))
> > +             return false;
>
> You can use CHECK to encode better error messages. Thhis is what
> most test_progs tests are using.

I was going back and forth on this when I was writing this bit.
CHECK_FAIL() already prints the line that fails, so when debugging
it's pretty clear what call went wrong if you dig into the code.
Combine with perror() and you actually get a readable string of the
error, whereas the common form for CHECK() seems to be just printing
the error code which the developer then has to do symbol lookup to
interpret..

    if (CHECK(efd < 0, "open", "err %d errno %d\n", efd, errno))

Example output with CHECK_FAIL / perror approach:

    # ./test_progs -t assign
    ...
    Timed out while connecting to server
    connect_to_server:FAIL:90
    Cannot connect to server: Interrupted system call
    #46/1 ipv4 port redir:FAIL
    #46 sk_assign:FAIL
    Summary: 0/0 PASSED, 0 SKIPPED, 2 FAILED

Diff to make this happen is just connect to a port that the BPF
program doesn't redirect:

$ git diff
diff --git a/tools/testing/selftests/bpf/prog_tests/sk_assign.c
b/tools/testing/selftests/bpf/prog_tests/sk_assign.c
index 1f0afcc20c48..ba661145518a 100644
--- a/tools/testing/selftests/bpf/prog_tests/sk_assign.c
+++ b/tools/testing/selftests/bpf/prog_tests/sk_assign.c
@@ -192,7 +191,7 @@ static int do_sk_assign(void)
                goto out;

        /* Connect to unbound ports */
-       addr4.sin_port = htons(TEST_DPORT);
+       addr4.sin_port = htons(666);
        addr6.sin6_port = htons(TEST_DPORT);

        test__start_subtest("ipv4 port redir");

(I had to drop the kill() call as well, but that's part of the next
revision of this series..)

> > +static int connect_to_server(const struct sockaddr *addr, socklen_t len)
> > +{
> > +     int fd = -1;
> > +
> > +     fd = socket(addr->sa_family, SOCK_STREAM, 0);
> > +     if (CHECK_FAIL(fd == -1))
> > +             goto out;
> > +     if (CHECK_FAIL(sigaction(SIGALRM, &timeout_action, NULL)))
> > +             goto out;
>
> should this goto close_out?

Will fix.

> > +void test_sk_assign(void)
> > +{
> > +     int self_net;
> > +
> > +     self_net = open(NS_SELF, O_RDONLY);
> > +     if (CHECK_FAIL(self_net < 0)) {
> > +             perror("Unable to open "NS_SELF);
> > +             return;
> > +     }
> > +
> > +     if (!configure_stack(self_net)) {
> > +             perror("configure_stack");
> > +             goto cleanup;
> > +     }
> > +
> > +     do_sk_assign();
> > +
> > +cleanup:
> > +     close(self_net);
>
> Did we exit the newly unshared net namespace and restored the previous
> namespace?

Ah I've mainly just been running this test so it didn't affect me but
I realise now I dropped the hunks that were intended to do this
cleanup. Will fix.

> > +     /* We can't do a single skc_lookup_tcp here, because then the compiler
> > +      * will likely spill tuple_len to the stack. This makes it lose all
> > +      * bounds information in the verifier, which then rejects the call as
> > +      * unsafe.
> > +      */
>
> This is a known issue. For scalars, only constant is restored properly
> in verifier at this moment. I did some hacking before to enable any
> scalars. The fear is this will make pruning performs worse. More
> study is needed here.

Thanks for the background. Do you want me to refer to any specific
release version or date or commit for this comment or it's fine to
leave as-is?
Yonghong Song March 25, 2020, 10 p.m. UTC | #5
On 3/25/20 2:20 PM, Joe Stringer wrote:
> On Wed, Mar 25, 2020 at 11:18 AM Yonghong Song <yhs@fb.com> wrote:
>>
>>
>>
>> On 3/24/20 10:57 PM, Joe Stringer wrote:
>>> From: Lorenz Bauer <lmb@cloudflare.com>
>>>
>>> Attach a tc direct-action classifier to lo in a fresh network
>>> namespace, and rewrite all connection attempts to localhost:4321
>>> to localhost:1234 (for port tests) and connections to unreachable
>>> IPv4/IPv6 IPs to the local socket (for address tests).
>>>
>>> Keep in mind that both client to server and server to client traffic
>>> passes the classifier.
>>>
>>> Signed-off-by: Lorenz Bauer <lmb@cloudflare.com>
>>> Co-authored-by: Joe Stringer <joe@wand.net.nz>
>>> Signed-off-by: Joe Stringer <joe@wand.net.nz>
>>> ---
>>> v2: Rebase onto test_progs infrastructure
>>> v1: Initial commit
>>> ---
>>>    tools/testing/selftests/bpf/Makefile          |   2 +-
>>>    .../selftests/bpf/prog_tests/sk_assign.c      | 244 ++++++++++++++++++
>>>    .../selftests/bpf/progs/test_sk_assign.c      | 127 +++++++++
>>>    3 files changed, 372 insertions(+), 1 deletion(-)
>>>    create mode 100644 tools/testing/selftests/bpf/prog_tests/sk_assign.c
>>>    create mode 100644 tools/testing/selftests/bpf/progs/test_sk_assign.c
>>>
>>> diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
>>> index 7729892e0b04..4f7f83d059ca 100644
>>> --- a/tools/testing/selftests/bpf/Makefile
>>> +++ b/tools/testing/selftests/bpf/Makefile
>>> @@ -76,7 +76,7 @@ TEST_PROGS_EXTENDED := with_addr.sh \
>>>    # Compile but not part of 'make run_tests'
>>>    TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \
>>>        flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \
>>> -     test_lirc_mode2_user xdping test_cpp runqslower
>>> +     test_lirc_mode2_user xdping test_cpp runqslower test_sk_assign
>>
>> No test_sk_assign any more as the test is integrated into test_progs, right?
> 
> I'll fix it up.
> 
>>> +static __u32 duration;
>>> +
>>> +static bool configure_stack(int self_net)
>>
>> self_net parameter is not used.
> 
> Hrm, why didn't the compiler tell me this..? Will fix.
> 
>>> +{
>>> +     /* Move to a new networking namespace */
>>> +     if (CHECK_FAIL(unshare(CLONE_NEWNET)))
>>> +             return false;
>>
>> You can use CHECK to encode better error messages. Thhis is what
>> most test_progs tests are using.
> 
> I was going back and forth on this when I was writing this bit.
> CHECK_FAIL() already prints the line that fails, so when debugging
> it's pretty clear what call went wrong if you dig into the code.
> Combine with perror() and you actually get a readable string of the
> error, whereas the common form for CHECK() seems to be just printing
> the error code which the developer then has to do symbol lookup to
> interpret..
> 
>      if (CHECK(efd < 0, "open", "err %d errno %d\n", efd, errno))
> 
> Example output with CHECK_FAIL / perror approach:
> 
>      # ./test_progs -t assign
>      ...
>      Timed out while connecting to server
>      connect_to_server:FAIL:90
>      Cannot connect to server: Interrupted system call
>      #46/1 ipv4 port redir:FAIL
>      #46 sk_assign:FAIL
>      Summary: 0/0 PASSED, 0 SKIPPED, 2 FAILED

I won't insist since CHECK_FAIL should roughly provide enough
information for failure. CHECK might be more useful if you want
to provide more context, esp. if the same routine is called
in multiple places and you can have a marker to differentiate
which call site caused the problem.

But again, just a suggestion. CHECK_FAIL is okay to me.

> 
> Diff to make this happen is just connect to a port that the BPF
> program doesn't redirect:
> 
> $ git diff
> diff --git a/tools/testing/selftests/bpf/prog_tests/sk_assign.c
> b/tools/testing/selftests/bpf/prog_tests/sk_assign.c
> index 1f0afcc20c48..ba661145518a 100644
> --- a/tools/testing/selftests/bpf/prog_tests/sk_assign.c
> +++ b/tools/testing/selftests/bpf/prog_tests/sk_assign.c
> @@ -192,7 +191,7 @@ static int do_sk_assign(void)
>                  goto out;
> 
>          /* Connect to unbound ports */
> -       addr4.sin_port = htons(TEST_DPORT);
> +       addr4.sin_port = htons(666);
>          addr6.sin6_port = htons(TEST_DPORT);
> 
>          test__start_subtest("ipv4 port redir");
> 
> (I had to drop the kill() call as well, but that's part of the next
> revision of this series..)
> 
>>> +static int connect_to_server(const struct sockaddr *addr, socklen_t len)
>>> +{
>>> +     int fd = -1;
>>> +
>>> +     fd = socket(addr->sa_family, SOCK_STREAM, 0);
>>> +     if (CHECK_FAIL(fd == -1))
>>> +             goto out;
>>> +     if (CHECK_FAIL(sigaction(SIGALRM, &timeout_action, NULL)))
>>> +             goto out;
>>
>> should this goto close_out?
> 
> Will fix.
> 
>>> +void test_sk_assign(void)
>>> +{
>>> +     int self_net;
>>> +
>>> +     self_net = open(NS_SELF, O_RDONLY);
>>> +     if (CHECK_FAIL(self_net < 0)) {
>>> +             perror("Unable to open "NS_SELF);
>>> +             return;
>>> +     }
>>> +
>>> +     if (!configure_stack(self_net)) {
>>> +             perror("configure_stack");
>>> +             goto cleanup;
>>> +     }
>>> +
>>> +     do_sk_assign();
>>> +
>>> +cleanup:
>>> +     close(self_net);
>>
>> Did we exit the newly unshared net namespace and restored the previous
>> namespace?
> 
> Ah I've mainly just been running this test so it didn't affect me but
> I realise now I dropped the hunks that were intended to do this
> cleanup. Will fix.
> 
>>> +     /* We can't do a single skc_lookup_tcp here, because then the compiler
>>> +      * will likely spill tuple_len to the stack. This makes it lose all
>>> +      * bounds information in the verifier, which then rejects the call as
>>> +      * unsafe.
>>> +      */
>>
>> This is a known issue. For scalars, only constant is restored properly
>> in verifier at this moment. I did some hacking before to enable any
>> scalars. The fear is this will make pruning performs worse. More
>> study is needed here.
> 
> Thanks for the background. Do you want me to refer to any specific
> release version or date or commit for this comment or it's fine to
> leave as-is?

Maybe add a "workaround:" marker in the comments so later we can search
and find these examples if we have compiler/verifier improvements.

-bash-4.4$ egrep -ri workaround
test_get_stack_rawtp.c: * This is an acceptable workaround since there 
is one entry here.
test_seg6_loop.c:       // workaround: define induction variable "i" as 
"long" instead
test_sysctl_loop1.c:    /* a workaround to prevent compiler from generating
-bash-4.4$
Joe Stringer March 25, 2020, 11:07 p.m. UTC | #6
On Wed, Mar 25, 2020 at 3:01 PM Yonghong Song <yhs@fb.com> wrote:
>
>
>
> On 3/25/20 2:20 PM, Joe Stringer wrote:
> > On Wed, Mar 25, 2020 at 11:18 AM Yonghong Song <yhs@fb.com> wrote:
> >>
> >>
> >>
> >> On 3/24/20 10:57 PM, Joe Stringer wrote:
> >>> From: Lorenz Bauer <lmb@cloudflare.com>
> >>>
> >>> Attach a tc direct-action classifier to lo in a fresh network
> >>> namespace, and rewrite all connection attempts to localhost:4321
> >>> to localhost:1234 (for port tests) and connections to unreachable
> >>> IPv4/IPv6 IPs to the local socket (for address tests).
> >>>
> >>> Keep in mind that both client to server and server to client traffic
> >>> passes the classifier.
> >>>
> >>> Signed-off-by: Lorenz Bauer <lmb@cloudflare.com>
> >>> Co-authored-by: Joe Stringer <joe@wand.net.nz>
> >>> Signed-off-by: Joe Stringer <joe@wand.net.nz>
> >>> ---
> >>> v2: Rebase onto test_progs infrastructure
> >>> v1: Initial commit
> >>> ---
> >>>    tools/testing/selftests/bpf/Makefile          |   2 +-
> >>>    .../selftests/bpf/prog_tests/sk_assign.c      | 244 ++++++++++++++++++
> >>>    .../selftests/bpf/progs/test_sk_assign.c      | 127 +++++++++
> >>>    3 files changed, 372 insertions(+), 1 deletion(-)
> >>>    create mode 100644 tools/testing/selftests/bpf/prog_tests/sk_assign.c
> >>>    create mode 100644 tools/testing/selftests/bpf/progs/test_sk_assign.c
> >>>
> >>> diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
> >>> index 7729892e0b04..4f7f83d059ca 100644
> >>> --- a/tools/testing/selftests/bpf/Makefile
> >>> +++ b/tools/testing/selftests/bpf/Makefile
> >>> @@ -76,7 +76,7 @@ TEST_PROGS_EXTENDED := with_addr.sh \
> >>>    # Compile but not part of 'make run_tests'
> >>>    TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \
> >>>        flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \
> >>> -     test_lirc_mode2_user xdping test_cpp runqslower
> >>> +     test_lirc_mode2_user xdping test_cpp runqslower test_sk_assign
> >>
> >> No test_sk_assign any more as the test is integrated into test_progs, right?
> >
> > I'll fix it up.
> >
> >>> +static __u32 duration;
> >>> +
> >>> +static bool configure_stack(int self_net)
> >>
> >> self_net parameter is not used.
> >
> > Hrm, why didn't the compiler tell me this..? Will fix.
> >
> >>> +{
> >>> +     /* Move to a new networking namespace */
> >>> +     if (CHECK_FAIL(unshare(CLONE_NEWNET)))
> >>> +             return false;
> >>
> >> You can use CHECK to encode better error messages. Thhis is what
> >> most test_progs tests are using.
> >
> > I was going back and forth on this when I was writing this bit.
> > CHECK_FAIL() already prints the line that fails, so when debugging
> > it's pretty clear what call went wrong if you dig into the code.
> > Combine with perror() and you actually get a readable string of the
> > error, whereas the common form for CHECK() seems to be just printing
> > the error code which the developer then has to do symbol lookup to
> > interpret..
> >
> >      if (CHECK(efd < 0, "open", "err %d errno %d\n", efd, errno))
> >
> > Example output with CHECK_FAIL / perror approach:
> >
> >      # ./test_progs -t assign
> >      ...
> >      Timed out while connecting to server
> >      connect_to_server:FAIL:90
> >      Cannot connect to server: Interrupted system call
> >      #46/1 ipv4 port redir:FAIL
> >      #46 sk_assign:FAIL
> >      Summary: 0/0 PASSED, 0 SKIPPED, 2 FAILED
>
> I won't insist since CHECK_FAIL should roughly provide enough
> information for failure. CHECK might be more useful if you want
> to provide more context, esp. if the same routine is called
> in multiple places and you can have a marker to differentiate
> which call site caused the problem.

Good point, maybe for extra context the subtests can use CHECK() in
addition to the CHECK_FAIL.

> But again, just a suggestion. CHECK_FAIL is okay to me.

<snip>

> >>> +     /* We can't do a single skc_lookup_tcp here, because then the compiler
> >>> +      * will likely spill tuple_len to the stack. This makes it lose all
> >>> +      * bounds information in the verifier, which then rejects the call as
> >>> +      * unsafe.
> >>> +      */
> >>
> >> This is a known issue. For scalars, only constant is restored properly
> >> in verifier at this moment. I did some hacking before to enable any
> >> scalars. The fear is this will make pruning performs worse. More
> >> study is needed here.
> >
> > Thanks for the background. Do you want me to refer to any specific
> > release version or date or commit for this comment or it's fine to
> > leave as-is?
>
> Maybe add a "workaround:" marker in the comments so later we can search
> and find these examples if we have compiler/verifier improvements.
>
> -bash-4.4$ egrep -ri workaround
> test_get_stack_rawtp.c: * This is an acceptable workaround since there
> is one entry here.
> test_seg6_loop.c:       // workaround: define induction variable "i" as
> "long" instead
> test_sysctl_loop1.c:    /* a workaround to prevent compiler from generating
> -bash-4.4$

SGTM, Will roll that in thanks.
Andrii Nakryiko March 26, 2020, 2:04 a.m. UTC | #7
On Tue, Mar 24, 2020 at 10:58 PM Joe Stringer <joe@wand.net.nz> wrote:
>
> From: Lorenz Bauer <lmb@cloudflare.com>
>
> Attach a tc direct-action classifier to lo in a fresh network
> namespace, and rewrite all connection attempts to localhost:4321
> to localhost:1234 (for port tests) and connections to unreachable
> IPv4/IPv6 IPs to the local socket (for address tests).
>
> Keep in mind that both client to server and server to client traffic
> passes the classifier.
>
> Signed-off-by: Lorenz Bauer <lmb@cloudflare.com>
> Co-authored-by: Joe Stringer <joe@wand.net.nz>
> Signed-off-by: Joe Stringer <joe@wand.net.nz>
> ---

Can you please check that you test fails (instead of getting stuck)
when there is something wrong with network. We went through this
exercise with tcp_rtt and sockmap_listen, where a bunch of stuff was
blocking. This works fine when everything works, but horribly breaks
when something is not working. Given this is part of test_progs, let's
please make sure we don't deadlock anywhere.

> v2: Rebase onto test_progs infrastructure
> v1: Initial commit
> ---
>  tools/testing/selftests/bpf/Makefile          |   2 +-
>  .../selftests/bpf/prog_tests/sk_assign.c      | 244 ++++++++++++++++++
>  .../selftests/bpf/progs/test_sk_assign.c      | 127 +++++++++
>  3 files changed, 372 insertions(+), 1 deletion(-)
>  create mode 100644 tools/testing/selftests/bpf/prog_tests/sk_assign.c
>  create mode 100644 tools/testing/selftests/bpf/progs/test_sk_assign.c
>

[...]
Andrii Nakryiko March 26, 2020, 2:16 a.m. UTC | #8
On Tue, Mar 24, 2020 at 10:58 PM Joe Stringer <joe@wand.net.nz> wrote:
>
> From: Lorenz Bauer <lmb@cloudflare.com>
>
> Attach a tc direct-action classifier to lo in a fresh network
> namespace, and rewrite all connection attempts to localhost:4321
> to localhost:1234 (for port tests) and connections to unreachable
> IPv4/IPv6 IPs to the local socket (for address tests).
>
> Keep in mind that both client to server and server to client traffic
> passes the classifier.
>
> Signed-off-by: Lorenz Bauer <lmb@cloudflare.com>
> Co-authored-by: Joe Stringer <joe@wand.net.nz>
> Signed-off-by: Joe Stringer <joe@wand.net.nz>
> ---
> v2: Rebase onto test_progs infrastructure
> v1: Initial commit
> ---
>  tools/testing/selftests/bpf/Makefile          |   2 +-
>  .../selftests/bpf/prog_tests/sk_assign.c      | 244 ++++++++++++++++++
>  .../selftests/bpf/progs/test_sk_assign.c      | 127 +++++++++
>  3 files changed, 372 insertions(+), 1 deletion(-)
>  create mode 100644 tools/testing/selftests/bpf/prog_tests/sk_assign.c
>  create mode 100644 tools/testing/selftests/bpf/progs/test_sk_assign.c
>
> diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
> index 7729892e0b04..4f7f83d059ca 100644
> --- a/tools/testing/selftests/bpf/Makefile
> +++ b/tools/testing/selftests/bpf/Makefile
> @@ -76,7 +76,7 @@ TEST_PROGS_EXTENDED := with_addr.sh \
>  # Compile but not part of 'make run_tests'
>  TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \
>         flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \
> -       test_lirc_mode2_user xdping test_cpp runqslower
> +       test_lirc_mode2_user xdping test_cpp runqslower test_sk_assign
>
>  TEST_CUSTOM_PROGS = urandom_read
>
> diff --git a/tools/testing/selftests/bpf/prog_tests/sk_assign.c b/tools/testing/selftests/bpf/prog_tests/sk_assign.c
> new file mode 100644
> index 000000000000..1f0afcc20c48
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/prog_tests/sk_assign.c
> @@ -0,0 +1,244 @@
> +// SPDX-License-Identifier: GPL-2.0
> +// Copyright (c) 2018 Facebook
> +// Copyright (c) 2019 Cloudflare
> +// Copyright (c) 2020 Isovalent, Inc.
> +/*
> + * Test that the socket assign program is able to redirect traffic towards a
> + * socket, regardless of whether the port or address destination of the traffic
> + * matches the port.
> + */
> +
> +#define _GNU_SOURCE
> +#include <fcntl.h>
> +#include <signal.h>
> +#include <stdlib.h>
> +#include <unistd.h>
> +
> +#include "test_progs.h"
> +
> +#define TEST_DPORT 4321
> +#define TEST_DADDR (0xC0A80203)
> +#define NS_SELF "/proc/self/ns/net"
> +
> +static __u32 duration;
> +
> +static bool configure_stack(int self_net)
> +{
> +       /* Move to a new networking namespace */
> +       if (CHECK_FAIL(unshare(CLONE_NEWNET)))
> +               return false;
> +
> +       /* Configure necessary links, routes */
> +       if (CHECK_FAIL(system("ip link set dev lo up")))
> +               return false;
> +       if (CHECK_FAIL(system("ip route add local default dev lo")))
> +               return false;
> +       if (CHECK_FAIL(system("ip -6 route add local default dev lo")))
> +               return false;
> +
> +       /* Load qdisc, BPF program */
> +       if (CHECK_FAIL(system("tc qdisc add dev lo clsact")))
> +               return false;
> +       if (CHECK_FAIL(system("tc filter add dev lo ingress bpf direct-action "
> +                    "object-file ./test_sk_assign.o section sk_assign_test")))
> +               return false;
> +
> +       return true;
> +}
> +
> +static int start_server(const struct sockaddr *addr, socklen_t len)
> +{
> +       int fd;
> +
> +       fd = socket(addr->sa_family, SOCK_STREAM, 0);
> +       if (CHECK_FAIL(fd == -1))
> +               goto out;
> +       if (CHECK_FAIL(bind(fd, addr, len) == -1))
> +               goto close_out;
> +       if (CHECK_FAIL(listen(fd, 128) == -1))
> +               goto close_out;
> +
> +       goto out;
> +
> +close_out:
> +       close(fd);
> +       fd = -1;
> +out:
> +       return fd;
> +}
> +
> +static void handle_timeout(int signum)
> +{
> +       if (signum == SIGALRM)
> +               fprintf(stderr, "Timed out while connecting to server\n");
> +       kill(0, SIGKILL);
> +}
> +
> +static struct sigaction timeout_action = {
> +       .sa_handler = handle_timeout,
> +};
> +
> +static int connect_to_server(const struct sockaddr *addr, socklen_t len)
> +{
> +       int fd = -1;
> +
> +       fd = socket(addr->sa_family, SOCK_STREAM, 0);
> +       if (CHECK_FAIL(fd == -1))
> +               goto out;
> +       if (CHECK_FAIL(sigaction(SIGALRM, &timeout_action, NULL)))
> +               goto out;

no-no-no, we are not doing this. It's part of prog_tests and shouldn't
install its own signal handlers and sending asynchronous signals to
itself. Please find another way to have a timeout.

> +       alarm(3);
> +       if (CHECK_FAIL(connect(fd, addr, len) == -1))
> +               goto close_out;
> +
> +       goto out;
> +
> +close_out:
> +       close(fd);
> +       fd = -1;
> +out:
> +       return fd;
> +}
> +
> +static in_port_t get_port(int fd)
> +{
> +       struct sockaddr_storage name;
> +       socklen_t len;
> +       in_port_t port = 0;
> +
> +       len = sizeof(name);
> +       if (CHECK_FAIL(getsockname(fd, (struct sockaddr *)&name, &len)))
> +               return port;
> +
> +       switch (name.ss_family) {
> +       case AF_INET:
> +               port = ((struct sockaddr_in *)&name)->sin_port;
> +               break;
> +       case AF_INET6:
> +               port = ((struct sockaddr_in6 *)&name)->sin6_port;
> +               break;
> +       default:
> +               CHECK(1, "Invalid address family", "%d\n", name.ss_family);
> +       }
> +       return port;
> +}
> +
> +static int run_test(int server_fd, const struct sockaddr *addr, socklen_t len)
> +{
> +       int client = -1, srv_client = -1;
> +       char buf[] = "testing";
> +       in_port_t port;
> +       int ret = 1;
> +
> +       client = connect_to_server(addr, len);
> +       if (client == -1) {
> +               perror("Cannot connect to server");
> +               goto out;
> +       }
> +
> +       srv_client = accept(server_fd, NULL, NULL);
> +       if (CHECK_FAIL(srv_client == -1)) {
> +               perror("Can't accept connection");
> +               goto out;
> +       }
> +       if (CHECK_FAIL(write(client, buf, sizeof(buf)) != sizeof(buf))) {
> +               perror("Can't write on client");
> +               goto out;
> +       }
> +       if (CHECK_FAIL(read(srv_client, buf, sizeof(buf)) != sizeof(buf))) {
> +               perror("Can't read on server");
> +               goto out;
> +       }
> +
> +       port = get_port(srv_client);
> +       if (CHECK_FAIL(!port))
> +               goto out;
> +       if (CHECK(port != htons(TEST_DPORT), "Expected", "port %u but got %u",
> +                 TEST_DPORT, ntohs(port)))
> +               goto out;
> +
> +       ret = 0;
> +out:
> +       close(client);
> +       close(srv_client);
> +       return ret;
> +}
> +
> +static int do_sk_assign(void)
> +{
> +       struct sockaddr_in addr4;
> +       struct sockaddr_in6 addr6;
> +       int server = -1;
> +       int server_v6 = -1;
> +       int err = 1;
> +
> +       memset(&addr4, 0, sizeof(addr4));
> +       addr4.sin_family = AF_INET;
> +       addr4.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
> +       addr4.sin_port = htons(1234);
> +
> +       memset(&addr6, 0, sizeof(addr6));
> +       addr6.sin6_family = AF_INET6;
> +       addr6.sin6_addr = in6addr_loopback;
> +       addr6.sin6_port = htons(1234);
> +
> +       server = start_server((const struct sockaddr *)&addr4, sizeof(addr4));
> +       if (server == -1)
> +               goto out;
> +
> +       server_v6 = start_server((const struct sockaddr *)&addr6,
> +                                sizeof(addr6));
> +       if (server_v6 == -1)
> +               goto out;
> +
> +       /* Connect to unbound ports */
> +       addr4.sin_port = htons(TEST_DPORT);
> +       addr6.sin6_port = htons(TEST_DPORT);
> +
> +       test__start_subtest("ipv4 port redir");
> +       if (run_test(server, (const struct sockaddr *)&addr4, sizeof(addr4)))
> +               goto out;
> +
> +       test__start_subtest("ipv6 port redir");
> +       if (run_test(server_v6, (const struct sockaddr *)&addr6, sizeof(addr6)))
> +               goto out;
> +
> +       /* Connect to unbound addresses */
> +       addr4.sin_addr.s_addr = htonl(TEST_DADDR);
> +       addr6.sin6_addr.s6_addr32[3] = htonl(TEST_DADDR);
> +
> +       test__start_subtest("ipv4 addr redir");
> +       if (run_test(server, (const struct sockaddr *)&addr4, sizeof(addr4)))
> +               goto out;
> +
> +       test__start_subtest("ipv6 addr redir");

test__start_subtest() returns false if subtest is supposed to be
skipped. If you ignore that, then test_progs -t and -n filtering won't
work properly.

> +       if (run_test(server_v6, (const struct sockaddr *)&addr6, sizeof(addr6)))
> +               goto out;
> +
> +       err = 0;
> +out:
> +       close(server);
> +       close(server_v6);
> +       return err;
> +}
> +
> +void test_sk_assign(void)
> +{
> +       int self_net;
> +
> +       self_net = open(NS_SELF, O_RDONLY);

I'm not familiar with what this does. Can you please explain briefly
what are the side effects of this? Asking because of shared test_progs
environment worries, of course.

> +       if (CHECK_FAIL(self_net < 0)) {
> +               perror("Unable to open "NS_SELF);
> +               return;
> +       }
> +
> +       if (!configure_stack(self_net)) {
> +               perror("configure_stack");
> +               goto cleanup;
> +       }
> +
> +       do_sk_assign();
> +
> +cleanup:
> +       close(self_net);
> +}
> diff --git a/tools/testing/selftests/bpf/progs/test_sk_assign.c b/tools/testing/selftests/bpf/progs/test_sk_assign.c
> new file mode 100644
> index 000000000000..7de30ad3f594
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/progs/test_sk_assign.c
> @@ -0,0 +1,127 @@
> +// SPDX-License-Identifier: GPL-2.0
> +// Copyright (c) 2019 Cloudflare Ltd.
> +
> +#include <stddef.h>
> +#include <stdbool.h>
> +#include <string.h>
> +#include <linux/bpf.h>
> +#include <linux/if_ether.h>
> +#include <linux/in.h>
> +#include <linux/ip.h>
> +#include <linux/ipv6.h>
> +#include <linux/pkt_cls.h>
> +#include <linux/tcp.h>
> +#include <sys/socket.h>
> +#include <bpf/bpf_helpers.h>
> +#include <bpf/bpf_endian.h>
> +
> +int _version SEC("version") = 1;
> +char _license[] SEC("license") = "GPL";
> +
> +/* Fill 'tuple' with L3 info, and attempt to find L4. On fail, return NULL. */
> +static struct bpf_sock_tuple *get_tuple(void *data, __u64 nh_off,
> +                                       void *data_end, __u16 eth_proto,
> +                                       bool *ipv4)
> +{
> +       struct bpf_sock_tuple *result;
> +       __u8 proto = 0;
> +       __u64 ihl_len;
> +
> +       if (eth_proto == bpf_htons(ETH_P_IP)) {
> +               struct iphdr *iph = (struct iphdr *)(data + nh_off);
> +
> +               if (iph + 1 > data_end)
> +                       return NULL;
> +               if (iph->ihl != 5)
> +                       /* Options are not supported */
> +                       return NULL;
> +               ihl_len = iph->ihl * 4;
> +               proto = iph->protocol;
> +               *ipv4 = true;
> +               result = (struct bpf_sock_tuple *)&iph->saddr;
> +       } else if (eth_proto == bpf_htons(ETH_P_IPV6)) {
> +               struct ipv6hdr *ip6h = (struct ipv6hdr *)(data + nh_off);
> +
> +               if (ip6h + 1 > data_end)
> +                       return NULL;
> +               ihl_len = sizeof(*ip6h);
> +               proto = ip6h->nexthdr;
> +               *ipv4 = false;
> +               result = (struct bpf_sock_tuple *)&ip6h->saddr;
> +       } else {
> +               return NULL;
> +       }
> +
> +       if (result + 1 > data_end || proto != IPPROTO_TCP)
> +               return NULL;
> +
> +       return result;
> +}
> +
> +SEC("sk_assign_test")

Please use "canonical" section name that libbpf recognizes. This
woulde be "action/" or "classifier/", right?

> +int bpf_sk_assign_test(struct __sk_buff *skb)
> +{
> +       void *data_end = (void *)(long)skb->data_end;
> +       void *data = (void *)(long)skb->data;
> +       struct ethhdr *eth = (struct ethhdr *)(data);
> +       struct bpf_sock_tuple *tuple, ln = {0};
> +       struct bpf_sock *sk;
> +       int tuple_len;
> +       bool ipv4;
> +       int ret;
> +
> +       if (eth + 1 > data_end)
> +               return TC_ACT_SHOT;
> +
> +       tuple = get_tuple(data, sizeof(*eth), data_end, eth->h_proto, &ipv4);
> +       if (!tuple)
> +               return TC_ACT_SHOT;
> +
> +       tuple_len = ipv4 ? sizeof(tuple->ipv4) : sizeof(tuple->ipv6);
> +       sk = bpf_skc_lookup_tcp(skb, tuple, tuple_len, BPF_F_CURRENT_NETNS, 0);
> +       if (sk) {
> +               if (sk->state != BPF_TCP_LISTEN)
> +                       goto assign;
> +
> +               bpf_sk_release(sk);
> +       }
> +
> +       if (ipv4) {
> +               if (tuple->ipv4.dport != bpf_htons(4321))
> +                       return TC_ACT_OK;
> +
> +               ln.ipv4.daddr = bpf_htonl(0x7f000001);
> +               ln.ipv4.dport = bpf_htons(1234);
> +
> +               sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv4),
> +                                       BPF_F_CURRENT_NETNS, 0);
> +       } else {
> +               if (tuple->ipv6.dport != bpf_htons(4321))
> +                       return TC_ACT_OK;
> +
> +               /* Upper parts of daddr are already zero. */
> +               ln.ipv6.daddr[3] = bpf_htonl(0x1);
> +               ln.ipv6.dport = bpf_htons(1234);
> +
> +               sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv6),
> +                                       BPF_F_CURRENT_NETNS, 0);
> +       }
> +
> +       /* We can't do a single skc_lookup_tcp here, because then the compiler
> +        * will likely spill tuple_len to the stack. This makes it lose all
> +        * bounds information in the verifier, which then rejects the call as
> +        * unsafe.
> +        */
> +       if (!sk)
> +               return TC_ACT_SHOT;
> +
> +       if (sk->state != BPF_TCP_LISTEN) {
> +               bpf_sk_release(sk);
> +               return TC_ACT_SHOT;
> +       }
> +
> +assign:
> +       ret = bpf_sk_assign(skb, sk, 0);
> +       bpf_sk_release(sk);
> +       return ret == 0 ? TC_ACT_OK : TC_ACT_SHOT;
> +}
> --
> 2.20.1
>
Joe Stringer March 26, 2020, 5:28 a.m. UTC | #9
On Wed, Mar 25, 2020 at 7:16 PM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Tue, Mar 24, 2020 at 10:58 PM Joe Stringer <joe@wand.net.nz> wrote:
> >
> > From: Lorenz Bauer <lmb@cloudflare.com>
> >
> > Attach a tc direct-action classifier to lo in a fresh network
> > namespace, and rewrite all connection attempts to localhost:4321
> > to localhost:1234 (for port tests) and connections to unreachable
> > IPv4/IPv6 IPs to the local socket (for address tests).
> >
> > Keep in mind that both client to server and server to client traffic
> > passes the classifier.
> >
> > Signed-off-by: Lorenz Bauer <lmb@cloudflare.com>
> > Co-authored-by: Joe Stringer <joe@wand.net.nz>
> > Signed-off-by: Joe Stringer <joe@wand.net.nz>
> > ---

<snip>

> > +static void handle_timeout(int signum)
> > +{
> > +       if (signum == SIGALRM)
> > +               fprintf(stderr, "Timed out while connecting to server\n");
> > +       kill(0, SIGKILL);
> > +}
> > +
> > +static struct sigaction timeout_action = {
> > +       .sa_handler = handle_timeout,
> > +};
> > +
> > +static int connect_to_server(const struct sockaddr *addr, socklen_t len)
> > +{
> > +       int fd = -1;
> > +
> > +       fd = socket(addr->sa_family, SOCK_STREAM, 0);
> > +       if (CHECK_FAIL(fd == -1))
> > +               goto out;
> > +       if (CHECK_FAIL(sigaction(SIGALRM, &timeout_action, NULL)))
> > +               goto out;
>
> no-no-no, we are not doing this. It's part of prog_tests and shouldn't
> install its own signal handlers and sending asynchronous signals to
> itself. Please find another way to have a timeout.

I realise it didn't clean up after itself. How about signal(SIGALRM,
SIG_DFL); just like other existing tests do?

> > +       test__start_subtest("ipv4 addr redir");
> > +       if (run_test(server, (const struct sockaddr *)&addr4, sizeof(addr4)))
> > +               goto out;
> > +
> > +       test__start_subtest("ipv6 addr redir");
>
> test__start_subtest() returns false if subtest is supposed to be
> skipped. If you ignore that, then test_progs -t and -n filtering won't
> work properly.

Will fix.

> > +       if (run_test(server_v6, (const struct sockaddr *)&addr6, sizeof(addr6)))
> > +               goto out;
> > +
> > +       err = 0;
> > +out:
> > +       close(server);
> > +       close(server_v6);
> > +       return err;
> > +}
> > +
> > +void test_sk_assign(void)
> > +{
> > +       int self_net;
> > +
> > +       self_net = open(NS_SELF, O_RDONLY);
>
> I'm not familiar with what this does. Can you please explain briefly
> what are the side effects of this? Asking because of shared test_progs
> environment worries, of course.

This one is opening an fd to the current program's netns path on the
filesystem. The intention was to use it to switch back to the current
netns after the unshare() call elsewhere which switches to a new
netns. As per the other feedback the bit where it switches back to
this netns was dropped along the way so I'll fix it up in the next
revision.

> > +SEC("sk_assign_test")
>
> Please use "canonical" section name that libbpf recognizes. This
> woulde be "action/" or "classifier/", right?

Will fix.
Martin KaFai Lau March 26, 2020, 6:25 a.m. UTC | #10
On Wed, Mar 25, 2020 at 01:55:59PM -0700, Joe Stringer wrote:
> On Wed, Mar 25, 2020 at 3:35 AM Lorenz Bauer <lmb@cloudflare.com> wrote:
> >
> > On Wed, 25 Mar 2020 at 05:58, Joe Stringer <joe@wand.net.nz> wrote:
> > >
> > > From: Lorenz Bauer <lmb@cloudflare.com>
> > >
> > > Attach a tc direct-action classifier to lo in a fresh network
> > > namespace, and rewrite all connection attempts to localhost:4321
> > > to localhost:1234 (for port tests) and connections to unreachable
> > > IPv4/IPv6 IPs to the local socket (for address tests).
> >
> > Can you extend this to cover UDP as well?
> 
> I'm working on a follow-up series for UDP, we need this too.
Other than selftests, what are the changes for UDP in patch 1 - 4?
Martin KaFai Lau March 26, 2020, 6:31 a.m. UTC | #11
On Wed, Mar 25, 2020 at 10:28:11PM -0700, Joe Stringer wrote:
> On Wed, Mar 25, 2020 at 7:16 PM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> >
> > On Tue, Mar 24, 2020 at 10:58 PM Joe Stringer <joe@wand.net.nz> wrote:
> > >
> > > From: Lorenz Bauer <lmb@cloudflare.com>
> > >
> > > Attach a tc direct-action classifier to lo in a fresh network
> > > namespace, and rewrite all connection attempts to localhost:4321
> > > to localhost:1234 (for port tests) and connections to unreachable
> > > IPv4/IPv6 IPs to the local socket (for address tests).
> > >
> > > Keep in mind that both client to server and server to client traffic
> > > passes the classifier.
> > >
> > > Signed-off-by: Lorenz Bauer <lmb@cloudflare.com>
> > > Co-authored-by: Joe Stringer <joe@wand.net.nz>
> > > Signed-off-by: Joe Stringer <joe@wand.net.nz>
> > > ---
> 
> <snip>
> 
> > > +static void handle_timeout(int signum)
> > > +{
> > > +       if (signum == SIGALRM)
> > > +               fprintf(stderr, "Timed out while connecting to server\n");
> > > +       kill(0, SIGKILL);
> > > +}
> > > +
> > > +static struct sigaction timeout_action = {
> > > +       .sa_handler = handle_timeout,
> > > +};
> > > +
> > > +static int connect_to_server(const struct sockaddr *addr, socklen_t len)
> > > +{
> > > +       int fd = -1;
> > > +
> > > +       fd = socket(addr->sa_family, SOCK_STREAM, 0);
> > > +       if (CHECK_FAIL(fd == -1))
> > > +               goto out;
> > > +       if (CHECK_FAIL(sigaction(SIGALRM, &timeout_action, NULL)))
> > > +               goto out;
> >
> > no-no-no, we are not doing this. It's part of prog_tests and shouldn't
> > install its own signal handlers and sending asynchronous signals to
> > itself. Please find another way to have a timeout.
> 
> I realise it didn't clean up after itself. How about signal(SIGALRM,
> SIG_DFL); just like other existing tests do?
or setsockopt SO_SNDTIMEO?
Joe Stringer March 26, 2020, 6:38 a.m. UTC | #12
On Wed, Mar 25, 2020 at 11:25 PM Martin KaFai Lau <kafai@fb.com> wrote:
>
> On Wed, Mar 25, 2020 at 01:55:59PM -0700, Joe Stringer wrote:
> > On Wed, Mar 25, 2020 at 3:35 AM Lorenz Bauer <lmb@cloudflare.com> wrote:
> > >
> > > On Wed, 25 Mar 2020 at 05:58, Joe Stringer <joe@wand.net.nz> wrote:
> > > >
> > > > From: Lorenz Bauer <lmb@cloudflare.com>
> > > >
> > > > Attach a tc direct-action classifier to lo in a fresh network
> > > > namespace, and rewrite all connection attempts to localhost:4321
> > > > to localhost:1234 (for port tests) and connections to unreachable
> > > > IPv4/IPv6 IPs to the local socket (for address tests).
> > >
> > > Can you extend this to cover UDP as well?
> >
> > I'm working on a follow-up series for UDP, we need this too.
> Other than selftests, what are the changes for UDP in patch 1 - 4?

Nothing in those patches, I have refactoring of all of the socket
helpers, skc_lookup_udp() and adding flags to the socket lookup
functions to support only looking for a certain type of sockets -
established or listen. This helps to avoid multiple lookups in these
cases where you really just want to look up established sockets with
the packet tuple first then look up the listener socket with the
unrelated/tproxy tuple. For UDP it makes it easier to find the correct
socket and in general (including TCP) helps to avoid up to two socket
hashtable lookups for this use case. This part is because the current
helpers all look up the established socket first then the listener
socket, so for the first packet that hits these we perform both of
these lookups for the packet tuple (which finds nothing), then look up
an established socket for the target tuple (which finds nothing) then
finally a listen socket for the target tuple. It's about another 300+
/ 250- changes overall, of which a large chunk is one patch that
refactors the code into macros. I haven't narrowed down for sure
whether the lookup flags patch is required for UDP cases yet.
Lorenz Bauer March 26, 2020, 10:13 a.m. UTC | #13
On Wed, 25 Mar 2020 at 18:17, Yonghong Song <yhs@fb.com> wrote:
>
>
>
> On 3/24/20 10:57 PM, Joe Stringer wrote:
> > From: Lorenz Bauer <lmb@cloudflare.com>
> >
> > Attach a tc direct-action classifier to lo in a fresh network
> > namespace, and rewrite all connection attempts to localhost:4321
> > to localhost:1234 (for port tests) and connections to unreachable
> > IPv4/IPv6 IPs to the local socket (for address tests).
> >
> > Keep in mind that both client to server and server to client traffic
> > passes the classifier.
> >
> > Signed-off-by: Lorenz Bauer <lmb@cloudflare.com>
> > Co-authored-by: Joe Stringer <joe@wand.net.nz>
> > Signed-off-by: Joe Stringer <joe@wand.net.nz>
> > ---
> > v2: Rebase onto test_progs infrastructure
> > v1: Initial commit
> > ---
> >   tools/testing/selftests/bpf/Makefile          |   2 +-
> >   .../selftests/bpf/prog_tests/sk_assign.c      | 244 ++++++++++++++++++
> >   .../selftests/bpf/progs/test_sk_assign.c      | 127 +++++++++
> >   3 files changed, 372 insertions(+), 1 deletion(-)
> >   create mode 100644 tools/testing/selftests/bpf/prog_tests/sk_assign.c
> >   create mode 100644 tools/testing/selftests/bpf/progs/test_sk_assign.c
> >
> > diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
> > index 7729892e0b04..4f7f83d059ca 100644
> > --- a/tools/testing/selftests/bpf/Makefile
> > +++ b/tools/testing/selftests/bpf/Makefile
> > @@ -76,7 +76,7 @@ TEST_PROGS_EXTENDED := with_addr.sh \
> >   # Compile but not part of 'make run_tests'
> >   TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \
> >       flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \
> > -     test_lirc_mode2_user xdping test_cpp runqslower
> > +     test_lirc_mode2_user xdping test_cpp runqslower test_sk_assign
>
> No test_sk_assign any more as the test is integrated into test_progs, right?
>
> >
> >   TEST_CUSTOM_PROGS = urandom_read
> >
> > diff --git a/tools/testing/selftests/bpf/prog_tests/sk_assign.c b/tools/testing/selftests/bpf/prog_tests/sk_assign.c
> > new file mode 100644
> > index 000000000000..1f0afcc20c48
> > --- /dev/null
> > +++ b/tools/testing/selftests/bpf/prog_tests/sk_assign.c
> > @@ -0,0 +1,244 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +// Copyright (c) 2018 Facebook
> > +// Copyright (c) 2019 Cloudflare
> > +// Copyright (c) 2020 Isovalent, Inc.
> > +/*
> > + * Test that the socket assign program is able to redirect traffic towards a
> > + * socket, regardless of whether the port or address destination of the traffic
> > + * matches the port.
> > + */
> > +
> > +#define _GNU_SOURCE
> > +#include <fcntl.h>
> > +#include <signal.h>
> > +#include <stdlib.h>
> > +#include <unistd.h>
> > +
> > +#include "test_progs.h"
> > +
> > +#define TEST_DPORT 4321
> > +#define TEST_DADDR (0xC0A80203)
> > +#define NS_SELF "/proc/self/ns/net"
> > +
> > +static __u32 duration;
> > +
> > +static bool configure_stack(int self_net)
>
> self_net parameter is not used.
>
> > +{
> > +     /* Move to a new networking namespace */
> > +     if (CHECK_FAIL(unshare(CLONE_NEWNET)))
> > +             return false;
>
> You can use CHECK to encode better error messages. Thhis is what
> most test_progs tests are using.
>
> > +     /* Configure necessary links, routes */
> > +     if (CHECK_FAIL(system("ip link set dev lo up")))
> > +             return false;
> > +     if (CHECK_FAIL(system("ip route add local default dev lo")))
> > +             return false;
> > +     if (CHECK_FAIL(system("ip -6 route add local default dev lo")))
> > +             return false;
> > +
> > +     /* Load qdisc, BPF program */
> > +     if (CHECK_FAIL(system("tc qdisc add dev lo clsact")))
> > +             return false;
> > +     if (CHECK_FAIL(system("tc filter add dev lo ingress bpf direct-action "
> > +                  "object-file ./test_sk_assign.o section sk_assign_test")))
> > +             return false;
> > +
> > +     return true;
> > +}
> > +
> > +static int start_server(const struct sockaddr *addr, socklen_t len)
> > +{
> > +     int fd;
> > +
> > +     fd = socket(addr->sa_family, SOCK_STREAM, 0);
> > +     if (CHECK_FAIL(fd == -1))
> > +             goto out;
> > +     if (CHECK_FAIL(bind(fd, addr, len) == -1))
> > +             goto close_out;
> > +     if (CHECK_FAIL(listen(fd, 128) == -1))
> > +             goto close_out;
> > +
> > +     goto out;
> > +
> > +close_out:
> > +     close(fd);
> > +     fd = -1;
> > +out:
> > +     return fd;
> > +}
> > +
> > +static void handle_timeout(int signum)
> > +{
> > +     if (signum == SIGALRM)
> > +             fprintf(stderr, "Timed out while connecting to server\n");
> > +     kill(0, SIGKILL);
> > +}
> > +
> > +static struct sigaction timeout_action = {
> > +     .sa_handler = handle_timeout,
> > +};
> > +
> > +static int connect_to_server(const struct sockaddr *addr, socklen_t len)
> > +{
> > +     int fd = -1;
> > +
> > +     fd = socket(addr->sa_family, SOCK_STREAM, 0);
> > +     if (CHECK_FAIL(fd == -1))
> > +             goto out;
> > +     if (CHECK_FAIL(sigaction(SIGALRM, &timeout_action, NULL)))
> > +             goto out;
>
> should this goto close_out?
>
> > +     alarm(3);
> > +     if (CHECK_FAIL(connect(fd, addr, len) == -1))
> > +             goto close_out;
> > +
> > +     goto out;
> > +
> > +close_out:
> > +     close(fd);
> > +     fd = -1;
> > +out:
> > +     return fd;
> > +}
> > +
> > +static in_port_t get_port(int fd)
> > +{
> > +     struct sockaddr_storage name;
> > +     socklen_t len;
> > +     in_port_t port = 0;
> > +
> > +     len = sizeof(name);
> > +     if (CHECK_FAIL(getsockname(fd, (struct sockaddr *)&name, &len)))
> > +             return port;
> > +
> > +     switch (name.ss_family) {
> > +     case AF_INET:
> > +             port = ((struct sockaddr_in *)&name)->sin_port;
> > +             break;
> > +     case AF_INET6:
> > +             port = ((struct sockaddr_in6 *)&name)->sin6_port;
> > +             break;
> > +     default:
> > +             CHECK(1, "Invalid address family", "%d\n", name.ss_family);
> > +     }
> > +     return port;
> > +}
> > +
> > +static int run_test(int server_fd, const struct sockaddr *addr, socklen_t len)
> > +{
> > +     int client = -1, srv_client = -1;
> > +     char buf[] = "testing";
> > +     in_port_t port;
> > +     int ret = 1;
> > +
> > +     client = connect_to_server(addr, len);
> > +     if (client == -1) {
> > +             perror("Cannot connect to server");
> > +             goto out;
> > +     }
> > +
> > +     srv_client = accept(server_fd, NULL, NULL);
> > +     if (CHECK_FAIL(srv_client == -1)) {
> > +             perror("Can't accept connection");
> > +             goto out;
> > +     }
> > +     if (CHECK_FAIL(write(client, buf, sizeof(buf)) != sizeof(buf))) {
> > +             perror("Can't write on client");
> > +             goto out;
> > +     }
> > +     if (CHECK_FAIL(read(srv_client, buf, sizeof(buf)) != sizeof(buf))) {
> > +             perror("Can't read on server");
> > +             goto out;
> > +     }
> > +
> > +     port = get_port(srv_client);
> > +     if (CHECK_FAIL(!port))
> > +             goto out;
> > +     if (CHECK(port != htons(TEST_DPORT), "Expected", "port %u but got %u",
> > +               TEST_DPORT, ntohs(port)))
> > +             goto out;
> > +
> > +     ret = 0;
> > +out:
> > +     close(client);
> > +     close(srv_client);
> > +     return ret;
> > +}
> > +
> > +static int do_sk_assign(void)
> > +{
> > +     struct sockaddr_in addr4;
> > +     struct sockaddr_in6 addr6;
> > +     int server = -1;
> > +     int server_v6 = -1;
> > +     int err = 1;
> > +
> > +     memset(&addr4, 0, sizeof(addr4));
> > +     addr4.sin_family = AF_INET;
> > +     addr4.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
> > +     addr4.sin_port = htons(1234);
> > +
> > +     memset(&addr6, 0, sizeof(addr6));
> > +     addr6.sin6_family = AF_INET6;
> > +     addr6.sin6_addr = in6addr_loopback;
> > +     addr6.sin6_port = htons(1234);
> > +
> > +     server = start_server((const struct sockaddr *)&addr4, sizeof(addr4));
> > +     if (server == -1)
> > +             goto out;
> > +
> > +     server_v6 = start_server((const struct sockaddr *)&addr6,
> > +                              sizeof(addr6));
> > +     if (server_v6 == -1)
> > +             goto out;
> > +
> > +     /* Connect to unbound ports */
> > +     addr4.sin_port = htons(TEST_DPORT);
> > +     addr6.sin6_port = htons(TEST_DPORT);
> > +
> > +     test__start_subtest("ipv4 port redir");
> > +     if (run_test(server, (const struct sockaddr *)&addr4, sizeof(addr4)))
> > +             goto out;
> > +
> > +     test__start_subtest("ipv6 port redir");
> > +     if (run_test(server_v6, (const struct sockaddr *)&addr6, sizeof(addr6)))
> > +             goto out;
> > +
> > +     /* Connect to unbound addresses */
> > +     addr4.sin_addr.s_addr = htonl(TEST_DADDR);
> > +     addr6.sin6_addr.s6_addr32[3] = htonl(TEST_DADDR);
> > +
> > +     test__start_subtest("ipv4 addr redir");
> > +     if (run_test(server, (const struct sockaddr *)&addr4, sizeof(addr4)))
> > +             goto out;
> > +
> > +     test__start_subtest("ipv6 addr redir");
> > +     if (run_test(server_v6, (const struct sockaddr *)&addr6, sizeof(addr6)))
> > +             goto out;
> > +
> > +     err = 0;
> > +out:
> > +     close(server);
> > +     close(server_v6);
> > +     return err;
> > +}
> > +
> > +void test_sk_assign(void)
> > +{
> > +     int self_net;
> > +
> > +     self_net = open(NS_SELF, O_RDONLY);
> > +     if (CHECK_FAIL(self_net < 0)) {
> > +             perror("Unable to open "NS_SELF);
> > +             return;
> > +     }
> > +
> > +     if (!configure_stack(self_net)) {
> > +             perror("configure_stack");
> > +             goto cleanup;
> > +     }
> > +
> > +     do_sk_assign();
> > +
> > +cleanup:
> > +     close(self_net);
>
> Did we exit the newly unshared net namespace and restored the previous
> namespace?
>
> > +}
> > diff --git a/tools/testing/selftests/bpf/progs/test_sk_assign.c b/tools/testing/selftests/bpf/progs/test_sk_assign.c
> > new file mode 100644
> > index 000000000000..7de30ad3f594
> > --- /dev/null
> > +++ b/tools/testing/selftests/bpf/progs/test_sk_assign.c
> > @@ -0,0 +1,127 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +// Copyright (c) 2019 Cloudflare Ltd.
> > +
> > +#include <stddef.h>
> > +#include <stdbool.h>
> > +#include <string.h>
> > +#include <linux/bpf.h>
> > +#include <linux/if_ether.h>
> > +#include <linux/in.h>
> > +#include <linux/ip.h>
> > +#include <linux/ipv6.h>
> > +#include <linux/pkt_cls.h>
> > +#include <linux/tcp.h>
> > +#include <sys/socket.h>
> > +#include <bpf/bpf_helpers.h>
> > +#include <bpf/bpf_endian.h>
> > +
> > +int _version SEC("version") = 1;
> > +char _license[] SEC("license") = "GPL";
> > +
> > +/* Fill 'tuple' with L3 info, and attempt to find L4. On fail, return NULL. */
> > +static struct bpf_sock_tuple *get_tuple(void *data, __u64 nh_off,
> > +                                     void *data_end, __u16 eth_proto,
> > +                                     bool *ipv4)
> > +{
> > +     struct bpf_sock_tuple *result;
> > +     __u8 proto = 0;
> > +     __u64 ihl_len;
> > +
> > +     if (eth_proto == bpf_htons(ETH_P_IP)) {
> > +             struct iphdr *iph = (struct iphdr *)(data + nh_off);
> > +
> > +             if (iph + 1 > data_end)
> > +                     return NULL;
> > +             if (iph->ihl != 5)
> > +                     /* Options are not supported */
> > +                     return NULL;
> > +             ihl_len = iph->ihl * 4;
> > +             proto = iph->protocol;
> > +             *ipv4 = true;
> > +             result = (struct bpf_sock_tuple *)&iph->saddr;
> > +     } else if (eth_proto == bpf_htons(ETH_P_IPV6)) {
> > +             struct ipv6hdr *ip6h = (struct ipv6hdr *)(data + nh_off);
> > +
> > +             if (ip6h + 1 > data_end)
> > +                     return NULL;
> > +             ihl_len = sizeof(*ip6h);
> > +             proto = ip6h->nexthdr;
> > +             *ipv4 = false;
> > +             result = (struct bpf_sock_tuple *)&ip6h->saddr;
> > +     } else {
> > +             return NULL;
> > +     }
> > +
> > +     if (result + 1 > data_end || proto != IPPROTO_TCP)
> > +             return NULL;
> > +
> > +     return result;
> > +}
> > +
> > +SEC("sk_assign_test")
> > +int bpf_sk_assign_test(struct __sk_buff *skb)
> > +{
> > +     void *data_end = (void *)(long)skb->data_end;
> > +     void *data = (void *)(long)skb->data;
> > +     struct ethhdr *eth = (struct ethhdr *)(data);
> > +     struct bpf_sock_tuple *tuple, ln = {0};
> > +     struct bpf_sock *sk;
> > +     int tuple_len;
> > +     bool ipv4;
> > +     int ret;
> > +
> > +     if (eth + 1 > data_end)
> > +             return TC_ACT_SHOT;
> > +
> > +     tuple = get_tuple(data, sizeof(*eth), data_end, eth->h_proto, &ipv4);
> > +     if (!tuple)
> > +             return TC_ACT_SHOT;
> > +
> > +     tuple_len = ipv4 ? sizeof(tuple->ipv4) : sizeof(tuple->ipv6);
> > +     sk = bpf_skc_lookup_tcp(skb, tuple, tuple_len, BPF_F_CURRENT_NETNS, 0);
>
> You can get rid of tuple_len with
>         if (ipv4)
>                 sk = bpf_skc_lookup_tcp(..., sizeof(tuple->ipv4), ...);
>         else
>                 sk = bpf_skc_lookup_tcp(..., sizeof(tuple->ipv6), ...);
>
> and later on you can do common bpf_skc_lookup_tcp.
> But it may not be worthwhile to do it, as you have two separate calls
> in the above instead.
>
> > +     if (sk) {
> > +             if (sk->state != BPF_TCP_LISTEN)
> > +                     goto assign;
> > +
> > +             bpf_sk_release(sk);
> > +     }
> > +
> > +     if (ipv4) {
> > +             if (tuple->ipv4.dport != bpf_htons(4321))
> > +                     return TC_ACT_OK;
> > +
> > +             ln.ipv4.daddr = bpf_htonl(0x7f000001);
> > +             ln.ipv4.dport = bpf_htons(1234);
> > +
> > +             sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv4),
> > +                                     BPF_F_CURRENT_NETNS, 0);
> > +     } else {
> > +             if (tuple->ipv6.dport != bpf_htons(4321))
> > +                     return TC_ACT_OK;
> > +
> > +             /* Upper parts of daddr are already zero. */
> > +             ln.ipv6.daddr[3] = bpf_htonl(0x1);
> > +             ln.ipv6.dport = bpf_htons(1234);
> > +
> > +             sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv6),
> > +                                     BPF_F_CURRENT_NETNS, 0);
> > +     }
> > +
> > +     /* We can't do a single skc_lookup_tcp here, because then the compiler
> > +      * will likely spill tuple_len to the stack. This makes it lose all
> > +      * bounds information in the verifier, which then rejects the call as
> > +      * unsafe.
> > +      */
>
> This is a known issue. For scalars, only constant is restored properly
> in verifier at this moment. I did some hacking before to enable any
> scalars. The fear is this will make pruning performs worse. More
> study is needed here.

Of topic, but: this is actually one of the most challenging issues for
us when writing
BPF. It forces us to have very deep call graphs to hopefully avoid clang
spilling the constants. Please let me know if I can help in any way.

>
> > +     if (!sk)
> > +             return TC_ACT_SHOT;
> > +
> > +     if (sk->state != BPF_TCP_LISTEN) {
> > +             bpf_sk_release(sk);
> > +             return TC_ACT_SHOT;
> > +     }
> > +
> > +assign:
> > +     ret = bpf_sk_assign(skb, sk, 0);
> > +     bpf_sk_release(sk);
> > +     return ret == 0 ? TC_ACT_OK : TC_ACT_SHOT;
> > +}
> >
Andrii Nakryiko March 26, 2020, 7:36 p.m. UTC | #14
On Wed, Mar 25, 2020 at 10:28 PM Joe Stringer <joe@wand.net.nz> wrote:
>
> On Wed, Mar 25, 2020 at 7:16 PM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> >
> > On Tue, Mar 24, 2020 at 10:58 PM Joe Stringer <joe@wand.net.nz> wrote:
> > >
> > > From: Lorenz Bauer <lmb@cloudflare.com>
> > >
> > > Attach a tc direct-action classifier to lo in a fresh network
> > > namespace, and rewrite all connection attempts to localhost:4321
> > > to localhost:1234 (for port tests) and connections to unreachable
> > > IPv4/IPv6 IPs to the local socket (for address tests).
> > >
> > > Keep in mind that both client to server and server to client traffic
> > > passes the classifier.
> > >
> > > Signed-off-by: Lorenz Bauer <lmb@cloudflare.com>
> > > Co-authored-by: Joe Stringer <joe@wand.net.nz>
> > > Signed-off-by: Joe Stringer <joe@wand.net.nz>
> > > ---
>
> <snip>
>
> > > +static void handle_timeout(int signum)
> > > +{
> > > +       if (signum == SIGALRM)
> > > +               fprintf(stderr, "Timed out while connecting to server\n");
> > > +       kill(0, SIGKILL);
> > > +}
> > > +
> > > +static struct sigaction timeout_action = {
> > > +       .sa_handler = handle_timeout,
> > > +};
> > > +
> > > +static int connect_to_server(const struct sockaddr *addr, socklen_t len)
> > > +{
> > > +       int fd = -1;
> > > +
> > > +       fd = socket(addr->sa_family, SOCK_STREAM, 0);
> > > +       if (CHECK_FAIL(fd == -1))
> > > +               goto out;
> > > +       if (CHECK_FAIL(sigaction(SIGALRM, &timeout_action, NULL)))
> > > +               goto out;
> >
> > no-no-no, we are not doing this. It's part of prog_tests and shouldn't
> > install its own signal handlers and sending asynchronous signals to
> > itself. Please find another way to have a timeout.
>
> I realise it didn't clean up after itself. How about signal(SIGALRM,
> SIG_DFL); just like other existing tests do?

You have alarm(3), which will deliver SIGALARM later, when other test
is running. Once you clean up this custom signal handler it will cause
test_progs to coredump. So still no, please find another way to do
timeouts.


>
> > > +       test__start_subtest("ipv4 addr redir");
> > > +       if (run_test(server, (const struct sockaddr *)&addr4, sizeof(addr4)))
> > > +               goto out;
> > > +
> > > +       test__start_subtest("ipv6 addr redir");
> >
> > test__start_subtest() returns false if subtest is supposed to be
> > skipped. If you ignore that, then test_progs -t and -n filtering won't
> > work properly.
>
> Will fix.
>
> > > +       if (run_test(server_v6, (const struct sockaddr *)&addr6, sizeof(addr6)))
> > > +               goto out;
> > > +
> > > +       err = 0;
> > > +out:
> > > +       close(server);
> > > +       close(server_v6);
> > > +       return err;
> > > +}
> > > +
> > > +void test_sk_assign(void)
> > > +{
> > > +       int self_net;
> > > +
> > > +       self_net = open(NS_SELF, O_RDONLY);
> >
> > I'm not familiar with what this does. Can you please explain briefly
> > what are the side effects of this? Asking because of shared test_progs
> > environment worries, of course.
>
> This one is opening an fd to the current program's netns path on the
> filesystem. The intention was to use it to switch back to the current
> netns after the unshare() call elsewhere which switches to a new
> netns. As per the other feedback the bit where it switches back to
> this netns was dropped along the way so I'll fix it up in the next
> revision.
>
> > > +SEC("sk_assign_test")
> >
> > Please use "canonical" section name that libbpf recognizes. This
> > woulde be "action/" or "classifier/", right?
>
> Will fix.
Alexei Starovoitov March 26, 2020, 9:07 p.m. UTC | #15
On Thu, Mar 26, 2020 at 10:13:31AM +0000, Lorenz Bauer wrote:
> > > +
> > > +     if (ipv4) {
> > > +             if (tuple->ipv4.dport != bpf_htons(4321))
> > > +                     return TC_ACT_OK;
> > > +
> > > +             ln.ipv4.daddr = bpf_htonl(0x7f000001);
> > > +             ln.ipv4.dport = bpf_htons(1234);
> > > +
> > > +             sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv4),
> > > +                                     BPF_F_CURRENT_NETNS, 0);
> > > +     } else {
> > > +             if (tuple->ipv6.dport != bpf_htons(4321))
> > > +                     return TC_ACT_OK;
> > > +
> > > +             /* Upper parts of daddr are already zero. */
> > > +             ln.ipv6.daddr[3] = bpf_htonl(0x1);
> > > +             ln.ipv6.dport = bpf_htons(1234);
> > > +
> > > +             sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv6),
> > > +                                     BPF_F_CURRENT_NETNS, 0);
> > > +     }
> > > +
> > > +     /* We can't do a single skc_lookup_tcp here, because then the compiler
> > > +      * will likely spill tuple_len to the stack. This makes it lose all
> > > +      * bounds information in the verifier, which then rejects the call as
> > > +      * unsafe.
> > > +      */
> >
> > This is a known issue. For scalars, only constant is restored properly
> > in verifier at this moment. I did some hacking before to enable any
> > scalars. The fear is this will make pruning performs worse. More
> > study is needed here.
> 
> Of topic, but: this is actually one of the most challenging issues for
> us when writing
> BPF. It forces us to have very deep call graphs to hopefully avoid clang
> spilling the constants. Please let me know if I can help in any way.

Thanks for bringing this up.
Yonghong, please correct me if I'm wrong.
I think you've experimented with tracking spilled constants. The first issue
came with spilling of 4 byte constant. The verifier tracks 8 byte slots and
lots of places assume that slot granularity. It's not clear yet how to refactor
the verifier. Ideas, help are greatly appreciated.
The second concern was pruning, but iirc the experiments were inconclusive.
selftests/bpf only has old fb progs. Hence, I think, the step zero is for
everyone to contribute their bpf programs written in C. If we have both
cilium and cloudflare progs as selftests it will help a lot to guide such long
lasting verifier decisions.
Joe Stringer March 26, 2020, 9:38 p.m. UTC | #16
On Thu, Mar 26, 2020 at 12:37 PM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Wed, Mar 25, 2020 at 10:28 PM Joe Stringer <joe@wand.net.nz> wrote:
> >
> > On Wed, Mar 25, 2020 at 7:16 PM Andrii Nakryiko
> > <andrii.nakryiko@gmail.com> wrote:
> > >
> > > On Tue, Mar 24, 2020 at 10:58 PM Joe Stringer <joe@wand.net.nz> wrote:
> > > >
> > > > From: Lorenz Bauer <lmb@cloudflare.com>
> > > >
> > > > Attach a tc direct-action classifier to lo in a fresh network
> > > > namespace, and rewrite all connection attempts to localhost:4321
> > > > to localhost:1234 (for port tests) and connections to unreachable
> > > > IPv4/IPv6 IPs to the local socket (for address tests).
> > > >
> > > > Keep in mind that both client to server and server to client traffic
> > > > passes the classifier.
> > > >
> > > > Signed-off-by: Lorenz Bauer <lmb@cloudflare.com>
> > > > Co-authored-by: Joe Stringer <joe@wand.net.nz>
> > > > Signed-off-by: Joe Stringer <joe@wand.net.nz>
> > > > ---
> >
> > <snip>
> >
> > > > +static void handle_timeout(int signum)
> > > > +{
> > > > +       if (signum == SIGALRM)
> > > > +               fprintf(stderr, "Timed out while connecting to server\n");
> > > > +       kill(0, SIGKILL);
> > > > +}
> > > > +
> > > > +static struct sigaction timeout_action = {
> > > > +       .sa_handler = handle_timeout,
> > > > +};
> > > > +
> > > > +static int connect_to_server(const struct sockaddr *addr, socklen_t len)
> > > > +{
> > > > +       int fd = -1;
> > > > +
> > > > +       fd = socket(addr->sa_family, SOCK_STREAM, 0);
> > > > +       if (CHECK_FAIL(fd == -1))
> > > > +               goto out;
> > > > +       if (CHECK_FAIL(sigaction(SIGALRM, &timeout_action, NULL)))
> > > > +               goto out;
> > >
> > > no-no-no, we are not doing this. It's part of prog_tests and shouldn't
> > > install its own signal handlers and sending asynchronous signals to
> > > itself. Please find another way to have a timeout.
> >
> > I realise it didn't clean up after itself. How about signal(SIGALRM,
> > SIG_DFL); just like other existing tests do?
>
> You have alarm(3), which will deliver SIGALARM later, when other test
> is running. Once you clean up this custom signal handler it will cause
> test_progs to coredump. So still no, please find another way to do
> timeouts.

FWIW I hit this issue and alarm(0) afterwards fixed it up.

That said the SO_SNDTIMEO / SO_RCVTIMEO alternative should work fine
for this use case instead so I'll switch to that, it's marginally
cleaner.
Yonghong Song March 26, 2020, 11:14 p.m. UTC | #17
On 3/26/20 2:07 PM, Alexei Starovoitov wrote:
> On Thu, Mar 26, 2020 at 10:13:31AM +0000, Lorenz Bauer wrote:
>>>> +
>>>> +     if (ipv4) {
>>>> +             if (tuple->ipv4.dport != bpf_htons(4321))
>>>> +                     return TC_ACT_OK;
>>>> +
>>>> +             ln.ipv4.daddr = bpf_htonl(0x7f000001);
>>>> +             ln.ipv4.dport = bpf_htons(1234);
>>>> +
>>>> +             sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv4),
>>>> +                                     BPF_F_CURRENT_NETNS, 0);
>>>> +     } else {
>>>> +             if (tuple->ipv6.dport != bpf_htons(4321))
>>>> +                     return TC_ACT_OK;
>>>> +
>>>> +             /* Upper parts of daddr are already zero. */
>>>> +             ln.ipv6.daddr[3] = bpf_htonl(0x1);
>>>> +             ln.ipv6.dport = bpf_htons(1234);
>>>> +
>>>> +             sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv6),
>>>> +                                     BPF_F_CURRENT_NETNS, 0);
>>>> +     }
>>>> +
>>>> +     /* We can't do a single skc_lookup_tcp here, because then the compiler
>>>> +      * will likely spill tuple_len to the stack. This makes it lose all
>>>> +      * bounds information in the verifier, which then rejects the call as
>>>> +      * unsafe.
>>>> +      */
>>>
>>> This is a known issue. For scalars, only constant is restored properly
>>> in verifier at this moment. I did some hacking before to enable any
>>> scalars. The fear is this will make pruning performs worse. More
>>> study is needed here.
>>
>> Of topic, but: this is actually one of the most challenging issues for
>> us when writing
>> BPF. It forces us to have very deep call graphs to hopefully avoid clang
>> spilling the constants. Please let me know if I can help in any way.
> 
> Thanks for bringing this up.
> Yonghong, please correct me if I'm wrong.

Yes. The summary below is correct. For reference, the below bcc issue
documents some of my investigation:
   https://github.com/iovisor/bcc/issues/2463

> I think you've experimented with tracking spilled constants. The first issue
> came with spilling of 4 byte constant. The verifier tracks 8 byte slots and
> lots of places assume that slot granularity. It's not clear yet how to refactor
> the verifier. Ideas, help are greatly appreciated.

I cannot remember exactly what I did then. Probably remember the spilled 
size too. Since the hack is never peer reviewed, maybe my approach has bugs.

> The second concern was pruning, but iirc the experiments were inconclusive.
> selftests/bpf only has old fb progs. Hence, I think, the step zero is for
> everyone to contribute their bpf programs written in C. If we have both
> cilium and cloudflare progs as selftests it will help a lot to guide such long
> lasting verifier decisions.

Yes, this is inconclusive and I did not do any active investigation here
since just enhancing the non-const spill won't resolve the above issue.
But totally agree that if we had an implementation, we should measure
its impact on verifier speed.
Joe Stringer March 26, 2020, 11:39 p.m. UTC | #18
On Wed, Mar 25, 2020 at 11:38 PM Joe Stringer <joe@wand.net.nz> wrote:
>
> On Wed, Mar 25, 2020 at 11:25 PM Martin KaFai Lau <kafai@fb.com> wrote:
> >
> > On Wed, Mar 25, 2020 at 01:55:59PM -0700, Joe Stringer wrote:
> > > On Wed, Mar 25, 2020 at 3:35 AM Lorenz Bauer <lmb@cloudflare.com> wrote:
> > > >
> > > > On Wed, 25 Mar 2020 at 05:58, Joe Stringer <joe@wand.net.nz> wrote:
> > > > >
> > > > > From: Lorenz Bauer <lmb@cloudflare.com>
> > > > >
> > > > > Attach a tc direct-action classifier to lo in a fresh network
> > > > > namespace, and rewrite all connection attempts to localhost:4321
> > > > > to localhost:1234 (for port tests) and connections to unreachable
> > > > > IPv4/IPv6 IPs to the local socket (for address tests).
> > > >
> > > > Can you extend this to cover UDP as well?
> > >
> > > I'm working on a follow-up series for UDP, we need this too.
> > Other than selftests, what are the changes for UDP in patch 1 - 4?
>
> Nothing in those patches, I have refactoring of all of the socket
> helpers, skc_lookup_udp() and adding flags to the socket lookup
> functions to support only looking for a certain type of sockets -
> established or listen. This helps to avoid multiple lookups in these
> cases where you really just want to look up established sockets with
> the packet tuple first then look up the listener socket with the
> unrelated/tproxy tuple. For UDP it makes it easier to find the correct
> socket and in general (including TCP) helps to avoid up to two socket
> hashtable lookups for this use case. This part is because the current
> helpers all look up the established socket first then the listener
> socket, so for the first packet that hits these we perform both of
> these lookups for the packet tuple (which finds nothing), then look up
> an established socket for the target tuple (which finds nothing) then
> finally a listen socket for the target tuple. It's about another 300+
> / 250- changes overall, of which a large chunk is one patch that
> refactors the code into macros. I haven't narrowed down for sure
> whether the lookup flags patch is required for UDP cases yet.

FWIW I did some more testing and it was not apparent that
skc_lookup_udp is at all necessary, I was able to roll in UDP support
in the next revision of this series with no special extra patches.

I'll keep working on those other optimizations in the background though.
Lorenz Bauer March 27, 2020, 10:02 a.m. UTC | #19
On Thu, 26 Mar 2020 at 21:07, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Thu, Mar 26, 2020 at 10:13:31AM +0000, Lorenz Bauer wrote:
> > > > +
> > > > +     if (ipv4) {
> > > > +             if (tuple->ipv4.dport != bpf_htons(4321))
> > > > +                     return TC_ACT_OK;
> > > > +
> > > > +             ln.ipv4.daddr = bpf_htonl(0x7f000001);
> > > > +             ln.ipv4.dport = bpf_htons(1234);
> > > > +
> > > > +             sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv4),
> > > > +                                     BPF_F_CURRENT_NETNS, 0);
> > > > +     } else {
> > > > +             if (tuple->ipv6.dport != bpf_htons(4321))
> > > > +                     return TC_ACT_OK;
> > > > +
> > > > +             /* Upper parts of daddr are already zero. */
> > > > +             ln.ipv6.daddr[3] = bpf_htonl(0x1);
> > > > +             ln.ipv6.dport = bpf_htons(1234);
> > > > +
> > > > +             sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv6),
> > > > +                                     BPF_F_CURRENT_NETNS, 0);
> > > > +     }
> > > > +
> > > > +     /* We can't do a single skc_lookup_tcp here, because then the compiler
> > > > +      * will likely spill tuple_len to the stack. This makes it lose all
> > > > +      * bounds information in the verifier, which then rejects the call as
> > > > +      * unsafe.
> > > > +      */
> > >
> > > This is a known issue. For scalars, only constant is restored properly
> > > in verifier at this moment. I did some hacking before to enable any
> > > scalars. The fear is this will make pruning performs worse. More
> > > study is needed here.
> >
> > Of topic, but: this is actually one of the most challenging issues for
> > us when writing
> > BPF. It forces us to have very deep call graphs to hopefully avoid clang
> > spilling the constants. Please let me know if I can help in any way.
>
> Thanks for bringing this up.
> Yonghong, please correct me if I'm wrong.
> I think you've experimented with tracking spilled constants. The first issue
> came with spilling of 4 byte constant. The verifier tracks 8 byte slots and
> lots of places assume that slot granularity. It's not clear yet how to refactor
> the verifier. Ideas, help are greatly appreciated.
> The second concern was pruning, but iirc the experiments were inconclusive.
> selftests/bpf only has old fb progs. Hence, I think, the step zero is for
> everyone to contribute their bpf programs written in C. If we have both
> cilium and cloudflare progs as selftests it will help a lot to guide such long
> lasting verifier decisions.

Ok, I'll try to get something sorted out. We have a TC classifier that
would be suitable,
and I've been meaning to get it open sourced. Does the integration into the
test suite have to involve running packets through it, or is compile
and load enough?
Alexei Starovoitov March 27, 2020, 4:08 p.m. UTC | #20
On Fri, Mar 27, 2020 at 3:03 AM Lorenz Bauer <lmb@cloudflare.com> wrote:
> >
> > Thanks for bringing this up.
> > Yonghong, please correct me if I'm wrong.
> > I think you've experimented with tracking spilled constants. The first issue
> > came with spilling of 4 byte constant. The verifier tracks 8 byte slots and
> > lots of places assume that slot granularity. It's not clear yet how to refactor
> > the verifier. Ideas, help are greatly appreciated.
> > The second concern was pruning, but iirc the experiments were inconclusive.
> > selftests/bpf only has old fb progs. Hence, I think, the step zero is for
> > everyone to contribute their bpf programs written in C. If we have both
> > cilium and cloudflare progs as selftests it will help a lot to guide such long
> > lasting verifier decisions.
>
> Ok, I'll try to get something sorted out. We have a TC classifier that
> would be suitable,
> and I've been meaning to get it open sourced. Does the integration into the
> test suite have to involve running packets through it, or is compile
> and load enough?

It would be great if you can add it as part of test_progs and run it
with one or two packets via prog_test_run like all the tests do.
Thanks!
Joe Stringer March 27, 2020, 7:06 p.m. UTC | #21
On Thu, Mar 26, 2020 at 2:07 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
> The second concern was pruning, but iirc the experiments were inconclusive.
> selftests/bpf only has old fb progs. Hence, I think, the step zero is for
> everyone to contribute their bpf programs written in C. If we have both
> cilium and cloudflare progs as selftests it will help a lot to guide such long
> lasting verifier decisions.

How would you like to handle program changes over time for this?

In Cilium community we periodically rebuild bpf-next VM images for
testing, and run every pull request against those images[0]. We also
test against specific older kernels, currently 4.9 and 4.19. This
allows us to get some sense for the impact of upstream changes while
developing Cilium features, but unfortunately doesn't allow everyone
using kernel selftests to get that feedback at least from the kernel
tree. We also have a verifier complexity test script where we compile
with the maximum number of features (to ideally generate the most
complex programs possible) then attempt to load all of the various
programs, and output the complexity count that the kernel reports[1,2]
which we can track over time.

However Cilium BPF programs are actively developing and even if we
merge these programs into the kernel tree, they will get out-of-date
quickly. Up until recently everything was verifying fine compiling
with LLVM7 and loading into bpf-next. Over the past month we started
noticing new issues not with the existing implementation, but in *new*
BPF features. As we increased complexity, our CI started failing
against bpf-next[3] while they loaded fine on older kernels. We ended
up mitigating by upgrading to LLVM-10. Long story short, there's
several moving parts; changing BPF program implementations, changing
the compiler toolchain, changing the kernel verifier. So my question
is basically, where's the line of responsibility for what the kernel
selftests are responsible for vs integration tests? How do we maintain
those over time as the BPF programs and compiler changes?

Do we just parachute the ~11K LoC of Cilium datapath into the kernel
tree once per cycle? Or should Cilium autobuild a verifier-test docker
image that kernel testing scripts can pull & run? Or would it be
helpful to have a separate GitHub project similar to libbpf that pulls
out kernel selftests, Cilium progs, fb progs, cloudflare progs, etc
automatically and centralizes a generic suite of BPF verifier
integration tests? Some other option?

[0] https://github.com/cilium/packer-ci-build
[1] https://github.com/cilium/cilium/blob/master/test/bpf/check-complexity.sh
[2] https://github.com/cilium/cilium/blob/master/test/bpf/verifier-test.sh
[3] https://github.com/cilium/cilium/issues/10517
Daniel Borkmann March 27, 2020, 8:16 p.m. UTC | #22
On 3/27/20 8:06 PM, Joe Stringer wrote:
> On Thu, Mar 26, 2020 at 2:07 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
>> The second concern was pruning, but iirc the experiments were inconclusive.
>> selftests/bpf only has old fb progs. Hence, I think, the step zero is for
>> everyone to contribute their bpf programs written in C. If we have both
>> cilium and cloudflare progs as selftests it will help a lot to guide such long
>> lasting verifier decisions.
> 
> How would you like to handle program changes over time for this?
> 
> In Cilium community we periodically rebuild bpf-next VM images for
> testing, and run every pull request against those images[0]. We also
> test against specific older kernels, currently 4.9 and 4.19. This
> allows us to get some sense for the impact of upstream changes while
> developing Cilium features, but unfortunately doesn't allow everyone
> using kernel selftests to get that feedback at least from the kernel
> tree. We also have a verifier complexity test script where we compile
> with the maximum number of features (to ideally generate the most
> complex programs possible) then attempt to load all of the various
> programs, and output the complexity count that the kernel reports[1,2]
> which we can track over time.
> 
> However Cilium BPF programs are actively developing and even if we
> merge these programs into the kernel tree, they will get out-of-date
> quickly. Up until recently everything was verifying fine compiling
> with LLVM7 and loading into bpf-next. Over the past month we started
> noticing new issues not with the existing implementation, but in *new*
> BPF features. As we increased complexity, our CI started failing
> against bpf-next[3] while they loaded fine on older kernels. We ended
> up mitigating by upgrading to LLVM-10. Long story short, there's
> several moving parts; changing BPF program implementations, changing
> the compiler toolchain, changing the kernel verifier. So my question
> is basically, where's the line of responsibility for what the kernel
> selftests are responsible for vs integration tests? How do we maintain
> those over time as the BPF programs and compiler changes?

I wonder whether it would make sense to create test cases which are based
on our cilium/cilium:latest docker image. Those would be run as part of
BPF kernel selftests as well as part of our own CI for every PR. I think
it could be some basic connectivity test, service mapping, etc with a
bunch of application containers.

One nice thing that just came to mind is that it would actually allow for
easy testing of latest clang/llvm git by rerunning the test script and
remapping them as a volume, e.g.:

   docker run -it -v /root/clang+llvm-7.0.1-x86_64-linux-gnu-ubuntu-18.04/bin/clang:/bin/clang \
                  -v /root/clang+llvm-7.0.1-x86_64-linux-gnu-ubuntu-18.04/bin/llc:/bin/llc \
          cilium/cilium:latest /bin/sh

Perhaps that would be more useful and always up to date than a copy of the
code base that would get stale next day? In the end in this context kernel
changes and/or llvm changes might be of interest to check whether anything
potentially blows up, so having a self-contained packaging might be useful.
Thoughts?

> Do we just parachute the ~11K LoC of Cilium datapath into the kernel
> tree once per cycle? Or should Cilium autobuild a verifier-test docker
> image that kernel testing scripts can pull & run? Or would it be
> helpful to have a separate GitHub project similar to libbpf that pulls
> out kernel selftests, Cilium progs, fb progs, cloudflare progs, etc
> automatically and centralizes a generic suite of BPF verifier
> integration tests? Some other option?
> 
> [0] https://github.com/cilium/packer-ci-build
> [1] https://github.com/cilium/cilium/blob/master/test/bpf/check-complexity.sh
> [2] https://github.com/cilium/cilium/blob/master/test/bpf/verifier-test.sh
> [3] https://github.com/cilium/cilium/issues/10517
>
Alexei Starovoitov March 27, 2020, 10:24 p.m. UTC | #23
On Fri, Mar 27, 2020 at 09:16:52PM +0100, Daniel Borkmann wrote:
> 
> Perhaps that would be more useful and always up to date than a copy of the
> code base that would get stale next day? In the end in this context kernel
> changes and/or llvm changes might be of interest to check whether anything
> potentially blows up, so having a self-contained packaging might be useful.
> Thoughts?

Right now we have zero cilium progs in selftest :)
so any number of progs is better than nothing.

> > Do we just parachute the ~11K LoC of Cilium datapath into the kernel
> > tree once per cycle? 

I don't think 11k progs updated every kernel release will catch
much more than basic reduced set.
selftests/bpf is not a substitute for cilium CI.
I would prefer .c tests to be developed once and frozen.
More tests can be added every 6 month or so.
I think copy-paste is ok-ish too, but would be much better
to think through about aspects of the code first.
It worked well for fb xdp/tc progs. I took some of them,
refactored them to selftest/Makefile and they were kept as-is for
the last 3 years. While real progs kept being actively changed.
For example: progs/test_l4lb.c is about 1/10th of the real deal here:
https://github.com/facebookincubator/katran/tree/master/katran/lib/bpf
In terms of lines code, number of includes and so on.
While hacking katran into selftest I tried to capture the _style_ of C code.
Like:
 bool get_packet_dst(struct real_definition **real,
                     struct packet_description *pckt,
                     struct vip_meta *vip_info,
                     bool is_ipv6)
I wouldn't have written the prototype this way.
Passing double pointer as a return value as a first argument is not my style :)
The style of nested 'if', etc. Those are the things that make clang
generate specific code patterns that I was trying to preserve in selftests.

Example 2: progs/strobemeta.h is about 1/4 of real thing.
Yet frozen it was super useful for the work on bounded loops.

Example 3: progs/test_get_stack_rawtp.c is one specific code pattern
that used in our profiling prog.
This one was immensely helpful to track down jmp32/alu32 bugs.
It's the one that we're still fixing (see John's jmp32 work).

and so on.
selftests/bpf/progs don't need to be full copy-paste. Ideally the capture
the essence of the progs in the short form.

clang bpf backend and verifier smartness keep changing. So having
frozen selftests is necessary to see the trend and capture bugs
in the backend and in the verifier.
Andrii Nakryiko March 28, 2020, 12:17 a.m. UTC | #24
On Fri, Mar 27, 2020 at 12:07 PM Joe Stringer <joe@wand.net.nz> wrote:
>
> On Thu, Mar 26, 2020 at 2:07 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> > The second concern was pruning, but iirc the experiments were inconclusive.
> > selftests/bpf only has old fb progs. Hence, I think, the step zero is for
> > everyone to contribute their bpf programs written in C. If we have both
> > cilium and cloudflare progs as selftests it will help a lot to guide such long
> > lasting verifier decisions.
>
> How would you like to handle program changes over time for this?
>
> In Cilium community we periodically rebuild bpf-next VM images for
> testing, and run every pull request against those images[0]. We also
> test against specific older kernels, currently 4.9 and 4.19. This
> allows us to get some sense for the impact of upstream changes while
> developing Cilium features, but unfortunately doesn't allow everyone
> using kernel selftests to get that feedback at least from the kernel
> tree. We also have a verifier complexity test script where we compile
> with the maximum number of features (to ideally generate the most
> complex programs possible) then attempt to load all of the various
> programs, and output the complexity count that the kernel reports[1,2]
> which we can track over time.
>
> However Cilium BPF programs are actively developing and even if we
> merge these programs into the kernel tree, they will get out-of-date
> quickly. Up until recently everything was verifying fine compiling
> with LLVM7 and loading into bpf-next. Over the past month we started
> noticing new issues not with the existing implementation, but in *new*
> BPF features. As we increased complexity, our CI started failing
> against bpf-next[3] while they loaded fine on older kernels. We ended
> up mitigating by upgrading to LLVM-10. Long story short, there's
> several moving parts; changing BPF program implementations, changing
> the compiler toolchain, changing the kernel verifier. So my question
> is basically, where's the line of responsibility for what the kernel
> selftests are responsible for vs integration tests? How do we maintain
> those over time as the BPF programs and compiler changes?

Just wanted to point out that libbpf's Github CI has multi-kernel
testing, so we'll be able to capture regressions on old kernels that
are caused by libbpf and/or nightly clang (we are currently pulling
clang-11 from nightly packages). We are also testing against latest
kernel as well, so if they break, we'll need to fix them. Which is why
I'd like those programs to be manageable in size and complexity and a
simple part of test_progs, not in some Docker container :)

>
> Do we just parachute the ~11K LoC of Cilium datapath into the kernel
> tree once per cycle? Or should Cilium autobuild a verifier-test docker
> image that kernel testing scripts can pull & run? Or would it be
> helpful to have a separate GitHub project similar to libbpf that pulls
> out kernel selftests, Cilium progs, fb progs, cloudflare progs, etc
> automatically and centralizes a generic suite of BPF verifier
> integration tests? Some other option?
>
> [0] https://github.com/cilium/packer-ci-build
> [1] https://github.com/cilium/cilium/blob/master/test/bpf/check-complexity.sh
> [2] https://github.com/cilium/cilium/blob/master/test/bpf/verifier-test.sh
> [3] https://github.com/cilium/cilium/issues/10517
diff mbox series

Patch

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 7729892e0b04..4f7f83d059ca 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -76,7 +76,7 @@  TEST_PROGS_EXTENDED := with_addr.sh \
 # Compile but not part of 'make run_tests'
 TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \
 	flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \
-	test_lirc_mode2_user xdping test_cpp runqslower
+	test_lirc_mode2_user xdping test_cpp runqslower test_sk_assign
 
 TEST_CUSTOM_PROGS = urandom_read
 
diff --git a/tools/testing/selftests/bpf/prog_tests/sk_assign.c b/tools/testing/selftests/bpf/prog_tests/sk_assign.c
new file mode 100644
index 000000000000..1f0afcc20c48
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/sk_assign.c
@@ -0,0 +1,244 @@ 
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Facebook
+// Copyright (c) 2019 Cloudflare
+// Copyright (c) 2020 Isovalent, Inc.
+/*
+ * Test that the socket assign program is able to redirect traffic towards a
+ * socket, regardless of whether the port or address destination of the traffic
+ * matches the port.
+ */
+
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "test_progs.h"
+
+#define TEST_DPORT 4321
+#define TEST_DADDR (0xC0A80203)
+#define NS_SELF "/proc/self/ns/net"
+
+static __u32 duration;
+
+static bool configure_stack(int self_net)
+{
+	/* Move to a new networking namespace */
+	if (CHECK_FAIL(unshare(CLONE_NEWNET)))
+		return false;
+
+	/* Configure necessary links, routes */
+	if (CHECK_FAIL(system("ip link set dev lo up")))
+		return false;
+	if (CHECK_FAIL(system("ip route add local default dev lo")))
+		return false;
+	if (CHECK_FAIL(system("ip -6 route add local default dev lo")))
+		return false;
+
+	/* Load qdisc, BPF program */
+	if (CHECK_FAIL(system("tc qdisc add dev lo clsact")))
+		return false;
+	if (CHECK_FAIL(system("tc filter add dev lo ingress bpf direct-action "
+		     "object-file ./test_sk_assign.o section sk_assign_test")))
+		return false;
+
+	return true;
+}
+
+static int start_server(const struct sockaddr *addr, socklen_t len)
+{
+	int fd;
+
+	fd = socket(addr->sa_family, SOCK_STREAM, 0);
+	if (CHECK_FAIL(fd == -1))
+		goto out;
+	if (CHECK_FAIL(bind(fd, addr, len) == -1))
+		goto close_out;
+	if (CHECK_FAIL(listen(fd, 128) == -1))
+		goto close_out;
+
+	goto out;
+
+close_out:
+	close(fd);
+	fd = -1;
+out:
+	return fd;
+}
+
+static void handle_timeout(int signum)
+{
+	if (signum == SIGALRM)
+		fprintf(stderr, "Timed out while connecting to server\n");
+	kill(0, SIGKILL);
+}
+
+static struct sigaction timeout_action = {
+	.sa_handler = handle_timeout,
+};
+
+static int connect_to_server(const struct sockaddr *addr, socklen_t len)
+{
+	int fd = -1;
+
+	fd = socket(addr->sa_family, SOCK_STREAM, 0);
+	if (CHECK_FAIL(fd == -1))
+		goto out;
+	if (CHECK_FAIL(sigaction(SIGALRM, &timeout_action, NULL)))
+		goto out;
+	alarm(3);
+	if (CHECK_FAIL(connect(fd, addr, len) == -1))
+		goto close_out;
+
+	goto out;
+
+close_out:
+	close(fd);
+	fd = -1;
+out:
+	return fd;
+}
+
+static in_port_t get_port(int fd)
+{
+	struct sockaddr_storage name;
+	socklen_t len;
+	in_port_t port = 0;
+
+	len = sizeof(name);
+	if (CHECK_FAIL(getsockname(fd, (struct sockaddr *)&name, &len)))
+		return port;
+
+	switch (name.ss_family) {
+	case AF_INET:
+		port = ((struct sockaddr_in *)&name)->sin_port;
+		break;
+	case AF_INET6:
+		port = ((struct sockaddr_in6 *)&name)->sin6_port;
+		break;
+	default:
+		CHECK(1, "Invalid address family", "%d\n", name.ss_family);
+	}
+	return port;
+}
+
+static int run_test(int server_fd, const struct sockaddr *addr, socklen_t len)
+{
+	int client = -1, srv_client = -1;
+	char buf[] = "testing";
+	in_port_t port;
+	int ret = 1;
+
+	client = connect_to_server(addr, len);
+	if (client == -1) {
+		perror("Cannot connect to server");
+		goto out;
+	}
+
+	srv_client = accept(server_fd, NULL, NULL);
+	if (CHECK_FAIL(srv_client == -1)) {
+		perror("Can't accept connection");
+		goto out;
+	}
+	if (CHECK_FAIL(write(client, buf, sizeof(buf)) != sizeof(buf))) {
+		perror("Can't write on client");
+		goto out;
+	}
+	if (CHECK_FAIL(read(srv_client, buf, sizeof(buf)) != sizeof(buf))) {
+		perror("Can't read on server");
+		goto out;
+	}
+
+	port = get_port(srv_client);
+	if (CHECK_FAIL(!port))
+		goto out;
+	if (CHECK(port != htons(TEST_DPORT), "Expected", "port %u but got %u",
+		  TEST_DPORT, ntohs(port)))
+		goto out;
+
+	ret = 0;
+out:
+	close(client);
+	close(srv_client);
+	return ret;
+}
+
+static int do_sk_assign(void)
+{
+	struct sockaddr_in addr4;
+	struct sockaddr_in6 addr6;
+	int server = -1;
+	int server_v6 = -1;
+	int err = 1;
+
+	memset(&addr4, 0, sizeof(addr4));
+	addr4.sin_family = AF_INET;
+	addr4.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+	addr4.sin_port = htons(1234);
+
+	memset(&addr6, 0, sizeof(addr6));
+	addr6.sin6_family = AF_INET6;
+	addr6.sin6_addr = in6addr_loopback;
+	addr6.sin6_port = htons(1234);
+
+	server = start_server((const struct sockaddr *)&addr4, sizeof(addr4));
+	if (server == -1)
+		goto out;
+
+	server_v6 = start_server((const struct sockaddr *)&addr6,
+				 sizeof(addr6));
+	if (server_v6 == -1)
+		goto out;
+
+	/* Connect to unbound ports */
+	addr4.sin_port = htons(TEST_DPORT);
+	addr6.sin6_port = htons(TEST_DPORT);
+
+	test__start_subtest("ipv4 port redir");
+	if (run_test(server, (const struct sockaddr *)&addr4, sizeof(addr4)))
+		goto out;
+
+	test__start_subtest("ipv6 port redir");
+	if (run_test(server_v6, (const struct sockaddr *)&addr6, sizeof(addr6)))
+		goto out;
+
+	/* Connect to unbound addresses */
+	addr4.sin_addr.s_addr = htonl(TEST_DADDR);
+	addr6.sin6_addr.s6_addr32[3] = htonl(TEST_DADDR);
+
+	test__start_subtest("ipv4 addr redir");
+	if (run_test(server, (const struct sockaddr *)&addr4, sizeof(addr4)))
+		goto out;
+
+	test__start_subtest("ipv6 addr redir");
+	if (run_test(server_v6, (const struct sockaddr *)&addr6, sizeof(addr6)))
+		goto out;
+
+	err = 0;
+out:
+	close(server);
+	close(server_v6);
+	return err;
+}
+
+void test_sk_assign(void)
+{
+	int self_net;
+
+	self_net = open(NS_SELF, O_RDONLY);
+	if (CHECK_FAIL(self_net < 0)) {
+		perror("Unable to open "NS_SELF);
+		return;
+	}
+
+	if (!configure_stack(self_net)) {
+		perror("configure_stack");
+		goto cleanup;
+	}
+
+	do_sk_assign();
+
+cleanup:
+	close(self_net);
+}
diff --git a/tools/testing/selftests/bpf/progs/test_sk_assign.c b/tools/testing/selftests/bpf/progs/test_sk_assign.c
new file mode 100644
index 000000000000..7de30ad3f594
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_sk_assign.c
@@ -0,0 +1,127 @@ 
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Cloudflare Ltd.
+
+#include <stddef.h>
+#include <stdbool.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/pkt_cls.h>
+#include <linux/tcp.h>
+#include <sys/socket.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+int _version SEC("version") = 1;
+char _license[] SEC("license") = "GPL";
+
+/* Fill 'tuple' with L3 info, and attempt to find L4. On fail, return NULL. */
+static struct bpf_sock_tuple *get_tuple(void *data, __u64 nh_off,
+					void *data_end, __u16 eth_proto,
+					bool *ipv4)
+{
+	struct bpf_sock_tuple *result;
+	__u8 proto = 0;
+	__u64 ihl_len;
+
+	if (eth_proto == bpf_htons(ETH_P_IP)) {
+		struct iphdr *iph = (struct iphdr *)(data + nh_off);
+
+		if (iph + 1 > data_end)
+			return NULL;
+		if (iph->ihl != 5)
+			/* Options are not supported */
+			return NULL;
+		ihl_len = iph->ihl * 4;
+		proto = iph->protocol;
+		*ipv4 = true;
+		result = (struct bpf_sock_tuple *)&iph->saddr;
+	} else if (eth_proto == bpf_htons(ETH_P_IPV6)) {
+		struct ipv6hdr *ip6h = (struct ipv6hdr *)(data + nh_off);
+
+		if (ip6h + 1 > data_end)
+			return NULL;
+		ihl_len = sizeof(*ip6h);
+		proto = ip6h->nexthdr;
+		*ipv4 = false;
+		result = (struct bpf_sock_tuple *)&ip6h->saddr;
+	} else {
+		return NULL;
+	}
+
+	if (result + 1 > data_end || proto != IPPROTO_TCP)
+		return NULL;
+
+	return result;
+}
+
+SEC("sk_assign_test")
+int bpf_sk_assign_test(struct __sk_buff *skb)
+{
+	void *data_end = (void *)(long)skb->data_end;
+	void *data = (void *)(long)skb->data;
+	struct ethhdr *eth = (struct ethhdr *)(data);
+	struct bpf_sock_tuple *tuple, ln = {0};
+	struct bpf_sock *sk;
+	int tuple_len;
+	bool ipv4;
+	int ret;
+
+	if (eth + 1 > data_end)
+		return TC_ACT_SHOT;
+
+	tuple = get_tuple(data, sizeof(*eth), data_end, eth->h_proto, &ipv4);
+	if (!tuple)
+		return TC_ACT_SHOT;
+
+	tuple_len = ipv4 ? sizeof(tuple->ipv4) : sizeof(tuple->ipv6);
+	sk = bpf_skc_lookup_tcp(skb, tuple, tuple_len, BPF_F_CURRENT_NETNS, 0);
+	if (sk) {
+		if (sk->state != BPF_TCP_LISTEN)
+			goto assign;
+
+		bpf_sk_release(sk);
+	}
+
+	if (ipv4) {
+		if (tuple->ipv4.dport != bpf_htons(4321))
+			return TC_ACT_OK;
+
+		ln.ipv4.daddr = bpf_htonl(0x7f000001);
+		ln.ipv4.dport = bpf_htons(1234);
+
+		sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv4),
+					BPF_F_CURRENT_NETNS, 0);
+	} else {
+		if (tuple->ipv6.dport != bpf_htons(4321))
+			return TC_ACT_OK;
+
+		/* Upper parts of daddr are already zero. */
+		ln.ipv6.daddr[3] = bpf_htonl(0x1);
+		ln.ipv6.dport = bpf_htons(1234);
+
+		sk = bpf_skc_lookup_tcp(skb, &ln, sizeof(ln.ipv6),
+					BPF_F_CURRENT_NETNS, 0);
+	}
+
+	/* We can't do a single skc_lookup_tcp here, because then the compiler
+	 * will likely spill tuple_len to the stack. This makes it lose all
+	 * bounds information in the verifier, which then rejects the call as
+	 * unsafe.
+	 */
+	if (!sk)
+		return TC_ACT_SHOT;
+
+	if (sk->state != BPF_TCP_LISTEN) {
+		bpf_sk_release(sk);
+		return TC_ACT_SHOT;
+	}
+
+assign:
+	ret = bpf_sk_assign(skb, sk, 0);
+	bpf_sk_release(sk);
+	return ret == 0 ? TC_ACT_OK : TC_ACT_SHOT;
+}