diff mbox series

[iproute2-next] Add tc-BPF example for a TCP pure ack recognizer

Message ID 1525199561-9302-1-git-send-email-dave.taht@gmail.com
State Changes Requested, archived
Delegated to: David Ahern
Headers show
Series [iproute2-next] Add tc-BPF example for a TCP pure ack recognizer | expand

Commit Message

Dave Taht May 1, 2018, 6:32 p.m. UTC
ack_recognize can shift pure tcp acks into another flowid.
---
 examples/bpf/ack_recognize.c | 98 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 98 insertions(+)
 create mode 100644 examples/bpf/ack_recognize.c

Comments

David Ahern May 2, 2018, 4:12 a.m. UTC | #1
On 5/1/18 12:32 PM, Dave Taht wrote:
> ack_recognize can shift pure tcp acks into another flowid.
> ---
>  examples/bpf/ack_recognize.c | 98 ++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 98 insertions(+)
>  create mode 100644 examples/bpf/ack_recognize.c
> 
> diff --git a/examples/bpf/ack_recognize.c b/examples/bpf/ack_recognize.c
> new file mode 100644
> index 0000000..5da620c
> --- /dev/null
> +++ b/examples/bpf/ack_recognize.c
> @@ -0,0 +1,98 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Copyright 2017 Google Inc.

2017?? it's 2018. Did you write this last year and just now posting?

> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation; either version 2
> + * of the License, or (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> + * 02110-1301, USA.
> + */
> +
> +/*
> + * Author: dave.taht@gmail.com (Dave Taht)
> + *
> + * ack_recognizer: An eBPF program that correctly recognizes modern TCP ACKs,
> + * with tcp option fields like SACK and timestamps, and no additional data.
> + *
> + * ack_match call: Recognize "pure acks" with no data payload
> + *
> + */
> +
> +#include "bpf_api.h"
> +#include "linux/if_ether.h"
> +#include "linux/ip.h"
> +#include "linux/in.h"
> +#include "linux/ipv6.h"
> +#include "linux/tcp.h"
> +
> +/*
> + * A pure ack contains the ip header, the tcp header + options, flags with the
> + * ack field set, and no additional payload. That last bit is what every prior
> + * ack filter gets wrong, they typically assume an obsolete 64 bytes, and don't
> + * calculate the options (like sack or timestamps) to subtract from the payload.
> + */
> +
> +__section_cls_entry
> +int ack_match(struct __sk_buff *skb)
> +{
> +	void *data = (void *)(long)skb->data;
> +	void *data_end = (void *)(long)skb->data_end;
> +	struct ethhdr *eth = data;
> +	struct iphdr *iph = data + sizeof(*eth);
> +	struct tcphdr *tcp;
> +
> +	if (data + sizeof(*eth) + sizeof(*iph) + sizeof(*tcp) > data_end)
> +		return 0;
> +
> +	if (eth->h_proto == htons(ETH_P_IP) &&
> +	    iph->version == 4) {

Why not make the version == 4 under the proto check?
	if (eth->h_proto == htons(ETH_P_IP) {
		if (iph->version != 4)
			return 0;
if the eth proto is ETH_P_IP, and version != 4 something is really wrong

> +		if(iph->protocol == IPPROTO_TCP &&
> +		   iph->ihl == 5 &&
> +		   data + sizeof(*eth) + 20 + sizeof(*tcp) <= data_end) {
> +			tcp = data + sizeof(*eth) + 20;
> +			if (tcp->ack &&
> +			    htons(iph->tot_len) == 20 + tcp->doff*4)
> +				return -1;

And then here why the magic '20' and limits on ihl = 5? both are trivial
to accommodate programmatically.

> +		}
> +	} else if (eth->h_proto == htons(ETH_P_IPV6) &&
> +		   iph->version == 6) {
> +		struct ipv6hdr *iph6 = (struct ipv6hdr *) iph;
> +		if(iph6->nexthdr == IPPROTO_TCP &&
> +		   data + sizeof(*eth) + 40 + sizeof(*tcp) <= data_end ) {
> +			tcp = data + sizeof(*eth) + 40;
> +			if (tcp->ack &&
> +			    tcp->doff*4 == htons(iph6->payload_len))
> +				return -1;

ditto here.

> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +/* Example: Move acks into a priority queue:
> +
> +IFACE=eth0
> +tc qdisc del dev $IFACE root 2> /dev/null
> +tc qdisc add dev $IFACE root handle 1: prio bands 3 \
> +	priomap 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
> +tc qdisc add dev $IFACE parent 1:1 handle 10:1 sfq headdrop # acks only
> +tc qdisc add dev $IFACE parent 1:2 handle 20:1 fq_codel # all other traffic
> +tc qdisc add dev $IFACE parent 1:3 handle 30:1 fq_codel # unused
> +tc filter add dev $IFACE parent 1: prio 1 bpf \
> +	object-file ack_recognize.o flowid 1:1
> +
> +Please note that a strict priority queue is not a good idea (drr would be
> +better), nor is doing any level of prioritization on acks at all....
> +*/
> +
> +BPF_LICENSE("GPL");
>
Dave Taht May 2, 2018, 4:51 a.m. UTC | #2
On Tue, May 1, 2018 at 9:12 PM, David Ahern <dsahern@gmail.com> wrote:
> On 5/1/18 12:32 PM, Dave Taht wrote:
>> ack_recognize can shift pure tcp acks into another flowid.
>> ---
>>  examples/bpf/ack_recognize.c | 98 ++++++++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 98 insertions(+)
>>  create mode 100644 examples/bpf/ack_recognize.c
>>
>> diff --git a/examples/bpf/ack_recognize.c b/examples/bpf/ack_recognize.c
>> new file mode 100644
>> index 0000000..5da620c
>> --- /dev/null
>> +++ b/examples/bpf/ack_recognize.c
>> @@ -0,0 +1,98 @@
>> +/* SPDX-License-Identifier: GPL-2.0 */
>> +/*
>> + * Copyright 2017 Google Inc.
>
> 2017?? it's 2018. Did you write this last year and just now posting?

November, 2017? Shortly prior to you taking iproute2-next off of steven's hands.

It was the first stage of a proof of concept showing (eventually) that
a simple ack decimator/filter (like "drop every other ack", or simple
"drop head" in the supplied example) had a ton of problems, and that
to filter out acks correctly to any extent, it needed to peer back
into the queue. (see the sch_cake ack-filter controversy on-going on
this list)

While it's better than what is in wondershaper, I wouldn't recommend
anyone use it. It was, however, useful in acquiring a gut feel for
what several other broken ack filters might be doing in the field. I
submitted it as a possibly useful example for showing off tc-bpf and
to add fuel to the ack-filter fire under cake.

I can clean it up, SPF it, etc, if you want it. Otherwise, sorry for the noise.

>> + *
>> + * This program is free software; you can redistribute it and/or
>> + * modify it under the terms of the GNU General Public License
>> + * as published by the Free Software Foundation; either version 2
>> + * of the License, or (at your option) any later version.
>> + *
>> + * This program is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> + * GNU General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU General Public License
>> + * along with this program; if not, write to the Free Software
>> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
>> + * 02110-1301, USA.
>> + */
>> +
>> +/*
>> + * Author: dave.taht@gmail.com (Dave Taht)
>> + *
>> + * ack_recognizer: An eBPF program that correctly recognizes modern TCP ACKs,
>> + * with tcp option fields like SACK and timestamps, and no additional data.
>> + *
>> + * ack_match call: Recognize "pure acks" with no data payload
>> + *
>> + */
>> +
>> +#include "bpf_api.h"
>> +#include "linux/if_ether.h"
>> +#include "linux/ip.h"
>> +#include "linux/in.h"
>> +#include "linux/ipv6.h"
>> +#include "linux/tcp.h"
>> +
>> +/*
>> + * A pure ack contains the ip header, the tcp header + options, flags with the
>> + * ack field set, and no additional payload. That last bit is what every prior
>> + * ack filter gets wrong, they typically assume an obsolete 64 bytes, and don't
>> + * calculate the options (like sack or timestamps) to subtract from the payload.
>> + */
>> +
>> +__section_cls_entry
>> +int ack_match(struct __sk_buff *skb)
>> +{
>> +     void *data = (void *)(long)skb->data;
>> +     void *data_end = (void *)(long)skb->data_end;
>> +     struct ethhdr *eth = data;
>> +     struct iphdr *iph = data + sizeof(*eth);
>> +     struct tcphdr *tcp;
>> +
>> +     if (data + sizeof(*eth) + sizeof(*iph) + sizeof(*tcp) > data_end)
>> +             return 0;
>> +
>> +     if (eth->h_proto == htons(ETH_P_IP) &&
>> +         iph->version == 4) {
>
> Why not make the version == 4 under the proto check?
>         if (eth->h_proto == htons(ETH_P_IP) {
>                 if (iph->version != 4)
>                         return 0;
> if the eth proto is ETH_P_IP, and version != 4 something is really wrong
>
>> +             if(iph->protocol == IPPROTO_TCP &&
>> +                iph->ihl == 5 &&
>> +                data + sizeof(*eth) + 20 + sizeof(*tcp) <= data_end) {
>> +                     tcp = data + sizeof(*eth) + 20;
>> +                     if (tcp->ack &&
>> +                         htons(iph->tot_len) == 20 + tcp->doff*4)
>> +                             return -1;
>
> And then here why the magic '20' and limits on ihl = 5? both are trivial
> to accommodate programmatically.
>
>> +             }
>> +     } else if (eth->h_proto == htons(ETH_P_IPV6) &&
>> +                iph->version == 6) {
>> +             struct ipv6hdr *iph6 = (struct ipv6hdr *) iph;
>> +             if(iph6->nexthdr == IPPROTO_TCP &&
>> +                data + sizeof(*eth) + 40 + sizeof(*tcp) <= data_end ) {
>> +                     tcp = data + sizeof(*eth) + 40;
>> +                     if (tcp->ack &&
>> +                         tcp->doff*4 == htons(iph6->payload_len))
>> +                             return -1;
>
> ditto here.
>
>> +             }
>> +     }
>> +
>> +     return 0;
>> +}
>> +
>> +/* Example: Move acks into a priority queue:
>> +
>> +IFACE=eth0
>> +tc qdisc del dev $IFACE root 2> /dev/null
>> +tc qdisc add dev $IFACE root handle 1: prio bands 3 \
>> +     priomap 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
>> +tc qdisc add dev $IFACE parent 1:1 handle 10:1 sfq headdrop # acks only
>> +tc qdisc add dev $IFACE parent 1:2 handle 20:1 fq_codel # all other traffic
>> +tc qdisc add dev $IFACE parent 1:3 handle 30:1 fq_codel # unused
>> +tc filter add dev $IFACE parent 1: prio 1 bpf \
>> +     object-file ack_recognize.o flowid 1:1
>> +
>> +Please note that a strict priority queue is not a good idea (drr would be
>> +better), nor is doing any level of prioritization on acks at all....
>> +*/
>> +
>> +BPF_LICENSE("GPL");
>>
>
David Ahern May 2, 2018, 3:22 p.m. UTC | #3
On 5/1/18 10:51 PM, Dave Taht wrote:
> On Tue, May 1, 2018 at 9:12 PM, David Ahern <dsahern@gmail.com> wrote:
>> On 5/1/18 12:32 PM, Dave Taht wrote:
>>> ack_recognize can shift pure tcp acks into another flowid.
>>> ---
>>>  examples/bpf/ack_recognize.c | 98 ++++++++++++++++++++++++++++++++++++++++++++
>>>  1 file changed, 98 insertions(+)
>>>  create mode 100644 examples/bpf/ack_recognize.c
>>>
>>> diff --git a/examples/bpf/ack_recognize.c b/examples/bpf/ack_recognize.c
>>> new file mode 100644
>>> index 0000000..5da620c
>>> --- /dev/null
>>> +++ b/examples/bpf/ack_recognize.c
>>> @@ -0,0 +1,98 @@
>>> +/* SPDX-License-Identifier: GPL-2.0 */
>>> +/*
>>> + * Copyright 2017 Google Inc.
>>
>> 2017?? it's 2018. Did you write this last year and just now posting?
> 
> November, 2017? Shortly prior to you taking iproute2-next off of steven's hands.

ok, just checking

> 
> It was the first stage of a proof of concept showing (eventually) that
> a simple ack decimator/filter (like "drop every other ack", or simple
> "drop head" in the supplied example) had a ton of problems, and that
> to filter out acks correctly to any extent, it needed to peer back
> into the queue. (see the sch_cake ack-filter controversy on-going on
> this list)
> 
> While it's better than what is in wondershaper, I wouldn't recommend
> anyone use it. It was, however, useful in acquiring a gut feel for
> what several other broken ack filters might be doing in the field. I
> submitted it as a possibly useful example for showing off tc-bpf and
> to add fuel to the ack-filter fire under cake.
> 
> I can clean it up, SPF it, etc, if you want it. Otherwise, sorry for the noise.

I'm fine with the example, just prefer magic numbers to be clarified.
Stephen Hemminger Oct. 21, 2019, 11:47 p.m. UTC | #4
On Tue,  1 May 2018 11:32:41 -0700
Dave Taht <dave.taht@gmail.com> wrote:

> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Copyright 2017 Google Inc.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation; either version 2
> + * of the License, or (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> + * 02110-1301, USA.
> + */
> +

SPDX is enough don't add boilerplate.
Dave Taht Oct. 23, 2019, 4:59 a.m. UTC | #5
On Mon, Oct 21, 2019 at 4:48 PM Stephen Hemminger
<stephen@networkplumber.org> wrote:
>
> On Tue,  1 May 2018 11:32:41 -0700
> Dave Taht <dave.taht@gmail.com> wrote:
>
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +/*
> > + * Copyright 2017 Google Inc.
> > + *
> > + * This program is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU General Public License
> > + * as published by the Free Software Foundation; either version 2
> > + * of the License, or (at your option) any later version.
> > + *
> > + * This program is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > + * GNU General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU General Public License
> > + * along with this program; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> > + * 02110-1301, USA.
> > + */
> > +
>
> SPDX is enough don't add boilerplate.

I had posted this example code 2+ years ago, and, um, this was rather
a long time to await code review.

Do y'all actually want this?
diff mbox series

Patch

diff --git a/examples/bpf/ack_recognize.c b/examples/bpf/ack_recognize.c
new file mode 100644
index 0000000..5da620c
--- /dev/null
+++ b/examples/bpf/ack_recognize.c
@@ -0,0 +1,98 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+
+/*
+ * Author: dave.taht@gmail.com (Dave Taht)
+ *
+ * ack_recognizer: An eBPF program that correctly recognizes modern TCP ACKs,
+ * with tcp option fields like SACK and timestamps, and no additional data.
+ *
+ * ack_match call: Recognize "pure acks" with no data payload
+ *
+ */
+
+#include "bpf_api.h"
+#include "linux/if_ether.h"
+#include "linux/ip.h"
+#include "linux/in.h"
+#include "linux/ipv6.h"
+#include "linux/tcp.h"
+
+/*
+ * A pure ack contains the ip header, the tcp header + options, flags with the
+ * ack field set, and no additional payload. That last bit is what every prior
+ * ack filter gets wrong, they typically assume an obsolete 64 bytes, and don't
+ * calculate the options (like sack or timestamps) to subtract from the payload.
+ */
+
+__section_cls_entry
+int ack_match(struct __sk_buff *skb)
+{
+	void *data = (void *)(long)skb->data;
+	void *data_end = (void *)(long)skb->data_end;
+	struct ethhdr *eth = data;
+	struct iphdr *iph = data + sizeof(*eth);
+	struct tcphdr *tcp;
+
+	if (data + sizeof(*eth) + sizeof(*iph) + sizeof(*tcp) > data_end)
+		return 0;
+
+	if (eth->h_proto == htons(ETH_P_IP) &&
+	    iph->version == 4) {
+		if(iph->protocol == IPPROTO_TCP &&
+		   iph->ihl == 5 &&
+		   data + sizeof(*eth) + 20 + sizeof(*tcp) <= data_end) {
+			tcp = data + sizeof(*eth) + 20;
+			if (tcp->ack &&
+			    htons(iph->tot_len) == 20 + tcp->doff*4)
+				return -1;
+		}
+	} else if (eth->h_proto == htons(ETH_P_IPV6) &&
+		   iph->version == 6) {
+		struct ipv6hdr *iph6 = (struct ipv6hdr *) iph;
+		if(iph6->nexthdr == IPPROTO_TCP &&
+		   data + sizeof(*eth) + 40 + sizeof(*tcp) <= data_end ) {
+			tcp = data + sizeof(*eth) + 40;
+			if (tcp->ack &&
+			    tcp->doff*4 == htons(iph6->payload_len))
+				return -1;
+		}
+	}
+
+	return 0;
+}
+
+/* Example: Move acks into a priority queue:
+
+IFACE=eth0
+tc qdisc del dev $IFACE root 2> /dev/null
+tc qdisc add dev $IFACE root handle 1: prio bands 3 \
+	priomap 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+tc qdisc add dev $IFACE parent 1:1 handle 10:1 sfq headdrop # acks only
+tc qdisc add dev $IFACE parent 1:2 handle 20:1 fq_codel # all other traffic
+tc qdisc add dev $IFACE parent 1:3 handle 30:1 fq_codel # unused
+tc filter add dev $IFACE parent 1: prio 1 bpf \
+	object-file ack_recognize.o flowid 1:1
+
+Please note that a strict priority queue is not a good idea (drr would be
+better), nor is doing any level of prioritization on acks at all....
+*/
+
+BPF_LICENSE("GPL");