diff mbox

cls_flow: Add tunnel support to the flow classifier

Message ID 1318806373.7169.35.camel@ganymede
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Dan Siemon Oct. 16, 2011, 11:06 p.m. UTC
When used on an interface carrying tunneled traffic the flow classifier
can't look into the tunnels so all of the traffic within the tunnel is
treated as a single flow. This does not allow any type of intelligent
queuing to occur. This patch adds new keys to the flow classifier which
look inside the tunnel. Presently IP-IP, IP-IPv6, IPv6-IPv6 and IPv6-IP
tunnels are supported.

If you are interested I have posted some background and experimental
results at:
http://www.coverfire.com/archives/2011/10/16/making-the-linux-flow-classifier-tunnel-aware/

The related iproute2 patch can be found at the above URL as well.

Signed-off-by: Dan Siemon <dan@coverfire.com>

Comments

Eric Dumazet Oct. 17, 2011, 6:40 a.m. UTC | #1
Le dimanche 16 octobre 2011 à 19:06 -0400, Dan Siemon a écrit :
> When used on an interface carrying tunneled traffic the flow classifier
> can't look into the tunnels so all of the traffic within the tunnel is
> treated as a single flow. This does not allow any type of intelligent
> queuing to occur. This patch adds new keys to the flow classifier which
> look inside the tunnel. Presently IP-IP, IP-IPv6, IPv6-IPv6 and IPv6-IP
> tunnels are supported.
> 
> If you are interested I have posted some background and experimental
> results at:
> http://www.coverfire.com/archives/2011/10/16/making-the-linux-flow-classifier-tunnel-aware/
> 
> The related iproute2 patch can be found at the above URL as well.
> 
> Signed-off-by: Dan Siemon <dan@coverfire.com>
> 

Hi Dan

You're adding a lot of code (omitting the diffstat :( ) for a specific
usage, yet GRE tunnels are not supported.

IPv6 part is also a bit limited : It assumes TCP/UDP headers are the
first ones. Maybe its time to use ipv6_skip_exthdr() ?

Note also that if we pull (with pskb_network_may_pull()) too many bytes,
we kill routing performance on paged frags devices, wich are now
becoming very common.

Adding tunnel support and deep packet inspection might require the use
of skb_header_pointer() wich does the copy of needed data without
requiring expensive reallocation of skb head.



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dan Siemon Oct. 24, 2011, 1:21 a.m. UTC | #2
On Mon, 2011-10-17 at 08:40 +0200, Eric Dumazet wrote:
> Le dimanche 16 octobre 2011 à 19:06 -0400, Dan Siemon a écrit :
> > When used on an interface carrying tunneled traffic the flow classifier
> > can't look into the tunnels so all of the traffic within the tunnel is
> > treated as a single flow. This does not allow any type of intelligent
> > queuing to occur. This patch adds new keys to the flow classifier which
> > look inside the tunnel. Presently IP-IP, IP-IPv6, IPv6-IPv6 and IPv6-IP
> > tunnels are supported.
> > 
> > If you are interested I have posted some background and experimental
> > results at:
> > http://www.coverfire.com/archives/2011/10/16/making-the-linux-flow-classifier-tunnel-aware/
> > 
> > The related iproute2 patch can be found at the above URL as well.
> > 
> > Signed-off-by: Dan Siemon <dan@coverfire.com>
> > 
> 
> Hi Dan
> 
> You're adding a lot of code (omitting the diffstat :( ) for a specific
> usage, yet GRE tunnels are not supported.

Thanks for the review.

Are you arguing this use case isn't worth addressing or that there is a
more efficient way to implement this with less code?

> IPv6 part is also a bit limited : It assumes TCP/UDP headers are the
> first ones. Maybe its time to use ipv6_skip_exthdr() ?

I noticed this too but the existing src-proto and dst-proto don't handle
this case either. Maybe I can look into fixing those as well.

> Note also that if we pull (with pskb_network_may_pull()) too many bytes,
> we kill routing performance on paged frags devices, wich are now
> becoming very common.

I don't know what paged frag devices means but I trust you are correct :)

The existing keys also use pskb_network_may_pull(). Should they be changed as well?

> Adding tunnel support and deep packet inspection might require the use
> of skb_header_pointer() wich does the copy of needed data without
> requiring expensive reallocation of skb head.

I'll look into this but it may be a while before I have an updated
patch.
diff mbox

Patch

diff --git a/include/linux/pkt_cls.h b/include/linux/pkt_cls.h
index defbde2..2f80fa0 100644
--- a/include/linux/pkt_cls.h
+++ b/include/linux/pkt_cls.h
@@ -333,6 +333,11 @@  enum {
 	FLOW_KEY_SKGID,
 	FLOW_KEY_VLAN_TAG,
 	FLOW_KEY_RXHASH,
+	FLOW_KEY_TUNNEL_SRC,
+	FLOW_KEY_TUNNEL_DST,
+	FLOW_KEY_TUNNEL_PROTO,
+	FLOW_KEY_TUNNEL_PROTO_SRC,
+	FLOW_KEY_TUNNEL_PROTO_DST,
 	__FLOW_KEY_MAX,
 };
 
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index 6994214..f0bd3ad 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -311,6 +311,301 @@  static u32 flow_get_rxhash(struct sk_buff *skb)
 	return skb_get_rxhash(skb);
 }
 
+static u32 tunnel_inner_ip_src(struct sk_buff *skb)
+{
+	if (pskb_network_may_pull(skb, skb_network_header_len(skb) +
+							sizeof(struct iphdr))) {
+		return ntohl(ipip_hdr(skb)->saddr);
+	}
+
+	return 0;
+}
+
+static u32 tunnel_inner_ipv6_src(struct sk_buff *skb)
+{
+	if (pskb_network_may_pull(skb, skb_network_header_len(skb) +
+						sizeof(struct ipv6hdr))) {
+		struct ipv6hdr *iph = (struct ipv6hdr *)
+					skb_transport_header(skb);
+		return ntohl(iph->saddr.s6_addr32[3]);
+	}
+
+	return 0;
+}
+
+static u32 flow_get_tunnel_src(struct sk_buff *skb)
+{
+	switch (skb->protocol) {
+	case htons(ETH_P_IP):
+		if (pskb_network_may_pull(skb, sizeof(struct iphdr))) {
+			if (ip_hdr(skb)->protocol == IPPROTO_IPIP) {
+				return tunnel_inner_ip_src(skb);
+			} else if (ip_hdr(skb)->protocol == IPPROTO_IPV6) {
+				return tunnel_inner_ipv6_src(skb);
+			}
+		}
+		break;
+	case htons(ETH_P_IPV6):
+		if (pskb_network_may_pull(skb, sizeof(struct ipv6hdr))) {
+			if (ipv6_hdr(skb)->nexthdr == IPPROTO_IPIP) {
+				return tunnel_inner_ip_src(skb);
+			} else if (ipv6_hdr(skb)->nexthdr == IPPROTO_IPV6) {
+				return tunnel_inner_ipv6_src(skb);
+			}
+		}
+		break;
+	}
+
+	return 0;
+}
+
+static u32 tunnel_inner_ip_dst(struct sk_buff *skb)
+{
+	if (pskb_network_may_pull(skb, skb_network_header_len(skb) +
+							sizeof(struct iphdr))) {
+		return ntohl(ipip_hdr(skb)->daddr);
+	}
+
+	return 0;
+}
+
+static u32 tunnel_inner_ipv6_dst(struct sk_buff *skb)
+{
+	if (pskb_network_may_pull(skb, skb_network_header_len(skb) +
+						sizeof(struct ipv6hdr))) {
+		struct ipv6hdr *iph = (struct ipv6hdr *)
+					skb_transport_header(skb);
+		return ntohl(iph->daddr.s6_addr32[3]);
+	}
+
+	return 0;
+}
+
+static u32 flow_get_tunnel_dst(struct sk_buff *skb)
+{
+	switch (skb->protocol) {
+	case htons(ETH_P_IP):
+		if (pskb_network_may_pull(skb, sizeof(struct iphdr))) {
+			if (ip_hdr(skb)->protocol == IPPROTO_IPIP) {
+				return tunnel_inner_ip_dst(skb);
+			} else if (ip_hdr(skb)->protocol == IPPROTO_IPV6) {
+				return tunnel_inner_ipv6_dst(skb);
+			}
+		}
+		break;
+	case htons(ETH_P_IPV6):
+		if (pskb_network_may_pull(skb, sizeof(struct ipv6hdr))) {
+			if (ipv6_hdr(skb)->nexthdr == IPPROTO_IPIP) {
+				return tunnel_inner_ip_dst(skb);
+			} else if (ipv6_hdr(skb)->nexthdr == IPPROTO_IPV6) {
+				return tunnel_inner_ipv6_dst(skb);
+			}
+		}
+		break;
+	}
+
+	return 0;
+}
+
+static u32 tunnel_inner_ip_proto(struct sk_buff *skb)
+{
+	struct iphdr *iph;
+
+	if (!pskb_network_may_pull(skb, skb_network_header_len(skb) +
+							sizeof(struct iphdr))) {
+		return 0;
+	}
+
+	iph = ipip_hdr(skb);
+
+	return iph->protocol;
+}
+
+static u32 tunnel_inner_ipv6_proto(struct sk_buff *skb)
+{
+	struct ipv6hdr *ipv6h;
+
+	if (!pskb_network_may_pull(skb, skb_network_header_len(skb) +
+						sizeof(struct ipv6hdr))) {
+		return 0;
+	}
+
+	ipv6h = (struct ipv6hdr *)skb_transport_header(skb);
+
+	return ipv6h->nexthdr;
+}
+
+static u32 flow_get_tunnel_proto(struct sk_buff *skb)
+{
+	switch (skb->protocol) {
+	case htons(ETH_P_IP):
+		if (pskb_network_may_pull(skb, sizeof(struct iphdr))) {
+			if (ip_hdr(skb)->protocol == IPPROTO_IPIP) {
+				return tunnel_inner_ip_proto(skb);
+			} else if (ip_hdr(skb)->protocol == IPPROTO_IPV6) {
+				return tunnel_inner_ipv6_proto(skb);
+			}
+		}
+		break;
+	case htons(ETH_P_IPV6):
+		if (pskb_network_may_pull(skb, sizeof(struct ipv6hdr))) {
+			if (ipv6_hdr(skb)->nexthdr == IPPROTO_IPIP) {
+				return tunnel_inner_ip_proto(skb);
+			} else if (ipv6_hdr(skb)->nexthdr == IPPROTO_IPV6) {
+				return tunnel_inner_ipv6_proto(skb);
+			}
+		}
+		break;
+	}
+
+	return 0;
+}
+
+static u32 tunnel_inner_ip_proto_src(struct sk_buff *skb)
+{
+	struct iphdr *iph;
+	int poff;
+
+	if (!pskb_network_may_pull(skb, skb_network_header_len(skb) +
+							sizeof(struct iphdr))) {
+		return 0;
+	}
+
+	iph = ipip_hdr(skb);
+
+	if (ip_is_fragment(iph))
+		return 0;
+
+	poff = proto_ports_offset(iph->protocol);
+	if (poff >= 0 && pskb_network_may_pull(skb, skb_network_header_len(skb)
+						+ iph->ihl * 4 + 2 + poff)) {
+		return ntohs(*(__be16 *)((void *)iph + iph->ihl * 4 + poff));
+	}
+
+	return 0;
+}
+
+static u32 tunnel_inner_ipv6_proto_src(struct sk_buff *skb)
+{
+	struct ipv6hdr *ipv6h;
+	int poff;
+
+	if (!pskb_network_may_pull(skb, skb_network_header_len(skb) +
+						sizeof(struct ipv6hdr))) {
+		return 0;
+	}
+
+	ipv6h = (struct ipv6hdr *)skb_transport_header(skb);
+
+	poff = proto_ports_offset(ipv6h->nexthdr);
+	if (poff >= 0 &&
+		    pskb_network_may_pull(skb, sizeof(*ipv6h) + poff + 2)) {
+		return ntohs(*(__be16 *)((void *)ipv6h + sizeof(*ipv6h) +
+						 			poff));
+	}
+
+	return 0;
+}
+
+static u32 flow_get_tunnel_proto_src(struct sk_buff *skb)
+{
+	switch (skb->protocol) {
+	case htons(ETH_P_IP):
+		if (pskb_network_may_pull(skb, sizeof(struct iphdr))) {
+			if (ip_hdr(skb)->protocol == IPPROTO_IPIP) {
+				return tunnel_inner_ip_proto_src(skb);
+			} else if (ip_hdr(skb)->protocol == IPPROTO_IPV6) {
+				return tunnel_inner_ipv6_proto_src(skb);
+			}
+			return 0;
+		}
+		break;
+	case htons(ETH_P_IPV6):
+		if (pskb_network_may_pull(skb, sizeof(struct ipv6hdr))) {
+			if (ipv6_hdr(skb)->nexthdr == IPPROTO_IPIP) {
+				return tunnel_inner_ip_proto_src(skb);
+			} else if (ipv6_hdr(skb)->nexthdr == IPPROTO_IPV6) {
+				return tunnel_inner_ipv6_proto_src(skb);
+			}
+		}
+		break;
+	}
+
+	return 0;
+}
+
+static u32 tunnel_inner_ip_proto_dst(struct sk_buff *skb)
+{
+	struct iphdr *iph;
+	int poff;
+
+	if (!pskb_network_may_pull(skb, skb_network_header_len(skb) +
+							sizeof(struct iphdr))) {
+		return 0;
+	}
+
+	iph = ipip_hdr(skb);
+
+	if (ip_is_fragment(iph))
+		return 0;
+
+	poff = proto_ports_offset(iph->protocol);
+	if (poff >= 0 && pskb_network_may_pull(skb, skb_network_header_len(skb)
+						+ iph->ihl * 4 + 4 + poff)) {
+		return ntohs(*(__be16 *)((void *)iph + iph->ihl * 4 + 2 + poff));
+	}
+
+	return 0;
+}
+
+static u32 tunnel_inner_ipv6_proto_dst(struct sk_buff *skb)
+{
+	struct ipv6hdr *ipv6h;
+	int poff;
+
+	if (!pskb_network_may_pull(skb, skb_network_header_len(skb) +
+						sizeof(struct ipv6hdr))) {
+		return 0;
+	}
+
+	ipv6h = (struct ipv6hdr *)skb_transport_header(skb);
+
+	poff = proto_ports_offset(ipv6h->nexthdr);
+	if (poff >= 0 &&
+		    pskb_network_may_pull(skb, sizeof(*ipv6h) + poff + 4)) {
+		return ntohs(*(__be16 *)((void *)ipv6h + sizeof(*ipv6h) +
+						 		poff + 2));
+	}
+
+	return 0;
+}
+
+static u32 flow_get_tunnel_proto_dst(struct sk_buff *skb)
+{
+	switch (skb->protocol) {
+	case htons(ETH_P_IP):
+		if (pskb_network_may_pull(skb, sizeof(struct iphdr))) {
+			if (ip_hdr(skb)->protocol == IPPROTO_IPIP) {
+				return tunnel_inner_ip_proto_dst(skb);
+			} else if (ip_hdr(skb)->protocol == IPPROTO_IPV6) {
+				return tunnel_inner_ipv6_proto_dst(skb);
+			}
+		}
+		break;
+	case htons(ETH_P_IPV6):
+		if (pskb_network_may_pull(skb, sizeof(struct ipv6hdr))) {
+			if (ipv6_hdr(skb)->nexthdr == IPPROTO_IPIP) {
+				return tunnel_inner_ip_proto_dst(skb);
+			} else if (ipv6_hdr(skb)->nexthdr == IPPROTO_IPV6) {
+				return tunnel_inner_ipv6_proto_dst(skb);
+			}
+		}
+		break;
+	}
+
+	return 0;
+}
+
 static u32 flow_key_get(struct sk_buff *skb, int key)
 {
 	switch (key) {
@@ -350,6 +645,16 @@  static u32 flow_key_get(struct sk_buff *skb, int key)
 		return flow_get_vlan_tag(skb);
 	case FLOW_KEY_RXHASH:
 		return flow_get_rxhash(skb);
+	case FLOW_KEY_TUNNEL_SRC:
+		return flow_get_tunnel_src(skb);
+	case FLOW_KEY_TUNNEL_DST:
+		return flow_get_tunnel_dst(skb);
+	case FLOW_KEY_TUNNEL_PROTO:
+		return flow_get_tunnel_proto(skb);
+	case FLOW_KEY_TUNNEL_PROTO_SRC:
+		return flow_get_tunnel_proto_src(skb);
+	case FLOW_KEY_TUNNEL_PROTO_DST:
+		return flow_get_tunnel_proto_dst(skb);
 	default:
 		WARN_ON(1);
 		return 0;