Message ID | 1322213787-25796-2-git-send-email-hans@schillstrom.com |
---|---|
State | Not Applicable, archived |
Delegated to: | David Miller |
Headers | show |
> + addr1 = (__force u32) ip6->saddr.s6_addr32[3]; > + addr2 = (__force u32) ip6->daddr.s6_addr32[3]; ... > + ports.v32 = * (__force u32 *) (skb->data + nhoff); Is this code even vaguely portable?? I suspect the 'ports' bit has serious endianness problems. I'm also not sure whether linux guarantees the alignment of skb->data here. David -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Le vendredi 25 novembre 2011 à 14:19 +0000, David Laight a écrit : > > + addr1 = (__force u32) ip6->saddr.s6_addr32[3]; > > + addr2 = (__force u32) ip6->daddr.s6_addr32[3]; > ... > > + ports.v32 = * (__force u32 *) (skb->data + nhoff); > > Is this code even vaguely portable?? Yes it is. > I suspect the 'ports' bit has serious endianness problems. We dont care of endianness here, and we document it with the (__force u32) cast. > I'm also not sure whether linux guarantees the alignment > of skb->data here. It is guaranteed in whole linux stack. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Le vendredi 25 novembre 2011 à 10:36 +0100, Hans Schillstrom a écrit : > From: Hans Schillstrom <hans.schillstrom@ericsson.com> > > The target allows you to create rules in the "raw" and "mangle" tables > which alter the netfilter mark (nfmark) field within a given range. > First a 32 bit hash value is generated then modulus by <limit> and > finally an offset is added before it's written to nfmark. > Prior to routing, the nfmark can influence the routing method (see > "Use netfilter MARK value as routing key") and can also be used by > other subsystems to change their behavior. > Oh well, yet another duplicated flow dissector ... > +/* > + * Calc hash value, special casre is taken on icmp and fragmented messages > + * i.e. fragmented messages don't use ports. > + */ > +__u32 hmark_v6(struct sk_buff *skb, const struct xt_action_param *par) > +{ > + struct xt_hmark_info *info = (struct xt_hmark_info *)par->targinfo; > +no6ports: > + nexthdr &= info->prmask; > + /* get a consistent hash (same value on both flow directions) */ > + if (addr2 < addr1) > + swap(addr1, addr2); > + hash = jhash_3words(addr1, addr2, ports.v32, info->hashrnd) ^ nexthdr; whats the point computing hash, if info->hmod is null, since we dont set skb->mark ? > + if (info->hmod) > + skb->mark = (hash % info->hmod) + info->hoffs; > + > + return XT_CONTINUE; > +} > +#endif > + Same problem/question on hmark_v4() -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Nov 25, 2011 at 10:36:26AM +0100, Hans Schillstrom wrote: > diff --git a/include/net/ipv6.h b/include/net/ipv6.h > index 3f0258d..9e4d4f9 100644 > --- a/include/net/ipv6.h > +++ b/include/net/ipv6.h > @@ -39,6 +39,7 @@ > #define NEXTHDR_ICMP 58 /* ICMP for IPv6. */ > #define NEXTHDR_NONE 59 /* No next header */ > #define NEXTHDR_DEST 60 /* Destination options header. */ > +#define NEXTHDR_SCTP 132 /* Stream Control Transport Protocol */ > #define NEXTHDR_MOBILITY 135 /* Mobility header. */ > > #define NEXTHDR_MAX 255 This has to go in a separated patch. Please, send it to netdev. I think davem can pick that for 3.2-rc > diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig > index 8260b13..41bee43 100644 > --- a/net/netfilter/Kconfig > +++ b/net/netfilter/Kconfig > @@ -471,6 +471,23 @@ config NETFILTER_XT_TARGET_HL > since you can easily create immortal packets that loop > forever on the network. > > +config NETFILTER_XT_TARGET_HMARK > + tristate '"HMARK" target support' > + depends on NETFILTER_ADVANCED > + ---help--- > + This option adds the "HMARK" target. > + > + The target allows you to create rules in the "raw" and "mangle" tables > + which alter the netfilter mark (nfmark) field within a given range. > + First a 32 bit hash value is generated then modulus by <limit> and > + finally an offset is added before it's written to nfmark. > + > + Prior to routing, the nfmark can influence the routing method (see > + "Use netfilter MARK value as routing key") and can also be used by > + other subsystems to change their behavior. > + > + The mark match can also be used to match nfmark produced by this module. > + > config NETFILTER_XT_TARGET_IDLETIMER > tristate "IDLETIMER target support" > depends on NETFILTER_ADVANCED > diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile > index 1a02853..359eeb6 100644 > --- a/net/netfilter/Makefile > +++ b/net/netfilter/Makefile > @@ -56,6 +56,7 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o > obj-$(CONFIG_NETFILTER_XT_TARGET_CT) += xt_CT.o > obj-$(CONFIG_NETFILTER_XT_TARGET_DSCP) += xt_DSCP.o > obj-$(CONFIG_NETFILTER_XT_TARGET_HL) += xt_HL.o > +obj-$(CONFIG_NETFILTER_XT_TARGET_HMARK) += xt_hmark.o > obj-$(CONFIG_NETFILTER_XT_TARGET_LED) += xt_LED.o > obj-$(CONFIG_NETFILTER_XT_TARGET_NFLOG) += xt_NFLOG.o > obj-$(CONFIG_NETFILTER_XT_TARGET_NFQUEUE) += xt_NFQUEUE.o > diff --git a/net/netfilter/xt_hmark.c b/net/netfilter/xt_hmark.c > new file mode 100644 > index 0000000..ae33293 > --- /dev/null > +++ b/net/netfilter/xt_hmark.c > @@ -0,0 +1,327 @@ > +/* > + * xt_hmark - Netfilter module to set mark as hash value > + * > + * (C) 2011 Hans Schillstrom <hans.schillstrom@ericsson.com> > + * > + * Description: > + * This module calculates a hash value that can be modified by modulus > + * and an offset. The hash value is based on a direction independent > + * five tuple: src & dst addr src & dst ports and protocol. > + * However src & dst port can be masked and are not used for fragmented > + * packets, ESP and AH don't have ports so SPI will be used instead. > + * For ICMP error messages the hash mark values will be calculated on > + * the source packet i.e. the packet caused the error (If sufficient > + * amount of data exists). > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License version 2 as > + * published by the Free Software Foundation. > + */ > + > +#include <linux/module.h> > +#include <linux/skbuff.h> > +#include <net/ip.h> > +#include <linux/icmp.h> > + > +#include <linux/netfilter/xt_hmark.h> > +#include <linux/netfilter/x_tables.h> > +#include <net/netfilter/nf_nat.h> > + > +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) > +# define WITH_IPV6 1 > +#include <net/ipv6.h> > +#include <linux/netfilter_ipv6/ip6_tables.h> > +#endif > + > + Comestic: unnecessary extra line. > +MODULE_LICENSE("GPL"); > +MODULE_AUTHOR("Hans Schillstrom <hans.schillstrom@ericsson.com>"); > +MODULE_DESCRIPTION("Xtables: packet range mark operations by hash value"); > +MODULE_ALIAS("ipt_HMARK"); > +MODULE_ALIAS("ip6t_HMARK"); > + > +/* > + * ICMP, get inner header so calc can be made on the source message > + * not the icmp header, i.e. same hash mark must be produced > + * on an icmp error message. > + */ > +static int get_inner_hdr(struct sk_buff *skb, int iphsz, int nhoff) > +{ > + const struct icmphdr *icmph; > + struct icmphdr _ih; > + struct iphdr *iph = NULL; > + > + /* Not enough header? */ > + icmph = skb_header_pointer(skb, nhoff + iphsz, sizeof(_ih), &_ih); > + if (icmph == NULL) > + return nhoff; > + > + if (icmph->type > NR_ICMP_TYPES) > + return nhoff; > + > + /* Error message? */ > + if (icmph->type != ICMP_DEST_UNREACH && > + icmph->type != ICMP_SOURCE_QUENCH && > + icmph->type != ICMP_TIME_EXCEEDED && > + icmph->type != ICMP_PARAMETERPROB && > + icmph->type != ICMP_REDIRECT) > + return nhoff; > + /* Checkin full IP header plus 8 bytes of protocol to > + * avoid additional coding at protocol handlers. > + */ > + if (!pskb_may_pull(skb, nhoff + iphsz + sizeof(_ih) + 8)) > + return nhoff; skb_header_pointer again here, if conntrack is enabled, we can benefit from handling fragments. > + iph = (struct iphdr *)(skb->data + nhoff + iphsz + sizeof(_ih)); > + return nhoff + iphsz + sizeof(_ih); > +} > +/* > + * ICMPv6 > + * Input nhoff Offset into network header > + * offset where ICMPv6 header starts > + * Returns true if it's a icmp error and updates nhoff > + */ > +#ifdef WITH_IPV6 > +static int get_inner6_hdr(struct sk_buff *skb, int *offset, int hdrlen) > +{ > + struct icmp6hdr *icmp6h; > + struct icmp6hdr _ih6; > + > + icmp6h = skb_header_pointer(skb, *offset + hdrlen, sizeof(_ih6), &_ih6); > + if (icmp6h == NULL) > + return 0; > + > + if (icmp6h->icmp6_type && icmp6h->icmp6_type < 128) { > + *offset += hdrlen + sizeof(_ih6); > + return 1; > + } > + return 0; > +} > +/* > + * Calc hash value, special casre is taken on icmp and fragmented messages > + * i.e. fragmented messages don't use ports. > + */ > +__u32 hmark_v6(struct sk_buff *skb, const struct xt_action_param *par) > +{ > + struct xt_hmark_info *info = (struct xt_hmark_info *)par->targinfo; > + int nhoff, poff, hdrlen; > + u32 addr1, addr2, hash; > + struct ipv6hdr *ip6; > + u8 nexthdr; > + int frag = 0, ip6hdrlvl = 0; /* Header level */ > + struct ipv6_opt_hdr _hdr, *hp; > + union { > + u32 v32; > + u16 v16[2]; > + } ports; > + > + ports.v32 = 0; > + nhoff = skb_network_offset(skb); > + > +hdr_new: > + /* Get header info */ > + ip6 = (struct ipv6hdr *) (skb->data + nhoff); > + nexthdr = ip6->nexthdr; > + hdrlen = sizeof(struct ipv6hdr); > + hp = skb_header_pointer(skb, nhoff + hdrlen, sizeof(_hdr), &_hdr); you have to check return value of skb_header_pointer here. > + while (nexthdr) { > + switch (nexthdr) { > + case IPPROTO_ICMPV6: > + /* ICMP Error then move ptr to inner header */ > + if (get_inner6_hdr(skb, &nhoff, hdrlen)) { > + ip6hdrlvl++; > + if (!pskb_may_pull(skb, sizeof(_hdr) + nhoff)) > + return XT_CONTINUE; > + goto hdr_new; > + } > + nhoff += hdrlen; > + goto hdr_rdy; > + > + case NEXTHDR_FRAGMENT: > + if (!ip6hdrlvl) /* Do not use ports if fragmented */ > + frag = 1; > + break; > + > + /* End of hdr traversing cont. with ports and hash calc. */ > + case NEXTHDR_IPV6: /* Do not process tunnels */ > + case NEXTHDR_TCP: > + case NEXTHDR_UDP: > + case NEXTHDR_ESP: > + case NEXTHDR_AUTH: > + case NEXTHDR_SCTP: > + case NEXTHDR_NONE: /* Last hdr of something unknown */ > + nhoff += hdrlen; > + goto hdr_rdy; > + default: > + return XT_CONTINUE; > + } > + if (!hp) > + return XT_CONTINUE; > + nhoff += hdrlen; /* eat current header */ > + nexthdr = hp->nexthdr; /* Next header */ > + hdrlen = ipv6_optlen(hp); > + hp = skb_header_pointer(skb, nhoff + hdrlen, sizeof(_hdr), > + &_hdr); same here. > + if (!pskb_may_pull(skb, nhoff)) why this after skb_header_pointer? [... trimmed off ...] > poff = proto_ports_offset(ip_proto); > nhoff += ip->ihl * 4 + poff; > if (frag || poff < 0 || !pskb_may_pull(skb, nhoff + 4)) > goto noports; > > ports.v32 = * (__force u32 *) (skb->data + nhoff); > if (ip_proto == IPPROTO_ESP || ip_proto == IPPROTO_AH) { > ports.v32 = (ports.v32 & info->spimask) | nfo->spiset; > } else { > if (snatport) /* Replace nat'ed port(s) */ > ports.v16[1] = snatport; > if (dnatport) > ports.v16[0] = dnatport; > ports.v32 = (ports.v32 & info->pmask.v32) | > info->pset.v32; > if (ports.v16[1] < ports.v16[0]) > swap(ports.v16[0], ports.v16[1]); > } > >noports: > ip_proto &= info->prmask; > /* get a consistent hash (same value on both flow directions)/ > if (addr2 < addr1) > swap(addr1, addr2); > > hash = jhash_3words(addr1, addr2, ports.v32, info->hashrnd) ^ p_proto; > if (info->hmod) > skb->mark = (hash % info->hmod) + info->hoffs; > return XT_CONTINUE; > } Hm, I think the fragmentation handling is broken. Say that the first fragment contains the transport header header, then the mark is calculated based on the address and ports. Then, later on fragments will receive the mark based on the network header only. They may have different marks. If you don't want to use conntrack in your setup and you want to handle fragments, then you have to configure HMARK to calculate the hashing based on the network addresses. If you want to fully support fragments, then enable conntrack and you can configure HMARK to calculate the hashing based on network address + transport bits. Fix this by removing the fragmentation handling, then assume that people can select between two hashing configuration for HMARK. One based for network address which is fragment-safe, one that uses the transport layer information, that requires conntrack. Otherwise, I don't see a sane way to handle this situation. I think this has to be documented in the iptables manpage for HMARK. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Friday 2011-11-25 18:36, Pablo Neira Ayuso wrote: >On Fri, Nov 25, 2011 at 10:36:26AM +0100, Hans Schillstrom wrote: >> diff --git a/include/net/ipv6.h b/include/net/ipv6.h >> index 3f0258d..9e4d4f9 100644 >> --- a/include/net/ipv6.h >> +++ b/include/net/ipv6.h >> @@ -39,6 +39,7 @@ >> #define NEXTHDR_ICMP 58 /* ICMP for IPv6. */ >> #define NEXTHDR_NONE 59 /* No next header */ >> #define NEXTHDR_DEST 60 /* Destination options header. */ >> +#define NEXTHDR_SCTP 132 /* Stream Control Transport Protocol */ >> #define NEXTHDR_MOBILITY 135 /* Mobility header. */ >> >> #define NEXTHDR_MAX 255 > >This has to go in a separated patch. Please, send it to netdev. I >think davem can pick that for 3.2-rc I do have to wonder a little why we need the l4proto values twice (IPPROTO_SCTP plus NEXTHDR_SCTP). Has nobody ever thought of doing one foobar_<PROTOCOL>? >> + icmph->type != ICMP_REDIRECT) >> + return nhoff; >> + /* Checkin full IP header plus 8 bytes of protocol to >> + * avoid additional coding at protocol handlers. >> + */ >> + if (!pskb_may_pull(skb, nhoff + iphsz + sizeof(_ih) + 8)) >> + return nhoff; NB:I point out that the preferred long comment style begins with /*\n (to match the trailing \n*/, naturally) like in >> +/* >> + * ICMPv6 >> + * Input nhoff Offset into network header >> + * offset where ICMPv6 header starts >> + * Returns true if it's a icmp error and updates nhoff >> + */ -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 11/25/2011 10:36 AM, Hans Schillstrom wrote: > +__u32 hmark_v6(struct sk_buff *skb, const struct xt_action_param *par) > +{ > + struct xt_hmark_info *info = (struct xt_hmark_info *)par->targinfo; > + int nhoff, poff, hdrlen; > + u32 addr1, addr2, hash; > + struct ipv6hdr *ip6; > + u8 nexthdr; > + int frag = 0, ip6hdrlvl = 0; /* Header level */ > + struct ipv6_opt_hdr _hdr, *hp; > + union { > + u32 v32; > + u16 v16[2]; > + } ports; > + > + ports.v32 = 0; > + nhoff = skb_network_offset(skb); > + > +hdr_new: > + /* Get header info */ > + ip6 = (struct ipv6hdr *) (skb->data + nhoff); > + nexthdr = ip6->nexthdr; > + hdrlen = sizeof(struct ipv6hdr); > + hp = skb_header_pointer(skb, nhoff + hdrlen, sizeof(_hdr),&_hdr); > + > + while (nexthdr) { > + switch (nexthdr) { > + case IPPROTO_ICMPV6: > + /* ICMP Error then move ptr to inner header */ > + if (get_inner6_hdr(skb,&nhoff, hdrlen)) { This doesn't look right. You assume the ICMPv6 header is following the IPv6 header with any other headers in between. If there are other headers, hdrlen will contain the length of the last header. > + ip6hdrlvl++; > + if (!pskb_may_pull(skb, sizeof(_hdr) + nhoff)) > + return XT_CONTINUE; > + goto hdr_new; > + } > + nhoff += hdrlen; > + goto hdr_rdy; > + > + case NEXTHDR_FRAGMENT: > + if (!ip6hdrlvl) /* Do not use ports if fragmented */ > + frag = 1; Shouldn't you also check for fragment offset == 0 here? The fragment header also doesn't include the length, so using ipv6_optlen() below is incorrect. > + break; > + > + /* End of hdr traversing cont. with ports and hash calc. */ > + case NEXTHDR_IPV6: /* Do not process tunnels */ That comment looks misleading, you do seem to process them? > + case NEXTHDR_TCP: > + case NEXTHDR_UDP: > + case NEXTHDR_ESP: > + case NEXTHDR_AUTH: Don't you want to use the port numbers if only authentication without encryption is used? > + case NEXTHDR_SCTP: > + case NEXTHDR_NONE: /* Last hdr of something unknown */ > + nhoff += hdrlen; > + goto hdr_rdy; > + default: > + return XT_CONTINUE; > + } > + if (!hp) > + return XT_CONTINUE; > + nhoff += hdrlen; /* eat current header */ > + nexthdr = hp->nexthdr; /* Next header */ > + hdrlen = ipv6_optlen(hp); > + hp = skb_header_pointer(skb, nhoff + hdrlen, sizeof(_hdr), > + &_hdr); > + > + if (!pskb_may_pull(skb, nhoff)) > + return XT_CONTINUE; > + } And final question, why not simply use ipv6_skip_exthdr()? > +hdr_rdy: > +... -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wednesday, November 30, 2011 16:51:35 Patrick McHardy wrote: > On 11/25/2011 10:36 AM, Hans Schillstrom wrote: > > +__u32 hmark_v6(struct sk_buff *skb, const struct xt_action_param *par) > > +{ > > + struct xt_hmark_info *info = (struct xt_hmark_info *)par->targinfo; > > + int nhoff, poff, hdrlen; > > + u32 addr1, addr2, hash; > > + struct ipv6hdr *ip6; > > + u8 nexthdr; > > + int frag = 0, ip6hdrlvl = 0; /* Header level */ > > + struct ipv6_opt_hdr _hdr, *hp; > > + union { > > + u32 v32; > > + u16 v16[2]; > > + } ports; > > + > > + ports.v32 = 0; > > + nhoff = skb_network_offset(skb); > > + > > +hdr_new: > > + /* Get header info */ > > + ip6 = (struct ipv6hdr *) (skb->data + nhoff); > > + nexthdr = ip6->nexthdr; > > + hdrlen = sizeof(struct ipv6hdr); > > + hp = skb_header_pointer(skb, nhoff + hdrlen, sizeof(_hdr),&_hdr); > > + > > + while (nexthdr) { > > + switch (nexthdr) { > > + case IPPROTO_ICMPV6: > > + /* ICMP Error then move ptr to inner header */ > > + if (get_inner6_hdr(skb,&nhoff, hdrlen)) { > > This doesn't look right. You assume the ICMPv6 header is following > the IPv6 header with any other headers in between. If there are > other headers, hdrlen will contain the length of the last header. RFC-4443 "Every ICMPv6 message is preceded by an IPv6 header and zero or more IPv6 extension headers." hdrlen is actually previous header length in bytes, to be correct. nhoff is the sum of processed headers. So in case of an icmp the nhoff will be updated, and hdrlen preset to ipv6hdr size > > + ip6hdrlvl++; > > + if (!pskb_may_pull(skb, sizeof(_hdr) + nhoff)) > > + return XT_CONTINUE; > > + goto hdr_new; > > + } > > + nhoff += hdrlen; > > + goto hdr_rdy; > > + > > + case NEXTHDR_FRAGMENT: > > + if (!ip6hdrlvl) /* Do not use ports if fragmented */ > > + frag = 1; > > Shouldn't you also check for fragment offset == 0 here? According to the RFC "Initialized to zero for transmission; ignored on reception" > The fragment header also doesn't include the length, so > using ipv6_optlen() below is incorrect. True, it has a fixed size, of 8 octets I'll fix that. (as long as it is zero it will work :-) > > + break; > > + > > + /* End of hdr traversing cont. with ports and hash calc. */ > > + case NEXTHDR_IPV6: /* Do not process tunnels */ > > That comment looks misleading, you do seem to process them? Ooops a "return XT_CONTINUE;" seems to be missing here. > > > + case NEXTHDR_TCP: > > + case NEXTHDR_UDP: > > + case NEXTHDR_ESP: > > + case NEXTHDR_AUTH: > > Don't you want to use the port numbers if only authentication > without encryption is used? with esp or ah the SPI will be used instead of ports. Useful or not I don't know since they are asymmetric in terms of a flow. > > > + case NEXTHDR_SCTP: > > + case NEXTHDR_NONE: /* Last hdr of something unknown */ > > + nhoff += hdrlen; > > + goto hdr_rdy; > > + default: > > + return XT_CONTINUE; > > + } > > + if (!hp) > > + return XT_CONTINUE; > > + nhoff += hdrlen; /* eat current header */ > > + nexthdr = hp->nexthdr; /* Next header */ > > + hdrlen = ipv6_optlen(hp); > > + hp = skb_header_pointer(skb, nhoff + hdrlen, sizeof(_hdr), > > + &_hdr); > > + > > + if (!pskb_may_pull(skb, nhoff)) > > + return XT_CONTINUE; > > + } > > And final question, why not simply use ipv6_skip_exthdr()? problems with fragments... But when looking into ipv6_skip_exthdr() again I realize that handling of NEXTHDR_HOP NEXTHDR_ROUTING, and NEXTHDR_DEST is wrong. It think I need to rewrite this part a bit just skip this headers; case NEXTHDR_HOP: case NEXTHDR_ROUTING: case NEXTHDR_DEST: break; and exit on this: case NEXTHDR_IPV6: case NEXTHDR_NONE: default: return XT_CONTINUE; > > > +hdr_rdy: > > +... > > Thanks Hans -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 12/01/2011 01:25 AM, Hans Schillstrom wrote: > On Wednesday, November 30, 2011 16:51:35 Patrick McHardy wrote: >> On 11/25/2011 10:36 AM, Hans Schillstrom wrote: >>> + >>> +hdr_new: >>> + /* Get header info */ >>> + ip6 = (struct ipv6hdr *) (skb->data + nhoff); >>> + nexthdr = ip6->nexthdr; >>> + hdrlen = sizeof(struct ipv6hdr); >>> + hp = skb_header_pointer(skb, nhoff + hdrlen, sizeof(_hdr),&_hdr); >>> + >>> + while (nexthdr) { >>> + switch (nexthdr) { >>> + case IPPROTO_ICMPV6: >>> + /* ICMP Error then move ptr to inner header */ >>> + if (get_inner6_hdr(skb,&nhoff, hdrlen)) { >> This doesn't look right. You assume the ICMPv6 header is following >> the IPv6 header with any other headers in between. If there are >> other headers, hdrlen will contain the length of the last header. > > RFC-4443 "Every ICMPv6 message is preceded by an IPv6 header and zero or more IPv6 extension headers." > hdrlen is actually previous header length in bytes, to be correct. > nhoff is the sum of processed headers. > So in case of an icmp the nhoff will be updated, and hdrlen preset to ipv6hdr size Right, I missed that you're using nhoff + hdrlen in get_inner6_hdr(). >>> + ip6hdrlvl++; >>> + if (!pskb_may_pull(skb, sizeof(_hdr) + nhoff)) >>> + return XT_CONTINUE; >>> + goto hdr_new; >>> + } >>> + nhoff += hdrlen; >>> + goto hdr_rdy; >>> + >>> + case NEXTHDR_FRAGMENT: >>> + if (!ip6hdrlvl) /* Do not use ports if fragmented */ >>> + frag = 1; >> Shouldn't you also check for fragment offset == 0 here? > According to the RFC "Initialized to zero for transmission; ignored on reception" No, what I meant is that for the first fragment, you do have the upper layer header available. But as we already discussed for a stable identifier you want to ignore it anyways. >>> + case NEXTHDR_TCP: >>> + case NEXTHDR_UDP: >>> + case NEXTHDR_ESP: >>> + case NEXTHDR_AUTH: >> Don't you want to use the port numbers if only authentication >> without encryption is used? > with esp or ah the SPI will be used instead of ports. > Useful or not I don't know since they are asymmetric in terms of a flow. Yes, but with AH you could either use the ESP SPI or if no ESP is used the port numbers of the upper layer protocol. >> And final question, why not simply use ipv6_skip_exthdr()? > problems with fragments... So the probem is that it will return the transport layer protocol header for fragments with frag_off == 0? We also have ipv6_find_hdr() which we could modify to indicate this in the frag_off pointer. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/include/linux/netfilter/xt_hmark.h b/include/linux/netfilter/xt_hmark.h new file mode 100644 index 0000000..6c1436a --- /dev/null +++ b/include/linux/netfilter/xt_hmark.h @@ -0,0 +1,48 @@ +#ifndef XT_HMARK_H_ +#define XT_HMARK_H_ + +#include <linux/types.h> + +/* + * Flags must not start at 0, since it's used as none. + */ +enum { + XT_HMARK_SADR_AND = 1, /* SNAT & DNAT are used by the kernel module */ + XT_HMARK_DADR_AND, + XT_HMARK_SPI_AND, + XT_HMARK_SPI_OR, + XT_HMARK_SPORT_AND, + XT_HMARK_DPORT_AND, + XT_HMARK_SPORT_OR, + XT_HMARK_DPORT_OR, + XT_HMARK_PROTO_AND, + XT_HMARK_RND, + XT_HMARK_MODULUS, + XT_HMARK_OFFSET, + XT_HMARK_USE_SNAT, + XT_HMARK_USE_DNAT, +}; + +union ports { + struct { + __u16 src; + __u16 dst; + } p16; + __u32 v32; +}; + +struct xt_hmark_info { + __u32 smask; /* Source address mask */ + __u32 dmask; /* Dest address mask */ + union ports pmask; + union ports pset; + __u32 spimask; + __u32 spiset; + __u16 flags; /* Print out only */ + __u16 prmask; /* L4 Proto mask */ + __u32 hashrnd; + __u32 hmod; /* Modulus */ + __u32 hoffs; /* Offset */ +}; + +#endif /* XT_HMARK_H_ */ diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 3f0258d..9e4d4f9 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -39,6 +39,7 @@ #define NEXTHDR_ICMP 58 /* ICMP for IPv6. */ #define NEXTHDR_NONE 59 /* No next header */ #define NEXTHDR_DEST 60 /* Destination options header. */ +#define NEXTHDR_SCTP 132 /* Stream Control Transport Protocol */ #define NEXTHDR_MOBILITY 135 /* Mobility header. */ #define NEXTHDR_MAX 255 diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 8260b13..41bee43 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -471,6 +471,23 @@ config NETFILTER_XT_TARGET_HL since you can easily create immortal packets that loop forever on the network. +config NETFILTER_XT_TARGET_HMARK + tristate '"HMARK" target support' + depends on NETFILTER_ADVANCED + ---help--- + This option adds the "HMARK" target. + + The target allows you to create rules in the "raw" and "mangle" tables + which alter the netfilter mark (nfmark) field within a given range. + First a 32 bit hash value is generated then modulus by <limit> and + finally an offset is added before it's written to nfmark. + + Prior to routing, the nfmark can influence the routing method (see + "Use netfilter MARK value as routing key") and can also be used by + other subsystems to change their behavior. + + The mark match can also be used to match nfmark produced by this module. + config NETFILTER_XT_TARGET_IDLETIMER tristate "IDLETIMER target support" depends on NETFILTER_ADVANCED diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 1a02853..359eeb6 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -56,6 +56,7 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o obj-$(CONFIG_NETFILTER_XT_TARGET_CT) += xt_CT.o obj-$(CONFIG_NETFILTER_XT_TARGET_DSCP) += xt_DSCP.o obj-$(CONFIG_NETFILTER_XT_TARGET_HL) += xt_HL.o +obj-$(CONFIG_NETFILTER_XT_TARGET_HMARK) += xt_hmark.o obj-$(CONFIG_NETFILTER_XT_TARGET_LED) += xt_LED.o obj-$(CONFIG_NETFILTER_XT_TARGET_NFLOG) += xt_NFLOG.o obj-$(CONFIG_NETFILTER_XT_TARGET_NFQUEUE) += xt_NFQUEUE.o diff --git a/net/netfilter/xt_hmark.c b/net/netfilter/xt_hmark.c new file mode 100644 index 0000000..ae33293 --- /dev/null +++ b/net/netfilter/xt_hmark.c @@ -0,0 +1,327 @@ +/* + * xt_hmark - Netfilter module to set mark as hash value + * + * (C) 2011 Hans Schillstrom <hans.schillstrom@ericsson.com> + * + * Description: + * This module calculates a hash value that can be modified by modulus + * and an offset. The hash value is based on a direction independent + * five tuple: src & dst addr src & dst ports and protocol. + * However src & dst port can be masked and are not used for fragmented + * packets, ESP and AH don't have ports so SPI will be used instead. + * For ICMP error messages the hash mark values will be calculated on + * the source packet i.e. the packet caused the error (If sufficient + * amount of data exists). + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <net/ip.h> +#include <linux/icmp.h> + +#include <linux/netfilter/xt_hmark.h> +#include <linux/netfilter/x_tables.h> +#include <net/netfilter/nf_nat.h> + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +# define WITH_IPV6 1 +#include <net/ipv6.h> +#include <linux/netfilter_ipv6/ip6_tables.h> +#endif + + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Hans Schillstrom <hans.schillstrom@ericsson.com>"); +MODULE_DESCRIPTION("Xtables: packet range mark operations by hash value"); +MODULE_ALIAS("ipt_HMARK"); +MODULE_ALIAS("ip6t_HMARK"); + +/* + * ICMP, get inner header so calc can be made on the source message + * not the icmp header, i.e. same hash mark must be produced + * on an icmp error message. + */ +static int get_inner_hdr(struct sk_buff *skb, int iphsz, int nhoff) +{ + const struct icmphdr *icmph; + struct icmphdr _ih; + struct iphdr *iph = NULL; + + /* Not enough header? */ + icmph = skb_header_pointer(skb, nhoff + iphsz, sizeof(_ih), &_ih); + if (icmph == NULL) + return nhoff; + + if (icmph->type > NR_ICMP_TYPES) + return nhoff; + + /* Error message? */ + if (icmph->type != ICMP_DEST_UNREACH && + icmph->type != ICMP_SOURCE_QUENCH && + icmph->type != ICMP_TIME_EXCEEDED && + icmph->type != ICMP_PARAMETERPROB && + icmph->type != ICMP_REDIRECT) + return nhoff; + /* Checkin full IP header plus 8 bytes of protocol to + * avoid additional coding at protocol handlers. + */ + if (!pskb_may_pull(skb, nhoff + iphsz + sizeof(_ih) + 8)) + return nhoff; + + iph = (struct iphdr *)(skb->data + nhoff + iphsz + sizeof(_ih)); + return nhoff + iphsz + sizeof(_ih); +} +/* + * ICMPv6 + * Input nhoff Offset into network header + * offset where ICMPv6 header starts + * Returns true if it's a icmp error and updates nhoff + */ +#ifdef WITH_IPV6 +static int get_inner6_hdr(struct sk_buff *skb, int *offset, int hdrlen) +{ + struct icmp6hdr *icmp6h; + struct icmp6hdr _ih6; + + icmp6h = skb_header_pointer(skb, *offset + hdrlen, sizeof(_ih6), &_ih6); + if (icmp6h == NULL) + return 0; + + if (icmp6h->icmp6_type && icmp6h->icmp6_type < 128) { + *offset += hdrlen + sizeof(_ih6); + return 1; + } + return 0; +} +/* + * Calc hash value, special casre is taken on icmp and fragmented messages + * i.e. fragmented messages don't use ports. + */ +__u32 hmark_v6(struct sk_buff *skb, const struct xt_action_param *par) +{ + struct xt_hmark_info *info = (struct xt_hmark_info *)par->targinfo; + int nhoff, poff, hdrlen; + u32 addr1, addr2, hash; + struct ipv6hdr *ip6; + u8 nexthdr; + int frag = 0, ip6hdrlvl = 0; /* Header level */ + struct ipv6_opt_hdr _hdr, *hp; + union { + u32 v32; + u16 v16[2]; + } ports; + + ports.v32 = 0; + nhoff = skb_network_offset(skb); + +hdr_new: + /* Get header info */ + ip6 = (struct ipv6hdr *) (skb->data + nhoff); + nexthdr = ip6->nexthdr; + hdrlen = sizeof(struct ipv6hdr); + hp = skb_header_pointer(skb, nhoff + hdrlen, sizeof(_hdr), &_hdr); + + while (nexthdr) { + switch (nexthdr) { + case IPPROTO_ICMPV6: + /* ICMP Error then move ptr to inner header */ + if (get_inner6_hdr(skb, &nhoff, hdrlen)) { + ip6hdrlvl++; + if (!pskb_may_pull(skb, sizeof(_hdr) + nhoff)) + return XT_CONTINUE; + goto hdr_new; + } + nhoff += hdrlen; + goto hdr_rdy; + + case NEXTHDR_FRAGMENT: + if (!ip6hdrlvl) /* Do not use ports if fragmented */ + frag = 1; + break; + + /* End of hdr traversing cont. with ports and hash calc. */ + case NEXTHDR_IPV6: /* Do not process tunnels */ + case NEXTHDR_TCP: + case NEXTHDR_UDP: + case NEXTHDR_ESP: + case NEXTHDR_AUTH: + case NEXTHDR_SCTP: + case NEXTHDR_NONE: /* Last hdr of something unknown */ + nhoff += hdrlen; + goto hdr_rdy; + default: + return XT_CONTINUE; + } + if (!hp) + return XT_CONTINUE; + nhoff += hdrlen; /* eat current header */ + nexthdr = hp->nexthdr; /* Next header */ + hdrlen = ipv6_optlen(hp); + hp = skb_header_pointer(skb, nhoff + hdrlen, sizeof(_hdr), + &_hdr); + + if (!pskb_may_pull(skb, nhoff)) + return XT_CONTINUE; + } +hdr_rdy: + + addr1 = (__force u32) ip6->saddr.s6_addr32[3]; + addr2 = (__force u32) ip6->daddr.s6_addr32[3]; + poff = proto_ports_offset(nexthdr); + nhoff += poff; + if (frag || poff < 0 || !pskb_may_pull(skb, nhoff + 4)) + goto no6ports; + + ports.v32 = * (__force u32 *) (skb->data + nhoff); + if (nexthdr == IPPROTO_ESP || nexthdr == IPPROTO_AH) { + ports.v32 = (ports.v32 & info->spimask) | info->spiset; + } else { + ports.v32 = (ports.v32 & info->pmask.v32) | + info->pset.v32; + /* get a consistent hash (same value on both flow directions) */ + if (ports.v16[1] < ports.v16[0]) + swap(ports.v16[0], ports.v16[1]); + } + +no6ports: + nexthdr &= info->prmask; + /* get a consistent hash (same value on both flow directions) */ + if (addr2 < addr1) + swap(addr1, addr2); + hash = jhash_3words(addr1, addr2, ports.v32, info->hashrnd) ^ nexthdr; + if (info->hmod) + skb->mark = (hash % info->hmod) + info->hoffs; + + return XT_CONTINUE; +} +#endif + +/* + * Calc hash value, special case is taken on icmp and fragmented messages + * i.e. fragmented messages don't use ports. + */ +unsigned int hmark_v4(struct sk_buff *skb, const struct xt_action_param *par) +{ + struct xt_hmark_info *info = (struct xt_hmark_info *)par->targinfo; + int nhoff, poff, frag = 0; + struct iphdr *ip; + u8 ip_proto; + u32 addr1, addr2, hash; + u16 snatport = 0, dnatport = 0; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); + union { + u32 v32; + u16 v16[2]; + } ports; + + nhoff = skb_network_offset(skb); + ports.v32 = 0; + + ip = (struct iphdr *) (skb->data + nhoff); + if (ip->protocol == IPPROTO_ICMP) { + /* calc hash on inner header if right type */ + nhoff = get_inner_hdr(skb, ip->ihl * 4, nhoff); + ip = (struct iphdr *) (skb->data + nhoff); + } + + ip_proto = ip->protocol; + if (ip->frag_off & htons(IP_MF | IP_OFFSET)) + frag = 1; + + addr1 = (__force u32) ip->saddr & info->smask; + addr2 = (__force u32) ip->daddr & info->dmask; + + if (ct && test_bit(IP_CT_IS_REPLY, &ct->status)) { + struct nf_conntrack_tuple *otuple; + + otuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + /* On the "return flow", to get the original address + * i,e, replace the source address. */ + if ((ct->status & IPS_DST_NAT) && + (info->flags & XT_HMARK_USE_DNAT)) { + addr1 = (__force u32) otuple->dst.u3.in.s_addr; + dnatport = otuple->dst.u.udp.port; + } + /* On the "return flow", to get the original address + * i,e, replace the destination address. */ + if ((ct->status & IPS_SRC_NAT) && + (info->flags & XT_HMARK_USE_SNAT)) { + addr2 = (__force u32) otuple->src.u3.in.s_addr; + snatport = otuple->src.u.udp.port; + } + } + + poff = proto_ports_offset(ip_proto); + nhoff += ip->ihl * 4 + poff; + if (frag || poff < 0 || !pskb_may_pull(skb, nhoff + 4)) + goto noports; + + ports.v32 = * (__force u32 *) (skb->data + nhoff); + if (ip_proto == IPPROTO_ESP || ip_proto == IPPROTO_AH) { + ports.v32 = (ports.v32 & info->spimask) | info->spiset; + } else { + if (snatport) /* Replace nat'ed port(s) */ + ports.v16[1] = snatport; + if (dnatport) + ports.v16[0] = dnatport; + ports.v32 = (ports.v32 & info->pmask.v32) | + info->pset.v32; + if (ports.v16[1] < ports.v16[0]) + swap(ports.v16[0], ports.v16[1]); + } + +noports: + ip_proto &= info->prmask; + /* get a consistent hash (same value on both flow directions) */ + if (addr2 < addr1) + swap(addr1, addr2); + + hash = jhash_3words(addr1, addr2, ports.v32, info->hashrnd) ^ ip_proto; + if (info->hmod) + skb->mark = (hash % info->hmod) + info->hoffs; + return XT_CONTINUE; +} + +static struct xt_target hmark_tg_reg[] __read_mostly = { + { + .name = "HMARK", + .revision = 0, + .family = NFPROTO_IPV4, + .target = hmark_v4, + .targetsize = sizeof(struct xt_hmark_info), + .me = THIS_MODULE, + }, +#ifdef WITH_IPV6 + { + .name = "HMARK", + .revision = 0, + .family = NFPROTO_IPV6, + .target = hmark_v6, + .targetsize = sizeof(struct xt_hmark_info), + .me = THIS_MODULE, + }, +#endif +}; + +static int __init hmark_mt_init(void) +{ + int ret; + + ret = xt_register_targets(hmark_tg_reg, ARRAY_SIZE(hmark_tg_reg)); + if (ret < 0) + return ret; + return 0; +} + +static void __exit hmark_mt_exit(void) +{ + xt_unregister_targets(hmark_tg_reg, ARRAY_SIZE(hmark_tg_reg)); +} + +module_init(hmark_mt_init); +module_exit(hmark_mt_exit);