Message ID | 3900291.2831326843194846.JavaMail.root@5-MeO-DMT.ynet.sk |
---|---|
State | Rejected, archived |
Delegated to: | David Miller |
Headers | show |
2012/1/18 Stefan Gula <steweg@ynet.sk>: > From: Stefan Gula <steweg@gmail.com> > > This patch is an extension for current Ethernet over GRE > implementation, which allows user to create virtual bridge (multipoint > VPN) and forward traffic based on Ethernet MAC address information in > it. It simulates the Bridge behavior learning mechanism, but instead > of learning port ID from which given MAC address comes, it learns IP > address of peer which encapsulated given packet. Multicast, Broadcast > and unknown-multicast traffic is send over network as multicast > encapsulated GRE packet, so one Ethernet multipoint GRE tunnel can be > represented as one single virtual switch on logical level and be also > represented as one multicast IPv4 address on network level. > > Signed-off-by: Stefan Gula <steweg@gmail.com> > > --- > > code was merged with Eric Dumazet proposal (all except the reordering of orig_source as that needed to be previous value), tested and fixed with additional lines in ipgre_tap_netdev_ops struct, orig_source line was moved before pskb_may_pull > > diff -uprN -X linux-3.2.1-orig/Documentation/dontdiff linux-3.2.1-orig/include/net/ipip.h linux-3.2.1-my/include/net/ipip.h > --- linux-3.2.1-orig/include/net/ipip.h 2012-01-12 20:42:45.000000000 +0100 > +++ linux-3.2.1-my/include/net/ipip.h 2012-01-16 11:17:01.000000000 +0100 > @@ -27,6 +27,14 @@ struct ip_tunnel { > __u32 o_seqno; /* The last output seqno */ > int hlen; /* Precalculated GRE header length */ > int mlink; > +#ifdef CONFIG_NET_IPGRE_BRIDGE > +#define GRETAP_BR_HASH_BITS 8 > +#define GRETAP_BR_HASH_SIZE (1 << GRETAP_BR_HASH_BITS) > + struct hlist_head hash[GRETAP_BR_HASH_SIZE]; > + spinlock_t hash_lock; > + unsigned long ageing_time; > + struct timer_list gc_timer; > +#endif > > struct ip_tunnel_parm parms; > > diff -uprN -X linux-3.2.1-orig/Documentation/dontdiff linux-3.2.1-orig/net/ipv4/Kconfig linux-3.2.1-my/net/ipv4/Kconfig > --- linux-3.2.1-orig/net/ipv4/Kconfig 2012-01-12 20:42:45.000000000 +0100 > +++ linux-3.2.1-my/net/ipv4/Kconfig 2012-01-16 12:37:00.000000000 +0100 > @@ -211,6 +211,15 @@ config NET_IPGRE_BROADCAST > Network), but can be distributed all over the Internet. If you want > to do that, say Y here and to "IP multicast routing" below. > > +config NET_IPGRE_BRIDGE > + bool "IP: Ethernet over multipoint GRE over IP" > + depends on IP_MULTICAST && NET_IPGRE && NET_IPGRE_BROADCAST > + help > + Allows you to use multipoint GRE VPN as virtual switch and interconnect > + several L2 endpoints over L3 routed infrastructure. It is useful for > + creating multipoint L2 VPNs which can be later used inside bridge > + interfaces If you want to use. GRE multipoint L2 VPN feature say Y. > + > config IP_MROUTE > bool "IP: multicast routing" > depends on IP_MULTICAST > diff -uprN -X linux-3.2.1-orig/Documentation/dontdiff linux-3.2.1-orig/net/ipv4/ip_gre.c linux-3.2.1-my/net/ipv4/ip_gre.c > --- linux-3.2.1-orig/net/ipv4/ip_gre.c 2012-01-12 20:42:45.000000000 +0100 > +++ linux-3.2.1-my/net/ipv4/ip_gre.c 2012-01-18 00:33:55.000000000 +0100 > @@ -52,6 +52,11 @@ > #include <net/ip6_route.h> > #endif > > +#ifdef CONFIG_NET_IPGRE_BRIDGE > +#include <linux/jhash.h> > +#include <asm/unaligned.h> > +#endif > + > /* > Problems & solutions > -------------------- > @@ -134,6 +139,172 @@ struct ipgre_net { > struct net_device *fb_tunnel_dev; > }; > > +#ifdef CONFIG_NET_IPGRE_BRIDGE > + /* > + * This part of code includes codes to enable L2 ethernet > + * switch virtualization over IP routed infrastructure with > + * utilization of multicast capable endpoint using Ethernet > + * over GRE > + * > + * Author: Stefan Gula > + * Signed-off-by: Stefan Gula <steweg@gmail.com> > + */ > +struct ipgre_tap_bridge_entry { > + struct hlist_node hlist; > + __be32 raddr; > + unsigned char addr[ETH_ALEN]; > + unsigned long updated; > + struct rcu_head rcu; > +}; > + > +static u32 ipgre_salt __read_mostly; > + > +static inline int ipgre_tap_bridge_hash(const unsigned char *mac) > +{ > + u32 key = get_unaligned((u32 *)(mac + 2)); > + > + return jhash_1word(key, ipgre_salt) & (GRETAP_BR_HASH_SIZE - 1); > +} > + > +static inline int ipgre_tap_bridge_has_expired(const struct ip_tunnel *tunnel, > + const struct ipgre_tap_bridge_entry *entry) > +{ > + return time_before_eq(entry->updated + tunnel->ageing_time, > + jiffies); > +} > + > +static inline void ipgre_tap_bridge_delete(struct ipgre_tap_bridge_entry *entry) > +{ > + hlist_del_rcu(&entry->hlist); > + kfree_rcu(entry, rcu); > +} > + > +static void ipgre_tap_bridge_cleanup(unsigned long _data) > +{ > + struct ip_tunnel *tunnel = (struct ip_tunnel *)_data; > + unsigned long delay = tunnel->ageing_time; > + unsigned long next_timer = jiffies + tunnel->ageing_time; > + int i; > + > + spin_lock(&tunnel->hash_lock); > + for (i = 0; i < GRETAP_BR_HASH_SIZE; i++) { > + struct ipgre_tap_bridge_entry *entry; > + struct hlist_node *h, *n; > + > + hlist_for_each_entry_safe(entry, h, n, > + &tunnel->hash[i], hlist) > + { > + unsigned long this_timer; > + this_timer = entry->updated + delay; > + if (time_before_eq(this_timer, jiffies)) > + ipgre_tap_bridge_delete(entry); > + else if (time_before(this_timer, next_timer)) > + next_timer = this_timer; > + } > + } > + spin_unlock(&tunnel->hash_lock); > + mod_timer(&tunnel->gc_timer, round_jiffies_up(next_timer)); > +} > + > +static void ipgre_tap_bridge_flush(struct ip_tunnel *tunnel) > +{ > + int i; > + > + spin_lock_bh(&tunnel->hash_lock); > + for (i = 0; i < GRETAP_BR_HASH_SIZE; i++) { > + struct ipgre_tap_bridge_entry *entry; > + struct hlist_node *h, *n; > + > + hlist_for_each_entry_safe(entry, h, n, > + &tunnel->hash[i], hlist) > + { > + ipgre_tap_bridge_delete(entry); > + } > + } > + spin_unlock_bh(&tunnel->hash_lock); > +} > + > +static struct ipgre_tap_bridge_entry *__ipgre_tap_bridge_get( > + struct ip_tunnel *tunnel, const unsigned char *addr) > +{ > + struct hlist_node *h; > + struct ipgre_tap_bridge_entry *entry; > + > + hlist_for_each_entry_rcu(entry, h, > + &tunnel->hash[ipgre_tap_bridge_hash(addr)], hlist) { > + if (!compare_ether_addr(entry->addr, addr)) { > + if (unlikely(ipgre_tap_bridge_has_expired(tunnel, > + entry))) > + break; > + return entry; > + } > + } > + > + return NULL; > +} > + > +static struct ipgre_tap_bridge_entry *ipgre_tap_bridge_find( > + struct hlist_head *head, > + const unsigned char *addr) > +{ > + struct hlist_node *h; > + struct ipgre_tap_bridge_entry *entry; > + > + hlist_for_each_entry(entry, h, head, hlist) { > + if (!compare_ether_addr(entry->addr, addr)) > + return entry; > + } > + return NULL; > +} > + > + > +static struct ipgre_tap_bridge_entry *ipgre_tap_bridge_find_rcu( > + struct hlist_head *head, > + const unsigned char *addr) > +{ > + struct hlist_node *h; > + struct ipgre_tap_bridge_entry *entry; > + > + hlist_for_each_entry_rcu(entry, h, head, hlist) { > + if (!compare_ether_addr(entry->addr, addr)) > + return entry; > + } > + return NULL; > +} > + > +static struct ipgre_tap_bridge_entry *ipgre_tap_bridge_create( > + struct hlist_head *head, > + __be32 source, > + const unsigned char *addr) > +{ > + struct ipgre_tap_bridge_entry *entry; > + > + entry = kmalloc(sizeof(*entry), GFP_ATOMIC); > + if (entry) { > + memcpy(entry->addr, addr, ETH_ALEN); > + entry->raddr = source; > + entry->updated = jiffies; > + hlist_add_head_rcu(&entry->hlist, head); > + } > + return entry; > +} > + > +static __be32 ipgre_tap_bridge_get_raddr(struct ip_tunnel *tunnel, > + const unsigned char *addr) > +{ > + __be32 raddr = 0; > + struct ipgre_tap_bridge_entry *entry; > + > + rcu_read_lock(); > + entry = __ipgre_tap_bridge_get(tunnel, addr); > + if (entry) > + raddr = entry->raddr; > + rcu_read_unlock(); > + > + return raddr; > +} > + > +#endif > /* Tunnel hash table */ > > /* > @@ -562,6 +733,12 @@ static int ipgre_rcv(struct sk_buff *skb > struct ip_tunnel *tunnel; > int offset = 4; > __be16 gre_proto; > +#ifdef CONFIG_NET_IPGRE_BRIDGE > + __be32 orig_source; > + struct hlist_head *head; > + struct ipgre_tap_bridge_entry *entry; > + const struct ethhdr *tethhdr; > +#endif > > if (!pskb_may_pull(skb, 16)) > goto drop_nolock; > @@ -654,6 +831,9 @@ static int ipgre_rcv(struct sk_buff *skb > > /* Warning: All skb pointers will be invalidated! */ > if (tunnel->dev->type == ARPHRD_ETHER) { > +#ifdef CONFIG_NET_IPGRE_BRIDGE > + orig_source = iph->saddr; > +#endif > if (!pskb_may_pull(skb, ETH_HLEN)) { > tunnel->dev->stats.rx_length_errors++; > tunnel->dev->stats.rx_errors++; > @@ -663,6 +843,32 @@ static int ipgre_rcv(struct sk_buff *skb > iph = ip_hdr(skb); > skb->protocol = eth_type_trans(skb, tunnel->dev); > skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); > +#ifdef CONFIG_NET_IPGRE_BRIDGE > + if (ipv4_is_multicast(tunnel->parms.iph.daddr)) { > + tethhdr = eth_hdr(skb); > + if (!is_multicast_ether_addr( > + tethhdr->h_source)) { > + head = &tunnel->hash[ > + ipgre_tap_bridge_hash( > + tethhdr->h_source)]; > + entry = ipgre_tap_bridge_find_rcu(head, > + tethhdr->h_source); > + if (likely(entry)) { > + entry->raddr = orig_source; > + entry->updated = jiffies; > + } else { > + spin_lock(&tunnel->hash_lock); > + if (!ipgre_tap_bridge_find(head, > + tethhdr->h_source)) > + ipgre_tap_bridge_create( > + head, > + orig_source, > + tethhdr->h_source); > + spin_unlock(&tunnel->hash_lock); > + } > + } > + } > +#endif > } > > tstats = this_cpu_ptr(tunnel->dev->tstats); > @@ -702,7 +908,7 @@ static netdev_tx_t ipgre_tunnel_xmit(str > struct iphdr *iph; /* Our new IP header */ > unsigned int max_headroom; /* The extra header space needed */ > int gre_hlen; > - __be32 dst; > + __be32 dst = 0; > int mtu; > > if (dev->type == ARPHRD_ETHER) > @@ -716,7 +922,15 @@ static netdev_tx_t ipgre_tunnel_xmit(str > tiph = &tunnel->parms.iph; > } > > - if ((dst = tiph->daddr) == 0) { > +#ifdef CONFIG_NET_IPGRE_BRIDGE > + if ((dev->type == ARPHRD_ETHER) && > + ipv4_is_multicast(tunnel->parms.iph.daddr)) > + dst = ipgre_tap_bridge_get_raddr(tunnel, > + ((struct ethhdr *)skb->data)->h_dest); > +#endif > + if (dst == 0) > + dst = tiph->daddr; > + if (dst == 0) { > /* NBMA tunnel */ > > if (skb_dst(skb) == NULL) { > @@ -1209,6 +1423,16 @@ static int ipgre_open(struct net_device > return -EADDRNOTAVAIL; > t->mlink = dev->ifindex; > ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr); > +#ifdef CONFIG_NET_IPGRE_BRIDGE > + if (t->dev->type == ARPHRD_ETHER) { > + INIT_HLIST_HEAD(t->hash); > + spin_lock_init(&t->hash_lock); > + t->ageing_time = 300 * HZ; > + setup_timer(&t->gc_timer, ipgre_tap_bridge_cleanup, > + (unsigned long) t); > + mod_timer(&t->gc_timer, jiffies + t->ageing_time); > + } > +#endif > } > return 0; > } > @@ -1219,6 +1443,12 @@ static int ipgre_close(struct net_device > > if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) { > struct in_device *in_dev; > +#ifdef CONFIG_NET_IPGRE_BRIDGE > + if (t->dev->type == ARPHRD_ETHER) { > + ipgre_tap_bridge_flush(t); > + del_timer_sync(&t->gc_timer); > + } > +#endif > in_dev = inetdev_by_index(dev_net(dev), t->mlink); > if (in_dev) > ip_mc_dec_group(in_dev, t->parms.iph.daddr); > @@ -1488,6 +1718,10 @@ static int ipgre_tap_init(struct net_dev > static const struct net_device_ops ipgre_tap_netdev_ops = { > .ndo_init = ipgre_tap_init, > .ndo_uninit = ipgre_tunnel_uninit, > +#ifdef CONFIG_NET_IPGRE_BRIDGE > + .ndo_open = ipgre_open, > + .ndo_stop = ipgre_close, > +#endif > .ndo_start_xmit = ipgre_tunnel_xmit, > .ndo_set_mac_address = eth_mac_addr, > .ndo_validate_addr = eth_validate_addr, > @@ -1705,6 +1939,9 @@ static int __init ipgre_init(void) > > printk(KERN_INFO "GRE over IPv4 tunneling driver\n"); > > +#ifdef CONFIG_NET_IPGRE_BRIDGE > + get_random_bytes(&ipgre_salt, sizeof(ipgre_salt)); > +#endif > err = register_pernet_device(&ipgre_net_ops); > if (err < 0) > return err; is there anything else needed from my side to get this code into the kernel or should I only wait for the maintainers to check it? -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Le lundi 23 janvier 2012 à 14:12 +0100, Štefan Gula a écrit : > is there anything else needed from my side to get this code into the > kernel or should I only wait for the maintainers to check it? net-next is/was not yet opened, you shall wait for the good time frame. Dont copy/paste a big message only to add one sentence at its end, its really a waste of time for many of us. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
From: Eric Dumazet <eric.dumazet@gmail.com> Date: Mon, 23 Jan 2012 15:13:19 +0100 > Le lundi 23 janvier 2012 à 14:12 +0100, Štefan Gula a écrit : > >> is there anything else needed from my side to get this code into the >> kernel or should I only wait for the maintainers to check it? > > net-next is/was not yet opened, you shall wait for the good time frame. > > Dont copy/paste a big message only to add one sentence at its end, its > really a waste of time for many of us. Also you were told of other ways to achieve the things you want to do, and I haven't seen any reasonable response that supports still adding this new code. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
2012/1/23 David Miller <davem@davemloft.net>: > From: Eric Dumazet <eric.dumazet@gmail.com> > Date: Mon, 23 Jan 2012 15:13:19 +0100 > >> Le lundi 23 janvier 2012 à 14:12 +0100, Štefan Gula a écrit : >> >>> is there anything else needed from my side to get this code into the >>> kernel or should I only wait for the maintainers to check it? >> >> net-next is/was not yet opened, you shall wait for the good time frame. >> >> Dont copy/paste a big message only to add one sentence at its end, its >> really a waste of time for many of us. > > Also you were told of other ways to achieve the things you want to do, > and I haven't seen any reasonable response that supports still adding > this new code. I am sorry about the mistakes I did (still new to kernel submitting processes). I believe that I also explain to everybody who says something similar that it is not entirely true and probably never will be as none of the current SW allows you to create L2 Ethernet Multipoint VPN solution in such simple way as almost all requires to run special kind of software in user-space to provide control-plane for such feature to work. This software must be able to run on various platforms/architectures which makes it sometimes problematic. My solution doesn't need this at all and extends the capability of gretap interfaces in proper manner to be aligned with other forwarding engines of gretap interfaces. I already have at least one successful test done in other environment other than my own. So I assume some people like this approach better than other solutions. For them and also for myself, I believe it should be added. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hi, I have conducted testing on this patch series and have found it to be stable. It applies cleanly and doesn't introduce any noticable performance impact over standard gretap interfaces. The reason why this patch is useful is that it stands to be the only true mulitpoint L2 VPN with a kernel space forwarding plane. Almost all other VPN solutions use the TAP/TUN module - resulting in barely 300-400mbits of performance even on powerful cores. Many of these solutions implented in userspace still don't implement MAC learning or any method of tunneling traffic directly to endpoints rather than through some central server. I measured this patch (and gretap in general) peaking at 4.7gbits on my test hardware (reasonable i5 cores) The iperf benchmark results are at the end of this email. The merits of the solution are as follows: - No userspace utilities required (other than ip) - Stateless (gre) tunnels dont cause issues like TCP in TCP and are well supported in hardware (unlike custom UDP encapsulation) - Forwarding table automatically updated via multicast learning - If included the Linux kernel is fully capable for being a multisite Ethernet bridge with learning and unicast tunneling. In contrast of other software being able to implement this solutoin: - OpenVSwitch currently doesnt have NVGRE or VXLAN support (I'm working on getting VXLAN support stable in the 1.4 branch however) - as a subpoint you can implement this using an Openflow controller and OVS gre interfaces but it's well outside the effort scope this patch is in - Userspace solutions like OpenVPN that use the TUN interface are very inefficient for high speed tunneling use and again are hard to configure in multipoint mode without resuling in a star topology (very inefficient for high b/w use) - Other kernelspace tunneling isnt capable of forming a loop free arbitary topology without STP or other protocol to ensure an acyclic forwarding graph Having this in the kernel eliminates the need for much more complicated userspace utilities and the ip command makes for an effective user interface. Lastly here are the promised benchmarks, I can post test rig stats if anyone is interested. ------------------------------------------------------------ Server listening on TCP port 5001 TCP window size: 85.3 KByte (default) ------------------------------------------------------------ [ 4] local 192.168.1.2 port 5001 connected with 192.168.1.1 port 59819 [ ID] Interval Transfer Bandwidth [ 4] 0.0-10.0 sec 5.50 GBytes 4.72 Gbits/sec Kind regards, Joseph. 2012/1/24 Štefan Gula <steweg@ynet.sk>: > 2012/1/23 David Miller <davem@davemloft.net>: >> From: Eric Dumazet <eric.dumazet@gmail.com> >> Date: Mon, 23 Jan 2012 15:13:19 +0100 >> >>> Le lundi 23 janvier 2012 à 14:12 +0100, Štefan Gula a écrit : >>> >>>> is there anything else needed from my side to get this code into the >>>> kernel or should I only wait for the maintainers to check it? >>> >>> net-next is/was not yet opened, you shall wait for the good time frame. >>> >>> Dont copy/paste a big message only to add one sentence at its end, its >>> really a waste of time for many of us. >> >> Also you were told of other ways to achieve the things you want to do, >> and I haven't seen any reasonable response that supports still adding >> this new code. > I am sorry about the mistakes I did (still new to kernel submitting > processes). I believe that I also explain to everybody who says > something similar that it is not entirely true and probably never will > be as none of the current SW allows you to create L2 Ethernet > Multipoint VPN solution in such simple way as almost all requires to > run special kind of software in user-space to provide control-plane > for such feature to work. This software must be able to run on various > platforms/architectures which makes it sometimes problematic. My > solution doesn't need this at all and extends the capability of gretap > interfaces in proper manner to be aligned with other forwarding > engines of gretap interfaces. I already have at least one successful > test done in other environment other than my own. So I assume some > people like this approach better than other solutions. For them and > also for myself, I believe it should be added. > -- > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > Please read the FAQ at http://www.tux.org/lkml/
From: Joseph Glanville <joseph.glanville@orionvm.com.au> Date: Wed, 25 Jan 2012 14:48:37 +1100 > The reason why this patch is useful is that it stands to be the only > true mulitpoint L2 VPN with a kernel space forwarding plane. So what you're telling me is that I added this huge openvswitch thing essentially for nothing? -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Le mercredi 18 janvier 2012 à 00:33 +0100, Stefan Gula a écrit : > From: Stefan Gula <steweg@gmail.com> > > This patch is an extension for current Ethernet over GRE > implementation, which allows user to create virtual bridge (multipoint > VPN) and forward traffic based on Ethernet MAC address information in > it. It simulates the Bridge behavior learning mechanism, but instead > of learning port ID from which given MAC address comes, it learns IP > address of peer which encapsulated given packet. Multicast, Broadcast > and unknown-multicast traffic is send over network as multicast > encapsulated GRE packet, so one Ethernet multipoint GRE tunnel can be > represented as one single virtual switch on logical level and be also > represented as one multicast IPv4 address on network level. > > Signed-off-by: Stefan Gula <steweg@gmail.com> > > --- > Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> I guess remaining performance point is to get a refcount less ip_route_output_gre(), aka ip_route_output_gre_noref() :) -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jan 24, 2012 at 8:02 PM, David Miller <davem@davemloft.net> wrote: > From: Joseph Glanville <joseph.glanville@orionvm.com.au> > Date: Wed, 25 Jan 2012 14:48:37 +1100 > >> The reason why this patch is useful is that it stands to be the only >> true mulitpoint L2 VPN with a kernel space forwarding plane. > > So what you're telling me is that I added this huge openvswitch > thing essentially for nothing? I think it's actually the opposite - Open vSwitch can be used to implement this type of thing as well as for many other use cases. On the other hand, even when implementing a multipoint L2 solution it can be useful to have additional levels of control but you can't do that with this patch because it essentially statically glues together tunneling and bridging. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
2012/1/25 Jesse Gross <jesse@nicira.com>: > On Tue, Jan 24, 2012 at 8:02 PM, David Miller <davem@davemloft.net> wrote: >> From: Joseph Glanville <joseph.glanville@orionvm.com.au> >> Date: Wed, 25 Jan 2012 14:48:37 +1100 >> >>> The reason why this patch is useful is that it stands to be the only >>> true mulitpoint L2 VPN with a kernel space forwarding plane. >> >> So what you're telling me is that I added this huge openvswitch >> thing essentially for nothing? > > I think it's actually the opposite - Open vSwitch can be used to > implement this type of thing as well as for many other use cases. On > the other hand, even when implementing a multipoint L2 solution it can > be useful to have additional levels of control but you can't do that > with this patch because it essentially statically glues together > tunneling and bridging. Yes, those methods are glued together. If you are speaking about additional level of controls. What kind of control is missing? As if I compare it to standard bridge, it is missing: - STP code, which is not relevant here as the topology inside the gretap bridge never reaches loops - it represent more one shared link than box with multiple links from STP point of view. On the other hand STP can be tunneled inside of that tunnel by putting gretap interface as part of some bridge e.g. "brctl addif br0 gretap0". - IGMP/MLD snooping. IGMP/MLD snooping are useful features, but due the encapsulation rules, the only one optimalization can be done and that's if in ONLY ONE gretap enabled nodes requires to join some multicast group inside the gretap and one node has source behind. In that case those frames can be forwarded to only that gretap node. In case of two or more, the encapsulation process will result in using multicast as underlying technology so any one of the gretap nodes will received the frames regardless of state if IGMP/MLD. On the other hand such multicast optimalizations are missing from whole GRE tunnels code (PIM/MLD/IGMP snooping, using something like cisco Multicast Domain Trees/MDT....), so if somebody wants to optimize that feel free, but don't blame this patch for missing those. - did I miss something? -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Le mardi 24 janvier 2012 à 23:11 -0800, Jesse Gross a écrit : > I think it's actually the opposite - Open vSwitch can be used to > implement this type of thing as well as for many other use cases. On > the other hand, even when implementing a multipoint L2 solution it can > be useful to have additional levels of control but you can't do that > with this patch because it essentially statically glues together > tunneling and bridging. Unless you can provide a working solution in a very short time, this patch is a pragmatic one. Code is not perfect and could be improved (for example using a helper function to keep ipgre_rcv() shorter and reduce indentation level) Stefan, could you move this code out of ipgre_rcv() ? +#ifdef CONFIG_NET_IPGRE_BRIDGE + if (ipv4_is_multicast(tunnel->parms.iph.daddr)) { + tethhdr = eth_hdr(skb); + if (!is_multicast_ether_addr( + tethhdr->h_source)) { + head = &tunnel->hash[ + ipgre_tap_bridge_hash( + tethhdr->h_source)]; + entry = ipgre_tap_bridge_find_rcu(head, + tethhdr->h_source); + if (likely(entry)) { + entry->raddr = orig_source; + entry->updated = jiffies; + } else { + spin_lock(&tunnel->hash_lock); + if (!ipgre_tap_bridge_find(head, + tethhdr->h_source)) + ipgre_tap_bridge_create( + head, + orig_source, + tethhdr->h_source); + spin_unlock(&tunnel->hash_lock); + } + } + } +#endif -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
2012/1/25 Eric Dumazet <eric.dumazet@gmail.com>: > Le mardi 24 janvier 2012 à 23:11 -0800, Jesse Gross a écrit : > >> I think it's actually the opposite - Open vSwitch can be used to >> implement this type of thing as well as for many other use cases. On >> the other hand, even when implementing a multipoint L2 solution it can >> be useful to have additional levels of control but you can't do that >> with this patch because it essentially statically glues together >> tunneling and bridging. > > Unless you can provide a working solution in a very short time, this > patch is a pragmatic one. > > Code is not perfect and could be improved (for example using a helper > function to keep ipgre_rcv() shorter and reduce indentation level) > > Stefan, could you move this code out of ipgre_rcv() ? > > > +#ifdef CONFIG_NET_IPGRE_BRIDGE > + if (ipv4_is_multicast(tunnel->parms.iph.daddr)) { > + tethhdr = eth_hdr(skb); > + if (!is_multicast_ether_addr( > + tethhdr->h_source)) { > + head = &tunnel->hash[ > + ipgre_tap_bridge_hash( > + tethhdr->h_source)]; > + entry = ipgre_tap_bridge_find_rcu(head, > + tethhdr->h_source); > + if (likely(entry)) { > + entry->raddr = orig_source; > + entry->updated = jiffies; > + } else { > + spin_lock(&tunnel->hash_lock); > + if (!ipgre_tap_bridge_find(head, > + tethhdr->h_source)) > + ipgre_tap_bridge_create( > + head, > + orig_source, > + tethhdr->h_source); > + spin_unlock(&tunnel->hash_lock); > + } > + } > + } > +#endif > > > Ok
Hi, Sorry David, I think I might have failed to convey my meaning. What I meant by only true L2 multipoint VPN is that Open vSwitch cannot fuffil this on it's own at this time. Without NVGRE or VXLAN support its required to implement some logic in a Openflow controller to achieve this functionality. However once NVGRE/VXLAN is complete advanced users can setup a VXLAN domain to accomplish this. Open vSwitch fufils a different role to this simplistic VPN. The point of OVS is to be a generic L2 forwarding plane for Software Defined Networking (SDN) solutions, an effective cornerstone of network virtualisation. If anything it's more analgous to Solaris's Crossbow or FreeBSD's VIMAGE in that it's more of a wholistic solution to the problem cappable of solving arbitarily difficult fowarding problems with the assistance of userspace logic. As such given a controller that implemented the tunnel endpoint learning, establishment of tunnels between every endpoint and a broadcast tunnel you could build a solution that is on par with this patch if not better. The 2 are fundamentally different usecases however - many people don't need or couldn't care less about SDN and just want a mulitpoint VPN or L3 encapsulation of L2 between bridges, this is an ideal solution for these simpler usecases. The primary users of Open vSwitch are likely to be large virtual environments with complex network topologies and well defined virtual networking needs. For this class of users the Linux Bridge module + static tunneling isn't going to cut it without some very clever control software - which is why OVS exists. I think there is quite some way to go yet integrating OVS into the kernel and making use of all of it's advanced features but I would say it's far from useless and that in the long run it's inclusion will be very rewarding. Kind regards, Joseph. On 25 January 2012 18:11, Jesse Gross <jesse@nicira.com> wrote: > On Tue, Jan 24, 2012 at 8:02 PM, David Miller <davem@davemloft.net> wrote: >> From: Joseph Glanville <joseph.glanville@orionvm.com.au> >> Date: Wed, 25 Jan 2012 14:48:37 +1100 >> >>> The reason why this patch is useful is that it stands to be the only >>> true mulitpoint L2 VPN with a kernel space forwarding plane. >> >> So what you're telling me is that I added this huge openvswitch >> thing essentially for nothing? > > I think it's actually the opposite - Open vSwitch can be used to > implement this type of thing as well as for many other use cases. On > the other hand, even when implementing a multipoint L2 solution it can > be useful to have additional levels of control but you can't do that > with this patch because it essentially statically glues together > tunneling and bridging.
On Wed, 18 Jan 2012 00:33:14 +0100 (CET) Stefan Gula <steweg@ynet.sk> wrote: > From: Stefan Gula <steweg@gmail.com> > > This patch is an extension for current Ethernet over GRE > implementation, which allows user to create virtual bridge (multipoint > VPN) and forward traffic based on Ethernet MAC address information in > it. It simulates the Bridge behavior learning mechanism, but instead > of learning port ID from which given MAC address comes, it learns IP > address of peer which encapsulated given packet. Multicast, Broadcast > and unknown-multicast traffic is send over network as multicast > encapsulated GRE packet, so one Ethernet multipoint GRE tunnel can be > represented as one single virtual switch on logical level and be also > represented as one multicast IPv4 address on network level. > > Signed-off-by: Stefan Gula <steweg@gmail.com> Will this break normal usages of GRE? Compile time options are not acceptable for a standard distribution if it will break it for the other case. It is fine to add the additional data structure elements, but the code in the receive path might get broken? -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
2012/1/25 Stephen Hemminger <shemminger@vyatta.com>: > On Wed, 18 Jan 2012 00:33:14 +0100 (CET) > Stefan Gula <steweg@ynet.sk> wrote: > >> From: Stefan Gula <steweg@gmail.com> >> >> This patch is an extension for current Ethernet over GRE >> implementation, which allows user to create virtual bridge (multipoint >> VPN) and forward traffic based on Ethernet MAC address information in >> it. It simulates the Bridge behavior learning mechanism, but instead >> of learning port ID from which given MAC address comes, it learns IP >> address of peer which encapsulated given packet. Multicast, Broadcast >> and unknown-multicast traffic is send over network as multicast >> encapsulated GRE packet, so one Ethernet multipoint GRE tunnel can be >> represented as one single virtual switch on logical level and be also >> represented as one multicast IPv4 address on network level. >> >> Signed-off-by: Stefan Gula <steweg@gmail.com> > > Will this break normal usages of GRE? > Compile time options are not acceptable for a standard distribution if > it will break it for the other case. It is fine to add the additional data > structure elements, but the code in the receive path might get broken? > > No, if you don't decide to compile with given kernel option.. nothing happens and original GRE code is leaved intact. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
From: Štefan Gula <steweg@ynet.sk> Date: Wed, 25 Jan 2012 22:01:08 +0100 > 2012/1/25 Stephen Hemminger <shemminger@vyatta.com>: >> On Wed, 18 Jan 2012 00:33:14 +0100 (CET) >> Stefan Gula <steweg@ynet.sk> wrote: >> >>> From: Stefan Gula <steweg@gmail.com> >>> >>> This patch is an extension for current Ethernet over GRE >>> implementation, which allows user to create virtual bridge (multipoint >>> VPN) and forward traffic based on Ethernet MAC address information in >>> it. It simulates the Bridge behavior learning mechanism, but instead >>> of learning port ID from which given MAC address comes, it learns IP >>> address of peer which encapsulated given packet. Multicast, Broadcast >>> and unknown-multicast traffic is send over network as multicast >>> encapsulated GRE packet, so one Ethernet multipoint GRE tunnel can be >>> represented as one single virtual switch on logical level and be also >>> represented as one multicast IPv4 address on network level. >>> >>> Signed-off-by: Stefan Gula <steweg@gmail.com> >> >> Will this break normal usages of GRE? >> Compile time options are not acceptable for a standard distribution if >> it will break it for the other case. It is fine to add the additional data >> structure elements, but the code in the receive path might get broken? >> >> > No, if you don't decide to compile with given kernel option.. nothing > happens and original GRE code is leaved intact. Every distribution is going to want to turn the option on to provide the feature to their users, so this line of reasoning is not acceptable. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
From: Jesse Gross <jesse@nicira.com> Date: Tue, 24 Jan 2012 23:11:06 -0800 > On Tue, Jan 24, 2012 at 8:02 PM, David Miller <davem@davemloft.net> wrote: >> From: Joseph Glanville <joseph.glanville@orionvm.com.au> >> Date: Wed, 25 Jan 2012 14:48:37 +1100 >> >>> The reason why this patch is useful is that it stands to be the only >>> true mulitpoint L2 VPN with a kernel space forwarding plane. >> >> So what you're telling me is that I added this huge openvswitch >> thing essentially for nothing? > > I think it's actually the opposite - Open vSwitch can be used to > implement this type of thing as well as for many other use cases. Then openvswitch is exactly where you should be prototyping and implementing support for this sort of stuff. And only if you cannot obtain reasonable performance using openvswitch should you be even entertaining the notion of a static implementation. That's the whole premise behind putting openvswitch into the tree, so that guys like you can play around in userspace without having to make any kernel changes at all. I am not applying these patches, the more things you say the more I am convinced they are not appropriate. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
2012/1/25 David Miller <davem@davemloft.net>: > From: Štefan Gula <steweg@ynet.sk> > Date: Wed, 25 Jan 2012 22:01:08 +0100 > >> 2012/1/25 Stephen Hemminger <shemminger@vyatta.com>: >>> On Wed, 18 Jan 2012 00:33:14 +0100 (CET) >>> Stefan Gula <steweg@ynet.sk> wrote: >>> >>>> From: Stefan Gula <steweg@gmail.com> >>>> >>>> This patch is an extension for current Ethernet over GRE >>>> implementation, which allows user to create virtual bridge (multipoint >>>> VPN) and forward traffic based on Ethernet MAC address information in >>>> it. It simulates the Bridge behavior learning mechanism, but instead >>>> of learning port ID from which given MAC address comes, it learns IP >>>> address of peer which encapsulated given packet. Multicast, Broadcast >>>> and unknown-multicast traffic is send over network as multicast >>>> encapsulated GRE packet, so one Ethernet multipoint GRE tunnel can be >>>> represented as one single virtual switch on logical level and be also >>>> represented as one multicast IPv4 address on network level. >>>> >>>> Signed-off-by: Stefan Gula <steweg@gmail.com> >>> >>> Will this break normal usages of GRE? >>> Compile time options are not acceptable for a standard distribution if >>> it will break it for the other case. It is fine to add the additional data >>> structure elements, but the code in the receive path might get broken? >>> >>> >> No, if you don't decide to compile with given kernel option.. nothing >> happens and original GRE code is leaved intact. > > Every distribution is going to want to turn the option on to provide > the feature to their users, so this line of reasoning is not > acceptable. On the other hand, if you enable the compile this option, it modifies the behavior only when you are using this command to create the interface: ip link add vpn0 type gretap local 1.2.3.4 remote 239.192.0.1 key 12345 ttl 10 please notice two major keywords: - gretap - 239.192.0.1 My patch modifies the code behavior only if you used "gretap" and remote IP must be valid multicast IP address. In every other combinations it is still using plain original GRE codes. So enabling it doesn't breaks anything else. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
2012/1/25 David Miller <davem@davemloft.net>: > From: Jesse Gross <jesse@nicira.com> > Date: Tue, 24 Jan 2012 23:11:06 -0800 > >> On Tue, Jan 24, 2012 at 8:02 PM, David Miller <davem@davemloft.net> wrote: >>> From: Joseph Glanville <joseph.glanville@orionvm.com.au> >>> Date: Wed, 25 Jan 2012 14:48:37 +1100 >>> >>>> The reason why this patch is useful is that it stands to be the only >>>> true mulitpoint L2 VPN with a kernel space forwarding plane. >>> >>> So what you're telling me is that I added this huge openvswitch >>> thing essentially for nothing? >> >> I think it's actually the opposite - Open vSwitch can be used to >> implement this type of thing as well as for many other use cases. > > Then openvswitch is exactly where you should be prototyping and > implementing support for this sort of stuff. > > And only if you cannot obtain reasonable performance using openvswitch > should you be even entertaining the notion of a static implementation. > > That's the whole premise behind putting openvswitch into the tree, so > that guys like you can play around in userspace without having to make > any kernel changes at all. > > I am not applying these patches, the more things you say the more I am > convinced they are not appropriate. > The performance is one of the most critical thing why I have chosen to build kernel patch in the first place instead of some user-space app. If I used this approach, I would probably end up with patch for OpenVPN project instead in that time. I am not telling that openvswitch is not a good place for prototyping, but I believe that this patch is beyond that border as it successfully run in environment with more 98 linux-based APs, used for 4K+ users, with no issue for more than 2 years. The performance results from Joseph Glanville even adds value to it. So I still don't get the point, why my patch and openvswitch cannot coexists in the kernel together and let user/admin to choose to correct solution for him/her. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
2012/1/25 Štefan Gula <steweg@ynet.sk>: > 2012/1/25 David Miller <davem@davemloft.net>: >> From: Jesse Gross <jesse@nicira.com> >> Date: Tue, 24 Jan 2012 23:11:06 -0800 >> >>> On Tue, Jan 24, 2012 at 8:02 PM, David Miller <davem@davemloft.net> wrote: >>>> From: Joseph Glanville <joseph.glanville@orionvm.com.au> >>>> Date: Wed, 25 Jan 2012 14:48:37 +1100 >>>> >>>>> The reason why this patch is useful is that it stands to be the only >>>>> true mulitpoint L2 VPN with a kernel space forwarding plane. >>>> >>>> So what you're telling me is that I added this huge openvswitch >>>> thing essentially for nothing? >>> >>> I think it's actually the opposite - Open vSwitch can be used to >>> implement this type of thing as well as for many other use cases. >> >> Then openvswitch is exactly where you should be prototyping and >> implementing support for this sort of stuff. >> >> And only if you cannot obtain reasonable performance using openvswitch >> should you be even entertaining the notion of a static implementation. >> >> That's the whole premise behind putting openvswitch into the tree, so >> that guys like you can play around in userspace without having to make >> any kernel changes at all. >> >> I am not applying these patches, the more things you say the more I am >> convinced they are not appropriate. >> > The performance is one of the most critical thing why I have chosen to > build kernel patch in the first place instead of some user-space app. I am very interested in testing this patch, for the kinds of environments I care about (embedded, cpe, etc) but I won't be able to get around to it for a week or so. I found the overall simplicity of this approach vs the complexity of the alternatives, appealing, and the performance numbers also.
From: Štefan Gula <steweg@ynet.sk> Date: Wed, 25 Jan 2012 23:57:18 +0100 > The performance is one of the most critical thing why I have chosen to > build kernel patch in the first place instead of some user-space app. > If I used this approach, I would probably end up with patch for > OpenVPN project instead in that time. I am not telling that > openvswitch is not a good place for prototyping, but I believe that > this patch is beyond that border as it successfully run in environment > with more 98 linux-based APs, used for 4K+ users, with no issue for > more than 2 years. The performance results from Joseph Glanville even > adds value to it. So I still don't get the point, why my patch and > openvswitch cannot coexists in the kernel together and let user/admin > to choose to correct solution for him/her. You don't even know if openvswitch could provide acceptable levels of performance, because you haven't even tried. I'm not applying your patch. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
2012/1/26 David Miller <davem@davemloft.net>: > From: Štefan Gula <steweg@ynet.sk> > Date: Wed, 25 Jan 2012 23:57:18 +0100 > >> The performance is one of the most critical thing why I have chosen to >> build kernel patch in the first place instead of some user-space app. >> If I used this approach, I would probably end up with patch for >> OpenVPN project instead in that time. I am not telling that >> openvswitch is not a good place for prototyping, but I believe that >> this patch is beyond that border as it successfully run in environment >> with more 98 linux-based APs, used for 4K+ users, with no issue for >> more than 2 years. The performance results from Joseph Glanville even >> adds value to it. So I still don't get the point, why my patch and >> openvswitch cannot coexists in the kernel together and let user/admin >> to choose to correct solution for him/her. > > You don't even know if openvswitch could provide acceptable levels > of performance, because you haven't even tried. > > I'm not applying your patch. Performance of any user-space application is lower than performance of something running purely inside the kernel-space only. So still don't see any valid reason, why it simply cannot coexists as it doesn't breaks any existing functionality at all? -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
From: Štefan Gula <steweg@ynet.sk> Date: Thu, 26 Jan 2012 11:57:30 +0100 > 2012/1/26 David Miller <davem@davemloft.net>: >> From: Štefan Gula <steweg@ynet.sk> >> Date: Wed, 25 Jan 2012 23:57:18 +0100 >> >>> The performance is one of the most critical thing why I have chosen to >>> build kernel patch in the first place instead of some user-space app. >>> If I used this approach, I would probably end up with patch for >>> OpenVPN project instead in that time. I am not telling that >>> openvswitch is not a good place for prototyping, but I believe that >>> this patch is beyond that border as it successfully run in environment >>> with more 98 linux-based APs, used for 4K+ users, with no issue for >>> more than 2 years. The performance results from Joseph Glanville even >>> adds value to it. So I still don't get the point, why my patch and >>> openvswitch cannot coexists in the kernel together and let user/admin >>> to choose to correct solution for him/her. >> >> You don't even know if openvswitch could provide acceptable levels >> of performance, because you haven't even tried. >> >> I'm not applying your patch. > Performance of any user-space application is lower than performance of > something running purely inside the kernel-space only. So still don't > see any valid reason, why it simply cannot coexists as it doesn't > breaks any existing functionality at all? The only userspace component is setting up the rules, the actual packet processing occurs in the openvswitch kernel code. Are you really unable to understand this? -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
David is correct, the forwarding speed of Open vSwitch is at parity with the Linux Bridging module and its tunneling speed is actually slightly faster than the in kernel GRE implementation. I have tested this across a variety of configurations. Joseph. On 27 January 2012 05:30, David Miller <davem@davemloft.net> wrote: > From: Štefan Gula <steweg@ynet.sk> > Date: Thu, 26 Jan 2012 11:57:30 +0100 > >> 2012/1/26 David Miller <davem@davemloft.net>: >>> From: Štefan Gula <steweg@ynet.sk> >>> Date: Wed, 25 Jan 2012 23:57:18 +0100 >>> >>>> The performance is one of the most critical thing why I have chosen to >>>> build kernel patch in the first place instead of some user-space app. >>>> If I used this approach, I would probably end up with patch for >>>> OpenVPN project instead in that time. I am not telling that >>>> openvswitch is not a good place for prototyping, but I believe that >>>> this patch is beyond that border as it successfully run in environment >>>> with more 98 linux-based APs, used for 4K+ users, with no issue for >>>> more than 2 years. The performance results from Joseph Glanville even >>>> adds value to it. So I still don't get the point, why my patch and >>>> openvswitch cannot coexists in the kernel together and let user/admin >>>> to choose to correct solution for him/her. >>> >>> You don't even know if openvswitch could provide acceptable levels >>> of performance, because you haven't even tried. >>> >>> I'm not applying your patch. >> Performance of any user-space application is lower than performance of >> something running purely inside the kernel-space only. So still don't >> see any valid reason, why it simply cannot coexists as it doesn't >> breaks any existing functionality at all? > > The only userspace component is setting up the rules, the actual > packet processing occurs in the openvswitch kernel code. > > Are you really unable to understand this?
Le vendredi 27 janvier 2012 à 09:24 +1100, Joseph Glanville a écrit : > David is correct, the forwarding speed of Open vSwitch is at parity > with the Linux Bridging module and its tunneling speed is actually > slightly faster than the in kernel GRE implementation. I have tested > this across a variety of configurations. Thanks for this input ! When was this tested exactly, and do you have some "perf tool" reports to provide ? GRE is lockless since one year or so (modulo how is setup the tunnel as discovered recently) Also please note that openvSwitch is said to be fast path only on established flows. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hi Eric, I have been testing on the 3.2.X series of kernels and the 3.0.X and 3.1.X prior to that but most of my results are for 3.2.1. My test hardware includes pairs of hardware (two or more of each) based on the following chips. AMD 6140 Intel X5650 Intel Core i5 2400 (desktop) I conducted throughput and PPS benchmarks using iperf and pktgen. All tests were performed over a real IP network (IP over Infiniband) that can operate at a much greater speed than the software is capable of forwarding at. If the list is interested I will re-run and post all of my benchmarks or atleast send them to you personally. Unfortunately I didn't store the benchmarks (much to my own stupidty) and I repurposed the hardware for other things. Yes I was going to mention this in my last email but deleted it on lack of relevance at the time. Under pathological load OVS suffers in benchmarks, continual establishment of new flows is really not good for it - I haven't observed this personally though. It does however worry me that this could be used as a viable DoS.. I don't really know what could be done to mitigate this however. That aside it's GRE implementation using loopback (internal) interfaces performs very well and is like I said onpar with Linux Bridge + GRE. Are the any specific things you would like to see? I'm not a networking guru and would welcome any assistance you could provide on improving my test methodology. Someone suggested netperf to me the other day, I intend on running some benchmarks with it next week. Joseph. On 27 January 2012 16:59, Eric Dumazet <eric.dumazet@gmail.com> wrote: > Le vendredi 27 janvier 2012 à 09:24 +1100, Joseph Glanville a écrit : >> David is correct, the forwarding speed of Open vSwitch is at parity >> with the Linux Bridging module and its tunneling speed is actually >> slightly faster than the in kernel GRE implementation. I have tested >> this across a variety of configurations. > > Thanks for this input ! When was this tested exactly, and do you have > some "perf tool" reports to provide ? > > GRE is lockless since one year or so (modulo how is setup the tunnel as > discovered recently) > > Also please note that openvSwitch is said to be fast path only on > established flows. > > >
On Thu, Jan 26, 2012 at 9:59 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote: > Le vendredi 27 janvier 2012 à 09:24 +1100, Joseph Glanville a écrit : >> David is correct, the forwarding speed of Open vSwitch is at parity >> with the Linux Bridging module and its tunneling speed is actually >> slightly faster than the in kernel GRE implementation. I have tested >> this across a variety of configurations. > > Thanks for this input ! When was this tested exactly, and do you have > some "perf tool" reports to provide ? > > GRE is lockless since one year or so (modulo how is setup the tunnel as > discovered recently) The out-of-tree OVS GRE stack (which is separate for practical and historical reasons but I would like to combine with the in-kernel one in the future) uses a few dirty tricks to help performance. I don't know the parameters that Joseph used to test but one that he might be benefiting from here is where we cache the full set of headers to be pushed (GRE/IP/Ethernet) as well any routing/switching decisions. This turns a GRE encapsulation into a simple memcpy if there is enough headroom. It's not something that I would want to propose for more general usage but I think that GSO/GRO for GRE would get at least some of that benefit and would also make things easier when NICs with support for encapsulated offloads start showing up. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Jan 27, 2012 at 2:54 AM, Joseph Glanville <joseph.glanville@orionvm.com.au> wrote: > Under pathological load OVS suffers in benchmarks, continual > establishment of new flows is really not good for it - I haven't > observed this personally though. > It does however worry me that this could be used as a viable DoS.. I > don't really know what could be done to mitigate this however. OVS allows each input port to have a separate genl socket associated with it for the purpose of sending flow misses to userspace. Currently userspace uses this to round-robin around these sockets when handling flow setup requests in order to prevent one port from DoSing the others. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff -uprN -X linux-3.2.1-orig/Documentation/dontdiff linux-3.2.1-orig/include/net/ipip.h linux-3.2.1-my/include/net/ipip.h --- linux-3.2.1-orig/include/net/ipip.h 2012-01-12 20:42:45.000000000 +0100 +++ linux-3.2.1-my/include/net/ipip.h 2012-01-16 11:17:01.000000000 +0100 @@ -27,6 +27,14 @@ struct ip_tunnel { __u32 o_seqno; /* The last output seqno */ int hlen; /* Precalculated GRE header length */ int mlink; +#ifdef CONFIG_NET_IPGRE_BRIDGE +#define GRETAP_BR_HASH_BITS 8 +#define GRETAP_BR_HASH_SIZE (1 << GRETAP_BR_HASH_BITS) + struct hlist_head hash[GRETAP_BR_HASH_SIZE]; + spinlock_t hash_lock; + unsigned long ageing_time; + struct timer_list gc_timer; +#endif struct ip_tunnel_parm parms; diff -uprN -X linux-3.2.1-orig/Documentation/dontdiff linux-3.2.1-orig/net/ipv4/Kconfig linux-3.2.1-my/net/ipv4/Kconfig --- linux-3.2.1-orig/net/ipv4/Kconfig 2012-01-12 20:42:45.000000000 +0100 +++ linux-3.2.1-my/net/ipv4/Kconfig 2012-01-16 12:37:00.000000000 +0100 @@ -211,6 +211,15 @@ config NET_IPGRE_BROADCAST Network), but can be distributed all over the Internet. If you want to do that, say Y here and to "IP multicast routing" below. +config NET_IPGRE_BRIDGE + bool "IP: Ethernet over multipoint GRE over IP" + depends on IP_MULTICAST && NET_IPGRE && NET_IPGRE_BROADCAST + help + Allows you to use multipoint GRE VPN as virtual switch and interconnect + several L2 endpoints over L3 routed infrastructure. It is useful for + creating multipoint L2 VPNs which can be later used inside bridge + interfaces If you want to use. GRE multipoint L2 VPN feature say Y. + config IP_MROUTE bool "IP: multicast routing" depends on IP_MULTICAST diff -uprN -X linux-3.2.1-orig/Documentation/dontdiff linux-3.2.1-orig/net/ipv4/ip_gre.c linux-3.2.1-my/net/ipv4/ip_gre.c --- linux-3.2.1-orig/net/ipv4/ip_gre.c 2012-01-12 20:42:45.000000000 +0100 +++ linux-3.2.1-my/net/ipv4/ip_gre.c 2012-01-18 00:33:55.000000000 +0100 @@ -52,6 +52,11 @@ #include <net/ip6_route.h> #endif +#ifdef CONFIG_NET_IPGRE_BRIDGE +#include <linux/jhash.h> +#include <asm/unaligned.h> +#endif + /* Problems & solutions -------------------- @@ -134,6 +139,172 @@ struct ipgre_net { struct net_device *fb_tunnel_dev; }; +#ifdef CONFIG_NET_IPGRE_BRIDGE + /* + * This part of code includes codes to enable L2 ethernet + * switch virtualization over IP routed infrastructure with + * utilization of multicast capable endpoint using Ethernet + * over GRE + * + * Author: Stefan Gula + * Signed-off-by: Stefan Gula <steweg@gmail.com> + */ +struct ipgre_tap_bridge_entry { + struct hlist_node hlist; + __be32 raddr; + unsigned char addr[ETH_ALEN]; + unsigned long updated; + struct rcu_head rcu; +}; + +static u32 ipgre_salt __read_mostly; + +static inline int ipgre_tap_bridge_hash(const unsigned char *mac) +{ + u32 key = get_unaligned((u32 *)(mac + 2)); + + return jhash_1word(key, ipgre_salt) & (GRETAP_BR_HASH_SIZE - 1); +} + +static inline int ipgre_tap_bridge_has_expired(const struct ip_tunnel *tunnel, + const struct ipgre_tap_bridge_entry *entry) +{ + return time_before_eq(entry->updated + tunnel->ageing_time, + jiffies); +} + +static inline void ipgre_tap_bridge_delete(struct ipgre_tap_bridge_entry *entry) +{ + hlist_del_rcu(&entry->hlist); + kfree_rcu(entry, rcu); +} + +static void ipgre_tap_bridge_cleanup(unsigned long _data) +{ + struct ip_tunnel *tunnel = (struct ip_tunnel *)_data; + unsigned long delay = tunnel->ageing_time; + unsigned long next_timer = jiffies + tunnel->ageing_time; + int i; + + spin_lock(&tunnel->hash_lock); + for (i = 0; i < GRETAP_BR_HASH_SIZE; i++) { + struct ipgre_tap_bridge_entry *entry; + struct hlist_node *h, *n; + + hlist_for_each_entry_safe(entry, h, n, + &tunnel->hash[i], hlist) + { + unsigned long this_timer; + this_timer = entry->updated + delay; + if (time_before_eq(this_timer, jiffies)) + ipgre_tap_bridge_delete(entry); + else if (time_before(this_timer, next_timer)) + next_timer = this_timer; + } + } + spin_unlock(&tunnel->hash_lock); + mod_timer(&tunnel->gc_timer, round_jiffies_up(next_timer)); +} + +static void ipgre_tap_bridge_flush(struct ip_tunnel *tunnel) +{ + int i; + + spin_lock_bh(&tunnel->hash_lock); + for (i = 0; i < GRETAP_BR_HASH_SIZE; i++) { + struct ipgre_tap_bridge_entry *entry; + struct hlist_node *h, *n; + + hlist_for_each_entry_safe(entry, h, n, + &tunnel->hash[i], hlist) + { + ipgre_tap_bridge_delete(entry); + } + } + spin_unlock_bh(&tunnel->hash_lock); +} + +static struct ipgre_tap_bridge_entry *__ipgre_tap_bridge_get( + struct ip_tunnel *tunnel, const unsigned char *addr) +{ + struct hlist_node *h; + struct ipgre_tap_bridge_entry *entry; + + hlist_for_each_entry_rcu(entry, h, + &tunnel->hash[ipgre_tap_bridge_hash(addr)], hlist) { + if (!compare_ether_addr(entry->addr, addr)) { + if (unlikely(ipgre_tap_bridge_has_expired(tunnel, + entry))) + break; + return entry; + } + } + + return NULL; +} + +static struct ipgre_tap_bridge_entry *ipgre_tap_bridge_find( + struct hlist_head *head, + const unsigned char *addr) +{ + struct hlist_node *h; + struct ipgre_tap_bridge_entry *entry; + + hlist_for_each_entry(entry, h, head, hlist) { + if (!compare_ether_addr(entry->addr, addr)) + return entry; + } + return NULL; +} + + +static struct ipgre_tap_bridge_entry *ipgre_tap_bridge_find_rcu( + struct hlist_head *head, + const unsigned char *addr) +{ + struct hlist_node *h; + struct ipgre_tap_bridge_entry *entry; + + hlist_for_each_entry_rcu(entry, h, head, hlist) { + if (!compare_ether_addr(entry->addr, addr)) + return entry; + } + return NULL; +} + +static struct ipgre_tap_bridge_entry *ipgre_tap_bridge_create( + struct hlist_head *head, + __be32 source, + const unsigned char *addr) +{ + struct ipgre_tap_bridge_entry *entry; + + entry = kmalloc(sizeof(*entry), GFP_ATOMIC); + if (entry) { + memcpy(entry->addr, addr, ETH_ALEN); + entry->raddr = source; + entry->updated = jiffies; + hlist_add_head_rcu(&entry->hlist, head); + } + return entry; +} + +static __be32 ipgre_tap_bridge_get_raddr(struct ip_tunnel *tunnel, + const unsigned char *addr) +{ + __be32 raddr = 0; + struct ipgre_tap_bridge_entry *entry; + + rcu_read_lock(); + entry = __ipgre_tap_bridge_get(tunnel, addr); + if (entry) + raddr = entry->raddr; + rcu_read_unlock(); + + return raddr; +} + +#endif /* Tunnel hash table */ /* @@ -562,6 +733,12 @@ static int ipgre_rcv(struct sk_buff *skb struct ip_tunnel *tunnel; int offset = 4; __be16 gre_proto; +#ifdef CONFIG_NET_IPGRE_BRIDGE + __be32 orig_source; + struct hlist_head *head; + struct ipgre_tap_bridge_entry *entry; + const struct ethhdr *tethhdr; +#endif if (!pskb_may_pull(skb, 16)) goto drop_nolock; @@ -654,6 +831,9 @@ static int ipgre_rcv(struct sk_buff *skb /* Warning: All skb pointers will be invalidated! */ if (tunnel->dev->type == ARPHRD_ETHER) { +#ifdef CONFIG_NET_IPGRE_BRIDGE + orig_source = iph->saddr; +#endif if (!pskb_may_pull(skb, ETH_HLEN)) { tunnel->dev->stats.rx_length_errors++; tunnel->dev->stats.rx_errors++; @@ -663,6 +843,32 @@ static int ipgre_rcv(struct sk_buff *skb iph = ip_hdr(skb); skb->protocol = eth_type_trans(skb, tunnel->dev); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); +#ifdef CONFIG_NET_IPGRE_BRIDGE + if (ipv4_is_multicast(tunnel->parms.iph.daddr)) { + tethhdr = eth_hdr(skb); + if (!is_multicast_ether_addr( + tethhdr->h_source)) { + head = &tunnel->hash[ + ipgre_tap_bridge_hash( + tethhdr->h_source)]; + entry = ipgre_tap_bridge_find_rcu(head, + tethhdr->h_source); + if (likely(entry)) { + entry->raddr = orig_source; + entry->updated = jiffies; + } else { + spin_lock(&tunnel->hash_lock); + if (!ipgre_tap_bridge_find(head, + tethhdr->h_source)) + ipgre_tap_bridge_create( + head, + orig_source, + tethhdr->h_source); + spin_unlock(&tunnel->hash_lock); + } + } + } +#endif } tstats = this_cpu_ptr(tunnel->dev->tstats); @@ -702,7 +908,7 @@ static netdev_tx_t ipgre_tunnel_xmit(str struct iphdr *iph; /* Our new IP header */ unsigned int max_headroom; /* The extra header space needed */ int gre_hlen; - __be32 dst; + __be32 dst = 0; int mtu; if (dev->type == ARPHRD_ETHER) @@ -716,7 +922,15 @@ static netdev_tx_t ipgre_tunnel_xmit(str tiph = &tunnel->parms.iph; } - if ((dst = tiph->daddr) == 0) { +#ifdef CONFIG_NET_IPGRE_BRIDGE + if ((dev->type == ARPHRD_ETHER) && + ipv4_is_multicast(tunnel->parms.iph.daddr)) + dst = ipgre_tap_bridge_get_raddr(tunnel, + ((struct ethhdr *)skb->data)->h_dest); +#endif + if (dst == 0) + dst = tiph->daddr; + if (dst == 0) { /* NBMA tunnel */ if (skb_dst(skb) == NULL) { @@ -1209,6 +1423,16 @@ static int ipgre_open(struct net_device return -EADDRNOTAVAIL; t->mlink = dev->ifindex; ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr); +#ifdef CONFIG_NET_IPGRE_BRIDGE + if (t->dev->type == ARPHRD_ETHER) { + INIT_HLIST_HEAD(t->hash); + spin_lock_init(&t->hash_lock); + t->ageing_time = 300 * HZ; + setup_timer(&t->gc_timer, ipgre_tap_bridge_cleanup, + (unsigned long) t); + mod_timer(&t->gc_timer, jiffies + t->ageing_time); + } +#endif } return 0; } @@ -1219,6 +1443,12 @@ static int ipgre_close(struct net_device if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) { struct in_device *in_dev; +#ifdef CONFIG_NET_IPGRE_BRIDGE + if (t->dev->type == ARPHRD_ETHER) { + ipgre_tap_bridge_flush(t); + del_timer_sync(&t->gc_timer); + } +#endif in_dev = inetdev_by_index(dev_net(dev), t->mlink); if (in_dev) ip_mc_dec_group(in_dev, t->parms.iph.daddr); @@ -1488,6 +1718,10 @@ static int ipgre_tap_init(struct net_dev static const struct net_device_ops ipgre_tap_netdev_ops = { .ndo_init = ipgre_tap_init, .ndo_uninit = ipgre_tunnel_uninit, +#ifdef CONFIG_NET_IPGRE_BRIDGE + .ndo_open = ipgre_open, + .ndo_stop = ipgre_close, +#endif .ndo_start_xmit = ipgre_tunnel_xmit, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, @@ -1705,6 +1939,9 @@ static int __init ipgre_init(void) printk(KERN_INFO "GRE over IPv4 tunneling driver\n"); +#ifdef CONFIG_NET_IPGRE_BRIDGE + get_random_bytes(&ipgre_salt, sizeof(ipgre_salt)); +#endif err = register_pernet_device(&ipgre_net_ops); if (err < 0) return err;