[SRU,N,2/4] UBUNTU: SAUCE: fan: add VXLAN implementation

Message ID	20240501124023.683940-3-andrea.righi@canonical.com
State	New
Headers	show Return-Path: <kernel-team-bounces@lists.ubuntu.com> From: Andrea Righi <andrea.righi@canonical.com> To: kernel-team@lists.ubuntu.com Subject: [SRU][N][PATCH 2/4] UBUNTU: SAUCE: fan: add VXLAN implementation Date: Wed, 1 May 2024 14:34:58 +0200 Message-ID: <20240501124023.683940-3-andrea.righi@canonical.com> In-Reply-To: <20240501124023.683940-1-andrea.righi@canonical.com> References: <20240501124023.683940-1-andrea.righi@canonical.com> MIME-Version: 1.0 Precedence: list Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: kernel-team-bounces@lists.ubuntu.com Sender: "kernel-team" <kernel-team-bounces@lists.ubuntu.com>
Series	re-enable Ubuntu FAN in the Noble kernel \| expand [SRU,N,0/4] re-enable Ubuntu FAN in the Noble kernel [SRU,N,1/4] UBUNTU: SAUCE: fan: tunnel multiple mapping mode (v3) [SRU,N,2/4] UBUNTU: SAUCE: fan: add VXLAN implementation [SRU,N,3/4] UBUNTU: SAUCE: fan: Fix NULL pointer dereference [SRU,N,4/4] UBUNTU: SAUCE: fan: support vxlan strict length validation

diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 16106e088c63..f16a4679e5ee 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -13,6 +13,7 @@ #include <linux/slab.h> #include <linux/udp.h> #include <linux/igmp.h> +#include <linux/inetdevice.h> #include <linux/if_ether.h> #include <linux/ethtool.h> #include <net/arp.h> @@ -71,6 +72,167 @@ static inline bool vxlan_collect_metadata(struct vxlan_sock *vs) ip_tunnel_collect_metadata(); } +static struct ip_fan_map *vxlan_fan_find_map(struct vxlan_dev *vxlan, __be32 daddr) +{ + struct ip_fan_map *fan_map; + + rcu_read_lock(); + list_for_each_entry_rcu(fan_map, &vxlan->fan.fan_maps, list) { + if (fan_map->overlay == + (daddr & inet_make_mask(fan_map->overlay_prefix))) { + rcu_read_unlock(); + return fan_map; + } + } + rcu_read_unlock(); + + return NULL; +} + +static void vxlan_fan_flush_map(struct vxlan_dev *vxlan) +{ + struct ip_fan_map *fan_map; + + list_for_each_entry_rcu(fan_map, &vxlan->fan.fan_maps, list) { + list_del_rcu(&fan_map->list); + kfree_rcu(fan_map, rcu); + } +} + +static int vxlan_fan_del_map(struct vxlan_dev *vxlan, __be32 overlay) +{ + struct ip_fan_map *fan_map; + + fan_map = vxlan_fan_find_map(vxlan, overlay); + if (!fan_map) + return -ENOENT; + + list_del_rcu(&fan_map->list); + kfree_rcu(fan_map, rcu); + + return 0; +} + +static int vxlan_fan_add_map(struct vxlan_dev *vxlan, struct ifla_fan_map *map) +{ + __be32 overlay_mask, underlay_mask; + struct ip_fan_map *fan_map; + + overlay_mask = inet_make_mask(map->overlay_prefix); + underlay_mask = inet_make_mask(map->underlay_prefix); + + netdev_dbg(vxlan->dev, "vfam: map: o %x/%d u %x/%d om %x um %x\n", + map->overlay, map->overlay_prefix, + map->underlay, map->underlay_prefix, + overlay_mask, underlay_mask); + + if ((map->overlay & ~overlay_mask) || (map->underlay & ~underlay_mask)) + return -EINVAL; + + if (!(map->overlay & overlay_mask) && (map->underlay & underlay_mask)) + return -EINVAL; + + /* Special case: overlay 0 and underlay 0: flush all mappings */ + if (!map->overlay && !map->underlay) { + vxlan_fan_flush_map(vxlan); + return 0; + } + + /* Special case: overlay set and underlay 0: clear map for overlay */ + if (!map->underlay) + return vxlan_fan_del_map(vxlan, map->overlay); + + if (vxlan_fan_find_map(vxlan, map->overlay)) + return -EEXIST; + + fan_map = kmalloc(sizeof(*fan_map), GFP_KERNEL); + fan_map->underlay = map->underlay; + fan_map->overlay = map->overlay; + fan_map->underlay_prefix = map->underlay_prefix; + fan_map->overlay_mask = ntohl(overlay_mask); + fan_map->overlay_prefix = map->overlay_prefix; + + list_add_tail_rcu(&fan_map->list, &vxlan->fan.fan_maps); + + return 0; +} + +static int vxlan_parse_fan_map(struct nlattr *data[], struct vxlan_dev *vxlan) +{ + struct ifla_fan_map *map; + struct nlattr *attr; + int rem, rv; + + nla_for_each_nested(attr, data[IFLA_IPTUN_FAN_MAP], rem) { + map = nla_data(attr); + rv = vxlan_fan_add_map(vxlan, map); + if (rv) + return rv; + } + + return 0; +} + +static int vxlan_fan_build_rdst(struct vxlan_dev *vxlan, struct sk_buff *skb, + struct vxlan_rdst *fan_rdst) +{ + struct ip_fan_map *f_map; + union vxlan_addr *va; + u32 daddr, underlay; + struct arphdr *arp; + void *arp_ptr; + struct ethhdr *eth; + struct iphdr *iph; + + eth = eth_hdr(skb); + switch (eth->h_proto) { + case htons(ETH_P_IP): + iph = ip_hdr(skb); + if (!iph) + return -EINVAL; + daddr = iph->daddr; + break; + case htons(ETH_P_ARP): + arp = arp_hdr(skb); + if (!arp) + return -EINVAL; + arp_ptr = arp + 1; + netdev_dbg(vxlan->dev, + "vfbr: arp sha %pM sip %pI4 tha %pM tip %pI4\n", + arp_ptr, arp_ptr + skb->dev->addr_len, + arp_ptr + skb->dev->addr_len + 4, + arp_ptr + (skb->dev->addr_len * 2) + 4); + arp_ptr += (skb->dev->addr_len * 2) + 4; + memcpy(&daddr, arp_ptr, 4); + break; + default: + netdev_dbg(vxlan->dev, "vfbr: unknown eth p %x\n", eth->h_proto); + return -EINVAL; + } + + f_map = vxlan_fan_find_map(vxlan, daddr); + if (!f_map) + return -EINVAL; + + daddr = ntohl(daddr); + underlay = ntohl(f_map->underlay); + if (!underlay) + return -EINVAL; + + memset(fan_rdst, 0, sizeof(*fan_rdst)); + va = &fan_rdst->remote_ip; + va->sa.sa_family = AF_INET; + fan_rdst->remote_vni = vxlan->default_dst.remote_vni; + va->sin.sin_addr.s_addr = htonl(underlay | + ((daddr & ~f_map->overlay_mask) >> + (32 - f_map->overlay_prefix - + (32 - f_map->underlay_prefix)))); + netdev_dbg(vxlan->dev, "vfbr: daddr %x ul %x dst %x\n", + daddr, underlay, va->sin.sin_addr.s_addr); + + return 0; +} + /* Find VXLAN socket based on network namespace, address family, UDP port, * enabled unshareable flags and socket device binding (see l3mdev with * non-default VRF). @@ -2433,6 +2595,13 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, goto tx_error; } + if (fan_has_map(&vxlan->fan) && rt->rt_flags & RTCF_LOCAL) { + netdev_dbg(dev, "discard fan to localhost %pI4\n", + &rdst->remote_ip.sin.sin_addr.s_addr); + ip_rt_put(rt); + goto tx_free; + } + if (!info) { /* Bypass encapsulation if the destination is local */ err = encap_bypass_if_local(skb, dev, vxlan, AF_INET, @@ -2569,6 +2738,7 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, dst_release(ndst); dev->stats.tx_errors++; vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_ERRORS, 0); +tx_free: kfree_skb(skb); } @@ -2716,6 +2886,20 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) rcu_read_unlock(); } + if (fan_has_map(&vxlan->fan)) { + struct vxlan_rdst fan_rdst; + + netdev_dbg(vxlan->dev, "vxlan_xmit p %x d %pM\n", + eth->h_proto, eth->h_dest); + if (vxlan_fan_build_rdst(vxlan, skb, &fan_rdst)) { + dev->stats.tx_dropped++; + kfree_skb(skb); + return NETDEV_TX_OK; + } + vxlan_xmit_one(skb, dev, vni, &fan_rdst, 0); + return NETDEV_TX_OK; + } + eth = eth_hdr(skb); f = vxlan_find_mac(vxlan, eth->h_dest, vni); did_rsc = false; @@ -3325,6 +3509,8 @@ static void vxlan_setup(struct net_device *dev) spin_lock_init(&vxlan->hash_lock[h]); INIT_HLIST_HEAD(&vxlan->fdb_head[h]); } + + INIT_LIST_HEAD(&vxlan->fan.fan_maps); } static void vxlan_ether_setup(struct net_device *dev) @@ -4055,6 +4241,12 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], conf->remote_ip.sa.sa_family = AF_INET6; } + if (data[IFLA_VXLAN_FAN_MAP]) { + err = vxlan_parse_fan_map(data, vxlan); + if (err) + return err; + } + if (data[IFLA_VXLAN_LOCAL]) { if (changelink && (conf->saddr.sa.sa_family != AF_INET)) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LOCAL], "New local address family does not match old"); @@ -4440,6 +4632,7 @@ static size_t vxlan_get_size(const struct net_device *dev) nla_total_size(0) + /* IFLA_VXLAN_GPE */ nla_total_size(0) + /* IFLA_VXLAN_REMCSUM_NOPARTIAL */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_VNIFILTER */ + nla_total_size(sizeof(struct ip_fan_map) * 256) + 0; } @@ -4486,6 +4679,26 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) } } + if (fan_has_map(&vxlan->fan)) { + struct nlattr *fan_nest; + struct ip_fan_map *fan_map; + + fan_nest = nla_nest_start(skb, IFLA_VXLAN_FAN_MAP); + if (!fan_nest) + goto nla_put_failure; + list_for_each_entry_rcu(fan_map, &vxlan->fan.fan_maps, list) { + struct ifla_fan_map map; + + map.underlay = fan_map->underlay; + map.underlay_prefix = fan_map->underlay_prefix; + map.overlay = fan_map->overlay; + map.overlay_prefix = fan_map->overlay_prefix; + if (nla_put(skb, IFLA_FAN_MAPPING, sizeof(map), &map)) + goto nla_put_failure; + } + nla_nest_end(skb, fan_nest); + } + if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->cfg.ttl) || nla_put_u8(skb, IFLA_VXLAN_TTL_INHERIT, !!(vxlan->cfg.flags & VXLAN_F_TTL_INHERIT)) || @@ -4826,6 +5039,22 @@ static __net_init int vxlan_init_net(struct net *net) NULL); } +#ifdef CONFIG_SYSCTL +static struct ctl_table_header *vxlan_fan_header; +static unsigned int vxlan_fan_version = 4; + +static struct ctl_table vxlan_fan_sysctls[] = { + { + .procname = "vxlan", + .data = &vxlan_fan_version, + .maxlen = sizeof(vxlan_fan_version), + .mode = 0444, + .proc_handler = proc_dointvec, + }, + {}, +}; +#endif /* CONFIG_SYSCTL */ + static void vxlan_destroy_tunnels(struct net *net, struct list_head *head) { struct vxlan_net *vn = net_generic(net, vxlan_net_id); @@ -4903,7 +5132,20 @@ static int __init vxlan_init_module(void) vxlan_vnifilter_init(); +#ifdef CONFIG_SYSCTL + vxlan_fan_header = register_net_sysctl(&init_net, "net/fan", + vxlan_fan_sysctls); + if (!vxlan_fan_header) { + rc = -ENOMEM; + goto sysctl_failed; + } +#endif /* CONFIG_SYSCTL */ + return 0; +#ifdef CONFIG_SYSCTL +sysctl_failed: + rtnl_link_unregister(&vxlan_link_ops); +#endif /* CONFIG_SYSCTL */ out4: unregister_switchdev_notifier(&vxlan_switchdev_notifier_block); out3: @@ -4917,6 +5159,9 @@ late_initcall(vxlan_init_module); static void __exit vxlan_cleanup_module(void) { +#ifdef CONFIG_SYSCTL + unregister_net_sysctl_table(vxlan_fan_header); +#endif /* CONFIG_SYSCTL */ vxlan_vnifilter_uninit(); rtnl_link_unregister(&vxlan_link_ops); unregister_switchdev_notifier(&vxlan_switchdev_notifier_block); diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 98a3d8ad9415..8c53e4179854 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -118,9 +118,18 @@ struct metadata_dst; */ #define FAN_OVERLAY_CNT 256 +struct ip_fan_map { + __be32 underlay; + __be32 overlay; + u16 underlay_prefix; + u16 overlay_prefix; + u32 overlay_mask; + struct list_head list; + struct rcu_head rcu; +}; + struct ip_tunnel_fan { -/* u32 __rcu *map;*/ - u32 map[FAN_OVERLAY_CNT]; + struct list_head fan_maps; }; struct ip_tunnel { @@ -170,6 +179,11 @@ struct ip_tunnel { bool ignore_df; }; +static inline int fan_has_map(const struct ip_tunnel_fan *fan) +{ + return !list_empty(&fan->fan_maps); +} + struct tnl_ptk_info { __be16 flags; __be16 proto; diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 33ba6fc151cf..e55d5b1483db 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -294,6 +294,8 @@ struct vxlan_dev { struct net *net; /* netns for packet i/o */ struct vxlan_rdst default_dst; /* default destination */ + struct ip_tunnel_fan fan; + struct timer_list age_timer; spinlock_t hash_lock[FDB_HASH_SIZE]; unsigned int addrcnt; diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index ab9bcff96e4d..4345ceae5d99 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1378,6 +1378,7 @@ enum { IFLA_VXLAN_VNIFILTER, /* only applicable with COLLECT_METADATA mode */ IFLA_VXLAN_LOCALBYPASS, IFLA_VXLAN_LABEL_POLICY, /* IPv6 flow label policy; ifla_vxlan_label_policy */ + IFLA_VXLAN_FAN_MAP = 33, __IFLA_VXLAN_MAX }; #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h index a862f0c483b7..f1401060d5b5 100644 --- a/include/uapi/linux/if_tunnel.h +++ b/include/uapi/linux/if_tunnel.h @@ -186,8 +186,6 @@ enum { (TUNNEL_GENEVE_OPT | TUNNEL_VXLAN_OPT | TUNNEL_ERSPAN_OPT | \ TUNNEL_GTP_OPT) -#define TUNNEL_FAN __cpu_to_be16(0x8000) - enum { IFLA_FAN_UNSPEC, IFLA_FAN_MAPPING, @@ -196,7 +194,7 @@ enum { #define IFLA_FAN_MAX (__IFLA_FAN_MAX - 1) -struct ip_tunnel_fan_map { +struct ifla_fan_map { __be32 underlay; __be32 overlay; __u16 underlay_prefix; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 5faebe94d071..b6ca5742fc46 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -1227,11 +1227,6 @@ int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], } EXPORT_SYMBOL_GPL(ip_tunnel_newlink); -static int ip_tunnel_is_fan(struct ip_tunnel *tunnel) -{ - return tunnel->parms.i_flags & TUNNEL_FAN; -} - int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm *p, __u32 fwmark) { @@ -1241,7 +1236,7 @@ int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id); if (dev == itn->fb_tunnel_dev) - return ip_tunnel_is_fan(tunnel) ? 0 : -EINVAL; + return fan_has_map(&tunnel->fan) ? 0 : -EINVAL; t = ip_tunnel_find(itn, p, dev->type); diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 2fbe16a33bc1..a044da845559 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -102,6 +102,7 @@ #include <linux/netfilter_ipv4.h> #include <linux/if_ether.h> #include <linux/inetdevice.h> +#include <linux/rculist.h> #include <net/sock.h> #include <net/ip.h> @@ -268,37 +269,144 @@ static int mplsip_rcv(struct sk_buff *skb) } #endif -static int ipip_tunnel_is_fan(struct ip_tunnel *tunnel) +static struct ip_fan_map *ipip_fan_find_map(struct ip_tunnel *t, __be32 daddr) { - return tunnel->parms.i_flags & TUNNEL_FAN; + struct ip_fan_map *fan_map; + + rcu_read_lock(); + list_for_each_entry_rcu(fan_map, &t->fan.fan_maps, list) { + if (fan_map->overlay == + (daddr & inet_make_mask(fan_map->overlay_prefix))) { + rcu_read_unlock(); + return fan_map; + } + } + rcu_read_unlock(); + + return NULL; } -/* - * Determine fan tunnel endpoint to send packet to, based on the inner IP - * address. For an overlay (inner) address Y.A.B.C, the transformation is - * F.G.A.B, where "F" and "G" are the first two octets of the underlay - * network (the network portion of a /16), "A" and "B" are the low order - * two octets of the underlay network host (the host portion of a /16), - * and "Y" is a configured first octet of the overlay network. +/* Determine fan tunnel endpoint to send packet to, based on the inner IP + * address. + * + * Given a /8 overlay and /16 underlay, for an overlay (inner) address + * Y.A.B.C, the transformation is F.G.A.B, where "F" and "G" are the first + * two octets of the underlay network (the network portion of a /16), "A" + * and "B" are the low order two octets of the underlay network host (the + * host portion of a /16), and "Y" is a configured first octet of the + * overlay network. + * + * E.g., underlay host 10.88.3.4/16 with an overlay of 99.0.0.0/8 would + * host overlay subnet 99.3.4.0/24. An overlay network datagram from + * 99.3.4.5 to 99.6.7.8, would be directed to underlay host 10.88.6.7, + * which hosts overlay network subnet 99.6.7.0/24. This transformation is + * described in detail further below. + * + * Using netmasks for the overlay and underlay other than /8 and /16, as + * shown above, can yield larger (or smaller) overlay subnets, with the + * trade-off of allowing fewer (or more) underlay hosts to participate. + * + * The size of each overlay network subnet is defined by the total of the + * network mask of the overlay plus the size of host portion of the + * underlay network. In the above example, /8 + /16 = /24. + * + * E.g., consider underlay host 10.99.238.5/20 and overlay 99.0.0.0/8. In + * this case, the network portion of the underlay is 10.99.224.0/20, and + * the host portion is 0.0.14.5 (12 bits). To determine the overlay + * network subnet, the 12 bits of host portion are left shifted 12 bits + * (/20 - /8) and ORed with the overlay subnet prefix. This yields an + * overlay subnet of 99.224.80/20, composed of 8 bits overlay, followed by + * 12 bits underlay. This yields 12 bits in the overlay network portion, + * allowing for 4094 addresses in each overlay network subnet. The + * trade-off is that fewer hosts may participate in the underlay network, + * as its host address size has shrunk from 16 bits (65534 addresses) in + * the first example to 12 bits (4094 addresses) here. + * + * For fewer hosts per overlay subnet (permitting a larger number of + * underlay hosts to participate), the underlay netmask may be made + * smaller. + * + * E.g., underlay host 10.111.1.2/12 (network 10.96.0.0/12, host portion + * is 0.15.1.2, 20 bits) with an overlay of 33.0.0.0/8 would left shift + * the 20 bits of host by 4 (so that it's highest order bit is adjacent to + * the lowest order bit of the /8 overlay). This yields an overlay subnet + * of 33.240.16.32/28 (8 bits overlay, 20 bits from the host portion of + * the underlay). This provides more addresses for the underlay network + * (approximately 2^20), but each host's segment of the overlay provides + * only 4 bits of addresses (14 usable). + * + * It is also possible to adjust the overlay subnet. + * + * For an overlay of 240.0.0.0/5 and underlay of 10.88.0.0/20, consider + * underlay host 10.88.129.2; the 12 bits of host, 0.0.1.2, are left + * shifted 15 bits (/20 - /5), yielding an overlay network of + * 240.129.0.0/17. An underlay host of 10.88.244.215 would yield an + * overlay network of 242.107.128.0/17. + * + * For an overlay of 100.64.0.0/10 and underlay of 10.224.220.0/24, for + * underlay host 10.224.220.10, the underlay host portion (.10) is left + * shifted 14 bits, yielding an overlay network subnet of 100.66.128.0/18. + * This would permit 254 addresses on the underlay, with each overlay + * segment providing approximately 2^14 - 2 addresses (16382). + * + * For packets being encapsulated, the overlay network destination IP + * address is deconstructed into its overlay and underlay-derived + * portions. The underlay portion (determined by the overlay mask and + * overlay subnet mask) is right shifted according to the size of the + * underlay network mask. This value is then ORed with the network + * portion of the underlay network to produce the underlay network + * destination for the encapsulated datagram. + * + * For example, using the initial example of underlay 10.88.3.4/16 and + * overlay 99.0.0.0/8, with underlay host 10.88.3.4/16 providing overlay + * subnet 99.3.4.0/24 with specfic host 99.3.4.5. A datagram from + * 99.3.4.5 to 99.6.7.8 would first have the underlay host derived portion + * of the address extracted. This is a number of bits equal to underlay + * network host portion. In the destination address, the highest order of + * these bits is one bit lower than the lowest order bit from the overlay + * network mask. * - * E.g., underlay host 10.88.3.4 with an overlay of 99 would host overlay - * subnet 99.3.4.0/24. An overlay network datagram from 99.3.4.5 to - * 99.6.7.8, would be directed to underlay host 10.88.6.7, which hosts - * overlay network 99.6.7.0/24. + * Using the sample value, 99.6.7.8, the overlay mask is /8, and the + * underlay mask is /16 (leaving 16 bits for the host portion). The bits + * to be shifted are the middle two octets, 0.6.7.0, as this is 99.6.7.8 + * ANDed with the mask 0x00ffff00 (which is 16 bits, the highest order of + * which is 1 bit lower than the lowest order overlay address bit). + * + * These octets, 0.6.7.0, are then right shifted 8 bits, yielding 0.0.6.7. + * This value is then ORed with the underlay network portion, + * 10.88.0.0/16, providing 10.88.6.7 as the final underlay destination for + * the encapuslated datagram. + * + * Another transform using the final example: overlay 100.64.0.0/10 and + * underlay 10.224.220.0/24. Consider overlay address 100.66.128.1 + * sending a datagram to 100.66.200.5. In this case, 8 bits (the host + * portion size of 10.224.220.0/24) beginning after the 100.64/10 overlay + * prefix are masked off, yielding 0.2.192.0. This is right shifted 14 + * (32 - 10 - (32 - 24), i.e., the number of bits between the overlay + * network portion and the underlay host portion) bits, yielding 0.0.0.11. + * This is ORed with the underlay network portion, 10.224.220.0/24, giving + * the underlay destination of 10.224.220.11 for overlay destination + * 100.66.200.5. */ static int ipip_build_fan_iphdr(struct ip_tunnel *tunnel, struct sk_buff *skb, struct iphdr *iph) { - unsigned int overlay; + struct ip_fan_map *f_map; u32 daddr, underlay; + f_map = ipip_fan_find_map(tunnel, ip_hdr(skb)->daddr); + if (!f_map) + return -ENOENT; + daddr = ntohl(ip_hdr(skb)->daddr); - overlay = daddr >> 24; - underlay = tunnel->fan.map[overlay]; + underlay = ntohl(f_map->underlay); if (!underlay) return -EINVAL; *iph = tunnel->parms.iph; - iph->daddr = htonl(underlay | ((daddr >> 8) & 0x0000ffff)); + iph->daddr = htonl(underlay | + ((daddr & ~f_map->overlay_mask) >> + (32 - f_map->overlay_prefix - + (32 - f_map->underlay_prefix)))); return 0; } @@ -336,7 +444,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4)) goto tx_error; - if (ipip_tunnel_is_fan(tunnel)) { + if (fan_has_map(&tunnel->fan)) { if (ipip_build_fan_iphdr(tunnel, skb, &fiph)) goto tx_error; tiph = &fiph; @@ -407,6 +515,8 @@ static const struct net_device_ops ipip_netdev_ops = { static void ipip_tunnel_setup(struct net_device *dev) { + struct ip_tunnel *t = netdev_priv(dev); + dev->netdev_ops = &ipip_netdev_ops; dev->header_ops = &ip_tunnel_header_ops; @@ -419,6 +529,7 @@ static void ipip_tunnel_setup(struct net_device *dev) dev->features |= IPIP_FEATURES; dev->hw_features |= IPIP_FEATURES; ip_tunnel_setup(dev, ipip_net_id); + INIT_LIST_HEAD(&t->fan.fan_maps); } static int ipip_tunnel_init(struct net_device *dev) @@ -471,41 +582,65 @@ static void ipip_netlink_parms(struct nlattr *data[], *fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]); } -static void ipip_fan_free_map(struct ip_tunnel *t) +static void ipip_fan_flush_map(struct ip_tunnel *t) { - memset(&t->fan.map, 0, sizeof(t->fan.map)); + struct ip_fan_map *fan_map; + + list_for_each_entry_rcu(fan_map, &t->fan.fan_maps, list) { + list_del_rcu(&fan_map->list); + kfree_rcu(fan_map, rcu); + } } -static int ipip_fan_set_map(struct ip_tunnel *t, struct ip_tunnel_fan_map *map) +static int ipip_fan_del_map(struct ip_tunnel *t, __be32 overlay) { - u32 overlay, overlay_mask, underlay, underlay_mask; + struct ip_fan_map *fan_map; - if ((map->underlay_prefix && map->underlay_prefix != 16) || - (map->overlay_prefix && map->overlay_prefix != 8)) - return -EINVAL; + fan_map = ipip_fan_find_map(t, overlay); + if (!fan_map) + return -ENOENT; - overlay = ntohl(map->overlay); - overlay_mask = ntohl(inet_make_mask(map->overlay_prefix)); + list_del_rcu(&fan_map->list); + kfree_rcu(fan_map, rcu); - underlay = ntohl(map->underlay); - underlay_mask = ntohl(inet_make_mask(map->underlay_prefix)); + return 0; +} - if ((overlay & ~overlay_mask) || (underlay & ~underlay_mask)) - return -EINVAL; +static int ipip_fan_add_map(struct ip_tunnel *t, struct ifla_fan_map *map) +{ + __be32 overlay_mask, underlay_mask; + struct ip_fan_map *fan_map; - if (!(overlay & overlay_mask) && (underlay & underlay_mask)) + overlay_mask = inet_make_mask(map->overlay_prefix); + underlay_mask = inet_make_mask(map->underlay_prefix); + + if ((map->overlay & ~overlay_mask) || (map->underlay & ~underlay_mask)) return -EINVAL; - t->parms.i_flags |= TUNNEL_FAN; + if (!(map->overlay & overlay_mask) && (map->underlay & underlay_mask)) + return -EINVAL; - /* Special case: overlay 0 and underlay 0 clears all mappings */ - if (!overlay && !underlay) { - ipip_fan_free_map(t); + /* Special case: overlay 0 and underlay 0: flush all mappings */ + if (!map->overlay && !map->underlay) { + ipip_fan_flush_map(t); return 0; } + + /* Special case: overlay set and underlay 0: clear map for overlay */ + if (!map->underlay) + return ipip_fan_del_map(t, map->overlay); + + if (ipip_fan_find_map(t, map->overlay)) + return -EEXIST; + + fan_map = kmalloc(sizeof(*fan_map), GFP_KERNEL); + fan_map->underlay = map->underlay; + fan_map->overlay = map->overlay; + fan_map->underlay_prefix = map->underlay_prefix; + fan_map->overlay_mask = ntohl(overlay_mask); + fan_map->overlay_prefix = map->overlay_prefix; - overlay >>= (32 - map->overlay_prefix); - t->fan.map[overlay] = underlay; + list_add_tail_rcu(&fan_map->list, &t->fan.fan_maps); return 0; } @@ -513,7 +648,7 @@ static int ipip_fan_set_map(struct ip_tunnel *t, struct ip_tunnel_fan_map *map) static int ipip_netlink_fan(struct nlattr *data[], struct ip_tunnel *t, struct ip_tunnel_parm *parms) { - struct ip_tunnel_fan_map *map; + struct ifla_fan_map *map; struct nlattr *attr; int rem, rv; @@ -525,7 +660,7 @@ static int ipip_netlink_fan(struct nlattr *data[], struct ip_tunnel *t, nla_for_each_nested(attr, data[IFLA_IPTUN_FAN_MAP], rem) { map = nla_data(attr); - rv = ipip_fan_set_map(t, map); + rv = ipip_fan_add_map(t, map); if (rv) return rv; } @@ -619,7 +754,7 @@ static size_t ipip_get_size(const struct net_device *dev) /* IFLA_IPTUN_FWMARK */ nla_total_size(4) + /* IFLA_IPTUN_FAN_MAP */ - nla_total_size(sizeof(struct ip_tunnel_fan_map)) * 256 + + nla_total_size(sizeof(struct ifla_fan_map)) * 256 + 0; } @@ -652,25 +787,22 @@ static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev) if (tunnel->collect_md) if (nla_put_flag(skb, IFLA_IPTUN_COLLECT_METADATA)) goto nla_put_failure; - if (tunnel->parms.i_flags & TUNNEL_FAN) { + if (fan_has_map(&tunnel->fan)) { struct nlattr *fan_nest; - int i; + struct ip_fan_map *fan_map; fan_nest = nla_nest_start(skb, IFLA_IPTUN_FAN_MAP); if (!fan_nest) goto nla_put_failure; - for (i = 0; i < 256; i++) { - if (tunnel->fan.map[i]) { - struct ip_tunnel_fan_map map; - - map.underlay = htonl(tunnel->fan.map[i]); - map.underlay_prefix = 16; - map.overlay = htonl(i << 24); - map.overlay_prefix = 8; - if (nla_put(skb, IFLA_FAN_MAPPING, - sizeof(map), &map)) - goto nla_put_failure; - } + list_for_each_entry_rcu(fan_map, &tunnel->fan.fan_maps, list) { + struct ifla_fan_map map; + + map.underlay = fan_map->underlay; + map.underlay_prefix = fan_map->underlay_prefix; + map.overlay = fan_map->overlay; + map.overlay_prefix = fan_map->overlay_prefix; + if (nla_put(skb, IFLA_FAN_MAPPING, sizeof(map), &map)) + goto nla_put_failure; } nla_nest_end(skb, fan_nest); }

[SRU,N,2/4] UBUNTU: SAUCE: fan: add VXLAN implementation

Commit Message

Patch