Message ID | 20181107134859.19896-2-christian@brauner.io |
---|---|
State | Awaiting Upstream, archived |
Delegated to: | David Miller |
Headers | show |
Series | br_netfilter: enable in non-initial netns | expand |
Hi, On Wed, Nov 07, 2018 at 02:48:58PM +0100, Christian Brauner wrote: [...] > diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h > index ca043342c0eb..eedbd1ac940e 100644 > --- a/include/net/netns/netfilter.h > +++ b/include/net/netns/netfilter.h > @@ -35,4 +35,20 @@ struct netns_nf { > bool defrag_ipv6; > #endif > }; > + > +struct netns_brnf { > +#ifdef CONFIG_SYSCTL > + struct ctl_table_header *ctl_hdr; > +#endif > + > + /* default value is 1 */ > + int call_iptables; > + int call_ip6tables; > + int call_arptables; > + > + /* default value is 0 */ > + int filter_vlan_tagged; > + int filter_pppoe_tagged; > + int pass_vlan_indev; > +}; I have spun on this several times, wondering if there's a way to avoid scratching these many bytes per netns to expose these sysctl entries that are plain on/off toggles... You said this: >Currently, the /proc/sys/net/bridge folder is only created in the >initial network namespace I think we can add one single sysctl to expose these as flags from net namespaces. Idea is to keep the existing (legacy) sysctl entries for init_net only, and add a new single new one that exposes these as flags (should be also available for consistency in init_net I'd suggest). Flags could be map in this way, eg. 0x1 call_iptables 0x2 call_ip6tables 0x4 call_arptables 0x8 filter_vlan_tagged ... Also documentation would be good to have for this. Would this idea fly for you? Thanks.
On Tue, Nov 27, 2018 at 01:20:47AM +0100, Pablo Neira Ayuso wrote: > Hi, > > On Wed, Nov 07, 2018 at 02:48:58PM +0100, Christian Brauner wrote: > [...] > > diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h > > index ca043342c0eb..eedbd1ac940e 100644 > > --- a/include/net/netns/netfilter.h > > +++ b/include/net/netns/netfilter.h > > @@ -35,4 +35,20 @@ struct netns_nf { > > bool defrag_ipv6; > > #endif > > }; > > + > > +struct netns_brnf { > > +#ifdef CONFIG_SYSCTL > > + struct ctl_table_header *ctl_hdr; > > +#endif > > + > > + /* default value is 1 */ > > + int call_iptables; > > + int call_ip6tables; > > + int call_arptables; > > + > > + /* default value is 0 */ > > + int filter_vlan_tagged; > > + int filter_pppoe_tagged; > > + int pass_vlan_indev; > > +}; > > I have spun on this several times, wondering if there's a way to avoid > scratching these many bytes per netns to expose these sysctl entries > that are plain on/off toggles... You said this: > > >Currently, the /proc/sys/net/bridge folder is only created in the > >initial network namespace > > I think we can add one single sysctl to expose these as flags from net > namespaces. Idea is to keep the existing (legacy) sysctl entries for > init_net only, and add a new single new one that exposes these as flags > (should be also available for consistency in init_net I'd suggest). > Flags could be map in this way, eg. > > 0x1 call_iptables > 0x2 call_ip6tables > 0x4 call_arptables > 0x8 filter_vlan_tagged > ... > > Also documentation would be good to have for this. > > Would this idea fly for you? Thanks. My suggestion is to keep these files per network namespace but have a single flag argument in struct netns_brnf: +struct netns_brnf { +#ifdef CONFIG_SYSCTL + struct ctl_table_header *ctl_hdr; +#endif + + /* default value is 1 */ + unsigned int filter_flags; +}; #define BRNF_CALL_IPTABLES 0x1 #define BRNF_CALL_IP6TABLES 0x2 #define BRNF_CALL_ARPTABLES 0x4 #define BRNF_CALL_VLAN_TAGGED 0x8 a write to the corresponding file would then cause the flag to be set or unset in filter_flags. This way we are a) space-efficient internally not bloating struct net while b) not breaking running tools in non-initial network namespaces that expect the files to be there. b) is really the important bit here. :) Christian
On Tue, Nov 27, 2018 at 03:20:45AM +0100, Christian Brauner wrote: > On Tue, Nov 27, 2018 at 01:20:47AM +0100, Pablo Neira Ayuso wrote: > > Hi, > > > > On Wed, Nov 07, 2018 at 02:48:58PM +0100, Christian Brauner wrote: > > [...] > > > diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h > > > index ca043342c0eb..eedbd1ac940e 100644 > > > --- a/include/net/netns/netfilter.h > > > +++ b/include/net/netns/netfilter.h > > > @@ -35,4 +35,20 @@ struct netns_nf { > > > bool defrag_ipv6; > > > #endif > > > }; > > > + > > > +struct netns_brnf { > > > +#ifdef CONFIG_SYSCTL > > > + struct ctl_table_header *ctl_hdr; > > > +#endif > > > + > > > + /* default value is 1 */ > > > + int call_iptables; > > > + int call_ip6tables; > > > + int call_arptables; > > > + > > > + /* default value is 0 */ > > > + int filter_vlan_tagged; > > > + int filter_pppoe_tagged; > > > + int pass_vlan_indev; > > > +}; > > > > I have spun on this several times, wondering if there's a way to avoid > > scratching these many bytes per netns to expose these sysctl entries > > that are plain on/off toggles... You said this: > > > > >Currently, the /proc/sys/net/bridge folder is only created in the > > >initial network namespace > > > > I think we can add one single sysctl to expose these as flags from net > > namespaces. Idea is to keep the existing (legacy) sysctl entries for > > init_net only, and add a new single new one that exposes these as flags > > (should be also available for consistency in init_net I'd suggest). > > Flags could be map in this way, eg. > > > > 0x1 call_iptables > > 0x2 call_ip6tables > > 0x4 call_arptables > > 0x8 filter_vlan_tagged > > ... > > > > Also documentation would be good to have for this. > > > > Would this idea fly for you? Thanks. > > My suggestion is to keep these files per network namespace but have a > single flag argument in struct netns_brnf: > +struct netns_brnf { > +#ifdef CONFIG_SYSCTL > + struct ctl_table_header *ctl_hdr; > +#endif > + > + /* default value is 1 */ > + unsigned int filter_flags; > +}; > > #define BRNF_CALL_IPTABLES 0x1 > #define BRNF_CALL_IP6TABLES 0x2 > #define BRNF_CALL_ARPTABLES 0x4 > #define BRNF_CALL_VLAN_TAGGED 0x8 > > a write to the corresponding file would then cause the flag to be set or > unset in filter_flags. > This way we are a) space-efficient internally not bloating struct net > while b) not breaking running tools in non-initial network namespaces > that expect the files to be there. b) is really the important bit here. :) OK, please, go explore this space-efficient approach. Thanks.
On Tue, Nov 27, 2018 at 09:23:49AM +0100, Pablo Neira Ayuso wrote: > On Tue, Nov 27, 2018 at 03:20:45AM +0100, Christian Brauner wrote: > > On Tue, Nov 27, 2018 at 01:20:47AM +0100, Pablo Neira Ayuso wrote: > > > Hi, > > > > > > On Wed, Nov 07, 2018 at 02:48:58PM +0100, Christian Brauner wrote: > > > [...] > > > > diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h > > > > index ca043342c0eb..eedbd1ac940e 100644 > > > > --- a/include/net/netns/netfilter.h > > > > +++ b/include/net/netns/netfilter.h > > > > @@ -35,4 +35,20 @@ struct netns_nf { > > > > bool defrag_ipv6; > > > > #endif > > > > }; > > > > + > > > > +struct netns_brnf { > > > > +#ifdef CONFIG_SYSCTL > > > > + struct ctl_table_header *ctl_hdr; > > > > +#endif > > > > + > > > > + /* default value is 1 */ > > > > + int call_iptables; > > > > + int call_ip6tables; > > > > + int call_arptables; > > > > + > > > > + /* default value is 0 */ > > > > + int filter_vlan_tagged; > > > > + int filter_pppoe_tagged; > > > > + int pass_vlan_indev; > > > > +}; > > > > > > I have spun on this several times, wondering if there's a way to avoid > > > scratching these many bytes per netns to expose these sysctl entries > > > that are plain on/off toggles... You said this: > > > > > > >Currently, the /proc/sys/net/bridge folder is only created in the > > > >initial network namespace > > > > > > I think we can add one single sysctl to expose these as flags from net > > > namespaces. Idea is to keep the existing (legacy) sysctl entries for > > > init_net only, and add a new single new one that exposes these as flags > > > (should be also available for consistency in init_net I'd suggest). > > > Flags could be map in this way, eg. > > > > > > 0x1 call_iptables > > > 0x2 call_ip6tables > > > 0x4 call_arptables > > > 0x8 filter_vlan_tagged > > > ... > > > > > > Also documentation would be good to have for this. > > > > > > Would this idea fly for you? Thanks. > > > > My suggestion is to keep these files per network namespace but have a > > single flag argument in struct netns_brnf: > > +struct netns_brnf { > > +#ifdef CONFIG_SYSCTL > > + struct ctl_table_header *ctl_hdr; > > +#endif > > + > > + /* default value is 1 */ > > + unsigned int filter_flags; > > +}; > > > > #define BRNF_CALL_IPTABLES 0x1 > > #define BRNF_CALL_IP6TABLES 0x2 > > #define BRNF_CALL_ARPTABLES 0x4 > > #define BRNF_CALL_VLAN_TAGGED 0x8 > > > > a write to the corresponding file would then cause the flag to be set or > > unset in filter_flags. > > This way we are a) space-efficient internally not bloating struct net > > while b) not breaking running tools in non-initial network namespaces > > that expect the files to be there. b) is really the important bit here. :) > > OK, please, go explore this space-efficient approach. Thanks. Will do. I'll try to get to it in the next couple of days and send out a new version. Thanks!
On Tue, Nov 27, 2018 at 09:23:49AM +0100, Pablo Neira Ayuso wrote: > On Tue, Nov 27, 2018 at 03:20:45AM +0100, Christian Brauner wrote: > > On Tue, Nov 27, 2018 at 01:20:47AM +0100, Pablo Neira Ayuso wrote: > > > Hi, > > > > > > On Wed, Nov 07, 2018 at 02:48:58PM +0100, Christian Brauner wrote: > > > [...] > > > > diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h > > > > index ca043342c0eb..eedbd1ac940e 100644 > > > > --- a/include/net/netns/netfilter.h > > > > +++ b/include/net/netns/netfilter.h > > > > @@ -35,4 +35,20 @@ struct netns_nf { > > > > bool defrag_ipv6; > > > > #endif > > > > }; > > > > + > > > > +struct netns_brnf { > > > > +#ifdef CONFIG_SYSCTL > > > > + struct ctl_table_header *ctl_hdr; > > > > +#endif > > > > + > > > > + /* default value is 1 */ > > > > + int call_iptables; > > > > + int call_ip6tables; > > > > + int call_arptables; > > > > + > > > > + /* default value is 0 */ > > > > + int filter_vlan_tagged; > > > > + int filter_pppoe_tagged; > > > > + int pass_vlan_indev; > > > > +}; > > > > > > I have spun on this several times, wondering if there's a way to avoid > > > scratching these many bytes per netns to expose these sysctl entries > > > that are plain on/off toggles... You said this: > > > > > > >Currently, the /proc/sys/net/bridge folder is only created in the > > > >initial network namespace > > > > > > I think we can add one single sysctl to expose these as flags from net > > > namespaces. Idea is to keep the existing (legacy) sysctl entries for > > > init_net only, and add a new single new one that exposes these as flags > > > (should be also available for consistency in init_net I'd suggest). > > > Flags could be map in this way, eg. > > > > > > 0x1 call_iptables > > > 0x2 call_ip6tables > > > 0x4 call_arptables > > > 0x8 filter_vlan_tagged > > > ... > > > > > > Also documentation would be good to have for this. > > > > > > Would this idea fly for you? Thanks. > > > > My suggestion is to keep these files per network namespace but have a > > single flag argument in struct netns_brnf: > > +struct netns_brnf { > > +#ifdef CONFIG_SYSCTL > > + struct ctl_table_header *ctl_hdr; > > +#endif > > + > > + /* default value is 1 */ > > + unsigned int filter_flags; > > +}; > > > > #define BRNF_CALL_IPTABLES 0x1 > > #define BRNF_CALL_IP6TABLES 0x2 > > #define BRNF_CALL_ARPTABLES 0x4 > > #define BRNF_CALL_VLAN_TAGGED 0x8 > > > > a write to the corresponding file would then cause the flag to be set or > > unset in filter_flags. > > This way we are a) space-efficient internally not bloating struct net > > while b) not breaking running tools in non-initial network namespaces > > that expect the files to be there. b) is really the important bit here. :) > > OK, please, go explore this space-efficient approach. Thanks. Sorry for the wait. Other patches came up. :) So, I looked into this approach and it is annoying to do: - the sysctl proc parsing infrastructure is not equipped to deal with flags at all and expanding it to it would be a lot of code - we would need either an atomic type or locking for filter_flags in the netns_brnf struct if multiple proc sysctl handlers try to raise or lower bits in filter_flags via different files at the same time So I feel that this is not a feasible solution. We could make netns_brnf a pointer in struct net and allocate it on new network namespace creation if we care about space but then we take the performance hit of k*alloc(). What I stressed before: for userspace it's important that we don't change the semantics how br netfilter is configured in a non-initial network namespace to not break existing tools in such environments. Christian
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 99d4148e0f90..bea0474cd3ea 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -125,6 +125,9 @@ struct net { #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) struct netns_ct ct; #endif +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + struct netns_brnf brnf; +#endif #if defined(CONFIG_NF_TABLES) || defined(CONFIG_NF_TABLES_MODULE) struct netns_nftables nft; #endif diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h index ca043342c0eb..eedbd1ac940e 100644 --- a/include/net/netns/netfilter.h +++ b/include/net/netns/netfilter.h @@ -35,4 +35,20 @@ struct netns_nf { bool defrag_ipv6; #endif }; + +struct netns_brnf { +#ifdef CONFIG_SYSCTL + struct ctl_table_header *ctl_hdr; +#endif + + /* default value is 1 */ + int call_iptables; + int call_ip6tables; + int call_arptables; + + /* default value is 0 */ + int filter_vlan_tagged; + int filter_pppoe_tagged; + int pass_vlan_indev; +}; #endif diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index b1b5e8516724..656a084f4825 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -53,23 +53,6 @@ struct brnf_net { bool enabled; }; -#ifdef CONFIG_SYSCTL -static struct ctl_table_header *brnf_sysctl_header; -static int brnf_call_iptables __read_mostly = 1; -static int brnf_call_ip6tables __read_mostly = 1; -static int brnf_call_arptables __read_mostly = 1; -static int brnf_filter_vlan_tagged __read_mostly; -static int brnf_filter_pppoe_tagged __read_mostly; -static int brnf_pass_vlan_indev __read_mostly; -#else -#define brnf_call_iptables 1 -#define brnf_call_ip6tables 1 -#define brnf_call_arptables 1 -#define brnf_filter_vlan_tagged 0 -#define brnf_filter_pppoe_tagged 0 -#define brnf_pass_vlan_indev 0 -#endif - #define IS_IP(skb) \ (!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IP)) @@ -91,15 +74,15 @@ static inline __be16 vlan_proto(const struct sk_buff *skb) #define IS_VLAN_IP(skb) \ (vlan_proto(skb) == htons(ETH_P_IP) && \ - brnf_filter_vlan_tagged) + init_net.brnf.filter_vlan_tagged) #define IS_VLAN_IPV6(skb) \ (vlan_proto(skb) == htons(ETH_P_IPV6) && \ - brnf_filter_vlan_tagged) + init_net.brnf.filter_vlan_tagged) #define IS_VLAN_ARP(skb) \ (vlan_proto(skb) == htons(ETH_P_ARP) && \ - brnf_filter_vlan_tagged) + init_net.brnf.filter_vlan_tagged) static inline __be16 pppoe_proto(const struct sk_buff *skb) { @@ -110,12 +93,12 @@ static inline __be16 pppoe_proto(const struct sk_buff *skb) #define IS_PPPOE_IP(skb) \ (skb->protocol == htons(ETH_P_PPP_SES) && \ pppoe_proto(skb) == htons(PPP_IP) && \ - brnf_filter_pppoe_tagged) + init_net.brnf.filter_pppoe_tagged) #define IS_PPPOE_IPV6(skb) \ (skb->protocol == htons(ETH_P_PPP_SES) && \ pppoe_proto(skb) == htons(PPP_IPV6) && \ - brnf_filter_pppoe_tagged) + init_net.brnf.filter_pppoe_tagged) /* largest possible L2 header, see br_nf_dev_queue_xmit() */ #define NF_BRIDGE_MAX_MAC_HEADER_LENGTH (PPPOE_SES_HLEN + ETH_HLEN) @@ -430,7 +413,7 @@ static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, const struct struct net_device *vlan, *br; br = bridge_parent(dev); - if (brnf_pass_vlan_indev == 0 || !skb_vlan_tag_present(skb)) + if (init_net.brnf.pass_vlan_indev == 0 || !skb_vlan_tag_present(skb)) return br; vlan = __vlan_find_dev_deep_rcu(br, skb->vlan_proto, @@ -487,7 +470,7 @@ static unsigned int br_nf_pre_routing(void *priv, br = p->br; if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb)) { - if (!brnf_call_ip6tables && + if (!init_net.brnf.call_ip6tables && !br_opt_get(br, BROPT_NF_CALL_IP6TABLES)) return NF_ACCEPT; @@ -495,7 +478,8 @@ static unsigned int br_nf_pre_routing(void *priv, return br_nf_pre_routing_ipv6(priv, skb, state); } - if (!brnf_call_iptables && !br_opt_get(br, BROPT_NF_CALL_IPTABLES)) + if (!init_net.brnf.call_iptables && + !br_opt_get(br, BROPT_NF_CALL_IPTABLES)) return NF_ACCEPT; if (!IS_IP(skb) && !IS_VLAN_IP(skb) && !IS_PPPOE_IP(skb)) @@ -637,7 +621,8 @@ static unsigned int br_nf_forward_arp(void *priv, return NF_ACCEPT; br = p->br; - if (!brnf_call_arptables && !br_opt_get(br, BROPT_NF_CALL_ARPTABLES)) + if (!init_net.brnf.call_arptables && + !br_opt_get(br, BROPT_NF_CALL_ARPTABLES)) return NF_ACCEPT; if (!IS_ARP(skb)) { @@ -1032,42 +1017,42 @@ int brnf_sysctl_call_tables(struct ctl_table *ctl, int write, static struct ctl_table brnf_table[] = { { .procname = "bridge-nf-call-arptables", - .data = &brnf_call_arptables, + .data = &init_net.brnf.call_arptables, .maxlen = sizeof(int), .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, { .procname = "bridge-nf-call-iptables", - .data = &brnf_call_iptables, + .data = &init_net.brnf.call_iptables, .maxlen = sizeof(int), .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, { .procname = "bridge-nf-call-ip6tables", - .data = &brnf_call_ip6tables, + .data = &init_net.brnf.call_ip6tables, .maxlen = sizeof(int), .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, { .procname = "bridge-nf-filter-vlan-tagged", - .data = &brnf_filter_vlan_tagged, + .data = &init_net.brnf.filter_vlan_tagged, .maxlen = sizeof(int), .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, { .procname = "bridge-nf-filter-pppoe-tagged", - .data = &brnf_filter_pppoe_tagged, + .data = &init_net.brnf.filter_pppoe_tagged, .maxlen = sizeof(int), .mode = 0644, .proc_handler = brnf_sysctl_call_tables, }, { .procname = "bridge-nf-pass-vlan-input-dev", - .data = &brnf_pass_vlan_indev, + .data = &init_net.brnf.pass_vlan_indev, .maxlen = sizeof(int), .mode = 0644, .proc_handler = brnf_sysctl_call_tables, @@ -1076,6 +1061,16 @@ static struct ctl_table brnf_table[] = { }; #endif +static inline void br_netfilter_sysctl_default(struct netns_brnf *brnf) +{ + brnf->call_iptables = 1; + brnf->call_ip6tables = 1; + brnf->call_arptables = 1; + brnf->filter_vlan_tagged = 0; + brnf->filter_pppoe_tagged = 0; + brnf->pass_vlan_indev = 0; +} + static int __init br_netfilter_init(void) { int ret; @@ -1090,9 +1085,12 @@ static int __init br_netfilter_init(void) return ret; } + /* Always set default values. Even if CONFIG_SYSCTL is not set. */ + br_netfilter_sysctl_default(&init_net.brnf); + #ifdef CONFIG_SYSCTL - brnf_sysctl_header = register_net_sysctl(&init_net, "net/bridge", brnf_table); - if (brnf_sysctl_header == NULL) { + init_net.brnf.ctl_hdr = register_net_sysctl(&init_net, "net/bridge", brnf_table); + if (!init_net.brnf.ctl_hdr) { printk(KERN_WARNING "br_netfilter: can't register to sysctl.\n"); unregister_netdevice_notifier(&brnf_notifier); @@ -1111,7 +1109,7 @@ static void __exit br_netfilter_fini(void) unregister_netdevice_notifier(&brnf_notifier); unregister_pernet_subsys(&brnf_net_ops); #ifdef CONFIG_SYSCTL - unregister_net_sysctl_table(brnf_sysctl_header); + unregister_net_sysctl_table(init_net.brnf.ctl_hdr); #endif }