Message ID | 1571288584-46449-1-git-send-email-xiangxia.m.yue@gmail.com |
---|---|
State | Awaiting Upstream |
Delegated to: | David Miller |
Headers | show |
Series | [net-next] netfilter: nf_conntrack: introduce conntrack limit per-zone | expand |
xiangxia.m.yue@gmail.com <xiangxia.m.yue@gmail.com> wrote: > nf_conntrack_max is used to limit the maximum number of > conntrack entries in the conntrack table for every network > namespace. For the containers that reside in the same namespace, > they share the same conntrack table, and the total # of conntrack > entries for all containers are limited by nf_conntrack_max. > In this case, if one of the container abuses the usage the > conntrack entries, it blocks the others from committing valid > conntrack entries into the conntrack table. > > To address the issue, this patch adds conntrack counter for zones > and max count which zone wanted, So that any zone can't consume > all conntrack entries in the conntrack table. > > This feature can be used for openvswitch or iptables. Your approach adds cost for everyone, plus a 256kbyte 'struct net' increase. openvswitch supports per zone limits already, using nf_conncount infrastructure. nftables supports it using ruleset (via 'ct count'). If you need support for iptables, consider extending xt_connlimit.c instead -- looking at the code it might already do all that is needed if userspace passes a 0-length mask for the ip address, i.e. iptables -t mangle -A PREROUTING -m conntrack --ctstate NEW -m connlimit \ --connlimit-above 1000 --connlimit-mask 0 -j REJECT
On Wed, Oct 23, 2019 at 6:31 PM Florian Westphal <fw@strlen.de> wrote: > > xiangxia.m.yue@gmail.com <xiangxia.m.yue@gmail.com> wrote: > > nf_conntrack_max is used to limit the maximum number of > > conntrack entries in the conntrack table for every network > > namespace. For the containers that reside in the same namespace, > > they share the same conntrack table, and the total # of conntrack > > entries for all containers are limited by nf_conntrack_max. > > In this case, if one of the container abuses the usage the > > conntrack entries, it blocks the others from committing valid > > conntrack entries into the conntrack table. > > > > To address the issue, this patch adds conntrack counter for zones > > and max count which zone wanted, So that any zone can't consume > > all conntrack entries in the conntrack table. > > > > This feature can be used for openvswitch or iptables. > > Your approach adds cost for everyone, plus a 256kbyte 'struct net' > increase. > > openvswitch supports per zone limits already, using nf_conncount > infrastructure. This path limits the UNREPLIED conntrack entries. If we SYN flood one zone, the zone will consume all entries in table, which state SYN_SENT. The openvswitch limits only the +est conntrack. > nftables supports it using ruleset (via 'ct count'). > > If you need support for iptables, consider extending xt_connlimit.c > instead -- looking at the code it might already do all that is needed > if userspace passes a 0-length mask for the ip address, i.e. > > iptables -t mangle -A PREROUTING -m conntrack --ctstate NEW -m connlimit \ > --connlimit-above 1000 --connlimit-mask 0 -j REJECT
Tonghao Zhang <xiangxia.m.yue@gmail.com> wrote: > > openvswitch supports per zone limits already, using nf_conncount > > infrastructure. > This path limits the UNREPLIED conntrack entries. If we SYN flood one > zone, the zone will consume all entries in table, which state > SYN_SENT. > The openvswitch limits only the +est conntrack. Why? Can't it be fixed to work properly? > > iptables -t mangle -A PREROUTING -m conntrack --ctstate NEW -m connlimit \ > > --connlimit-above 1000 --connlimit-mask 0 -j REJECT This should work for the synflood case, too.
diff --git a/include/linux/netfilter/nf_conntrack_zones_common.h b/include/linux/netfilter/nf_conntrack_zones_common.h index 8f3905e1..0d50880 100644 --- a/include/linux/netfilter/nf_conntrack_zones_common.h +++ b/include/linux/netfilter/nf_conntrack_zones_common.h @@ -12,11 +12,13 @@ #define NF_CT_DEFAULT_ZONE_DIR (NF_CT_ZONE_DIR_ORIG | NF_CT_ZONE_DIR_REPL) #define NF_CT_FLAG_MARK 1 +#define NF_CT_ZONE_CONN_MAX 65535 struct nf_conntrack_zone { u16 id; u8 flags; u8 dir; + unsigned int max_wanted; }; extern const struct nf_conntrack_zone nf_ct_zone_dflt; diff --git a/include/net/netfilter/nf_conntrack_zones.h b/include/net/netfilter/nf_conntrack_zones.h index 48dbadb..f072374 100644 --- a/include/net/netfilter/nf_conntrack_zones.h +++ b/include/net/netfilter/nf_conntrack_zones.h @@ -5,6 +5,42 @@ #include <linux/netfilter/nf_conntrack_zones_common.h> #include <net/netfilter/nf_conntrack.h> +static inline void nf_ct_zone_count_init(struct net *net) +{ +#ifdef CONFIG_NF_CONNTRACK_ZONES + int i; + for (i = 0; i < NF_CT_ZONE_CONN_MAX; i ++) + atomic_set(&net->ct.zone_conn_max[i], 0); +#endif +} + +static inline void nf_ct_zone_count_inc(struct net *net, + const struct nf_conntrack_zone *zone) +{ +#ifdef CONFIG_NF_CONNTRACK_ZONES + atomic_inc(&net->ct.zone_conn_max[zone->id]); +#endif +} + +static inline void nf_ct_zone_count_dec(struct net *net, + const struct nf_conntrack_zone *zone) +{ +#ifdef CONFIG_NF_CONNTRACK_ZONES + atomic_dec(&net->ct.zone_conn_max[zone->id]); +#endif +} + +static inline unsigned int +nf_ct_zone_count_read(struct net *net, + const struct nf_conntrack_zone *zone) +{ +#ifdef CONFIG_NF_CONNTRACK_ZONES + return atomic_read(&net->ct.zone_conn_max[zone->id]); +#else + return 0; +#endif +} + static inline const struct nf_conntrack_zone * nf_ct_zone(const struct nf_conn *ct) { diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h index 806454e..da50d1e 100644 --- a/include/net/netns/conntrack.h +++ b/include/net/netns/conntrack.h @@ -6,6 +6,7 @@ #include <linux/list_nulls.h> #include <linux/atomic.h> #include <linux/workqueue.h> +#include <linux/netfilter/nf_conntrack_zones_common.h> #include <linux/netfilter/nf_conntrack_tcp.h> #ifdef CONFIG_NF_CT_PROTO_DCCP #include <linux/netfilter/nf_conntrack_dccp.h> @@ -118,5 +119,9 @@ struct netns_ct { #if defined(CONFIG_NF_CONNTRACK_LABELS) unsigned int labels_used; #endif + +#ifdef CONFIG_NF_CONNTRACK_ZONES + atomic_t zone_conn_max[NF_CT_ZONE_CONN_MAX]; +#endif }; #endif diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 0c63120..a2f7c27d 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1352,14 +1352,20 @@ static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work) /* We don't want any race condition at early drop stage */ atomic_inc(&net->ct.count); + nf_ct_zone_count_inc(net, zone); + + if ((nf_conntrack_max && + unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) || + (zone->max_wanted && + unlikely(nf_ct_zone_count_read(net, zone) > zone->max_wanted))) { - if (nf_conntrack_max && - unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) { if (!early_drop(net, hash)) { if (!conntrack_gc_work.early_drop) conntrack_gc_work.early_drop = true; + atomic_dec(&net->ct.count); - net_warn_ratelimited("nf_conntrack: table full, dropping packet\n"); + nf_ct_zone_count_dec(net, zone); + net_warn_ratelimited("nf_conntrack: table or zone full, dropping packet\n"); return ERR_PTR(-ENOMEM); } } @@ -1394,6 +1400,7 @@ static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work) return ct; out: atomic_dec(&net->ct.count); + nf_ct_zone_count_dec(net, zone); return ERR_PTR(-ENOMEM); } @@ -1421,6 +1428,7 @@ void nf_conntrack_free(struct nf_conn *ct) kmem_cache_free(nf_conntrack_cachep, ct); smp_mb__before_atomic(); atomic_dec(&net->ct.count); + nf_ct_zone_count_dec(net, nf_ct_zone(ct)); } EXPORT_SYMBOL_GPL(nf_conntrack_free); @@ -2510,6 +2518,7 @@ int nf_conntrack_init_net(struct net *net) BUILD_BUG_ON(IP_CT_UNTRACKED == IP_CT_NUMBER); BUILD_BUG_ON_NOT_POWER_OF_2(CONNTRACK_LOCKS); atomic_set(&net->ct.count, 0); + nf_ct_zone_count_init(net); net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu); if (!net->ct.pcpu_lists)