xfrm: cache bundle lookup results in flow cache

Message ID	1268655610-7845-1-git-send-email-timo.teras@iki.fi
State	Changes Requested, archived
Delegated to:	David Miller
Headers	show Return-Path: <netdev-owner@vger.kernel.org> DomainKey-Signature: a=rsa-sha1; c=nofws; d=gmail.com; s=gamma; h=sender:from:to:cc:subject:date:message-id:x-mailer; b=NGAiHWYHOZZS6rTI/lOt8se595BrSGwUeruEegNHk7M25INvqDTf0Hm3jx53lR4ko6 r2BWYAiCL+j+L07KCZo11j6Pqb/tjpQZlB9QvbSZ2sR5z6axARWe7qmETarQaz2vT9YA AhsFJ+X7yLD/5aJ6ZqKyH9jt7D1X5dPDwIxGE= From: Timo Teras <timo.teras@iki.fi> To: netdev@vger.kernel.org Cc: Timo Teras <timo.teras@iki.fi>, Herbert Xu <herbert@gondor.apana.org.au> Subject: [PATCH] xfrm: cache bundle lookup results in flow cache Date: Mon, 15 Mar 2010 14:20:10 +0200 Message-Id: <1268655610-7845-1-git-send-email-timo.teras@iki.fi> Sender: netdev-owner@vger.kernel.org Precedence: bulk

diff --git a/include/net/flow.h b/include/net/flow.h index 809970b..814a9d2 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -8,6 +8,9 @@ #define _NET_FLOW_H #include <linux/in6.h> +#include <linux/notifier.h> +#include <linux/timer.h> +#include <linux/slab.h> #include <asm/atomic.h> struct flowi { @@ -86,13 +89,37 @@ struct flowi { struct net; struct sock; -typedef int (*flow_resolve_t)(struct net *net, struct flowi *key, u16 family, - u8 dir, void **objp, atomic_t **obj_refp); -extern void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, - u8 dir, flow_resolve_t resolver); -extern void flow_cache_flush(void); -extern atomic_t flow_cache_genid; +struct flow_cache_percpu; +struct flow_cache_entry; + +struct flow_cache { + u32 hash_shift; + u32 order; + struct flow_cache_percpu * percpu; + struct notifier_block hotcpu_notifier; + int low_watermark; + int high_watermark; + struct timer_list rnd_timer; + struct kmem_cache * flow_cachep; +}; + +struct flow_cache_entry { + struct flow_cache_entry *next; + struct flowi key; + u16 family; + u8 dir; +}; + +extern struct flow_cache_entry *flow_cache_lookup( + struct flow_cache *cache, struct flowi *key, + u16 family, u8 dir); +extern void flow_cache_entry_put(struct flow_cache_entry *fce); + +void flow_cache_flush(struct flow_cache *fc, + void (*flush)(struct flow_cache *fc, struct flow_cache_entry *fce)); +extern int flow_cache_init(struct flow_cache *cache, size_t entry_size); +extern void flow_cache_fini(struct flow_cache *cache); static inline int flow_cache_uli_match(struct flowi *fl1, struct flowi *fl2) { diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h index 74f119a..1b223c9 100644 --- a/include/net/netns/xfrm.h +++ b/include/net/netns/xfrm.h @@ -42,6 +42,10 @@ struct netns_xfrm { struct xfrm_policy_hash policy_bydst[XFRM_POLICY_MAX * 2]; unsigned int policy_count[XFRM_POLICY_MAX * 2]; struct work_struct policy_hash_work; + atomic_t policy_genid; + struct hlist_head policy_gc_list; + struct work_struct policy_gc_work; + struct flow_cache flow_cache; struct dst_ops xfrm4_dst_ops; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index d74e080..f469b9b 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -488,6 +488,7 @@ struct xfrm_policy { struct xfrm_lifetime_cfg lft; struct xfrm_lifetime_cur curlft; struct dst_entry *bundles; + atomic_t bundles_genid; struct xfrm_policy_walk_entry walk; u8 type; u8 action; diff --git a/net/core/flow.c b/net/core/flow.c index 9601587..e3782c2 100644 --- a/net/core/flow.c +++ b/net/core/flow.c @@ -25,114 +25,85 @@ #include <asm/atomic.h> #include <linux/security.h> -struct flow_cache_entry { - struct flow_cache_entry *next; - u16 family; - u8 dir; - u32 genid; - struct flowi key; - void *object; - atomic_t *object_ref; -}; - -atomic_t flow_cache_genid = ATOMIC_INIT(0); - -static u32 flow_hash_shift; -#define flow_hash_size (1 << flow_hash_shift) -static DEFINE_PER_CPU(struct flow_cache_entry **, flow_tables) = { NULL }; - -#define flow_table(cpu) (per_cpu(flow_tables, cpu)) - -static struct kmem_cache *flow_cachep __read_mostly; -static int flow_lwm, flow_hwm; - -struct flow_percpu_info { - int hash_rnd_recalc; - u32 hash_rnd; - int count; +struct flow_cache_percpu { + struct flow_cache_entry ** hash_table; + int hash_count; + u32 hash_rnd; + int hash_rnd_recalc; + struct tasklet_struct flush_tasklet; }; -static DEFINE_PER_CPU(struct flow_percpu_info, flow_hash_info) = { 0 }; - -#define flow_hash_rnd_recalc(cpu) \ - (per_cpu(flow_hash_info, cpu).hash_rnd_recalc) -#define flow_hash_rnd(cpu) \ - (per_cpu(flow_hash_info, cpu).hash_rnd) -#define flow_count(cpu) \ - (per_cpu(flow_hash_info, cpu).count) - -static struct timer_list flow_hash_rnd_timer; - -#define FLOW_HASH_RND_PERIOD (10 * 60 * HZ) struct flow_flush_info { - atomic_t cpuleft; - struct completion completion; + void (*flush)(struct flow_cache *fc, struct flow_cache_entry *fce); + struct flow_cache * cache; + atomic_t cpuleft; + struct completion completion; }; -static DEFINE_PER_CPU(struct tasklet_struct, flow_flush_tasklets) = { NULL }; -#define flow_flush_tasklet(cpu) (&per_cpu(flow_flush_tasklets, cpu)) +#define flow_cache_hash_size(cache) (1 << (cache)->hash_shift) +#define FLOW_HASH_RND_PERIOD (10 * 60 * HZ) static void flow_cache_new_hashrnd(unsigned long arg) { + struct flow_cache *fc = (struct flow_cache *) arg; int i; for_each_possible_cpu(i) - flow_hash_rnd_recalc(i) = 1; + per_cpu_ptr(fc->percpu, i)->hash_rnd_recalc = 1; - flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD; - add_timer(&flow_hash_rnd_timer); + fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD; + add_timer(&fc->rnd_timer); } -static void flow_entry_kill(int cpu, struct flow_cache_entry *fle) -{ - if (fle->object) - atomic_dec(fle->object_ref); - kmem_cache_free(flow_cachep, fle); - flow_count(cpu)--; -} - -static void __flow_cache_shrink(int cpu, int shrink_to) +static void __flow_cache_shrink(struct flow_cache *fc, + struct flow_cache_percpu *fcp, + int shrink_to) { struct flow_cache_entry *fle, **flp; int i; - for (i = 0; i < flow_hash_size; i++) { + for (i = 0; i < flow_cache_hash_size(fc); i++) { int k = 0; - flp = &flow_table(cpu)[i]; + flp = &fcp->hash_table[i]; while ((fle = *flp) != NULL && k < shrink_to) { k++; flp = &fle->next; } while ((fle = *flp) != NULL) { *flp = fle->next; - flow_entry_kill(cpu, fle); + + kmem_cache_free(fc->flow_cachep, fle); + fcp->hash_count--; } } } -static void flow_cache_shrink(int cpu) +static void flow_cache_shrink(struct flow_cache *fc, + struct flow_cache_percpu *fcp) { - int shrink_to = flow_lwm / flow_hash_size; + int shrink_to = fc->low_watermark / flow_cache_hash_size(fc); - __flow_cache_shrink(cpu, shrink_to); + __flow_cache_shrink(fc, fcp, shrink_to); } -static void flow_new_hash_rnd(int cpu) +static void flow_new_hash_rnd(struct flow_cache *fc, + struct flow_cache_percpu *fcp) { - get_random_bytes(&flow_hash_rnd(cpu), sizeof(u32)); - flow_hash_rnd_recalc(cpu) = 0; - - __flow_cache_shrink(cpu, 0); + get_random_bytes(&fcp->hash_rnd, sizeof(u32)); + fcp->hash_rnd_recalc = 0; + __flow_cache_shrink(fc, fcp, 0); } -static u32 flow_hash_code(struct flowi *key, int cpu) +static u32 flow_hash_code(struct flow_cache *fc, + struct flow_cache_percpu *fcp, + struct flowi *key) { u32 *k = (u32 *) key; - return (jhash2(k, (sizeof(*key) / sizeof(u32)), flow_hash_rnd(cpu)) & - (flow_hash_size - 1)); + return (jhash2(k, (sizeof(*key) / sizeof(u32)), fcp->hash_rnd) + & (flow_cache_hash_size(fc) - 1)); } #if (BITS_PER_LONG == 64) @@ -165,128 +136,100 @@ static int flow_key_compare(struct flowi *key1, struct flowi *key2) return 0; } -void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir, - flow_resolve_t resolver) +struct flow_cache_entry *flow_cache_lookup(struct flow_cache *fc, + struct flowi *key, + u16 family, u8 dir) { struct flow_cache_entry *fle, **head; + struct flow_cache_percpu *fcp; unsigned int hash; - int cpu; local_bh_disable(); - cpu = smp_processor_id(); + fcp = per_cpu_ptr(fc->percpu, smp_processor_id()); fle = NULL; /* Packet really early in init? Making flow_cache_init a * pre-smp initcall would solve this. --RR */ - if (!flow_table(cpu)) + if (!fcp->hash_table) goto nocache; - if (flow_hash_rnd_recalc(cpu)) - flow_new_hash_rnd(cpu); - hash = flow_hash_code(key, cpu); + if (fcp->hash_rnd_recalc) + flow_new_hash_rnd(fc, fcp); + + hash = flow_hash_code(fc, fcp, key); - head = &flow_table(cpu)[hash]; + head = &fcp->hash_table[hash]; for (fle = *head; fle; fle = fle->next) { if (fle->family == family && fle->dir == dir && flow_key_compare(key, &fle->key) == 0) { - if (fle->genid == atomic_read(&flow_cache_genid)) { - void *ret = fle->object; - - if (ret) - atomic_inc(fle->object_ref); - local_bh_enable(); - - return ret; - } - break; - } - } - - if (!fle) { - if (flow_count(cpu) > flow_hwm) - flow_cache_shrink(cpu); - - fle = kmem_cache_alloc(flow_cachep, GFP_ATOMIC); - if (fle) { - fle->next = *head; - *head = fle; - fle->family = family; - fle->dir = dir; - memcpy(&fle->key, key, sizeof(*key)); - fle->object = NULL; - flow_count(cpu)++; + return fle; } } -nocache: - { - int err; - void *obj; - atomic_t *obj_ref; - - err = resolver(net, key, family, dir, &obj, &obj_ref); + if (fcp->hash_count > fc->high_watermark) + flow_cache_shrink(fc, fcp); - if (fle && !err) { - fle->genid = atomic_read(&flow_cache_genid); + fle = kmem_cache_zalloc(fc->flow_cachep, GFP_ATOMIC); + if (!fle) + goto nocache; - if (fle->object) - atomic_dec(fle->object_ref); + fle->next = *head; + *head = fle; + fle->family = family; + fle->dir = dir; + memcpy(&fle->key, key, sizeof(*key)); + fcp->hash_count++; + return fle; - fle->object = obj; - fle->object_ref = obj_ref; - if (obj) - atomic_inc(fle->object_ref); - } - local_bh_enable(); +nocache: + local_bh_enable(); + return NULL; +} - if (err) - obj = ERR_PTR(err); - return obj; - } +void flow_cache_entry_put(struct flow_cache_entry *fce) +{ + local_bh_enable(); } static void flow_cache_flush_tasklet(unsigned long data) { - struct flow_flush_info *info = (void *)data; + struct flow_flush_info *info = (void *) data; + struct flow_cache *fc = (void *) info->cache; + struct flow_cache_percpu *fcp; int i; - int cpu; - cpu = smp_processor_id(); - for (i = 0; i < flow_hash_size; i++) { - struct flow_cache_entry *fle; + if (info->flush == NULL) + goto done; - fle = flow_table(cpu)[i]; - for (; fle; fle = fle->next) { - unsigned genid = atomic_read(&flow_cache_genid); - - if (!fle->object || fle->genid == genid) - continue; + fcp = per_cpu_ptr(fc->percpu, smp_processor_id()); + for (i = 0; i < flow_cache_hash_size(fc); i++) { + struct flow_cache_entry *fle; - fle->object = NULL; - atomic_dec(fle->object_ref); - } + fle = fcp->hash_table[i]; + for (; fle; fle = fle->next) + info->flush(fc, fle); } +done: if (atomic_dec_and_test(&info->cpuleft)) complete(&info->completion); } -static void flow_cache_flush_per_cpu(void *) __attribute__((__unused__)); static void flow_cache_flush_per_cpu(void *data) { struct flow_flush_info *info = data; - int cpu; struct tasklet_struct *tasklet; + int cpu; cpu = smp_processor_id(); - - tasklet = flow_flush_tasklet(cpu); - tasklet->data = (unsigned long)info; + tasklet = &per_cpu_ptr(info->cache->percpu, cpu)->flush_tasklet; + tasklet->data = (unsigned long) data; tasklet_schedule(tasklet); } -void flow_cache_flush(void) +void flow_cache_flush(struct flow_cache *fc, + void (*flush)(struct flow_cache *fc, struct flow_cache_entry *fce)) { struct flow_flush_info info; static DEFINE_MUTEX(flow_flush_sem); @@ -294,6 +237,8 @@ void flow_cache_flush(void) /* Don't want cpus going down or up during this. */ get_online_cpus(); mutex_lock(&flow_flush_sem); + info.cache = fc; + info.flush = flush; atomic_set(&info.cpuleft, num_online_cpus()); init_completion(&info.completion); @@ -307,62 +252,99 @@ void flow_cache_flush(void) put_online_cpus(); } -static void __init flow_cache_cpu_prepare(int cpu) +static void __init flow_cache_cpu_prepare(struct flow_cache *fc, + struct flow_cache_percpu *fcp) +{ + fcp->hash_table = (struct flow_cache_entry **) + __get_free_pages(GFP_KERNEL|__GFP_ZERO, fc->order); + fcp->hash_rnd_recalc = 1; + fcp->hash_count = 0; + + tasklet_init(&fcp->flush_tasklet, flow_cache_flush_tasklet, 0); +} + +static int __cpuinit flow_cache_cpu(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + struct flow_cache *fc = container_of(nfb, struct flow_cache, hotcpu_notifier); + int cpu = (unsigned long) hcpu; + struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu); + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + flow_cache_cpu_prepare(fc, fcp); + if (!fcp->hash_table) + return NOTIFY_BAD; + break; + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + case CPU_DEAD: + case CPU_DEAD_FROZEN: + if (fcp->hash_table) { + __flow_cache_shrink(fc, fcp, 0); + free_pages((unsigned long) fcp->hash_table, fc->order); + fcp->hash_table = NULL; + } + break; + } + return NOTIFY_OK; +} + +int flow_cache_init(struct flow_cache *fc, size_t entry_size) { - struct tasklet_struct *tasklet; unsigned long order; + int i, r; + + BUG_ON(entry_size < sizeof(struct flow_cache_entry)); + fc->flow_cachep = kmem_cache_create("flow_cache", + entry_size, + 0, SLAB_PANIC, + NULL); + fc->hash_shift = 10; + fc->low_watermark = 2 * flow_cache_hash_size(fc); + fc->high_watermark = 4 * flow_cache_hash_size(fc); + fc->percpu = alloc_percpu(struct flow_cache_percpu); for (order = 0; (PAGE_SIZE << order) < - (sizeof(struct flow_cache_entry *)*flow_hash_size); + (sizeof(struct flow_cache_entry *) * flow_cache_hash_size(fc)); order++) /* NOTHING */; + fc->order = order; - flow_table(cpu) = (struct flow_cache_entry **) - __get_free_pages(GFP_KERNEL|__GFP_ZERO, order); - if (!flow_table(cpu)) - panic("NET: failed to allocate flow cache order %lu\n", order); + setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd, (unsigned long) fc); + fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD; + add_timer(&fc->rnd_timer); - flow_hash_rnd_recalc(cpu) = 1; - flow_count(cpu) = 0; + for_each_online_cpu(i) { + r = flow_cache_cpu(&fc->hotcpu_notifier, + CPU_UP_PREPARE, (void*) i); + if (r != NOTIFY_OK) + panic("NET: failed to allocate flow cache order %lu\n", order); + } - tasklet = flow_flush_tasklet(cpu); - tasklet_init(tasklet, flow_cache_flush_tasklet, 0); -} + fc->hotcpu_notifier = (struct notifier_block){ + .notifier_call = flow_cache_cpu, + }; + register_hotcpu_notifier(&fc->hotcpu_notifier); -static int flow_cache_cpu(struct notifier_block *nfb, - unsigned long action, - void *hcpu) -{ - if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) - __flow_cache_shrink((unsigned long)hcpu, 0); - return NOTIFY_OK; + return 0; } -static int __init flow_cache_init(void) +void flow_cache_fini(struct flow_cache *fc) { int i; - flow_cachep = kmem_cache_create("flow_cache", - sizeof(struct flow_cache_entry), - 0, SLAB_PANIC, - NULL); - flow_hash_shift = 10; - flow_lwm = 2 * flow_hash_size; - flow_hwm = 4 * flow_hash_size; - - setup_timer(&flow_hash_rnd_timer, flow_cache_new_hashrnd, 0); - flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD; - add_timer(&flow_hash_rnd_timer); + del_timer(&fc->rnd_timer); + unregister_hotcpu_notifier(&fc->hotcpu_notifier); for_each_possible_cpu(i) - flow_cache_cpu_prepare(i); + flow_cache_cpu(&fc->hotcpu_notifier, CPU_DEAD, (void*) i); - hotcpu_notifier(flow_cache_cpu, 0); - return 0; + free_percpu(fc->percpu); + kmem_cache_destroy(fc->flow_cachep); } -module_init(flow_cache_init); - -EXPORT_SYMBOL(flow_cache_genid); EXPORT_SYMBOL(flow_cache_lookup); diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 3516e6f..588ba76 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -151,8 +151,9 @@ void __inet6_csk_dst_store(struct sock *sk, struct dst_entry *dst, #ifdef CONFIG_XFRM { + struct net *net = sock_net(sk); struct rt6_info *rt = (struct rt6_info *)dst; - rt->rt6i_flow_cache_genid = atomic_read(&flow_cache_genid); + rt->rt6i_flow_cache_genid = atomic_read(&net->xfrm.policy_genid); } #endif } @@ -166,8 +167,9 @@ struct dst_entry *__inet6_csk_dst_check(struct sock *sk, u32 cookie) #ifdef CONFIG_XFRM if (dst) { + struct net *net = sock_net(sk); struct rt6_info *rt = (struct rt6_info *)dst; - if (rt->rt6i_flow_cache_genid != atomic_read(&flow_cache_genid)) { + if (rt->rt6i_flow_cache_genid != atomic_read(&net->xfrm.policy_genid)) { __sk_dst_reset(sk); dst = NULL; } diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 843e066..228b813 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -44,7 +44,6 @@ static struct xfrm_policy_afinfo *xfrm_policy_afinfo[NPROTO]; static struct kmem_cache *xfrm_dst_cache __read_mostly; -static HLIST_HEAD(xfrm_policy_gc_list); static DEFINE_SPINLOCK(xfrm_policy_gc_lock); static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family); @@ -53,6 +52,7 @@ static void xfrm_init_pmtu(struct dst_entry *dst); static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol, int dir); +static int stale_bundle(struct dst_entry *dst); static inline int __xfrm4_selector_match(struct xfrm_selector *sel, struct flowi *fl) @@ -216,6 +216,35 @@ expired: xfrm_pol_put(xp); } +struct xfrm_flow_cache_entry { + struct flow_cache_entry fce; + struct xfrm_policy *policy; + struct xfrm_dst *dst; + u32 policy_genid, bundles_genid; +}; +#define XFRM_CACHE_NO_POLICY ((struct xfrm_policy *) -1) + +void xfrm_flow_cache_entry_validate(struct flow_cache *fc, + struct flow_cache_entry *fce) +{ + struct net *net = container_of(fc, struct net, xfrm.flow_cache); + struct xfrm_flow_cache_entry *xfc = + container_of(fce, struct xfrm_flow_cache_entry, fce); + + if (xfc->policy_genid != atomic_read(&net->xfrm.policy_genid)) + goto invalid; + if (xfc->policy == NULL || xfc->policy == XFRM_CACHE_NO_POLICY) + return; + if (xfc->policy->walk.dead) + goto invalid; + if (xfc->bundles_genid != atomic_read(&xfc->policy->bundles_genid)) + goto invalid_dst; + return; +invalid: + xfc->policy = NULL; +invalid_dst: + xfc->dst = NULL; +} /* Allocate xfrm_policy. Not used here, it is supposed to be used by pfkeyv2 * SPD calls. @@ -269,27 +298,26 @@ static void xfrm_policy_gc_kill(struct xfrm_policy *policy) if (del_timer(&policy->timer)) atomic_dec(&policy->refcnt); - if (atomic_read(&policy->refcnt) > 1) - flow_cache_flush(); - xfrm_pol_put(policy); } static void xfrm_policy_gc_task(struct work_struct *work) { + struct net *net = container_of(work, struct net, xfrm.policy_gc_work); struct xfrm_policy *policy; struct hlist_node *entry, *tmp; struct hlist_head gc_list; spin_lock_bh(&xfrm_policy_gc_lock); - gc_list.first = xfrm_policy_gc_list.first; - INIT_HLIST_HEAD(&xfrm_policy_gc_list); + gc_list.first = net->xfrm.policy_gc_list.first; + INIT_HLIST_HEAD(&net->xfrm.policy_gc_list); spin_unlock_bh(&xfrm_policy_gc_lock); + flow_cache_flush(&net->xfrm.flow_cache, xfrm_flow_cache_entry_validate); + hlist_for_each_entry_safe(policy, entry, tmp, &gc_list, bydst) xfrm_policy_gc_kill(policy); } -static DECLARE_WORK(xfrm_policy_gc_work, xfrm_policy_gc_task); /* Rule must be locked. Release descentant resources, announce * entry dead. The rule must be unlinked from lists to the moment. @@ -297,6 +325,7 @@ static DECLARE_WORK(xfrm_policy_gc_work, xfrm_policy_gc_task); static void xfrm_policy_kill(struct xfrm_policy *policy) { + struct net *net = xp_net(policy); int dead; write_lock_bh(&policy->lock); @@ -310,10 +339,10 @@ static void xfrm_policy_kill(struct xfrm_policy *policy) } spin_lock_bh(&xfrm_policy_gc_lock); - hlist_add_head(&policy->bydst, &xfrm_policy_gc_list); + hlist_add_head(&policy->bydst, &net->xfrm.policy_gc_list); spin_unlock_bh(&xfrm_policy_gc_lock); - schedule_work(&xfrm_policy_gc_work); + schedule_work(&net->xfrm.policy_gc_work); } static unsigned int xfrm_policy_hashmax __read_mostly = 1 * 1024 * 1024; @@ -588,7 +617,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) hlist_add_head(&policy->bydst, chain); xfrm_pol_hold(policy); net->xfrm.policy_count[dir]++; - atomic_inc(&flow_cache_genid); + atomic_inc(&net->xfrm.policy_genid); if (delpol) __xfrm_policy_unlink(delpol, dir); policy->index = delpol ? delpol->index : xfrm_gen_index(net, dir); @@ -621,11 +650,13 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) gc_list = dst; policy->bundles = NULL; + atomic_inc(&policy->bundles_genid); } write_unlock(&policy->lock); } read_unlock_bh(&xfrm_policy_lock); + flow_cache_flush(&net->xfrm.flow_cache, NULL); while (gc_list) { struct dst_entry *dst = gc_list; @@ -672,7 +703,7 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type, write_unlock_bh(&xfrm_policy_lock); if (ret && delete) { - atomic_inc(&flow_cache_genid); + atomic_inc(&net->xfrm.policy_genid); xfrm_policy_kill(ret); } return ret; @@ -714,7 +745,7 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type, write_unlock_bh(&xfrm_policy_lock); if (ret && delete) { - atomic_inc(&flow_cache_genid); + atomic_inc(&net->xfrm.policy_genid); xfrm_policy_kill(ret); } return ret; @@ -835,7 +866,7 @@ int xfrm_policy_flush(struct net *net, u8 type, struct xfrm_audit *audit_info) } if (!cnt) err = -ESRCH; - atomic_inc(&flow_cache_genid); + atomic_inc(&net->xfrm.policy_genid); out: write_unlock_bh(&xfrm_policy_lock); return err; @@ -989,32 +1020,18 @@ fail: return ret; } -static int xfrm_policy_lookup(struct net *net, struct flowi *fl, u16 family, - u8 dir, void **objp, atomic_t **obj_refp) +static struct xfrm_policy *xfrm_policy_lookup( + struct net *net, struct flowi *fl, + u16 family, u8 dir) { +#ifdef CONFIG_XFRM_SUB_POLICY struct xfrm_policy *pol; - int err = 0; -#ifdef CONFIG_XFRM_SUB_POLICY pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family, dir); - if (IS_ERR(pol)) { - err = PTR_ERR(pol); - pol = NULL; - } - if (pol || err) - goto end; + if (pol != NULL) + return pol; #endif - pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir); - if (IS_ERR(pol)) { - err = PTR_ERR(pol); - pol = NULL; - } -#ifdef CONFIG_XFRM_SUB_POLICY -end: -#endif - if ((*objp = (void *) pol) != NULL) - *obj_refp = &pol->refcnt; - return err; + return xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir); } static inline int policy_to_flow_dir(int dir) @@ -1100,12 +1117,14 @@ static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol, int xfrm_policy_delete(struct xfrm_policy *pol, int dir) { + struct net *net = xp_net(pol); + write_lock_bh(&xfrm_policy_lock); pol = __xfrm_policy_unlink(pol, dir); write_unlock_bh(&xfrm_policy_lock); if (pol) { if (dir < XFRM_POLICY_MAX) - atomic_inc(&flow_cache_genid); + atomic_inc(&net->xfrm.policy_genid); xfrm_policy_kill(pol); return 0; } @@ -1545,13 +1564,34 @@ xfrm_dst_update_origin(struct dst_entry *dst, struct flowi *fl) #endif } -static int stale_bundle(struct dst_entry *dst); - /* Main function: finds/creates a bundle for given flow. * * At the moment we eat a raw IP route. Mostly to speed up lookups * on interfaces with disabled IPsec. */ + +static void xfrm_flow_cache_update(struct net *net, struct flowi *key, + u16 family, u8 dir, + struct xfrm_policy *pol, + struct xfrm_dst *dst) +{ + struct flow_cache_entry *fce; + struct xfrm_flow_cache_entry *xf; + + fce = flow_cache_lookup(&net->xfrm.flow_cache, + key, family, dir); + if (fce == NULL) + return; + + xf = container_of(fce, struct xfrm_flow_cache_entry, fce); + xf->policy_genid = atomic_read(&net->xfrm.policy_genid); + xf->policy = pol; + if (dst != NULL) + xf->bundles_genid = atomic_read(&pol->bundles_genid); + xf->dst = dst; + flow_cache_entry_put(fce); +} + int __xfrm_lookup(struct net *net, struct dst_entry **dst_p, struct flowi *fl, struct sock *sk, int flags) { @@ -1570,8 +1610,10 @@ int __xfrm_lookup(struct net *net, struct dst_entry **dst_p, struct flowi *fl, u8 dir = policy_to_flow_dir(XFRM_POLICY_OUT); restart: - genid = atomic_read(&flow_cache_genid); + family = dst_orig->ops->family; + genid = atomic_read(&net->xfrm.policy_genid); policy = NULL; + dst = NULL; for (pi = 0; pi < ARRAY_SIZE(pols); pi++) pols[pi] = NULL; npols = 0; @@ -1588,24 +1630,51 @@ restart: } if (!policy) { + struct flow_cache_entry *fce; + struct xfrm_flow_cache_entry *xf; + /* To accelerate a bit... */ if ((dst_orig->flags & DST_NOXFRM) || !net->xfrm.policy_count[XFRM_POLICY_OUT]) goto nopol; - policy = flow_cache_lookup(net, fl, dst_orig->ops->family, - dir, xfrm_policy_lookup); - err = PTR_ERR(policy); - if (IS_ERR(policy)) { - XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR); - goto dropdst; + fce = flow_cache_lookup(&net->xfrm.flow_cache, + fl, family, dir); + if (fce == NULL) + goto no_cache; + + xf = container_of(fce, struct xfrm_flow_cache_entry, fce); + xfrm_flow_cache_entry_validate(&net->xfrm.flow_cache, fce); + if (xf->policy != NULL) { + policy = xf->policy; + if (policy != XFRM_CACHE_NO_POLICY) + xfrm_pol_hold(policy); + if (xf->dst != NULL) + dst = dst_clone((struct dst_entry *) xf->dst); + } + flow_cache_entry_put(fce); + if (policy == XFRM_CACHE_NO_POLICY) + goto nopol; + if (dst && !xfrm_bundle_ok(policy, (struct xfrm_dst *) dst, fl, family, 0)) { + dst_release(dst); + dst = NULL; } } +no_cache: + if (!policy) { + policy = xfrm_policy_lookup(net, fl, family, dir); + if (!policy) { + xfrm_flow_cache_update( + net, fl, family, dir, + XFRM_CACHE_NO_POLICY, NULL); + goto nopol; + } + } + if (IS_ERR(policy)) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR); + goto dropdst; + } - if (!policy) - goto nopol; - - family = dst_orig->ops->family; pols[0] = policy; npols ++; xfrm_nr += pols[0]->xfrm_nr; @@ -1616,6 +1685,9 @@ restart: policy->curlft.use_time = get_seconds(); + if (dst) + goto dst_found; + switch (policy->action) { default: case XFRM_POLICY_BLOCK: @@ -1626,18 +1698,11 @@ restart: case XFRM_POLICY_ALLOW: #ifndef CONFIG_XFRM_SUB_POLICY - if (policy->xfrm_nr == 0) { - /* Flow passes not transformed. */ - xfrm_pol_put(policy); - return 0; - } + if (policy->xfrm_nr == 0) + goto no_transform; #endif - /* Try to find matching bundle. - * - * LATER: help from flow cache. It is optional, this - * is required only for output policy. - */ + /* Try to find matching bundle the hard way. */ dst = xfrm_find_bundle(fl, policy, family); if (IS_ERR(dst)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLECHECKERROR); @@ -1677,12 +1742,8 @@ restart: * they are searched. See above not-transformed bypass * is surrounded by non-sub policy configuration, too. */ - if (xfrm_nr == 0) { - /* Flow passes not transformed. */ - xfrm_pols_put(pols, npols); - return 0; - } - + if (xfrm_nr == 0) + goto no_transform; #endif nx = xfrm_tmpl_resolve(pols, npols, fl, xfrm, family); @@ -1713,7 +1774,7 @@ restart: goto error; } if (nx == -EAGAIN || - genid != atomic_read(&flow_cache_genid)) { + genid != atomic_read(&net->xfrm.policy_genid)) { xfrm_pols_put(pols, npols); goto restart; } @@ -1724,11 +1785,8 @@ restart: goto error; } } - if (nx == 0) { - /* Flow passes not transformed. */ - xfrm_pols_put(pols, npols); - return 0; - } + if (nx == 0) + goto no_transform; dst = xfrm_bundle_create(policy, xfrm, nx, fl, dst_orig); err = PTR_ERR(dst); @@ -1777,6 +1835,9 @@ restart: dst_hold(dst); write_unlock_bh(&policy->lock); } + xfrm_flow_cache_update(net, fl, family, dir, + policy, (struct xfrm_dst *) dst); +dst_found: *dst_p = dst; dst_release(dst_orig); xfrm_pols_put(pols, npols); @@ -1794,7 +1855,12 @@ nopol: if (flags & XFRM_LOOKUP_ICMP) goto dropdst; return 0; +no_transform: + /* Flow passes not transformed. */ + xfrm_pols_put(pols, npols); + return 0; } + EXPORT_SYMBOL(__xfrm_lookup); int xfrm_lookup(struct net *net, struct dst_entry **dst_p, struct flowi *fl, @@ -1952,10 +2018,35 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, } } - if (!pol) - pol = flow_cache_lookup(net, &fl, family, fl_dir, - xfrm_policy_lookup); - + if (!pol) { + struct flow_cache_entry *fce; + struct xfrm_flow_cache_entry *xf; + + fce = flow_cache_lookup(&net->xfrm.flow_cache, + &fl, family, dir); + if (fce != NULL) { + xf = container_of(fce, struct xfrm_flow_cache_entry, fce); + xfrm_flow_cache_entry_validate(&net->xfrm.flow_cache, fce); + if (xf->policy != NULL) { + pol = xf->policy; + if (pol != XFRM_CACHE_NO_POLICY) + xfrm_pol_hold(pol); + else + pol = NULL; + } else { + pol = xfrm_policy_lookup(net, &fl, family, dir); + if (!IS_ERR(pol)) { + if (pol) + xf->policy = pol; + else + xf->policy = XFRM_CACHE_NO_POLICY; + } + xf->dst = NULL; + xf->policy_genid = atomic_read(&net->xfrm.policy_genid); + } + flow_cache_entry_put(fce); + } + } if (IS_ERR(pol)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR); return 0; @@ -2153,6 +2244,7 @@ static void prune_one_bundle(struct xfrm_policy *pol, int (*func)(struct dst_ent dstp = &dst->next; } } + atomic_inc(&pol->bundles_genid); write_unlock(&pol->lock); } @@ -2180,6 +2272,7 @@ static void xfrm_prune_bundles(struct net *net, int (*func)(struct dst_entry *)) } read_unlock_bh(&xfrm_policy_lock); + flow_cache_flush(&net->xfrm.flow_cache, NULL); while (gc_list) { struct dst_entry *dst = gc_list; gc_list = dst->next; @@ -2498,6 +2591,9 @@ static int __net_init xfrm_policy_init(struct net *net) INIT_LIST_HEAD(&net->xfrm.policy_all); INIT_WORK(&net->xfrm.policy_hash_work, xfrm_hash_resize); + INIT_HLIST_HEAD(&net->xfrm.policy_gc_list); + INIT_WORK(&net->xfrm.policy_gc_work, xfrm_policy_gc_task); + flow_cache_init(&net->xfrm.flow_cache, sizeof(struct xfrm_flow_cache_entry)); if (net_eq(net, &init_net)) register_netdevice_notifier(&xfrm_dev_notifier); return 0; @@ -2531,7 +2627,7 @@ static void xfrm_policy_fini(struct net *net) audit_info.sessionid = -1; audit_info.secid = 0; xfrm_policy_flush(net, XFRM_POLICY_TYPE_MAIN, &audit_info); - flush_work(&xfrm_policy_gc_work); + flush_work(&net->xfrm.policy_gc_work); WARN_ON(!list_empty(&net->xfrm.policy_all)); @@ -2549,6 +2645,8 @@ static void xfrm_policy_fini(struct net *net) sz = (net->xfrm.policy_idx_hmask + 1) * sizeof(struct hlist_head); WARN_ON(!hlist_empty(net->xfrm.policy_byidx)); xfrm_hash_free(net->xfrm.policy_byidx, sz); + + flow_cache_fini(&net->xfrm.flow_cache); } static int __net_init xfrm_net_init(struct net *net) @@ -2756,8 +2854,9 @@ static int migrate_tmpl_match(struct xfrm_migrate *m, struct xfrm_tmpl *t) static int xfrm_policy_migrate(struct xfrm_policy *pol, struct xfrm_migrate *m, int num_migrate) { + struct net *net = xp_net(pol); struct xfrm_migrate *mp; - struct dst_entry *dst; + struct dst_entry *gc_list = NULL, *tail; int i, j, n = 0; write_lock_bh(&pol->lock); @@ -2782,15 +2881,25 @@ static int xfrm_policy_migrate(struct xfrm_policy *pol, sizeof(pol->xfrm_vec[i].saddr)); pol->xfrm_vec[i].encap_family = mp->new_family; /* flush bundles */ - while ((dst = pol->bundles) != NULL) { - pol->bundles = dst->next; - dst_free(dst); - } + tail = pol->bundles; + while (tail->next) + tail = tail->next; + tail->next = gc_list; + gc_list = pol->bundles; + pol->bundles = NULL; + atomic_inc(&pol->bundles_genid); } } - write_unlock_bh(&pol->lock); + flow_cache_flush(&net->xfrm.flow_cache, NULL); + while (gc_list) { + struct dst_entry *dst = gc_list; + + gc_list = dst->next; + dst_free(dst); + } + if (!n) return -ENODATA;

xfrm: cache bundle lookup results in flow cache

Commit Message

Comments

Patch