diff mbox

[1/4] flow: virtualize flow cache entry methods

Message ID 1270126340-30181-2-git-send-email-timo.teras@iki.fi
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Timo Teras April 1, 2010, 12:52 p.m. UTC
This allows to validate the cached object before returning it.
It also allows to destruct object properly, if the last reference
was held in flow cache. This is also a prepartion for caching
bundles in the flow cache.

In return for virtualizing the methods, we save on:
- not having to regenerate the whole flow cache on policy removal:
  each flow matching a killed policy gets refreshed as the getter
  function notices it smartly.
- we do not have to call flow_cache_flush from policy gc, since the
  flow cache now properly deletes the object if it had any references

Signed-off-by: Timo Teras <timo.teras@iki.fi>
---
 include/net/flow.h     |   18 ++++++--
 include/net/xfrm.h     |    2 +
 net/core/flow.c        |  118 ++++++++++++++++++++++++-----------------------
 net/xfrm/xfrm_policy.c |  113 ++++++++++++++++++++++++++++++---------------
 4 files changed, 151 insertions(+), 100 deletions(-)

Comments

Eric Dumazet April 1, 2010, 1:05 p.m. UTC | #1
Le jeudi 01 avril 2010 à 15:52 +0300, Timo Teras a écrit :
> This allows to validate the cached object before returning it.
> It also allows to destruct object properly, if the last reference
> was held in flow cache. This is also a prepartion for caching
> bundles in the flow cache.
> 
> In return for virtualizing the methods, we save on:
> - not having to regenerate the whole flow cache on policy removal:
>   each flow matching a killed policy gets refreshed as the getter
>   function notices it smartly.
> - we do not have to call flow_cache_flush from policy gc, since the
>   flow cache now properly deletes the object if it had any references
> 
> Signed-off-by: Timo Teras <timo.teras@iki.fi>
> ---

...

> @@ -481,6 +482,7 @@ struct xfrm_policy {
>  	atomic_t		refcnt;
>  	struct timer_list	timer;
>  
> +	struct flow_cache_entry_ops *fc_ops;
>  	u32			priority;
>  	u32			index;
>  	struct xfrm_mark	mark;
...

> +static struct flow_cache_entry_ops xfrm_policy_fc_ops __read_mostly = {
> +	.get = xfrm_policy_get_fce,
> +	.check = xfrm_policy_check_fce,
> +	.delete = xfrm_policy_delete_fce,
> +};
>  

Any particular reason these flow_cache_entry_ops are not const ?


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Timo Teras April 1, 2010, 1:07 p.m. UTC | #2
Eric Dumazet wrote:
> Le jeudi 01 avril 2010 à 15:52 +0300, Timo Teras a écrit :
>> @@ -481,6 +482,7 @@ struct xfrm_policy {
>>  	atomic_t		refcnt;
>>  	struct timer_list	timer;
>>  
>> +	struct flow_cache_entry_ops *fc_ops;
>>  	u32			priority;
>>  	u32			index;
>>  	struct xfrm_mark	mark;
> ...
> 
>> +static struct flow_cache_entry_ops xfrm_policy_fc_ops __read_mostly = {
>> +	.get = xfrm_policy_get_fce,
>> +	.check = xfrm_policy_check_fce,
>> +	.delete = xfrm_policy_delete_fce,
>> +};
>>  
> 
> Any particular reason these flow_cache_entry_ops are not const ?

Umm... No. I'll constify them. Thanks.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Herbert Xu April 3, 2010, 3:38 a.m. UTC | #3
On Thu, Apr 01, 2010 at 03:52:17PM +0300, Timo Teras wrote:
> This allows to validate the cached object before returning it.
> It also allows to destruct object properly, if the last reference
> was held in flow cache. This is also a prepartion for caching
> bundles in the flow cache.
> 
> In return for virtualizing the methods, we save on:
> - not having to regenerate the whole flow cache on policy removal:
>   each flow matching a killed policy gets refreshed as the getter
>   function notices it smartly.
> - we do not have to call flow_cache_flush from policy gc, since the
>   flow cache now properly deletes the object if it had any references
> 
> Signed-off-by: Timo Teras <timo.teras@iki.fi>

With repsect to removing the cache flush upon policy removal,
what takes care of the timely purging of the corresponding cache
entries if no new traffic comes through?

The concern is that if they're not purged in the absence of new
traffic, then we may hold references on all sorts of objects,
leading to consequences such as the inability to unregister net
devices.
  
>  struct flow_cache_entry {
> -	struct flow_cache_entry	*next;
> -	u16			family;
> -	u8			dir;
> -	u32			genid;
> -	struct flowi		key;
> -	void			*object;
> -	atomic_t		*object_ref;
> +	struct flow_cache_entry *	next;

Please follow the existing coding style.

Cheers,
Herbert Xu April 3, 2010, 8:36 a.m. UTC | #4
On Sat, Apr 03, 2010 at 11:38:57AM +0800, Herbert Xu wrote:
> 
> With repsect to removing the cache flush upon policy removal,
> what takes care of the timely purging of the corresponding cache
> entries if no new traffic comes through?

In fact this change would seem to render the existing bundle
pruning mechanism when devices are unregistered ineffective.

xfrm_prune_bundles walks through active policy lists to find
the bundles to purge.  However, if a policy has been deleted
while a bundle referencing it is still in the cache, that bundle
will not be pruned.

Cheers,
Timo Teras April 3, 2010, 1:50 p.m. UTC | #5
Herbert Xu wrote:
> With repsect to removing the cache flush upon policy removal,
> what takes care of the timely purging of the corresponding cache
> entries if no new traffic comes through?
> 
> The concern is that if they're not purged in the absence of new
> traffic, then we may hold references on all sorts of objects,
> leading to consequences such as the inability to unregister net
> devices.

The flow cache is randomized every ten minutes. Thus all flow
cache entries get recreated regularly.

> On Sat, Apr 03, 2010 at 11:38:57AM +0800, Herbert Xu wrote:
>> With repsect to removing the cache flush upon policy removal,
>> what takes care of the timely purging of the corresponding cache
>> entries if no new traffic comes through?
> 
> In fact this change would seem to render the existing bundle
> pruning mechanism when devices are unregistered ineffective.
> 
> xfrm_prune_bundles walks through active policy lists to find
> the bundles to purge.  However, if a policy has been deleted
> while a bundle referencing it is still in the cache, that bundle
> will not be pruned.

When policy is killed, the policy->genid is incremented which
makes xfrm_bundle_ok check fail and the bundle to get pruned
immediately on flush.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Herbert Xu April 3, 2010, 2:17 p.m. UTC | #6
On Sat, Apr 03, 2010 at 04:50:08PM +0300, Timo Teräs wrote:
>
> The flow cache is randomized every ten minutes. Thus all flow
> cache entries get recreated regularly.

Having rmmod <netdrv> block for up to ten minutes is hardly
ideal.

Besides, in future we may want to get rid of the regular reseeding
just like we did for IPv4 routes.

Cheers,
Timo Teras April 3, 2010, 2:26 p.m. UTC | #7
Herbert Xu wrote:
> On Sat, Apr 03, 2010 at 04:50:08PM +0300, Timo Teräs wrote:
>> The flow cache is randomized every ten minutes. Thus all flow
>> cache entries get recreated regularly.
> 
> Having rmmod <netdrv> block for up to ten minutes is hardly
> ideal.

Why would this block? The device down hook calls flow cache
flush. On flush all bundles with non-up devices get pruned
immediately (via stale_bundle check).

> Besides, in future we may want to get rid of the regular reseeding
> just like we did for IPv4 routes.

Right. It certainly sounds good. And needs a separate change
then.

If this is done, we will need to still have some sort of
periodic gc for flow cache, just like ipv4 routes have. The
gc would on each tick scan just some of the flow cache hash
chains.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Herbert Xu April 3, 2010, 3:53 p.m. UTC | #8
On Sat, Apr 03, 2010 at 05:26:00PM +0300, Timo Teräs wrote:
>
> Why would this block? The device down hook calls flow cache
> flush. On flush all bundles with non-up devices get pruned
> immediately (via stale_bundle check).

Perhaps I missed something in your patch, but the flush that
we currently perform is limited to the bundles from hashed policies.
So if a policy has just recently been removed, then its bundles
won't be flushed.

Cheers,
Timo Teras April 3, 2010, 8:19 p.m. UTC | #9
Herbert Xu wrote:
> On Sat, Apr 03, 2010 at 05:26:00PM +0300, Timo Teräs wrote:
>> Why would this block? The device down hook calls flow cache
>> flush. On flush all bundles with non-up devices get pruned
>> immediately (via stale_bundle check).
> 
> Perhaps I missed something in your patch, but the flush that
> we currently perform is limited to the bundles from hashed policies.
> So if a policy has just recently been removed, then its bundles
> won't be flushed.

If a policy is removed, policy->genid is incremented invalidating
the bundles. Those bundles get freed when:
 - specific flow gets hit
 - cache is flushed due to GC call, or interface going down
 - flow cache randomization

If someone is then removing a net driver, we still execute
flush on the 'device down' hook, and all stale bundles
get flushed.

But yes, this means that xfrm_policy struct can now be held
allocated up to ten extra minutes. But it's only memory that
it's holding, not any extra refs. And it's still reclaimable
by the GC.

If this feels troublesome, we could add asynchronous flush
request that would be called on policy removal. Or even stick
to the synchronous one.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Herbert Xu April 4, 2010, 2:06 a.m. UTC | #10
On Sat, Apr 03, 2010 at 11:19:04PM +0300, Timo Teräs wrote:
>
> If someone is then removing a net driver, we still execute
> flush on the 'device down' hook, and all stale bundles
> get flushed.

Not if the bundle belongs to a policy recently deleted.

> But yes, this means that xfrm_policy struct can now be held
> allocated up to ten extra minutes. But it's only memory that
> it's holding, not any extra refs. And it's still reclaimable
> by the GC.

You also hold down the bundle xdst's along with it, which can
hold netdev references preventing modules from being unloaded.

> If this feels troublesome, we could add asynchronous flush
> request that would be called on policy removal. Or even stick
> to the synchronous one.

How about change xfrm_flush_bundles to flush bundles from the
cache instead of xfrm_policy?

Cheers,
Timo Teras April 4, 2010, 5:50 a.m. UTC | #11
Herbert Xu wrote:
> On Sat, Apr 03, 2010 at 11:19:04PM +0300, Timo Teräs wrote:
>> If someone is then removing a net driver, we still execute
>> flush on the 'device down' hook, and all stale bundles
>> get flushed.
> 
> Not if the bundle belongs to a policy recently deleted.
> 
>> But yes, this means that xfrm_policy struct can now be held
>> allocated up to ten extra minutes. But it's only memory that
>> it's holding, not any extra refs. And it's still reclaimable
>> by the GC.
> 
> You also hold down the bundle xdst's along with it, which can
> hold netdev references preventing modules from being unloaded.
> 
>> If this feels troublesome, we could add asynchronous flush
>> request that would be called on policy removal. Or even stick
>> to the synchronous one.
> 
> How about change xfrm_flush_bundles to flush bundles from the
> cache instead of xfrm_policy?

For the common case:

1. Policy deleted; policy->walk.dead set, policy->genid incremented
2. NETDEV_DOWN hook called, calls flow_cache_flush()
3. flow_cache_flush enumerates all policy and bundle refs
   in it's cache
4. for each bundle xfrm_bundle_check_fce() is called, which
   calls stale_bundle()
5. all bundles using stale policy, fail that check because
     xdst->policy_genid != xdst->pols[0]->genid
   (checked in xfrm_bundle_ok)
6. flow cache calls entry's ->delete which is dst_free for bundles
7. flow_cache_flush() returns

flow_cache_flush really frees the bundles in it on flush.

But now that I look my code again. Your statement is true for
per-socket bundles. They would not get deleted in this case.
I'll change NETDEV_DOWN to call garbage collect instead of
flow cache flush which will then also free the per-socket bundles.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Herbert Xu April 4, 2010, 5:58 a.m. UTC | #12
On Sun, Apr 04, 2010 at 08:50:41AM +0300, Timo Teräs wrote:
>
> For the common case:
>
> 1. Policy deleted; policy->walk.dead set, policy->genid incremented
> 2. NETDEV_DOWN hook called, calls flow_cache_flush()
> 3. flow_cache_flush enumerates all policy and bundle refs
>   in it's cache
> 4. for each bundle xfrm_bundle_check_fce() is called, which
>   calls stale_bundle()
> 5. all bundles using stale policy, fail that check because
>     xdst->policy_genid != xdst->pols[0]->genid
>   (checked in xfrm_bundle_ok)
> 6. flow cache calls entry's ->delete which is dst_free for bundles
> 7. flow_cache_flush() returns

Ah, you're doing it in 2/4.  Can we please have each patch be
a self-contained unit? It should be possible to apply 1/4 and
have a resulting kernel that works properly without having to
apply the rest of your patches.

Thanks,
Timo Teras April 4, 2010, 6:07 a.m. UTC | #13
Herbert Xu wrote:
> On Sun, Apr 04, 2010 at 08:50:41AM +0300, Timo Teräs wrote:
>> For the common case:
>>
>> 1. Policy deleted; policy->walk.dead set, policy->genid incremented
>> 2. NETDEV_DOWN hook called, calls flow_cache_flush()
>> 3. flow_cache_flush enumerates all policy and bundle refs
>>   in it's cache
>> 4. for each bundle xfrm_bundle_check_fce() is called, which
>>   calls stale_bundle()
>> 5. all bundles using stale policy, fail that check because
>>     xdst->policy_genid != xdst->pols[0]->genid
>>   (checked in xfrm_bundle_ok)
>> 6. flow cache calls entry's ->delete which is dst_free for bundles
>> 7. flow_cache_flush() returns
> 
> Ah, you're doing it in 2/4.  Can we please have each patch be
> a self-contained unit? It should be possible to apply 1/4 and
> have a resulting kernel that works properly without having to
> apply the rest of your patches.

With 1/4 only, the bundle deletion is not touched. In that case
the policy GC deletes explicitly the bundles. The bundles get
deleted immediately, and only the struct xfrm_policy might get
held up allocated longer.

The code flow would be:
 1. xfrm_policy_kill() queues to GC
 2. xfrm_policy_gc_kill() called from xfrm_policy_gc_task()
    frees all bundles in that policy
 3. xfrm_policy_gc_kill() releases it's reference
 4. ... time passes (flush, randomization, or flow hit occurs)
 5. flow cache releases it's final reference, calls
    xfrm_policy_destroy() which only frees the xfrm_policy memory
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Herbert Xu April 4, 2010, 6:19 a.m. UTC | #14
On Sun, Apr 04, 2010 at 09:07:12AM +0300, Timo Teräs wrote:
>
> With 1/4 only, the bundle deletion is not touched. In that case
> the policy GC deletes explicitly the bundles. The bundles get
> deleted immediately, and only the struct xfrm_policy might get
> held up allocated longer.
>
> The code flow would be:
> 1. xfrm_policy_kill() queues to GC
> 2. xfrm_policy_gc_kill() called from xfrm_policy_gc_task()
>    frees all bundles in that policy
> 3. xfrm_policy_gc_kill() releases it's reference
> 4. ... time passes (flush, randomization, or flow hit occurs)
> 5. flow cache releases it's final reference, calls
>    xfrm_policy_destroy() which only frees the xfrm_policy memory

With 1/4 only, you've removed the flow cache flush when policies
are deleted.  However, you don't add the flow cache flush to the
NETDEV_DOWN case until 2/4.  So with only 1/4, bundles (and hence
netdev objects) may be held for up to 10 minutes.

Cheers,
Timo Teras April 4, 2010, 6:28 a.m. UTC | #15
Herbert Xu wrote:
> On Sun, Apr 04, 2010 at 09:07:12AM +0300, Timo Teräs wrote:
>> With 1/4 only, the bundle deletion is not touched. In that case
>> the policy GC deletes explicitly the bundles. The bundles get
>> deleted immediately, and only the struct xfrm_policy might get
>> held up allocated longer.
>>
>> The code flow would be:
>> 1. xfrm_policy_kill() queues to GC
>> 2. xfrm_policy_gc_kill() called from xfrm_policy_gc_task()
>>    frees all bundles in that policy
>> 3. xfrm_policy_gc_kill() releases it's reference
>> 4. ... time passes (flush, randomization, or flow hit occurs)
>> 5. flow cache releases it's final reference, calls
>>    xfrm_policy_destroy() which only frees the xfrm_policy memory
> 
> With 1/4 only, you've removed the flow cache flush when policies
> are deleted.  However, you don't add the flow cache flush to the
> NETDEV_DOWN case until 2/4.  So with only 1/4, bundles (and hence
> netdev objects) may be held for up to 10 minutes.

No. The flow cache flush removal does not prevent bundle deletion.
The flow cache flush is in current code *after* deleting the bundles
from the policy. Freeing bundles and flushing cache are completely
two separate things in current code. Only in 2/4 the bundle deletion
becomes dependent on flow cache flush.

Please, read xfrm_policy_kill() and xfrm_policy_gc_kill() in current
code, and after applying 1/4. The diff of 1/4 is not as informative
as the bundle deletion code is not shown there (since it's not touched).

The only reason why current code does flow cache flush on policy
removal, is to make sure that it's not the flow cache atomic_dec
to drop last reference; if that happened xfrm_policy_destroy would
not get ever called.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Herbert Xu April 4, 2010, 8:35 a.m. UTC | #16
On Sun, Apr 04, 2010 at 09:28:43AM +0300, Timo Teräs wrote:
>
> No. The flow cache flush removal does not prevent bundle deletion.
> The flow cache flush is in current code *after* deleting the bundles
> from the policy. Freeing bundles and flushing cache are completely
> two separate things in current code. Only in 2/4 the bundle deletion
> becomes dependent on flow cache flush.

Ah yes I confused myself.  The problem I thought would occur
doesn't because 1/4 doesn't put the bundles in the flow cache
yet.

Thanks,
Herbert Xu April 4, 2010, 10:42 a.m. UTC | #17
On Thu, Apr 01, 2010 at 03:52:17PM +0300, Timo Teras wrote:
>
> -extern void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family,
> -			       u8 dir, flow_resolve_t resolver);
> +struct flow_cache_entry_ops {
> +	struct flow_cache_entry_ops ** (*get)(struct flow_cache_entry_ops **);
> +	int (*check)(struct flow_cache_entry_ops **);
> +	void (*delete)(struct flow_cache_entry_ops **);
> +};
> +
> +typedef struct flow_cache_entry_ops **(*flow_resolve_t)(
> +		struct net *net, struct flowi *key, u16 family,
> +		u8 dir, struct flow_cache_entry_ops **old_ops, void *ctx);

OK this bit really bugs me.

When I first looked at it, my reaction was why on earth are we
returning an ops pointer? Only after some digging around do I see
the fact that this ops pointer is in fact embedded in xfrm_policy.

How about embedding flow_cache_entry in xfrm_policy instead? Returning
flow_cache_entry * would make a lot more sense than a nested pointer
to flow_cache_entry_ops.

Cheers,
Timo Teras April 4, 2010, 10:50 a.m. UTC | #18
Herbert Xu wrote:
> On Thu, Apr 01, 2010 at 03:52:17PM +0300, Timo Teras wrote:
>> -extern void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family,
>> -			       u8 dir, flow_resolve_t resolver);
>> +struct flow_cache_entry_ops {
>> +	struct flow_cache_entry_ops ** (*get)(struct flow_cache_entry_ops **);
>> +	int (*check)(struct flow_cache_entry_ops **);
>> +	void (*delete)(struct flow_cache_entry_ops **);
>> +};
>> +
>> +typedef struct flow_cache_entry_ops **(*flow_resolve_t)(
>> +		struct net *net, struct flowi *key, u16 family,
>> +		u8 dir, struct flow_cache_entry_ops **old_ops, void *ctx);
> 
> OK this bit really bugs me.
> 
> When I first looked at it, my reaction was why on earth are we
> returning an ops pointer? Only after some digging around do I see
> the fact that this ops pointer is in fact embedded in xfrm_policy.
> 
> How about embedding flow_cache_entry in xfrm_policy instead? Returning
> flow_cache_entry * would make a lot more sense than a nested pointer
> to flow_cache_entry_ops.

Because flow_cache_entry is per-cpu, and multiple entries (due to
different flows matching same policies, or same flow having multiple
per-cpu entries) can point to same policy. If we cached "dummy" objects
for even policies, then this would be better approach.

This would make actually sense, since it'd be useful to cache all
policies involved in check path (main + sub policy refs). In which
case we might want to make the ops 'per flow cache instance' instead
of 'per cache entry'.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Herbert Xu April 4, 2010, 11 a.m. UTC | #19
On Sun, Apr 04, 2010 at 01:50:16PM +0300, Timo Teräs wrote:
>
> Because flow_cache_entry is per-cpu, and multiple entries (due to
> different flows matching same policies, or same flow having multiple
> per-cpu entries) can point to same policy. If we cached "dummy" objects
> for even policies, then this would be better approach.

Oh yes of course.

But what we could do is embed most of flow_cache_entry into
xfrm_policy (and xdst in your latter patches) along with the
ops pointer.

Like this:

struct flow_cache_object {
	u16			family;
	u8			dir;
	u32			genid;
	struct flowi		key;
	struct flow_cache_ops **ops;
};

struct flow_cache_entry {
	struct flow_cache_entry	*next;
	struct flow_cache_object *obj;
};

struct xfrm_policy {
	struct flow_cache_object flo;
	...
};

What do you think?

Cheers,
Timo Teras April 4, 2010, 11:06 a.m. UTC | #20
Herbert Xu wrote:
> On Sun, Apr 04, 2010 at 01:50:16PM +0300, Timo Teräs wrote:
>> Because flow_cache_entry is per-cpu, and multiple entries (due to
>> different flows matching same policies, or same flow having multiple
>> per-cpu entries) can point to same policy. If we cached "dummy" objects
>> for even policies, then this would be better approach.
> 
> Oh yes of course.
> 
> But what we could do is embed most of flow_cache_entry into
> xfrm_policy (and xdst in your latter patches) along with the
> ops pointer.
> 
> Like this:
> 
> struct flow_cache_object {
> 	u16			family;
> 	u8			dir;
> 	u32			genid;
> 	struct flowi		key;
> 	struct flow_cache_ops **ops;
> };
> 
> struct flow_cache_entry {
> 	struct flow_cache_entry	*next;
> 	struct flow_cache_object *obj;
> };
> 
> struct xfrm_policy {
> 	struct flow_cache_object flo;
> 	...
> };
> 
> What do you think?

It would still not work for policies. For every policy X we
can get N+1 different matches with separate struct flowi contents.
It's not possible to put single struct flowi or any other of
the flow details in to xfrm_policy. It's a N-to-1 mapping. Not
a 1-to-1 mapping.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Herbert Xu April 4, 2010, 11:26 a.m. UTC | #21
On Sun, Apr 04, 2010 at 02:06:55PM +0300, Timo Teräs wrote:
>
> It would still not work for policies. For every policy X we
> can get N+1 different matches with separate struct flowi contents.
> It's not possible to put single struct flowi or any other of
> the flow details in to xfrm_policy. It's a N-to-1 mapping. Not
> a 1-to-1 mapping.

Fine, move key into flow_cache_entry but the rest should still
work, no?

struct flow_cache_object {
	u16			family;
	u8			dir;
	u32			genid;
	struct flow_cache_ops  *ops;
};

struct flow_cache_entry {
	struct flow_cache_entry	*next;
	struct flowi key;
	struct flow_cache_object *obj;
};

struct xfrm_policy {
	struct flow_cache_object flo;
	...
};

Cheers,
Herbert Xu April 4, 2010, 11:31 a.m. UTC | #22
On Sun, Apr 04, 2010 at 07:26:36PM +0800, Herbert Xu wrote:
> 
> Fine, move key into flow_cache_entry but the rest should still
> work, no?
> 
> struct flow_cache_object {
> 	u16			family;
> 	u8			dir;
> 	u32			genid;
> 	struct flow_cache_ops  *ops;
> };
> 
> struct flow_cache_entry {
> 	struct flow_cache_entry	*next;
> 	struct flowi key;
> 	struct flow_cache_object *obj;
> };
> 
> struct xfrm_policy {
> 	struct flow_cache_object flo;
> 	...
> };

OK this doesn't work either as we still have NULL objects for
now.  But I still think even if the ops pointer is the only
member in flow_cache_object, it looks better than returning the
nested ops pointer directly from flow_cache_lookup.

Cheers,
Timo Teras April 4, 2010, 12:09 p.m. UTC | #23
Herbert Xu wrote:
> On Sun, Apr 04, 2010 at 07:26:36PM +0800, Herbert Xu wrote:
>> Fine, move key into flow_cache_entry but the rest should still
>> work, no?
>
> OK this doesn't work either as we still have NULL objects for
> now.  But I still think even if the ops pointer is the only
> member in flow_cache_object, it looks better than returning the
> nested ops pointer directly from flow_cache_lookup.

Yes, it'll look better. I'll wrap the pointer in a struct.

Ok, so far it's:
 - constify ops
 - indentation fixes for flow.c struct's with pointer members
 - wrap ops* in a struct* to avoid ops**

Will fix and resend refreshed patches tomorrow.

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/net/flow.h b/include/net/flow.h
index 809970b..65d68a6 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -86,11 +86,21 @@  struct flowi {
 
 struct net;
 struct sock;
-typedef int (*flow_resolve_t)(struct net *net, struct flowi *key, u16 family,
-			      u8 dir, void **objp, atomic_t **obj_refp);
 
-extern void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family,
-			       u8 dir, flow_resolve_t resolver);
+struct flow_cache_entry_ops {
+	struct flow_cache_entry_ops ** (*get)(struct flow_cache_entry_ops **);
+	int (*check)(struct flow_cache_entry_ops **);
+	void (*delete)(struct flow_cache_entry_ops **);
+};
+
+typedef struct flow_cache_entry_ops **(*flow_resolve_t)(
+		struct net *net, struct flowi *key, u16 family,
+		u8 dir, struct flow_cache_entry_ops **old_ops, void *ctx);
+
+extern struct flow_cache_entry_ops **flow_cache_lookup(
+		struct net *net, struct flowi *key, u16 family,
+		u8 dir, flow_resolve_t resolver, void *ctx);
+
 extern void flow_cache_flush(void);
 extern atomic_t flow_cache_genid;
 
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index d74e080..cb8934b 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -19,6 +19,7 @@ 
 #include <net/route.h>
 #include <net/ipv6.h>
 #include <net/ip6_fib.h>
+#include <net/flow.h>
 
 #include <linux/interrupt.h>
 
@@ -481,6 +482,7 @@  struct xfrm_policy {
 	atomic_t		refcnt;
 	struct timer_list	timer;
 
+	struct flow_cache_entry_ops *fc_ops;
 	u32			priority;
 	u32			index;
 	struct xfrm_mark	mark;
diff --git a/net/core/flow.c b/net/core/flow.c
index 1d27ca6..f938137 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -26,13 +26,12 @@ 
 #include <linux/security.h>
 
 struct flow_cache_entry {
-	struct flow_cache_entry	*next;
-	u16			family;
-	u8			dir;
-	u32			genid;
-	struct flowi		key;
-	void			*object;
-	atomic_t		*object_ref;
+	struct flow_cache_entry *	next;
+	u16				family;
+	u8				dir;
+	u32				genid;
+	struct flowi			key;
+	struct flow_cache_entry_ops **	ops;
 };
 
 struct flow_cache_percpu {
@@ -78,12 +77,21 @@  static void flow_cache_new_hashrnd(unsigned long arg)
 	add_timer(&fc->rnd_timer);
 }
 
+static int flow_entry_valid(struct flow_cache_entry *fle)
+{
+	if (atomic_read(&flow_cache_genid) != fle->genid)
+		return 0;
+	if (fle->ops && !(*fle->ops)->check(fle->ops))
+		return 0;
+	return 1;
+}
+
 static void flow_entry_kill(struct flow_cache *fc,
 			    struct flow_cache_percpu *fcp,
 			    struct flow_cache_entry *fle)
 {
-	if (fle->object)
-		atomic_dec(fle->object_ref);
+	if (fle->ops)
+		(*fle->ops)->delete(fle->ops);
 	kmem_cache_free(flow_cachep, fle);
 	fcp->hash_count--;
 }
@@ -96,16 +104,18 @@  static void __flow_cache_shrink(struct flow_cache *fc,
 	int i;
 
 	for (i = 0; i < flow_cache_hash_size(fc); i++) {
-		int k = 0;
+		int saved = 0;
 
 		flp = &fcp->hash_table[i];
-		while ((fle = *flp) != NULL && k < shrink_to) {
-			k++;
-			flp = &fle->next;
-		}
 		while ((fle = *flp) != NULL) {
-			*flp = fle->next;
-			flow_entry_kill(fc, fcp, fle);
+			if (saved < shrink_to &&
+			    flow_entry_valid(fle)) {
+				saved++;
+				flp = &fle->next;
+			} else {
+				*flp = fle->next;
+				flow_entry_kill(fc, fcp, fle);
+			}
 		}
 	}
 }
@@ -166,18 +176,21 @@  static int flow_key_compare(struct flowi *key1, struct flowi *key2)
 	return 0;
 }
 
-void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
-			flow_resolve_t resolver)
+struct flow_cache_entry_ops **flow_cache_lookup(
+		struct net *net, struct flowi *key, u16 family, u8 dir,
+		flow_resolve_t resolver, void *ctx)
 {
 	struct flow_cache *fc = &flow_cache_global;
 	struct flow_cache_percpu *fcp;
 	struct flow_cache_entry *fle, **head;
+	struct flow_cache_entry_ops **ops;
 	unsigned int hash;
 
 	local_bh_disable();
 	fcp = per_cpu_ptr(fc->percpu, smp_processor_id());
 
 	fle = NULL;
+	ops = NULL;
 	/* Packet really early in init?  Making flow_cache_init a
 	 * pre-smp initcall would solve this.  --RR */
 	if (!fcp->hash_table)
@@ -185,24 +198,14 @@  void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
 
 	if (fcp->hash_rnd_recalc)
 		flow_new_hash_rnd(fc, fcp);
-	hash = flow_hash_code(fc, fcp, key);
 
+	hash = flow_hash_code(fc, fcp, key);
 	head = &fcp->hash_table[hash];
 	for (fle = *head; fle; fle = fle->next) {
 		if (fle->family == family &&
 		    fle->dir == dir &&
-		    flow_key_compare(key, &fle->key) == 0) {
-			if (fle->genid == atomic_read(&flow_cache_genid)) {
-				void *ret = fle->object;
-
-				if (ret)
-					atomic_inc(fle->object_ref);
-				local_bh_enable();
-
-				return ret;
-			}
+		    flow_key_compare(key, &fle->key) == 0)
 			break;
-		}
 	}
 
 	if (!fle) {
@@ -215,37 +218,37 @@  void *flow_cache_lookup(struct net *net, struct flowi *key, u16 family, u8 dir,
 			*head = fle;
 			fle->family = family;
 			fle->dir = dir;
+			fle->ops = NULL;
 			memcpy(&fle->key, key, sizeof(*key));
-			fle->object = NULL;
+			fle->ops = NULL;
 			fcp->hash_count++;
 		}
+	} else if (fle->genid == atomic_read(&flow_cache_genid)) {
+		ops = fle->ops;
+		if (!ops)
+			goto ret_ops;
+		ops = (*ops)->get(ops);
+		if (ops)
+			goto ret_ops;
 	}
 
 nocache:
-	{
-		int err;
-		void *obj;
-		atomic_t *obj_ref;
-
-		err = resolver(net, key, family, dir, &obj, &obj_ref);
-
-		if (fle && !err) {
-			fle->genid = atomic_read(&flow_cache_genid);
-
-			if (fle->object)
-				atomic_dec(fle->object_ref);
-
-			fle->object = obj;
-			fle->object_ref = obj_ref;
-			if (obj)
-				atomic_inc(fle->object_ref);
+	ops = resolver(net, key, family, dir, fle ? fle->ops : NULL, ctx);
+	if (fle) {
+		fle->genid = atomic_read(&flow_cache_genid);
+		if (IS_ERR(ops)) {
+			fle->genid--;
+			fle->ops = NULL;
+		} else {
+			fle->ops = ops;
 		}
-		local_bh_enable();
-
-		if (err)
-			obj = ERR_PTR(err);
-		return obj;
+	} else {
+		if (ops && !IS_ERR(ops))
+			(*ops)->delete(ops);
 	}
+ret_ops:
+	local_bh_enable();
+	return ops;
 }
 
 static void flow_cache_flush_tasklet(unsigned long data)
@@ -261,13 +264,12 @@  static void flow_cache_flush_tasklet(unsigned long data)
 
 		fle = fcp->hash_table[i];
 		for (; fle; fle = fle->next) {
-			unsigned genid = atomic_read(&flow_cache_genid);
-
-			if (!fle->object || fle->genid == genid)
+			if (flow_entry_valid(fle))
 				continue;
 
-			fle->object = NULL;
-			atomic_dec(fle->object_ref);
+			if (fle->ops)
+				(*fle->ops)->delete(fle->ops);
+			fle->ops = NULL;
 		}
 	}
 
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 82789cf..20f5b01 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -216,6 +216,36 @@  expired:
 	xfrm_pol_put(xp);
 }
 
+static struct flow_cache_entry_ops **xfrm_policy_get_fce(
+		struct flow_cache_entry_ops **ops)
+{
+	struct xfrm_policy *pol = container_of(ops, struct xfrm_policy, fc_ops);
+
+	if (unlikely(pol->walk.dead))
+		ops = NULL;
+	else
+		xfrm_pol_hold(pol);
+
+	return ops;
+}
+
+static int xfrm_policy_check_fce(struct flow_cache_entry_ops **ops)
+{
+	struct xfrm_policy *pol = container_of(ops, struct xfrm_policy, fc_ops);
+
+	return !pol->walk.dead;
+}
+
+static void xfrm_policy_delete_fce(struct flow_cache_entry_ops **ops)
+{
+	xfrm_pol_put(container_of(ops, struct xfrm_policy, fc_ops));
+}
+
+static struct flow_cache_entry_ops xfrm_policy_fc_ops __read_mostly = {
+	.get = xfrm_policy_get_fce,
+	.check = xfrm_policy_check_fce,
+	.delete = xfrm_policy_delete_fce,
+};
 
 /* Allocate xfrm_policy. Not used here, it is supposed to be used by pfkeyv2
  * SPD calls.
@@ -236,6 +266,7 @@  struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp)
 		atomic_set(&policy->refcnt, 1);
 		setup_timer(&policy->timer, xfrm_policy_timer,
 				(unsigned long)policy);
+		policy->fc_ops = &xfrm_policy_fc_ops;
 	}
 	return policy;
 }
@@ -269,9 +300,6 @@  static void xfrm_policy_gc_kill(struct xfrm_policy *policy)
 	if (del_timer(&policy->timer))
 		atomic_dec(&policy->refcnt);
 
-	if (atomic_read(&policy->refcnt) > 1)
-		flow_cache_flush();
-
 	xfrm_pol_put(policy);
 }
 
@@ -661,10 +689,8 @@  struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type,
 	}
 	write_unlock_bh(&xfrm_policy_lock);
 
-	if (ret && delete) {
-		atomic_inc(&flow_cache_genid);
+	if (ret && delete)
 		xfrm_policy_kill(ret);
-	}
 	return ret;
 }
 EXPORT_SYMBOL(xfrm_policy_bysel_ctx);
@@ -703,10 +729,8 @@  struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type,
 	}
 	write_unlock_bh(&xfrm_policy_lock);
 
-	if (ret && delete) {
-		atomic_inc(&flow_cache_genid);
+	if (ret && delete)
 		xfrm_policy_kill(ret);
-	}
 	return ret;
 }
 EXPORT_SYMBOL(xfrm_policy_byid);
@@ -822,7 +846,6 @@  int xfrm_policy_flush(struct net *net, u8 type, struct xfrm_audit *audit_info)
 	}
 	if (!cnt)
 		err = -ESRCH;
-	atomic_inc(&flow_cache_genid);
 out:
 	write_unlock_bh(&xfrm_policy_lock);
 	return err;
@@ -976,32 +999,35 @@  fail:
 	return ret;
 }
 
-static int xfrm_policy_lookup(struct net *net, struct flowi *fl, u16 family,
-			      u8 dir, void **objp, atomic_t **obj_refp)
+static struct flow_cache_entry_ops **xfrm_policy_lookup(
+		struct net *net, struct flowi *fl, u16 family,
+		u8 dir, struct flow_cache_entry_ops **old_ops, void *ctx)
 {
 	struct xfrm_policy *pol;
-	int err = 0;
+
+	if (old_ops)
+		xfrm_pol_put(container_of(old_ops, struct xfrm_policy, fc_ops));
 
 #ifdef CONFIG_XFRM_SUB_POLICY
 	pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family, dir);
-	if (IS_ERR(pol)) {
-		err = PTR_ERR(pol);
-		pol = NULL;
-	}
-	if (pol || err)
-		goto end;
+	if (IS_ERR(pol))
+		return ERR_CAST(pol);
+	if (pol)
+		goto found;
 #endif
 	pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir);
-	if (IS_ERR(pol)) {
-		err = PTR_ERR(pol);
-		pol = NULL;
-	}
-#ifdef CONFIG_XFRM_SUB_POLICY
-end:
-#endif
-	if ((*objp = (void *) pol) != NULL)
-		*obj_refp = &pol->refcnt;
-	return err;
+	if (IS_ERR(pol))
+		return ERR_CAST(pol);
+	if (pol)
+		goto found;
+	return NULL;
+
+found:
+	/* Resolver returns two references:
+	 * one for cache and one for caller of flow_cache_lookup() */
+	xfrm_pol_hold(pol);
+
+	return &pol->fc_ops;
 }
 
 static inline int policy_to_flow_dir(int dir)
@@ -1091,8 +1117,6 @@  int xfrm_policy_delete(struct xfrm_policy *pol, int dir)
 	pol = __xfrm_policy_unlink(pol, dir);
 	write_unlock_bh(&xfrm_policy_lock);
 	if (pol) {
-		if (dir < XFRM_POLICY_MAX)
-			atomic_inc(&flow_cache_genid);
 		xfrm_policy_kill(pol);
 		return 0;
 	}
@@ -1578,18 +1602,24 @@  restart:
 	}
 
 	if (!policy) {
+		struct flow_cache_entry_ops **ops;
+
 		/* To accelerate a bit...  */
 		if ((dst_orig->flags & DST_NOXFRM) ||
 		    !net->xfrm.policy_count[XFRM_POLICY_OUT])
 			goto nopol;
 
-		policy = flow_cache_lookup(net, fl, dst_orig->ops->family,
-					   dir, xfrm_policy_lookup);
-		err = PTR_ERR(policy);
-		if (IS_ERR(policy)) {
+		ops = flow_cache_lookup(net, fl, dst_orig->ops->family,
+					dir, xfrm_policy_lookup, NULL);
+		err = PTR_ERR(ops);
+		if (IS_ERR(ops)) {
 			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
 			goto dropdst;
 		}
+		if (ops)
+			policy = container_of(ops, struct xfrm_policy, fc_ops);
+		else
+			policy = NULL;
 	}
 
 	if (!policy)
@@ -1939,9 +1969,16 @@  int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
 		}
 	}
 
-	if (!pol)
-		pol = flow_cache_lookup(net, &fl, family, fl_dir,
-					xfrm_policy_lookup);
+	if (!pol) {
+		struct flow_cache_entry_ops **ops;
+
+		ops = flow_cache_lookup(net, &fl, family, fl_dir,
+					xfrm_policy_lookup, NULL);
+		if (IS_ERR(ops))
+			pol = ERR_CAST(ops);
+		else if (ops)
+			pol = container_of(ops, struct xfrm_policy, fc_ops);
+	}
 
 	if (IS_ERR(pol)) {
 		XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);