diff mbox

net: make ip_rt_acct a normal percpu var

Message ID 200811172050.31308.rusty@rustcorp.com.au
State Rejected, archived
Delegated to: David Miller
Headers show

Commit Message

Rusty Russell Nov. 17, 2008, 10:20 a.m. UTC
There's no reason for this to be dynamically allocated: we panic if
allocation fails anyway.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>


Comments

Eric Dumazet Nov. 17, 2008, 10:36 p.m. UTC | #1
Rusty Russell a écrit :
> There's no reason for this to be dynamically allocated: we panic if
> allocation fails anyway.
> 
> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
> 

Well, since we dont have "per_cpu bss", this also grow the size of vmlinux file, with
a block of 4096 nul bytes.

Same thing for rt_hash_lock_init() : We dynamically allocate the rt_hash_locks,
while its size is usually quite small, and we could avoid the pointer.

Maybe with current machines/boot loaders, we dont care anymore, I dont know...

> diff -r 643c2cdfdb62 include/net/route.h
> --- a/include/net/route.h	Mon Nov 17 19:57:42 2008 +1030
> +++ b/include/net/route.h	Mon Nov 17 20:23:37 2008 +1030
> @@ -34,6 +34,7 @@
>  #include <linux/ip.h>
>  #include <linux/cache.h>
>  #include <linux/security.h>
> +#include <linux/percpu.h>
>  
>  #ifndef __KERNEL__
>  #warning This file is not supposed to be used outside of kernel.
> @@ -105,7 +106,7 @@
>          unsigned int out_hlist_search;
>  };
>  
> -extern struct ip_rt_acct *ip_rt_acct;
> +DECLARE_PER_CPU(struct ip_rt_acct[256], ip_rt_acct);
>  
>  struct in_device;
>  extern int		ip_rt_init(void);
> diff -r 643c2cdfdb62 net/ipv4/ip_input.c
> --- a/net/ipv4/ip_input.c	Mon Nov 17 19:57:42 2008 +1030
> +++ b/net/ipv4/ip_input.c	Mon Nov 17 20:23:37 2008 +1030
> @@ -339,7 +339,7 @@
>  
>  #ifdef CONFIG_NET_CLS_ROUTE
>  	if (unlikely(skb->dst->tclassid)) {
> -		struct ip_rt_acct *st = per_cpu_ptr(ip_rt_acct, smp_processor_id());
> +		struct ip_rt_acct *st = __get_cpu_var(ip_rt_acct);
>  		u32 idx = skb->dst->tclassid;
>  		st[idx&0xFF].o_packets++;
>  		st[idx&0xFF].o_bytes+=skb->len;
> diff -r 643c2cdfdb62 net/ipv4/route.c
> --- a/net/ipv4/route.c	Mon Nov 17 19:57:42 2008 +1030
> +++ b/net/ipv4/route.c	Mon Nov 17 20:23:37 2008 +1030
> @@ -541,7 +541,7 @@
>  			unsigned int j;
>  			u32 *src;
>  
> -			src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
> +			src = ((u32 *)per_cpu(ip_rt_acct, i)) + offset;
>  			for (j = 0; j < length/4; j++)
>  				dst[j] += src[j];
>  		}
> @@ -3237,7 +3237,7 @@
>  
>  
>  #ifdef CONFIG_NET_CLS_ROUTE
> -struct ip_rt_acct *ip_rt_acct __read_mostly;
> +DEFINE_PER_CPU(struct ip_rt_acct[256], ip_rt_acct);
>  #endif /* CONFIG_NET_CLS_ROUTE */
>  
>  static __initdata unsigned long rhash_entries;
> @@ -3253,12 +3253,6 @@
>  int __init ip_rt_init(void)
>  {
>  	int rc = 0;
> -
> -#ifdef CONFIG_NET_CLS_ROUTE
> -	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
> -	if (!ip_rt_acct)
> -		panic("IP: failed to allocate ip_rt_acct\n");
> -#endif
>  
>  	ipv4_dst_ops.kmem_cachep =
>  		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
>  
> ���{.n�+�������+%�����ݶ��w��{.n�+���z�^����ܨ}���Ơz�&j:+v�������w����2�ޙ���&�)ߡ�a������z��z�ޗ�+��ݢj��w��f
> 
> 


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rusty Russell Nov. 18, 2008, 3:38 p.m. UTC | #2
On Tuesday 18 November 2008 09:06:14 Eric Dumazet wrote:
> Rusty Russell a écrit :
> > There's no reason for this to be dynamically allocated: we panic if
> > allocation fails anyway.
> >
> > Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
>
> Well, since we dont have "per_cpu bss", this also grow the size of vmlinux
> file, with a block of 4096 nul bytes.
...
> Maybe with current machines/boot loaders, we dont care anymore, I dont
> know...

Good q.  If people care about on-disk size, maybe we should look at bzImage 
size?

Anyway, here 'tis:
Before:
  text    data     bss     dec     hex filename
4116153  433292  475136 5024581  4cab45 vmlinux
-rw-r--r-- 1 rusty rusty 2251984 2008-11-19 02:03 arch/x86/boot/bzImage

After:
   text    data     bss     dec     hex filename
4116089  437388  475136 5028613  4cbb05 vmlinux
-rw-r--r-- 1 rusty rusty 2252016 2008-11-19 02:05 arch/x86/boot/bzImage

Cheers,
Rusty.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Nov. 19, 2008, 10:20 p.m. UTC | #3
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 19 Nov 2008 02:08:10 +1030

> On Tuesday 18 November 2008 09:06:14 Eric Dumazet wrote:
> > Rusty Russell a écrit :
> > > There's no reason for this to be dynamically allocated: we panic if
> > > allocation fails anyway.
> > >
> > > Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
> >
> > Well, since we dont have "per_cpu bss", this also grow the size of vmlinux
> > file, with a block of 4096 nul bytes.
> ...
> > Maybe with current machines/boot loaders, we dont care anymore, I dont
> > know...
> 
> Good q.  If people care about on-disk size, maybe we should look at bzImage 
> size?

It's the size of the final uncompressed loaded image that matters for
some bootloader limits.

I really don't like anything that increases the size like this, to be
honest.

Do you really need this to forward some work you are doing?  If not
can we just let sleeping dogs lie on this one? :)
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rusty Russell Nov. 19, 2008, 11:13 p.m. UTC | #4
On Thursday 20 November 2008 08:50:23 David Miller wrote:
> Do you really need this to forward some work you are doing?  If not
> can we just let sleeping dogs lie on this one? :)

Yes, I have patches to convert the dynamic percpu data to use the same 
mechanism as static percpu data.  Unfortunately we don't have a mechanism for 
enlarging the percpu region (which is why this wasn't done earlier), so we use 
a heuristic to figure out how much extra percpu region to allocate at boot.

And 4k makes this one of the Big Pigs in dynamic per-cpu allocations.

(SNMP mibs are even worse, but that's a separate debate...)

I can try to implement a bss-like DEFINE_PER_CPU_ZERO(), but it seems silly to 
talk about tight boot loader size restrictions for SMP kernels.

Cheers,
Rusty.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Nov. 19, 2008, 11:17 p.m. UTC | #5
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 20 Nov 2008 09:43:21 +1030

> On Thursday 20 November 2008 08:50:23 David Miller wrote:
> > Do you really need this to forward some work you are doing?  If not
> > can we just let sleeping dogs lie on this one? :)
> 
> Yes, I have patches to convert the dynamic percpu data to use the same 
> mechanism as static percpu data.  Unfortunately we don't have a mechanism for 
> enlarging the percpu region (which is why this wasn't done earlier), so we use 
> a heuristic to figure out how much extra percpu region to allocate at boot.
> 
> And 4k makes this one of the Big Pigs in dynamic per-cpu allocations.
> 
> (SNMP mibs are even worse, but that's a separate debate...)

We make a big fuss (rightly) about a few hundred bytes and this sucker
is FOUR KILOBYTES.

Really for the time being I'd rather see this converted to a
num_possible_cpus() sized normal kzalloc() and direct indexing.  I
don't want the networking to bloat up the main kernel image by so
much.

> I can try to implement a bss-like DEFINE_PER_CPU_ZERO(), but it seems silly to 
> talk about tight boot loader size restrictions for SMP kernels.

SMP these days is candy.  And sparc is what has the bootloader restrictions
btw :)
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Nov. 19, 2008, 11:23 p.m. UTC | #6
Rusty Russell a écrit :
> On Thursday 20 November 2008 08:50:23 David Miller wrote:
>> Do you really need this to forward some work you are doing?  If not
>> can we just let sleeping dogs lie on this one? :)
> 
> Yes, I have patches to convert the dynamic percpu data to use the same 
> mechanism as static percpu data.  Unfortunately we don't have a mechanism for 
> enlarging the percpu region (which is why this wasn't done earlier), so we use 
> a heuristic to figure out how much extra percpu region to allocate at boot.
> 
> And 4k makes this one of the Big Pigs in dynamic per-cpu allocations.
> 
> (SNMP mibs are even worse, but that's a separate debate...)
> 
> I can try to implement a bss-like DEFINE_PER_CPU_ZERO(), but it seems silly to 
> talk about tight boot loader size restrictions for SMP kernels.
> 

Then, if we really want to run 4096 cpus on a machine, we dont want to allocate
16 MBytes of memory for these ip_rt_acct counters, or even more for SNMP mibs.

Maybe its time to design a new mechanism, to avoid the basic "one variable" shared
by all cpus, and avoid the overkill "one separate variable for each cpu", and loop
4096 times to do the sum of this variable...

Something that would allocate a maximum of eight blocs.

Then atomic ops would be necessary for updates of SNMP counters (only if NR_CPUS > 8)


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rusty Russell Nov. 20, 2008, 12:28 a.m. UTC | #7
On Thursday 20 November 2008 09:53:48 Eric Dumazet wrote:
> Rusty Russell a écrit :
> > On Thursday 20 November 2008 08:50:23 David Miller wrote:
> >> Do you really need this to forward some work you are doing?  If not
> >> can we just let sleeping dogs lie on this one? :)
> >
> > Yes, I have patches to convert the dynamic percpu data to use the same
> > mechanism as static percpu data.  Unfortunately we don't have a mechanism
> > for enlarging the percpu region (which is why this wasn't done earlier),
> > so we use a heuristic to figure out how much extra percpu region to
> > allocate at boot.
> >
> > And 4k makes this one of the Big Pigs in dynamic per-cpu allocations.
> >
> > (SNMP mibs are even worse, but that's a separate debate...)
> >
> > I can try to implement a bss-like DEFINE_PER_CPU_ZERO(), but it seems
> > silly to talk about tight boot loader size restrictions for SMP kernels.
>
> Then, if we really want to run 4096 cpus on a machine, we dont want to
> allocate 16 MBytes of memory for these ip_rt_acct counters, or even more
> for SNMP mibs.
>
> Maybe its time to design a new mechanism, to avoid the basic "one variable"
> shared by all cpus, and avoid the overkill "one separate variable for each
> cpu", and loop 4096 times to do the sum of this variable...

Per-node vars; no doubt we'll get there.  It might be worth having YA percpu 
counters implementation which does exactly this.  After the dynamic percpu 
changes and some local_* ops changes to allow use with dynamic percpu vars, it 
should be straightforward.

I don't think it's urgent: my concern is not with people who have 4096 cpus 
(but I do care about people with 2 cpus and CONFIG_NR_CPUS=4096).

Cheers,
Rusty.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rusty Russell Nov. 20, 2008, 4:22 a.m. UTC | #8
On Thursday 20 November 2008 09:47:54 David Miller wrote:
> We make a big fuss (rightly) about a few hundred bytes and this sucker
> is FOUR KILOBYTES.

Sure, but this is just trading 4k runtime for 4k static, ie. it's just 
unhiding it.  That's very different from adding random bloat.

> Really for the time being I'd rather see this converted to a
> num_possible_cpus() sized normal kzalloc() and direct indexing.  I
> don't want the networking to bloat up the main kernel image by so
> much.

Nooooooo! :)

First, num_possible_cpus() is the wrong answer (if there are holes in 
cpu_possible_map).  Second, there's little point having percpu infrastructure 
which people avoid.  Third, that memory not going to be numa-aware.  Fourth, 
the dynamic percpu version is fewer instructions (with dynamic percpu 
patches).

I've spent an hour trying to implement DEFINE_PER_CPU_ZERO(), but AFAICT it 
can't be done (gas rejects "b" as a section attribute) :(

So I've dropped the patch, but I wonder if this 4k of stats should be here at 
all.

Thanks,
Rusty.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff -r 643c2cdfdb62 include/net/route.h
--- a/include/net/route.h	Mon Nov 17 19:57:42 2008 +1030
+++ b/include/net/route.h	Mon Nov 17 20:23:37 2008 +1030
@@ -34,6 +34,7 @@ 
 #include <linux/ip.h>
 #include <linux/cache.h>
 #include <linux/security.h>
+#include <linux/percpu.h>
 
 #ifndef __KERNEL__
 #warning This file is not supposed to be used outside of kernel.
@@ -105,7 +106,7 @@ 
         unsigned int out_hlist_search;
 };
 
-extern struct ip_rt_acct *ip_rt_acct;
+DECLARE_PER_CPU(struct ip_rt_acct[256], ip_rt_acct);
 
 struct in_device;
 extern int		ip_rt_init(void);
diff -r 643c2cdfdb62 net/ipv4/ip_input.c
--- a/net/ipv4/ip_input.c	Mon Nov 17 19:57:42 2008 +1030
+++ b/net/ipv4/ip_input.c	Mon Nov 17 20:23:37 2008 +1030
@@ -339,7 +339,7 @@ 
 
 #ifdef CONFIG_NET_CLS_ROUTE
 	if (unlikely(skb->dst->tclassid)) {
-		struct ip_rt_acct *st = per_cpu_ptr(ip_rt_acct, smp_processor_id());
+		struct ip_rt_acct *st = __get_cpu_var(ip_rt_acct);
 		u32 idx = skb->dst->tclassid;
 		st[idx&0xFF].o_packets++;
 		st[idx&0xFF].o_bytes+=skb->len;
diff -r 643c2cdfdb62 net/ipv4/route.c
--- a/net/ipv4/route.c	Mon Nov 17 19:57:42 2008 +1030
+++ b/net/ipv4/route.c	Mon Nov 17 20:23:37 2008 +1030
@@ -541,7 +541,7 @@ 
 			unsigned int j;
 			u32 *src;
 
-			src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
+			src = ((u32 *)per_cpu(ip_rt_acct, i)) + offset;
 			for (j = 0; j < length/4; j++)
 				dst[j] += src[j];
 		}
@@ -3237,7 +3237,7 @@ 
 
 
 #ifdef CONFIG_NET_CLS_ROUTE
-struct ip_rt_acct *ip_rt_acct __read_mostly;
+DEFINE_PER_CPU(struct ip_rt_acct[256], ip_rt_acct);
 #endif /* CONFIG_NET_CLS_ROUTE */
 
 static __initdata unsigned long rhash_entries;
@@ -3253,12 +3253,6 @@ 
 int __init ip_rt_init(void)
 {
 	int rc = 0;
-
-#ifdef CONFIG_NET_CLS_ROUTE
-	ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
-	if (!ip_rt_acct)
-		panic("IP: failed to allocate ip_rt_acct\n");
-#endif
 
 	ipv4_dst_ops.kmem_cachep =
 		kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,