diff mbox

[for,2.6.32,(untested)] netns: Add quota for number of NET_NS instances.

Message ID 201111201622.FDJ51567.VLFHQFMFOOSOtJ@I-love.SAKURA.ne.jp
State Not Applicable, archived
Delegated to: David Miller
Headers show

Commit Message

Tetsuo Handa Nov. 20, 2011, 7:22 a.m. UTC
In order to solve below problems, can we add sysctl variable for
restricting number of NET_NS instances?
--------------------------------------------------
[PATCH for 2.6.32 (untested)] netns: Add quota for number of NET_NS instances.

CONFIG_NET_NS support in 2.6.32 has a problem that leads to OOM killer when
clone(CLONE_NEWNET) is called instantly.
https://bugs.launchpad.net/ubuntu/+source/linux/+bug/720095
But disabling CONFIG_NET_NS broke lxc containers.
https://bugs.launchpad.net/ubuntu/+source/linux/+bug/790863

This patch introduces /proc/sys/net/core/netns_max interface that limits
max number of network namespace instances.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
---
 include/net/sock.h         |    4 ++++
 net/core/net_namespace.c   |    9 +++++++++
 net/core/sysctl_net_core.c |   10 ++++++++++
 3 files changed, 23 insertions(+)

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Eric W. Biederman Nov. 20, 2011, 11:13 p.m. UTC | #1
Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> writes:

> In order to solve below problems, can we add sysctl variable for
> restricting number of NET_NS instances?

I don't have any particular problems with patch but I don't think it
will result in a working system that is easy to keep working.  Tuning
static limits can be fickle.

Simply throttling the number of processes as anything reasonable will do
should keep the problem in check.  The practical issue is that we have
a huge build of network namespaces that don't get cleaned up.

My inclination in this case the practical fix is that during network
namespace allocation someone take a look at the cleanup_list.  See
that there is ongoing cleanup activity, and wait until at least one
network namespace has cleaned up.  Perhaps by creating a work struct
and waiting for it to cycle through the netns workqueue.

That should throttle network namespace creation to the same speed as
network namespace deletion and prevent the problem of too many
dead network namespaces building up and taking resources.

Eric


> --------------------------------------------------
> [PATCH for 2.6.32 (untested)] netns: Add quota for number of NET_NS instances.
>
> CONFIG_NET_NS support in 2.6.32 has a problem that leads to OOM killer when
> clone(CLONE_NEWNET) is called instantly.
> https://bugs.launchpad.net/ubuntu/+source/linux/+bug/720095
> But disabling CONFIG_NET_NS broke lxc containers.
> https://bugs.launchpad.net/ubuntu/+source/linux/+bug/790863
>
> This patch introduces /proc/sys/net/core/netns_max interface that limits
> max number of network namespace instances.
>
> Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
> ---
>  include/net/sock.h         |    4 ++++
>  net/core/net_namespace.c   |    9 +++++++++
>  net/core/sysctl_net_core.c |   10 ++++++++++
>  3 files changed, 23 insertions(+)
>
> --- linux-2.6.32.48.orig/include/net/sock.h
> +++ linux-2.6.32.48/include/net/sock.h
> @@ -1598,4 +1598,8 @@ extern int sysctl_optmem_max;
>  extern __u32 sysctl_wmem_default;
>  extern __u32 sysctl_rmem_default;
>  
> +#ifdef CONFIG_NET_NS
> +extern int max_netns_count;
> +#endif
> +
>  #endif	/* _SOCK_H */
> --- linux-2.6.32.48.orig/net/core/net_namespace.c
> +++ linux-2.6.32.48/net/core/net_namespace.c
> @@ -81,12 +81,18 @@ static struct net_generic *net_alloc_gen
>  #ifdef CONFIG_NET_NS
>  static struct kmem_cache *net_cachep;
>  static struct workqueue_struct *netns_wq;
> +static atomic_t used_netns_count = ATOMIC_INIT(0);
> +unsigned int max_netns_count;
>  
>  static struct net *net_alloc(void)
>  {
>  	struct net *net = NULL;
>  	struct net_generic *ng;
>  
> +	atomic_inc(&used_netns_count);
> +	if (atomic_read(&used_netns_count) > max_netns_count)
> +		goto out;
> +
>  	ng = net_alloc_generic();
>  	if (!ng)
>  		goto out;
> @@ -96,7 +102,9 @@ static struct net *net_alloc(void)
>  		goto out_free;
>  
>  	rcu_assign_pointer(net->gen, ng);
> +	return net;
>  out:
> +	atomic_dec(&used_netns_count);
>  	return net;
>  
>  out_free:
> @@ -115,6 +123,7 @@ static void net_free(struct net *net)
>  #endif
>  	kfree(net->gen);
>  	kmem_cache_free(net_cachep, net);
> +	atomic_dec(&used_netns_count);
>  }
>  
>  static struct net *net_create(void)
> --- linux-2.6.32.48.orig/net/core/sysctl_net_core.c
> +++ linux-2.6.32.48/net/core/sysctl_net_core.c
> @@ -89,6 +89,16 @@ static struct ctl_table net_core_table[]
>  		.mode		= 0644,
>  		.proc_handler	= proc_dointvec
>  	},
> +#ifdef CONFIG_NET_NS
> +	{
> +		.ctl_name       = CTL_UNNUMBERED,
> +		.procname       = "netns_max",
> +		.data           = &max_netns_count,
> +		.maxlen         = sizeof(int),
> +		.mode           = 0644,
> +		.proc_handler   = proc_dointvec,
> +	},
> +#endif
>  #endif /* CONFIG_NET */
>  	{
>  		.ctl_name	= NET_CORE_BUDGET,
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tetsuo Handa Nov. 21, 2011, 1:57 a.m. UTC | #2
Eric W. Biederman wrote:
> Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> writes:
> 
> > In order to solve below problems, can we add sysctl variable for
> > restricting number of NET_NS instances?
> 
> I don't have any particular problems with patch but I don't think it
> will result in a working system that is easy to keep working.  Tuning
> static limits can be fickle.

What I worry is that, although clone() is an operation that is allowed to
sleep, waiting for too long might be annoying for users, especially when the
user cannot easily send Ctrl-C or SIGKILL. (I think ftp client is an example.)

> My inclination in this case the practical fix is that during network
> namespace allocation someone take a look at the cleanup_list.  See
> that there is ongoing cleanup activity, and wait until at least one
> network namespace has cleaned up.  Perhaps by creating a work struct
> and waiting for it to cycle through the netns workqueue.

Are you suggesting that we should wait only when "the number of NET_NS
instances exceeded quota" and "there is a dead NET_NS instance"?
In other words, let clone() fail immediately if "the number of NET_NS
instances exceeded quota" but "cleanup_list is empty"?

If you are suggesting that we should always wait until "the number of NET_NS
instances becomes smaller than quota", clone() might sleep too long when the
user cannot easily send signals.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric W. Biederman Nov. 21, 2011, 2:45 a.m. UTC | #3
Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> writes:

> Eric W. Biederman wrote:
>> Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> writes:
>> 
>> > In order to solve below problems, can we add sysctl variable for
>> > restricting number of NET_NS instances?
>> 
>> I don't have any particular problems with patch but I don't think it
>> will result in a working system that is easy to keep working.  Tuning
>> static limits can be fickle.
>
> What I worry is that, although clone() is an operation that is allowed to
> sleep, waiting for too long might be annoying for users, especially when the
> user cannot easily send Ctrl-C or SIGKILL. (I think ftp client is an
> example.)

An ftp client can always close the connection.  We already have to
contend for the net_mutex when both creating and destroying network
namespaces so I would be surprised if it is actually a problem.

But the reality is that under high connection load if we actually want
to use network namespaces we have to wait for previous network
namespaces to clean up.  So I am not particularly worried.  Especially
since most of the cleanup speed issues when there is a backlog have
been fixed in more recent kernels.

>> My inclination in this case the practical fix is that during network
>> namespace allocation someone take a look at the cleanup_list.  See
>> that there is ongoing cleanup activity, and wait until at least one
>> network namespace has cleaned up.  Perhaps by creating a work struct
>> and waiting for it to cycle through the netns workqueue.
>
> Are you suggesting that we should wait only when "the number of NET_NS
> instances exceeded quota" and "there is a dead NET_NS instance"?
> In other words, let clone() fail immediately if "the number of NET_NS
> instances exceeded quota" but "cleanup_list is empty"?
>
> If you are suggesting that we should always wait until "the number of NET_NS
> instances becomes smaller than quota", clone() might sleep too long when the
> user cannot easily send signals.

I am suggesting that if a netns instance is being cleaned up we should
wait for one netns instance to be cleaned up.  A single netns instance
does not take long to clean up (in general).  But a lot of netns
instances do take a while.

With waiting for one netns instance to be cleaned up we should be able
to guarantee that we don't develop a substantial backlog network
namespaces to be cleaned up.  And that was the problem.

I don't expect we need to do anything if there are no network namespaces
not being cleaned up.

There is of course debian's solution which was to simply tweak vsftp
to not use network namespaces on 2.6.32 and only enable the feature
on later kernels.  But you seem to want to do something a little
more substantial than that.

Eric

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

--- linux-2.6.32.48.orig/include/net/sock.h
+++ linux-2.6.32.48/include/net/sock.h
@@ -1598,4 +1598,8 @@  extern int sysctl_optmem_max;
 extern __u32 sysctl_wmem_default;
 extern __u32 sysctl_rmem_default;
 
+#ifdef CONFIG_NET_NS
+extern int max_netns_count;
+#endif
+
 #endif	/* _SOCK_H */
--- linux-2.6.32.48.orig/net/core/net_namespace.c
+++ linux-2.6.32.48/net/core/net_namespace.c
@@ -81,12 +81,18 @@  static struct net_generic *net_alloc_gen
 #ifdef CONFIG_NET_NS
 static struct kmem_cache *net_cachep;
 static struct workqueue_struct *netns_wq;
+static atomic_t used_netns_count = ATOMIC_INIT(0);
+unsigned int max_netns_count;
 
 static struct net *net_alloc(void)
 {
 	struct net *net = NULL;
 	struct net_generic *ng;
 
+	atomic_inc(&used_netns_count);
+	if (atomic_read(&used_netns_count) > max_netns_count)
+		goto out;
+
 	ng = net_alloc_generic();
 	if (!ng)
 		goto out;
@@ -96,7 +102,9 @@  static struct net *net_alloc(void)
 		goto out_free;
 
 	rcu_assign_pointer(net->gen, ng);
+	return net;
 out:
+	atomic_dec(&used_netns_count);
 	return net;
 
 out_free:
@@ -115,6 +123,7 @@  static void net_free(struct net *net)
 #endif
 	kfree(net->gen);
 	kmem_cache_free(net_cachep, net);
+	atomic_dec(&used_netns_count);
 }
 
 static struct net *net_create(void)
--- linux-2.6.32.48.orig/net/core/sysctl_net_core.c
+++ linux-2.6.32.48/net/core/sysctl_net_core.c
@@ -89,6 +89,16 @@  static struct ctl_table net_core_table[]
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+#ifdef CONFIG_NET_NS
+	{
+		.ctl_name       = CTL_UNNUMBERED,
+		.procname       = "netns_max",
+		.data           = &max_netns_count,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec,
+	},
+#endif
 #endif /* CONFIG_NET */
 	{
 		.ctl_name	= NET_CORE_BUDGET,