diff mbox series

[RFC,04/27] containers: Allow a process to be forked into a container

Message ID 155024686966.21651.5963892339360034863.stgit@warthog.procyon.org.uk
State New
Headers show
Series Containers and using authenticated filesystems | expand

Commit Message

David Howells Feb. 15, 2019, 4:07 p.m. UTC
Allow a single process to be forked directly into a container using a new
syscall, thereby 'booting' the container:

	pid_t pid = fork_into_container(int container_fd);

This process will be the 'init' process of the container.

Further attempts to fork into the container will be rejected.

Signed-off-by: David Howells <dhowells@redhat.com>
---

 arch/x86/entry/syscalls/syscall_32.tbl |    1 
 arch/x86/entry/syscalls/syscall_64.tbl |    1 
 arch/x86/ia32/sys_ia32.c               |    2 -
 include/linux/cred.h                   |    3 +
 include/linux/nsproxy.h                |    7 ++
 include/linux/sched/task.h             |    3 +
 include/linux/syscalls.h               |    1 
 kernel/cred.c                          |   45 +++++++++++++
 kernel/fork.c                          |  110 ++++++++++++++++++++++++++------
 kernel/nsproxy.c                       |   11 +++
 kernel/sys_ni.c                        |    1 
 11 files changed, 157 insertions(+), 28 deletions(-)

Comments

Stephen Smalley Feb. 15, 2019, 5:39 p.m. UTC | #1
On 2/15/19 11:07 AM, David Howells wrote:
> Allow a single process to be forked directly into a container using a new
> syscall, thereby 'booting' the container:
> 
> 	pid_t pid = fork_into_container(int container_fd);
> 
> This process will be the 'init' process of the container.
> 
> Further attempts to fork into the container will be rejected.
> 
> Signed-off-by: David Howells <dhowells@redhat.com>
> ---
> 
>   arch/x86/entry/syscalls/syscall_32.tbl |    1
>   arch/x86/entry/syscalls/syscall_64.tbl |    1
>   arch/x86/ia32/sys_ia32.c               |    2 -
>   include/linux/cred.h                   |    3 +
>   include/linux/nsproxy.h                |    7 ++
>   include/linux/sched/task.h             |    3 +
>   include/linux/syscalls.h               |    1
>   kernel/cred.c                          |   45 +++++++++++++
>   kernel/fork.c                          |  110 ++++++++++++++++++++++++++------
>   kernel/nsproxy.c                       |   11 +++
>   kernel/sys_ni.c                        |    1
>   11 files changed, 157 insertions(+), 28 deletions(-)
> 
> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
> index 3564814a5d21..8666693510f9 100644
> --- a/arch/x86/entry/syscalls/syscall_32.tbl
> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> @@ -408,3 +408,4 @@
>   394	i386	mount_notify		sys_mount_notify		__ia32_sys_mount_notify
>   395	i386	sb_notify		sys_sb_notify			__ia32_sys_sb_notify
>   396	i386	container_create	sys_container_create		__ia32_sys_container_create
> +397	i386	fork_into_container	sys_fork_into_container		__ia32_sys_fork_into_container
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index aa6cccbe5271..d40d4790fcb2 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -353,6 +353,7 @@
>   342	common	mount_notify		__x64_sys_mount_notify
>   343	common	sb_notify		__x64_sys_sb_notify
>   344	common	container_create	__x64_sys_container_create
> +345	common	fork_into_container	__x64_sys_fork_into_container
>   
>   #
>   # x32-specific system call numbers start at 512 to avoid cache impact
> diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
> index a43212036257..080d9e21b697 100644
> --- a/arch/x86/ia32/sys_ia32.c
> +++ b/arch/x86/ia32/sys_ia32.c
> @@ -238,5 +238,5 @@ COMPAT_SYSCALL_DEFINE5(x86_clone, unsigned long, clone_flags,
>   		       unsigned long, tls_val, int __user *, child_tidptr)
>   {
>   	return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr,
> -			tls_val);
> +			tls_val, NULL);
>   }
> diff --git a/include/linux/cred.h b/include/linux/cred.h
> index 4907c9df86b3..357e743d5d4a 100644
> --- a/include/linux/cred.h
> +++ b/include/linux/cred.h
> @@ -23,6 +23,7 @@
>   
>   struct cred;
>   struct inode;
> +struct container;
>   
>   /*
>    * COW Supplementary groups list
> @@ -155,7 +156,7 @@ struct cred {
>   
>   extern void __put_cred(struct cred *);
>   extern void exit_creds(struct task_struct *);
> -extern int copy_creds(struct task_struct *, unsigned long);
> +extern int copy_creds(struct task_struct *, unsigned long, struct container *);
>   extern const struct cred *get_task_cred(struct task_struct *);
>   extern struct cred *cred_alloc_blank(void);
>   extern struct cred *prepare_creds(void);
> diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
> index 2ae1b1a4d84d..81838ae24a92 100644
> --- a/include/linux/nsproxy.h
> +++ b/include/linux/nsproxy.h
> @@ -11,6 +11,7 @@ struct ipc_namespace;
>   struct pid_namespace;
>   struct cgroup_namespace;
>   struct fs_struct;
> +struct container;
>   
>   /*
>    * A structure to contain pointers to all per-process
> @@ -63,9 +64,13 @@ extern struct nsproxy init_nsproxy;
>    *         * /
>    *     task_unlock(task);
>    *
> + *  4. Container namespaces are set at container creation and cannot be
> + *     changed.
> + *
>    */
>   
> -int copy_namespaces(unsigned long flags, struct task_struct *tsk);
> +int copy_namespaces(unsigned long flags, struct task_struct *tsk,
> +		    struct container *dest_container);
>   void exit_task_namespaces(struct task_struct *tsk);
>   void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new);
>   void free_nsproxy(struct nsproxy *ns);
> diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
> index 44c6f15800ff..bdff71b0fb66 100644
> --- a/include/linux/sched/task.h
> +++ b/include/linux/sched/task.h
> @@ -73,7 +73,8 @@ extern void do_group_exit(int);
>   extern void exit_files(struct task_struct *);
>   extern void exit_itimers(struct signal_struct *);
>   
> -extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *, unsigned long);
> +extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *,
> +		     int __user *, unsigned long, struct container *);
>   extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *);
>   struct task_struct *fork_idle(int);
>   extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index dac42098c2dd..15e5cc704df3 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -946,6 +946,7 @@ asmlinkage long sys_sb_notify(int dfd, const char __user *path,
>   asmlinkage long sys_container_create(const char __user *name, unsigned int flags,
>   				     unsigned long spare3, unsigned long spare4,
>   				     unsigned long spare5);
> +asmlinkage long sys_fork_into_container(int containerfd);
>   
>   /*
>    * Architecture-specific system calls
> diff --git a/kernel/cred.c b/kernel/cred.c
> index 21f4a97085b4..f0ee5cec533d 100644
> --- a/kernel/cred.c
> +++ b/kernel/cred.c
> @@ -313,6 +313,43 @@ struct cred *prepare_exec_creds(void)
>   	return new;
>   }
>   
> +/*
> + * Handle forking a process into a container.
> + */
> +static struct cred *copy_container_creds(struct container *dest_container)
> +{
> +	struct cred *new;
> +
> +	validate_process_creds();
> +
> +	new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
> +	if (!new)
> +		return NULL;
> +
> +	kdebug("prepare_creds() alloc %p", new);
> +
> +	memcpy(new, dest_container->cred, sizeof(struct cred));
> +
> +	atomic_set(&new->usage, 1);
> +	set_cred_subscribers(new, 0);
> +	get_group_info(new->group_info);
> +	get_uid(new->user);
> +	get_user_ns(new->user_ns);
> +
> +#ifdef CONFIG_SECURITY
> +	new->security = NULL;
> +#endif
> +
> +	if (security_prepare_creds(new, dest_container->cred, GFP_KERNEL) < 0)
> +		goto error;
> +	validate_creds(new);
> +	return new;
> +
> +error:
> +	abort_creds(new);
> +	return NULL;
> +}
> +
>   /*
>    * Copy credentials for the new process created by fork()
>    *
> @@ -322,7 +359,8 @@ struct cred *prepare_exec_creds(void)
>    * The new process gets the current process's subjective credentials as its
>    * objective and subjective credentials
>    */
> -int copy_creds(struct task_struct *p, unsigned long clone_flags)
> +int copy_creds(struct task_struct *p, unsigned long clone_flags,
> +	       struct container *dest_container)
>   {
>   	struct cred *new;
>   	int ret;
> @@ -343,7 +381,10 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
>   		return 0;
>   	}
>   
> -	new = prepare_creds();
> +	if (dest_container)
> +		new = copy_container_creds(dest_container);

Shouldn't there be a check between the current process' credentials and 
the destination container's credentials before allowing this to occur?

> +	else
> +		new = prepare_creds();
>   	if (!new)
>   		return -ENOMEM;
>   
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 009cf7e63894..71401deb4434 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1385,9 +1385,33 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
>   	return retval;
>   }
>   
> -static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
> +static int copy_fs(unsigned long clone_flags, struct task_struct *tsk,
> +		   struct container *dest_container)
>   {
>   	struct fs_struct *fs = current->fs;
> +
> +#ifdef CONFIG_CONTAINERS
> +	if (dest_container) {
> +		fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
> +		if (!fs)
> +			return -ENOMEM;
> +
> +		fs->users = 1;
> +		fs->in_exec = 0;
> +		spin_lock_init(&fs->lock);
> +		seqcount_init(&fs->seq);
> +		fs->umask = 0022;
> +
> +		spin_lock(&dest_container->lock);
> +		fs->pwd = fs->root = dest_container->root;
> +		path_get(&fs->root);
> +		path_get(&fs->pwd);
> +		spin_unlock(&dest_container->lock);
> +		tsk->fs = fs;
> +		return 0;
> +	}
> +#endif
> +
>   	if (clone_flags & CLONE_FS) {
>   		/* tsk->fs is already what we want */
>   		spin_lock(&fs->lock);
> @@ -1679,7 +1703,8 @@ static __latent_entropy struct task_struct *copy_process(
>   					struct pid *pid,
>   					int trace,
>   					unsigned long tls,
> -					int node)
> +					int node,
> +					struct container *dest_container)
>   {
>   	int retval;
>   	struct task_struct *p;
> @@ -1783,7 +1808,7 @@ static __latent_entropy struct task_struct *copy_process(
>   	}
>   	current->flags &= ~PF_NPROC_EXCEEDED;
>   
> -	retval = copy_creds(p, clone_flags);
> +	retval = copy_creds(p, clone_flags, dest_container);
>   	if (retval < 0)
>   		goto bad_fork_free;
>   
> @@ -1905,7 +1930,7 @@ static __latent_entropy struct task_struct *copy_process(
>   	retval = copy_files(clone_flags, p);
>   	if (retval)
>   		goto bad_fork_cleanup_semundo;
> -	retval = copy_fs(clone_flags, p);
> +	retval = copy_fs(clone_flags, p, dest_container);
>   	if (retval)
>   		goto bad_fork_cleanup_files;
>   	retval = copy_sighand(clone_flags, p);
> @@ -1917,15 +1942,15 @@ static __latent_entropy struct task_struct *copy_process(
>   	retval = copy_mm(clone_flags, p);
>   	if (retval)
>   		goto bad_fork_cleanup_signal;
> -	retval = copy_namespaces(clone_flags, p);
> +	retval = copy_container(clone_flags, p, dest_container);
>   	if (retval)
>   		goto bad_fork_cleanup_mm;
> -	retval = copy_container(clone_flags, p, NULL);
> +	retval = copy_namespaces(clone_flags, p, dest_container);
>   	if (retval)
> -		goto bad_fork_cleanup_namespaces;
> +		goto bad_fork_cleanup_container;
>   	retval = copy_io(clone_flags, p);
>   	if (retval)
> -		goto bad_fork_cleanup_container;
> +		goto bad_fork_cleanup_namespaces;
>   	retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls);
>   	if (retval)
>   		goto bad_fork_cleanup_io;
> @@ -2124,10 +2149,10 @@ static __latent_entropy struct task_struct *copy_process(
>   bad_fork_cleanup_io:
>   	if (p->io_context)
>   		exit_io_context(p);
> -bad_fork_cleanup_container:
> -	exit_container(p);
>   bad_fork_cleanup_namespaces:
>   	exit_task_namespaces(p);
> +bad_fork_cleanup_container:
> +	exit_container(p);
>   bad_fork_cleanup_mm:
>   	if (p->mm)
>   		mmput(p->mm);
> @@ -2183,7 +2208,7 @@ struct task_struct *fork_idle(int cpu)
>   {
>   	struct task_struct *task;
>   	task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0,
> -			    cpu_to_node(cpu));
> +			    cpu_to_node(cpu), NULL);
>   	if (!IS_ERR(task)) {
>   		init_idle_pids(task);
>   		init_idle(task, cpu);
> @@ -2195,15 +2220,16 @@ struct task_struct *fork_idle(int cpu)
>   /*
>    *  Ok, this is the main fork-routine.
>    *
> - * It copies the process, and if successful kick-starts
> - * it and waits for it to finish using the VM if required.
> + * It copies the process into the specified container, and if successful
> + * kick-starts it and waits for it to finish using the VM if required.
>    */
>   long _do_fork(unsigned long clone_flags,
>   	      unsigned long stack_start,
>   	      unsigned long stack_size,
>   	      int __user *parent_tidptr,
>   	      int __user *child_tidptr,
> -	      unsigned long tls)
> +	      unsigned long tls,
> +	      struct container *dest_container)
>   {
>   	struct completion vfork;
>   	struct pid *pid;
> @@ -2229,8 +2255,32 @@ long _do_fork(unsigned long clone_flags,
>   			trace = 0;
>   	}
>   
> +	if (dest_container) {
> +		/* A process spawned into a container doesn't share anything
> +		 * with the parent other than namespaces.
> +		 */
> +		if (clone_flags & (CLONE_CHILD_CLEARTID |
> +				   CLONE_CHILD_SETTID |
> +				   CLONE_FILES |
> +				   CLONE_FS |
> +				   CLONE_IO |
> +				   CLONE_PARENT |
> +				   CLONE_PARENT_SETTID |
> +				   CLONE_PTRACE |
> +				   CLONE_SETTLS |
> +				   CLONE_SIGHAND |
> +				   CLONE_SYSVSEM |
> +				   CLONE_THREAD))
> +			return -EINVAL;
> +
> +		/* However, we do have to let kernel threads borrow a VM. */
> +		if ((clone_flags & CLONE_VM) && current->mm)
> +			return -EINVAL;
> +	}
> +	
>   	p = copy_process(clone_flags, stack_start, stack_size,
> -			 child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
> +			 child_tidptr, NULL, trace, tls, NUMA_NO_NODE,
> +			 dest_container);
>   	add_latent_entropy();
>   
>   	if (IS_ERR(p))
> @@ -2279,7 +2329,7 @@ long do_fork(unsigned long clone_flags,
>   	      int __user *child_tidptr)
>   {
>   	return _do_fork(clone_flags, stack_start, stack_size,
> -			parent_tidptr, child_tidptr, 0);
> +			parent_tidptr, child_tidptr, 0, NULL);
>   }
>   #endif
>   
> @@ -2289,14 +2339,14 @@ long do_fork(unsigned long clone_flags,
>   pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
>   {
>   	return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
> -		(unsigned long)arg, NULL, NULL, 0);
> +			(unsigned long)arg, NULL, NULL, 0, NULL);
>   }
>   
>   #ifdef __ARCH_WANT_SYS_FORK
>   SYSCALL_DEFINE0(fork)
>   {
>   #ifdef CONFIG_MMU
> -	return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0);
> +	return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0, NULL);
>   #else
>   	/* can not support in nommu mode */
>   	return -EINVAL;
> @@ -2308,7 +2358,26 @@ SYSCALL_DEFINE0(fork)
>   SYSCALL_DEFINE0(vfork)
>   {
>   	return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
> -			0, NULL, NULL, 0);
> +			0, NULL, NULL, 0, NULL);
> +}
> +#endif
> +
> +#ifdef CONFIG_CONTAINERS
> +SYSCALL_DEFINE1(fork_into_container, int, containerfd)
> +{
> +	struct fd f = fdget(containerfd);
> +	int ret;
> +
> +	if (!f.file)
> +		return -EBADF;
> +	ret = -EINVAL;
> +	if (is_container_file(f.file)) {
> +		struct container *dest_container = f.file->private_data;
> +
> +		ret = _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0, dest_container);
> +	}
> +	fdput(f);
> +	return ret;
>   }
>   #endif
>   
> @@ -2336,7 +2405,8 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
>   		 unsigned long, tls)
>   #endif
>   {
> -	return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls);
> +	return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls,
> +			NULL);
>   }
>   #endif
>   
> diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
> index 4bb5184b3a80..4031075300a4 100644
> --- a/kernel/nsproxy.c
> +++ b/kernel/nsproxy.c
> @@ -136,12 +136,19 @@ struct nsproxy *create_new_namespaces(unsigned long flags,
>    * called from clone.  This now handles copy for nsproxy and all
>    * namespaces therein.
>    */
> -int copy_namespaces(unsigned long flags, struct task_struct *tsk)
> +int copy_namespaces(unsigned long flags, struct task_struct *tsk,
> +		    struct container *dest_container)
>   {
>   	struct nsproxy *old_ns = tsk->nsproxy;
>   	struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
>   	struct nsproxy *new_ns;
>   
> +	if (dest_container) {
> +		get_nsproxy(dest_container->ns);
> +		tsk->nsproxy = dest_container->ns;
> +		return 0;
> +	}
> +
>   	if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
>   			      CLONE_NEWPID | CLONE_NEWNET |
>   			      CLONE_NEWCGROUP)))) {
> @@ -163,7 +170,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
>   		(CLONE_NEWIPC | CLONE_SYSVSEM))
>   		return -EINVAL;
>   
> -	new_ns = create_new_namespaces(flags, tsk->nsproxy, user_ns, tsk->fs);
> +	new_ns = create_new_namespaces(flags, old_ns, user_ns, tsk->fs);
>   	if (IS_ERR(new_ns))
>   		return  PTR_ERR(new_ns);
>   
> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
> index f0455cbb91cf..a23ad529d548 100644
> --- a/kernel/sys_ni.c
> +++ b/kernel/sys_ni.c
> @@ -144,6 +144,7 @@ COND_SYSCALL(container_create);
>   /* kernel/exit.c */
>   
>   /* kernel/fork.c */
> +COND_SYSCALL(fork_into_container);
>   
>   /* kernel/futex.c */
>   COND_SYSCALL(futex);
>
Eric W. Biederman Feb. 19, 2019, 4:39 p.m. UTC | #2
David Howells <dhowells@redhat.com> writes:

> Allow a single process to be forked directly into a container using a new
> syscall, thereby 'booting' the container:
>
> 	pid_t pid = fork_into_container(int container_fd);
>
> This process will be the 'init' process of the container.
>
> Further attempts to fork into the container will be rejected.

So you are breaking nsenter, and it's like.

There are no technical reasons to disallow this, and may good practical
reasons to allow this.

Nacked-by: "Eric W. Biederman" <ebiederm@xmission.com>

> Signed-off-by: David Howells <dhowells@redhat.com>
> ---
>
>  arch/x86/entry/syscalls/syscall_32.tbl |    1 
>  arch/x86/entry/syscalls/syscall_64.tbl |    1 
>  arch/x86/ia32/sys_ia32.c               |    2 -
>  include/linux/cred.h                   |    3 +
>  include/linux/nsproxy.h                |    7 ++
>  include/linux/sched/task.h             |    3 +
>  include/linux/syscalls.h               |    1 
>  kernel/cred.c                          |   45 +++++++++++++
>  kernel/fork.c                          |  110 ++++++++++++++++++++++++++------
>  kernel/nsproxy.c                       |   11 +++
>  kernel/sys_ni.c                        |    1 
>  11 files changed, 157 insertions(+), 28 deletions(-)
>
> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
> index 3564814a5d21..8666693510f9 100644
> --- a/arch/x86/entry/syscalls/syscall_32.tbl
> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> @@ -408,3 +408,4 @@
>  394	i386	mount_notify		sys_mount_notify		__ia32_sys_mount_notify
>  395	i386	sb_notify		sys_sb_notify			__ia32_sys_sb_notify
>  396	i386	container_create	sys_container_create		__ia32_sys_container_create
> +397	i386	fork_into_container	sys_fork_into_container		__ia32_sys_fork_into_container
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index aa6cccbe5271..d40d4790fcb2 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -353,6 +353,7 @@
>  342	common	mount_notify		__x64_sys_mount_notify
>  343	common	sb_notify		__x64_sys_sb_notify
>  344	common	container_create	__x64_sys_container_create
> +345	common	fork_into_container	__x64_sys_fork_into_container
>  
>  #
>  # x32-specific system call numbers start at 512 to avoid cache impact
> diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
> index a43212036257..080d9e21b697 100644
> --- a/arch/x86/ia32/sys_ia32.c
> +++ b/arch/x86/ia32/sys_ia32.c
> @@ -238,5 +238,5 @@ COMPAT_SYSCALL_DEFINE5(x86_clone, unsigned long, clone_flags,
>  		       unsigned long, tls_val, int __user *, child_tidptr)
>  {
>  	return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr,
> -			tls_val);
> +			tls_val, NULL);
>  }
> diff --git a/include/linux/cred.h b/include/linux/cred.h
> index 4907c9df86b3..357e743d5d4a 100644
> --- a/include/linux/cred.h
> +++ b/include/linux/cred.h
> @@ -23,6 +23,7 @@
>  
>  struct cred;
>  struct inode;
> +struct container;
>  
>  /*
>   * COW Supplementary groups list
> @@ -155,7 +156,7 @@ struct cred {
>  
>  extern void __put_cred(struct cred *);
>  extern void exit_creds(struct task_struct *);
> -extern int copy_creds(struct task_struct *, unsigned long);
> +extern int copy_creds(struct task_struct *, unsigned long, struct container *);
>  extern const struct cred *get_task_cred(struct task_struct *);
>  extern struct cred *cred_alloc_blank(void);
>  extern struct cred *prepare_creds(void);
> diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
> index 2ae1b1a4d84d..81838ae24a92 100644
> --- a/include/linux/nsproxy.h
> +++ b/include/linux/nsproxy.h
> @@ -11,6 +11,7 @@ struct ipc_namespace;
>  struct pid_namespace;
>  struct cgroup_namespace;
>  struct fs_struct;
> +struct container;
>  
>  /*
>   * A structure to contain pointers to all per-process
> @@ -63,9 +64,13 @@ extern struct nsproxy init_nsproxy;
>   *         * /
>   *     task_unlock(task);
>   *
> + *  4. Container namespaces are set at container creation and cannot be
> + *     changed.
> + *
>   */
>  
> -int copy_namespaces(unsigned long flags, struct task_struct *tsk);
> +int copy_namespaces(unsigned long flags, struct task_struct *tsk,
> +		    struct container *dest_container);
>  void exit_task_namespaces(struct task_struct *tsk);
>  void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new);
>  void free_nsproxy(struct nsproxy *ns);
> diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
> index 44c6f15800ff..bdff71b0fb66 100644
> --- a/include/linux/sched/task.h
> +++ b/include/linux/sched/task.h
> @@ -73,7 +73,8 @@ extern void do_group_exit(int);
>  extern void exit_files(struct task_struct *);
>  extern void exit_itimers(struct signal_struct *);
>  
> -extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *, unsigned long);
> +extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *,
> +		     int __user *, unsigned long, struct container *);
>  extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *);
>  struct task_struct *fork_idle(int);
>  extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index dac42098c2dd..15e5cc704df3 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -946,6 +946,7 @@ asmlinkage long sys_sb_notify(int dfd, const char __user *path,
>  asmlinkage long sys_container_create(const char __user *name, unsigned int flags,
>  				     unsigned long spare3, unsigned long spare4,
>  				     unsigned long spare5);
> +asmlinkage long sys_fork_into_container(int containerfd);
>  
>  /*
>   * Architecture-specific system calls
> diff --git a/kernel/cred.c b/kernel/cred.c
> index 21f4a97085b4..f0ee5cec533d 100644
> --- a/kernel/cred.c
> +++ b/kernel/cred.c
> @@ -313,6 +313,43 @@ struct cred *prepare_exec_creds(void)
>  	return new;
>  }
>  
> +/*
> + * Handle forking a process into a container.
> + */
> +static struct cred *copy_container_creds(struct container *dest_container)
> +{
> +	struct cred *new;
> +
> +	validate_process_creds();
> +
> +	new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
> +	if (!new)
> +		return NULL;
> +
> +	kdebug("prepare_creds() alloc %p", new);
> +
> +	memcpy(new, dest_container->cred, sizeof(struct cred));
> +
> +	atomic_set(&new->usage, 1);
> +	set_cred_subscribers(new, 0);
> +	get_group_info(new->group_info);
> +	get_uid(new->user);
> +	get_user_ns(new->user_ns);
> +
> +#ifdef CONFIG_SECURITY
> +	new->security = NULL;
> +#endif
> +
> +	if (security_prepare_creds(new, dest_container->cred, GFP_KERNEL) < 0)
> +		goto error;
> +	validate_creds(new);
> +	return new;
> +
> +error:
> +	abort_creds(new);
> +	return NULL;
> +}
> +
>  /*
>   * Copy credentials for the new process created by fork()
>   *
> @@ -322,7 +359,8 @@ struct cred *prepare_exec_creds(void)
>   * The new process gets the current process's subjective credentials as its
>   * objective and subjective credentials
>   */
> -int copy_creds(struct task_struct *p, unsigned long clone_flags)
> +int copy_creds(struct task_struct *p, unsigned long clone_flags,
> +	       struct container *dest_container)
>  {
>  	struct cred *new;
>  	int ret;
> @@ -343,7 +381,10 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
>  		return 0;
>  	}
>  
> -	new = prepare_creds();
> +	if (dest_container)
> +		new = copy_container_creds(dest_container);
> +	else
> +		new = prepare_creds();
>  	if (!new)
>  		return -ENOMEM;
>  
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 009cf7e63894..71401deb4434 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1385,9 +1385,33 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
>  	return retval;
>  }
>  
> -static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
> +static int copy_fs(unsigned long clone_flags, struct task_struct *tsk,
> +		   struct container *dest_container)
>  {
>  	struct fs_struct *fs = current->fs;
> +
> +#ifdef CONFIG_CONTAINERS
> +	if (dest_container) {
> +		fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
> +		if (!fs)
> +			return -ENOMEM;
> +
> +		fs->users = 1;
> +		fs->in_exec = 0;
> +		spin_lock_init(&fs->lock);
> +		seqcount_init(&fs->seq);
> +		fs->umask = 0022;
> +
> +		spin_lock(&dest_container->lock);
> +		fs->pwd = fs->root = dest_container->root;
> +		path_get(&fs->root);
> +		path_get(&fs->pwd);
> +		spin_unlock(&dest_container->lock);
> +		tsk->fs = fs;
> +		return 0;
> +	}
> +#endif
> +
>  	if (clone_flags & CLONE_FS) {
>  		/* tsk->fs is already what we want */
>  		spin_lock(&fs->lock);
> @@ -1679,7 +1703,8 @@ static __latent_entropy struct task_struct *copy_process(
>  					struct pid *pid,
>  					int trace,
>  					unsigned long tls,
> -					int node)
> +					int node,
> +					struct container *dest_container)
>  {
>  	int retval;
>  	struct task_struct *p;
> @@ -1783,7 +1808,7 @@ static __latent_entropy struct task_struct *copy_process(
>  	}
>  	current->flags &= ~PF_NPROC_EXCEEDED;
>  
> -	retval = copy_creds(p, clone_flags);
> +	retval = copy_creds(p, clone_flags, dest_container);
>  	if (retval < 0)
>  		goto bad_fork_free;
>  
> @@ -1905,7 +1930,7 @@ static __latent_entropy struct task_struct *copy_process(
>  	retval = copy_files(clone_flags, p);
>  	if (retval)
>  		goto bad_fork_cleanup_semundo;
> -	retval = copy_fs(clone_flags, p);
> +	retval = copy_fs(clone_flags, p, dest_container);
>  	if (retval)
>  		goto bad_fork_cleanup_files;
>  	retval = copy_sighand(clone_flags, p);
> @@ -1917,15 +1942,15 @@ static __latent_entropy struct task_struct *copy_process(
>  	retval = copy_mm(clone_flags, p);
>  	if (retval)
>  		goto bad_fork_cleanup_signal;
> -	retval = copy_namespaces(clone_flags, p);
> +	retval = copy_container(clone_flags, p, dest_container);
>  	if (retval)
>  		goto bad_fork_cleanup_mm;
> -	retval = copy_container(clone_flags, p, NULL);
> +	retval = copy_namespaces(clone_flags, p, dest_container);
>  	if (retval)
> -		goto bad_fork_cleanup_namespaces;
> +		goto bad_fork_cleanup_container;
>  	retval = copy_io(clone_flags, p);
>  	if (retval)
> -		goto bad_fork_cleanup_container;
> +		goto bad_fork_cleanup_namespaces;
>  	retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls);
>  	if (retval)
>  		goto bad_fork_cleanup_io;
> @@ -2124,10 +2149,10 @@ static __latent_entropy struct task_struct *copy_process(
>  bad_fork_cleanup_io:
>  	if (p->io_context)
>  		exit_io_context(p);
> -bad_fork_cleanup_container:
> -	exit_container(p);
>  bad_fork_cleanup_namespaces:
>  	exit_task_namespaces(p);
> +bad_fork_cleanup_container:
> +	exit_container(p);
>  bad_fork_cleanup_mm:
>  	if (p->mm)
>  		mmput(p->mm);
> @@ -2183,7 +2208,7 @@ struct task_struct *fork_idle(int cpu)
>  {
>  	struct task_struct *task;
>  	task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0,
> -			    cpu_to_node(cpu));
> +			    cpu_to_node(cpu), NULL);
>  	if (!IS_ERR(task)) {
>  		init_idle_pids(task);
>  		init_idle(task, cpu);
> @@ -2195,15 +2220,16 @@ struct task_struct *fork_idle(int cpu)
>  /*
>   *  Ok, this is the main fork-routine.
>   *
> - * It copies the process, and if successful kick-starts
> - * it and waits for it to finish using the VM if required.
> + * It copies the process into the specified container, and if successful
> + * kick-starts it and waits for it to finish using the VM if required.
>   */
>  long _do_fork(unsigned long clone_flags,
>  	      unsigned long stack_start,
>  	      unsigned long stack_size,
>  	      int __user *parent_tidptr,
>  	      int __user *child_tidptr,
> -	      unsigned long tls)
> +	      unsigned long tls,
> +	      struct container *dest_container)
>  {
>  	struct completion vfork;
>  	struct pid *pid;
> @@ -2229,8 +2255,32 @@ long _do_fork(unsigned long clone_flags,
>  			trace = 0;
>  	}
>  
> +	if (dest_container) {
> +		/* A process spawned into a container doesn't share anything
> +		 * with the parent other than namespaces.
> +		 */
> +		if (clone_flags & (CLONE_CHILD_CLEARTID |
> +				   CLONE_CHILD_SETTID |
> +				   CLONE_FILES |
> +				   CLONE_FS |
> +				   CLONE_IO |
> +				   CLONE_PARENT |
> +				   CLONE_PARENT_SETTID |
> +				   CLONE_PTRACE |
> +				   CLONE_SETTLS |
> +				   CLONE_SIGHAND |
> +				   CLONE_SYSVSEM |
> +				   CLONE_THREAD))
> +			return -EINVAL;
> +
> +		/* However, we do have to let kernel threads borrow a VM. */
> +		if ((clone_flags & CLONE_VM) && current->mm)
> +			return -EINVAL;
> +	}
> +	
>  	p = copy_process(clone_flags, stack_start, stack_size,
> -			 child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
> +			 child_tidptr, NULL, trace, tls, NUMA_NO_NODE,
> +			 dest_container);
>  	add_latent_entropy();
>  
>  	if (IS_ERR(p))
> @@ -2279,7 +2329,7 @@ long do_fork(unsigned long clone_flags,
>  	      int __user *child_tidptr)
>  {
>  	return _do_fork(clone_flags, stack_start, stack_size,
> -			parent_tidptr, child_tidptr, 0);
> +			parent_tidptr, child_tidptr, 0, NULL);
>  }
>  #endif
>  
> @@ -2289,14 +2339,14 @@ long do_fork(unsigned long clone_flags,
>  pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
>  {
>  	return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
> -		(unsigned long)arg, NULL, NULL, 0);
> +			(unsigned long)arg, NULL, NULL, 0, NULL);
>  }
>  
>  #ifdef __ARCH_WANT_SYS_FORK
>  SYSCALL_DEFINE0(fork)
>  {
>  #ifdef CONFIG_MMU
> -	return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0);
> +	return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0, NULL);
>  #else
>  	/* can not support in nommu mode */
>  	return -EINVAL;
> @@ -2308,7 +2358,26 @@ SYSCALL_DEFINE0(fork)
>  SYSCALL_DEFINE0(vfork)
>  {
>  	return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
> -			0, NULL, NULL, 0);
> +			0, NULL, NULL, 0, NULL);
> +}
> +#endif
> +
> +#ifdef CONFIG_CONTAINERS
> +SYSCALL_DEFINE1(fork_into_container, int, containerfd)
> +{
> +	struct fd f = fdget(containerfd);
> +	int ret;
> +
> +	if (!f.file)
> +		return -EBADF;
> +	ret = -EINVAL;
> +	if (is_container_file(f.file)) {
> +		struct container *dest_container = f.file->private_data;
> +
> +		ret = _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0, dest_container);
> +	}
> +	fdput(f);
> +	return ret;
>  }
>  #endif
>  
> @@ -2336,7 +2405,8 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
>  		 unsigned long, tls)
>  #endif
>  {
> -	return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls);
> +	return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls,
> +			NULL);
>  }
>  #endif
>  
> diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
> index 4bb5184b3a80..4031075300a4 100644
> --- a/kernel/nsproxy.c
> +++ b/kernel/nsproxy.c
> @@ -136,12 +136,19 @@ struct nsproxy *create_new_namespaces(unsigned long flags,
>   * called from clone.  This now handles copy for nsproxy and all
>   * namespaces therein.
>   */
> -int copy_namespaces(unsigned long flags, struct task_struct *tsk)
> +int copy_namespaces(unsigned long flags, struct task_struct *tsk,
> +		    struct container *dest_container)
>  {
>  	struct nsproxy *old_ns = tsk->nsproxy;
>  	struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
>  	struct nsproxy *new_ns;
>  
> +	if (dest_container) {
> +		get_nsproxy(dest_container->ns);
> +		tsk->nsproxy = dest_container->ns;
> +		return 0;
> +	}
> +
>  	if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
>  			      CLONE_NEWPID | CLONE_NEWNET |
>  			      CLONE_NEWCGROUP)))) {
> @@ -163,7 +170,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
>  		(CLONE_NEWIPC | CLONE_SYSVSEM)) 
>  		return -EINVAL;
>  
> -	new_ns = create_new_namespaces(flags, tsk->nsproxy, user_ns, tsk->fs);
> +	new_ns = create_new_namespaces(flags, old_ns, user_ns, tsk->fs);
>  	if (IS_ERR(new_ns))
>  		return  PTR_ERR(new_ns);
>  
> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
> index f0455cbb91cf..a23ad529d548 100644
> --- a/kernel/sys_ni.c
> +++ b/kernel/sys_ni.c
> @@ -144,6 +144,7 @@ COND_SYSCALL(container_create);
>  /* kernel/exit.c */
>  
>  /* kernel/fork.c */
> +COND_SYSCALL(fork_into_container);
>  
>  /* kernel/futex.c */
>  COND_SYSCALL(futex);
David Howells Feb. 19, 2019, 11:16 p.m. UTC | #3
Eric W. Biederman <ebiederm@xmission.com> wrote:

> > Further attempts to fork into the container will be rejected.
>
> There are no technical reasons to disallow this, and may good practical
> reasons to allow this.

Fair enough; that can be done.  Could even emulate /sbin/request-key upcalling
that way, with the manager spawning the daemon into the container with it.

> So you are breaking nsenter, and it's like.

It shouldn't stop nsenter() from working.

David
diff mbox series

Patch

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 3564814a5d21..8666693510f9 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -408,3 +408,4 @@ 
 394	i386	mount_notify		sys_mount_notify		__ia32_sys_mount_notify
 395	i386	sb_notify		sys_sb_notify			__ia32_sys_sb_notify
 396	i386	container_create	sys_container_create		__ia32_sys_container_create
+397	i386	fork_into_container	sys_fork_into_container		__ia32_sys_fork_into_container
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index aa6cccbe5271..d40d4790fcb2 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -353,6 +353,7 @@ 
 342	common	mount_notify		__x64_sys_mount_notify
 343	common	sb_notify		__x64_sys_sb_notify
 344	common	container_create	__x64_sys_container_create
+345	common	fork_into_container	__x64_sys_fork_into_container
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index a43212036257..080d9e21b697 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -238,5 +238,5 @@  COMPAT_SYSCALL_DEFINE5(x86_clone, unsigned long, clone_flags,
 		       unsigned long, tls_val, int __user *, child_tidptr)
 {
 	return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr,
-			tls_val);
+			tls_val, NULL);
 }
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 4907c9df86b3..357e743d5d4a 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -23,6 +23,7 @@ 
 
 struct cred;
 struct inode;
+struct container;
 
 /*
  * COW Supplementary groups list
@@ -155,7 +156,7 @@  struct cred {
 
 extern void __put_cred(struct cred *);
 extern void exit_creds(struct task_struct *);
-extern int copy_creds(struct task_struct *, unsigned long);
+extern int copy_creds(struct task_struct *, unsigned long, struct container *);
 extern const struct cred *get_task_cred(struct task_struct *);
 extern struct cred *cred_alloc_blank(void);
 extern struct cred *prepare_creds(void);
diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index 2ae1b1a4d84d..81838ae24a92 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -11,6 +11,7 @@  struct ipc_namespace;
 struct pid_namespace;
 struct cgroup_namespace;
 struct fs_struct;
+struct container;
 
 /*
  * A structure to contain pointers to all per-process
@@ -63,9 +64,13 @@  extern struct nsproxy init_nsproxy;
  *         * /
  *     task_unlock(task);
  *
+ *  4. Container namespaces are set at container creation and cannot be
+ *     changed.
+ *
  */
 
-int copy_namespaces(unsigned long flags, struct task_struct *tsk);
+int copy_namespaces(unsigned long flags, struct task_struct *tsk,
+		    struct container *dest_container);
 void exit_task_namespaces(struct task_struct *tsk);
 void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new);
 void free_nsproxy(struct nsproxy *ns);
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 44c6f15800ff..bdff71b0fb66 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -73,7 +73,8 @@  extern void do_group_exit(int);
 extern void exit_files(struct task_struct *);
 extern void exit_itimers(struct signal_struct *);
 
-extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *, unsigned long);
+extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *,
+		     int __user *, unsigned long, struct container *);
 extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *);
 struct task_struct *fork_idle(int);
 extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index dac42098c2dd..15e5cc704df3 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -946,6 +946,7 @@  asmlinkage long sys_sb_notify(int dfd, const char __user *path,
 asmlinkage long sys_container_create(const char __user *name, unsigned int flags,
 				     unsigned long spare3, unsigned long spare4,
 				     unsigned long spare5);
+asmlinkage long sys_fork_into_container(int containerfd);
 
 /*
  * Architecture-specific system calls
diff --git a/kernel/cred.c b/kernel/cred.c
index 21f4a97085b4..f0ee5cec533d 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -313,6 +313,43 @@  struct cred *prepare_exec_creds(void)
 	return new;
 }
 
+/*
+ * Handle forking a process into a container.
+ */
+static struct cred *copy_container_creds(struct container *dest_container)
+{
+	struct cred *new;
+
+	validate_process_creds();
+
+	new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
+	if (!new)
+		return NULL;
+
+	kdebug("prepare_creds() alloc %p", new);
+
+	memcpy(new, dest_container->cred, sizeof(struct cred));
+
+	atomic_set(&new->usage, 1);
+	set_cred_subscribers(new, 0);
+	get_group_info(new->group_info);
+	get_uid(new->user);
+	get_user_ns(new->user_ns);
+
+#ifdef CONFIG_SECURITY
+	new->security = NULL;
+#endif
+
+	if (security_prepare_creds(new, dest_container->cred, GFP_KERNEL) < 0)
+		goto error;
+	validate_creds(new);
+	return new;
+
+error:
+	abort_creds(new);
+	return NULL;
+}
+
 /*
  * Copy credentials for the new process created by fork()
  *
@@ -322,7 +359,8 @@  struct cred *prepare_exec_creds(void)
  * The new process gets the current process's subjective credentials as its
  * objective and subjective credentials
  */
-int copy_creds(struct task_struct *p, unsigned long clone_flags)
+int copy_creds(struct task_struct *p, unsigned long clone_flags,
+	       struct container *dest_container)
 {
 	struct cred *new;
 	int ret;
@@ -343,7 +381,10 @@  int copy_creds(struct task_struct *p, unsigned long clone_flags)
 		return 0;
 	}
 
-	new = prepare_creds();
+	if (dest_container)
+		new = copy_container_creds(dest_container);
+	else
+		new = prepare_creds();
 	if (!new)
 		return -ENOMEM;
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 009cf7e63894..71401deb4434 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1385,9 +1385,33 @@  static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
 	return retval;
 }
 
-static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
+static int copy_fs(unsigned long clone_flags, struct task_struct *tsk,
+		   struct container *dest_container)
 {
 	struct fs_struct *fs = current->fs;
+
+#ifdef CONFIG_CONTAINERS
+	if (dest_container) {
+		fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
+		if (!fs)
+			return -ENOMEM;
+
+		fs->users = 1;
+		fs->in_exec = 0;
+		spin_lock_init(&fs->lock);
+		seqcount_init(&fs->seq);
+		fs->umask = 0022;
+
+		spin_lock(&dest_container->lock);
+		fs->pwd = fs->root = dest_container->root;
+		path_get(&fs->root);
+		path_get(&fs->pwd);
+		spin_unlock(&dest_container->lock);
+		tsk->fs = fs;
+		return 0;
+	}
+#endif
+
 	if (clone_flags & CLONE_FS) {
 		/* tsk->fs is already what we want */
 		spin_lock(&fs->lock);
@@ -1679,7 +1703,8 @@  static __latent_entropy struct task_struct *copy_process(
 					struct pid *pid,
 					int trace,
 					unsigned long tls,
-					int node)
+					int node,
+					struct container *dest_container)
 {
 	int retval;
 	struct task_struct *p;
@@ -1783,7 +1808,7 @@  static __latent_entropy struct task_struct *copy_process(
 	}
 	current->flags &= ~PF_NPROC_EXCEEDED;
 
-	retval = copy_creds(p, clone_flags);
+	retval = copy_creds(p, clone_flags, dest_container);
 	if (retval < 0)
 		goto bad_fork_free;
 
@@ -1905,7 +1930,7 @@  static __latent_entropy struct task_struct *copy_process(
 	retval = copy_files(clone_flags, p);
 	if (retval)
 		goto bad_fork_cleanup_semundo;
-	retval = copy_fs(clone_flags, p);
+	retval = copy_fs(clone_flags, p, dest_container);
 	if (retval)
 		goto bad_fork_cleanup_files;
 	retval = copy_sighand(clone_flags, p);
@@ -1917,15 +1942,15 @@  static __latent_entropy struct task_struct *copy_process(
 	retval = copy_mm(clone_flags, p);
 	if (retval)
 		goto bad_fork_cleanup_signal;
-	retval = copy_namespaces(clone_flags, p);
+	retval = copy_container(clone_flags, p, dest_container);
 	if (retval)
 		goto bad_fork_cleanup_mm;
-	retval = copy_container(clone_flags, p, NULL);
+	retval = copy_namespaces(clone_flags, p, dest_container);
 	if (retval)
-		goto bad_fork_cleanup_namespaces;
+		goto bad_fork_cleanup_container;
 	retval = copy_io(clone_flags, p);
 	if (retval)
-		goto bad_fork_cleanup_container;
+		goto bad_fork_cleanup_namespaces;
 	retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls);
 	if (retval)
 		goto bad_fork_cleanup_io;
@@ -2124,10 +2149,10 @@  static __latent_entropy struct task_struct *copy_process(
 bad_fork_cleanup_io:
 	if (p->io_context)
 		exit_io_context(p);
-bad_fork_cleanup_container:
-	exit_container(p);
 bad_fork_cleanup_namespaces:
 	exit_task_namespaces(p);
+bad_fork_cleanup_container:
+	exit_container(p);
 bad_fork_cleanup_mm:
 	if (p->mm)
 		mmput(p->mm);
@@ -2183,7 +2208,7 @@  struct task_struct *fork_idle(int cpu)
 {
 	struct task_struct *task;
 	task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0,
-			    cpu_to_node(cpu));
+			    cpu_to_node(cpu), NULL);
 	if (!IS_ERR(task)) {
 		init_idle_pids(task);
 		init_idle(task, cpu);
@@ -2195,15 +2220,16 @@  struct task_struct *fork_idle(int cpu)
 /*
  *  Ok, this is the main fork-routine.
  *
- * It copies the process, and if successful kick-starts
- * it and waits for it to finish using the VM if required.
+ * It copies the process into the specified container, and if successful
+ * kick-starts it and waits for it to finish using the VM if required.
  */
 long _do_fork(unsigned long clone_flags,
 	      unsigned long stack_start,
 	      unsigned long stack_size,
 	      int __user *parent_tidptr,
 	      int __user *child_tidptr,
-	      unsigned long tls)
+	      unsigned long tls,
+	      struct container *dest_container)
 {
 	struct completion vfork;
 	struct pid *pid;
@@ -2229,8 +2255,32 @@  long _do_fork(unsigned long clone_flags,
 			trace = 0;
 	}
 
+	if (dest_container) {
+		/* A process spawned into a container doesn't share anything
+		 * with the parent other than namespaces.
+		 */
+		if (clone_flags & (CLONE_CHILD_CLEARTID |
+				   CLONE_CHILD_SETTID |
+				   CLONE_FILES |
+				   CLONE_FS |
+				   CLONE_IO |
+				   CLONE_PARENT |
+				   CLONE_PARENT_SETTID |
+				   CLONE_PTRACE |
+				   CLONE_SETTLS |
+				   CLONE_SIGHAND |
+				   CLONE_SYSVSEM |
+				   CLONE_THREAD))
+			return -EINVAL;
+
+		/* However, we do have to let kernel threads borrow a VM. */
+		if ((clone_flags & CLONE_VM) && current->mm)
+			return -EINVAL;
+	}
+	
 	p = copy_process(clone_flags, stack_start, stack_size,
-			 child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
+			 child_tidptr, NULL, trace, tls, NUMA_NO_NODE,
+			 dest_container);
 	add_latent_entropy();
 
 	if (IS_ERR(p))
@@ -2279,7 +2329,7 @@  long do_fork(unsigned long clone_flags,
 	      int __user *child_tidptr)
 {
 	return _do_fork(clone_flags, stack_start, stack_size,
-			parent_tidptr, child_tidptr, 0);
+			parent_tidptr, child_tidptr, 0, NULL);
 }
 #endif
 
@@ -2289,14 +2339,14 @@  long do_fork(unsigned long clone_flags,
 pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
 {
 	return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
-		(unsigned long)arg, NULL, NULL, 0);
+			(unsigned long)arg, NULL, NULL, 0, NULL);
 }
 
 #ifdef __ARCH_WANT_SYS_FORK
 SYSCALL_DEFINE0(fork)
 {
 #ifdef CONFIG_MMU
-	return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0);
+	return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0, NULL);
 #else
 	/* can not support in nommu mode */
 	return -EINVAL;
@@ -2308,7 +2358,26 @@  SYSCALL_DEFINE0(fork)
 SYSCALL_DEFINE0(vfork)
 {
 	return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
-			0, NULL, NULL, 0);
+			0, NULL, NULL, 0, NULL);
+}
+#endif
+
+#ifdef CONFIG_CONTAINERS
+SYSCALL_DEFINE1(fork_into_container, int, containerfd)
+{
+	struct fd f = fdget(containerfd);
+	int ret;
+
+	if (!f.file)
+		return -EBADF;
+	ret = -EINVAL;
+	if (is_container_file(f.file)) {
+		struct container *dest_container = f.file->private_data;
+
+		ret = _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0, dest_container);
+	}
+	fdput(f);
+	return ret;
 }
 #endif
 
@@ -2336,7 +2405,8 @@  SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
 		 unsigned long, tls)
 #endif
 {
-	return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls);
+	return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls,
+			NULL);
 }
 #endif
 
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 4bb5184b3a80..4031075300a4 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -136,12 +136,19 @@  struct nsproxy *create_new_namespaces(unsigned long flags,
  * called from clone.  This now handles copy for nsproxy and all
  * namespaces therein.
  */
-int copy_namespaces(unsigned long flags, struct task_struct *tsk)
+int copy_namespaces(unsigned long flags, struct task_struct *tsk,
+		    struct container *dest_container)
 {
 	struct nsproxy *old_ns = tsk->nsproxy;
 	struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
 	struct nsproxy *new_ns;
 
+	if (dest_container) {
+		get_nsproxy(dest_container->ns);
+		tsk->nsproxy = dest_container->ns;
+		return 0;
+	}
+
 	if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
 			      CLONE_NEWPID | CLONE_NEWNET |
 			      CLONE_NEWCGROUP)))) {
@@ -163,7 +170,7 @@  int copy_namespaces(unsigned long flags, struct task_struct *tsk)
 		(CLONE_NEWIPC | CLONE_SYSVSEM)) 
 		return -EINVAL;
 
-	new_ns = create_new_namespaces(flags, tsk->nsproxy, user_ns, tsk->fs);
+	new_ns = create_new_namespaces(flags, old_ns, user_ns, tsk->fs);
 	if (IS_ERR(new_ns))
 		return  PTR_ERR(new_ns);
 
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index f0455cbb91cf..a23ad529d548 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -144,6 +144,7 @@  COND_SYSCALL(container_create);
 /* kernel/exit.c */
 
 /* kernel/fork.c */
+COND_SYSCALL(fork_into_container);
 
 /* kernel/futex.c */
 COND_SYSCALL(futex);