@@ -477,6 +477,7 @@ quiet_ni_syscall:
PTREGSCALL stub32_clone, sys32_clone, %rdx
PTREGSCALL stub32_vfork, sys_vfork, %rdi
PTREGSCALL stub32_iopl, sys_iopl, %rsi
+ PTREGSCALL stub32_eclone, sys_eclone, %r8
ENTRY(ia32_ptregs_common)
popq %r11
@@ -842,4 +843,5 @@ ia32_sys_call_table:
.quad compat_sys_rt_tgsigqueueinfo /* 335 */
.quad sys_perf_event_open
.quad compat_sys_recvmmsg
+ .quad stub32_eclone
ia32_syscall_end:
@@ -27,6 +27,8 @@ long sys_execve(char __user *, char __user * __user *,
char __user * __user *, struct pt_regs *);
long sys_clone(unsigned long, unsigned long, void __user *,
void __user *, struct pt_regs *);
+long sys_eclone(unsigned flags_low, struct clone_args __user *uca,
+ int args_size, pid_t __user *pids, struct pt_regs *regs);
/* kernel/ldt.c */
asmlinkage int sys_modify_ldt(int, void __user *, unsigned long);
@@ -343,10 +343,11 @@
#define __NR_rt_tgsigqueueinfo 335
#define __NR_perf_event_open 336
#define __NR_recvmmsg 337
+#define __NR_eclone 338
#ifdef __KERNEL__
-#define NR_syscalls 338
+#define NR_syscalls 339
#define __ARCH_WANT_IPC_PARSE_VERSION
#define __ARCH_WANT_OLD_READDIR
@@ -663,6 +663,8 @@ __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
__SYSCALL(__NR_perf_event_open, sys_perf_event_open)
#define __NR_recvmmsg 299
__SYSCALL(__NR_recvmmsg, sys_recvmmsg)
+#define __NR_eclone 300
+__SYSCALL(__NR_eclone, stub_eclone)
#ifndef __NO_STUBS
#define __ARCH_WANT_OLD_READDIR
@@ -758,6 +758,19 @@ ptregs_##name: \
addl $4,%esp; \
ret
+#define PTREGSCALL4(name) \
+ ALIGN; \
+ptregs_##name: \
+ leal 4(%esp),%eax; \
+ pushl %eax; \
+ pushl PT_ESI(%eax); \
+ movl PT_EDX(%eax),%ecx; \
+ movl PT_ECX(%eax),%edx; \
+ movl PT_EBX(%eax),%eax; \
+ call sys_##name; \
+ addl $8,%esp; \
+ ret
+
PTREGSCALL1(iopl)
PTREGSCALL0(fork)
PTREGSCALL0(vfork)
@@ -767,6 +780,7 @@ PTREGSCALL0(sigreturn)
PTREGSCALL0(rt_sigreturn)
PTREGSCALL2(vm86)
PTREGSCALL1(vm86old)
+PTREGSCALL4(eclone)
/* Clone is an oddball. The 4th arg is in %edi */
ALIGN;
@@ -698,6 +698,7 @@ END(\label)
PTREGSCALL stub_vfork, sys_vfork, %rdi
PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
PTREGSCALL stub_iopl, sys_iopl, %rsi
+ PTREGSCALL stub_eclone, sys_eclone, %r8
ENTRY(ptregscall_common)
DEFAULT_FRAME 1 8 /* offset 8: return address */
@@ -259,6 +259,45 @@ sys_clone(unsigned long clone_flags, unsigned long newsp,
return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
}
+long
+sys_eclone(unsigned flags_low, struct clone_args __user *uca,
+ int args_size, pid_t __user *pids, struct pt_regs *regs)
+{
+ int rc;
+ struct clone_args kca;
+ unsigned long flags;
+ int __user *parent_tidp;
+ int __user *child_tidp;
+ unsigned long __user stack;
+ unsigned long stack_size;
+
+ rc = fetch_clone_args_from_user(uca, args_size, &kca);
+ if (rc)
+ return rc;
+
+ /*
+ * TODO: Convert 'clone-flags' to 64-bits on all architectures.
+ * TODO: When ->clone_flags_high is non-zero, copy it in to the
+ * higher word(s) of 'flags':
+ *
+ * flags = (kca.clone_flags_high << 32) | flags_low;
+ */
+ flags = flags_low;
+ parent_tidp = (int *)(unsigned long)kca.parent_tid_ptr;
+ child_tidp = (int *)(unsigned long)kca.child_tid_ptr;
+
+ stack_size = (unsigned long)kca.child_stack_size;
+ if (stack_size)
+ return -EINVAL;
+
+ stack = (unsigned long)kca.child_stack;
+ if (!stack)
+ stack = regs->sp;
+
+ return do_fork_with_pids(flags, stack, regs, stack_size, parent_tidp,
+ child_tidp, kca.nr_pids, pids);
+}
+
/*
* This gets run with %si containing the
* function to call, and %di containing
@@ -700,4 +739,3 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
unsigned long range_end = mm->brk + 0x02000000;
return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
}
-
@@ -337,3 +337,4 @@ ENTRY(sys_call_table)
.long sys_rt_tgsigqueueinfo /* 335 */
.long sys_perf_event_open
.long sys_recvmmsg
+ .long ptregs_eclone
@@ -2129,6 +2129,8 @@ extern int disallow_signal(int);
extern int do_execve(char *, char __user * __user *, char __user * __user *, struct pt_regs *);
extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *);
+extern int fetch_clone_args_from_user(struct clone_args __user *, int,
+ struct clone_args *);
extern long do_fork_with_pids(unsigned long, unsigned long, struct pt_regs *,
unsigned long, int __user *, int __user *,
unsigned int, pid_t __user *);
@@ -204,6 +204,22 @@ struct ustat {
char f_fpack[6];
};
+struct clone_args {
+ u64 clone_flags_high;
+ /*
+ * Architectures can use child_stack for either the stack pointer or
+ * the base of of stack. If child_stack is used as the stack pointer,
+ * child_stack_size must be 0. Otherwise child_stack_size must be
+ * set to size of allocated stack.
+ */
+ u64 child_stack;
+ u64 child_stack_size;
+ u64 parent_tid_ptr;
+ u64 child_tid_ptr;
+ u32 nr_pids;
+ u32 reserved0;
+};
+
#endif /* __KERNEL__ */
#endif /* __ASSEMBLY__ */
#endif /* _LINUX_TYPES_H */
@@ -1350,6 +1350,114 @@ struct task_struct * __cpuinit fork_idle(int cpu)
}
/*
+ * If user specified any 'target-pids' in @upid_setp, copy them from
+ * user and return a pointer to a local copy of the list of pids. The
+ * caller must free the list, when they are done using it.
+ *
+ * If user did not specify any target pids, return NULL (caller should
+ * treat this like normal clone).
+ *
+ * On any errors, return the error code
+ */
+static pid_t *copy_target_pids(int unum_pids, pid_t __user *upids)
+{
+ int j;
+ int rc;
+ int size;
+ int knum_pids; /* # of pids needed in kernel */
+ pid_t *target_pids;
+
+ if (!unum_pids)
+ return NULL;
+
+ knum_pids = task_pid(current)->level + 1;
+ if (unum_pids > knum_pids)
+ return ERR_PTR(-EINVAL);
+
+ /*
+ * To keep alloc_pid() simple, allocate an extra pid_t in target_pids[]
+ * and set it to 0. This last entry in target_pids[] corresponds to the
+ * (yet-to-be-created) descendant pid-namespace if CLONE_NEWPID was
+ * specified. If CLONE_NEWPID was not specified, this last entry will
+ * simply be ignored.
+ */
+ target_pids = kzalloc((knum_pids + 1) * sizeof(pid_t), GFP_KERNEL);
+ if (!target_pids)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * A process running in a level 2 pid namespace has three pid namespaces
+ * and hence three pid numbers. If this process is checkpointed,
+ * information about these three namespaces are saved. We refer to these
+ * namespaces as 'known namespaces'.
+ *
+ * If this checkpointed process is however restarted in a level 3 pid
+ * namespace, the restarted process has an extra ancestor pid namespace
+ * (i.e 'unknown namespace') and 'knum_pids' exceeds 'unum_pids'.
+ *
+ * During restart, the process requests specific pids for its 'known
+ * namespaces' and lets kernel assign pids to its 'unknown namespaces'.
+ *
+ * Since the requested-pids correspond to 'known namespaces' and since
+ * 'known-namespaces' are younger than (i.e descendants of) 'unknown-
+ * namespaces', copy requested pids to the back-end of target_pids[]
+ * (i.e before the last entry for CLONE_NEWPID mentioned above).
+ * Any entries in target_pids[] not corresponding to a requested pid
+ * will be set to zero and kernel assigns a pid in those namespaces.
+ *
+ * NOTE: The order of pids in target_pids[] is oldest pid namespace
+ * to youngest (target_pids[0] corresponds to init_pid_ns). i.e. the
+ * the order is:
+ *
+ * - pids for 'unknown-namespaces' (if any)
+ * - pids for 'known-namespaces' (requested pids)
+ * - 0 in the last entry (for CLONE_NEWPID).
+ */
+ j = knum_pids - unum_pids;
+ size = unum_pids * sizeof(pid_t);
+
+ rc = copy_from_user(&target_pids[j], upids, size);
+ if (rc) {
+ rc = -EFAULT;
+ goto out_free;
+ }
+
+ return target_pids;
+
+out_free:
+ kfree(target_pids);
+ return ERR_PTR(rc);
+}
+
+int
+fetch_clone_args_from_user(struct clone_args __user *uca, int args_size,
+ struct clone_args *kca)
+{
+ int rc;
+
+ /*
+ * TODO: If size of clone_args is not what the kernel expects, it
+ * could be that kernel is newer and has an extended structure.
+ * When that happens, this check needs to be smarter. For now,
+ * assume exact match.
+ */
+ if (args_size != sizeof(struct clone_args))
+ return -EINVAL;
+
+ rc = copy_from_user(kca, uca, args_size);
+ if (rc)
+ return -EFAULT;
+
+ /*
+ * To avoid future compatibility issues, ensure unused fields are 0.
+ */
+ if (kca->reserved0 || kca->clone_flags_high)
+ return -EINVAL;
+
+ return 0;
+}
+
+/*
* Ok, this is the main fork-routine.
*
* It copies the process, and if successful kick-starts
@@ -1367,7 +1475,7 @@ long do_fork_with_pids(unsigned long clone_flags,
struct task_struct *p;
int trace = 0;
long nr;
- pid_t *target_pids = NULL;
+ pid_t *target_pids;
/*
* Do some preliminary argument and permissions checking before we
@@ -1401,6 +1509,16 @@ long do_fork_with_pids(unsigned long clone_flags,
}
}
+ target_pids = copy_target_pids(num_pids, upids);
+ if (target_pids) {
+ if (IS_ERR(target_pids))
+ return PTR_ERR(target_pids);
+
+ nr = -EPERM;
+ if (!capable(CAP_SYS_ADMIN))
+ goto out_free;
+ }
+
/*
* When called from kernel_thread, don't do user tracing stuff.
*/
@@ -1462,6 +1580,10 @@ long do_fork_with_pids(unsigned long clone_flags,
} else {
nr = PTR_ERR(p);
}
+
+out_free:
+ kfree(target_pids);
+
return nr;
}