@@ -116,13 +116,17 @@ void new_thread_handler(void)
* callback returns only if the kernel thread execs a process
*/
fn(arg);
+#ifndef CONFIG_MMU
+ arch_switch_to(current);
+#endif
userspace(¤t->thread.regs.regs);
}
/* Called magically, see new_thread_handler above */
static void fork_handler(void)
{
- schedule_tail(current->thread.prev_sched);
+ if (current->thread.prev_sched)
+ schedule_tail(current->thread.prev_sched);
/*
* XXX: if interrupt_end() calls schedule, this call to
@@ -133,6 +137,33 @@ static void fork_handler(void)
current->thread.prev_sched = NULL;
+#ifndef CONFIG_MMU
+ /*
+ * child of vfork(2) comes here.
+ * clone(2) also enters here but doesn't need to advance the %rsp.
+ *
+ * This fork can only come from libc's vfork, which
+ * does this:
+ * popq %%rdx;
+ * call *%rax; // zpoline => __kernel_vsyscall
+ * pushq %%rdx;
+ * %rcx stores the return address which is stored
+ * at pt_regs[HOST_IP] at the moment. As child returns
+ * via userspace() with a jmp instruction (while parent
+ * does via ret instruction in __kernel_vsyscall), we
+ * need to pop (advance) the pushed address by "call"
+ * though, so this is what this next line does.
+ *
+ * As a result of vfork return in child, stack contents
+ * is overwritten by child (by pushq in vfork), which
+ * makes the parent puzzled after child returns.
+ *
+ * thus the contents should be restored before vfork/parent
+ * returns. this is done in do_syscall_64().
+ */
+ if (current->thread.regs.regs.gp[HOST_ORIG_AX] == __NR_vfork)
+ current->thread.regs.regs.gp[REGS_SP_INDEX] += 8;
+#endif
userspace(¤t->thread.regs.regs);
}
@@ -25,7 +25,10 @@
void os_alarm_process(int pid)
{
+/* !CONFIG_MMU doesn't send alarm signal to other processes */
+#ifdef CONFIG_MMU
kill(pid, SIGALRM);
+#endif
}
void os_kill_process(int pid, int reap_child)
@@ -42,11 +45,14 @@ void os_kill_process(int pid, int reap_child)
void os_kill_ptraced_process(int pid, int reap_child)
{
+/* !CONFIG_MMU doesn't have ptraced process */
+#ifdef CONFIG_MMU
kill(pid, SIGKILL);
ptrace(PTRACE_KILL, pid);
ptrace(PTRACE_CONT, pid);
if (reap_child)
CATCH_EINTR(waitpid(pid, NULL, __WALL));
+#endif
}
/* Don't use the glibc version, which caches the result in TLS. It misses some
@@ -144,6 +144,7 @@ void wait_stub_done(int pid)
extern unsigned long current_stub_stack(void);
+#ifdef CONFIG_MMU
static void get_skas_faultinfo(int pid, struct faultinfo *fi)
{
int err;
@@ -176,6 +177,7 @@ static void handle_trap(int pid, struct uml_pt_regs *regs)
handle_syscall(regs);
}
+#endif
extern char __syscall_stub_start[];
@@ -389,6 +391,7 @@ int start_userspace(unsigned long stub_stack)
}
int unscheduled_userspace_iterations;
+#ifdef CONFIG_MMU
extern unsigned long tt_extra_sched_jiffies;
void userspace(struct uml_pt_regs *regs)
@@ -550,6 +553,7 @@ void userspace(struct uml_pt_regs *regs)
}
}
}
+#endif /* UML_CONFIG_MMU */
void new_thread(void *stack, jmp_buf *buf, void (*handler)(void))
{
@@ -38,6 +38,18 @@ static __always_inline void cpu_relax(void)
#define task_pt_regs(t) (&(t)->thread.regs)
+#ifndef CONFIG_MMU
+#define task_top_of_stack(task) \
+({ \
+ unsigned long __ptr = (unsigned long)task->stack; \
+ __ptr += THREAD_SIZE; \
+ __ptr; \
+})
+
+extern long current_top_of_stack;
+extern long current_ptregs;
+#endif
+
#include <asm/processor-generic.h>
#endif
@@ -1,14 +1,43 @@
// SPDX-License-Identifier: GPL-2.0
+//#define DEBUG 1
#include <linux/kernel.h>
#include <linux/ptrace.h>
#include <kern_util.h>
#include <sysdep/syscalls.h>
#include <os.h>
+/*
+ * save/restore the return address stored in the stack, as the child overwrites
+ * the contents after returning to userspace (i.e., by push %rdx).
+ *
+ * see the detail in fork_handler().
+ */
+static void *vfork_save_stack(void)
+{
+ unsigned char *stack_copy;
+
+ stack_copy = kzalloc(8, GFP_KERNEL);
+ if (!stack_copy)
+ return NULL;
+
+ memcpy(stack_copy,
+ (void *)current->thread.regs.regs.gp[HOST_SP], 8);
+
+ return stack_copy;
+}
+
+static void vfork_restore_stack(void *stack_copy)
+{
+ WARN_ON_ONCE(!stack_copy);
+ memcpy((void *)current->thread.regs.regs.gp[HOST_SP],
+ stack_copy, 8);
+}
+
__visible void do_syscall_64(struct pt_regs *regs)
{
int syscall;
+ unsigned char *stack_copy = NULL;
syscall = PT_SYSCALL_NR(regs->regs.gp);
UPT_SYSCALL_NR(®s->regs) = syscall;
@@ -17,6 +46,9 @@ __visible void do_syscall_64(struct pt_regs *regs)
syscall, (unsigned long)current,
(unsigned long)sys_call_table[syscall]);
+ if (syscall == __NR_vfork)
+ stack_copy = vfork_save_stack();
+
if (likely(syscall < NR_syscalls)) {
PT_REGS_SET_SYSCALL_RETURN(regs,
EXECUTE_SYSCALL(syscall, regs));
@@ -34,4 +66,8 @@ __visible void do_syscall_64(struct pt_regs *regs)
/* execve succeeded */
if (syscall == __NR_execve && regs->regs.gp[HOST_AX] == 0)
userspace(¤t->thread.regs.regs);
+
+ /* only parents of vfork restores the contents of stack */
+ if (syscall == __NR_vfork && regs->regs.gp[HOST_AX] > 0)
+ vfork_restore_stack(stack_copy);
}
@@ -85,3 +85,24 @@ ENTRY(__kernel_vsyscall)
ret
END(__kernel_vsyscall)
+
+// void userspace(struct uml_pt_regs *regs)
+ENTRY(userspace)
+ /* align the stack for x86_64 ABI */
+ and $-0x10, %rsp
+ /* Handle any immediate reschedules or signals */
+ call interrupt_end
+
+ movq current_ptregs, %rsp
+
+ POP_REGS
+
+ addq $8, %rsp /* skip orig_ax */
+ popq %r11 /* pt_regs->ip */
+ addq $8, %rsp /* skip cs */
+ addq $8, %rsp /* skip flags */
+ popq %rsp
+
+ jmp *%r11
+
+END(userspace)
@@ -51,6 +51,18 @@ void arch_switch_to(struct task_struct *to)
* Nothing needs to be done on x86_64.
* The FS_BASE/GS_BASE registers are saved in the ptrace register set.
*/
+#ifndef CONFIG_MMU
+ current_top_of_stack = task_top_of_stack(to);
+ current_ptregs = (long)task_pt_regs(to);
+
+ if ((to->thread.regs.regs.gp[FS_BASE / sizeof(unsigned long)] == 0) ||
+ (to->mm == NULL))
+ return;
+
+ /* this changes the FS on every context switch */
+ arch_prctl(to, ARCH_SET_FS,
+ (void __user *) to->thread.regs.regs.gp[FS_BASE / sizeof(unsigned long)]);
+#endif
}
SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,