diff mbox

[RFC,v2,03/18] x86/asm/head: standardize the bottom of the stack for idle tasks

Message ID 9268772b31cc7bc4dc40c617e3baf45e07322145.1461875890.git.jpoimboe@redhat.com (mailing list archive)
State Not Applicable
Headers show

Commit Message

Josh Poimboeuf April 28, 2016, 8:44 p.m. UTC
Thanks to all the recent x86 entry code refactoring, most tasks' kernel
stacks start at the same offset right above their saved pt_regs,
regardless of which syscall was used to enter the kernel.  That creates
a nice convention which makes it straightforward to identify the
"bottom" of the stack, which can be useful for stack walking code which
needs to verify the stack is sane.

However there are still a few types of tasks which don't yet follow that
convention:

1) CPU idle tasks, aka the "swapper" tasks

2) freshly forked TIF_FORK tasks which don't have a stack at all

Make the idle tasks conform to the new stack bottom convention by
starting their stack at a sizeof(pt_regs) offset from the end of the
stack page.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
---
 arch/x86/kernel/head_64.S | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

Comments

Brian Gerst April 29, 2016, 6:46 p.m. UTC | #1
On Thu, Apr 28, 2016 at 4:44 PM, Josh Poimboeuf <jpoimboe@redhat.com> wrote:
> Thanks to all the recent x86 entry code refactoring, most tasks' kernel
> stacks start at the same offset right above their saved pt_regs,
> regardless of which syscall was used to enter the kernel.  That creates
> a nice convention which makes it straightforward to identify the
> "bottom" of the stack, which can be useful for stack walking code which
> needs to verify the stack is sane.
>
> However there are still a few types of tasks which don't yet follow that
> convention:
>
> 1) CPU idle tasks, aka the "swapper" tasks
>
> 2) freshly forked TIF_FORK tasks which don't have a stack at all
>
> Make the idle tasks conform to the new stack bottom convention by
> starting their stack at a sizeof(pt_regs) offset from the end of the
> stack page.
>
> Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
> ---
>  arch/x86/kernel/head_64.S | 7 ++++---
>  1 file changed, 4 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
> index 6dbd2c0..0b12311 100644
> --- a/arch/x86/kernel/head_64.S
> +++ b/arch/x86/kernel/head_64.S
> @@ -296,8 +296,9 @@ ENTRY(start_cpu)
>          *      REX.W + FF /5 JMP m16:64 Jump far, absolute indirect,
>          *              address given in m16:64.
>          */
> -       movq    initial_code(%rip),%rax
> -       pushq   $0              # fake return address to stop unwinder
> +       call    1f              # put return address on stack for unwinder
> +1:     xorq    %rbp, %rbp      # clear frame pointer
> +       movq    initial_code(%rip), %rax
>         pushq   $__KERNEL_CS    # set correct cs
>         pushq   %rax            # target address in negative space
>         lretq

This chunk looks like it should be a separate patch.

--
Brian Gerst
Andy Lutomirski April 29, 2016, 7:39 p.m. UTC | #2
On Thu, Apr 28, 2016 at 1:44 PM, Josh Poimboeuf <jpoimboe@redhat.com> wrote:
> Thanks to all the recent x86 entry code refactoring, most tasks' kernel
> stacks start at the same offset right above their saved pt_regs,
> regardless of which syscall was used to enter the kernel.  That creates
> a nice convention which makes it straightforward to identify the
> "bottom" of the stack, which can be useful for stack walking code which
> needs to verify the stack is sane.
>
> However there are still a few types of tasks which don't yet follow that
> convention:
>
> 1) CPU idle tasks, aka the "swapper" tasks
>
> 2) freshly forked TIF_FORK tasks which don't have a stack at all
>
> Make the idle tasks conform to the new stack bottom convention by
> starting their stack at a sizeof(pt_regs) offset from the end of the
> stack page.
>
> Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
> ---
>  arch/x86/kernel/head_64.S | 7 ++++---
>  1 file changed, 4 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
> index 6dbd2c0..0b12311 100644
> --- a/arch/x86/kernel/head_64.S
> +++ b/arch/x86/kernel/head_64.S
> @@ -296,8 +296,9 @@ ENTRY(start_cpu)
>          *      REX.W + FF /5 JMP m16:64 Jump far, absolute indirect,
>          *              address given in m16:64.
>          */
> -       movq    initial_code(%rip),%rax
> -       pushq   $0              # fake return address to stop unwinder
> +       call    1f              # put return address on stack for unwinder
> +1:     xorq    %rbp, %rbp      # clear frame pointer
> +       movq    initial_code(%rip), %rax
>         pushq   $__KERNEL_CS    # set correct cs
>         pushq   %rax            # target address in negative space
>         lretq
> @@ -325,7 +326,7 @@ ENDPROC(start_cpu0)
>         GLOBAL(initial_gs)
>         .quad   INIT_PER_CPU_VAR(irq_stack_union)
>         GLOBAL(initial_stack)
> -       .quad  init_thread_union+THREAD_SIZE-8
> +       .quad  init_thread_union + THREAD_SIZE - SIZEOF_PTREGS

As long as you're doing this, could you also set orig_ax to -1?  I
remember running into some oddities resulting from orig_ax containing
garbage at some point.

--Andy
Josh Poimboeuf April 29, 2016, 8:28 p.m. UTC | #3
On Fri, Apr 29, 2016 at 02:46:10PM -0400, Brian Gerst wrote:
> On Thu, Apr 28, 2016 at 4:44 PM, Josh Poimboeuf <jpoimboe@redhat.com> wrote:
> > Thanks to all the recent x86 entry code refactoring, most tasks' kernel
> > stacks start at the same offset right above their saved pt_regs,
> > regardless of which syscall was used to enter the kernel.  That creates
> > a nice convention which makes it straightforward to identify the
> > "bottom" of the stack, which can be useful for stack walking code which
> > needs to verify the stack is sane.
> >
> > However there are still a few types of tasks which don't yet follow that
> > convention:
> >
> > 1) CPU idle tasks, aka the "swapper" tasks
> >
> > 2) freshly forked TIF_FORK tasks which don't have a stack at all
> >
> > Make the idle tasks conform to the new stack bottom convention by
> > starting their stack at a sizeof(pt_regs) offset from the end of the
> > stack page.
> >
> > Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
> > ---
> >  arch/x86/kernel/head_64.S | 7 ++++---
> >  1 file changed, 4 insertions(+), 3 deletions(-)
> >
> > diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
> > index 6dbd2c0..0b12311 100644
> > --- a/arch/x86/kernel/head_64.S
> > +++ b/arch/x86/kernel/head_64.S
> > @@ -296,8 +296,9 @@ ENTRY(start_cpu)
> >          *      REX.W + FF /5 JMP m16:64 Jump far, absolute indirect,
> >          *              address given in m16:64.
> >          */
> > -       movq    initial_code(%rip),%rax
> > -       pushq   $0              # fake return address to stop unwinder
> > +       call    1f              # put return address on stack for unwinder
> > +1:     xorq    %rbp, %rbp      # clear frame pointer
> > +       movq    initial_code(%rip), %rax
> >         pushq   $__KERNEL_CS    # set correct cs
> >         pushq   %rax            # target address in negative space
> >         lretq
> 
> This chunk looks like it should be a separate patch.

Agreed, thanks.
Josh Poimboeuf April 29, 2016, 8:50 p.m. UTC | #4
On Fri, Apr 29, 2016 at 12:39:16PM -0700, Andy Lutomirski wrote:
> On Thu, Apr 28, 2016 at 1:44 PM, Josh Poimboeuf <jpoimboe@redhat.com> wrote:
> > Thanks to all the recent x86 entry code refactoring, most tasks' kernel
> > stacks start at the same offset right above their saved pt_regs,
> > regardless of which syscall was used to enter the kernel.  That creates
> > a nice convention which makes it straightforward to identify the
> > "bottom" of the stack, which can be useful for stack walking code which
> > needs to verify the stack is sane.
> >
> > However there are still a few types of tasks which don't yet follow that
> > convention:
> >
> > 1) CPU idle tasks, aka the "swapper" tasks
> >
> > 2) freshly forked TIF_FORK tasks which don't have a stack at all
> >
> > Make the idle tasks conform to the new stack bottom convention by
> > starting their stack at a sizeof(pt_regs) offset from the end of the
> > stack page.
> >
> > Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
> > ---
> >  arch/x86/kernel/head_64.S | 7 ++++---
> >  1 file changed, 4 insertions(+), 3 deletions(-)
> >
> > diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
> > index 6dbd2c0..0b12311 100644
> > --- a/arch/x86/kernel/head_64.S
> > +++ b/arch/x86/kernel/head_64.S
> > @@ -296,8 +296,9 @@ ENTRY(start_cpu)
> >          *      REX.W + FF /5 JMP m16:64 Jump far, absolute indirect,
> >          *              address given in m16:64.
> >          */
> > -       movq    initial_code(%rip),%rax
> > -       pushq   $0              # fake return address to stop unwinder
> > +       call    1f              # put return address on stack for unwinder
> > +1:     xorq    %rbp, %rbp      # clear frame pointer
> > +       movq    initial_code(%rip), %rax
> >         pushq   $__KERNEL_CS    # set correct cs
> >         pushq   %rax            # target address in negative space
> >         lretq
> > @@ -325,7 +326,7 @@ ENDPROC(start_cpu0)
> >         GLOBAL(initial_gs)
> >         .quad   INIT_PER_CPU_VAR(irq_stack_union)
> >         GLOBAL(initial_stack)
> > -       .quad  init_thread_union+THREAD_SIZE-8
> > +       .quad  init_thread_union + THREAD_SIZE - SIZEOF_PTREGS
> 
> As long as you're doing this, could you also set orig_ax to -1?  I
> remember running into some oddities resulting from orig_ax containing
> garbage at some point.

I assume you mean to initialize the orig_rax value in the pt_regs at the
bottom of the stack of the idle task?

How could that cause a problem?  Since the idle task never returns from
a system call, I'd assume that memory never gets accessed?
Andy Lutomirski April 29, 2016, 9:38 p.m. UTC | #5
On Fri, Apr 29, 2016 at 1:50 PM, Josh Poimboeuf <jpoimboe@redhat.com> wrote:
> On Fri, Apr 29, 2016 at 12:39:16PM -0700, Andy Lutomirski wrote:
>> On Thu, Apr 28, 2016 at 1:44 PM, Josh Poimboeuf <jpoimboe@redhat.com> wrote:
>> > Thanks to all the recent x86 entry code refactoring, most tasks' kernel
>> > stacks start at the same offset right above their saved pt_regs,
>> > regardless of which syscall was used to enter the kernel.  That creates
>> > a nice convention which makes it straightforward to identify the
>> > "bottom" of the stack, which can be useful for stack walking code which
>> > needs to verify the stack is sane.
>> >
>> > However there are still a few types of tasks which don't yet follow that
>> > convention:
>> >
>> > 1) CPU idle tasks, aka the "swapper" tasks
>> >
>> > 2) freshly forked TIF_FORK tasks which don't have a stack at all
>> >
>> > Make the idle tasks conform to the new stack bottom convention by
>> > starting their stack at a sizeof(pt_regs) offset from the end of the
>> > stack page.
>> >
>> > Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
>> > ---
>> >  arch/x86/kernel/head_64.S | 7 ++++---
>> >  1 file changed, 4 insertions(+), 3 deletions(-)
>> >
>> > diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
>> > index 6dbd2c0..0b12311 100644
>> > --- a/arch/x86/kernel/head_64.S
>> > +++ b/arch/x86/kernel/head_64.S
>> > @@ -296,8 +296,9 @@ ENTRY(start_cpu)
>> >          *      REX.W + FF /5 JMP m16:64 Jump far, absolute indirect,
>> >          *              address given in m16:64.
>> >          */
>> > -       movq    initial_code(%rip),%rax
>> > -       pushq   $0              # fake return address to stop unwinder
>> > +       call    1f              # put return address on stack for unwinder
>> > +1:     xorq    %rbp, %rbp      # clear frame pointer
>> > +       movq    initial_code(%rip), %rax
>> >         pushq   $__KERNEL_CS    # set correct cs
>> >         pushq   %rax            # target address in negative space
>> >         lretq
>> > @@ -325,7 +326,7 @@ ENDPROC(start_cpu0)
>> >         GLOBAL(initial_gs)
>> >         .quad   INIT_PER_CPU_VAR(irq_stack_union)
>> >         GLOBAL(initial_stack)
>> > -       .quad  init_thread_union+THREAD_SIZE-8
>> > +       .quad  init_thread_union + THREAD_SIZE - SIZEOF_PTREGS
>>
>> As long as you're doing this, could you also set orig_ax to -1?  I
>> remember running into some oddities resulting from orig_ax containing
>> garbage at some point.
>
> I assume you mean to initialize the orig_rax value in the pt_regs at the
> bottom of the stack of the idle task?
>
> How could that cause a problem?  Since the idle task never returns from
> a system call, I'd assume that memory never gets accessed?
>

Look at collect_syscall in lib/syscall.c

> --
> Josh
Josh Poimboeuf April 29, 2016, 11:27 p.m. UTC | #6
On Fri, Apr 29, 2016 at 02:38:02PM -0700, Andy Lutomirski wrote:
> On Fri, Apr 29, 2016 at 1:50 PM, Josh Poimboeuf <jpoimboe@redhat.com> wrote:
> > On Fri, Apr 29, 2016 at 12:39:16PM -0700, Andy Lutomirski wrote:
> >> On Thu, Apr 28, 2016 at 1:44 PM, Josh Poimboeuf <jpoimboe@redhat.com> wrote:
> >> > Thanks to all the recent x86 entry code refactoring, most tasks' kernel
> >> > stacks start at the same offset right above their saved pt_regs,
> >> > regardless of which syscall was used to enter the kernel.  That creates
> >> > a nice convention which makes it straightforward to identify the
> >> > "bottom" of the stack, which can be useful for stack walking code which
> >> > needs to verify the stack is sane.
> >> >
> >> > However there are still a few types of tasks which don't yet follow that
> >> > convention:
> >> >
> >> > 1) CPU idle tasks, aka the "swapper" tasks
> >> >
> >> > 2) freshly forked TIF_FORK tasks which don't have a stack at all
> >> >
> >> > Make the idle tasks conform to the new stack bottom convention by
> >> > starting their stack at a sizeof(pt_regs) offset from the end of the
> >> > stack page.
> >> >
> >> > Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
> >> > ---
> >> >  arch/x86/kernel/head_64.S | 7 ++++---
> >> >  1 file changed, 4 insertions(+), 3 deletions(-)
> >> >
> >> > diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
> >> > index 6dbd2c0..0b12311 100644
> >> > --- a/arch/x86/kernel/head_64.S
> >> > +++ b/arch/x86/kernel/head_64.S
> >> > @@ -296,8 +296,9 @@ ENTRY(start_cpu)
> >> >          *      REX.W + FF /5 JMP m16:64 Jump far, absolute indirect,
> >> >          *              address given in m16:64.
> >> >          */
> >> > -       movq    initial_code(%rip),%rax
> >> > -       pushq   $0              # fake return address to stop unwinder
> >> > +       call    1f              # put return address on stack for unwinder
> >> > +1:     xorq    %rbp, %rbp      # clear frame pointer
> >> > +       movq    initial_code(%rip), %rax
> >> >         pushq   $__KERNEL_CS    # set correct cs
> >> >         pushq   %rax            # target address in negative space
> >> >         lretq
> >> > @@ -325,7 +326,7 @@ ENDPROC(start_cpu0)
> >> >         GLOBAL(initial_gs)
> >> >         .quad   INIT_PER_CPU_VAR(irq_stack_union)
> >> >         GLOBAL(initial_stack)
> >> > -       .quad  init_thread_union+THREAD_SIZE-8
> >> > +       .quad  init_thread_union + THREAD_SIZE - SIZEOF_PTREGS
> >>
> >> As long as you're doing this, could you also set orig_ax to -1?  I
> >> remember running into some oddities resulting from orig_ax containing
> >> garbage at some point.
> >
> > I assume you mean to initialize the orig_rax value in the pt_regs at the
> > bottom of the stack of the idle task?
> >
> > How could that cause a problem?  Since the idle task never returns from
> > a system call, I'd assume that memory never gets accessed?
> >
> 
> Look at collect_syscall in lib/syscall.c

I don't see how collect_syscall() can be called for the per-cpu idle
"swapper" tasks (which is what the above code affects).  They don't have
pids or /proc entries so you can't do /proc/<pid>/syscall on them.
Andy Lutomirski April 30, 2016, 12:10 a.m. UTC | #7
On Apr 29, 2016 4:27 PM, "Josh Poimboeuf" <jpoimboe@redhat.com> wrote:
>
> On Fri, Apr 29, 2016 at 02:38:02PM -0700, Andy Lutomirski wrote:
> > On Fri, Apr 29, 2016 at 1:50 PM, Josh Poimboeuf <jpoimboe@redhat.com> wrote:
> > > On Fri, Apr 29, 2016 at 12:39:16PM -0700, Andy Lutomirski wrote:
> > >> On Thu, Apr 28, 2016 at 1:44 PM, Josh Poimboeuf <jpoimboe@redhat.com> wrote:
> > >> > Thanks to all the recent x86 entry code refactoring, most tasks' kernel
> > >> > stacks start at the same offset right above their saved pt_regs,
> > >> > regardless of which syscall was used to enter the kernel.  That creates
> > >> > a nice convention which makes it straightforward to identify the
> > >> > "bottom" of the stack, which can be useful for stack walking code which
> > >> > needs to verify the stack is sane.
> > >> >
> > >> > However there are still a few types of tasks which don't yet follow that
> > >> > convention:
> > >> >
> > >> > 1) CPU idle tasks, aka the "swapper" tasks
> > >> >
> > >> > 2) freshly forked TIF_FORK tasks which don't have a stack at all
> > >> >
> > >> > Make the idle tasks conform to the new stack bottom convention by
> > >> > starting their stack at a sizeof(pt_regs) offset from the end of the
> > >> > stack page.
> > >> >
> > >> > Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
> > >> > ---
> > >> >  arch/x86/kernel/head_64.S | 7 ++++---
> > >> >  1 file changed, 4 insertions(+), 3 deletions(-)
> > >> >
> > >> > diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
> > >> > index 6dbd2c0..0b12311 100644
> > >> > --- a/arch/x86/kernel/head_64.S
> > >> > +++ b/arch/x86/kernel/head_64.S
> > >> > @@ -296,8 +296,9 @@ ENTRY(start_cpu)
> > >> >          *      REX.W + FF /5 JMP m16:64 Jump far, absolute indirect,
> > >> >          *              address given in m16:64.
> > >> >          */
> > >> > -       movq    initial_code(%rip),%rax
> > >> > -       pushq   $0              # fake return address to stop unwinder
> > >> > +       call    1f              # put return address on stack for unwinder
> > >> > +1:     xorq    %rbp, %rbp      # clear frame pointer
> > >> > +       movq    initial_code(%rip), %rax
> > >> >         pushq   $__KERNEL_CS    # set correct cs
> > >> >         pushq   %rax            # target address in negative space
> > >> >         lretq
> > >> > @@ -325,7 +326,7 @@ ENDPROC(start_cpu0)
> > >> >         GLOBAL(initial_gs)
> > >> >         .quad   INIT_PER_CPU_VAR(irq_stack_union)
> > >> >         GLOBAL(initial_stack)
> > >> > -       .quad  init_thread_union+THREAD_SIZE-8
> > >> > +       .quad  init_thread_union + THREAD_SIZE - SIZEOF_PTREGS
> > >>
> > >> As long as you're doing this, could you also set orig_ax to -1?  I
> > >> remember running into some oddities resulting from orig_ax containing
> > >> garbage at some point.
> > >
> > > I assume you mean to initialize the orig_rax value in the pt_regs at the
> > > bottom of the stack of the idle task?
> > >
> > > How could that cause a problem?  Since the idle task never returns from
> > > a system call, I'd assume that memory never gets accessed?
> > >
> >
> > Look at collect_syscall in lib/syscall.c
>
> I don't see how collect_syscall() can be called for the per-cpu idle
> "swapper" tasks (which is what the above code affects).  They don't have
> pids or /proc entries so you can't do /proc/<pid>/syscall on them.

If so, then never mind.

--Andy

>
> --
> Josh
diff mbox

Patch

diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 6dbd2c0..0b12311 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -296,8 +296,9 @@  ENTRY(start_cpu)
 	 *	REX.W + FF /5 JMP m16:64 Jump far, absolute indirect,
 	 *		address given in m16:64.
 	 */
-	movq	initial_code(%rip),%rax
-	pushq	$0		# fake return address to stop unwinder
+	call	1f		# put return address on stack for unwinder
+1:	xorq	%rbp, %rbp	# clear frame pointer
+	movq	initial_code(%rip), %rax
 	pushq	$__KERNEL_CS	# set correct cs
 	pushq	%rax		# target address in negative space
 	lretq
@@ -325,7 +326,7 @@  ENDPROC(start_cpu0)
 	GLOBAL(initial_gs)
 	.quad	INIT_PER_CPU_VAR(irq_stack_union)
 	GLOBAL(initial_stack)
-	.quad  init_thread_union+THREAD_SIZE-8
+	.quad  init_thread_union + THREAD_SIZE - SIZEOF_PTREGS
 	__FINITDATA
 
 bad_address: