diff mbox

[00/10] target/i386/tcg: fixes for seg_helper.c

Message ID 20240710062920.73063-1-pbonzini@redhat.com
State New
Headers show

Commit Message

Paolo Bonzini July 10, 2024, 6:29 a.m. UTC
This includes bugfixes:
- allowing IRET from user mode to user mode with SMAP (do not use implicit
  kernel accesses, which break if the stack is in userspace)

- use DPL-level accesses for interrupts and call gates

- various fixes for task switching

And two related cleanups: computing MMU index once for far calls and returns
(including task switches), and using X86Access for TSS access.

Tested with a really ugly patch to kvm-unit-tests, included after signature.

Paolo Bonzini (7):
  target/i386/tcg: Allow IRET from user mode to user mode with SMAP
  target/i386/tcg: use PUSHL/PUSHW for error code
  target/i386/tcg: Compute MMU index once
  target/i386/tcg: Use DPL-level accesses for interrupts and call gates
  target/i386/tcg: check for correct busy state before switching to a
    new task
  target/i386/tcg: use X86Access for TSS access
  target/i386/tcg: save current task state before loading new one

Richard Henderson (3):
  target/i386/tcg: Remove SEG_ADDL
  target/i386/tcg: Reorg push/pop within seg_helper.c
  target/i386/tcg: Introduce x86_mmu_index_{kernel_,}pl

 target/i386/cpu.h            |  11 +-
 target/i386/cpu.c            |  27 +-
 target/i386/tcg/seg_helper.c | 606 +++++++++++++++++++----------------
 3 files changed, 354 insertions(+), 290 deletions(-)

Comments

Robert Henry July 10, 2024, 9 p.m. UTC | #1
I have only skimmed the diffs.  Your knowledge of the deep semantics,
gained by close differential reading of intel and amd docs, is truly
amazing.  Many thanks for pushing this through!

I have 2 nits, perhaps stylistic only.

For code like "sp -= 2" or "sp += 2" followed or preceded by a write to the
stack pointer of a uint16_t variable 'x',  would it be better/more robust
to rewrite as: "sp -= sizeof(x)"  ?

There are a lot of masks constructed using -1.  I think it would be clearer
to use 0xffffffff (for 32-bit masks) as that reminds the reader that this
is a bit mask.  But it seems that using -1 is how the original code was
written.

On Tue, Jul 9, 2024 at 11:29 PM Paolo Bonzini <pbonzini@redhat.com> wrote:

> This includes bugfixes:
> - allowing IRET from user mode to user mode with SMAP (do not use implicit
>   kernel accesses, which break if the stack is in userspace)
>
> - use DPL-level accesses for interrupts and call gates
>
> - various fixes for task switching
>
> And two related cleanups: computing MMU index once for far calls and
> returns
> (including task switches), and using X86Access for TSS access.
>
> Tested with a really ugly patch to kvm-unit-tests, included after
> signature.
>
> Paolo Bonzini (7):
>   target/i386/tcg: Allow IRET from user mode to user mode with SMAP
>   target/i386/tcg: use PUSHL/PUSHW for error code
>   target/i386/tcg: Compute MMU index once
>   target/i386/tcg: Use DPL-level accesses for interrupts and call gates
>   target/i386/tcg: check for correct busy state before switching to a
>     new task
>   target/i386/tcg: use X86Access for TSS access
>   target/i386/tcg: save current task state before loading new one
>
> Richard Henderson (3):
>   target/i386/tcg: Remove SEG_ADDL
>   target/i386/tcg: Reorg push/pop within seg_helper.c
>   target/i386/tcg: Introduce x86_mmu_index_{kernel_,}pl
>
>  target/i386/cpu.h            |  11 +-
>  target/i386/cpu.c            |  27 +-
>  target/i386/tcg/seg_helper.c | 606 +++++++++++++++++++----------------
>  3 files changed, 354 insertions(+), 290 deletions(-)
>
> --
> 2.45.2
>
> diff --git a/lib/x86/usermode.c b/lib/x86/usermode.c
> index c3ec0ad7..0bf40c6d 100644
> --- a/lib/x86/usermode.c
> +++ b/lib/x86/usermode.c
> @@ -5,13 +5,15 @@
>  #include "x86/desc.h"
>  #include "x86/isr.h"
>  #include "alloc.h"
> +#include "alloc_page.h"
>  #include "setjmp.h"
>  #include "usermode.h"
>
>  #include "libcflat.h"
>  #include <stdint.h>
>
> -#define USERMODE_STACK_SIZE    0x2000
> +#define USERMODE_STACK_ORDER   1 /* 8k */
> +#define USERMODE_STACK_SIZE    (1 << (12 + USERMODE_STACK_ORDER))
>  #define RET_TO_KERNEL_IRQ      0x20
>
>  static jmp_buf jmpbuf;
> @@ -37,9 +39,14 @@ uint64_t run_in_user(usermode_func func, unsigned int
> fault_vector,
>  {
>         extern char ret_to_kernel;
>         volatile uint64_t rax = 0;
> -       static unsigned char user_stack[USERMODE_STACK_SIZE];
> +       static unsigned char *user_stack;
>         handler old_ex;
>
> +       if (!user_stack) {
> +               user_stack = alloc_pages(USERMODE_STACK_ORDER);
> +               printf("%p\n", user_stack);
> +       }
> +
>         *raised_vector = 0;
>         set_idt_entry(RET_TO_KERNEL_IRQ, &ret_to_kernel, 3);
>         old_ex = handle_exception(fault_vector,
> @@ -51,6 +58,8 @@ uint64_t run_in_user(usermode_func func, unsigned int
> fault_vector,
>                 return 0;
>         }
>
> +       memcpy(user_stack + USERMODE_STACK_SIZE - 8, &func, 8);
> +
>         asm volatile (
>                         /* Prepare kernel SP for exception handlers */
>                         "mov %%rsp, %[rsp0]\n\t"
> @@ -63,12 +72,13 @@ uint64_t run_in_user(usermode_func func, unsigned int
> fault_vector,
>                         "pushq %[user_stack_top]\n\t"
>                         "pushfq\n\t"
>                         "pushq %[user_cs]\n\t"
> -                       "lea user_mode(%%rip), %%rax\n\t"
> +                       "lea user_mode+0x800000(%%rip), %%rax\n\t" //
> smap.flat places usermode addresses at 8MB-16MB
>                         "pushq %%rax\n\t"
>                         "iretq\n"
>
>                         "user_mode:\n\t"
>                         /* Back up volatile registers before invoking func
> */
> +                       "pop %%rax\n\t"
>                         "push %%rcx\n\t"
>                         "push %%rdx\n\t"
>                         "push %%rdi\n\t"
> @@ -78,11 +88,12 @@ uint64_t run_in_user(usermode_func func, unsigned int
> fault_vector,
>                         "push %%r10\n\t"
>                         "push %%r11\n\t"
>                         /* Call user mode function */
> +                       "add $0x800000,%%rbp\n\t"
>                         "mov %[arg1], %%rdi\n\t"
>                         "mov %[arg2], %%rsi\n\t"
>                         "mov %[arg3], %%rdx\n\t"
>                         "mov %[arg4], %%rcx\n\t"
> -                       "call *%[func]\n\t"
> +                       "call *%%rax\n\t"
>                         /* Restore registers */
>                         "pop %%r11\n\t"
>                         "pop %%r10\n\t"
> @@ -112,12 +123,11 @@ uint64_t run_in_user(usermode_func func, unsigned
> int fault_vector,
>                         [arg2]"m"(arg2),
>                         [arg3]"m"(arg3),
>                         [arg4]"m"(arg4),
> -                       [func]"m"(func),
>                         [user_ds]"i"(USER_DS),
>                         [user_cs]"i"(USER_CS),
>                         [kernel_ds]"rm"(KERNEL_DS),
>                         [user_stack_top]"r"(user_stack +
> -                                       sizeof(user_stack)),
> +                                       USERMODE_STACK_SIZE - 8),
>                         [kernel_entry_vector]"i"(RET_TO_KERNEL_IRQ));
>
>         handle_exception(fault_vector, old_ex);
> diff --git a/x86/smap.c b/x86/smap.c
> index 9a823a55..65119442 100644
> --- a/x86/smap.c
> +++ b/x86/smap.c
> @@ -2,6 +2,7 @@
>  #include <alloc_page.h>
>  #include "x86/desc.h"
>  #include "x86/processor.h"
> +#include "x86/usermode.h"
>  #include "x86/vm.h"
>
>  volatile int pf_count = 0;
> @@ -89,6 +90,31 @@ static void check_smap_nowp(void)
>         write_cr3(read_cr3());
>  }
>
> +#ifdef __x86_64__
> +static void iret(void)
> +{
> +       asm volatile(
> +           "mov %%rsp, %%rcx;"
> +           "movl %%ss, %%ebx; pushq %%rbx; pushq %%rcx;"
> +           "pushf;"
> +           "movl %%cs, %%ebx; pushq %%rbx; "
> +           "lea 1f(%%rip), %%rbx; pushq %%rbx; iretq; 1:"
> +
> +               : : : "ebx", "ecx", "cc"); /* RPL=0 */
> +}
> +
> +static void test_user_iret(void)
> +{
> +       bool raised_vector;
> +       uintptr_t user_iret = (uintptr_t)iret + USER_BASE;
> +
> +       run_in_user((usermode_func)user_iret, PF_VECTOR, 0, 0, 0, 0,
> +                   &raised_vector);
> +
> +       report(!raised_vector, "No #PF on CPL=3 DPL=3 iret");
> +}
> +#endif
> +
>  int main(int ac, char **av)
>  {
>         unsigned long i;
> @@ -196,7 +222,9 @@ int main(int ac, char **av)
>
>         check_smap_nowp();
>
> -       // TODO: implicit kernel access from ring 3 (e.g. int)
> +#ifdef __x86_64__
> +       test_user_iret();
> +#endif
>
>         return report_summary();
>  }
>
>
>
>
Paolo Bonzini July 10, 2024, 9:08 p.m. UTC | #2
Il mer 10 lug 2024, 23:01 Robert Henry <rrh.henry@gmail.com> ha scritto:

> I have only skimmed the diffs.  Your knowledge of the deep semantics,
> gained by close differential reading of intel and amd docs, is truly
> amazing.  Many thanks for pushing this through!
>

Thanks for bringing this to our attention too, apart from the practical bug
hopefully it will help future readers to have a more precise implementation.

I tried to acknowledge your contribution in the commit messages.

I have 2 nits, perhaps stylistic only.
>
> For code like "sp -= 2" or "sp += 2" followed or preceded by a write to
> the stack pointer of a uint16_t variable 'x',  would it be better/more
> robust to rewrite as: "sp -= sizeof(x)"  ?
>

I think that's intentional because the value subtracted is related to the
"stw" or "stl" in the store (likewise for incrementing after a load) more
than to the size of x.

There are a lot of masks constructed using -1.  I think it would be clearer
> to use 0xffffffff (for 32-bit masks) as that reminds the reader that this
> is a bit mask.  But it seems that using -1 is how the original code was
> written.
>

-1 is used for 64-bit masks only. They get unwieldy quickly. :)

Paolo


> On Tue, Jul 9, 2024 at 11:29 PM Paolo Bonzini <pbonzini@redhat.com> wrote:
>
>> This includes bugfixes:
>> - allowing IRET from user mode to user mode with SMAP (do not use implicit
>>   kernel accesses, which break if the stack is in userspace)
>>
>> - use DPL-level accesses for interrupts and call gates
>>
>> - various fixes for task switching
>>
>> And two related cleanups: computing MMU index once for far calls and
>> returns
>> (including task switches), and using X86Access for TSS access.
>>
>> Tested with a really ugly patch to kvm-unit-tests, included after
>> signature.
>>
>> Paolo Bonzini (7):
>>   target/i386/tcg: Allow IRET from user mode to user mode with SMAP
>>   target/i386/tcg: use PUSHL/PUSHW for error code
>>   target/i386/tcg: Compute MMU index once
>>   target/i386/tcg: Use DPL-level accesses for interrupts and call gates
>>   target/i386/tcg: check for correct busy state before switching to a
>>     new task
>>   target/i386/tcg: use X86Access for TSS access
>>   target/i386/tcg: save current task state before loading new one
>>
>> Richard Henderson (3):
>>   target/i386/tcg: Remove SEG_ADDL
>>   target/i386/tcg: Reorg push/pop within seg_helper.c
>>   target/i386/tcg: Introduce x86_mmu_index_{kernel_,}pl
>>
>>  target/i386/cpu.h            |  11 +-
>>  target/i386/cpu.c            |  27 +-
>>  target/i386/tcg/seg_helper.c | 606 +++++++++++++++++++----------------
>>  3 files changed, 354 insertions(+), 290 deletions(-)
>>
>> --
>> 2.45.2
>>
>> diff --git a/lib/x86/usermode.c b/lib/x86/usermode.c
>> index c3ec0ad7..0bf40c6d 100644
>> --- a/lib/x86/usermode.c
>> +++ b/lib/x86/usermode.c
>> @@ -5,13 +5,15 @@
>>  #include "x86/desc.h"
>>  #include "x86/isr.h"
>>  #include "alloc.h"
>> +#include "alloc_page.h"
>>  #include "setjmp.h"
>>  #include "usermode.h"
>>
>>  #include "libcflat.h"
>>  #include <stdint.h>
>>
>> -#define USERMODE_STACK_SIZE    0x2000
>> +#define USERMODE_STACK_ORDER   1 /* 8k */
>> +#define USERMODE_STACK_SIZE    (1 << (12 + USERMODE_STACK_ORDER))
>>  #define RET_TO_KERNEL_IRQ      0x20
>>
>>  static jmp_buf jmpbuf;
>> @@ -37,9 +39,14 @@ uint64_t run_in_user(usermode_func func, unsigned int
>> fault_vector,
>>  {
>>         extern char ret_to_kernel;
>>         volatile uint64_t rax = 0;
>> -       static unsigned char user_stack[USERMODE_STACK_SIZE];
>> +       static unsigned char *user_stack;
>>         handler old_ex;
>>
>> +       if (!user_stack) {
>> +               user_stack = alloc_pages(USERMODE_STACK_ORDER);
>> +               printf("%p\n", user_stack);
>> +       }
>> +
>>         *raised_vector = 0;
>>         set_idt_entry(RET_TO_KERNEL_IRQ, &ret_to_kernel, 3);
>>         old_ex = handle_exception(fault_vector,
>> @@ -51,6 +58,8 @@ uint64_t run_in_user(usermode_func func, unsigned int
>> fault_vector,
>>                 return 0;
>>         }
>>
>> +       memcpy(user_stack + USERMODE_STACK_SIZE - 8, &func, 8);
>> +
>>         asm volatile (
>>                         /* Prepare kernel SP for exception handlers */
>>                         "mov %%rsp, %[rsp0]\n\t"
>> @@ -63,12 +72,13 @@ uint64_t run_in_user(usermode_func func, unsigned int
>> fault_vector,
>>                         "pushq %[user_stack_top]\n\t"
>>                         "pushfq\n\t"
>>                         "pushq %[user_cs]\n\t"
>> -                       "lea user_mode(%%rip), %%rax\n\t"
>> +                       "lea user_mode+0x800000(%%rip), %%rax\n\t" //
>> smap.flat places usermode addresses at 8MB-16MB
>>                         "pushq %%rax\n\t"
>>                         "iretq\n"
>>
>>                         "user_mode:\n\t"
>>                         /* Back up volatile registers before invoking
>> func */
>> +                       "pop %%rax\n\t"
>>                         "push %%rcx\n\t"
>>                         "push %%rdx\n\t"
>>                         "push %%rdi\n\t"
>> @@ -78,11 +88,12 @@ uint64_t run_in_user(usermode_func func, unsigned int
>> fault_vector,
>>                         "push %%r10\n\t"
>>                         "push %%r11\n\t"
>>                         /* Call user mode function */
>> +                       "add $0x800000,%%rbp\n\t"
>>                         "mov %[arg1], %%rdi\n\t"
>>                         "mov %[arg2], %%rsi\n\t"
>>                         "mov %[arg3], %%rdx\n\t"
>>                         "mov %[arg4], %%rcx\n\t"
>> -                       "call *%[func]\n\t"
>> +                       "call *%%rax\n\t"
>>                         /* Restore registers */
>>                         "pop %%r11\n\t"
>>                         "pop %%r10\n\t"
>> @@ -112,12 +123,11 @@ uint64_t run_in_user(usermode_func func, unsigned
>> int fault_vector,
>>                         [arg2]"m"(arg2),
>>                         [arg3]"m"(arg3),
>>                         [arg4]"m"(arg4),
>> -                       [func]"m"(func),
>>                         [user_ds]"i"(USER_DS),
>>                         [user_cs]"i"(USER_CS),
>>                         [kernel_ds]"rm"(KERNEL_DS),
>>                         [user_stack_top]"r"(user_stack +
>> -                                       sizeof(user_stack)),
>> +                                       USERMODE_STACK_SIZE - 8),
>>                         [kernel_entry_vector]"i"(RET_TO_KERNEL_IRQ));
>>
>>         handle_exception(fault_vector, old_ex);
>> diff --git a/x86/smap.c b/x86/smap.c
>> index 9a823a55..65119442 100644
>> --- a/x86/smap.c
>> +++ b/x86/smap.c
>> @@ -2,6 +2,7 @@
>>  #include <alloc_page.h>
>>  #include "x86/desc.h"
>>  #include "x86/processor.h"
>> +#include "x86/usermode.h"
>>  #include "x86/vm.h"
>>
>>  volatile int pf_count = 0;
>> @@ -89,6 +90,31 @@ static void check_smap_nowp(void)
>>         write_cr3(read_cr3());
>>  }
>>
>> +#ifdef __x86_64__
>> +static void iret(void)
>> +{
>> +       asm volatile(
>> +           "mov %%rsp, %%rcx;"
>> +           "movl %%ss, %%ebx; pushq %%rbx; pushq %%rcx;"
>> +           "pushf;"
>> +           "movl %%cs, %%ebx; pushq %%rbx; "
>> +           "lea 1f(%%rip), %%rbx; pushq %%rbx; iretq; 1:"
>> +
>> +               : : : "ebx", "ecx", "cc"); /* RPL=0 */
>> +}
>> +
>> +static void test_user_iret(void)
>> +{
>> +       bool raised_vector;
>> +       uintptr_t user_iret = (uintptr_t)iret + USER_BASE;
>> +
>> +       run_in_user((usermode_func)user_iret, PF_VECTOR, 0, 0, 0, 0,
>> +                   &raised_vector);
>> +
>> +       report(!raised_vector, "No #PF on CPL=3 DPL=3 iret");
>> +}
>> +#endif
>> +
>>  int main(int ac, char **av)
>>  {
>>         unsigned long i;
>> @@ -196,7 +222,9 @@ int main(int ac, char **av)
>>
>>         check_smap_nowp();
>>
>> -       // TODO: implicit kernel access from ring 3 (e.g. int)
>> +#ifdef __x86_64__
>> +       test_user_iret();
>> +#endif
>>
>>         return report_summary();
>>  }
>>
>>
>>
>>
diff mbox

Patch

diff --git a/lib/x86/usermode.c b/lib/x86/usermode.c
index c3ec0ad7..0bf40c6d 100644
--- a/lib/x86/usermode.c
+++ b/lib/x86/usermode.c
@@ -5,13 +5,15 @@ 
 #include "x86/desc.h"
 #include "x86/isr.h"
 #include "alloc.h"
+#include "alloc_page.h"
 #include "setjmp.h"
 #include "usermode.h"
 
 #include "libcflat.h"
 #include <stdint.h>
 
-#define USERMODE_STACK_SIZE	0x2000
+#define USERMODE_STACK_ORDER	1 /* 8k */
+#define USERMODE_STACK_SIZE	(1 << (12 + USERMODE_STACK_ORDER))
 #define RET_TO_KERNEL_IRQ	0x20
 
 static jmp_buf jmpbuf;
@@ -37,9 +39,14 @@  uint64_t run_in_user(usermode_func func, unsigned int fault_vector,
 {
 	extern char ret_to_kernel;
 	volatile uint64_t rax = 0;
-	static unsigned char user_stack[USERMODE_STACK_SIZE];
+	static unsigned char *user_stack;
 	handler old_ex;
 
+	if (!user_stack) {
+		user_stack = alloc_pages(USERMODE_STACK_ORDER);
+		printf("%p\n", user_stack);
+	}
+
 	*raised_vector = 0;
 	set_idt_entry(RET_TO_KERNEL_IRQ, &ret_to_kernel, 3);
 	old_ex = handle_exception(fault_vector,
@@ -51,6 +58,8 @@  uint64_t run_in_user(usermode_func func, unsigned int fault_vector,
 		return 0;
 	}
 
+	memcpy(user_stack + USERMODE_STACK_SIZE - 8, &func, 8);
+
 	asm volatile (
 			/* Prepare kernel SP for exception handlers */
 			"mov %%rsp, %[rsp0]\n\t"
@@ -63,12 +72,13 @@  uint64_t run_in_user(usermode_func func, unsigned int fault_vector,
 			"pushq %[user_stack_top]\n\t"
 			"pushfq\n\t"
 			"pushq %[user_cs]\n\t"
-			"lea user_mode(%%rip), %%rax\n\t"
+			"lea user_mode+0x800000(%%rip), %%rax\n\t" // smap.flat places usermode addresses at 8MB-16MB
 			"pushq %%rax\n\t"
 			"iretq\n"
 
 			"user_mode:\n\t"
 			/* Back up volatile registers before invoking func */
+			"pop %%rax\n\t"
 			"push %%rcx\n\t"
 			"push %%rdx\n\t"
 			"push %%rdi\n\t"
@@ -78,11 +88,12 @@  uint64_t run_in_user(usermode_func func, unsigned int fault_vector,
 			"push %%r10\n\t"
 			"push %%r11\n\t"
 			/* Call user mode function */
+			"add $0x800000,%%rbp\n\t"
 			"mov %[arg1], %%rdi\n\t"
 			"mov %[arg2], %%rsi\n\t"
 			"mov %[arg3], %%rdx\n\t"
 			"mov %[arg4], %%rcx\n\t"
-			"call *%[func]\n\t"
+			"call *%%rax\n\t"
 			/* Restore registers */
 			"pop %%r11\n\t"
 			"pop %%r10\n\t"
@@ -112,12 +123,11 @@  uint64_t run_in_user(usermode_func func, unsigned int fault_vector,
 			[arg2]"m"(arg2),
 			[arg3]"m"(arg3),
 			[arg4]"m"(arg4),
-			[func]"m"(func),
 			[user_ds]"i"(USER_DS),
 			[user_cs]"i"(USER_CS),
 			[kernel_ds]"rm"(KERNEL_DS),
 			[user_stack_top]"r"(user_stack +
-					sizeof(user_stack)),
+					USERMODE_STACK_SIZE - 8),
 			[kernel_entry_vector]"i"(RET_TO_KERNEL_IRQ));
 
 	handle_exception(fault_vector, old_ex);
diff --git a/x86/smap.c b/x86/smap.c
index 9a823a55..65119442 100644
--- a/x86/smap.c
+++ b/x86/smap.c
@@ -2,6 +2,7 @@ 
 #include <alloc_page.h>
 #include "x86/desc.h"
 #include "x86/processor.h"
+#include "x86/usermode.h"
 #include "x86/vm.h"
 
 volatile int pf_count = 0;
@@ -89,6 +90,31 @@  static void check_smap_nowp(void)
 	write_cr3(read_cr3());
 }
 
+#ifdef __x86_64__
+static void iret(void)
+{
+	asm volatile(
+	    "mov %%rsp, %%rcx;"
+	    "movl %%ss, %%ebx; pushq %%rbx; pushq %%rcx;"
+	    "pushf;"
+	    "movl %%cs, %%ebx; pushq %%rbx; "
+	    "lea 1f(%%rip), %%rbx; pushq %%rbx; iretq; 1:"
+
+		: : : "ebx", "ecx", "cc"); /* RPL=0 */
+}
+
+static void test_user_iret(void)
+{
+	bool raised_vector;
+	uintptr_t user_iret = (uintptr_t)iret + USER_BASE;
+
+	run_in_user((usermode_func)user_iret, PF_VECTOR, 0, 0, 0, 0,
+		    &raised_vector);
+
+	report(!raised_vector, "No #PF on CPL=3 DPL=3 iret");
+}
+#endif
+
 int main(int ac, char **av)
 {
 	unsigned long i;
@@ -196,7 +222,9 @@  int main(int ac, char **av)
 
 	check_smap_nowp();
 
-	// TODO: implicit kernel access from ring 3 (e.g. int)
+#ifdef __x86_64__
+	test_user_iret();
+#endif
 
 	return report_summary();
 }