diff mbox series

[1/1] i386/tcg: Allow IRET from user mode to user mode for dotnet runtime

Message ID 20240611162021.269457-2-robhenry@microsoft.com
State New
Headers show
Series i386/tcg fix for IRET as used in dotnet runtime | expand

Commit Message

Robert Henry June 11, 2024, 4:20 p.m. UTC
This fixes a bug wherein i386/tcg assumed an interrupt return using
the IRET instruction was always returning from kernel mode to either
kernel mode or user mode. This assumption is violated when IRET is used
as a clever way to restore thread state, as for example in the dotnet
runtime. There, IRET returns from user mode to user mode.

This bug manifested itself as a page fault in the guest Linux kernel.

This bug appears to have been in QEMU since the beginning.

Resolves: https://gitlab.com/qemu-project/qemu/-/issues/249
Signed-off-by: Robert R. Henry <robhenry@microsoft.com>
---
 target/i386/tcg/seg_helper.c | 78 ++++++++++++++++++++++--------------
 1 file changed, 47 insertions(+), 31 deletions(-)

Comments

Richard Henderson June 15, 2024, 11:25 p.m. UTC | #1
On 6/11/24 09:20, Robert R. Henry wrote:
> This fixes a bug wherein i386/tcg assumed an interrupt return using
> the IRET instruction was always returning from kernel mode to either
> kernel mode or user mode. This assumption is violated when IRET is used
> as a clever way to restore thread state, as for example in the dotnet
> runtime. There, IRET returns from user mode to user mode.
> 
> This bug manifested itself as a page fault in the guest Linux kernel.
> 
> This bug appears to have been in QEMU since the beginning.
> 
> Resolves: https://gitlab.com/qemu-project/qemu/-/issues/249
> Signed-off-by: Robert R. Henry <robhenry@microsoft.com>
> ---
>   target/i386/tcg/seg_helper.c | 78 ++++++++++++++++++++++--------------
>   1 file changed, 47 insertions(+), 31 deletions(-)
> 
> diff --git a/target/i386/tcg/seg_helper.c b/target/i386/tcg/seg_helper.c
> index 715db1f232..815d26e61d 100644
> --- a/target/i386/tcg/seg_helper.c
> +++ b/target/i386/tcg/seg_helper.c
> @@ -843,20 +843,35 @@ static void do_interrupt_protected(CPUX86State *env, int intno, int is_int,
>   
>   #ifdef TARGET_X86_64
>   
> -#define PUSHQ_RA(sp, val, ra)                   \
> -    {                                           \
> -        sp -= 8;                                \
> -        cpu_stq_kernel_ra(env, sp, (val), ra);  \
> -    }
> -
> -#define POPQ_RA(sp, val, ra)                    \
> -    {                                           \
> -        val = cpu_ldq_kernel_ra(env, sp, ra);   \
> -        sp += 8;                                \
> -    }
> +#define PUSHQ_RA(sp, val, ra, cpl, dpl) \
> +  FUNC_PUSHQ_RA(env, &sp, val, ra, cpl, dpl)
> +
> +static inline void FUNC_PUSHQ_RA(
> +    CPUX86State *env, target_ulong *sp,
> +    target_ulong val, target_ulong ra, int cpl, int dpl) {
> +  *sp -= 8;
> +  if (dpl == 0) {
> +    cpu_stq_kernel_ra(env, *sp, val, ra);
> +  } else {
> +    cpu_stq_data_ra(env, *sp, val, ra);
> +  }
> +}

This doesn't seem quite right.

I would be much happier if we were to resolve the proper mmu index earlier, once, rather 
than within each call to cpu_{ld,st}*_{kernel,data}_ra.  With the mmu index in hand, use 
cpu_{ld,st}*_mmuidx_ra instead.

I believe you will want to factor out a subroutine of x86_cpu_mmu_index which passes in 
the pl, rather than reading cpl from env->hflags.  This will also allow 
cpu_mmu_index_kernel to be eliminated or simplified, which is written to assume pl=0.


r~
Robert Henry June 16, 2024, 10:44 p.m. UTC | #2
I do not think I will have the time or focus to work on improving this
patch this summer, as I will retire in 2 weeks and need to make a clean
break to focus on other things (health, for one) for a while.

If anyone wants to put into place Richard's ideas, I will not be offended!

I do not see any of this chatter in this email thread on the bug report
https://gitlab.com/qemu-project/qemu/-/issues/249

Robert Henry

On Sat, Jun 15, 2024 at 4:25 PM Richard Henderson <
richard.henderson@linaro.org> wrote:

> On 6/11/24 09:20, Robert R. Henry wrote:
> > This fixes a bug wherein i386/tcg assumed an interrupt return using
> > the IRET instruction was always returning from kernel mode to either
> > kernel mode or user mode. This assumption is violated when IRET is used
> > as a clever way to restore thread state, as for example in the dotnet
> > runtime. There, IRET returns from user mode to user mode.
> >
> > This bug manifested itself as a page fault in the guest Linux kernel.
> >
> > This bug appears to have been in QEMU since the beginning.
> >
> > Resolves: https://gitlab.com/qemu-project/qemu/-/issues/249
> > Signed-off-by: Robert R. Henry <robhenry@microsoft.com>
> > ---
> >   target/i386/tcg/seg_helper.c | 78 ++++++++++++++++++++++--------------
> >   1 file changed, 47 insertions(+), 31 deletions(-)
> >
> > diff --git a/target/i386/tcg/seg_helper.c b/target/i386/tcg/seg_helper.c
> > index 715db1f232..815d26e61d 100644
> > --- a/target/i386/tcg/seg_helper.c
> > +++ b/target/i386/tcg/seg_helper.c
> > @@ -843,20 +843,35 @@ static void do_interrupt_protected(CPUX86State
> *env, int intno, int is_int,
> >
> >   #ifdef TARGET_X86_64
> >
> > -#define PUSHQ_RA(sp, val, ra)                   \
> > -    {                                           \
> > -        sp -= 8;                                \
> > -        cpu_stq_kernel_ra(env, sp, (val), ra);  \
> > -    }
> > -
> > -#define POPQ_RA(sp, val, ra)                    \
> > -    {                                           \
> > -        val = cpu_ldq_kernel_ra(env, sp, ra);   \
> > -        sp += 8;                                \
> > -    }
> > +#define PUSHQ_RA(sp, val, ra, cpl, dpl) \
> > +  FUNC_PUSHQ_RA(env, &sp, val, ra, cpl, dpl)
> > +
> > +static inline void FUNC_PUSHQ_RA(
> > +    CPUX86State *env, target_ulong *sp,
> > +    target_ulong val, target_ulong ra, int cpl, int dpl) {
> > +  *sp -= 8;
> > +  if (dpl == 0) {
> > +    cpu_stq_kernel_ra(env, *sp, val, ra);
> > +  } else {
> > +    cpu_stq_data_ra(env, *sp, val, ra);
> > +  }
> > +}
>
> This doesn't seem quite right.
>
> I would be much happier if we were to resolve the proper mmu index
> earlier, once, rather
> than within each call to cpu_{ld,st}*_{kernel,data}_ra.  With the mmu
> index in hand, use
> cpu_{ld,st}*_mmuidx_ra instead.
>
> I believe you will want to factor out a subroutine of x86_cpu_mmu_index
> which passes in
> the pl, rather than reading cpl from env->hflags.  This will also allow
> cpu_mmu_index_kernel to be eliminated or simplified, which is written to
> assume pl=0.
>
>
> r~
>
Paolo Bonzini June 17, 2024, 8:33 a.m. UTC | #3
On Mon, Jun 17, 2024 at 12:45 AM Robert Henry <rrh.henry@gmail.com> wrote:
> I do not think I will have the time or focus to work on improving this patch this summer, as I will retire in 2 weeks and need to make a clean break to focus on other things (health, for one) for a while.
> If anyone wants to put into place Richard's ideas, I will not be offended!

Great, I'll do the work and make sure your analysis and contribution
to the patch is recognized.

> I do not see any of this chatter in this email thread on the bug report https://gitlab.com/qemu-project/qemu/-/issues/249

Yeah, that happens - the discussion in the bug report often focuses
more on what the bug is than how to fix it. I had looked at the patch
and came to roughly the same conclusion as Richard, though he beat me
to answering.

More precisely:

- the main issue with your patch is that it only affects IRETQ and I
think (going from memory) RETFQ. All IRET and RETF operations should
use the CPL for the access.

- a secondary issue with the patch is that you can use the *_data
variant for both CPL0 and CPL3 accesses.

- it's a good idea to also solve the related issue, that interrupts
should use a data access for the DPL of the interrupt/call gate. That
one cannot use *_data, so there are two alternatives: using an if like
you did, or using the *_mmuidx variant with the MMU index computed in
advance.

Yet another related issue (going back to *really* legacy stuff) is
that call gates need to use *_data when reading the parameters from
the stack, so that it's possible to use call gates from CPL3 to CPL0
with CR4.SMAP=1.

Paolo
diff mbox series

Patch

diff --git a/target/i386/tcg/seg_helper.c b/target/i386/tcg/seg_helper.c
index 715db1f232..815d26e61d 100644
--- a/target/i386/tcg/seg_helper.c
+++ b/target/i386/tcg/seg_helper.c
@@ -843,20 +843,35 @@  static void do_interrupt_protected(CPUX86State *env, int intno, int is_int,
 
 #ifdef TARGET_X86_64
 
-#define PUSHQ_RA(sp, val, ra)                   \
-    {                                           \
-        sp -= 8;                                \
-        cpu_stq_kernel_ra(env, sp, (val), ra);  \
-    }
-
-#define POPQ_RA(sp, val, ra)                    \
-    {                                           \
-        val = cpu_ldq_kernel_ra(env, sp, ra);   \
-        sp += 8;                                \
-    }
+#define PUSHQ_RA(sp, val, ra, cpl, dpl) \
+  FUNC_PUSHQ_RA(env, &sp, val, ra, cpl, dpl)
+
+static inline void FUNC_PUSHQ_RA(
+    CPUX86State *env, target_ulong *sp,
+    target_ulong val, target_ulong ra, int cpl, int dpl) {
+  *sp -= 8;
+  if (dpl == 0) {
+    cpu_stq_kernel_ra(env, *sp, val, ra);
+  } else {
+    cpu_stq_data_ra(env, *sp, val, ra);
+  } 
+}
 
-#define PUSHQ(sp, val) PUSHQ_RA(sp, val, 0)
-#define POPQ(sp, val) POPQ_RA(sp, val, 0)
+#define POPQ_RA(sp, val, ra, cpl, dpl) \
+  val = FUNC_POPQ_RA(env, &sp, ra, cpl, dpl)
+
+static inline target_ulong FUNC_POPQ_RA(
+    CPUX86State *env, target_ulong *sp,
+    target_ulong ra, int cpl, int dpl) {
+  target_ulong val;
+  if (cpl == 0) {  /* TODO perhaps both arms reduce to cpu_ldq_data_ra? */
+    val = cpu_ldq_kernel_ra(env, *sp, ra);
+  } else {
+    val = cpu_ldq_data_ra(env, *sp, ra);
+  }
+  *sp += 8;
+  return val;
+}
 
 static inline target_ulong get_rsp_from_tss(CPUX86State *env, int level)
 {
@@ -901,6 +916,7 @@  static void do_interrupt64(CPUX86State *env, int intno, int is_int,
     uint32_t e1, e2, e3, ss, eflags;
     target_ulong old_eip, esp, offset;
     bool set_rf;
+    const target_ulong retaddr = 0;
 
     has_error_code = 0;
     if (!is_int && !is_hw) {
@@ -989,13 +1005,13 @@  static void do_interrupt64(CPUX86State *env, int intno, int is_int,
         eflags |= RF_MASK;
     }
 
-    PUSHQ(esp, env->segs[R_SS].selector);
-    PUSHQ(esp, env->regs[R_ESP]);
-    PUSHQ(esp, eflags);
-    PUSHQ(esp, env->segs[R_CS].selector);
-    PUSHQ(esp, old_eip);
+    PUSHQ_RA(esp, env->segs[R_SS].selector, retaddr, cpl, dpl);
+    PUSHQ_RA(esp, env->regs[R_ESP],         retaddr, cpl, dpl);
+    PUSHQ_RA(esp, eflags,                   retaddr, cpl, dpl);
+    PUSHQ_RA(esp, env->segs[R_CS].selector, retaddr, cpl, dpl);
+    PUSHQ_RA(esp, old_eip,                  retaddr, cpl, dpl);
     if (has_error_code) {
-        PUSHQ(esp, error_code);
+        PUSHQ_RA(esp, error_code, retaddr, cpl, dpl);
     }
 
     /* interrupt gate clear IF mask */
@@ -1621,8 +1637,8 @@  void helper_lcall_protected(CPUX86State *env, int new_cs, target_ulong new_eip,
 
             /* 64 bit case */
             rsp = env->regs[R_ESP];
-            PUSHQ_RA(rsp, env->segs[R_CS].selector, GETPC());
-            PUSHQ_RA(rsp, next_eip, GETPC());
+            PUSHQ_RA(rsp, env->segs[R_CS].selector, GETPC(), cpl, dpl);
+            PUSHQ_RA(rsp, next_eip, GETPC(), cpl, dpl);
             /* from this point, not restartable */
             env->regs[R_ESP] = rsp;
             cpu_x86_load_seg_cache(env, R_CS, (new_cs & 0xfffc) | cpl,
@@ -1792,8 +1808,8 @@  void helper_lcall_protected(CPUX86State *env, int new_cs, target_ulong new_eip,
 #ifdef TARGET_X86_64
             if (shift == 2) {
                 /* XXX: verify if new stack address is canonical */
-                PUSHQ_RA(sp, env->segs[R_SS].selector, GETPC());
-                PUSHQ_RA(sp, env->regs[R_ESP], GETPC());
+                PUSHQ_RA(sp, env->segs[R_SS].selector, GETPC(), cpl, dpl);
+                PUSHQ_RA(sp, env->regs[R_ESP], GETPC(), cpl, dpl);
                 /* parameters aren't supported for 64-bit call gates */
             } else
 #endif
@@ -1828,8 +1844,8 @@  void helper_lcall_protected(CPUX86State *env, int new_cs, target_ulong new_eip,
 
 #ifdef TARGET_X86_64
         if (shift == 2) {
-            PUSHQ_RA(sp, env->segs[R_CS].selector, GETPC());
-            PUSHQ_RA(sp, next_eip, GETPC());
+            PUSHQ_RA(sp, env->segs[R_CS].selector, GETPC(), cpl, dpl);
+            PUSHQ_RA(sp, next_eip, GETPC(), cpl, dpl);
         } else
 #endif
         if (shift == 1) {
@@ -1944,6 +1960,7 @@  static inline void helper_ret_protected(CPUX86State *env, int shift,
     int cpl, dpl, rpl, eflags_mask, iopl;
     target_ulong ssp, sp, new_eip, new_esp, sp_mask;
 
+    cpl = env->hflags & HF_CPL_MASK;
 #ifdef TARGET_X86_64
     if (shift == 2) {
         sp_mask = -1;
@@ -1957,11 +1974,11 @@  static inline void helper_ret_protected(CPUX86State *env, int shift,
     new_eflags = 0; /* avoid warning */
 #ifdef TARGET_X86_64
     if (shift == 2) {
-        POPQ_RA(sp, new_eip, retaddr);
-        POPQ_RA(sp, new_cs, retaddr);
+        POPQ_RA(sp, new_eip, retaddr, cpl, dpl);
+        POPQ_RA(sp, new_cs, retaddr, cpl, dpl);
         new_cs &= 0xffff;
         if (is_iret) {
-            POPQ_RA(sp, new_eflags, retaddr);
+            POPQ_RA(sp, new_eflags, retaddr, cpl, dpl);
         }
     } else
 #endif
@@ -1999,7 +2016,6 @@  static inline void helper_ret_protected(CPUX86State *env, int shift,
         !(e2 & DESC_CS_MASK)) {
         raise_exception_err_ra(env, EXCP0D_GPF, new_cs & 0xfffc, retaddr);
     }
-    cpl = env->hflags & HF_CPL_MASK;
     rpl = new_cs & 3;
     if (rpl < cpl) {
         raise_exception_err_ra(env, EXCP0D_GPF, new_cs & 0xfffc, retaddr);
@@ -2030,8 +2046,8 @@  static inline void helper_ret_protected(CPUX86State *env, int shift,
         /* return to different privilege level */
 #ifdef TARGET_X86_64
         if (shift == 2) {
-            POPQ_RA(sp, new_esp, retaddr);
-            POPQ_RA(sp, new_ss, retaddr);
+            POPQ_RA(sp, new_esp, retaddr, cpl, dpl);
+            POPQ_RA(sp, new_ss, retaddr, cpl, dpl);
             new_ss &= 0xffff;
         } else
 #endif