Message ID | 20221005053234.29312-2-bgray@linux.ibm.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | Out-of-line static calls for powerpc64 ELF V2 | expand |
Hi, Le 05/10/2022 à 07:32, Benjamin Gray a écrit : > Adds a generic text patching mechanism for patches of size int or long > bytes. > > The patch_instruction function is reimplemented in terms of this > more generic function. This generic implementation allows patching of > arbitrary long data, such as pointers on 64-bit. > > On 32-bit patch_int is marked noinline to prevent a mis-optimisation. > Without noinline, inside patch_branch the compiler may inline all the > way to do_patch_memory, preventing the compiler from inlining > do_patch_memory into patch_int. This would needlessly force patch_int > to be a branch to do_patch_memory. I'm on business trip this week so I can't test it on hardware, but the generated code looks horrid and sub-optimal, with a stack frame and so many registers saved into it. That's mpc885_ads_defconfig built with GCC 12, without modules without stackprotector with 4k pages. 00000168 <__patch_memory.constprop.0>: 168: 90 83 00 00 stw r4,0(r3) 16c: 7c 00 18 6c dcbst 0,r3 170: 7c 00 04 ac hwsync 174: 7c 00 2f ac icbi 0,r5 178: 7c 00 04 ac hwsync 17c: 4c 00 01 2c isync 180: 38 60 00 00 li r3,0 184: 4e 80 00 20 blr 188: 38 60 ff ff li r3,-1 18c: 4e 80 00 20 blr 00000190 <raw_patch_instruction>: 190: 90 83 00 00 stw r4,0(r3) 194: 7c 00 18 6c dcbst 0,r3 198: 7c 00 04 ac hwsync 19c: 7c 00 1f ac icbi 0,r3 1a0: 7c 00 04 ac hwsync 1a4: 4c 00 01 2c isync 1a8: 38 60 00 00 li r3,0 1ac: 4e 80 00 20 blr 1b0: 38 60 ff ff li r3,-1 1b4: 4e 80 00 20 blr 000001b8 <patch_uint>: 1b8: 7c 65 1b 78 mr r5,r3 1bc: 48 00 00 a4 b 260 <patch_uint+0xa8> 1c0: 94 21 ff e0 stwu r1,-32(r1) 1c4: 7c 08 02 a6 mflr r0 1c8: 90 01 00 24 stw r0,36(r1) 1cc: 93 81 00 10 stw r28,16(r1) 1d0: 93 a1 00 14 stw r29,20(r1) 1d4: 93 c1 00 18 stw r30,24(r1) 1d8: 93 e1 00 1c stw r31,28(r1) 1dc: 7f 80 00 a6 mfmsr r28 1e0: 7c 51 13 a6 mtspr 81,r2 1e4: 3d 20 00 00 lis r9,0 1e6: R_PPC_ADDR16_HA .data 1e8: 81 49 00 00 lwz r10,0(r9) 1ea: R_PPC_ADDR16_LO .data 1ec: 3d 20 00 00 lis r9,0 1ee: R_PPC_ADDR16_HA init_mm+0x24 1f0: 83 ea 00 04 lwz r31,4(r10) 1f4: 80 e9 00 00 lwz r7,0(r9) 1f6: R_PPC_ADDR16_LO init_mm+0x24 1f8: 57 e8 65 3a rlwinm r8,r31,12,20,29 1fc: 7f a7 40 2e lwzx r29,r7,r8 200: 7c 69 1b 78 mr r9,r3 204: 3d 29 40 00 addis r9,r9,16384 208: 57 fe b5 3a rlwinm r30,r31,22,20,29 20c: 55 29 00 26 clrrwi r9,r9,12 210: 61 29 01 25 ori r9,r9,293 214: 57 bd 00 26 clrrwi r29,r29,12 218: 3f de c0 00 addis r30,r30,-16384 21c: 7d 3d f1 2e stwx r9,r29,r30 220: 53 e3 00 26 rlwimi r3,r31,0,0,19 224: 4b ff ff 45 bl 168 <__patch_memory.constprop.0> 228: 39 20 00 00 li r9,0 22c: 7d 3d f1 2e stwx r9,r29,r30 230: 57 ff 00 26 clrrwi r31,r31,12 234: 7c 00 fa 64 tlbie r31,r0 238: 7c 00 04 ac hwsync 23c: 7f 80 01 24 mtmsr r28 240: 80 01 00 24 lwz r0,36(r1) 244: 83 81 00 10 lwz r28,16(r1) 248: 83 a1 00 14 lwz r29,20(r1) 24c: 83 c1 00 18 lwz r30,24(r1) 250: 83 e1 00 1c lwz r31,28(r1) 254: 7c 08 03 a6 mtlr r0 258: 38 21 00 20 addi r1,r1,32 25c: 4e 80 00 20 blr 260: 4b ff ff 08 b 168 <__patch_memory.constprop.0> Christophe > > Signed-off-by: Benjamin Gray <bgray@linux.ibm.com> > --- > arch/powerpc/include/asm/code-patching.h | 29 ++++++++++ > arch/powerpc/lib/code-patching.c | 73 ++++++++++++++++++------ > 2 files changed, 85 insertions(+), 17 deletions(-) > > diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/code-patching.h > index 3f881548fb61..170bfa848c7c 100644 > --- a/arch/powerpc/include/asm/code-patching.h > +++ b/arch/powerpc/include/asm/code-patching.h > @@ -72,7 +72,36 @@ static inline int create_branch(ppc_inst_t *instr, const u32 *addr, > int create_cond_branch(ppc_inst_t *instr, const u32 *addr, > unsigned long target, int flags); > int patch_branch(u32 *addr, unsigned long target, int flags); > + > +/* patch_uint and patch_ulong must only be called on addresses where the patch > + * does not cross a cacheline, otherwise it may not be flushed properly and > + * mixes of new and stale data may be observed. > + * > + * patch_instruction and other instruction patchers automatically satisfy this > + * requirement due to instruction alignment requirements. > + */ > + > +int patch_uint(void *addr, unsigned int val); > + > +#ifdef CONFIG_PPC64 > + > +int patch_ulong(void *addr, unsigned long val); > int patch_instruction(u32 *addr, ppc_inst_t instr); > + > +#else > + > +static inline int patch_ulong(void *addr, unsigned long val) > +{ > + return patch_uint(addr, val); > +} > + > +static inline int patch_instruction(u32 *addr, ppc_inst_t instr) > +{ > + return patch_uint(addr, ppc_inst_val(instr)); > +} > + > +#endif > + > int raw_patch_instruction(u32 *addr, ppc_inst_t instr); > > static inline unsigned long patch_site_addr(s32 *site) > diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c > index 125c55e3e148..ecdd2e523d9a 100644 > --- a/arch/powerpc/lib/code-patching.c > +++ b/arch/powerpc/lib/code-patching.c > @@ -15,20 +15,24 @@ > #include <asm/code-patching.h> > #include <asm/inst.h> > > -static int __patch_instruction(u32 *exec_addr, ppc_inst_t instr, u32 *patch_addr) > +static int __patch_memory(void *patch_addr, unsigned long val, void *exec_addr, > + bool is_dword) > { > - if (!ppc_inst_prefixed(instr)) { > - u32 val = ppc_inst_val(instr); > - > - __put_kernel_nofault(patch_addr, &val, u32, failed); > - } else { > - u64 val = ppc_inst_as_ulong(instr); > + /* Prefixed instruction may cross cacheline if cacheline smaller than 64 bytes */ > + BUILD_BUG_ON(IS_ENABLED(CONFIG_PPC64) && L1_CACHE_BYTES < 64); > > + if (unlikely(is_dword)) > __put_kernel_nofault(patch_addr, &val, u64, failed); > - } > + else > + __put_kernel_nofault(patch_addr, &val, u32, failed); > > - asm ("dcbst 0, %0; sync; icbi 0,%1; sync; isync" :: "r" (patch_addr), > - "r" (exec_addr)); > + /* Assume data is inside a single cacheline */ > + dcbst(patch_addr); > + mb(); /* sync */ > + /* Flush on the EA that may be executed in case of a non-coherent icache */ > + icbi(exec_addr); > + mb(); /* sync */ > + isync(); > > return 0; > > @@ -38,7 +42,10 @@ static int __patch_instruction(u32 *exec_addr, ppc_inst_t instr, u32 *patch_addr > > int raw_patch_instruction(u32 *addr, ppc_inst_t instr) > { > - return __patch_instruction(addr, instr, addr); > + if (ppc_inst_prefixed(instr)) > + return __patch_memory(addr, ppc_inst_as_ulong(instr), addr, true); > + else > + return __patch_memory(addr, ppc_inst_val(instr), addr, false); > } > > static DEFINE_PER_CPU(struct vm_struct *, text_poke_area); > @@ -149,7 +156,7 @@ static void unmap_patch_area(unsigned long addr) > flush_tlb_kernel_range(addr, addr + PAGE_SIZE); > } > > -static int __do_patch_instruction(u32 *addr, ppc_inst_t instr) > +static int __do_patch_memory(void *addr, unsigned long val, bool is_dword) > { > int err; > u32 *patch_addr; > @@ -166,7 +173,7 @@ static int __do_patch_instruction(u32 *addr, ppc_inst_t instr) > if (radix_enabled()) > asm volatile("ptesync": : :"memory"); > > - err = __patch_instruction(addr, instr, patch_addr); > + err = __patch_memory(patch_addr, val, addr, is_dword); > > pte_clear(&init_mm, text_poke_addr, pte); > flush_tlb_kernel_range(text_poke_addr, text_poke_addr + PAGE_SIZE); > @@ -174,7 +181,7 @@ static int __do_patch_instruction(u32 *addr, ppc_inst_t instr) > return err; > } > > -int patch_instruction(u32 *addr, ppc_inst_t instr) > +static int do_patch_memory(void *addr, unsigned long val, bool is_dword) > { > int err; > unsigned long flags; > @@ -186,15 +193,47 @@ int patch_instruction(u32 *addr, ppc_inst_t instr) > */ > if (!IS_ENABLED(CONFIG_STRICT_KERNEL_RWX) || > !static_branch_likely(&poking_init_done)) > - return raw_patch_instruction(addr, instr); > + return __patch_memory(addr, val, addr, is_dword); > > local_irq_save(flags); > - err = __do_patch_instruction(addr, instr); > + err = __do_patch_memory(addr, val, is_dword); > local_irq_restore(flags); > > return err; > } > -NOKPROBE_SYMBOL(patch_instruction); > + > +#ifdef CONFIG_PPC64 > + > +int patch_uint(void *addr, unsigned int val) > +{ > + return do_patch_memory(addr, val, false); > +} > +NOKPROBE_SYMBOL(patch_uint) > + > +int patch_ulong(void *addr, unsigned long val) > +{ > + return do_patch_memory(addr, val, true); > +} > +NOKPROBE_SYMBOL(patch_ulong) > + > +int patch_instruction(u32 *addr, ppc_inst_t instr) > +{ > + if (ppc_inst_prefixed(instr)) > + return patch_ulong(addr, ppc_inst_as_ulong(instr)); > + else > + return patch_uint(addr, ppc_inst_val(instr)); > +} > +NOKPROBE_SYMBOL(patch_instruction) > + > +#else > + > +noinline int patch_uint(void *addr, unsigned int val) > +{ > + return do_patch_memory(addr, val, false); > +} > +NOKPROBE_SYMBOL(patch_uint) > + > +#endif > > int patch_branch(u32 *addr, unsigned long target, int flags) > {
On Wed, 2022-10-05 at 17:55 +0000, Christophe Leroy wrote: > I'm on business trip this week so I can't test it on hardware, but > the > generated code looks horrid and sub-optimal, with a stack frame and > so > many registers saved into it. That's mpc885_ads_defconfig built with > GCC > 12, without modules without stackprotector with 4k pages. Yeah, that definitely wasn't supposed to happen. I was looking at the 32 and 64 bit machine code closely while actively writing it, but I must have refactored it badly when cleaning up for submission. It was supposed to automatically be inlined, leaving it identical to the original on 32-bit. Given inlining is desirable here, and 64-bit inlines anyway, I think I should just mark __patch_memory with __always_inline.
Le 06/10/2022 à 05:36, Benjamin Gray a écrit : > On Wed, 2022-10-05 at 17:55 +0000, Christophe Leroy wrote: >> I'm on business trip this week so I can't test it on hardware, but >> the >> generated code looks horrid and sub-optimal, with a stack frame and >> so >> many registers saved into it. That's mpc885_ads_defconfig built with >> GCC >> 12, without modules without stackprotector with 4k pages. > > Yeah, that definitely wasn't supposed to happen. I was looking at the > 32 and 64 bit machine code closely while actively writing it, but I > must have refactored it badly when cleaning up for submission. It was > supposed to automatically be inlined, leaving it identical to the > original on 32-bit. > > Given inlining is desirable here, and 64-bit inlines anyway, I think I > should just mark __patch_memory with __always_inline. FWIW, I get a far better result with : diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index ba00c550d9d2..447b8de6e427 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -21,7 +21,7 @@ static int __patch_memory(void *patch_addr, unsigned long val, void *exec_addr, /* Prefixed instruction may cross cacheline if cacheline smaller than 64 bytes */ BUILD_BUG_ON(IS_ENABLED(CONFIG_PPC64) && L1_CACHE_BYTES < 64); - if (unlikely(is_dword)) + if (IS_ENABLED(CONFIG_PPC64) && unlikely(is_dword)) __put_kernel_nofault(patch_addr, &val, u64, failed); else __put_kernel_nofault(patch_addr, &val, u32, failed);
On Thu, 2022-10-06 at 09:19 +0000, Christophe Leroy wrote: > > > Le 06/10/2022 à 05:36, Benjamin Gray a écrit : > > On Wed, 2022-10-05 at 17:55 +0000, Christophe Leroy wrote: > > > I'm on business trip this week so I can't test it on hardware, > > > but > > > the > > > generated code looks horrid and sub-optimal, with a stack frame > > > and > > > so > > > many registers saved into it. That's mpc885_ads_defconfig built > > > with > > > GCC > > > 12, without modules without stackprotector with 4k pages. > > > > Yeah, that definitely wasn't supposed to happen. I was looking at > > the > > 32 and 64 bit machine code closely while actively writing it, but I > > must have refactored it badly when cleaning up for submission. It > > was > > supposed to automatically be inlined, leaving it identical to the > > original on 32-bit. > > > > Given inlining is desirable here, and 64-bit inlines anyway, I > > think I > > should just mark __patch_memory with __always_inline. > > FWIW, I get a far better result with : > > diff --git a/arch/powerpc/lib/code-patching.c > b/arch/powerpc/lib/code-patching.c > index ba00c550d9d2..447b8de6e427 100644 > --- a/arch/powerpc/lib/code-patching.c > +++ b/arch/powerpc/lib/code-patching.c > @@ -21,7 +21,7 @@ static int __patch_memory(void *patch_addr, > unsigned > long val, void *exec_addr, > /* Prefixed instruction may cross cacheline if cacheline > smaller than > 64 bytes */ > BUILD_BUG_ON(IS_ENABLED(CONFIG_PPC64) && L1_CACHE_BYTES < > 64); > > - if (unlikely(is_dword)) > + if (IS_ENABLED(CONFIG_PPC64) && unlikely(is_dword)) > __put_kernel_nofault(patch_addr, &val, u64, failed); > else > __put_kernel_nofault(patch_addr, &val, u32, failed); That's very interesting, as that's what I had originally. I removed the IS_ENABLED(CONFIG_PPC64) part of the if condition as part of submission cleanup refactoring because it should be redundant after constant propagation. The weird thing is that GCC constant propagates either way, it's just changing it's mind about inlining. I can add it back, but the optimisation here seems very fragile. It feels more reliable just to specify it explicitly.
diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/code-patching.h index 3f881548fb61..170bfa848c7c 100644 --- a/arch/powerpc/include/asm/code-patching.h +++ b/arch/powerpc/include/asm/code-patching.h @@ -72,7 +72,36 @@ static inline int create_branch(ppc_inst_t *instr, const u32 *addr, int create_cond_branch(ppc_inst_t *instr, const u32 *addr, unsigned long target, int flags); int patch_branch(u32 *addr, unsigned long target, int flags); + +/* patch_uint and patch_ulong must only be called on addresses where the patch + * does not cross a cacheline, otherwise it may not be flushed properly and + * mixes of new and stale data may be observed. + * + * patch_instruction and other instruction patchers automatically satisfy this + * requirement due to instruction alignment requirements. + */ + +int patch_uint(void *addr, unsigned int val); + +#ifdef CONFIG_PPC64 + +int patch_ulong(void *addr, unsigned long val); int patch_instruction(u32 *addr, ppc_inst_t instr); + +#else + +static inline int patch_ulong(void *addr, unsigned long val) +{ + return patch_uint(addr, val); +} + +static inline int patch_instruction(u32 *addr, ppc_inst_t instr) +{ + return patch_uint(addr, ppc_inst_val(instr)); +} + +#endif + int raw_patch_instruction(u32 *addr, ppc_inst_t instr); static inline unsigned long patch_site_addr(s32 *site) diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index 125c55e3e148..ecdd2e523d9a 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -15,20 +15,24 @@ #include <asm/code-patching.h> #include <asm/inst.h> -static int __patch_instruction(u32 *exec_addr, ppc_inst_t instr, u32 *patch_addr) +static int __patch_memory(void *patch_addr, unsigned long val, void *exec_addr, + bool is_dword) { - if (!ppc_inst_prefixed(instr)) { - u32 val = ppc_inst_val(instr); - - __put_kernel_nofault(patch_addr, &val, u32, failed); - } else { - u64 val = ppc_inst_as_ulong(instr); + /* Prefixed instruction may cross cacheline if cacheline smaller than 64 bytes */ + BUILD_BUG_ON(IS_ENABLED(CONFIG_PPC64) && L1_CACHE_BYTES < 64); + if (unlikely(is_dword)) __put_kernel_nofault(patch_addr, &val, u64, failed); - } + else + __put_kernel_nofault(patch_addr, &val, u32, failed); - asm ("dcbst 0, %0; sync; icbi 0,%1; sync; isync" :: "r" (patch_addr), - "r" (exec_addr)); + /* Assume data is inside a single cacheline */ + dcbst(patch_addr); + mb(); /* sync */ + /* Flush on the EA that may be executed in case of a non-coherent icache */ + icbi(exec_addr); + mb(); /* sync */ + isync(); return 0; @@ -38,7 +42,10 @@ static int __patch_instruction(u32 *exec_addr, ppc_inst_t instr, u32 *patch_addr int raw_patch_instruction(u32 *addr, ppc_inst_t instr) { - return __patch_instruction(addr, instr, addr); + if (ppc_inst_prefixed(instr)) + return __patch_memory(addr, ppc_inst_as_ulong(instr), addr, true); + else + return __patch_memory(addr, ppc_inst_val(instr), addr, false); } static DEFINE_PER_CPU(struct vm_struct *, text_poke_area); @@ -149,7 +156,7 @@ static void unmap_patch_area(unsigned long addr) flush_tlb_kernel_range(addr, addr + PAGE_SIZE); } -static int __do_patch_instruction(u32 *addr, ppc_inst_t instr) +static int __do_patch_memory(void *addr, unsigned long val, bool is_dword) { int err; u32 *patch_addr; @@ -166,7 +173,7 @@ static int __do_patch_instruction(u32 *addr, ppc_inst_t instr) if (radix_enabled()) asm volatile("ptesync": : :"memory"); - err = __patch_instruction(addr, instr, patch_addr); + err = __patch_memory(patch_addr, val, addr, is_dword); pte_clear(&init_mm, text_poke_addr, pte); flush_tlb_kernel_range(text_poke_addr, text_poke_addr + PAGE_SIZE); @@ -174,7 +181,7 @@ static int __do_patch_instruction(u32 *addr, ppc_inst_t instr) return err; } -int patch_instruction(u32 *addr, ppc_inst_t instr) +static int do_patch_memory(void *addr, unsigned long val, bool is_dword) { int err; unsigned long flags; @@ -186,15 +193,47 @@ int patch_instruction(u32 *addr, ppc_inst_t instr) */ if (!IS_ENABLED(CONFIG_STRICT_KERNEL_RWX) || !static_branch_likely(&poking_init_done)) - return raw_patch_instruction(addr, instr); + return __patch_memory(addr, val, addr, is_dword); local_irq_save(flags); - err = __do_patch_instruction(addr, instr); + err = __do_patch_memory(addr, val, is_dword); local_irq_restore(flags); return err; } -NOKPROBE_SYMBOL(patch_instruction); + +#ifdef CONFIG_PPC64 + +int patch_uint(void *addr, unsigned int val) +{ + return do_patch_memory(addr, val, false); +} +NOKPROBE_SYMBOL(patch_uint) + +int patch_ulong(void *addr, unsigned long val) +{ + return do_patch_memory(addr, val, true); +} +NOKPROBE_SYMBOL(patch_ulong) + +int patch_instruction(u32 *addr, ppc_inst_t instr) +{ + if (ppc_inst_prefixed(instr)) + return patch_ulong(addr, ppc_inst_as_ulong(instr)); + else + return patch_uint(addr, ppc_inst_val(instr)); +} +NOKPROBE_SYMBOL(patch_instruction) + +#else + +noinline int patch_uint(void *addr, unsigned int val) +{ + return do_patch_memory(addr, val, false); +} +NOKPROBE_SYMBOL(patch_uint) + +#endif int patch_branch(u32 *addr, unsigned long target, int flags) {
Adds a generic text patching mechanism for patches of size int or long bytes. The patch_instruction function is reimplemented in terms of this more generic function. This generic implementation allows patching of arbitrary long data, such as pointers on 64-bit. On 32-bit patch_int is marked noinline to prevent a mis-optimisation. Without noinline, inside patch_branch the compiler may inline all the way to do_patch_memory, preventing the compiler from inlining do_patch_memory into patch_int. This would needlessly force patch_int to be a branch to do_patch_memory. Signed-off-by: Benjamin Gray <bgray@linux.ibm.com> --- arch/powerpc/include/asm/code-patching.h | 29 ++++++++++ arch/powerpc/lib/code-patching.c | 73 ++++++++++++++++++------ 2 files changed, 85 insertions(+), 17 deletions(-)