Message ID | 20230601101257.530867-13-rppt@kernel.org |
---|---|
State | New |
Headers | show |
Series | mm: jit/text allocator | expand |
On Thu, Jun 01, 2023 at 01:12:56PM +0300, Mike Rapoport wrote: > +static void __init_or_module do_text_poke(void *addr, const void *opcode, size_t len) > +{ > + if (system_state < SYSTEM_RUNNING) { > + text_poke_early(addr, opcode, len); > + } else { > + mutex_lock(&text_mutex); > + text_poke(addr, opcode, len); > + mutex_unlock(&text_mutex); > + } > +} So I don't much like do_text_poke(); why? > diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c > index aa99536b824c..d50595f2c1a6 100644 > --- a/arch/x86/kernel/ftrace.c > +++ b/arch/x86/kernel/ftrace.c > @@ -118,10 +118,13 @@ ftrace_modify_code_direct(unsigned long ip, const char *old_code, > return ret; > > /* replace the text with the new text */ > - if (ftrace_poke_late) > + if (ftrace_poke_late) { > text_poke_queue((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL); > - else > - text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE); > + } else { > + mutex_lock(&text_mutex); > + text_poke((void *)ip, new_code, MCOUNT_INSN_SIZE); > + mutex_unlock(&text_mutex); > + } > return 0; > } And in the above case it's actively wrong for loosing the _queue() thing.
On Thu, Jun 01, 2023 at 12:30:50PM +0200, Peter Zijlstra wrote: > On Thu, Jun 01, 2023 at 01:12:56PM +0300, Mike Rapoport wrote: > > > +static void __init_or_module do_text_poke(void *addr, const void *opcode, size_t len) > > +{ > > + if (system_state < SYSTEM_RUNNING) { > > + text_poke_early(addr, opcode, len); > > + } else { > > + mutex_lock(&text_mutex); > > + text_poke(addr, opcode, len); > > + mutex_unlock(&text_mutex); > > + } > > +} > > So I don't much like do_text_poke(); why? I believe the idea was to keep memcpy for early boot before the kernel image is protected without going and adding if (is_module_text_address()) all over the place. I think this can be used instead without updating all the call sites of text_poke_early(): diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 91057de8e6bc..f994e63e9903 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1458,7 +1458,7 @@ void __init_or_module text_poke_early(void *addr, const void *opcode, * code cannot be running and speculative code-fetches are * prevented. Just change the code. */ - memcpy(addr, opcode, len); + text_poke_copy(addr, opcode, len); } else { local_irq_save(flags); memcpy(addr, opcode, len); > > diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c > > index aa99536b824c..d50595f2c1a6 100644 > > --- a/arch/x86/kernel/ftrace.c > > +++ b/arch/x86/kernel/ftrace.c > > @@ -118,10 +118,13 @@ ftrace_modify_code_direct(unsigned long ip, const char *old_code, > > return ret; > > > > /* replace the text with the new text */ > > - if (ftrace_poke_late) > > + if (ftrace_poke_late) { > > text_poke_queue((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL); > > - else > > - text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE); > > + } else { > > + mutex_lock(&text_mutex); > > + text_poke((void *)ip, new_code, MCOUNT_INSN_SIZE); > > + mutex_unlock(&text_mutex); > > + } > > return 0; > > } > > And in the above case it's actively wrong for loosing the _queue() > thing.
On Thu, 2023-06-01 at 13:12 +0300, Mike Rapoport wrote: > /* > * Are we looking at a near JMP with a 1 or 4-byte displacement. > @@ -331,7 +344,7 @@ void __init_or_module noinline > apply_alternatives(struct alt_instr *start, > > DUMP_BYTES(insn_buff, insn_buff_sz, "%px: final_insn: > ", instr); > > - text_poke_early(instr, insn_buff, insn_buff_sz); > + do_text_poke(instr, insn_buff, insn_buff_sz); > > next: > optimize_nops(instr, a->instrlen); > @@ -564,7 +577,7 @@ void __init_or_module noinline > apply_retpolines(s32 *start, s32 *end) > optimize_nops(bytes, len); > DUMP_BYTES(((u8*)addr), len, "%px: orig: ", > addr); > DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", > addr); > - text_poke_early(addr, bytes, len); > + do_text_poke(addr, bytes, len); > } > } > } > @@ -638,7 +651,7 @@ void __init_or_module noinline apply_returns(s32 > *start, s32 *end) > if (len == insn.length) { > DUMP_BYTES(((u8*)addr), len, "%px: orig: ", > addr); > DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", > addr); > - text_poke_early(addr, bytes, len); > + do_text_poke(addr, bytes, len); > } > } > } > @@ -674,7 +687,7 @@ static void poison_endbr(void *addr, bool warn) > */ > DUMP_BYTES(((u8*)addr), 4, "%px: orig: ", addr); > DUMP_BYTES(((u8*)&poison), 4, "%px: repl: ", addr); > - text_poke_early(addr, &poison, 4); > + do_text_poke(addr, &poison, 4); > } > > /* > @@ -869,7 +882,7 @@ static int cfi_disable_callers(s32 *start, s32 > *end) > if (!hash) /* nocfi callers */ > continue; > > - text_poke_early(addr, jmp, 2); > + do_text_poke(addr, jmp, 2); > } > > return 0; > @@ -892,7 +905,7 @@ static int cfi_enable_callers(s32 *start, s32 > *end) > if (!hash) /* nocfi callers */ > continue; > > - text_poke_early(addr, mov, 2); > + do_text_poke(addr, mov, 2); > } > > return 0; > @@ -913,7 +926,7 @@ static int cfi_rand_preamble(s32 *start, s32 > *end) > return -EINVAL; > > hash = cfi_rehash(hash); > - text_poke_early(addr + 1, &hash, 4); > + do_text_poke(addr + 1, &hash, 4); > } > > return 0; > @@ -932,9 +945,9 @@ static int cfi_rewrite_preamble(s32 *start, s32 > *end) > addr, addr, 5, addr)) > return -EINVAL; > > - text_poke_early(addr, fineibt_preamble_start, > fineibt_preamble_size); > + do_text_poke(addr, fineibt_preamble_start, > fineibt_preamble_size); > WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != > 0x12345678); > - text_poke_early(addr + fineibt_preamble_hash, &hash, > 4); > + do_text_poke(addr + fineibt_preamble_hash, &hash, 4); > } It is just a local flush, but I wonder how much text_poke()ing is too much. A lot of the are even inside loops. Can't it do the batch version at least? The other thing, and maybe this is in paranoia category, but it's probably at least worth noting. Before the modules were not made executable until all of the code was finalized. Now they are made executable in an intermediate state and then patched later. It might weaken the CFI stuff, but also it just kind of seems a bit unbounded for dealing with executable code. Preparing the modules in a separate RW mapping, and then text_poke()ing the whole thing in when you are done would resolve both of these.
On Thu, Jun 01, 2023 at 12:30:50PM +0200, Peter Zijlstra wrote: > On Thu, Jun 01, 2023 at 01:12:56PM +0300, Mike Rapoport wrote: > > > +static void __init_or_module do_text_poke(void *addr, const void *opcode, size_t len) > > +{ > > + if (system_state < SYSTEM_RUNNING) { > > + text_poke_early(addr, opcode, len); > > + } else { > > + mutex_lock(&text_mutex); > > + text_poke(addr, opcode, len); > > + mutex_unlock(&text_mutex); > > + } > > +} > > So I don't much like do_text_poke(); why? Could you share why? I think the impementation sucks but conceptually it's the right idea - create a new temporary mapping to avoid the need for RWX mappings.
On Thu, Jun 01, 2023 at 04:54:27PM +0000, Edgecombe, Rick P wrote: > It is just a local flush, but I wonder how much text_poke()ing is too > much. A lot of the are even inside loops. Can't it do the batch version > at least? > > The other thing, and maybe this is in paranoia category, but it's > probably at least worth noting. Before the modules were not made > executable until all of the code was finalized. Now they are made > executable in an intermediate state and then patched later. It might > weaken the CFI stuff, but also it just kind of seems a bit unbounded > for dealing with executable code. I believe bpf starts out by initializing new executable memory with illegal opcodes, maybe we should steal that and make it standard. > Preparing the modules in a separate RW mapping, and then text_poke()ing > the whole thing in when you are done would resolve both of these. text_poke() _does_ create a separate RW mapping. The thing that sucks about text_poke() is that it always does a full TLB flush, and AFAICT that's not remotely needed. What it really wants to be doing is conceptually just kmap_local() mempcy() kunmap_loca() flush_icache(); ...except that kmap_local() won't actually create a new mapping on non-highmem architectures, so text_poke() open codes it.
On Thu, 2023-06-01 at 14:00 -0400, Kent Overstreet wrote: > On Thu, Jun 01, 2023 at 04:54:27PM +0000, Edgecombe, Rick P wrote: > > It is just a local flush, but I wonder how much text_poke()ing is > > too > > much. A lot of the are even inside loops. Can't it do the batch > > version > > at least? > > > > The other thing, and maybe this is in paranoia category, but it's > > probably at least worth noting. Before the modules were not made > > executable until all of the code was finalized. Now they are made > > executable in an intermediate state and then patched later. It > > might > > weaken the CFI stuff, but also it just kind of seems a bit > > unbounded > > for dealing with executable code. > > I believe bpf starts out by initializing new executable memory with > illegal opcodes, maybe we should steal that and make it standard. I was thinking of modules which have a ton of alternatives, errata fixes, etc applied to them after the initial sections are written to the to-be-executable mapping. I thought this had zeroed pages to start, which seems ok. > > > Preparing the modules in a separate RW mapping, and then > > text_poke()ing > > the whole thing in when you are done would resolve both of these. > > text_poke() _does_ create a separate RW mapping. Sorry, I meant a separate RW allocation. > > The thing that sucks about text_poke() is that it always does a full > TLB > flush, and AFAICT that's not remotely needed. What it really wants to > be > doing is conceptually just > > kmap_local() > mempcy() > kunmap_loca() > flush_icache(); > > ...except that kmap_local() won't actually create a new mapping on > non-highmem architectures, so text_poke() open codes it. Text poke creates only a local CPU RW mapping. It's more secure because other threads can't write to it. It also only needs to flush the local core when it's done since it's not using a shared MM. It used to use the fixmap, which is similar to what you are describing I think.
On Thu, Jun 01, 2023 at 06:13:44PM +0000, Edgecombe, Rick P wrote: > > text_poke() _does_ create a separate RW mapping. > > Sorry, I meant a separate RW allocation. Ah yes, that makes sense > > > > > The thing that sucks about text_poke() is that it always does a full > > TLB > > flush, and AFAICT that's not remotely needed. What it really wants to > > be > > doing is conceptually just > > > > kmap_local() > > mempcy() > > kunmap_loca() > > flush_icache(); > > > > ...except that kmap_local() won't actually create a new mapping on > > non-highmem architectures, so text_poke() open codes it. > > Text poke creates only a local CPU RW mapping. It's more secure because > other threads can't write to it. *nod*, same as kmap_local > It also only needs to flush the local core when it's done since it's > not using a shared MM. Ahh! Thanks for that; perhaps the comment in text_poke() about IPIs could be a bit clearer. What is it (if anything) you don't like about text_poke() then? It looks like it's doing broadly similar things to kmap_local(), so should be in the same ballpark from a performance POV?
On Thu, 2023-06-01 at 14:38 -0400, Kent Overstreet wrote: > On Thu, Jun 01, 2023 at 06:13:44PM +0000, Edgecombe, Rick P wrote: > > > text_poke() _does_ create a separate RW mapping. > > > > Sorry, I meant a separate RW allocation. > > Ah yes, that makes sense > > > > > > > > > > The thing that sucks about text_poke() is that it always does a > > > full > > > TLB > > > flush, and AFAICT that's not remotely needed. What it really > > > wants to > > > be > > > doing is conceptually just > > > > > > kmap_local() > > > mempcy() > > > kunmap_loca() > > > flush_icache(); > > > > > > ...except that kmap_local() won't actually create a new mapping > > > on > > > non-highmem architectures, so text_poke() open codes it. > > > > Text poke creates only a local CPU RW mapping. It's more secure > > because > > other threads can't write to it. > > *nod*, same as kmap_local It's only used and flushed locally, but it is accessible to all CPU's, right? > > > It also only needs to flush the local core when it's done since > > it's > > not using a shared MM. > > Ahh! Thanks for that; perhaps the comment in text_poke() about IPIs > could be a bit clearer. > > What is it (if anything) you don't like about text_poke() then? It > looks > like it's doing broadly similar things to kmap_local(), so should be > in the same ballpark from a performance POV? The way text_poke() is used here, it is creating a new writable alias and flushing it for *each* write to the module (like for each write of an individual relocation, etc). I was just thinking it might warrant some batching or something.
On Thu, Jun 1, 2023 at 3:15 AM Mike Rapoport <rppt@kernel.org> wrote: > > From: Song Liu <song@kernel.org> > > Replace direct memory writes to memory allocated for code with text poking > to allow allocation of executable memory as ROX. > > The only exception is arch_prepare_bpf_trampoline() that cannot jit > directly into module memory yet, so it uses set_memory calls to > unprotect the memory before writing to it and to protect memory in the > end. > > Signed-off-by: Song Liu <song@kernel.org> > Co-developed-by: Mike Rapoport (IBM) <rppt@kernel.org> > Signed-off-by: Mike Rapoport (IBM) <rppt@kernel.org> > --- > arch/x86/kernel/alternative.c | 43 +++++++++++++++++++++++------------ > arch/x86/kernel/ftrace.c | 41 +++++++++++++++++++++------------ > arch/x86/kernel/module.c | 24 +++++-------------- > arch/x86/kernel/static_call.c | 10 ++++---- > arch/x86/kernel/unwind_orc.c | 13 +++++++---- > arch/x86/net/bpf_jit_comp.c | 22 +++++++++++++----- We need the following in this patch (or before this patch). Otherwise, the system will crash at the VIRTUAL_BUG_ON() in vmalloc_to_page(). Thanks, Song diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index bf954d2721c1..4efa8a795ebc 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1084,7 +1084,7 @@ bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr, return NULL; } - *rw_header = kvmalloc(size, GFP_KERNEL); + *rw_header = kvzalloc(size, GFP_KERNEL); if (!*rw_header) { bpf_arch_text_copy(&ro_header->size, &size, sizeof(size)); bpf_prog_pack_free(ro_header); @@ -1092,8 +1092,6 @@ bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr, return NULL; } - /* Fill space with illegal/arch-dep instructions. */ - bpf_fill_ill_insns(*rw_header, size); (*rw_header)->size = size; hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)),
> On Jun 1, 2023, at 1:50 PM, Edgecombe, Rick P <rick.p.edgecombe@intel.com> wrote: > > On Thu, 2023-06-01 at 14:38 -0400, Kent Overstreet wrote: >> On Thu, Jun 01, 2023 at 06:13:44PM +0000, Edgecombe, Rick P wrote: >>>> text_poke() _does_ create a separate RW mapping. >>> >>> Sorry, I meant a separate RW allocation. >> >> Ah yes, that makes sense >> >> >>> >>>> >>>> The thing that sucks about text_poke() is that it always does a >>>> full >>>> TLB >>>> flush, and AFAICT that's not remotely needed. What it really >>>> wants to >>>> be >>>> doing is conceptually just >>>> >>>> kmap_local() >>>> mempcy() >>>> kunmap_loca() >>>> flush_icache(); >>>> >>>> ...except that kmap_local() won't actually create a new mapping >>>> on >>>> non-highmem architectures, so text_poke() open codes it. >>> >>> Text poke creates only a local CPU RW mapping. It's more secure >>> because >>> other threads can't write to it. >> >> *nod*, same as kmap_local > > It's only used and flushed locally, but it is accessible to all CPU's, > right? > >> >>> It also only needs to flush the local core when it's done since >>> it's >>> not using a shared MM. >> >> Ahh! Thanks for that; perhaps the comment in text_poke() about IPIs >> could be a bit clearer. >> >> What is it (if anything) you don't like about text_poke() then? It >> looks >> like it's doing broadly similar things to kmap_local(), so should be >> in the same ballpark from a performance POV? > > The way text_poke() is used here, it is creating a new writable alias > and flushing it for *each* write to the module (like for each write of > an individual relocation, etc). I was just thinking it might warrant > some batching or something. I am not advocating to do so, but if you want to have many efficient writes, perhaps you can just disable CR0.WP. Just saying that if you are about to write all over the memory, text_poke() does not provide too much security for the poking thread.
On Thu, Jun 1, 2023 at 4:07 AM Mike Rapoport <rppt@kernel.org> wrote: > > On Thu, Jun 01, 2023 at 12:30:50PM +0200, Peter Zijlstra wrote: > > On Thu, Jun 01, 2023 at 01:12:56PM +0300, Mike Rapoport wrote: > > > > > +static void __init_or_module do_text_poke(void *addr, const void *opcode, size_t len) > > > +{ > > > + if (system_state < SYSTEM_RUNNING) { > > > + text_poke_early(addr, opcode, len); > > > + } else { > > > + mutex_lock(&text_mutex); > > > + text_poke(addr, opcode, len); > > > + mutex_unlock(&text_mutex); > > > + } > > > +} > > > > So I don't much like do_text_poke(); why? > > I believe the idea was to keep memcpy for early boot before the kernel > image is protected without going and adding if (is_module_text_address()) > all over the place. > > I think this can be used instead without updating all the call sites of > text_poke_early(): > > diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c > index 91057de8e6bc..f994e63e9903 100644 > --- a/arch/x86/kernel/alternative.c > +++ b/arch/x86/kernel/alternative.c > @@ -1458,7 +1458,7 @@ void __init_or_module text_poke_early(void *addr, const void *opcode, > * code cannot be running and speculative code-fetches are > * prevented. Just change the code. > */ > - memcpy(addr, opcode, len); > + text_poke_copy(addr, opcode, len); > } else { > local_irq_save(flags); > memcpy(addr, opcode, len); > This alone doesn't work, as text_poke_early() is called before addr is added to the list of module texts. So we still use memcpy() here. Thanks, Song
On Thu, Jun 01, 2023 at 08:50:39PM +0000, Edgecombe, Rick P wrote: > > Ahh! Thanks for that; perhaps the comment in text_poke() about IPIs > > could be a bit clearer. > > > > What is it (if anything) you don't like about text_poke() then? It > > looks > > like it's doing broadly similar things to kmap_local(), so should be > > in the same ballpark from a performance POV? > > The way text_poke() is used here, it is creating a new writable alias > and flushing it for *each* write to the module (like for each write of > an individual relocation, etc). I was just thinking it might warrant > some batching or something. Ah, I see. A kmap_local type interface might get us that kind of batching, if it supported mapping compound pages - currently kmap_local still only maps single pages, but with folios getting plumbed around I assume someone will make it handle compound pages eventually.
On Thu, 1 Jun 2023 16:54:36 -0700 Nadav Amit <nadav.amit@gmail.com> wrote: > > The way text_poke() is used here, it is creating a new writable alias > > and flushing it for *each* write to the module (like for each write of > > an individual relocation, etc). I was just thinking it might warrant > > some batching or something. Batching does exist, which is what the text_poke_queue() thing does. -- Steve
On Sun, Jun 04, 2023 at 10:52:44PM -0400, Steven Rostedt wrote: > On Thu, 1 Jun 2023 16:54:36 -0700 > Nadav Amit <nadav.amit@gmail.com> wrote: > > > > The way text_poke() is used here, it is creating a new writable alias > > > and flushing it for *each* write to the module (like for each write of > > > an individual relocation, etc). I was just thinking it might warrant > > > some batching or something. > > I am not advocating to do so, but if you want to have many efficient > > writes, perhaps you can just disable CR0.WP. Just saying that if you > > are about to write all over the memory, text_poke() does not provide > > too much security for the poking thread. Heh, this is definitely and easier hack to implement :) > Batching does exist, which is what the text_poke_queue() thing does. For module loading text_poke_queue() will still be much slower than a bunch of memset()s for no good reason because we don't need all the complexity of text_poke_bp_batch() for module initialization because we are sure we are not patching live code. What we'd need here is a new batching mode that will create a writable alias mapping at the beginning of apply_relocate_*() and module_finalize(), then it will use memcpy() to that writable alias and will tear the mapping down in the end. Another option is to teach alternatives to update a writable copy rather than do in place changes like Song suggested. My feeling is that it will be more intrusive change though. > -- Steve >
On Mon, 2023-06-05 at 11:11 +0300, Mike Rapoport wrote: > On Sun, Jun 04, 2023 at 10:52:44PM -0400, Steven Rostedt wrote: > > On Thu, 1 Jun 2023 16:54:36 -0700 > > Nadav Amit <nadav.amit@gmail.com> wrote: > > > > > > The way text_poke() is used here, it is creating a new writable > > > > alias > > > > and flushing it for *each* write to the module (like for each > > > > write of > > > > an individual relocation, etc). I was just thinking it might > > > > warrant > > > > some batching or something. > > > > I am not advocating to do so, but if you want to have many > > > efficient > > > writes, perhaps you can just disable CR0.WP. Just saying that if > > > you > > > are about to write all over the memory, text_poke() does not > > > provide > > > too much security for the poking thread. > > Heh, this is definitely and easier hack to implement :) I don't know the details, but previously there was some strong dislike of CR0.WP toggling. And now there is also the problem of CET. Setting CR0.WP=0 will #GP if CR4.CET is 1 (as it currently is for kernel IBT). I guess you might get away with toggling them both in some controlled situation, but it might be a lot easier to hack up then to be made fully acceptable. It does sound much more efficient though. > > > Batching does exist, which is what the text_poke_queue() thing > > does. > > For module loading text_poke_queue() will still be much slower than a > bunch > of memset()s for no good reason because we don't need all the > complexity of > text_poke_bp_batch() for module initialization because we are sure we > are > not patching live code. > > What we'd need here is a new batching mode that will create a > writable > alias mapping at the beginning of apply_relocate_*() and > module_finalize(), > then it will use memcpy() to that writable alias and will tear the > mapping > down in the end. It's probably only a tiny bit faster than keeping a separate writable allocation and text_poking it in at the end. > > Another option is to teach alternatives to update a writable copy > rather > than do in place changes like Song suggested. My feeling is that it > will be > more intrusive change though. You mean keeping a separate RW allocation and then text_poking() the whole thing in when you are done? That is what I was trying to say at the beginning of this thread. The other benefit is you don't make the intermediate loading states of the module, executable. I tried this technique previously [0], and I thought it was not too bad. In most of the callers it looks similar to what you have in do_text_poke(). Sometimes less, sometimes more. It might need enlightening of some of the stuff currently using text_poke() during module loading, like jump labels. So that bit is more intrusive, yea. But it sounds so much cleaner and well controlled. Did you have a particular trouble spot in mind? [0] https://lore.kernel.org/lkml/20201120202426.18009-5-rick.p.edgecombe@intel.com/
On Mon, Jun 05, 2023 at 04:10:21PM +0000, Edgecombe, Rick P wrote: > On Mon, 2023-06-05 at 11:11 +0300, Mike Rapoport wrote: > > On Sun, Jun 04, 2023 at 10:52:44PM -0400, Steven Rostedt wrote: > > > On Thu, 1 Jun 2023 16:54:36 -0700 > > > Nadav Amit <nadav.amit@gmail.com> wrote: > > > > > > > > The way text_poke() is used here, it is creating a new writable > > > > > alias > > > > > and flushing it for *each* write to the module (like for each > > > > > write of > > > > > an individual relocation, etc). I was just thinking it might > > > > > warrant > > > > > some batching or something. > > > > > > I am not advocating to do so, but if you want to have many > > > > efficient > > > > writes, perhaps you can just disable CR0.WP. Just saying that if > > > > you > > > > are about to write all over the memory, text_poke() does not > > > > provide > > > > too much security for the poking thread. > > > > Heh, this is definitely and easier hack to implement :) > > I don't know the details, but previously there was some strong dislike > of CR0.WP toggling. And now there is also the problem of CET. Setting > CR0.WP=0 will #GP if CR4.CET is 1 (as it currently is for kernel IBT). > I guess you might get away with toggling them both in some controlled > situation, but it might be a lot easier to hack up then to be made > fully acceptable. It does sound much more efficient though. I don't think we'd really want that, especially looking at WARN_ONCE(bits_missing, "CR0 WP bit went missing!?\n"); at native_write_cr0(). > > > Batching does exist, which is what the text_poke_queue() thing > > > does. > > > > For module loading text_poke_queue() will still be much slower than a > > bunch > > of memset()s for no good reason because we don't need all the > > complexity of > > text_poke_bp_batch() for module initialization because we are sure we > > are > > not patching live code. > > > > What we'd need here is a new batching mode that will create a > > writable > > alias mapping at the beginning of apply_relocate_*() and > > module_finalize(), > > then it will use memcpy() to that writable alias and will tear the > > mapping > > down in the end. > > It's probably only a tiny bit faster than keeping a separate writable > allocation and text_poking it in at the end. Right, but it still will be faster than text_poking every relocation. > > Another option is to teach alternatives to update a writable copy > > rather > > than do in place changes like Song suggested. My feeling is that it > > will be > > more intrusive change though. > > You mean keeping a separate RW allocation and then text_poking() the > whole thing in when you are done? That is what I was trying to say at > the beginning of this thread. The other benefit is you don't make the > intermediate loading states of the module, executable. > > I tried this technique previously [0], and I thought it was not too > bad. In most of the callers it looks similar to what you have in > do_text_poke(). Sometimes less, sometimes more. It might need > enlightening of some of the stuff currently using text_poke() during > module loading, like jump labels. So that bit is more intrusive, yea. > But it sounds so much cleaner and well controlled. Did you have a > particular trouble spot in mind? Nothing in particular, except the intrusive part. Except the changes in modules.c we'd need to teach alternatives to deal with a writable copy. > [0] > https://lore.kernel.org/lkml/20201120202426.18009-5-rick.p.edgecombe@intel.com/
On Mon, 2023-06-05 at 23:42 +0300, Mike Rapoport wrote: > > I tried this technique previously [0], and I thought it was not too > > bad. In most of the callers it looks similar to what you have in > > do_text_poke(). Sometimes less, sometimes more. It might need > > enlightening of some of the stuff currently using text_poke() > > during > > module loading, like jump labels. So that bit is more intrusive, > > yea. > > But it sounds so much cleaner and well controlled. Did you have a > > particular trouble spot in mind? > > Nothing in particular, except the intrusive part. Except the changes > in > modules.c we'd need to teach alternatives to deal with a writable > copy. I didn't think alternatives piece looked too bad on the caller side (if that's what you meant): https://lore.kernel.org/lkml/20201120202426.18009-7-rick.p.edgecombe@intel.com/ The ugly part was in the (poorly named) module_adjust_writable_addr(): +static inline void *module_adjust_writable_addr(void *addr) +{ + unsigned long laddr = (unsigned long)addr; + struct module *mod; + + mutex_lock(&module_mutex); + mod = __module_address(laddr); + if (!mod) { + mutex_unlock(&module_mutex); + return addr; + } + mutex_unlock(&module_mutex); + /* The module shouldn't be going away if someone is trying to write to it */ + + return (void *)perm_writable_addr(module_get_allocation(mod, laddr), laddr); +} + It took module_mutex and looked up the module in order to find the writable buffer from just the executable address. Basically all the loading code external to modules had to go through that interface. But now I'm wondering what I was thinking, it seems this could just be an RCU read lock. That doesn't seem to bad...
> On Jun 5, 2023, at 9:10 AM, Edgecombe, Rick P <rick.p.edgecombe@intel.com> wrote: > > On Mon, 2023-06-05 at 11:11 +0300, Mike Rapoport wrote: >> On Sun, Jun 04, 2023 at 10:52:44PM -0400, Steven Rostedt wrote: >>> On Thu, 1 Jun 2023 16:54:36 -0700 >>> Nadav Amit <nadav.amit@gmail.com> wrote: >>> >>>>> The way text_poke() is used here, it is creating a new writable >>>>> alias >>>>> and flushing it for *each* write to the module (like for each >>>>> write of >>>>> an individual relocation, etc). I was just thinking it might >>>>> warrant >>>>> some batching or something. >> >>>> I am not advocating to do so, but if you want to have many >>>> efficient >>>> writes, perhaps you can just disable CR0.WP. Just saying that if >>>> you >>>> are about to write all over the memory, text_poke() does not >>>> provide >>>> too much security for the poking thread. >> >> Heh, this is definitely and easier hack to implement :) > > I don't know the details, but previously there was some strong dislike > of CR0.WP toggling. And now there is also the problem of CET. Setting > CR0.WP=0 will #GP if CR4.CET is 1 (as it currently is for kernel IBT). > I guess you might get away with toggling them both in some controlled > situation, but it might be a lot easier to hack up then to be made > fully acceptable. It does sound much more efficient though. Thanks for highlighting this issue. I understand the limitations of CR0.WP. There is also always the concerns that without CET or other control flow integrity mechanism, someone would abuse (using ROP/JOP) functions that clear CR0.WP…
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index f615e0cb6d93..91057de8e6bc 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -18,6 +18,7 @@ #include <linux/mmu_context.h> #include <linux/bsearch.h> #include <linux/sync_core.h> +#include <linux/set_memory.h> #include <asm/text-patching.h> #include <asm/alternative.h> #include <asm/sections.h> @@ -76,6 +77,19 @@ do { \ } \ } while (0) +void text_poke_early(void *addr, const void *opcode, size_t len); + +static void __init_or_module do_text_poke(void *addr, const void *opcode, size_t len) +{ + if (system_state < SYSTEM_RUNNING) { + text_poke_early(addr, opcode, len); + } else { + mutex_lock(&text_mutex); + text_poke(addr, opcode, len); + mutex_unlock(&text_mutex); + } +} + static const unsigned char x86nops[] = { BYTES_NOP1, @@ -108,7 +122,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len) unsigned int noplen = len; if (noplen > ASM_NOP_MAX) noplen = ASM_NOP_MAX; - memcpy(insns, x86_nops[noplen], noplen); + do_text_poke(insns, x86_nops[noplen], noplen); insns += noplen; len -= noplen; } @@ -120,7 +134,6 @@ extern s32 __cfi_sites[], __cfi_sites_end[]; extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[]; extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern s32 __smp_locks[], __smp_locks_end[]; -void text_poke_early(void *addr, const void *opcode, size_t len); /* * Are we looking at a near JMP with a 1 or 4-byte displacement. @@ -331,7 +344,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, DUMP_BYTES(insn_buff, insn_buff_sz, "%px: final_insn: ", instr); - text_poke_early(instr, insn_buff, insn_buff_sz); + do_text_poke(instr, insn_buff, insn_buff_sz); next: optimize_nops(instr, a->instrlen); @@ -564,7 +577,7 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) optimize_nops(bytes, len); DUMP_BYTES(((u8*)addr), len, "%px: orig: ", addr); DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr); - text_poke_early(addr, bytes, len); + do_text_poke(addr, bytes, len); } } } @@ -638,7 +651,7 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end) if (len == insn.length) { DUMP_BYTES(((u8*)addr), len, "%px: orig: ", addr); DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr); - text_poke_early(addr, bytes, len); + do_text_poke(addr, bytes, len); } } } @@ -674,7 +687,7 @@ static void poison_endbr(void *addr, bool warn) */ DUMP_BYTES(((u8*)addr), 4, "%px: orig: ", addr); DUMP_BYTES(((u8*)&poison), 4, "%px: repl: ", addr); - text_poke_early(addr, &poison, 4); + do_text_poke(addr, &poison, 4); } /* @@ -869,7 +882,7 @@ static int cfi_disable_callers(s32 *start, s32 *end) if (!hash) /* nocfi callers */ continue; - text_poke_early(addr, jmp, 2); + do_text_poke(addr, jmp, 2); } return 0; @@ -892,7 +905,7 @@ static int cfi_enable_callers(s32 *start, s32 *end) if (!hash) /* nocfi callers */ continue; - text_poke_early(addr, mov, 2); + do_text_poke(addr, mov, 2); } return 0; @@ -913,7 +926,7 @@ static int cfi_rand_preamble(s32 *start, s32 *end) return -EINVAL; hash = cfi_rehash(hash); - text_poke_early(addr + 1, &hash, 4); + do_text_poke(addr + 1, &hash, 4); } return 0; @@ -932,9 +945,9 @@ static int cfi_rewrite_preamble(s32 *start, s32 *end) addr, addr, 5, addr)) return -EINVAL; - text_poke_early(addr, fineibt_preamble_start, fineibt_preamble_size); + do_text_poke(addr, fineibt_preamble_start, fineibt_preamble_size); WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != 0x12345678); - text_poke_early(addr + fineibt_preamble_hash, &hash, 4); + do_text_poke(addr + fineibt_preamble_hash, &hash, 4); } return 0; @@ -953,7 +966,7 @@ static int cfi_rand_callers(s32 *start, s32 *end) hash = decode_caller_hash(addr); if (hash) { hash = -cfi_rehash(hash); - text_poke_early(addr + 2, &hash, 4); + do_text_poke(addr + 2, &hash, 4); } } @@ -971,9 +984,9 @@ static int cfi_rewrite_callers(s32 *start, s32 *end) addr -= fineibt_caller_size; hash = decode_caller_hash(addr); if (hash) { - text_poke_early(addr, fineibt_caller_start, fineibt_caller_size); + do_text_poke(addr, fineibt_caller_start, fineibt_caller_size); WARN_ON(*(u32 *)(addr + fineibt_caller_hash) != 0x12345678); - text_poke_early(addr + fineibt_caller_hash, &hash, 4); + do_text_poke(addr + fineibt_caller_hash, &hash, 4); } /* rely on apply_retpolines() */ } @@ -1243,7 +1256,7 @@ void __init_or_module apply_paravirt(struct paravirt_patch_site *start, /* Pad the rest with nops */ add_nops(insn_buff + used, p->len - used); - text_poke_early(p->instr, insn_buff, p->len); + do_text_poke(p->instr, insn_buff, p->len); } } extern struct paravirt_patch_site __start_parainstructions[], diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index aa99536b824c..d50595f2c1a6 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -118,10 +118,13 @@ ftrace_modify_code_direct(unsigned long ip, const char *old_code, return ret; /* replace the text with the new text */ - if (ftrace_poke_late) + if (ftrace_poke_late) { text_poke_queue((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL); - else - text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE); + } else { + mutex_lock(&text_mutex); + text_poke((void *)ip, new_code, MCOUNT_INSN_SIZE); + mutex_unlock(&text_mutex); + } return 0; } @@ -319,7 +322,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) unsigned const char op_ref[] = { 0x48, 0x8b, 0x15 }; unsigned const char retq[] = { RET_INSN_OPCODE, INT3_INSN_OPCODE }; union ftrace_op_code_union op_ptr; - int ret; + void *ret; if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { start_offset = (unsigned long)ftrace_regs_caller; @@ -350,15 +353,15 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) npages = DIV_ROUND_UP(*tramp_size, PAGE_SIZE); /* Copy ftrace_caller onto the trampoline memory */ - ret = copy_from_kernel_nofault(trampoline, (void *)start_offset, size); - if (WARN_ON(ret < 0)) + ret = text_poke_copy(trampoline, (void *)start_offset, size); + if (WARN_ON(!ret)) goto fail; ip = trampoline + size; if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) __text_gen_insn(ip, JMP32_INSN_OPCODE, ip, x86_return_thunk, JMP32_INSN_SIZE); else - memcpy(ip, retq, sizeof(retq)); + text_poke_copy(ip, retq, sizeof(retq)); /* No need to test direct calls on created trampolines */ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { @@ -366,8 +369,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) ip = trampoline + (jmp_offset - start_offset); if (WARN_ON(*(char *)ip != 0x75)) goto fail; - ret = copy_from_kernel_nofault(ip, x86_nops[2], 2); - if (ret < 0) + if (!text_poke_copy(ip, x86_nops[2], 2)) goto fail; } @@ -380,7 +382,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) */ ptr = (unsigned long *)(trampoline + size + RET_SIZE); - *ptr = (unsigned long)ops; + text_poke_copy(ptr, &ops, sizeof(unsigned long)); op_offset -= start_offset; memcpy(&op_ptr, trampoline + op_offset, OP_REF_SIZE); @@ -396,7 +398,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) op_ptr.offset = offset; /* put in the new offset to the ftrace_ops */ - memcpy(trampoline + op_offset, &op_ptr, OP_REF_SIZE); + text_poke_copy(trampoline + op_offset, &op_ptr, OP_REF_SIZE); /* put in the call to the function */ mutex_lock(&text_mutex); @@ -406,9 +408,9 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) * the depth accounting before the call already. */ dest = ftrace_ops_get_func(ops); - memcpy(trampoline + call_offset, - text_gen_insn(CALL_INSN_OPCODE, trampoline + call_offset, dest), - CALL_INSN_SIZE); + text_poke_copy_locked(trampoline + call_offset, + text_gen_insn(CALL_INSN_OPCODE, trampoline + call_offset, dest), + CALL_INSN_SIZE, false); mutex_unlock(&text_mutex); /* ALLOC_TRAMP flags lets us know we created it */ @@ -658,4 +660,15 @@ void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, } #endif +void ftrace_swap_func(void *a, void *b, int n) +{ + unsigned long t; + + WARN_ON_ONCE(n != sizeof(t)); + + t = *((unsigned long *)a); + text_poke_copy(a, b, sizeof(t)); + text_poke_copy(b, &t, sizeof(t)); +} + #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 94a00dc103cd..444bc76574b9 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -83,7 +83,6 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs, unsigned int symindex, unsigned int relsec, struct module *me, - void *(*write)(void *dest, const void *src, size_t len), bool apply) { unsigned int i; @@ -151,14 +150,14 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs, (int)ELF64_R_TYPE(rel[i].r_info), loc, val); return -ENOEXEC; } - write(loc, &val, size); + text_poke(loc, &val, size); } else { if (memcmp(loc, &val, size)) { pr_warn("x86/modules: Invalid relocation target, existing value does not match expected value for type %d, loc %p, val %Lx\n", (int)ELF64_R_TYPE(rel[i].r_info), loc, val); return -ENOEXEC; } - write(loc, &zero, size); + text_poke(loc, &zero, size); } } return 0; @@ -179,22 +178,11 @@ static int write_relocate_add(Elf64_Shdr *sechdrs, bool apply) { int ret; - bool early = me->state == MODULE_STATE_UNFORMED; - void *(*write)(void *, const void *, size_t) = memcpy; - - if (!early) { - write = text_poke; - mutex_lock(&text_mutex); - } - - ret = __write_relocate_add(sechdrs, strtab, symindex, relsec, me, - write, apply); - - if (!early) { - text_poke_sync(); - mutex_unlock(&text_mutex); - } + mutex_lock(&text_mutex); + ret = __write_relocate_add(sechdrs, strtab, symindex, relsec, me, apply); + text_poke_sync(); + mutex_unlock(&text_mutex); return ret; } diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c index b70670a98597..90aacef21dfa 100644 --- a/arch/x86/kernel/static_call.c +++ b/arch/x86/kernel/static_call.c @@ -51,7 +51,7 @@ asm (".global __static_call_return\n\t" ".size __static_call_return, . - __static_call_return \n\t"); static void __ref __static_call_transform(void *insn, enum insn_type type, - void *func, bool modinit) + void *func) { const void *emulate = NULL; int size = CALL_INSN_SIZE; @@ -105,7 +105,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, if (memcmp(insn, code, size) == 0) return; - if (system_state == SYSTEM_BOOTING || modinit) + if (system_state == SYSTEM_BOOTING) return text_poke_early(insn, code, size); text_poke_bp(insn, code, size, emulate); @@ -160,12 +160,12 @@ void arch_static_call_transform(void *site, void *tramp, void *func, bool tail) if (tramp) { __static_call_validate(tramp, true, true); - __static_call_transform(tramp, __sc_insn(!func, true), func, false); + __static_call_transform(tramp, __sc_insn(!func, true), func); } if (IS_ENABLED(CONFIG_HAVE_STATIC_CALL_INLINE) && site) { __static_call_validate(site, tail, false); - __static_call_transform(site, __sc_insn(!func, tail), func, false); + __static_call_transform(site, __sc_insn(!func, tail), func); } mutex_unlock(&text_mutex); @@ -193,7 +193,7 @@ bool __static_call_fixup(void *tramp, u8 op, void *dest) mutex_lock(&text_mutex); if (op == RET_INSN_OPCODE || dest == &__x86_return_thunk) - __static_call_transform(tramp, RET, NULL, true); + __static_call_transform(tramp, RET, NULL); mutex_unlock(&text_mutex); return true; diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 3ac50b7298d1..264188ec50c9 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -7,6 +7,7 @@ #include <asm/unwind.h> #include <asm/orc_types.h> #include <asm/orc_lookup.h> +#include <asm/text-patching.h> #define orc_warn(fmt, ...) \ printk_deferred_once(KERN_WARNING "WARNING: " fmt, ##__VA_ARGS__) @@ -222,18 +223,22 @@ static void orc_sort_swap(void *_a, void *_b, int size) struct orc_entry orc_tmp; int *a = _a, *b = _b, tmp; int delta = _b - _a; + int val; /* Swap the .orc_unwind_ip entries: */ tmp = *a; - *a = *b + delta; - *b = tmp - delta; + val = *b + delta; + text_poke_copy(a, &val, sizeof(val)); + val = tmp - delta; + text_poke_copy(b, &val, sizeof(val)); /* Swap the corresponding .orc_unwind entries: */ orc_a = cur_orc_table + (a - cur_orc_ip_table); orc_b = cur_orc_table + (b - cur_orc_ip_table); orc_tmp = *orc_a; - *orc_a = *orc_b; - *orc_b = orc_tmp; + + text_poke_copy(orc_a, orc_b, sizeof(*orc_b)); + text_poke_copy(orc_b, &orc_tmp, sizeof(orc_tmp)); } static int orc_sort_cmp(const void *_a, const void *_b) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 1056bbf55b17..bae267f0a257 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -226,7 +226,7 @@ static u8 simple_alu_opcodes[] = { static void jit_fill_hole(void *area, unsigned int size) { /* Fill whole space with INT3 instructions */ - memset(area, 0xcc, size); + text_poke_set(area, 0xcc, size); } int bpf_arch_text_invalidate(void *dst, size_t len) @@ -2202,6 +2202,9 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i orig_call += X86_PATCH_SIZE; } + set_memory_nx((unsigned long)image & PAGE_MASK, 1); + set_memory_rw((unsigned long)image & PAGE_MASK, 1); + prog = image; EMIT_ENDBR(); @@ -2238,20 +2241,24 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im); if (emit_rsb_call(&prog, __bpf_tramp_enter, prog)) { ret = -EINVAL; - goto cleanup; + goto reprotect_memory; } } if (fentry->nr_links) if (invoke_bpf(m, &prog, fentry, regs_off, run_ctx_off, - flags & BPF_TRAMP_F_RET_FENTRY_RET)) - return -EINVAL; + flags & BPF_TRAMP_F_RET_FENTRY_RET)) { + ret = -EINVAL; + goto reprotect_memory; + } if (fmod_ret->nr_links) { branches = kcalloc(fmod_ret->nr_links, sizeof(u8 *), GFP_KERNEL); - if (!branches) - return -ENOMEM; + if (!branches) { + ret = -ENOMEM; + goto reprotect_memory; + } if (invoke_bpf_mod_ret(m, &prog, fmod_ret, regs_off, run_ctx_off, branches)) { @@ -2336,6 +2343,9 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i cleanup: kfree(branches); +reprotect_memory: + set_memory_rox((unsigned long)image & PAGE_MASK, 1); + return ret; }