Message ID | 20240807124103.85644-2-mpe@ellerman.id.au (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | [1/4] mm: Add optional close() to struct vm_special_mapping | expand |
On 07.08.24 14:41, Michael Ellerman wrote: > Add a close() callback to the VDSO special mapping to handle unmapping > of the VDSO. That will make it possible to remove the arch_unmap() hook > entirely in a subsequent patch. > > Suggested-by: Linus Torvalds <torvalds@linux-foundation.org> > Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> > --- Reviewed-by: David Hildenbrand <david@redhat.com>
On Wed, Aug 7, 2024 at 5:41 AM Michael Ellerman <mpe@ellerman.id.au> wrote: > > Add a close() callback to the VDSO special mapping to handle unmapping > of the VDSO. That will make it possible to remove the arch_unmap() hook > entirely in a subsequent patch. > > Suggested-by: Linus Torvalds <torvalds@linux-foundation.org> > Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> > --- > arch/powerpc/include/asm/mmu_context.h | 4 ---- > arch/powerpc/kernel/vdso.c | 17 +++++++++++++++++ > 2 files changed, 17 insertions(+), 4 deletions(-) > > diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h > index 37bffa0f7918..9b8c1555744e 100644 > --- a/arch/powerpc/include/asm/mmu_context.h > +++ b/arch/powerpc/include/asm/mmu_context.h > @@ -263,10 +263,6 @@ extern void arch_exit_mmap(struct mm_struct *mm); > static inline void arch_unmap(struct mm_struct *mm, > unsigned long start, unsigned long end) > { > - unsigned long vdso_base = (unsigned long)mm->context.vdso; > - > - if (start <= vdso_base && vdso_base < end) > - mm->context.vdso = NULL; > } > > #ifdef CONFIG_PPC_MEM_KEYS > diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c > index 7a2ff9010f17..220a76cae7c1 100644 > --- a/arch/powerpc/kernel/vdso.c > +++ b/arch/powerpc/kernel/vdso.c > @@ -81,6 +81,21 @@ static int vdso64_mremap(const struct vm_special_mapping *sm, struct vm_area_str > return vdso_mremap(sm, new_vma, &vdso64_end - &vdso64_start); > } > > +static void vdso_close(const struct vm_special_mapping *sm, struct vm_area_struct *vma) > +{ > + struct mm_struct *mm = vma->vm_mm; > + > + /* > + * close() is called for munmap() but also for mremap(). In the mremap() > + * case the vdso pointer has already been updated by the mremap() hook > + * above, so it must not be set to NULL here. > + */ > + if (vma->vm_start != (unsigned long)mm->context.vdso) > + return; > + > + mm->context.vdso = NULL; > +} > + > static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, > struct vm_area_struct *vma, struct vm_fault *vmf); > > @@ -92,11 +107,13 @@ static struct vm_special_mapping vvar_spec __ro_after_init = { > static struct vm_special_mapping vdso32_spec __ro_after_init = { > .name = "[vdso]", > .mremap = vdso32_mremap, > + .close = vdso_close, IIUC, only CHECKPOINT_RESTORE requires this, and CHECKPOINT_RESTORE is in init/Kconfig, with default N Can we add #ifdef CONFIG_CHECKPOINT_RESTORE here ? Thanks Best regards, -Jeff > }; > > static struct vm_special_mapping vdso64_spec __ro_after_init = { > .name = "[vdso]", > .mremap = vdso64_mremap, > + .close = vdso_close, > }; > > #ifdef CONFIG_TIME_NS > -- > 2.45.2 >
* Jeff Xu <jeffxu@google.com> [240807 11:44]: > On Wed, Aug 7, 2024 at 5:41 AM Michael Ellerman <mpe@ellerman.id.au> wrote: > > > > Add a close() callback to the VDSO special mapping to handle unmapping > > of the VDSO. That will make it possible to remove the arch_unmap() hook > > entirely in a subsequent patch. > > > > Suggested-by: Linus Torvalds <torvalds@linux-foundation.org> > > Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> > > --- > > arch/powerpc/include/asm/mmu_context.h | 4 ---- > > arch/powerpc/kernel/vdso.c | 17 +++++++++++++++++ > > 2 files changed, 17 insertions(+), 4 deletions(-) > > > > diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h > > index 37bffa0f7918..9b8c1555744e 100644 > > --- a/arch/powerpc/include/asm/mmu_context.h > > +++ b/arch/powerpc/include/asm/mmu_context.h > > @@ -263,10 +263,6 @@ extern void arch_exit_mmap(struct mm_struct *mm); > > static inline void arch_unmap(struct mm_struct *mm, > > unsigned long start, unsigned long end) > > { > > - unsigned long vdso_base = (unsigned long)mm->context.vdso; > > - > > - if (start <= vdso_base && vdso_base < end) > > - mm->context.vdso = NULL; > > } > > > > #ifdef CONFIG_PPC_MEM_KEYS > > diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c > > index 7a2ff9010f17..220a76cae7c1 100644 > > --- a/arch/powerpc/kernel/vdso.c > > +++ b/arch/powerpc/kernel/vdso.c > > @@ -81,6 +81,21 @@ static int vdso64_mremap(const struct vm_special_mapping *sm, struct vm_area_str > > return vdso_mremap(sm, new_vma, &vdso64_end - &vdso64_start); > > } > > > > +static void vdso_close(const struct vm_special_mapping *sm, struct vm_area_struct *vma) > > +{ > > + struct mm_struct *mm = vma->vm_mm; > > + > > + /* > > + * close() is called for munmap() but also for mremap(). In the mremap() > > + * case the vdso pointer has already been updated by the mremap() hook > > + * above, so it must not be set to NULL here. > > + */ > > + if (vma->vm_start != (unsigned long)mm->context.vdso) > > + return; > > + > > + mm->context.vdso = NULL; > > +} > > + > > static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, > > struct vm_area_struct *vma, struct vm_fault *vmf); > > > > @@ -92,11 +107,13 @@ static struct vm_special_mapping vvar_spec __ro_after_init = { > > static struct vm_special_mapping vdso32_spec __ro_after_init = { > > .name = "[vdso]", > > .mremap = vdso32_mremap, > > + .close = vdso_close, > IIUC, only CHECKPOINT_RESTORE requires this, and > CHECKPOINT_RESTORE is in init/Kconfig, with default N > > Can we add #ifdef CONFIG_CHECKPOINT_RESTORE here ? > No, these can be unmapped and it needs to be cleaned up. Valgrind is one application that is known to unmap the vdso and runs into issues on platforms that do not handle the removal correctly. Thanks, Liam
On Wed, Aug 7, 2024 at 8:56 AM Liam R. Howlett <Liam.Howlett@oracle.com> wrote: > > * Jeff Xu <jeffxu@google.com> [240807 11:44]: > > On Wed, Aug 7, 2024 at 5:41 AM Michael Ellerman <mpe@ellerman.id.au> wrote: > > > > > > Add a close() callback to the VDSO special mapping to handle unmapping > > > of the VDSO. That will make it possible to remove the arch_unmap() hook > > > entirely in a subsequent patch. > > > > > > Suggested-by: Linus Torvalds <torvalds@linux-foundation.org> > > > Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> > > > --- > > > arch/powerpc/include/asm/mmu_context.h | 4 ---- > > > arch/powerpc/kernel/vdso.c | 17 +++++++++++++++++ > > > 2 files changed, 17 insertions(+), 4 deletions(-) > > > > > > diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h > > > index 37bffa0f7918..9b8c1555744e 100644 > > > --- a/arch/powerpc/include/asm/mmu_context.h > > > +++ b/arch/powerpc/include/asm/mmu_context.h > > > @@ -263,10 +263,6 @@ extern void arch_exit_mmap(struct mm_struct *mm); > > > static inline void arch_unmap(struct mm_struct *mm, > > > unsigned long start, unsigned long end) > > > { > > > - unsigned long vdso_base = (unsigned long)mm->context.vdso; > > > - > > > - if (start <= vdso_base && vdso_base < end) > > > - mm->context.vdso = NULL; > > > } > > > > > > #ifdef CONFIG_PPC_MEM_KEYS > > > diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c > > > index 7a2ff9010f17..220a76cae7c1 100644 > > > --- a/arch/powerpc/kernel/vdso.c > > > +++ b/arch/powerpc/kernel/vdso.c > > > @@ -81,6 +81,21 @@ static int vdso64_mremap(const struct vm_special_mapping *sm, struct vm_area_str > > > return vdso_mremap(sm, new_vma, &vdso64_end - &vdso64_start); > > > } > > > > > > +static void vdso_close(const struct vm_special_mapping *sm, struct vm_area_struct *vma) > > > +{ > > > + struct mm_struct *mm = vma->vm_mm; > > > + > > > + /* > > > + * close() is called for munmap() but also for mremap(). In the mremap() > > > + * case the vdso pointer has already been updated by the mremap() hook > > > + * above, so it must not be set to NULL here. > > > + */ > > > + if (vma->vm_start != (unsigned long)mm->context.vdso) > > > + return; > > > + > > > + mm->context.vdso = NULL; > > > +} > > > + > > > static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, > > > struct vm_area_struct *vma, struct vm_fault *vmf); > > > > > > @@ -92,11 +107,13 @@ static struct vm_special_mapping vvar_spec __ro_after_init = { > > > static struct vm_special_mapping vdso32_spec __ro_after_init = { > > > .name = "[vdso]", > > > .mremap = vdso32_mremap, > > > + .close = vdso_close, > > IIUC, only CHECKPOINT_RESTORE requires this, and > > CHECKPOINT_RESTORE is in init/Kconfig, with default N > > > > Can we add #ifdef CONFIG_CHECKPOINT_RESTORE here ? > > > > No, these can be unmapped and it needs to be cleaned up. Valgrind is > one application that is known to unmap the vdso and runs into issues on > platforms that do not handle the removal correctly. > Maybe Valgrind needs that exactly for checkpoint restore ? [1] "CRIU fails to restore applications that have unmapped the vDSO segment. One such application is Valgrind." Usually when the kernel accepts new functionality, the patch needs to state the user case. The only user case I found for .mremap and .close so far is the CRIU case. [1] https://github.com/checkpoint-restore/criu/issues/488 Thanks -Jeff > Thanks, > Liam
* Jeff Xu <jeffxu@google.com> [240807 12:37]: > On Wed, Aug 7, 2024 at 8:56 AM Liam R. Howlett <Liam.Howlett@oracle.com> wrote: > > > > * Jeff Xu <jeffxu@google.com> [240807 11:44]: > > > On Wed, Aug 7, 2024 at 5:41 AM Michael Ellerman <mpe@ellerman.id.au> wrote: > > > > > > > > Add a close() callback to the VDSO special mapping to handle unmapping > > > > of the VDSO. That will make it possible to remove the arch_unmap() hook > > > > entirely in a subsequent patch. > > > > > > > > Suggested-by: Linus Torvalds <torvalds@linux-foundation.org> > > > > Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> > > > > --- > > > > arch/powerpc/include/asm/mmu_context.h | 4 ---- > > > > arch/powerpc/kernel/vdso.c | 17 +++++++++++++++++ > > > > 2 files changed, 17 insertions(+), 4 deletions(-) > > > > > > > > diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h > > > > index 37bffa0f7918..9b8c1555744e 100644 > > > > --- a/arch/powerpc/include/asm/mmu_context.h > > > > +++ b/arch/powerpc/include/asm/mmu_context.h > > > > @@ -263,10 +263,6 @@ extern void arch_exit_mmap(struct mm_struct *mm); > > > > static inline void arch_unmap(struct mm_struct *mm, > > > > unsigned long start, unsigned long end) > > > > { > > > > - unsigned long vdso_base = (unsigned long)mm->context.vdso; > > > > - > > > > - if (start <= vdso_base && vdso_base < end) > > > > - mm->context.vdso = NULL; > > > > } > > > > > > > > #ifdef CONFIG_PPC_MEM_KEYS > > > > diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c > > > > index 7a2ff9010f17..220a76cae7c1 100644 > > > > --- a/arch/powerpc/kernel/vdso.c > > > > +++ b/arch/powerpc/kernel/vdso.c > > > > @@ -81,6 +81,21 @@ static int vdso64_mremap(const struct vm_special_mapping *sm, struct vm_area_str > > > > return vdso_mremap(sm, new_vma, &vdso64_end - &vdso64_start); > > > > } > > > > > > > > +static void vdso_close(const struct vm_special_mapping *sm, struct vm_area_struct *vma) > > > > +{ > > > > + struct mm_struct *mm = vma->vm_mm; > > > > + > > > > + /* > > > > + * close() is called for munmap() but also for mremap(). In the mremap() > > > > + * case the vdso pointer has already been updated by the mremap() hook > > > > + * above, so it must not be set to NULL here. > > > > + */ > > > > + if (vma->vm_start != (unsigned long)mm->context.vdso) > > > > + return; > > > > + > > > > + mm->context.vdso = NULL; > > > > +} > > > > + > > > > static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, > > > > struct vm_area_struct *vma, struct vm_fault *vmf); > > > > > > > > @@ -92,11 +107,13 @@ static struct vm_special_mapping vvar_spec __ro_after_init = { > > > > static struct vm_special_mapping vdso32_spec __ro_after_init = { > > > > .name = "[vdso]", > > > > .mremap = vdso32_mremap, > > > > + .close = vdso_close, > > > IIUC, only CHECKPOINT_RESTORE requires this, and > > > CHECKPOINT_RESTORE is in init/Kconfig, with default N > > > > > > Can we add #ifdef CONFIG_CHECKPOINT_RESTORE here ? > > > > > > > No, these can be unmapped and it needs to be cleaned up. Valgrind is > > one application that is known to unmap the vdso and runs into issues on > > platforms that do not handle the removal correctly. > > > Maybe Valgrind needs that exactly for checkpoint restore ? [1] Maybe, and maybe there are 100 other applications unmapping the vdso for some other reason? > > "CRIU fails to restore applications that have unmapped the vDSO > segment. One such > application is Valgrind." > > Usually when the kernel accepts new functionality, the patch needs to > state the user case. This isn't new functionality, the arch_unmap() existed before and this is moving the functionality. We can't just disable it because we assume we know it's only used once. I had planned to do this sort of patch set in a follow up to my patch set [1], so I was deep into looking at this before the mseal() regression - which I expected to happen and have been advocating to avoid extra walks... This would be fixed by my patch set by reducing the walk count. > The only user case I found for .mremap and .close so far is the CRIU case. > In fact, this is broken on other archs for valgrind since they unmap the vdso and then crash [2]. There has been a fix in the works for a while, which I stepped in during the patch set [1], which can be seen here [3]. In the discussion, the issue with Valgrind is raised and a generic solution to solve for more than just ppc is discussed. The solution avoids crashing if vdso is unmapped and that seems like the sane direction of this work. We also have users unmapping vdsos here [4] too. Why would we leave a dangling pointer in the mm struct based on a compile flag? [1] https://lore.kernel.org/linux-mm/20240717200709.1552558-18-Liam.Howlett@oracle.com/ [2] https://lore.kernel.org/lkml/87imd5h5kb.fsf@mpe.ellerman.id.au/ [3] https://lore.kernel.org/all/20210611180242.711399-17-dima@arista.com/ [4] https://github.com/insanitybit/void-ship
On Wed, Aug 7, 2024 at 10:11 AM Liam R. Howlett <Liam.Howlett@oracle.com> wrote: > > * Jeff Xu <jeffxu@google.com> [240807 12:37]: > > On Wed, Aug 7, 2024 at 8:56 AM Liam R. Howlett <Liam.Howlett@oracle.com> wrote: > > > > > > * Jeff Xu <jeffxu@google.com> [240807 11:44]: > > > > On Wed, Aug 7, 2024 at 5:41 AM Michael Ellerman <mpe@ellerman.id.au> wrote: > > > > > > > > > > Add a close() callback to the VDSO special mapping to handle unmapping > > > > > of the VDSO. That will make it possible to remove the arch_unmap() hook > > > > > entirely in a subsequent patch. > > > > > > > > > > Suggested-by: Linus Torvalds <torvalds@linux-foundation.org> > > > > > Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> > > > > > --- > > > > > arch/powerpc/include/asm/mmu_context.h | 4 ---- > > > > > arch/powerpc/kernel/vdso.c | 17 +++++++++++++++++ > > > > > 2 files changed, 17 insertions(+), 4 deletions(-) > > > > > > > > > > diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h > > > > > index 37bffa0f7918..9b8c1555744e 100644 > > > > > --- a/arch/powerpc/include/asm/mmu_context.h > > > > > +++ b/arch/powerpc/include/asm/mmu_context.h > > > > > @@ -263,10 +263,6 @@ extern void arch_exit_mmap(struct mm_struct *mm); > > > > > static inline void arch_unmap(struct mm_struct *mm, > > > > > unsigned long start, unsigned long end) > > > > > { > > > > > - unsigned long vdso_base = (unsigned long)mm->context.vdso; > > > > > - > > > > > - if (start <= vdso_base && vdso_base < end) > > > > > - mm->context.vdso = NULL; > > > > > } > > > > > > > > > > #ifdef CONFIG_PPC_MEM_KEYS > > > > > diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c > > > > > index 7a2ff9010f17..220a76cae7c1 100644 > > > > > --- a/arch/powerpc/kernel/vdso.c > > > > > +++ b/arch/powerpc/kernel/vdso.c > > > > > @@ -81,6 +81,21 @@ static int vdso64_mremap(const struct vm_special_mapping *sm, struct vm_area_str > > > > > return vdso_mremap(sm, new_vma, &vdso64_end - &vdso64_start); > > > > > } > > > > > > > > > > +static void vdso_close(const struct vm_special_mapping *sm, struct vm_area_struct *vma) > > > > > +{ > > > > > + struct mm_struct *mm = vma->vm_mm; > > > > > + > > > > > + /* > > > > > + * close() is called for munmap() but also for mremap(). In the mremap() > > > > > + * case the vdso pointer has already been updated by the mremap() hook > > > > > + * above, so it must not be set to NULL here. > > > > > + */ > > > > > + if (vma->vm_start != (unsigned long)mm->context.vdso) > > > > > + return; > > > > > + > > > > > + mm->context.vdso = NULL; > > > > > +} > > > > > + > > > > > static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, > > > > > struct vm_area_struct *vma, struct vm_fault *vmf); > > > > > > > > > > @@ -92,11 +107,13 @@ static struct vm_special_mapping vvar_spec __ro_after_init = { > > > > > static struct vm_special_mapping vdso32_spec __ro_after_init = { > > > > > .name = "[vdso]", > > > > > .mremap = vdso32_mremap, > > > > > + .close = vdso_close, > > > > IIUC, only CHECKPOINT_RESTORE requires this, and > > > > CHECKPOINT_RESTORE is in init/Kconfig, with default N > > > > > > > > Can we add #ifdef CONFIG_CHECKPOINT_RESTORE here ? > > > > > > > > > > No, these can be unmapped and it needs to be cleaned up. Valgrind is > > > one application that is known to unmap the vdso and runs into issues on > > > platforms that do not handle the removal correctly. > > > > > Maybe Valgrind needs that exactly for checkpoint restore ? [1] > > Maybe, and maybe there are 100 other applications unmapping the vdso for > some other reason? > When PPC added arch_munamp, it was for CRIU, wasn't it ? That was the original intention. Maybe there are more apps that have started unmapping vdso, it might be interesting to know those use cases before widely opening this without kernel config. > > > > "CRIU fails to restore applications that have unmapped the vDSO > > segment. One such > > application is Valgrind." > > > > Usually when the kernel accepts new functionality, the patch needs to > > state the user case. > > This isn't new functionality, the arch_unmap() existed before and this > is moving the functionality. We can't just disable it because we assume > we know it's only used once. > > I had planned to do this sort of patch set in a follow up to my patch > set [1], so I was deep into looking at this before the mseal() > regression - which I expected to happen and have been advocating to > avoid extra walks... This would be fixed by my patch set by reducing the > walk count. > I would rather leave mseal() in-loop check discussion to the other email thread :-) > > The only user case I found for .mremap and .close so far is the CRIU case. > > > > In fact, this is broken on other archs for valgrind since they unmap the > vdso and then crash [2]. There has been a fix in the works for a while, > which I stepped in during the patch set [1], which can be seen here > [3]. In the discussion, the issue with Valgrind is raised and a generic > solution to solve for more than just ppc is discussed. The solution > avoids crashing if vdso is unmapped and that seems like the sane > direction of this work. > > We also have users unmapping vdsos here [4] too. This is a strange code. If the use case about clock_gettime is legit, then kerne can provide an option to not unload vdso during elf loading. > > Why would we leave a dangling pointer in the mm struct based on a > compile flag? > > [1] https://lore.kernel.org/linux-mm/20240717200709.1552558-18-Liam.Howlett@oracle.com/ > [2] https://lore.kernel.org/lkml/87imd5h5kb.fsf@mpe.ellerman.id.au/ > [3] https://lore.kernel.org/all/20210611180242.711399-17-dima@arista.com/ > [4] https://github.com/insanitybit/void-ship >
* Jeff Xu <jeffxu@google.com> [240807 16:12]: > On Wed, Aug 7, 2024 at 10:11 AM Liam R. Howlett <Liam.Howlett@oracle.com> wrote: > > > > * Jeff Xu <jeffxu@google.com> [240807 12:37]: > > > On Wed, Aug 7, 2024 at 8:56 AM Liam R. Howlett <Liam.Howlett@oracle.com> wrote: > > > > > > > > * Jeff Xu <jeffxu@google.com> [240807 11:44]: > > > > > On Wed, Aug 7, 2024 at 5:41 AM Michael Ellerman <mpe@ellerman.id.au> wrote: > > > > > > > > > > > > Add a close() callback to the VDSO special mapping to handle unmapping > > > > > > of the VDSO. That will make it possible to remove the arch_unmap() hook > > > > > > entirely in a subsequent patch. > > > > > > > > > > > > Suggested-by: Linus Torvalds <torvalds@linux-foundation.org> > > > > > > Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> > > > > > > --- > > > > > > arch/powerpc/include/asm/mmu_context.h | 4 ---- > > > > > > arch/powerpc/kernel/vdso.c | 17 +++++++++++++++++ > > > > > > 2 files changed, 17 insertions(+), 4 deletions(-) > > > > > > > > > > > > diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h > > > > > > index 37bffa0f7918..9b8c1555744e 100644 > > > > > > --- a/arch/powerpc/include/asm/mmu_context.h > > > > > > +++ b/arch/powerpc/include/asm/mmu_context.h > > > > > > @@ -263,10 +263,6 @@ extern void arch_exit_mmap(struct mm_struct *mm); > > > > > > static inline void arch_unmap(struct mm_struct *mm, > > > > > > unsigned long start, unsigned long end) > > > > > > { > > > > > > - unsigned long vdso_base = (unsigned long)mm->context.vdso; > > > > > > - > > > > > > - if (start <= vdso_base && vdso_base < end) > > > > > > - mm->context.vdso = NULL; > > > > > > } > > > > > > > > > > > > #ifdef CONFIG_PPC_MEM_KEYS > > > > > > diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c > > > > > > index 7a2ff9010f17..220a76cae7c1 100644 > > > > > > --- a/arch/powerpc/kernel/vdso.c > > > > > > +++ b/arch/powerpc/kernel/vdso.c > > > > > > @@ -81,6 +81,21 @@ static int vdso64_mremap(const struct vm_special_mapping *sm, struct vm_area_str > > > > > > return vdso_mremap(sm, new_vma, &vdso64_end - &vdso64_start); > > > > > > } > > > > > > > > > > > > +static void vdso_close(const struct vm_special_mapping *sm, struct vm_area_struct *vma) > > > > > > +{ > > > > > > + struct mm_struct *mm = vma->vm_mm; > > > > > > + > > > > > > + /* > > > > > > + * close() is called for munmap() but also for mremap(). In the mremap() > > > > > > + * case the vdso pointer has already been updated by the mremap() hook > > > > > > + * above, so it must not be set to NULL here. > > > > > > + */ > > > > > > + if (vma->vm_start != (unsigned long)mm->context.vdso) > > > > > > + return; > > > > > > + > > > > > > + mm->context.vdso = NULL; > > > > > > +} > > > > > > + > > > > > > static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, > > > > > > struct vm_area_struct *vma, struct vm_fault *vmf); > > > > > > > > > > > > @@ -92,11 +107,13 @@ static struct vm_special_mapping vvar_spec __ro_after_init = { > > > > > > static struct vm_special_mapping vdso32_spec __ro_after_init = { > > > > > > .name = "[vdso]", > > > > > > .mremap = vdso32_mremap, > > > > > > + .close = vdso_close, > > > > > IIUC, only CHECKPOINT_RESTORE requires this, and > > > > > CHECKPOINT_RESTORE is in init/Kconfig, with default N > > > > > > > > > > Can we add #ifdef CONFIG_CHECKPOINT_RESTORE here ? > > > > > > > > > > > > > No, these can be unmapped and it needs to be cleaned up. Valgrind is > > > > one application that is known to unmap the vdso and runs into issues on > > > > platforms that do not handle the removal correctly. > > > > > > > Maybe Valgrind needs that exactly for checkpoint restore ? [1] > > > > Maybe, and maybe there are 100 other applications unmapping the vdso for > > some other reason? > > > When PPC added arch_munamp, it was for CRIU, wasn't it ? That was the original > intention. > > Maybe there are more apps that have started unmapping vdso, it might > be interesting > to know those use cases before widely opening this without kernel config. > > > > > > > "CRIU fails to restore applications that have unmapped the vDSO > > > segment. One such > > > application is Valgrind." > > > > > > Usually when the kernel accepts new functionality, the patch needs to > > > state the user case. > > ... > > > The only user case I found for .mremap and .close so far is the CRIU case. > > > > > > > In fact, this is broken on other archs for valgrind since they unmap the > > vdso and then crash [2]. There has been a fix in the works for a while, > > which I stepped in during the patch set [1], which can be seen here > > [3]. In the discussion, the issue with Valgrind is raised and a generic > > solution to solve for more than just ppc is discussed. The solution > > avoids crashing if vdso is unmapped and that seems like the sane > > direction of this work. > > > > We also have users unmapping vdsos here [4] too. > This is a strange code. If the use case about clock_gettime is legit, then > kerne can provide an option to not unload vdso during elf loading. Yes, I would not want to do this - but there are people doing strange (to put it politely) things that we did not intend. > > > > > Why would we leave a dangling pointer in the mm struct based on a > > compile flag? Okay, I'm going to try one more time here. You are suggesting to have a conf flag to leave the vdso pointer unchanged when it is unmapped. Having the close behind the conf option will not prevent it from being unmapped or mapped over, so what you are suggesting is have a configuration option that leaves a pointer, mm->context.vdso, to be unsafe if it is unmapped if you disable checkpoint restore. This is also not what userspace sees today, so you are changing user visible behaviour based on a configuration option because you think, but are not sure, that checkpoint restore is the only user. Or did I miss something? Thanks, Liam > > > > [1] https://lore.kernel.org/linux-mm/20240717200709.1552558-18-Liam.Howlett@oracle.com/ > > [2] https://lore.kernel.org/lkml/87imd5h5kb.fsf@mpe.ellerman.id.au/ > > [3] https://lore.kernel.org/all/20210611180242.711399-17-dima@arista.com/ > > [4] https://github.com/insanitybit/void-ship > >
On Wed, 7 Aug 2024 at 16:20, Liam R. Howlett <Liam.Howlett@oracle.com> wrote: > > Okay, I'm going to try one more time here. You are suggesting to have a > conf flag to leave the vdso pointer unchanged when it is unmapped. > Having the close behind the conf option will not prevent it from being > unmapped or mapped over, so what you are suggesting is have a > configuration option that leaves a pointer, mm->context.vdso, to be > unsafe if it is unmapped if you disable checkpoint restore. We definitely do not want that kind of complexity. It makes the kernel just more complicated to have to deal with both cases. That said, I don't love how special powerpc is here. What we could do is to is - stop calling these things "special mappings", and just admit that it's for different vdso mappings and nothing else (for some odd reason arm and nios2 calls it a "kuser helper" rather than vdso, but it's the exact same thing) - don't do this whole indirect function pointer thing with mremap and close at all, and just do this all unapologetically and for all architectures in the generic VM layer together with "if (vma->vm_start == mm->context.vdso)" etc. that would get rid of the conceptual complexity of having different architectures doing different things (and the unnecessary overhead of having an indirect function pointer that just points to one single thing). But I think the current "clean up the existing mess" is probably the less invasive one over "make the existing mess be explicitly about vdso and avoid unnecessary per-architecture differences". If people want to, we can do the unification (and stop pretending the "special mappings" could be something else) later. Linus
On Wed, Aug 7, 2024 at 8:21 PM Linus Torvalds <torvalds@linux-foundation.org> wrote: > > On Wed, 7 Aug 2024 at 16:20, Liam R. Howlett <Liam.Howlett@oracle.com> wrote: > > > > Okay, I'm going to try one more time here. You are suggesting to have a > > conf flag to leave the vdso pointer unchanged when it is unmapped. > > Having the close behind the conf option will not prevent it from being > > unmapped or mapped over, so what you are suggesting is have a > > configuration option that leaves a pointer, mm->context.vdso, to be > > unsafe if it is unmapped if you disable checkpoint restore. > This is a new point that I didn't realize before, if we are going to handle unmap vdso safely, yes, this is a bugfix that should be applied everywhere for all arch, without CHECKPOINT_RESTORE config. Do we need to worry about mmap(fixed) ? which can have the same effect as mremap. > We definitely do not want that kind of complexity. It makes the kernel > just more complicated to have to deal with both cases. > > That said, I don't love how special powerpc is here. > > What we could do is to is > > - stop calling these things "special mappings", and just admit that > it's for different vdso mappings and nothing else (for some odd reason > arm and nios2 calls it a "kuser helper" rather than vdso, but it's the > exact same thing) > > - don't do this whole indirect function pointer thing with mremap and > close at all, and just do this all unapologetically and for all > architectures in the generic VM layer together with "if (vma->vm_start > == mm->context.vdso)" etc. > > that would get rid of the conceptual complexity of having different > architectures doing different things (and the unnecessary overhead of > having an indirect function pointer that just points to one single > thing). > > But I think the current "clean up the existing mess" is probably the > less invasive one over "make the existing mess be explicitly about > vdso and avoid unnecessary per-architecture differences". > > If people want to, we can do the unification (and stop pretending the > "special mappings" could be something else) later. > > Linus
On Wed, Aug 7, 2024 at 8:21 PM Linus Torvalds <torvalds@linux-foundation.org> wrote: > > On Wed, 7 Aug 2024 at 16:20, Liam R. Howlett <Liam.Howlett@oracle.com> wrote: > > > > Okay, I'm going to try one more time here. You are suggesting to have a > > conf flag to leave the vdso pointer unchanged when it is unmapped. > > Having the close behind the conf option will not prevent it from being > > unmapped or mapped over, so what you are suggesting is have a > > configuration option that leaves a pointer, mm->context.vdso, to be > > unsafe if it is unmapped if you disable checkpoint restore. > > We definitely do not want that kind of complexity. It makes the kernel > just more complicated to have to deal with both cases. > > That said, I don't love how special powerpc is here. > > What we could do is to is > > - stop calling these things "special mappings", and just admit that > it's for different vdso mappings and nothing else (for some odd reason > arm and nios2 calls it a "kuser helper" rather than vdso, but it's the > exact same thing) > > - don't do this whole indirect function pointer thing with mremap and > close at all, and just do this all unapologetically and for all > architectures in the generic VM layer together with "if (vma->vm_start > == mm->context.vdso)" etc. > > that would get rid of the conceptual complexity of having different > architectures doing different things (and the unnecessary overhead of > having an indirect function pointer that just points to one single > thing). > > But I think the current "clean up the existing mess" is probably the > less invasive one over "make the existing mess be explicitly about > vdso and avoid unnecessary per-architecture differences". > > If people want to, we can do the unification (and stop pretending the > "special mappings" could be something else) later. OK. Now this is clear to me (at last). Treating vdso mapping, (maybe all the special mappings) as normal mapping and handling mmap/mremap/munmap properly from userspace will indeed make things clear conceptually. I'm OK with doing this later since it is a big change. Thanks -Jeff > > Linus
* Linus Torvalds <torvalds@linux-foundation.org> [240807 23:21]: > On Wed, 7 Aug 2024 at 16:20, Liam R. Howlett <Liam.Howlett@oracle.com> wrote: > > ... > > That said, I don't love how special powerpc is here. I think more (all?) archs should be doing something like ppc when the vdso is removed. If someone removes the vdso, then the speed up provided should just go away and the function calls shouldn't try to use the quick look up and crash. I view this as another 'caching of a vma pointer' issue that isn't cleaned up when the vma goes away. > > What we could do is to is > > - stop calling these things "special mappings", and just admit that > it's for different vdso mappings and nothing else (for some odd reason > arm and nios2 calls it a "kuser helper" rather than vdso, but it's the > exact same thing) But isn't it a special mapping? We don't allow for merging of the vma, the mlock handling has some odd behaviour with this vma, and there is the comment in mm/internal.h's mlock_vma_folio about ignoring these special vmas in a race. There is also some other 'special mapping' of vvars too? I haven't looked deeply into this yet as my investigation was preempted by vacation. > > - don't do this whole indirect function pointer thing with mremap and > close at all, and just do this all unapologetically and for all > architectures in the generic VM layer together with "if (vma->vm_start > == mm->context.vdso)" etc. > > that would get rid of the conceptual complexity of having different > architectures doing different things (and the unnecessary overhead of > having an indirect function pointer that just points to one single > thing). > > But I think the current "clean up the existing mess" is probably the > less invasive one over "make the existing mess be explicitly about > vdso and avoid unnecessary per-architecture differences". Okay, sure. > > If people want to, we can do the unification (and stop pretending the > "special mappings" could be something else) later. > I was planning to use the regular vma vm_ops to jump into the 'special unmap code' and then do all the checks there. IOW, keep the vma flagged as VM_SPECIAL and call the special_mapping_whatever() function as a regular vmops for, say, ->remove_vma() or ->mremap(). Keeping the flag means all the race avoidance/locking/merging works the same as it does today. What I am trying to avoid is another arch_get_unmapped_area() scenario where a bug exists for a decade in some versions of the cloned code. Thanks, Liam
* Jeff Xu <jeffxu@google.com> [240807 23:37]: > On Wed, Aug 7, 2024 at 8:21 PM Linus Torvalds > <torvalds@linux-foundation.org> wrote: > > > > On Wed, 7 Aug 2024 at 16:20, Liam R. Howlett <Liam.Howlett@oracle.com> wrote: > > > > > > Okay, I'm going to try one more time here. You are suggesting to have a > > > conf flag to leave the vdso pointer unchanged when it is unmapped. > > > Having the close behind the conf option will not prevent it from being > > > unmapped or mapped over, so what you are suggesting is have a > > > configuration option that leaves a pointer, mm->context.vdso, to be > > > unsafe if it is unmapped if you disable checkpoint restore. > > > This is a new point that I didn't realize before, if we are going to handle > unmap vdso safely, yes, this is a bugfix that should be applied everywhere > for all arch, without CHECKPOINT_RESTORE config. > > Do we need to worry about mmap(fixed) ? which can have the same effect > as mremap. Yes, but it should be handled by vm_ops->close() when MAP_FIXED unmaps the vdso. Note that you cannot MAP_FIXED over half of the vma as the vm_ops->may_split() is special_mapping_split(), which just returns -EINVAL. Thanks, Liam
On Thu, Aug 8, 2024 at 11:08 AM Liam R. Howlett <Liam.Howlett@oracle.com> wrote: > > * Jeff Xu <jeffxu@google.com> [240807 23:37]: > > On Wed, Aug 7, 2024 at 8:21 PM Linus Torvalds > > <torvalds@linux-foundation.org> wrote: > > > > > > On Wed, 7 Aug 2024 at 16:20, Liam R. Howlett <Liam.Howlett@oracle.com> wrote: > > > > > > > > Okay, I'm going to try one more time here. You are suggesting to have a > > > > conf flag to leave the vdso pointer unchanged when it is unmapped. > > > > Having the close behind the conf option will not prevent it from being > > > > unmapped or mapped over, so what you are suggesting is have a > > > > configuration option that leaves a pointer, mm->context.vdso, to be > > > > unsafe if it is unmapped if you disable checkpoint restore. > > > > > This is a new point that I didn't realize before, if we are going to handle > > unmap vdso safely, yes, this is a bugfix that should be applied everywhere > > for all arch, without CHECKPOINT_RESTORE config. > > > > Do we need to worry about mmap(fixed) ? which can have the same effect > > as mremap. > > Yes, but it should be handled by vm_ops->close() when MAP_FIXED unmaps > the vdso. Note that you cannot MAP_FIXED over half of the vma as the > vm_ops->may_split() is special_mapping_split(), which just returns > -EINVAL. > The may_split() failure logic is specific to vm_special_mapping, right ? Do we still need to keep vm_special_mapping struct , if we are going to treat special vma as normal vma ? > Thanks, > Liam
* Jeff Xu <jeffxu@google.com> [240808 14:37]: > On Thu, Aug 8, 2024 at 11:08 AM Liam R. Howlett <Liam.Howlett@oracle.com> wrote: > > > > * Jeff Xu <jeffxu@google.com> [240807 23:37]: > > > On Wed, Aug 7, 2024 at 8:21 PM Linus Torvalds > > > <torvalds@linux-foundation.org> wrote: > > > > > > > > On Wed, 7 Aug 2024 at 16:20, Liam R. Howlett <Liam.Howlett@oracle.com> wrote: > > > > > > > > > > Okay, I'm going to try one more time here. You are suggesting to have a > > > > > conf flag to leave the vdso pointer unchanged when it is unmapped. > > > > > Having the close behind the conf option will not prevent it from being > > > > > unmapped or mapped over, so what you are suggesting is have a > > > > > configuration option that leaves a pointer, mm->context.vdso, to be > > > > > unsafe if it is unmapped if you disable checkpoint restore. > > > > > > > This is a new point that I didn't realize before, if we are going to handle > > > unmap vdso safely, yes, this is a bugfix that should be applied everywhere > > > for all arch, without CHECKPOINT_RESTORE config. > > > > > > Do we need to worry about mmap(fixed) ? which can have the same effect > > > as mremap. > > > > Yes, but it should be handled by vm_ops->close() when MAP_FIXED unmaps > > the vdso. Note that you cannot MAP_FIXED over half of the vma as the > > vm_ops->may_split() is special_mapping_split(), which just returns > > -EINVAL. > > > The may_split() failure logic is specific to vm_special_mapping, right ? Not really, it's just what exists for these vmas vm_ops struct. It's called on every vma for every split in __split_vma(). > > Do we still need to keep vm_special_mapping struct , if we are going to > treat special vma as normal vma ? No, just set the vm_ops may_split to something that returns -EINVAL.
On Thu, Aug 8, 2024 at 11:46 AM Liam R. Howlett <Liam.Howlett@oracle.com> wrote: > > * Jeff Xu <jeffxu@google.com> [240808 14:37]: > > On Thu, Aug 8, 2024 at 11:08 AM Liam R. Howlett <Liam.Howlett@oracle.com> wrote: > > > > > > * Jeff Xu <jeffxu@google.com> [240807 23:37]: > > > > On Wed, Aug 7, 2024 at 8:21 PM Linus Torvalds > > > > <torvalds@linux-foundation.org> wrote: > > > > > > > > > > On Wed, 7 Aug 2024 at 16:20, Liam R. Howlett <Liam.Howlett@oracle.com> wrote: > > > > > > > > > > > > Okay, I'm going to try one more time here. You are suggesting to have a > > > > > > conf flag to leave the vdso pointer unchanged when it is unmapped. > > > > > > Having the close behind the conf option will not prevent it from being > > > > > > unmapped or mapped over, so what you are suggesting is have a > > > > > > configuration option that leaves a pointer, mm->context.vdso, to be > > > > > > unsafe if it is unmapped if you disable checkpoint restore. > > > > > > > > > This is a new point that I didn't realize before, if we are going to handle > > > > unmap vdso safely, yes, this is a bugfix that should be applied everywhere > > > > for all arch, without CHECKPOINT_RESTORE config. > > > > > > > > Do we need to worry about mmap(fixed) ? which can have the same effect > > > > as mremap. > > > > > > Yes, but it should be handled by vm_ops->close() when MAP_FIXED unmaps > > > the vdso. Note that you cannot MAP_FIXED over half of the vma as the > > > vm_ops->may_split() is special_mapping_split(), which just returns > > > -EINVAL. > > > > > The may_split() failure logic is specific to vm_special_mapping, right ? > > Not really, it's just what exists for these vmas vm_ops struct. It's > called on every vma for every split in __split_vma(). > > > > > Do we still need to keep vm_special_mapping struct , if we are going to > > treat special vma as normal vma ? > > No, just set the vm_ops may_split to something that returns -EINVAL. > OK, that makes sense. Thanks
diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index 37bffa0f7918..9b8c1555744e 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -263,10 +263,6 @@ extern void arch_exit_mmap(struct mm_struct *mm); static inline void arch_unmap(struct mm_struct *mm, unsigned long start, unsigned long end) { - unsigned long vdso_base = (unsigned long)mm->context.vdso; - - if (start <= vdso_base && vdso_base < end) - mm->context.vdso = NULL; } #ifdef CONFIG_PPC_MEM_KEYS diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index 7a2ff9010f17..220a76cae7c1 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -81,6 +81,21 @@ static int vdso64_mremap(const struct vm_special_mapping *sm, struct vm_area_str return vdso_mremap(sm, new_vma, &vdso64_end - &vdso64_start); } +static void vdso_close(const struct vm_special_mapping *sm, struct vm_area_struct *vma) +{ + struct mm_struct *mm = vma->vm_mm; + + /* + * close() is called for munmap() but also for mremap(). In the mremap() + * case the vdso pointer has already been updated by the mremap() hook + * above, so it must not be set to NULL here. + */ + if (vma->vm_start != (unsigned long)mm->context.vdso) + return; + + mm->context.vdso = NULL; +} + static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, struct vm_area_struct *vma, struct vm_fault *vmf); @@ -92,11 +107,13 @@ static struct vm_special_mapping vvar_spec __ro_after_init = { static struct vm_special_mapping vdso32_spec __ro_after_init = { .name = "[vdso]", .mremap = vdso32_mremap, + .close = vdso_close, }; static struct vm_special_mapping vdso64_spec __ro_after_init = { .name = "[vdso]", .mremap = vdso64_mremap, + .close = vdso_close, }; #ifdef CONFIG_TIME_NS
Add a close() callback to the VDSO special mapping to handle unmapping of the VDSO. That will make it possible to remove the arch_unmap() hook entirely in a subsequent patch. Suggested-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> --- arch/powerpc/include/asm/mmu_context.h | 4 ---- arch/powerpc/kernel/vdso.c | 17 +++++++++++++++++ 2 files changed, 17 insertions(+), 4 deletions(-)