Message ID | 20200607075949.665-2-alex@ghiti.fr (mailing list archive) |
---|---|
State | Not Applicable |
Headers | show |
Series | vmalloc kernel mapping and relocatable kernel | expand |
Context | Check | Description |
---|---|---|
snowpatch_ozlabs/apply_patch | success | Successfully applied on branch powerpc/merge (ec7b8eb9bc7a519047485c95f7292b48f5b73fe6) |
snowpatch_ozlabs/checkpatch | warning | total: 0 errors, 0 warnings, 1 checks, 267 lines checked |
snowpatch_ozlabs/needsstable | success | Patch has no Fixes tags |
On Sun, Jun 7, 2020 at 1:01 AM Alexandre Ghiti <alex@ghiti.fr> wrote: > > This is a preparatory patch for relocatable kernel. > > The kernel used to be linked at PAGE_OFFSET address and used to be loaded > physically at the beginning of the main memory. Therefore, we could use > the linear mapping for the kernel mapping. > > But the relocated kernel base address will be different from PAGE_OFFSET > and since in the linear mapping, two different virtual addresses cannot > point to the same physical address, the kernel mapping needs to lie outside > the linear mapping. > > In addition, because modules and BPF must be close to the kernel (inside > +-2GB window), the kernel is placed at the end of the vmalloc zone minus > 2GB, which leaves room for modules and BPF. The kernel could not be > placed at the beginning of the vmalloc zone since other vmalloc > allocations from the kernel could get all the +-2GB window around the > kernel which would prevent new modules and BPF programs to be loaded. > > Signed-off-by: Alexandre Ghiti <alex@ghiti.fr> > Reviewed-by: Zong Li <zong.li@sifive.com> > --- > arch/riscv/boot/loader.lds.S | 3 +- > arch/riscv/include/asm/page.h | 10 +++++- > arch/riscv/include/asm/pgtable.h | 38 ++++++++++++++------- > arch/riscv/kernel/head.S | 3 +- > arch/riscv/kernel/module.c | 4 +-- > arch/riscv/kernel/vmlinux.lds.S | 3 +- > arch/riscv/mm/init.c | 58 +++++++++++++++++++++++++------- > arch/riscv/mm/physaddr.c | 2 +- > 8 files changed, 88 insertions(+), 33 deletions(-) > > diff --git a/arch/riscv/boot/loader.lds.S b/arch/riscv/boot/loader.lds.S > index 47a5003c2e28..62d94696a19c 100644 > --- a/arch/riscv/boot/loader.lds.S > +++ b/arch/riscv/boot/loader.lds.S > @@ -1,13 +1,14 @@ > /* SPDX-License-Identifier: GPL-2.0 */ > > #include <asm/page.h> > +#include <asm/pgtable.h> > > OUTPUT_ARCH(riscv) > ENTRY(_start) > > SECTIONS > { > - . = PAGE_OFFSET; > + . = KERNEL_LINK_ADDR; > > .payload : { > *(.payload) > diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h > index 2d50f76efe48..48bb09b6a9b7 100644 > --- a/arch/riscv/include/asm/page.h > +++ b/arch/riscv/include/asm/page.h > @@ -90,18 +90,26 @@ typedef struct page *pgtable_t; > > #ifdef CONFIG_MMU > extern unsigned long va_pa_offset; > +extern unsigned long va_kernel_pa_offset; > extern unsigned long pfn_base; > #define ARCH_PFN_OFFSET (pfn_base) > #else > #define va_pa_offset 0 > +#define va_kernel_pa_offset 0 > #define ARCH_PFN_OFFSET (PAGE_OFFSET >> PAGE_SHIFT) > #endif /* CONFIG_MMU */ > > extern unsigned long max_low_pfn; > extern unsigned long min_low_pfn; > +extern unsigned long kernel_virt_addr; > > #define __pa_to_va_nodebug(x) ((void *)((unsigned long) (x) + va_pa_offset)) > -#define __va_to_pa_nodebug(x) ((unsigned long)(x) - va_pa_offset) > +#define linear_mapping_va_to_pa(x) ((unsigned long)(x) - va_pa_offset) > +#define kernel_mapping_va_to_pa(x) \ > + ((unsigned long)(x) - va_kernel_pa_offset) > +#define __va_to_pa_nodebug(x) \ > + (((x) >= PAGE_OFFSET) ? \ > + linear_mapping_va_to_pa(x) : kernel_mapping_va_to_pa(x)) > > #ifdef CONFIG_DEBUG_VIRTUAL > extern phys_addr_t __virt_to_phys(unsigned long x); > diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h > index 35b60035b6b0..94ef3b49dfb6 100644 > --- a/arch/riscv/include/asm/pgtable.h > +++ b/arch/riscv/include/asm/pgtable.h > @@ -11,23 +11,29 @@ > > #include <asm/pgtable-bits.h> > > -#ifndef __ASSEMBLY__ > - > -/* Page Upper Directory not used in RISC-V */ > -#include <asm-generic/pgtable-nopud.h> > -#include <asm/page.h> > -#include <asm/tlbflush.h> > -#include <linux/mm_types.h> > - > -#ifdef CONFIG_MMU > +#ifndef CONFIG_MMU > +#define KERNEL_VIRT_ADDR PAGE_OFFSET > +#define KERNEL_LINK_ADDR PAGE_OFFSET > +#else > +/* > + * Leave 2GB for modules and BPF that must lie within a 2GB range around > + * the kernel. > + */ > +#define KERNEL_VIRT_ADDR (VMALLOC_END - SZ_2G + 1) > +#define KERNEL_LINK_ADDR KERNEL_VIRT_ADDR > > #define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) > #define VMALLOC_END (PAGE_OFFSET - 1) > #define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE) > > #define BPF_JIT_REGION_SIZE (SZ_128M) > -#define BPF_JIT_REGION_START (PAGE_OFFSET - BPF_JIT_REGION_SIZE) > -#define BPF_JIT_REGION_END (VMALLOC_END) > +#define BPF_JIT_REGION_START PFN_ALIGN((unsigned long)&_end) > +#define BPF_JIT_REGION_END (BPF_JIT_REGION_START + BPF_JIT_REGION_SIZE) > + As these mappings have changed a few times in recent months including this one, I think it would be better to have virtual memory layout documentation in RISC-V similar to other architectures. If you can include the page table layout for 3/4 level page tables in the same document, that would be really helpful. > +#ifdef CONFIG_64BIT > +#define VMALLOC_MODULE_START BPF_JIT_REGION_END > +#define VMALLOC_MODULE_END (((unsigned long)&_start & PAGE_MASK) + SZ_2G) > +#endif > > /* > * Roughly size the vmemmap space to be large enough to fit enough > @@ -57,9 +63,16 @@ > #define FIXADDR_SIZE PGDIR_SIZE > #endif > #define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) > - > #endif > > +#ifndef __ASSEMBLY__ > + > +/* Page Upper Directory not used in RISC-V */ > +#include <asm-generic/pgtable-nopud.h> > +#include <asm/page.h> > +#include <asm/tlbflush.h> > +#include <linux/mm_types.h> > + > #ifdef CONFIG_64BIT > #include <asm/pgtable-64.h> > #else > @@ -483,6 +496,7 @@ static inline void __kernel_map_pages(struct page *page, int numpages, int enabl > > #define kern_addr_valid(addr) (1) /* FIXME */ > > +extern char _start[]; > extern void *dtb_early_va; > void setup_bootmem(void); > void paging_init(void); > diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S > index 98a406474e7d..8f5bb7731327 100644 > --- a/arch/riscv/kernel/head.S > +++ b/arch/riscv/kernel/head.S > @@ -49,7 +49,8 @@ ENTRY(_start) > #ifdef CONFIG_MMU > relocate: > /* Relocate return address */ > - li a1, PAGE_OFFSET > + la a1, kernel_virt_addr > + REG_L a1, 0(a1) > la a2, _start > sub a1, a1, a2 > add ra, ra, a1 > diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c > index 8bbe5dbe1341..1a8fbe05accf 100644 > --- a/arch/riscv/kernel/module.c > +++ b/arch/riscv/kernel/module.c > @@ -392,12 +392,10 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, > } > > #if defined(CONFIG_MMU) && defined(CONFIG_64BIT) > -#define VMALLOC_MODULE_START \ > - max(PFN_ALIGN((unsigned long)&_end - SZ_2G), VMALLOC_START) > void *module_alloc(unsigned long size) > { > return __vmalloc_node_range(size, 1, VMALLOC_MODULE_START, > - VMALLOC_END, GFP_KERNEL, > + VMALLOC_MODULE_END, GFP_KERNEL, > PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, > __builtin_return_address(0)); > } > diff --git a/arch/riscv/kernel/vmlinux.lds.S b/arch/riscv/kernel/vmlinux.lds.S > index 0339b6bbe11a..a9abde62909f 100644 > --- a/arch/riscv/kernel/vmlinux.lds.S > +++ b/arch/riscv/kernel/vmlinux.lds.S > @@ -4,7 +4,8 @@ > * Copyright (C) 2017 SiFive > */ > > -#define LOAD_OFFSET PAGE_OFFSET > +#include <asm/pgtable.h> > +#define LOAD_OFFSET KERNEL_LINK_ADDR > #include <asm/vmlinux.lds.h> > #include <asm/page.h> > #include <asm/cache.h> > diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c > index 736de6c8739f..71da78914645 100644 > --- a/arch/riscv/mm/init.c > +++ b/arch/riscv/mm/init.c > @@ -22,6 +22,9 @@ > > #include "../kernel/head.h" > > +unsigned long kernel_virt_addr = KERNEL_VIRT_ADDR; > +EXPORT_SYMBOL(kernel_virt_addr); > + > unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] > __page_aligned_bss; > EXPORT_SYMBOL(empty_zero_page); > @@ -178,8 +181,12 @@ void __init setup_bootmem(void) > } > > #ifdef CONFIG_MMU > +/* Offset between linear mapping virtual address and kernel load address */ > unsigned long va_pa_offset; > EXPORT_SYMBOL(va_pa_offset); > +/* Offset between kernel mapping virtual address and kernel load address */ > +unsigned long va_kernel_pa_offset; > +EXPORT_SYMBOL(va_kernel_pa_offset); > unsigned long pfn_base; > EXPORT_SYMBOL(pfn_base); > > @@ -271,7 +278,7 @@ static phys_addr_t __init alloc_pmd(uintptr_t va) > if (mmu_enabled) > return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); > > - pmd_num = (va - PAGE_OFFSET) >> PGDIR_SHIFT; > + pmd_num = (va - kernel_virt_addr) >> PGDIR_SHIFT; > BUG_ON(pmd_num >= NUM_EARLY_PMDS); > return (uintptr_t)&early_pmd[pmd_num * PTRS_PER_PMD]; > } > @@ -372,14 +379,30 @@ static uintptr_t __init best_map_size(phys_addr_t base, phys_addr_t size) > #error "setup_vm() is called from head.S before relocate so it should not use absolute addressing." > #endif > > +static uintptr_t load_pa, load_sz; > + > +static void __init create_kernel_page_table(pgd_t *pgdir, uintptr_t map_size) > +{ > + uintptr_t va, end_va; > + > + end_va = kernel_virt_addr + load_sz; > + for (va = kernel_virt_addr; va < end_va; va += map_size) > + create_pgd_mapping(pgdir, va, > + load_pa + (va - kernel_virt_addr), > + map_size, PAGE_KERNEL_EXEC); > +} > + > asmlinkage void __init setup_vm(uintptr_t dtb_pa) > { > uintptr_t va, end_va; > - uintptr_t load_pa = (uintptr_t)(&_start); > - uintptr_t load_sz = (uintptr_t)(&_end) - load_pa; > uintptr_t map_size = best_map_size(load_pa, MAX_EARLY_MAPPING_SIZE); > > + load_pa = (uintptr_t)(&_start); > + load_sz = (uintptr_t)(&_end) - load_pa; > + > va_pa_offset = PAGE_OFFSET - load_pa; > + va_kernel_pa_offset = kernel_virt_addr - load_pa; > + > pfn_base = PFN_DOWN(load_pa); > > /* > @@ -402,26 +425,22 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) > create_pmd_mapping(fixmap_pmd, FIXADDR_START, > (uintptr_t)fixmap_pte, PMD_SIZE, PAGE_TABLE); > /* Setup trampoline PGD and PMD */ > - create_pgd_mapping(trampoline_pg_dir, PAGE_OFFSET, > + create_pgd_mapping(trampoline_pg_dir, kernel_virt_addr, > (uintptr_t)trampoline_pmd, PGDIR_SIZE, PAGE_TABLE); > - create_pmd_mapping(trampoline_pmd, PAGE_OFFSET, > + create_pmd_mapping(trampoline_pmd, kernel_virt_addr, > load_pa, PMD_SIZE, PAGE_KERNEL_EXEC); > #else > /* Setup trampoline PGD */ > - create_pgd_mapping(trampoline_pg_dir, PAGE_OFFSET, > + create_pgd_mapping(trampoline_pg_dir, kernel_virt_addr, > load_pa, PGDIR_SIZE, PAGE_KERNEL_EXEC); > #endif > > /* > - * Setup early PGD covering entire kernel which will allows > + * Setup early PGD covering entire kernel which will allow > * us to reach paging_init(). We map all memory banks later > * in setup_vm_final() below. > */ > - end_va = PAGE_OFFSET + load_sz; > - for (va = PAGE_OFFSET; va < end_va; va += map_size) > - create_pgd_mapping(early_pg_dir, va, > - load_pa + (va - PAGE_OFFSET), > - map_size, PAGE_KERNEL_EXEC); > + create_kernel_page_table(early_pg_dir, map_size); > > /* Create fixed mapping for early FDT parsing */ > end_va = __fix_to_virt(FIX_FDT) + FIX_FDT_SIZE; > @@ -441,6 +460,7 @@ static void __init setup_vm_final(void) > uintptr_t va, map_size; > phys_addr_t pa, start, end; > struct memblock_region *reg; > + static struct vm_struct vm_kernel = { 0 }; > > /* Set mmu_enabled flag */ > mmu_enabled = true; > @@ -467,10 +487,22 @@ static void __init setup_vm_final(void) > for (pa = start; pa < end; pa += map_size) { > va = (uintptr_t)__va(pa); > create_pgd_mapping(swapper_pg_dir, va, pa, > - map_size, PAGE_KERNEL_EXEC); > + map_size, PAGE_KERNEL); > } > } > > + /* Map the kernel */ > + create_kernel_page_table(swapper_pg_dir, PMD_SIZE); > + > + /* Reserve the vmalloc area occupied by the kernel */ > + vm_kernel.addr = (void *)kernel_virt_addr; > + vm_kernel.phys_addr = load_pa; > + vm_kernel.size = (load_sz + PMD_SIZE - 1) & ~(PMD_SIZE - 1); > + vm_kernel.flags = VM_MAP | VM_NO_GUARD; > + vm_kernel.caller = __builtin_return_address(0); > + > + vm_area_add_early(&vm_kernel); > + > /* Clear fixmap PTE and PMD mappings */ > clear_fixmap(FIX_PTE); > clear_fixmap(FIX_PMD); > diff --git a/arch/riscv/mm/physaddr.c b/arch/riscv/mm/physaddr.c > index e8e4dcd39fed..35703d5ef5fd 100644 > --- a/arch/riscv/mm/physaddr.c > +++ b/arch/riscv/mm/physaddr.c > @@ -23,7 +23,7 @@ EXPORT_SYMBOL(__virt_to_phys); > > phys_addr_t __phys_addr_symbol(unsigned long x) > { > - unsigned long kernel_start = (unsigned long)PAGE_OFFSET; > + unsigned long kernel_start = (unsigned long)kernel_virt_addr; > unsigned long kernel_end = (unsigned long)_end; > > /* > -- > 2.20.1 > >
Hi Atish, Le 6/11/20 à 5:34 PM, Atish Patra a écrit : > On Sun, Jun 7, 2020 at 1:01 AM Alexandre Ghiti <alex@ghiti.fr> wrote: >> This is a preparatory patch for relocatable kernel. >> >> The kernel used to be linked at PAGE_OFFSET address and used to be loaded >> physically at the beginning of the main memory. Therefore, we could use >> the linear mapping for the kernel mapping. >> >> But the relocated kernel base address will be different from PAGE_OFFSET >> and since in the linear mapping, two different virtual addresses cannot >> point to the same physical address, the kernel mapping needs to lie outside >> the linear mapping. >> >> In addition, because modules and BPF must be close to the kernel (inside >> +-2GB window), the kernel is placed at the end of the vmalloc zone minus >> 2GB, which leaves room for modules and BPF. The kernel could not be >> placed at the beginning of the vmalloc zone since other vmalloc >> allocations from the kernel could get all the +-2GB window around the >> kernel which would prevent new modules and BPF programs to be loaded. >> >> Signed-off-by: Alexandre Ghiti <alex@ghiti.fr> >> Reviewed-by: Zong Li <zong.li@sifive.com> >> --- >> arch/riscv/boot/loader.lds.S | 3 +- >> arch/riscv/include/asm/page.h | 10 +++++- >> arch/riscv/include/asm/pgtable.h | 38 ++++++++++++++------- >> arch/riscv/kernel/head.S | 3 +- >> arch/riscv/kernel/module.c | 4 +-- >> arch/riscv/kernel/vmlinux.lds.S | 3 +- >> arch/riscv/mm/init.c | 58 +++++++++++++++++++++++++------- >> arch/riscv/mm/physaddr.c | 2 +- >> 8 files changed, 88 insertions(+), 33 deletions(-) >> >> diff --git a/arch/riscv/boot/loader.lds.S b/arch/riscv/boot/loader.lds.S >> index 47a5003c2e28..62d94696a19c 100644 >> --- a/arch/riscv/boot/loader.lds.S >> +++ b/arch/riscv/boot/loader.lds.S >> @@ -1,13 +1,14 @@ >> /* SPDX-License-Identifier: GPL-2.0 */ >> >> #include <asm/page.h> >> +#include <asm/pgtable.h> >> >> OUTPUT_ARCH(riscv) >> ENTRY(_start) >> >> SECTIONS >> { >> - . = PAGE_OFFSET; >> + . = KERNEL_LINK_ADDR; >> >> .payload : { >> *(.payload) >> diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h >> index 2d50f76efe48..48bb09b6a9b7 100644 >> --- a/arch/riscv/include/asm/page.h >> +++ b/arch/riscv/include/asm/page.h >> @@ -90,18 +90,26 @@ typedef struct page *pgtable_t; >> >> #ifdef CONFIG_MMU >> extern unsigned long va_pa_offset; >> +extern unsigned long va_kernel_pa_offset; >> extern unsigned long pfn_base; >> #define ARCH_PFN_OFFSET (pfn_base) >> #else >> #define va_pa_offset 0 >> +#define va_kernel_pa_offset 0 >> #define ARCH_PFN_OFFSET (PAGE_OFFSET >> PAGE_SHIFT) >> #endif /* CONFIG_MMU */ >> >> extern unsigned long max_low_pfn; >> extern unsigned long min_low_pfn; >> +extern unsigned long kernel_virt_addr; >> >> #define __pa_to_va_nodebug(x) ((void *)((unsigned long) (x) + va_pa_offset)) >> -#define __va_to_pa_nodebug(x) ((unsigned long)(x) - va_pa_offset) >> +#define linear_mapping_va_to_pa(x) ((unsigned long)(x) - va_pa_offset) >> +#define kernel_mapping_va_to_pa(x) \ >> + ((unsigned long)(x) - va_kernel_pa_offset) >> +#define __va_to_pa_nodebug(x) \ >> + (((x) >= PAGE_OFFSET) ? \ >> + linear_mapping_va_to_pa(x) : kernel_mapping_va_to_pa(x)) >> >> #ifdef CONFIG_DEBUG_VIRTUAL >> extern phys_addr_t __virt_to_phys(unsigned long x); >> diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h >> index 35b60035b6b0..94ef3b49dfb6 100644 >> --- a/arch/riscv/include/asm/pgtable.h >> +++ b/arch/riscv/include/asm/pgtable.h >> @@ -11,23 +11,29 @@ >> >> #include <asm/pgtable-bits.h> >> >> -#ifndef __ASSEMBLY__ >> - >> -/* Page Upper Directory not used in RISC-V */ >> -#include <asm-generic/pgtable-nopud.h> >> -#include <asm/page.h> >> -#include <asm/tlbflush.h> >> -#include <linux/mm_types.h> >> - >> -#ifdef CONFIG_MMU >> +#ifndef CONFIG_MMU >> +#define KERNEL_VIRT_ADDR PAGE_OFFSET >> +#define KERNEL_LINK_ADDR PAGE_OFFSET >> +#else >> +/* >> + * Leave 2GB for modules and BPF that must lie within a 2GB range around >> + * the kernel. >> + */ >> +#define KERNEL_VIRT_ADDR (VMALLOC_END - SZ_2G + 1) >> +#define KERNEL_LINK_ADDR KERNEL_VIRT_ADDR >> >> #define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) >> #define VMALLOC_END (PAGE_OFFSET - 1) >> #define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE) >> >> #define BPF_JIT_REGION_SIZE (SZ_128M) >> -#define BPF_JIT_REGION_START (PAGE_OFFSET - BPF_JIT_REGION_SIZE) >> -#define BPF_JIT_REGION_END (VMALLOC_END) >> +#define BPF_JIT_REGION_START PFN_ALIGN((unsigned long)&_end) >> +#define BPF_JIT_REGION_END (BPF_JIT_REGION_START + BPF_JIT_REGION_SIZE) >> + > As these mappings have changed a few times in recent months including > this one, I think it would be > better to have virtual memory layout documentation in RISC-V similar > to other architectures. > > If you can include the page table layout for 3/4 level page tables in > the same document, that would be really helpful. > Yes, I'll do that in a separate commit. Thanks, Alex >> +#ifdef CONFIG_64BIT >> +#define VMALLOC_MODULE_START BPF_JIT_REGION_END >> +#define VMALLOC_MODULE_END (((unsigned long)&_start & PAGE_MASK) + SZ_2G) >> +#endif >> >> /* >> * Roughly size the vmemmap space to be large enough to fit enough >> @@ -57,9 +63,16 @@ >> #define FIXADDR_SIZE PGDIR_SIZE >> #endif >> #define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) >> - >> #endif >> >> +#ifndef __ASSEMBLY__ >> + >> +/* Page Upper Directory not used in RISC-V */ >> +#include <asm-generic/pgtable-nopud.h> >> +#include <asm/page.h> >> +#include <asm/tlbflush.h> >> +#include <linux/mm_types.h> >> + >> #ifdef CONFIG_64BIT >> #include <asm/pgtable-64.h> >> #else >> @@ -483,6 +496,7 @@ static inline void __kernel_map_pages(struct page *page, int numpages, int enabl >> >> #define kern_addr_valid(addr) (1) /* FIXME */ >> >> +extern char _start[]; >> extern void *dtb_early_va; >> void setup_bootmem(void); >> void paging_init(void); >> diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S >> index 98a406474e7d..8f5bb7731327 100644 >> --- a/arch/riscv/kernel/head.S >> +++ b/arch/riscv/kernel/head.S >> @@ -49,7 +49,8 @@ ENTRY(_start) >> #ifdef CONFIG_MMU >> relocate: >> /* Relocate return address */ >> - li a1, PAGE_OFFSET >> + la a1, kernel_virt_addr >> + REG_L a1, 0(a1) >> la a2, _start >> sub a1, a1, a2 >> add ra, ra, a1 >> diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c >> index 8bbe5dbe1341..1a8fbe05accf 100644 >> --- a/arch/riscv/kernel/module.c >> +++ b/arch/riscv/kernel/module.c >> @@ -392,12 +392,10 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, >> } >> >> #if defined(CONFIG_MMU) && defined(CONFIG_64BIT) >> -#define VMALLOC_MODULE_START \ >> - max(PFN_ALIGN((unsigned long)&_end - SZ_2G), VMALLOC_START) >> void *module_alloc(unsigned long size) >> { >> return __vmalloc_node_range(size, 1, VMALLOC_MODULE_START, >> - VMALLOC_END, GFP_KERNEL, >> + VMALLOC_MODULE_END, GFP_KERNEL, >> PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, >> __builtin_return_address(0)); >> } >> diff --git a/arch/riscv/kernel/vmlinux.lds.S b/arch/riscv/kernel/vmlinux.lds.S >> index 0339b6bbe11a..a9abde62909f 100644 >> --- a/arch/riscv/kernel/vmlinux.lds.S >> +++ b/arch/riscv/kernel/vmlinux.lds.S >> @@ -4,7 +4,8 @@ >> * Copyright (C) 2017 SiFive >> */ >> >> -#define LOAD_OFFSET PAGE_OFFSET >> +#include <asm/pgtable.h> >> +#define LOAD_OFFSET KERNEL_LINK_ADDR >> #include <asm/vmlinux.lds.h> >> #include <asm/page.h> >> #include <asm/cache.h> >> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c >> index 736de6c8739f..71da78914645 100644 >> --- a/arch/riscv/mm/init.c >> +++ b/arch/riscv/mm/init.c >> @@ -22,6 +22,9 @@ >> >> #include "../kernel/head.h" >> >> +unsigned long kernel_virt_addr = KERNEL_VIRT_ADDR; >> +EXPORT_SYMBOL(kernel_virt_addr); >> + >> unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] >> __page_aligned_bss; >> EXPORT_SYMBOL(empty_zero_page); >> @@ -178,8 +181,12 @@ void __init setup_bootmem(void) >> } >> >> #ifdef CONFIG_MMU >> +/* Offset between linear mapping virtual address and kernel load address */ >> unsigned long va_pa_offset; >> EXPORT_SYMBOL(va_pa_offset); >> +/* Offset between kernel mapping virtual address and kernel load address */ >> +unsigned long va_kernel_pa_offset; >> +EXPORT_SYMBOL(va_kernel_pa_offset); >> unsigned long pfn_base; >> EXPORT_SYMBOL(pfn_base); >> >> @@ -271,7 +278,7 @@ static phys_addr_t __init alloc_pmd(uintptr_t va) >> if (mmu_enabled) >> return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); >> >> - pmd_num = (va - PAGE_OFFSET) >> PGDIR_SHIFT; >> + pmd_num = (va - kernel_virt_addr) >> PGDIR_SHIFT; >> BUG_ON(pmd_num >= NUM_EARLY_PMDS); >> return (uintptr_t)&early_pmd[pmd_num * PTRS_PER_PMD]; >> } >> @@ -372,14 +379,30 @@ static uintptr_t __init best_map_size(phys_addr_t base, phys_addr_t size) >> #error "setup_vm() is called from head.S before relocate so it should not use absolute addressing." >> #endif >> >> +static uintptr_t load_pa, load_sz; >> + >> +static void __init create_kernel_page_table(pgd_t *pgdir, uintptr_t map_size) >> +{ >> + uintptr_t va, end_va; >> + >> + end_va = kernel_virt_addr + load_sz; >> + for (va = kernel_virt_addr; va < end_va; va += map_size) >> + create_pgd_mapping(pgdir, va, >> + load_pa + (va - kernel_virt_addr), >> + map_size, PAGE_KERNEL_EXEC); >> +} >> + >> asmlinkage void __init setup_vm(uintptr_t dtb_pa) >> { >> uintptr_t va, end_va; >> - uintptr_t load_pa = (uintptr_t)(&_start); >> - uintptr_t load_sz = (uintptr_t)(&_end) - load_pa; >> uintptr_t map_size = best_map_size(load_pa, MAX_EARLY_MAPPING_SIZE); >> >> + load_pa = (uintptr_t)(&_start); >> + load_sz = (uintptr_t)(&_end) - load_pa; >> + >> va_pa_offset = PAGE_OFFSET - load_pa; >> + va_kernel_pa_offset = kernel_virt_addr - load_pa; >> + >> pfn_base = PFN_DOWN(load_pa); >> >> /* >> @@ -402,26 +425,22 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) >> create_pmd_mapping(fixmap_pmd, FIXADDR_START, >> (uintptr_t)fixmap_pte, PMD_SIZE, PAGE_TABLE); >> /* Setup trampoline PGD and PMD */ >> - create_pgd_mapping(trampoline_pg_dir, PAGE_OFFSET, >> + create_pgd_mapping(trampoline_pg_dir, kernel_virt_addr, >> (uintptr_t)trampoline_pmd, PGDIR_SIZE, PAGE_TABLE); >> - create_pmd_mapping(trampoline_pmd, PAGE_OFFSET, >> + create_pmd_mapping(trampoline_pmd, kernel_virt_addr, >> load_pa, PMD_SIZE, PAGE_KERNEL_EXEC); >> #else >> /* Setup trampoline PGD */ >> - create_pgd_mapping(trampoline_pg_dir, PAGE_OFFSET, >> + create_pgd_mapping(trampoline_pg_dir, kernel_virt_addr, >> load_pa, PGDIR_SIZE, PAGE_KERNEL_EXEC); >> #endif >> >> /* >> - * Setup early PGD covering entire kernel which will allows >> + * Setup early PGD covering entire kernel which will allow >> * us to reach paging_init(). We map all memory banks later >> * in setup_vm_final() below. >> */ >> - end_va = PAGE_OFFSET + load_sz; >> - for (va = PAGE_OFFSET; va < end_va; va += map_size) >> - create_pgd_mapping(early_pg_dir, va, >> - load_pa + (va - PAGE_OFFSET), >> - map_size, PAGE_KERNEL_EXEC); >> + create_kernel_page_table(early_pg_dir, map_size); >> >> /* Create fixed mapping for early FDT parsing */ >> end_va = __fix_to_virt(FIX_FDT) + FIX_FDT_SIZE; >> @@ -441,6 +460,7 @@ static void __init setup_vm_final(void) >> uintptr_t va, map_size; >> phys_addr_t pa, start, end; >> struct memblock_region *reg; >> + static struct vm_struct vm_kernel = { 0 }; >> >> /* Set mmu_enabled flag */ >> mmu_enabled = true; >> @@ -467,10 +487,22 @@ static void __init setup_vm_final(void) >> for (pa = start; pa < end; pa += map_size) { >> va = (uintptr_t)__va(pa); >> create_pgd_mapping(swapper_pg_dir, va, pa, >> - map_size, PAGE_KERNEL_EXEC); >> + map_size, PAGE_KERNEL); >> } >> } >> >> + /* Map the kernel */ >> + create_kernel_page_table(swapper_pg_dir, PMD_SIZE); >> + >> + /* Reserve the vmalloc area occupied by the kernel */ >> + vm_kernel.addr = (void *)kernel_virt_addr; >> + vm_kernel.phys_addr = load_pa; >> + vm_kernel.size = (load_sz + PMD_SIZE - 1) & ~(PMD_SIZE - 1); >> + vm_kernel.flags = VM_MAP | VM_NO_GUARD; >> + vm_kernel.caller = __builtin_return_address(0); >> + >> + vm_area_add_early(&vm_kernel); >> + >> /* Clear fixmap PTE and PMD mappings */ >> clear_fixmap(FIX_PTE); >> clear_fixmap(FIX_PMD); >> diff --git a/arch/riscv/mm/physaddr.c b/arch/riscv/mm/physaddr.c >> index e8e4dcd39fed..35703d5ef5fd 100644 >> --- a/arch/riscv/mm/physaddr.c >> +++ b/arch/riscv/mm/physaddr.c >> @@ -23,7 +23,7 @@ EXPORT_SYMBOL(__virt_to_phys); >> >> phys_addr_t __phys_addr_symbol(unsigned long x) >> { >> - unsigned long kernel_start = (unsigned long)PAGE_OFFSET; >> + unsigned long kernel_start = (unsigned long)kernel_virt_addr; >> unsigned long kernel_end = (unsigned long)_end; >> >> /* >> -- >> 2.20.1 >> >> >
On Sun, 07 Jun 2020 00:59:46 PDT (-0700), alex@ghiti.fr wrote: > This is a preparatory patch for relocatable kernel. > > The kernel used to be linked at PAGE_OFFSET address and used to be loaded > physically at the beginning of the main memory. Therefore, we could use > the linear mapping for the kernel mapping. > > But the relocated kernel base address will be different from PAGE_OFFSET > and since in the linear mapping, two different virtual addresses cannot > point to the same physical address, the kernel mapping needs to lie outside > the linear mapping. I know it's been a while, but I keep opening this up to review it and just can't get over how ugly it is to put the kernel's linear map in the vmalloc region. I guess I don't understand why this is necessary at all. Specifically: why can't we just relocate the kernel within the linear map? That would let the bootloader put the kernel wherever it wants, modulo the physical memory size we support. We'd need to handle the regions that are coupled to the kernel's execution address, but we could just put them in an explicit memory region which is what we should probably be doing anyway. > In addition, because modules and BPF must be close to the kernel (inside > +-2GB window), the kernel is placed at the end of the vmalloc zone minus > 2GB, which leaves room for modules and BPF. The kernel could not be > placed at the beginning of the vmalloc zone since other vmalloc > allocations from the kernel could get all the +-2GB window around the > kernel which would prevent new modules and BPF programs to be loaded. Well, that's not enough to make sure this doesn't happen -- it's just enough to make sure it doesn't happen very quickily. That's the same boat we're already in, though, so it's not like it's worse. > Signed-off-by: Alexandre Ghiti <alex@ghiti.fr> > Reviewed-by: Zong Li <zong.li@sifive.com> > --- > arch/riscv/boot/loader.lds.S | 3 +- > arch/riscv/include/asm/page.h | 10 +++++- > arch/riscv/include/asm/pgtable.h | 38 ++++++++++++++------- > arch/riscv/kernel/head.S | 3 +- > arch/riscv/kernel/module.c | 4 +-- > arch/riscv/kernel/vmlinux.lds.S | 3 +- > arch/riscv/mm/init.c | 58 +++++++++++++++++++++++++------- > arch/riscv/mm/physaddr.c | 2 +- > 8 files changed, 88 insertions(+), 33 deletions(-) > > diff --git a/arch/riscv/boot/loader.lds.S b/arch/riscv/boot/loader.lds.S > index 47a5003c2e28..62d94696a19c 100644 > --- a/arch/riscv/boot/loader.lds.S > +++ b/arch/riscv/boot/loader.lds.S > @@ -1,13 +1,14 @@ > /* SPDX-License-Identifier: GPL-2.0 */ > > #include <asm/page.h> > +#include <asm/pgtable.h> > > OUTPUT_ARCH(riscv) > ENTRY(_start) > > SECTIONS > { > - . = PAGE_OFFSET; > + . = KERNEL_LINK_ADDR; > > .payload : { > *(.payload) > diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h > index 2d50f76efe48..48bb09b6a9b7 100644 > --- a/arch/riscv/include/asm/page.h > +++ b/arch/riscv/include/asm/page.h > @@ -90,18 +90,26 @@ typedef struct page *pgtable_t; > > #ifdef CONFIG_MMU > extern unsigned long va_pa_offset; > +extern unsigned long va_kernel_pa_offset; > extern unsigned long pfn_base; > #define ARCH_PFN_OFFSET (pfn_base) > #else > #define va_pa_offset 0 > +#define va_kernel_pa_offset 0 > #define ARCH_PFN_OFFSET (PAGE_OFFSET >> PAGE_SHIFT) > #endif /* CONFIG_MMU */ > > extern unsigned long max_low_pfn; > extern unsigned long min_low_pfn; > +extern unsigned long kernel_virt_addr; > > #define __pa_to_va_nodebug(x) ((void *)((unsigned long) (x) + va_pa_offset)) > -#define __va_to_pa_nodebug(x) ((unsigned long)(x) - va_pa_offset) > +#define linear_mapping_va_to_pa(x) ((unsigned long)(x) - va_pa_offset) > +#define kernel_mapping_va_to_pa(x) \ > + ((unsigned long)(x) - va_kernel_pa_offset) > +#define __va_to_pa_nodebug(x) \ > + (((x) >= PAGE_OFFSET) ? \ > + linear_mapping_va_to_pa(x) : kernel_mapping_va_to_pa(x)) > > #ifdef CONFIG_DEBUG_VIRTUAL > extern phys_addr_t __virt_to_phys(unsigned long x); > diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h > index 35b60035b6b0..94ef3b49dfb6 100644 > --- a/arch/riscv/include/asm/pgtable.h > +++ b/arch/riscv/include/asm/pgtable.h > @@ -11,23 +11,29 @@ > > #include <asm/pgtable-bits.h> > > -#ifndef __ASSEMBLY__ > - > -/* Page Upper Directory not used in RISC-V */ > -#include <asm-generic/pgtable-nopud.h> > -#include <asm/page.h> > -#include <asm/tlbflush.h> > -#include <linux/mm_types.h> > - > -#ifdef CONFIG_MMU > +#ifndef CONFIG_MMU > +#define KERNEL_VIRT_ADDR PAGE_OFFSET > +#define KERNEL_LINK_ADDR PAGE_OFFSET > +#else > +/* > + * Leave 2GB for modules and BPF that must lie within a 2GB range around > + * the kernel. > + */ > +#define KERNEL_VIRT_ADDR (VMALLOC_END - SZ_2G + 1) > +#define KERNEL_LINK_ADDR KERNEL_VIRT_ADDR At a bare minimum this is going to make a mess of the 32-bit port, as non-relocatable kernels are now going to get linked at 1GiB which is where user code is supposed to live. That's an easy fix, though, as the 32-bit stuff doesn't need any module address restrictions. > #define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) > #define VMALLOC_END (PAGE_OFFSET - 1) > #define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE) > > #define BPF_JIT_REGION_SIZE (SZ_128M) > -#define BPF_JIT_REGION_START (PAGE_OFFSET - BPF_JIT_REGION_SIZE) > -#define BPF_JIT_REGION_END (VMALLOC_END) > +#define BPF_JIT_REGION_START PFN_ALIGN((unsigned long)&_end) > +#define BPF_JIT_REGION_END (BPF_JIT_REGION_START + BPF_JIT_REGION_SIZE) > + > +#ifdef CONFIG_64BIT > +#define VMALLOC_MODULE_START BPF_JIT_REGION_END > +#define VMALLOC_MODULE_END (((unsigned long)&_start & PAGE_MASK) + SZ_2G) > +#endif > > /* > * Roughly size the vmemmap space to be large enough to fit enough > @@ -57,9 +63,16 @@ > #define FIXADDR_SIZE PGDIR_SIZE > #endif > #define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) > - > #endif > > +#ifndef __ASSEMBLY__ > + > +/* Page Upper Directory not used in RISC-V */ > +#include <asm-generic/pgtable-nopud.h> > +#include <asm/page.h> > +#include <asm/tlbflush.h> > +#include <linux/mm_types.h> > + > #ifdef CONFIG_64BIT > #include <asm/pgtable-64.h> > #else > @@ -483,6 +496,7 @@ static inline void __kernel_map_pages(struct page *page, int numpages, int enabl > > #define kern_addr_valid(addr) (1) /* FIXME */ > > +extern char _start[]; > extern void *dtb_early_va; > void setup_bootmem(void); > void paging_init(void); > diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S > index 98a406474e7d..8f5bb7731327 100644 > --- a/arch/riscv/kernel/head.S > +++ b/arch/riscv/kernel/head.S > @@ -49,7 +49,8 @@ ENTRY(_start) > #ifdef CONFIG_MMU > relocate: > /* Relocate return address */ > - li a1, PAGE_OFFSET > + la a1, kernel_virt_addr > + REG_L a1, 0(a1) > la a2, _start > sub a1, a1, a2 > add ra, ra, a1 > diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c > index 8bbe5dbe1341..1a8fbe05accf 100644 > --- a/arch/riscv/kernel/module.c > +++ b/arch/riscv/kernel/module.c > @@ -392,12 +392,10 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, > } > > #if defined(CONFIG_MMU) && defined(CONFIG_64BIT) > -#define VMALLOC_MODULE_START \ > - max(PFN_ALIGN((unsigned long)&_end - SZ_2G), VMALLOC_START) > void *module_alloc(unsigned long size) > { > return __vmalloc_node_range(size, 1, VMALLOC_MODULE_START, > - VMALLOC_END, GFP_KERNEL, > + VMALLOC_MODULE_END, GFP_KERNEL, > PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, > __builtin_return_address(0)); > } > diff --git a/arch/riscv/kernel/vmlinux.lds.S b/arch/riscv/kernel/vmlinux.lds.S > index 0339b6bbe11a..a9abde62909f 100644 > --- a/arch/riscv/kernel/vmlinux.lds.S > +++ b/arch/riscv/kernel/vmlinux.lds.S > @@ -4,7 +4,8 @@ > * Copyright (C) 2017 SiFive > */ > > -#define LOAD_OFFSET PAGE_OFFSET > +#include <asm/pgtable.h> > +#define LOAD_OFFSET KERNEL_LINK_ADDR > #include <asm/vmlinux.lds.h> > #include <asm/page.h> > #include <asm/cache.h> > diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c > index 736de6c8739f..71da78914645 100644 > --- a/arch/riscv/mm/init.c > +++ b/arch/riscv/mm/init.c > @@ -22,6 +22,9 @@ > > #include "../kernel/head.h" > > +unsigned long kernel_virt_addr = KERNEL_VIRT_ADDR; > +EXPORT_SYMBOL(kernel_virt_addr); > + > unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] > __page_aligned_bss; > EXPORT_SYMBOL(empty_zero_page); > @@ -178,8 +181,12 @@ void __init setup_bootmem(void) > } > > #ifdef CONFIG_MMU > +/* Offset between linear mapping virtual address and kernel load address */ > unsigned long va_pa_offset; > EXPORT_SYMBOL(va_pa_offset); > +/* Offset between kernel mapping virtual address and kernel load address */ > +unsigned long va_kernel_pa_offset; > +EXPORT_SYMBOL(va_kernel_pa_offset); > unsigned long pfn_base; > EXPORT_SYMBOL(pfn_base); > > @@ -271,7 +278,7 @@ static phys_addr_t __init alloc_pmd(uintptr_t va) > if (mmu_enabled) > return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); > > - pmd_num = (va - PAGE_OFFSET) >> PGDIR_SHIFT; > + pmd_num = (va - kernel_virt_addr) >> PGDIR_SHIFT; > BUG_ON(pmd_num >= NUM_EARLY_PMDS); > return (uintptr_t)&early_pmd[pmd_num * PTRS_PER_PMD]; > } > @@ -372,14 +379,30 @@ static uintptr_t __init best_map_size(phys_addr_t base, phys_addr_t size) > #error "setup_vm() is called from head.S before relocate so it should not use absolute addressing." > #endif > > +static uintptr_t load_pa, load_sz; > + > +static void __init create_kernel_page_table(pgd_t *pgdir, uintptr_t map_size) > +{ > + uintptr_t va, end_va; > + > + end_va = kernel_virt_addr + load_sz; > + for (va = kernel_virt_addr; va < end_va; va += map_size) > + create_pgd_mapping(pgdir, va, > + load_pa + (va - kernel_virt_addr), > + map_size, PAGE_KERNEL_EXEC); > +} > + > asmlinkage void __init setup_vm(uintptr_t dtb_pa) > { > uintptr_t va, end_va; > - uintptr_t load_pa = (uintptr_t)(&_start); > - uintptr_t load_sz = (uintptr_t)(&_end) - load_pa; > uintptr_t map_size = best_map_size(load_pa, MAX_EARLY_MAPPING_SIZE); > > + load_pa = (uintptr_t)(&_start); > + load_sz = (uintptr_t)(&_end) - load_pa; > + > va_pa_offset = PAGE_OFFSET - load_pa; > + va_kernel_pa_offset = kernel_virt_addr - load_pa; > + > pfn_base = PFN_DOWN(load_pa); > > /* > @@ -402,26 +425,22 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) > create_pmd_mapping(fixmap_pmd, FIXADDR_START, > (uintptr_t)fixmap_pte, PMD_SIZE, PAGE_TABLE); > /* Setup trampoline PGD and PMD */ > - create_pgd_mapping(trampoline_pg_dir, PAGE_OFFSET, > + create_pgd_mapping(trampoline_pg_dir, kernel_virt_addr, > (uintptr_t)trampoline_pmd, PGDIR_SIZE, PAGE_TABLE); > - create_pmd_mapping(trampoline_pmd, PAGE_OFFSET, > + create_pmd_mapping(trampoline_pmd, kernel_virt_addr, > load_pa, PMD_SIZE, PAGE_KERNEL_EXEC); > #else > /* Setup trampoline PGD */ > - create_pgd_mapping(trampoline_pg_dir, PAGE_OFFSET, > + create_pgd_mapping(trampoline_pg_dir, kernel_virt_addr, > load_pa, PGDIR_SIZE, PAGE_KERNEL_EXEC); > #endif > > /* > - * Setup early PGD covering entire kernel which will allows > + * Setup early PGD covering entire kernel which will allow > * us to reach paging_init(). We map all memory banks later > * in setup_vm_final() below. > */ > - end_va = PAGE_OFFSET + load_sz; > - for (va = PAGE_OFFSET; va < end_va; va += map_size) > - create_pgd_mapping(early_pg_dir, va, > - load_pa + (va - PAGE_OFFSET), > - map_size, PAGE_KERNEL_EXEC); > + create_kernel_page_table(early_pg_dir, map_size); > > /* Create fixed mapping for early FDT parsing */ > end_va = __fix_to_virt(FIX_FDT) + FIX_FDT_SIZE; > @@ -441,6 +460,7 @@ static void __init setup_vm_final(void) > uintptr_t va, map_size; > phys_addr_t pa, start, end; > struct memblock_region *reg; > + static struct vm_struct vm_kernel = { 0 }; > > /* Set mmu_enabled flag */ > mmu_enabled = true; > @@ -467,10 +487,22 @@ static void __init setup_vm_final(void) > for (pa = start; pa < end; pa += map_size) { > va = (uintptr_t)__va(pa); > create_pgd_mapping(swapper_pg_dir, va, pa, > - map_size, PAGE_KERNEL_EXEC); > + map_size, PAGE_KERNEL); > } > } > > + /* Map the kernel */ > + create_kernel_page_table(swapper_pg_dir, PMD_SIZE); > + > + /* Reserve the vmalloc area occupied by the kernel */ > + vm_kernel.addr = (void *)kernel_virt_addr; > + vm_kernel.phys_addr = load_pa; > + vm_kernel.size = (load_sz + PMD_SIZE - 1) & ~(PMD_SIZE - 1); > + vm_kernel.flags = VM_MAP | VM_NO_GUARD; > + vm_kernel.caller = __builtin_return_address(0); > + > + vm_area_add_early(&vm_kernel); > + > /* Clear fixmap PTE and PMD mappings */ > clear_fixmap(FIX_PTE); > clear_fixmap(FIX_PMD); > diff --git a/arch/riscv/mm/physaddr.c b/arch/riscv/mm/physaddr.c > index e8e4dcd39fed..35703d5ef5fd 100644 > --- a/arch/riscv/mm/physaddr.c > +++ b/arch/riscv/mm/physaddr.c > @@ -23,7 +23,7 @@ EXPORT_SYMBOL(__virt_to_phys); > > phys_addr_t __phys_addr_symbol(unsigned long x) > { > - unsigned long kernel_start = (unsigned long)PAGE_OFFSET; > + unsigned long kernel_start = (unsigned long)kernel_virt_addr; > unsigned long kernel_end = (unsigned long)_end; > > /*
On Thu, Jul 9, 2020 at 1:05 PM Palmer Dabbelt <palmer@dabbelt.com> wrote: > > On Sun, 07 Jun 2020 00:59:46 PDT (-0700), alex@ghiti.fr wrote: > > This is a preparatory patch for relocatable kernel. > > > > The kernel used to be linked at PAGE_OFFSET address and used to be loaded > > physically at the beginning of the main memory. Therefore, we could use > > the linear mapping for the kernel mapping. > > > > But the relocated kernel base address will be different from PAGE_OFFSET > > and since in the linear mapping, two different virtual addresses cannot > > point to the same physical address, the kernel mapping needs to lie outside > > the linear mapping. > > I know it's been a while, but I keep opening this up to review it and just > can't get over how ugly it is to put the kernel's linear map in the vmalloc > region. > > I guess I don't understand why this is necessary at all. Specifically: why > can't we just relocate the kernel within the linear map? That would let the > bootloader put the kernel wherever it wants, modulo the physical memory size we > support. We'd need to handle the regions that are coupled to the kernel's > execution address, but we could just put them in an explicit memory region > which is what we should probably be doing anyway. The original implementation of relocation doesn't move the kernel's linear map to the vmalloc region, and I also give the KASLR RFC patch [1] based on that. In original, we relocate the kernel in the linear map region, we would calculate a random value first as the offset, then we move the kernel image to the new target address which is obtained by adding this offset to it's VA and PA. It's enough for randomizing the kernel, but it seems to me if we want to decouple the kernel's linear mapping, the physical mapping of RAM and virtual mapping of RAM, it might be good to move the kernel's mapping out from the linear region. Even so, it is still an intrusive change. As far as I know, only arm64 does something like that. [1] https://patchwork.kernel.org/project/linux-riscv/list/?series=260615 > > > In addition, because modules and BPF must be close to the kernel (inside > > +-2GB window), the kernel is placed at the end of the vmalloc zone minus > > 2GB, which leaves room for modules and BPF. The kernel could not be > > placed at the beginning of the vmalloc zone since other vmalloc > > allocations from the kernel could get all the +-2GB window around the > > kernel which would prevent new modules and BPF programs to be loaded. > > Well, that's not enough to make sure this doesn't happen -- it's just enough to > make sure it doesn't happen very quickily. That's the same boat we're already > in, though, so it's not like it's worse. > > > Signed-off-by: Alexandre Ghiti <alex@ghiti.fr> > > Reviewed-by: Zong Li <zong.li@sifive.com> > > --- > > arch/riscv/boot/loader.lds.S | 3 +- > > arch/riscv/include/asm/page.h | 10 +++++- > > arch/riscv/include/asm/pgtable.h | 38 ++++++++++++++------- > > arch/riscv/kernel/head.S | 3 +- > > arch/riscv/kernel/module.c | 4 +-- > > arch/riscv/kernel/vmlinux.lds.S | 3 +- > > arch/riscv/mm/init.c | 58 +++++++++++++++++++++++++------- > > arch/riscv/mm/physaddr.c | 2 +- > > 8 files changed, 88 insertions(+), 33 deletions(-) > > > > diff --git a/arch/riscv/boot/loader.lds.S b/arch/riscv/boot/loader.lds.S > > index 47a5003c2e28..62d94696a19c 100644 > > --- a/arch/riscv/boot/loader.lds.S > > +++ b/arch/riscv/boot/loader.lds.S > > @@ -1,13 +1,14 @@ > > /* SPDX-License-Identifier: GPL-2.0 */ > > > > #include <asm/page.h> > > +#include <asm/pgtable.h> > > > > OUTPUT_ARCH(riscv) > > ENTRY(_start) > > > > SECTIONS > > { > > - . = PAGE_OFFSET; > > + . = KERNEL_LINK_ADDR; > > > > .payload : { > > *(.payload) > > diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h > > index 2d50f76efe48..48bb09b6a9b7 100644 > > --- a/arch/riscv/include/asm/page.h > > +++ b/arch/riscv/include/asm/page.h > > @@ -90,18 +90,26 @@ typedef struct page *pgtable_t; > > > > #ifdef CONFIG_MMU > > extern unsigned long va_pa_offset; > > +extern unsigned long va_kernel_pa_offset; > > extern unsigned long pfn_base; > > #define ARCH_PFN_OFFSET (pfn_base) > > #else > > #define va_pa_offset 0 > > +#define va_kernel_pa_offset 0 > > #define ARCH_PFN_OFFSET (PAGE_OFFSET >> PAGE_SHIFT) > > #endif /* CONFIG_MMU */ > > > > extern unsigned long max_low_pfn; > > extern unsigned long min_low_pfn; > > +extern unsigned long kernel_virt_addr; > > > > #define __pa_to_va_nodebug(x) ((void *)((unsigned long) (x) + va_pa_offset)) > > -#define __va_to_pa_nodebug(x) ((unsigned long)(x) - va_pa_offset) > > +#define linear_mapping_va_to_pa(x) ((unsigned long)(x) - va_pa_offset) > > +#define kernel_mapping_va_to_pa(x) \ > > + ((unsigned long)(x) - va_kernel_pa_offset) > > +#define __va_to_pa_nodebug(x) \ > > + (((x) >= PAGE_OFFSET) ? \ > > + linear_mapping_va_to_pa(x) : kernel_mapping_va_to_pa(x)) > > > > #ifdef CONFIG_DEBUG_VIRTUAL > > extern phys_addr_t __virt_to_phys(unsigned long x); > > diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h > > index 35b60035b6b0..94ef3b49dfb6 100644 > > --- a/arch/riscv/include/asm/pgtable.h > > +++ b/arch/riscv/include/asm/pgtable.h > > @@ -11,23 +11,29 @@ > > > > #include <asm/pgtable-bits.h> > > > > -#ifndef __ASSEMBLY__ > > - > > -/* Page Upper Directory not used in RISC-V */ > > -#include <asm-generic/pgtable-nopud.h> > > -#include <asm/page.h> > > -#include <asm/tlbflush.h> > > -#include <linux/mm_types.h> > > - > > -#ifdef CONFIG_MMU > > +#ifndef CONFIG_MMU > > +#define KERNEL_VIRT_ADDR PAGE_OFFSET > > +#define KERNEL_LINK_ADDR PAGE_OFFSET > > +#else > > +/* > > + * Leave 2GB for modules and BPF that must lie within a 2GB range around > > + * the kernel. > > + */ > > +#define KERNEL_VIRT_ADDR (VMALLOC_END - SZ_2G + 1) > > +#define KERNEL_LINK_ADDR KERNEL_VIRT_ADDR > > At a bare minimum this is going to make a mess of the 32-bit port, as > non-relocatable kernels are now going to get linked at 1GiB which is where user > code is supposed to live. That's an easy fix, though, as the 32-bit stuff > doesn't need any module address restrictions. > > > #define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) > > #define VMALLOC_END (PAGE_OFFSET - 1) > > #define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE) > > > > #define BPF_JIT_REGION_SIZE (SZ_128M) > > -#define BPF_JIT_REGION_START (PAGE_OFFSET - BPF_JIT_REGION_SIZE) > > -#define BPF_JIT_REGION_END (VMALLOC_END) > > +#define BPF_JIT_REGION_START PFN_ALIGN((unsigned long)&_end) > > +#define BPF_JIT_REGION_END (BPF_JIT_REGION_START + BPF_JIT_REGION_SIZE) > > + > > +#ifdef CONFIG_64BIT > > +#define VMALLOC_MODULE_START BPF_JIT_REGION_END > > +#define VMALLOC_MODULE_END (((unsigned long)&_start & PAGE_MASK) + SZ_2G) > > +#endif > > > > /* > > * Roughly size the vmemmap space to be large enough to fit enough > > @@ -57,9 +63,16 @@ > > #define FIXADDR_SIZE PGDIR_SIZE > > #endif > > #define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) > > - > > #endif > > > > +#ifndef __ASSEMBLY__ > > + > > +/* Page Upper Directory not used in RISC-V */ > > +#include <asm-generic/pgtable-nopud.h> > > +#include <asm/page.h> > > +#include <asm/tlbflush.h> > > +#include <linux/mm_types.h> > > + > > #ifdef CONFIG_64BIT > > #include <asm/pgtable-64.h> > > #else > > @@ -483,6 +496,7 @@ static inline void __kernel_map_pages(struct page *page, int numpages, int enabl > > > > #define kern_addr_valid(addr) (1) /* FIXME */ > > > > +extern char _start[]; > > extern void *dtb_early_va; > > void setup_bootmem(void); > > void paging_init(void); > > diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S > > index 98a406474e7d..8f5bb7731327 100644 > > --- a/arch/riscv/kernel/head.S > > +++ b/arch/riscv/kernel/head.S > > @@ -49,7 +49,8 @@ ENTRY(_start) > > #ifdef CONFIG_MMU > > relocate: > > /* Relocate return address */ > > - li a1, PAGE_OFFSET > > + la a1, kernel_virt_addr > > + REG_L a1, 0(a1) > > la a2, _start > > sub a1, a1, a2 > > add ra, ra, a1 > > diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c > > index 8bbe5dbe1341..1a8fbe05accf 100644 > > --- a/arch/riscv/kernel/module.c > > +++ b/arch/riscv/kernel/module.c > > @@ -392,12 +392,10 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, > > } > > > > #if defined(CONFIG_MMU) && defined(CONFIG_64BIT) > > -#define VMALLOC_MODULE_START \ > > - max(PFN_ALIGN((unsigned long)&_end - SZ_2G), VMALLOC_START) > > void *module_alloc(unsigned long size) > > { > > return __vmalloc_node_range(size, 1, VMALLOC_MODULE_START, > > - VMALLOC_END, GFP_KERNEL, > > + VMALLOC_MODULE_END, GFP_KERNEL, > > PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, > > __builtin_return_address(0)); > > } > > diff --git a/arch/riscv/kernel/vmlinux.lds.S b/arch/riscv/kernel/vmlinux.lds.S > > index 0339b6bbe11a..a9abde62909f 100644 > > --- a/arch/riscv/kernel/vmlinux.lds.S > > +++ b/arch/riscv/kernel/vmlinux.lds.S > > @@ -4,7 +4,8 @@ > > * Copyright (C) 2017 SiFive > > */ > > > > -#define LOAD_OFFSET PAGE_OFFSET > > +#include <asm/pgtable.h> > > +#define LOAD_OFFSET KERNEL_LINK_ADDR > > #include <asm/vmlinux.lds.h> > > #include <asm/page.h> > > #include <asm/cache.h> > > diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c > > index 736de6c8739f..71da78914645 100644 > > --- a/arch/riscv/mm/init.c > > +++ b/arch/riscv/mm/init.c > > @@ -22,6 +22,9 @@ > > > > #include "../kernel/head.h" > > > > +unsigned long kernel_virt_addr = KERNEL_VIRT_ADDR; > > +EXPORT_SYMBOL(kernel_virt_addr); > > + > > unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] > > __page_aligned_bss; > > EXPORT_SYMBOL(empty_zero_page); > > @@ -178,8 +181,12 @@ void __init setup_bootmem(void) > > } > > > > #ifdef CONFIG_MMU > > +/* Offset between linear mapping virtual address and kernel load address */ > > unsigned long va_pa_offset; > > EXPORT_SYMBOL(va_pa_offset); > > +/* Offset between kernel mapping virtual address and kernel load address */ > > +unsigned long va_kernel_pa_offset; > > +EXPORT_SYMBOL(va_kernel_pa_offset); > > unsigned long pfn_base; > > EXPORT_SYMBOL(pfn_base); > > > > @@ -271,7 +278,7 @@ static phys_addr_t __init alloc_pmd(uintptr_t va) > > if (mmu_enabled) > > return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); > > > > - pmd_num = (va - PAGE_OFFSET) >> PGDIR_SHIFT; > > + pmd_num = (va - kernel_virt_addr) >> PGDIR_SHIFT; > > BUG_ON(pmd_num >= NUM_EARLY_PMDS); > > return (uintptr_t)&early_pmd[pmd_num * PTRS_PER_PMD]; > > } > > @@ -372,14 +379,30 @@ static uintptr_t __init best_map_size(phys_addr_t base, phys_addr_t size) > > #error "setup_vm() is called from head.S before relocate so it should not use absolute addressing." > > #endif > > > > +static uintptr_t load_pa, load_sz; > > + > > +static void __init create_kernel_page_table(pgd_t *pgdir, uintptr_t map_size) > > +{ > > + uintptr_t va, end_va; > > + > > + end_va = kernel_virt_addr + load_sz; > > + for (va = kernel_virt_addr; va < end_va; va += map_size) > > + create_pgd_mapping(pgdir, va, > > + load_pa + (va - kernel_virt_addr), > > + map_size, PAGE_KERNEL_EXEC); > > +} > > + > > asmlinkage void __init setup_vm(uintptr_t dtb_pa) > > { > > uintptr_t va, end_va; > > - uintptr_t load_pa = (uintptr_t)(&_start); > > - uintptr_t load_sz = (uintptr_t)(&_end) - load_pa; > > uintptr_t map_size = best_map_size(load_pa, MAX_EARLY_MAPPING_SIZE); > > > > + load_pa = (uintptr_t)(&_start); > > + load_sz = (uintptr_t)(&_end) - load_pa; > > + > > va_pa_offset = PAGE_OFFSET - load_pa; > > + va_kernel_pa_offset = kernel_virt_addr - load_pa; > > + > > pfn_base = PFN_DOWN(load_pa); > > > > /* > > @@ -402,26 +425,22 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) > > create_pmd_mapping(fixmap_pmd, FIXADDR_START, > > (uintptr_t)fixmap_pte, PMD_SIZE, PAGE_TABLE); > > /* Setup trampoline PGD and PMD */ > > - create_pgd_mapping(trampoline_pg_dir, PAGE_OFFSET, > > + create_pgd_mapping(trampoline_pg_dir, kernel_virt_addr, > > (uintptr_t)trampoline_pmd, PGDIR_SIZE, PAGE_TABLE); > > - create_pmd_mapping(trampoline_pmd, PAGE_OFFSET, > > + create_pmd_mapping(trampoline_pmd, kernel_virt_addr, > > load_pa, PMD_SIZE, PAGE_KERNEL_EXEC); > > #else > > /* Setup trampoline PGD */ > > - create_pgd_mapping(trampoline_pg_dir, PAGE_OFFSET, > > + create_pgd_mapping(trampoline_pg_dir, kernel_virt_addr, > > load_pa, PGDIR_SIZE, PAGE_KERNEL_EXEC); > > #endif > > > > /* > > - * Setup early PGD covering entire kernel which will allows > > + * Setup early PGD covering entire kernel which will allow > > * us to reach paging_init(). We map all memory banks later > > * in setup_vm_final() below. > > */ > > - end_va = PAGE_OFFSET + load_sz; > > - for (va = PAGE_OFFSET; va < end_va; va += map_size) > > - create_pgd_mapping(early_pg_dir, va, > > - load_pa + (va - PAGE_OFFSET), > > - map_size, PAGE_KERNEL_EXEC); > > + create_kernel_page_table(early_pg_dir, map_size); > > > > /* Create fixed mapping for early FDT parsing */ > > end_va = __fix_to_virt(FIX_FDT) + FIX_FDT_SIZE; > > @@ -441,6 +460,7 @@ static void __init setup_vm_final(void) > > uintptr_t va, map_size; > > phys_addr_t pa, start, end; > > struct memblock_region *reg; > > + static struct vm_struct vm_kernel = { 0 }; > > > > /* Set mmu_enabled flag */ > > mmu_enabled = true; > > @@ -467,10 +487,22 @@ static void __init setup_vm_final(void) > > for (pa = start; pa < end; pa += map_size) { > > va = (uintptr_t)__va(pa); > > create_pgd_mapping(swapper_pg_dir, va, pa, > > - map_size, PAGE_KERNEL_EXEC); > > + map_size, PAGE_KERNEL); > > } > > } > > > > + /* Map the kernel */ > > + create_kernel_page_table(swapper_pg_dir, PMD_SIZE); > > + > > + /* Reserve the vmalloc area occupied by the kernel */ > > + vm_kernel.addr = (void *)kernel_virt_addr; > > + vm_kernel.phys_addr = load_pa; > > + vm_kernel.size = (load_sz + PMD_SIZE - 1) & ~(PMD_SIZE - 1); > > + vm_kernel.flags = VM_MAP | VM_NO_GUARD; > > + vm_kernel.caller = __builtin_return_address(0); > > + > > + vm_area_add_early(&vm_kernel); > > + > > /* Clear fixmap PTE and PMD mappings */ > > clear_fixmap(FIX_PTE); > > clear_fixmap(FIX_PMD); > > diff --git a/arch/riscv/mm/physaddr.c b/arch/riscv/mm/physaddr.c > > index e8e4dcd39fed..35703d5ef5fd 100644 > > --- a/arch/riscv/mm/physaddr.c > > +++ b/arch/riscv/mm/physaddr.c > > @@ -23,7 +23,7 @@ EXPORT_SYMBOL(__virt_to_phys); > > > > phys_addr_t __phys_addr_symbol(unsigned long x) > > { > > - unsigned long kernel_start = (unsigned long)PAGE_OFFSET; > > + unsigned long kernel_start = (unsigned long)kernel_virt_addr; > > unsigned long kernel_end = (unsigned long)_end; > > > > /*
Hi Palmer, Le 7/9/20 à 1:05 AM, Palmer Dabbelt a écrit : > On Sun, 07 Jun 2020 00:59:46 PDT (-0700), alex@ghiti.fr wrote: >> This is a preparatory patch for relocatable kernel. >> >> The kernel used to be linked at PAGE_OFFSET address and used to be loaded >> physically at the beginning of the main memory. Therefore, we could use >> the linear mapping for the kernel mapping. >> >> But the relocated kernel base address will be different from PAGE_OFFSET >> and since in the linear mapping, two different virtual addresses cannot >> point to the same physical address, the kernel mapping needs to lie >> outside >> the linear mapping. > > I know it's been a while, but I keep opening this up to review it and just > can't get over how ugly it is to put the kernel's linear map in the vmalloc > region. > > I guess I don't understand why this is necessary at all. Specifically: why > can't we just relocate the kernel within the linear map? That would let > the > bootloader put the kernel wherever it wants, modulo the physical memory > size we > support. We'd need to handle the regions that are coupled to the kernel's > execution address, but we could just put them in an explicit memory region > which is what we should probably be doing anyway. Virtual relocation in the linear mapping requires to move the kernel physically too. Zong implemented this physical move in its KASLR RFC patchset, which is cumbersome since finding an available physical spot is harder than just selecting a virtual range in the vmalloc range. In addition, having the kernel mapping in the linear mapping prevents the use of hugepage for the linear mapping resulting in performance loss (at least for the GB that encompasses the kernel). Why do you find this "ugly" ? The vmalloc region is just a bunch of available virtual addresses to whatever purpose we want, and as noted by Zong, arm64 uses the same scheme. > >> In addition, because modules and BPF must be close to the kernel (inside >> +-2GB window), the kernel is placed at the end of the vmalloc zone minus >> 2GB, which leaves room for modules and BPF. The kernel could not be >> placed at the beginning of the vmalloc zone since other vmalloc >> allocations from the kernel could get all the +-2GB window around the >> kernel which would prevent new modules and BPF programs to be loaded. > > Well, that's not enough to make sure this doesn't happen -- it's just > enough to > make sure it doesn't happen very quickily. That's the same boat we're > already > in, though, so it's not like it's worse. Indeed, that's not worse, I haven't found a way to reserve vmalloc area without actually allocating it. > >> Signed-off-by: Alexandre Ghiti <alex@ghiti.fr> >> Reviewed-by: Zong Li <zong.li@sifive.com> >> --- >> arch/riscv/boot/loader.lds.S | 3 +- >> arch/riscv/include/asm/page.h | 10 +++++- >> arch/riscv/include/asm/pgtable.h | 38 ++++++++++++++------- >> arch/riscv/kernel/head.S | 3 +- >> arch/riscv/kernel/module.c | 4 +-- >> arch/riscv/kernel/vmlinux.lds.S | 3 +- >> arch/riscv/mm/init.c | 58 +++++++++++++++++++++++++------- >> arch/riscv/mm/physaddr.c | 2 +- >> 8 files changed, 88 insertions(+), 33 deletions(-) >> >> diff --git a/arch/riscv/boot/loader.lds.S b/arch/riscv/boot/loader.lds.S >> index 47a5003c2e28..62d94696a19c 100644 >> --- a/arch/riscv/boot/loader.lds.S >> +++ b/arch/riscv/boot/loader.lds.S >> @@ -1,13 +1,14 @@ >> /* SPDX-License-Identifier: GPL-2.0 */ >> >> #include <asm/page.h> >> +#include <asm/pgtable.h> >> >> OUTPUT_ARCH(riscv) >> ENTRY(_start) >> >> SECTIONS >> { >> - . = PAGE_OFFSET; >> + . = KERNEL_LINK_ADDR; >> >> .payload : { >> *(.payload) >> diff --git a/arch/riscv/include/asm/page.h >> b/arch/riscv/include/asm/page.h >> index 2d50f76efe48..48bb09b6a9b7 100644 >> --- a/arch/riscv/include/asm/page.h >> +++ b/arch/riscv/include/asm/page.h >> @@ -90,18 +90,26 @@ typedef struct page *pgtable_t; >> >> #ifdef CONFIG_MMU >> extern unsigned long va_pa_offset; >> +extern unsigned long va_kernel_pa_offset; >> extern unsigned long pfn_base; >> #define ARCH_PFN_OFFSET (pfn_base) >> #else >> #define va_pa_offset 0 >> +#define va_kernel_pa_offset 0 >> #define ARCH_PFN_OFFSET (PAGE_OFFSET >> PAGE_SHIFT) >> #endif /* CONFIG_MMU */ >> >> extern unsigned long max_low_pfn; >> extern unsigned long min_low_pfn; >> +extern unsigned long kernel_virt_addr; >> >> #define __pa_to_va_nodebug(x) ((void *)((unsigned long) (x) + >> va_pa_offset)) >> -#define __va_to_pa_nodebug(x) ((unsigned long)(x) - va_pa_offset) >> +#define linear_mapping_va_to_pa(x) ((unsigned long)(x) - >> va_pa_offset) >> +#define kernel_mapping_va_to_pa(x) \ >> + ((unsigned long)(x) - va_kernel_pa_offset) >> +#define __va_to_pa_nodebug(x) \ >> + (((x) >= PAGE_OFFSET) ? \ >> + linear_mapping_va_to_pa(x) : kernel_mapping_va_to_pa(x)) >> >> #ifdef CONFIG_DEBUG_VIRTUAL >> extern phys_addr_t __virt_to_phys(unsigned long x); >> diff --git a/arch/riscv/include/asm/pgtable.h >> b/arch/riscv/include/asm/pgtable.h >> index 35b60035b6b0..94ef3b49dfb6 100644 >> --- a/arch/riscv/include/asm/pgtable.h >> +++ b/arch/riscv/include/asm/pgtable.h >> @@ -11,23 +11,29 @@ >> >> #include <asm/pgtable-bits.h> >> >> -#ifndef __ASSEMBLY__ >> - >> -/* Page Upper Directory not used in RISC-V */ >> -#include <asm-generic/pgtable-nopud.h> >> -#include <asm/page.h> >> -#include <asm/tlbflush.h> >> -#include <linux/mm_types.h> >> - >> -#ifdef CONFIG_MMU >> +#ifndef CONFIG_MMU >> +#define KERNEL_VIRT_ADDR PAGE_OFFSET >> +#define KERNEL_LINK_ADDR PAGE_OFFSET >> +#else >> +/* >> + * Leave 2GB for modules and BPF that must lie within a 2GB range around >> + * the kernel. >> + */ >> +#define KERNEL_VIRT_ADDR (VMALLOC_END - SZ_2G + 1) >> +#define KERNEL_LINK_ADDR KERNEL_VIRT_ADDR > > At a bare minimum this is going to make a mess of the 32-bit port, as > non-relocatable kernels are now going to get linked at 1GiB which is > where user > code is supposed to live. That's an easy fix, though, as the 32-bit stuff > doesn't need any module address restrictions. Indeed, I will take a look at that. > >> #define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) >> #define VMALLOC_END (PAGE_OFFSET - 1) >> #define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE) >> >> #define BPF_JIT_REGION_SIZE (SZ_128M) >> -#define BPF_JIT_REGION_START (PAGE_OFFSET - BPF_JIT_REGION_SIZE) >> -#define BPF_JIT_REGION_END (VMALLOC_END) >> +#define BPF_JIT_REGION_START PFN_ALIGN((unsigned long)&_end) >> +#define BPF_JIT_REGION_END (BPF_JIT_REGION_START + >> BPF_JIT_REGION_SIZE) >> + >> +#ifdef CONFIG_64BIT >> +#define VMALLOC_MODULE_START BPF_JIT_REGION_END >> +#define VMALLOC_MODULE_END (((unsigned long)&_start & PAGE_MASK) + >> SZ_2G) >> +#endif >> >> /* >> * Roughly size the vmemmap space to be large enough to fit enough >> @@ -57,9 +63,16 @@ >> #define FIXADDR_SIZE PGDIR_SIZE >> #endif >> #define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) >> - >> #endif >> >> +#ifndef __ASSEMBLY__ >> + >> +/* Page Upper Directory not used in RISC-V */ >> +#include <asm-generic/pgtable-nopud.h> >> +#include <asm/page.h> >> +#include <asm/tlbflush.h> >> +#include <linux/mm_types.h> >> + >> #ifdef CONFIG_64BIT >> #include <asm/pgtable-64.h> >> #else >> @@ -483,6 +496,7 @@ static inline void __kernel_map_pages(struct page >> *page, int numpages, int enabl >> >> #define kern_addr_valid(addr) (1) /* FIXME */ >> >> +extern char _start[]; >> extern void *dtb_early_va; >> void setup_bootmem(void); >> void paging_init(void); >> diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S >> index 98a406474e7d..8f5bb7731327 100644 >> --- a/arch/riscv/kernel/head.S >> +++ b/arch/riscv/kernel/head.S >> @@ -49,7 +49,8 @@ ENTRY(_start) >> #ifdef CONFIG_MMU >> relocate: >> /* Relocate return address */ >> - li a1, PAGE_OFFSET >> + la a1, kernel_virt_addr >> + REG_L a1, 0(a1) >> la a2, _start >> sub a1, a1, a2 >> add ra, ra, a1 >> diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c >> index 8bbe5dbe1341..1a8fbe05accf 100644 >> --- a/arch/riscv/kernel/module.c >> +++ b/arch/riscv/kernel/module.c >> @@ -392,12 +392,10 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const >> char *strtab, >> } >> >> #if defined(CONFIG_MMU) && defined(CONFIG_64BIT) >> -#define VMALLOC_MODULE_START \ >> - max(PFN_ALIGN((unsigned long)&_end - SZ_2G), VMALLOC_START) >> void *module_alloc(unsigned long size) >> { >> return __vmalloc_node_range(size, 1, VMALLOC_MODULE_START, >> - VMALLOC_END, GFP_KERNEL, >> + VMALLOC_MODULE_END, GFP_KERNEL, >> PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, >> __builtin_return_address(0)); >> } >> diff --git a/arch/riscv/kernel/vmlinux.lds.S >> b/arch/riscv/kernel/vmlinux.lds.S >> index 0339b6bbe11a..a9abde62909f 100644 >> --- a/arch/riscv/kernel/vmlinux.lds.S >> +++ b/arch/riscv/kernel/vmlinux.lds.S >> @@ -4,7 +4,8 @@ >> * Copyright (C) 2017 SiFive >> */ >> >> -#define LOAD_OFFSET PAGE_OFFSET >> +#include <asm/pgtable.h> >> +#define LOAD_OFFSET KERNEL_LINK_ADDR >> #include <asm/vmlinux.lds.h> >> #include <asm/page.h> >> #include <asm/cache.h> >> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c >> index 736de6c8739f..71da78914645 100644 >> --- a/arch/riscv/mm/init.c >> +++ b/arch/riscv/mm/init.c >> @@ -22,6 +22,9 @@ >> >> #include "../kernel/head.h" >> >> +unsigned long kernel_virt_addr = KERNEL_VIRT_ADDR; >> +EXPORT_SYMBOL(kernel_virt_addr); >> + >> unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] >> __page_aligned_bss; >> EXPORT_SYMBOL(empty_zero_page); >> @@ -178,8 +181,12 @@ void __init setup_bootmem(void) >> } >> >> #ifdef CONFIG_MMU >> +/* Offset between linear mapping virtual address and kernel load >> address */ >> unsigned long va_pa_offset; >> EXPORT_SYMBOL(va_pa_offset); >> +/* Offset between kernel mapping virtual address and kernel load >> address */ >> +unsigned long va_kernel_pa_offset; >> +EXPORT_SYMBOL(va_kernel_pa_offset); >> unsigned long pfn_base; >> EXPORT_SYMBOL(pfn_base); >> >> @@ -271,7 +278,7 @@ static phys_addr_t __init alloc_pmd(uintptr_t va) >> if (mmu_enabled) >> return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); >> >> - pmd_num = (va - PAGE_OFFSET) >> PGDIR_SHIFT; >> + pmd_num = (va - kernel_virt_addr) >> PGDIR_SHIFT; >> BUG_ON(pmd_num >= NUM_EARLY_PMDS); >> return (uintptr_t)&early_pmd[pmd_num * PTRS_PER_PMD]; >> } >> @@ -372,14 +379,30 @@ static uintptr_t __init >> best_map_size(phys_addr_t base, phys_addr_t size) >> #error "setup_vm() is called from head.S before relocate so it should >> not use absolute addressing." >> #endif >> >> +static uintptr_t load_pa, load_sz; >> + >> +static void __init create_kernel_page_table(pgd_t *pgdir, uintptr_t >> map_size) >> +{ >> + uintptr_t va, end_va; >> + >> + end_va = kernel_virt_addr + load_sz; >> + for (va = kernel_virt_addr; va < end_va; va += map_size) >> + create_pgd_mapping(pgdir, va, >> + load_pa + (va - kernel_virt_addr), >> + map_size, PAGE_KERNEL_EXEC); >> +} >> + >> asmlinkage void __init setup_vm(uintptr_t dtb_pa) >> { >> uintptr_t va, end_va; >> - uintptr_t load_pa = (uintptr_t)(&_start); >> - uintptr_t load_sz = (uintptr_t)(&_end) - load_pa; >> uintptr_t map_size = best_map_size(load_pa, MAX_EARLY_MAPPING_SIZE); >> >> + load_pa = (uintptr_t)(&_start); >> + load_sz = (uintptr_t)(&_end) - load_pa; >> + >> va_pa_offset = PAGE_OFFSET - load_pa; >> + va_kernel_pa_offset = kernel_virt_addr - load_pa; >> + >> pfn_base = PFN_DOWN(load_pa); >> >> /* >> @@ -402,26 +425,22 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) >> create_pmd_mapping(fixmap_pmd, FIXADDR_START, >> (uintptr_t)fixmap_pte, PMD_SIZE, PAGE_TABLE); >> /* Setup trampoline PGD and PMD */ >> - create_pgd_mapping(trampoline_pg_dir, PAGE_OFFSET, >> + create_pgd_mapping(trampoline_pg_dir, kernel_virt_addr, >> (uintptr_t)trampoline_pmd, PGDIR_SIZE, PAGE_TABLE); >> - create_pmd_mapping(trampoline_pmd, PAGE_OFFSET, >> + create_pmd_mapping(trampoline_pmd, kernel_virt_addr, >> load_pa, PMD_SIZE, PAGE_KERNEL_EXEC); >> #else >> /* Setup trampoline PGD */ >> - create_pgd_mapping(trampoline_pg_dir, PAGE_OFFSET, >> + create_pgd_mapping(trampoline_pg_dir, kernel_virt_addr, >> load_pa, PGDIR_SIZE, PAGE_KERNEL_EXEC); >> #endif >> >> /* >> - * Setup early PGD covering entire kernel which will allows >> + * Setup early PGD covering entire kernel which will allow >> * us to reach paging_init(). We map all memory banks later >> * in setup_vm_final() below. >> */ >> - end_va = PAGE_OFFSET + load_sz; >> - for (va = PAGE_OFFSET; va < end_va; va += map_size) >> - create_pgd_mapping(early_pg_dir, va, >> - load_pa + (va - PAGE_OFFSET), >> - map_size, PAGE_KERNEL_EXEC); >> + create_kernel_page_table(early_pg_dir, map_size); >> >> /* Create fixed mapping for early FDT parsing */ >> end_va = __fix_to_virt(FIX_FDT) + FIX_FDT_SIZE; >> @@ -441,6 +460,7 @@ static void __init setup_vm_final(void) >> uintptr_t va, map_size; >> phys_addr_t pa, start, end; >> struct memblock_region *reg; >> + static struct vm_struct vm_kernel = { 0 }; >> >> /* Set mmu_enabled flag */ >> mmu_enabled = true; >> @@ -467,10 +487,22 @@ static void __init setup_vm_final(void) >> for (pa = start; pa < end; pa += map_size) { >> va = (uintptr_t)__va(pa); >> create_pgd_mapping(swapper_pg_dir, va, pa, >> - map_size, PAGE_KERNEL_EXEC); >> + map_size, PAGE_KERNEL); >> } >> } >> >> + /* Map the kernel */ >> + create_kernel_page_table(swapper_pg_dir, PMD_SIZE); >> + >> + /* Reserve the vmalloc area occupied by the kernel */ >> + vm_kernel.addr = (void *)kernel_virt_addr; >> + vm_kernel.phys_addr = load_pa; >> + vm_kernel.size = (load_sz + PMD_SIZE - 1) & ~(PMD_SIZE - 1); >> + vm_kernel.flags = VM_MAP | VM_NO_GUARD; >> + vm_kernel.caller = __builtin_return_address(0); >> + >> + vm_area_add_early(&vm_kernel); >> + >> /* Clear fixmap PTE and PMD mappings */ >> clear_fixmap(FIX_PTE); >> clear_fixmap(FIX_PMD); >> diff --git a/arch/riscv/mm/physaddr.c b/arch/riscv/mm/physaddr.c >> index e8e4dcd39fed..35703d5ef5fd 100644 >> --- a/arch/riscv/mm/physaddr.c >> +++ b/arch/riscv/mm/physaddr.c >> @@ -23,7 +23,7 @@ EXPORT_SYMBOL(__virt_to_phys); >> >> phys_addr_t __phys_addr_symbol(unsigned long x) >> { >> - unsigned long kernel_start = (unsigned long)PAGE_OFFSET; >> + unsigned long kernel_start = (unsigned long)kernel_virt_addr; >> unsigned long kernel_end = (unsigned long)_end; >> >> /* Alex
Let's try to make progress here: I add linux-mm in CC to get feedback on this patch as it blocks sv48 support too. Alex Le 7/9/20 à 7:11 AM, Alex Ghiti a écrit : > Hi Palmer, > > Le 7/9/20 à 1:05 AM, Palmer Dabbelt a écrit : >> On Sun, 07 Jun 2020 00:59:46 PDT (-0700), alex@ghiti.fr wrote: >>> This is a preparatory patch for relocatable kernel. >>> >>> The kernel used to be linked at PAGE_OFFSET address and used to be >>> loaded >>> physically at the beginning of the main memory. Therefore, we could use >>> the linear mapping for the kernel mapping. >>> >>> But the relocated kernel base address will be different from PAGE_OFFSET >>> and since in the linear mapping, two different virtual addresses cannot >>> point to the same physical address, the kernel mapping needs to lie >>> outside >>> the linear mapping. >> >> I know it's been a while, but I keep opening this up to review it and >> just >> can't get over how ugly it is to put the kernel's linear map in the >> vmalloc >> region. >> >> I guess I don't understand why this is necessary at all. >> Specifically: why >> can't we just relocate the kernel within the linear map? That would >> let the >> bootloader put the kernel wherever it wants, modulo the physical >> memory size we >> support. We'd need to handle the regions that are coupled to the >> kernel's >> execution address, but we could just put them in an explicit memory >> region >> which is what we should probably be doing anyway. > > Virtual relocation in the linear mapping requires to move the kernel > physically too. Zong implemented this physical move in its KASLR RFC > patchset, which is cumbersome since finding an available physical spot > is harder than just selecting a virtual range in the vmalloc range. > > In addition, having the kernel mapping in the linear mapping prevents > the use of hugepage for the linear mapping resulting in performance loss > (at least for the GB that encompasses the kernel). > > Why do you find this "ugly" ? The vmalloc region is just a bunch of > available virtual addresses to whatever purpose we want, and as noted by > Zong, arm64 uses the same scheme. > >> >>> In addition, because modules and BPF must be close to the kernel (inside >>> +-2GB window), the kernel is placed at the end of the vmalloc zone minus >>> 2GB, which leaves room for modules and BPF. The kernel could not be >>> placed at the beginning of the vmalloc zone since other vmalloc >>> allocations from the kernel could get all the +-2GB window around the >>> kernel which would prevent new modules and BPF programs to be loaded. >> >> Well, that's not enough to make sure this doesn't happen -- it's just >> enough to >> make sure it doesn't happen very quickily. That's the same boat we're >> already >> in, though, so it's not like it's worse. > > Indeed, that's not worse, I haven't found a way to reserve vmalloc area > without actually allocating it. > >> >>> Signed-off-by: Alexandre Ghiti <alex@ghiti.fr> >>> Reviewed-by: Zong Li <zong.li@sifive.com> >>> --- >>> arch/riscv/boot/loader.lds.S | 3 +- >>> arch/riscv/include/asm/page.h | 10 +++++- >>> arch/riscv/include/asm/pgtable.h | 38 ++++++++++++++------- >>> arch/riscv/kernel/head.S | 3 +- >>> arch/riscv/kernel/module.c | 4 +-- >>> arch/riscv/kernel/vmlinux.lds.S | 3 +- >>> arch/riscv/mm/init.c | 58 +++++++++++++++++++++++++------- >>> arch/riscv/mm/physaddr.c | 2 +- >>> 8 files changed, 88 insertions(+), 33 deletions(-) >>> >>> diff --git a/arch/riscv/boot/loader.lds.S b/arch/riscv/boot/loader.lds.S >>> index 47a5003c2e28..62d94696a19c 100644 >>> --- a/arch/riscv/boot/loader.lds.S >>> +++ b/arch/riscv/boot/loader.lds.S >>> @@ -1,13 +1,14 @@ >>> /* SPDX-License-Identifier: GPL-2.0 */ >>> >>> #include <asm/page.h> >>> +#include <asm/pgtable.h> >>> >>> OUTPUT_ARCH(riscv) >>> ENTRY(_start) >>> >>> SECTIONS >>> { >>> - . = PAGE_OFFSET; >>> + . = KERNEL_LINK_ADDR; >>> >>> .payload : { >>> *(.payload) >>> diff --git a/arch/riscv/include/asm/page.h >>> b/arch/riscv/include/asm/page.h >>> index 2d50f76efe48..48bb09b6a9b7 100644 >>> --- a/arch/riscv/include/asm/page.h >>> +++ b/arch/riscv/include/asm/page.h >>> @@ -90,18 +90,26 @@ typedef struct page *pgtable_t; >>> >>> #ifdef CONFIG_MMU >>> extern unsigned long va_pa_offset; >>> +extern unsigned long va_kernel_pa_offset; >>> extern unsigned long pfn_base; >>> #define ARCH_PFN_OFFSET (pfn_base) >>> #else >>> #define va_pa_offset 0 >>> +#define va_kernel_pa_offset 0 >>> #define ARCH_PFN_OFFSET (PAGE_OFFSET >> PAGE_SHIFT) >>> #endif /* CONFIG_MMU */ >>> >>> extern unsigned long max_low_pfn; >>> extern unsigned long min_low_pfn; >>> +extern unsigned long kernel_virt_addr; >>> >>> #define __pa_to_va_nodebug(x) ((void *)((unsigned long) (x) + >>> va_pa_offset)) >>> -#define __va_to_pa_nodebug(x) ((unsigned long)(x) - va_pa_offset) >>> +#define linear_mapping_va_to_pa(x) ((unsigned long)(x) - >>> va_pa_offset) >>> +#define kernel_mapping_va_to_pa(x) \ >>> + ((unsigned long)(x) - va_kernel_pa_offset) >>> +#define __va_to_pa_nodebug(x) \ >>> + (((x) >= PAGE_OFFSET) ? \ >>> + linear_mapping_va_to_pa(x) : kernel_mapping_va_to_pa(x)) >>> >>> #ifdef CONFIG_DEBUG_VIRTUAL >>> extern phys_addr_t __virt_to_phys(unsigned long x); >>> diff --git a/arch/riscv/include/asm/pgtable.h >>> b/arch/riscv/include/asm/pgtable.h >>> index 35b60035b6b0..94ef3b49dfb6 100644 >>> --- a/arch/riscv/include/asm/pgtable.h >>> +++ b/arch/riscv/include/asm/pgtable.h >>> @@ -11,23 +11,29 @@ >>> >>> #include <asm/pgtable-bits.h> >>> >>> -#ifndef __ASSEMBLY__ >>> - >>> -/* Page Upper Directory not used in RISC-V */ >>> -#include <asm-generic/pgtable-nopud.h> >>> -#include <asm/page.h> >>> -#include <asm/tlbflush.h> >>> -#include <linux/mm_types.h> >>> - >>> -#ifdef CONFIG_MMU >>> +#ifndef CONFIG_MMU >>> +#define KERNEL_VIRT_ADDR PAGE_OFFSET >>> +#define KERNEL_LINK_ADDR PAGE_OFFSET >>> +#else >>> +/* >>> + * Leave 2GB for modules and BPF that must lie within a 2GB range >>> around >>> + * the kernel. >>> + */ >>> +#define KERNEL_VIRT_ADDR (VMALLOC_END - SZ_2G + 1) >>> +#define KERNEL_LINK_ADDR KERNEL_VIRT_ADDR >> >> At a bare minimum this is going to make a mess of the 32-bit port, as >> non-relocatable kernels are now going to get linked at 1GiB which is >> where user >> code is supposed to live. That's an easy fix, though, as the 32-bit >> stuff >> doesn't need any module address restrictions. > > Indeed, I will take a look at that. > >> >>> #define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) >>> #define VMALLOC_END (PAGE_OFFSET - 1) >>> #define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE) >>> >>> #define BPF_JIT_REGION_SIZE (SZ_128M) >>> -#define BPF_JIT_REGION_START (PAGE_OFFSET - BPF_JIT_REGION_SIZE) >>> -#define BPF_JIT_REGION_END (VMALLOC_END) >>> +#define BPF_JIT_REGION_START PFN_ALIGN((unsigned long)&_end) >>> +#define BPF_JIT_REGION_END (BPF_JIT_REGION_START + >>> BPF_JIT_REGION_SIZE) >>> + >>> +#ifdef CONFIG_64BIT >>> +#define VMALLOC_MODULE_START BPF_JIT_REGION_END >>> +#define VMALLOC_MODULE_END (((unsigned long)&_start & PAGE_MASK) >>> + SZ_2G) >>> +#endif >>> >>> /* >>> * Roughly size the vmemmap space to be large enough to fit enough >>> @@ -57,9 +63,16 @@ >>> #define FIXADDR_SIZE PGDIR_SIZE >>> #endif >>> #define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) >>> - >>> #endif >>> >>> +#ifndef __ASSEMBLY__ >>> + >>> +/* Page Upper Directory not used in RISC-V */ >>> +#include <asm-generic/pgtable-nopud.h> >>> +#include <asm/page.h> >>> +#include <asm/tlbflush.h> >>> +#include <linux/mm_types.h> >>> + >>> #ifdef CONFIG_64BIT >>> #include <asm/pgtable-64.h> >>> #else >>> @@ -483,6 +496,7 @@ static inline void __kernel_map_pages(struct page >>> *page, int numpages, int enabl >>> >>> #define kern_addr_valid(addr) (1) /* FIXME */ >>> >>> +extern char _start[]; >>> extern void *dtb_early_va; >>> void setup_bootmem(void); >>> void paging_init(void); >>> diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S >>> index 98a406474e7d..8f5bb7731327 100644 >>> --- a/arch/riscv/kernel/head.S >>> +++ b/arch/riscv/kernel/head.S >>> @@ -49,7 +49,8 @@ ENTRY(_start) >>> #ifdef CONFIG_MMU >>> relocate: >>> /* Relocate return address */ >>> - li a1, PAGE_OFFSET >>> + la a1, kernel_virt_addr >>> + REG_L a1, 0(a1) >>> la a2, _start >>> sub a1, a1, a2 >>> add ra, ra, a1 >>> diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c >>> index 8bbe5dbe1341..1a8fbe05accf 100644 >>> --- a/arch/riscv/kernel/module.c >>> +++ b/arch/riscv/kernel/module.c >>> @@ -392,12 +392,10 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const >>> char *strtab, >>> } >>> >>> #if defined(CONFIG_MMU) && defined(CONFIG_64BIT) >>> -#define VMALLOC_MODULE_START \ >>> - max(PFN_ALIGN((unsigned long)&_end - SZ_2G), VMALLOC_START) >>> void *module_alloc(unsigned long size) >>> { >>> return __vmalloc_node_range(size, 1, VMALLOC_MODULE_START, >>> - VMALLOC_END, GFP_KERNEL, >>> + VMALLOC_MODULE_END, GFP_KERNEL, >>> PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, >>> __builtin_return_address(0)); >>> } >>> diff --git a/arch/riscv/kernel/vmlinux.lds.S >>> b/arch/riscv/kernel/vmlinux.lds.S >>> index 0339b6bbe11a..a9abde62909f 100644 >>> --- a/arch/riscv/kernel/vmlinux.lds.S >>> +++ b/arch/riscv/kernel/vmlinux.lds.S >>> @@ -4,7 +4,8 @@ >>> * Copyright (C) 2017 SiFive >>> */ >>> >>> -#define LOAD_OFFSET PAGE_OFFSET >>> +#include <asm/pgtable.h> >>> +#define LOAD_OFFSET KERNEL_LINK_ADDR >>> #include <asm/vmlinux.lds.h> >>> #include <asm/page.h> >>> #include <asm/cache.h> >>> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c >>> index 736de6c8739f..71da78914645 100644 >>> --- a/arch/riscv/mm/init.c >>> +++ b/arch/riscv/mm/init.c >>> @@ -22,6 +22,9 @@ >>> >>> #include "../kernel/head.h" >>> >>> +unsigned long kernel_virt_addr = KERNEL_VIRT_ADDR; >>> +EXPORT_SYMBOL(kernel_virt_addr); >>> + >>> unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] >>> __page_aligned_bss; >>> EXPORT_SYMBOL(empty_zero_page); >>> @@ -178,8 +181,12 @@ void __init setup_bootmem(void) >>> } >>> >>> #ifdef CONFIG_MMU >>> +/* Offset between linear mapping virtual address and kernel load >>> address */ >>> unsigned long va_pa_offset; >>> EXPORT_SYMBOL(va_pa_offset); >>> +/* Offset between kernel mapping virtual address and kernel load >>> address */ >>> +unsigned long va_kernel_pa_offset; >>> +EXPORT_SYMBOL(va_kernel_pa_offset); >>> unsigned long pfn_base; >>> EXPORT_SYMBOL(pfn_base); >>> >>> @@ -271,7 +278,7 @@ static phys_addr_t __init alloc_pmd(uintptr_t va) >>> if (mmu_enabled) >>> return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); >>> >>> - pmd_num = (va - PAGE_OFFSET) >> PGDIR_SHIFT; >>> + pmd_num = (va - kernel_virt_addr) >> PGDIR_SHIFT; >>> BUG_ON(pmd_num >= NUM_EARLY_PMDS); >>> return (uintptr_t)&early_pmd[pmd_num * PTRS_PER_PMD]; >>> } >>> @@ -372,14 +379,30 @@ static uintptr_t __init >>> best_map_size(phys_addr_t base, phys_addr_t size) >>> #error "setup_vm() is called from head.S before relocate so it >>> should not use absolute addressing." >>> #endif >>> >>> +static uintptr_t load_pa, load_sz; >>> + >>> +static void __init create_kernel_page_table(pgd_t *pgdir, uintptr_t >>> map_size) >>> +{ >>> + uintptr_t va, end_va; >>> + >>> + end_va = kernel_virt_addr + load_sz; >>> + for (va = kernel_virt_addr; va < end_va; va += map_size) >>> + create_pgd_mapping(pgdir, va, >>> + load_pa + (va - kernel_virt_addr), >>> + map_size, PAGE_KERNEL_EXEC); >>> +} >>> + >>> asmlinkage void __init setup_vm(uintptr_t dtb_pa) >>> { >>> uintptr_t va, end_va; >>> - uintptr_t load_pa = (uintptr_t)(&_start); >>> - uintptr_t load_sz = (uintptr_t)(&_end) - load_pa; >>> uintptr_t map_size = best_map_size(load_pa, >>> MAX_EARLY_MAPPING_SIZE); >>> >>> + load_pa = (uintptr_t)(&_start); >>> + load_sz = (uintptr_t)(&_end) - load_pa; >>> + >>> va_pa_offset = PAGE_OFFSET - load_pa; >>> + va_kernel_pa_offset = kernel_virt_addr - load_pa; >>> + >>> pfn_base = PFN_DOWN(load_pa); >>> >>> /* >>> @@ -402,26 +425,22 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) >>> create_pmd_mapping(fixmap_pmd, FIXADDR_START, >>> (uintptr_t)fixmap_pte, PMD_SIZE, PAGE_TABLE); >>> /* Setup trampoline PGD and PMD */ >>> - create_pgd_mapping(trampoline_pg_dir, PAGE_OFFSET, >>> + create_pgd_mapping(trampoline_pg_dir, kernel_virt_addr, >>> (uintptr_t)trampoline_pmd, PGDIR_SIZE, PAGE_TABLE); >>> - create_pmd_mapping(trampoline_pmd, PAGE_OFFSET, >>> + create_pmd_mapping(trampoline_pmd, kernel_virt_addr, >>> load_pa, PMD_SIZE, PAGE_KERNEL_EXEC); >>> #else >>> /* Setup trampoline PGD */ >>> - create_pgd_mapping(trampoline_pg_dir, PAGE_OFFSET, >>> + create_pgd_mapping(trampoline_pg_dir, kernel_virt_addr, >>> load_pa, PGDIR_SIZE, PAGE_KERNEL_EXEC); >>> #endif >>> >>> /* >>> - * Setup early PGD covering entire kernel which will allows >>> + * Setup early PGD covering entire kernel which will allow >>> * us to reach paging_init(). We map all memory banks later >>> * in setup_vm_final() below. >>> */ >>> - end_va = PAGE_OFFSET + load_sz; >>> - for (va = PAGE_OFFSET; va < end_va; va += map_size) >>> - create_pgd_mapping(early_pg_dir, va, >>> - load_pa + (va - PAGE_OFFSET), >>> - map_size, PAGE_KERNEL_EXEC); >>> + create_kernel_page_table(early_pg_dir, map_size); >>> >>> /* Create fixed mapping for early FDT parsing */ >>> end_va = __fix_to_virt(FIX_FDT) + FIX_FDT_SIZE; >>> @@ -441,6 +460,7 @@ static void __init setup_vm_final(void) >>> uintptr_t va, map_size; >>> phys_addr_t pa, start, end; >>> struct memblock_region *reg; >>> + static struct vm_struct vm_kernel = { 0 }; >>> >>> /* Set mmu_enabled flag */ >>> mmu_enabled = true; >>> @@ -467,10 +487,22 @@ static void __init setup_vm_final(void) >>> for (pa = start; pa < end; pa += map_size) { >>> va = (uintptr_t)__va(pa); >>> create_pgd_mapping(swapper_pg_dir, va, pa, >>> - map_size, PAGE_KERNEL_EXEC); >>> + map_size, PAGE_KERNEL); >>> } >>> } >>> >>> + /* Map the kernel */ >>> + create_kernel_page_table(swapper_pg_dir, PMD_SIZE); >>> + >>> + /* Reserve the vmalloc area occupied by the kernel */ >>> + vm_kernel.addr = (void *)kernel_virt_addr; >>> + vm_kernel.phys_addr = load_pa; >>> + vm_kernel.size = (load_sz + PMD_SIZE - 1) & ~(PMD_SIZE - 1); >>> + vm_kernel.flags = VM_MAP | VM_NO_GUARD; >>> + vm_kernel.caller = __builtin_return_address(0); >>> + >>> + vm_area_add_early(&vm_kernel); >>> + >>> /* Clear fixmap PTE and PMD mappings */ >>> clear_fixmap(FIX_PTE); >>> clear_fixmap(FIX_PMD); >>> diff --git a/arch/riscv/mm/physaddr.c b/arch/riscv/mm/physaddr.c >>> index e8e4dcd39fed..35703d5ef5fd 100644 >>> --- a/arch/riscv/mm/physaddr.c >>> +++ b/arch/riscv/mm/physaddr.c >>> @@ -23,7 +23,7 @@ EXPORT_SYMBOL(__virt_to_phys); >>> >>> phys_addr_t __phys_addr_symbol(unsigned long x) >>> { >>> - unsigned long kernel_start = (unsigned long)PAGE_OFFSET; >>> + unsigned long kernel_start = (unsigned long)kernel_virt_addr; >>> unsigned long kernel_end = (unsigned long)_end; >>> >>> /* > > Alex
On Tue, 21 Jul 2020 11:36:10 PDT (-0700), alex@ghiti.fr wrote: > Let's try to make progress here: I add linux-mm in CC to get feedback on > this patch as it blocks sv48 support too. Sorry for being slow here. I haven't replied because I hadn't really fleshed out the design yet, but just so everyone's on the same page my problems with this are: * We waste vmalloc space on 32-bit systems, where there isn't a lot of it. * On 64-bit systems the VA space around the kernel is precious because it's the only place we can place text (modules, BPF, whatever). If we start putting the kernel in the vmalloc space then we either have to pre-allocate a bunch of space around it (essentially making it a fixed mapping anyway) or it becomes likely that we won't be able to find space for modules as they're loaded into running systems. * Relying on a relocatable kernel for sv48 support introduces a fairly large performance hit. Roughly, my proposal would be to: * Leave the 32-bit memory map alone. On 32-bit systems we can load modules anywhere and we only have one VA width, so we're not really solving any problems with these changes. * Staticly allocate a 2GiB portion of the VA space for all our text, as its own region. We'd link/relocate the kernel here instead of around PAGE_OFFSET, which would decouple the kernel from the physical memory layout of the system. This would have the side effect of sorting out a bunch of bootloader headaches that we currently have. * Sort out how to maintain a linear map as the canonical hole moves around between the VA widths without adding a bunch of overhead to the virt2phys and friends. This is probably going to be the trickiest part, but I think if we just change the page table code to essentially lie about VAs when an sv39 system runs an sv48+sv39 kernel we could make it work -- there'd be some logical complexity involved, but it would remain fast. This doesn't solve the problem of virtually relocatable kernels, but it does let us decouple that from the sv48 stuff. It also lets us stop relying on a fixed physical address the kernel is loaded into, which is another thing I don't like. I know this may be a more complicated approach, but there aren't any sv48 systems around right now so I just don't see the rush to support them, particularly when there's a cost to what already exists (for those who haven't been watching, so far all the sv48 patch sets have imposed a significant performance penalty on all systems). > > Alex > > Le 7/9/20 à 7:11 AM, Alex Ghiti a écrit : >> Hi Palmer, >> >> Le 7/9/20 à 1:05 AM, Palmer Dabbelt a écrit : >>> On Sun, 07 Jun 2020 00:59:46 PDT (-0700), alex@ghiti.fr wrote: >>>> This is a preparatory patch for relocatable kernel. >>>> >>>> The kernel used to be linked at PAGE_OFFSET address and used to be >>>> loaded >>>> physically at the beginning of the main memory. Therefore, we could use >>>> the linear mapping for the kernel mapping. >>>> >>>> But the relocated kernel base address will be different from PAGE_OFFSET >>>> and since in the linear mapping, two different virtual addresses cannot >>>> point to the same physical address, the kernel mapping needs to lie >>>> outside >>>> the linear mapping. >>> >>> I know it's been a while, but I keep opening this up to review it and >>> just >>> can't get over how ugly it is to put the kernel's linear map in the >>> vmalloc >>> region. >>> >>> I guess I don't understand why this is necessary at all. >>> Specifically: why >>> can't we just relocate the kernel within the linear map? That would >>> let the >>> bootloader put the kernel wherever it wants, modulo the physical >>> memory size we >>> support. We'd need to handle the regions that are coupled to the >>> kernel's >>> execution address, but we could just put them in an explicit memory >>> region >>> which is what we should probably be doing anyway. >> >> Virtual relocation in the linear mapping requires to move the kernel >> physically too. Zong implemented this physical move in its KASLR RFC >> patchset, which is cumbersome since finding an available physical spot >> is harder than just selecting a virtual range in the vmalloc range. >> >> In addition, having the kernel mapping in the linear mapping prevents >> the use of hugepage for the linear mapping resulting in performance loss >> (at least for the GB that encompasses the kernel). >> >> Why do you find this "ugly" ? The vmalloc region is just a bunch of >> available virtual addresses to whatever purpose we want, and as noted by >> Zong, arm64 uses the same scheme. >> >>> >>>> In addition, because modules and BPF must be close to the kernel (inside >>>> +-2GB window), the kernel is placed at the end of the vmalloc zone minus >>>> 2GB, which leaves room for modules and BPF. The kernel could not be >>>> placed at the beginning of the vmalloc zone since other vmalloc >>>> allocations from the kernel could get all the +-2GB window around the >>>> kernel which would prevent new modules and BPF programs to be loaded. >>> >>> Well, that's not enough to make sure this doesn't happen -- it's just >>> enough to >>> make sure it doesn't happen very quickily. That's the same boat we're >>> already >>> in, though, so it's not like it's worse. >> >> Indeed, that's not worse, I haven't found a way to reserve vmalloc area >> without actually allocating it. >> >>> >>>> Signed-off-by: Alexandre Ghiti <alex@ghiti.fr> >>>> Reviewed-by: Zong Li <zong.li@sifive.com> >>>> --- >>>> arch/riscv/boot/loader.lds.S | 3 +- >>>> arch/riscv/include/asm/page.h | 10 +++++- >>>> arch/riscv/include/asm/pgtable.h | 38 ++++++++++++++------- >>>> arch/riscv/kernel/head.S | 3 +- >>>> arch/riscv/kernel/module.c | 4 +-- >>>> arch/riscv/kernel/vmlinux.lds.S | 3 +- >>>> arch/riscv/mm/init.c | 58 +++++++++++++++++++++++++------- >>>> arch/riscv/mm/physaddr.c | 2 +- >>>> 8 files changed, 88 insertions(+), 33 deletions(-) >>>> >>>> diff --git a/arch/riscv/boot/loader.lds.S b/arch/riscv/boot/loader.lds.S >>>> index 47a5003c2e28..62d94696a19c 100644 >>>> --- a/arch/riscv/boot/loader.lds.S >>>> +++ b/arch/riscv/boot/loader.lds.S >>>> @@ -1,13 +1,14 @@ >>>> /* SPDX-License-Identifier: GPL-2.0 */ >>>> >>>> #include <asm/page.h> >>>> +#include <asm/pgtable.h> >>>> >>>> OUTPUT_ARCH(riscv) >>>> ENTRY(_start) >>>> >>>> SECTIONS >>>> { >>>> - . = PAGE_OFFSET; >>>> + . = KERNEL_LINK_ADDR; >>>> >>>> .payload : { >>>> *(.payload) >>>> diff --git a/arch/riscv/include/asm/page.h >>>> b/arch/riscv/include/asm/page.h >>>> index 2d50f76efe48..48bb09b6a9b7 100644 >>>> --- a/arch/riscv/include/asm/page.h >>>> +++ b/arch/riscv/include/asm/page.h >>>> @@ -90,18 +90,26 @@ typedef struct page *pgtable_t; >>>> >>>> #ifdef CONFIG_MMU >>>> extern unsigned long va_pa_offset; >>>> +extern unsigned long va_kernel_pa_offset; >>>> extern unsigned long pfn_base; >>>> #define ARCH_PFN_OFFSET (pfn_base) >>>> #else >>>> #define va_pa_offset 0 >>>> +#define va_kernel_pa_offset 0 >>>> #define ARCH_PFN_OFFSET (PAGE_OFFSET >> PAGE_SHIFT) >>>> #endif /* CONFIG_MMU */ >>>> >>>> extern unsigned long max_low_pfn; >>>> extern unsigned long min_low_pfn; >>>> +extern unsigned long kernel_virt_addr; >>>> >>>> #define __pa_to_va_nodebug(x) ((void *)((unsigned long) (x) + >>>> va_pa_offset)) >>>> -#define __va_to_pa_nodebug(x) ((unsigned long)(x) - va_pa_offset) >>>> +#define linear_mapping_va_to_pa(x) ((unsigned long)(x) - >>>> va_pa_offset) >>>> +#define kernel_mapping_va_to_pa(x) \ >>>> + ((unsigned long)(x) - va_kernel_pa_offset) >>>> +#define __va_to_pa_nodebug(x) \ >>>> + (((x) >= PAGE_OFFSET) ? \ >>>> + linear_mapping_va_to_pa(x) : kernel_mapping_va_to_pa(x)) >>>> >>>> #ifdef CONFIG_DEBUG_VIRTUAL >>>> extern phys_addr_t __virt_to_phys(unsigned long x); >>>> diff --git a/arch/riscv/include/asm/pgtable.h >>>> b/arch/riscv/include/asm/pgtable.h >>>> index 35b60035b6b0..94ef3b49dfb6 100644 >>>> --- a/arch/riscv/include/asm/pgtable.h >>>> +++ b/arch/riscv/include/asm/pgtable.h >>>> @@ -11,23 +11,29 @@ >>>> >>>> #include <asm/pgtable-bits.h> >>>> >>>> -#ifndef __ASSEMBLY__ >>>> - >>>> -/* Page Upper Directory not used in RISC-V */ >>>> -#include <asm-generic/pgtable-nopud.h> >>>> -#include <asm/page.h> >>>> -#include <asm/tlbflush.h> >>>> -#include <linux/mm_types.h> >>>> - >>>> -#ifdef CONFIG_MMU >>>> +#ifndef CONFIG_MMU >>>> +#define KERNEL_VIRT_ADDR PAGE_OFFSET >>>> +#define KERNEL_LINK_ADDR PAGE_OFFSET >>>> +#else >>>> +/* >>>> + * Leave 2GB for modules and BPF that must lie within a 2GB range >>>> around >>>> + * the kernel. >>>> + */ >>>> +#define KERNEL_VIRT_ADDR (VMALLOC_END - SZ_2G + 1) >>>> +#define KERNEL_LINK_ADDR KERNEL_VIRT_ADDR >>> >>> At a bare minimum this is going to make a mess of the 32-bit port, as >>> non-relocatable kernels are now going to get linked at 1GiB which is >>> where user >>> code is supposed to live. That's an easy fix, though, as the 32-bit >>> stuff >>> doesn't need any module address restrictions. >> >> Indeed, I will take a look at that. >> >>> >>>> #define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) >>>> #define VMALLOC_END (PAGE_OFFSET - 1) >>>> #define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE) >>>> >>>> #define BPF_JIT_REGION_SIZE (SZ_128M) >>>> -#define BPF_JIT_REGION_START (PAGE_OFFSET - BPF_JIT_REGION_SIZE) >>>> -#define BPF_JIT_REGION_END (VMALLOC_END) >>>> +#define BPF_JIT_REGION_START PFN_ALIGN((unsigned long)&_end) >>>> +#define BPF_JIT_REGION_END (BPF_JIT_REGION_START + >>>> BPF_JIT_REGION_SIZE) >>>> + >>>> +#ifdef CONFIG_64BIT >>>> +#define VMALLOC_MODULE_START BPF_JIT_REGION_END >>>> +#define VMALLOC_MODULE_END (((unsigned long)&_start & PAGE_MASK) >>>> + SZ_2G) >>>> +#endif >>>> >>>> /* >>>> * Roughly size the vmemmap space to be large enough to fit enough >>>> @@ -57,9 +63,16 @@ >>>> #define FIXADDR_SIZE PGDIR_SIZE >>>> #endif >>>> #define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) >>>> - >>>> #endif >>>> >>>> +#ifndef __ASSEMBLY__ >>>> + >>>> +/* Page Upper Directory not used in RISC-V */ >>>> +#include <asm-generic/pgtable-nopud.h> >>>> +#include <asm/page.h> >>>> +#include <asm/tlbflush.h> >>>> +#include <linux/mm_types.h> >>>> + >>>> #ifdef CONFIG_64BIT >>>> #include <asm/pgtable-64.h> >>>> #else >>>> @@ -483,6 +496,7 @@ static inline void __kernel_map_pages(struct page >>>> *page, int numpages, int enabl >>>> >>>> #define kern_addr_valid(addr) (1) /* FIXME */ >>>> >>>> +extern char _start[]; >>>> extern void *dtb_early_va; >>>> void setup_bootmem(void); >>>> void paging_init(void); >>>> diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S >>>> index 98a406474e7d..8f5bb7731327 100644 >>>> --- a/arch/riscv/kernel/head.S >>>> +++ b/arch/riscv/kernel/head.S >>>> @@ -49,7 +49,8 @@ ENTRY(_start) >>>> #ifdef CONFIG_MMU >>>> relocate: >>>> /* Relocate return address */ >>>> - li a1, PAGE_OFFSET >>>> + la a1, kernel_virt_addr >>>> + REG_L a1, 0(a1) >>>> la a2, _start >>>> sub a1, a1, a2 >>>> add ra, ra, a1 >>>> diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c >>>> index 8bbe5dbe1341..1a8fbe05accf 100644 >>>> --- a/arch/riscv/kernel/module.c >>>> +++ b/arch/riscv/kernel/module.c >>>> @@ -392,12 +392,10 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const >>>> char *strtab, >>>> } >>>> >>>> #if defined(CONFIG_MMU) && defined(CONFIG_64BIT) >>>> -#define VMALLOC_MODULE_START \ >>>> - max(PFN_ALIGN((unsigned long)&_end - SZ_2G), VMALLOC_START) >>>> void *module_alloc(unsigned long size) >>>> { >>>> return __vmalloc_node_range(size, 1, VMALLOC_MODULE_START, >>>> - VMALLOC_END, GFP_KERNEL, >>>> + VMALLOC_MODULE_END, GFP_KERNEL, >>>> PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, >>>> __builtin_return_address(0)); >>>> } >>>> diff --git a/arch/riscv/kernel/vmlinux.lds.S >>>> b/arch/riscv/kernel/vmlinux.lds.S >>>> index 0339b6bbe11a..a9abde62909f 100644 >>>> --- a/arch/riscv/kernel/vmlinux.lds.S >>>> +++ b/arch/riscv/kernel/vmlinux.lds.S >>>> @@ -4,7 +4,8 @@ >>>> * Copyright (C) 2017 SiFive >>>> */ >>>> >>>> -#define LOAD_OFFSET PAGE_OFFSET >>>> +#include <asm/pgtable.h> >>>> +#define LOAD_OFFSET KERNEL_LINK_ADDR >>>> #include <asm/vmlinux.lds.h> >>>> #include <asm/page.h> >>>> #include <asm/cache.h> >>>> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c >>>> index 736de6c8739f..71da78914645 100644 >>>> --- a/arch/riscv/mm/init.c >>>> +++ b/arch/riscv/mm/init.c >>>> @@ -22,6 +22,9 @@ >>>> >>>> #include "../kernel/head.h" >>>> >>>> +unsigned long kernel_virt_addr = KERNEL_VIRT_ADDR; >>>> +EXPORT_SYMBOL(kernel_virt_addr); >>>> + >>>> unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] >>>> __page_aligned_bss; >>>> EXPORT_SYMBOL(empty_zero_page); >>>> @@ -178,8 +181,12 @@ void __init setup_bootmem(void) >>>> } >>>> >>>> #ifdef CONFIG_MMU >>>> +/* Offset between linear mapping virtual address and kernel load >>>> address */ >>>> unsigned long va_pa_offset; >>>> EXPORT_SYMBOL(va_pa_offset); >>>> +/* Offset between kernel mapping virtual address and kernel load >>>> address */ >>>> +unsigned long va_kernel_pa_offset; >>>> +EXPORT_SYMBOL(va_kernel_pa_offset); >>>> unsigned long pfn_base; >>>> EXPORT_SYMBOL(pfn_base); >>>> >>>> @@ -271,7 +278,7 @@ static phys_addr_t __init alloc_pmd(uintptr_t va) >>>> if (mmu_enabled) >>>> return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); >>>> >>>> - pmd_num = (va - PAGE_OFFSET) >> PGDIR_SHIFT; >>>> + pmd_num = (va - kernel_virt_addr) >> PGDIR_SHIFT; >>>> BUG_ON(pmd_num >= NUM_EARLY_PMDS); >>>> return (uintptr_t)&early_pmd[pmd_num * PTRS_PER_PMD]; >>>> } >>>> @@ -372,14 +379,30 @@ static uintptr_t __init >>>> best_map_size(phys_addr_t base, phys_addr_t size) >>>> #error "setup_vm() is called from head.S before relocate so it >>>> should not use absolute addressing." >>>> #endif >>>> >>>> +static uintptr_t load_pa, load_sz; >>>> + >>>> +static void __init create_kernel_page_table(pgd_t *pgdir, uintptr_t >>>> map_size) >>>> +{ >>>> + uintptr_t va, end_va; >>>> + >>>> + end_va = kernel_virt_addr + load_sz; >>>> + for (va = kernel_virt_addr; va < end_va; va += map_size) >>>> + create_pgd_mapping(pgdir, va, >>>> + load_pa + (va - kernel_virt_addr), >>>> + map_size, PAGE_KERNEL_EXEC); >>>> +} >>>> + >>>> asmlinkage void __init setup_vm(uintptr_t dtb_pa) >>>> { >>>> uintptr_t va, end_va; >>>> - uintptr_t load_pa = (uintptr_t)(&_start); >>>> - uintptr_t load_sz = (uintptr_t)(&_end) - load_pa; >>>> uintptr_t map_size = best_map_size(load_pa, >>>> MAX_EARLY_MAPPING_SIZE); >>>> >>>> + load_pa = (uintptr_t)(&_start); >>>> + load_sz = (uintptr_t)(&_end) - load_pa; >>>> + >>>> va_pa_offset = PAGE_OFFSET - load_pa; >>>> + va_kernel_pa_offset = kernel_virt_addr - load_pa; >>>> + >>>> pfn_base = PFN_DOWN(load_pa); >>>> >>>> /* >>>> @@ -402,26 +425,22 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) >>>> create_pmd_mapping(fixmap_pmd, FIXADDR_START, >>>> (uintptr_t)fixmap_pte, PMD_SIZE, PAGE_TABLE); >>>> /* Setup trampoline PGD and PMD */ >>>> - create_pgd_mapping(trampoline_pg_dir, PAGE_OFFSET, >>>> + create_pgd_mapping(trampoline_pg_dir, kernel_virt_addr, >>>> (uintptr_t)trampoline_pmd, PGDIR_SIZE, PAGE_TABLE); >>>> - create_pmd_mapping(trampoline_pmd, PAGE_OFFSET, >>>> + create_pmd_mapping(trampoline_pmd, kernel_virt_addr, >>>> load_pa, PMD_SIZE, PAGE_KERNEL_EXEC); >>>> #else >>>> /* Setup trampoline PGD */ >>>> - create_pgd_mapping(trampoline_pg_dir, PAGE_OFFSET, >>>> + create_pgd_mapping(trampoline_pg_dir, kernel_virt_addr, >>>> load_pa, PGDIR_SIZE, PAGE_KERNEL_EXEC); >>>> #endif >>>> >>>> /* >>>> - * Setup early PGD covering entire kernel which will allows >>>> + * Setup early PGD covering entire kernel which will allow >>>> * us to reach paging_init(). We map all memory banks later >>>> * in setup_vm_final() below. >>>> */ >>>> - end_va = PAGE_OFFSET + load_sz; >>>> - for (va = PAGE_OFFSET; va < end_va; va += map_size) >>>> - create_pgd_mapping(early_pg_dir, va, >>>> - load_pa + (va - PAGE_OFFSET), >>>> - map_size, PAGE_KERNEL_EXEC); >>>> + create_kernel_page_table(early_pg_dir, map_size); >>>> >>>> /* Create fixed mapping for early FDT parsing */ >>>> end_va = __fix_to_virt(FIX_FDT) + FIX_FDT_SIZE; >>>> @@ -441,6 +460,7 @@ static void __init setup_vm_final(void) >>>> uintptr_t va, map_size; >>>> phys_addr_t pa, start, end; >>>> struct memblock_region *reg; >>>> + static struct vm_struct vm_kernel = { 0 }; >>>> >>>> /* Set mmu_enabled flag */ >>>> mmu_enabled = true; >>>> @@ -467,10 +487,22 @@ static void __init setup_vm_final(void) >>>> for (pa = start; pa < end; pa += map_size) { >>>> va = (uintptr_t)__va(pa); >>>> create_pgd_mapping(swapper_pg_dir, va, pa, >>>> - map_size, PAGE_KERNEL_EXEC); >>>> + map_size, PAGE_KERNEL); >>>> } >>>> } >>>> >>>> + /* Map the kernel */ >>>> + create_kernel_page_table(swapper_pg_dir, PMD_SIZE); >>>> + >>>> + /* Reserve the vmalloc area occupied by the kernel */ >>>> + vm_kernel.addr = (void *)kernel_virt_addr; >>>> + vm_kernel.phys_addr = load_pa; >>>> + vm_kernel.size = (load_sz + PMD_SIZE - 1) & ~(PMD_SIZE - 1); >>>> + vm_kernel.flags = VM_MAP | VM_NO_GUARD; >>>> + vm_kernel.caller = __builtin_return_address(0); >>>> + >>>> + vm_area_add_early(&vm_kernel); >>>> + >>>> /* Clear fixmap PTE and PMD mappings */ >>>> clear_fixmap(FIX_PTE); >>>> clear_fixmap(FIX_PMD); >>>> diff --git a/arch/riscv/mm/physaddr.c b/arch/riscv/mm/physaddr.c >>>> index e8e4dcd39fed..35703d5ef5fd 100644 >>>> --- a/arch/riscv/mm/physaddr.c >>>> +++ b/arch/riscv/mm/physaddr.c >>>> @@ -23,7 +23,7 @@ EXPORT_SYMBOL(__virt_to_phys); >>>> >>>> phys_addr_t __phys_addr_symbol(unsigned long x) >>>> { >>>> - unsigned long kernel_start = (unsigned long)PAGE_OFFSET; >>>> + unsigned long kernel_start = (unsigned long)kernel_virt_addr; >>>> unsigned long kernel_end = (unsigned long)_end; >>>> >>>> /* >> >> Alex
On Tue, 2020-07-21 at 14:36 -0400, Alex Ghiti wrote: > > > I guess I don't understand why this is necessary at all. > > > Specifically: why > > > can't we just relocate the kernel within the linear map? That would > > > let the > > > bootloader put the kernel wherever it wants, modulo the physical > > > memory size we > > > support. We'd need to handle the regions that are coupled to the > > > kernel's > > > execution address, but we could just put them in an explicit memory > > > region > > > which is what we should probably be doing anyway. > > > > Virtual relocation in the linear mapping requires to move the kernel > > physically too. Zong implemented this physical move in its KASLR RFC > > patchset, which is cumbersome since finding an available physical spot > > is harder than just selecting a virtual range in the vmalloc range. > > > > In addition, having the kernel mapping in the linear mapping prevents > > the use of hugepage for the linear mapping resulting in performance loss > > (at least for the GB that encompasses the kernel). > > > > Why do you find this "ugly" ? The vmalloc region is just a bunch of > > available virtual addresses to whatever purpose we want, and as noted by > > Zong, arm64 uses the same scheme. I don't get it :-) At least on powerpc we move the kernel in the linear mapping and it works fine with huge pages, what is your problem there ? You rely on punching small-page size holes in there ? At least in the old days, there were a number of assumptions that the kernel text/data/bss resides in the linear mapping. If you change that you need to ensure that it's still physically contiguous and you'll have to tweak __va and __pa, which might induce extra overhead. Cheers, Ben.
On Tue, 2020-07-21 at 12:05 -0700, Palmer Dabbelt wrote: > > * We waste vmalloc space on 32-bit systems, where there isn't a lot of it. > * On 64-bit systems the VA space around the kernel is precious because it's the > only place we can place text (modules, BPF, whatever). Why ? Branch distance limits ? You can't use trampolines ? > If we start putting > the kernel in the vmalloc space then we either have to pre-allocate a bunch > of space around it (essentially making it a fixed mapping anyway) or it > becomes likely that we won't be able to find space for modules as they're > loaded into running systems. I dislike the kernel being in the vmalloc space (see my other email) but I don't understand the specific issue with modules. > * Relying on a relocatable kernel for sv48 support introduces a fairly large > performance hit. Out of curiosity why would relocatable kernels introduce a significant hit ? Where about do you see the overhead coming from ? > Roughly, my proposal would be to: > > * Leave the 32-bit memory map alone. On 32-bit systems we can load modules > anywhere and we only have one VA width, so we're not really solving any > problems with these changes. > * Staticly allocate a 2GiB portion of the VA space for all our text, as its own > region. We'd link/relocate the kernel here instead of around PAGE_OFFSET, > which would decouple the kernel from the physical memory layout of the system. > This would have the side effect of sorting out a bunch of bootloader headaches > that we currently have. > * Sort out how to maintain a linear map as the canonical hole moves around > between the VA widths without adding a bunch of overhead to the virt2phys and > friends. This is probably going to be the trickiest part, but I think if we > just change the page table code to essentially lie about VAs when an sv39 > system runs an sv48+sv39 kernel we could make it work -- there'd be some > logical complexity involved, but it would remain fast. > > This doesn't solve the problem of virtually relocatable kernels, but it does > let us decouple that from the sv48 stuff. It also lets us stop relying on a > fixed physical address the kernel is loaded into, which is another thing I > don't like. > > I know this may be a more complicated approach, but there aren't any sv48 > systems around right now so I just don't see the rush to support them, > particularly when there's a cost to what already exists (for those who haven't > been watching, so far all the sv48 patch sets have imposed a significant > performance penalty on all systems).
On Tue, 21 Jul 2020 16:11:02 PDT (-0700), benh@kernel.crashing.org wrote: > On Tue, 2020-07-21 at 14:36 -0400, Alex Ghiti wrote: >> > > I guess I don't understand why this is necessary at all. >> > > Specifically: why >> > > can't we just relocate the kernel within the linear map? That would >> > > let the >> > > bootloader put the kernel wherever it wants, modulo the physical >> > > memory size we >> > > support. We'd need to handle the regions that are coupled to the >> > > kernel's >> > > execution address, but we could just put them in an explicit memory >> > > region >> > > which is what we should probably be doing anyway. >> > >> > Virtual relocation in the linear mapping requires to move the kernel >> > physically too. Zong implemented this physical move in its KASLR RFC >> > patchset, which is cumbersome since finding an available physical spot >> > is harder than just selecting a virtual range in the vmalloc range. >> > >> > In addition, having the kernel mapping in the linear mapping prevents >> > the use of hugepage for the linear mapping resulting in performance loss >> > (at least for the GB that encompasses the kernel). >> > >> > Why do you find this "ugly" ? The vmalloc region is just a bunch of >> > available virtual addresses to whatever purpose we want, and as noted by >> > Zong, arm64 uses the same scheme. > > I don't get it :-) > > At least on powerpc we move the kernel in the linear mapping and it > works fine with huge pages, what is your problem there ? You rely on > punching small-page size holes in there ? That was my original suggestion, and I'm not actually sure it's invalid. It would mean that both the kernel's physical and virtual addresses are set by the bootloader, which may or may not be workable if we want to have an sv48+sv39 kernel. My initial approach to sv48+sv39 kernels would be to just throw away the sv39 memory on sv48 kernels, which would preserve the linear map but mean that there is no single physical address that's accessible for both. That would require some coordination between the bootloader and the kernel as to where it should be loaded, but maybe there's a better way to design the linear map. Right now we have a bunch of unwritten rules about where things need to be loaded, which is a recipe for disaster. We could copy the kernel around, but I'm not sure I really like that idea. We do zero the BSS right now, so it's not like we entirely rely on the bootloader to set up the kernel image, but with the hart race boot scheme we have right now we'd at least need to leave a stub sitting around. Maybe we just throw away SBI v0.1, though, that's why we called it all legacy in the first place. My bigger worry is that anything that involves running the kernel at arbitrary virtual addresses means we need a PIC kernel, which means every global symbol needs an indirection. That's probably not so bad for shared libraries, but the kernel has a lot of global symbols. PLT references probably aren't so scary, as we have an incoherent instruction cache so the virtual function predictor isn't that hard to build, but making all global data accesses GOT-relative seems like a disaster for performance. This fixed-VA thing really just exists so we don't have to be full-on PIC. In theory I think we could just get away with pretending that medany is PIC, which I believe works as long as the data and text offset stays constant, you you don't have any symbols between 2GiB and -2GiB (as those may stay fixed, even in medany), and you deal with GP accordingly (which should work itself out in the current startup code). We rely on this for some of the early boot code (and will soon for kexec), but that's a very controlled code base and we've already had some issues. I'd be much more comfortable adding an explicit semi-PIC code model, as I tend to miss something when doing these sorts of things and then we could at least add it to the GCC test runs and guarantee it actually works. Not really sure I want to deal with that, though. It would, however, be the only way to get random virtual addresses during kernel execution. > At least in the old days, there were a number of assumptions that > the kernel text/data/bss resides in the linear mapping. Ya, it terrified me as well. Alex says arm64 puts the kernel in the vmalloc region, so assuming that's the case it must be possible. I didn't get that from reading the arm64 port (I guess it's no secret that pretty much all I do is copy their code) > If you change that you need to ensure that it's still physically > contiguous and you'll have to tweak __va and __pa, which might induce > extra overhead. I'm operating under the assumption that we don't want to add an additional load to virt2phys conversions. arm64 bends over backwards to avoid the load, and I'm assuming they have a reason for doing so. Of course, if we're PIC then maybe performance just doesn't matter, but I'm not sure I want to just give up. Distros will probably build the sv48+sv39 kernels as soon as they show up, even if there's no sv48 hardware for a while.
On Tue, 21 Jul 2020 16:12:58 PDT (-0700), benh@kernel.crashing.org wrote: > On Tue, 2020-07-21 at 12:05 -0700, Palmer Dabbelt wrote: >> >> * We waste vmalloc space on 32-bit systems, where there isn't a lot of it. >> * On 64-bit systems the VA space around the kernel is precious because it's the >> only place we can place text (modules, BPF, whatever). > > Why ? Branch distance limits ? You can't use trampolines ? Nothing fundamental, it's just that we don't have a large code model in the C compiler. As a result all the global symbols are resolved as 32-bit PC-relative accesses. We could fix this with a fast large code model, but then the kernel would need to relax global symbol references in modules and we don't even do that for the simple code models we have now. FWIW, some of the proposed large code models are essentially just split-PLT/GOT and therefor don't require relaxation, but at that point we're essentially PIC until we have more that 2GiB of kernel text -- and even then, we keep all the performance issues. >> If we start putting >> the kernel in the vmalloc space then we either have to pre-allocate a bunch >> of space around it (essentially making it a fixed mapping anyway) or it >> becomes likely that we won't be able to find space for modules as they're >> loaded into running systems. > > I dislike the kernel being in the vmalloc space (see my other email) > but I don't understand the specific issue with modules. Essentially what's above, the modules smell the same as the rest of the kernel's code and therefor have a similar set of restrictions. If we build PIC modules and have the PLT entries do GOT loads (as do our shared libraries) then we could break this restriction, but that comes with some performance implications. Like I said in the other email, I'm less worried about the instruction side of things so maybe that's the right way to go. >> * Relying on a relocatable kernel for sv48 support introduces a fairly large >> performance hit. > > Out of curiosity why would relocatable kernels introduce a significant > hit ? Where about do you see the overhead coming from ? Our PIC codegen, probably better addressed by my other email and above. > >> Roughly, my proposal would be to: >> >> * Leave the 32-bit memory map alone. On 32-bit systems we can load modules >> anywhere and we only have one VA width, so we're not really solving any >> problems with these changes. >> * Staticly allocate a 2GiB portion of the VA space for all our text, as its own >> region. We'd link/relocate the kernel here instead of around PAGE_OFFSET, >> which would decouple the kernel from the physical memory layout of the system. >> This would have the side effect of sorting out a bunch of bootloader headaches >> that we currently have. >> * Sort out how to maintain a linear map as the canonical hole moves around >> between the VA widths without adding a bunch of overhead to the virt2phys and >> friends. This is probably going to be the trickiest part, but I think if we >> just change the page table code to essentially lie about VAs when an sv39 >> system runs an sv48+sv39 kernel we could make it work -- there'd be some >> logical complexity involved, but it would remain fast. >> >> This doesn't solve the problem of virtually relocatable kernels, but it does >> let us decouple that from the sv48 stuff. It also lets us stop relying on a >> fixed physical address the kernel is loaded into, which is another thing I >> don't like. >> >> I know this may be a more complicated approach, but there aren't any sv48 >> systems around right now so I just don't see the rush to support them, >> particularly when there's a cost to what already exists (for those who haven't >> been watching, so far all the sv48 patch sets have imposed a significant >> performance penalty on all systems).
On Tue, 2020-07-21 at 16:48 -0700, Palmer Dabbelt wrote: > > Why ? Branch distance limits ? You can't use trampolines ? > > Nothing fundamental, it's just that we don't have a large code model in the C > compiler. As a result all the global symbols are resolved as 32-bit > PC-relative accesses. We could fix this with a fast large code model, but then > the kernel would need to relax global symbol references in modules and we don't > even do that for the simple code models we have now. FWIW, some of the > proposed large code models are essentially just split-PLT/GOT and therefor > don't require relaxation, but at that point we're essentially PIC until we > have more that 2GiB of kernel text -- and even then, we keep all the > performance issues. My memory might be out of date but I *think* we do it on powerpc without going to a large code model, but just having the in-kernel linker insert trampolines. Cheers, Ben.
Benjamin Herrenschmidt <benh@kernel.crashing.org> writes: > On Tue, 2020-07-21 at 16:48 -0700, Palmer Dabbelt wrote: >> > Why ? Branch distance limits ? You can't use trampolines ? >> >> Nothing fundamental, it's just that we don't have a large code model in the C >> compiler. As a result all the global symbols are resolved as 32-bit >> PC-relative accesses. We could fix this with a fast large code model, but then >> the kernel would need to relax global symbol references in modules and we don't >> even do that for the simple code models we have now. FWIW, some of the >> proposed large code models are essentially just split-PLT/GOT and therefor >> don't require relaxation, but at that point we're essentially PIC until we >> have more that 2GiB of kernel text -- and even then, we keep all the >> performance issues. > > My memory might be out of date but I *think* we do it on powerpc > without going to a large code model, but just having the in-kernel > linker insert trampolines. We build modules with the large code model, and always have AFAIK: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/powerpc/Makefile?commit=4fa640dc52302b5e62b01b05c755b055549633ae#n129 # -mcmodel=medium breaks modules because it uses 32bit offsets from # the TOC pointer to create pointers where possible. Pointers into the # percpu data area are created by this method. # # The kernel module loader relocates the percpu data section from the # original location (starting with 0xd...) to somewhere in the base # kernel percpu data space (starting with 0xc...). We need a full # 64bit relocation for this to work, hence -mcmodel=large. KBUILD_CFLAGS_MODULE += -mcmodel=large We also insert trampolines for branches, but IIUC that's a separate issue. cheers
On Tue, 21 Jul 2020 21:50:42 PDT (-0700), mpe@ellerman.id.au wrote: > Benjamin Herrenschmidt <benh@kernel.crashing.org> writes: >> On Tue, 2020-07-21 at 16:48 -0700, Palmer Dabbelt wrote: >>> > Why ? Branch distance limits ? You can't use trampolines ? >>> >>> Nothing fundamental, it's just that we don't have a large code model in the C >>> compiler. As a result all the global symbols are resolved as 32-bit >>> PC-relative accesses. We could fix this with a fast large code model, but then >>> the kernel would need to relax global symbol references in modules and we don't >>> even do that for the simple code models we have now. FWIW, some of the >>> proposed large code models are essentially just split-PLT/GOT and therefor >>> don't require relaxation, but at that point we're essentially PIC until we >>> have more that 2GiB of kernel text -- and even then, we keep all the >>> performance issues. >> >> My memory might be out of date but I *think* we do it on powerpc >> without going to a large code model, but just having the in-kernel >> linker insert trampolines. > > We build modules with the large code model, and always have AFAIK: > > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/powerpc/Makefile?commit=4fa640dc52302b5e62b01b05c755b055549633ae#n129 > > # -mcmodel=medium breaks modules because it uses 32bit offsets from > # the TOC pointer to create pointers where possible. Pointers into the > # percpu data area are created by this method. > # > # The kernel module loader relocates the percpu data section from the > # original location (starting with 0xd...) to somewhere in the base > # kernel percpu data space (starting with 0xc...). We need a full > # 64bit relocation for this to work, hence -mcmodel=large. > KBUILD_CFLAGS_MODULE += -mcmodel=large Well, a fast large code model would solve a lot of problems :). Unfortunately we just don't have enough people working on this stuff to do that. It's a somewhat tricky thing to do on RISC-V as there aren't any quick sequences for long addresses, but I don't think we're that much worse off than everyone else. At some point I had a bunch of designs written up, but they probably went along with my SiFive computer. I think we ended up decided that the best bet would be to distribute constant tables throughout the text such that they're accessible via the 32-bit PC-relative loads at any point -- essentially the multi-GOT stuff that MIPS used for big objects. Doing that well is a lot of work and doing it poorly is just as slow as PIC, so we never got around to it. > We also insert trampolines for branches, but IIUC that's a separate > issue. "PowerPC branch trampolines" points me here https://sourceware.org/binutils/docs-2.20/ld/PowerPC-ELF32.html . That sounds like what we're doing already in the medium code models: we have short and medium control transfer sequences, linker relaxation optimizes them when possible. Since we rely on linker relaxation pretty heavily we just don't bother with the smaller code model: it'd be a 12-bit address space for data and a 21-bit address space for text (with 13-bit maximum function size). Instead of building out such a small code model we just spent time improving the linker.
On Tue, Jul 21, 2020 at 9:06 PM Palmer Dabbelt <palmer@dabbelt.com> wrote: > > On Tue, 21 Jul 2020 11:36:10 PDT (-0700), alex@ghiti.fr wrote: > > Let's try to make progress here: I add linux-mm in CC to get feedback on > > this patch as it blocks sv48 support too. > > Sorry for being slow here. I haven't replied because I hadn't really fleshed > out the design yet, but just so everyone's on the same page my problems with > this are: > > * We waste vmalloc space on 32-bit systems, where there isn't a lot of it. There is actually an ongoing work to make 32-bit Arm kernels move vmlinux into the vmalloc space, as part of the move to avoid highmem. Overall, a 32-bit system would waste about 0.1% of its virtual address space by having the kernel be located in both the linear map and the vmalloc area. It's not zero, but not that bad either. With the typical split of 3072 MB user, 768MB linear and 256MB vmalloc, it's also around 1.5% of the available vmalloc area (assuming a 4MB vmlinux in a typical 32-bit kernel), but the boundaries can be changed arbitrarily if needed. The eventual goal is to have a split of 3840MB for either user or linear map plus and 256MB for vmalloc, including the kernel. Switching between linear and user has a noticeable runtime overhead, but it relaxes both the limits for user memory and lowmem, and it provides a somewhat stronger address space isolation. Another potential idea would be to completely randomize the physical addresses underneath the kernel by using a random permutation of the pages in the kernel image. This adds even more overhead (virt_to_phys may need to call vmalloc_to_page or similar) and may cause problems with DMA into kernel .data across page boundaries, > * Sort out how to maintain a linear map as the canonical hole moves around > between the VA widths without adding a bunch of overhead to the virt2phys and > friends. This is probably going to be the trickiest part, but I think if we > just change the page table code to essentially lie about VAs when an sv39 > system runs an sv48+sv39 kernel we could make it work -- there'd be some > logical complexity involved, but it would remain fast. I assume you can't use the trick that x86 has where all kernel addresses are at the top of the 64-bit address space and user addresses are at the bottom, regardless of the size of the page tables? Arnd
On Wed, 22 Jul 2020 02:43:50 PDT (-0700), Arnd Bergmann wrote: > On Tue, Jul 21, 2020 at 9:06 PM Palmer Dabbelt <palmer@dabbelt.com> wrote: >> >> On Tue, 21 Jul 2020 11:36:10 PDT (-0700), alex@ghiti.fr wrote: >> > Let's try to make progress here: I add linux-mm in CC to get feedback on >> > this patch as it blocks sv48 support too. >> >> Sorry for being slow here. I haven't replied because I hadn't really fleshed >> out the design yet, but just so everyone's on the same page my problems with >> this are: >> >> * We waste vmalloc space on 32-bit systems, where there isn't a lot of it. > > There is actually an ongoing work to make 32-bit Arm kernels move > vmlinux into the vmalloc space, as part of the move to avoid highmem. > > Overall, a 32-bit system would waste about 0.1% of its virtual address space > by having the kernel be located in both the linear map and the vmalloc area. > It's not zero, but not that bad either. With the typical split of 3072 MB user, > 768MB linear and 256MB vmalloc, it's also around 1.5% of the available > vmalloc area (assuming a 4MB vmlinux in a typical 32-bit kernel), but the > boundaries can be changed arbitrarily if needed. OK, I guess maybe it's not so bad. Our 32-bit defconfig is 10MiB, but I wouldn't really put much weight behind that number as it's just a 64-bit defconfig built for 32-bit. We don't have any 32-bit hardware anyway, so if this becomes an issue later I guess we can just deal with it then. > The eventual goal is to have a split of 3840MB for either user or linear map > plus and 256MB for vmalloc, including the kernel. Switching between linear > and user has a noticeable runtime overhead, but it relaxes both the limits > for user memory and lowmem, and it provides a somewhat stronger > address space isolation. Ya, I think we decided not to do that, at least for now. I guess the right answer there will depend on what 32-bit systems look like, and since we don't have any I'm inclined to just stick to the fast option. > Another potential idea would be to completely randomize the physical > addresses underneath the kernel by using a random permutation of the > pages in the kernel image. This adds even more overhead (virt_to_phys > may need to call vmalloc_to_page or similar) and may cause problems > with DMA into kernel .data across page boundaries, > >> * Sort out how to maintain a linear map as the canonical hole moves around >> between the VA widths without adding a bunch of overhead to the virt2phys and >> friends. This is probably going to be the trickiest part, but I think if we >> just change the page table code to essentially lie about VAs when an sv39 >> system runs an sv48+sv39 kernel we could make it work -- there'd be some >> logical complexity involved, but it would remain fast. > > I assume you can't use the trick that x86 has where all kernel addresses > are at the top of the 64-bit address space and user addresses are at the > bottom, regardless of the size of the page tables? They have the load in their mapping functions, as far as I can tell that's required to do this sort of thing. We do as well to handle some of the implicit boot stuff for now, but I was assuming that we'd want to get rid of that for performance reasons. That said, maybe it just doesn't matter?
On Wed, Jul 22, 2020 at 9:52 PM Palmer Dabbelt <palmer@dabbelt.com> wrote: > On Wed, 22 Jul 2020 02:43:50 PDT (-0700), Arnd Bergmann wrote: > > On Tue, Jul 21, 2020 at 9:06 PM Palmer Dabbelt <palmer@dabbelt.com> wrote: > > The eventual goal is to have a split of 3840MB for either user or linear map > > plus and 256MB for vmalloc, including the kernel. Switching between linear > > and user has a noticeable runtime overhead, but it relaxes both the limits > > for user memory and lowmem, and it provides a somewhat stronger > > address space isolation. > > Ya, I think we decided not to do that, at least for now. I guess the right > answer there will depend on what 32-bit systems look like, and since we don't > have any I'm inclined to just stick to the fast option. Makes sense. Actually on 32-bit Arm we see fewer large-memory configurations in new machines than we had in the past before 64-bit machines were widely available at low cost, so I expect not to see a lot new hardware with more than 1GB of DDR3 (two 256Mbit x16 chips) for cost reasons, and rv32 is likely going to be similar, so you may never really see a need for highmem or the above hack to increase the size of the linear mapping. I just noticed that rv32 allows 2GB of lowmem rather than just the usual 768MB or 1GB, at the expense of addressable user memory. This seems like an unusual choice, but I also don't see any reason to change this or make it more flexible unless actual users appear. Arnd
On Wed, Jul 22, 2020 at 1:23 PM Arnd Bergmann <arnd@arndb.de> wrote: > > On Wed, Jul 22, 2020 at 9:52 PM Palmer Dabbelt <palmer@dabbelt.com> wrote: > > On Wed, 22 Jul 2020 02:43:50 PDT (-0700), Arnd Bergmann wrote: > > > On Tue, Jul 21, 2020 at 9:06 PM Palmer Dabbelt <palmer@dabbelt.com> wrote: > > > The eventual goal is to have a split of 3840MB for either user or linear map > > > plus and 256MB for vmalloc, including the kernel. Switching between linear > > > and user has a noticeable runtime overhead, but it relaxes both the limits > > > for user memory and lowmem, and it provides a somewhat stronger > > > address space isolation. > > > > Ya, I think we decided not to do that, at least for now. I guess the right > > answer there will depend on what 32-bit systems look like, and since we don't > > have any I'm inclined to just stick to the fast option. > > Makes sense. Actually on 32-bit Arm we see fewer large-memory > configurations in new machines than we had in the past before 64-bit > machines were widely available at low cost, so I expect not to see a > lot new hardware with more than 1GB of DDR3 (two 256Mbit x16 chips) > for cost reasons, and rv32 is likely going to be similar, so you may never > really see a need for highmem or the above hack to increase the > size of the linear mapping. > > I just noticed that rv32 allows 2GB of lowmem rather than just the usual > 768MB or 1GB, at the expense of addressable user memory. This seems > like an unusual choice, but I also don't see any reason to change this > or make it more flexible unless actual users appear. > I am a bit confused here. As per my understanding, RV32 supports 1GB of lowmem only as the page offset is set to 0xC0000000. The config option MAXPHYSMEM_2GB is misleading as RV32 actually allows 1GB of physical memory only. Any memory blocks beyond DRAM + 1GB are removed in setup_bootmem. IMHO, The current config should clarify that. Moreover, we should add 2G split under a separate configuration if we want to support that. > Arnd > > _______________________________________________ > linux-riscv mailing list > linux-riscv@lists.infradead.org > http://lists.infradead.org/mailman/listinfo/linux-riscv
Hi Benjamin, Le 7/21/20 à 7:11 PM, Benjamin Herrenschmidt a écrit : > On Tue, 2020-07-21 at 14:36 -0400, Alex Ghiti wrote: >>>> I guess I don't understand why this is necessary at all. >>>> Specifically: why >>>> can't we just relocate the kernel within the linear map? That would >>>> let the >>>> bootloader put the kernel wherever it wants, modulo the physical >>>> memory size we >>>> support. We'd need to handle the regions that are coupled to the >>>> kernel's >>>> execution address, but we could just put them in an explicit memory >>>> region >>>> which is what we should probably be doing anyway. >>> >>> Virtual relocation in the linear mapping requires to move the kernel >>> physically too. Zong implemented this physical move in its KASLR RFC >>> patchset, which is cumbersome since finding an available physical spot >>> is harder than just selecting a virtual range in the vmalloc range. >>> >>> In addition, having the kernel mapping in the linear mapping prevents >>> the use of hugepage for the linear mapping resulting in performance loss >>> (at least for the GB that encompasses the kernel). >>> >>> Why do you find this "ugly" ? The vmalloc region is just a bunch of >>> available virtual addresses to whatever purpose we want, and as noted by >>> Zong, arm64 uses the same scheme. > > I don't get it :-) > > At least on powerpc we move the kernel in the linear mapping and it > works fine with huge pages, what is your problem there ? You rely on > punching small-page size holes in there ? > ARCH_HAS_STRICT_KERNEL_RWX prevents the use of a hugepage for the kernel mapping in the direct mapping as it sets different permissions to different part of the kernel (data, text..etc). > At least in the old days, there were a number of assumptions that > the kernel text/data/bss resides in the linear mapping. > > If you change that you need to ensure that it's still physically > contiguous and you'll have to tweak __va and __pa, which might induce > extra overhead. > Yes that's done in this patch and indeed there is an overhead to those functions. > Cheers, > Ben. > > Thanks, Alex
Hi Palmer, Le 7/21/20 à 3:05 PM, Palmer Dabbelt a écrit : > On Tue, 21 Jul 2020 11:36:10 PDT (-0700), alex@ghiti.fr wrote: >> Let's try to make progress here: I add linux-mm in CC to get feedback on >> this patch as it blocks sv48 support too. > > Sorry for being slow here. I haven't replied because I hadn't really > fleshed No problem :) > out the design yet, but just so everyone's on the same page my problems > with > this are: > > * We waste vmalloc space on 32-bit systems, where there isn't a lot of it. > * On 64-bit systems the VA space around the kernel is precious because > it's the > only place we can place text (modules, BPF, whatever). If we start > putting > the kernel in the vmalloc space then we either have to pre-allocate a > bunch > of space around it (essentially making it a fixed mapping anyway) or it > becomes likely that we won't be able to find space for modules as they're > loaded into running systems. Let's note that we already have this issue for BPF and modules right now. But by keeping the kernel 'in the end' of the vmalloc region, that's quite mitigate this problem: if we exhaust the vmalloc region in 64bit and then start allocating here, I think the whole system will have other problem. > * Relying on a relocatable kernel for sv48 support introduces a fairly > large > performance hit. I understand the performance penalty but I struggle to it "fairly large": can we benchmark this somehow ? > > Roughly, my proposal would be to: > > * Leave the 32-bit memory map alone. On 32-bit systems we can load modules > anywhere and we only have one VA width, so we're not really solving any > problems with these changes. Ok that's possible although a lot of ifdef will get involved :) > * Staticly allocate a 2GiB portion of the VA space for all our text, as > its own > region. We'd link/relocate the kernel here instead of around > PAGE_OFFSET, > which would decouple the kernel from the physical memory layout of the > system. > This would have the side effect of sorting out a bunch of bootloader > headaches > that we currently have. This amounts to doing the same as this patch but instead of using the vmalloc region, we'd use our own right ? I believe we'd then lose the vmalloc facilities to allocate modules around this zone. > * Sort out how to maintain a linear map as the canonical hole moves around > between the VA widths without adding a bunch of overhead to the > virt2phys and > friends. This is probably going to be the trickiest part, but I think > if we > just change the page table code to essentially lie about VAs when an sv39 > system runs an sv48+sv39 kernel we could make it work -- there'd be some > logical complexity involved, but it would remain fast. I have to think about that. > > This doesn't solve the problem of virtually relocatable kernels, but it > does > let us decouple that from the sv48 stuff. It also lets us stop relying > on a > fixed physical address the kernel is loaded into, which is another thing I > don't like. > Agreed on this one. > I know this may be a more complicated approach, but there aren't any sv48 > systems around right now so I just don't see the rush to support them, > particularly when there's a cost to what already exists (for those who > haven't > been watching, so far all the sv48 patch sets have imposed a significant > performance penalty on all systems). > Alex >> >> Alex >> >> Le 7/9/20 à 7:11 AM, Alex Ghiti a écrit : >>> Hi Palmer, >>> >>> Le 7/9/20 à 1:05 AM, Palmer Dabbelt a écrit : >>>> On Sun, 07 Jun 2020 00:59:46 PDT (-0700), alex@ghiti.fr wrote: >>>>> This is a preparatory patch for relocatable kernel. >>>>> >>>>> The kernel used to be linked at PAGE_OFFSET address and used to be >>>>> loaded >>>>> physically at the beginning of the main memory. Therefore, we could >>>>> use >>>>> the linear mapping for the kernel mapping. >>>>> >>>>> But the relocated kernel base address will be different from >>>>> PAGE_OFFSET >>>>> and since in the linear mapping, two different virtual addresses >>>>> cannot >>>>> point to the same physical address, the kernel mapping needs to lie >>>>> outside >>>>> the linear mapping. >>>> >>>> I know it's been a while, but I keep opening this up to review it and >>>> just >>>> can't get over how ugly it is to put the kernel's linear map in the >>>> vmalloc >>>> region. >>>> >>>> I guess I don't understand why this is necessary at all. >>>> Specifically: why >>>> can't we just relocate the kernel within the linear map? That would >>>> let the >>>> bootloader put the kernel wherever it wants, modulo the physical >>>> memory size we >>>> support. We'd need to handle the regions that are coupled to the >>>> kernel's >>>> execution address, but we could just put them in an explicit memory >>>> region >>>> which is what we should probably be doing anyway. >>> >>> Virtual relocation in the linear mapping requires to move the kernel >>> physically too. Zong implemented this physical move in its KASLR RFC >>> patchset, which is cumbersome since finding an available physical spot >>> is harder than just selecting a virtual range in the vmalloc range. >>> >>> In addition, having the kernel mapping in the linear mapping prevents >>> the use of hugepage for the linear mapping resulting in performance loss >>> (at least for the GB that encompasses the kernel). >>> >>> Why do you find this "ugly" ? The vmalloc region is just a bunch of >>> available virtual addresses to whatever purpose we want, and as noted by >>> Zong, arm64 uses the same scheme. >>> >>>> >>>>> In addition, because modules and BPF must be close to the kernel >>>>> (inside >>>>> +-2GB window), the kernel is placed at the end of the vmalloc zone >>>>> minus >>>>> 2GB, which leaves room for modules and BPF. The kernel could not be >>>>> placed at the beginning of the vmalloc zone since other vmalloc >>>>> allocations from the kernel could get all the +-2GB window around the >>>>> kernel which would prevent new modules and BPF programs to be loaded. >>>> >>>> Well, that's not enough to make sure this doesn't happen -- it's just >>>> enough to >>>> make sure it doesn't happen very quickily. That's the same boat we're >>>> already >>>> in, though, so it's not like it's worse. >>> >>> Indeed, that's not worse, I haven't found a way to reserve vmalloc area >>> without actually allocating it. >>> >>>> >>>>> Signed-off-by: Alexandre Ghiti <alex@ghiti.fr> >>>>> Reviewed-by: Zong Li <zong.li@sifive.com> >>>>> --- >>>>> arch/riscv/boot/loader.lds.S | 3 +- >>>>> arch/riscv/include/asm/page.h | 10 +++++- >>>>> arch/riscv/include/asm/pgtable.h | 38 ++++++++++++++------- >>>>> arch/riscv/kernel/head.S | 3 +- >>>>> arch/riscv/kernel/module.c | 4 +-- >>>>> arch/riscv/kernel/vmlinux.lds.S | 3 +- >>>>> arch/riscv/mm/init.c | 58 >>>>> +++++++++++++++++++++++++------- >>>>> arch/riscv/mm/physaddr.c | 2 +- >>>>> 8 files changed, 88 insertions(+), 33 deletions(-) >>>>> >>>>> diff --git a/arch/riscv/boot/loader.lds.S >>>>> b/arch/riscv/boot/loader.lds.S >>>>> index 47a5003c2e28..62d94696a19c 100644 >>>>> --- a/arch/riscv/boot/loader.lds.S >>>>> +++ b/arch/riscv/boot/loader.lds.S >>>>> @@ -1,13 +1,14 @@ >>>>> /* SPDX-License-Identifier: GPL-2.0 */ >>>>> >>>>> #include <asm/page.h> >>>>> +#include <asm/pgtable.h> >>>>> >>>>> OUTPUT_ARCH(riscv) >>>>> ENTRY(_start) >>>>> >>>>> SECTIONS >>>>> { >>>>> - . = PAGE_OFFSET; >>>>> + . = KERNEL_LINK_ADDR; >>>>> >>>>> .payload : { >>>>> *(.payload) >>>>> diff --git a/arch/riscv/include/asm/page.h >>>>> b/arch/riscv/include/asm/page.h >>>>> index 2d50f76efe48..48bb09b6a9b7 100644 >>>>> --- a/arch/riscv/include/asm/page.h >>>>> +++ b/arch/riscv/include/asm/page.h >>>>> @@ -90,18 +90,26 @@ typedef struct page *pgtable_t; >>>>> >>>>> #ifdef CONFIG_MMU >>>>> extern unsigned long va_pa_offset; >>>>> +extern unsigned long va_kernel_pa_offset; >>>>> extern unsigned long pfn_base; >>>>> #define ARCH_PFN_OFFSET (pfn_base) >>>>> #else >>>>> #define va_pa_offset 0 >>>>> +#define va_kernel_pa_offset 0 >>>>> #define ARCH_PFN_OFFSET (PAGE_OFFSET >> PAGE_SHIFT) >>>>> #endif /* CONFIG_MMU */ >>>>> >>>>> extern unsigned long max_low_pfn; >>>>> extern unsigned long min_low_pfn; >>>>> +extern unsigned long kernel_virt_addr; >>>>> >>>>> #define __pa_to_va_nodebug(x) ((void *)((unsigned long) (x) + >>>>> va_pa_offset)) >>>>> -#define __va_to_pa_nodebug(x) ((unsigned long)(x) - va_pa_offset) >>>>> +#define linear_mapping_va_to_pa(x) ((unsigned long)(x) - >>>>> va_pa_offset) >>>>> +#define kernel_mapping_va_to_pa(x) \ >>>>> + ((unsigned long)(x) - va_kernel_pa_offset) >>>>> +#define __va_to_pa_nodebug(x) \ >>>>> + (((x) >= PAGE_OFFSET) ? \ >>>>> + linear_mapping_va_to_pa(x) : kernel_mapping_va_to_pa(x)) >>>>> >>>>> #ifdef CONFIG_DEBUG_VIRTUAL >>>>> extern phys_addr_t __virt_to_phys(unsigned long x); >>>>> diff --git a/arch/riscv/include/asm/pgtable.h >>>>> b/arch/riscv/include/asm/pgtable.h >>>>> index 35b60035b6b0..94ef3b49dfb6 100644 >>>>> --- a/arch/riscv/include/asm/pgtable.h >>>>> +++ b/arch/riscv/include/asm/pgtable.h >>>>> @@ -11,23 +11,29 @@ >>>>> >>>>> #include <asm/pgtable-bits.h> >>>>> >>>>> -#ifndef __ASSEMBLY__ >>>>> - >>>>> -/* Page Upper Directory not used in RISC-V */ >>>>> -#include <asm-generic/pgtable-nopud.h> >>>>> -#include <asm/page.h> >>>>> -#include <asm/tlbflush.h> >>>>> -#include <linux/mm_types.h> >>>>> - >>>>> -#ifdef CONFIG_MMU >>>>> +#ifndef CONFIG_MMU >>>>> +#define KERNEL_VIRT_ADDR PAGE_OFFSET >>>>> +#define KERNEL_LINK_ADDR PAGE_OFFSET >>>>> +#else >>>>> +/* >>>>> + * Leave 2GB for modules and BPF that must lie within a 2GB range >>>>> around >>>>> + * the kernel. >>>>> + */ >>>>> +#define KERNEL_VIRT_ADDR (VMALLOC_END - SZ_2G + 1) >>>>> +#define KERNEL_LINK_ADDR KERNEL_VIRT_ADDR >>>> >>>> At a bare minimum this is going to make a mess of the 32-bit port, as >>>> non-relocatable kernels are now going to get linked at 1GiB which is >>>> where user >>>> code is supposed to live. That's an easy fix, though, as the 32-bit >>>> stuff >>>> doesn't need any module address restrictions. >>> >>> Indeed, I will take a look at that. >>> >>>> >>>>> #define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) >>>>> #define VMALLOC_END (PAGE_OFFSET - 1) >>>>> #define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE) >>>>> >>>>> #define BPF_JIT_REGION_SIZE (SZ_128M) >>>>> -#define BPF_JIT_REGION_START (PAGE_OFFSET - BPF_JIT_REGION_SIZE) >>>>> -#define BPF_JIT_REGION_END (VMALLOC_END) >>>>> +#define BPF_JIT_REGION_START PFN_ALIGN((unsigned long)&_end) >>>>> +#define BPF_JIT_REGION_END (BPF_JIT_REGION_START + >>>>> BPF_JIT_REGION_SIZE) >>>>> + >>>>> +#ifdef CONFIG_64BIT >>>>> +#define VMALLOC_MODULE_START BPF_JIT_REGION_END >>>>> +#define VMALLOC_MODULE_END (((unsigned long)&_start & PAGE_MASK) >>>>> + SZ_2G) >>>>> +#endif >>>>> >>>>> /* >>>>> * Roughly size the vmemmap space to be large enough to fit enough >>>>> @@ -57,9 +63,16 @@ >>>>> #define FIXADDR_SIZE PGDIR_SIZE >>>>> #endif >>>>> #define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) >>>>> - >>>>> #endif >>>>> >>>>> +#ifndef __ASSEMBLY__ >>>>> + >>>>> +/* Page Upper Directory not used in RISC-V */ >>>>> +#include <asm-generic/pgtable-nopud.h> >>>>> +#include <asm/page.h> >>>>> +#include <asm/tlbflush.h> >>>>> +#include <linux/mm_types.h> >>>>> + >>>>> #ifdef CONFIG_64BIT >>>>> #include <asm/pgtable-64.h> >>>>> #else >>>>> @@ -483,6 +496,7 @@ static inline void __kernel_map_pages(struct page >>>>> *page, int numpages, int enabl >>>>> >>>>> #define kern_addr_valid(addr) (1) /* FIXME */ >>>>> >>>>> +extern char _start[]; >>>>> extern void *dtb_early_va; >>>>> void setup_bootmem(void); >>>>> void paging_init(void); >>>>> diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S >>>>> index 98a406474e7d..8f5bb7731327 100644 >>>>> --- a/arch/riscv/kernel/head.S >>>>> +++ b/arch/riscv/kernel/head.S >>>>> @@ -49,7 +49,8 @@ ENTRY(_start) >>>>> #ifdef CONFIG_MMU >>>>> relocate: >>>>> /* Relocate return address */ >>>>> - li a1, PAGE_OFFSET >>>>> + la a1, kernel_virt_addr >>>>> + REG_L a1, 0(a1) >>>>> la a2, _start >>>>> sub a1, a1, a2 >>>>> add ra, ra, a1 >>>>> diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c >>>>> index 8bbe5dbe1341..1a8fbe05accf 100644 >>>>> --- a/arch/riscv/kernel/module.c >>>>> +++ b/arch/riscv/kernel/module.c >>>>> @@ -392,12 +392,10 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const >>>>> char *strtab, >>>>> } >>>>> >>>>> #if defined(CONFIG_MMU) && defined(CONFIG_64BIT) >>>>> -#define VMALLOC_MODULE_START \ >>>>> - max(PFN_ALIGN((unsigned long)&_end - SZ_2G), VMALLOC_START) >>>>> void *module_alloc(unsigned long size) >>>>> { >>>>> return __vmalloc_node_range(size, 1, VMALLOC_MODULE_START, >>>>> - VMALLOC_END, GFP_KERNEL, >>>>> + VMALLOC_MODULE_END, GFP_KERNEL, >>>>> PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, >>>>> __builtin_return_address(0)); >>>>> } >>>>> diff --git a/arch/riscv/kernel/vmlinux.lds.S >>>>> b/arch/riscv/kernel/vmlinux.lds.S >>>>> index 0339b6bbe11a..a9abde62909f 100644 >>>>> --- a/arch/riscv/kernel/vmlinux.lds.S >>>>> +++ b/arch/riscv/kernel/vmlinux.lds.S >>>>> @@ -4,7 +4,8 @@ >>>>> * Copyright (C) 2017 SiFive >>>>> */ >>>>> >>>>> -#define LOAD_OFFSET PAGE_OFFSET >>>>> +#include <asm/pgtable.h> >>>>> +#define LOAD_OFFSET KERNEL_LINK_ADDR >>>>> #include <asm/vmlinux.lds.h> >>>>> #include <asm/page.h> >>>>> #include <asm/cache.h> >>>>> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c >>>>> index 736de6c8739f..71da78914645 100644 >>>>> --- a/arch/riscv/mm/init.c >>>>> +++ b/arch/riscv/mm/init.c >>>>> @@ -22,6 +22,9 @@ >>>>> >>>>> #include "../kernel/head.h" >>>>> >>>>> +unsigned long kernel_virt_addr = KERNEL_VIRT_ADDR; >>>>> +EXPORT_SYMBOL(kernel_virt_addr); >>>>> + >>>>> unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] >>>>> __page_aligned_bss; >>>>> EXPORT_SYMBOL(empty_zero_page); >>>>> @@ -178,8 +181,12 @@ void __init setup_bootmem(void) >>>>> } >>>>> >>>>> #ifdef CONFIG_MMU >>>>> +/* Offset between linear mapping virtual address and kernel load >>>>> address */ >>>>> unsigned long va_pa_offset; >>>>> EXPORT_SYMBOL(va_pa_offset); >>>>> +/* Offset between kernel mapping virtual address and kernel load >>>>> address */ >>>>> +unsigned long va_kernel_pa_offset; >>>>> +EXPORT_SYMBOL(va_kernel_pa_offset); >>>>> unsigned long pfn_base; >>>>> EXPORT_SYMBOL(pfn_base); >>>>> >>>>> @@ -271,7 +278,7 @@ static phys_addr_t __init alloc_pmd(uintptr_t va) >>>>> if (mmu_enabled) >>>>> return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); >>>>> >>>>> - pmd_num = (va - PAGE_OFFSET) >> PGDIR_SHIFT; >>>>> + pmd_num = (va - kernel_virt_addr) >> PGDIR_SHIFT; >>>>> BUG_ON(pmd_num >= NUM_EARLY_PMDS); >>>>> return (uintptr_t)&early_pmd[pmd_num * PTRS_PER_PMD]; >>>>> } >>>>> @@ -372,14 +379,30 @@ static uintptr_t __init >>>>> best_map_size(phys_addr_t base, phys_addr_t size) >>>>> #error "setup_vm() is called from head.S before relocate so it >>>>> should not use absolute addressing." >>>>> #endif >>>>> >>>>> +static uintptr_t load_pa, load_sz; >>>>> + >>>>> +static void __init create_kernel_page_table(pgd_t *pgdir, uintptr_t >>>>> map_size) >>>>> +{ >>>>> + uintptr_t va, end_va; >>>>> + >>>>> + end_va = kernel_virt_addr + load_sz; >>>>> + for (va = kernel_virt_addr; va < end_va; va += map_size) >>>>> + create_pgd_mapping(pgdir, va, >>>>> + load_pa + (va - kernel_virt_addr), >>>>> + map_size, PAGE_KERNEL_EXEC); >>>>> +} >>>>> + >>>>> asmlinkage void __init setup_vm(uintptr_t dtb_pa) >>>>> { >>>>> uintptr_t va, end_va; >>>>> - uintptr_t load_pa = (uintptr_t)(&_start); >>>>> - uintptr_t load_sz = (uintptr_t)(&_end) - load_pa; >>>>> uintptr_t map_size = best_map_size(load_pa, >>>>> MAX_EARLY_MAPPING_SIZE); >>>>> >>>>> + load_pa = (uintptr_t)(&_start); >>>>> + load_sz = (uintptr_t)(&_end) - load_pa; >>>>> + >>>>> va_pa_offset = PAGE_OFFSET - load_pa; >>>>> + va_kernel_pa_offset = kernel_virt_addr - load_pa; >>>>> + >>>>> pfn_base = PFN_DOWN(load_pa); >>>>> >>>>> /* >>>>> @@ -402,26 +425,22 @@ asmlinkage void __init setup_vm(uintptr_t >>>>> dtb_pa) >>>>> create_pmd_mapping(fixmap_pmd, FIXADDR_START, >>>>> (uintptr_t)fixmap_pte, PMD_SIZE, PAGE_TABLE); >>>>> /* Setup trampoline PGD and PMD */ >>>>> - create_pgd_mapping(trampoline_pg_dir, PAGE_OFFSET, >>>>> + create_pgd_mapping(trampoline_pg_dir, kernel_virt_addr, >>>>> (uintptr_t)trampoline_pmd, PGDIR_SIZE, PAGE_TABLE); >>>>> - create_pmd_mapping(trampoline_pmd, PAGE_OFFSET, >>>>> + create_pmd_mapping(trampoline_pmd, kernel_virt_addr, >>>>> load_pa, PMD_SIZE, PAGE_KERNEL_EXEC); >>>>> #else >>>>> /* Setup trampoline PGD */ >>>>> - create_pgd_mapping(trampoline_pg_dir, PAGE_OFFSET, >>>>> + create_pgd_mapping(trampoline_pg_dir, kernel_virt_addr, >>>>> load_pa, PGDIR_SIZE, PAGE_KERNEL_EXEC); >>>>> #endif >>>>> >>>>> /* >>>>> - * Setup early PGD covering entire kernel which will allows >>>>> + * Setup early PGD covering entire kernel which will allow >>>>> * us to reach paging_init(). We map all memory banks later >>>>> * in setup_vm_final() below. >>>>> */ >>>>> - end_va = PAGE_OFFSET + load_sz; >>>>> - for (va = PAGE_OFFSET; va < end_va; va += map_size) >>>>> - create_pgd_mapping(early_pg_dir, va, >>>>> - load_pa + (va - PAGE_OFFSET), >>>>> - map_size, PAGE_KERNEL_EXEC); >>>>> + create_kernel_page_table(early_pg_dir, map_size); >>>>> >>>>> /* Create fixed mapping for early FDT parsing */ >>>>> end_va = __fix_to_virt(FIX_FDT) + FIX_FDT_SIZE; >>>>> @@ -441,6 +460,7 @@ static void __init setup_vm_final(void) >>>>> uintptr_t va, map_size; >>>>> phys_addr_t pa, start, end; >>>>> struct memblock_region *reg; >>>>> + static struct vm_struct vm_kernel = { 0 }; >>>>> >>>>> /* Set mmu_enabled flag */ >>>>> mmu_enabled = true; >>>>> @@ -467,10 +487,22 @@ static void __init setup_vm_final(void) >>>>> for (pa = start; pa < end; pa += map_size) { >>>>> va = (uintptr_t)__va(pa); >>>>> create_pgd_mapping(swapper_pg_dir, va, pa, >>>>> - map_size, PAGE_KERNEL_EXEC); >>>>> + map_size, PAGE_KERNEL); >>>>> } >>>>> } >>>>> >>>>> + /* Map the kernel */ >>>>> + create_kernel_page_table(swapper_pg_dir, PMD_SIZE); >>>>> + >>>>> + /* Reserve the vmalloc area occupied by the kernel */ >>>>> + vm_kernel.addr = (void *)kernel_virt_addr; >>>>> + vm_kernel.phys_addr = load_pa; >>>>> + vm_kernel.size = (load_sz + PMD_SIZE - 1) & ~(PMD_SIZE - 1); >>>>> + vm_kernel.flags = VM_MAP | VM_NO_GUARD; >>>>> + vm_kernel.caller = __builtin_return_address(0); >>>>> + >>>>> + vm_area_add_early(&vm_kernel); >>>>> + >>>>> /* Clear fixmap PTE and PMD mappings */ >>>>> clear_fixmap(FIX_PTE); >>>>> clear_fixmap(FIX_PMD); >>>>> diff --git a/arch/riscv/mm/physaddr.c b/arch/riscv/mm/physaddr.c >>>>> index e8e4dcd39fed..35703d5ef5fd 100644 >>>>> --- a/arch/riscv/mm/physaddr.c >>>>> +++ b/arch/riscv/mm/physaddr.c >>>>> @@ -23,7 +23,7 @@ EXPORT_SYMBOL(__virt_to_phys); >>>>> >>>>> phys_addr_t __phys_addr_symbol(unsigned long x) >>>>> { >>>>> - unsigned long kernel_start = (unsigned long)PAGE_OFFSET; >>>>> + unsigned long kernel_start = (unsigned long)kernel_virt_addr; >>>>> unsigned long kernel_end = (unsigned long)_end; >>>>> >>>>> /* >>> >>> Alex
Le 7/21/20 à 7:36 PM, Palmer Dabbelt a écrit : > On Tue, 21 Jul 2020 16:11:02 PDT (-0700), benh@kernel.crashing.org wrote: >> On Tue, 2020-07-21 at 14:36 -0400, Alex Ghiti wrote: >>> > > I guess I don't understand why this is necessary at all. >>> > > Specifically: why >>> > > can't we just relocate the kernel within the linear map? That would >>> > > let the >>> > > bootloader put the kernel wherever it wants, modulo the physical >>> > > memory size we >>> > > support. We'd need to handle the regions that are coupled to the >>> > > kernel's >>> > > execution address, but we could just put them in an explicit memory >>> > > region >>> > > which is what we should probably be doing anyway. >>> > >>> > Virtual relocation in the linear mapping requires to move the kernel >>> > physically too. Zong implemented this physical move in its KASLR RFC >>> > patchset, which is cumbersome since finding an available physical spot >>> > is harder than just selecting a virtual range in the vmalloc range. >>> > >>> > In addition, having the kernel mapping in the linear mapping prevents >>> > the use of hugepage for the linear mapping resulting in performance >>> loss >>> > (at least for the GB that encompasses the kernel). >>> > >>> > Why do you find this "ugly" ? The vmalloc region is just a bunch of >>> > available virtual addresses to whatever purpose we want, and as >>> noted by >>> > Zong, arm64 uses the same scheme. >> >> I don't get it :-) >> >> At least on powerpc we move the kernel in the linear mapping and it >> works fine with huge pages, what is your problem there ? You rely on >> punching small-page size holes in there ? > > That was my original suggestion, and I'm not actually sure it's > invalid. It > would mean that both the kernel's physical and virtual addresses are set > by the > bootloader, which may or may not be workable if we want to have an > sv48+sv39 > kernel. My initial approach to sv48+sv39 kernels would be to just throw > away > the sv39 memory on sv48 kernels, which would preserve the linear map but > mean > that there is no single physical address that's accessible for both. That > would require some coordination between the bootloader and the kernel as to > where it should be loaded, but maybe there's a better way to design the > linear > map. Right now we have a bunch of unwritten rules about where things > need to > be loaded, which is a recipe for disaster. > > We could copy the kernel around, but I'm not sure I really like that > idea. We > do zero the BSS right now, so it's not like we entirely rely on the > bootloader > to set up the kernel image, but with the hart race boot scheme we have > right > now we'd at least need to leave a stub sitting around. Maybe we just throw > away SBI v0.1, though, that's why we called it all legacy in the first > place. > > My bigger worry is that anything that involves running the kernel at > arbitrary > virtual addresses means we need a PIC kernel, which means every global > symbol > needs an indirection. That's probably not so bad for shared libraries, > but the > kernel has a lot of global symbols. PLT references probably aren't so > scary, > as we have an incoherent instruction cache so the virtual function > predictor > isn't that hard to build, but making all global data accesses GOT-relative > seems like a disaster for performance. This fixed-VA thing really just > exists > so we don't have to be full-on PIC. > > In theory I think we could just get away with pretending that medany is > PIC, > which I believe works as long as the data and text offset stays > constant, you > you don't have any symbols between 2GiB and -2GiB (as those may stay fixed, > even in medany), and you deal with GP accordingly (which should work > itself out > in the current startup code). We rely on this for some of the early > boot code > (and will soon for kexec), but that's a very controlled code base and we've > already had some issues. I'd be much more comfortable adding an explicit > semi-PIC code model, as I tend to miss something when doing these sorts of > things and then we could at least add it to the GCC test runs and > guarantee it > actually works. Not really sure I want to deal with that, though. It > would, > however, be the only way to get random virtual addresses during kernel > execution. > >> At least in the old days, there were a number of assumptions that >> the kernel text/data/bss resides in the linear mapping. > > Ya, it terrified me as well. Alex says arm64 puts the kernel in the > vmalloc > region, so assuming that's the case it must be possible. I didn't get that > from reading the arm64 port (I guess it's no secret that pretty much all > I do > is copy their code) See https://elixir.bootlin.com/linux/latest/source/arch/arm64/mm/mmu.c#L615. > >> If you change that you need to ensure that it's still physically >> contiguous and you'll have to tweak __va and __pa, which might induce >> extra overhead. > > I'm operating under the assumption that we don't want to add an > additional load > to virt2phys conversions. arm64 bends over backwards to avoid the load, > and > I'm assuming they have a reason for doing so. Of course, if we're PIC then > maybe performance just doesn't matter, but I'm not sure I want to just > give up. > Distros will probably build the sv48+sv39 kernels as soon as they show > up, even > if there's no sv48 hardware for a while.
On Thu, 2020-07-23 at 01:21 -0400, Alex Ghiti wrote: > > works fine with huge pages, what is your problem there ? You rely on > > punching small-page size holes in there ? > > > > ARCH_HAS_STRICT_KERNEL_RWX prevents the use of a hugepage for the kernel > mapping in the direct mapping as it sets different permissions to > different part of the kernel (data, text..etc). Ah ok, that can be solved in a couple of ways... One is to use the linker script to ensure those sections are linked HUGE_PAGE_SIZE appart and moved appropriately by early boot code. One is to selectively degrade just those huge pages. I'm not familiar with the RiscV MMU (I should probably go have a look) but if it's a classic radix tree with huge pages at PUD/PMD level, then you could just degrade the one(s) that cross those boundaries. Cheers, Ben.
On Wed, Jul 22, 2020 at 11:06 PM Atish Patra <atishp@atishpatra.org> wrote: > > On Wed, Jul 22, 2020 at 1:23 PM Arnd Bergmann <arnd@arndb.de> wrote: > > > > I just noticed that rv32 allows 2GB of lowmem rather than just the usual > > 768MB or 1GB, at the expense of addressable user memory. This seems > > like an unusual choice, but I also don't see any reason to change this > > or make it more flexible unless actual users appear. > > > > I am a bit confused here. As per my understanding, RV32 supports 1GB > of lowmem only > as the page offset is set to 0xC0000000. The config option > MAXPHYSMEM_2GB is misleading > as RV32 actually allows 1GB of physical memory only. Ok, in that case I was apparently misled by the Kconfig option name. I just tried building a kernel to see what the boundaries actually are, as this is not the only confusing bit. Here is what I see: 0x9dc00000 TASK_SIZE/FIXADDR_START /* code comment says 0x9fc00000 */ 0x9e000000 FIXADDR_TOP/PCI_IO_START 0x9f000000 PCI_IO_END/VMEMMAP_START 0xa0000000 VMEMMAP_END/VMALLOC_START 0xc0000000 VMALLOC_END/PAGE_OFFSET Having exactly 1GB of linear map does make a lot of sense. Having PCI I/O, vmemmap and fixmap come out of the user range means you get slightly different behavior in user space if there are any changes to that set, but that is probably fine as well, if you want the flexibility to go to a 2GB linear map and expect user space to deal with that as well. There is one common trick from arm32 however that you might want to consider: if vmalloc was moved above the linear map rather than below, the size of the vmalloc area can dynamically depend on the amount of RAM that is actually present rather than be set to a fixed value. On arm32, there is around 240MB of vmalloc space if the linear map is fully populated with RAM, but it can grow to use all of the avaialable address space if less RAM was detected at boot time (up to 3GB depending on CONFIG_VMSPLIT). > Any memory blocks beyond > DRAM + 1GB are removed in setup_bootmem. IMHO, The current config > should clarify that. > > Moreover, we should add 2G split under a separate configuration if we > want to support that. Right. It's probably not needed immediately, but can't hurt either. Arnd
On Fri, Jul 24, 2020 at 12:34 AM Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote: > On Thu, 2020-07-23 at 01:21 -0400, Alex Ghiti wrote: > > > works fine with huge pages, what is your problem there ? You rely on > > > punching small-page size holes in there ? > > > > > > > ARCH_HAS_STRICT_KERNEL_RWX prevents the use of a hugepage for the kernel > > mapping in the direct mapping as it sets different permissions to > > different part of the kernel (data, text..etc). > > Ah ok, that can be solved in a couple of ways... > > One is to use the linker script to ensure those sections are linked > HUGE_PAGE_SIZE appart and moved appropriately by early boot code. One > is to selectively degrade just those huge pages. > > I'm not familiar with the RiscV MMU (I should probably go have a look) > but if it's a classic radix tree with huge pages at PUD/PMD level, then > you could just degrade the one(s) that cross those boundaries. That would work, but if the system can otherwise use 1GB-sized pages, that might mean degrading the first gigabyte into a mix of 2MB pages and 4KB pages. If the kernel is in vmalloc space and vmap is able to use 2MB pages for contiguous chunks of the mapping, you get a somewhat better TLB usage. However, this also means that a writable mapping exists in the linear mapping for any executable part of the kernel (.text in both vmlinux and modules). Do we have that on other architectures as well, or is this something that ought to be prevented with STRICT_KERNEL_RWX/STRICT_MODULE_RWX? Arnd
diff --git a/arch/riscv/boot/loader.lds.S b/arch/riscv/boot/loader.lds.S index 47a5003c2e28..62d94696a19c 100644 --- a/arch/riscv/boot/loader.lds.S +++ b/arch/riscv/boot/loader.lds.S @@ -1,13 +1,14 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include <asm/page.h> +#include <asm/pgtable.h> OUTPUT_ARCH(riscv) ENTRY(_start) SECTIONS { - . = PAGE_OFFSET; + . = KERNEL_LINK_ADDR; .payload : { *(.payload) diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index 2d50f76efe48..48bb09b6a9b7 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -90,18 +90,26 @@ typedef struct page *pgtable_t; #ifdef CONFIG_MMU extern unsigned long va_pa_offset; +extern unsigned long va_kernel_pa_offset; extern unsigned long pfn_base; #define ARCH_PFN_OFFSET (pfn_base) #else #define va_pa_offset 0 +#define va_kernel_pa_offset 0 #define ARCH_PFN_OFFSET (PAGE_OFFSET >> PAGE_SHIFT) #endif /* CONFIG_MMU */ extern unsigned long max_low_pfn; extern unsigned long min_low_pfn; +extern unsigned long kernel_virt_addr; #define __pa_to_va_nodebug(x) ((void *)((unsigned long) (x) + va_pa_offset)) -#define __va_to_pa_nodebug(x) ((unsigned long)(x) - va_pa_offset) +#define linear_mapping_va_to_pa(x) ((unsigned long)(x) - va_pa_offset) +#define kernel_mapping_va_to_pa(x) \ + ((unsigned long)(x) - va_kernel_pa_offset) +#define __va_to_pa_nodebug(x) \ + (((x) >= PAGE_OFFSET) ? \ + linear_mapping_va_to_pa(x) : kernel_mapping_va_to_pa(x)) #ifdef CONFIG_DEBUG_VIRTUAL extern phys_addr_t __virt_to_phys(unsigned long x); diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 35b60035b6b0..94ef3b49dfb6 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -11,23 +11,29 @@ #include <asm/pgtable-bits.h> -#ifndef __ASSEMBLY__ - -/* Page Upper Directory not used in RISC-V */ -#include <asm-generic/pgtable-nopud.h> -#include <asm/page.h> -#include <asm/tlbflush.h> -#include <linux/mm_types.h> - -#ifdef CONFIG_MMU +#ifndef CONFIG_MMU +#define KERNEL_VIRT_ADDR PAGE_OFFSET +#define KERNEL_LINK_ADDR PAGE_OFFSET +#else +/* + * Leave 2GB for modules and BPF that must lie within a 2GB range around + * the kernel. + */ +#define KERNEL_VIRT_ADDR (VMALLOC_END - SZ_2G + 1) +#define KERNEL_LINK_ADDR KERNEL_VIRT_ADDR #define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) #define VMALLOC_END (PAGE_OFFSET - 1) #define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE) #define BPF_JIT_REGION_SIZE (SZ_128M) -#define BPF_JIT_REGION_START (PAGE_OFFSET - BPF_JIT_REGION_SIZE) -#define BPF_JIT_REGION_END (VMALLOC_END) +#define BPF_JIT_REGION_START PFN_ALIGN((unsigned long)&_end) +#define BPF_JIT_REGION_END (BPF_JIT_REGION_START + BPF_JIT_REGION_SIZE) + +#ifdef CONFIG_64BIT +#define VMALLOC_MODULE_START BPF_JIT_REGION_END +#define VMALLOC_MODULE_END (((unsigned long)&_start & PAGE_MASK) + SZ_2G) +#endif /* * Roughly size the vmemmap space to be large enough to fit enough @@ -57,9 +63,16 @@ #define FIXADDR_SIZE PGDIR_SIZE #endif #define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) - #endif +#ifndef __ASSEMBLY__ + +/* Page Upper Directory not used in RISC-V */ +#include <asm-generic/pgtable-nopud.h> +#include <asm/page.h> +#include <asm/tlbflush.h> +#include <linux/mm_types.h> + #ifdef CONFIG_64BIT #include <asm/pgtable-64.h> #else @@ -483,6 +496,7 @@ static inline void __kernel_map_pages(struct page *page, int numpages, int enabl #define kern_addr_valid(addr) (1) /* FIXME */ +extern char _start[]; extern void *dtb_early_va; void setup_bootmem(void); void paging_init(void); diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S index 98a406474e7d..8f5bb7731327 100644 --- a/arch/riscv/kernel/head.S +++ b/arch/riscv/kernel/head.S @@ -49,7 +49,8 @@ ENTRY(_start) #ifdef CONFIG_MMU relocate: /* Relocate return address */ - li a1, PAGE_OFFSET + la a1, kernel_virt_addr + REG_L a1, 0(a1) la a2, _start sub a1, a1, a2 add ra, ra, a1 diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c index 8bbe5dbe1341..1a8fbe05accf 100644 --- a/arch/riscv/kernel/module.c +++ b/arch/riscv/kernel/module.c @@ -392,12 +392,10 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, } #if defined(CONFIG_MMU) && defined(CONFIG_64BIT) -#define VMALLOC_MODULE_START \ - max(PFN_ALIGN((unsigned long)&_end - SZ_2G), VMALLOC_START) void *module_alloc(unsigned long size) { return __vmalloc_node_range(size, 1, VMALLOC_MODULE_START, - VMALLOC_END, GFP_KERNEL, + VMALLOC_MODULE_END, GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, __builtin_return_address(0)); } diff --git a/arch/riscv/kernel/vmlinux.lds.S b/arch/riscv/kernel/vmlinux.lds.S index 0339b6bbe11a..a9abde62909f 100644 --- a/arch/riscv/kernel/vmlinux.lds.S +++ b/arch/riscv/kernel/vmlinux.lds.S @@ -4,7 +4,8 @@ * Copyright (C) 2017 SiFive */ -#define LOAD_OFFSET PAGE_OFFSET +#include <asm/pgtable.h> +#define LOAD_OFFSET KERNEL_LINK_ADDR #include <asm/vmlinux.lds.h> #include <asm/page.h> #include <asm/cache.h> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 736de6c8739f..71da78914645 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -22,6 +22,9 @@ #include "../kernel/head.h" +unsigned long kernel_virt_addr = KERNEL_VIRT_ADDR; +EXPORT_SYMBOL(kernel_virt_addr); + unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; EXPORT_SYMBOL(empty_zero_page); @@ -178,8 +181,12 @@ void __init setup_bootmem(void) } #ifdef CONFIG_MMU +/* Offset between linear mapping virtual address and kernel load address */ unsigned long va_pa_offset; EXPORT_SYMBOL(va_pa_offset); +/* Offset between kernel mapping virtual address and kernel load address */ +unsigned long va_kernel_pa_offset; +EXPORT_SYMBOL(va_kernel_pa_offset); unsigned long pfn_base; EXPORT_SYMBOL(pfn_base); @@ -271,7 +278,7 @@ static phys_addr_t __init alloc_pmd(uintptr_t va) if (mmu_enabled) return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); - pmd_num = (va - PAGE_OFFSET) >> PGDIR_SHIFT; + pmd_num = (va - kernel_virt_addr) >> PGDIR_SHIFT; BUG_ON(pmd_num >= NUM_EARLY_PMDS); return (uintptr_t)&early_pmd[pmd_num * PTRS_PER_PMD]; } @@ -372,14 +379,30 @@ static uintptr_t __init best_map_size(phys_addr_t base, phys_addr_t size) #error "setup_vm() is called from head.S before relocate so it should not use absolute addressing." #endif +static uintptr_t load_pa, load_sz; + +static void __init create_kernel_page_table(pgd_t *pgdir, uintptr_t map_size) +{ + uintptr_t va, end_va; + + end_va = kernel_virt_addr + load_sz; + for (va = kernel_virt_addr; va < end_va; va += map_size) + create_pgd_mapping(pgdir, va, + load_pa + (va - kernel_virt_addr), + map_size, PAGE_KERNEL_EXEC); +} + asmlinkage void __init setup_vm(uintptr_t dtb_pa) { uintptr_t va, end_va; - uintptr_t load_pa = (uintptr_t)(&_start); - uintptr_t load_sz = (uintptr_t)(&_end) - load_pa; uintptr_t map_size = best_map_size(load_pa, MAX_EARLY_MAPPING_SIZE); + load_pa = (uintptr_t)(&_start); + load_sz = (uintptr_t)(&_end) - load_pa; + va_pa_offset = PAGE_OFFSET - load_pa; + va_kernel_pa_offset = kernel_virt_addr - load_pa; + pfn_base = PFN_DOWN(load_pa); /* @@ -402,26 +425,22 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) create_pmd_mapping(fixmap_pmd, FIXADDR_START, (uintptr_t)fixmap_pte, PMD_SIZE, PAGE_TABLE); /* Setup trampoline PGD and PMD */ - create_pgd_mapping(trampoline_pg_dir, PAGE_OFFSET, + create_pgd_mapping(trampoline_pg_dir, kernel_virt_addr, (uintptr_t)trampoline_pmd, PGDIR_SIZE, PAGE_TABLE); - create_pmd_mapping(trampoline_pmd, PAGE_OFFSET, + create_pmd_mapping(trampoline_pmd, kernel_virt_addr, load_pa, PMD_SIZE, PAGE_KERNEL_EXEC); #else /* Setup trampoline PGD */ - create_pgd_mapping(trampoline_pg_dir, PAGE_OFFSET, + create_pgd_mapping(trampoline_pg_dir, kernel_virt_addr, load_pa, PGDIR_SIZE, PAGE_KERNEL_EXEC); #endif /* - * Setup early PGD covering entire kernel which will allows + * Setup early PGD covering entire kernel which will allow * us to reach paging_init(). We map all memory banks later * in setup_vm_final() below. */ - end_va = PAGE_OFFSET + load_sz; - for (va = PAGE_OFFSET; va < end_va; va += map_size) - create_pgd_mapping(early_pg_dir, va, - load_pa + (va - PAGE_OFFSET), - map_size, PAGE_KERNEL_EXEC); + create_kernel_page_table(early_pg_dir, map_size); /* Create fixed mapping for early FDT parsing */ end_va = __fix_to_virt(FIX_FDT) + FIX_FDT_SIZE; @@ -441,6 +460,7 @@ static void __init setup_vm_final(void) uintptr_t va, map_size; phys_addr_t pa, start, end; struct memblock_region *reg; + static struct vm_struct vm_kernel = { 0 }; /* Set mmu_enabled flag */ mmu_enabled = true; @@ -467,10 +487,22 @@ static void __init setup_vm_final(void) for (pa = start; pa < end; pa += map_size) { va = (uintptr_t)__va(pa); create_pgd_mapping(swapper_pg_dir, va, pa, - map_size, PAGE_KERNEL_EXEC); + map_size, PAGE_KERNEL); } } + /* Map the kernel */ + create_kernel_page_table(swapper_pg_dir, PMD_SIZE); + + /* Reserve the vmalloc area occupied by the kernel */ + vm_kernel.addr = (void *)kernel_virt_addr; + vm_kernel.phys_addr = load_pa; + vm_kernel.size = (load_sz + PMD_SIZE - 1) & ~(PMD_SIZE - 1); + vm_kernel.flags = VM_MAP | VM_NO_GUARD; + vm_kernel.caller = __builtin_return_address(0); + + vm_area_add_early(&vm_kernel); + /* Clear fixmap PTE and PMD mappings */ clear_fixmap(FIX_PTE); clear_fixmap(FIX_PMD); diff --git a/arch/riscv/mm/physaddr.c b/arch/riscv/mm/physaddr.c index e8e4dcd39fed..35703d5ef5fd 100644 --- a/arch/riscv/mm/physaddr.c +++ b/arch/riscv/mm/physaddr.c @@ -23,7 +23,7 @@ EXPORT_SYMBOL(__virt_to_phys); phys_addr_t __phys_addr_symbol(unsigned long x) { - unsigned long kernel_start = (unsigned long)PAGE_OFFSET; + unsigned long kernel_start = (unsigned long)kernel_virt_addr; unsigned long kernel_end = (unsigned long)_end; /*