Message ID | 20230125-b4-powerpc-rtas-queue-v2-11-9aa6bd058063@linux.ibm.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | RTAS maintenance | expand |
Nathan Lynch via B4 Submission Endpoint <devnull+nathanl.linux.ibm.com@kernel.org> writes: > diff --git a/arch/powerpc/include/asm/rtas-work-area.h b/arch/powerpc/include/asm/rtas-work-area.h > new file mode 100644 > index 000000000000..76ccb039cc37 > --- /dev/null > +++ b/arch/powerpc/include/asm/rtas-work-area.h > @@ -0,0 +1,45 @@ > +/* SPDX-License-Identifier: GPL-2.0-only */ > +#ifndef POWERPC_RTAS_WORK_AREA_H > +#define POWERPC_RTAS_WORK_AREA_H The usual style would be _ASM_POWERPC_RTAS_WORK_AREA_H. > diff --git a/arch/powerpc/kernel/rtas-work-area.c b/arch/powerpc/kernel/rtas-work-area.c > new file mode 100644 > index 000000000000..75950e13a0fe > --- /dev/null > +++ b/arch/powerpc/kernel/rtas-work-area.c > @@ -0,0 +1,208 @@ > +// SPDX-License-Identifier: GPL-2.0-only > + > +#define pr_fmt(fmt) "rtas-work-area: " fmt > + > +#include <linux/genalloc.h> > +#include <linux/log2.h> > +#include <linux/kernel.h> > +#include <linux/memblock.h> > +#include <linux/mempool.h> > +#include <linux/minmax.h> > +#include <linux/mutex.h> > +#include <linux/numa.h> > +#include <linux/sizes.h> > +#include <linux/wait.h> > + > +#include <asm/machdep.h> > +#include <asm/rtas-work-area.h> > + > +enum { > + /* > + * Ensure the pool is page-aligned. > + */ > + RTAS_WORK_AREA_ARENA_ALIGN = PAGE_SIZE, > + > + RTAS_WORK_AREA_ARENA_SZ = SZ_256K, > + /* > + * The smallest known work area size is for ibm,get-vpd's > + * location code argument, which is limited to 79 characters > + * plus 1 nul terminator. > + * > + * PAPR+ 7.3.20 ibm,get-vpd RTAS Call > + * PAPR+ 12.3.2.4 Converged Location Code Rules - Length Restrictions > + */ > + RTAS_WORK_AREA_MIN_ALLOC_SZ = roundup_pow_of_two(80), > + /* > + * Don't let a single allocation claim the whole arena. > + */ > + RTAS_WORK_AREA_MAX_ALLOC_SZ = RTAS_WORK_AREA_ARENA_SZ / 2, > +}; > + > +static struct rtas_work_area_allocator_state { > + struct gen_pool *gen_pool; > + char *arena; > + struct mutex mutex; /* serializes allocations */ > + struct wait_queue_head wqh; > + mempool_t descriptor_pool; > + bool available; > +} rwa_state_ = { > + .mutex = __MUTEX_INITIALIZER(rwa_state_.mutex), > + .wqh = __WAIT_QUEUE_HEAD_INITIALIZER(rwa_state_.wqh), > +}; > +static struct rtas_work_area_allocator_state *rwa_state = &rwa_state_; I assumed the pointer was so you could swap this out at runtime or something, but I don't think you do. Any reason not to drop the pointer and just use rwa_state.foo accessors? That would also allow the struct to be anonymous. Or if you have the pointer you can at least make it NULL prior to init and avoid the need for "available". > +/* > + * A single work area buffer and descriptor to serve requests early in > + * boot before the allocator is fully initialized. > + */ > +static bool early_work_area_in_use __initdata; > +static char early_work_area_buf[SZ_4K] __initdata; That should be page aligned I think? > +static struct rtas_work_area early_work_area __initdata = { > + .buf = early_work_area_buf, > + .size = sizeof(early_work_area_buf), > +}; > + > + > +static struct rtas_work_area * __init rtas_work_area_alloc_early(size_t size) > +{ > + WARN_ON(size > early_work_area.size); > + WARN_ON(early_work_area_in_use); > + early_work_area_in_use = true; > + memset(early_work_area.buf, 0, early_work_area.size); > + return &early_work_area; > +} > + > +static void __init rtas_work_area_free_early(struct rtas_work_area *work_area) > +{ > + WARN_ON(work_area != &early_work_area); > + WARN_ON(!early_work_area_in_use); > + early_work_area_in_use = false; > +} > + > +struct rtas_work_area * __ref rtas_work_area_alloc(size_t size) > +{ > + struct rtas_work_area *area; > + unsigned long addr; > + > + might_sleep(); > + > + WARN_ON(size > RTAS_WORK_AREA_MAX_ALLOC_SZ); > + size = min_t(size_t, size, RTAS_WORK_AREA_MAX_ALLOC_SZ); This seems unsafe. If you return a buffer smaller than the caller asks for they're likely to read/write past the end of it and corrupt memory. AFAIK genalloc doesn't have guard pages or anything fancy to save us from that - but maybe I'm wrong, I've never used it. There's only three callers in the end, seems like we should just return NULL if the size is too large and have callers check the return value. > + if (!rwa_state->available) { > + area = rtas_work_area_alloc_early(size); > + goto out; > + } > + > + /* > + * To ensure FCFS behavior and prevent a high rate of smaller > + * requests from starving larger ones, use the mutex to queue > + * allocations. > + */ > + mutex_lock(&rwa_state->mutex); > + wait_event(rwa_state->wqh, > + (addr = gen_pool_alloc(rwa_state->gen_pool, size)) != 0); > + mutex_unlock(&rwa_state->mutex); > + > + area = mempool_alloc(&rwa_state->descriptor_pool, GFP_KERNEL); > + *area = (typeof(*area)){ > + .size = size, > + .buf = (char *)addr, > + }; That is an odd way to write that :) > +out: > + pr_devel("%ps -> %s() -> buf=%p size=%zu\n", > + (void *)_RET_IP_, __func__, area->buf, area->size); Can we drop those? They need a recompile to enable, so if someone needs debugging they can just rewrite them - or use some sort of tracing instead. > + return area; > +} > + > +void __ref rtas_work_area_free(struct rtas_work_area *area) > +{ > + pr_devel("%ps -> %s() -> buf=%p size=%zu\n", > + (void *)_RET_IP_, __func__, area->buf, area->size); Ditto. > + if (!rwa_state->available) { > + rtas_work_area_free_early(area); > + return; > + } > + > + gen_pool_free(rwa_state->gen_pool, (unsigned long)area->buf, area->size); > + mempool_free(area, &rwa_state->descriptor_pool); > + wake_up(&rwa_state->wqh); > +} > + > +/* > + * Initialization of the work area allocator happens in two parts. To > + * reliably reserve an arena that satisfies RTAS addressing > + * requirements, we must perform a memblock allocation early, > + * immmediately after RTAS instantiation. Then we have to wait until > + * the slab allocator is up before setting up the descriptor mempool > + * and adding the arena to a gen_pool. > + */ > +static __init int rtas_work_area_allocator_init(void) > +{ > + const unsigned int order = ilog2(RTAS_WORK_AREA_MIN_ALLOC_SZ); > + const phys_addr_t pa_start = __pa(rwa_state->arena); > + const phys_addr_t pa_end = pa_start + RTAS_WORK_AREA_ARENA_SZ - 1; > + struct gen_pool *pool; > + const int nid = NUMA_NO_NODE; > + int err; > + > + err = -ENOMEM; > + if (!rwa_state->arena) > + goto err_out; > + > + pool = gen_pool_create(order, nid); > + if (!pool) > + goto err_out; > + /* > + * All RTAS functions that consume work areas are OK with > + * natural alignment, when they have alignment requirements at > + * all. > + */ > + gen_pool_set_algo(pool, gen_pool_first_fit_order_align, NULL); > + > + err = gen_pool_add(pool, (unsigned long)rwa_state->arena, > + RTAS_WORK_AREA_ARENA_SZ, nid); > + if (err) > + goto err_destroy; > + > + err = mempool_init_kmalloc_pool(&rwa_state->descriptor_pool, 1, > + sizeof(struct rtas_work_area)); > + if (err) > + goto err_destroy; > + > + rwa_state->gen_pool = pool; > + rwa_state->available = true; > + > + pr_debug("arena [%pa-%pa] (%uK), min/max alloc sizes %u/%u\n", > + &pa_start, &pa_end, > + RTAS_WORK_AREA_ARENA_SZ / SZ_1K, > + RTAS_WORK_AREA_MIN_ALLOC_SZ, > + RTAS_WORK_AREA_MAX_ALLOC_SZ); > + > + return 0; > + > +err_destroy: > + gen_pool_destroy(pool); > +err_out: > + return err; > +} > +machine_arch_initcall(pseries, rtas_work_area_allocator_init); Should it live in platforms/pseries then? > +/** > + * rtas_work_area_reserve_arena() - reserve memory suitable for RTAS work areas. > + */ > +int __init rtas_work_area_reserve_arena(const phys_addr_t limit) > +{ > + const phys_addr_t align = RTAS_WORK_AREA_ARENA_ALIGN; > + const phys_addr_t size = RTAS_WORK_AREA_ARENA_SZ; > + const phys_addr_t min = MEMBLOCK_LOW_LIMIT; > + const int nid = NUMA_NO_NODE; This should probably also be restricted to pseries? cheers
Michael Ellerman <mpe@ellerman.id.au> writes: > Nathan Lynch via B4 Submission Endpoint > <devnull+nathanl.linux.ibm.com@kernel.org> writes: >> diff --git a/arch/powerpc/include/asm/rtas-work-area.h b/arch/powerpc/include/asm/rtas-work-area.h >> new file mode 100644 >> index 000000000000..76ccb039cc37 >> --- /dev/null >> +++ b/arch/powerpc/include/asm/rtas-work-area.h >> @@ -0,0 +1,45 @@ >> +/* SPDX-License-Identifier: GPL-2.0-only */ >> +#ifndef POWERPC_RTAS_WORK_AREA_H >> +#define POWERPC_RTAS_WORK_AREA_H > > The usual style would be _ASM_POWERPC_RTAS_WORK_AREA_H. OK. (will change in all new headers) >> +static struct rtas_work_area_allocator_state { >> + struct gen_pool *gen_pool; >> + char *arena; >> + struct mutex mutex; /* serializes allocations */ >> + struct wait_queue_head wqh; >> + mempool_t descriptor_pool; >> + bool available; >> +} rwa_state_ = { >> + .mutex = __MUTEX_INITIALIZER(rwa_state_.mutex), >> + .wqh = __WAIT_QUEUE_HEAD_INITIALIZER(rwa_state_.wqh), >> +}; >> +static struct rtas_work_area_allocator_state *rwa_state = &rwa_state_; > > I assumed the pointer was so you could swap this out at runtime or > something, but I don't think you do. > > Any reason not to drop the pointer and just use rwa_state.foo accessors? > That would also allow the struct to be anonymous. > > Or if you have the pointer you can at least make it NULL prior to init > and avoid the need for "available". I think it's there because earlier versions of this that I never posted had unit tests. I'll either resurrect those or reduce the indirection. >> +/* >> + * A single work area buffer and descriptor to serve requests early in >> + * boot before the allocator is fully initialized. >> + */ >> +static bool early_work_area_in_use __initdata; >> +static char early_work_area_buf[SZ_4K] __initdata; > > That should be page aligned I think? Yes. It happens to be safe in this version because ibm,get-system-parameter, which has no alignment requirement, is the only function used early enough to use the buffer. But that's too fragile. >> +static struct rtas_work_area early_work_area __initdata = { >> + .buf = early_work_area_buf, >> + .size = sizeof(early_work_area_buf), >> +}; >> + >> + >> +static struct rtas_work_area * __init rtas_work_area_alloc_early(size_t size) >> +{ >> + WARN_ON(size > early_work_area.size); >> + WARN_ON(early_work_area_in_use); >> + early_work_area_in_use = true; >> + memset(early_work_area.buf, 0, early_work_area.size); >> + return &early_work_area; >> +} >> + >> +static void __init rtas_work_area_free_early(struct rtas_work_area *work_area) >> +{ >> + WARN_ON(work_area != &early_work_area); >> + WARN_ON(!early_work_area_in_use); >> + early_work_area_in_use = false; >> +} >> + >> +struct rtas_work_area * __ref rtas_work_area_alloc(size_t size) >> +{ >> + struct rtas_work_area *area; >> + unsigned long addr; >> + >> + might_sleep(); >> + >> + WARN_ON(size > RTAS_WORK_AREA_MAX_ALLOC_SZ); >> + size = min_t(size_t, size, RTAS_WORK_AREA_MAX_ALLOC_SZ); > > This seems unsafe. > > If you return a buffer smaller than the caller asks for they're likely > to read/write past the end of it and corrupt memory. OK, let's figure out another way to handle this. > AFAIK genalloc doesn't have guard pages or anything fancy to save us > from that - but maybe I'm wrong, I've never used it. Yeah we would have to build our own thing on top of it. And I don't think it could be something that traps on access, it would have to be a check in rtas_work_area_free(), after the fact. > There's only three callers in the end, seems like we should just return > NULL if the size is too large and have callers check the return value. There are more conversions to do, and a property I hope to maintain is that requests can't fail. Existing users of rtas_data_buf don't have error paths for failure to acquire the buffer. I believe the allocation size passed to rtas_work_area_alloc() can be known at build time in all cases. Maybe we could prevent inappropriate requests from being built with a compile-time assertion (untested): /* rtas-work-area.h */ static inline struct rtas_work_area *rtas_work_area_alloc(size_t sz) { static_assert(sz < RTAS_WORK_AREA_MAX_ALLOC_SZ); return __rtas_work_area_alloc(sz); } I think this would be OK? If I can't make it work I'll fall back to returning NULL as you suggest, but it will make for more churn (and risk) in the conversions. >> + if (!rwa_state->available) { >> + area = rtas_work_area_alloc_early(size); >> + goto out; >> + } >> + >> + /* >> + * To ensure FCFS behavior and prevent a high rate of smaller >> + * requests from starving larger ones, use the mutex to queue >> + * allocations. >> + */ >> + mutex_lock(&rwa_state->mutex); >> + wait_event(rwa_state->wqh, >> + (addr = gen_pool_alloc(rwa_state->gen_pool, size)) != 0); >> + mutex_unlock(&rwa_state->mutex); >> + >> + area = mempool_alloc(&rwa_state->descriptor_pool, GFP_KERNEL); >> + *area = (typeof(*area)){ >> + .size = size, >> + .buf = (char *)addr, >> + }; > > That is an odd way to write that :) yeah I'll change it. > >> +out: >> + pr_devel("%ps -> %s() -> buf=%p size=%zu\n", >> + (void *)_RET_IP_, __func__, area->buf, area->size); > > Can we drop those? They need a recompile to enable, so if someone needs > debugging they can just rewrite them - or use some sort of tracing > instead. Sure. >> +machine_arch_initcall(pseries, rtas_work_area_allocator_init); > > Should it live in platforms/pseries then? Yeah it probably ought to. I am pretty sure the "work area" construct is PAPR-specific, and I haven't found any evidence that it's used on non-pseries. >> +/** >> + * rtas_work_area_reserve_arena() - reserve memory suitable for RTAS work areas. >> + */ >> +int __init rtas_work_area_reserve_arena(const phys_addr_t limit) >> +{ >> + const phys_addr_t align = RTAS_WORK_AREA_ARENA_ALIGN; >> + const phys_addr_t size = RTAS_WORK_AREA_ARENA_SZ; >> + const phys_addr_t min = MEMBLOCK_LOW_LIMIT; >> + const int nid = NUMA_NO_NODE; > > This should probably also be restricted to pseries? Yes.
Nathan Lynch <nathanl@linux.ibm.com> writes: > Michael Ellerman <mpe@ellerman.id.au> writes: >> Nathan Lynch via B4 Submission Endpoint >> <devnull+nathanl.linux.ibm.com@kernel.org> writes: ... >>> +struct rtas_work_area * __ref rtas_work_area_alloc(size_t size) >>> +{ >>> + struct rtas_work_area *area; >>> + unsigned long addr; >>> + >>> + might_sleep(); >>> + >>> + WARN_ON(size > RTAS_WORK_AREA_MAX_ALLOC_SZ); >>> + size = min_t(size_t, size, RTAS_WORK_AREA_MAX_ALLOC_SZ); >> >> This seems unsafe. >> >> If you return a buffer smaller than the caller asks for they're likely >> to read/write past the end of it and corrupt memory. > > OK, let's figure out another way to handle this. > >> AFAIK genalloc doesn't have guard pages or anything fancy to save us >> from that - but maybe I'm wrong, I've never used it. > > Yeah we would have to build our own thing on top of it. And I don't > think it could be something that traps on access, it would have to be a > check in rtas_work_area_free(), after the fact. I *think* we could use the MMU. We'd just have to allocate whole pages, and then vmap() them (create a mapping in vmalloc space), and then give the vmalloc space address back to the caller. They'd then operate on that address, meaning any overflow would trap. You already have rtas_work_area_phys() for passing the phys address to RTAS. But that would be a lot more complicated than your suggestion below. >> There's only three callers in the end, seems like we should just return >> NULL if the size is too large and have callers check the return value. > > There are more conversions to do, and a property I hope to maintain is > that requests can't fail. Existing users of rtas_data_buf don't have > error paths for failure to acquire the buffer. > > I believe the allocation size passed to rtas_work_area_alloc() can be > known at build time in all cases. Maybe we could prevent inappropriate > requests from being built with a compile-time assertion (untested): > > /* rtas-work-area.h */ > > static inline struct rtas_work_area *rtas_work_area_alloc(size_t sz) > { > static_assert(sz < RTAS_WORK_AREA_MAX_ALLOC_SZ); > return __rtas_work_area_alloc(sz); > } > > I think this would be OK? If I can't make it work I'll fall back to > returning NULL as you suggest, but it will make for more churn (and > risk) in the conversions. Yeah if the sizes are always known at compile time that is a much better solution. cheers
diff --git a/arch/powerpc/include/asm/rtas-work-area.h b/arch/powerpc/include/asm/rtas-work-area.h new file mode 100644 index 000000000000..76ccb039cc37 --- /dev/null +++ b/arch/powerpc/include/asm/rtas-work-area.h @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef POWERPC_RTAS_WORK_AREA_H +#define POWERPC_RTAS_WORK_AREA_H + +#include <linux/types.h> + +#include <asm/page.h> + +/** + * struct rtas_work_area - RTAS work area descriptor. + * + * Descriptor for a "work area" in PAPR terminology that satisfies + * RTAS addressing requirements. + */ +struct rtas_work_area { + /* private: Use the APIs provided below. */ + char *buf; + size_t size; +}; + +struct rtas_work_area *rtas_work_area_alloc(size_t size); +void rtas_work_area_free(struct rtas_work_area *area); + +static inline char *rtas_work_area_raw_buf(const struct rtas_work_area *area) +{ + return area->buf; +} + +static inline size_t rtas_work_area_size(const struct rtas_work_area *area) +{ + return area->size; +} + +static inline phys_addr_t rtas_work_area_phys(const struct rtas_work_area *area) +{ + return __pa(area->buf); +} + +/* + * Early setup for the work area allocator. Call from + * rtas_initialize() only. + */ +int rtas_work_area_reserve_arena(phys_addr_t); + +#endif /* POWERPC_RTAS_WORK_AREA_H */ diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 9b6146056e48..69e652e319a4 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -90,7 +90,8 @@ obj-$(CONFIG_PPC_BOOK3S_IDLE) += idle_book3s.o procfs-y := proc_powerpc.o obj-$(CONFIG_PROC_FS) += $(procfs-y) rtaspci-$(CONFIG_PPC64)-$(CONFIG_PCI) := rtas_pci.o -obj-$(CONFIG_PPC_RTAS) += rtas_entry.o rtas.o rtas-rtc.o $(rtaspci-y-y) +obj-$(CONFIG_PPC_RTAS) += rtas_entry.o rtas.o rtas-rtc.o $(rtaspci-y-y) \ + rtas-work-area.o obj-$(CONFIG_PPC_RTAS_DAEMON) += rtasd.o obj-$(CONFIG_RTAS_FLASH) += rtas_flash.o obj-$(CONFIG_RTAS_PROC) += rtas-proc.o diff --git a/arch/powerpc/kernel/rtas-work-area.c b/arch/powerpc/kernel/rtas-work-area.c new file mode 100644 index 000000000000..75950e13a0fe --- /dev/null +++ b/arch/powerpc/kernel/rtas-work-area.c @@ -0,0 +1,208 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#define pr_fmt(fmt) "rtas-work-area: " fmt + +#include <linux/genalloc.h> +#include <linux/log2.h> +#include <linux/kernel.h> +#include <linux/memblock.h> +#include <linux/mempool.h> +#include <linux/minmax.h> +#include <linux/mutex.h> +#include <linux/numa.h> +#include <linux/sizes.h> +#include <linux/wait.h> + +#include <asm/machdep.h> +#include <asm/rtas-work-area.h> + +enum { + /* + * Ensure the pool is page-aligned. + */ + RTAS_WORK_AREA_ARENA_ALIGN = PAGE_SIZE, + + RTAS_WORK_AREA_ARENA_SZ = SZ_256K, + /* + * The smallest known work area size is for ibm,get-vpd's + * location code argument, which is limited to 79 characters + * plus 1 nul terminator. + * + * PAPR+ 7.3.20 ibm,get-vpd RTAS Call + * PAPR+ 12.3.2.4 Converged Location Code Rules - Length Restrictions + */ + RTAS_WORK_AREA_MIN_ALLOC_SZ = roundup_pow_of_two(80), + /* + * Don't let a single allocation claim the whole arena. + */ + RTAS_WORK_AREA_MAX_ALLOC_SZ = RTAS_WORK_AREA_ARENA_SZ / 2, +}; + +static struct rtas_work_area_allocator_state { + struct gen_pool *gen_pool; + char *arena; + struct mutex mutex; /* serializes allocations */ + struct wait_queue_head wqh; + mempool_t descriptor_pool; + bool available; +} rwa_state_ = { + .mutex = __MUTEX_INITIALIZER(rwa_state_.mutex), + .wqh = __WAIT_QUEUE_HEAD_INITIALIZER(rwa_state_.wqh), +}; +static struct rtas_work_area_allocator_state *rwa_state = &rwa_state_; + +/* + * A single work area buffer and descriptor to serve requests early in + * boot before the allocator is fully initialized. + */ +static bool early_work_area_in_use __initdata; +static char early_work_area_buf[SZ_4K] __initdata; +static struct rtas_work_area early_work_area __initdata = { + .buf = early_work_area_buf, + .size = sizeof(early_work_area_buf), +}; + + +static struct rtas_work_area * __init rtas_work_area_alloc_early(size_t size) +{ + WARN_ON(size > early_work_area.size); + WARN_ON(early_work_area_in_use); + early_work_area_in_use = true; + memset(early_work_area.buf, 0, early_work_area.size); + return &early_work_area; +} + +static void __init rtas_work_area_free_early(struct rtas_work_area *work_area) +{ + WARN_ON(work_area != &early_work_area); + WARN_ON(!early_work_area_in_use); + early_work_area_in_use = false; +} + +struct rtas_work_area * __ref rtas_work_area_alloc(size_t size) +{ + struct rtas_work_area *area; + unsigned long addr; + + might_sleep(); + + WARN_ON(size > RTAS_WORK_AREA_MAX_ALLOC_SZ); + size = min_t(size_t, size, RTAS_WORK_AREA_MAX_ALLOC_SZ); + + if (!rwa_state->available) { + area = rtas_work_area_alloc_early(size); + goto out; + } + + /* + * To ensure FCFS behavior and prevent a high rate of smaller + * requests from starving larger ones, use the mutex to queue + * allocations. + */ + mutex_lock(&rwa_state->mutex); + wait_event(rwa_state->wqh, + (addr = gen_pool_alloc(rwa_state->gen_pool, size)) != 0); + mutex_unlock(&rwa_state->mutex); + + area = mempool_alloc(&rwa_state->descriptor_pool, GFP_KERNEL); + *area = (typeof(*area)){ + .size = size, + .buf = (char *)addr, + }; +out: + pr_devel("%ps -> %s() -> buf=%p size=%zu\n", + (void *)_RET_IP_, __func__, area->buf, area->size); + + return area; +} + +void __ref rtas_work_area_free(struct rtas_work_area *area) +{ + pr_devel("%ps -> %s() -> buf=%p size=%zu\n", + (void *)_RET_IP_, __func__, area->buf, area->size); + + if (!rwa_state->available) { + rtas_work_area_free_early(area); + return; + } + + gen_pool_free(rwa_state->gen_pool, (unsigned long)area->buf, area->size); + mempool_free(area, &rwa_state->descriptor_pool); + wake_up(&rwa_state->wqh); +} + +/* + * Initialization of the work area allocator happens in two parts. To + * reliably reserve an arena that satisfies RTAS addressing + * requirements, we must perform a memblock allocation early, + * immmediately after RTAS instantiation. Then we have to wait until + * the slab allocator is up before setting up the descriptor mempool + * and adding the arena to a gen_pool. + */ +static __init int rtas_work_area_allocator_init(void) +{ + const unsigned int order = ilog2(RTAS_WORK_AREA_MIN_ALLOC_SZ); + const phys_addr_t pa_start = __pa(rwa_state->arena); + const phys_addr_t pa_end = pa_start + RTAS_WORK_AREA_ARENA_SZ - 1; + struct gen_pool *pool; + const int nid = NUMA_NO_NODE; + int err; + + err = -ENOMEM; + if (!rwa_state->arena) + goto err_out; + + pool = gen_pool_create(order, nid); + if (!pool) + goto err_out; + /* + * All RTAS functions that consume work areas are OK with + * natural alignment, when they have alignment requirements at + * all. + */ + gen_pool_set_algo(pool, gen_pool_first_fit_order_align, NULL); + + err = gen_pool_add(pool, (unsigned long)rwa_state->arena, + RTAS_WORK_AREA_ARENA_SZ, nid); + if (err) + goto err_destroy; + + err = mempool_init_kmalloc_pool(&rwa_state->descriptor_pool, 1, + sizeof(struct rtas_work_area)); + if (err) + goto err_destroy; + + rwa_state->gen_pool = pool; + rwa_state->available = true; + + pr_debug("arena [%pa-%pa] (%uK), min/max alloc sizes %u/%u\n", + &pa_start, &pa_end, + RTAS_WORK_AREA_ARENA_SZ / SZ_1K, + RTAS_WORK_AREA_MIN_ALLOC_SZ, + RTAS_WORK_AREA_MAX_ALLOC_SZ); + + return 0; + +err_destroy: + gen_pool_destroy(pool); +err_out: + return err; +} +machine_arch_initcall(pseries, rtas_work_area_allocator_init); + +/** + * rtas_work_area_reserve_arena() - reserve memory suitable for RTAS work areas. + */ +int __init rtas_work_area_reserve_arena(const phys_addr_t limit) +{ + const phys_addr_t align = RTAS_WORK_AREA_ARENA_ALIGN; + const phys_addr_t size = RTAS_WORK_AREA_ARENA_SZ; + const phys_addr_t min = MEMBLOCK_LOW_LIMIT; + const int nid = NUMA_NO_NODE; + + rwa_state->arena = memblock_alloc_try_nid(size, align, min, limit, nid); + if (!rwa_state->arena) + return -ENOMEM; + + return 0; +} diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index 3290f25b9b34..41c430dc40c2 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -36,6 +36,7 @@ #include <asm/machdep.h> #include <asm/mmu.h> #include <asm/page.h> +#include <asm/rtas-work-area.h> #include <asm/rtas.h> #include <asm/time.h> #include <asm/trace.h> @@ -1938,6 +1939,8 @@ void __init rtas_initialize(void) #endif ibm_open_errinjct_token = rtas_token("ibm,open-errinjct"); ibm_errinjct_token = rtas_token("ibm,errinjct"); + + rtas_work_area_reserve_arena(rtas_region); } int __init early_init_dt_scan_rtas(unsigned long node,