Message ID | 20201113153333.290505-1-clombard@linux.vnet.ibm.com (mailing list archive) |
---|---|
State | Changes Requested |
Headers | show |
Series | ocxl: Mmio invalidation support | expand |
Context | Check | Description |
---|---|---|
snowpatch_ozlabs/apply_patch | success | Successfully applied on branch powerpc/merge (80ecbe16c827714ce3741ed1f1d34488b903e717) |
snowpatch_ozlabs/build-ppc64le | success | Build succeeded |
snowpatch_ozlabs/build-ppc64be | success | Build succeeded |
snowpatch_ozlabs/build-ppc64e | success | Build succeeded |
snowpatch_ozlabs/build-pmac32 | success | Build succeeded |
snowpatch_ozlabs/checkpatch | warning | total: 1 errors, 6 warnings, 28 checks, 479 lines checked |
snowpatch_ozlabs/needsstable | success | Patch has no Fixes tags |
On 13/11/2020 16:33, Christophe Lombard wrote: > OpenCAPI 4.0/5.0 with TLBI/SLBI Snooping, is not used due to performance > problems caused by the PAU having to process all incoming TLBI/SLBI > commands which will cause them to back up on the PowerBus. > > When the Address Translation Mode requires TLB and SLB Invalidate > operations to be initiated using MMIO registers, a set of registers like > the following is used: > • XTS MMIO ATSD0 LPARID register > • XTS MMIO ATSD0 AVA register > • XTS MMIO ATSD0 launch register, write access initiates a shoot down > • XTS MMIO ATSD0 status register > > The MMIO based mechanism also blocks the NPU/PAU from snooping TLBIE > commands from the PowerBus. > > The Shootdown commands (ATSD) will be generated using MMIO registers > in the NPU/PAU and sent to the device. > > Signed-off-by: Christophe Lombard <clombard@linux.vnet.ibm.com> > --- > arch/powerpc/include/asm/pnv-ocxl.h | 2 + > arch/powerpc/platforms/powernv/ocxl.c | 19 +++ > drivers/misc/ocxl/link.c | 180 ++++++++++++++++++++++---- > drivers/misc/ocxl/ocxl_internal.h | 46 ++++++- > drivers/misc/ocxl/trace.h | 125 ++++++++++++++++++ > 5 files changed, 348 insertions(+), 24 deletions(-) > > diff --git a/arch/powerpc/include/asm/pnv-ocxl.h b/arch/powerpc/include/asm/pnv-ocxl.h > index d37ededca3ee..4a23abcc347b 100644 > --- a/arch/powerpc/include/asm/pnv-ocxl.h > +++ b/arch/powerpc/include/asm/pnv-ocxl.h > @@ -28,4 +28,6 @@ int pnv_ocxl_spa_setup(struct pci_dev *dev, void *spa_mem, int PE_mask, void **p > void pnv_ocxl_spa_release(void *platform_data); > int pnv_ocxl_spa_remove_pe_from_cache(void *platform_data, int pe_handle); > > +extern int pnv_ocxl_map_lpar(struct pci_dev *dev, uint64_t lparid, > + uint64_t lpcr); "extern" is useless > #endif /* _ASM_PNV_OCXL_H */ > diff --git a/arch/powerpc/platforms/powernv/ocxl.c b/arch/powerpc/platforms/powernv/ocxl.c > index ecdad219d704..100546ea635f 100644 > --- a/arch/powerpc/platforms/powernv/ocxl.c > +++ b/arch/powerpc/platforms/powernv/ocxl.c > @@ -483,3 +483,22 @@ int pnv_ocxl_spa_remove_pe_from_cache(void *platform_data, int pe_handle) > return rc; > } > EXPORT_SYMBOL_GPL(pnv_ocxl_spa_remove_pe_from_cache); > + > +int pnv_ocxl_map_lpar(struct pci_dev *dev, uint64_t lparid, > + uint64_t lpcr) > +{ > + struct pci_controller *hose = pci_bus_to_host(dev->bus); > + struct pnv_phb *phb = hose->private_data; > + u32 bdfn; > + int rc; > + > + bdfn = (dev->bus->number << 8) | dev->devfn; I was told a bit too late that pci_dev_id() exists, so we should probably use from now on. > + rc = opal_npu_map_lpar(phb->opal_id, bdfn, lparid, lpcr); > + if (rc) { > + dev_err(&dev->dev, "Error mapping device to LPAR: %d\n", rc); > + return -EINVAL; > + } > + > + return 0; > +} > +EXPORT_SYMBOL_GPL(pnv_ocxl_map_lpar); > diff --git a/drivers/misc/ocxl/link.c b/drivers/misc/ocxl/link.c > index fd73d3bc0eb6..9b5b77d40734 100644 > --- a/drivers/misc/ocxl/link.c > +++ b/drivers/misc/ocxl/link.c Overall, there are many changes in that file and it would help the review if it could be broken up in a set of smaller patches. > @@ -4,6 +4,8 @@ > #include <linux/mutex.h> > #include <linux/mm_types.h> > #include <linux/mmu_context.h> > +#include <linux/mm.h> > +#include <linux/mmu_notifier.h> > #include <asm/copro.h> > #include <asm/pnv-ocxl.h> > #include <asm/xive.h> > @@ -33,6 +35,31 @@ > > #define SPA_PE_VALID 0x80000000 > > +struct spa; > + > +/* > + * A opencapi link can be used be by several PCI functions. We have > + * one link per device slot. > + * > + * A linked list of opencapi links should suffice, as there's a > + * limited number of opencapi slots on a system and lookup is only > + * done when the device is probed > + */ > +struct ocxl_link { > + struct list_head list; > + struct kref ref; > + int domain; > + int bus; > + int dev; > + u64 mmio_atsd; /* ATSD physical address */ > + void __iomem *base; /* ATSD register virtual address */ > + spinlock_t atsd_lock; // to serialize shootdowns > + atomic_t irq_available; > + struct spa *spa; > + void *platform_data; > +}; > +static struct list_head links_list = LIST_HEAD_INIT(links_list); > +static DEFINE_MUTEX(links_list_lock); > > struct pe_data { > struct mm_struct *mm; > @@ -41,6 +68,8 @@ struct pe_data { > /* opaque pointer to be passed to the above callback */ > void *xsl_err_data; > struct rcu_head rcu; > + struct ocxl_link *link; > + struct mmu_notifier mmu_notifier; > }; > > struct spa { > @@ -69,27 +98,6 @@ struct spa { > } xsl_fault; > }; > > -/* > - * A opencapi link can be used be by several PCI functions. We have > - * one link per device slot. > - * > - * A linked list of opencapi links should suffice, as there's a > - * limited number of opencapi slots on a system and lookup is only > - * done when the device is probed > - */ > -struct ocxl_link { > - struct list_head list; > - struct kref ref; > - int domain; > - int bus; > - int dev; > - atomic_t irq_available; > - struct spa *spa; > - void *platform_data; > -}; > -static struct list_head links_list = LIST_HEAD_INIT(links_list); > -static DEFINE_MUTEX(links_list_lock); > - > enum xsl_response { > CONTINUE, > ADDRESS_ERROR, > @@ -126,6 +134,8 @@ static void ack_irq(struct spa *spa, enum xsl_response r) > } > } > > +static const struct mmu_notifier_ops ocxl_mmu_notifier_ops; > + > static void xsl_fault_handler_bh(struct work_struct *fault_work) > { > vm_fault_t flt = 0; > @@ -376,6 +386,7 @@ static void free_spa(struct ocxl_link *link) > > static int alloc_link(struct pci_dev *dev, int PE_mask, struct ocxl_link **out_link) > { > + struct pci_controller *hose = pci_bus_to_host(dev->bus); > struct ocxl_link *link; > int rc; > > @@ -403,6 +414,22 @@ static int alloc_link(struct pci_dev *dev, int PE_mask, struct ocxl_link **out_l > if (rc) > goto err_xsl_irq; > > + /* Since OpenCAPI 5.0, Address Translation Mode requires TLB > + * and SLB Invalidate operations to be initiated using MMIO > + * registers > + */ > + if (of_property_read_u64_index(hose->dn, "ibm,mmio-atsd", > + 0, &link->mmio_atsd)) { > + dev_info(&dev->dev, "No available ATSD found\n"); > + } > + if (link->mmio_atsd) { > + link->base = ioremap(link->mmio_atsd, 24); > + if (!link->base) > + dev_warn(&dev->dev, "ioremap failed - mmio_atsd: %#llx\n", link->mmio_atsd); > + else > + pnv_ocxl_map_lpar(dev, mfspr(SPRN_LPID), 0); > + } To stay coherent with how we've done things like interrupts, the device tree parsing and ioremap() for the shootdown page should be done in a platform-specific file (powernv) Also, could we get a more descriptive name than link->base? > + > *out_link = link; > return 0; > > @@ -464,12 +491,101 @@ void ocxl_link_release(struct pci_dev *dev, void *link_handle) > { > struct ocxl_link *link = (struct ocxl_link *) link_handle; > > + if (link->base) { > + iounmap(link->base); > + link->base = NULL; > + } What's allocated in alloc_link(), i.e. the shootdown page, should be released in release_xsl() and not in ocxl_link_release() (it's too early in case of several functions). > mutex_lock(&links_list_lock); > kref_put(&link->ref, release_xsl); > mutex_unlock(&links_list_lock); > } > EXPORT_SYMBOL_GPL(ocxl_link_release); > > +static void tlb_invalidate(struct ocxl_link *link, > + unsigned long pid, > + unsigned long addr) > +{ > + unsigned long timeout = jiffies + (HZ * OCXL_ATSD_TIMEOUT); > + uint64_t val; > + int pend; > + > + if (!link->base) > + return; > + > + spin_lock(&link->atsd_lock); Looks like atsd_lock is not initialized. > + if (addr) { > + /* load Abbreviated Virtual Address register with > + * the necessary value > + */ > + val = SETFIELD(XTS_ATSD_AVA_AVA, 0ull, addr >> (63-51)); > + out_be64(link->base + XTS_ATSD_AVA, val); > + eieio(); That eieio() needs a comment. > + trace_ocxl_mmu_notifier_mmio_atsd_ava(val, pid); > + } > + > + /* Write access initiates a shoot down to initiate the > + * TLB Invalidate command > + */ > + val = XTS_ATSD_LNCH_R; All that code should be in a platform-specific file. > + if (addr) { > + val = SETFIELD(XTS_ATSD_LNCH_RIC, val, 0b00); > + val = SETFIELD(XTS_ATSD_LNCH_IS, val, 0b00); > + } else { > + val = SETFIELD(XTS_ATSD_LNCH_RIC, val, 0b10); > + val = SETFIELD(XTS_ATSD_LNCH_IS, val, 0b01); > + val |= XTS_ATSD_LNCH_OCAPI_SINGLETON; > + } > + val |= XTS_ATSD_LNCH_PRS; > + val = SETFIELD(XTS_ATSD_LNCH_AP, val, 0b101); > + val = SETFIELD(XTS_ATSD_LNCH_PID, val, pid); > + out_be64(link->base + XTS_ATSD_LNCH, val); > + trace_ocxl_mmu_notifier_mmio_atsd_lnch(val, addr, pid); > + > + /* Poll the ATSD status register to determine when the > + * TLB Invalidate has been completed. > + */ > + val = in_be64(link->base + XTS_ATSD_STAT); > + pend = val >> 63; > + trace_ocxl_mmu_notifier_mmio_atsd_stat(val, addr, pid); > + > + while (pend) { > + if (time_after_eq(jiffies, timeout)) { > + pr_err("%s - Timeout while reading XTS MMIO ATSD status register (val=%#llx, pidr=0x%lx)\n", > + __func__, val, pid); > + spin_unlock(&link->atsd_lock); > + return; > + } > + cpu_relax(); > + val = in_be64(link->base + XTS_ATSD_STAT); > + pend = val >> 63; > + } > + spin_unlock(&link->atsd_lock); > + trace_ocxl_mmu_notifier_mmio_atsd_stat(val, addr, pid); > +} > + > +static void invalidate_range_end(struct mmu_notifier *mn, > + const struct mmu_notifier_range *range) > +{ > + struct pe_data *pe_data = container_of(mn, struct pe_data, mmu_notifier); > + struct ocxl_link *link = pe_data->link; > + struct mm_struct *mm = mn->mm; > + unsigned long addr, pid, page_size = PAGE_SIZE; > + > + pid = mm->context.id; > + trace_ocxl_mmu_notifier_range(range->start, range->end, pid); > + > + for (addr = range->start; addr < range->end; addr += page_size) > + tlb_invalidate(link, pid, addr); > +} > + > +static const struct mmu_notifier_ops ocxl_mmu_notifier_ops = { > + /* invalidate_range_end() is called when all pages in the > + * range have been unmapped and the pages have been freed by > + * the VM > + */ > + .invalidate_range_end = invalidate_range_end, > +}; > + See comments in mmu_notifier.h. Looks like we shouldn't be using invalidate_range_end(), it's too late Did we test abrupt death of the host process? Do we need to also use .release()? > static u64 calculate_cfg_state(bool kernel) > { > u64 state; > @@ -517,7 +633,7 @@ int ocxl_link_add_pe(void *link_handle, int pasid, u32 pidr, u32 tidr, > goto unlock; > } > > - pe_data = kmalloc(sizeof(*pe_data), GFP_KERNEL); > + pe_data = kzalloc(sizeof(*pe_data), GFP_KERNEL); > if (!pe_data) { > rc = -ENOMEM; > goto unlock; > @@ -526,9 +642,13 @@ int ocxl_link_add_pe(void *link_handle, int pasid, u32 pidr, u32 tidr, > pe_data->mm = mm; > pe_data->xsl_err_cb = xsl_err_cb; > pe_data->xsl_err_data = xsl_err_data; > + pe_data->link = link; > + pe_data->mmu_notifier.ops = &ocxl_mmu_notifier_ops; > > memset(pe, 0, sizeof(struct ocxl_process_element)); > pe->config_state = cpu_to_be64(calculate_cfg_state(pidr == 0)); > + pe->pasid = cpu_to_be32(pasid << (31 - 19)); > + pe->bdf = cpu_to_be32(1 << (31 - 15)); why do we hard-code bdf 1? > pe->lpid = cpu_to_be32(mfspr(SPRN_LPID)); > pe->pid = cpu_to_be32(pidr); > pe->tid = cpu_to_be32(tidr); > @@ -540,8 +660,17 @@ int ocxl_link_add_pe(void *link_handle, int pasid, u32 pidr, u32 tidr, > * by the nest MMU. If we have a kernel context, TLBIs are > * already global. > */ > - if (mm) > + if (mm) { > mm_context_add_copro(mm); > + if (link->base) { > + /* Use MMIO registers for the TLB and SLB > + * Invalidate operations. > + */ > + trace_init_mmu_notifier(pasid, mm->context.id); > + mmu_notifier_register(&pe_data->mmu_notifier, mm); > + } > + } > + > /* > * Barrier is to make sure PE is visible in the SPA before it > * is used by the device. It also helps with the global TLBI > @@ -672,6 +801,11 @@ int ocxl_link_remove_pe(void *link_handle, int pasid) > WARN(1, "Couldn't find pe data when removing PE\n"); > } else { > if (pe_data->mm) { > + if (link->base) { > + trace_release_mmu_notifier(pasid, pe_data->mm->context.id); > + mmu_notifier_unregister(&pe_data->mmu_notifier, pe_data->mm); > + tlb_invalidate(link, pe_data->mm->context.id, 0ull); > + } > mm_context_remove_copro(pe_data->mm); > mmdrop(pe_data->mm); > } > diff --git a/drivers/misc/ocxl/ocxl_internal.h b/drivers/misc/ocxl/ocxl_internal.h > index 0bad0a123af6..35d8be3cd270 100644 > --- a/drivers/misc/ocxl/ocxl_internal.h > +++ b/drivers/misc/ocxl/ocxl_internal.h > @@ -8,6 +8,48 @@ > #include <linux/list.h> > #include <misc/ocxl.h> > > +/* Find left shift from first set bit in mask */ > +#define MASK_TO_LSH(m) (__builtin_ffsl(m) - 1) > + > +/* Set field fname of oval to fval > + * NOTE: oval isn't modified, the combined result is returned > + */ > +#define SETFIELD(m, v, val) \ > + (((v) & ~(m)) | ((((typeof(v))(val)) << MASK_TO_LSH(m)) & (m))) > + > +#define OCXL_ATSD_TIMEOUT 1 > + > +/* 5.9.3.3 TLB Management Instructions - PowerISA tags workbook */ > +#define XTS_ATSD_LNCH 0x00 > +#define XTS_ATSD_LNCH_R PPC_BIT(0) /* Radix Invalidate */ > +#define XTS_ATSD_LNCH_RIC PPC_BITMASK(1,2) /* Radix Invalidation Control > + * 0b00 Just invalidate TLB. > + * 0b01 Invalidate just Page Walk Cache. > + * 0b10 Invalidate TLB, Page Walk Cache, and any > + * caching of Partition and Process Table Entries. > + */ > +#define XTS_ATSD_LNCH_LP PPC_BITMASK(3, 10) /* Number and Page Size of translations to be invalidated (HPT only ?) */ > +#define XTS_ATSD_LNCH_IS PPC_BITMASK(11, 12) /* Invalidation Criteria > + * 0b00 Invalidate just the target VA. > + * 0b01 Invalidate matching PID. > + */ > +#define XTS_ATSD_LNCH_PRS PPC_BIT(13) /* 0b1: Process Scope, 0b0: Partition Scope */ > +#define XTS_ATSD_LNCH_B PPC_BIT(14) /* Invalidation Flag */ > +#define XTS_ATSD_LNCH_AP PPC_BITMASK(15, 17) /* Actual Page Size to be invalidated > + * 000 4KB > + * 101 64KB > + * 001 2MB > + * 010 1GB > + */ > +#define XTS_ATSD_LNCH_L PPC_BIT(18) /* Defines the large page select (L=0b0 for 4KB pages, L=0b1 for large pages) */ > +#define XTS_ATSD_LNCH_PID PPC_BITMASK(19, 38) /* Process ID */ > +#define XTS_ATSD_LNCH_F PPC_BIT(39) /* NoFlush – Assumed to be 0b0 */ > +#define XTS_ATSD_LNCH_OCAPI_SLBI PPC_BIT(40) > +#define XTS_ATSD_LNCH_OCAPI_SINGLETON PPC_BIT(41) > +#define XTS_ATSD_AVA 0x08 > +#define XTS_ATSD_AVA_AVA PPC_BITMASK(0, 51) /* au lieu de 35*/ > +#define XTS_ATSD_STAT 0x10 > + > #define MAX_IRQ_PER_LINK 2000 > #define MAX_IRQ_PER_CONTEXT MAX_IRQ_PER_LINK > > @@ -84,7 +126,9 @@ struct ocxl_context { > > struct ocxl_process_element { > __be64 config_state; > - __be32 reserved1[11]; > + __be32 pasid; > + __be32 bdf; > + __be32 reserved1[9]; > __be32 lpid; > __be32 tid; > __be32 pid; > diff --git a/drivers/misc/ocxl/trace.h b/drivers/misc/ocxl/trace.h > index 17e21cb2addd..6171069d071a 100644 > --- a/drivers/misc/ocxl/trace.h > +++ b/drivers/misc/ocxl/trace.h I haven't looked at this yet. It should probably be in a separate patch. Fred > @@ -8,6 +8,131 @@ > > #include <linux/tracepoint.h> > > + > +TRACE_EVENT(ocxl_mmu_notifier_range, > + TP_PROTO(unsigned long start, unsigned long end, unsigned long pidr), > + TP_ARGS(start, end, pidr), > + > + TP_STRUCT__entry( > + __field(unsigned long, start) > + __field(unsigned long, end) > + __field(unsigned long, pidr) > + ), > + > + TP_fast_assign( > + __entry->start = start; > + __entry->end = end; > + __entry->pidr = pidr; > + ), > + > + TP_printk("start=0x%lx end=0x%lx pidr=0x%lx", > + __entry->start, > + __entry->end, > + __entry->pidr > + ) > +); > + > +TRACE_EVENT(ocxl_mmu_notifier_mmio_atsd_ava, > + TP_PROTO(u64 val, unsigned long pidr), > + TP_ARGS(val, pidr), > + > + TP_STRUCT__entry( > + __field(u64, val) > + __field(unsigned long, pidr) > + ), > + > + TP_fast_assign( > + __entry->val = val; > + __entry->pidr = pidr; > + ), > + > + TP_printk("ATSD AVA: 0x%llx pidr=0x%lx", > + __entry->val, __entry->pidr > + ) > +); > + > +TRACE_EVENT(ocxl_mmu_notifier_mmio_atsd_lnch, > + TP_PROTO(u64 val, unsigned long addr, unsigned long pidr), > + TP_ARGS(val, addr, pidr), > + > + TP_STRUCT__entry( > + __field(u64, val) > + __field(unsigned long, addr) > + __field(unsigned long, pidr) > + ), > + > + TP_fast_assign( > + __entry->val = val; > + __entry->addr = addr; > + __entry->pidr = pidr; > + ), > + > + TP_printk("ATSD LNCH: 0x%llx addr=0x%lx pidr=0x%lx", > + __entry->val, __entry->addr, __entry->pidr > + ) > +); > + > +TRACE_EVENT(ocxl_mmu_notifier_mmio_atsd_stat, > + TP_PROTO(u64 val, unsigned long addr, unsigned long pidr), > + TP_ARGS(val, addr, pidr), > + > + TP_STRUCT__entry( > + __field(u64, val) > + __field(unsigned long, addr) > + __field(unsigned long, pidr) > + ), > + > + TP_fast_assign( > + __entry->val = val; > + __entry->addr = addr; > + __entry->pidr = pidr; > + ), > + > + TP_printk("ATSD STAT: 0x%llx addr=0x%lx pidr=0x%lx", > + __entry->val, __entry->addr, __entry->pidr > + ) > +); > + > +TRACE_EVENT(init_mmu_notifier, > + TP_PROTO(int pasid, unsigned long pidr), > + TP_ARGS(pasid, pidr), > + > + TP_STRUCT__entry( > + __field(int, pasid) > + __field(unsigned long, pidr) > + ), > + > + TP_fast_assign( > + __entry->pasid = pasid; > + __entry->pidr = pidr; > + ), > + > + TP_printk("pasid=%d, pidr=0x%lx", > + __entry->pasid, > + __entry->pidr > + ) > +); > + > +TRACE_EVENT(release_mmu_notifier, > + TP_PROTO(int pasid, unsigned long pidr), > + TP_ARGS(pasid, pidr), > + > + TP_STRUCT__entry( > + __field(int, pasid) > + __field(unsigned long, pidr) > + ), > + > + TP_fast_assign( > + __entry->pasid = pasid; > + __entry->pidr = pidr; > + ), > + > + TP_printk("pasid=%d, pidr=0x%lx", > + __entry->pasid, > + __entry->pidr > + ) > +); > + > DECLARE_EVENT_CLASS(ocxl_context, > TP_PROTO(pid_t pid, void *spa, int pasid, u32 pidr, u32 tidr), > TP_ARGS(pid, spa, pasid, pidr, tidr), >
On 14/11/20 2:33 am, Christophe Lombard wrote: > OpenCAPI 4.0/5.0 with TLBI/SLBI Snooping, is not used due to performance > problems caused by the PAU having to process all incoming TLBI/SLBI > commands which will cause them to back up on the PowerBus. > > When the Address Translation Mode requires TLB and SLB Invalidate > operations to be initiated using MMIO registers, a set of registers like > the following is used: > • XTS MMIO ATSD0 LPARID register > • XTS MMIO ATSD0 AVA register > • XTS MMIO ATSD0 launch register, write access initiates a shoot down > • XTS MMIO ATSD0 status register > > The MMIO based mechanism also blocks the NPU/PAU from snooping TLBIE > commands from the PowerBus. > > The Shootdown commands (ATSD) will be generated using MMIO registers > in the NPU/PAU and sent to the device. > > Signed-off-by: Christophe Lombard <clombard@linux.vnet.ibm.com> snowpatch has reported some minor checkpatch issues: https://openpower.xyz/job/snowpatch/job/snowpatch-linux-checkpatch/16267//artifact/linux/checkpatch.log
diff --git a/arch/powerpc/include/asm/pnv-ocxl.h b/arch/powerpc/include/asm/pnv-ocxl.h index d37ededca3ee..4a23abcc347b 100644 --- a/arch/powerpc/include/asm/pnv-ocxl.h +++ b/arch/powerpc/include/asm/pnv-ocxl.h @@ -28,4 +28,6 @@ int pnv_ocxl_spa_setup(struct pci_dev *dev, void *spa_mem, int PE_mask, void **p void pnv_ocxl_spa_release(void *platform_data); int pnv_ocxl_spa_remove_pe_from_cache(void *platform_data, int pe_handle); +extern int pnv_ocxl_map_lpar(struct pci_dev *dev, uint64_t lparid, + uint64_t lpcr); #endif /* _ASM_PNV_OCXL_H */ diff --git a/arch/powerpc/platforms/powernv/ocxl.c b/arch/powerpc/platforms/powernv/ocxl.c index ecdad219d704..100546ea635f 100644 --- a/arch/powerpc/platforms/powernv/ocxl.c +++ b/arch/powerpc/platforms/powernv/ocxl.c @@ -483,3 +483,22 @@ int pnv_ocxl_spa_remove_pe_from_cache(void *platform_data, int pe_handle) return rc; } EXPORT_SYMBOL_GPL(pnv_ocxl_spa_remove_pe_from_cache); + +int pnv_ocxl_map_lpar(struct pci_dev *dev, uint64_t lparid, + uint64_t lpcr) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + struct pnv_phb *phb = hose->private_data; + u32 bdfn; + int rc; + + bdfn = (dev->bus->number << 8) | dev->devfn; + rc = opal_npu_map_lpar(phb->opal_id, bdfn, lparid, lpcr); + if (rc) { + dev_err(&dev->dev, "Error mapping device to LPAR: %d\n", rc); + return -EINVAL; + } + + return 0; +} +EXPORT_SYMBOL_GPL(pnv_ocxl_map_lpar); diff --git a/drivers/misc/ocxl/link.c b/drivers/misc/ocxl/link.c index fd73d3bc0eb6..9b5b77d40734 100644 --- a/drivers/misc/ocxl/link.c +++ b/drivers/misc/ocxl/link.c @@ -4,6 +4,8 @@ #include <linux/mutex.h> #include <linux/mm_types.h> #include <linux/mmu_context.h> +#include <linux/mm.h> +#include <linux/mmu_notifier.h> #include <asm/copro.h> #include <asm/pnv-ocxl.h> #include <asm/xive.h> @@ -33,6 +35,31 @@ #define SPA_PE_VALID 0x80000000 +struct spa; + +/* + * A opencapi link can be used be by several PCI functions. We have + * one link per device slot. + * + * A linked list of opencapi links should suffice, as there's a + * limited number of opencapi slots on a system and lookup is only + * done when the device is probed + */ +struct ocxl_link { + struct list_head list; + struct kref ref; + int domain; + int bus; + int dev; + u64 mmio_atsd; /* ATSD physical address */ + void __iomem *base; /* ATSD register virtual address */ + spinlock_t atsd_lock; // to serialize shootdowns + atomic_t irq_available; + struct spa *spa; + void *platform_data; +}; +static struct list_head links_list = LIST_HEAD_INIT(links_list); +static DEFINE_MUTEX(links_list_lock); struct pe_data { struct mm_struct *mm; @@ -41,6 +68,8 @@ struct pe_data { /* opaque pointer to be passed to the above callback */ void *xsl_err_data; struct rcu_head rcu; + struct ocxl_link *link; + struct mmu_notifier mmu_notifier; }; struct spa { @@ -69,27 +98,6 @@ struct spa { } xsl_fault; }; -/* - * A opencapi link can be used be by several PCI functions. We have - * one link per device slot. - * - * A linked list of opencapi links should suffice, as there's a - * limited number of opencapi slots on a system and lookup is only - * done when the device is probed - */ -struct ocxl_link { - struct list_head list; - struct kref ref; - int domain; - int bus; - int dev; - atomic_t irq_available; - struct spa *spa; - void *platform_data; -}; -static struct list_head links_list = LIST_HEAD_INIT(links_list); -static DEFINE_MUTEX(links_list_lock); - enum xsl_response { CONTINUE, ADDRESS_ERROR, @@ -126,6 +134,8 @@ static void ack_irq(struct spa *spa, enum xsl_response r) } } +static const struct mmu_notifier_ops ocxl_mmu_notifier_ops; + static void xsl_fault_handler_bh(struct work_struct *fault_work) { vm_fault_t flt = 0; @@ -376,6 +386,7 @@ static void free_spa(struct ocxl_link *link) static int alloc_link(struct pci_dev *dev, int PE_mask, struct ocxl_link **out_link) { + struct pci_controller *hose = pci_bus_to_host(dev->bus); struct ocxl_link *link; int rc; @@ -403,6 +414,22 @@ static int alloc_link(struct pci_dev *dev, int PE_mask, struct ocxl_link **out_l if (rc) goto err_xsl_irq; + /* Since OpenCAPI 5.0, Address Translation Mode requires TLB + * and SLB Invalidate operations to be initiated using MMIO + * registers + */ + if (of_property_read_u64_index(hose->dn, "ibm,mmio-atsd", + 0, &link->mmio_atsd)) { + dev_info(&dev->dev, "No available ATSD found\n"); + } + if (link->mmio_atsd) { + link->base = ioremap(link->mmio_atsd, 24); + if (!link->base) + dev_warn(&dev->dev, "ioremap failed - mmio_atsd: %#llx\n", link->mmio_atsd); + else + pnv_ocxl_map_lpar(dev, mfspr(SPRN_LPID), 0); + } + *out_link = link; return 0; @@ -464,12 +491,101 @@ void ocxl_link_release(struct pci_dev *dev, void *link_handle) { struct ocxl_link *link = (struct ocxl_link *) link_handle; + if (link->base) { + iounmap(link->base); + link->base = NULL; + } mutex_lock(&links_list_lock); kref_put(&link->ref, release_xsl); mutex_unlock(&links_list_lock); } EXPORT_SYMBOL_GPL(ocxl_link_release); +static void tlb_invalidate(struct ocxl_link *link, + unsigned long pid, + unsigned long addr) +{ + unsigned long timeout = jiffies + (HZ * OCXL_ATSD_TIMEOUT); + uint64_t val; + int pend; + + if (!link->base) + return; + + spin_lock(&link->atsd_lock); + if (addr) { + /* load Abbreviated Virtual Address register with + * the necessary value + */ + val = SETFIELD(XTS_ATSD_AVA_AVA, 0ull, addr >> (63-51)); + out_be64(link->base + XTS_ATSD_AVA, val); + eieio(); + trace_ocxl_mmu_notifier_mmio_atsd_ava(val, pid); + } + + /* Write access initiates a shoot down to initiate the + * TLB Invalidate command + */ + val = XTS_ATSD_LNCH_R; + if (addr) { + val = SETFIELD(XTS_ATSD_LNCH_RIC, val, 0b00); + val = SETFIELD(XTS_ATSD_LNCH_IS, val, 0b00); + } else { + val = SETFIELD(XTS_ATSD_LNCH_RIC, val, 0b10); + val = SETFIELD(XTS_ATSD_LNCH_IS, val, 0b01); + val |= XTS_ATSD_LNCH_OCAPI_SINGLETON; + } + val |= XTS_ATSD_LNCH_PRS; + val = SETFIELD(XTS_ATSD_LNCH_AP, val, 0b101); + val = SETFIELD(XTS_ATSD_LNCH_PID, val, pid); + out_be64(link->base + XTS_ATSD_LNCH, val); + trace_ocxl_mmu_notifier_mmio_atsd_lnch(val, addr, pid); + + /* Poll the ATSD status register to determine when the + * TLB Invalidate has been completed. + */ + val = in_be64(link->base + XTS_ATSD_STAT); + pend = val >> 63; + trace_ocxl_mmu_notifier_mmio_atsd_stat(val, addr, pid); + + while (pend) { + if (time_after_eq(jiffies, timeout)) { + pr_err("%s - Timeout while reading XTS MMIO ATSD status register (val=%#llx, pidr=0x%lx)\n", + __func__, val, pid); + spin_unlock(&link->atsd_lock); + return; + } + cpu_relax(); + val = in_be64(link->base + XTS_ATSD_STAT); + pend = val >> 63; + } + spin_unlock(&link->atsd_lock); + trace_ocxl_mmu_notifier_mmio_atsd_stat(val, addr, pid); +} + +static void invalidate_range_end(struct mmu_notifier *mn, + const struct mmu_notifier_range *range) +{ + struct pe_data *pe_data = container_of(mn, struct pe_data, mmu_notifier); + struct ocxl_link *link = pe_data->link; + struct mm_struct *mm = mn->mm; + unsigned long addr, pid, page_size = PAGE_SIZE; + + pid = mm->context.id; + trace_ocxl_mmu_notifier_range(range->start, range->end, pid); + + for (addr = range->start; addr < range->end; addr += page_size) + tlb_invalidate(link, pid, addr); +} + +static const struct mmu_notifier_ops ocxl_mmu_notifier_ops = { + /* invalidate_range_end() is called when all pages in the + * range have been unmapped and the pages have been freed by + * the VM + */ + .invalidate_range_end = invalidate_range_end, +}; + static u64 calculate_cfg_state(bool kernel) { u64 state; @@ -517,7 +633,7 @@ int ocxl_link_add_pe(void *link_handle, int pasid, u32 pidr, u32 tidr, goto unlock; } - pe_data = kmalloc(sizeof(*pe_data), GFP_KERNEL); + pe_data = kzalloc(sizeof(*pe_data), GFP_KERNEL); if (!pe_data) { rc = -ENOMEM; goto unlock; @@ -526,9 +642,13 @@ int ocxl_link_add_pe(void *link_handle, int pasid, u32 pidr, u32 tidr, pe_data->mm = mm; pe_data->xsl_err_cb = xsl_err_cb; pe_data->xsl_err_data = xsl_err_data; + pe_data->link = link; + pe_data->mmu_notifier.ops = &ocxl_mmu_notifier_ops; memset(pe, 0, sizeof(struct ocxl_process_element)); pe->config_state = cpu_to_be64(calculate_cfg_state(pidr == 0)); + pe->pasid = cpu_to_be32(pasid << (31 - 19)); + pe->bdf = cpu_to_be32(1 << (31 - 15)); pe->lpid = cpu_to_be32(mfspr(SPRN_LPID)); pe->pid = cpu_to_be32(pidr); pe->tid = cpu_to_be32(tidr); @@ -540,8 +660,17 @@ int ocxl_link_add_pe(void *link_handle, int pasid, u32 pidr, u32 tidr, * by the nest MMU. If we have a kernel context, TLBIs are * already global. */ - if (mm) + if (mm) { mm_context_add_copro(mm); + if (link->base) { + /* Use MMIO registers for the TLB and SLB + * Invalidate operations. + */ + trace_init_mmu_notifier(pasid, mm->context.id); + mmu_notifier_register(&pe_data->mmu_notifier, mm); + } + } + /* * Barrier is to make sure PE is visible in the SPA before it * is used by the device. It also helps with the global TLBI @@ -672,6 +801,11 @@ int ocxl_link_remove_pe(void *link_handle, int pasid) WARN(1, "Couldn't find pe data when removing PE\n"); } else { if (pe_data->mm) { + if (link->base) { + trace_release_mmu_notifier(pasid, pe_data->mm->context.id); + mmu_notifier_unregister(&pe_data->mmu_notifier, pe_data->mm); + tlb_invalidate(link, pe_data->mm->context.id, 0ull); + } mm_context_remove_copro(pe_data->mm); mmdrop(pe_data->mm); } diff --git a/drivers/misc/ocxl/ocxl_internal.h b/drivers/misc/ocxl/ocxl_internal.h index 0bad0a123af6..35d8be3cd270 100644 --- a/drivers/misc/ocxl/ocxl_internal.h +++ b/drivers/misc/ocxl/ocxl_internal.h @@ -8,6 +8,48 @@ #include <linux/list.h> #include <misc/ocxl.h> +/* Find left shift from first set bit in mask */ +#define MASK_TO_LSH(m) (__builtin_ffsl(m) - 1) + +/* Set field fname of oval to fval + * NOTE: oval isn't modified, the combined result is returned + */ +#define SETFIELD(m, v, val) \ + (((v) & ~(m)) | ((((typeof(v))(val)) << MASK_TO_LSH(m)) & (m))) + +#define OCXL_ATSD_TIMEOUT 1 + +/* 5.9.3.3 TLB Management Instructions - PowerISA tags workbook */ +#define XTS_ATSD_LNCH 0x00 +#define XTS_ATSD_LNCH_R PPC_BIT(0) /* Radix Invalidate */ +#define XTS_ATSD_LNCH_RIC PPC_BITMASK(1,2) /* Radix Invalidation Control + * 0b00 Just invalidate TLB. + * 0b01 Invalidate just Page Walk Cache. + * 0b10 Invalidate TLB, Page Walk Cache, and any + * caching of Partition and Process Table Entries. + */ +#define XTS_ATSD_LNCH_LP PPC_BITMASK(3, 10) /* Number and Page Size of translations to be invalidated (HPT only ?) */ +#define XTS_ATSD_LNCH_IS PPC_BITMASK(11, 12) /* Invalidation Criteria + * 0b00 Invalidate just the target VA. + * 0b01 Invalidate matching PID. + */ +#define XTS_ATSD_LNCH_PRS PPC_BIT(13) /* 0b1: Process Scope, 0b0: Partition Scope */ +#define XTS_ATSD_LNCH_B PPC_BIT(14) /* Invalidation Flag */ +#define XTS_ATSD_LNCH_AP PPC_BITMASK(15, 17) /* Actual Page Size to be invalidated + * 000 4KB + * 101 64KB + * 001 2MB + * 010 1GB + */ +#define XTS_ATSD_LNCH_L PPC_BIT(18) /* Defines the large page select (L=0b0 for 4KB pages, L=0b1 for large pages) */ +#define XTS_ATSD_LNCH_PID PPC_BITMASK(19, 38) /* Process ID */ +#define XTS_ATSD_LNCH_F PPC_BIT(39) /* NoFlush – Assumed to be 0b0 */ +#define XTS_ATSD_LNCH_OCAPI_SLBI PPC_BIT(40) +#define XTS_ATSD_LNCH_OCAPI_SINGLETON PPC_BIT(41) +#define XTS_ATSD_AVA 0x08 +#define XTS_ATSD_AVA_AVA PPC_BITMASK(0, 51) /* au lieu de 35*/ +#define XTS_ATSD_STAT 0x10 + #define MAX_IRQ_PER_LINK 2000 #define MAX_IRQ_PER_CONTEXT MAX_IRQ_PER_LINK @@ -84,7 +126,9 @@ struct ocxl_context { struct ocxl_process_element { __be64 config_state; - __be32 reserved1[11]; + __be32 pasid; + __be32 bdf; + __be32 reserved1[9]; __be32 lpid; __be32 tid; __be32 pid; diff --git a/drivers/misc/ocxl/trace.h b/drivers/misc/ocxl/trace.h index 17e21cb2addd..6171069d071a 100644 --- a/drivers/misc/ocxl/trace.h +++ b/drivers/misc/ocxl/trace.h @@ -8,6 +8,131 @@ #include <linux/tracepoint.h> + +TRACE_EVENT(ocxl_mmu_notifier_range, + TP_PROTO(unsigned long start, unsigned long end, unsigned long pidr), + TP_ARGS(start, end, pidr), + + TP_STRUCT__entry( + __field(unsigned long, start) + __field(unsigned long, end) + __field(unsigned long, pidr) + ), + + TP_fast_assign( + __entry->start = start; + __entry->end = end; + __entry->pidr = pidr; + ), + + TP_printk("start=0x%lx end=0x%lx pidr=0x%lx", + __entry->start, + __entry->end, + __entry->pidr + ) +); + +TRACE_EVENT(ocxl_mmu_notifier_mmio_atsd_ava, + TP_PROTO(u64 val, unsigned long pidr), + TP_ARGS(val, pidr), + + TP_STRUCT__entry( + __field(u64, val) + __field(unsigned long, pidr) + ), + + TP_fast_assign( + __entry->val = val; + __entry->pidr = pidr; + ), + + TP_printk("ATSD AVA: 0x%llx pidr=0x%lx", + __entry->val, __entry->pidr + ) +); + +TRACE_EVENT(ocxl_mmu_notifier_mmio_atsd_lnch, + TP_PROTO(u64 val, unsigned long addr, unsigned long pidr), + TP_ARGS(val, addr, pidr), + + TP_STRUCT__entry( + __field(u64, val) + __field(unsigned long, addr) + __field(unsigned long, pidr) + ), + + TP_fast_assign( + __entry->val = val; + __entry->addr = addr; + __entry->pidr = pidr; + ), + + TP_printk("ATSD LNCH: 0x%llx addr=0x%lx pidr=0x%lx", + __entry->val, __entry->addr, __entry->pidr + ) +); + +TRACE_EVENT(ocxl_mmu_notifier_mmio_atsd_stat, + TP_PROTO(u64 val, unsigned long addr, unsigned long pidr), + TP_ARGS(val, addr, pidr), + + TP_STRUCT__entry( + __field(u64, val) + __field(unsigned long, addr) + __field(unsigned long, pidr) + ), + + TP_fast_assign( + __entry->val = val; + __entry->addr = addr; + __entry->pidr = pidr; + ), + + TP_printk("ATSD STAT: 0x%llx addr=0x%lx pidr=0x%lx", + __entry->val, __entry->addr, __entry->pidr + ) +); + +TRACE_EVENT(init_mmu_notifier, + TP_PROTO(int pasid, unsigned long pidr), + TP_ARGS(pasid, pidr), + + TP_STRUCT__entry( + __field(int, pasid) + __field(unsigned long, pidr) + ), + + TP_fast_assign( + __entry->pasid = pasid; + __entry->pidr = pidr; + ), + + TP_printk("pasid=%d, pidr=0x%lx", + __entry->pasid, + __entry->pidr + ) +); + +TRACE_EVENT(release_mmu_notifier, + TP_PROTO(int pasid, unsigned long pidr), + TP_ARGS(pasid, pidr), + + TP_STRUCT__entry( + __field(int, pasid) + __field(unsigned long, pidr) + ), + + TP_fast_assign( + __entry->pasid = pasid; + __entry->pidr = pidr; + ), + + TP_printk("pasid=%d, pidr=0x%lx", + __entry->pasid, + __entry->pidr + ) +); + DECLARE_EVENT_CLASS(ocxl_context, TP_PROTO(pid_t pid, void *spa, int pasid, u32 pidr, u32 tidr), TP_ARGS(pid, spa, pasid, pidr, tidr),
OpenCAPI 4.0/5.0 with TLBI/SLBI Snooping, is not used due to performance problems caused by the PAU having to process all incoming TLBI/SLBI commands which will cause them to back up on the PowerBus. When the Address Translation Mode requires TLB and SLB Invalidate operations to be initiated using MMIO registers, a set of registers like the following is used: • XTS MMIO ATSD0 LPARID register • XTS MMIO ATSD0 AVA register • XTS MMIO ATSD0 launch register, write access initiates a shoot down • XTS MMIO ATSD0 status register The MMIO based mechanism also blocks the NPU/PAU from snooping TLBIE commands from the PowerBus. The Shootdown commands (ATSD) will be generated using MMIO registers in the NPU/PAU and sent to the device. Signed-off-by: Christophe Lombard <clombard@linux.vnet.ibm.com> --- arch/powerpc/include/asm/pnv-ocxl.h | 2 + arch/powerpc/platforms/powernv/ocxl.c | 19 +++ drivers/misc/ocxl/link.c | 180 ++++++++++++++++++++++---- drivers/misc/ocxl/ocxl_internal.h | 46 ++++++- drivers/misc/ocxl/trace.h | 125 ++++++++++++++++++ 5 files changed, 348 insertions(+), 24 deletions(-)