[V2,4/5] ocxl: Add mmu notifier

Message ID	20201120173241.59229-5-clombard@linux.vnet.ibm.com (mailing list archive)
State	Changes Requested
Headers	show Return-Path: <linuxppc-dev-bounces+patchwork-incoming=ozlabs.org@lists.ozlabs.org> From: Christophe Lombard <clombard@linux.vnet.ibm.com> To: linuxppc-dev@lists.ozlabs.org, fbarrat@linux.vnet.ibm.com, ajd@linux.ibm.com Subject: [PATCH V2 4/5] ocxl: Add mmu notifier Date: Fri, 20 Nov 2020 18:32:40 +0100 Message-Id: <20201120173241.59229-5-clombard@linux.vnet.ibm.com> In-Reply-To: <20201120173241.59229-1-clombard@linux.vnet.ibm.com> References: <20201120173241.59229-1-clombard@linux.vnet.ibm.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: list Errors-To: linuxppc-dev-bounces+patchwork-incoming=ozlabs.org@lists.ozlabs.org Sender: "Linuxppc-dev" <linuxppc-dev-bounces+patchwork-incoming=ozlabs.org@lists.ozlabs.org>
Series	ocxl: Mmio invalidation support \| expand [V2,0/5] ocxl: Mmio invalidation support [V2,1/5] ocxl: Assign a register set to a Logical Partition [V2,2/5] ocxl: Initiate a TLB invalidate command [V2,3/5] ocxl: Update the Process Element Entry [V2,4/5] ocxl: Add mmu notifier [V2,5/5] ocxl: Add new kernel traces

Context	Check	Description
snowpatch_ozlabs/apply_patch	success	Successfully applied on branch powerpc/merge (9d1aa2f025c6cc516125c42c70f6a9ce087c49ea)
snowpatch_ozlabs/checkpatch	warning	total: 0 errors, 0 warnings, 1 checks, 121 lines checked
snowpatch_ozlabs/needsstable	success	Patch has no Fixes tags

Christophe Lombard Nov. 20, 2020, 5:32 p.m. UTC

Add invalidate_range mmu notifier, when required (ATSD access of MMIO
registers is available), to initiate TLB invalidation commands.
For the time being, the ATSD0 set of registers is used by default.

The pasid and bdf values have to be configured in the Process Element
Entry.
The PEE must be set up to match the BDF/PASID of the AFU.

Signed-off-by: Christophe Lombard <clombard@linux.vnet.ibm.com>
---
 drivers/misc/ocxl/link.c | 58 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 57 insertions(+), 1 deletion(-)

Frederic Barrat Nov. 23, 2020, 10:40 a.m. UTC | #1

On 20/11/2020 18:32, Christophe Lombard wrote:
> Add invalidate_range mmu notifier, when required (ATSD access of MMIO
> registers is available), to initiate TLB invalidation commands.
> For the time being, the ATSD0 set of registers is used by default.
> 
> The pasid and bdf values have to be configured in the Process Element
> Entry.
> The PEE must be set up to match the BDF/PASID of the AFU.
> 
> Signed-off-by: Christophe Lombard <clombard@linux.vnet.ibm.com>
> ---
>   drivers/misc/ocxl/link.c | 58 +++++++++++++++++++++++++++++++++++++++-
>   1 file changed, 57 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/misc/ocxl/link.c b/drivers/misc/ocxl/link.c
> index 20444db8a2bb..100bdfe9ec37 100644
> --- a/drivers/misc/ocxl/link.c
> +++ b/drivers/misc/ocxl/link.c
> @@ -2,8 +2,10 @@
>   // Copyright 2017 IBM Corp.
>   #include <linux/sched/mm.h>
>   #include <linux/mutex.h>
> +#include <linux/mm.h>
>   #include <linux/mm_types.h>
>   #include <linux/mmu_context.h>
> +#include <linux/mmu_notifier.h>
>   #include <asm/copro.h>
>   #include <asm/pnv-ocxl.h>
>   #include <asm/xive.h>
> @@ -33,6 +35,7 @@
> 
>   #define SPA_PE_VALID		0x80000000
> 
> +struct ocxl_link;
> 
>   struct pe_data {
>   	struct mm_struct *mm;
> @@ -41,6 +44,8 @@ struct pe_data {
>   	/* opaque pointer to be passed to the above callback */
>   	void *xsl_err_data;
>   	struct rcu_head rcu;
> +	struct ocxl_link *link;
> +	struct mmu_notifier mmu_notifier;
>   };
> 
>   struct spa {
> @@ -83,6 +88,8 @@ struct ocxl_link {
>   	int domain;
>   	int bus;
>   	int dev;
> +	void __iomem *arva;     /* ATSD register virtual address */
> +	spinlock_t atsd_lock;   /* to serialize shootdowns */
>   	atomic_t irq_available;
>   	struct spa *spa;
>   	void *platform_data;
> @@ -403,6 +410,11 @@ static int alloc_link(struct pci_dev *dev, int PE_mask, struct ocxl_link **out_l
>   	if (rc)
>   		goto err_xsl_irq;
> 
> +	rc = pnv_ocxl_map_lpar(dev, mfspr(SPRN_LPID), 0,
> +					  &link->arva);
> +	if (!rc)
> +		spin_lock_init(&link->atsd_lock);
> +


We could use a comment to say that if arva = 0, then we don't need mmio 
shootdowns and we rely on hardware snooping.

Also, we could always initialize the spin lock, it doesn't hurt and make 
the code more readable.

   Fred


>   	*out_link = link;
>   	return 0;
> 
> @@ -454,6 +466,11 @@ static void release_xsl(struct kref *ref)
>   {
>   	struct ocxl_link *link = container_of(ref, struct ocxl_link, ref);
> 
> +	if (link->arva) {
> +		pnv_ocxl_unmap_lpar(&link->arva);
> +		link->arva = NULL;
> +	}
> +
>   	list_del(&link->list);
>   	/* call platform code before releasing data */
>   	pnv_ocxl_spa_release(link->platform_data);
> @@ -470,6 +487,26 @@ void ocxl_link_release(struct pci_dev *dev, void *link_handle)
>   }
>   EXPORT_SYMBOL_GPL(ocxl_link_release);
> 
> +static void invalidate_range(struct mmu_notifier *mn,
> +			     struct mm_struct *mm,
> +			     unsigned long start, unsigned long end)
> +{
> +	struct pe_data *pe_data = container_of(mn, struct pe_data, mmu_notifier);
> +	struct ocxl_link *link = pe_data->link;
> +	unsigned long addr, pid, page_size = PAGE_SIZE;
> +
> +	pid = mm->context.id;
> +
> +	spin_lock(&link->atsd_lock);
> +	for (addr = start; addr < end; addr += page_size)
> +		pnv_ocxl_tlb_invalidate(&link->arva, pid, addr);
> +	spin_unlock(&link->atsd_lock);
> +}
> +
> +static const struct mmu_notifier_ops ocxl_mmu_notifier_ops = {
> +	.invalidate_range = invalidate_range,
> +};
> +
>   static u64 calculate_cfg_state(bool kernel)
>   {
>   	u64 state;
> @@ -526,6 +563,8 @@ int ocxl_link_add_pe(void *link_handle, int pasid, u32 pidr, u32 tidr,
>   	pe_data->mm = mm;
>   	pe_data->xsl_err_cb = xsl_err_cb;
>   	pe_data->xsl_err_data = xsl_err_data;
> +	pe_data->link = link;
> +	pe_data->mmu_notifier.ops = &ocxl_mmu_notifier_ops;
> 
>   	memset(pe, 0, sizeof(struct ocxl_process_element));
>   	pe->config_state = cpu_to_be64(calculate_cfg_state(pidr == 0));
> @@ -542,8 +581,16 @@ int ocxl_link_add_pe(void *link_handle, int pasid, u32 pidr, u32 tidr,
>   	 * by the nest MMU. If we have a kernel context, TLBIs are
>   	 * already global.
>   	 */
> -	if (mm)
> +	if (mm) {
>   		mm_context_add_copro(mm);
> +		if (link->arva) {
> +			/* Use MMIO registers for the TLB Invalidate
> +			 * operations.
> +			 */
> +			mmu_notifier_register(&pe_data->mmu_notifier, mm);
> +		}
> +	}
> +
>   	/*
>   	 * Barrier is to make sure PE is visible in the SPA before it
>   	 * is used by the device. It also helps with the global TLBI
> @@ -674,6 +721,15 @@ int ocxl_link_remove_pe(void *link_handle, int pasid)
>   		WARN(1, "Couldn't find pe data when removing PE\n");
>   	} else {
>   		if (pe_data->mm) {
> +			if (link->arva) {
> +				mmu_notifier_unregister(&pe_data->mmu_notifier,
> +							pe_data->mm);
> +				spin_lock(&link->atsd_lock);
> +				pnv_ocxl_tlb_invalidate(&link->arva,
> +							pe_data->mm->context.id,
> +							0ull);
> +				spin_unlock(&link->atsd_lock);
> +			}
>   			mm_context_remove_copro(pe_data->mm);
>   			mmdrop(pe_data->mm);
>   		}
>

Christoph Hellwig Nov. 24, 2020, 9:17 a.m. UTC | #2

You probably want to add Jason for an audit of new notifier uses.

On Fri, Nov 20, 2020 at 06:32:40PM +0100, Christophe Lombard wrote:
> Add invalidate_range mmu notifier, when required (ATSD access of MMIO
> registers is available), to initiate TLB invalidation commands.
> For the time being, the ATSD0 set of registers is used by default.
> 
> The pasid and bdf values have to be configured in the Process Element
> Entry.
> The PEE must be set up to match the BDF/PASID of the AFU.
> 
> Signed-off-by: Christophe Lombard <clombard@linux.vnet.ibm.com>
> ---
>  drivers/misc/ocxl/link.c | 58 +++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 57 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/misc/ocxl/link.c b/drivers/misc/ocxl/link.c
> index 20444db8a2bb..100bdfe9ec37 100644
> --- a/drivers/misc/ocxl/link.c
> +++ b/drivers/misc/ocxl/link.c
> @@ -2,8 +2,10 @@
>  // Copyright 2017 IBM Corp.
>  #include <linux/sched/mm.h>
>  #include <linux/mutex.h>
> +#include <linux/mm.h>
>  #include <linux/mm_types.h>
>  #include <linux/mmu_context.h>
> +#include <linux/mmu_notifier.h>
>  #include <asm/copro.h>
>  #include <asm/pnv-ocxl.h>
>  #include <asm/xive.h>
> @@ -33,6 +35,7 @@
>  
>  #define SPA_PE_VALID		0x80000000
>  
> +struct ocxl_link;
>  
>  struct pe_data {
>  	struct mm_struct *mm;
> @@ -41,6 +44,8 @@ struct pe_data {
>  	/* opaque pointer to be passed to the above callback */
>  	void *xsl_err_data;
>  	struct rcu_head rcu;
> +	struct ocxl_link *link;
> +	struct mmu_notifier mmu_notifier;
>  };
>  
>  struct spa {
> @@ -83,6 +88,8 @@ struct ocxl_link {
>  	int domain;
>  	int bus;
>  	int dev;
> +	void __iomem *arva;     /* ATSD register virtual address */
> +	spinlock_t atsd_lock;   /* to serialize shootdowns */
>  	atomic_t irq_available;
>  	struct spa *spa;
>  	void *platform_data;
> @@ -403,6 +410,11 @@ static int alloc_link(struct pci_dev *dev, int PE_mask, struct ocxl_link **out_l
>  	if (rc)
>  		goto err_xsl_irq;
>  
> +	rc = pnv_ocxl_map_lpar(dev, mfspr(SPRN_LPID), 0,
> +					  &link->arva);
> +	if (!rc)
> +		spin_lock_init(&link->atsd_lock);
> +
>  	*out_link = link;
>  	return 0;
>  
> @@ -454,6 +466,11 @@ static void release_xsl(struct kref *ref)
>  {
>  	struct ocxl_link *link = container_of(ref, struct ocxl_link, ref);
>  
> +	if (link->arva) {
> +		pnv_ocxl_unmap_lpar(&link->arva);
> +		link->arva = NULL;
> +	}
> +
>  	list_del(&link->list);
>  	/* call platform code before releasing data */
>  	pnv_ocxl_spa_release(link->platform_data);
> @@ -470,6 +487,26 @@ void ocxl_link_release(struct pci_dev *dev, void *link_handle)
>  }
>  EXPORT_SYMBOL_GPL(ocxl_link_release);
>  
> +static void invalidate_range(struct mmu_notifier *mn,
> +			     struct mm_struct *mm,
> +			     unsigned long start, unsigned long end)
> +{
> +	struct pe_data *pe_data = container_of(mn, struct pe_data, mmu_notifier);
> +	struct ocxl_link *link = pe_data->link;
> +	unsigned long addr, pid, page_size = PAGE_SIZE;
> +
> +	pid = mm->context.id;
> +
> +	spin_lock(&link->atsd_lock);
> +	for (addr = start; addr < end; addr += page_size)
> +		pnv_ocxl_tlb_invalidate(&link->arva, pid, addr);
> +	spin_unlock(&link->atsd_lock);
> +}
> +
> +static const struct mmu_notifier_ops ocxl_mmu_notifier_ops = {
> +	.invalidate_range = invalidate_range,
> +};
> +
>  static u64 calculate_cfg_state(bool kernel)
>  {
>  	u64 state;
> @@ -526,6 +563,8 @@ int ocxl_link_add_pe(void *link_handle, int pasid, u32 pidr, u32 tidr,
>  	pe_data->mm = mm;
>  	pe_data->xsl_err_cb = xsl_err_cb;
>  	pe_data->xsl_err_data = xsl_err_data;
> +	pe_data->link = link;
> +	pe_data->mmu_notifier.ops = &ocxl_mmu_notifier_ops;
>  
>  	memset(pe, 0, sizeof(struct ocxl_process_element));
>  	pe->config_state = cpu_to_be64(calculate_cfg_state(pidr == 0));
> @@ -542,8 +581,16 @@ int ocxl_link_add_pe(void *link_handle, int pasid, u32 pidr, u32 tidr,
>  	 * by the nest MMU. If we have a kernel context, TLBIs are
>  	 * already global.
>  	 */
> -	if (mm)
> +	if (mm) {
>  		mm_context_add_copro(mm);
> +		if (link->arva) {
> +			/* Use MMIO registers for the TLB Invalidate
> +			 * operations.
> +			 */
> +			mmu_notifier_register(&pe_data->mmu_notifier, mm);
> +		}
> +	}
> +
>  	/*
>  	 * Barrier is to make sure PE is visible in the SPA before it
>  	 * is used by the device. It also helps with the global TLBI
> @@ -674,6 +721,15 @@ int ocxl_link_remove_pe(void *link_handle, int pasid)
>  		WARN(1, "Couldn't find pe data when removing PE\n");
>  	} else {
>  		if (pe_data->mm) {
> +			if (link->arva) {
> +				mmu_notifier_unregister(&pe_data->mmu_notifier,
> +							pe_data->mm);
> +				spin_lock(&link->atsd_lock);
> +				pnv_ocxl_tlb_invalidate(&link->arva,
> +							pe_data->mm->context.id,
> +							0ull);
> +				spin_unlock(&link->atsd_lock);
> +			}
>  			mm_context_remove_copro(pe_data->mm);
>  			mmdrop(pe_data->mm);
>  		}
> -- 
> 2.28.0
> 
---end quoted text---

Jason Gunthorpe Nov. 24, 2020, 1:45 p.m. UTC | #3

On Tue, Nov 24, 2020 at 09:17:38AM +0000, Christoph Hellwig wrote:

> > @@ -470,6 +487,26 @@ void ocxl_link_release(struct pci_dev *dev, void *link_handle)
> >  }
> >  EXPORT_SYMBOL_GPL(ocxl_link_release);
> >  
> > +static void invalidate_range(struct mmu_notifier *mn,
> > +			     struct mm_struct *mm,
> > +			     unsigned long start, unsigned long end)
> > +{
> > +	struct pe_data *pe_data = container_of(mn, struct pe_data, mmu_notifier);
> > +	struct ocxl_link *link = pe_data->link;
> > +	unsigned long addr, pid, page_size = PAGE_SIZE;

The page_size variable seems unnecessary

> > +
> > +	pid = mm->context.id;
> > +
> > +	spin_lock(&link->atsd_lock);
> > +	for (addr = start; addr < end; addr += page_size)
> > +		pnv_ocxl_tlb_invalidate(&link->arva, pid, addr);
> > +	spin_unlock(&link->atsd_lock);
> > +}
> > +
> > +static const struct mmu_notifier_ops ocxl_mmu_notifier_ops = {
> > +	.invalidate_range = invalidate_range,
> > +};
> > +
> >  static u64 calculate_cfg_state(bool kernel)
> >  {
> >  	u64 state;
> > @@ -526,6 +563,8 @@ int ocxl_link_add_pe(void *link_handle, int pasid, u32 pidr, u32 tidr,
> >  	pe_data->mm = mm;
> >  	pe_data->xsl_err_cb = xsl_err_cb;
> >  	pe_data->xsl_err_data = xsl_err_data;
> > +	pe_data->link = link;
> > +	pe_data->mmu_notifier.ops = &ocxl_mmu_notifier_ops;
> >  
> >  	memset(pe, 0, sizeof(struct ocxl_process_element));
> >  	pe->config_state = cpu_to_be64(calculate_cfg_state(pidr == 0));
> > @@ -542,8 +581,16 @@ int ocxl_link_add_pe(void *link_handle, int pasid, u32 pidr, u32 tidr,
> >  	 * by the nest MMU. If we have a kernel context, TLBIs are
> >  	 * already global.
> >  	 */
> > -	if (mm)
> > +	if (mm) {
> >  		mm_context_add_copro(mm);
> > +		if (link->arva) {
> > +			/* Use MMIO registers for the TLB Invalidate
> > +			 * operations.
> > +			 */
> > +			mmu_notifier_register(&pe_data->mmu_notifier, mm);

Every other place doing stuff like this is de-duplicating the
notifier. If you have multiple clients this will do multiple redundant
invalidations?

The notifier get/put API is designed to solve that problem, you'd get
a single notifier for the mm and then add the impacted arva's to some
list at the notifier.

Jason

Christophe Lombard Nov. 24, 2020, 4:48 p.m. UTC | #4

Le 24/11/2020 à 14:45, Jason Gunthorpe a écrit :
> On Tue, Nov 24, 2020 at 09:17:38AM +0000, Christoph Hellwig wrote:
>
>>> @@ -470,6 +487,26 @@ void ocxl_link_release(struct pci_dev *dev, void *link_handle)
>>>   }
>>>   EXPORT_SYMBOL_GPL(ocxl_link_release);
>>>   
>>> +static void invalidate_range(struct mmu_notifier *mn,
>>> +			     struct mm_struct *mm,
>>> +			     unsigned long start, unsigned long end)
>>> +{
>>> +	struct pe_data *pe_data = container_of(mn, struct pe_data, mmu_notifier);
>>> +	struct ocxl_link *link = pe_data->link;
>>> +	unsigned long addr, pid, page_size = PAGE_SIZE;
> The page_size variable seems unnecessary
>
>>> +
>>> +	pid = mm->context.id;
>>> +
>>> +	spin_lock(&link->atsd_lock);
>>> +	for (addr = start; addr < end; addr += page_size)
>>> +		pnv_ocxl_tlb_invalidate(&link->arva, pid, addr);
>>> +	spin_unlock(&link->atsd_lock);
>>> +}
>>> +
>>> +static const struct mmu_notifier_ops ocxl_mmu_notifier_ops = {
>>> +	.invalidate_range = invalidate_range,
>>> +};
>>> +
>>>   static u64 calculate_cfg_state(bool kernel)
>>>   {
>>>   	u64 state;
>>> @@ -526,6 +563,8 @@ int ocxl_link_add_pe(void *link_handle, int pasid, u32 pidr, u32 tidr,
>>>   	pe_data->mm = mm;
>>>   	pe_data->xsl_err_cb = xsl_err_cb;
>>>   	pe_data->xsl_err_data = xsl_err_data;
>>> +	pe_data->link = link;
>>> +	pe_data->mmu_notifier.ops = &ocxl_mmu_notifier_ops;
>>>   
>>>   	memset(pe, 0, sizeof(struct ocxl_process_element));
>>>   	pe->config_state = cpu_to_be64(calculate_cfg_state(pidr == 0));
>>> @@ -542,8 +581,16 @@ int ocxl_link_add_pe(void *link_handle, int pasid, u32 pidr, u32 tidr,
>>>   	 * by the nest MMU. If we have a kernel context, TLBIs are
>>>   	 * already global.
>>>   	 */
>>> -	if (mm)
>>> +	if (mm) {
>>>   		mm_context_add_copro(mm);
>>> +		if (link->arva) {
>>> +			/* Use MMIO registers for the TLB Invalidate
>>> +			 * operations.
>>> +			 */
>>> +			mmu_notifier_register(&pe_data->mmu_notifier, mm);
> Every other place doing stuff like this is de-duplicating the
> notifier. If you have multiple clients this will do multiple redundant
> invalidations?

We could have multiple clients, although not something that we have often.
We have only one attach per process. But if not, we must still have 
invalidation for each.

>
> The notifier get/put API is designed to solve that problem, you'd get
> a single notifier for the mm and then add the impacted arva's to some
> list at the notifier.

Thanks for the information.
>
> Jason

[V2,4/5] ocxl: Add mmu notifier

Checks

Commit Message

Comments

Patch