diff mbox series

[4/4,Focal/linux-azure] PCI: hv: Fix interrupt mapping for multi-MSI

Message ID 20220713125649.24063-5-tim.gardner@canonical.com
State New
Headers show
Series Azure: Support multi-MSI | expand

Commit Message

Tim Gardner July 13, 2022, 12:56 p.m. UTC
From: Jeffrey Hugo <quic_jhugo@quicinc.com>

BugLink: https://bugs.launchpad.net/bugs/1981577

According to Dexuan, the hypervisor folks beleive that multi-msi
allocations are not correct.  compose_msi_msg() will allocate multi-msi
one by one.  However, multi-msi is a block of related MSIs, with alignment
requirements.  In order for the hypervisor to allocate properly aligned
and consecutive entries in the IOMMU Interrupt Remapping Table, there
should be a single mapping request that requests all of the multi-msi
vectors in one shot.

Dexuan suggests detecting the multi-msi case and composing a single
request related to the first MSI.  Then for the other MSIs in the same
block, use the cached information.  This appears to be viable, so do it.

Suggested-by: Dexuan Cui <decui@microsoft.com>
Signed-off-by: Jeffrey Hugo <quic_jhugo@quicinc.com>
Reviewed-by: Dexuan Cui <decui@microsoft.com>
Tested-by: Michael Kelley <mikelley@microsoft.com>
Link: https://lore.kernel.org/r/1652282599-21643-1-git-send-email-quic_jhugo@quicinc.com
Signed-off-by: Wei Liu <wei.liu@kernel.org>
(backported from commit a2bad844a67b1c7740bda63e87453baf63c3a7f7)
[rtg - msi_desc attributes are in a different place in the structure ]
Signed-off-by: Tim Gardner <tim.gardner@canonical.com>
---
 drivers/pci/controller/pci-hyperv.c | 67 ++++++++++++++++++++++++-----
 1 file changed, 57 insertions(+), 10 deletions(-)

Comments

Bartlomiej Zolnierkiewicz July 14, 2022, 9:08 a.m. UTC | #1
Hi Tim,

On Wed, Jul 13, 2022 at 2:57 PM Tim Gardner <tim.gardner@canonical.com> wrote:
>
> From: Jeffrey Hugo <quic_jhugo@quicinc.com>
>
> BugLink: https://bugs.launchpad.net/bugs/1981577
>
> According to Dexuan, the hypervisor folks beleive that multi-msi
> allocations are not correct.  compose_msi_msg() will allocate multi-msi
> one by one.  However, multi-msi is a block of related MSIs, with alignment
> requirements.  In order for the hypervisor to allocate properly aligned
> and consecutive entries in the IOMMU Interrupt Remapping Table, there
> should be a single mapping request that requests all of the multi-msi
> vectors in one shot.
>
> Dexuan suggests detecting the multi-msi case and composing a single
> request related to the first MSI.  Then for the other MSIs in the same
> block, use the cached information.  This appears to be viable, so do it.
>
> Suggested-by: Dexuan Cui <decui@microsoft.com>
> Signed-off-by: Jeffrey Hugo <quic_jhugo@quicinc.com>
> Reviewed-by: Dexuan Cui <decui@microsoft.com>
> Tested-by: Michael Kelley <mikelley@microsoft.com>
> Link: https://lore.kernel.org/r/1652282599-21643-1-git-send-email-quic_jhugo@quicinc.com
> Signed-off-by: Wei Liu <wei.liu@kernel.org>
> (backported from commit a2bad844a67b1c7740bda63e87453baf63c3a7f7)
> [rtg - msi_desc attributes are in a different place in the structure ]
> Signed-off-by: Tim Gardner <tim.gardner@canonical.com>
> ---
>  drivers/pci/controller/pci-hyperv.c | 67 ++++++++++++++++++++++++-----
>  1 file changed, 57 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
> index 04165db6d385..6ce774de3525 100644
> --- a/drivers/pci/controller/pci-hyperv.c
> +++ b/drivers/pci/controller/pci-hyperv.c
> @@ -1398,6 +1398,10 @@ static void hv_int_desc_free(struct hv_pci_dev *hpdev,
>                 u8 buffer[sizeof(struct pci_delete_interrupt)];
>         } ctxt;
>
> +       if (!int_desc->vector_count) {
> +               kfree(int_desc);
> +               return;
> +       }
>         memset(&ctxt, 0, sizeof(ctxt));
>         int_pkt = (struct pci_delete_interrupt *)&ctxt.pkt.message;
>         int_pkt->message_type.type =
> @@ -1491,8 +1495,15 @@ static void hv_irq_unmask(struct irq_data *data)
>         memset(params, 0, sizeof(*params));
>         params->partition_id = HV_PARTITION_ID_SELF;
>         params->int_entry.source = HV_INTERRUPT_SOURCE_MSI;
> +#ifdef CONFIG_X86
>         params->int_entry.msi_entry.address.as_uint32 = int_desc->address & 0xffffffff;
>         params->int_entry.msi_entry.data.as_uint32 = int_desc->data;
> +#elif CONFIG_ARM64
> +       params->int_entry.msi_entry.address = int_desc->address & 0xffffffff;
> +       params->int_entry.msi_entry.data = int_desc->data;
> +#else
> +#error Unsupported architecture
> +#endif

Shouldn't the above change be included in the patch #1/4 instead?

[ Even better if we could backport to focal:linux-azure commit
d06957d7a6929e6a4aa959cb59d66f0c095fc974 ("PCI: hv: Avoid the retarget
interrupt hypercall in irq_unmask() on ARM64") like it is done for
jammy:linux-azure. I assume that it was not backported because it is
more risky for focal as It introduces behavior change for ARM64.. ]

Best regards,
Bartlomiej

>         params->device_id = (hbus->hdev->dev_instance.b[5] << 24) |
>                            (hbus->hdev->dev_instance.b[4] << 16) |
>                            (hbus->hdev->dev_instance.b[7] << 8) |
> @@ -1595,12 +1606,12 @@ static void hv_pci_compose_compl(void *context, struct pci_response *resp,
>
>  static u32 hv_compose_msi_req_v1(
>         struct pci_create_interrupt *int_pkt, struct cpumask *affinity,
> -       u32 slot, u8 vector)
> +       u32 slot, u8 vector, u8 vector_count)
>  {
>         int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE;
>         int_pkt->wslot.slot = slot;
>         int_pkt->int_desc.vector = vector;
> -       int_pkt->int_desc.vector_count = 1;
> +       int_pkt->int_desc.vector_count = vector_count;
>         int_pkt->int_desc.delivery_mode = DELIVERY_MODE;
>
>         /*
> @@ -1623,14 +1634,14 @@ static int hv_compose_msi_req_get_cpu(struct cpumask *affinity)
>
>  static u32 hv_compose_msi_req_v2(
>         struct pci_create_interrupt2 *int_pkt, struct cpumask *affinity,
> -       u32 slot, u8 vector)
> +       u32 slot, u8 vector, u8 vector_count)
>  {
>         int cpu;
>
>         int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE2;
>         int_pkt->wslot.slot = slot;
>         int_pkt->int_desc.vector = vector;
> -       int_pkt->int_desc.vector_count = 1;
> +       int_pkt->int_desc.vector_count = vector_count;
>         int_pkt->int_desc.delivery_mode = DELIVERY_MODE;
>         cpu = hv_compose_msi_req_get_cpu(affinity);
>         int_pkt->int_desc.processor_array[0] =
> @@ -1642,7 +1653,7 @@ static u32 hv_compose_msi_req_v2(
>
>  static u32 hv_compose_msi_req_v3(
>         struct pci_create_interrupt3 *int_pkt, struct cpumask *affinity,
> -       u32 slot, u32 vector)
> +       u32 slot, u32 vector, u8 vector_count)
>  {
>         int cpu;
>
> @@ -1650,7 +1661,7 @@ static u32 hv_compose_msi_req_v3(
>         int_pkt->wslot.slot = slot;
>         int_pkt->int_desc.vector = vector;
>         int_pkt->int_desc.reserved = 0;
> -       int_pkt->int_desc.vector_count = 1;
> +       int_pkt->int_desc.vector_count = vector_count;
>         int_pkt->int_desc.delivery_mode = DELIVERY_MODE;
>         cpu = hv_compose_msi_req_get_cpu(affinity);
>         int_pkt->int_desc.processor_array[0] =
> @@ -1681,6 +1692,8 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
>         struct cpumask *dest;
>         struct compose_comp_ctxt comp;
>         struct tran_int_desc *int_desc;
> +       struct msi_desc *msi_desc;
> +       u8 vector, vector_count;
>         struct {
>                 struct pci_packet pci_pkt;
>                 union {
> @@ -1702,7 +1715,8 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
>                 return;
>         }
>
> -       pdev = msi_desc_to_pci_dev(irq_data_get_msi_desc(data));
> +       msi_desc  = irq_data_get_msi_desc(data);
> +       pdev = msi_desc_to_pci_dev(msi_desc);
>         dest = irq_data_get_effective_affinity_mask(data);
>         pbus = pdev->bus;
>         hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata);
> @@ -1715,6 +1729,36 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
>         if (!int_desc)
>                 goto drop_reference;
>
> +       if (!msi_desc->msi_attrib.is_msix && msi_desc->nvec_used > 1) {
> +               /*
> +                * If this is not the first MSI of Multi MSI, we already have
> +                * a mapping.  Can exit early.
> +                */
> +               if (msi_desc->irq != data->irq) {
> +                       data->chip_data = int_desc;
> +                       int_desc->address = msi_desc->msg.address_lo |
> +                                           (u64)msi_desc->msg.address_hi << 32;
> +                       int_desc->data = msi_desc->msg.data +
> +                                        (data->irq - msi_desc->irq);
> +                       msg->address_hi = msi_desc->msg.address_hi;
> +                       msg->address_lo = msi_desc->msg.address_lo;
> +                       msg->data = int_desc->data;
> +                       put_pcichild(hpdev);
> +                       return;
> +               }
> +               /*
> +                * The vector we select here is a dummy value.  The correct
> +                * value gets sent to the hypervisor in unmask().  This needs
> +                * to be aligned with the count, and also not zero.  Multi-msi
> +                * is powers of 2 up to 32, so 32 will always work here.
> +                */
> +               vector = 32;
> +               vector_count = msi_desc->nvec_used;
> +       } else {
> +               vector = hv_msi_get_int_vector(data);
> +               vector_count = 1;
> +       }
> +
>         memset(&ctxt, 0, sizeof(ctxt));
>         init_completion(&comp.comp_pkt.host_event);
>         ctxt.pci_pkt.completion_func = hv_pci_compose_compl;
> @@ -1725,7 +1769,8 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
>                 size = hv_compose_msi_req_v1(&ctxt.int_pkts.v1,
>                                         dest,
>                                         hpdev->desc.win_slot.slot,
> -                                       hv_msi_get_int_vector(data));
> +                                       vector,
> +                                       vector_count);
>                 break;
>
>         case PCI_PROTOCOL_VERSION_1_2:
> @@ -1733,14 +1778,16 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
>                 size = hv_compose_msi_req_v2(&ctxt.int_pkts.v2,
>                                         dest,
>                                         hpdev->desc.win_slot.slot,
> -                                       hv_msi_get_int_vector(data));
> +                                       vector,
> +                                       vector_count);
>                 break;
>
>         case PCI_PROTOCOL_VERSION_1_4:
>                 size = hv_compose_msi_req_v3(&ctxt.int_pkts.v3,
>                                         dest,
>                                         hpdev->desc.win_slot.slot,
> -                                       hv_msi_get_int_vector(data));
> +                                       vector,
> +                                       vector_count);
>                 break;
>
>         default:
> --
> 2.37.0
Tim Gardner July 18, 2022, 5:36 p.m. UTC | #2
On 7/14/22 03:08, Bartlomiej Zolnierkiewicz wrote:
> Hi Tim,
> 
> On Wed, Jul 13, 2022 at 2:57 PM Tim Gardner <tim.gardner@canonical.com> wrote:
>>
>> From: Jeffrey Hugo <quic_jhugo@quicinc.com>
>>
>> BugLink: https://bugs.launchpad.net/bugs/1981577
>>
>> According to Dexuan, the hypervisor folks beleive that multi-msi
>> allocations are not correct.  compose_msi_msg() will allocate multi-msi
>> one by one.  However, multi-msi is a block of related MSIs, with alignment
>> requirements.  In order for the hypervisor to allocate properly aligned
>> and consecutive entries in the IOMMU Interrupt Remapping Table, there
>> should be a single mapping request that requests all of the multi-msi
>> vectors in one shot.
>>
>> Dexuan suggests detecting the multi-msi case and composing a single
>> request related to the first MSI.  Then for the other MSIs in the same
>> block, use the cached information.  This appears to be viable, so do it.
>>
>> Suggested-by: Dexuan Cui <decui@microsoft.com>
>> Signed-off-by: Jeffrey Hugo <quic_jhugo@quicinc.com>
>> Reviewed-by: Dexuan Cui <decui@microsoft.com>
>> Tested-by: Michael Kelley <mikelley@microsoft.com>
>> Link: https://lore.kernel.org/r/1652282599-21643-1-git-send-email-quic_jhugo@quicinc.com
>> Signed-off-by: Wei Liu <wei.liu@kernel.org>
>> (backported from commit a2bad844a67b1c7740bda63e87453baf63c3a7f7)
>> [rtg - msi_desc attributes are in a different place in the structure ]
>> Signed-off-by: Tim Gardner <tim.gardner@canonical.com>
>> ---
>>   drivers/pci/controller/pci-hyperv.c | 67 ++++++++++++++++++++++++-----
>>   1 file changed, 57 insertions(+), 10 deletions(-)
>>
>> diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
>> index 04165db6d385..6ce774de3525 100644
>> --- a/drivers/pci/controller/pci-hyperv.c
>> +++ b/drivers/pci/controller/pci-hyperv.c
>> @@ -1398,6 +1398,10 @@ static void hv_int_desc_free(struct hv_pci_dev *hpdev,
>>                  u8 buffer[sizeof(struct pci_delete_interrupt)];
>>          } ctxt;
>>
>> +       if (!int_desc->vector_count) {
>> +               kfree(int_desc);
>> +               return;
>> +       }
>>          memset(&ctxt, 0, sizeof(ctxt));
>>          int_pkt = (struct pci_delete_interrupt *)&ctxt.pkt.message;
>>          int_pkt->message_type.type =
>> @@ -1491,8 +1495,15 @@ static void hv_irq_unmask(struct irq_data *data)
>>          memset(params, 0, sizeof(*params));
>>          params->partition_id = HV_PARTITION_ID_SELF;
>>          params->int_entry.source = HV_INTERRUPT_SOURCE_MSI;
>> +#ifdef CONFIG_X86
>>          params->int_entry.msi_entry.address.as_uint32 = int_desc->address & 0xffffffff;
>>          params->int_entry.msi_entry.data.as_uint32 = int_desc->data;
>> +#elif CONFIG_ARM64
>> +       params->int_entry.msi_entry.address = int_desc->address & 0xffffffff;
>> +       params->int_entry.msi_entry.data = int_desc->data;
>> +#else
>> +#error Unsupported architecture
>> +#endif
> 
> Shouldn't the above change be included in the patch #1/4 instead?
> 

That block really should have gone into patch 2 in order to enable 
bisecting to work. I'll do that when I apply the patches, but otherwise 
won't change the code.

> [ Even better if we could backport to focal:linux-azure commit
> d06957d7a6929e6a4aa959cb59d66f0c095fc974 ("PCI: hv: Avoid the retarget
> interrupt hypercall in irq_unmask() on ARM64") like it is done for
> jammy:linux-azure. I assume that it was not backported because it is
> more risky for focal as It introduces behavior change for ARM64.. ]
> 

This is at an awkward point in the architecture code consolidation. I 
tried adding d06957d7a6929e6a4aa959cb59d66f0c095fc974 ("PCI: hv: Avoid 
the retarget interrupt hypercall in irq_unmask() on ARM64") but it 
caused more back port issues then it was worth.

This is the code that Microsoft tested, so I'm content to leave it as is.

rtg
diff mbox series

Patch

diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
index 04165db6d385..6ce774de3525 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -1398,6 +1398,10 @@  static void hv_int_desc_free(struct hv_pci_dev *hpdev,
 		u8 buffer[sizeof(struct pci_delete_interrupt)];
 	} ctxt;
 
+	if (!int_desc->vector_count) {
+		kfree(int_desc);
+		return;
+	}
 	memset(&ctxt, 0, sizeof(ctxt));
 	int_pkt = (struct pci_delete_interrupt *)&ctxt.pkt.message;
 	int_pkt->message_type.type =
@@ -1491,8 +1495,15 @@  static void hv_irq_unmask(struct irq_data *data)
 	memset(params, 0, sizeof(*params));
 	params->partition_id = HV_PARTITION_ID_SELF;
 	params->int_entry.source = HV_INTERRUPT_SOURCE_MSI;
+#ifdef CONFIG_X86
 	params->int_entry.msi_entry.address.as_uint32 = int_desc->address & 0xffffffff;
 	params->int_entry.msi_entry.data.as_uint32 = int_desc->data;
+#elif CONFIG_ARM64
+	params->int_entry.msi_entry.address = int_desc->address & 0xffffffff;
+	params->int_entry.msi_entry.data = int_desc->data;
+#else
+#error Unsupported architecture
+#endif
 	params->device_id = (hbus->hdev->dev_instance.b[5] << 24) |
 			   (hbus->hdev->dev_instance.b[4] << 16) |
 			   (hbus->hdev->dev_instance.b[7] << 8) |
@@ -1595,12 +1606,12 @@  static void hv_pci_compose_compl(void *context, struct pci_response *resp,
 
 static u32 hv_compose_msi_req_v1(
 	struct pci_create_interrupt *int_pkt, struct cpumask *affinity,
-	u32 slot, u8 vector)
+	u32 slot, u8 vector, u8 vector_count)
 {
 	int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE;
 	int_pkt->wslot.slot = slot;
 	int_pkt->int_desc.vector = vector;
-	int_pkt->int_desc.vector_count = 1;
+	int_pkt->int_desc.vector_count = vector_count;
 	int_pkt->int_desc.delivery_mode = DELIVERY_MODE;
 
 	/*
@@ -1623,14 +1634,14 @@  static int hv_compose_msi_req_get_cpu(struct cpumask *affinity)
 
 static u32 hv_compose_msi_req_v2(
 	struct pci_create_interrupt2 *int_pkt, struct cpumask *affinity,
-	u32 slot, u8 vector)
+	u32 slot, u8 vector, u8 vector_count)
 {
 	int cpu;
 
 	int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE2;
 	int_pkt->wslot.slot = slot;
 	int_pkt->int_desc.vector = vector;
-	int_pkt->int_desc.vector_count = 1;
+	int_pkt->int_desc.vector_count = vector_count;
 	int_pkt->int_desc.delivery_mode = DELIVERY_MODE;
 	cpu = hv_compose_msi_req_get_cpu(affinity);
 	int_pkt->int_desc.processor_array[0] =
@@ -1642,7 +1653,7 @@  static u32 hv_compose_msi_req_v2(
 
 static u32 hv_compose_msi_req_v3(
 	struct pci_create_interrupt3 *int_pkt, struct cpumask *affinity,
-	u32 slot, u32 vector)
+	u32 slot, u32 vector, u8 vector_count)
 {
 	int cpu;
 
@@ -1650,7 +1661,7 @@  static u32 hv_compose_msi_req_v3(
 	int_pkt->wslot.slot = slot;
 	int_pkt->int_desc.vector = vector;
 	int_pkt->int_desc.reserved = 0;
-	int_pkt->int_desc.vector_count = 1;
+	int_pkt->int_desc.vector_count = vector_count;
 	int_pkt->int_desc.delivery_mode = DELIVERY_MODE;
 	cpu = hv_compose_msi_req_get_cpu(affinity);
 	int_pkt->int_desc.processor_array[0] =
@@ -1681,6 +1692,8 @@  static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
 	struct cpumask *dest;
 	struct compose_comp_ctxt comp;
 	struct tran_int_desc *int_desc;
+	struct msi_desc *msi_desc;
+	u8 vector, vector_count;
 	struct {
 		struct pci_packet pci_pkt;
 		union {
@@ -1702,7 +1715,8 @@  static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
 		return;
 	}
 
-	pdev = msi_desc_to_pci_dev(irq_data_get_msi_desc(data));
+	msi_desc  = irq_data_get_msi_desc(data);
+	pdev = msi_desc_to_pci_dev(msi_desc);
 	dest = irq_data_get_effective_affinity_mask(data);
 	pbus = pdev->bus;
 	hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata);
@@ -1715,6 +1729,36 @@  static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
 	if (!int_desc)
 		goto drop_reference;
 
+	if (!msi_desc->msi_attrib.is_msix && msi_desc->nvec_used > 1) {
+		/*
+		 * If this is not the first MSI of Multi MSI, we already have
+		 * a mapping.  Can exit early.
+		 */
+		if (msi_desc->irq != data->irq) {
+			data->chip_data = int_desc;
+			int_desc->address = msi_desc->msg.address_lo |
+					    (u64)msi_desc->msg.address_hi << 32;
+			int_desc->data = msi_desc->msg.data +
+					 (data->irq - msi_desc->irq);
+			msg->address_hi = msi_desc->msg.address_hi;
+			msg->address_lo = msi_desc->msg.address_lo;
+			msg->data = int_desc->data;
+			put_pcichild(hpdev);
+			return;
+		}
+		/*
+		 * The vector we select here is a dummy value.  The correct
+		 * value gets sent to the hypervisor in unmask().  This needs
+		 * to be aligned with the count, and also not zero.  Multi-msi
+		 * is powers of 2 up to 32, so 32 will always work here.
+		 */
+		vector = 32;
+		vector_count = msi_desc->nvec_used;
+	} else {
+		vector = hv_msi_get_int_vector(data);
+		vector_count = 1;
+	}
+
 	memset(&ctxt, 0, sizeof(ctxt));
 	init_completion(&comp.comp_pkt.host_event);
 	ctxt.pci_pkt.completion_func = hv_pci_compose_compl;
@@ -1725,7 +1769,8 @@  static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
 		size = hv_compose_msi_req_v1(&ctxt.int_pkts.v1,
 					dest,
 					hpdev->desc.win_slot.slot,
-					hv_msi_get_int_vector(data));
+					vector,
+					vector_count);
 		break;
 
 	case PCI_PROTOCOL_VERSION_1_2:
@@ -1733,14 +1778,16 @@  static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
 		size = hv_compose_msi_req_v2(&ctxt.int_pkts.v2,
 					dest,
 					hpdev->desc.win_slot.slot,
-					hv_msi_get_int_vector(data));
+					vector,
+					vector_count);
 		break;
 
 	case PCI_PROTOCOL_VERSION_1_4:
 		size = hv_compose_msi_req_v3(&ctxt.int_pkts.v3,
 					dest,
 					hpdev->desc.win_slot.slot,
-					hv_msi_get_int_vector(data));
+					vector,
+					vector_count);
 		break;
 
 	default: