diff mbox series

[v2] PCI: Fix crash during pci_dev hot-unplug on pseries KVM guest

Message ID 20240715080726.2496198-1-amachhiw@linux.ibm.com
State New
Headers show
Series [v2] PCI: Fix crash during pci_dev hot-unplug on pseries KVM guest | expand

Commit Message

Amit Machhiwal July 15, 2024, 8:07 a.m. UTC
With CONFIG_PCI_DYNAMIC_OF_NODES [1], a hot-plug and hot-unplug sequence
of a PCI device attached to a PCI-bridge causes following kernel Oops on
a pseries KVM guest:

 RTAS: event: 2, Type: Hotplug Event (229), Severity: 1
 Kernel attempted to read user page (10ec00000048) - exploit attempt? (uid: 0)
 BUG: Unable to handle kernel data access on read at 0x10ec00000048
 Faulting instruction address: 0xc0000000012d8728
 Oops: Kernel access of bad area, sig: 11 [#1]
 LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries
<snip>
 NIP [c0000000012d8728] __of_changeset_entry_invert+0x10/0x1ac
 LR [c0000000012da7f0] __of_changeset_revert_entries+0x98/0x180
 Call Trace:
 [c00000000bcc3970] [c0000000012daa60] of_changeset_revert+0x58/0xd8
 [c00000000bcc39c0] [c000000000d0ed78] of_pci_remove_node+0x74/0xb0
 [c00000000bcc39f0] [c000000000cdcfe0] pci_stop_bus_device+0xf4/0x138
 [c00000000bcc3a30] [c000000000cdd140] pci_stop_and_remove_bus_device_locked+0x34/0x64
 [c00000000bcc3a60] [c000000000cf3780] remove_store+0xf0/0x108
 [c00000000bcc3ab0] [c000000000e89e04] dev_attr_store+0x34/0x78
 [c00000000bcc3ad0] [c0000000007f8dd4] sysfs_kf_write+0x70/0xa4
 [c00000000bcc3af0] [c0000000007f7248] kernfs_fop_write_iter+0x1d0/0x2e0
 [c00000000bcc3b40] [c0000000006c9b08] vfs_write+0x27c/0x558
 [c00000000bcc3bf0] [c0000000006ca168] ksys_write+0x90/0x170
 [c00000000bcc3c40] [c000000000033248] system_call_exception+0xf8/0x290
 [c00000000bcc3e50] [c00000000000d05c] system_call_vectored_common+0x15c/0x2ec
<snip>

A git bisect pointed this regression to be introduced via [1] that added
a mechanism to create device tree nodes for parent PCI bridges when a
PCI device is hot-plugged.

The Oops is caused when `pci_stop_dev()` tries to remove a non-existing
device-tree node associated with the pci_dev that was earlier
hot-plugged and was attached under a pci-bridge. The PCI dev header
`dev->hdr_type` being 0, results a conditional check done with
`pci_is_bridge()` into false. Consequently, a call to
`of_pci_make_dev_node()` to create a device node is never made. When at
a later point in time, in the device node removal path, a memcpy is
attempted in `__of_changeset_entry_invert()`; since the device node was
never created, results in an Oops due to kernel read access to a bad
address.

To fix this issue, the patch updates `of_changeset_create_node()` to
allocate a new node only when the device node doesn't exist and init it
in case it does already. Also, introduce `of_pci_free_node()` to be
called to only revert and destroy the changeset device node that was
created via a call to `of_changeset_create_node()`.

[1] commit 407d1a51921e ("PCI: Create device tree node for bridge")

Fixes: 407d1a51921e ("PCI: Create device tree node for bridge")
Reported-by: Kowshik Jois B S <kowsjois@linux.ibm.com>
Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
Signed-off-by: Amit Machhiwal <amachhiw@linux.ibm.com>
---
Changes since v1:
    * Included Lizhi's suggested changes on V1
    * Fixed below two warnings from Lizhi's changes and rearranged the cleanup
      part a bit in `of_pci_make_dev_node`
	drivers/pci/of.c:611:6: warning: no previous prototype for ‘of_pci_free_node’ [-Wmissing-prototypes]
	  611 | void of_pci_free_node(struct device_node *np)
	      |      ^~~~~~~~~~~~~~~~               
	drivers/pci/of.c: In function ‘of_pci_make_dev_node’:
	drivers/pci/of.c:696:1: warning: label ‘out_destroy_cset’ defined but not used [-Wunused-label]
	  696 | out_destroy_cset:       
	      | ^~~~~~~~~~~~~~~~  
    * V1: https://lore.kernel.org/all/20240703141634.2974589-1-amachhiw@linux.ibm.com/

 drivers/of/dynamic.c  | 16 ++++++++++++----
 drivers/of/unittest.c |  2 +-
 drivers/pci/bus.c     |  3 +--
 drivers/pci/of.c      | 39 ++++++++++++++++++++++++++-------------
 drivers/pci/pci.h     |  2 ++
 include/linux/of.h    |  1 +
 6 files changed, 43 insertions(+), 20 deletions(-)


base-commit: 43db1e03c086ed20cc75808d3f45e780ec4ca26e

Comments

Lizhi Hou July 15, 2024, 4:20 p.m. UTC | #1
On 7/15/24 01:07, Amit Machhiwal wrote:
> With CONFIG_PCI_DYNAMIC_OF_NODES [1], a hot-plug and hot-unplug sequence
> of a PCI device attached to a PCI-bridge causes following kernel Oops on
> a pseries KVM guest:
>
>   RTAS: event: 2, Type: Hotplug Event (229), Severity: 1
>   Kernel attempted to read user page (10ec00000048) - exploit attempt? (uid: 0)
>   BUG: Unable to handle kernel data access on read at 0x10ec00000048
>   Faulting instruction address: 0xc0000000012d8728
>   Oops: Kernel access of bad area, sig: 11 [#1]
>   LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries
> <snip>
>   NIP [c0000000012d8728] __of_changeset_entry_invert+0x10/0x1ac
>   LR [c0000000012da7f0] __of_changeset_revert_entries+0x98/0x180
>   Call Trace:
>   [c00000000bcc3970] [c0000000012daa60] of_changeset_revert+0x58/0xd8
>   [c00000000bcc39c0] [c000000000d0ed78] of_pci_remove_node+0x74/0xb0
>   [c00000000bcc39f0] [c000000000cdcfe0] pci_stop_bus_device+0xf4/0x138
>   [c00000000bcc3a30] [c000000000cdd140] pci_stop_and_remove_bus_device_locked+0x34/0x64
>   [c00000000bcc3a60] [c000000000cf3780] remove_store+0xf0/0x108
>   [c00000000bcc3ab0] [c000000000e89e04] dev_attr_store+0x34/0x78
>   [c00000000bcc3ad0] [c0000000007f8dd4] sysfs_kf_write+0x70/0xa4
>   [c00000000bcc3af0] [c0000000007f7248] kernfs_fop_write_iter+0x1d0/0x2e0
>   [c00000000bcc3b40] [c0000000006c9b08] vfs_write+0x27c/0x558
>   [c00000000bcc3bf0] [c0000000006ca168] ksys_write+0x90/0x170
>   [c00000000bcc3c40] [c000000000033248] system_call_exception+0xf8/0x290
>   [c00000000bcc3e50] [c00000000000d05c] system_call_vectored_common+0x15c/0x2ec
> <snip>
>
> A git bisect pointed this regression to be introduced via [1] that added
> a mechanism to create device tree nodes for parent PCI bridges when a
> PCI device is hot-plugged.
>
> The Oops is caused when `pci_stop_dev()` tries to remove a non-existing
> device-tree node associated with the pci_dev that was earlier
> hot-plugged and was attached under a pci-bridge. The PCI dev header
> `dev->hdr_type` being 0, results a conditional check done with
> `pci_is_bridge()` into false. Consequently, a call to
> `of_pci_make_dev_node()` to create a device node is never made. When at
> a later point in time, in the device node removal path, a memcpy is
> attempted in `__of_changeset_entry_invert()`; since the device node was
> never created, results in an Oops due to kernel read access to a bad
> address.
>
> To fix this issue, the patch updates `of_changeset_create_node()` to
> allocate a new node only when the device node doesn't exist and init it
> in case it does already. Also, introduce `of_pci_free_node()` to be
> called to only revert and destroy the changeset device node that was
> created via a call to `of_changeset_create_node()`.
>
> [1] commit 407d1a51921e ("PCI: Create device tree node for bridge")
>
> Fixes: 407d1a51921e ("PCI: Create device tree node for bridge")
> Reported-by: Kowshik Jois B S <kowsjois@linux.ibm.com>
> Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
> Signed-off-by: Amit Machhiwal <amachhiw@linux.ibm.com>
> ---
> Changes since v1:
>      * Included Lizhi's suggested changes on V1
>      * Fixed below two warnings from Lizhi's changes and rearranged the cleanup
>        part a bit in `of_pci_make_dev_node`
> 	drivers/pci/of.c:611:6: warning: no previous prototype for ‘of_pci_free_node’ [-Wmissing-prototypes]
> 	  611 | void of_pci_free_node(struct device_node *np)
> 	      |      ^~~~~~~~~~~~~~~~
> 	drivers/pci/of.c: In function ‘of_pci_make_dev_node’:
> 	drivers/pci/of.c:696:1: warning: label ‘out_destroy_cset’ defined but not used [-Wunused-label]
> 	  696 | out_destroy_cset:
> 	      | ^~~~~~~~~~~~~~~~
>      * V1: https://lore.kernel.org/all/20240703141634.2974589-1-amachhiw@linux.ibm.com/
>
>   drivers/of/dynamic.c  | 16 ++++++++++++----
>   drivers/of/unittest.c |  2 +-
>   drivers/pci/bus.c     |  3 +--
>   drivers/pci/of.c      | 39 ++++++++++++++++++++++++++-------------
>   drivers/pci/pci.h     |  2 ++
>   include/linux/of.h    |  1 +
>   6 files changed, 43 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c
> index dda6092e6d3a..9bba5e82a384 100644
> --- a/drivers/of/dynamic.c
> +++ b/drivers/of/dynamic.c
> @@ -492,21 +492,29 @@ struct device_node *__of_node_dup(const struct device_node *np,
>    * a given changeset.
>    *
>    * @ocs: Pointer to changeset
> + * @np: Pointer to device node. If null, allocate a new node. If not, init an
> + *	existing one.
>    * @parent: Pointer to parent device node
>    * @full_name: Node full name
>    *
>    * Return: Pointer to the created device node or NULL in case of an error.
>    */
>   struct device_node *of_changeset_create_node(struct of_changeset *ocs,
> +					     struct device_node *np,
>   					     struct device_node *parent,
>   					     const char *full_name)
>   {
> -	struct device_node *np;
>   	int ret;
>   
> -	np = __of_node_dup(NULL, full_name);
> -	if (!np)
> -		return NULL;
> +	if (!np) {
> +		np = __of_node_dup(NULL, full_name);
> +		if (!np)
> +			return NULL;
> +	} else {
> +		of_node_set_flag(np, OF_DYNAMIC);
> +		of_node_set_flag(np, OF_DETACHED);
> +	}
> +
>   	np->parent = parent;
>   
>   	ret = of_changeset_attach_node(ocs, np);
> diff --git a/drivers/of/unittest.c b/drivers/of/unittest.c
> index 445ad13dab98..b1bcc9ed40a6 100644
> --- a/drivers/of/unittest.c
> +++ b/drivers/of/unittest.c
> @@ -871,7 +871,7 @@ static void __init of_unittest_changeset(void)
>   	unittest(!of_changeset_add_property(&chgset, parent, ppadd), "fail add prop prop-add\n");
>   	unittest(!of_changeset_update_property(&chgset, parent, ppupdate), "fail update prop\n");
>   	unittest(!of_changeset_remove_property(&chgset, parent, ppremove), "fail remove prop\n");
> -	n22 = of_changeset_create_node(&chgset, n2, "n22");
> +	n22 = of_changeset_create_node(&chgset, NULL,  n2, "n22");
>   	unittest(n22, "fail create n22\n");
>   	unittest(!of_changeset_add_prop_string(&chgset, n22, "prop-str", "abcd"),
>   		 "fail add prop prop-str");
> diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
> index 826b5016a101..d7ca20cb146a 100644
> --- a/drivers/pci/bus.c
> +++ b/drivers/pci/bus.c
> @@ -342,8 +342,7 @@ void pci_bus_add_device(struct pci_dev *dev)
>   	 */
>   	pcibios_bus_add_device(dev);
>   	pci_fixup_device(pci_fixup_final, dev);
> -	if (pci_is_bridge(dev))
> -		of_pci_make_dev_node(dev);
> +	of_pci_make_dev_node(dev);

Please undo this change. It should only create the device node for 
bridges and the pci endpoints listed in quirks for now.


Thanks,

Lizhi

>   	pci_create_sysfs_dev_files(dev);
>   	pci_proc_attach_device(dev);
>   	pci_bridge_d3_update(dev);
> diff --git a/drivers/pci/of.c b/drivers/pci/of.c
> index 51e3dd0ea5ab..883bf15211a5 100644
> --- a/drivers/pci/of.c
> +++ b/drivers/pci/of.c
> @@ -608,18 +608,28 @@ int devm_of_pci_bridge_init(struct device *dev, struct pci_host_bridge *bridge)
>   
>   #ifdef CONFIG_PCI_DYNAMIC_OF_NODES
>   
> +void of_pci_free_node(struct device_node *np)
> +{
> +	struct of_changeset *cset;
> +
> +	cset = (struct of_changeset *)(np + 1);
> +
> +	np->data = NULL;
> +	of_changeset_revert(cset);
> +	of_changeset_destroy(cset);
> +	of_node_put(np);
> +}
> +
>   void of_pci_remove_node(struct pci_dev *pdev)
>   {
>   	struct device_node *np;
>   
>   	np = pci_device_to_OF_node(pdev);
> -	if (!np || !of_node_check_flag(np, OF_DYNAMIC))
> +	if (!np || np->data != of_pci_free_node)
>   		return;
>   	pdev->dev.of_node = NULL;
>   
> -	of_changeset_revert(np->data);
> -	of_changeset_destroy(np->data);
> -	of_node_put(np);
> +	of_pci_free_node(np);
>   }
>   
>   void of_pci_make_dev_node(struct pci_dev *pdev)
> @@ -655,14 +665,18 @@ void of_pci_make_dev_node(struct pci_dev *pdev)
>   	if (!name)
>   		return;
>   
> -	cset = kmalloc(sizeof(*cset), GFP_KERNEL);
> -	if (!cset)
> +	np = kzalloc(sizeof(*np) + sizeof(*cset), GFP_KERNEL);
> +	if (!np)
>   		goto out_free_name;
> +	np->full_name = name;
> +	of_node_init(np);
> +
> +	cset = (struct of_changeset *)(np + 1);
>   	of_changeset_init(cset);
>   
> -	np = of_changeset_create_node(cset, ppnode, name);
> +	np = of_changeset_create_node(cset, np, ppnode, NULL);
>   	if (!np)
> -		goto out_destroy_cset;
> +		goto out_free_node;
>   
>   	ret = of_pci_add_properties(pdev, cset, np);
>   	if (ret)
> @@ -670,19 +684,18 @@ void of_pci_make_dev_node(struct pci_dev *pdev)
>   
>   	ret = of_changeset_apply(cset);
>   	if (ret)
> -		goto out_free_node;
> +		goto out_destroy_cset;
>   
> -	np->data = cset;
> +	np->data = of_pci_free_node;
>   	pdev->dev.of_node = np;
> -	kfree(name);
>   
>   	return;
>   
> -out_free_node:
> -	of_node_put(np);
>   out_destroy_cset:
>   	of_changeset_destroy(cset);
>   	kfree(cset);
> +out_free_node:
> +	of_node_put(np);
>   out_free_name:
>   	kfree(name);
>   }
> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> index fd44565c4756..7b1a455306b8 100644
> --- a/drivers/pci/pci.h
> +++ b/drivers/pci/pci.h
> @@ -702,11 +702,13 @@ struct of_changeset;
>   
>   #ifdef CONFIG_PCI_DYNAMIC_OF_NODES
>   void of_pci_make_dev_node(struct pci_dev *pdev);
> +void of_pci_free_node(struct device_node *np);
>   void of_pci_remove_node(struct pci_dev *pdev);
>   int of_pci_add_properties(struct pci_dev *pdev, struct of_changeset *ocs,
>   			  struct device_node *np);
>   #else
>   static inline void of_pci_make_dev_node(struct pci_dev *pdev) { }
> +static inline void of_pci_free_node(struct device_node *np) { }
>   static inline void of_pci_remove_node(struct pci_dev *pdev) { }
>   #endif
>   
> diff --git a/include/linux/of.h b/include/linux/of.h
> index a0bedd038a05..f774459d0d84 100644
> --- a/include/linux/of.h
> +++ b/include/linux/of.h
> @@ -1631,6 +1631,7 @@ static inline int of_changeset_update_property(struct of_changeset *ocs,
>   }
>   
>   struct device_node *of_changeset_create_node(struct of_changeset *ocs,
> +					     struct device_node *np,
>   					     struct device_node *parent,
>   					     const char *full_name);
>   int of_changeset_add_prop_string(struct of_changeset *ocs,
>
> base-commit: 43db1e03c086ed20cc75808d3f45e780ec4ca26e
Bjorn Helgaas July 15, 2024, 5:23 p.m. UTC | #2
On Mon, Jul 15, 2024 at 09:20:01AM -0700, Lizhi Hou wrote:
> On 7/15/24 01:07, Amit Machhiwal wrote:
> > With CONFIG_PCI_DYNAMIC_OF_NODES [1], a hot-plug and hot-unplug sequence
> > of a PCI device attached to a PCI-bridge causes following kernel Oops on
> > a pseries KVM guest:
> > 
> >   RTAS: event: 2, Type: Hotplug Event (229), Severity: 1
> >   Kernel attempted to read user page (10ec00000048) - exploit attempt? (uid: 0)
> >   BUG: Unable to handle kernel data access on read at 0x10ec00000048
> >   Faulting instruction address: 0xc0000000012d8728
> >   Oops: Kernel access of bad area, sig: 11 [#1]
> >   LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries
> > <snip>
> >   NIP [c0000000012d8728] __of_changeset_entry_invert+0x10/0x1ac
> >   LR [c0000000012da7f0] __of_changeset_revert_entries+0x98/0x180
> >   Call Trace:
> >   [c00000000bcc3970] [c0000000012daa60] of_changeset_revert+0x58/0xd8
> >   [c00000000bcc39c0] [c000000000d0ed78] of_pci_remove_node+0x74/0xb0
> >   [c00000000bcc39f0] [c000000000cdcfe0] pci_stop_bus_device+0xf4/0x138
> >   [c00000000bcc3a30] [c000000000cdd140] pci_stop_and_remove_bus_device_locked+0x34/0x64
> >   [c00000000bcc3a60] [c000000000cf3780] remove_store+0xf0/0x108
> >   [c00000000bcc3ab0] [c000000000e89e04] dev_attr_store+0x34/0x78
> >   [c00000000bcc3ad0] [c0000000007f8dd4] sysfs_kf_write+0x70/0xa4
> >   [c00000000bcc3af0] [c0000000007f7248] kernfs_fop_write_iter+0x1d0/0x2e0
> >   [c00000000bcc3b40] [c0000000006c9b08] vfs_write+0x27c/0x558
> >   [c00000000bcc3bf0] [c0000000006ca168] ksys_write+0x90/0x170
> >   [c00000000bcc3c40] [c000000000033248] system_call_exception+0xf8/0x290
> >   [c00000000bcc3e50] [c00000000000d05c] system_call_vectored_common+0x15c/0x2ec
> > <snip>
> > 
> > A git bisect pointed this regression to be introduced via [1] that added
> > a mechanism to create device tree nodes for parent PCI bridges when a
> > PCI device is hot-plugged.
> > 
> > The Oops is caused when `pci_stop_dev()` tries to remove a non-existing
> > device-tree node associated with the pci_dev that was earlier
> > hot-plugged and was attached under a pci-bridge. The PCI dev header
> > `dev->hdr_type` being 0, results a conditional check done with
> > `pci_is_bridge()` into false. Consequently, a call to
> > `of_pci_make_dev_node()` to create a device node is never made. When at
> > a later point in time, in the device node removal path, a memcpy is
> > attempted in `__of_changeset_entry_invert()`; since the device node was
> > never created, results in an Oops due to kernel read access to a bad
> > address.
> > 
> > To fix this issue, the patch updates `of_changeset_create_node()` to
> > allocate a new node only when the device node doesn't exist and init it
> > in case it does already. Also, introduce `of_pci_free_node()` to be
> > called to only revert and destroy the changeset device node that was
> > created via a call to `of_changeset_create_node()`.
> > 
> > [1] commit 407d1a51921e ("PCI: Create device tree node for bridge")
> > 
> > Fixes: 407d1a51921e ("PCI: Create device tree node for bridge")
> > Reported-by: Kowshik Jois B S <kowsjois@linux.ibm.com>
> > Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
> > Signed-off-by: Amit Machhiwal <amachhiw@linux.ibm.com>
> > ---
> > Changes since v1:
> >      * Included Lizhi's suggested changes on V1
> >      * Fixed below two warnings from Lizhi's changes and rearranged the cleanup
> >        part a bit in `of_pci_make_dev_node`
> > 	drivers/pci/of.c:611:6: warning: no previous prototype for ‘of_pci_free_node’ [-Wmissing-prototypes]
> > 	  611 | void of_pci_free_node(struct device_node *np)
> > 	      |      ^~~~~~~~~~~~~~~~
> > 	drivers/pci/of.c: In function ‘of_pci_make_dev_node’:
> > 	drivers/pci/of.c:696:1: warning: label ‘out_destroy_cset’ defined but not used [-Wunused-label]
> > 	  696 | out_destroy_cset:
> > 	      | ^~~~~~~~~~~~~~~~
> >      * V1: https://lore.kernel.org/all/20240703141634.2974589-1-amachhiw@linux.ibm.com/
> > 
> >   drivers/of/dynamic.c  | 16 ++++++++++++----
> >   drivers/of/unittest.c |  2 +-
> >   drivers/pci/bus.c     |  3 +--
> >   drivers/pci/of.c      | 39 ++++++++++++++++++++++++++-------------
> >   drivers/pci/pci.h     |  2 ++
> >   include/linux/of.h    |  1 +
> >   6 files changed, 43 insertions(+), 20 deletions(-)
> > 
> > diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c
> > index dda6092e6d3a..9bba5e82a384 100644
> > --- a/drivers/of/dynamic.c
> > +++ b/drivers/of/dynamic.c
> > @@ -492,21 +492,29 @@ struct device_node *__of_node_dup(const struct device_node *np,
> >    * a given changeset.
> >    *
> >    * @ocs: Pointer to changeset
> > + * @np: Pointer to device node. If null, allocate a new node. If not, init an
> > + *	existing one.
> >    * @parent: Pointer to parent device node
> >    * @full_name: Node full name
> >    *
> >    * Return: Pointer to the created device node or NULL in case of an error.
> >    */
> >   struct device_node *of_changeset_create_node(struct of_changeset *ocs,
> > +					     struct device_node *np,
> >   					     struct device_node *parent,
> >   					     const char *full_name)
> >   {
> > -	struct device_node *np;
> >   	int ret;
> > -	np = __of_node_dup(NULL, full_name);
> > -	if (!np)
> > -		return NULL;
> > +	if (!np) {
> > +		np = __of_node_dup(NULL, full_name);
> > +		if (!np)
> > +			return NULL;
> > +	} else {
> > +		of_node_set_flag(np, OF_DYNAMIC);
> > +		of_node_set_flag(np, OF_DETACHED);
> > +	}
> > +
> >   	np->parent = parent;
> >   	ret = of_changeset_attach_node(ocs, np);
> > diff --git a/drivers/of/unittest.c b/drivers/of/unittest.c
> > index 445ad13dab98..b1bcc9ed40a6 100644
> > --- a/drivers/of/unittest.c
> > +++ b/drivers/of/unittest.c
> > @@ -871,7 +871,7 @@ static void __init of_unittest_changeset(void)
> >   	unittest(!of_changeset_add_property(&chgset, parent, ppadd), "fail add prop prop-add\n");
> >   	unittest(!of_changeset_update_property(&chgset, parent, ppupdate), "fail update prop\n");
> >   	unittest(!of_changeset_remove_property(&chgset, parent, ppremove), "fail remove prop\n");
> > -	n22 = of_changeset_create_node(&chgset, n2, "n22");
> > +	n22 = of_changeset_create_node(&chgset, NULL,  n2, "n22");
> >   	unittest(n22, "fail create n22\n");
> >   	unittest(!of_changeset_add_prop_string(&chgset, n22, "prop-str", "abcd"),
> >   		 "fail add prop prop-str");
> > diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
> > index 826b5016a101..d7ca20cb146a 100644
> > --- a/drivers/pci/bus.c
> > +++ b/drivers/pci/bus.c
> > @@ -342,8 +342,7 @@ void pci_bus_add_device(struct pci_dev *dev)
> >   	 */
> >   	pcibios_bus_add_device(dev);
> >   	pci_fixup_device(pci_fixup_final, dev);
> > -	if (pci_is_bridge(dev))
> > -		of_pci_make_dev_node(dev);
> > +	of_pci_make_dev_node(dev);
> 
> Please undo this change. It should only create the device node for bridges
> and the pci endpoints listed in quirks for now.

IIUC, you want Amit to post a v3 of this patch that omits this
specific pci_bus_add_device() change?

> >   	pci_create_sysfs_dev_files(dev);
> >   	pci_proc_attach_device(dev);
> >   	pci_bridge_d3_update(dev);
> > diff --git a/drivers/pci/of.c b/drivers/pci/of.c
> > index 51e3dd0ea5ab..883bf15211a5 100644
> > --- a/drivers/pci/of.c
> > +++ b/drivers/pci/of.c
> > @@ -608,18 +608,28 @@ int devm_of_pci_bridge_init(struct device *dev, struct pci_host_bridge *bridge)
> >   #ifdef CONFIG_PCI_DYNAMIC_OF_NODES
> > +void of_pci_free_node(struct device_node *np)
> > +{
> > +	struct of_changeset *cset;
> > +
> > +	cset = (struct of_changeset *)(np + 1);
> > +
> > +	np->data = NULL;
> > +	of_changeset_revert(cset);
> > +	of_changeset_destroy(cset);
> > +	of_node_put(np);
> > +}
> > +
> >   void of_pci_remove_node(struct pci_dev *pdev)
> >   {
> >   	struct device_node *np;
> >   	np = pci_device_to_OF_node(pdev);
> > -	if (!np || !of_node_check_flag(np, OF_DYNAMIC))
> > +	if (!np || np->data != of_pci_free_node)
> >   		return;
> >   	pdev->dev.of_node = NULL;
> > -	of_changeset_revert(np->data);
> > -	of_changeset_destroy(np->data);
> > -	of_node_put(np);
> > +	of_pci_free_node(np);
> >   }
> >   void of_pci_make_dev_node(struct pci_dev *pdev)
> > @@ -655,14 +665,18 @@ void of_pci_make_dev_node(struct pci_dev *pdev)
> >   	if (!name)
> >   		return;
> > -	cset = kmalloc(sizeof(*cset), GFP_KERNEL);
> > -	if (!cset)
> > +	np = kzalloc(sizeof(*np) + sizeof(*cset), GFP_KERNEL);
> > +	if (!np)
> >   		goto out_free_name;
> > +	np->full_name = name;
> > +	of_node_init(np);
> > +
> > +	cset = (struct of_changeset *)(np + 1);
> >   	of_changeset_init(cset);
> > -	np = of_changeset_create_node(cset, ppnode, name);
> > +	np = of_changeset_create_node(cset, np, ppnode, NULL);
> >   	if (!np)
> > -		goto out_destroy_cset;
> > +		goto out_free_node;
> >   	ret = of_pci_add_properties(pdev, cset, np);
> >   	if (ret)
> > @@ -670,19 +684,18 @@ void of_pci_make_dev_node(struct pci_dev *pdev)
> >   	ret = of_changeset_apply(cset);
> >   	if (ret)
> > -		goto out_free_node;
> > +		goto out_destroy_cset;
> > -	np->data = cset;
> > +	np->data = of_pci_free_node;
> >   	pdev->dev.of_node = np;
> > -	kfree(name);
> >   	return;
> > -out_free_node:
> > -	of_node_put(np);
> >   out_destroy_cset:
> >   	of_changeset_destroy(cset);
> >   	kfree(cset);
> > +out_free_node:
> > +	of_node_put(np);
> >   out_free_name:
> >   	kfree(name);
> >   }
> > diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> > index fd44565c4756..7b1a455306b8 100644
> > --- a/drivers/pci/pci.h
> > +++ b/drivers/pci/pci.h
> > @@ -702,11 +702,13 @@ struct of_changeset;
> >   #ifdef CONFIG_PCI_DYNAMIC_OF_NODES
> >   void of_pci_make_dev_node(struct pci_dev *pdev);
> > +void of_pci_free_node(struct device_node *np);
> >   void of_pci_remove_node(struct pci_dev *pdev);
> >   int of_pci_add_properties(struct pci_dev *pdev, struct of_changeset *ocs,
> >   			  struct device_node *np);
> >   #else
> >   static inline void of_pci_make_dev_node(struct pci_dev *pdev) { }
> > +static inline void of_pci_free_node(struct device_node *np) { }
> >   static inline void of_pci_remove_node(struct pci_dev *pdev) { }
> >   #endif
> > diff --git a/include/linux/of.h b/include/linux/of.h
> > index a0bedd038a05..f774459d0d84 100644
> > --- a/include/linux/of.h
> > +++ b/include/linux/of.h
> > @@ -1631,6 +1631,7 @@ static inline int of_changeset_update_property(struct of_changeset *ocs,
> >   }
> >   struct device_node *of_changeset_create_node(struct of_changeset *ocs,
> > +					     struct device_node *np,
> >   					     struct device_node *parent,
> >   					     const char *full_name);
> >   int of_changeset_add_prop_string(struct of_changeset *ocs,
> > 
> > base-commit: 43db1e03c086ed20cc75808d3f45e780ec4ca26e
Rob Herring (Arm) July 15, 2024, 6:55 p.m. UTC | #3
On Mon, Jul 15, 2024 at 2:08 AM Amit Machhiwal <amachhiw@linux.ibm.com> wrote:
>
> With CONFIG_PCI_DYNAMIC_OF_NODES [1], a hot-plug and hot-unplug sequence
> of a PCI device attached to a PCI-bridge causes following kernel Oops on
> a pseries KVM guest:
>
>  RTAS: event: 2, Type: Hotplug Event (229), Severity: 1
>  Kernel attempted to read user page (10ec00000048) - exploit attempt? (uid: 0)
>  BUG: Unable to handle kernel data access on read at 0x10ec00000048
>  Faulting instruction address: 0xc0000000012d8728
>  Oops: Kernel access of bad area, sig: 11 [#1]
>  LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries
> <snip>
>  NIP [c0000000012d8728] __of_changeset_entry_invert+0x10/0x1ac
>  LR [c0000000012da7f0] __of_changeset_revert_entries+0x98/0x180
>  Call Trace:
>  [c00000000bcc3970] [c0000000012daa60] of_changeset_revert+0x58/0xd8
>  [c00000000bcc39c0] [c000000000d0ed78] of_pci_remove_node+0x74/0xb0
>  [c00000000bcc39f0] [c000000000cdcfe0] pci_stop_bus_device+0xf4/0x138
>  [c00000000bcc3a30] [c000000000cdd140] pci_stop_and_remove_bus_device_locked+0x34/0x64
>  [c00000000bcc3a60] [c000000000cf3780] remove_store+0xf0/0x108
>  [c00000000bcc3ab0] [c000000000e89e04] dev_attr_store+0x34/0x78
>  [c00000000bcc3ad0] [c0000000007f8dd4] sysfs_kf_write+0x70/0xa4
>  [c00000000bcc3af0] [c0000000007f7248] kernfs_fop_write_iter+0x1d0/0x2e0
>  [c00000000bcc3b40] [c0000000006c9b08] vfs_write+0x27c/0x558
>  [c00000000bcc3bf0] [c0000000006ca168] ksys_write+0x90/0x170
>  [c00000000bcc3c40] [c000000000033248] system_call_exception+0xf8/0x290
>  [c00000000bcc3e50] [c00000000000d05c] system_call_vectored_common+0x15c/0x2ec
> <snip>
>
> A git bisect pointed this regression to be introduced via [1] that added
> a mechanism to create device tree nodes for parent PCI bridges when a
> PCI device is hot-plugged.
>
> The Oops is caused when `pci_stop_dev()` tries to remove a non-existing
> device-tree node associated with the pci_dev that was earlier
> hot-plugged and was attached under a pci-bridge. The PCI dev header
> `dev->hdr_type` being 0, results a conditional check done with
> `pci_is_bridge()` into false. Consequently, a call to
> `of_pci_make_dev_node()` to create a device node is never made. When at
> a later point in time, in the device node removal path, a memcpy is
> attempted in `__of_changeset_entry_invert()`; since the device node was
> never created, results in an Oops due to kernel read access to a bad
> address.
>
> To fix this issue, the patch updates `of_changeset_create_node()` to
> allocate a new node only when the device node doesn't exist and init it
> in case it does already. Also, introduce `of_pci_free_node()` to be
> called to only revert and destroy the changeset device node that was
> created via a call to `of_changeset_create_node()`.
>
> [1] commit 407d1a51921e ("PCI: Create device tree node for bridge")
>
> Fixes: 407d1a51921e ("PCI: Create device tree node for bridge")
> Reported-by: Kowshik Jois B S <kowsjois@linux.ibm.com>
> Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
> Signed-off-by: Amit Machhiwal <amachhiw@linux.ibm.com>
> ---
> Changes since v1:
>     * Included Lizhi's suggested changes on V1
>     * Fixed below two warnings from Lizhi's changes and rearranged the cleanup
>       part a bit in `of_pci_make_dev_node`
>         drivers/pci/of.c:611:6: warning: no previous prototype for ‘of_pci_free_node’ [-Wmissing-prototypes]
>           611 | void of_pci_free_node(struct device_node *np)
>               |      ^~~~~~~~~~~~~~~~
>         drivers/pci/of.c: In function ‘of_pci_make_dev_node’:
>         drivers/pci/of.c:696:1: warning: label ‘out_destroy_cset’ defined but not used [-Wunused-label]
>           696 | out_destroy_cset:
>               | ^~~~~~~~~~~~~~~~
>     * V1: https://lore.kernel.org/all/20240703141634.2974589-1-amachhiw@linux.ibm.com/
>
>  drivers/of/dynamic.c  | 16 ++++++++++++----
>  drivers/of/unittest.c |  2 +-
>  drivers/pci/bus.c     |  3 +--
>  drivers/pci/of.c      | 39 ++++++++++++++++++++++++++-------------
>  drivers/pci/pci.h     |  2 ++
>  include/linux/of.h    |  1 +
>  6 files changed, 43 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c
> index dda6092e6d3a..9bba5e82a384 100644
> --- a/drivers/of/dynamic.c
> +++ b/drivers/of/dynamic.c
> @@ -492,21 +492,29 @@ struct device_node *__of_node_dup(const struct device_node *np,
>   * a given changeset.
>   *
>   * @ocs: Pointer to changeset
> + * @np: Pointer to device node. If null, allocate a new node. If not, init an
> + *     existing one.
>   * @parent: Pointer to parent device node
>   * @full_name: Node full name
>   *
>   * Return: Pointer to the created device node or NULL in case of an error.
>   */
>  struct device_node *of_changeset_create_node(struct of_changeset *ocs,
> +                                            struct device_node *np,
>                                              struct device_node *parent,
>                                              const char *full_name)
>  {
> -       struct device_node *np;
>         int ret;
>
> -       np = __of_node_dup(NULL, full_name);
> -       if (!np)
> -               return NULL;
> +       if (!np) {
> +               np = __of_node_dup(NULL, full_name);
> +               if (!np)
> +                       return NULL;
> +       } else {
> +               of_node_set_flag(np, OF_DYNAMIC);
> +               of_node_set_flag(np, OF_DETACHED);

Are we going to rename the function to
of_changeset_create_or_maybe_modify_node()? No. The functions here are
very clear in that they allocate new objects and don't reuse what's
passed in.

Rob
Lizhi Hou July 15, 2024, 8:35 p.m. UTC | #4
On 7/15/24 10:23, Bjorn Helgaas wrote:
> On Mon, Jul 15, 2024 at 09:20:01AM -0700, Lizhi Hou wrote:
>> On 7/15/24 01:07, Amit Machhiwal wrote:
>>> With CONFIG_PCI_DYNAMIC_OF_NODES [1], a hot-plug and hot-unplug sequence
>>> of a PCI device attached to a PCI-bridge causes following kernel Oops on
>>> a pseries KVM guest:
>>>
>>>    RTAS: event: 2, Type: Hotplug Event (229), Severity: 1
>>>    Kernel attempted to read user page (10ec00000048) - exploit attempt? (uid: 0)
>>>    BUG: Unable to handle kernel data access on read at 0x10ec00000048
>>>    Faulting instruction address: 0xc0000000012d8728
>>>    Oops: Kernel access of bad area, sig: 11 [#1]
>>>    LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries
>>> <snip>
>>>    NIP [c0000000012d8728] __of_changeset_entry_invert+0x10/0x1ac
>>>    LR [c0000000012da7f0] __of_changeset_revert_entries+0x98/0x180
>>>    Call Trace:
>>>    [c00000000bcc3970] [c0000000012daa60] of_changeset_revert+0x58/0xd8
>>>    [c00000000bcc39c0] [c000000000d0ed78] of_pci_remove_node+0x74/0xb0
>>>    [c00000000bcc39f0] [c000000000cdcfe0] pci_stop_bus_device+0xf4/0x138
>>>    [c00000000bcc3a30] [c000000000cdd140] pci_stop_and_remove_bus_device_locked+0x34/0x64
>>>    [c00000000bcc3a60] [c000000000cf3780] remove_store+0xf0/0x108
>>>    [c00000000bcc3ab0] [c000000000e89e04] dev_attr_store+0x34/0x78
>>>    [c00000000bcc3ad0] [c0000000007f8dd4] sysfs_kf_write+0x70/0xa4
>>>    [c00000000bcc3af0] [c0000000007f7248] kernfs_fop_write_iter+0x1d0/0x2e0
>>>    [c00000000bcc3b40] [c0000000006c9b08] vfs_write+0x27c/0x558
>>>    [c00000000bcc3bf0] [c0000000006ca168] ksys_write+0x90/0x170
>>>    [c00000000bcc3c40] [c000000000033248] system_call_exception+0xf8/0x290
>>>    [c00000000bcc3e50] [c00000000000d05c] system_call_vectored_common+0x15c/0x2ec
>>> <snip>
>>>
>>> A git bisect pointed this regression to be introduced via [1] that added
>>> a mechanism to create device tree nodes for parent PCI bridges when a
>>> PCI device is hot-plugged.
>>>
>>> The Oops is caused when `pci_stop_dev()` tries to remove a non-existing
>>> device-tree node associated with the pci_dev that was earlier
>>> hot-plugged and was attached under a pci-bridge. The PCI dev header
>>> `dev->hdr_type` being 0, results a conditional check done with
>>> `pci_is_bridge()` into false. Consequently, a call to
>>> `of_pci_make_dev_node()` to create a device node is never made. When at
>>> a later point in time, in the device node removal path, a memcpy is
>>> attempted in `__of_changeset_entry_invert()`; since the device node was
>>> never created, results in an Oops due to kernel read access to a bad
>>> address.
>>>
>>> To fix this issue, the patch updates `of_changeset_create_node()` to
>>> allocate a new node only when the device node doesn't exist and init it
>>> in case it does already. Also, introduce `of_pci_free_node()` to be
>>> called to only revert and destroy the changeset device node that was
>>> created via a call to `of_changeset_create_node()`.
>>>
>>> [1] commit 407d1a51921e ("PCI: Create device tree node for bridge")
>>>
>>> Fixes: 407d1a51921e ("PCI: Create device tree node for bridge")
>>> Reported-by: Kowshik Jois B S <kowsjois@linux.ibm.com>
>>> Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
>>> Signed-off-by: Amit Machhiwal <amachhiw@linux.ibm.com>
>>> ---
>>> Changes since v1:
>>>       * Included Lizhi's suggested changes on V1
>>>       * Fixed below two warnings from Lizhi's changes and rearranged the cleanup
>>>         part a bit in `of_pci_make_dev_node`
>>> 	drivers/pci/of.c:611:6: warning: no previous prototype for ‘of_pci_free_node’ [-Wmissing-prototypes]
>>> 	  611 | void of_pci_free_node(struct device_node *np)
>>> 	      |      ^~~~~~~~~~~~~~~~
>>> 	drivers/pci/of.c: In function ‘of_pci_make_dev_node’:
>>> 	drivers/pci/of.c:696:1: warning: label ‘out_destroy_cset’ defined but not used [-Wunused-label]
>>> 	  696 | out_destroy_cset:
>>> 	      | ^~~~~~~~~~~~~~~~
>>>       * V1: https://lore.kernel.org/all/20240703141634.2974589-1-amachhiw@linux.ibm.com/
>>>
>>>    drivers/of/dynamic.c  | 16 ++++++++++++----
>>>    drivers/of/unittest.c |  2 +-
>>>    drivers/pci/bus.c     |  3 +--
>>>    drivers/pci/of.c      | 39 ++++++++++++++++++++++++++-------------
>>>    drivers/pci/pci.h     |  2 ++
>>>    include/linux/of.h    |  1 +
>>>    6 files changed, 43 insertions(+), 20 deletions(-)
>>>
>>> diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c
>>> index dda6092e6d3a..9bba5e82a384 100644
>>> --- a/drivers/of/dynamic.c
>>> +++ b/drivers/of/dynamic.c
>>> @@ -492,21 +492,29 @@ struct device_node *__of_node_dup(const struct device_node *np,
>>>     * a given changeset.
>>>     *
>>>     * @ocs: Pointer to changeset
>>> + * @np: Pointer to device node. If null, allocate a new node. If not, init an
>>> + *	existing one.
>>>     * @parent: Pointer to parent device node
>>>     * @full_name: Node full name
>>>     *
>>>     * Return: Pointer to the created device node or NULL in case of an error.
>>>     */
>>>    struct device_node *of_changeset_create_node(struct of_changeset *ocs,
>>> +					     struct device_node *np,
>>>    					     struct device_node *parent,
>>>    					     const char *full_name)
>>>    {
>>> -	struct device_node *np;
>>>    	int ret;
>>> -	np = __of_node_dup(NULL, full_name);
>>> -	if (!np)
>>> -		return NULL;
>>> +	if (!np) {
>>> +		np = __of_node_dup(NULL, full_name);
>>> +		if (!np)
>>> +			return NULL;
>>> +	} else {
>>> +		of_node_set_flag(np, OF_DYNAMIC);
>>> +		of_node_set_flag(np, OF_DETACHED);
>>> +	}
>>> +
>>>    	np->parent = parent;
>>>    	ret = of_changeset_attach_node(ocs, np);
>>> diff --git a/drivers/of/unittest.c b/drivers/of/unittest.c
>>> index 445ad13dab98..b1bcc9ed40a6 100644
>>> --- a/drivers/of/unittest.c
>>> +++ b/drivers/of/unittest.c
>>> @@ -871,7 +871,7 @@ static void __init of_unittest_changeset(void)
>>>    	unittest(!of_changeset_add_property(&chgset, parent, ppadd), "fail add prop prop-add\n");
>>>    	unittest(!of_changeset_update_property(&chgset, parent, ppupdate), "fail update prop\n");
>>>    	unittest(!of_changeset_remove_property(&chgset, parent, ppremove), "fail remove prop\n");
>>> -	n22 = of_changeset_create_node(&chgset, n2, "n22");
>>> +	n22 = of_changeset_create_node(&chgset, NULL,  n2, "n22");
>>>    	unittest(n22, "fail create n22\n");
>>>    	unittest(!of_changeset_add_prop_string(&chgset, n22, "prop-str", "abcd"),
>>>    		 "fail add prop prop-str");
>>> diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
>>> index 826b5016a101..d7ca20cb146a 100644
>>> --- a/drivers/pci/bus.c
>>> +++ b/drivers/pci/bus.c
>>> @@ -342,8 +342,7 @@ void pci_bus_add_device(struct pci_dev *dev)
>>>    	 */
>>>    	pcibios_bus_add_device(dev);
>>>    	pci_fixup_device(pci_fixup_final, dev);
>>> -	if (pci_is_bridge(dev))
>>> -		of_pci_make_dev_node(dev);
>>> +	of_pci_make_dev_node(dev);
>> Please undo this change. It should only create the device node for bridges
>> and the pci endpoints listed in quirks for now.
> IIUC, you want Amit to post a v3 of this patch that omits this
> specific pci_bus_add_device() change?

Yes.

Lizhi

>
>>>    	pci_create_sysfs_dev_files(dev);
>>>    	pci_proc_attach_device(dev);
>>>    	pci_bridge_d3_update(dev);
>>> diff --git a/drivers/pci/of.c b/drivers/pci/of.c
>>> index 51e3dd0ea5ab..883bf15211a5 100644
>>> --- a/drivers/pci/of.c
>>> +++ b/drivers/pci/of.c
>>> @@ -608,18 +608,28 @@ int devm_of_pci_bridge_init(struct device *dev, struct pci_host_bridge *bridge)
>>>    #ifdef CONFIG_PCI_DYNAMIC_OF_NODES
>>> +void of_pci_free_node(struct device_node *np)
>>> +{
>>> +	struct of_changeset *cset;
>>> +
>>> +	cset = (struct of_changeset *)(np + 1);
>>> +
>>> +	np->data = NULL;
>>> +	of_changeset_revert(cset);
>>> +	of_changeset_destroy(cset);
>>> +	of_node_put(np);
>>> +}
>>> +
>>>    void of_pci_remove_node(struct pci_dev *pdev)
>>>    {
>>>    	struct device_node *np;
>>>    	np = pci_device_to_OF_node(pdev);
>>> -	if (!np || !of_node_check_flag(np, OF_DYNAMIC))
>>> +	if (!np || np->data != of_pci_free_node)
>>>    		return;
>>>    	pdev->dev.of_node = NULL;
>>> -	of_changeset_revert(np->data);
>>> -	of_changeset_destroy(np->data);
>>> -	of_node_put(np);
>>> +	of_pci_free_node(np);
>>>    }
>>>    void of_pci_make_dev_node(struct pci_dev *pdev)
>>> @@ -655,14 +665,18 @@ void of_pci_make_dev_node(struct pci_dev *pdev)
>>>    	if (!name)
>>>    		return;
>>> -	cset = kmalloc(sizeof(*cset), GFP_KERNEL);
>>> -	if (!cset)
>>> +	np = kzalloc(sizeof(*np) + sizeof(*cset), GFP_KERNEL);
>>> +	if (!np)
>>>    		goto out_free_name;
>>> +	np->full_name = name;
>>> +	of_node_init(np);
>>> +
>>> +	cset = (struct of_changeset *)(np + 1);
>>>    	of_changeset_init(cset);
>>> -	np = of_changeset_create_node(cset, ppnode, name);
>>> +	np = of_changeset_create_node(cset, np, ppnode, NULL);
>>>    	if (!np)
>>> -		goto out_destroy_cset;
>>> +		goto out_free_node;
>>>    	ret = of_pci_add_properties(pdev, cset, np);
>>>    	if (ret)
>>> @@ -670,19 +684,18 @@ void of_pci_make_dev_node(struct pci_dev *pdev)
>>>    	ret = of_changeset_apply(cset);
>>>    	if (ret)
>>> -		goto out_free_node;
>>> +		goto out_destroy_cset;
>>> -	np->data = cset;
>>> +	np->data = of_pci_free_node;
>>>    	pdev->dev.of_node = np;
>>> -	kfree(name);
>>>    	return;
>>> -out_free_node:
>>> -	of_node_put(np);
>>>    out_destroy_cset:
>>>    	of_changeset_destroy(cset);
>>>    	kfree(cset);
>>> +out_free_node:
>>> +	of_node_put(np);
>>>    out_free_name:
>>>    	kfree(name);
>>>    }
>>> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
>>> index fd44565c4756..7b1a455306b8 100644
>>> --- a/drivers/pci/pci.h
>>> +++ b/drivers/pci/pci.h
>>> @@ -702,11 +702,13 @@ struct of_changeset;
>>>    #ifdef CONFIG_PCI_DYNAMIC_OF_NODES
>>>    void of_pci_make_dev_node(struct pci_dev *pdev);
>>> +void of_pci_free_node(struct device_node *np);
>>>    void of_pci_remove_node(struct pci_dev *pdev);
>>>    int of_pci_add_properties(struct pci_dev *pdev, struct of_changeset *ocs,
>>>    			  struct device_node *np);
>>>    #else
>>>    static inline void of_pci_make_dev_node(struct pci_dev *pdev) { }
>>> +static inline void of_pci_free_node(struct device_node *np) { }
>>>    static inline void of_pci_remove_node(struct pci_dev *pdev) { }
>>>    #endif
>>> diff --git a/include/linux/of.h b/include/linux/of.h
>>> index a0bedd038a05..f774459d0d84 100644
>>> --- a/include/linux/of.h
>>> +++ b/include/linux/of.h
>>> @@ -1631,6 +1631,7 @@ static inline int of_changeset_update_property(struct of_changeset *ocs,
>>>    }
>>>    struct device_node *of_changeset_create_node(struct of_changeset *ocs,
>>> +					     struct device_node *np,
>>>    					     struct device_node *parent,
>>>    					     const char *full_name);
>>>    int of_changeset_add_prop_string(struct of_changeset *ocs,
>>>
>>> base-commit: 43db1e03c086ed20cc75808d3f45e780ec4ca26e
Lizhi Hou July 15, 2024, 8:52 p.m. UTC | #5
On 7/15/24 11:55, Rob Herring wrote:
> On Mon, Jul 15, 2024 at 2:08 AM Amit Machhiwal <amachhiw@linux.ibm.com> wrote:
>> With CONFIG_PCI_DYNAMIC_OF_NODES [1], a hot-plug and hot-unplug sequence
>> of a PCI device attached to a PCI-bridge causes following kernel Oops on
>> a pseries KVM guest:
>>
>>   RTAS: event: 2, Type: Hotplug Event (229), Severity: 1
>>   Kernel attempted to read user page (10ec00000048) - exploit attempt? (uid: 0)
>>   BUG: Unable to handle kernel data access on read at 0x10ec00000048
>>   Faulting instruction address: 0xc0000000012d8728
>>   Oops: Kernel access of bad area, sig: 11 [#1]
>>   LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries
>> <snip>
>>   NIP [c0000000012d8728] __of_changeset_entry_invert+0x10/0x1ac
>>   LR [c0000000012da7f0] __of_changeset_revert_entries+0x98/0x180
>>   Call Trace:
>>   [c00000000bcc3970] [c0000000012daa60] of_changeset_revert+0x58/0xd8
>>   [c00000000bcc39c0] [c000000000d0ed78] of_pci_remove_node+0x74/0xb0
>>   [c00000000bcc39f0] [c000000000cdcfe0] pci_stop_bus_device+0xf4/0x138
>>   [c00000000bcc3a30] [c000000000cdd140] pci_stop_and_remove_bus_device_locked+0x34/0x64
>>   [c00000000bcc3a60] [c000000000cf3780] remove_store+0xf0/0x108
>>   [c00000000bcc3ab0] [c000000000e89e04] dev_attr_store+0x34/0x78
>>   [c00000000bcc3ad0] [c0000000007f8dd4] sysfs_kf_write+0x70/0xa4
>>   [c00000000bcc3af0] [c0000000007f7248] kernfs_fop_write_iter+0x1d0/0x2e0
>>   [c00000000bcc3b40] [c0000000006c9b08] vfs_write+0x27c/0x558
>>   [c00000000bcc3bf0] [c0000000006ca168] ksys_write+0x90/0x170
>>   [c00000000bcc3c40] [c000000000033248] system_call_exception+0xf8/0x290
>>   [c00000000bcc3e50] [c00000000000d05c] system_call_vectored_common+0x15c/0x2ec
>> <snip>
>>
>> A git bisect pointed this regression to be introduced via [1] that added
>> a mechanism to create device tree nodes for parent PCI bridges when a
>> PCI device is hot-plugged.
>>
>> The Oops is caused when `pci_stop_dev()` tries to remove a non-existing
>> device-tree node associated with the pci_dev that was earlier
>> hot-plugged and was attached under a pci-bridge. The PCI dev header
>> `dev->hdr_type` being 0, results a conditional check done with
>> `pci_is_bridge()` into false. Consequently, a call to
>> `of_pci_make_dev_node()` to create a device node is never made. When at
>> a later point in time, in the device node removal path, a memcpy is
>> attempted in `__of_changeset_entry_invert()`; since the device node was
>> never created, results in an Oops due to kernel read access to a bad
>> address.
>>
>> To fix this issue, the patch updates `of_changeset_create_node()` to
>> allocate a new node only when the device node doesn't exist and init it
>> in case it does already. Also, introduce `of_pci_free_node()` to be
>> called to only revert and destroy the changeset device node that was
>> created via a call to `of_changeset_create_node()`.
>>
>> [1] commit 407d1a51921e ("PCI: Create device tree node for bridge")
>>
>> Fixes: 407d1a51921e ("PCI: Create device tree node for bridge")
>> Reported-by: Kowshik Jois B S <kowsjois@linux.ibm.com>
>> Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
>> Signed-off-by: Amit Machhiwal <amachhiw@linux.ibm.com>
>> ---
>> Changes since v1:
>>      * Included Lizhi's suggested changes on V1
>>      * Fixed below two warnings from Lizhi's changes and rearranged the cleanup
>>        part a bit in `of_pci_make_dev_node`
>>          drivers/pci/of.c:611:6: warning: no previous prototype for ‘of_pci_free_node’ [-Wmissing-prototypes]
>>            611 | void of_pci_free_node(struct device_node *np)
>>                |      ^~~~~~~~~~~~~~~~
>>          drivers/pci/of.c: In function ‘of_pci_make_dev_node’:
>>          drivers/pci/of.c:696:1: warning: label ‘out_destroy_cset’ defined but not used [-Wunused-label]
>>            696 | out_destroy_cset:
>>                | ^~~~~~~~~~~~~~~~
>>      * V1: https://lore.kernel.org/all/20240703141634.2974589-1-amachhiw@linux.ibm.com/
>>
>>   drivers/of/dynamic.c  | 16 ++++++++++++----
>>   drivers/of/unittest.c |  2 +-
>>   drivers/pci/bus.c     |  3 +--
>>   drivers/pci/of.c      | 39 ++++++++++++++++++++++++++-------------
>>   drivers/pci/pci.h     |  2 ++
>>   include/linux/of.h    |  1 +
>>   6 files changed, 43 insertions(+), 20 deletions(-)
>>
>> diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c
>> index dda6092e6d3a..9bba5e82a384 100644
>> --- a/drivers/of/dynamic.c
>> +++ b/drivers/of/dynamic.c
>> @@ -492,21 +492,29 @@ struct device_node *__of_node_dup(const struct device_node *np,
>>    * a given changeset.
>>    *
>>    * @ocs: Pointer to changeset
>> + * @np: Pointer to device node. If null, allocate a new node. If not, init an
>> + *     existing one.
>>    * @parent: Pointer to parent device node
>>    * @full_name: Node full name
>>    *
>>    * Return: Pointer to the created device node or NULL in case of an error.
>>    */
>>   struct device_node *of_changeset_create_node(struct of_changeset *ocs,
>> +                                            struct device_node *np,
>>                                               struct device_node *parent,
>>                                               const char *full_name)
>>   {
>> -       struct device_node *np;
>>          int ret;
>>
>> -       np = __of_node_dup(NULL, full_name);
>> -       if (!np)
>> -               return NULL;
>> +       if (!np) {
>> +               np = __of_node_dup(NULL, full_name);
>> +               if (!np)
>> +                       return NULL;
>> +       } else {
>> +               of_node_set_flag(np, OF_DYNAMIC);
>> +               of_node_set_flag(np, OF_DETACHED);
> Are we going to rename the function to
> of_changeset_create_or_maybe_modify_node()? No. The functions here are
> very clear in that they allocate new objects and don't reuse what's
> passed in.

Ok. How about keeping of_changeset_create_node unchanged.

Instead, call kzalloc(), of_node_init() and of_changeset_attach_node()

in of_pci_make_dev_node() directly.

A similar example is dlpar_parse_cc_node().


Does this sound better?


Thanks,

Lizhi

>
> Rob
Rob Herring (Arm) July 23, 2024, 4:21 p.m. UTC | #6
On Mon, Jul 15, 2024 at 01:52:30PM -0700, Lizhi Hou wrote:
> 
> On 7/15/24 11:55, Rob Herring wrote:
> > On Mon, Jul 15, 2024 at 2:08 AM Amit Machhiwal <amachhiw@linux.ibm.com> wrote:
> > > With CONFIG_PCI_DYNAMIC_OF_NODES [1], a hot-plug and hot-unplug sequence
> > > of a PCI device attached to a PCI-bridge causes following kernel Oops on
> > > a pseries KVM guest:
> > > 
> > >   RTAS: event: 2, Type: Hotplug Event (229), Severity: 1
> > >   Kernel attempted to read user page (10ec00000048) - exploit attempt? (uid: 0)
> > >   BUG: Unable to handle kernel data access on read at 0x10ec00000048
> > >   Faulting instruction address: 0xc0000000012d8728
> > >   Oops: Kernel access of bad area, sig: 11 [#1]
> > >   LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries
> > > <snip>
> > >   NIP [c0000000012d8728] __of_changeset_entry_invert+0x10/0x1ac
> > >   LR [c0000000012da7f0] __of_changeset_revert_entries+0x98/0x180
> > >   Call Trace:
> > >   [c00000000bcc3970] [c0000000012daa60] of_changeset_revert+0x58/0xd8
> > >   [c00000000bcc39c0] [c000000000d0ed78] of_pci_remove_node+0x74/0xb0
> > >   [c00000000bcc39f0] [c000000000cdcfe0] pci_stop_bus_device+0xf4/0x138
> > >   [c00000000bcc3a30] [c000000000cdd140] pci_stop_and_remove_bus_device_locked+0x34/0x64
> > >   [c00000000bcc3a60] [c000000000cf3780] remove_store+0xf0/0x108
> > >   [c00000000bcc3ab0] [c000000000e89e04] dev_attr_store+0x34/0x78
> > >   [c00000000bcc3ad0] [c0000000007f8dd4] sysfs_kf_write+0x70/0xa4
> > >   [c00000000bcc3af0] [c0000000007f7248] kernfs_fop_write_iter+0x1d0/0x2e0
> > >   [c00000000bcc3b40] [c0000000006c9b08] vfs_write+0x27c/0x558
> > >   [c00000000bcc3bf0] [c0000000006ca168] ksys_write+0x90/0x170
> > >   [c00000000bcc3c40] [c000000000033248] system_call_exception+0xf8/0x290
> > >   [c00000000bcc3e50] [c00000000000d05c] system_call_vectored_common+0x15c/0x2ec
> > > <snip>
> > > 
> > > A git bisect pointed this regression to be introduced via [1] that added
> > > a mechanism to create device tree nodes for parent PCI bridges when a
> > > PCI device is hot-plugged.
> > > 
> > > The Oops is caused when `pci_stop_dev()` tries to remove a non-existing
> > > device-tree node associated with the pci_dev that was earlier
> > > hot-plugged and was attached under a pci-bridge. The PCI dev header
> > > `dev->hdr_type` being 0, results a conditional check done with
> > > `pci_is_bridge()` into false. Consequently, a call to
> > > `of_pci_make_dev_node()` to create a device node is never made. When at
> > > a later point in time, in the device node removal path, a memcpy is
> > > attempted in `__of_changeset_entry_invert()`; since the device node was
> > > never created, results in an Oops due to kernel read access to a bad
> > > address.
> > > 
> > > To fix this issue, the patch updates `of_changeset_create_node()` to
> > > allocate a new node only when the device node doesn't exist and init it
> > > in case it does already. Also, introduce `of_pci_free_node()` to be
> > > called to only revert and destroy the changeset device node that was
> > > created via a call to `of_changeset_create_node()`.
> > > 
> > > [1] commit 407d1a51921e ("PCI: Create device tree node for bridge")
> > > 
> > > Fixes: 407d1a51921e ("PCI: Create device tree node for bridge")
> > > Reported-by: Kowshik Jois B S <kowsjois@linux.ibm.com>
> > > Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
> > > Signed-off-by: Amit Machhiwal <amachhiw@linux.ibm.com>
> > > ---
> > > Changes since v1:
> > >      * Included Lizhi's suggested changes on V1
> > >      * Fixed below two warnings from Lizhi's changes and rearranged the cleanup
> > >        part a bit in `of_pci_make_dev_node`
> > >          drivers/pci/of.c:611:6: warning: no previous prototype for ‘of_pci_free_node’ [-Wmissing-prototypes]
> > >            611 | void of_pci_free_node(struct device_node *np)
> > >                |      ^~~~~~~~~~~~~~~~
> > >          drivers/pci/of.c: In function ‘of_pci_make_dev_node’:
> > >          drivers/pci/of.c:696:1: warning: label ‘out_destroy_cset’ defined but not used [-Wunused-label]
> > >            696 | out_destroy_cset:
> > >                | ^~~~~~~~~~~~~~~~
> > >      * V1: https://lore.kernel.org/all/20240703141634.2974589-1-amachhiw@linux.ibm.com/
> > > 
> > >   drivers/of/dynamic.c  | 16 ++++++++++++----
> > >   drivers/of/unittest.c |  2 +-
> > >   drivers/pci/bus.c     |  3 +--
> > >   drivers/pci/of.c      | 39 ++++++++++++++++++++++++++-------------
> > >   drivers/pci/pci.h     |  2 ++
> > >   include/linux/of.h    |  1 +
> > >   6 files changed, 43 insertions(+), 20 deletions(-)
> > > 
> > > diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c
> > > index dda6092e6d3a..9bba5e82a384 100644
> > > --- a/drivers/of/dynamic.c
> > > +++ b/drivers/of/dynamic.c
> > > @@ -492,21 +492,29 @@ struct device_node *__of_node_dup(const struct device_node *np,
> > >    * a given changeset.
> > >    *
> > >    * @ocs: Pointer to changeset
> > > + * @np: Pointer to device node. If null, allocate a new node. If not, init an
> > > + *     existing one.
> > >    * @parent: Pointer to parent device node
> > >    * @full_name: Node full name
> > >    *
> > >    * Return: Pointer to the created device node or NULL in case of an error.
> > >    */
> > >   struct device_node *of_changeset_create_node(struct of_changeset *ocs,
> > > +                                            struct device_node *np,
> > >                                               struct device_node *parent,
> > >                                               const char *full_name)
> > >   {
> > > -       struct device_node *np;
> > >          int ret;
> > > 
> > > -       np = __of_node_dup(NULL, full_name);
> > > -       if (!np)
> > > -               return NULL;
> > > +       if (!np) {
> > > +               np = __of_node_dup(NULL, full_name);
> > > +               if (!np)
> > > +                       return NULL;
> > > +       } else {
> > > +               of_node_set_flag(np, OF_DYNAMIC);
> > > +               of_node_set_flag(np, OF_DETACHED);
> > Are we going to rename the function to
> > of_changeset_create_or_maybe_modify_node()? No. The functions here are
> > very clear in that they allocate new objects and don't reuse what's
> > passed in.
> 
> Ok. How about keeping of_changeset_create_node unchanged.
> 
> Instead, call kzalloc(), of_node_init() and of_changeset_attach_node()
> 
> in of_pci_make_dev_node() directly.
> 
> A similar example is dlpar_parse_cc_node().
> 
> 
> Does this sound better?

No, because really that code should be re-written using of_changeset
API.

My suggestion is add a data pointer to struct of_changeset and then set 
that to something to know the data ptr is a changeset and is your 
changeset.

Rob
Lizhi Hou July 23, 2024, 6:21 p.m. UTC | #7
On 7/23/24 09:21, Rob Herring wrote:
> On Mon, Jul 15, 2024 at 01:52:30PM -0700, Lizhi Hou wrote:
>> On 7/15/24 11:55, Rob Herring wrote:
>>> On Mon, Jul 15, 2024 at 2:08 AM Amit Machhiwal <amachhiw@linux.ibm.com> wrote:
>>>> With CONFIG_PCI_DYNAMIC_OF_NODES [1], a hot-plug and hot-unplug sequence
>>>> of a PCI device attached to a PCI-bridge causes following kernel Oops on
>>>> a pseries KVM guest:
>>>>
>>>>    RTAS: event: 2, Type: Hotplug Event (229), Severity: 1
>>>>    Kernel attempted to read user page (10ec00000048) - exploit attempt? (uid: 0)
>>>>    BUG: Unable to handle kernel data access on read at 0x10ec00000048
>>>>    Faulting instruction address: 0xc0000000012d8728
>>>>    Oops: Kernel access of bad area, sig: 11 [#1]
>>>>    LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries
>>>> <snip>
>>>>    NIP [c0000000012d8728] __of_changeset_entry_invert+0x10/0x1ac
>>>>    LR [c0000000012da7f0] __of_changeset_revert_entries+0x98/0x180
>>>>    Call Trace:
>>>>    [c00000000bcc3970] [c0000000012daa60] of_changeset_revert+0x58/0xd8
>>>>    [c00000000bcc39c0] [c000000000d0ed78] of_pci_remove_node+0x74/0xb0
>>>>    [c00000000bcc39f0] [c000000000cdcfe0] pci_stop_bus_device+0xf4/0x138
>>>>    [c00000000bcc3a30] [c000000000cdd140] pci_stop_and_remove_bus_device_locked+0x34/0x64
>>>>    [c00000000bcc3a60] [c000000000cf3780] remove_store+0xf0/0x108
>>>>    [c00000000bcc3ab0] [c000000000e89e04] dev_attr_store+0x34/0x78
>>>>    [c00000000bcc3ad0] [c0000000007f8dd4] sysfs_kf_write+0x70/0xa4
>>>>    [c00000000bcc3af0] [c0000000007f7248] kernfs_fop_write_iter+0x1d0/0x2e0
>>>>    [c00000000bcc3b40] [c0000000006c9b08] vfs_write+0x27c/0x558
>>>>    [c00000000bcc3bf0] [c0000000006ca168] ksys_write+0x90/0x170
>>>>    [c00000000bcc3c40] [c000000000033248] system_call_exception+0xf8/0x290
>>>>    [c00000000bcc3e50] [c00000000000d05c] system_call_vectored_common+0x15c/0x2ec
>>>> <snip>
>>>>
>>>> A git bisect pointed this regression to be introduced via [1] that added
>>>> a mechanism to create device tree nodes for parent PCI bridges when a
>>>> PCI device is hot-plugged.
>>>>
>>>> The Oops is caused when `pci_stop_dev()` tries to remove a non-existing
>>>> device-tree node associated with the pci_dev that was earlier
>>>> hot-plugged and was attached under a pci-bridge. The PCI dev header
>>>> `dev->hdr_type` being 0, results a conditional check done with
>>>> `pci_is_bridge()` into false. Consequently, a call to
>>>> `of_pci_make_dev_node()` to create a device node is never made. When at
>>>> a later point in time, in the device node removal path, a memcpy is
>>>> attempted in `__of_changeset_entry_invert()`; since the device node was
>>>> never created, results in an Oops due to kernel read access to a bad
>>>> address.
>>>>
>>>> To fix this issue, the patch updates `of_changeset_create_node()` to
>>>> allocate a new node only when the device node doesn't exist and init it
>>>> in case it does already. Also, introduce `of_pci_free_node()` to be
>>>> called to only revert and destroy the changeset device node that was
>>>> created via a call to `of_changeset_create_node()`.
>>>>
>>>> [1] commit 407d1a51921e ("PCI: Create device tree node for bridge")
>>>>
>>>> Fixes: 407d1a51921e ("PCI: Create device tree node for bridge")
>>>> Reported-by: Kowshik Jois B S <kowsjois@linux.ibm.com>
>>>> Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
>>>> Signed-off-by: Amit Machhiwal <amachhiw@linux.ibm.com>
>>>> ---
>>>> Changes since v1:
>>>>       * Included Lizhi's suggested changes on V1
>>>>       * Fixed below two warnings from Lizhi's changes and rearranged the cleanup
>>>>         part a bit in `of_pci_make_dev_node`
>>>>           drivers/pci/of.c:611:6: warning: no previous prototype for ‘of_pci_free_node’ [-Wmissing-prototypes]
>>>>             611 | void of_pci_free_node(struct device_node *np)
>>>>                 |      ^~~~~~~~~~~~~~~~
>>>>           drivers/pci/of.c: In function ‘of_pci_make_dev_node’:
>>>>           drivers/pci/of.c:696:1: warning: label ‘out_destroy_cset’ defined but not used [-Wunused-label]
>>>>             696 | out_destroy_cset:
>>>>                 | ^~~~~~~~~~~~~~~~
>>>>       * V1: https://lore.kernel.org/all/20240703141634.2974589-1-amachhiw@linux.ibm.com/
>>>>
>>>>    drivers/of/dynamic.c  | 16 ++++++++++++----
>>>>    drivers/of/unittest.c |  2 +-
>>>>    drivers/pci/bus.c     |  3 +--
>>>>    drivers/pci/of.c      | 39 ++++++++++++++++++++++++++-------------
>>>>    drivers/pci/pci.h     |  2 ++
>>>>    include/linux/of.h    |  1 +
>>>>    6 files changed, 43 insertions(+), 20 deletions(-)
>>>>
>>>> diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c
>>>> index dda6092e6d3a..9bba5e82a384 100644
>>>> --- a/drivers/of/dynamic.c
>>>> +++ b/drivers/of/dynamic.c
>>>> @@ -492,21 +492,29 @@ struct device_node *__of_node_dup(const struct device_node *np,
>>>>     * a given changeset.
>>>>     *
>>>>     * @ocs: Pointer to changeset
>>>> + * @np: Pointer to device node. If null, allocate a new node. If not, init an
>>>> + *     existing one.
>>>>     * @parent: Pointer to parent device node
>>>>     * @full_name: Node full name
>>>>     *
>>>>     * Return: Pointer to the created device node or NULL in case of an error.
>>>>     */
>>>>    struct device_node *of_changeset_create_node(struct of_changeset *ocs,
>>>> +                                            struct device_node *np,
>>>>                                                struct device_node *parent,
>>>>                                                const char *full_name)
>>>>    {
>>>> -       struct device_node *np;
>>>>           int ret;
>>>>
>>>> -       np = __of_node_dup(NULL, full_name);
>>>> -       if (!np)
>>>> -               return NULL;
>>>> +       if (!np) {
>>>> +               np = __of_node_dup(NULL, full_name);
>>>> +               if (!np)
>>>> +                       return NULL;
>>>> +       } else {
>>>> +               of_node_set_flag(np, OF_DYNAMIC);
>>>> +               of_node_set_flag(np, OF_DETACHED);
>>> Are we going to rename the function to
>>> of_changeset_create_or_maybe_modify_node()? No. The functions here are
>>> very clear in that they allocate new objects and don't reuse what's
>>> passed in.
>> Ok. How about keeping of_changeset_create_node unchanged.
>>
>> Instead, call kzalloc(), of_node_init() and of_changeset_attach_node()
>>
>> in of_pci_make_dev_node() directly.
>>
>> A similar example is dlpar_parse_cc_node().
>>
>>
>> Does this sound better?
> No, because really that code should be re-written using of_changeset
> API.
>
> My suggestion is add a data pointer to struct of_changeset and then set
> that to something to know the data ptr is a changeset and is your
> changeset.

I do not fully understand the point. I think the issue is that we do not 
know if a given of_node is created by of_pci_make_dev_node(), correct?

of_node->data can point to anything. And we do not know if it points a 
cset or not.  Do you mean to add a flag (e.g. OF_PCI_DYNAMIC) to 
indicate of_node->data points to cset?

I probably misunderstood. Could you explain more?


Thanks,

Lizhi

>
> Rob
Rob Herring (Arm) July 23, 2024, 7:54 p.m. UTC | #8
On Tue, Jul 23, 2024 at 12:21 PM Lizhi Hou <lizhi.hou@amd.com> wrote:
>
>
> On 7/23/24 09:21, Rob Herring wrote:
> > On Mon, Jul 15, 2024 at 01:52:30PM -0700, Lizhi Hou wrote:
> >> On 7/15/24 11:55, Rob Herring wrote:
> >>> On Mon, Jul 15, 2024 at 2:08 AM Amit Machhiwal <amachhiw@linux.ibm.com> wrote:
> >>>> With CONFIG_PCI_DYNAMIC_OF_NODES [1], a hot-plug and hot-unplug sequence
> >>>> of a PCI device attached to a PCI-bridge causes following kernel Oops on
> >>>> a pseries KVM guest:
> >>>>
> >>>>    RTAS: event: 2, Type: Hotplug Event (229), Severity: 1
> >>>>    Kernel attempted to read user page (10ec00000048) - exploit attempt? (uid: 0)
> >>>>    BUG: Unable to handle kernel data access on read at 0x10ec00000048
> >>>>    Faulting instruction address: 0xc0000000012d8728
> >>>>    Oops: Kernel access of bad area, sig: 11 [#1]
> >>>>    LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries
> >>>> <snip>
> >>>>    NIP [c0000000012d8728] __of_changeset_entry_invert+0x10/0x1ac
> >>>>    LR [c0000000012da7f0] __of_changeset_revert_entries+0x98/0x180
> >>>>    Call Trace:
> >>>>    [c00000000bcc3970] [c0000000012daa60] of_changeset_revert+0x58/0xd8
> >>>>    [c00000000bcc39c0] [c000000000d0ed78] of_pci_remove_node+0x74/0xb0
> >>>>    [c00000000bcc39f0] [c000000000cdcfe0] pci_stop_bus_device+0xf4/0x138
> >>>>    [c00000000bcc3a30] [c000000000cdd140] pci_stop_and_remove_bus_device_locked+0x34/0x64
> >>>>    [c00000000bcc3a60] [c000000000cf3780] remove_store+0xf0/0x108
> >>>>    [c00000000bcc3ab0] [c000000000e89e04] dev_attr_store+0x34/0x78
> >>>>    [c00000000bcc3ad0] [c0000000007f8dd4] sysfs_kf_write+0x70/0xa4
> >>>>    [c00000000bcc3af0] [c0000000007f7248] kernfs_fop_write_iter+0x1d0/0x2e0
> >>>>    [c00000000bcc3b40] [c0000000006c9b08] vfs_write+0x27c/0x558
> >>>>    [c00000000bcc3bf0] [c0000000006ca168] ksys_write+0x90/0x170
> >>>>    [c00000000bcc3c40] [c000000000033248] system_call_exception+0xf8/0x290
> >>>>    [c00000000bcc3e50] [c00000000000d05c] system_call_vectored_common+0x15c/0x2ec
> >>>> <snip>
> >>>>
> >>>> A git bisect pointed this regression to be introduced via [1] that added
> >>>> a mechanism to create device tree nodes for parent PCI bridges when a
> >>>> PCI device is hot-plugged.
> >>>>
> >>>> The Oops is caused when `pci_stop_dev()` tries to remove a non-existing
> >>>> device-tree node associated with the pci_dev that was earlier
> >>>> hot-plugged and was attached under a pci-bridge. The PCI dev header
> >>>> `dev->hdr_type` being 0, results a conditional check done with
> >>>> `pci_is_bridge()` into false. Consequently, a call to
> >>>> `of_pci_make_dev_node()` to create a device node is never made. When at
> >>>> a later point in time, in the device node removal path, a memcpy is
> >>>> attempted in `__of_changeset_entry_invert()`; since the device node was
> >>>> never created, results in an Oops due to kernel read access to a bad
> >>>> address.
> >>>>
> >>>> To fix this issue, the patch updates `of_changeset_create_node()` to
> >>>> allocate a new node only when the device node doesn't exist and init it
> >>>> in case it does already. Also, introduce `of_pci_free_node()` to be
> >>>> called to only revert and destroy the changeset device node that was
> >>>> created via a call to `of_changeset_create_node()`.
> >>>>
> >>>> [1] commit 407d1a51921e ("PCI: Create device tree node for bridge")
> >>>>
> >>>> Fixes: 407d1a51921e ("PCI: Create device tree node for bridge")
> >>>> Reported-by: Kowshik Jois B S <kowsjois@linux.ibm.com>
> >>>> Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
> >>>> Signed-off-by: Amit Machhiwal <amachhiw@linux.ibm.com>
> >>>> ---
> >>>> Changes since v1:
> >>>>       * Included Lizhi's suggested changes on V1
> >>>>       * Fixed below two warnings from Lizhi's changes and rearranged the cleanup
> >>>>         part a bit in `of_pci_make_dev_node`
> >>>>           drivers/pci/of.c:611:6: warning: no previous prototype for ‘of_pci_free_node’ [-Wmissing-prototypes]
> >>>>             611 | void of_pci_free_node(struct device_node *np)
> >>>>                 |      ^~~~~~~~~~~~~~~~
> >>>>           drivers/pci/of.c: In function ‘of_pci_make_dev_node’:
> >>>>           drivers/pci/of.c:696:1: warning: label ‘out_destroy_cset’ defined but not used [-Wunused-label]
> >>>>             696 | out_destroy_cset:
> >>>>                 | ^~~~~~~~~~~~~~~~
> >>>>       * V1: https://lore.kernel.org/all/20240703141634.2974589-1-amachhiw@linux.ibm.com/
> >>>>
> >>>>    drivers/of/dynamic.c  | 16 ++++++++++++----
> >>>>    drivers/of/unittest.c |  2 +-
> >>>>    drivers/pci/bus.c     |  3 +--
> >>>>    drivers/pci/of.c      | 39 ++++++++++++++++++++++++++-------------
> >>>>    drivers/pci/pci.h     |  2 ++
> >>>>    include/linux/of.h    |  1 +
> >>>>    6 files changed, 43 insertions(+), 20 deletions(-)
> >>>>
> >>>> diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c
> >>>> index dda6092e6d3a..9bba5e82a384 100644
> >>>> --- a/drivers/of/dynamic.c
> >>>> +++ b/drivers/of/dynamic.c
> >>>> @@ -492,21 +492,29 @@ struct device_node *__of_node_dup(const struct device_node *np,
> >>>>     * a given changeset.
> >>>>     *
> >>>>     * @ocs: Pointer to changeset
> >>>> + * @np: Pointer to device node. If null, allocate a new node. If not, init an
> >>>> + *     existing one.
> >>>>     * @parent: Pointer to parent device node
> >>>>     * @full_name: Node full name
> >>>>     *
> >>>>     * Return: Pointer to the created device node or NULL in case of an error.
> >>>>     */
> >>>>    struct device_node *of_changeset_create_node(struct of_changeset *ocs,
> >>>> +                                            struct device_node *np,
> >>>>                                                struct device_node *parent,
> >>>>                                                const char *full_name)
> >>>>    {
> >>>> -       struct device_node *np;
> >>>>           int ret;
> >>>>
> >>>> -       np = __of_node_dup(NULL, full_name);
> >>>> -       if (!np)
> >>>> -               return NULL;
> >>>> +       if (!np) {
> >>>> +               np = __of_node_dup(NULL, full_name);
> >>>> +               if (!np)
> >>>> +                       return NULL;
> >>>> +       } else {
> >>>> +               of_node_set_flag(np, OF_DYNAMIC);
> >>>> +               of_node_set_flag(np, OF_DETACHED);
> >>> Are we going to rename the function to
> >>> of_changeset_create_or_maybe_modify_node()? No. The functions here are
> >>> very clear in that they allocate new objects and don't reuse what's
> >>> passed in.
> >> Ok. How about keeping of_changeset_create_node unchanged.
> >>
> >> Instead, call kzalloc(), of_node_init() and of_changeset_attach_node()
> >>
> >> in of_pci_make_dev_node() directly.
> >>
> >> A similar example is dlpar_parse_cc_node().
> >>
> >>
> >> Does this sound better?
> > No, because really that code should be re-written using of_changeset
> > API.
> >
> > My suggestion is add a data pointer to struct of_changeset and then set
> > that to something to know the data ptr is a changeset and is your
> > changeset.
>
> I do not fully understand the point. I think the issue is that we do not
> know if a given of_node is created by of_pci_make_dev_node(), correct?

Yes.

> of_node->data can point to anything. And we do not know if it points a
> cset or not.

Right. But instead of checking "of_node->data == of_pci_free_node",
you would just be checking "*(of_node->data) == of_pci_free_node"
(omitting a NULL check and cast for simplicity). I suppose in theory
that could have a false match, but that could happen in this patch
already.

> Do you mean to add a flag (e.g. OF_PCI_DYNAMIC) to
> indicate of_node->data points to cset?

That would be another option, but OF_PCI_DYNAMIC would not be a good
name because that would be a flag bit for every single caller needing
similar functionality. Name it just what it indicates: of_node->data
points to cset

If we have that flag, then possibly the DT core can handle more
clean-up itself like calling detach and freeing the changeset.
Ideally, the flags should be internal to the DT code.

Rob
Lizhi Hou July 23, 2024, 9:08 p.m. UTC | #9
On 7/23/24 12:54, Rob Herring wrote:
> On Tue, Jul 23, 2024 at 12:21 PM Lizhi Hou <lizhi.hou@amd.com> wrote:
>>
>> On 7/23/24 09:21, Rob Herring wrote:
>>> On Mon, Jul 15, 2024 at 01:52:30PM -0700, Lizhi Hou wrote:
>>>> On 7/15/24 11:55, Rob Herring wrote:
>>>>> On Mon, Jul 15, 2024 at 2:08 AM Amit Machhiwal <amachhiw@linux.ibm.com> wrote:
>>>>>> With CONFIG_PCI_DYNAMIC_OF_NODES [1], a hot-plug and hot-unplug sequence
>>>>>> of a PCI device attached to a PCI-bridge causes following kernel Oops on
>>>>>> a pseries KVM guest:
>>>>>>
>>>>>>     RTAS: event: 2, Type: Hotplug Event (229), Severity: 1
>>>>>>     Kernel attempted to read user page (10ec00000048) - exploit attempt? (uid: 0)
>>>>>>     BUG: Unable to handle kernel data access on read at 0x10ec00000048
>>>>>>     Faulting instruction address: 0xc0000000012d8728
>>>>>>     Oops: Kernel access of bad area, sig: 11 [#1]
>>>>>>     LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries
>>>>>> <snip>
>>>>>>     NIP [c0000000012d8728] __of_changeset_entry_invert+0x10/0x1ac
>>>>>>     LR [c0000000012da7f0] __of_changeset_revert_entries+0x98/0x180
>>>>>>     Call Trace:
>>>>>>     [c00000000bcc3970] [c0000000012daa60] of_changeset_revert+0x58/0xd8
>>>>>>     [c00000000bcc39c0] [c000000000d0ed78] of_pci_remove_node+0x74/0xb0
>>>>>>     [c00000000bcc39f0] [c000000000cdcfe0] pci_stop_bus_device+0xf4/0x138
>>>>>>     [c00000000bcc3a30] [c000000000cdd140] pci_stop_and_remove_bus_device_locked+0x34/0x64
>>>>>>     [c00000000bcc3a60] [c000000000cf3780] remove_store+0xf0/0x108
>>>>>>     [c00000000bcc3ab0] [c000000000e89e04] dev_attr_store+0x34/0x78
>>>>>>     [c00000000bcc3ad0] [c0000000007f8dd4] sysfs_kf_write+0x70/0xa4
>>>>>>     [c00000000bcc3af0] [c0000000007f7248] kernfs_fop_write_iter+0x1d0/0x2e0
>>>>>>     [c00000000bcc3b40] [c0000000006c9b08] vfs_write+0x27c/0x558
>>>>>>     [c00000000bcc3bf0] [c0000000006ca168] ksys_write+0x90/0x170
>>>>>>     [c00000000bcc3c40] [c000000000033248] system_call_exception+0xf8/0x290
>>>>>>     [c00000000bcc3e50] [c00000000000d05c] system_call_vectored_common+0x15c/0x2ec
>>>>>> <snip>
>>>>>>
>>>>>> A git bisect pointed this regression to be introduced via [1] that added
>>>>>> a mechanism to create device tree nodes for parent PCI bridges when a
>>>>>> PCI device is hot-plugged.
>>>>>>
>>>>>> The Oops is caused when `pci_stop_dev()` tries to remove a non-existing
>>>>>> device-tree node associated with the pci_dev that was earlier
>>>>>> hot-plugged and was attached under a pci-bridge. The PCI dev header
>>>>>> `dev->hdr_type` being 0, results a conditional check done with
>>>>>> `pci_is_bridge()` into false. Consequently, a call to
>>>>>> `of_pci_make_dev_node()` to create a device node is never made. When at
>>>>>> a later point in time, in the device node removal path, a memcpy is
>>>>>> attempted in `__of_changeset_entry_invert()`; since the device node was
>>>>>> never created, results in an Oops due to kernel read access to a bad
>>>>>> address.
>>>>>>
>>>>>> To fix this issue, the patch updates `of_changeset_create_node()` to
>>>>>> allocate a new node only when the device node doesn't exist and init it
>>>>>> in case it does already. Also, introduce `of_pci_free_node()` to be
>>>>>> called to only revert and destroy the changeset device node that was
>>>>>> created via a call to `of_changeset_create_node()`.
>>>>>>
>>>>>> [1] commit 407d1a51921e ("PCI: Create device tree node for bridge")
>>>>>>
>>>>>> Fixes: 407d1a51921e ("PCI: Create device tree node for bridge")
>>>>>> Reported-by: Kowshik Jois B S <kowsjois@linux.ibm.com>
>>>>>> Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
>>>>>> Signed-off-by: Amit Machhiwal <amachhiw@linux.ibm.com>
>>>>>> ---
>>>>>> Changes since v1:
>>>>>>        * Included Lizhi's suggested changes on V1
>>>>>>        * Fixed below two warnings from Lizhi's changes and rearranged the cleanup
>>>>>>          part a bit in `of_pci_make_dev_node`
>>>>>>            drivers/pci/of.c:611:6: warning: no previous prototype for ‘of_pci_free_node’ [-Wmissing-prototypes]
>>>>>>              611 | void of_pci_free_node(struct device_node *np)
>>>>>>                  |      ^~~~~~~~~~~~~~~~
>>>>>>            drivers/pci/of.c: In function ‘of_pci_make_dev_node’:
>>>>>>            drivers/pci/of.c:696:1: warning: label ‘out_destroy_cset’ defined but not used [-Wunused-label]
>>>>>>              696 | out_destroy_cset:
>>>>>>                  | ^~~~~~~~~~~~~~~~
>>>>>>        * V1: https://lore.kernel.org/all/20240703141634.2974589-1-amachhiw@linux.ibm.com/
>>>>>>
>>>>>>     drivers/of/dynamic.c  | 16 ++++++++++++----
>>>>>>     drivers/of/unittest.c |  2 +-
>>>>>>     drivers/pci/bus.c     |  3 +--
>>>>>>     drivers/pci/of.c      | 39 ++++++++++++++++++++++++++-------------
>>>>>>     drivers/pci/pci.h     |  2 ++
>>>>>>     include/linux/of.h    |  1 +
>>>>>>     6 files changed, 43 insertions(+), 20 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c
>>>>>> index dda6092e6d3a..9bba5e82a384 100644
>>>>>> --- a/drivers/of/dynamic.c
>>>>>> +++ b/drivers/of/dynamic.c
>>>>>> @@ -492,21 +492,29 @@ struct device_node *__of_node_dup(const struct device_node *np,
>>>>>>      * a given changeset.
>>>>>>      *
>>>>>>      * @ocs: Pointer to changeset
>>>>>> + * @np: Pointer to device node. If null, allocate a new node. If not, init an
>>>>>> + *     existing one.
>>>>>>      * @parent: Pointer to parent device node
>>>>>>      * @full_name: Node full name
>>>>>>      *
>>>>>>      * Return: Pointer to the created device node or NULL in case of an error.
>>>>>>      */
>>>>>>     struct device_node *of_changeset_create_node(struct of_changeset *ocs,
>>>>>> +                                            struct device_node *np,
>>>>>>                                                 struct device_node *parent,
>>>>>>                                                 const char *full_name)
>>>>>>     {
>>>>>> -       struct device_node *np;
>>>>>>            int ret;
>>>>>>
>>>>>> -       np = __of_node_dup(NULL, full_name);
>>>>>> -       if (!np)
>>>>>> -               return NULL;
>>>>>> +       if (!np) {
>>>>>> +               np = __of_node_dup(NULL, full_name);
>>>>>> +               if (!np)
>>>>>> +                       return NULL;
>>>>>> +       } else {
>>>>>> +               of_node_set_flag(np, OF_DYNAMIC);
>>>>>> +               of_node_set_flag(np, OF_DETACHED);
>>>>> Are we going to rename the function to
>>>>> of_changeset_create_or_maybe_modify_node()? No. The functions here are
>>>>> very clear in that they allocate new objects and don't reuse what's
>>>>> passed in.
>>>> Ok. How about keeping of_changeset_create_node unchanged.
>>>>
>>>> Instead, call kzalloc(), of_node_init() and of_changeset_attach_node()
>>>>
>>>> in of_pci_make_dev_node() directly.
>>>>
>>>> A similar example is dlpar_parse_cc_node().
>>>>
>>>>
>>>> Does this sound better?
>>> No, because really that code should be re-written using of_changeset
>>> API.
>>>
>>> My suggestion is add a data pointer to struct of_changeset and then set
>>> that to something to know the data ptr is a changeset and is your
>>> changeset.
>> I do not fully understand the point. I think the issue is that we do not
>> know if a given of_node is created by of_pci_make_dev_node(), correct?
> Yes.
>
>> of_node->data can point to anything. And we do not know if it points a
>> cset or not.
> Right. But instead of checking "of_node->data == of_pci_free_node",
> you would just be checking "*(of_node->data) == of_pci_free_node"

if of_node->data is a char* pointer, it would be panic. So I used 
of_node->data == of_pci_free_node.

> (omitting a NULL check and cast for simplicity). I suppose in theory
> that could have a false match, but that could happen in this patch
> already.

I think if any other kernel code  put of_pci_free_node to of_node->data, 
it can be fixed over there.

>
>> Do you mean to add a flag (e.g. OF_PCI_DYNAMIC) to
>> indicate of_node->data points to cset?
> That would be another option, but OF_PCI_DYNAMIC would not be a good
> name because that would be a flag bit for every single caller needing
> similar functionality. Name it just what it indicates: of_node->data
> points to cset
>
> If we have that flag, then possibly the DT core can handle more
> clean-up itself like calling detach and freeing the changeset.
> Ideally, the flags should be internal to the DT code.

Sure. If you prefer this option, I will propose another fix.


Thanks,

Lizhi

>
> Rob
Amit Machhiwal July 25, 2024, 5:45 p.m. UTC | #10
Hi Lizhi, Rob,

Sorry for responding late. I got busy with some other things.

On 2024/07/23 02:08 PM, Lizhi Hou wrote:
> 
> On 7/23/24 12:54, Rob Herring wrote:
> > On Tue, Jul 23, 2024 at 12:21 PM Lizhi Hou <lizhi.hou@amd.com> wrote:
> > > 
> > > On 7/23/24 09:21, Rob Herring wrote:
> > > > On Mon, Jul 15, 2024 at 01:52:30PM -0700, Lizhi Hou wrote:
> > > > > On 7/15/24 11:55, Rob Herring wrote:
> > > > > > On Mon, Jul 15, 2024 at 2:08 AM Amit Machhiwal <amachhiw@linux.ibm.com> wrote:
> > > > > > > With CONFIG_PCI_DYNAMIC_OF_NODES [1], a hot-plug and hot-unplug sequence
> > > > > > > of a PCI device attached to a PCI-bridge causes following kernel Oops on
> > > > > > > a pseries KVM guest:
> > > > > > > 
> > > > > > >     RTAS: event: 2, Type: Hotplug Event (229), Severity: 1
> > > > > > >     Kernel attempted to read user page (10ec00000048) - exploit attempt? (uid: 0)
> > > > > > >     BUG: Unable to handle kernel data access on read at 0x10ec00000048
> > > > > > >     Faulting instruction address: 0xc0000000012d8728
> > > > > > >     Oops: Kernel access of bad area, sig: 11 [#1]
> > > > > > >     LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries
> > > > > > > <snip>
> > > > > > >     NIP [c0000000012d8728] __of_changeset_entry_invert+0x10/0x1ac
> > > > > > >     LR [c0000000012da7f0] __of_changeset_revert_entries+0x98/0x180
> > > > > > >     Call Trace:
> > > > > > >     [c00000000bcc3970] [c0000000012daa60] of_changeset_revert+0x58/0xd8
> > > > > > >     [c00000000bcc39c0] [c000000000d0ed78] of_pci_remove_node+0x74/0xb0
> > > > > > >     [c00000000bcc39f0] [c000000000cdcfe0] pci_stop_bus_device+0xf4/0x138
> > > > > > >     [c00000000bcc3a30] [c000000000cdd140] pci_stop_and_remove_bus_device_locked+0x34/0x64
> > > > > > >     [c00000000bcc3a60] [c000000000cf3780] remove_store+0xf0/0x108
> > > > > > >     [c00000000bcc3ab0] [c000000000e89e04] dev_attr_store+0x34/0x78
> > > > > > >     [c00000000bcc3ad0] [c0000000007f8dd4] sysfs_kf_write+0x70/0xa4
> > > > > > >     [c00000000bcc3af0] [c0000000007f7248] kernfs_fop_write_iter+0x1d0/0x2e0
> > > > > > >     [c00000000bcc3b40] [c0000000006c9b08] vfs_write+0x27c/0x558
> > > > > > >     [c00000000bcc3bf0] [c0000000006ca168] ksys_write+0x90/0x170
> > > > > > >     [c00000000bcc3c40] [c000000000033248] system_call_exception+0xf8/0x290
> > > > > > >     [c00000000bcc3e50] [c00000000000d05c] system_call_vectored_common+0x15c/0x2ec
> > > > > > > <snip>
> > > > > > > 
> > > > > > > A git bisect pointed this regression to be introduced via [1] that added
> > > > > > > a mechanism to create device tree nodes for parent PCI bridges when a
> > > > > > > PCI device is hot-plugged.
> > > > > > > 
> > > > > > > The Oops is caused when `pci_stop_dev()` tries to remove a non-existing
> > > > > > > device-tree node associated with the pci_dev that was earlier
> > > > > > > hot-plugged and was attached under a pci-bridge. The PCI dev header
> > > > > > > `dev->hdr_type` being 0, results a conditional check done with
> > > > > > > `pci_is_bridge()` into false. Consequently, a call to
> > > > > > > `of_pci_make_dev_node()` to create a device node is never made. When at
> > > > > > > a later point in time, in the device node removal path, a memcpy is
> > > > > > > attempted in `__of_changeset_entry_invert()`; since the device node was
> > > > > > > never created, results in an Oops due to kernel read access to a bad
> > > > > > > address.
> > > > > > > 
> > > > > > > To fix this issue, the patch updates `of_changeset_create_node()` to
> > > > > > > allocate a new node only when the device node doesn't exist and init it
> > > > > > > in case it does already. Also, introduce `of_pci_free_node()` to be
> > > > > > > called to only revert and destroy the changeset device node that was
> > > > > > > created via a call to `of_changeset_create_node()`.
> > > > > > > 
> > > > > > > [1] commit 407d1a51921e ("PCI: Create device tree node for bridge")
> > > > > > > 
> > > > > > > Fixes: 407d1a51921e ("PCI: Create device tree node for bridge")
> > > > > > > Reported-by: Kowshik Jois B S <kowsjois@linux.ibm.com>
> > > > > > > Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
> > > > > > > Signed-off-by: Amit Machhiwal <amachhiw@linux.ibm.com>
> > > > > > > ---
> > > > > > > Changes since v1:
> > > > > > >        * Included Lizhi's suggested changes on V1
> > > > > > >        * Fixed below two warnings from Lizhi's changes and rearranged the cleanup
> > > > > > >          part a bit in `of_pci_make_dev_node`
> > > > > > >            drivers/pci/of.c:611:6: warning: no previous prototype for ‘of_pci_free_node’ [-Wmissing-prototypes]
> > > > > > >              611 | void of_pci_free_node(struct device_node *np)
> > > > > > >                  |      ^~~~~~~~~~~~~~~~
> > > > > > >            drivers/pci/of.c: In function ‘of_pci_make_dev_node’:
> > > > > > >            drivers/pci/of.c:696:1: warning: label ‘out_destroy_cset’ defined but not used [-Wunused-label]
> > > > > > >              696 | out_destroy_cset:
> > > > > > >                  | ^~~~~~~~~~~~~~~~
> > > > > > >        * V1: https://lore.kernel.org/all/20240703141634.2974589-1-amachhiw@linux.ibm.com/
> > > > > > > 
> > > > > > >     drivers/of/dynamic.c  | 16 ++++++++++++----
> > > > > > >     drivers/of/unittest.c |  2 +-
> > > > > > >     drivers/pci/bus.c     |  3 +--
> > > > > > >     drivers/pci/of.c      | 39 ++++++++++++++++++++++++++-------------
> > > > > > >     drivers/pci/pci.h     |  2 ++
> > > > > > >     include/linux/of.h    |  1 +
> > > > > > >     6 files changed, 43 insertions(+), 20 deletions(-)
> > > > > > > 
> > > > > > > diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c
> > > > > > > index dda6092e6d3a..9bba5e82a384 100644
> > > > > > > --- a/drivers/of/dynamic.c
> > > > > > > +++ b/drivers/of/dynamic.c
> > > > > > > @@ -492,21 +492,29 @@ struct device_node *__of_node_dup(const struct device_node *np,
> > > > > > >      * a given changeset.
> > > > > > >      *
> > > > > > >      * @ocs: Pointer to changeset
> > > > > > > + * @np: Pointer to device node. If null, allocate a new node. If not, init an
> > > > > > > + *     existing one.
> > > > > > >      * @parent: Pointer to parent device node
> > > > > > >      * @full_name: Node full name
> > > > > > >      *
> > > > > > >      * Return: Pointer to the created device node or NULL in case of an error.
> > > > > > >      */
> > > > > > >     struct device_node *of_changeset_create_node(struct of_changeset *ocs,
> > > > > > > +                                            struct device_node *np,
> > > > > > >                                                 struct device_node *parent,
> > > > > > >                                                 const char *full_name)
> > > > > > >     {
> > > > > > > -       struct device_node *np;
> > > > > > >            int ret;
> > > > > > > 
> > > > > > > -       np = __of_node_dup(NULL, full_name);
> > > > > > > -       if (!np)
> > > > > > > -               return NULL;
> > > > > > > +       if (!np) {
> > > > > > > +               np = __of_node_dup(NULL, full_name);
> > > > > > > +               if (!np)
> > > > > > > +                       return NULL;
> > > > > > > +       } else {
> > > > > > > +               of_node_set_flag(np, OF_DYNAMIC);
> > > > > > > +               of_node_set_flag(np, OF_DETACHED);
> > > > > > Are we going to rename the function to
> > > > > > of_changeset_create_or_maybe_modify_node()? No. The functions here are
> > > > > > very clear in that they allocate new objects and don't reuse what's
> > > > > > passed in.
> > > > > Ok. How about keeping of_changeset_create_node unchanged.
> > > > > 
> > > > > Instead, call kzalloc(), of_node_init() and of_changeset_attach_node()
> > > > > 
> > > > > in of_pci_make_dev_node() directly.
> > > > > 
> > > > > A similar example is dlpar_parse_cc_node().
> > > > > 
> > > > > 
> > > > > Does this sound better?
> > > > No, because really that code should be re-written using of_changeset
> > > > API.
> > > > 
> > > > My suggestion is add a data pointer to struct of_changeset and then set
> > > > that to something to know the data ptr is a changeset and is your
> > > > changeset.
> > > I do not fully understand the point. I think the issue is that we do not
> > > know if a given of_node is created by of_pci_make_dev_node(), correct?
> > Yes.
> > 
> > > of_node->data can point to anything. And we do not know if it points a
> > > cset or not.
> > Right. But instead of checking "of_node->data == of_pci_free_node",
> > you would just be checking "*(of_node->data) == of_pci_free_node"
> 
> if of_node->data is a char* pointer, it would be panic. So I used
> of_node->data == of_pci_free_node.
> 
> > (omitting a NULL check and cast for simplicity). I suppose in theory
> > that could have a false match, but that could happen in this patch
> > already.
> 
> I think if any other kernel code  put of_pci_free_node to of_node->data, it
> can be fixed over there.
> 
> > 
> > > Do you mean to add a flag (e.g. OF_PCI_DYNAMIC) to
> > > indicate of_node->data points to cset?
> > That would be another option, but OF_PCI_DYNAMIC would not be a good
> > name because that would be a flag bit for every single caller needing
> > similar functionality. Name it just what it indicates: of_node->data
> > points to cset
> > 
> > If we have that flag, then possibly the DT core can handle more
> > clean-up itself like calling detach and freeing the changeset.
> > Ideally, the flags should be internal to the DT code.
> 
> Sure. If you prefer this option, I will propose another fix.
> 

The crash in question is a critical issue that we would want to have a fix for
soon. And while this is still being figured out, is it okay to go with the fix I
proposed in the V1 of this patch?

Thanks,
Amit

> 
> Thanks,
> 
> Lizhi
> 
> > 
> > Rob
Bjorn Helgaas July 25, 2024, 8:55 p.m. UTC | #11
On Thu, Jul 25, 2024 at 11:15:39PM +0530, Amit Machhiwal wrote:
> ...
> The crash in question is a critical issue that we would want to have
> a fix for soon. And while this is still being figured out, is it
> okay to go with the fix I proposed in the V1 of this patch?

v6.10 has been released already, and it will be a couple months before
the v6.11 release.

It looks like the regression is 407d1a51921e, which appeared in v6.6,
almost a year ago, so it's fairly old.

What target are you thinking about for the V1 patch?  I guess if we
add it as a v6.11 post-merge window fix, it might get backported to
stable kernels before v6.11?  But if the plan is to merge the V1 patch
and then polish it again before v6.11 releases, I'm not sure it's
worth the churn.

Bjorn
Lizhi Hou July 25, 2024, 11:06 p.m. UTC | #12
Hi Amit,


I try to follow the option which add a OF flag. If Rob is ok with this, 
I would suggest to use it instead of V1 patch

diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c
index dda6092e6d3a..a401ed0463d9 100644
--- a/drivers/of/dynamic.c
+++ b/drivers/of/dynamic.c
@@ -382,6 +382,11 @@ void of_node_release(struct kobject *kobj)
                                __func__, node);
         }

+       if (of_node_check_flag(node, OF_CREATED_WITH_CSET)) {
+               of_changeset_revert(node->data);
+               of_changeset_destroy(node->data);
+       }
+
         if (node->child)
                 pr_err("ERROR: %s() unexpected children for %pOF/%s\n",
                         __func__, node->parent, node->full_name);
@@ -507,6 +512,7 @@ struct device_node *of_changeset_create_node(struct 
of_changeset *ocs,
         np = __of_node_dup(NULL, full_name);
         if (!np)
                 return NULL;
+       of_node_set_flag(np, OF_CREATED_WITH_CSET);
         np->parent = parent;

         ret = of_changeset_attach_node(ocs, np);
diff --git a/drivers/of/unittest.c b/drivers/of/unittest.c
index 445ad13dab98..191ab771d9e8 100644
--- a/drivers/of/unittest.c
+++ b/drivers/of/unittest.c
@@ -895,8 +895,6 @@ static void __init of_unittest_changeset(void)
                  "'%pOF' not added\n", n22);
         of_node_put(np);

-       unittest(!of_changeset_revert(&chgset), "revert failed\n");
-
unittest(!of_find_node_by_path("/testcase-data/changeset/n2/n21"),
                  "'%pOF' still present after revert\n", n21);

@@ -908,8 +906,6 @@ static void __init of_unittest_changeset(void)
         if (!ret)
                 unittest(strcmp(propstr, "hello") == 0, "original value 
not in updated property after revert");

-       of_changeset_destroy(&chgset);
-
         of_node_put(n1);
         of_node_put(n2);
         of_node_put(n21);
diff --git a/drivers/pci/of.c b/drivers/pci/of.c
index 51e3dd0ea5ab..836aaba752bb 100644
--- a/drivers/pci/of.c
+++ b/drivers/pci/of.c
@@ -613,12 +613,10 @@ void of_pci_remove_node(struct pci_dev *pdev)
         struct device_node *np;

         np = pci_device_to_OF_node(pdev);
-       if (!np || !of_node_check_flag(np, OF_DYNAMIC))
+       if (!np || !of_node_check_flag(np, OF_CREATED_WITH_CSET))
                 return;
         pdev->dev.of_node = NULL;

-       of_changeset_revert(np->data);
-       of_changeset_destroy(np->data);
         of_node_put(np);
  }

diff --git a/include/linux/of.h b/include/linux/of.h
index a0bedd038a05..a46317f6626e 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -153,6 +153,7 @@ extern struct device_node *of_stdout;
  #define OF_POPULATED_BUS       4 /* platform bus created for children */
  #define OF_OVERLAY             5 /* allocated for an overlay */
  #define OF_OVERLAY_FREE_CSET   6 /* in overlay cset being freed */
+#define OF_CREATED_WITH_CSET    7 /* created by of_changeset_create_node */

  #define OF_BAD_ADDR    ((u64)-1)

Thanks,

Lizhi

On 7/25/24 10:45, Amit Machhiwal wrote:
> Hi Lizhi, Rob,
>
> Sorry for responding late. I got busy with some other things.
>
> On 2024/07/23 02:08 PM, Lizhi Hou wrote:
>> On 7/23/24 12:54, Rob Herring wrote:
>>> On Tue, Jul 23, 2024 at 12:21 PM Lizhi Hou <lizhi.hou@amd.com> wrote:
>>>> On 7/23/24 09:21, Rob Herring wrote:
>>>>> On Mon, Jul 15, 2024 at 01:52:30PM -0700, Lizhi Hou wrote:
>>>>>> On 7/15/24 11:55, Rob Herring wrote:
>>>>>>> On Mon, Jul 15, 2024 at 2:08 AM Amit Machhiwal <amachhiw@linux.ibm.com> wrote:
>>>>>>>> With CONFIG_PCI_DYNAMIC_OF_NODES [1], a hot-plug and hot-unplug sequence
>>>>>>>> of a PCI device attached to a PCI-bridge causes following kernel Oops on
>>>>>>>> a pseries KVM guest:
>>>>>>>>
>>>>>>>>      RTAS: event: 2, Type: Hotplug Event (229), Severity: 1
>>>>>>>>      Kernel attempted to read user page (10ec00000048) - exploit attempt? (uid: 0)
>>>>>>>>      BUG: Unable to handle kernel data access on read at 0x10ec00000048
>>>>>>>>      Faulting instruction address: 0xc0000000012d8728
>>>>>>>>      Oops: Kernel access of bad area, sig: 11 [#1]
>>>>>>>>      LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries
>>>>>>>> <snip>
>>>>>>>>      NIP [c0000000012d8728] __of_changeset_entry_invert+0x10/0x1ac
>>>>>>>>      LR [c0000000012da7f0] __of_changeset_revert_entries+0x98/0x180
>>>>>>>>      Call Trace:
>>>>>>>>      [c00000000bcc3970] [c0000000012daa60] of_changeset_revert+0x58/0xd8
>>>>>>>>      [c00000000bcc39c0] [c000000000d0ed78] of_pci_remove_node+0x74/0xb0
>>>>>>>>      [c00000000bcc39f0] [c000000000cdcfe0] pci_stop_bus_device+0xf4/0x138
>>>>>>>>      [c00000000bcc3a30] [c000000000cdd140] pci_stop_and_remove_bus_device_locked+0x34/0x64
>>>>>>>>      [c00000000bcc3a60] [c000000000cf3780] remove_store+0xf0/0x108
>>>>>>>>      [c00000000bcc3ab0] [c000000000e89e04] dev_attr_store+0x34/0x78
>>>>>>>>      [c00000000bcc3ad0] [c0000000007f8dd4] sysfs_kf_write+0x70/0xa4
>>>>>>>>      [c00000000bcc3af0] [c0000000007f7248] kernfs_fop_write_iter+0x1d0/0x2e0
>>>>>>>>      [c00000000bcc3b40] [c0000000006c9b08] vfs_write+0x27c/0x558
>>>>>>>>      [c00000000bcc3bf0] [c0000000006ca168] ksys_write+0x90/0x170
>>>>>>>>      [c00000000bcc3c40] [c000000000033248] system_call_exception+0xf8/0x290
>>>>>>>>      [c00000000bcc3e50] [c00000000000d05c] system_call_vectored_common+0x15c/0x2ec
>>>>>>>> <snip>
>>>>>>>>
>>>>>>>> A git bisect pointed this regression to be introduced via [1] that added
>>>>>>>> a mechanism to create device tree nodes for parent PCI bridges when a
>>>>>>>> PCI device is hot-plugged.
>>>>>>>>
>>>>>>>> The Oops is caused when `pci_stop_dev()` tries to remove a non-existing
>>>>>>>> device-tree node associated with the pci_dev that was earlier
>>>>>>>> hot-plugged and was attached under a pci-bridge. The PCI dev header
>>>>>>>> `dev->hdr_type` being 0, results a conditional check done with
>>>>>>>> `pci_is_bridge()` into false. Consequently, a call to
>>>>>>>> `of_pci_make_dev_node()` to create a device node is never made. When at
>>>>>>>> a later point in time, in the device node removal path, a memcpy is
>>>>>>>> attempted in `__of_changeset_entry_invert()`; since the device node was
>>>>>>>> never created, results in an Oops due to kernel read access to a bad
>>>>>>>> address.
>>>>>>>>
>>>>>>>> To fix this issue, the patch updates `of_changeset_create_node()` to
>>>>>>>> allocate a new node only when the device node doesn't exist and init it
>>>>>>>> in case it does already. Also, introduce `of_pci_free_node()` to be
>>>>>>>> called to only revert and destroy the changeset device node that was
>>>>>>>> created via a call to `of_changeset_create_node()`.
>>>>>>>>
>>>>>>>> [1] commit 407d1a51921e ("PCI: Create device tree node for bridge")
>>>>>>>>
>>>>>>>> Fixes: 407d1a51921e ("PCI: Create device tree node for bridge")
>>>>>>>> Reported-by: Kowshik Jois B S <kowsjois@linux.ibm.com>
>>>>>>>> Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
>>>>>>>> Signed-off-by: Amit Machhiwal <amachhiw@linux.ibm.com>
>>>>>>>> ---
>>>>>>>> Changes since v1:
>>>>>>>>         * Included Lizhi's suggested changes on V1
>>>>>>>>         * Fixed below two warnings from Lizhi's changes and rearranged the cleanup
>>>>>>>>           part a bit in `of_pci_make_dev_node`
>>>>>>>>             drivers/pci/of.c:611:6: warning: no previous prototype for ‘of_pci_free_node’ [-Wmissing-prototypes]
>>>>>>>>               611 | void of_pci_free_node(struct device_node *np)
>>>>>>>>                   |      ^~~~~~~~~~~~~~~~
>>>>>>>>             drivers/pci/of.c: In function ‘of_pci_make_dev_node’:
>>>>>>>>             drivers/pci/of.c:696:1: warning: label ‘out_destroy_cset’ defined but not used [-Wunused-label]
>>>>>>>>               696 | out_destroy_cset:
>>>>>>>>                   | ^~~~~~~~~~~~~~~~
>>>>>>>>         * V1: https://lore.kernel.org/all/20240703141634.2974589-1-amachhiw@linux.ibm.com/
>>>>>>>>
>>>>>>>>      drivers/of/dynamic.c  | 16 ++++++++++++----
>>>>>>>>      drivers/of/unittest.c |  2 +-
>>>>>>>>      drivers/pci/bus.c     |  3 +--
>>>>>>>>      drivers/pci/of.c      | 39 ++++++++++++++++++++++++++-------------
>>>>>>>>      drivers/pci/pci.h     |  2 ++
>>>>>>>>      include/linux/of.h    |  1 +
>>>>>>>>      6 files changed, 43 insertions(+), 20 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c
>>>>>>>> index dda6092e6d3a..9bba5e82a384 100644
>>>>>>>> --- a/drivers/of/dynamic.c
>>>>>>>> +++ b/drivers/of/dynamic.c
>>>>>>>> @@ -492,21 +492,29 @@ struct device_node *__of_node_dup(const struct device_node *np,
>>>>>>>>       * a given changeset.
>>>>>>>>       *
>>>>>>>>       * @ocs: Pointer to changeset
>>>>>>>> + * @np: Pointer to device node. If null, allocate a new node. If not, init an
>>>>>>>> + *     existing one.
>>>>>>>>       * @parent: Pointer to parent device node
>>>>>>>>       * @full_name: Node full name
>>>>>>>>       *
>>>>>>>>       * Return: Pointer to the created device node or NULL in case of an error.
>>>>>>>>       */
>>>>>>>>      struct device_node *of_changeset_create_node(struct of_changeset *ocs,
>>>>>>>> +                                            struct device_node *np,
>>>>>>>>                                                  struct device_node *parent,
>>>>>>>>                                                  const char *full_name)
>>>>>>>>      {
>>>>>>>> -       struct device_node *np;
>>>>>>>>             int ret;
>>>>>>>>
>>>>>>>> -       np = __of_node_dup(NULL, full_name);
>>>>>>>> -       if (!np)
>>>>>>>> -               return NULL;
>>>>>>>> +       if (!np) {
>>>>>>>> +               np = __of_node_dup(NULL, full_name);
>>>>>>>> +               if (!np)
>>>>>>>> +                       return NULL;
>>>>>>>> +       } else {
>>>>>>>> +               of_node_set_flag(np, OF_DYNAMIC);
>>>>>>>> +               of_node_set_flag(np, OF_DETACHED);
>>>>>>> Are we going to rename the function to
>>>>>>> of_changeset_create_or_maybe_modify_node()? No. The functions here are
>>>>>>> very clear in that they allocate new objects and don't reuse what's
>>>>>>> passed in.
>>>>>> Ok. How about keeping of_changeset_create_node unchanged.
>>>>>>
>>>>>> Instead, call kzalloc(), of_node_init() and of_changeset_attach_node()
>>>>>>
>>>>>> in of_pci_make_dev_node() directly.
>>>>>>
>>>>>> A similar example is dlpar_parse_cc_node().
>>>>>>
>>>>>>
>>>>>> Does this sound better?
>>>>> No, because really that code should be re-written using of_changeset
>>>>> API.
>>>>>
>>>>> My suggestion is add a data pointer to struct of_changeset and then set
>>>>> that to something to know the data ptr is a changeset and is your
>>>>> changeset.
>>>> I do not fully understand the point. I think the issue is that we do not
>>>> know if a given of_node is created by of_pci_make_dev_node(), correct?
>>> Yes.
>>>
>>>> of_node->data can point to anything. And we do not know if it points a
>>>> cset or not.
>>> Right. But instead of checking "of_node->data == of_pci_free_node",
>>> you would just be checking "*(of_node->data) == of_pci_free_node"
>> if of_node->data is a char* pointer, it would be panic. So I used
>> of_node->data == of_pci_free_node.
>>
>>> (omitting a NULL check and cast for simplicity). I suppose in theory
>>> that could have a false match, but that could happen in this patch
>>> already.
>> I think if any other kernel code  put of_pci_free_node to of_node->data, it
>> can be fixed over there.
>>
>>>> Do you mean to add a flag (e.g. OF_PCI_DYNAMIC) to
>>>> indicate of_node->data points to cset?
>>> That would be another option, but OF_PCI_DYNAMIC would not be a good
>>> name because that would be a flag bit for every single caller needing
>>> similar functionality. Name it just what it indicates: of_node->data
>>> points to cset
>>>
>>> If we have that flag, then possibly the DT core can handle more
>>> clean-up itself like calling detach and freeing the changeset.
>>> Ideally, the flags should be internal to the DT code.
>> Sure. If you prefer this option, I will propose another fix.
>>
> The crash in question is a critical issue that we would want to have a fix for
> soon. And while this is still being figured out, is it okay to go with the fix I
> proposed in the V1 of this patch?
>
> Thanks,
> Amit
>
>> Thanks,
>>
>> Lizhi
>>
>>> Rob
Amit Machhiwal July 26, 2024, 5:49 a.m. UTC | #13
Hi Bjorn,

On 2024/07/25 03:55 PM, Bjorn Helgaas wrote:
> On Thu, Jul 25, 2024 at 11:15:39PM +0530, Amit Machhiwal wrote:
> > ...
> > The crash in question is a critical issue that we would want to have
> > a fix for soon. And while this is still being figured out, is it
> > okay to go with the fix I proposed in the V1 of this patch?
> 
> v6.10 has been released already, and it will be a couple months before
> the v6.11 release.
> 
> It looks like the regression is 407d1a51921e, which appeared in v6.6,
> almost a year ago, so it's fairly old.
> 
> What target are you thinking about for the V1 patch?  I guess if we
> add it as a v6.11 post-merge window fix, it might get backported to
> stable kernels before v6.11?  

Yes, I think we can go ahead with taking V1 patch for v6.11 post-merge window to
fix the current bug and ask Ubuntu to pick it while Lizhi's proposed patch goes
under test and review.

Thanks,
Amit
Rob Herring (Arm) July 26, 2024, 11:37 a.m. UTC | #14
+ Ubuntu kernel list, again

On Thu, Jul 25, 2024 at 11:15:39PM +0530, Amit Machhiwal wrote:
> Hi Lizhi, Rob,
> 
> Sorry for responding late. I got busy with some other things.
> 
> On 2024/07/23 02:08 PM, Lizhi Hou wrote:
> > 
> > On 7/23/24 12:54, Rob Herring wrote:
> > > On Tue, Jul 23, 2024 at 12:21 PM Lizhi Hou <lizhi.hou@amd.com> wrote:
> > > > 
> > > > On 7/23/24 09:21, Rob Herring wrote:
> > > > > On Mon, Jul 15, 2024 at 01:52:30PM -0700, Lizhi Hou wrote:
> > > > > > On 7/15/24 11:55, Rob Herring wrote:
> > > > > > > On Mon, Jul 15, 2024 at 2:08 AM Amit Machhiwal <amachhiw@linux.ibm.com> wrote:
> > > > > > > > With CONFIG_PCI_DYNAMIC_OF_NODES [1], a hot-plug and hot-unplug sequence
> > > > > > > > of a PCI device attached to a PCI-bridge causes following kernel Oops on
> > > > > > > > a pseries KVM guest:
> > > > > > > > 
> > > > > > > >     RTAS: event: 2, Type: Hotplug Event (229), Severity: 1
> > > > > > > >     Kernel attempted to read user page (10ec00000048) - exploit attempt? (uid: 0)
> > > > > > > >     BUG: Unable to handle kernel data access on read at 0x10ec00000048
> > > > > > > >     Faulting instruction address: 0xc0000000012d8728
> > > > > > > >     Oops: Kernel access of bad area, sig: 11 [#1]
> > > > > > > >     LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries
> > > > > > > > <snip>
> > > > > > > >     NIP [c0000000012d8728] __of_changeset_entry_invert+0x10/0x1ac
> > > > > > > >     LR [c0000000012da7f0] __of_changeset_revert_entries+0x98/0x180
> > > > > > > >     Call Trace:
> > > > > > > >     [c00000000bcc3970] [c0000000012daa60] of_changeset_revert+0x58/0xd8
> > > > > > > >     [c00000000bcc39c0] [c000000000d0ed78] of_pci_remove_node+0x74/0xb0
> > > > > > > >     [c00000000bcc39f0] [c000000000cdcfe0] pci_stop_bus_device+0xf4/0x138
> > > > > > > >     [c00000000bcc3a30] [c000000000cdd140] pci_stop_and_remove_bus_device_locked+0x34/0x64
> > > > > > > >     [c00000000bcc3a60] [c000000000cf3780] remove_store+0xf0/0x108
> > > > > > > >     [c00000000bcc3ab0] [c000000000e89e04] dev_attr_store+0x34/0x78
> > > > > > > >     [c00000000bcc3ad0] [c0000000007f8dd4] sysfs_kf_write+0x70/0xa4
> > > > > > > >     [c00000000bcc3af0] [c0000000007f7248] kernfs_fop_write_iter+0x1d0/0x2e0
> > > > > > > >     [c00000000bcc3b40] [c0000000006c9b08] vfs_write+0x27c/0x558
> > > > > > > >     [c00000000bcc3bf0] [c0000000006ca168] ksys_write+0x90/0x170
> > > > > > > >     [c00000000bcc3c40] [c000000000033248] system_call_exception+0xf8/0x290
> > > > > > > >     [c00000000bcc3e50] [c00000000000d05c] system_call_vectored_common+0x15c/0x2ec
> > > > > > > > <snip>
> > > > > > > > 
> > > > > > > > A git bisect pointed this regression to be introduced via [1] that added
> > > > > > > > a mechanism to create device tree nodes for parent PCI bridges when a
> > > > > > > > PCI device is hot-plugged.
> > > > > > > > 
> > > > > > > > The Oops is caused when `pci_stop_dev()` tries to remove a non-existing
> > > > > > > > device-tree node associated with the pci_dev that was earlier
> > > > > > > > hot-plugged and was attached under a pci-bridge. The PCI dev header
> > > > > > > > `dev->hdr_type` being 0, results a conditional check done with
> > > > > > > > `pci_is_bridge()` into false. Consequently, a call to
> > > > > > > > `of_pci_make_dev_node()` to create a device node is never made. When at
> > > > > > > > a later point in time, in the device node removal path, a memcpy is
> > > > > > > > attempted in `__of_changeset_entry_invert()`; since the device node was
> > > > > > > > never created, results in an Oops due to kernel read access to a bad
> > > > > > > > address.
> > > > > > > > 
> > > > > > > > To fix this issue, the patch updates `of_changeset_create_node()` to
> > > > > > > > allocate a new node only when the device node doesn't exist and init it
> > > > > > > > in case it does already. Also, introduce `of_pci_free_node()` to be
> > > > > > > > called to only revert and destroy the changeset device node that was
> > > > > > > > created via a call to `of_changeset_create_node()`.
> > > > > > > > 
> > > > > > > > [1] commit 407d1a51921e ("PCI: Create device tree node for bridge")
> > > > > > > > 
> > > > > > > > Fixes: 407d1a51921e ("PCI: Create device tree node for bridge")
> > > > > > > > Reported-by: Kowshik Jois B S <kowsjois@linux.ibm.com>
> > > > > > > > Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
> > > > > > > > Signed-off-by: Amit Machhiwal <amachhiw@linux.ibm.com>
> > > > > > > > ---
> > > > > > > > Changes since v1:
> > > > > > > >        * Included Lizhi's suggested changes on V1
> > > > > > > >        * Fixed below two warnings from Lizhi's changes and rearranged the cleanup
> > > > > > > >          part a bit in `of_pci_make_dev_node`
> > > > > > > >            drivers/pci/of.c:611:6: warning: no previous prototype for ‘of_pci_free_node’ [-Wmissing-prototypes]
> > > > > > > >              611 | void of_pci_free_node(struct device_node *np)
> > > > > > > >                  |      ^~~~~~~~~~~~~~~~
> > > > > > > >            drivers/pci/of.c: In function ‘of_pci_make_dev_node’:
> > > > > > > >            drivers/pci/of.c:696:1: warning: label ‘out_destroy_cset’ defined but not used [-Wunused-label]
> > > > > > > >              696 | out_destroy_cset:
> > > > > > > >                  | ^~~~~~~~~~~~~~~~
> > > > > > > >        * V1: https://lore.kernel.org/all/20240703141634.2974589-1-amachhiw@linux.ibm.com/
> > > > > > > > 
> > > > > > > >     drivers/of/dynamic.c  | 16 ++++++++++++----
> > > > > > > >     drivers/of/unittest.c |  2 +-
> > > > > > > >     drivers/pci/bus.c     |  3 +--
> > > > > > > >     drivers/pci/of.c      | 39 ++++++++++++++++++++++++++-------------
> > > > > > > >     drivers/pci/pci.h     |  2 ++
> > > > > > > >     include/linux/of.h    |  1 +
> > > > > > > >     6 files changed, 43 insertions(+), 20 deletions(-)
> > > > > > > > 
> > > > > > > > diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c
> > > > > > > > index dda6092e6d3a..9bba5e82a384 100644
> > > > > > > > --- a/drivers/of/dynamic.c
> > > > > > > > +++ b/drivers/of/dynamic.c
> > > > > > > > @@ -492,21 +492,29 @@ struct device_node *__of_node_dup(const struct device_node *np,
> > > > > > > >      * a given changeset.
> > > > > > > >      *
> > > > > > > >      * @ocs: Pointer to changeset
> > > > > > > > + * @np: Pointer to device node. If null, allocate a new node. If not, init an
> > > > > > > > + *     existing one.
> > > > > > > >      * @parent: Pointer to parent device node
> > > > > > > >      * @full_name: Node full name
> > > > > > > >      *
> > > > > > > >      * Return: Pointer to the created device node or NULL in case of an error.
> > > > > > > >      */
> > > > > > > >     struct device_node *of_changeset_create_node(struct of_changeset *ocs,
> > > > > > > > +                                            struct device_node *np,
> > > > > > > >                                                 struct device_node *parent,
> > > > > > > >                                                 const char *full_name)
> > > > > > > >     {
> > > > > > > > -       struct device_node *np;
> > > > > > > >            int ret;
> > > > > > > > 
> > > > > > > > -       np = __of_node_dup(NULL, full_name);
> > > > > > > > -       if (!np)
> > > > > > > > -               return NULL;
> > > > > > > > +       if (!np) {
> > > > > > > > +               np = __of_node_dup(NULL, full_name);
> > > > > > > > +               if (!np)
> > > > > > > > +                       return NULL;
> > > > > > > > +       } else {
> > > > > > > > +               of_node_set_flag(np, OF_DYNAMIC);
> > > > > > > > +               of_node_set_flag(np, OF_DETACHED);
> > > > > > > Are we going to rename the function to
> > > > > > > of_changeset_create_or_maybe_modify_node()? No. The functions here are
> > > > > > > very clear in that they allocate new objects and don't reuse what's
> > > > > > > passed in.
> > > > > > Ok. How about keeping of_changeset_create_node unchanged.
> > > > > > 
> > > > > > Instead, call kzalloc(), of_node_init() and of_changeset_attach_node()
> > > > > > 
> > > > > > in of_pci_make_dev_node() directly.
> > > > > > 
> > > > > > A similar example is dlpar_parse_cc_node().
> > > > > > 
> > > > > > 
> > > > > > Does this sound better?
> > > > > No, because really that code should be re-written using of_changeset
> > > > > API.
> > > > > 
> > > > > My suggestion is add a data pointer to struct of_changeset and then set
> > > > > that to something to know the data ptr is a changeset and is your
> > > > > changeset.
> > > > I do not fully understand the point. I think the issue is that we do not
> > > > know if a given of_node is created by of_pci_make_dev_node(), correct?
> > > Yes.
> > > 
> > > > of_node->data can point to anything. And we do not know if it points a
> > > > cset or not.
> > > Right. But instead of checking "of_node->data == of_pci_free_node",
> > > you would just be checking "*(of_node->data) == of_pci_free_node"
> > 
> > if of_node->data is a char* pointer, it would be panic. So I used
> > of_node->data == of_pci_free_node.
> > 
> > > (omitting a NULL check and cast for simplicity). I suppose in theory
> > > that could have a false match, but that could happen in this patch
> > > already.
> > 
> > I think if any other kernel code  put of_pci_free_node to of_node->data, it
> > can be fixed over there.
> > 
> > > 
> > > > Do you mean to add a flag (e.g. OF_PCI_DYNAMIC) to
> > > > indicate of_node->data points to cset?
> > > That would be another option, but OF_PCI_DYNAMIC would not be a good
> > > name because that would be a flag bit for every single caller needing
> > > similar functionality. Name it just what it indicates: of_node->data
> > > points to cset
> > > 
> > > If we have that flag, then possibly the DT core can handle more
> > > clean-up itself like calling detach and freeing the changeset.
> > > Ideally, the flags should be internal to the DT code.
> > 
> > Sure. If you prefer this option, I will propose another fix.
> > 
> 
> The crash in question is a critical issue that we would want to have a fix for
> soon. And while this is still being figured out, is it okay to go with the fix I
> proposed in the V1 of this patch?

No, sorry but this is not critical. This config option should be off. 
Fix the distro. They turned it on. If hiding it behind CONFIG_EXPERT or 
something would work, that would be fine for upstream. But I suspect 
Ubuntu turns that on because they turn on everything...

Rob
Michael Ellerman July 26, 2024, 12:49 p.m. UTC | #15
Amit Machhiwal <amachhiw@linux.ibm.com> writes:
> Hi Bjorn,
>
> On 2024/07/25 03:55 PM, Bjorn Helgaas wrote:
>> On Thu, Jul 25, 2024 at 11:15:39PM +0530, Amit Machhiwal wrote:
>> > ...
>> > The crash in question is a critical issue that we would want to have
>> > a fix for soon. And while this is still being figured out, is it
>> > okay to go with the fix I proposed in the V1 of this patch?
>> 
>> v6.10 has been released already, and it will be a couple months before
>> the v6.11 release.
>> 
>> It looks like the regression is 407d1a51921e, which appeared in v6.6,
>> almost a year ago, so it's fairly old.
>> 
>> What target are you thinking about for the V1 patch?  I guess if we
>> add it as a v6.11 post-merge window fix, it might get backported to
>> stable kernels before v6.11?  
>
> Yes, I think we can go ahead with taking V1 patch for v6.11 post-merge window to
> fix the current bug and ask Ubuntu to pick it while Lizhi's proposed patch goes
> under test and review.

Lizhi's proposed patch (v3?) looks pretty small and straight forward.
It should be possible to get it tested and reviewed and merge it as a
fix during the v6.11-rc series.

Or if the CONFIG option is completely broken as Rob suggests then it
should just be forced off in Kconfig.

cheers
Rob Herring (Arm) July 26, 2024, 5:52 p.m. UTC | #16
On Thu, Jul 25, 2024 at 6:06 PM Lizhi Hou <lizhi.hou@amd.com> wrote:
>
> Hi Amit,
>
>
> I try to follow the option which add a OF flag. If Rob is ok with this,
> I would suggest to use it instead of V1 patch
>
> diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c
> index dda6092e6d3a..a401ed0463d9 100644
> --- a/drivers/of/dynamic.c
> +++ b/drivers/of/dynamic.c
> @@ -382,6 +382,11 @@ void of_node_release(struct kobject *kobj)
>                                 __func__, node);
>          }
>
> +       if (of_node_check_flag(node, OF_CREATED_WITH_CSET)) {
> +               of_changeset_revert(node->data);
> +               of_changeset_destroy(node->data);
> +       }

What happens if multiple nodes are created in the changeset?

> +
>          if (node->child)
>                  pr_err("ERROR: %s() unexpected children for %pOF/%s\n",
>                          __func__, node->parent, node->full_name);
> @@ -507,6 +512,7 @@ struct device_node *of_changeset_create_node(struct
> of_changeset *ocs,
>          np = __of_node_dup(NULL, full_name);
>          if (!np)
>                  return NULL;
> +       of_node_set_flag(np, OF_CREATED_WITH_CSET);

This should be set where the data ptr is set.

Rob
Lizhi Hou July 26, 2024, 6:45 p.m. UTC | #17
On 7/26/24 10:52, Rob Herring wrote:
> On Thu, Jul 25, 2024 at 6:06 PM Lizhi Hou <lizhi.hou@amd.com> wrote:
>> Hi Amit,
>>
>>
>> I try to follow the option which add a OF flag. If Rob is ok with this,
>> I would suggest to use it instead of V1 patch
>>
>> diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c
>> index dda6092e6d3a..a401ed0463d9 100644
>> --- a/drivers/of/dynamic.c
>> +++ b/drivers/of/dynamic.c
>> @@ -382,6 +382,11 @@ void of_node_release(struct kobject *kobj)
>>                                  __func__, node);
>>           }
>>
>> +       if (of_node_check_flag(node, OF_CREATED_WITH_CSET)) {
>> +               of_changeset_revert(node->data);
>> +               of_changeset_destroy(node->data);
>> +       }
> What happens if multiple nodes are created in the changeset?
Ok. multiple nodes will not work.
>
>> +
>>           if (node->child)
>>                   pr_err("ERROR: %s() unexpected children for %pOF/%s\n",
>>                           __func__, node->parent, node->full_name);
>> @@ -507,6 +512,7 @@ struct device_node *of_changeset_create_node(struct
>> of_changeset *ocs,
>>           np = __of_node_dup(NULL, full_name);
>>           if (!np)
>>                   return NULL;
>> +       of_node_set_flag(np, OF_CREATED_WITH_CSET);
> This should be set where the data ptr is set.

Ok. It sounds the fix could be simplified to 3 lines change.


diff --git a/drivers/pci/of.c b/drivers/pci/of.c
index 51e3dd0ea5ab..0b3ba1e1b18c 100644
--- a/drivers/pci/of.c
+++ b/drivers/pci/of.c
@@ -613,7 +613,7 @@ void of_pci_remove_node(struct pci_dev *pdev)
         struct device_node *np;

         np = pci_device_to_OF_node(pdev);
-       if (!np || !of_node_check_flag(np, OF_DYNAMIC))
+       if (!np || !of_node_check_flag(np, OF_CREATED_WITH_CSET))
                 return;
         pdev->dev.of_node = NULL;

@@ -672,6 +672,7 @@ void of_pci_make_dev_node(struct pci_dev *pdev)
         if (ret)
                 goto out_free_node;

+       of_node_set_flag(np, OF_CREATED_WITH_CSET);
         np->data = cset;
         pdev->dev.of_node = np;
         kfree(name);
diff --git a/include/linux/of.h b/include/linux/of.h
index a0bedd038a05..a46317f6626e 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -153,6 +153,7 @@ extern struct device_node *of_stdout;
  #define OF_POPULATED_BUS       4 /* platform bus created for children */
  #define OF_OVERLAY             5 /* allocated for an overlay */
  #define OF_OVERLAY_FREE_CSET   6 /* in overlay cset being freed */
+#define OF_CREATED_WITH_CSET    7 /* created by of_changeset_create_node */

  #define OF_BAD_ADDR    ((u64)-1)


Lizhi

>
> Rob
Amit Machhiwal July 29, 2024, 11:13 a.m. UTC | #18
Hi Lizhi,

On 2024/07/26 11:45 AM, Lizhi Hou wrote:
> 
> On 7/26/24 10:52, Rob Herring wrote:
> > On Thu, Jul 25, 2024 at 6:06 PM Lizhi Hou <lizhi.hou@amd.com> wrote:
> > > Hi Amit,
> > > 
> > > 
> > > I try to follow the option which add a OF flag. If Rob is ok with this,
> > > I would suggest to use it instead of V1 patch
> > > 
> > > diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c
> > > index dda6092e6d3a..a401ed0463d9 100644
> > > --- a/drivers/of/dynamic.c
> > > +++ b/drivers/of/dynamic.c
> > > @@ -382,6 +382,11 @@ void of_node_release(struct kobject *kobj)
> > >                                  __func__, node);
> > >           }
> > > 
> > > +       if (of_node_check_flag(node, OF_CREATED_WITH_CSET)) {
> > > +               of_changeset_revert(node->data);
> > > +               of_changeset_destroy(node->data);
> > > +       }
> > What happens if multiple nodes are created in the changeset?
> Ok. multiple nodes will not work.
> > 
> > > +
> > >           if (node->child)
> > >                   pr_err("ERROR: %s() unexpected children for %pOF/%s\n",
> > >                           __func__, node->parent, node->full_name);
> > > @@ -507,6 +512,7 @@ struct device_node *of_changeset_create_node(struct
> > > of_changeset *ocs,
> > >           np = __of_node_dup(NULL, full_name);
> > >           if (!np)
> > >                   return NULL;
> > > +       of_node_set_flag(np, OF_CREATED_WITH_CSET);
> > This should be set where the data ptr is set.
> 
> Ok. It sounds the fix could be simplified to 3 lines change.

Thanks for the patch. The hot-plug and hot-unplug of PCI device seem to work
fine as expected. I see this patch would attempt to remove only the nodes which
were created in `of_pci_make_dev_node()` with the help of the newly introduced
flag, which looks good to me.

Also, since a call to `of_pci_make_dev_node()` from `pci_bus_add_device()`, that
creates devices nodes only for bridge devices, is conditional on
`pci_is_bridge()`, it only makes sense to retain the logical symmetry and call
`of_pci_remove_node()` conditionally on `pci_is_bridge()` as well in
`pci_stop_dev()`. Hence, I would like to propose the below change along with the
above patch:

diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c
index 910387e5bdbf..c6394bf562cd 100644
--- a/drivers/pci/remove.c
+++ b/drivers/pci/remove.c
@@ -23,7 +23,8 @@ static void pci_stop_dev(struct pci_dev *dev)
                device_release_driver(&dev->dev);
                pci_proc_detach_device(dev);
                pci_remove_sysfs_dev_files(dev);
-               of_pci_remove_node(dev);
+               if (pci_is_bridge(dev))
+                       of_pci_remove_node(dev);
 
                pci_dev_assign_added(dev, false);
        }

Please let me know of your thoughts on this and based on that I can spin the v3
of this patch.

In addition to this, can this patch be taken as part of 6.11 as a bug fix?

Thanks,
Amit

> 
> 
> diff --git a/drivers/pci/of.c b/drivers/pci/of.c
> index 51e3dd0ea5ab..0b3ba1e1b18c 100644
> --- a/drivers/pci/of.c
> +++ b/drivers/pci/of.c
> @@ -613,7 +613,7 @@ void of_pci_remove_node(struct pci_dev *pdev)
>         struct device_node *np;
> 
>         np = pci_device_to_OF_node(pdev);
> -       if (!np || !of_node_check_flag(np, OF_DYNAMIC))
> +       if (!np || !of_node_check_flag(np, OF_CREATED_WITH_CSET))
>                 return;
>         pdev->dev.of_node = NULL;
> 
> @@ -672,6 +672,7 @@ void of_pci_make_dev_node(struct pci_dev *pdev)
>         if (ret)
>                 goto out_free_node;
> 
> +       of_node_set_flag(np, OF_CREATED_WITH_CSET);
>         np->data = cset;
>         pdev->dev.of_node = np;
>         kfree(name);
> diff --git a/include/linux/of.h b/include/linux/of.h
> index a0bedd038a05..a46317f6626e 100644
> --- a/include/linux/of.h
> +++ b/include/linux/of.h
> @@ -153,6 +153,7 @@ extern struct device_node *of_stdout;
>  #define OF_POPULATED_BUS       4 /* platform bus created for children */
>  #define OF_OVERLAY             5 /* allocated for an overlay */
>  #define OF_OVERLAY_FREE_CSET   6 /* in overlay cset being freed */
> +#define OF_CREATED_WITH_CSET    7 /* created by of_changeset_create_node */
> 
>  #define OF_BAD_ADDR    ((u64)-1)
> 
> 
> Lizhi
> 
> > 
> > Rob
Stefan Bader July 29, 2024, 1:27 p.m. UTC | #19
On 26.07.24 13:37, Rob Herring wrote:
> + Ubuntu kernel list, again
> 
> On Thu, Jul 25, 2024 at 11:15:39PM +0530, Amit Machhiwal wrote:
>> Hi Lizhi, Rob,
>>
>> Sorry for responding late. I got busy with some other things.
>>
>> On 2024/07/23 02:08 PM, Lizhi Hou wrote:
>>>
>>> On 7/23/24 12:54, Rob Herring wrote:
>>>> On Tue, Jul 23, 2024 at 12:21 PM Lizhi Hou <lizhi.hou@amd.com> wrote:
>>>>>
>>>>> On 7/23/24 09:21, Rob Herring wrote:
>>>>>> On Mon, Jul 15, 2024 at 01:52:30PM -0700, Lizhi Hou wrote:
>>>>>>> On 7/15/24 11:55, Rob Herring wrote:
>>>>>>>> On Mon, Jul 15, 2024 at 2:08 AM Amit Machhiwal <amachhiw@linux.ibm.com> wrote:
>>>>>>>>> With CONFIG_PCI_DYNAMIC_OF_NODES [1], a hot-plug and hot-unplug sequence
>>>>>>>>> of a PCI device attached to a PCI-bridge causes following kernel Oops on
>>>>>>>>> a pseries KVM guest:
>>>>>>>>>
>>>>>>>>>      RTAS: event: 2, Type: Hotplug Event (229), Severity: 1
>>>>>>>>>      Kernel attempted to read user page (10ec00000048) - exploit attempt? (uid: 0)
>>>>>>>>>      BUG: Unable to handle kernel data access on read at 0x10ec00000048
>>>>>>>>>      Faulting instruction address: 0xc0000000012d8728
>>>>>>>>>      Oops: Kernel access of bad area, sig: 11 [#1]
>>>>>>>>>      LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries
>>>>>>>>> <snip>
>>>>>>>>>      NIP [c0000000012d8728] __of_changeset_entry_invert+0x10/0x1ac
>>>>>>>>>      LR [c0000000012da7f0] __of_changeset_revert_entries+0x98/0x180
>>>>>>>>>      Call Trace:
>>>>>>>>>      [c00000000bcc3970] [c0000000012daa60] of_changeset_revert+0x58/0xd8
>>>>>>>>>      [c00000000bcc39c0] [c000000000d0ed78] of_pci_remove_node+0x74/0xb0
>>>>>>>>>      [c00000000bcc39f0] [c000000000cdcfe0] pci_stop_bus_device+0xf4/0x138
>>>>>>>>>      [c00000000bcc3a30] [c000000000cdd140] pci_stop_and_remove_bus_device_locked+0x34/0x64
>>>>>>>>>      [c00000000bcc3a60] [c000000000cf3780] remove_store+0xf0/0x108
>>>>>>>>>      [c00000000bcc3ab0] [c000000000e89e04] dev_attr_store+0x34/0x78
>>>>>>>>>      [c00000000bcc3ad0] [c0000000007f8dd4] sysfs_kf_write+0x70/0xa4
>>>>>>>>>      [c00000000bcc3af0] [c0000000007f7248] kernfs_fop_write_iter+0x1d0/0x2e0
>>>>>>>>>      [c00000000bcc3b40] [c0000000006c9b08] vfs_write+0x27c/0x558
>>>>>>>>>      [c00000000bcc3bf0] [c0000000006ca168] ksys_write+0x90/0x170
>>>>>>>>>      [c00000000bcc3c40] [c000000000033248] system_call_exception+0xf8/0x290
>>>>>>>>>      [c00000000bcc3e50] [c00000000000d05c] system_call_vectored_common+0x15c/0x2ec
>>>>>>>>> <snip>
>>>>>>>>>
>>>>>>>>> A git bisect pointed this regression to be introduced via [1] that added
>>>>>>>>> a mechanism to create device tree nodes for parent PCI bridges when a
>>>>>>>>> PCI device is hot-plugged.
>>>>>>>>>
>>>>>>>>> The Oops is caused when `pci_stop_dev()` tries to remove a non-existing
>>>>>>>>> device-tree node associated with the pci_dev that was earlier
>>>>>>>>> hot-plugged and was attached under a pci-bridge. The PCI dev header
>>>>>>>>> `dev->hdr_type` being 0, results a conditional check done with
>>>>>>>>> `pci_is_bridge()` into false. Consequently, a call to
>>>>>>>>> `of_pci_make_dev_node()` to create a device node is never made. When at
>>>>>>>>> a later point in time, in the device node removal path, a memcpy is
>>>>>>>>> attempted in `__of_changeset_entry_invert()`; since the device node was
>>>>>>>>> never created, results in an Oops due to kernel read access to a bad
>>>>>>>>> address.
>>>>>>>>>
>>>>>>>>> To fix this issue, the patch updates `of_changeset_create_node()` to
>>>>>>>>> allocate a new node only when the device node doesn't exist and init it
>>>>>>>>> in case it does already. Also, introduce `of_pci_free_node()` to be
>>>>>>>>> called to only revert and destroy the changeset device node that was
>>>>>>>>> created via a call to `of_changeset_create_node()`.
>>>>>>>>>
>>>>>>>>> [1] commit 407d1a51921e ("PCI: Create device tree node for bridge")
>>>>>>>>>
>>>>>>>>> Fixes: 407d1a51921e ("PCI: Create device tree node for bridge")
>>>>>>>>> Reported-by: Kowshik Jois B S <kowsjois@linux.ibm.com>
>>>>>>>>> Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
>>>>>>>>> Signed-off-by: Amit Machhiwal <amachhiw@linux.ibm.com>
>>>>>>>>> ---
>>>>>>>>> Changes since v1:
>>>>>>>>>         * Included Lizhi's suggested changes on V1
>>>>>>>>>         * Fixed below two warnings from Lizhi's changes and rearranged the cleanup
>>>>>>>>>           part a bit in `of_pci_make_dev_node`
>>>>>>>>>             drivers/pci/of.c:611:6: warning: no previous prototype for ‘of_pci_free_node’ [-Wmissing-prototypes]
>>>>>>>>>               611 | void of_pci_free_node(struct device_node *np)
>>>>>>>>>                   |      ^~~~~~~~~~~~~~~~
>>>>>>>>>             drivers/pci/of.c: In function ‘of_pci_make_dev_node’:
>>>>>>>>>             drivers/pci/of.c:696:1: warning: label ‘out_destroy_cset’ defined but not used [-Wunused-label]
>>>>>>>>>               696 | out_destroy_cset:
>>>>>>>>>                   | ^~~~~~~~~~~~~~~~
>>>>>>>>>         * V1: https://lore.kernel.org/all/20240703141634.2974589-1-amachhiw@linux.ibm.com/
>>>>>>>>>
>>>>>>>>>      drivers/of/dynamic.c  | 16 ++++++++++++----
>>>>>>>>>      drivers/of/unittest.c |  2 +-
>>>>>>>>>      drivers/pci/bus.c     |  3 +--
>>>>>>>>>      drivers/pci/of.c      | 39 ++++++++++++++++++++++++++-------------
>>>>>>>>>      drivers/pci/pci.h     |  2 ++
>>>>>>>>>      include/linux/of.h    |  1 +
>>>>>>>>>      6 files changed, 43 insertions(+), 20 deletions(-)
>>>>>>>>>
>>>>>>>>> diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c
>>>>>>>>> index dda6092e6d3a..9bba5e82a384 100644
>>>>>>>>> --- a/drivers/of/dynamic.c
>>>>>>>>> +++ b/drivers/of/dynamic.c
>>>>>>>>> @@ -492,21 +492,29 @@ struct device_node *__of_node_dup(const struct device_node *np,
>>>>>>>>>       * a given changeset.
>>>>>>>>>       *
>>>>>>>>>       * @ocs: Pointer to changeset
>>>>>>>>> + * @np: Pointer to device node. If null, allocate a new node. If not, init an
>>>>>>>>> + *     existing one.
>>>>>>>>>       * @parent: Pointer to parent device node
>>>>>>>>>       * @full_name: Node full name
>>>>>>>>>       *
>>>>>>>>>       * Return: Pointer to the created device node or NULL in case of an error.
>>>>>>>>>       */
>>>>>>>>>      struct device_node *of_changeset_create_node(struct of_changeset *ocs,
>>>>>>>>> +                                            struct device_node *np,
>>>>>>>>>                                                  struct device_node *parent,
>>>>>>>>>                                                  const char *full_name)
>>>>>>>>>      {
>>>>>>>>> -       struct device_node *np;
>>>>>>>>>             int ret;
>>>>>>>>>
>>>>>>>>> -       np = __of_node_dup(NULL, full_name);
>>>>>>>>> -       if (!np)
>>>>>>>>> -               return NULL;
>>>>>>>>> +       if (!np) {
>>>>>>>>> +               np = __of_node_dup(NULL, full_name);
>>>>>>>>> +               if (!np)
>>>>>>>>> +                       return NULL;
>>>>>>>>> +       } else {
>>>>>>>>> +               of_node_set_flag(np, OF_DYNAMIC);
>>>>>>>>> +               of_node_set_flag(np, OF_DETACHED);
>>>>>>>> Are we going to rename the function to
>>>>>>>> of_changeset_create_or_maybe_modify_node()? No. The functions here are
>>>>>>>> very clear in that they allocate new objects and don't reuse what's
>>>>>>>> passed in.
>>>>>>> Ok. How about keeping of_changeset_create_node unchanged.
>>>>>>>
>>>>>>> Instead, call kzalloc(), of_node_init() and of_changeset_attach_node()
>>>>>>>
>>>>>>> in of_pci_make_dev_node() directly.
>>>>>>>
>>>>>>> A similar example is dlpar_parse_cc_node().
>>>>>>>
>>>>>>>
>>>>>>> Does this sound better?
>>>>>> No, because really that code should be re-written using of_changeset
>>>>>> API.
>>>>>>
>>>>>> My suggestion is add a data pointer to struct of_changeset and then set
>>>>>> that to something to know the data ptr is a changeset and is your
>>>>>> changeset.
>>>>> I do not fully understand the point. I think the issue is that we do not
>>>>> know if a given of_node is created by of_pci_make_dev_node(), correct?
>>>> Yes.
>>>>
>>>>> of_node->data can point to anything. And we do not know if it points a
>>>>> cset or not.
>>>> Right. But instead of checking "of_node->data == of_pci_free_node",
>>>> you would just be checking "*(of_node->data) == of_pci_free_node"
>>>
>>> if of_node->data is a char* pointer, it would be panic. So I used
>>> of_node->data == of_pci_free_node.
>>>
>>>> (omitting a NULL check and cast for simplicity). I suppose in theory
>>>> that could have a false match, but that could happen in this patch
>>>> already.
>>>
>>> I think if any other kernel code  put of_pci_free_node to of_node->data, it
>>> can be fixed over there.
>>>
>>>>
>>>>> Do you mean to add a flag (e.g. OF_PCI_DYNAMIC) to
>>>>> indicate of_node->data points to cset?
>>>> That would be another option, but OF_PCI_DYNAMIC would not be a good
>>>> name because that would be a flag bit for every single caller needing
>>>> similar functionality. Name it just what it indicates: of_node->data
>>>> points to cset
>>>>
>>>> If we have that flag, then possibly the DT core can handle more
>>>> clean-up itself like calling detach and freeing the changeset.
>>>> Ideally, the flags should be internal to the DT code.
>>>
>>> Sure. If you prefer this option, I will propose another fix.
>>>
>>
>> The crash in question is a critical issue that we would want to have a fix for
>> soon. And while this is still being figured out, is it okay to go with the fix I
>> proposed in the V1 of this patch?
> 
> No, sorry but this is not critical. This config option should be off.
> Fix the distro. They turned it on. If hiding it behind CONFIG_EXPERT or
> something would work, that would be fine for upstream. But I suspect
> Ubuntu turns that on because they turn on everything...

Likely that was the case. Noted and we will turn it off in one of the 
next updates.
Thanks,

Stefan
> 
> Rob
>
Lizhi Hou July 29, 2024, 4:47 p.m. UTC | #20
Hi Amit

On 7/29/24 04:13, Amit Machhiwal wrote:
> Hi Lizhi,
>
> On 2024/07/26 11:45 AM, Lizhi Hou wrote:
>> On 7/26/24 10:52, Rob Herring wrote:
>>> On Thu, Jul 25, 2024 at 6:06 PM Lizhi Hou <lizhi.hou@amd.com> wrote:
>>>> Hi Amit,
>>>>
>>>>
>>>> I try to follow the option which add a OF flag. If Rob is ok with this,
>>>> I would suggest to use it instead of V1 patch
>>>>
>>>> diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c
>>>> index dda6092e6d3a..a401ed0463d9 100644
>>>> --- a/drivers/of/dynamic.c
>>>> +++ b/drivers/of/dynamic.c
>>>> @@ -382,6 +382,11 @@ void of_node_release(struct kobject *kobj)
>>>>                                   __func__, node);
>>>>            }
>>>>
>>>> +       if (of_node_check_flag(node, OF_CREATED_WITH_CSET)) {
>>>> +               of_changeset_revert(node->data);
>>>> +               of_changeset_destroy(node->data);
>>>> +       }
>>> What happens if multiple nodes are created in the changeset?
>> Ok. multiple nodes will not work.
>>>> +
>>>>            if (node->child)
>>>>                    pr_err("ERROR: %s() unexpected children for %pOF/%s\n",
>>>>                            __func__, node->parent, node->full_name);
>>>> @@ -507,6 +512,7 @@ struct device_node *of_changeset_create_node(struct
>>>> of_changeset *ocs,
>>>>            np = __of_node_dup(NULL, full_name);
>>>>            if (!np)
>>>>                    return NULL;
>>>> +       of_node_set_flag(np, OF_CREATED_WITH_CSET);
>>> This should be set where the data ptr is set.
>> Ok. It sounds the fix could be simplified to 3 lines change.
> Thanks for the patch. The hot-plug and hot-unplug of PCI device seem to work
> fine as expected. I see this patch would attempt to remove only the nodes which
> were created in `of_pci_make_dev_node()` with the help of the newly introduced
> flag, which looks good to me.
>
> Also, since a call to `of_pci_make_dev_node()` from `pci_bus_add_device()`, that
> creates devices nodes only for bridge devices, is conditional on
> `pci_is_bridge()`, it only makes sense to retain the logical symmetry and call
> `of_pci_remove_node()` conditionally on `pci_is_bridge()` as well in
> `pci_stop_dev()`. Hence, I would like to propose the below change along with the
> above patch:
>
> diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c
> index 910387e5bdbf..c6394bf562cd 100644
> --- a/drivers/pci/remove.c
> +++ b/drivers/pci/remove.c
> @@ -23,7 +23,8 @@ static void pci_stop_dev(struct pci_dev *dev)
>                  device_release_driver(&dev->dev);
>                  pci_proc_detach_device(dev);
>                  pci_remove_sysfs_dev_files(dev);
> -               of_pci_remove_node(dev);
> +               if (pci_is_bridge(dev))
> +                       of_pci_remove_node(dev);
>   
>                  pci_dev_assign_added(dev, false);
>          }
>
> Please let me know of your thoughts on this and based on that I can spin the v3
> of this patch.

As I mentioned, there are endpoints in pci quirks (pci/quirks.c) will 
also create nodes by of_pci_make_dev_node(). So please remove above two 
lines.


Thanks,

Lizhi

>
> In addition to this, can this patch be taken as part of 6.11 as a bug fix?
>
> Thanks,
> Amit
>
>>
>> diff --git a/drivers/pci/of.c b/drivers/pci/of.c
>> index 51e3dd0ea5ab..0b3ba1e1b18c 100644
>> --- a/drivers/pci/of.c
>> +++ b/drivers/pci/of.c
>> @@ -613,7 +613,7 @@ void of_pci_remove_node(struct pci_dev *pdev)
>>          struct device_node *np;
>>
>>          np = pci_device_to_OF_node(pdev);
>> -       if (!np || !of_node_check_flag(np, OF_DYNAMIC))
>> +       if (!np || !of_node_check_flag(np, OF_CREATED_WITH_CSET))
>>                  return;
>>          pdev->dev.of_node = NULL;
>>
>> @@ -672,6 +672,7 @@ void of_pci_make_dev_node(struct pci_dev *pdev)
>>          if (ret)
>>                  goto out_free_node;
>>
>> +       of_node_set_flag(np, OF_CREATED_WITH_CSET);
>>          np->data = cset;
>>          pdev->dev.of_node = np;
>>          kfree(name);
>> diff --git a/include/linux/of.h b/include/linux/of.h
>> index a0bedd038a05..a46317f6626e 100644
>> --- a/include/linux/of.h
>> +++ b/include/linux/of.h
>> @@ -153,6 +153,7 @@ extern struct device_node *of_stdout;
>>   #define OF_POPULATED_BUS       4 /* platform bus created for children */
>>   #define OF_OVERLAY             5 /* allocated for an overlay */
>>   #define OF_OVERLAY_FREE_CSET   6 /* in overlay cset being freed */
>> +#define OF_CREATED_WITH_CSET    7 /* created by of_changeset_create_node */
>>
>>   #define OF_BAD_ADDR    ((u64)-1)
>>
>>
>> Lizhi
>>
>>> Rob
Amit Machhiwal July 29, 2024, 4:55 p.m. UTC | #21
Hi Lizhi,

On 2024/07/29 09:47 AM, Lizhi Hou wrote:
> Hi Amit
> 
> On 7/29/24 04:13, Amit Machhiwal wrote:
> > Hi Lizhi,
> > 
> > On 2024/07/26 11:45 AM, Lizhi Hou wrote:
> > > On 7/26/24 10:52, Rob Herring wrote:
> > > > On Thu, Jul 25, 2024 at 6:06 PM Lizhi Hou <lizhi.hou@amd.com> wrote:
> > > > > Hi Amit,
> > > > > 
> > > > > 
> > > > > I try to follow the option which add a OF flag. If Rob is ok with this,
> > > > > I would suggest to use it instead of V1 patch
> > > > > 
> > > > > diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c
> > > > > index dda6092e6d3a..a401ed0463d9 100644
> > > > > --- a/drivers/of/dynamic.c
> > > > > +++ b/drivers/of/dynamic.c
> > > > > @@ -382,6 +382,11 @@ void of_node_release(struct kobject *kobj)
> > > > >                                   __func__, node);
> > > > >            }
> > > > > 
> > > > > +       if (of_node_check_flag(node, OF_CREATED_WITH_CSET)) {
> > > > > +               of_changeset_revert(node->data);
> > > > > +               of_changeset_destroy(node->data);
> > > > > +       }
> > > > What happens if multiple nodes are created in the changeset?
> > > Ok. multiple nodes will not work.
> > > > > +
> > > > >            if (node->child)
> > > > >                    pr_err("ERROR: %s() unexpected children for %pOF/%s\n",
> > > > >                            __func__, node->parent, node->full_name);
> > > > > @@ -507,6 +512,7 @@ struct device_node *of_changeset_create_node(struct
> > > > > of_changeset *ocs,
> > > > >            np = __of_node_dup(NULL, full_name);
> > > > >            if (!np)
> > > > >                    return NULL;
> > > > > +       of_node_set_flag(np, OF_CREATED_WITH_CSET);
> > > > This should be set where the data ptr is set.
> > > Ok. It sounds the fix could be simplified to 3 lines change.
> > Thanks for the patch. The hot-plug and hot-unplug of PCI device seem to work
> > fine as expected. I see this patch would attempt to remove only the nodes which
> > were created in `of_pci_make_dev_node()` with the help of the newly introduced
> > flag, which looks good to me.
> > 
> > Also, since a call to `of_pci_make_dev_node()` from `pci_bus_add_device()`, that
> > creates devices nodes only for bridge devices, is conditional on
> > `pci_is_bridge()`, it only makes sense to retain the logical symmetry and call
> > `of_pci_remove_node()` conditionally on `pci_is_bridge()` as well in
> > `pci_stop_dev()`. Hence, I would like to propose the below change along with the
> > above patch:
> > 
> > diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c
> > index 910387e5bdbf..c6394bf562cd 100644
> > --- a/drivers/pci/remove.c
> > +++ b/drivers/pci/remove.c
> > @@ -23,7 +23,8 @@ static void pci_stop_dev(struct pci_dev *dev)
> >                  device_release_driver(&dev->dev);
> >                  pci_proc_detach_device(dev);
> >                  pci_remove_sysfs_dev_files(dev);
> > -               of_pci_remove_node(dev);
> > +               if (pci_is_bridge(dev))
> > +                       of_pci_remove_node(dev);
> >                  pci_dev_assign_added(dev, false);
> >          }
> > 
> > Please let me know of your thoughts on this and based on that I can spin the v3
> > of this patch.
> 
> As I mentioned, there are endpoints in pci quirks (pci/quirks.c) will also
> create nodes by of_pci_make_dev_node(). So please remove above two lines.

Sorry if I'm misinterpreting something here but as I mentioned,
`of_pci_make_dev_node()` is called only for bridge devices with check performed
via `pci_is_bridge()`, could you please elaborate more on why the same check
can't be put while removing the node via `of_pci_remove_node()`?

Thanks,
Amit

> 
> Thanks,
> 
> Lizhi
> 
> > 
> > In addition to this, can this patch be taken as part of 6.11 as a bug fix?
> > 
> > Thanks,
> > Amit
> > 
> > > 
> > > diff --git a/drivers/pci/of.c b/drivers/pci/of.c
> > > index 51e3dd0ea5ab..0b3ba1e1b18c 100644
> > > --- a/drivers/pci/of.c
> > > +++ b/drivers/pci/of.c
> > > @@ -613,7 +613,7 @@ void of_pci_remove_node(struct pci_dev *pdev)
> > >          struct device_node *np;
> > > 
> > >          np = pci_device_to_OF_node(pdev);
> > > -       if (!np || !of_node_check_flag(np, OF_DYNAMIC))
> > > +       if (!np || !of_node_check_flag(np, OF_CREATED_WITH_CSET))
> > >                  return;
> > >          pdev->dev.of_node = NULL;
> > > 
> > > @@ -672,6 +672,7 @@ void of_pci_make_dev_node(struct pci_dev *pdev)
> > >          if (ret)
> > >                  goto out_free_node;
> > > 
> > > +       of_node_set_flag(np, OF_CREATED_WITH_CSET);
> > >          np->data = cset;
> > >          pdev->dev.of_node = np;
> > >          kfree(name);
> > > diff --git a/include/linux/of.h b/include/linux/of.h
> > > index a0bedd038a05..a46317f6626e 100644
> > > --- a/include/linux/of.h
> > > +++ b/include/linux/of.h
> > > @@ -153,6 +153,7 @@ extern struct device_node *of_stdout;
> > >   #define OF_POPULATED_BUS       4 /* platform bus created for children */
> > >   #define OF_OVERLAY             5 /* allocated for an overlay */
> > >   #define OF_OVERLAY_FREE_CSET   6 /* in overlay cset being freed */
> > > +#define OF_CREATED_WITH_CSET    7 /* created by of_changeset_create_node */
> > > 
> > >   #define OF_BAD_ADDR    ((u64)-1)
> > > 
> > > 
> > > Lizhi
> > > 
> > > > Rob
Lizhi Hou July 29, 2024, 6:19 p.m. UTC | #22
On 7/29/24 09:55, Amit Machhiwal wrote:
> Hi Lizhi,
>
> On 2024/07/29 09:47 AM, Lizhi Hou wrote:
>> Hi Amit
>>
>> On 7/29/24 04:13, Amit Machhiwal wrote:
>>> Hi Lizhi,
>>>
>>> On 2024/07/26 11:45 AM, Lizhi Hou wrote:
>>>> On 7/26/24 10:52, Rob Herring wrote:
>>>>> On Thu, Jul 25, 2024 at 6:06 PM Lizhi Hou <lizhi.hou@amd.com> wrote:
>>>>>> Hi Amit,
>>>>>>
>>>>>>
>>>>>> I try to follow the option which add a OF flag. If Rob is ok with this,
>>>>>> I would suggest to use it instead of V1 patch
>>>>>>
>>>>>> diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c
>>>>>> index dda6092e6d3a..a401ed0463d9 100644
>>>>>> --- a/drivers/of/dynamic.c
>>>>>> +++ b/drivers/of/dynamic.c
>>>>>> @@ -382,6 +382,11 @@ void of_node_release(struct kobject *kobj)
>>>>>>                                    __func__, node);
>>>>>>             }
>>>>>>
>>>>>> +       if (of_node_check_flag(node, OF_CREATED_WITH_CSET)) {
>>>>>> +               of_changeset_revert(node->data);
>>>>>> +               of_changeset_destroy(node->data);
>>>>>> +       }
>>>>> What happens if multiple nodes are created in the changeset?
>>>> Ok. multiple nodes will not work.
>>>>>> +
>>>>>>             if (node->child)
>>>>>>                     pr_err("ERROR: %s() unexpected children for %pOF/%s\n",
>>>>>>                             __func__, node->parent, node->full_name);
>>>>>> @@ -507,6 +512,7 @@ struct device_node *of_changeset_create_node(struct
>>>>>> of_changeset *ocs,
>>>>>>             np = __of_node_dup(NULL, full_name);
>>>>>>             if (!np)
>>>>>>                     return NULL;
>>>>>> +       of_node_set_flag(np, OF_CREATED_WITH_CSET);
>>>>> This should be set where the data ptr is set.
>>>> Ok. It sounds the fix could be simplified to 3 lines change.
>>> Thanks for the patch. The hot-plug and hot-unplug of PCI device seem to work
>>> fine as expected. I see this patch would attempt to remove only the nodes which
>>> were created in `of_pci_make_dev_node()` with the help of the newly introduced
>>> flag, which looks good to me.
>>>
>>> Also, since a call to `of_pci_make_dev_node()` from `pci_bus_add_device()`, that
>>> creates devices nodes only for bridge devices, is conditional on
>>> `pci_is_bridge()`, it only makes sense to retain the logical symmetry and call
>>> `of_pci_remove_node()` conditionally on `pci_is_bridge()` as well in
>>> `pci_stop_dev()`. Hence, I would like to propose the below change along with the
>>> above patch:
>>>
>>> diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c
>>> index 910387e5bdbf..c6394bf562cd 100644
>>> --- a/drivers/pci/remove.c
>>> +++ b/drivers/pci/remove.c
>>> @@ -23,7 +23,8 @@ static void pci_stop_dev(struct pci_dev *dev)
>>>                   device_release_driver(&dev->dev);
>>>                   pci_proc_detach_device(dev);
>>>                   pci_remove_sysfs_dev_files(dev);
>>> -               of_pci_remove_node(dev);
>>> +               if (pci_is_bridge(dev))
>>> +                       of_pci_remove_node(dev);
>>>                   pci_dev_assign_added(dev, false);
>>>           }
>>>
>>> Please let me know of your thoughts on this and based on that I can spin the v3
>>> of this patch.
>> As I mentioned, there are endpoints in pci quirks (pci/quirks.c) will also
>> create nodes by of_pci_make_dev_node(). So please remove above two lines.
> Sorry if I'm misinterpreting something here but as I mentioned,
> `of_pci_make_dev_node()` is called only for bridge devices with check performed
> via `pci_is_bridge()`, could you please elaborate more on why the same check
> can't be put while removing the node via `of_pci_remove_node()`?

For devices added in quirks, of_pci_make_dev_node() will be called 
through pci_fixup_device().


Lizhi

>
> Thanks,
> Amit
>
>> Thanks,
>>
>> Lizhi
>>
>>> In addition to this, can this patch be taken as part of 6.11 as a bug fix?
>>>
>>> Thanks,
>>> Amit
>>>
>>>> diff --git a/drivers/pci/of.c b/drivers/pci/of.c
>>>> index 51e3dd0ea5ab..0b3ba1e1b18c 100644
>>>> --- a/drivers/pci/of.c
>>>> +++ b/drivers/pci/of.c
>>>> @@ -613,7 +613,7 @@ void of_pci_remove_node(struct pci_dev *pdev)
>>>>           struct device_node *np;
>>>>
>>>>           np = pci_device_to_OF_node(pdev);
>>>> -       if (!np || !of_node_check_flag(np, OF_DYNAMIC))
>>>> +       if (!np || !of_node_check_flag(np, OF_CREATED_WITH_CSET))
>>>>                   return;
>>>>           pdev->dev.of_node = NULL;
>>>>
>>>> @@ -672,6 +672,7 @@ void of_pci_make_dev_node(struct pci_dev *pdev)
>>>>           if (ret)
>>>>                   goto out_free_node;
>>>>
>>>> +       of_node_set_flag(np, OF_CREATED_WITH_CSET);
>>>>           np->data = cset;
>>>>           pdev->dev.of_node = np;
>>>>           kfree(name);
>>>> diff --git a/include/linux/of.h b/include/linux/of.h
>>>> index a0bedd038a05..a46317f6626e 100644
>>>> --- a/include/linux/of.h
>>>> +++ b/include/linux/of.h
>>>> @@ -153,6 +153,7 @@ extern struct device_node *of_stdout;
>>>>    #define OF_POPULATED_BUS       4 /* platform bus created for children */
>>>>    #define OF_OVERLAY             5 /* allocated for an overlay */
>>>>    #define OF_OVERLAY_FREE_CSET   6 /* in overlay cset being freed */
>>>> +#define OF_CREATED_WITH_CSET    7 /* created by of_changeset_create_node */
>>>>
>>>>    #define OF_BAD_ADDR    ((u64)-1)
>>>>
>>>>
>>>> Lizhi
>>>>
>>>>> Rob
Amit Machhiwal Aug. 2, 2024, 4:55 p.m. UTC | #23
Hi Stefan,

On 2024/07/29 03:27 PM, Stefan Bader wrote:
> On 26.07.24 13:37, Rob Herring wrote:
> > + Ubuntu kernel list, again
> > 
> > On Thu, Jul 25, 2024 at 11:15:39PM +0530, Amit Machhiwal wrote:
> > > Hi Lizhi, Rob,
> > > 
> > > Sorry for responding late. I got busy with some other things.
> > > 
> > > On 2024/07/23 02:08 PM, Lizhi Hou wrote:
> > > > 
> > > > On 7/23/24 12:54, Rob Herring wrote:
> > > > > On Tue, Jul 23, 2024 at 12:21 PM Lizhi Hou <lizhi.hou@amd.com> wrote:
> > > > > > 
> > > > > > On 7/23/24 09:21, Rob Herring wrote:
> > > > > > > On Mon, Jul 15, 2024 at 01:52:30PM -0700, Lizhi Hou wrote:
> > > > > > > > On 7/15/24 11:55, Rob Herring wrote:
> > > > > > > > > On Mon, Jul 15, 2024 at 2:08 AM Amit Machhiwal <amachhiw@linux.ibm.com> wrote:
> > > > > > > > > > With CONFIG_PCI_DYNAMIC_OF_NODES [1], a hot-plug and hot-unplug sequence
> > > > > > > > > > of a PCI device attached to a PCI-bridge causes following kernel Oops on
> > > > > > > > > > a pseries KVM guest:
> > > > > > > > > > 
> > > > > > > > > >      RTAS: event: 2, Type: Hotplug Event (229), Severity: 1
> > > > > > > > > >      Kernel attempted to read user page (10ec00000048) - exploit attempt? (uid: 0)
> > > > > > > > > >      BUG: Unable to handle kernel data access on read at 0x10ec00000048
> > > > > > > > > >      Faulting instruction address: 0xc0000000012d8728
> > > > > > > > > >      Oops: Kernel access of bad area, sig: 11 [#1]
> > > > > > > > > >      LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries
> > > > > > > > > > <snip>
> > > > > > > > > >      NIP [c0000000012d8728] __of_changeset_entry_invert+0x10/0x1ac
> > > > > > > > > >      LR [c0000000012da7f0] __of_changeset_revert_entries+0x98/0x180
> > > > > > > > > >      Call Trace:
> > > > > > > > > >      [c00000000bcc3970] [c0000000012daa60] of_changeset_revert+0x58/0xd8
> > > > > > > > > >      [c00000000bcc39c0] [c000000000d0ed78] of_pci_remove_node+0x74/0xb0
> > > > > > > > > >      [c00000000bcc39f0] [c000000000cdcfe0] pci_stop_bus_device+0xf4/0x138
> > > > > > > > > >      [c00000000bcc3a30] [c000000000cdd140] pci_stop_and_remove_bus_device_locked+0x34/0x64
> > > > > > > > > >      [c00000000bcc3a60] [c000000000cf3780] remove_store+0xf0/0x108
> > > > > > > > > >      [c00000000bcc3ab0] [c000000000e89e04] dev_attr_store+0x34/0x78
> > > > > > > > > >      [c00000000bcc3ad0] [c0000000007f8dd4] sysfs_kf_write+0x70/0xa4
> > > > > > > > > >      [c00000000bcc3af0] [c0000000007f7248] kernfs_fop_write_iter+0x1d0/0x2e0
> > > > > > > > > >      [c00000000bcc3b40] [c0000000006c9b08] vfs_write+0x27c/0x558
> > > > > > > > > >      [c00000000bcc3bf0] [c0000000006ca168] ksys_write+0x90/0x170
> > > > > > > > > >      [c00000000bcc3c40] [c000000000033248] system_call_exception+0xf8/0x290
> > > > > > > > > >      [c00000000bcc3e50] [c00000000000d05c] system_call_vectored_common+0x15c/0x2ec
> > > > > > > > > > <snip>
> > > > > > > > > > 
> > > > > > > > > > A git bisect pointed this regression to be introduced via [1] that added
> > > > > > > > > > a mechanism to create device tree nodes for parent PCI bridges when a
> > > > > > > > > > PCI device is hot-plugged.
> > > > > > > > > > 
> > > > > > > > > > The Oops is caused when `pci_stop_dev()` tries to remove a non-existing
> > > > > > > > > > device-tree node associated with the pci_dev that was earlier
> > > > > > > > > > hot-plugged and was attached under a pci-bridge. The PCI dev header
> > > > > > > > > > `dev->hdr_type` being 0, results a conditional check done with
> > > > > > > > > > `pci_is_bridge()` into false. Consequently, a call to
> > > > > > > > > > `of_pci_make_dev_node()` to create a device node is never made. When at
> > > > > > > > > > a later point in time, in the device node removal path, a memcpy is
> > > > > > > > > > attempted in `__of_changeset_entry_invert()`; since the device node was
> > > > > > > > > > never created, results in an Oops due to kernel read access to a bad
> > > > > > > > > > address.
> > > > > > > > > > 
> > > > > > > > > > To fix this issue, the patch updates `of_changeset_create_node()` to
> > > > > > > > > > allocate a new node only when the device node doesn't exist and init it
> > > > > > > > > > in case it does already. Also, introduce `of_pci_free_node()` to be
> > > > > > > > > > called to only revert and destroy the changeset device node that was
> > > > > > > > > > created via a call to `of_changeset_create_node()`.
> > > > > > > > > > 
> > > > > > > > > > [1] commit 407d1a51921e ("PCI: Create device tree node for bridge")
> > > > > > > > > > 
> > > > > > > > > > Fixes: 407d1a51921e ("PCI: Create device tree node for bridge")
> > > > > > > > > > Reported-by: Kowshik Jois B S <kowsjois@linux.ibm.com>
> > > > > > > > > > Signed-off-by: Lizhi Hou <lizhi.hou@amd.com>
> > > > > > > > > > Signed-off-by: Amit Machhiwal <amachhiw@linux.ibm.com>
> > > > > > > > > > ---
> > > > > > > > > > Changes since v1:
> > > > > > > > > >         * Included Lizhi's suggested changes on V1
> > > > > > > > > >         * Fixed below two warnings from Lizhi's changes and rearranged the cleanup
> > > > > > > > > >           part a bit in `of_pci_make_dev_node`
> > > > > > > > > >             drivers/pci/of.c:611:6: warning: no previous prototype for ‘of_pci_free_node’ [-Wmissing-prototypes]
> > > > > > > > > >               611 | void of_pci_free_node(struct device_node *np)
> > > > > > > > > >                   |      ^~~~~~~~~~~~~~~~
> > > > > > > > > >             drivers/pci/of.c: In function ‘of_pci_make_dev_node’:
> > > > > > > > > >             drivers/pci/of.c:696:1: warning: label ‘out_destroy_cset’ defined but not used [-Wunused-label]
> > > > > > > > > >               696 | out_destroy_cset:
> > > > > > > > > >                   | ^~~~~~~~~~~~~~~~
> > > > > > > > > >         * V1: https://lore.kernel.org/all/20240703141634.2974589-1-amachhiw@linux.ibm.com/
> > > > > > > > > > 
> > > > > > > > > >      drivers/of/dynamic.c  | 16 ++++++++++++----
> > > > > > > > > >      drivers/of/unittest.c |  2 +-
> > > > > > > > > >      drivers/pci/bus.c     |  3 +--
> > > > > > > > > >      drivers/pci/of.c      | 39 ++++++++++++++++++++++++++-------------
> > > > > > > > > >      drivers/pci/pci.h     |  2 ++
> > > > > > > > > >      include/linux/of.h    |  1 +
> > > > > > > > > >      6 files changed, 43 insertions(+), 20 deletions(-)
> > > > > > > > > > 
> > > > > > > > > > diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c
> > > > > > > > > > index dda6092e6d3a..9bba5e82a384 100644
> > > > > > > > > > --- a/drivers/of/dynamic.c
> > > > > > > > > > +++ b/drivers/of/dynamic.c
> > > > > > > > > > @@ -492,21 +492,29 @@ struct device_node *__of_node_dup(const struct device_node *np,
> > > > > > > > > >       * a given changeset.
> > > > > > > > > >       *
> > > > > > > > > >       * @ocs: Pointer to changeset
> > > > > > > > > > + * @np: Pointer to device node. If null, allocate a new node. If not, init an
> > > > > > > > > > + *     existing one.
> > > > > > > > > >       * @parent: Pointer to parent device node
> > > > > > > > > >       * @full_name: Node full name
> > > > > > > > > >       *
> > > > > > > > > >       * Return: Pointer to the created device node or NULL in case of an error.
> > > > > > > > > >       */
> > > > > > > > > >      struct device_node *of_changeset_create_node(struct of_changeset *ocs,
> > > > > > > > > > +                                            struct device_node *np,
> > > > > > > > > >                                                  struct device_node *parent,
> > > > > > > > > >                                                  const char *full_name)
> > > > > > > > > >      {
> > > > > > > > > > -       struct device_node *np;
> > > > > > > > > >             int ret;
> > > > > > > > > > 
> > > > > > > > > > -       np = __of_node_dup(NULL, full_name);
> > > > > > > > > > -       if (!np)
> > > > > > > > > > -               return NULL;
> > > > > > > > > > +       if (!np) {
> > > > > > > > > > +               np = __of_node_dup(NULL, full_name);
> > > > > > > > > > +               if (!np)
> > > > > > > > > > +                       return NULL;
> > > > > > > > > > +       } else {
> > > > > > > > > > +               of_node_set_flag(np, OF_DYNAMIC);
> > > > > > > > > > +               of_node_set_flag(np, OF_DETACHED);
> > > > > > > > > Are we going to rename the function to
> > > > > > > > > of_changeset_create_or_maybe_modify_node()? No. The functions here are
> > > > > > > > > very clear in that they allocate new objects and don't reuse what's
> > > > > > > > > passed in.
> > > > > > > > Ok. How about keeping of_changeset_create_node unchanged.
> > > > > > > > 
> > > > > > > > Instead, call kzalloc(), of_node_init() and of_changeset_attach_node()
> > > > > > > > 
> > > > > > > > in of_pci_make_dev_node() directly.
> > > > > > > > 
> > > > > > > > A similar example is dlpar_parse_cc_node().
> > > > > > > > 
> > > > > > > > 
> > > > > > > > Does this sound better?
> > > > > > > No, because really that code should be re-written using of_changeset
> > > > > > > API.
> > > > > > > 
> > > > > > > My suggestion is add a data pointer to struct of_changeset and then set
> > > > > > > that to something to know the data ptr is a changeset and is your
> > > > > > > changeset.
> > > > > > I do not fully understand the point. I think the issue is that we do not
> > > > > > know if a given of_node is created by of_pci_make_dev_node(), correct?
> > > > > Yes.
> > > > > 
> > > > > > of_node->data can point to anything. And we do not know if it points a
> > > > > > cset or not.
> > > > > Right. But instead of checking "of_node->data == of_pci_free_node",
> > > > > you would just be checking "*(of_node->data) == of_pci_free_node"
> > > > 
> > > > if of_node->data is a char* pointer, it would be panic. So I used
> > > > of_node->data == of_pci_free_node.
> > > > 
> > > > > (omitting a NULL check and cast for simplicity). I suppose in theory
> > > > > that could have a false match, but that could happen in this patch
> > > > > already.
> > > > 
> > > > I think if any other kernel code  put of_pci_free_node to of_node->data, it
> > > > can be fixed over there.
> > > > 
> > > > > 
> > > > > > Do you mean to add a flag (e.g. OF_PCI_DYNAMIC) to
> > > > > > indicate of_node->data points to cset?
> > > > > That would be another option, but OF_PCI_DYNAMIC would not be a good
> > > > > name because that would be a flag bit for every single caller needing
> > > > > similar functionality. Name it just what it indicates: of_node->data
> > > > > points to cset
> > > > > 
> > > > > If we have that flag, then possibly the DT core can handle more
> > > > > clean-up itself like calling detach and freeing the changeset.
> > > > > Ideally, the flags should be internal to the DT code.
> > > > 
> > > > Sure. If you prefer this option, I will propose another fix.
> > > > 
> > > 
> > > The crash in question is a critical issue that we would want to have a fix for
> > > soon. And while this is still being figured out, is it okay to go with the fix I
> > > proposed in the V1 of this patch?
> > 
> > No, sorry but this is not critical. This config option should be off.
> > Fix the distro. They turned it on. If hiding it behind CONFIG_EXPERT or
> > something would work, that would be fine for upstream. But I suspect
> > Ubuntu turns that on because they turn on everything...
> 
> Likely that was the case. Noted and we will turn it off in one of the next
> updates.

We have mirrored the internal bugzilla here:

https://bugs.launchpad.net/ubuntu/+bug/2075721

Thanks,
Amit

> Thanks,
> 
> Stefan
> > 
> > Rob
> > 
> 
> -- 
> - Stefan
>
diff mbox series

Patch

diff --git a/drivers/of/dynamic.c b/drivers/of/dynamic.c
index dda6092e6d3a..9bba5e82a384 100644
--- a/drivers/of/dynamic.c
+++ b/drivers/of/dynamic.c
@@ -492,21 +492,29 @@  struct device_node *__of_node_dup(const struct device_node *np,
  * a given changeset.
  *
  * @ocs: Pointer to changeset
+ * @np: Pointer to device node. If null, allocate a new node. If not, init an
+ *	existing one.
  * @parent: Pointer to parent device node
  * @full_name: Node full name
  *
  * Return: Pointer to the created device node or NULL in case of an error.
  */
 struct device_node *of_changeset_create_node(struct of_changeset *ocs,
+					     struct device_node *np,
 					     struct device_node *parent,
 					     const char *full_name)
 {
-	struct device_node *np;
 	int ret;
 
-	np = __of_node_dup(NULL, full_name);
-	if (!np)
-		return NULL;
+	if (!np) {
+		np = __of_node_dup(NULL, full_name);
+		if (!np)
+			return NULL;
+	} else {
+		of_node_set_flag(np, OF_DYNAMIC);
+		of_node_set_flag(np, OF_DETACHED);
+	}
+
 	np->parent = parent;
 
 	ret = of_changeset_attach_node(ocs, np);
diff --git a/drivers/of/unittest.c b/drivers/of/unittest.c
index 445ad13dab98..b1bcc9ed40a6 100644
--- a/drivers/of/unittest.c
+++ b/drivers/of/unittest.c
@@ -871,7 +871,7 @@  static void __init of_unittest_changeset(void)
 	unittest(!of_changeset_add_property(&chgset, parent, ppadd), "fail add prop prop-add\n");
 	unittest(!of_changeset_update_property(&chgset, parent, ppupdate), "fail update prop\n");
 	unittest(!of_changeset_remove_property(&chgset, parent, ppremove), "fail remove prop\n");
-	n22 = of_changeset_create_node(&chgset, n2, "n22");
+	n22 = of_changeset_create_node(&chgset, NULL,  n2, "n22");
 	unittest(n22, "fail create n22\n");
 	unittest(!of_changeset_add_prop_string(&chgset, n22, "prop-str", "abcd"),
 		 "fail add prop prop-str");
diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
index 826b5016a101..d7ca20cb146a 100644
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c
@@ -342,8 +342,7 @@  void pci_bus_add_device(struct pci_dev *dev)
 	 */
 	pcibios_bus_add_device(dev);
 	pci_fixup_device(pci_fixup_final, dev);
-	if (pci_is_bridge(dev))
-		of_pci_make_dev_node(dev);
+	of_pci_make_dev_node(dev);
 	pci_create_sysfs_dev_files(dev);
 	pci_proc_attach_device(dev);
 	pci_bridge_d3_update(dev);
diff --git a/drivers/pci/of.c b/drivers/pci/of.c
index 51e3dd0ea5ab..883bf15211a5 100644
--- a/drivers/pci/of.c
+++ b/drivers/pci/of.c
@@ -608,18 +608,28 @@  int devm_of_pci_bridge_init(struct device *dev, struct pci_host_bridge *bridge)
 
 #ifdef CONFIG_PCI_DYNAMIC_OF_NODES
 
+void of_pci_free_node(struct device_node *np)
+{
+	struct of_changeset *cset;
+
+	cset = (struct of_changeset *)(np + 1);
+
+	np->data = NULL;
+	of_changeset_revert(cset);
+	of_changeset_destroy(cset);
+	of_node_put(np);
+}
+
 void of_pci_remove_node(struct pci_dev *pdev)
 {
 	struct device_node *np;
 
 	np = pci_device_to_OF_node(pdev);
-	if (!np || !of_node_check_flag(np, OF_DYNAMIC))
+	if (!np || np->data != of_pci_free_node)
 		return;
 	pdev->dev.of_node = NULL;
 
-	of_changeset_revert(np->data);
-	of_changeset_destroy(np->data);
-	of_node_put(np);
+	of_pci_free_node(np);
 }
 
 void of_pci_make_dev_node(struct pci_dev *pdev)
@@ -655,14 +665,18 @@  void of_pci_make_dev_node(struct pci_dev *pdev)
 	if (!name)
 		return;
 
-	cset = kmalloc(sizeof(*cset), GFP_KERNEL);
-	if (!cset)
+	np = kzalloc(sizeof(*np) + sizeof(*cset), GFP_KERNEL);
+	if (!np)
 		goto out_free_name;
+	np->full_name = name;
+	of_node_init(np);
+
+	cset = (struct of_changeset *)(np + 1);
 	of_changeset_init(cset);
 
-	np = of_changeset_create_node(cset, ppnode, name);
+	np = of_changeset_create_node(cset, np, ppnode, NULL);
 	if (!np)
-		goto out_destroy_cset;
+		goto out_free_node;
 
 	ret = of_pci_add_properties(pdev, cset, np);
 	if (ret)
@@ -670,19 +684,18 @@  void of_pci_make_dev_node(struct pci_dev *pdev)
 
 	ret = of_changeset_apply(cset);
 	if (ret)
-		goto out_free_node;
+		goto out_destroy_cset;
 
-	np->data = cset;
+	np->data = of_pci_free_node;
 	pdev->dev.of_node = np;
-	kfree(name);
 
 	return;
 
-out_free_node:
-	of_node_put(np);
 out_destroy_cset:
 	of_changeset_destroy(cset);
 	kfree(cset);
+out_free_node:
+	of_node_put(np);
 out_free_name:
 	kfree(name);
 }
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index fd44565c4756..7b1a455306b8 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -702,11 +702,13 @@  struct of_changeset;
 
 #ifdef CONFIG_PCI_DYNAMIC_OF_NODES
 void of_pci_make_dev_node(struct pci_dev *pdev);
+void of_pci_free_node(struct device_node *np);
 void of_pci_remove_node(struct pci_dev *pdev);
 int of_pci_add_properties(struct pci_dev *pdev, struct of_changeset *ocs,
 			  struct device_node *np);
 #else
 static inline void of_pci_make_dev_node(struct pci_dev *pdev) { }
+static inline void of_pci_free_node(struct device_node *np) { }
 static inline void of_pci_remove_node(struct pci_dev *pdev) { }
 #endif
 
diff --git a/include/linux/of.h b/include/linux/of.h
index a0bedd038a05..f774459d0d84 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -1631,6 +1631,7 @@  static inline int of_changeset_update_property(struct of_changeset *ocs,
 }
 
 struct device_node *of_changeset_create_node(struct of_changeset *ocs,
+					     struct device_node *np,
 					     struct device_node *parent,
 					     const char *full_name);
 int of_changeset_add_prop_string(struct of_changeset *ocs,