diff mbox series

[RFC,1/2] powerpc/pseries: papr-vpd char driver for VPD retrieval

Message ID 20230822-papr-sys_rtas-vs-lockdown-v1-1-932623cf3c7b@linux.ibm.com (mailing list archive)
State RFC
Headers show
Series powerpc/pseries: new character devices for RTAS functions | expand

Commit Message

Nathan Lynch via B4 Relay Aug. 22, 2023, 9:33 p.m. UTC
From: Nathan Lynch <nathanl@linux.ibm.com>

PowerVM LPARs may retrieve Vital Product Data (VPD) for system
components using the ibm,get-vpd RTAS function.

We can expose this to user space with a /dev/papr-vpd character
device, where the programming model is:

  struct papr_location_code plc = { .str = "", }; /* obtain all VPD */
  int devfd = open("/dev/papr-vpd", O_WRONLY);
  int vpdfd = ioctl(devfd, PAPR_VPD_CREATE_HANDLE, &plc);
  size_t size = lseek(vpdfd, 0, SEEK_END);
  char *buf = malloc(size);
  pread(devfd, buf, size, 0);

When a file descriptor is obtained from ioctl(PAPR_VPD_CREATE_HANDLE),
the file contains the result of a complete ibm,get-vpd sequence. The
file contents are immutable from the POV of user space. To get a new
view of VPD, clients must create a new handle.

This design choice insulates user space from most of the complexities
that ibm,get-vpd brings:

* ibm,get-vpd must be called more than once to obtain complete
  results.
* Only one ibm,get-vpd call sequence should be in progress at a time;
  concurrent sequences will disrupt each other. Callers must have a
  protocol for serializing their use of the function.
* A call sequence in progress may receive a "VPD changed, try again"
  status, requiring the client to start over. (The driver does not yet
  handle this, but it should be easy to add.)

The memory required for the VPD buffers seems acceptable, around 20KB
for all VPD on one of my systems. And the value of the
/rtas/ibm,vpd-size DT property (the estimated maximum size of VPD) is
consistently 300KB across various systems I've checked.

I've implemented support for this new ABI in the rtas_get_vpd()
function in librtas, which the vpdupdate command currently uses to
populate its VPD database. I've verified that an unmodified vpdupdate
binary generates an identical database when using a librtas.so that
prefers the new ABI.

Likely remaining work:

* Handle RTAS call status -4 (VPD changed) during ibm,get-vpd call
  sequence.
* Prevent ibm,get-vpd calls via rtas(2) from disrupting ibm,get-vpd
  call sequences in this driver.
* (Maybe) implement a poll method for delivering notifications of
  potential changes to VPD, e.g. after a partition migration.

Questions, points for discussion:

* Am I allocating the ioctl numbers correctly?
* The only way to discover the size of a VPD buffer is
  lseek(SEEK_END). fstat() doesn't work for anonymous fds like
  this. Is this OK, or should the buffer length be discoverable some
  other way?

Signed-off-by: Nathan Lynch <nathanl@linux.ibm.com>
---
 Documentation/userspace-api/ioctl/ioctl-number.rst |   2 +
 arch/powerpc/include/uapi/asm/papr-vpd.h           |  29 ++
 arch/powerpc/platforms/pseries/Makefile            |   1 +
 arch/powerpc/platforms/pseries/papr-vpd.c          | 353 +++++++++++++++++++++
 4 files changed, 385 insertions(+)

Comments

Michal Suchanek Aug. 30, 2023, 7:29 a.m. UTC | #1
Hello,

thanks for working on this.

On Tue, Aug 22, 2023 at 04:33:39PM -0500, Nathan Lynch via B4 Relay wrote:
> From: Nathan Lynch <nathanl@linux.ibm.com>
> 
> PowerVM LPARs may retrieve Vital Product Data (VPD) for system
> components using the ibm,get-vpd RTAS function.
> 
> We can expose this to user space with a /dev/papr-vpd character
> device, where the programming model is:
> 
>   struct papr_location_code plc = { .str = "", }; /* obtain all VPD */
>   int devfd = open("/dev/papr-vpd", O_WRONLY);
>   int vpdfd = ioctl(devfd, PAPR_VPD_CREATE_HANDLE, &plc);
>   size_t size = lseek(vpdfd, 0, SEEK_END);
>   char *buf = malloc(size);
>   pread(devfd, buf, size, 0);
> 
> When a file descriptor is obtained from ioctl(PAPR_VPD_CREATE_HANDLE),
> the file contains the result of a complete ibm,get-vpd sequence. The

Could this be somewhat less obfuscated?

What the caller wants is the result of "ibm,get-vpd", which is a
well-known string identifier of the rtas call.

Yet this identifier is never passed in. Instead we have this new
PAPR_VPD_CREATE_HANDLE. This is a completely new identifier, specific to
this call only as is the /dev/papr-vpd device name, another new
identifier.

Maybe the interface could provide a way to specify the service name?

> file contents are immutable from the POV of user space. To get a new
> view of VPD, clients must create a new handle.

Which is basically the same as creating a file descriptor with open().

Maybe creating a directory in sysfs or procfs with filenames
corresponding to rtas services would do the same job without extra
obfuscation?

> This design choice insulates user space from most of the complexities
> that ibm,get-vpd brings:
> 
> * ibm,get-vpd must be called more than once to obtain complete
>   results.
> * Only one ibm,get-vpd call sequence should be in progress at a time;
>   concurrent sequences will disrupt each other. Callers must have a
>   protocol for serializing their use of the function.
> * A call sequence in progress may receive a "VPD changed, try again"
>   status, requiring the client to start over. (The driver does not yet
>   handle this, but it should be easy to add.)

That certainly reduces the complexity of the user interface making it
much saner.

> The memory required for the VPD buffers seems acceptable, around 20KB
> for all VPD on one of my systems. And the value of the
> /rtas/ibm,vpd-size DT property (the estimated maximum size of VPD) is
> consistently 300KB across various systems I've checked.
> 
> I've implemented support for this new ABI in the rtas_get_vpd()
> function in librtas, which the vpdupdate command currently uses to
> populate its VPD database. I've verified that an unmodified vpdupdate
> binary generates an identical database when using a librtas.so that
> prefers the new ABI.
> 
> Likely remaining work:
> 
> * Handle RTAS call status -4 (VPD changed) during ibm,get-vpd call
>   sequence.
> * Prevent ibm,get-vpd calls via rtas(2) from disrupting ibm,get-vpd
>   call sequences in this driver.
> * (Maybe) implement a poll method for delivering notifications of
>   potential changes to VPD, e.g. after a partition migration.

That sounds like something for netlink. If that is desired maybe it
should be used in the first place?

> Questions, points for discussion:
> 
> * Am I allocating the ioctl numbers correctly?
> * The only way to discover the size of a VPD buffer is
>   lseek(SEEK_END). fstat() doesn't work for anonymous fds like
>   this. Is this OK, or should the buffer length be discoverable some
>   other way?

So long as users have /rtas/ibm,vpd-size as the top bound of the data
they can receive I don't think it's critical to know the current VPD
size.

Thanks

Michal
Michael Ellerman Aug. 31, 2023, 5:34 a.m. UTC | #2
Michal Suchánek <msuchanek@suse.de> writes:
> Hello,
>
> thanks for working on this.
>
> On Tue, Aug 22, 2023 at 04:33:39PM -0500, Nathan Lynch via B4 Relay wrote:
>> From: Nathan Lynch <nathanl@linux.ibm.com>
>> 
>> PowerVM LPARs may retrieve Vital Product Data (VPD) for system
>> components using the ibm,get-vpd RTAS function.
>> 
>> We can expose this to user space with a /dev/papr-vpd character
>> device, where the programming model is:
>> 
>>   struct papr_location_code plc = { .str = "", }; /* obtain all VPD */
>>   int devfd = open("/dev/papr-vpd", O_WRONLY);
>>   int vpdfd = ioctl(devfd, PAPR_VPD_CREATE_HANDLE, &plc);
>>   size_t size = lseek(vpdfd, 0, SEEK_END);
>>   char *buf = malloc(size);
>>   pread(devfd, buf, size, 0);
>> 
>> When a file descriptor is obtained from ioctl(PAPR_VPD_CREATE_HANDLE),
>> the file contains the result of a complete ibm,get-vpd sequence. The
>
> Could this be somewhat less obfuscated?
>
> What the caller wants is the result of "ibm,get-vpd", which is a
> well-known string identifier of the rtas call.

Not really. What the caller wants is *the VPD*. Currently that's done
by calling the RTAS "ibm,get-vpd" function, but that could change in
future. There's RTAS calls that have been replaced with a "version 2" in
the past, that could happen here too. Or the RTAS call could be replaced
by a hypercall (though unlikely).

But hopefully if the underlying mechanism changed the kernel would be
able to hide that detail behind this new API, and users would not need
to change at all.

> Yet this identifier is never passed in. Instead we have this new
> PAPR_VPD_CREATE_HANDLE. This is a completely new identifier, specific to
> this call only as is the /dev/papr-vpd device name, another new
> identifier.
>
> Maybe the interface could provide a way to specify the service name?
>
>> file contents are immutable from the POV of user space. To get a new
>> view of VPD, clients must create a new handle.
>
> Which is basically the same as creating a file descriptor with open().

Sort of. But much cleaner becuase you don't need to create a file in the
filesystem and tell userspace how to find it.

This pattern of creating file descriptors from existing file descriptors
to model a hiearachy of objects is well established in eg. the KVM and
DRM APIs.

> Maybe creating a directory in sysfs or procfs with filenames
> corresponding to rtas services would do the same job without extra
> obfuscation?

It's not obfuscation, it's abstraction. The kernel talks to firmware to
do things, and provides an API to user space. Not all the details of how
the firmware works are relevant to user space, including the exact
firmware calls required to implement a certain feature.

>> This design choice insulates user space from most of the complexities
>> that ibm,get-vpd brings:
>> 
>> * ibm,get-vpd must be called more than once to obtain complete
>>   results.
>> * Only one ibm,get-vpd call sequence should be in progress at a time;
>>   concurrent sequences will disrupt each other. Callers must have a
>>   protocol for serializing their use of the function.
>> * A call sequence in progress may receive a "VPD changed, try again"
>>   status, requiring the client to start over. (The driver does not yet
>>   handle this, but it should be easy to add.)
>
> That certainly reduces the complexity of the user interface making it
> much saner.

Yes :)

>> The memory required for the VPD buffers seems acceptable, around 20KB
>> for all VPD on one of my systems. And the value of the
>> /rtas/ibm,vpd-size DT property (the estimated maximum size of VPD) is
>> consistently 300KB across various systems I've checked.
>> 
>> I've implemented support for this new ABI in the rtas_get_vpd()
>> function in librtas, which the vpdupdate command currently uses to
>> populate its VPD database. I've verified that an unmodified vpdupdate
>> binary generates an identical database when using a librtas.so that
>> prefers the new ABI.
>> 
>> Likely remaining work:
>> 
>> * Handle RTAS call status -4 (VPD changed) during ibm,get-vpd call
>>   sequence.
>> * Prevent ibm,get-vpd calls via rtas(2) from disrupting ibm,get-vpd
>>   call sequences in this driver.
>> * (Maybe) implement a poll method for delivering notifications of
>>   potential changes to VPD, e.g. after a partition migration.
>
> That sounds like something for netlink. If that is desired maybe it
> should be used in the first place?

I don't see why that is related to netlink. It's entirely normal for
file descriptor based APIs to implement poll.

netlink adds a lot of complexity for zero gain IMO.

cheers
Michal Suchanek Aug. 31, 2023, 10:38 a.m. UTC | #3
On Thu, Aug 31, 2023 at 03:34:37PM +1000, Michael Ellerman wrote:
> Michal Suchánek <msuchanek@suse.de> writes:
> > Hello,
> >
> > thanks for working on this.
> >
> > On Tue, Aug 22, 2023 at 04:33:39PM -0500, Nathan Lynch via B4 Relay wrote:
> >> From: Nathan Lynch <nathanl@linux.ibm.com>
> >> 
> >> PowerVM LPARs may retrieve Vital Product Data (VPD) for system
> >> components using the ibm,get-vpd RTAS function.
> >> 
> >> We can expose this to user space with a /dev/papr-vpd character
> >> device, where the programming model is:
> >> 
> >>   struct papr_location_code plc = { .str = "", }; /* obtain all VPD */
> >>   int devfd = open("/dev/papr-vpd", O_WRONLY);
> >>   int vpdfd = ioctl(devfd, PAPR_VPD_CREATE_HANDLE, &plc);
> >>   size_t size = lseek(vpdfd, 0, SEEK_END);
> >>   char *buf = malloc(size);
> >>   pread(devfd, buf, size, 0);
> >> 
> >> When a file descriptor is obtained from ioctl(PAPR_VPD_CREATE_HANDLE),
> >> the file contains the result of a complete ibm,get-vpd sequence. The
> >
> > Could this be somewhat less obfuscated?
> >
> > What the caller wants is the result of "ibm,get-vpd", which is a
> > well-known string identifier of the rtas call.
> 
> Not really. What the caller wants is *the VPD*. Currently that's done
> by calling the RTAS "ibm,get-vpd" function, but that could change in
> future. There's RTAS calls that have been replaced with a "version 2" in
> the past, that could happen here too. Or the RTAS call could be replaced
> by a hypercall (though unlikely).
> 
> But hopefully if the underlying mechanism changed the kernel would be
> able to hide that detail behind this new API, and users would not need
> to change at all.
> 
> > Yet this identifier is never passed in. Instead we have this new
> > PAPR_VPD_CREATE_HANDLE. This is a completely new identifier, specific to
> > this call only as is the /dev/papr-vpd device name, another new
> > identifier.
> >
> > Maybe the interface could provide a way to specify the service name?
> >
> >> file contents are immutable from the POV of user space. To get a new
> >> view of VPD, clients must create a new handle.
> >
> > Which is basically the same as creating a file descriptor with open().
> 
> Sort of. But much cleaner becuase you don't need to create a file in the
> filesystem and tell userspace how to find it.

You very much do. There is the /dev/papr-vpd and PAPR_VPD_CREATE_HANDLE
which userspace has to know about, the PAPR_VPD_CREATE_HANDLE is not
even possible to find at all.

Thanks

Michal
Michal Suchanek Aug. 31, 2023, 11:35 a.m. UTC | #4
On Thu, Aug 31, 2023 at 03:34:37PM +1000, Michael Ellerman wrote:
> Michal Suchánek <msuchanek@suse.de> writes:
> > Hello,
> >
> > thanks for working on this.
> >
> > On Tue, Aug 22, 2023 at 04:33:39PM -0500, Nathan Lynch via B4 Relay wrote:
> >> From: Nathan Lynch <nathanl@linux.ibm.com>
> >> 
> >> PowerVM LPARs may retrieve Vital Product Data (VPD) for system
> >> components using the ibm,get-vpd RTAS function.
> >> 
> >> We can expose this to user space with a /dev/papr-vpd character
> >> device, where the programming model is:
> >> 
> >>   struct papr_location_code plc = { .str = "", }; /* obtain all VPD */
> >>   int devfd = open("/dev/papr-vpd", O_WRONLY);
> >>   int vpdfd = ioctl(devfd, PAPR_VPD_CREATE_HANDLE, &plc);
> >>   size_t size = lseek(vpdfd, 0, SEEK_END);
> >>   char *buf = malloc(size);
> >>   pread(devfd, buf, size, 0);
> >> 
> >> When a file descriptor is obtained from ioctl(PAPR_VPD_CREATE_HANDLE),
> >> the file contains the result of a complete ibm,get-vpd sequence. The
> >
> > Could this be somewhat less obfuscated?
> >
> > What the caller wants is the result of "ibm,get-vpd", which is a
> > well-known string identifier of the rtas call.
> 
> Not really. What the caller wants is *the VPD*. Currently that's done
> by calling the RTAS "ibm,get-vpd" function, but that could change in
> future. There's RTAS calls that have been replaced with a "version 2" in
> the past, that could happen here too. Or the RTAS call could be replaced
> by a hypercall (though unlikely).
> 
> But hopefully if the underlying mechanism changed the kernel would be
> able to hide that detail behind this new API, and users would not need
> to change at all.

Still the kernel could use the name that is well-known today even if it
uses different implementation internally in the future.

> 
> > Yet this identifier is never passed in. Instead we have this new
> > PAPR_VPD_CREATE_HANDLE. This is a completely new identifier, specific to
> > this call only as is the /dev/papr-vpd device name, another new
> > identifier.
> >
> > Maybe the interface could provide a way to specify the service name?
> >
> >> file contents are immutable from the POV of user space. To get a new
> >> view of VPD, clients must create a new handle.
> >
> > Which is basically the same as creating a file descriptor with open().
> 
> Sort of. But much cleaner becuase you don't need to create a file in the
> filesystem and tell userspace how to find it.
> 
> This pattern of creating file descriptors from existing file descriptors
> to model a hiearachy of objects is well established in eg. the KVM and
> DRM APIs.

> 
> >> The memory required for the VPD buffers seems acceptable, around 20KB
> >> for all VPD on one of my systems. And the value of the
> >> /rtas/ibm,vpd-size DT property (the estimated maximum size of VPD) is
> >> consistently 300KB across various systems I've checked.
> >> 
> >> I've implemented support for this new ABI in the rtas_get_vpd()
> >> function in librtas, which the vpdupdate command currently uses to
> >> populate its VPD database. I've verified that an unmodified vpdupdate
> >> binary generates an identical database when using a librtas.so that
> >> prefers the new ABI.
> >> 
> >> Likely remaining work:
> >> 
> >> * Handle RTAS call status -4 (VPD changed) during ibm,get-vpd call
> >>   sequence.
> >> * Prevent ibm,get-vpd calls via rtas(2) from disrupting ibm,get-vpd
> >>   call sequences in this driver.
> >> * (Maybe) implement a poll method for delivering notifications of
> >>   potential changes to VPD, e.g. after a partition migration.
> >
> > That sounds like something for netlink. If that is desired maybe it
> > should be used in the first place?
> 
> I don't see why that is related to netlink. It's entirely normal for
> file descriptor based APIs to implement poll.
> 
> netlink adds a lot of complexity for zero gain IMO.

It kind of solves the problem with shoehorning something that's not
really a file into file descriptors. You don't have to when not using
them. It also solves how to access multiple services without creating a
large number of files and large number of obscure constants.

Thanks

Michal
Michael Ellerman Aug. 31, 2023, 11:37 a.m. UTC | #5
Michal Suchánek <msuchanek@suse.de> writes:
> On Thu, Aug 31, 2023 at 03:34:37PM +1000, Michael Ellerman wrote:
>> Michal Suchánek <msuchanek@suse.de> writes:
>> > On Tue, Aug 22, 2023 at 04:33:39PM -0500, Nathan Lynch via B4 Relay wrote:
>> >> From: Nathan Lynch <nathanl@linux.ibm.com>
>> >> 
>> >> PowerVM LPARs may retrieve Vital Product Data (VPD) for system
>> >> components using the ibm,get-vpd RTAS function.
>> >> 
>> >> We can expose this to user space with a /dev/papr-vpd character
>> >> device, where the programming model is:
>> >> 
>> >>   struct papr_location_code plc = { .str = "", }; /* obtain all VPD */
>> >>   int devfd = open("/dev/papr-vpd", O_WRONLY);
>> >>   int vpdfd = ioctl(devfd, PAPR_VPD_CREATE_HANDLE, &plc);
>> >>   size_t size = lseek(vpdfd, 0, SEEK_END);
>> >>   char *buf = malloc(size);
>> >>   pread(devfd, buf, size, 0);
>> >> 
>> >> When a file descriptor is obtained from ioctl(PAPR_VPD_CREATE_HANDLE),
>> >> the file contains the result of a complete ibm,get-vpd sequence. The
>> >
>> > Could this be somewhat less obfuscated?
>> >
>> > What the caller wants is the result of "ibm,get-vpd", which is a
>> > well-known string identifier of the rtas call.
>> 
>> Not really. What the caller wants is *the VPD*. Currently that's done
>> by calling the RTAS "ibm,get-vpd" function, but that could change in
>> future. There's RTAS calls that have been replaced with a "version 2" in
>> the past, that could happen here too. Or the RTAS call could be replaced
>> by a hypercall (though unlikely).
>> 
>> But hopefully if the underlying mechanism changed the kernel would be
>> able to hide that detail behind this new API, and users would not need
>> to change at all.
>> 
>> > Yet this identifier is never passed in. Instead we have this new
>> > PAPR_VPD_CREATE_HANDLE. This is a completely new identifier, specific to
>> > this call only as is the /dev/papr-vpd device name, another new
>> > identifier.
>> >
>> > Maybe the interface could provide a way to specify the service name?
>> >
>> >> file contents are immutable from the POV of user space. To get a new
>> >> view of VPD, clients must create a new handle.
>> >
>> > Which is basically the same as creating a file descriptor with open().
>> 
>> Sort of. But much cleaner becuase you don't need to create a file in the
>> filesystem and tell userspace how to find it.
>
> You very much do. There is the /dev/papr-vpd and PAPR_VPD_CREATE_HANDLE
> which userspace has to know about, the PAPR_VPD_CREATE_HANDLE is not
> even possible to find at all.

Well yeah you need the device itself :)

And yes the ioctl is defined in a header, not in the filesystem, but
that's entirely normal for an ioctl based API.

cheers
Michal Suchanek Aug. 31, 2023, 11:44 a.m. UTC | #6
On Thu, Aug 31, 2023 at 09:37:12PM +1000, Michael Ellerman wrote:
> Michal Suchánek <msuchanek@suse.de> writes:
> > On Thu, Aug 31, 2023 at 03:34:37PM +1000, Michael Ellerman wrote:
> >> Michal Suchánek <msuchanek@suse.de> writes:
> >> > On Tue, Aug 22, 2023 at 04:33:39PM -0500, Nathan Lynch via B4 Relay wrote:
> >> >> From: Nathan Lynch <nathanl@linux.ibm.com>
> >> >> 
> >> >> PowerVM LPARs may retrieve Vital Product Data (VPD) for system
> >> >> components using the ibm,get-vpd RTAS function.
> >> >> 
> >> >> We can expose this to user space with a /dev/papr-vpd character
> >> >> device, where the programming model is:
> >> >> 
> >> >>   struct papr_location_code plc = { .str = "", }; /* obtain all VPD */
> >> >>   int devfd = open("/dev/papr-vpd", O_WRONLY);
> >> >>   int vpdfd = ioctl(devfd, PAPR_VPD_CREATE_HANDLE, &plc);
> >> >>   size_t size = lseek(vpdfd, 0, SEEK_END);
> >> >>   char *buf = malloc(size);
> >> >>   pread(devfd, buf, size, 0);
> >> >> 
> >> >> When a file descriptor is obtained from ioctl(PAPR_VPD_CREATE_HANDLE),
> >> >> the file contains the result of a complete ibm,get-vpd sequence. The
> >> >
> >> > Could this be somewhat less obfuscated?
> >> >
> >> > What the caller wants is the result of "ibm,get-vpd", which is a
> >> > well-known string identifier of the rtas call.
> >> 
> >> Not really. What the caller wants is *the VPD*. Currently that's done
> >> by calling the RTAS "ibm,get-vpd" function, but that could change in
> >> future. There's RTAS calls that have been replaced with a "version 2" in
> >> the past, that could happen here too. Or the RTAS call could be replaced
> >> by a hypercall (though unlikely).
> >> 
> >> But hopefully if the underlying mechanism changed the kernel would be
> >> able to hide that detail behind this new API, and users would not need
> >> to change at all.
> >> 
> >> > Yet this identifier is never passed in. Instead we have this new
> >> > PAPR_VPD_CREATE_HANDLE. This is a completely new identifier, specific to
> >> > this call only as is the /dev/papr-vpd device name, another new
> >> > identifier.
> >> >
> >> > Maybe the interface could provide a way to specify the service name?
> >> >
> >> >> file contents are immutable from the POV of user space. To get a new
> >> >> view of VPD, clients must create a new handle.
> >> >
> >> > Which is basically the same as creating a file descriptor with open().
> >> 
> >> Sort of. But much cleaner becuase you don't need to create a file in the
> >> filesystem and tell userspace how to find it.
> >
> > You very much do. There is the /dev/papr-vpd and PAPR_VPD_CREATE_HANDLE
> > which userspace has to know about, the PAPR_VPD_CREATE_HANDLE is not
> > even possible to find at all.
> 
> Well yeah you need the device itself :)

And as named it's specific to this call, and new devices will be needed
for any additional rtas called implemented.

> 
> And yes the ioctl is defined in a header, not in the filesystem, but
> that's entirely normal for an ioctl based API.

Of course, because the ioctl API has no safe way of passing a string
identifier for the function. Then it needs to create these obscure IDs.

Other APIs that don't have this problem exist.

Thanks

Michal
Nathan Lynch Aug. 31, 2023, 3:52 p.m. UTC | #7
Michal Suchánek <msuchanek@suse.de> writes:
>
> thanks for working on this.

Thanks for reviewing!


> On Tue, Aug 22, 2023 at 04:33:39PM -0500, Nathan Lynch via B4 Relay wrote:
>> From: Nathan Lynch <nathanl@linux.ibm.com>
>> 
>> PowerVM LPARs may retrieve Vital Product Data (VPD) for system
>> components using the ibm,get-vpd RTAS function.
>> 
>> We can expose this to user space with a /dev/papr-vpd character
>> device, where the programming model is:
>> 
>>   struct papr_location_code plc = { .str = "", }; /* obtain all VPD */
>>   int devfd = open("/dev/papr-vpd", O_WRONLY);
>>   int vpdfd = ioctl(devfd, PAPR_VPD_CREATE_HANDLE, &plc);
>>   size_t size = lseek(vpdfd, 0, SEEK_END);
>>   char *buf = malloc(size);
>>   pread(devfd, buf, size, 0);
>> 
>> When a file descriptor is obtained from ioctl(PAPR_VPD_CREATE_HANDLE),
>> the file contains the result of a complete ibm,get-vpd sequence. The
>
> Could this be somewhat less obfuscated?
>
> What the caller wants is the result of "ibm,get-vpd", which is a
> well-known string identifier of the rtas call.
>
> Yet this identifier is never passed in. Instead we have this new
> PAPR_VPD_CREATE_HANDLE. This is a completely new identifier, specific to
> this call only as is the /dev/papr-vpd device name, another new
> identifier.
>
> Maybe the interface could provide a way to specify the service name?
>
>> file contents are immutable from the POV of user space. To get a new
>> view of VPD, clients must create a new handle.
>
> Which is basically the same as creating a file descriptor with open().
>
> Maybe creating a directory in sysfs or procfs with filenames
> corresponding to rtas services would do the same job without extra
> obfuscation?

Michael has already said most of what I would have on these points. I'll
add that my experience with user space software that interacts closely
with RTAS leads me to believe that the kernel can and should expose
these platform features in higher-level ways that address fundamental
needs while encapsulating complexities such as caller serialization and
the mechanism (RTAS vs hcall vs DT). In this case, the fact that the
ibm,get-vpd RTAS function is the PAPR-specified interface for the OS to
retrieve VPD is much less essential to vpdupdate/libvpd than the format
of the VPD returned.


>> * The only way to discover the size of a VPD buffer is
>>   lseek(SEEK_END). fstat() doesn't work for anonymous fds like
>>   this. Is this OK, or should the buffer length be discoverable some
>>   other way?
>
> So long as users have /rtas/ibm,vpd-size as the top bound of the data
> they can receive I don't think it's critical to know the current VPD
> size.

OK, well I want to be transparent that the spec language says it's the
"estimated" max size, so I would hesitate to use it to size buffers
without some fallback for when it's wrong.
Nathan Lynch Aug. 31, 2023, 5:59 p.m. UTC | #8
Michal Suchánek <msuchanek@suse.de> writes:
> On Thu, Aug 31, 2023 at 09:37:12PM +1000, Michael Ellerman wrote:
>> Michal Suchánek <msuchanek@suse.de> writes:
>> > On Thu, Aug 31, 2023 at 03:34:37PM +1000, Michael Ellerman wrote:
>> >> Michal Suchánek <msuchanek@suse.de> writes:
>> >> > On Tue, Aug 22, 2023 at 04:33:39PM -0500, Nathan Lynch via B4 Relay wrote:
>> >> >> From: Nathan Lynch <nathanl@linux.ibm.com>
>> >> >> 
>> >> >> PowerVM LPARs may retrieve Vital Product Data (VPD) for system
>> >> >> components using the ibm,get-vpd RTAS function.
>> >> >> 
>> >> >> We can expose this to user space with a /dev/papr-vpd character
>> >> >> device, where the programming model is:
>> >> >> 
>> >> >>   struct papr_location_code plc = { .str = "", }; /* obtain all VPD */
>> >> >>   int devfd = open("/dev/papr-vpd", O_WRONLY);
>> >> >>   int vpdfd = ioctl(devfd, PAPR_VPD_CREATE_HANDLE, &plc);
>> >> >>   size_t size = lseek(vpdfd, 0, SEEK_END);
>> >> >>   char *buf = malloc(size);
>> >> >>   pread(devfd, buf, size, 0);
>> >> >> 
>> >> >> When a file descriptor is obtained from ioctl(PAPR_VPD_CREATE_HANDLE),
>> >> >> the file contains the result of a complete ibm,get-vpd sequence. The
>> >> >
>> >> > Could this be somewhat less obfuscated?
>> >> >
>> >> > What the caller wants is the result of "ibm,get-vpd", which is a
>> >> > well-known string identifier of the rtas call.
>> >> 
>> >> Not really. What the caller wants is *the VPD*. Currently that's done
>> >> by calling the RTAS "ibm,get-vpd" function, but that could change in
>> >> future. There's RTAS calls that have been replaced with a "version 2" in
>> >> the past, that could happen here too. Or the RTAS call could be replaced
>> >> by a hypercall (though unlikely).
>> >> 
>> >> But hopefully if the underlying mechanism changed the kernel would be
>> >> able to hide that detail behind this new API, and users would not need
>> >> to change at all.
>> >> 
>> >> > Yet this identifier is never passed in. Instead we have this new
>> >> > PAPR_VPD_CREATE_HANDLE. This is a completely new identifier, specific to
>> >> > this call only as is the /dev/papr-vpd device name, another new
>> >> > identifier.
>> >> >
>> >> > Maybe the interface could provide a way to specify the service name?
>> >> >
>> >> >> file contents are immutable from the POV of user space. To get a new
>> >> >> view of VPD, clients must create a new handle.
>> >> >
>> >> > Which is basically the same as creating a file descriptor with open().
>> >> 
>> >> Sort of. But much cleaner becuase you don't need to create a file in the
>> >> filesystem and tell userspace how to find it.
>> >
>> > You very much do. There is the /dev/papr-vpd and PAPR_VPD_CREATE_HANDLE
>> > which userspace has to know about, the PAPR_VPD_CREATE_HANDLE is not
>> > even possible to find at all.
>> 
>> Well yeah you need the device itself :)
>
> And as named it's specific to this call, and new devices will be needed
> for any additional rtas called implemented.
>
>> 
>> And yes the ioctl is defined in a header, not in the filesystem, but
>> that's entirely normal for an ioctl based API.
>
> Of course, because the ioctl API has no safe way of passing a string
> identifier for the function. Then it needs to create these obscure IDs.
>
> Other APIs that don't have this problem exist.

Looking at the cover letter for the series, I wonder if my framing and
word choice is confusing? Instead of "new character devices for RTAS
functions", what I would really like to convey is "new character devices
for platform features that are currently only accessible through the
rtas() syscall". But that's too long :-)

You (Michal) seem to favor a kernel-user ABI where user space is allowed
to invoke arbitrary RTAS functions by name. But we already have that in
the form of the rtas() syscall. (User space looks up function tokens by
name in the DT.) The point of the series is that we need to move away
from that. It's too low-level and user space has to use /dev/mem when
invoking any of the less-simple RTAS functions.
Michal Suchanek Sept. 4, 2023, 7:20 a.m. UTC | #9
Hello,

On Thu, Aug 31, 2023 at 12:59:25PM -0500, Nathan Lynch wrote:
> Michal Suchánek <msuchanek@suse.de> writes:
> > On Thu, Aug 31, 2023 at 09:37:12PM +1000, Michael Ellerman wrote:
> >> Michal Suchánek <msuchanek@suse.de> writes:
> >> > On Thu, Aug 31, 2023 at 03:34:37PM +1000, Michael Ellerman wrote:
> >> >> Michal Suchánek <msuchanek@suse.de> writes:
> >> >> > On Tue, Aug 22, 2023 at 04:33:39PM -0500, Nathan Lynch via B4 Relay wrote:
> >> >> >> From: Nathan Lynch <nathanl@linux.ibm.com>
> >> >> >> 
> >> >> >> PowerVM LPARs may retrieve Vital Product Data (VPD) for system
> >> >> >> components using the ibm,get-vpd RTAS function.
> >> >> >> 
> >> >> >> We can expose this to user space with a /dev/papr-vpd character
> >> >> >> device, where the programming model is:
> >> >> >> 
> >> >> >>   struct papr_location_code plc = { .str = "", }; /* obtain all VPD */
> >> >> >>   int devfd = open("/dev/papr-vpd", O_WRONLY);
> >> >> >>   int vpdfd = ioctl(devfd, PAPR_VPD_CREATE_HANDLE, &plc);
> >> >> >>   size_t size = lseek(vpdfd, 0, SEEK_END);
> >> >> >>   char *buf = malloc(size);
> >> >> >>   pread(devfd, buf, size, 0);
> >> >> >> 
> >> >> >> When a file descriptor is obtained from ioctl(PAPR_VPD_CREATE_HANDLE),
> >> >> >> the file contains the result of a complete ibm,get-vpd sequence. The
> >> >> >
> >> >> > Could this be somewhat less obfuscated?
> >> >> >
> >> >> > What the caller wants is the result of "ibm,get-vpd", which is a
> >> >> > well-known string identifier of the rtas call.
> >> >> 
> >> >> Not really. What the caller wants is *the VPD*. Currently that's done
> >> >> by calling the RTAS "ibm,get-vpd" function, but that could change in
> >> >> future. There's RTAS calls that have been replaced with a "version 2" in
> >> >> the past, that could happen here too. Or the RTAS call could be replaced
> >> >> by a hypercall (though unlikely).
> >> >> 
> >> >> But hopefully if the underlying mechanism changed the kernel would be
> >> >> able to hide that detail behind this new API, and users would not need
> >> >> to change at all.
> >> >> 
> >> >> > Yet this identifier is never passed in. Instead we have this new
> >> >> > PAPR_VPD_CREATE_HANDLE. This is a completely new identifier, specific to
> >> >> > this call only as is the /dev/papr-vpd device name, another new
> >> >> > identifier.
> >> >> >
> >> >> > Maybe the interface could provide a way to specify the service name?
> >> >> >
> >> >> >> file contents are immutable from the POV of user space. To get a new
> >> >> >> view of VPD, clients must create a new handle.
> >> >> >
> >> >> > Which is basically the same as creating a file descriptor with open().
> >> >> 
> >> >> Sort of. But much cleaner becuase you don't need to create a file in the
> >> >> filesystem and tell userspace how to find it.
> >> >
> >> > You very much do. There is the /dev/papr-vpd and PAPR_VPD_CREATE_HANDLE
> >> > which userspace has to know about, the PAPR_VPD_CREATE_HANDLE is not
> >> > even possible to find at all.
> >> 
> >> Well yeah you need the device itself :)
> >
> > And as named it's specific to this call, and new devices will be needed
> > for any additional rtas called implemented.
> >
> >> 
> >> And yes the ioctl is defined in a header, not in the filesystem, but
> >> that's entirely normal for an ioctl based API.
> >
> > Of course, because the ioctl API has no safe way of passing a string
> > identifier for the function. Then it needs to create these obscure IDs.
> >
> > Other APIs that don't have this problem exist.
> 
> Looking at the cover letter for the series, I wonder if my framing and
> word choice is confusing? Instead of "new character devices for RTAS
> functions", what I would really like to convey is "new character devices
> for platform features that are currently only accessible through the
> rtas() syscall". But that's too long :-)

and does that really change anything?

> You (Michal) seem to favor a kernel-user ABI where user space is allowed
> to invoke arbitrary RTAS functions by name. But we already have that in
> the form of the rtas() syscall. (User space looks up function tokens by
> name in the DT.) The point of the series is that we need to move away
> from that. It's too low-level and user space has to use /dev/mem when
> invoking any of the less-simple RTAS functions.

We don't have that, directly accessing /dev/mem does not really work.
And that's what needs fixing in my view.

The rtas calls are all mechanically the same, the function implemented
here should be able to call any of them if there was a way to specify
the call.

Given that there is desire to have access to multiple calls I don't
think it makes sense to allocate a separate device with different name
for each.

The ioclt interface is not nice for this. however.

Given that different calls have different parameters having one ioctl
for all of them would be quite messy.

Still even as is the ioctl takes a string argument which is problematic
on its own.

Given that there is an argument to the call it cannot be reasonably
implemented as sysfs file, either.

That's probably why the crypto API ended up with using netlink. The
situation is very similar there: there are algorithms identified by
strings, there are parameters that vary by the algorithm, the operation
requested, and other parameters.

Unlike ioctl netlink has helpers for packing multiple fields into a
message and getting them out, on both kernel and user side. With that no
string pointers need to be passed around between kernel space and user
space, only one message buffer. Passing the buffer length and field
length is part of the protocol, and cannot be missed. When an argument
is not passed it is clearly not there, in the ioctl case it becomes
garbage.

Adding one call through netlink may require more effort than ioctl.
However, adding additional calls will be easy, especially for simpla
calls that don't take arguments.

Even if ioctls are used adding all rtas ioctls on one device at least
reduces redundancy of allocated identifiers.

Thanks

Michal
Michal Suchanek Sept. 4, 2023, 7:48 a.m. UTC | #10
On Thu, Aug 31, 2023 at 03:34:37PM +1000, Michael Ellerman wrote:
> Michal Suchánek <msuchanek@suse.de> writes:
> > Hello,
> >
> > thanks for working on this.
> >
> > On Tue, Aug 22, 2023 at 04:33:39PM -0500, Nathan Lynch via B4 Relay wrote:
> >> From: Nathan Lynch <nathanl@linux.ibm.com>
> >> 
> >> PowerVM LPARs may retrieve Vital Product Data (VPD) for system
> >> components using the ibm,get-vpd RTAS function.
> >> 
> >> We can expose this to user space with a /dev/papr-vpd character
> >> device, where the programming model is:
> >> 
> >>   struct papr_location_code plc = { .str = "", }; /* obtain all VPD */
> >>   int devfd = open("/dev/papr-vpd", O_WRONLY);
> >>   int vpdfd = ioctl(devfd, PAPR_VPD_CREATE_HANDLE, &plc);
> >>   size_t size = lseek(vpdfd, 0, SEEK_END);
> >>   char *buf = malloc(size);
> >>   pread(devfd, buf, size, 0);
> >> 
> >> When a file descriptor is obtained from ioctl(PAPR_VPD_CREATE_HANDLE),
> >> the file contains the result of a complete ibm,get-vpd sequence. The
> >
> > Could this be somewhat less obfuscated?
> >
> > What the caller wants is the result of "ibm,get-vpd", which is a
> > well-known string identifier of the rtas call.
> 
> Not really. What the caller wants is *the VPD*. Currently that's done
> by calling the RTAS "ibm,get-vpd" function, but that could change in
> future. There's RTAS calls that have been replaced with a "version 2" in
> the past, that could happen here too. Or the RTAS call could be replaced
> by a hypercall (though unlikely).
> 
> But hopefully if the underlying mechanism changed the kernel would be
> able to hide that detail behind this new API, and users would not need
> to change at all.

With the device named rtas-vpd it's clearly tied to rtas.

If 'version 2' of the call happens it's more likely than not going to
have new data format because limit of current format was reached. Then
emulating that old call with the new one would be counterproductive or
impossible.

Even if the same data is available through different call there is no
problem. If the user really used the well-known "ibm,get-vpd" identifier
documented in the specification then the kernel can translate it
internally to whatever new method for obtaining the data exists. The
current revisions of the specification are not going to go away, and the
identifier is still well-known and documented, even if newer revisions
of the platform use different way to provide the data to the kernel.

Sure, with the current syscall interface it would not work because the
user translates this well-known identifier into a function token, and
passes that to the kernel. With that if the "ibm,get-vpd" is gone the
kernel cannot provide the data anymore.

That was done to make it possible to call functions that were not yet
known when the kernel was written. This is no longer allowed, and the
kernel has functionality for translating function names to tokens for
the functions it does know about. Then it can do the translation for
userspace as well, and when the call is implemented differently in the
future abstract that detail away.

> > Yet this identifier is never passed in. Instead we have this new
> > PAPR_VPD_CREATE_HANDLE. This is a completely new identifier, specific to
> > this call only as is the /dev/papr-vpd device name, another new
> > identifier.
> >
> > Maybe the interface could provide a way to specify the service name?
> >
> >> file contents are immutable from the POV of user space. To get a new
> >> view of VPD, clients must create a new handle.
> >
> > Which is basically the same as creating a file descriptor with open().
> 
> Sort of. But much cleaner becuase you don't need to create a file in the
> filesystem and tell userspace how to find it.

Instead, you create a device in the filesystem, and assign an IOCTL, and
need to tell the userspace how to find both.

> 
> This pattern of creating file descriptors from existing file descriptors
> to model a hiearachy of objects is well established in eg. the KVM and
> DRM APIs.

Yet there is no object hierarchy to speak of here. There is one device
with one ioctl on it. The device name is tied to this specific call so a
different call will need both a new device and new IOCTL.

> 
> > Maybe creating a directory in sysfs or procfs with filenames
> > corresponding to rtas services would do the same job without extra
> > obfuscation?
> 
> It's not obfuscation, it's abstraction. The kernel talks to firmware to
> do things, and provides an API to user space. Not all the details of how
> the firmware works are relevant to user space, including the exact
> firmware calls required to implement a certain feature.

Well, it's not static data, it's a call. There might be different ways
to go around passing the arguments in and getting the results out.

Hiding the buffer management and chunking of the resulting data is an
abstraction, great.

Hiding the translation of well-known function name to the
system-specific token is an abstraction, great.

Translating the rtas error codes to well-known error codes specified in
errno.h is an abstraction, great.

Replacing the well-known function name defined in the specification with
Linux-specific device name and IOCLT number is not an abstraction. It's
more abstract than the system-specific function token but it's less
abstract than the well-known name defined by the specification.

Thanks

Michal
Michael Ellerman Sept. 5, 2023, 2:42 a.m. UTC | #11
Michal Suchánek <msuchanek@suse.de> writes:
> On Thu, Aug 31, 2023 at 12:59:25PM -0500, Nathan Lynch wrote:
...
>> You (Michal) seem to favor a kernel-user ABI where user space is allowed
>> to invoke arbitrary RTAS functions by name. But we already have that in
>> the form of the rtas() syscall. (User space looks up function tokens by
>> name in the DT.) The point of the series is that we need to move away
>> from that. It's too low-level and user space has to use /dev/mem when
>> invoking any of the less-simple RTAS functions.
>
> We don't have that, directly accessing /dev/mem does not really work.
> And that's what needs fixing in my view.
>
> The rtas calls are all mechanically the same, the function implemented
> here should be able to call any of them if there was a way to specify
> the call.
>
> Given that there is desire to have access to multiple calls I don't
> think it makes sense to allocate a separate device with different name
> for each.

I think it does make sense.

We explicitly don't want a general "call any RTAS function" API.

We want tightly scoped APIs that do one thing, or a family of related
things, but not anything & everything.

Having different devices for each of those APIs means permissions can be
granted separately on those devices. So a user/group can be given access
to the "papr-vpd" device, but not some other unrelated device that also
happens to expose an RTAS service (eg. error injection).

cheers
Michal Suchanek Sept. 5, 2023, 8:24 a.m. UTC | #12
On Tue, Sep 05, 2023 at 12:42:11PM +1000, Michael Ellerman wrote:
> Michal Suchánek <msuchanek@suse.de> writes:
> > On Thu, Aug 31, 2023 at 12:59:25PM -0500, Nathan Lynch wrote:
> ...
> >> You (Michal) seem to favor a kernel-user ABI where user space is allowed
> >> to invoke arbitrary RTAS functions by name. But we already have that in
> >> the form of the rtas() syscall. (User space looks up function tokens by
> >> name in the DT.) The point of the series is that we need to move away
> >> from that. It's too low-level and user space has to use /dev/mem when
> >> invoking any of the less-simple RTAS functions.
> >
> > We don't have that, directly accessing /dev/mem does not really work.
> > And that's what needs fixing in my view.
> >
> > The rtas calls are all mechanically the same, the function implemented
> > here should be able to call any of them if there was a way to specify
> > the call.
> >
> > Given that there is desire to have access to multiple calls I don't
> > think it makes sense to allocate a separate device with different name
> > for each.
> 
> I think it does make sense.
> 
> We explicitly don't want a general "call any RTAS function" API.
> 
> We want tightly scoped APIs that do one thing, or a family of related
> things, but not anything & everything.
> 
> Having different devices for each of those APIs means permissions can be
> granted separately on those devices. So a user/group can be given access
> to the "papr-vpd" device, but not some other unrelated device that also
> happens to expose an RTAS service (eg. error injection).

Yes, it does work as a kludge for setting permissions for individual
calls.

Thanks

Michal
Michal Suchanek Sept. 6, 2023, 9:19 a.m. UTC | #13
On Tue, Aug 22, 2023 at 04:33:39PM -0500, Nathan Lynch via B4 Relay wrote:
> From: Nathan Lynch <nathanl@linux.ibm.com>
> 
> PowerVM LPARs may retrieve Vital Product Data (VPD) for system
> components using the ibm,get-vpd RTAS function.
> 
> We can expose this to user space with a /dev/papr-vpd character
> device, where the programming model is:
> 
>   struct papr_location_code plc = { .str = "", }; /* obtain all VPD */
>   int devfd = open("/dev/papr-vpd", O_WRONLY);
>   int vpdfd = ioctl(devfd, PAPR_VPD_CREATE_HANDLE, &plc);
>   size_t size = lseek(vpdfd, 0, SEEK_END);
>   char *buf = malloc(size);
>   pread(devfd, buf, size, 0);
> 
> When a file descriptor is obtained from ioctl(PAPR_VPD_CREATE_HANDLE),
> the file contains the result of a complete ibm,get-vpd sequence. The
> file contents are immutable from the POV of user space. To get a new
> view of VPD, clients must create a new handle.
> 
> This design choice insulates user space from most of the complexities
> that ibm,get-vpd brings:
> 
> * ibm,get-vpd must be called more than once to obtain complete
>   results.
> * Only one ibm,get-vpd call sequence should be in progress at a time;
>   concurrent sequences will disrupt each other. Callers must have a
>   protocol for serializing their use of the function.
> * A call sequence in progress may receive a "VPD changed, try again"
>   status, requiring the client to start over. (The driver does not yet
>   handle this, but it should be easy to add.)
> 
> The memory required for the VPD buffers seems acceptable, around 20KB
> for all VPD on one of my systems. And the value of the
> /rtas/ibm,vpd-size DT property (the estimated maximum size of VPD) is
> consistently 300KB across various systems I've checked.
> 
> I've implemented support for this new ABI in the rtas_get_vpd()
> function in librtas, which the vpdupdate command currently uses to
> populate its VPD database. I've verified that an unmodified vpdupdate
> binary generates an identical database when using a librtas.so that
> prefers the new ABI.
> 
> Likely remaining work:
> 
> * Handle RTAS call status -4 (VPD changed) during ibm,get-vpd call
>   sequence.
> * Prevent ibm,get-vpd calls via rtas(2) from disrupting ibm,get-vpd
>   call sequences in this driver.
> * (Maybe) implement a poll method for delivering notifications of
>   potential changes to VPD, e.g. after a partition migration.
> 
> Questions, points for discussion:
> 
> * Am I allocating the ioctl numbers correctly?
> * The only way to discover the size of a VPD buffer is
>   lseek(SEEK_END). fstat() doesn't work for anonymous fds like
>   this. Is this OK, or should the buffer length be discoverable some
>   other way?
> 
> Signed-off-by: Nathan Lynch <nathanl@linux.ibm.com>
> ---
>  Documentation/userspace-api/ioctl/ioctl-number.rst |   2 +
>  arch/powerpc/include/uapi/asm/papr-vpd.h           |  29 ++
>  arch/powerpc/platforms/pseries/Makefile            |   1 +
>  arch/powerpc/platforms/pseries/papr-vpd.c          | 353 +++++++++++++++++++++
>  4 files changed, 385 insertions(+)
> 
> diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
> index 4ea5b837399a..a950545bf7cd 100644
> --- a/Documentation/userspace-api/ioctl/ioctl-number.rst
> +++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
> @@ -349,6 +349,8 @@ Code  Seq#    Include File                                           Comments
>                                                                       <mailto:vgo@ratio.de>
>  0xB1  00-1F                                                          PPPoX
>                                                                       <mailto:mostrows@styx.uwaterloo.ca>
> +0xB2  00     arch/powerpc/include/uapi/asm/papr-vpd.h                powerpc/pseries VPD API
> +                                                                     <mailto:linuxppc-dev>
>  0xB3  00     linux/mmc/ioctl.h
>  0xB4  00-0F  linux/gpio.h                                            <mailto:linux-gpio@vger.kernel.org>
>  0xB5  00-0F  uapi/linux/rpmsg.h                                      <mailto:linux-remoteproc@vger.kernel.org>
> diff --git a/arch/powerpc/include/uapi/asm/papr-vpd.h b/arch/powerpc/include/uapi/asm/papr-vpd.h
> new file mode 100644
> index 000000000000..aa33217ad5de
> --- /dev/null
> +++ b/arch/powerpc/include/uapi/asm/papr-vpd.h
> @@ -0,0 +1,29 @@
> +// SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
> +#ifndef _UAPI_PAPR_VPD_H_
> +#define _UAPI_PAPR_VPD_H_
> +
> +#include <linux/types.h>
> +#include <asm/ioctl.h>
> +
> +struct papr_location_code {
> +	/*
> +	 * PAPR+ 12.3.2.4 Converged Location Code Rules - Length
> +	 * Restrictions. 79 characters plus nul.
> +	 */
> +	char str[80];
> +};
> +
> +#define PAPR_VPD_IOCTL_BASE 0xb2
> +
> +#define PAPR_VPD_IO(nr)         _IO(PAPR_VPD_IOCTL_BASE, nr)
> +#define PAPR_VPD_IOR(nr, type)  _IOR(PAPR_VPD_IOCTL_BASE, nr, type)
> +#define PAPR_VPD_IOW(nr, type)  _IOW(PAPR_VPD_IOCTL_BASE, nr, type)
> +#define PAPR_VPD_IOWR(nr, type) _IOWR(PAPR_VPD_IOCTL_BASE, nr, type)
> +
> +/*
> + * ioctl for /dev/papr-vpd. Returns a VPD handle fd corresponding to
> + * the location code.
> + */
> +#define PAPR_VPD_CREATE_HANDLE PAPR_VPD_IOW(0, struct papr_location_code)
> +
> +#endif /* _UAPI_PAPR_VPD_H_ */
> diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile
> index 53c3b91af2f7..cbcaa102e2b4 100644
> --- a/arch/powerpc/platforms/pseries/Makefile
> +++ b/arch/powerpc/platforms/pseries/Makefile
> @@ -4,6 +4,7 @@ ccflags-$(CONFIG_PPC_PSERIES_DEBUG)	+= -DDEBUG
>  
>  obj-y			:= lpar.o hvCall.o nvram.o reconfig.o \
>  			   of_helpers.o rtas-work-area.o papr-sysparm.o \
> +			   papr-vpd.o \
>  			   setup.o iommu.o event_sources.o ras.o \
>  			   firmware.o power.o dlpar.o mobility.o rng.o \
>  			   pci.o pci_dlpar.o eeh_pseries.o msi.o \
> diff --git a/arch/powerpc/platforms/pseries/papr-vpd.c b/arch/powerpc/platforms/pseries/papr-vpd.c
> new file mode 100644
> index 000000000000..6fc9ee0c48ae
> --- /dev/null
> +++ b/arch/powerpc/platforms/pseries/papr-vpd.c
> @@ -0,0 +1,353 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +
> +#define pr_fmt(fmt) "papr-vpd: " fmt
> +
> +#include <linux/anon_inodes.h>
> +#include <linux/build_bug.h>
> +#include <linux/file.h>
> +#include <linux/fs.h>
> +#include <linux/init.h>
> +#include <linux/kernel.h>
> +#include <linux/kernel.h>
> +#include <linux/miscdevice.h>
> +#include <linux/slab.h>
> +#include <linux/string.h>

Should be linux/strin_helpers to get string_is_terminated explicitly and not
randomly throuugh another header.

Thanks

Michal

> +#include <asm/machdep.h>
> +#include <asm/papr-vpd.h>
> +#include <asm/rtas-work-area.h>
> +#include <asm/rtas.h>
> +
> +/*
> + * Internal VPD "blob" APIs: for accumulating successive ibm,get-vpd results
> + * into a buffer to be attached to a file descriptor.
> + */
> +struct vpd_blob {
> +	const char *data;
> +	size_t len;
> +};
> +
> +static struct vpd_blob *vpd_blob_new(void)
> +{
> +	return kzalloc(sizeof(struct vpd_blob), GFP_KERNEL_ACCOUNT);
> +}
> +
> +static void vpd_blob_free(struct vpd_blob *blob)
> +{
> +	if (blob) {
> +		kvfree(blob->data);
> +		kfree(blob);
> +	}
> +}
> +
> +static int vpd_blob_accumulate(struct vpd_blob *blob, const char *data, size_t len)
> +{
> +	const size_t new_len = blob->len + len;
> +	const size_t old_len = blob->len;
> +	const char *old_ptr = blob->data;
> +	char *new_ptr;
> +
> +	new_ptr = old_ptr ?
> +		kvrealloc(old_ptr, old_len, new_len, GFP_KERNEL_ACCOUNT) :
> +		kvmalloc(len, GFP_KERNEL_ACCOUNT);
> +
> +	if (!new_ptr)
> +		return -ENOMEM;
> +
> +	memcpy(&new_ptr[old_len], data, len);
> +	blob->data = new_ptr;
> +	blob->len = new_len;
> +	return 0;
> +}
> +
> +/**
> + * struct rtas_ibm_get_vpd_params - Parameters (in and out) for ibm,get-vpd.
> + *
> + * @loc_code: In: Location code buffer. Must be RTAS-addressable.
> + * @work_area: In: Work area buffer for results.
> + * @sequence: In: Sequence number. Out: Next sequence number.
> + * @written: Out: Bytes written by ibm,get-vpd to @work_area.
> + * @status: Out: RTAS call status.
> + */
> +struct rtas_ibm_get_vpd_params {
> +	const struct papr_location_code *loc_code;
> +	struct rtas_work_area *work_area;
> +	u32 sequence;
> +	u32 written;
> +	s32 status;
> +};
> +
> +static int rtas_ibm_get_vpd(struct rtas_ibm_get_vpd_params *params)
> +{
> +	const struct papr_location_code *loc_code = params->loc_code;
> +	struct rtas_work_area *work_area = params->work_area;
> +	u32 rets[2];
> +	s32 fwrc;
> +	int ret;
> +
> +	do {
> +		fwrc = rtas_call(rtas_function_token(RTAS_FN_IBM_GET_VPD), 4, 3,
> +				 rets,
> +				 __pa(loc_code),
> +				 rtas_work_area_phys(work_area),
> +				 rtas_work_area_size(work_area),
> +				 params->sequence);
> +	} while (rtas_busy_delay(fwrc));
> +
> +	switch (fwrc) {
> +	case -1:
> +		ret = -EIO;
> +		break;
> +	case -3:
> +		ret = -EINVAL;
> +		break;
> +	case -4:
> +		ret = -EAGAIN;
> +		break;
> +	case 1:
> +		params->sequence = rets[0];
> +		fallthrough;
> +	case 0:
> +		params->written = rets[1];
> +		/*
> +		 * Kernel or firmware bug, do not continue.
> +		 */
> +		if (WARN(params->written > rtas_work_area_size(work_area),
> +			 "possible write beyond end of work area"))
> +			ret = -EFAULT;
> +		else
> +			ret = 0;
> +		break;
> +	default:
> +		ret = -EIO;
> +		pr_err_ratelimited("unexpected ibm,get-vpd status %d\n", fwrc);
> +		break;
> +	}
> +
> +	params->status = fwrc;
> +	return ret;
> +}
> +
> +struct vpd_sequence_state {
> +	struct mutex *mutex; /* always &vpd_sequence_mutex */
> +	struct pin_cookie cookie;
> +	int error;
> +	struct rtas_ibm_get_vpd_params params;
> +};
> +
> +static void vpd_sequence_begin(struct vpd_sequence_state *state,
> +			       const struct papr_location_code *loc_code)
> +{
> +	static DEFINE_MUTEX(vpd_sequence_mutex);
> +	/*
> +	 * Use a static data structure for the location code passed to
> +	 * RTAS to ensure it's in the RMA and avoid a separate work
> +	 * area allocation.
> +	 */
> +	static struct papr_location_code static_loc_code;
> +
> +	mutex_lock(&vpd_sequence_mutex);
> +
> +	static_loc_code = *loc_code;
> +	*state = (struct vpd_sequence_state) {
> +		.mutex = &vpd_sequence_mutex,
> +		.cookie = lockdep_pin_lock(&vpd_sequence_mutex),
> +		.params = {
> +			.work_area = rtas_work_area_alloc(SZ_4K),
> +			.loc_code = &static_loc_code,
> +			.sequence = 1,
> +		},
> +	};
> +}
> +
> +static bool vpd_sequence_done(const struct vpd_sequence_state *state)
> +{
> +	bool done;
> +
> +	if (state->error)
> +		return true;
> +
> +	switch (state->params.status) {
> +	case 0:
> +		if (state->params.written == 0)
> +			done = false; /* Initial state. */
> +		else
> +			done = true; /* All data consumed. */
> +		break;
> +	case 1:
> +		done = false; /* More data available. */
> +		break;
> +	default:
> +		done = true; /* Error encountered. */
> +		break;
> +	}
> +
> +	return done;
> +}
> +
> +static bool vpd_sequence_advance(struct vpd_sequence_state *state)
> +{
> +	if (vpd_sequence_done(state))
> +		return false;
> +
> +	state->error = rtas_ibm_get_vpd(&state->params);
> +
> +	return state->error == 0;
> +}
> +
> +static size_t vpd_sequence_get_buffer(const struct vpd_sequence_state *state, const char **buf)
> +{
> +	*buf = rtas_work_area_raw_buf(state->params.work_area);
> +	return state->params.written;
> +}
> +
> +static void vpd_sequence_set_err(struct vpd_sequence_state *state, int err)
> +{
> +	state->error = err;
> +}
> +
> +static void vpd_sequence_end(struct vpd_sequence_state *state)
> +{
> +	rtas_work_area_free(state->params.work_area);
> +	lockdep_unpin_lock(state->mutex, state->cookie);
> +	mutex_unlock(state->mutex);
> +}
> +
> +static struct vpd_blob *papr_vpd_retrieve(const struct papr_location_code *loc_code)
> +{
> +	struct vpd_sequence_state state;
> +	struct vpd_blob *blob;
> +
> +	blob = vpd_blob_new();
> +	if (!blob)
> +		return ERR_PTR(-ENOMEM);
> +
> +	vpd_sequence_begin(&state, loc_code);
> +
> +	while (vpd_sequence_advance(&state)) {
> +		const char *buf;
> +		const size_t len = vpd_sequence_get_buffer(&state, &buf);
> +
> +		vpd_sequence_set_err(&state, vpd_blob_accumulate(blob, buf, len));
> +	}
> +
> +	vpd_sequence_end(&state);
> +
> +	if (!state.error)
> +		return blob;
> +
> +	vpd_blob_free(blob);
> +
> +	return ERR_PTR(state.error);
> +}
> +
> +static ssize_t papr_vpd_handle_read(struct file *file, char __user *buf, size_t size, loff_t *off)
> +{
> +	struct vpd_blob *blob = file->private_data;
> +
> +	/* Blobs should always have a valid data pointer and nonzero size. */
> +	if (WARN_ON_ONCE(!blob->data))
> +		return -EIO;
> +	if (WARN_ON_ONCE(blob->len == 0))
> +		return -EIO;
> +	return simple_read_from_buffer(buf, size, off, blob->data, blob->len);
> +}
> +
> +static int papr_vpd_handle_release(struct inode *inode, struct file *file)
> +{
> +	struct vpd_blob *blob = file->private_data;
> +
> +	vpd_blob_free(blob);
> +
> +	return 0;
> +}
> +
> +static loff_t papr_vpd_handle_seek(struct file *file, loff_t off, int whence)
> +{
> +	struct vpd_blob *blob = file->private_data;
> +
> +	return fixed_size_llseek(file, off, whence, blob->len);
> +}
> +
> +
> +static const struct file_operations papr_vpd_handle_ops = {
> +	.read = papr_vpd_handle_read,
> +	.llseek = papr_vpd_handle_seek,
> +	.release = papr_vpd_handle_release,
> +};
> +
> +static long papr_vpd_ioctl_create_handle(struct papr_location_code __user *ulc)
> +{
> +	struct papr_location_code klc;
> +	struct vpd_blob *blob;
> +	struct file *file;
> +	long err;
> +	int fd;
> +
> +	if (copy_from_user(&klc, ulc, sizeof(klc)))
> +		return -EFAULT;
> +
> +	if (!string_is_terminated(klc.str, ARRAY_SIZE(klc.str)))
> +		return -EINVAL;
> +
> +	blob = papr_vpd_retrieve(&klc);
> +	if (IS_ERR(blob))
> +		return PTR_ERR(blob);
> +
> +	fd = get_unused_fd_flags(O_RDONLY | O_CLOEXEC);
> +	if (fd < 0) {
> +		err = fd;
> +		goto free_blob;
> +	}
> +
> +	file = anon_inode_getfile("[papr-vpd]", &papr_vpd_handle_ops,
> +				  blob, O_RDONLY);
> +	if (IS_ERR(file)) {
> +		err = PTR_ERR(file);
> +		goto put_fd;
> +	}
> +
> +	file->f_mode |= FMODE_LSEEK | FMODE_PREAD;
> +	fd_install(fd, file);
> +	return fd;
> +put_fd:
> +	put_unused_fd(fd);
> +free_blob:
> +	vpd_blob_free(blob);
> +	return err;
> +}
> +
> +/* handler for /dev/papr-vpd */
> +static long papr_vpd_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
> +{
> +	void __user *argp = (__force void __user *)arg;
> +	long ret;
> +
> +	switch (ioctl) {
> +	case PAPR_VPD_CREATE_HANDLE:
> +		ret = papr_vpd_ioctl_create_handle(argp);
> +		break;
> +	default:
> +		ret = -ENOIOCTLCMD;
> +		break;
> +	}
> +	return ret;
> +}
> +
> +static const struct file_operations papr_vpd_ops = {
> +	.unlocked_ioctl = papr_vpd_dev_ioctl,
> +};
> +
> +static struct miscdevice papr_vpd_dev = {
> +	.minor = MISC_DYNAMIC_MINOR,
> +	.name = "papr-vpd",
> +	.fops = &papr_vpd_ops,
> +};
> +
> +static __init int papr_vpd_init(void)
> +{
> +	if (!rtas_function_implemented(RTAS_FN_IBM_GET_VPD))
> +		return -ENODEV;
> +
> +	return misc_register(&papr_vpd_dev);
> +}
> +machine_device_initcall(pseries, papr_vpd_init);
> 
> -- 
> 2.41.0
>
diff mbox series

Patch

diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
index 4ea5b837399a..a950545bf7cd 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -349,6 +349,8 @@  Code  Seq#    Include File                                           Comments
                                                                      <mailto:vgo@ratio.de>
 0xB1  00-1F                                                          PPPoX
                                                                      <mailto:mostrows@styx.uwaterloo.ca>
+0xB2  00     arch/powerpc/include/uapi/asm/papr-vpd.h                powerpc/pseries VPD API
+                                                                     <mailto:linuxppc-dev>
 0xB3  00     linux/mmc/ioctl.h
 0xB4  00-0F  linux/gpio.h                                            <mailto:linux-gpio@vger.kernel.org>
 0xB5  00-0F  uapi/linux/rpmsg.h                                      <mailto:linux-remoteproc@vger.kernel.org>
diff --git a/arch/powerpc/include/uapi/asm/papr-vpd.h b/arch/powerpc/include/uapi/asm/papr-vpd.h
new file mode 100644
index 000000000000..aa33217ad5de
--- /dev/null
+++ b/arch/powerpc/include/uapi/asm/papr-vpd.h
@@ -0,0 +1,29 @@ 
+// SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
+#ifndef _UAPI_PAPR_VPD_H_
+#define _UAPI_PAPR_VPD_H_
+
+#include <linux/types.h>
+#include <asm/ioctl.h>
+
+struct papr_location_code {
+	/*
+	 * PAPR+ 12.3.2.4 Converged Location Code Rules - Length
+	 * Restrictions. 79 characters plus nul.
+	 */
+	char str[80];
+};
+
+#define PAPR_VPD_IOCTL_BASE 0xb2
+
+#define PAPR_VPD_IO(nr)         _IO(PAPR_VPD_IOCTL_BASE, nr)
+#define PAPR_VPD_IOR(nr, type)  _IOR(PAPR_VPD_IOCTL_BASE, nr, type)
+#define PAPR_VPD_IOW(nr, type)  _IOW(PAPR_VPD_IOCTL_BASE, nr, type)
+#define PAPR_VPD_IOWR(nr, type) _IOWR(PAPR_VPD_IOCTL_BASE, nr, type)
+
+/*
+ * ioctl for /dev/papr-vpd. Returns a VPD handle fd corresponding to
+ * the location code.
+ */
+#define PAPR_VPD_CREATE_HANDLE PAPR_VPD_IOW(0, struct papr_location_code)
+
+#endif /* _UAPI_PAPR_VPD_H_ */
diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile
index 53c3b91af2f7..cbcaa102e2b4 100644
--- a/arch/powerpc/platforms/pseries/Makefile
+++ b/arch/powerpc/platforms/pseries/Makefile
@@ -4,6 +4,7 @@  ccflags-$(CONFIG_PPC_PSERIES_DEBUG)	+= -DDEBUG
 
 obj-y			:= lpar.o hvCall.o nvram.o reconfig.o \
 			   of_helpers.o rtas-work-area.o papr-sysparm.o \
+			   papr-vpd.o \
 			   setup.o iommu.o event_sources.o ras.o \
 			   firmware.o power.o dlpar.o mobility.o rng.o \
 			   pci.o pci_dlpar.o eeh_pseries.o msi.o \
diff --git a/arch/powerpc/platforms/pseries/papr-vpd.c b/arch/powerpc/platforms/pseries/papr-vpd.c
new file mode 100644
index 000000000000..6fc9ee0c48ae
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/papr-vpd.c
@@ -0,0 +1,353 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+
+#define pr_fmt(fmt) "papr-vpd: " fmt
+
+#include <linux/anon_inodes.h>
+#include <linux/build_bug.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/kernel.h>
+#include <linux/miscdevice.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <asm/machdep.h>
+#include <asm/papr-vpd.h>
+#include <asm/rtas-work-area.h>
+#include <asm/rtas.h>
+
+/*
+ * Internal VPD "blob" APIs: for accumulating successive ibm,get-vpd results
+ * into a buffer to be attached to a file descriptor.
+ */
+struct vpd_blob {
+	const char *data;
+	size_t len;
+};
+
+static struct vpd_blob *vpd_blob_new(void)
+{
+	return kzalloc(sizeof(struct vpd_blob), GFP_KERNEL_ACCOUNT);
+}
+
+static void vpd_blob_free(struct vpd_blob *blob)
+{
+	if (blob) {
+		kvfree(blob->data);
+		kfree(blob);
+	}
+}
+
+static int vpd_blob_accumulate(struct vpd_blob *blob, const char *data, size_t len)
+{
+	const size_t new_len = blob->len + len;
+	const size_t old_len = blob->len;
+	const char *old_ptr = blob->data;
+	char *new_ptr;
+
+	new_ptr = old_ptr ?
+		kvrealloc(old_ptr, old_len, new_len, GFP_KERNEL_ACCOUNT) :
+		kvmalloc(len, GFP_KERNEL_ACCOUNT);
+
+	if (!new_ptr)
+		return -ENOMEM;
+
+	memcpy(&new_ptr[old_len], data, len);
+	blob->data = new_ptr;
+	blob->len = new_len;
+	return 0;
+}
+
+/**
+ * struct rtas_ibm_get_vpd_params - Parameters (in and out) for ibm,get-vpd.
+ *
+ * @loc_code: In: Location code buffer. Must be RTAS-addressable.
+ * @work_area: In: Work area buffer for results.
+ * @sequence: In: Sequence number. Out: Next sequence number.
+ * @written: Out: Bytes written by ibm,get-vpd to @work_area.
+ * @status: Out: RTAS call status.
+ */
+struct rtas_ibm_get_vpd_params {
+	const struct papr_location_code *loc_code;
+	struct rtas_work_area *work_area;
+	u32 sequence;
+	u32 written;
+	s32 status;
+};
+
+static int rtas_ibm_get_vpd(struct rtas_ibm_get_vpd_params *params)
+{
+	const struct papr_location_code *loc_code = params->loc_code;
+	struct rtas_work_area *work_area = params->work_area;
+	u32 rets[2];
+	s32 fwrc;
+	int ret;
+
+	do {
+		fwrc = rtas_call(rtas_function_token(RTAS_FN_IBM_GET_VPD), 4, 3,
+				 rets,
+				 __pa(loc_code),
+				 rtas_work_area_phys(work_area),
+				 rtas_work_area_size(work_area),
+				 params->sequence);
+	} while (rtas_busy_delay(fwrc));
+
+	switch (fwrc) {
+	case -1:
+		ret = -EIO;
+		break;
+	case -3:
+		ret = -EINVAL;
+		break;
+	case -4:
+		ret = -EAGAIN;
+		break;
+	case 1:
+		params->sequence = rets[0];
+		fallthrough;
+	case 0:
+		params->written = rets[1];
+		/*
+		 * Kernel or firmware bug, do not continue.
+		 */
+		if (WARN(params->written > rtas_work_area_size(work_area),
+			 "possible write beyond end of work area"))
+			ret = -EFAULT;
+		else
+			ret = 0;
+		break;
+	default:
+		ret = -EIO;
+		pr_err_ratelimited("unexpected ibm,get-vpd status %d\n", fwrc);
+		break;
+	}
+
+	params->status = fwrc;
+	return ret;
+}
+
+struct vpd_sequence_state {
+	struct mutex *mutex; /* always &vpd_sequence_mutex */
+	struct pin_cookie cookie;
+	int error;
+	struct rtas_ibm_get_vpd_params params;
+};
+
+static void vpd_sequence_begin(struct vpd_sequence_state *state,
+			       const struct papr_location_code *loc_code)
+{
+	static DEFINE_MUTEX(vpd_sequence_mutex);
+	/*
+	 * Use a static data structure for the location code passed to
+	 * RTAS to ensure it's in the RMA and avoid a separate work
+	 * area allocation.
+	 */
+	static struct papr_location_code static_loc_code;
+
+	mutex_lock(&vpd_sequence_mutex);
+
+	static_loc_code = *loc_code;
+	*state = (struct vpd_sequence_state) {
+		.mutex = &vpd_sequence_mutex,
+		.cookie = lockdep_pin_lock(&vpd_sequence_mutex),
+		.params = {
+			.work_area = rtas_work_area_alloc(SZ_4K),
+			.loc_code = &static_loc_code,
+			.sequence = 1,
+		},
+	};
+}
+
+static bool vpd_sequence_done(const struct vpd_sequence_state *state)
+{
+	bool done;
+
+	if (state->error)
+		return true;
+
+	switch (state->params.status) {
+	case 0:
+		if (state->params.written == 0)
+			done = false; /* Initial state. */
+		else
+			done = true; /* All data consumed. */
+		break;
+	case 1:
+		done = false; /* More data available. */
+		break;
+	default:
+		done = true; /* Error encountered. */
+		break;
+	}
+
+	return done;
+}
+
+static bool vpd_sequence_advance(struct vpd_sequence_state *state)
+{
+	if (vpd_sequence_done(state))
+		return false;
+
+	state->error = rtas_ibm_get_vpd(&state->params);
+
+	return state->error == 0;
+}
+
+static size_t vpd_sequence_get_buffer(const struct vpd_sequence_state *state, const char **buf)
+{
+	*buf = rtas_work_area_raw_buf(state->params.work_area);
+	return state->params.written;
+}
+
+static void vpd_sequence_set_err(struct vpd_sequence_state *state, int err)
+{
+	state->error = err;
+}
+
+static void vpd_sequence_end(struct vpd_sequence_state *state)
+{
+	rtas_work_area_free(state->params.work_area);
+	lockdep_unpin_lock(state->mutex, state->cookie);
+	mutex_unlock(state->mutex);
+}
+
+static struct vpd_blob *papr_vpd_retrieve(const struct papr_location_code *loc_code)
+{
+	struct vpd_sequence_state state;
+	struct vpd_blob *blob;
+
+	blob = vpd_blob_new();
+	if (!blob)
+		return ERR_PTR(-ENOMEM);
+
+	vpd_sequence_begin(&state, loc_code);
+
+	while (vpd_sequence_advance(&state)) {
+		const char *buf;
+		const size_t len = vpd_sequence_get_buffer(&state, &buf);
+
+		vpd_sequence_set_err(&state, vpd_blob_accumulate(blob, buf, len));
+	}
+
+	vpd_sequence_end(&state);
+
+	if (!state.error)
+		return blob;
+
+	vpd_blob_free(blob);
+
+	return ERR_PTR(state.error);
+}
+
+static ssize_t papr_vpd_handle_read(struct file *file, char __user *buf, size_t size, loff_t *off)
+{
+	struct vpd_blob *blob = file->private_data;
+
+	/* Blobs should always have a valid data pointer and nonzero size. */
+	if (WARN_ON_ONCE(!blob->data))
+		return -EIO;
+	if (WARN_ON_ONCE(blob->len == 0))
+		return -EIO;
+	return simple_read_from_buffer(buf, size, off, blob->data, blob->len);
+}
+
+static int papr_vpd_handle_release(struct inode *inode, struct file *file)
+{
+	struct vpd_blob *blob = file->private_data;
+
+	vpd_blob_free(blob);
+
+	return 0;
+}
+
+static loff_t papr_vpd_handle_seek(struct file *file, loff_t off, int whence)
+{
+	struct vpd_blob *blob = file->private_data;
+
+	return fixed_size_llseek(file, off, whence, blob->len);
+}
+
+
+static const struct file_operations papr_vpd_handle_ops = {
+	.read = papr_vpd_handle_read,
+	.llseek = papr_vpd_handle_seek,
+	.release = papr_vpd_handle_release,
+};
+
+static long papr_vpd_ioctl_create_handle(struct papr_location_code __user *ulc)
+{
+	struct papr_location_code klc;
+	struct vpd_blob *blob;
+	struct file *file;
+	long err;
+	int fd;
+
+	if (copy_from_user(&klc, ulc, sizeof(klc)))
+		return -EFAULT;
+
+	if (!string_is_terminated(klc.str, ARRAY_SIZE(klc.str)))
+		return -EINVAL;
+
+	blob = papr_vpd_retrieve(&klc);
+	if (IS_ERR(blob))
+		return PTR_ERR(blob);
+
+	fd = get_unused_fd_flags(O_RDONLY | O_CLOEXEC);
+	if (fd < 0) {
+		err = fd;
+		goto free_blob;
+	}
+
+	file = anon_inode_getfile("[papr-vpd]", &papr_vpd_handle_ops,
+				  blob, O_RDONLY);
+	if (IS_ERR(file)) {
+		err = PTR_ERR(file);
+		goto put_fd;
+	}
+
+	file->f_mode |= FMODE_LSEEK | FMODE_PREAD;
+	fd_install(fd, file);
+	return fd;
+put_fd:
+	put_unused_fd(fd);
+free_blob:
+	vpd_blob_free(blob);
+	return err;
+}
+
+/* handler for /dev/papr-vpd */
+static long papr_vpd_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
+{
+	void __user *argp = (__force void __user *)arg;
+	long ret;
+
+	switch (ioctl) {
+	case PAPR_VPD_CREATE_HANDLE:
+		ret = papr_vpd_ioctl_create_handle(argp);
+		break;
+	default:
+		ret = -ENOIOCTLCMD;
+		break;
+	}
+	return ret;
+}
+
+static const struct file_operations papr_vpd_ops = {
+	.unlocked_ioctl = papr_vpd_dev_ioctl,
+};
+
+static struct miscdevice papr_vpd_dev = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name = "papr-vpd",
+	.fops = &papr_vpd_ops,
+};
+
+static __init int papr_vpd_init(void)
+{
+	if (!rtas_function_implemented(RTAS_FN_IBM_GET_VPD))
+		return -ENODEV;
+
+	return misc_register(&papr_vpd_dev);
+}
+machine_device_initcall(pseries, papr_vpd_init);