diff mbox series

[net-next,v2,1/2] fs/crashdd: add API to collect hardware dump in second kernel

Message ID 296ffbd47fd4f30238689e636bd2480683224227.1521888444.git.rahul.lakkireddy@chelsio.com
State Changes Requested, archived
Delegated to: David Miller
Headers show
Series kernel: add support to collect hardware logs in crash recovery kernel | expand

Commit Message

Rahul Lakkireddy March 24, 2018, 10:56 a.m. UTC
Add a new module crashdd that exports the /sys/kernel/crashdd/
directory in second kernel, containing collected hardware/firmware
dumps.

The sequence of actions done by device drivers to append their device
specific hardware/firmware logs to /sys/kernel/crashdd/ directory are
as follows:

1. During probe (before hardware is initialized), device drivers
register to the crashdd module (via crashdd_add_dump()), with
callback function, along with buffer size and log name needed for
firmware/hardware log collection.

2. Crashdd creates a driver's directory under
/sys/kernel/crashdd/<driver>. Then, it allocates the buffer with
requested size and invokes the device driver's registered callback
function.

3. Device driver collects all hardware/firmware logs into the buffer
and returns control back to crashdd.

4. Crashdd exposes the buffer as a binary file via
/sys/kernel/crashdd/<driver>/<dump_file>.

Suggested-by: Eric Biederman <ebiederm@xmission.com>.
Suggested-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
---
v2:
- Added ABI Documentation for crashdd.
- Directly use octal permission instead of macro.

Changes since rfc v2:
- Moved exporting crashdd from procfs to sysfs.  Suggested by
  Stephen Hemminger <stephen@networkplumber.org>
- Moved code from fs/proc/crashdd.c to fs/crashdd/ directory.
- Replaced all proc API with sysfs API and updated comments.
- Calling driver callback before creating the binary file under
  crashdd sysfs.
- Changed binary dump file permission from S_IRUSR to S_IRUGO.
- Changed module name from CRASH_DRIVER_DUMP to CRASH_DEVICE_DUMP.

rfc v2:
- Collecting logs in 2nd kernel instead of during kernel panic.
  Suggested by Eric Biederman <ebiederm@xmission.com>.
- Patch added in this series.

 Documentation/ABI/testing/sysfs-kernel-crashdd |  34 ++++
 fs/Kconfig                                     |   1 +
 fs/Makefile                                    |   1 +
 fs/crashdd/Kconfig                             |  10 ++
 fs/crashdd/Makefile                            |   3 +
 fs/crashdd/crashdd.c                           | 233 +++++++++++++++++++++++++
 fs/crashdd/crashdd_internal.h                  |  24 +++
 include/linux/crashdd.h                        |  24 +++
 8 files changed, 330 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-kernel-crashdd
 create mode 100644 fs/crashdd/Kconfig
 create mode 100644 fs/crashdd/Makefile
 create mode 100644 fs/crashdd/crashdd.c
 create mode 100644 fs/crashdd/crashdd_internal.h
 create mode 100644 include/linux/crashdd.h

Comments

kernel test robot March 25, 2018, 12:43 p.m. UTC | #1
Hi Rahul,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on net-next/master]

url:    https://github.com/0day-ci/linux/commits/Rahul-Lakkireddy/fs-crashdd-add-API-to-collect-hardware-dump-in-second-kernel/20180325-191308
config: i386-randconfig-s0-03251817 (attached as .config)
compiler: gcc-6 (Debian 6.4.0-9) 6.4.0 20171026
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

All warnings (new ones prefixed by >>):

   In file included from fs//crashdd/crashdd.c:8:0:
   fs//crashdd/crashdd_internal.h:13:23: error: field 'bin_attr' has incomplete type
     struct bin_attribute bin_attr; /* Binary dump file's attributes */
                          ^~~~~~~~
   fs//crashdd/crashdd.c: In function 'crashdd_read':
   fs//crashdd/crashdd.c:19:43: error: dereferencing pointer to incomplete type 'struct bin_attribute'
     struct crashdd_dump_node *dump = bin_attr->private;
                                              ^~
   fs//crashdd/crashdd.c: In function 'crashdd_mkdir':
   fs//crashdd/crashdd.c:27:9: error: implicit declaration of function 'kobject_create_and_add' [-Werror=implicit-function-declaration]
     return kobject_create_and_add(name, crashdd_kobj);
            ^~~~~~~~~~~~~~~~~~~~~~
   fs//crashdd/crashdd.c:27:9: warning: return makes pointer from integer without a cast [-Wint-conversion]
     return kobject_create_and_add(name, crashdd_kobj);
            ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   fs//crashdd/crashdd.c: In function 'crashdd_add_file':
   fs//crashdd/crashdd.c:39:9: error: implicit declaration of function 'sysfs_create_bin_file' [-Werror=implicit-function-declaration]
     return sysfs_create_bin_file(kobj, &dump->bin_attr);
            ^~~~~~~~~~~~~~~~~~~~~
   fs//crashdd/crashdd.c: In function 'crashdd_rmdir':
   fs//crashdd/crashdd.c:44:2: error: implicit declaration of function 'kobject_put' [-Werror=implicit-function-declaration]
     kobject_put(kobj);
     ^~~~~~~~~~~
   In file included from include/linux/kernel.h:10:0,
                    from include/linux/list.h:9,
                    from include/linux/preempt.h:11,
                    from include/linux/spinlock.h:51,
                    from include/linux/vmalloc.h:5,
                    from fs//crashdd/crashdd.c:4:
   fs//crashdd/crashdd.c: In function 'crashdd_get_driver':
   fs//crashdd/crashdd.c:101:25: error: dereferencing pointer to incomplete type 'struct kobject'
      if (!strcmp(node->kobj->name, name)) {
                            ^
   include/linux/compiler.h:58:30: note: in definition of macro '__trace_if'
     if (__builtin_constant_p(!!(cond)) ? !!(cond) :   \
                                 ^~~~
>> fs//crashdd/crashdd.c:101:3: note: in expansion of macro 'if'
      if (!strcmp(node->kobj->name, name)) {
      ^~
   fs//crashdd/crashdd.c: In function 'crashdd_init':
   fs//crashdd/crashdd.c:227:51: error: 'kernel_kobj' undeclared (first use in this function)
     crashdd_kobj = kobject_create_and_add("crashdd", kernel_kobj);
                                                      ^~~~~~~~~~~
   fs//crashdd/crashdd.c:227:51: note: each undeclared identifier is reported only once for each function it appears in
   fs//crashdd/crashdd.c: In function 'crashdd_add_file':
   fs//crashdd/crashdd.c:40:1: warning: control reaches end of non-void function [-Wreturn-type]
    }
    ^
   cc1: some warnings being treated as errors

vim +/if +101 fs//crashdd/crashdd.c

     3	
   > 4	#include <linux/vmalloc.h>
     5	#include <linux/crash_dump.h>
     6	#include <linux/crashdd.h>
     7	
     8	#include "crashdd_internal.h"
     9	
    10	static LIST_HEAD(crashdd_list);
    11	static DEFINE_MUTEX(crashdd_mutex);
    12	
    13	static struct kobject *crashdd_kobj;
    14	
    15	static ssize_t crashdd_read(struct file *filp, struct kobject *kobj,
    16				    struct bin_attribute *bin_attr,
    17				    char *buf, loff_t fpos, size_t count)
    18	{
    19		struct crashdd_dump_node *dump = bin_attr->private;
    20	
    21		memcpy(buf, dump->buf + fpos, count);
    22		return count;
    23	}
    24	
    25	static struct kobject *crashdd_mkdir(const char *name)
    26	{
    27		return kobject_create_and_add(name, crashdd_kobj);
    28	}
    29	
    30	static int crashdd_add_file(struct kobject *kobj, const char *name,
    31				    struct crashdd_dump_node *dump)
    32	{
    33		dump->bin_attr.attr.name = name;
    34		dump->bin_attr.attr.mode = 0444;
    35		dump->bin_attr.size = dump->size;
    36		dump->bin_attr.read = crashdd_read;
    37		dump->bin_attr.private = dump;
    38	
    39		return sysfs_create_bin_file(kobj, &dump->bin_attr);
    40	}
    41	
    42	static void crashdd_rmdir(struct kobject *kobj)
    43	{
    44		kobject_put(kobj);
    45	}
    46	
    47	/**
    48	 * crashdd_init_driver - create a sysfs driver entry.
    49	 * @name: Name of the directory.
    50	 *
    51	 * Creates a directory under /sys/kernel/crashdd/ with @name.  Allocates
    52	 * and saves the sysfs entry.  The sysfs entry is added to the global
    53	 * list and then returned to the caller. On failure, returns NULL.
    54	 */
    55	static struct crashdd_driver_node *crashdd_init_driver(const char *name)
    56	{
    57		struct crashdd_driver_node *node;
    58	
    59		node = vzalloc(sizeof(*node));
    60		if (!node)
    61			return NULL;
    62	
    63		/* Create a driver's directory under /sys/kernel/crashdd/ */
    64		node->kobj = crashdd_mkdir(name);
    65		if (!node->kobj) {
    66			vfree(node);
    67			return NULL;
    68		}
    69	
    70		atomic_set(&node->refcnt, 1);
    71	
    72		/* Initialize the list of dumps that go under this driver's
    73		 * directory.
    74		 */
    75		INIT_LIST_HEAD(&node->dump_list);
    76	
    77		/* Add the driver's entry to global list */
    78		mutex_lock(&crashdd_mutex);
    79		list_add_tail(&node->list, &crashdd_list);
    80		mutex_unlock(&crashdd_mutex);
    81	
    82		return node;
    83	}
    84	
    85	/**
    86	 * crashdd_get_driver - get an exisiting sysfs driver entry.
    87	 * @name: Name of the directory.
    88	 *
    89	 * Searches and fetches a sysfs entry having @name.  If @name is
    90	 * found, then the reference count is incremented and the entry
    91	 * is returned.  If @name is not found, NULL is returned.
    92	 */
    93	static struct crashdd_driver_node *crashdd_get_driver(const char *name)
    94	{
    95		struct crashdd_driver_node *node;
    96		int found = 0;
    97	
    98		/* Search for an existing driver sysfs entry having @name */
    99		mutex_lock(&crashdd_mutex);
   100		list_for_each_entry(node, &crashdd_list, list) {
 > 101			if (!strcmp(node->kobj->name, name)) {
   102				atomic_inc(&node->refcnt);
   103				found = 1;
   104				break;
   105			}
   106		}
   107		mutex_unlock(&crashdd_mutex);
   108	
   109		if (found)
   110			return node;
   111	
   112		/* No driver with @name found */
   113		return NULL;
   114	}
   115	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
Jiri Pirko March 30, 2018, 10:39 a.m. UTC | #2
Sat, Mar 24, 2018 at 11:56:33AM CET, rahul.lakkireddy@chelsio.com wrote:
>Add a new module crashdd that exports the /sys/kernel/crashdd/
>directory in second kernel, containing collected hardware/firmware
>dumps.
>
>The sequence of actions done by device drivers to append their device
>specific hardware/firmware logs to /sys/kernel/crashdd/ directory are
>as follows:
>
>1. During probe (before hardware is initialized), device drivers
>register to the crashdd module (via crashdd_add_dump()), with
>callback function, along with buffer size and log name needed for
>firmware/hardware log collection.
>
>2. Crashdd creates a driver's directory under
>/sys/kernel/crashdd/<driver>. Then, it allocates the buffer with

This smells. I need to identify the exact ASIC instance that produced
the dump. To identify by driver name does not help me if I have multiple
instances of the same driver. This looks wrong to me. This looks like
a job for devlink where you have 1 devlink instance per 1 ASIC instance.

Please see:
http://patchwork.ozlabs.org/project/netdev/list/?series=36524

I bevieve that the solution in the patchset could be used for
your usecase too.


>requested size and invokes the device driver's registered callback
>function.
>
>3. Device driver collects all hardware/firmware logs into the buffer
>and returns control back to crashdd.
>
>4. Crashdd exposes the buffer as a binary file via
>/sys/kernel/crashdd/<driver>/<dump_file>.
>
Rahul Lakkireddy March 30, 2018, 10:51 a.m. UTC | #3
On Friday, March 03/30/18, 2018 at 16:09:07 +0530, Jiri Pirko wrote:
> Sat, Mar 24, 2018 at 11:56:33AM CET, rahul.lakkireddy@chelsio.com wrote:
> >Add a new module crashdd that exports the /sys/kernel/crashdd/
> >directory in second kernel, containing collected hardware/firmware
> >dumps.
> >
> >The sequence of actions done by device drivers to append their device
> >specific hardware/firmware logs to /sys/kernel/crashdd/ directory are
> >as follows:
> >
> >1. During probe (before hardware is initialized), device drivers
> >register to the crashdd module (via crashdd_add_dump()), with
> >callback function, along with buffer size and log name needed for
> >firmware/hardware log collection.
> >
> >2. Crashdd creates a driver's directory under
> >/sys/kernel/crashdd/<driver>. Then, it allocates the buffer with
> 
> This smells. I need to identify the exact ASIC instance that produced
> the dump. To identify by driver name does not help me if I have multiple
> instances of the same driver. This looks wrong to me. This looks like
> a job for devlink where you have 1 devlink instance per 1 ASIC instance.
> 
> Please see:
> http://patchwork.ozlabs.org/project/netdev/list/?series=36524
> 
> I bevieve that the solution in the patchset could be used for
> your usecase too.
> 
> 

The sysfs approach proposed here had been dropped in favour exporting
the dumps as ELF notes in /proc/vmcore.

Will be posting the new patches soon.

> >requested size and invokes the device driver's registered callback
> >function.
> >
> >3. Device driver collects all hardware/firmware logs into the buffer
> >and returns control back to crashdd.
> >
> >4. Crashdd exposes the buffer as a binary file via
> >/sys/kernel/crashdd/<driver>/<dump_file>.
> >
Andrew Lunn March 30, 2018, 3:11 p.m. UTC | #4
> Please see:
> http://patchwork.ozlabs.org/project/netdev/list/?series=36524
> 
> I bevieve that the solution in the patchset could be used for
> your usecase too.

Hi Jiri

https://lkml.org/lkml/2018/3/20/436

How well does this API work for a 2Gbyte snapshot?

    Andrew
Eric W. Biederman March 30, 2018, 6:42 p.m. UTC | #5
Rahul Lakkireddy <rahul.lakkireddy@chelsio.com> writes:

> On Friday, March 03/30/18, 2018 at 16:09:07 +0530, Jiri Pirko wrote:
>> Sat, Mar 24, 2018 at 11:56:33AM CET, rahul.lakkireddy@chelsio.com wrote:
>> >Add a new module crashdd that exports the /sys/kernel/crashdd/
>> >directory in second kernel, containing collected hardware/firmware
>> >dumps.
>> >
>> >The sequence of actions done by device drivers to append their device
>> >specific hardware/firmware logs to /sys/kernel/crashdd/ directory are
>> >as follows:
>> >
>> >1. During probe (before hardware is initialized), device drivers
>> >register to the crashdd module (via crashdd_add_dump()), with
>> >callback function, along with buffer size and log name needed for
>> >firmware/hardware log collection.
>> >
>> >2. Crashdd creates a driver's directory under
>> >/sys/kernel/crashdd/<driver>. Then, it allocates the buffer with
>> 
>> This smells. I need to identify the exact ASIC instance that produced
>> the dump. To identify by driver name does not help me if I have multiple
>> instances of the same driver. This looks wrong to me. This looks like
>> a job for devlink where you have 1 devlink instance per 1 ASIC instance.
>> 
>> Please see:
>> http://patchwork.ozlabs.org/project/netdev/list/?series=36524
>> 
>> I bevieve that the solution in the patchset could be used for
>> your usecase too.
>> 
>> 
>
> The sysfs approach proposed here had been dropped in favour exporting
> the dumps as ELF notes in /proc/vmcore.
>
> Will be posting the new patches soon.

The concern was actually how you identify which device that came from.
Where you read the identifier changes but sysfs or /proc/vmcore the
change remains valid.

Eric
Jiri Pirko April 2, 2018, 9:11 a.m. UTC | #6
Fri, Mar 30, 2018 at 08:42:00PM CEST, ebiederm@xmission.com wrote:
>Rahul Lakkireddy <rahul.lakkireddy@chelsio.com> writes:
>
>> On Friday, March 03/30/18, 2018 at 16:09:07 +0530, Jiri Pirko wrote:
>>> Sat, Mar 24, 2018 at 11:56:33AM CET, rahul.lakkireddy@chelsio.com wrote:
>>> >Add a new module crashdd that exports the /sys/kernel/crashdd/
>>> >directory in second kernel, containing collected hardware/firmware
>>> >dumps.
>>> >
>>> >The sequence of actions done by device drivers to append their device
>>> >specific hardware/firmware logs to /sys/kernel/crashdd/ directory are
>>> >as follows:
>>> >
>>> >1. During probe (before hardware is initialized), device drivers
>>> >register to the crashdd module (via crashdd_add_dump()), with
>>> >callback function, along with buffer size and log name needed for
>>> >firmware/hardware log collection.
>>> >
>>> >2. Crashdd creates a driver's directory under
>>> >/sys/kernel/crashdd/<driver>. Then, it allocates the buffer with
>>> 
>>> This smells. I need to identify the exact ASIC instance that produced
>>> the dump. To identify by driver name does not help me if I have multiple
>>> instances of the same driver. This looks wrong to me. This looks like
>>> a job for devlink where you have 1 devlink instance per 1 ASIC instance.
>>> 
>>> Please see:
>>> http://patchwork.ozlabs.org/project/netdev/list/?series=36524
>>> 
>>> I bevieve that the solution in the patchset could be used for
>>> your usecase too.
>>> 
>>> 
>>
>> The sysfs approach proposed here had been dropped in favour exporting
>> the dumps as ELF notes in /proc/vmcore.
>>
>> Will be posting the new patches soon.
>
>The concern was actually how you identify which device that came from.
>Where you read the identifier changes but sysfs or /proc/vmcore the
>change remains valid.

Yeah. I still don't see how you link the dump and the device. Rahul, did
you look at the patchset I pointed out?
Thanks!
Jiri Pirko April 2, 2018, 9:12 a.m. UTC | #7
Fri, Mar 30, 2018 at 05:11:29PM CEST, andrew@lunn.ch wrote:
>> Please see:
>> http://patchwork.ozlabs.org/project/netdev/list/?series=36524
>> 
>> I bevieve that the solution in the patchset could be used for
>> your usecase too.
>
>Hi Jiri
>
>https://lkml.org/lkml/2018/3/20/436
>
>How well does this API work for a 2Gbyte snapshot?

Ccing Alex who did the tests.

>
>    Andrew
Andrew Lunn April 2, 2018, 12:21 p.m. UTC | #8
> >> The sysfs approach proposed here had been dropped in favour exporting
> >> the dumps as ELF notes in /proc/vmcore.
> >>
> >> Will be posting the new patches soon.
> >
> >The concern was actually how you identify which device that came from.
> >Where you read the identifier changes but sysfs or /proc/vmcore the
> >change remains valid.
> 
> Yeah. I still don't see how you link the dump and the device.

Hi Jiri

You can see in the third version the core code accept a free form
name. The driver builds a name using the driver name and the adaptor
name.

What i think would be good is to try to have one API to the driver
that can be used for both crash dumps and devlink snapshots. These are
used at different times, but have basically the same purpose, get
state from the device.

      Andrew
Rahul Lakkireddy April 2, 2018, 12:30 p.m. UTC | #9
On Monday, April 04/02/18, 2018 at 14:41:43 +0530, Jiri Pirko wrote:
> Fri, Mar 30, 2018 at 08:42:00PM CEST, ebiederm@xmission.com wrote:
> >Rahul Lakkireddy <rahul.lakkireddy@chelsio.com> writes:
> >
> >> On Friday, March 03/30/18, 2018 at 16:09:07 +0530, Jiri Pirko wrote:
> >>> Sat, Mar 24, 2018 at 11:56:33AM CET, rahul.lakkireddy@chelsio.com wrote:
> >>> >Add a new module crashdd that exports the /sys/kernel/crashdd/
> >>> >directory in second kernel, containing collected hardware/firmware
> >>> >dumps.
> >>> >
> >>> >The sequence of actions done by device drivers to append their device
> >>> >specific hardware/firmware logs to /sys/kernel/crashdd/ directory are
> >>> >as follows:
> >>> >
> >>> >1. During probe (before hardware is initialized), device drivers
> >>> >register to the crashdd module (via crashdd_add_dump()), with
> >>> >callback function, along with buffer size and log name needed for
> >>> >firmware/hardware log collection.
> >>> >
> >>> >2. Crashdd creates a driver's directory under
> >>> >/sys/kernel/crashdd/<driver>. Then, it allocates the buffer with
> >>> 
> >>> This smells. I need to identify the exact ASIC instance that produced
> >>> the dump. To identify by driver name does not help me if I have multiple
> >>> instances of the same driver. This looks wrong to me. This looks like
> >>> a job for devlink where you have 1 devlink instance per 1 ASIC instance.
> >>> 
> >>> Please see:
> >>> http://patchwork.ozlabs.org/project/netdev/list/?series=36524
> >>> 
> >>> I bevieve that the solution in the patchset could be used for
> >>> your usecase too.
> >>> 
> >>> 
> >>
> >> The sysfs approach proposed here had been dropped in favour exporting
> >> the dumps as ELF notes in /proc/vmcore.
> >>
> >> Will be posting the new patches soon.
> >
> >The concern was actually how you identify which device that came from.
> >Where you read the identifier changes but sysfs or /proc/vmcore the
> >change remains valid.
> 
> Yeah. I still don't see how you link the dump and the device.

In our case, the dump and the device are being identified by the
driver’s name followed by its corresponding pci bus id.  I’ve posted an
example in my v3 series:

https://www.spinics.net/lists/netdev/msg493781.html

Here’s an extract from the link above:

# readelf -n /proc/vmcore

Displaying notes found at file offset 0x00001000 with length 0x04003288:
Owner                 Data size     Description
VMCOREDD_cxgb4_0000:02:00.4 0x02000fd8      Unknown note type:(0x00000700)
VMCOREDD_cxgb4_0000:04:00.4 0x02000fd8      Unknown note type:(0x00000700)
CORE                 0x00000150     NT_PRSTATUS (prstatus structure)
CORE                 0x00000150     NT_PRSTATUS (prstatus structure)
CORE                 0x00000150     NT_PRSTATUS (prstatus structure)
CORE                 0x00000150     NT_PRSTATUS (prstatus structure)
CORE                 0x00000150     NT_PRSTATUS (prstatus structure)
CORE                 0x00000150     NT_PRSTATUS (prstatus structure)
CORE                 0x00000150     NT_PRSTATUS (prstatus structure)
CORE                 0x00000150     NT_PRSTATUS (prstatus structure)
VMCOREINFO           0x0000074f     Unknown note type: (0x00000000)

Here, for my two devices, the dump’s names are
VMCOREDD_cxgb4_0000:02:00.4 and VMCOREDD_cxgb4_0000:04:00.4.

It’s really up to the callers to write their own unique name for the
dump.  The name is appended to “VMCOREDD_” string.

> Rahul, did you look at the patchset I pointed out?

For devlink, I think the dump name would be identified by
bus_type/device_name; i.e. “pci/0000:02:00.4” for my example.
Is my understanding correct?

Thanks,
Rahul
Alex Vesker April 3, 2018, 5:43 a.m. UTC | #10
On 4/2/2018 12:12 PM, Jiri Pirko wrote:
> Fri, Mar 30, 2018 at 05:11:29PM CEST, andrew@lunn.ch wrote:
>>> Please see:
>>> http://patchwork.ozlabs.org/project/netdev/list/?series=36524
>>>
>>> I bevieve that the solution in the patchset could be used for
>>> your usecase too.
>> Hi Jiri
>>
>> https://lkml.org/lkml/2018/3/20/436
>>
>> How well does this API work for a 2Gbyte snapshot?
> Ccing Alex who did the tests.

I didn't check the performance for such a large snapshot.
 From my measurement it takes 0.09s for 1 MB of data this means
about ~3m.
This can be tuned and improved since this is a socket application.

>>     Andrew
Jiri Pirko April 3, 2018, 7:04 a.m. UTC | #11
Mon, Apr 02, 2018 at 02:30:45PM CEST, rahul.lakkireddy@chelsio.com wrote:
>On Monday, April 04/02/18, 2018 at 14:41:43 +0530, Jiri Pirko wrote:
>> Fri, Mar 30, 2018 at 08:42:00PM CEST, ebiederm@xmission.com wrote:
>> >Rahul Lakkireddy <rahul.lakkireddy@chelsio.com> writes:
>> >
>> >> On Friday, March 03/30/18, 2018 at 16:09:07 +0530, Jiri Pirko wrote:
>> >>> Sat, Mar 24, 2018 at 11:56:33AM CET, rahul.lakkireddy@chelsio.com wrote:
>> >>> >Add a new module crashdd that exports the /sys/kernel/crashdd/
>> >>> >directory in second kernel, containing collected hardware/firmware
>> >>> >dumps.
>> >>> >
>> >>> >The sequence of actions done by device drivers to append their device
>> >>> >specific hardware/firmware logs to /sys/kernel/crashdd/ directory are
>> >>> >as follows:
>> >>> >
>> >>> >1. During probe (before hardware is initialized), device drivers
>> >>> >register to the crashdd module (via crashdd_add_dump()), with
>> >>> >callback function, along with buffer size and log name needed for
>> >>> >firmware/hardware log collection.
>> >>> >
>> >>> >2. Crashdd creates a driver's directory under
>> >>> >/sys/kernel/crashdd/<driver>. Then, it allocates the buffer with
>> >>> 
>> >>> This smells. I need to identify the exact ASIC instance that produced
>> >>> the dump. To identify by driver name does not help me if I have multiple
>> >>> instances of the same driver. This looks wrong to me. This looks like
>> >>> a job for devlink where you have 1 devlink instance per 1 ASIC instance.
>> >>> 
>> >>> Please see:
>> >>> http://patchwork.ozlabs.org/project/netdev/list/?series=36524
>> >>> 
>> >>> I bevieve that the solution in the patchset could be used for
>> >>> your usecase too.
>> >>> 
>> >>> 
>> >>
>> >> The sysfs approach proposed here had been dropped in favour exporting
>> >> the dumps as ELF notes in /proc/vmcore.
>> >>
>> >> Will be posting the new patches soon.
>> >
>> >The concern was actually how you identify which device that came from.
>> >Where you read the identifier changes but sysfs or /proc/vmcore the
>> >change remains valid.
>> 
>> Yeah. I still don't see how you link the dump and the device.
>
>In our case, the dump and the device are being identified by the
>driver’s name followed by its corresponding pci bus id.  I’ve posted an
>example in my v3 series:
>
>https://www.spinics.net/lists/netdev/msg493781.html
>
>Here’s an extract from the link above:
>
># readelf -n /proc/vmcore
>
>Displaying notes found at file offset 0x00001000 with length 0x04003288:
>Owner                 Data size     Description
>VMCOREDD_cxgb4_0000:02:00.4 0x02000fd8      Unknown note type:(0x00000700)
>VMCOREDD_cxgb4_0000:04:00.4 0x02000fd8      Unknown note type:(0x00000700)
>CORE                 0x00000150     NT_PRSTATUS (prstatus structure)
>CORE                 0x00000150     NT_PRSTATUS (prstatus structure)
>CORE                 0x00000150     NT_PRSTATUS (prstatus structure)
>CORE                 0x00000150     NT_PRSTATUS (prstatus structure)
>CORE                 0x00000150     NT_PRSTATUS (prstatus structure)
>CORE                 0x00000150     NT_PRSTATUS (prstatus structure)
>CORE                 0x00000150     NT_PRSTATUS (prstatus structure)
>CORE                 0x00000150     NT_PRSTATUS (prstatus structure)
>VMCOREINFO           0x0000074f     Unknown note type: (0x00000000)
>
>Here, for my two devices, the dump’s names are
>VMCOREDD_cxgb4_0000:02:00.4 and VMCOREDD_cxgb4_0000:04:00.4.
>
>It’s really up to the callers to write their own unique name for the
>dump.  The name is appended to “VMCOREDD_” string.
>
>> Rahul, did you look at the patchset I pointed out?
>
>For devlink, I think the dump name would be identified by
>bus_type/device_name; i.e. “pci/0000:02:00.4” for my example.
>Is my understanding correct?

Yes.


>
>Thanks,
>Rahul
Andrew Lunn April 3, 2018, 12:35 p.m. UTC | #12
On Tue, Apr 03, 2018 at 08:43:27AM +0300, Alex Vesker wrote:
> 
> 
> On 4/2/2018 12:12 PM, Jiri Pirko wrote:
> >Fri, Mar 30, 2018 at 05:11:29PM CEST, andrew@lunn.ch wrote:
> >>>Please see:
> >>>http://patchwork.ozlabs.org/project/netdev/list/?series=36524
> >>>
> >>>I bevieve that the solution in the patchset could be used for
> >>>your usecase too.
> >>Hi Jiri
> >>
> >>https://lkml.org/lkml/2018/3/20/436
> >>
> >>How well does this API work for a 2Gbyte snapshot?
> >Ccing Alex who did the tests.
> 
> I didn't check the performance for such a large snapshot.
> From my measurement it takes 0.09s for 1 MB of data this means
> about ~3m.

I was not really thinking about performance. More about how well does
the system work when you ask the kernel for 2GB of RAM to put a
snapshot into? And given your current design, you need another 2GB
buffer for the driver to use before calling this new API.

So i'm asking, how well does this API scale?

I think you need to remove the need for a second buffer in the
driver. Either the driver allocates the buffer and hands it over, or
your core code allocates the buffer and gives it to the driver to
fill. Maybe look at what makes most sense for the crash dump code?

      Andrew
diff mbox series

Patch

diff --git a/Documentation/ABI/testing/sysfs-kernel-crashdd b/Documentation/ABI/testing/sysfs-kernel-crashdd
new file mode 100644
index 000000000000..1f6b9e27bf2e
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-crashdd
@@ -0,0 +1,34 @@ 
+What:		/sys/kernel/crashdd
+Date:		March 2018
+KernelVersion:	4.16
+Contact:	Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
+		Ganesh Goudar <ganeshgr@chelsio.com>
+Description:
+		Snapshot of underlying hardware/firmware state (like register
+		dump, firmware logs, adapter memory, etc.), at the time of
+		kernel panic will be very helpful while	debugging the culprit
+		device driver. Thus, Crash Device Dump (crashdd) module exports
+		API to allow device drivers to collect the device specific
+		snapshot of their hardware or firmware in crash recovery kernel.
+		The collected dumps will be exported under /sys/kernel/crashdd/
+		directory in crash recovery kernel.
+
+		The sequence of actions done by device drivers to append their
+		device specific hardware/firmware logs to /sys/kernel/crashdd/
+		directory are as follows:
+
+		1. During probe (before hardware is initialized), device drivers
+		register to the crashdd module (via crashdd_add_dump()), with
+		callback function, along with buffer size and log name needed for
+		firmware/hardware log collection.
+
+		2. Crashdd creates a driver's directory under
+		/sys/kernel/crashdd/<driver>. Then, it allocates the buffer
+		with requested size and invokes the device driver's registered
+		callback function.
+
+		3. Device driver collects all hardware/firmware logs into the buffer
+		and returns control back to crashdd.
+
+		4. Crashdd exposes the buffer as a file via
+		/sys/kernel/crashdd/<driver>/<dump_file>.
diff --git a/fs/Kconfig b/fs/Kconfig
index bc821a86d965..aae1c55a7dad 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -208,6 +208,7 @@  config ARCH_HAS_GIGANTIC_PAGE
 
 source "fs/configfs/Kconfig"
 source "fs/efivarfs/Kconfig"
+source "fs/crashdd/Kconfig"
 
 endmenu
 
diff --git a/fs/Makefile b/fs/Makefile
index add789ea270a..ff398a44f611 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -128,3 +128,4 @@  obj-y				+= exofs/ # Multiple modules
 obj-$(CONFIG_CEPH_FS)		+= ceph/
 obj-$(CONFIG_PSTORE)		+= pstore/
 obj-$(CONFIG_EFIVAR_FS)		+= efivarfs/
+obj-$(CONFIG_CRASH_DEVICE_DUMP)	+= crashdd/
diff --git a/fs/crashdd/Kconfig b/fs/crashdd/Kconfig
new file mode 100644
index 000000000000..5db9c7c98c17
--- /dev/null
+++ b/fs/crashdd/Kconfig
@@ -0,0 +1,10 @@ 
+config CRASH_DEVICE_DUMP
+	bool "Crash Kernel Device Hardware/Firmware Logs"
+	depends on SYSFS && CRASH_DUMP
+	default y
+	---help---
+	  Device drivers can collect the device specific snapshot of
+	  their hardware or firmware before they are initialized in
+	  crash recovery kernel. If you say Y here a tree of device
+	  specific dumps will be made available under /sys/kernel/crashdd/
+	  directory.
diff --git a/fs/crashdd/Makefile b/fs/crashdd/Makefile
new file mode 100644
index 000000000000..8dbf946c0ea4
--- /dev/null
+++ b/fs/crashdd/Makefile
@@ -0,0 +1,3 @@ 
+# SPDX-License-Identifier: GPL-2.0
+
+obj-y := crashdd.o
diff --git a/fs/crashdd/crashdd.c b/fs/crashdd/crashdd.c
new file mode 100644
index 000000000000..996ba926578b
--- /dev/null
+++ b/fs/crashdd/crashdd.c
@@ -0,0 +1,233 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2018 Chelsio Communications, Inc. All rights reserved. */
+
+#include <linux/vmalloc.h>
+#include <linux/crash_dump.h>
+#include <linux/crashdd.h>
+
+#include "crashdd_internal.h"
+
+static LIST_HEAD(crashdd_list);
+static DEFINE_MUTEX(crashdd_mutex);
+
+static struct kobject *crashdd_kobj;
+
+static ssize_t crashdd_read(struct file *filp, struct kobject *kobj,
+			    struct bin_attribute *bin_attr,
+			    char *buf, loff_t fpos, size_t count)
+{
+	struct crashdd_dump_node *dump = bin_attr->private;
+
+	memcpy(buf, dump->buf + fpos, count);
+	return count;
+}
+
+static struct kobject *crashdd_mkdir(const char *name)
+{
+	return kobject_create_and_add(name, crashdd_kobj);
+}
+
+static int crashdd_add_file(struct kobject *kobj, const char *name,
+			    struct crashdd_dump_node *dump)
+{
+	dump->bin_attr.attr.name = name;
+	dump->bin_attr.attr.mode = 0444;
+	dump->bin_attr.size = dump->size;
+	dump->bin_attr.read = crashdd_read;
+	dump->bin_attr.private = dump;
+
+	return sysfs_create_bin_file(kobj, &dump->bin_attr);
+}
+
+static void crashdd_rmdir(struct kobject *kobj)
+{
+	kobject_put(kobj);
+}
+
+/**
+ * crashdd_init_driver - create a sysfs driver entry.
+ * @name: Name of the directory.
+ *
+ * Creates a directory under /sys/kernel/crashdd/ with @name.  Allocates
+ * and saves the sysfs entry.  The sysfs entry is added to the global
+ * list and then returned to the caller. On failure, returns NULL.
+ */
+static struct crashdd_driver_node *crashdd_init_driver(const char *name)
+{
+	struct crashdd_driver_node *node;
+
+	node = vzalloc(sizeof(*node));
+	if (!node)
+		return NULL;
+
+	/* Create a driver's directory under /sys/kernel/crashdd/ */
+	node->kobj = crashdd_mkdir(name);
+	if (!node->kobj) {
+		vfree(node);
+		return NULL;
+	}
+
+	atomic_set(&node->refcnt, 1);
+
+	/* Initialize the list of dumps that go under this driver's
+	 * directory.
+	 */
+	INIT_LIST_HEAD(&node->dump_list);
+
+	/* Add the driver's entry to global list */
+	mutex_lock(&crashdd_mutex);
+	list_add_tail(&node->list, &crashdd_list);
+	mutex_unlock(&crashdd_mutex);
+
+	return node;
+}
+
+/**
+ * crashdd_get_driver - get an exisiting sysfs driver entry.
+ * @name: Name of the directory.
+ *
+ * Searches and fetches a sysfs entry having @name.  If @name is
+ * found, then the reference count is incremented and the entry
+ * is returned.  If @name is not found, NULL is returned.
+ */
+static struct crashdd_driver_node *crashdd_get_driver(const char *name)
+{
+	struct crashdd_driver_node *node;
+	int found = 0;
+
+	/* Search for an existing driver sysfs entry having @name */
+	mutex_lock(&crashdd_mutex);
+	list_for_each_entry(node, &crashdd_list, list) {
+		if (!strcmp(node->kobj->name, name)) {
+			atomic_inc(&node->refcnt);
+			found = 1;
+			break;
+		}
+	}
+	mutex_unlock(&crashdd_mutex);
+
+	if (found)
+		return node;
+
+	/* No driver with @name found */
+	return NULL;
+}
+
+/**
+ * crashdd_put_driver - put an exisiting sysfs driver entry.
+ * @node: driver sysfs entry.
+ *
+ * Decrement @node reference count.  If there are no dumps left under it,
+ * delete the sysfs directory and remove it from the global list.
+ */
+static void crashdd_put_driver(struct crashdd_driver_node *node)
+{
+	mutex_lock(&crashdd_mutex);
+	if (atomic_dec_and_test(&node->refcnt)) {
+		/* Delete @node driver entry if it has no dumps under it */
+		crashdd_rmdir(node->kobj);
+		list_del(&node->list);
+	}
+	mutex_unlock(&crashdd_mutex);
+}
+
+/**
+ * crashdd_add_dump - Allocate a directory under /sys/kernel/crashdd/ and
+ * add the dump to it.
+ * @driver_name: directory name under which the dump should be added.
+ * @data: dump info.
+ *
+ * Search for /sys/kernel/crashdd/<@driver_name>/ directory.  If not found,
+ * allocate a new directory under /sys/kernel/crashdd/ with @driver_name.
+ * Allocate the dump file's context and invoke the calling driver's dump
+ * collect routine.  Once collection is done, add the dump under
+ * /sys/kernel/crashdd/<@driver_name>/ directory.
+ */
+int crashdd_add_dump(const char *driver_name, struct crashdd_data *data)
+{
+	struct crashdd_driver_node *node;
+	struct crashdd_dump_node *dump;
+	void *buf = NULL;
+	int ret;
+
+	if (!driver_name || !strlen(driver_name) ||
+	    !data || !strlen(data->name) ||
+	    !data->crashdd_callback || !data->size)
+		return -EINVAL;
+
+	/* Get a driver sysfs entry with specified name. */
+	node = crashdd_get_driver(driver_name);
+	if (!node) {
+		/* No driver sysfs entry found with specified name.
+		 * So create a new one
+		 */
+		node = crashdd_init_driver(driver_name);
+		if (!node)
+			return -ENOMEM;
+	}
+
+	dump = vzalloc(sizeof(*dump));
+	if (!dump) {
+		ret = -ENOMEM;
+		goto out_err;
+	}
+
+	/* Allocate buffer for driver's to write their dumps */
+	buf = vzalloc(data->size);
+	if (!buf) {
+		ret = -ENOMEM;
+		goto out_err;
+	}
+
+	/* Invoke the driver's dump collection routing */
+	ret = data->crashdd_callback(data, buf);
+	if (ret)
+		goto out_err;
+
+	dump->buf = buf;
+	dump->size = data->size;
+
+	/* Add a binary file under /sys/kernel/crashdd/@driver_name/ */
+	ret = crashdd_add_file(node->kobj, data->name, dump);
+	if (ret)
+		goto out_err;
+
+	/* Add the dump to driver sysfs list */
+	mutex_lock(&crashdd_mutex);
+	list_add_tail(&dump->list, &node->dump_list);
+	atomic_inc(&node->refcnt);
+	mutex_unlock(&crashdd_mutex);
+
+	/* Return back the driver sysfs reference */
+	crashdd_put_driver(node);
+	return 0;
+
+out_err:
+	if (buf)
+		vfree(buf);
+
+	if (dump)
+		vfree(dump);
+
+	crashdd_put_driver(node);
+	return ret;
+}
+EXPORT_SYMBOL(crashdd_add_dump);
+
+/* Init function for crash device dump module. */
+static int __init crashdd_init(void)
+{
+	/*
+	 * Only export this directory in 2nd kernel.
+	 */
+	if (!is_kdump_kernel())
+		return 0;
+
+	/* Create /sys/kernel/crashdd/ directory */
+	crashdd_kobj = kobject_create_and_add("crashdd", kernel_kobj);
+	if (!crashdd_kobj)
+		return -ENOMEM;
+
+	return 0;
+}
+fs_initcall(crashdd_init);
diff --git a/fs/crashdd/crashdd_internal.h b/fs/crashdd/crashdd_internal.h
new file mode 100644
index 000000000000..9162d1a4264b
--- /dev/null
+++ b/fs/crashdd/crashdd_internal.h
@@ -0,0 +1,24 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2018 Chelsio Communications, Inc. All rights reserved. */
+
+#ifndef CRASH_DEVICE_DUMP_INTERNAL_H
+#define CRASH_DEVICE_DUMP_INTERNAL_H
+
+/* Binary dump file's context internal to crashdd */
+struct crashdd_dump_node {
+	/* Pointer to list of dumps under the driver sysfs entry */
+	struct list_head list;
+	void *buf;                     /* Buffer containing device's dump */
+	unsigned long size;            /* Size of the buffer */
+	struct bin_attribute bin_attr; /* Binary dump file's attributes */
+};
+
+/* Driver sysfs entry internal to crashdd */
+struct crashdd_driver_node {
+	/* Pointer to global list of driver sysfs entries */
+	struct list_head list;
+	struct list_head dump_list; /* List of dumps under this driver */
+	atomic_t refcnt;            /* Number of dumps under this directory */
+	struct kobject *kobj;       /* Pointer to driver sysfs kobject */
+};
+#endif /* CRASH_DEVICE_DUMP_INTERNAL_H */
diff --git a/include/linux/crashdd.h b/include/linux/crashdd.h
new file mode 100644
index 000000000000..edaba8424019
--- /dev/null
+++ b/include/linux/crashdd.h
@@ -0,0 +1,24 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2018 Chelsio Communications, Inc. All rights reserved. */
+
+#ifndef CRASH_DEVICE_DUMP_H
+#define CRASH_DEVICE_DUMP_H
+
+/* Max dump name length */
+#define CRASHDD_NAME_LENGTH 32
+
+/* Device Dump information to be filled by drivers */
+struct crashdd_data {
+	char name[CRASHDD_NAME_LENGTH]; /* Unique name of the dump */
+	unsigned long size;             /* Size of the dump */
+	/* Driver's registered callback to be invoked to collect dump */
+	int (*crashdd_callback)(struct crashdd_data *data, void *buf);
+};
+
+#ifdef CONFIG_CRASH_DEVICE_DUMP
+int crashdd_add_dump(const char *driver_name, struct crashdd_data *data);
+#else
+#define crashdd_add_dump(x, y) 0
+#endif /* CONFIG_CRASH_DEVICE_DUMP */
+
+#endif /* CRASH_DEVICE_DUMP_H */