Message ID | 156630280239.8896.11769233860624935762.stgit@hbathini.in.ibm.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | Add FADump support on PowerNV platform | expand |
Context | Check | Description |
---|---|---|
snowpatch_ozlabs/apply_patch | success | Successfully applied on branch next (c9633332103e55bc73d80d07ead28b95a22a85a3) |
snowpatch_ozlabs/checkpatch | warning | total: 0 errors, 2 warnings, 4 checks, 309 lines checked |
Hari Bathini <hbathini@linux.ibm.com> writes: > diff --git a/arch/powerpc/kernel/fadump-common.h b/arch/powerpc/kernel/fadump-common.h > index 7107cf2..fc408b0 100644 > --- a/arch/powerpc/kernel/fadump-common.h > +++ b/arch/powerpc/kernel/fadump-common.h > @@ -98,7 +98,11 @@ struct fw_dump { > /* cmd line option during boot */ > unsigned long reserve_bootvar; > > + unsigned long cpu_state_destination_addr; AFAICS that is only used in two places, and both of them have to call __va() on it, so why don't we store the virtual address to start with? > diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c > index f75b861..9a32a7f 100644 > --- a/arch/powerpc/platforms/powernv/opal-fadump.c > +++ b/arch/powerpc/platforms/powernv/opal-fadump.c > @@ -282,15 +283,122 @@ static void opal_fadump_cleanup(struct fw_dump *fadump_conf) > pr_warn("Could not reset (%llu) kernel metadata tag!\n", ret); > } > > +static inline void opal_fadump_set_regval_regnum(struct pt_regs *regs, > + u32 reg_type, u32 reg_num, > + u64 reg_val) > +{ > + if (reg_type == HDAT_FADUMP_REG_TYPE_GPR) { > + if (reg_num < 32) > + regs->gpr[reg_num] = reg_val; > + return; > + } > + > + switch (reg_num) { > + case SPRN_CTR: > + regs->ctr = reg_val; > + break; > + case SPRN_LR: > + regs->link = reg_val; > + break; > + case SPRN_XER: > + regs->xer = reg_val; > + break; > + case SPRN_DAR: > + regs->dar = reg_val; > + break; > + case SPRN_DSISR: > + regs->dsisr = reg_val; > + break; > + case HDAT_FADUMP_REG_ID_NIP: > + regs->nip = reg_val; > + break; > + case HDAT_FADUMP_REG_ID_MSR: > + regs->msr = reg_val; > + break; > + case HDAT_FADUMP_REG_ID_CCR: > + regs->ccr = reg_val; > + break; > + } > +} > + > +static inline void opal_fadump_read_regs(char *bufp, unsigned int regs_cnt, > + unsigned int reg_entry_size, > + struct pt_regs *regs) > +{ > + int i; > + struct hdat_fadump_reg_entry *reg_entry; Where's my christmas tree :) > + > + memset(regs, 0, sizeof(struct pt_regs)); > + > + for (i = 0; i < regs_cnt; i++, bufp += reg_entry_size) { > + reg_entry = (struct hdat_fadump_reg_entry *)bufp; > + opal_fadump_set_regval_regnum(regs, > + be32_to_cpu(reg_entry->reg_type), > + be32_to_cpu(reg_entry->reg_num), > + be64_to_cpu(reg_entry->reg_val)); > + } > +} > + > +static inline bool __init is_thread_core_inactive(u8 core_state) > +{ > + bool is_inactive = false; > + > + if (core_state == HDAT_FADUMP_CORE_INACTIVE) > + is_inactive = true; > + > + return is_inactive; return core_state == HDAT_FADUMP_CORE_INACTIVE; ?? In fact there's only one caller, so just drop the inline entirely. > +} > + > /* > * Convert CPU state data saved at the time of crash into ELF notes. > + * > + * Each register entry is of 16 bytes, A numerical identifier along with > + * a GPR/SPR flag in the first 8 bytes and the register value in the next > + * 8 bytes. For more details refer to F/W documentation. > */ > static int __init opal_fadump_build_cpu_notes(struct fw_dump *fadump_conf) > { > u32 num_cpus, *note_buf; > struct fadump_crash_info_header *fdh = NULL; > + struct hdat_fadump_thread_hdr *thdr; > + unsigned long addr; > + u32 thread_pir; > + char *bufp; > + struct pt_regs regs; > + unsigned int size_of_each_thread; > + unsigned int regs_offset, regs_cnt, reg_esize; > + int i; unsigned int size_of_each_thread, regs_offset, regs_cnt, reg_esize; struct fadump_crash_info_header *fdh = NULL; u32 num_cpus, thread_pir, *note_buf; struct hdat_fadump_thread_hdr *thdr; struct pt_regs regs; unsigned long addr; char *bufp; int i; Ah much better :) Though the number of variables might be an indication that this function could be split into smaller parts. > @@ -473,6 +627,26 @@ int __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, ulong node) > return 1; > } > > + ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_CPU, &addr); > + if ((ret != OPAL_SUCCESS) || !addr) { > + pr_err("Failed to get CPU metadata (%lld)\n", ret); > + return 1; > + } > + > + addr = be64_to_cpu(addr); > + pr_debug("CPU metadata addr: %llx\n", addr); > + > + opal_cpu_metadata = __va(addr); > + r_opal_cpu_metadata = (void *)addr; Another r_ variable I don't understand. > diff --git a/arch/powerpc/platforms/powernv/opal-fadump.h b/arch/powerpc/platforms/powernv/opal-fadump.h > index 19cac1f..ce4c522 100644 > --- a/arch/powerpc/platforms/powernv/opal-fadump.h > +++ b/arch/powerpc/platforms/powernv/opal-fadump.h > @@ -30,4 +30,43 @@ struct opal_fadump_mem_struct { > struct opal_mpipl_region rgn[OPAL_FADUMP_MAX_MEM_REGS]; > } __attribute__((packed)); > > +/* > + * CPU state data is provided by f/w. Below are the definitions > + * provided in HDAT spec. Refer to latest HDAT specification for > + * any update to this format. > + */ How is this meant to work? If HDAT ever changes the format they will break all existing kernels in the field. > +#define HDAT_FADUMP_CPU_DATA_VERSION 1 > + > +#define HDAT_FADUMP_CORE_INACTIVE (0x0F) > + > +/* HDAT thread header for register entries */ > +struct hdat_fadump_thread_hdr { > + __be32 pir; > + /* 0x00 - 0x0F - The corresponding stop state of the core */ > + u8 core_state; > + u8 reserved[3]; > + > + __be32 offset; /* Offset to Register Entries array */ > + __be32 ecnt; /* Number of entries */ > + __be32 esize; /* Alloc size of each array entry in bytes */ > + __be32 eactsz; /* Actual size of each array entry in bytes */ > +} __attribute__((packed)); > + > +/* Register types populated by f/w */ > +#define HDAT_FADUMP_REG_TYPE_GPR 0x01 > +#define HDAT_FADUMP_REG_TYPE_SPR 0x02 > + > +/* ID numbers used by f/w while populating certain registers */ > +#define HDAT_FADUMP_REG_ID_NIP 0x7D0 > +#define HDAT_FADUMP_REG_ID_MSR 0x7D1 > +#define HDAT_FADUMP_REG_ID_CCR 0x7D2 > + > +/* HDAT register entry. */ > +struct hdat_fadump_reg_entry { > + __be32 reg_type; > + __be32 reg_num; > + __be64 reg_val; > +} __attribute__((packed)); cheers
On 04/09/19 5:50 PM, Michael Ellerman wrote: > Hari Bathini <hbathini@linux.ibm.com> writes: > [...] >> +/* >> + * CPU state data is provided by f/w. Below are the definitions >> + * provided in HDAT spec. Refer to latest HDAT specification for >> + * any update to this format. >> + */ > > How is this meant to work? If HDAT ever changes the format they will > break all existing kernels in the field. > >> +#define HDAT_FADUMP_CPU_DATA_VERSION 1 Changes are not expected here. But this is just to cover for such scenario, if that ever happens. Also, I think it is a bit far-fetched to error out if versions mismatch. Warning and proceeding sounds worthier because the changes are usually backward compatible, if and when there are any. Will update accordingly... - Hari
On Mon, Sep 9, 2019 at 11:23 PM Hari Bathini <hbathini@linux.ibm.com> wrote: > > On 04/09/19 5:50 PM, Michael Ellerman wrote: > > Hari Bathini <hbathini@linux.ibm.com> writes: > > > > [...] > > >> +/* > >> + * CPU state data is provided by f/w. Below are the definitions > >> + * provided in HDAT spec. Refer to latest HDAT specification for > >> + * any update to this format. > >> + */ > > > > How is this meant to work? If HDAT ever changes the format they will > > break all existing kernels in the field. > > > >> +#define HDAT_FADUMP_CPU_DATA_VERSION 1 > > Changes are not expected here. But this is just to cover for such scenario, > if that ever happens. The HDAT spec doesn't define the SPR numbers for NIA, MSR and the CR. As far as I can tell the values you've assumed here are chip-specific, non-architected SPR numbers that come from an array buried somewhere in the SBE codebase. I don't believe you for a second when you say that this will never change. > Also, I think it is a bit far-fetched to error out if versions mismatch. > Warning and proceeding sounds worthier because the changes are usually > backward compatible, if and when there are any. Will update accordingly... Literally the only reason I didn't drop the CPU DATA parts of the OPAL MPIPL series was because I assumed the kernel would do the sensible thing and reject or ignore the structure if it did not know how to parse the data. Oliver
On 09/09/19 9:03 PM, Oliver O'Halloran wrote: > On Mon, Sep 9, 2019 at 11:23 PM Hari Bathini <hbathini@linux.ibm.com> wrote: >> >> On 04/09/19 5:50 PM, Michael Ellerman wrote: >>> Hari Bathini <hbathini@linux.ibm.com> writes: >>> >> >> [...] >> >>>> +/* >>>> + * CPU state data is provided by f/w. Below are the definitions >>>> + * provided in HDAT spec. Refer to latest HDAT specification for >>>> + * any update to this format. >>>> + */ >>> >>> How is this meant to work? If HDAT ever changes the format they will >>> break all existing kernels in the field. >>> >>>> +#define HDAT_FADUMP_CPU_DATA_VERSION 1 >> >> Changes are not expected here. But this is just to cover for such scenario, >> if that ever happens. > > The HDAT spec doesn't define the SPR numbers for NIA, MSR and the CR. > As far as I can tell the values you've assumed here are chip-specific, > non-architected SPR numbers that come from an array buried somewhere > in the SBE codebase. I don't believe you for a second when you say > that this will never change. At least, the understanding is that this numbers not change across processor generations. If something changes, it is supposed to be handled in SBE. Also, I am told this numbers would be listed in the HDAT Spec. Not sure if that happened yet though. Vasant, you have anything to add? >> Also, I think it is a bit far-fetched to error out if versions mismatch. >> Warning and proceeding sounds worthier because the changes are usually >> backward compatible, if and when there are any. Will update accordingly... > > Literally the only reason I didn't drop the CPU DATA parts of the OPAL > MPIPL series was because I assumed the kernel would do the sensible > thing and reject or ignore the structure if it did not know how to > parse the data. I think, the changes if any, would have to be backward compatible for the sake of sanity. Even if they are not, we are better off exporting the /proc/vmcore with a warning and some crazy CPU register data (if parsing goes alright) than no dump at all? - Hari
Hari Bathini <hbathini@linux.ibm.com> writes: > On 09/09/19 9:03 PM, Oliver O'Halloran wrote: >> On Mon, Sep 9, 2019 at 11:23 PM Hari Bathini <hbathini@linux.ibm.com> wrote: >>> On 04/09/19 5:50 PM, Michael Ellerman wrote: >>>> Hari Bathini <hbathini@linux.ibm.com> writes: >>> [...] >>> >>>>> +/* >>>>> + * CPU state data is provided by f/w. Below are the definitions >>>>> + * provided in HDAT spec. Refer to latest HDAT specification for >>>>> + * any update to this format. >>>>> + */ >>>> >>>> How is this meant to work? If HDAT ever changes the format they will >>>> break all existing kernels in the field. >>>> >>>>> +#define HDAT_FADUMP_CPU_DATA_VERSION 1 >>> >>> Changes are not expected here. But this is just to cover for such scenario, >>> if that ever happens. >> >> The HDAT spec doesn't define the SPR numbers for NIA, MSR and the CR. >> As far as I can tell the values you've assumed here are chip-specific, >> non-architected SPR numbers that come from an array buried somewhere >> in the SBE codebase. I don't believe you for a second when you say >> that this will never change. > > At least, the understanding is that this numbers not change across processor > generations. If something changes, it is supposed to be handled in SBE. Also, > I am told this numbers would be listed in the HDAT Spec. Not sure if that > happened yet though. Vasant, you have anything to add? That doesn't help much because the HDAT spec is not public. The point is with the code written the way it is, these values *must not* change, or else all existing kernels will be broken, which is not acceptable. >>> Also, I think it is a bit far-fetched to error out if versions mismatch. >>> Warning and proceeding sounds worthier because the changes are usually >>> backward compatible, if and when there are any. Will update accordingly... >> >> Literally the only reason I didn't drop the CPU DATA parts of the OPAL >> MPIPL series was because I assumed the kernel would do the sensible >> thing and reject or ignore the structure if it did not know how to >> parse the data. > > I think, the changes if any, would have to be backward compatible for the sake > of sanity. People need to understand that this is an ABI between firmware and in-the-field distribution kernels which are only updated at customer discretion, or possibly never. Any changes *must be* backward compatible. Looking at the header struct: +struct hdat_fadump_thread_hdr { + __be32 pir; + /* 0x00 - 0x0F - The corresponding stop state of the core */ + u8 core_state; + u8 reserved[3]; You have those 3 reserved bytes, so a future revision could repurpose one of those as a flag to indicate a new format. And/or the hdr could be made bigger and new kernels could be taught to look for new things in the space after the hdr but before the reg entries. So I think there is a reasonable mechanism for extending the format in future, but my point is people must understand that this is an ABI and changes must be made accordingly. > Even if they are not, we are better off exporting the /proc/vmcore > with a warning and some crazy CPU register data (if parsing goes alright) than > no dump at all? If it's just a case of reg entries that we don't recognise then yes I think it would be OK to just skip them and continue exporting. But if there's any deeper misunderstanding of the format then we should bail out. I notice now that you don't do anything in opal_fadump_set_regval_regnum() if you are passed a register we don't understand, so that probably needs fixing. cheers
On 10/09/19 7:35 PM, Michael Ellerman wrote: > Hari Bathini <hbathini@linux.ibm.com> writes: >> On 09/09/19 9:03 PM, Oliver O'Halloran wrote: >>> On Mon, Sep 9, 2019 at 11:23 PM Hari Bathini <hbathini@linux.ibm.com> wrote: >>>> On 04/09/19 5:50 PM, Michael Ellerman wrote: >>>>> Hari Bathini <hbathini@linux.ibm.com> writes: >>>> [...] >>>> >>>>>> +/* >>>>>> + * CPU state data is provided by f/w. Below are the definitions >>>>>> + * provided in HDAT spec. Refer to latest HDAT specification for >>>>>> + * any update to this format. >>>>>> + */ >>>>> >>>>> How is this meant to work? If HDAT ever changes the format they will >>>>> break all existing kernels in the field. >>>>> >>>>>> +#define HDAT_FADUMP_CPU_DATA_VERSION 1 >>>> >>>> Changes are not expected here. But this is just to cover for such scenario, >>>> if that ever happens. >>> >>> The HDAT spec doesn't define the SPR numbers for NIA, MSR and the CR. >>> As far as I can tell the values you've assumed here are chip-specific, >>> non-architected SPR numbers that come from an array buried somewhere >>> in the SBE codebase. I don't believe you for a second when you say >>> that this will never change. >> >> At least, the understanding is that this numbers not change across processor >> generations. If something changes, it is supposed to be handled in SBE. Also, >> I am told this numbers would be listed in the HDAT Spec. Not sure if that >> happened yet though. Vasant, you have anything to add? > > That doesn't help much because the HDAT spec is not public. > > The point is with the code written the way it is, these values *must > not* change, or else all existing kernels will be broken, which is not > acceptable. Yeah. It is absurd to error out just by looking at version number... > >>>> Also, I think it is a bit far-fetched to error out if versions mismatch. >>>> Warning and proceeding sounds worthier because the changes are usually >>>> backward compatible, if and when there are any. Will update accordingly... >>> >>> Literally the only reason I didn't drop the CPU DATA parts of the OPAL >>> MPIPL series was because I assumed the kernel would do the sensible >>> thing and reject or ignore the structure if it did not know how to >>> parse the data. >> >> I think, the changes if any, would have to be backward compatible for the sake >> of sanity. > > People need to understand that this is an ABI between firmware and > in-the-field distribution kernels which are only updated at customer > discretion, or possibly never. > > Any changes *must be* backward compatible. > > Looking at the header struct: > > +struct hdat_fadump_thread_hdr { > + __be32 pir; > + /* 0x00 - 0x0F - The corresponding stop state of the core */ > + u8 core_state; > + u8 reserved[3]; > > You have those 3 reserved bytes, so a future revision could repurpose > one of those as a flag to indicate a new format. And/or the hdr could be > made bigger and new kernels could be taught to look for new things in > the space after the hdr but before the reg entries. > > So I think there is a reasonable mechanism for extending the format in > future, but my point is people must understand that this is an ABI and > changes must be made accordingly. True. The folks who make the changes to this format should be aware that breaking kernel ABI is not going to be pretty and I think they are :) > >> Even if they are not, we are better off exporting the /proc/vmcore >> with a warning and some crazy CPU register data (if parsing goes alright) than >> no dump at all? > > If it's just a case of reg entries that we don't recognise then yes I > think it would be OK to just skip them and continue exporting. But if > there's any deeper misunderstanding of the format then we should bail > out. Sure. Will try and fix that by first trying to do a sanity check on the fields that are needed for parsing the data and proceed with a warning if nothing weird is detected and fallback to just appending crashing cpu as done in patch 16/31, if anything weird is observed. That should hopefully take care of all cases in the best possible way.. > > I notice now that you don't do anything in opal_fadump_set_regval_regnum() > if you are passed a register we don't understand, so that probably needs > fixing. f/w provides about 100 odd registers in the CPU state data. Most of them pt_regs doesn't care about. So, opal_fadump_set_regval_regnum is happy as long as it find the registers listed in it. Unless, pt_regs changes, we can stick to this and ignore the rest of them? - Hari
diff --git a/arch/powerpc/kernel/fadump-common.h b/arch/powerpc/kernel/fadump-common.h index 7107cf2..fc408b0 100644 --- a/arch/powerpc/kernel/fadump-common.h +++ b/arch/powerpc/kernel/fadump-common.h @@ -98,7 +98,11 @@ struct fw_dump { /* cmd line option during boot */ unsigned long reserve_bootvar; + unsigned long cpu_state_destination_addr; + unsigned long cpu_state_data_version; + unsigned long cpu_state_entry_size; unsigned long cpu_state_data_size; + unsigned long hpte_region_size; unsigned long boot_mem_dest_addr; diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c index f75b861..9a32a7f 100644 --- a/arch/powerpc/platforms/powernv/opal-fadump.c +++ b/arch/powerpc/platforms/powernv/opal-fadump.c @@ -23,6 +23,7 @@ #include "opal-fadump.h" static const struct opal_fadump_mem_struct *opal_fdm_active; +static const struct opal_mpipl_fadump *opal_cpu_metadata; static struct opal_fadump_mem_struct *opal_fdm; static int opal_fadump_unregister(struct fw_dump *fadump_conf); @@ -282,15 +283,122 @@ static void opal_fadump_cleanup(struct fw_dump *fadump_conf) pr_warn("Could not reset (%llu) kernel metadata tag!\n", ret); } +static inline void opal_fadump_set_regval_regnum(struct pt_regs *regs, + u32 reg_type, u32 reg_num, + u64 reg_val) +{ + if (reg_type == HDAT_FADUMP_REG_TYPE_GPR) { + if (reg_num < 32) + regs->gpr[reg_num] = reg_val; + return; + } + + switch (reg_num) { + case SPRN_CTR: + regs->ctr = reg_val; + break; + case SPRN_LR: + regs->link = reg_val; + break; + case SPRN_XER: + regs->xer = reg_val; + break; + case SPRN_DAR: + regs->dar = reg_val; + break; + case SPRN_DSISR: + regs->dsisr = reg_val; + break; + case HDAT_FADUMP_REG_ID_NIP: + regs->nip = reg_val; + break; + case HDAT_FADUMP_REG_ID_MSR: + regs->msr = reg_val; + break; + case HDAT_FADUMP_REG_ID_CCR: + regs->ccr = reg_val; + break; + } +} + +static inline void opal_fadump_read_regs(char *bufp, unsigned int regs_cnt, + unsigned int reg_entry_size, + struct pt_regs *regs) +{ + int i; + struct hdat_fadump_reg_entry *reg_entry; + + memset(regs, 0, sizeof(struct pt_regs)); + + for (i = 0; i < regs_cnt; i++, bufp += reg_entry_size) { + reg_entry = (struct hdat_fadump_reg_entry *)bufp; + opal_fadump_set_regval_regnum(regs, + be32_to_cpu(reg_entry->reg_type), + be32_to_cpu(reg_entry->reg_num), + be64_to_cpu(reg_entry->reg_val)); + } +} + +static inline bool __init is_thread_core_inactive(u8 core_state) +{ + bool is_inactive = false; + + if (core_state == HDAT_FADUMP_CORE_INACTIVE) + is_inactive = true; + + return is_inactive; +} + /* * Convert CPU state data saved at the time of crash into ELF notes. + * + * Each register entry is of 16 bytes, A numerical identifier along with + * a GPR/SPR flag in the first 8 bytes and the register value in the next + * 8 bytes. For more details refer to F/W documentation. */ static int __init opal_fadump_build_cpu_notes(struct fw_dump *fadump_conf) { u32 num_cpus, *note_buf; struct fadump_crash_info_header *fdh = NULL; + struct hdat_fadump_thread_hdr *thdr; + unsigned long addr; + u32 thread_pir; + char *bufp; + struct pt_regs regs; + unsigned int size_of_each_thread; + unsigned int regs_offset, regs_cnt, reg_esize; + int i; + + fadump_conf->cpu_state_entry_size = + be32_to_cpu(opal_cpu_metadata->cpu_data_size); + fadump_conf->cpu_state_destination_addr = + be64_to_cpu(opal_cpu_metadata->region[0].dest); + fadump_conf->cpu_state_data_size = + be64_to_cpu(opal_cpu_metadata->region[0].size); + + if ((fadump_conf->cpu_state_destination_addr == 0) || + (fadump_conf->cpu_state_entry_size == 0)) { + pr_err("CPU state data not available for processing!\n"); + return -ENODEV; + } + + size_of_each_thread = fadump_conf->cpu_state_entry_size; + num_cpus = (fadump_conf->cpu_state_data_size / size_of_each_thread); + + addr = fadump_conf->cpu_state_destination_addr; + bufp = __va(addr); + + /* + * Offset for register entries, entry size and registers count is + * duplicated in every thread header in keeping with HDAT format. + * Use these values from the first thread header. + */ + thdr = (struct hdat_fadump_thread_hdr *)bufp; + regs_offset = (offsetof(struct hdat_fadump_thread_hdr, offset) + + be32_to_cpu(thdr->offset)); + reg_esize = be32_to_cpu(thdr->esize); + regs_cnt = be32_to_cpu(thdr->ecnt); - num_cpus = 1; /* Allocate buffer to hold cpu crash notes. */ fadump_conf->cpu_notes_buf_size = num_cpus * sizeof(note_buf_t); fadump_conf->cpu_notes_buf_size = @@ -309,10 +417,53 @@ static int __init opal_fadump_build_cpu_notes(struct fw_dump *fadump_conf) if (fadump_conf->fadumphdr_addr) fdh = __va(fadump_conf->fadumphdr_addr); - if (fdh && (fdh->crashing_cpu != FADUMP_CPU_UNKNOWN)) { - note_buf = fadump_regs_to_elf_notes(note_buf, &(fdh->regs)); - final_note(note_buf); + pr_debug("--------CPU State Data------------\n"); + pr_debug("NumCpus : %u\n", num_cpus); + pr_debug("\tOffset: %u, Entry size: %u, Cnt: %u\n", + regs_offset, reg_esize, regs_cnt); + + for (i = 0; i < num_cpus; i++, bufp += size_of_each_thread) { + thdr = (struct hdat_fadump_thread_hdr *)bufp; + + thread_pir = be32_to_cpu(thdr->pir); + pr_debug("%04d) PIR: 0x%x, core state: 0x%02x\n", + (i + 1), thread_pir, thdr->core_state); + + /* + * Register state data of MAX cores is provided by firmware, + * but some of this cores may not be active. So, while + * processing register state data, check core state and + * skip threads that belong to inactive cores. + */ + if (is_thread_core_inactive(thdr->core_state)) + continue; + + /* + * If this is kernel initiated crash, crashing_cpu would be set + * appropriately and register data of the crashing CPU saved by + * crashing kernel. Add this saved register data of crashing CPU + * to elf notes and populate the pt_regs for the remaining CPUs + * from register state data provided by firmware. + */ + if (fdh && (fdh->crashing_cpu == thread_pir)) { + note_buf = fadump_regs_to_elf_notes(note_buf, + &fdh->regs); + pr_debug("Crashing CPU PIR: 0x%x - R1 : 0x%lx, NIP : 0x%lx\n", + fdh->crashing_cpu, fdh->regs.gpr[1], + fdh->regs.nip); + continue; + } + + opal_fadump_read_regs((bufp + regs_offset), regs_cnt, + reg_esize, ®s); + note_buf = fadump_regs_to_elf_notes(note_buf, ®s); + pr_debug("CPU PIR: 0x%x - R1 : 0x%lx, NIP : 0x%lx\n", + thread_pir, regs.gpr[1], regs.nip); + } + final_note(note_buf); + + if (fdh) { pr_debug("Updating elfcore header (%llx) with cpu notes\n", fdh->elfcorehdr_addr); fadump_update_elfcore_header(fadump_conf, @@ -327,7 +478,8 @@ static int __init opal_fadump_process(struct fw_dump *fadump_conf) struct fadump_crash_info_header *fdh; int rc = 0; - if (!opal_fdm_active || !fadump_conf->fadumphdr_addr) + if (!opal_fdm_active || !opal_cpu_metadata || + !fadump_conf->fadumphdr_addr) return -EINVAL; /* Validate the fadump crash info header */ @@ -337,13 +489,6 @@ static int __init opal_fadump_process(struct fw_dump *fadump_conf) return -EINVAL; } - /* - * TODO: To build cpu notes, find a way to map PIR to logical id. - * Also, we may need different method for pseries and powernv. - * The currently booted kernel could have a different PIR to - * logical id mapping. So, try saving info of previous kernel's - * paca to get the right PIR to logical id mapping. - */ rc = opal_fadump_build_cpu_notes(fadump_conf); if (rc) return rc; @@ -397,6 +542,14 @@ static void opal_fadump_trigger(struct fadump_crash_info_header *fdh, { int rc; + /* + * Unlike on pSeries platform, logical CPU number is not provided + * with architected register state data. So, store the crashing + * CPU's PIR instead to plug the appropriate register data for + * crashing CPU in the vmcore file. + */ + fdh->crashing_cpu = (u32)mfspr(SPRN_PIR); + rc = opal_cec_reboot2(OPAL_REBOOT_MPIPL, msg); if (rc == OPAL_UNSUPPORTED) { pr_emerg("Reboot type %d not supported.\n", @@ -449,6 +602,7 @@ int __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, ulong node) u64 addr = 0; s64 ret; const struct opal_fadump_mem_struct *r_opal_fdm_active; + const struct opal_mpipl_fadump *r_opal_cpu_metadata; ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_KERNEL, &addr); if ((ret != OPAL_SUCCESS) || !addr) { @@ -473,6 +627,26 @@ int __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, ulong node) return 1; } + ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_CPU, &addr); + if ((ret != OPAL_SUCCESS) || !addr) { + pr_err("Failed to get CPU metadata (%lld)\n", ret); + return 1; + } + + addr = be64_to_cpu(addr); + pr_debug("CPU metadata addr: %llx\n", addr); + + opal_cpu_metadata = __va(addr); + r_opal_cpu_metadata = (void *)addr; + fadump_conf->cpu_state_data_version = + be32_to_cpu(r_opal_cpu_metadata->cpu_data_version); + if (fadump_conf->cpu_state_data_version != + HDAT_FADUMP_CPU_DATA_VERSION) { + pr_err("CPU data format version (%lu) mismatch!\n", + fadump_conf->cpu_state_data_version); + return 1; + } + pr_info("Firmware-assisted dump is active.\n"); fadump_conf->dump_active = 1; opal_fadump_get_config(fadump_conf, r_opal_fdm_active); diff --git a/arch/powerpc/platforms/powernv/opal-fadump.h b/arch/powerpc/platforms/powernv/opal-fadump.h index 19cac1f..ce4c522 100644 --- a/arch/powerpc/platforms/powernv/opal-fadump.h +++ b/arch/powerpc/platforms/powernv/opal-fadump.h @@ -30,4 +30,43 @@ struct opal_fadump_mem_struct { struct opal_mpipl_region rgn[OPAL_FADUMP_MAX_MEM_REGS]; } __attribute__((packed)); +/* + * CPU state data is provided by f/w. Below are the definitions + * provided in HDAT spec. Refer to latest HDAT specification for + * any update to this format. + */ + +#define HDAT_FADUMP_CPU_DATA_VERSION 1 + +#define HDAT_FADUMP_CORE_INACTIVE (0x0F) + +/* HDAT thread header for register entries */ +struct hdat_fadump_thread_hdr { + __be32 pir; + /* 0x00 - 0x0F - The corresponding stop state of the core */ + u8 core_state; + u8 reserved[3]; + + __be32 offset; /* Offset to Register Entries array */ + __be32 ecnt; /* Number of entries */ + __be32 esize; /* Alloc size of each array entry in bytes */ + __be32 eactsz; /* Actual size of each array entry in bytes */ +} __attribute__((packed)); + +/* Register types populated by f/w */ +#define HDAT_FADUMP_REG_TYPE_GPR 0x01 +#define HDAT_FADUMP_REG_TYPE_SPR 0x02 + +/* ID numbers used by f/w while populating certain registers */ +#define HDAT_FADUMP_REG_ID_NIP 0x7D0 +#define HDAT_FADUMP_REG_ID_MSR 0x7D1 +#define HDAT_FADUMP_REG_ID_CCR 0x7D2 + +/* HDAT register entry. */ +struct hdat_fadump_reg_entry { + __be32 reg_type; + __be32 reg_num; + __be64 reg_val; +} __attribute__((packed)); + #endif /* __PPC64_OPAL_FA_DUMP_H__ */