diff mbox series

[v3,2/2] pseries/mce: Refactor the pseries mce handling code

Message ID 20211124095500.98656-2-ganeshgr@linux.ibm.com (mailing list archive)
State Rejected, archived
Headers show
Series [v3,1/2] powerpc/mce: Avoid using irq_work_queue() in realmode | expand

Checks

Context Check Description
snowpatch_ozlabs/github-powerpc_selftests success Successfully ran 8 jobs.
snowpatch_ozlabs/github-powerpc_ppctests success Successfully ran 8 jobs.
snowpatch_ozlabs/github-powerpc_kernel_qemu success Successfully ran 24 jobs.
snowpatch_ozlabs/github-powerpc_sparse success Successfully ran 4 jobs.
snowpatch_ozlabs/github-powerpc_clang success Successfully ran 7 jobs.

Commit Message

Ganesh Goudar Nov. 24, 2021, 9:55 a.m. UTC
Now that we are no longer switching on the mmu in realmode
mce handler, Revert the commit 4ff753feab02("powerpc/pseries:
Avoid using addr_to_pfn in real mode") partially, which
introduced functions mce_handle_err_virtmode/realmode() to
separate mce handler code which needed translation to enabled.

Signed-off-by: Ganesh Goudar <ganeshgr@linux.ibm.com>
---
 arch/powerpc/platforms/pseries/ras.c | 122 +++++++++++----------------
 1 file changed, 49 insertions(+), 73 deletions(-)

Comments

Nicholas Piggin Nov. 24, 2021, 1:10 p.m. UTC | #1
Excerpts from Ganesh Goudar's message of November 24, 2021 7:55 pm:
> Now that we are no longer switching on the mmu in realmode
> mce handler, Revert the commit 4ff753feab02("powerpc/pseries:
> Avoid using addr_to_pfn in real mode") partially, which
> introduced functions mce_handle_err_virtmode/realmode() to
> separate mce handler code which needed translation to enabled.
> 
> Signed-off-by: Ganesh Goudar <ganeshgr@linux.ibm.com>
> ---
>  arch/powerpc/platforms/pseries/ras.c | 122 +++++++++++----------------
>  1 file changed, 49 insertions(+), 73 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
> index 8613f9cc5798..62e1519b8355 100644
> --- a/arch/powerpc/platforms/pseries/ras.c
> +++ b/arch/powerpc/platforms/pseries/ras.c
> @@ -511,58 +511,17 @@ int pSeries_system_reset_exception(struct pt_regs *regs)
>  	return 0; /* need to perform reset */
>  }
>  
> -static int mce_handle_err_realmode(int disposition, u8 error_type)
> -{
> -#ifdef CONFIG_PPC_BOOK3S_64
> -	if (disposition == RTAS_DISP_NOT_RECOVERED) {
> -		switch (error_type) {
> -		case	MC_ERROR_TYPE_ERAT:
> -			flush_erat();
> -			disposition = RTAS_DISP_FULLY_RECOVERED;
> -			break;
> -		case	MC_ERROR_TYPE_SLB:
> -			/*
> -			 * Store the old slb content in paca before flushing.
> -			 * Print this when we go to virtual mode.
> -			 * There are chances that we may hit MCE again if there
> -			 * is a parity error on the SLB entry we trying to read
> -			 * for saving. Hence limit the slb saving to single
> -			 * level of recursion.
> -			 */
> -			if (local_paca->in_mce == 1)
> -				slb_save_contents(local_paca->mce_faulty_slbs);
> -			flush_and_reload_slb();
> -			disposition = RTAS_DISP_FULLY_RECOVERED;
> -			break;
> -		default:
> -			break;
> -		}
> -	} else if (disposition == RTAS_DISP_LIMITED_RECOVERY) {
> -		/* Platform corrected itself but could be degraded */
> -		pr_err("MCE: limited recovery, system may be degraded\n");
> -		disposition = RTAS_DISP_FULLY_RECOVERED;
> -	}
> -#endif
> -	return disposition;
> -}
> -
> -static int mce_handle_err_virtmode(struct pt_regs *regs,
> -				   struct rtas_error_log *errp,
> -				   struct pseries_mc_errorlog *mce_log,
> -				   int disposition)
> +static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp)
>  {
>  	struct mce_error_info mce_err = { 0 };
> +	unsigned long eaddr = 0, paddr = 0;
> +	struct pseries_errorlog *pseries_log;
> +	struct pseries_mc_errorlog *mce_log;
> +	int disposition = rtas_error_disposition(errp);
>  	int initiator = rtas_error_initiator(errp);
>  	int severity = rtas_error_severity(errp);
> -	unsigned long eaddr = 0, paddr = 0;
>  	u8 error_type, err_sub_type;
>  
> -	if (!mce_log)
> -		goto out;
> -
> -	error_type = mce_log->error_type;
> -	err_sub_type = rtas_mc_error_sub_type(mce_log);
> -
>  	if (initiator == RTAS_INITIATOR_UNKNOWN)
>  		mce_err.initiator = MCE_INITIATOR_UNKNOWN;
>  	else if (initiator == RTAS_INITIATOR_CPU)
> @@ -588,6 +547,8 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
>  		mce_err.severity = MCE_SEV_SEVERE;
>  	else if (severity == RTAS_SEVERITY_ERROR)
>  		mce_err.severity = MCE_SEV_SEVERE;
> +	else if (severity == RTAS_SEVERITY_FATAL)
> +		mce_err.severity = MCE_SEV_FATAL;
>  	else
>  		mce_err.severity = MCE_SEV_FATAL;
>  

What's this hunk for?

> @@ -599,7 +560,18 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
>  	mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN;
>  	mce_err.error_class = MCE_ECLASS_UNKNOWN;
>  
> -	switch (error_type) {
> +	if (!rtas_error_extended(errp))
> +		goto out;
> +
> +	pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE);
> +	if (!pseries_log)
> +		goto out;
> +
> +	mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
> +	error_type = mce_log->error_type;
> +	err_sub_type = rtas_mc_error_sub_type(mce_log);
> +
> +	switch (mce_log->error_type) {
>  	case MC_ERROR_TYPE_UE:
>  		mce_err.error_type = MCE_ERROR_TYPE_UE;
>  		mce_common_process_ue(regs, &mce_err);
> @@ -692,41 +664,45 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
>  		mce_err.error_type = MCE_ERROR_TYPE_DCACHE;
>  		break;
>  	case MC_ERROR_TYPE_I_CACHE:
> -		mce_err.error_type = MCE_ERROR_TYPE_ICACHE;
> +		mce_err.error_type = MCE_ERROR_TYPE_DCACHE;
>  		break;

And this one. Doesn't look right.

>  	case MC_ERROR_TYPE_UNKNOWN:
>  	default:
>  		mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN;
>  		break;
>  	}
> +
> +#ifdef CONFIG_PPC_BOOK3S_64
> +	if (disposition == RTAS_DISP_NOT_RECOVERED) {
> +		switch (error_type) {
> +		case	MC_ERROR_TYPE_SLB:
> +		case	MC_ERROR_TYPE_ERAT:
> +			/*
> +			 * Store the old slb content in paca before flushing.
> +			 * Print this when we go to virtual mode.
> +			 * There are chances that we may hit MCE again if there
> +			 * is a parity error on the SLB entry we trying to read
> +			 * for saving. Hence limit the slb saving to single
> +			 * level of recursion.
> +			 */
> +			if (local_paca->in_mce == 1)
> +				slb_save_contents(local_paca->mce_faulty_slbs);
> +			flush_and_reload_slb();
> +			disposition = RTAS_DISP_FULLY_RECOVERED;
> +			break;
> +		default:
> +			break;
> +		}
> +	} else if (disposition == RTAS_DISP_LIMITED_RECOVERY) {
> +		/* Platform corrected itself but could be degraded */
> +		pr_err("MCE: limited recovery, system may be degraded\n");
> +		disposition = RTAS_DISP_FULLY_RECOVERED;
> +	}

I would prefer if you just keep the mce_handle_err_realmode function 
(can rename it if you want). It's actually changed a bit since the
patch being reverted so we don't want to undo that.

Thanks,
Nick
Ganesh Goudar Jan. 17, 2022, 8:11 a.m. UTC | #2
On 11/24/21 18:40, Nicholas Piggin wrote:

> Excerpts from Ganesh Goudar's message of November 24, 2021 7:55 pm:
>> Now that we are no longer switching on the mmu in realmode
>> mce handler, Revert the commit 4ff753feab02("powerpc/pseries:
>> Avoid using addr_to_pfn in real mode") partially, which
>> introduced functions mce_handle_err_virtmode/realmode() to
>> separate mce handler code which needed translation to enabled.
>>
>> Signed-off-by: Ganesh Goudar<ganeshgr@linux.ibm.com>
>> ---
>>   arch/powerpc/platforms/pseries/ras.c | 122 +++++++++++----------------
>>   1 file changed, 49 insertions(+), 73 deletions(-)
>>
>> diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
>> index 8613f9cc5798..62e1519b8355 100644
>> --- a/arch/powerpc/platforms/pseries/ras.c
>> +++ b/arch/powerpc/platforms/pseries/ras.c
>> @@ -511,58 +511,17 @@ int pSeries_system_reset_exception(struct pt_regs *regs)
>>   	return 0; /* need to perform reset */
>>   }
>>   
>> -static int mce_handle_err_realmode(int disposition, u8 error_type)
>> -{
>> -#ifdef CONFIG_PPC_BOOK3S_64
>> -	if (disposition == RTAS_DISP_NOT_RECOVERED) {
>> -		switch (error_type) {
>> -		case	MC_ERROR_TYPE_ERAT:
>> -			flush_erat();
>> -			disposition = RTAS_DISP_FULLY_RECOVERED;
>> -			break;
>> -		case	MC_ERROR_TYPE_SLB:
>> -			/*
>> -			 * Store the old slb content in paca before flushing.
>> -			 * Print this when we go to virtual mode.
>> -			 * There are chances that we may hit MCE again if there
>> -			 * is a parity error on the SLB entry we trying to read
>> -			 * for saving. Hence limit the slb saving to single
>> -			 * level of recursion.
>> -			 */
>> -			if (local_paca->in_mce == 1)
>> -				slb_save_contents(local_paca->mce_faulty_slbs);
>> -			flush_and_reload_slb();
>> -			disposition = RTAS_DISP_FULLY_RECOVERED;
>> -			break;
>> -		default:
>> -			break;
>> -		}
>> -	} else if (disposition == RTAS_DISP_LIMITED_RECOVERY) {
>> -		/* Platform corrected itself but could be degraded */
>> -		pr_err("MCE: limited recovery, system may be degraded\n");
>> -		disposition = RTAS_DISP_FULLY_RECOVERED;
>> -	}
>> -#endif
>> -	return disposition;
>> -}
>> -
>> -static int mce_handle_err_virtmode(struct pt_regs *regs,
>> -				   struct rtas_error_log *errp,
>> -				   struct pseries_mc_errorlog *mce_log,
>> -				   int disposition)
>> +static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp)
>>   {
>>   	struct mce_error_info mce_err = { 0 };
>> +	unsigned long eaddr = 0, paddr = 0;
>> +	struct pseries_errorlog *pseries_log;
>> +	struct pseries_mc_errorlog *mce_log;
>> +	int disposition = rtas_error_disposition(errp);
>>   	int initiator = rtas_error_initiator(errp);
>>   	int severity = rtas_error_severity(errp);
>> -	unsigned long eaddr = 0, paddr = 0;
>>   	u8 error_type, err_sub_type;
>>   
>> -	if (!mce_log)
>> -		goto out;
>> -
>> -	error_type = mce_log->error_type;
>> -	err_sub_type = rtas_mc_error_sub_type(mce_log);
>> -
>>   	if (initiator == RTAS_INITIATOR_UNKNOWN)
>>   		mce_err.initiator = MCE_INITIATOR_UNKNOWN;
>>   	else if (initiator == RTAS_INITIATOR_CPU)
>> @@ -588,6 +547,8 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
>>   		mce_err.severity = MCE_SEV_SEVERE;
>>   	else if (severity == RTAS_SEVERITY_ERROR)
>>   		mce_err.severity = MCE_SEV_SEVERE;
>> +	else if (severity == RTAS_SEVERITY_FATAL)
>> +		mce_err.severity = MCE_SEV_FATAL;
>>   	else
>>   		mce_err.severity = MCE_SEV_FATAL;
>>   
> What's this hunk for?
>
>> @@ -599,7 +560,18 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
>>   	mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN;
>>   	mce_err.error_class = MCE_ECLASS_UNKNOWN;
>>   
>> -	switch (error_type) {
>> +	if (!rtas_error_extended(errp))
>> +		goto out;
>> +
>> +	pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE);
>> +	if (!pseries_log)
>> +		goto out;
>> +
>> +	mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
>> +	error_type = mce_log->error_type;
>> +	err_sub_type = rtas_mc_error_sub_type(mce_log);
>> +
>> +	switch (mce_log->error_type) {
>>   	case MC_ERROR_TYPE_UE:
>>   		mce_err.error_type = MCE_ERROR_TYPE_UE;
>>   		mce_common_process_ue(regs, &mce_err);
>> @@ -692,41 +664,45 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
>>   		mce_err.error_type = MCE_ERROR_TYPE_DCACHE;
>>   		break;
>>   	case MC_ERROR_TYPE_I_CACHE:
>> -		mce_err.error_type = MCE_ERROR_TYPE_ICACHE;
>> +		mce_err.error_type = MCE_ERROR_TYPE_DCACHE;
>>   		break;
> And this one. Doesn't look right.
>
>>   	case MC_ERROR_TYPE_UNKNOWN:
>>   	default:
>>   		mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN;
>>   		break;
>>   	}
>> +
>> +#ifdef CONFIG_PPC_BOOK3S_64
>> +	if (disposition == RTAS_DISP_NOT_RECOVERED) {
>> +		switch (error_type) {
>> +		case	MC_ERROR_TYPE_SLB:
>> +		case	MC_ERROR_TYPE_ERAT:
>> +			/*
>> +			 * Store the old slb content in paca before flushing.
>> +			 * Print this when we go to virtual mode.
>> +			 * There are chances that we may hit MCE again if there
>> +			 * is a parity error on the SLB entry we trying to read
>> +			 * for saving. Hence limit the slb saving to single
>> +			 * level of recursion.
>> +			 */
>> +			if (local_paca->in_mce == 1)
>> +				slb_save_contents(local_paca->mce_faulty_slbs);
>> +			flush_and_reload_slb();
>> +			disposition = RTAS_DISP_FULLY_RECOVERED;
>> +			break;
>> +		default:
>> +			break;
>> +		}
>> +	} else if (disposition == RTAS_DISP_LIMITED_RECOVERY) {
>> +		/* Platform corrected itself but could be degraded */
>> +		pr_err("MCE: limited recovery, system may be degraded\n");
>> +		disposition = RTAS_DISP_FULLY_RECOVERED;
>> +	}
> I would prefer if you just keep the mce_handle_err_realmode function
> (can rename it if you want). It's actually changed a bit since the
> patch being reverted so we don't want to undo that.

Ok, I will leave it as is for now, we can change it later.

>
> Thanks,
> Nick
diff mbox series

Patch

diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
index 8613f9cc5798..62e1519b8355 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -511,58 +511,17 @@  int pSeries_system_reset_exception(struct pt_regs *regs)
 	return 0; /* need to perform reset */
 }
 
-static int mce_handle_err_realmode(int disposition, u8 error_type)
-{
-#ifdef CONFIG_PPC_BOOK3S_64
-	if (disposition == RTAS_DISP_NOT_RECOVERED) {
-		switch (error_type) {
-		case	MC_ERROR_TYPE_ERAT:
-			flush_erat();
-			disposition = RTAS_DISP_FULLY_RECOVERED;
-			break;
-		case	MC_ERROR_TYPE_SLB:
-			/*
-			 * Store the old slb content in paca before flushing.
-			 * Print this when we go to virtual mode.
-			 * There are chances that we may hit MCE again if there
-			 * is a parity error on the SLB entry we trying to read
-			 * for saving. Hence limit the slb saving to single
-			 * level of recursion.
-			 */
-			if (local_paca->in_mce == 1)
-				slb_save_contents(local_paca->mce_faulty_slbs);
-			flush_and_reload_slb();
-			disposition = RTAS_DISP_FULLY_RECOVERED;
-			break;
-		default:
-			break;
-		}
-	} else if (disposition == RTAS_DISP_LIMITED_RECOVERY) {
-		/* Platform corrected itself but could be degraded */
-		pr_err("MCE: limited recovery, system may be degraded\n");
-		disposition = RTAS_DISP_FULLY_RECOVERED;
-	}
-#endif
-	return disposition;
-}
-
-static int mce_handle_err_virtmode(struct pt_regs *regs,
-				   struct rtas_error_log *errp,
-				   struct pseries_mc_errorlog *mce_log,
-				   int disposition)
+static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp)
 {
 	struct mce_error_info mce_err = { 0 };
+	unsigned long eaddr = 0, paddr = 0;
+	struct pseries_errorlog *pseries_log;
+	struct pseries_mc_errorlog *mce_log;
+	int disposition = rtas_error_disposition(errp);
 	int initiator = rtas_error_initiator(errp);
 	int severity = rtas_error_severity(errp);
-	unsigned long eaddr = 0, paddr = 0;
 	u8 error_type, err_sub_type;
 
-	if (!mce_log)
-		goto out;
-
-	error_type = mce_log->error_type;
-	err_sub_type = rtas_mc_error_sub_type(mce_log);
-
 	if (initiator == RTAS_INITIATOR_UNKNOWN)
 		mce_err.initiator = MCE_INITIATOR_UNKNOWN;
 	else if (initiator == RTAS_INITIATOR_CPU)
@@ -588,6 +547,8 @@  static int mce_handle_err_virtmode(struct pt_regs *regs,
 		mce_err.severity = MCE_SEV_SEVERE;
 	else if (severity == RTAS_SEVERITY_ERROR)
 		mce_err.severity = MCE_SEV_SEVERE;
+	else if (severity == RTAS_SEVERITY_FATAL)
+		mce_err.severity = MCE_SEV_FATAL;
 	else
 		mce_err.severity = MCE_SEV_FATAL;
 
@@ -599,7 +560,18 @@  static int mce_handle_err_virtmode(struct pt_regs *regs,
 	mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN;
 	mce_err.error_class = MCE_ECLASS_UNKNOWN;
 
-	switch (error_type) {
+	if (!rtas_error_extended(errp))
+		goto out;
+
+	pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE);
+	if (!pseries_log)
+		goto out;
+
+	mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
+	error_type = mce_log->error_type;
+	err_sub_type = rtas_mc_error_sub_type(mce_log);
+
+	switch (mce_log->error_type) {
 	case MC_ERROR_TYPE_UE:
 		mce_err.error_type = MCE_ERROR_TYPE_UE;
 		mce_common_process_ue(regs, &mce_err);
@@ -692,41 +664,45 @@  static int mce_handle_err_virtmode(struct pt_regs *regs,
 		mce_err.error_type = MCE_ERROR_TYPE_DCACHE;
 		break;
 	case MC_ERROR_TYPE_I_CACHE:
-		mce_err.error_type = MCE_ERROR_TYPE_ICACHE;
+		mce_err.error_type = MCE_ERROR_TYPE_DCACHE;
 		break;
 	case MC_ERROR_TYPE_UNKNOWN:
 	default:
 		mce_err.error_type = MCE_ERROR_TYPE_UNKNOWN;
 		break;
 	}
+
+#ifdef CONFIG_PPC_BOOK3S_64
+	if (disposition == RTAS_DISP_NOT_RECOVERED) {
+		switch (error_type) {
+		case	MC_ERROR_TYPE_SLB:
+		case	MC_ERROR_TYPE_ERAT:
+			/*
+			 * Store the old slb content in paca before flushing.
+			 * Print this when we go to virtual mode.
+			 * There are chances that we may hit MCE again if there
+			 * is a parity error on the SLB entry we trying to read
+			 * for saving. Hence limit the slb saving to single
+			 * level of recursion.
+			 */
+			if (local_paca->in_mce == 1)
+				slb_save_contents(local_paca->mce_faulty_slbs);
+			flush_and_reload_slb();
+			disposition = RTAS_DISP_FULLY_RECOVERED;
+			break;
+		default:
+			break;
+		}
+	} else if (disposition == RTAS_DISP_LIMITED_RECOVERY) {
+		/* Platform corrected itself but could be degraded */
+		pr_err("MCE: limited recovery, system may be degraded\n");
+		disposition = RTAS_DISP_FULLY_RECOVERED;
+	}
+#endif
 out:
 	save_mce_event(regs, disposition == RTAS_DISP_FULLY_RECOVERED,
-		       &mce_err, regs->nip, eaddr, paddr);
-	return disposition;
-}
+			&mce_err, regs->nip, eaddr, paddr);
 
-static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp)
-{
-	struct pseries_errorlog *pseries_log;
-	struct pseries_mc_errorlog *mce_log = NULL;
-	int disposition = rtas_error_disposition(errp);
-	unsigned long msr;
-	u8 error_type;
-
-	if (!rtas_error_extended(errp))
-		goto out;
-
-	pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE);
-	if (!pseries_log)
-		goto out;
-
-	mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
-	error_type = mce_log->error_type;
-
-	disposition = mce_handle_err_realmode(disposition, error_type);
-out:
-	disposition = mce_handle_err_virtmode(regs, errp, mce_log,
-					      disposition);
 	return disposition;
 }