From patchwork Tue Jan 16 04:45:39 2018
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Benjamin Herrenschmidt <benh@kernel.crashing.org>
X-Patchwork-Id: 861236
Return-Path: <skiboot-bounces+incoming=patchwork.ozlabs.org@lists.ozlabs.org>
X-Original-To: incoming@patchwork.ozlabs.org
Delivered-To: patchwork-incoming@bilbo.ozlabs.org
Received: from lists.ozlabs.org (lists.ozlabs.org [103.22.144.68])
	(using TLSv1.2 with cipher ADH-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by ozlabs.org (Postfix) with ESMTPS id 3zLHkT2nLVz9s7s
	for <incoming@patchwork.ozlabs.org>;
	Tue, 16 Jan 2018 15:46:57 +1100 (AEDT)
Received: from lists.ozlabs.org (lists.ozlabs.org [IPv6:2401:3900:2:1::3])
	by lists.ozlabs.org (Postfix) with ESMTP id 3zLHkT0wZTzF0dq
	for <incoming@patchwork.ozlabs.org>;
	Tue, 16 Jan 2018 15:46:57 +1100 (AEDT)
X-Original-To: skiboot@lists.ozlabs.org
Delivered-To: skiboot@lists.ozlabs.org
Authentication-Results: ozlabs.org; spf=permerror (mailfrom)
	smtp.mailfrom=kernel.crashing.org (client-ip=63.228.1.57;
	helo=gate.crashing.org; envelope-from=benh@kernel.crashing.org;
	receiver=<UNKNOWN>)
Received: from gate.crashing.org (gate.crashing.org [63.228.1.57])
	(using TLSv1 with cipher DHE-RSA-AES256-SHA (256/256 bits))
	(No client certificate requested)
	by lists.ozlabs.org (Postfix) with ESMTPS id 3zLHkM2JLWzF0df
	for <skiboot@lists.ozlabs.org>; Tue, 16 Jan 2018 15:46:51 +1100 (AEDT)
Received: from pasglop.au.ibm.com (localhost.localdomain [127.0.0.1])
	by gate.crashing.org (8.14.1/8.14.1) with ESMTP id w0G4kMFJ009584;
	Mon, 15 Jan 2018 22:46:23 -0600
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
To: skiboot@lists.ozlabs.org
Date: Tue, 16 Jan 2018 15:45:39 +1100
Message-Id: <20180116044540.10707-1-benh@kernel.crashing.org>
X-Mailer: git-send-email 2.14.3
Subject: [Skiboot] [RFC PATCH 1/2] hmi: Don't re-read HMER multiple times
X-BeenThere: skiboot@lists.ozlabs.org
X-Mailman-Version: 2.1.24
Precedence: list
List-Id: Mailing list for skiboot development <skiboot.lists.ozlabs.org>
List-Unsubscribe: <https://lists.ozlabs.org/options/skiboot>,
	<mailto:skiboot-request@lists.ozlabs.org?subject=unsubscribe>
List-Archive: <http://lists.ozlabs.org/pipermail/skiboot/>
List-Post: <mailto:skiboot@lists.ozlabs.org>
List-Help: <mailto:skiboot-request@lists.ozlabs.org?subject=help>
List-Subscribe: <https://lists.ozlabs.org/listinfo/skiboot>,
	<mailto:skiboot-request@lists.ozlabs.org?subject=subscribe>
MIME-Version: 1.0
Errors-To: skiboot-bounces+incoming=patchwork.ozlabs.org@lists.ozlabs.org
Sender: "Skiboot"
	<skiboot-bounces+incoming=patchwork.ozlabs.org@lists.ozlabs.org>

We want to make sure all reporting and actions are based
upon the same snapshot of HMER in case bits get added
by HW while we are in OPAL.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
---
 core/hmi.c | 35 ++++++++++++++---------------------
 1 file changed, 14 insertions(+), 21 deletions(-)
diff --git a/core/hmi.c b/core/hmi.c
index eb4faa38..5642bd0b 100644
--- a/core/hmi.c
+++ b/core/hmi.c
@@ -719,16 +719,13 @@ static int get_split_core_mode(void)
  *	- SPR_TFMR_TB_RESIDUE_ERR
  *	- SPR_TFMR_HDEC_PARITY_ERROR
  */
-static void pre_recovery_cleanup_p8(void)
+static void pre_recovery_cleanup_p8(uint64_t hmer)
 {
-	uint64_t hmer;
 	uint64_t tfmr;
 	uint32_t sibling_thread_mask;
 	int split_core_mode, subcore_id, thread_id, threads_per_core;
 	int i;
 
-	hmer = mfspr(SPR_HMER);
-
 	/* exit if it is not Time facility error. */
 	if (!(hmer & SPR_HMER_TFAC_ERROR))
 		return;
@@ -826,15 +823,12 @@ static void pre_recovery_cleanup_p8(void)
  *	- SPR_TFMR_TB_RESIDUE_ERR
  *	- SPR_TFMR_HDEC_PARITY_ERROR
  */
-static void pre_recovery_cleanup_p9(void)
+static void pre_recovery_cleanup_p9(uint64_t hmer)
 {
-	uint64_t hmer;
 	uint64_t tfmr;
 	int threads_per_core = cpu_thread_count;
 	int i;
 
-	hmer = mfspr(SPR_HMER);
-
 	/* exit if it is not Time facility error. */
 	if (!(hmer & SPR_HMER_TFAC_ERROR))
 		return;
@@ -912,12 +906,12 @@ static void pre_recovery_cleanup_p9(void)
 	wait_for_cleanup_complete();
 }
 
-static void pre_recovery_cleanup(void)
+static void pre_recovery_cleanup(uint64_t hmer)
 {
 	if (proc_gen == proc_gen_p9)
-		return pre_recovery_cleanup_p9();
+		return pre_recovery_cleanup_p9(hmer);
 	else
-		return pre_recovery_cleanup_p8();
+		return pre_recovery_cleanup_p8(hmer);
 }
 
 static void hmi_exit(void)
@@ -926,9 +920,8 @@ static void hmi_exit(void)
 	*(this_cpu()->core_hmi_state_ptr) &= ~(this_cpu()->thread_mask);
 }
 
-static void hmi_print_debug(const uint8_t *msg)
+static void hmi_print_debug(const uint8_t *msg, uint64_t hmer)
 {
-	uint64_t hmer = mfspr(SPR_HMER);
 	const char *loc;
 	uint32_t core_id, thread_index;
 
@@ -959,7 +952,7 @@ int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
 	 * In case of split core, some of the Timer facility errors need
 	 * cleanup to be done before we proceed with the error recovery.
 	 */
-	pre_recovery_cleanup();
+	pre_recovery_cleanup(hmer);
 
 	lock(&hmi_lock);
 	/*
@@ -978,7 +971,7 @@ int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
 			hmi_evt->type = OpalHMI_ERROR_PROC_RECOV_DONE;
 			queue_hmi_event(hmi_evt, recover);
 		}
-		hmi_print_debug("Processor recovery Done.");
+		hmi_print_debug("Processor recovery Done.", hmer);
 	}
 	if (hmer & SPR_HMER_PROC_RECV_ERROR_MASKED) {
 		hmer &= ~SPR_HMER_PROC_RECV_ERROR_MASKED;
@@ -987,7 +980,7 @@ int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
 			hmi_evt->type = OpalHMI_ERROR_PROC_RECOV_MASKED;
 			queue_hmi_event(hmi_evt, recover);
 		}
-		hmi_print_debug("Processor recovery Done (masked).");
+		hmi_print_debug("Processor recovery Done (masked).", hmer);
 	}
 	if (hmer & SPR_HMER_PROC_RECV_AGAIN) {
 		hmer &= ~SPR_HMER_PROC_RECV_AGAIN;
@@ -997,13 +990,13 @@ int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
 			queue_hmi_event(hmi_evt, recover);
 		}
 		hmi_print_debug("Processor recovery occurred again before"
-				"bit2 was cleared\n");
+				"bit2 was cleared\n", hmer);
 	}
 	/* Assert if we see malfunction alert, we can not continue. */
 	if (hmer & SPR_HMER_MALFUNCTION_ALERT) {
 		hmer &= ~SPR_HMER_MALFUNCTION_ALERT;
 
-		hmi_print_debug("Malfunction Alert");
+		hmi_print_debug("Malfunction Alert", hmer);
 		if (hmi_evt)
 			decode_malfunction(hmi_evt);
 	}
@@ -1012,7 +1005,7 @@ int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
 	if (hmer & SPR_HMER_HYP_RESOURCE_ERR) {
 		hmer &= ~SPR_HMER_HYP_RESOURCE_ERR;
 
-		hmi_print_debug("Hypervisor resource error");
+		hmi_print_debug("Hypervisor resource error", hmer);
 		recover = 0;
 		if (hmi_evt) {
 			hmi_evt->severity = OpalHMI_SEV_FATAL;
@@ -1028,7 +1021,7 @@ int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
 	if (hmer & SPR_HMER_TFAC_ERROR) {
 		tfmr = mfspr(SPR_TFMR);		/* save original TFMR */
 
-		hmi_print_debug("Timer Facility Error");
+		hmi_print_debug("Timer Facility Error", hmer);
 
 		hmer &= ~SPR_HMER_TFAC_ERROR;
 		recover = chiptod_recover_tb_errors();
@@ -1043,7 +1036,7 @@ int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
 		tfmr = mfspr(SPR_TFMR);		/* save original TFMR */
 		hmer &= ~SPR_HMER_TFMR_PARITY_ERROR;
 
-		hmi_print_debug("TFMR parity Error");
+		hmi_print_debug("TFMR parity Error", hmer);
 		recover = chiptod_recover_tb_errors();
 		if (hmi_evt) {
 			hmi_evt->severity = OpalHMI_SEV_FATAL;

From patchwork Tue Jan 16 04:45:40 2018
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Benjamin Herrenschmidt <benh@kernel.crashing.org>
X-Patchwork-Id: 861238
Return-Path: <skiboot-bounces+incoming=patchwork.ozlabs.org@lists.ozlabs.org>
X-Original-To: incoming@patchwork.ozlabs.org
Delivered-To: patchwork-incoming@bilbo.ozlabs.org
Received: from lists.ozlabs.org (lists.ozlabs.org [IPv6:2401:3900:2:1::3])
	(using TLSv1.2 with cipher ADH-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by ozlabs.org (Postfix) with ESMTPS id 3zLHkl35ymz9s7s
	for <incoming@patchwork.ozlabs.org>;
	Tue, 16 Jan 2018 15:47:11 +1100 (AEDT)
Received: from lists.ozlabs.org (lists.ozlabs.org [IPv6:2401:3900:2:1::3])
	by lists.ozlabs.org (Postfix) with ESMTP id 3zLHkl230szF0dq
	for <incoming@patchwork.ozlabs.org>;
	Tue, 16 Jan 2018 15:47:11 +1100 (AEDT)
X-Original-To: skiboot@lists.ozlabs.org
Delivered-To: skiboot@lists.ozlabs.org
Authentication-Results: ozlabs.org; spf=permerror (mailfrom)
	smtp.mailfrom=kernel.crashing.org (client-ip=63.228.1.57;
	helo=gate.crashing.org; envelope-from=benh@kernel.crashing.org;
	receiver=<UNKNOWN>)
Received: from gate.crashing.org (gate.crashing.org [63.228.1.57])
	(using TLSv1 with cipher DHE-RSA-AES256-SHA (256/256 bits))
	(No client certificate requested)
	by lists.ozlabs.org (Postfix) with ESMTPS id 3zLHkZ0XtZzF0fY
	for <skiboot@lists.ozlabs.org>; Tue, 16 Jan 2018 15:47:01 +1100 (AEDT)
Received: from pasglop.au.ibm.com (localhost.localdomain [127.0.0.1])
	by gate.crashing.org (8.14.1/8.14.1) with ESMTP id w0G4kMFK009584;
	Mon, 15 Jan 2018 22:46:26 -0600
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
To: skiboot@lists.ozlabs.org
Date: Tue, 16 Jan 2018 15:45:40 +1100
Message-Id: <20180116044540.10707-2-benh@kernel.crashing.org>
X-Mailer: git-send-email 2.14.3
In-Reply-To: <20180116044540.10707-1-benh@kernel.crashing.org>
References: <20180116044540.10707-1-benh@kernel.crashing.org>
Subject: [Skiboot] [RFC PATCH 2/2] hmi: Remove races in clearing HMER
X-BeenThere: skiboot@lists.ozlabs.org
X-Mailman-Version: 2.1.24
Precedence: list
List-Id: Mailing list for skiboot development <skiboot.lists.ozlabs.org>
List-Unsubscribe: <https://lists.ozlabs.org/options/skiboot>,
	<mailto:skiboot-request@lists.ozlabs.org?subject=unsubscribe>
List-Archive: <http://lists.ozlabs.org/pipermail/skiboot/>
List-Post: <mailto:skiboot@lists.ozlabs.org>
List-Help: <mailto:skiboot-request@lists.ozlabs.org?subject=help>
List-Subscribe: <https://lists.ozlabs.org/listinfo/skiboot>,
	<mailto:skiboot-request@lists.ozlabs.org?subject=subscribe>
MIME-Version: 1.0
Errors-To: skiboot-bounces+incoming=patchwork.ozlabs.org@lists.ozlabs.org
Sender: "Skiboot"
	<skiboot-bounces+incoming=patchwork.ozlabs.org@lists.ozlabs.org>

Writing to HMER acts as an "AND". The current code writes back the
value we originally read with the bits we handled cleared. This is
racy, if a new bit gets set in HW after the original read, we'll end
up clearing it without handling it.

Instead, use an all 1's mask with only the bit handled cleared.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
---
 core/hmi.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/core/hmi.c b/core/hmi.c
index 5642bd0b..9fc4927d 100644
--- a/core/hmi.c
+++ b/core/hmi.c
@@ -946,7 +946,7 @@ static void hmi_print_debug(const uint8_t *msg, uint64_t hmer)
 int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
 {
 	int recover = 1;
-	uint64_t tfmr;
+	uint64_t tfmr, handled = 0;
 
 	/*
 	 * In case of split core, some of the Timer facility errors need
@@ -965,7 +965,7 @@ int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
 	if (hmi_evt)
 		hmi_evt->hmer = hmer;
 	if (hmer & SPR_HMER_PROC_RECV_DONE) {
-		hmer &= ~SPR_HMER_PROC_RECV_DONE;
+		handled |= SPR_HMER_PROC_RECV_DONE;
 		if (hmi_evt) {
 			hmi_evt->severity = OpalHMI_SEV_NO_ERROR;
 			hmi_evt->type = OpalHMI_ERROR_PROC_RECOV_DONE;
@@ -974,7 +974,7 @@ int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
 		hmi_print_debug("Processor recovery Done.", hmer);
 	}
 	if (hmer & SPR_HMER_PROC_RECV_ERROR_MASKED) {
-		hmer &= ~SPR_HMER_PROC_RECV_ERROR_MASKED;
+		handled |= SPR_HMER_PROC_RECV_ERROR_MASKED;
 		if (hmi_evt) {
 			hmi_evt->severity = OpalHMI_SEV_NO_ERROR;
 			hmi_evt->type = OpalHMI_ERROR_PROC_RECOV_MASKED;
@@ -983,7 +983,7 @@ int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
 		hmi_print_debug("Processor recovery Done (masked).", hmer);
 	}
 	if (hmer & SPR_HMER_PROC_RECV_AGAIN) {
-		hmer &= ~SPR_HMER_PROC_RECV_AGAIN;
+		handled |= SPR_HMER_PROC_RECV_AGAIN;
 		if (hmi_evt) {
 			hmi_evt->severity = OpalHMI_SEV_NO_ERROR;
 			hmi_evt->type = OpalHMI_ERROR_PROC_RECOV_DONE_AGAIN;
@@ -994,7 +994,7 @@ int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
 	}
 	/* Assert if we see malfunction alert, we can not continue. */
 	if (hmer & SPR_HMER_MALFUNCTION_ALERT) {
-		hmer &= ~SPR_HMER_MALFUNCTION_ALERT;
+		handled |= SPR_HMER_MALFUNCTION_ALERT;
 
 		hmi_print_debug("Malfunction Alert", hmer);
 		if (hmi_evt)
@@ -1003,7 +1003,7 @@ int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
 
 	/* Assert if we see Hypervisor resource error, we can not continue. */
 	if (hmer & SPR_HMER_HYP_RESOURCE_ERR) {
-		hmer &= ~SPR_HMER_HYP_RESOURCE_ERR;
+		handled |= SPR_HMER_HYP_RESOURCE_ERR;
 
 		hmi_print_debug("Hypervisor resource error", hmer);
 		recover = 0;
@@ -1020,10 +1020,10 @@ int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
 	 */
 	if (hmer & SPR_HMER_TFAC_ERROR) {
 		tfmr = mfspr(SPR_TFMR);		/* save original TFMR */
+		handled |= SPR_HMER_TFAC_ERROR;
 
 		hmi_print_debug("Timer Facility Error", hmer);
 
-		hmer &= ~SPR_HMER_TFAC_ERROR;
 		recover = chiptod_recover_tb_errors();
 		if (hmi_evt) {
 			hmi_evt->severity = OpalHMI_SEV_ERROR_SYNC;
@@ -1034,7 +1034,7 @@ int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
 	}
 	if (hmer & SPR_HMER_TFMR_PARITY_ERROR) {
 		tfmr = mfspr(SPR_TFMR);		/* save original TFMR */
-		hmer &= ~SPR_HMER_TFMR_PARITY_ERROR;
+		handled |= SPR_HMER_TFMR_PARITY_ERROR;
 
 		hmi_print_debug("TFMR parity Error", hmer);
 		recover = chiptod_recover_tb_errors();
@@ -1051,9 +1051,11 @@ int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
 	/*
 	 * HMER bits are sticky, once set to 1 they remain set to 1 until
 	 * they are set to 0. Reset the error source bit to 0, otherwise
-	 * we keep getting HMI interrupt again and again.
+	 * we keep getting HMI interrupt again and again. Writing to HMER
+	 * acts as an AND, so we write mask of all 1's except for the bits
+	 * we want to clear.
 	 */
-	mtspr(SPR_HMER, hmer);
+	mtspr(SPR_HMER, ~handled);
 	hmi_exit();
 	/* Set the TB state looking at TFMR register before we head out. */
 	this_cpu()->tb_invalid = !(mfspr(SPR_TFMR) & SPR_TFMR_TB_VALID);