From patchwork Tue Jun 23 04:25:59 2015 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Benjamin Herrenschmidt X-Patchwork-Id: 487503 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from lists.ozlabs.org (lists.ozlabs.org [103.22.144.68]) (using TLSv1.2 with cipher AECDH-AES256-SHA (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 1FDCA140157 for ; Tue, 23 Jun 2015 14:27:06 +1000 (AEST) Received: from ozlabs.org (lists.ozlabs.org [IPv6:2401:3900:2:1::3]) by lists.ozlabs.org (Postfix) with ESMTP id E99481A0FC8 for ; Tue, 23 Jun 2015 14:27:05 +1000 (AEST) X-Original-To: skiboot@lists.ozlabs.org Delivered-To: skiboot@lists.ozlabs.org Received: from gate.crashing.org (gate.crashing.org [63.228.1.57]) (using TLSv1 with cipher DHE-RSA-AES256-SHA (256/256 bits)) (No client certificate requested) by lists.ozlabs.org (Postfix) with ESMTPS id 278481A0F72 for ; Tue, 23 Jun 2015 14:26:25 +1000 (AEST) Received: from pasglop.ozlabs.ibm.com (localhost.localdomain [127.0.0.1]) by gate.crashing.org (8.14.1/8.13.8) with ESMTP id t5N4Q2mS029011; Mon, 22 Jun 2015 23:26:14 -0500 From: Benjamin Herrenschmidt To: skiboot@lists.ozlabs.org Date: Tue, 23 Jun 2015 14:25:59 +1000 Message-Id: <1435033560-9180-9-git-send-email-benh@kernel.crashing.org> X-Mailer: git-send-email 2.1.4 In-Reply-To: <1435033560-9180-1-git-send-email-benh@kernel.crashing.org> References: <1435033560-9180-1-git-send-email-benh@kernel.crashing.org> Subject: [Skiboot] [PATCH 09/10] centaur: Improve FSI SCOM error handling X-BeenThere: skiboot@lists.ozlabs.org X-Mailman-Version: 2.1.20 Precedence: list List-Id: Mailing list for skiboot development List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , MIME-Version: 1.0 Errors-To: skiboot-bounces+incoming=patchwork.ozlabs.org@lists.ozlabs.org Sender: "Skiboot" Based on HostBoot, recovers from bad XSCOM addresses Signed-off-by: Benjamin Herrenschmidt --- hw/centaur.c | 122 ++++++++++++++++++++++++++++++++++-------------------- include/centaur.h | 1 + 2 files changed, 78 insertions(+), 45 deletions(-) diff --git a/hw/centaur.c b/hw/centaur.c index 24d2836..3df1291 100644 --- a/hw/centaur.c +++ b/hw/centaur.c @@ -59,6 +59,9 @@ #define SCAC_CONFIG_CLR 0x020115d0 #define SCAC_ENABLE_MSK PPC_BIT(0) +#define cent_log(__lev, __c, __fmt, ...) \ + prlog(__lev, "CENTAUR %x: " __fmt, __c->part_id, ##__VA_ARGS__) + static int64_t centaur_fsiscom_complete(struct centaur_chip *centaur) { int64_t rc; @@ -67,17 +70,49 @@ static int64_t centaur_fsiscom_complete(struct centaur_chip *centaur) rc = mfsi_read(centaur->fsi_master_chip_id, centaur->fsi_master_engine, centaur->fsi_master_port, FSI_STATUS_REG, &stat); if (rc) { - /* XXX Improve logging */ - prerror("CENTAUR: MFSI read error %lld reading STAT\n", rc); + cent_log(PR_ERR, centaur, "MFSI read error %lld reading STAT\n", rc); return rc; } if ((stat & (FSI_STATUS_ABORT | FSI_STATUS_ERRORS)) == 0) return OPAL_SUCCESS; - prerror("CENTAUR: Remote FSI error, stat=0x%08x\n", stat); + cent_log(PR_ERR, centaur, "Remote FSI SCOM error, status=0x%08x\n", stat); - /* XXX Handle recovery */ + /* All 1's ? Assume it's gone */ + if (stat == 0xffffffffu) { + cent_log(PR_ERR, centaur, "Chip appears to be dead !\n"); + centaur->valid = false; + + /* Here, hostboot grabs a pile of FFDC from the FSI layer, + * we could do that too ... + */ + return OPAL_HARDWARE; + } + /* Here HB prints the GPx registers which I believe are only + * in the host (FSI master). We skip that for now, we don't have + * a good API to them + */ + + /* Recovery sequence from HostBoot fsiscom.C + * if SCOM fails and FSI Master displays "MasterTimeOut" + * then 7,6 + * else if SCOM fails and FSI2PIB Status shows PIB abort + * then just perform unit reset (6) and wait 1 ms + * else (PIB_abort='0' but PIB error is unequal 0) + * then just perform unit reset (6) (wait not needed). + * + * Note: Waiting 1ms inside OPAL is a BIG NO NO !!! We have + * no choice but doing it at the moment but that will have + * to be fixed one way or another, possibly by returning some + * kind of busy status until the delay is expired. + */ + rc = mfsi_write(centaur->fsi_master_chip_id, centaur->fsi_master_engine, + centaur->fsi_master_port, FSI_ENG_RESET_REG, 0); + if (rc) { + cent_log(PR_ERR, centaur, "MFSI write error %lld resetting SCOM engine\n", + rc); + } return OPAL_HARDWARE; } @@ -90,8 +125,7 @@ static int64_t centaur_fsiscom_read(struct centaur_chip *centaur, uint32_t pcb_a rc = mfsi_write(centaur->fsi_master_chip_id, centaur->fsi_master_engine, centaur->fsi_master_port, FSI_CMD_REG, pcb_addr | FSI_CMD_RD); if (rc) { - /* XXX Improve logging */ - prerror("CENTAUR: MFSI write error %lld writing CMD\n", rc); + cent_log(PR_ERR, centaur, "MFSI write error %lld writing CMD\n", rc); return rc; } @@ -102,15 +136,13 @@ static int64_t centaur_fsiscom_read(struct centaur_chip *centaur, uint32_t pcb_a rc = mfsi_read(centaur->fsi_master_chip_id, centaur->fsi_master_engine, centaur->fsi_master_port, FSI_DATA0_REG, &data0); if (rc) { - /* XXX Improve logging */ - prerror("CENTAUR: MFSI read error %lld reading DATA0\n", rc); + cent_log(PR_ERR, centaur, "MFSI read error %lld reading DATA0\n", rc); return rc; } rc = mfsi_read(centaur->fsi_master_chip_id, centaur->fsi_master_engine, centaur->fsi_master_port, FSI_DATA1_REG, &data1); if (rc) { - /* XXX Improve logging */ - prerror("CENTAUR: MFSI read error %lld readking DATA1\n", rc); + cent_log(PR_ERR, centaur, "MFSI read error %lld readking DATA1\n", rc); return rc; } @@ -119,6 +151,33 @@ static int64_t centaur_fsiscom_read(struct centaur_chip *centaur, uint32_t pcb_a return OPAL_SUCCESS; } +static int64_t centaur_fsiscom_write(struct centaur_chip *centaur, uint32_t pcb_addr, + uint64_t val) +{ + int64_t rc; + + rc = mfsi_write(centaur->fsi_master_chip_id, centaur->fsi_master_engine, + centaur->fsi_master_port, FSI_DATA0_REG, hi32(val)); + if (rc) { + cent_log(PR_ERR, centaur, "MFSI write error %lld writing DATA0\n", rc); + return rc; + } + rc = mfsi_write(centaur->fsi_master_chip_id, centaur->fsi_master_engine, + centaur->fsi_master_port, FSI_DATA1_REG, lo32(val)); + if (rc) { + cent_log(PR_ERR, centaur, "MFSI write error %lld writing DATA1\n", rc); + return rc; + } + rc = mfsi_write(centaur->fsi_master_chip_id, centaur->fsi_master_engine, + centaur->fsi_master_port, FSI_CMD_REG, pcb_addr | FSI_CMD_WR); + if (rc) { + cent_log(PR_ERR, centaur, "MFSI write error %lld writing CMD\n", rc); + return rc; + } + + return centaur_fsiscom_complete(centaur); +} + struct centaur_chip *get_centaur(uint32_t part_id) { uint32_t hchip_id, mchan; @@ -156,36 +215,6 @@ struct centaur_chip *get_centaur(uint32_t part_id) return centaur; } -static int64_t centaur_fsiscom_write(struct centaur_chip *centaur, uint32_t pcb_addr, - uint64_t val) -{ - int64_t rc; - - rc = mfsi_write(centaur->fsi_master_chip_id, centaur->fsi_master_engine, - centaur->fsi_master_port, FSI_DATA0_REG, hi32(val)); - if (rc) { - /* XXX Improve logging */ - prerror("CENTAUR: MFSI write error %lld writing DATA0\n", rc); - return rc; - } - rc = mfsi_write(centaur->fsi_master_chip_id, centaur->fsi_master_engine, - centaur->fsi_master_port, FSI_DATA1_REG, lo32(val)); - if (rc) { - /* XXX Improve logging */ - prerror("CENTAUR: MFSI write error %lld writing DATA1\n", rc); - return rc; - } - rc = mfsi_write(centaur->fsi_master_chip_id, centaur->fsi_master_engine, - centaur->fsi_master_port, FSI_CMD_REG, pcb_addr | FSI_CMD_WR); - if (rc) { - /* XXX Improve logging */ - prerror("CENTAUR: MFSI write error %lld writing CMD\n", rc); - return rc; - } - - return centaur_fsiscom_complete(centaur); -} - int64_t centaur_xscom_read(uint32_t id, uint64_t pcb_addr, uint64_t *val) { struct centaur_chip *centaur = get_centaur(id); @@ -223,8 +252,9 @@ static bool centaur_check_id(struct centaur_chip *centaur) rc = centaur_fsiscom_read(centaur, 0xf000f, &val); if (rc) { - prerror("CENTAUR: FSISCOM error %lld reading ID register\n", - rc); + cent_log(PR_ERR, centaur, + " FSISCOM error %lld reading ID register\n", + rc); return false; } @@ -233,7 +263,8 @@ static bool centaur_check_id(struct centaur_chip *centaur) /* Identify chip */ if ((val & 0xff) != 0xe9) { - prerror("CENTAUR: CFAM ID 0x%02x is not a Centaur !\n", + cent_log(PR_ERR, centaur, + " CFAM ID 0x%02x is not a Centaur !\n", (unsigned int)(val & 0xff)); return false; } @@ -287,6 +318,7 @@ static bool centaur_add(uint32_t part_id, uint32_t mchip, uint32_t meng, prerror("CENTAUR: Duplicate centaur !\n"); return false; } + centaur->part_id = part_id; centaur->fsi_master_chip_id = mchip; centaur->fsi_master_port = mport; centaur->fsi_master_engine = meng ? MFSI_cMFSI1 : MFSI_cMFSI0; @@ -295,7 +327,7 @@ static bool centaur_add(uint32_t part_id, uint32_t mchip, uint32_t meng, if (!centaur_check_id(centaur)) return false; - printf("CENTAUR: ChipID 0x%x [DD%x.%x]\n", part_id, + cent_log(PR_INFO, centaur, "Found DD%x.%x chip\n", centaur->ec_level >> 4, centaur->ec_level & 0xf); @@ -343,7 +375,7 @@ int64_t centaur_enable_sensor_cache(uint32_t part_id) lock(¢aur->lock); if (centaur->scache_disable_count == 0) { - prerror("CENTAUR: Cache count going negative !\n"); + cent_log(PR_ERR, centaur, "Cache count going negative !\n"); backtrace(); goto bail; } diff --git a/include/centaur.h b/include/centaur.h index f515da7..6453e13 100644 --- a/include/centaur.h +++ b/include/centaur.h @@ -25,6 +25,7 @@ struct centaur_chip { bool valid; uint8_t ec_level; + uint32_t part_id; uint32_t fsi_master_chip_id; uint32_t fsi_master_port; uint32_t fsi_master_engine;