From patchwork Wed Dec 20 02:16:23 2017 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Benjamin Herrenschmidt X-Patchwork-Id: 851169 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from lists.ozlabs.org (lists.ozlabs.org [103.22.144.68]) (using TLSv1.2 with cipher ADH-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 3z1djh73Hjz9s7m for ; Wed, 20 Dec 2017 13:18:32 +1100 (AEDT) Received: from lists.ozlabs.org (lists.ozlabs.org [IPv6:2401:3900:2:1::3]) by lists.ozlabs.org (Postfix) with ESMTP id 3z1djh465KzF06p for ; Wed, 20 Dec 2017 13:18:32 +1100 (AEDT) X-Original-To: skiboot@lists.ozlabs.org Delivered-To: skiboot@lists.ozlabs.org Authentication-Results: ozlabs.org; spf=permerror (mailfrom) smtp.mailfrom=kernel.crashing.org (client-ip=63.228.1.57; helo=gate.crashing.org; envelope-from=benh@kernel.crashing.org; receiver=) Received: from gate.crashing.org (gate.crashing.org [63.228.1.57]) (using TLSv1 with cipher DHE-RSA-AES256-SHA (256/256 bits)) (No client certificate requested) by lists.ozlabs.org (Postfix) with ESMTPS id 3z1dgm6DBwzF06p for ; Wed, 20 Dec 2017 13:16:52 +1100 (AEDT) Received: from pasglop.ozlabs.ibm.com (localhost.localdomain [127.0.0.1]) by gate.crashing.org (8.14.1/8.14.1) with ESMTP id vBK2GRpB029502; Tue, 19 Dec 2017 20:16:31 -0600 From: Benjamin Herrenschmidt To: skiboot@lists.ozlabs.org Date: Wed, 20 Dec 2017 13:16:23 +1100 Message-Id: <20171220021623.27139-3-benh@kernel.crashing.org> X-Mailer: git-send-email 2.14.3 In-Reply-To: <20171220021623.27139-1-benh@kernel.crashing.org> References: <20171220021623.27139-1-benh@kernel.crashing.org> Subject: [Skiboot] [PATCH v2 3/3] lock: Add additional lock auditing code X-BeenThere: skiboot@lists.ozlabs.org X-Mailman-Version: 2.1.24 Precedence: list List-Id: Mailing list for skiboot development List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , MIME-Version: 1.0 Errors-To: skiboot-bounces+incoming=patchwork.ozlabs.org@lists.ozlabs.org Sender: "Skiboot" Keep track of lock owner name and replace lock_depth counter with a per-cpu list of locks held by the cpu. This allows us to print the actual locks held in case we hit the (in)famous message about opal_pollers being run with a lock held. It also allows us to warn (and drop them) if locks are still held when returning to the OS or completing a scheduled job. Signed-off-by: Benjamin Herrenschmidt --- v2. Move the list_del() in unlock() to before we clear the lock value otherwise we can race with a concurrent locker. --- core/cpu.c | 6 ++++++ core/lock.c | 41 ++++++++++++++++++++++++++++++++--------- core/opal.c | 8 +++++++- core/stack.c | 1 + core/timebase.c | 2 +- include/cpu.h | 2 +- include/lock.h | 32 +++++++++++++++++++++++++++----- 7 files changed, 75 insertions(+), 17 deletions(-) diff --git a/core/cpu.c b/core/cpu.c index 7dd7c863..3e110c3c 100644 --- a/core/cpu.c +++ b/core/cpu.c @@ -285,6 +285,11 @@ void cpu_process_jobs(void) if (no_return) free(job); func(data); + if (!list_empty(&cpu->locks_held)) { + prlog(PR_ERR, "OPAL job %s returning with locks held\n", + job->name); + drop_my_locks(true); + } lock(&cpu->job_lock); if (!no_return) { cpu->job_count--; @@ -822,6 +827,7 @@ static void init_cpu_thread(struct cpu_thread *t, init_lock(&t->dctl_lock); init_lock(&t->job_lock); list_head_init(&t->job_queue); + list_head_init(&t->locks_held); t->stack_guard = STACK_CHECK_GUARD_BASE ^ pir; t->state = state; t->pir = pir; diff --git a/core/lock.c b/core/lock.c index b5e3323f..edfe1c7e 100644 --- a/core/lock.c +++ b/core/lock.c @@ -57,8 +57,8 @@ static void unlock_check(struct lock *l) if (l->in_con_path && this_cpu()->con_suspend == 0) lock_error(l, "Unlock con lock with console not suspended", 3); - if (this_cpu()->lock_depth == 0) - lock_error(l, "Releasing lock with 0 depth", 4); + if (list_empty(&this_cpu()->locks_held)) + lock_error(l, "Releasing lock we don't hold depth", 4); } #else @@ -89,7 +89,7 @@ static inline bool __try_lock(struct cpu_thread *cpu, struct lock *l) return false; } -bool try_lock(struct lock *l) +bool try_lock_caller(struct lock *l, const char *owner) { struct cpu_thread *cpu = this_cpu(); @@ -97,22 +97,23 @@ bool try_lock(struct lock *l) return true; if (__try_lock(cpu, l)) { + l->owner = owner; if (l->in_con_path) cpu->con_suspend++; - cpu->lock_depth++; + list_add(&cpu->locks_held, &l->list); return true; } return false; } -void lock(struct lock *l) +void lock_caller(struct lock *l, const char *owner) { if (bust_locks) return; lock_check(l); for (;;) { - if (try_lock(l)) + if (try_lock_caller(l, owner)) break; smt_lowest(); while (l->lock_val) @@ -130,8 +131,9 @@ void unlock(struct lock *l) unlock_check(l); + l->owner = NULL; + list_del(&l->list); lwsync(); - this_cpu()->lock_depth--; l->lock_val = 0; /* WARNING: On fast reboot, we can be reset right at that @@ -144,7 +146,7 @@ void unlock(struct lock *l) } } -bool lock_recursive(struct lock *l) +bool lock_recursive_caller(struct lock *l, const char *caller) { if (bust_locks) return false; @@ -152,7 +154,7 @@ bool lock_recursive(struct lock *l) if (lock_held_by_me(l)) return false; - lock(l); + lock_caller(l, caller); return true; } @@ -160,3 +162,24 @@ void init_locks(void) { bust_locks = false; } + +void dump_locks_list(void) +{ + struct lock *l; + + prlog(PR_ERR, "Locks held:\n"); + list_for_each(&this_cpu()->locks_held, l, list) + prlog(PR_ERR, " %s\n", l->owner); +} + +void drop_my_locks(bool warn) +{ + struct lock *l; + + while((l = list_pop(&this_cpu()->locks_held, struct lock, list)) != NULL) { + if (warn) + prlog(PR_ERR, " %s\n", l->owner); + unlock(l); + } +} + diff --git a/core/opal.c b/core/opal.c index d33d527c..1bca774c 100644 --- a/core/opal.c +++ b/core/opal.c @@ -182,6 +182,11 @@ int64_t opal_exit_check(int64_t retval, struct stack_frame *eframe) printf("CPU UN-ACCOUNTED FIRMWARE ENTRY! PIR=%04lx cpu @%p -> pir=%04x token=%llu retval=%lld\n", mfspr(SPR_PIR), cpu, cpu->pir, token, retval); } else { + if (!list_empty(&cpu->locks_held)) { + prlog(PR_ERR, "OPAL exiting with locks held, token=%llu retval=%lld\n", + token, retval); + drop_my_locks(true); + } sync(); /* release barrier vs quiescing */ cpu->in_opal_call--; } @@ -557,7 +562,7 @@ void opal_run_pollers(void) } this_cpu()->in_poller = true; - if (this_cpu()->lock_depth && pollers_with_lock_warnings < 64) { + if (!list_empty(&this_cpu()->locks_held) && pollers_with_lock_warnings < 64) { /** * @fwts-label OPALPollerWithLock * @fwts-advice opal_run_pollers() was called with a lock @@ -565,6 +570,7 @@ void opal_run_pollers(void) * lucky/careful. */ prlog(PR_ERR, "Running pollers with lock held !\n"); + dump_locks_list(); backtrace(); pollers_with_lock_warnings++; if (pollers_with_lock_warnings == 64) { diff --git a/core/stack.c b/core/stack.c index bb7fa625..af4d37d0 100644 --- a/core/stack.c +++ b/core/stack.c @@ -21,6 +21,7 @@ #include #include #include +#include #define STACK_BUF_ENTRIES 60 static struct bt_entry bt_buf[STACK_BUF_ENTRIES]; diff --git a/core/timebase.c b/core/timebase.c index f2dff44b..777e4ba6 100644 --- a/core/timebase.c +++ b/core/timebase.c @@ -53,7 +53,7 @@ void time_wait(unsigned long duration) { struct cpu_thread *c = this_cpu(); - if (this_cpu()->lock_depth) { + if (!list_empty(&this_cpu()->locks_held)) { time_wait_nopoll(duration); return; } diff --git a/include/cpu.h b/include/cpu.h index e3bc75f4..bec4b030 100644 --- a/include/cpu.h +++ b/include/cpu.h @@ -61,8 +61,8 @@ struct cpu_thread { uint64_t save_r1; void *icp_regs; uint32_t in_opal_call; - uint32_t lock_depth; uint32_t con_suspend; + struct list_head locks_held; bool con_need_flush; bool quiesce_opal_call; bool in_mcount; diff --git a/include/lock.h b/include/lock.h index 7bdfca28..061873ab 100644 --- a/include/lock.h +++ b/include/lock.h @@ -20,6 +20,7 @@ #include #include #include +#include struct lock { /* Lock value has bit 63 as lock bit and the PIR of the owner @@ -32,10 +33,19 @@ struct lock { * in which case taking it will suspend console flushing */ bool in_con_path; + + /* file/line of lock owner */ + const char *owner; + + /* linkage in per-cpu list of owned locks */ + struct list_node list; }; -/* Initializer */ -#define LOCK_UNLOCKED { .lock_val = 0, .in_con_path = 0 } +/* Initializer... not ideal but works for now. If we need different + * values for the fields and/or start getting warnings we'll have to + * play macro tricks + */ +#define LOCK_UNLOCKED { 0 } /* Note vs. libc and locking: * @@ -65,8 +75,14 @@ static inline void init_lock(struct lock *l) *l = (struct lock)LOCK_UNLOCKED; } -extern bool try_lock(struct lock *l); -extern void lock(struct lock *l); +#define LOCK_CALLER __FILE__ ":" stringify(__LINE__) + +#define try_lock(l) try_lock_caller(l, LOCK_CALLER) +#define lock(l) lock_caller(l, LOCK_CALLER) +#define lock_recursive(l) lock_recursive_caller(l, LOCK_CALLER) + +extern bool try_lock_caller(struct lock *l, const char *caller); +extern void lock_caller(struct lock *l, const char *caller); extern void unlock(struct lock *l); extern bool lock_held_by_me(struct lock *l); @@ -77,9 +93,15 @@ extern bool lock_held_by_me(struct lock *l); * returns false if the lock was already held by this cpu. If it returns * true, then the caller shall release it when done. */ -extern bool lock_recursive(struct lock *l); +extern bool lock_recursive_caller(struct lock *l, const char *caller); /* Called after per-cpu data structures are available */ extern void init_locks(void); +/* Dump the list of locks held by this CPU */ +extern void dump_locks_list(void); + +/* Clean all locks held by CPU (and warn if any) */ +extern void drop_my_locks(bool warn); + #endif /* __LOCK_H */