diff mbox

[net] bnx2x: bail out if unable to acquire stats_sema

Message ID 1378223159-8710-1-git-send-email-mschmidt@redhat.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Michal Schmidt Sept. 3, 2013, 3:45 p.m. UTC
If we fail to acquire stats_sema in the specified time limit,
the chip is probably dead. It probably does not matter whether we
try to continue or not, but certainly we should not up() the semaphore
afterwards.

Signed-off-by: Michal Schmidt <mschmidt@redhat.com>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_stats.c | 24 +++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

Comments

Dmitry Kravkov Sept. 3, 2013, 3:51 p.m. UTC | #1
> -----Original Message-----
> From: Michal Schmidt [mailto:mschmidt@redhat.com]
> Sent: Tuesday, September 03, 2013 6:46 PM
> To: davem@davemloft.net
> Cc: netdev@vger.kernel.org; Dmitry Kravkov; Ariel Elior; Eilon Greenstein
> Subject: [PATCH net] bnx2x: bail out if unable to acquire stats_sema
> 
> If we fail to acquire stats_sema in the specified time limit, the chip is
> probably dead. It probably does not matter whether we try to continue or
> not, but certainly we should not up() the semaphore afterwards.
> 
> Signed-off-by: Michal Schmidt <mschmidt@redhat.com>
> ---
>  drivers/net/ethernet/broadcom/bnx2x/bnx2x_stats.c | 24
> +++++++++++++++++------
>  1 file changed, 18 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_stats.c
> b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_stats.c
> index 86436c7..d60bb8b 100644
> --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_stats.c
> +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_stats.c
> @@ -538,16 +538,20 @@ static void __bnx2x_stats_start(struct bnx2x *bp)
> 
>  static void bnx2x_stats_start(struct bnx2x *bp)  {
> -	if (down_timeout(&bp->stats_sema, HZ/10))
> +	if (down_timeout(&bp->stats_sema, HZ/10)) {
>  		BNX2X_ERR("Unable to acquire stats lock\n");
> +		return;
> +	}
>  	__bnx2x_stats_start(bp);
>  	up(&bp->stats_sema);
>  }
> 
>  static void bnx2x_stats_pmf_start(struct bnx2x *bp)  {
> -	if (down_timeout(&bp->stats_sema, HZ/10))
> +	if (down_timeout(&bp->stats_sema, HZ/10)) {
>  		BNX2X_ERR("Unable to acquire stats lock\n");
> +		return;
> +	}
>  	bnx2x_stats_comp(bp);
>  	__bnx2x_stats_pmf_update(bp);
>  	__bnx2x_stats_start(bp);
> @@ -556,8 +560,10 @@ static void bnx2x_stats_pmf_start(struct bnx2x *bp)
> 
>  static void bnx2x_stats_pmf_update(struct bnx2x *bp)  {
> -	if (down_timeout(&bp->stats_sema, HZ/10))
> +	if (down_timeout(&bp->stats_sema, HZ/10)) {
>  		BNX2X_ERR("Unable to acquire stats lock\n");
> +		return;
> +	}
>  	__bnx2x_stats_pmf_update(bp);
>  	up(&bp->stats_sema);
>  }
> @@ -569,8 +575,10 @@ static void bnx2x_stats_restart(struct bnx2x *bp)
>  	 */
>  	if (IS_VF(bp))
>  		return;
> -	if (down_timeout(&bp->stats_sema, HZ/10))
> +	if (down_timeout(&bp->stats_sema, HZ/10)) {
>  		BNX2X_ERR("Unable to acquire stats lock\n");
> +		return;
> +	}
>  	bnx2x_stats_comp(bp);
>  	__bnx2x_stats_start(bp);
>  	up(&bp->stats_sema);
> @@ -1361,8 +1369,10 @@ static void bnx2x_stats_stop(struct bnx2x *bp)  {
>  	int update = 0;
> 
> -	if (down_timeout(&bp->stats_sema, HZ/10))
> +	if (down_timeout(&bp->stats_sema, HZ/10)) {
>  		BNX2X_ERR("Unable to acquire stats lock\n");
> +		return;
> +	}
> 
>  	bp->stats_started = false;
> 
> @@ -1997,8 +2007,10 @@ void bnx2x_afex_collect_stats(struct bnx2x *bp,
> void *void_afex_stats,  void bnx2x_stats_safe_exec(struct bnx2x *bp,
>  			   void (func_to_exec)(void *cookie),
>  			   void *cookie){
> -	if (down_timeout(&bp->stats_sema, HZ/10))
> +	if (down_timeout(&bp->stats_sema, HZ/10)) {
>  		BNX2X_ERR("Unable to acquire stats lock\n");
> +		return;
> +	}
>  	bnx2x_stats_comp(bp);
>  	func_to_exec(cookie);
>  	__bnx2x_stats_start(bp);
> --
> 1.8.3.1
> 
Acked-by: Dmitry Kravkov <dmitry@broadcom.com>

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Neal Cardwell Sept. 4, 2013, 5:17 p.m. UTC | #2
On Tue, Sep 3, 2013 at 11:51 AM, Dmitry Kravkov <dmitry@broadcom.com> wrote:
>> -----Original Message-----
>> From: Michal Schmidt [mailto:mschmidt@redhat.com]
>> Sent: Tuesday, September 03, 2013 6:46 PM
>> To: davem@davemloft.net
>> Cc: netdev@vger.kernel.org; Dmitry Kravkov; Ariel Elior; Eilon Greenstein
>> Subject: [PATCH net] bnx2x: bail out if unable to acquire stats_sema
>>
>> If we fail to acquire stats_sema in the specified time limit, the chip is
>> probably dead. It probably does not matter whether we try to continue or
>> not, but certainly we should not up() the semaphore afterwards.
>>
>> Signed-off-by: Michal Schmidt <mschmidt@redhat.com>
>> ---
>>  drivers/net/ethernet/broadcom/bnx2x/bnx2x_stats.c | 24
>> +++++++++++++++++------

It seems like this patch has the downside that if the down_timeout()
fails, then bnx2x_stats_handle() ends up updating the stats state
machine's state without really executing the real body of the
action().

In fact it seems like there is a more general pre-existing problem of
this flavor with the bnx2x stats state machine: the
bnx2x_stats_handle() function updates the state machine
bp->stats_state while holding the spin lock, but does not execute the
action() while holding any sort of synchronization, so AFAICT there is
nothing to guarantee that the state machine actions happen in the
order the state machine wants them to happen. For example, if stats
events fire such that we want to execute actions that disable and then
enable stats, we could instead end up executing the actions in the
order that would attempt to enable and then disable them, if we get
unlucky with respect to when interrupts fire, etc.

It seems to me that instead of having all of the callees of
bnx2x_stats_handle() try to down/up the semaphore, instead
bnx2x_stats_handle() should try to down the stats_sema at the top, and
then if successful, it should change the bp->stats_state, call the
action, and up the stats_sema. Would that work?

neal
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dmitry Kravkov Sept. 6, 2013, 6:40 a.m. UTC | #3
On Wed, Sep 4, 2013 at 8:17 PM, Neal Cardwell <ncardwell@google.com> wrote:
> On Tue, Sep 3, 2013 at 11:51 AM, Dmitry Kravkov <dmitry@broadcom.com> wrote:
>>> -----Original Message-----
>>> From: Michal Schmidt [mailto:mschmidt@redhat.com]
>>> Sent: Tuesday, September 03, 2013 6:46 PM
>>> To: davem@davemloft.net
>>> Cc: netdev@vger.kernel.org; Dmitry Kravkov; Ariel Elior; Eilon Greenstein
>>> Subject: [PATCH net] bnx2x: bail out if unable to acquire stats_sema
>>>
>>> If we fail to acquire stats_sema in the specified time limit, the chip is
>>> probably dead. It probably does not matter whether we try to continue or
>>> not, but certainly we should not up() the semaphore afterwards.
>>>
>>> Signed-off-by: Michal Schmidt <mschmidt@redhat.com>
>>> ---
>>>  drivers/net/ethernet/broadcom/bnx2x/bnx2x_stats.c | 24
>>> +++++++++++++++++------
>
> It seems like this patch has the downside that if the down_timeout()
> fails, then bnx2x_stats_handle() ends up updating the stats state
> machine's state without really executing the real body of the
> action().
>
> In fact it seems like there is a more general pre-existing problem of
> this flavor with the bnx2x stats state machine: the
> bnx2x_stats_handle() function updates the state machine
> bp->stats_state while holding the spin lock, but does not execute the
> action() while holding any sort of synchronization, so AFAICT there is
> nothing to guarantee that the state machine actions happen in the
> order the state machine wants them to happen. For example, if stats
> events fire such that we want to execute actions that disable and then
> enable stats, we could instead end up executing the actions in the
> order that would attempt to enable and then disable them, if we get
> unlucky with respect to when interrupts fire, etc.
>
> It seems to me that instead of having all of the callees of
> bnx2x_stats_handle() try to down/up the semaphore, instead
> bnx2x_stats_handle() should try to down the stats_sema at the top, and
> then if successful, it should change the bp->stats_state, call the
> action, and up the stats_sema. Would that work?
>
handle() is called from sleepable context (open/close) and timer
context, then it's not possible to use semaphore for pretection

> neal
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Neal Cardwell Sept. 7, 2013, 3:53 a.m. UTC | #4
On Fri, Sep 6, 2013 at 2:40 AM, Dmitry Kravkov <dkravkov@gmail.com> wrote:
> On Wed, Sep 4, 2013 at 8:17 PM, Neal Cardwell <ncardwell@google.com> wrote:
>> On Tue, Sep 3, 2013 at 11:51 AM, Dmitry Kravkov <dmitry@broadcom.com> wrote:
>>>> -----Original Message-----
>>>> From: Michal Schmidt [mailto:mschmidt@redhat.com]
>>>> Sent: Tuesday, September 03, 2013 6:46 PM
>>>> To: davem@davemloft.net
>>>> Cc: netdev@vger.kernel.org; Dmitry Kravkov; Ariel Elior; Eilon Greenstein
>>>> Subject: [PATCH net] bnx2x: bail out if unable to acquire stats_sema
>>>>
>>>> If we fail to acquire stats_sema in the specified time limit, the chip is
>>>> probably dead. It probably does not matter whether we try to continue or
>>>> not, but certainly we should not up() the semaphore afterwards.
>>>>
>>>> Signed-off-by: Michal Schmidt <mschmidt@redhat.com>
>>>> ---
>>>>  drivers/net/ethernet/broadcom/bnx2x/bnx2x_stats.c | 24
>>>> +++++++++++++++++------
>>
>> It seems like this patch has the downside that if the down_timeout()
>> fails, then bnx2x_stats_handle() ends up updating the stats state
>> machine's state without really executing the real body of the
>> action().
>>
>> In fact it seems like there is a more general pre-existing problem of
>> this flavor with the bnx2x stats state machine: the
>> bnx2x_stats_handle() function updates the state machine
>> bp->stats_state while holding the spin lock, but does not execute the
>> action() while holding any sort of synchronization, so AFAICT there is
>> nothing to guarantee that the state machine actions happen in the
>> order the state machine wants them to happen. For example, if stats
>> events fire such that we want to execute actions that disable and then
>> enable stats, we could instead end up executing the actions in the
>> order that would attempt to enable and then disable them, if we get
>> unlucky with respect to when interrupts fire, etc.
>>
>> It seems to me that instead of having all of the callees of
>> bnx2x_stats_handle() try to down/up the semaphore, instead
>> bnx2x_stats_handle() should try to down the stats_sema at the top, and
>> then if successful, it should change the bp->stats_state, call the
>> action, and up the stats_sema. Would that work?
>>
> handle() is called from sleepable context (open/close) and timer
> context, then it's not possible to use semaphore for pretection

But it seems like all of the callees of bnx2x_stats_handle() are
already using the stats_sema for protection, and the only difference
is that bnx2x_stats_update uses down_trylock and the other callees use
down_timeout. What about making this explicit in bnx2x_stats_handle,
with something like:

if (event == STATS_EVENT_UPDATE) {
  if (down_trylock(&bp->stats_sema)) {
       BNX2X_ERR("stats down_trylock failed\n");
       goto out;
  }
else {
  if (down_timeout(&bp->stats_sema, HZ/10)) {
       BNX2X_ERR("stats down_timeout failed\n");
       goto out;
  }
}
bp->stats_state = ...;
action = ...;
action(bp);
up(&bp->stats_sema);

That would allow us to protect the bnx2x state machine so that the
action() events happen in the correct order, and we don't (e.g.)
accidentally end up doing an enable-then-disable when we wanted
disable-then-enable.

Would that work?

neal


>> neal
>> --
>> To unsubscribe from this list: send the line "unsubscribe netdev" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
>
>
> --
> Best regards,
> Dmitry
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dmitry Kravkov Sept. 7, 2013, 9:34 a.m. UTC | #5
> -----Original Message-----
> From: Neal Cardwell [mailto:ncardwell@google.com]
> Sent: Saturday, September 07, 2013 6:54 AM
> To: Dmitry Kravkov
> Cc: Dmitry Kravkov; Michal Schmidt; davem@davemloft.net;
> netdev@vger.kernel.org; Ariel Elior; Eilon Greenstein; Havard Skinnemoen;
> Eric Dumazet
> Subject: Re: [PATCH net] bnx2x: bail out if unable to acquire stats_sema
> 
> On Fri, Sep 6, 2013 at 2:40 AM, Dmitry Kravkov <dkravkov@gmail.com>
> wrote:
> > On Wed, Sep 4, 2013 at 8:17 PM, Neal Cardwell <ncardwell@google.com>
> wrote:
> >> On Tue, Sep 3, 2013 at 11:51 AM, Dmitry Kravkov
> <dmitry@broadcom.com> wrote:
> >>>> -----Original Message-----
> >>>> From: Michal Schmidt [mailto:mschmidt@redhat.com]
> >>>> Sent: Tuesday, September 03, 2013 6:46 PM
> >>>> To: davem@davemloft.net
> >>>> Cc: netdev@vger.kernel.org; Dmitry Kravkov; Ariel Elior; Eilon
> >>>> Greenstein
> >>>> Subject: [PATCH net] bnx2x: bail out if unable to acquire
> >>>> stats_sema
> >>>>
> >>>> If we fail to acquire stats_sema in the specified time limit, the
> >>>> chip is probably dead. It probably does not matter whether we try
> >>>> to continue or not, but certainly we should not up() the semaphore
> afterwards.
> >>>>
> >>>> Signed-off-by: Michal Schmidt <mschmidt@redhat.com>
> >>>> ---
> >>>>  drivers/net/ethernet/broadcom/bnx2x/bnx2x_stats.c | 24
> >>>> +++++++++++++++++------
> >>
> >> It seems like this patch has the downside that if the down_timeout()
> >> fails, then bnx2x_stats_handle() ends up updating the stats state
> >> machine's state without really executing the real body of the
> >> action().
> >>
> >> In fact it seems like there is a more general pre-existing problem of
> >> this flavor with the bnx2x stats state machine: the
> >> bnx2x_stats_handle() function updates the state machine
> >> bp->stats_state while holding the spin lock, but does not execute the
> >> action() while holding any sort of synchronization, so AFAICT there
> >> is nothing to guarantee that the state machine actions happen in the
> >> order the state machine wants them to happen. For example, if stats
> >> events fire such that we want to execute actions that disable and
> >> then enable stats, we could instead end up executing the actions in
> >> the order that would attempt to enable and then disable them, if we
> >> get unlucky with respect to when interrupts fire, etc.
> >>
> >> It seems to me that instead of having all of the callees of
> >> bnx2x_stats_handle() try to down/up the semaphore, instead
> >> bnx2x_stats_handle() should try to down the stats_sema at the top,
> >> and then if successful, it should change the bp->stats_state, call
> >> the action, and up the stats_sema. Would that work?
> >>
> > handle() is called from sleepable context (open/close) and timer
> > context, then it's not possible to use semaphore for pretection
> 
> But it seems like all of the callees of bnx2x_stats_handle() are already using
> the stats_sema for protection, and the only difference is that
> bnx2x_stats_update uses down_trylock and the other callees use
> down_timeout. What about making this explicit in bnx2x_stats_handle, with
> something like:
> 
> if (event == STATS_EVENT_UPDATE) {
>   if (down_trylock(&bp->stats_sema)) {
>        BNX2X_ERR("stats down_trylock failed\n");
>        goto out;
>   }
> else {
>   if (down_timeout(&bp->stats_sema, HZ/10)) {
>        BNX2X_ERR("stats down_timeout failed\n");
>        goto out;
>   }
> }
> bp->stats_state = ...;
> action = ...;
> action(bp);
> up(&bp->stats_sema);
> 
> That would allow us to protect the bnx2x state machine so that the
> action() events happen in the correct order, and we don't (e.g.) accidentally
> end up doing an enable-then-disable when we wanted disable-then-
> enable.
> 
> Would that work?
Looks like there is no a lot of difference  from current implementation - I will try it and report back...

> 
> neal
> 
>

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_stats.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_stats.c
index 86436c7..d60bb8b 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_stats.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_stats.c
@@ -538,16 +538,20 @@  static void __bnx2x_stats_start(struct bnx2x *bp)
 
 static void bnx2x_stats_start(struct bnx2x *bp)
 {
-	if (down_timeout(&bp->stats_sema, HZ/10))
+	if (down_timeout(&bp->stats_sema, HZ/10)) {
 		BNX2X_ERR("Unable to acquire stats lock\n");
+		return;
+	}
 	__bnx2x_stats_start(bp);
 	up(&bp->stats_sema);
 }
 
 static void bnx2x_stats_pmf_start(struct bnx2x *bp)
 {
-	if (down_timeout(&bp->stats_sema, HZ/10))
+	if (down_timeout(&bp->stats_sema, HZ/10)) {
 		BNX2X_ERR("Unable to acquire stats lock\n");
+		return;
+	}
 	bnx2x_stats_comp(bp);
 	__bnx2x_stats_pmf_update(bp);
 	__bnx2x_stats_start(bp);
@@ -556,8 +560,10 @@  static void bnx2x_stats_pmf_start(struct bnx2x *bp)
 
 static void bnx2x_stats_pmf_update(struct bnx2x *bp)
 {
-	if (down_timeout(&bp->stats_sema, HZ/10))
+	if (down_timeout(&bp->stats_sema, HZ/10)) {
 		BNX2X_ERR("Unable to acquire stats lock\n");
+		return;
+	}
 	__bnx2x_stats_pmf_update(bp);
 	up(&bp->stats_sema);
 }
@@ -569,8 +575,10 @@  static void bnx2x_stats_restart(struct bnx2x *bp)
 	 */
 	if (IS_VF(bp))
 		return;
-	if (down_timeout(&bp->stats_sema, HZ/10))
+	if (down_timeout(&bp->stats_sema, HZ/10)) {
 		BNX2X_ERR("Unable to acquire stats lock\n");
+		return;
+	}
 	bnx2x_stats_comp(bp);
 	__bnx2x_stats_start(bp);
 	up(&bp->stats_sema);
@@ -1361,8 +1369,10 @@  static void bnx2x_stats_stop(struct bnx2x *bp)
 {
 	int update = 0;
 
-	if (down_timeout(&bp->stats_sema, HZ/10))
+	if (down_timeout(&bp->stats_sema, HZ/10)) {
 		BNX2X_ERR("Unable to acquire stats lock\n");
+		return;
+	}
 
 	bp->stats_started = false;
 
@@ -1997,8 +2007,10 @@  void bnx2x_afex_collect_stats(struct bnx2x *bp, void *void_afex_stats,
 void bnx2x_stats_safe_exec(struct bnx2x *bp,
 			   void (func_to_exec)(void *cookie),
 			   void *cookie){
-	if (down_timeout(&bp->stats_sema, HZ/10))
+	if (down_timeout(&bp->stats_sema, HZ/10)) {
 		BNX2X_ERR("Unable to acquire stats lock\n");
+		return;
+	}
 	bnx2x_stats_comp(bp);
 	func_to_exec(cookie);
 	__bnx2x_stats_start(bp);