diff mbox series

[2/2] i2c: smbus: Send alert notifications to all devices if source not found

Message ID 20220110172857.2980523-3-linux@roeck-us.net
State Superseded
Delegated to: Wolfram Sang
Headers show
Series i2c: smbus: Handle stuck alerts | expand

Commit Message

Guenter Roeck Jan. 10, 2022, 5:28 p.m. UTC
If a SMBUs alert is received and the originating device is not found,
the reason may be that the address reported on the SMBus alert address
is corrupted, for example because multiple devices asserted alert and
do not correctly implement SMBus arbitration.

If this happens, call alert handlers on all devices connected to the
given I2C bus, in the hope that this cleans up the situation. Retry
twice before giving up.

This change reliably fixed the problem on a system with multiple devices
on a single bus. Example log where the device on address 0x18 (ADM1021)
and on address 0x4c (ADM7461A) both had the alert line asserted:

smbus_alert 3-000c: SMBALERT# from dev 0x0c, flag 0
smbus_alert 3-000c: no driver alert()!
smbus_alert 3-000c: SMBALERT# from dev 0x0c, flag 0
smbus_alert 3-000c: no driver alert()!
lm90 3-0018: temp1 out of range, please check!
lm90 3-0018: Disabling ALERT#
lm90 3-0029: Everything OK
lm90 3-002a: Everything OK
lm90 3-004c: temp1 out of range, please check!
lm90 3-004c: temp2 out of range, please check!
lm90 3-004c: Disabling ALERT#

Fixes: b5527a7766f0 ("i2c: Add SMBus alert support")
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/i2c/i2c-smbus.c | 38 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 36 insertions(+), 2 deletions(-)

Comments

Wolfram Sang July 28, 2024, 8:04 p.m. UTC | #1
On Mon, Jan 10, 2022 at 09:28:57AM -0800, Guenter Roeck wrote:
> If a SMBUs alert is received and the originating device is not found,
> the reason may be that the address reported on the SMBus alert address
> is corrupted, for example because multiple devices asserted alert and
> do not correctly implement SMBus arbitration.
> 
> If this happens, call alert handlers on all devices connected to the
> given I2C bus, in the hope that this cleans up the situation. Retry
> twice before giving up.

High level question: why the retry? Did you experience address
collisions going away on the second try? My guess is that they would be
mostly persistent, so we could call smbus_do_alert_force() right away?

> 
> This change reliably fixed the problem on a system with multiple devices
> on a single bus. Example log where the device on address 0x18 (ADM1021)
> and on address 0x4c (ADM7461A) both had the alert line asserted:
> 
> smbus_alert 3-000c: SMBALERT# from dev 0x0c, flag 0
> smbus_alert 3-000c: no driver alert()!
> smbus_alert 3-000c: SMBALERT# from dev 0x0c, flag 0
> smbus_alert 3-000c: no driver alert()!
> lm90 3-0018: temp1 out of range, please check!
> lm90 3-0018: Disabling ALERT#
> lm90 3-0029: Everything OK
> lm90 3-002a: Everything OK
> lm90 3-004c: temp1 out of range, please check!
> lm90 3-004c: temp2 out of range, please check!
> lm90 3-004c: Disabling ALERT#
> 
> Fixes: b5527a7766f0 ("i2c: Add SMBus alert support")
> Signed-off-by: Guenter Roeck <linux@roeck-us.net>
> ---
>  drivers/i2c/i2c-smbus.c | 38 ++++++++++++++++++++++++++++++++++++--
>  1 file changed, 36 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/i2c/i2c-smbus.c b/drivers/i2c/i2c-smbus.c
> index 533c885b99ac..f48cec19db41 100644
> --- a/drivers/i2c/i2c-smbus.c
> +++ b/drivers/i2c/i2c-smbus.c
> @@ -65,6 +65,32 @@ static int smbus_do_alert(struct device *dev, void *addrp)
>  	return ret;
>  }
>  
> +/* Same as above, but call back all drivers with alert handler */
> +
> +static int smbus_do_alert_force(struct device *dev, void *addrp)
> +{
> +	struct i2c_client *client = i2c_verify_client(dev);
> +	struct alert_data *data = addrp;
> +	struct i2c_driver *driver;
> +
> +	if (!client || (client->flags & I2C_CLIENT_TEN))
> +		return 0;
> +
> +	/*
> +	 * Drivers should either disable alerts, or provide at least
> +	 * a minimal handler. Lock so the driver won't change.
> +	 */
> +	device_lock(dev);
> +	if (client->dev.driver) {
> +		driver = to_i2c_driver(client->dev.driver);
> +		if (driver->alert)
> +			driver->alert(client, data->type, data->data);
> +	}
> +	device_unlock(dev);
> +
> +	return 0;
> +}
> +
>  /*
>   * The alert IRQ handler needs to hand work off to a task which can issue
>   * SMBus calls, because those sleeping calls can't be made in IRQ context.
> @@ -74,6 +100,7 @@ static irqreturn_t smbus_alert(int irq, void *d)
>  	struct i2c_smbus_alert *alert = d;
>  	struct i2c_client *ara;
>  	unsigned short prev_addr = 0;	/* Not a valid address */
> +	int retries = 0;
>  
>  	ara = alert->ara;
>  
> @@ -111,8 +138,15 @@ static irqreturn_t smbus_alert(int irq, void *d)
>  		 * Note: This assumes that a driver with alert handler handles
>  		 * the alert properly and clears it if necessary.
>  		 */
> -		if (data.addr == prev_addr && status != -EBUSY)
> -			break;
> +		if (data.addr == prev_addr && status != -EBUSY) {
> +			/* retry once */
> +			if (retries++)
> +				break;
> +			device_for_each_child(&ara->adapter->dev, &data,
> +					      smbus_do_alert_force);
> +		} else {
> +			retries = 0;
> +		}
>  		prev_addr = data.addr;
>  	}
>  
> -- 
> 2.33.0
>
Guenter Roeck July 29, 2024, 12:31 a.m. UTC | #2
On 7/28/24 13:04, Wolfram Sang wrote:
> On Mon, Jan 10, 2022 at 09:28:57AM -0800, Guenter Roeck wrote:
>> If a SMBUs alert is received and the originating device is not found,
>> the reason may be that the address reported on the SMBus alert address
>> is corrupted, for example because multiple devices asserted alert and
>> do not correctly implement SMBus arbitration.
>>
>> If this happens, call alert handlers on all devices connected to the
>> given I2C bus, in the hope that this cleans up the situation. Retry
>> twice before giving up.
> 
> High level question: why the retry? Did you experience address
> collisions going away on the second try? My guess is that they would be
> mostly persistent, so we could call smbus_do_alert_force() right away?
> 

I honestly don't recall. I had some brute force code to trigger alerts
on connected chips. Maybe the idea was to catch situations where another
alert was raised after or during the first cycle.

As for "call smbus_do_alert_force() right away", I am not sure I understand.
Isn't that what the code is doing twice ?

Thanks,
Guenter

>>
>> This change reliably fixed the problem on a system with multiple devices
>> on a single bus. Example log where the device on address 0x18 (ADM1021)
>> and on address 0x4c (ADM7461A) both had the alert line asserted:
Side note: That was ADT7461A, not ADM7461A.


>>
>> smbus_alert 3-000c: SMBALERT# from dev 0x0c, flag 0
>> smbus_alert 3-000c: no driver alert()!
>> smbus_alert 3-000c: SMBALERT# from dev 0x0c, flag 0
>> smbus_alert 3-000c: no driver alert()!
>> lm90 3-0018: temp1 out of range, please check!
>> lm90 3-0018: Disabling ALERT#
>> lm90 3-0029: Everything OK
>> lm90 3-002a: Everything OK
>> lm90 3-004c: temp1 out of range, please check!
>> lm90 3-004c: temp2 out of range, please check!
>> lm90 3-004c: Disabling ALERT#
>>
>> Fixes: b5527a7766f0 ("i2c: Add SMBus alert support")
>> Signed-off-by: Guenter Roeck <linux@roeck-us.net>
>> ---
>>   drivers/i2c/i2c-smbus.c | 38 ++++++++++++++++++++++++++++++++++++--
>>   1 file changed, 36 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/i2c/i2c-smbus.c b/drivers/i2c/i2c-smbus.c
>> index 533c885b99ac..f48cec19db41 100644
>> --- a/drivers/i2c/i2c-smbus.c
>> +++ b/drivers/i2c/i2c-smbus.c
>> @@ -65,6 +65,32 @@ static int smbus_do_alert(struct device *dev, void *addrp)
>>   	return ret;
>>   }
>>   
>> +/* Same as above, but call back all drivers with alert handler */
>> +
>> +static int smbus_do_alert_force(struct device *dev, void *addrp)
>> +{
>> +	struct i2c_client *client = i2c_verify_client(dev);
>> +	struct alert_data *data = addrp;
>> +	struct i2c_driver *driver;
>> +
>> +	if (!client || (client->flags & I2C_CLIENT_TEN))
>> +		return 0;
>> +
>> +	/*
>> +	 * Drivers should either disable alerts, or provide at least
>> +	 * a minimal handler. Lock so the driver won't change.
>> +	 */
>> +	device_lock(dev);
>> +	if (client->dev.driver) {
>> +		driver = to_i2c_driver(client->dev.driver);
>> +		if (driver->alert)
>> +			driver->alert(client, data->type, data->data);
>> +	}
>> +	device_unlock(dev);
>> +
>> +	return 0;
>> +}
>> +
>>   /*
>>    * The alert IRQ handler needs to hand work off to a task which can issue
>>    * SMBus calls, because those sleeping calls can't be made in IRQ context.
>> @@ -74,6 +100,7 @@ static irqreturn_t smbus_alert(int irq, void *d)
>>   	struct i2c_smbus_alert *alert = d;
>>   	struct i2c_client *ara;
>>   	unsigned short prev_addr = 0;	/* Not a valid address */
>> +	int retries = 0;
>>   
>>   	ara = alert->ara;
>>   
>> @@ -111,8 +138,15 @@ static irqreturn_t smbus_alert(int irq, void *d)
>>   		 * Note: This assumes that a driver with alert handler handles
>>   		 * the alert properly and clears it if necessary.
>>   		 */
>> -		if (data.addr == prev_addr && status != -EBUSY)
>> -			break;
>> +		if (data.addr == prev_addr && status != -EBUSY) {
>> +			/* retry once */
>> +			if (retries++)
>> +				break;
>> +			device_for_each_child(&ara->adapter->dev, &data,
>> +					      smbus_do_alert_force);
>> +		} else {
>> +			retries = 0;
>> +		}
>>   		prev_addr = data.addr;
>>   	}
>>   
>> -- 
>> 2.33.0
>>
Wolfram Sang July 29, 2024, 7:57 a.m. UTC | #3
Hi Guenter,

thanks for the feedback!

> > High level question: why the retry? Did you experience address
> > collisions going away on the second try? My guess is that they would be
> > mostly persistent, so we could call smbus_do_alert_force() right away?
> > 
> 
> I honestly don't recall. I had some brute force code to trigger alerts
> on connected chips. Maybe the idea was to catch situations where another
> alert was raised after or during the first cycle.

Hmm, I'd think that SMBAlert then stays asserted and the whole alert
handling will be started right away a second time? Given that all
hardware works correctly, of course. Your setup showed that arbitration
does not work well with actual hardware. Props for finding this out!

> As for "call smbus_do_alert_force() right away", I am not sure I understand.
> Isn't that what the code is doing twice ?

It calls smbus_do_alert() twice (without '_force'). If that fails, it
calls the _force version. I am wondering now if we can't call the _force
version right after smbus_do_alert() fails once. Meaning we could remove
all the "retries" code from your patch. If there is no clear reason for
the code, not having it is easier to maintain. That's why I ask.

I hope the question is understandable now.

Happy hacking,

   Wolfram
Guenter Roeck July 29, 2024, 2:23 p.m. UTC | #4
On 7/29/24 00:57, Wolfram Sang wrote:
> Hi Guenter,
> 
> thanks for the feedback!
> 
>>> High level question: why the retry? Did you experience address
>>> collisions going away on the second try? My guess is that they would be
>>> mostly persistent, so we could call smbus_do_alert_force() right away?
>>>
>>
>> I honestly don't recall. I had some brute force code to trigger alerts
>> on connected chips. Maybe the idea was to catch situations where another
>> alert was raised after or during the first cycle.
> 
> Hmm, I'd think that SMBAlert then stays asserted and the whole alert
> handling will be started right away a second time? Given that all
> hardware works correctly, of course. Your setup showed that arbitration
> does not work well with actual hardware. Props for finding this out!
> 
>> As for "call smbus_do_alert_force() right away", I am not sure I understand.
>> Isn't that what the code is doing twice ?
> 
> It calls smbus_do_alert() twice (without '_force'). If that fails, it
> calls the _force version. I am wondering now if we can't call the _force
> version right after smbus_do_alert() fails once. Meaning we could remove
> all the "retries" code from your patch. If there is no clear reason for
> the code, not having it is easier to maintain. That's why I ask.
> 
> I hope the question is understandable now.
> 

I looked into the code again. The sequence is (or is supposed to be):

1st loop:
	if (!alert_pending)
		break;
	smbus_do_alert()
	if (failed at same address)
		smbus_do_alert_force()

2nd loop:
	if (!alert_pending)
		break;
	smbus_do_alert()
	if (failed at same address)
		break;

I think what you are suggesting is

1st loop:
	if (!alert_pending)
		break;
	smbus_do_alert()
	if (failed at same address)
		retries++;
2nd loop:
	if (!alert_pending)
		break;
	smbus_do_alert_force()
	if (failed at same address && retries)
		break;

But in reality that would not be much different because the alert status
is checked prior to calling smbus_do_alert() again.

With your suggestion (if I understand it correctly), the code would be
something like

                 /* Notify driver for the device which issued the alert */
                 status = device_for_each_child(&ara->adapter->dev, &data,
                                                retries ? smbus_do_alert_force : smbus_do_alert);
                 /*
                  * If we read the same address more than once, and the alert
                  * was not handled by a driver, it won't do any good to repeat
                  * the loop because it will never terminate.
                  * Bail out in this case.
                  * Note: This assumes that a driver with alert handler handles
                  * the alert properly and clears it if necessary.
                  */
                 if (data.addr == prev_addr && status != -EBUSY) {
                         /* retry once */
                         if (retries++)
                                 break;
                 } else {
                         retries = 0;
                 }

I don't know, I prefer my code. It keeps the exception /retry handling in one
place. Personal preference, maybe. Either case, retries could probably be made
a boolean.

Thanks,
Guenter
Wolfram Sang July 29, 2024, 6:36 p.m. UTC | #5
> I looked into the code again. The sequence is (or is supposed to be):
> 
> 1st loop:
> 	if (!alert_pending)
> 		break;
> 	smbus_do_alert()
> 	if (failed at same address)
> 		smbus_do_alert_force()
> 
> 2nd loop:
> 	if (!alert_pending)
> 		break;
> 	smbus_do_alert()
> 	if (failed at same address)
> 		break;
> 
> I think what you are suggesting is
...

What I am suggesting is more like this:

1st loop:

 	smbus_do_alert()
	//impossible to have same address on first run, so go to 2nd loop

2nd loop:

 	smbus_do_alert()
 	if (failed at same address)
 		smbus_do_alert_force()
		break;

As I understand it, your sequence is missing "my" 1st loop with the
invalid address, so you will end up having 3 loops altogether?

The code I am suggesting is bascially yours without the retries
variable:

	status = device_for_each_child(&ara->adapter->dev, &data,
				       smbus_do_alert);
	if (data.addr == prev_addr && status != -EBUSY) {
		device_for_each_child(&ara->adapter->dev, &data,
				      smbus_do_alert_force);
		break;
	}
	prev_addr = data.addr;

Makes sense or am I missing something?
Guenter Roeck July 29, 2024, 6:44 p.m. UTC | #6
On 7/29/24 11:36, Wolfram Sang wrote:
> 
>> I looked into the code again. The sequence is (or is supposed to be):
>>
>> 1st loop:
>> 	if (!alert_pending)
>> 		break;
>> 	smbus_do_alert()
>> 	if (failed at same address)
>> 		smbus_do_alert_force()
>>
>> 2nd loop:
>> 	if (!alert_pending)
>> 		break;
>> 	smbus_do_alert()
>> 	if (failed at same address)
>> 		break;
>>
>> I think what you are suggesting is
> ...
> 
> What I am suggesting is more like this:
> 
> 1st loop:
> 
>   	smbus_do_alert()
> 	//impossible to have same address on first run, so go to 2nd loop
> 
> 2nd loop:
> 
>   	smbus_do_alert()
>   	if (failed at same address)
>   		smbus_do_alert_force()
> 		break;
> 
> As I understand it, your sequence is missing "my" 1st loop with the
> invalid address, so you will end up having 3 loops altogether?
> 
> The code I am suggesting is bascially yours without the retries
> variable:
> 
> 	status = device_for_each_child(&ara->adapter->dev, &data,
> 				       smbus_do_alert);
> 	if (data.addr == prev_addr && status != -EBUSY) {
> 		device_for_each_child(&ara->adapter->dev, &data,
> 				      smbus_do_alert_force);
> 		break;
> 	}
> 	prev_addr = data.addr;
> 
> Makes sense or am I missing something?
> 

Yes, that should work and is indeed simpler. You are correct, the
additional loop should not be necessary since smbus_do_alert_force()
should already call all connected drivers and hopefully clear
the alert condition on those.

Thanks,
Guenter
Wolfram Sang July 29, 2024, 8:52 p.m. UTC | #7
> > The code I am suggesting is bascially yours without the retries
> > variable:
> > 
> > 	status = device_for_each_child(&ara->adapter->dev, &data,
> > 				       smbus_do_alert);
> > 	if (data.addr == prev_addr && status != -EBUSY) {
> > 		device_for_each_child(&ara->adapter->dev, &data,
> > 				      smbus_do_alert_force);
> > 		break;
> > 	}
> > 	prev_addr = data.addr;
> > 
> > Makes sense or am I missing something?
> > 
> 
> Yes, that should work and is indeed simpler. You are correct, the
> additional loop should not be necessary since smbus_do_alert_force()
> should already call all connected drivers and hopefully clear
> the alert condition on those.

Great that we are aligned now! Do you have time to rework and test the
patch?
Guenter Roeck July 29, 2024, 9:39 p.m. UTC | #8
On 7/29/24 13:52, Wolfram Sang wrote:
> 
>>> The code I am suggesting is bascially yours without the retries
>>> variable:
>>>
>>> 	status = device_for_each_child(&ara->adapter->dev, &data,
>>> 				       smbus_do_alert);
>>> 	if (data.addr == prev_addr && status != -EBUSY) {
>>> 		device_for_each_child(&ara->adapter->dev, &data,
>>> 				      smbus_do_alert_force);
>>> 		break;
>>> 	}
>>> 	prev_addr = data.addr;
>>>
>>> Makes sense or am I missing something?
>>>
>>
>> Yes, that should work and is indeed simpler. You are correct, the
>> additional loop should not be necessary since smbus_do_alert_force()
>> should already call all connected drivers and hopefully clear
>> the alert condition on those.
> 
> Great that we are aligned now! Do you have time to rework and test the
> patch?
> 

I'll give it a try.

Guenter
diff mbox series

Patch

diff --git a/drivers/i2c/i2c-smbus.c b/drivers/i2c/i2c-smbus.c
index 533c885b99ac..f48cec19db41 100644
--- a/drivers/i2c/i2c-smbus.c
+++ b/drivers/i2c/i2c-smbus.c
@@ -65,6 +65,32 @@  static int smbus_do_alert(struct device *dev, void *addrp)
 	return ret;
 }
 
+/* Same as above, but call back all drivers with alert handler */
+
+static int smbus_do_alert_force(struct device *dev, void *addrp)
+{
+	struct i2c_client *client = i2c_verify_client(dev);
+	struct alert_data *data = addrp;
+	struct i2c_driver *driver;
+
+	if (!client || (client->flags & I2C_CLIENT_TEN))
+		return 0;
+
+	/*
+	 * Drivers should either disable alerts, or provide at least
+	 * a minimal handler. Lock so the driver won't change.
+	 */
+	device_lock(dev);
+	if (client->dev.driver) {
+		driver = to_i2c_driver(client->dev.driver);
+		if (driver->alert)
+			driver->alert(client, data->type, data->data);
+	}
+	device_unlock(dev);
+
+	return 0;
+}
+
 /*
  * The alert IRQ handler needs to hand work off to a task which can issue
  * SMBus calls, because those sleeping calls can't be made in IRQ context.
@@ -74,6 +100,7 @@  static irqreturn_t smbus_alert(int irq, void *d)
 	struct i2c_smbus_alert *alert = d;
 	struct i2c_client *ara;
 	unsigned short prev_addr = 0;	/* Not a valid address */
+	int retries = 0;
 
 	ara = alert->ara;
 
@@ -111,8 +138,15 @@  static irqreturn_t smbus_alert(int irq, void *d)
 		 * Note: This assumes that a driver with alert handler handles
 		 * the alert properly and clears it if necessary.
 		 */
-		if (data.addr == prev_addr && status != -EBUSY)
-			break;
+		if (data.addr == prev_addr && status != -EBUSY) {
+			/* retry once */
+			if (retries++)
+				break;
+			device_for_each_child(&ara->adapter->dev, &data,
+					      smbus_do_alert_force);
+		} else {
+			retries = 0;
+		}
 		prev_addr = data.addr;
 	}