diff mbox series

[2/3] thermal/drivers/tegra: Remove get_trend function

Message ID 20220616202537.303655-2-daniel.lezcano@linaro.org
State Not Applicable
Headers show
Series None | expand

Commit Message

Daniel Lezcano June 16, 2022, 8:25 p.m. UTC
The get_trend function does already what the generic framework does.

Remove it.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/thermal/tegra/soctherm.c | 32 --------------------------------
 1 file changed, 32 deletions(-)

Comments

Dmitry Osipenko June 18, 2022, 12:44 p.m. UTC | #1
16.06.2022 23:25, Daniel Lezcano пишет:
> The get_trend function does already what the generic framework does.
> 
> Remove it.
> 
> Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
> ---
>  drivers/thermal/tegra/soctherm.c | 32 --------------------------------
>  1 file changed, 32 deletions(-)
> 
> diff --git a/drivers/thermal/tegra/soctherm.c b/drivers/thermal/tegra/soctherm.c
> index 210325f92559..825eab526619 100644
> --- a/drivers/thermal/tegra/soctherm.c
> +++ b/drivers/thermal/tegra/soctherm.c
> @@ -633,37 +633,6 @@ static int tegra_thermctl_set_trip_temp(void *data, int trip, int temp)
>  	return 0;
>  }
>  
> -static int tegra_thermctl_get_trend(void *data, int trip,
> -				    enum thermal_trend *trend)
> -{
> -	struct tegra_thermctl_zone *zone = data;
> -	struct thermal_zone_device *tz = zone->tz;
> -	int trip_temp, temp, last_temp, ret;
> -
> -	if (!tz)
> -		return -EINVAL;
> -
> -	ret = tz->ops->get_trip_temp(zone->tz, trip, &trip_temp);
> -	if (ret)
> -		return ret;
> -
> -	temp = READ_ONCE(tz->temperature);
> -	last_temp = READ_ONCE(tz->last_temperature);
> -
> -	if (temp > trip_temp) {
> -		if (temp >= last_temp)
> -			*trend = THERMAL_TREND_RAISING;
> -		else
> -			*trend = THERMAL_TREND_STABLE;
> -	} else if (temp < trip_temp) {
> -		*trend = THERMAL_TREND_DROPPING;
> -	} else {
> -		*trend = THERMAL_TREND_STABLE;
> -	}
> -
> -	return 0;
> -}
> -
>  static void thermal_irq_enable(struct tegra_thermctl_zone *zn)
>  {
>  	u32 r;
> @@ -716,7 +685,6 @@ static int tegra_thermctl_set_trips(void *data, int lo, int hi)
>  static const struct thermal_zone_of_device_ops tegra_of_thermal_ops = {
>  	.get_temp = tegra_thermctl_get_temp,
>  	.set_trip_temp = tegra_thermctl_set_trip_temp,
> -	.get_trend = tegra_thermctl_get_trend,
>  	.set_trips = tegra_thermctl_set_trips,
>  };
>  

The framework doesn't use the trip temperature, is it really the same?
Previously, if temperature was above the trip and was dropping, then it
was THERMAL_TREND_STABLE instead of THERMAL_TREND_DROPPING.
Daniel Lezcano June 20, 2022, 3:17 p.m. UTC | #2
On 18/06/2022 14:44, Dmitry Osipenko wrote:
> 16.06.2022 23:25, Daniel Lezcano пишет:
>> The get_trend function does already what the generic framework does.
>>
>> Remove it.
>>
>> Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
>> ---

[ ... ]

>>   static void thermal_irq_enable(struct tegra_thermctl_zone *zn)
>>   {
>>   	u32 r;
>> @@ -716,7 +685,6 @@ static int tegra_thermctl_set_trips(void *data, int lo, int hi)
>>   static const struct thermal_zone_of_device_ops tegra_of_thermal_ops = {
>>   	.get_temp = tegra_thermctl_get_temp,
>>   	.set_trip_temp = tegra_thermctl_set_trip_temp,
>> -	.get_trend = tegra_thermctl_get_trend,
>>   	.set_trips = tegra_thermctl_set_trips,
>>   };
>>   
> 
> The framework doesn't use the trip temperature, is it really the same?
> Previously, if temperature was above the trip and was dropping, then it
> was THERMAL_TREND_STABLE instead of THERMAL_TREND_DROPPING.

Actually, the only difference is the temp > trip and the temperature < 
last_temperature. It results in the STABLE trend and the governor does 
nothing.

With the core trend function for the same inputs, temperature < 
last_temperature results in a DROPPING but as temp > trip the 'throttle' 
boolean is true in the governor, and get_next_state() with DROPPING + 
throttle=true, results in nothing because the action happens when 
throttle=false.

All the combinations result at end at the same action from the governor.
Daniel Lezcano June 28, 2022, 8:41 a.m. UTC | #3
Thierry, Dmitry,

are fine with this patch?

Thanks

   -- Daniel

On 16/06/2022 22:25, Daniel Lezcano wrote:
> The get_trend function does already what the generic framework does.
> 
> Remove it.
> 
> Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
> ---
>   drivers/thermal/tegra/soctherm.c | 32 --------------------------------
>   1 file changed, 32 deletions(-)
> 
> diff --git a/drivers/thermal/tegra/soctherm.c b/drivers/thermal/tegra/soctherm.c
> index 210325f92559..825eab526619 100644
> --- a/drivers/thermal/tegra/soctherm.c
> +++ b/drivers/thermal/tegra/soctherm.c
> @@ -633,37 +633,6 @@ static int tegra_thermctl_set_trip_temp(void *data, int trip, int temp)
>   	return 0;
>   }
>   
> -static int tegra_thermctl_get_trend(void *data, int trip,
> -				    enum thermal_trend *trend)
> -{
> -	struct tegra_thermctl_zone *zone = data;
> -	struct thermal_zone_device *tz = zone->tz;
> -	int trip_temp, temp, last_temp, ret;
> -
> -	if (!tz)
> -		return -EINVAL;
> -
> -	ret = tz->ops->get_trip_temp(zone->tz, trip, &trip_temp);
> -	if (ret)
> -		return ret;
> -
> -	temp = READ_ONCE(tz->temperature);
> -	last_temp = READ_ONCE(tz->last_temperature);
> -
> -	if (temp > trip_temp) {
> -		if (temp >= last_temp)
> -			*trend = THERMAL_TREND_RAISING;
> -		else
> -			*trend = THERMAL_TREND_STABLE;
> -	} else if (temp < trip_temp) {
> -		*trend = THERMAL_TREND_DROPPING;
> -	} else {
> -		*trend = THERMAL_TREND_STABLE;
> -	}
> -
> -	return 0;
> -}
> -
>   static void thermal_irq_enable(struct tegra_thermctl_zone *zn)
>   {
>   	u32 r;
> @@ -716,7 +685,6 @@ static int tegra_thermctl_set_trips(void *data, int lo, int hi)
>   static const struct thermal_zone_of_device_ops tegra_of_thermal_ops = {
>   	.get_temp = tegra_thermctl_get_temp,
>   	.set_trip_temp = tegra_thermctl_set_trip_temp,
> -	.get_trend = tegra_thermctl_get_trend,
>   	.set_trips = tegra_thermctl_set_trips,
>   };
>
Dmitry Osipenko June 28, 2022, 11:44 a.m. UTC | #4
On 6/28/22 11:41, Daniel Lezcano wrote:
> 
> Thierry, Dmitry,
> 
> are fine with this patch?

Seems should be good. I couldn't test it using recent the linux-next
because of a lockup in LM90 driver. There were quite a lot of changes in
LM90 recently, adding Guenter.

INFO: task kworker/3:1:44 blocked for more than 61 seconds.
      Not tainted 5.19.0-rc4-next-20220627-00012-g08b697b94b8a #2
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
task:kworker/3:1     state:D stack:    0 pid:   44 ppid:     2
flags:0x00000000
Workqueue: events_freezable_power_ thermal_zone_device_check
Backtrace:
 __schedule from schedule+0x60/0xcc
 r10:c0fead70 r9:c2854c94 r8:df9a1dac r7:c2814b40 r6:00000002 r5:c1883020
 r4:c2814b40
 schedule from schedule_preempt_disabled+0x28/0x38
 r5:c1883020 r4:c2814b40
 schedule_preempt_disabled from __mutex_lock.constprop.0+0x1e0/0x9ac
 r5:c1883020 r4:c2854c90
 __mutex_lock.constprop.0 from __mutex_lock_slowpath+0x1c/0x20
 r10:00000000 r9:c1882ae0 r8:c2854c90 r7:c2854c40 r6:00000001 r5:00000001
 r4:c2854c90
 __mutex_lock_slowpath from mutex_lock+0x60/0x64
 mutex_lock from lm90_read+0x40/0x3d4
 r5:00000001 r4:c2854e08
 lm90_read from hwmon_thermal_get_temp+0x58/0x8c
 r9:c1882ae0 r8:c2814b40 r7:de6aee00 r6:c1db1660 r5:c0af7940 r4:df9a1eb8
 hwmon_thermal_get_temp from of_thermal_get_temp+0x38/0x44
 r5:df9a1eb8 r4:c1db1400
 of_thermal_get_temp from thermal_zone_get_temp+0x58/0x78
 thermal_zone_get_temp from thermal_zone_device_update.part.0+0x4c/0x450
 r7:de6aee00 r6:c1db1400 r5:00000000 r4:c1db1400
 thermal_zone_device_update.part.0 from thermal_zone_device_check+0x58/0x5c
 r10:00000000 r9:c1882ae0 r8:c2814b40 r7:de6aee00 r6:c1db1400 r5:c1db1660
 r4:00000001
 thermal_zone_device_check from process_one_work+0x21c/0x530
 r7:de6aee00 r6:de6ab600 r5:c2802c00 r4:c1db167c
 process_one_work from worker_thread+0x19c/0x5cc
 r10:00000008 r9:c2814b40 r8:c1703d40 r7:de6ab61c r6:c2802c18 r5:de6ab600
 r4:c2802c00
 worker_thread from kthread+0x100/0x120
 r10:00000000 r9:df895e80 r8:c285e3c0 r7:c2802c00 r6:c014cf84 r5:c285e300
 r4:c2814b40
 kthread from ret_from_fork+0x14/0x2c
Exception stack(0xdf9a1fb0 to 0xdf9a1ff8)

> On 16/06/2022 22:25, Daniel Lezcano wrote:
>> The get_trend function does already what the generic framework does.
>>
>> Remove it.
>>
>> Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
>> ---
>>   drivers/thermal/tegra/soctherm.c | 32 --------------------------------
>>   1 file changed, 32 deletions(-)
>>
>> diff --git a/drivers/thermal/tegra/soctherm.c
>> b/drivers/thermal/tegra/soctherm.c
>> index 210325f92559..825eab526619 100644
>> --- a/drivers/thermal/tegra/soctherm.c
>> +++ b/drivers/thermal/tegra/soctherm.c
>> @@ -633,37 +633,6 @@ static int tegra_thermctl_set_trip_temp(void
>> *data, int trip, int temp)
>>       return 0;
>>   }
>>   -static int tegra_thermctl_get_trend(void *data, int trip,
>> -                    enum thermal_trend *trend)
>> -{
>> -    struct tegra_thermctl_zone *zone = data;
>> -    struct thermal_zone_device *tz = zone->tz;
>> -    int trip_temp, temp, last_temp, ret;
>> -
>> -    if (!tz)
>> -        return -EINVAL;
>> -
>> -    ret = tz->ops->get_trip_temp(zone->tz, trip, &trip_temp);
>> -    if (ret)
>> -        return ret;
>> -
>> -    temp = READ_ONCE(tz->temperature);
>> -    last_temp = READ_ONCE(tz->last_temperature);
>> -
>> -    if (temp > trip_temp) {
>> -        if (temp >= last_temp)
>> -            *trend = THERMAL_TREND_RAISING;
>> -        else
>> -            *trend = THERMAL_TREND_STABLE;
>> -    } else if (temp < trip_temp) {
>> -        *trend = THERMAL_TREND_DROPPING;
>> -    } else {
>> -        *trend = THERMAL_TREND_STABLE;
>> -    }
>> -
>> -    return 0;
>> -}
>> -
>>   static void thermal_irq_enable(struct tegra_thermctl_zone *zn)
>>   {
>>       u32 r;
>> @@ -716,7 +685,6 @@ static int tegra_thermctl_set_trips(void *data,
>> int lo, int hi)
>>   static const struct thermal_zone_of_device_ops tegra_of_thermal_ops = {
>>       .get_temp = tegra_thermctl_get_temp,
>>       .set_trip_temp = tegra_thermctl_set_trip_temp,
>> -    .get_trend = tegra_thermctl_get_trend,
>>       .set_trips = tegra_thermctl_set_trips,
>>   };
>>   
> 
>
Guenter Roeck June 28, 2022, 3:10 p.m. UTC | #5
On Tue, Jun 28, 2022 at 02:44:31PM +0300, Dmitry Osipenko wrote:
> On 6/28/22 11:41, Daniel Lezcano wrote:
> > 
> > Thierry, Dmitry,
> > 
> > are fine with this patch?
> 
> Seems should be good. I couldn't test it using recent the linux-next
> because of a lockup in LM90 driver. There were quite a lot of changes in
> LM90 recently, adding Guenter.
> 

Weird, I tested those changes to death with real hardware, and I don't
see a code path where the mutex can be left in blocked state unless the
underlying i2c driver locks up for some reason. What is the platform,
and can you point me to the devicetree file ? Also, is there anything
else lm90 or i2c related in the kernel log ?

Thanks,
Guenter

> INFO: task kworker/3:1:44 blocked for more than 61 seconds.
>       Not tainted 5.19.0-rc4-next-20220627-00012-g08b697b94b8a #2
> "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> task:kworker/3:1     state:D stack:    0 pid:   44 ppid:     2
> flags:0x00000000
> Workqueue: events_freezable_power_ thermal_zone_device_check
> Backtrace:
>  __schedule from schedule+0x60/0xcc
>  r10:c0fead70 r9:c2854c94 r8:df9a1dac r7:c2814b40 r6:00000002 r5:c1883020
>  r4:c2814b40
>  schedule from schedule_preempt_disabled+0x28/0x38
>  r5:c1883020 r4:c2814b40
>  schedule_preempt_disabled from __mutex_lock.constprop.0+0x1e0/0x9ac
>  r5:c1883020 r4:c2854c90
>  __mutex_lock.constprop.0 from __mutex_lock_slowpath+0x1c/0x20
>  r10:00000000 r9:c1882ae0 r8:c2854c90 r7:c2854c40 r6:00000001 r5:00000001
>  r4:c2854c90
>  __mutex_lock_slowpath from mutex_lock+0x60/0x64
>  mutex_lock from lm90_read+0x40/0x3d4
>  r5:00000001 r4:c2854e08
>  lm90_read from hwmon_thermal_get_temp+0x58/0x8c
>  r9:c1882ae0 r8:c2814b40 r7:de6aee00 r6:c1db1660 r5:c0af7940 r4:df9a1eb8
>  hwmon_thermal_get_temp from of_thermal_get_temp+0x38/0x44
>  r5:df9a1eb8 r4:c1db1400
>  of_thermal_get_temp from thermal_zone_get_temp+0x58/0x78
>  thermal_zone_get_temp from thermal_zone_device_update.part.0+0x4c/0x450
>  r7:de6aee00 r6:c1db1400 r5:00000000 r4:c1db1400
>  thermal_zone_device_update.part.0 from thermal_zone_device_check+0x58/0x5c
>  r10:00000000 r9:c1882ae0 r8:c2814b40 r7:de6aee00 r6:c1db1400 r5:c1db1660
>  r4:00000001
>  thermal_zone_device_check from process_one_work+0x21c/0x530
>  r7:de6aee00 r6:de6ab600 r5:c2802c00 r4:c1db167c
>  process_one_work from worker_thread+0x19c/0x5cc
>  r10:00000008 r9:c2814b40 r8:c1703d40 r7:de6ab61c r6:c2802c18 r5:de6ab600
>  r4:c2802c00
>  worker_thread from kthread+0x100/0x120
>  r10:00000000 r9:df895e80 r8:c285e3c0 r7:c2802c00 r6:c014cf84 r5:c285e300
>  r4:c2814b40
>  kthread from ret_from_fork+0x14/0x2c
> Exception stack(0xdf9a1fb0 to 0xdf9a1ff8)
> 
> > On 16/06/2022 22:25, Daniel Lezcano wrote:
> >> The get_trend function does already what the generic framework does.
> >>
> >> Remove it.
> >>
> >> Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
> >> ---
> >>   drivers/thermal/tegra/soctherm.c | 32 --------------------------------
> >>   1 file changed, 32 deletions(-)
> >>
> >> diff --git a/drivers/thermal/tegra/soctherm.c
> >> b/drivers/thermal/tegra/soctherm.c
> >> index 210325f92559..825eab526619 100644
> >> --- a/drivers/thermal/tegra/soctherm.c
> >> +++ b/drivers/thermal/tegra/soctherm.c
> >> @@ -633,37 +633,6 @@ static int tegra_thermctl_set_trip_temp(void
> >> *data, int trip, int temp)
> >>       return 0;
> >>   }
> >>   -static int tegra_thermctl_get_trend(void *data, int trip,
> >> -                    enum thermal_trend *trend)
> >> -{
> >> -    struct tegra_thermctl_zone *zone = data;
> >> -    struct thermal_zone_device *tz = zone->tz;
> >> -    int trip_temp, temp, last_temp, ret;
> >> -
> >> -    if (!tz)
> >> -        return -EINVAL;
> >> -
> >> -    ret = tz->ops->get_trip_temp(zone->tz, trip, &trip_temp);
> >> -    if (ret)
> >> -        return ret;
> >> -
> >> -    temp = READ_ONCE(tz->temperature);
> >> -    last_temp = READ_ONCE(tz->last_temperature);
> >> -
> >> -    if (temp > trip_temp) {
> >> -        if (temp >= last_temp)
> >> -            *trend = THERMAL_TREND_RAISING;
> >> -        else
> >> -            *trend = THERMAL_TREND_STABLE;
> >> -    } else if (temp < trip_temp) {
> >> -        *trend = THERMAL_TREND_DROPPING;
> >> -    } else {
> >> -        *trend = THERMAL_TREND_STABLE;
> >> -    }
> >> -
> >> -    return 0;
> >> -}
> >> -
> >>   static void thermal_irq_enable(struct tegra_thermctl_zone *zn)
> >>   {
> >>       u32 r;
> >> @@ -716,7 +685,6 @@ static int tegra_thermctl_set_trips(void *data,
> >> int lo, int hi)
> >>   static const struct thermal_zone_of_device_ops tegra_of_thermal_ops = {
> >>       .get_temp = tegra_thermctl_get_temp,
> >>       .set_trip_temp = tegra_thermctl_set_trip_temp,
> >> -    .get_trend = tegra_thermctl_get_trend,
> >>       .set_trips = tegra_thermctl_set_trips,
> >>   };
> >>   
> > 
> > 
>
Guenter Roeck June 28, 2022, 6:43 p.m. UTC | #6
On Tue, Jun 28, 2022 at 08:10:30AM -0700, Guenter Roeck wrote:
> On Tue, Jun 28, 2022 at 02:44:31PM +0300, Dmitry Osipenko wrote:
> > On 6/28/22 11:41, Daniel Lezcano wrote:
> > > 
> > > Thierry, Dmitry,
> > > 
> > > are fine with this patch?
> > 
> > Seems should be good. I couldn't test it using recent the linux-next
> > because of a lockup in LM90 driver. There were quite a lot of changes in
> > LM90 recently, adding Guenter.
> > 
> 
> Weird, I tested those changes to death with real hardware, and I don't
> see a code path where the mutex can be left in blocked state unless the
> underlying i2c driver locks up for some reason. What is the platform,
> and can you point me to the devicetree file ? Also, is there anything
> else lm90 or i2c related in the kernel log ?
> 

Follow-up question: I see that various Tegra systems use lm90 compatible
chips, and the interrupt line is in general wired up. Can you check if
you get lots of interrupts on that interrupt line ? Also, can you check
what happens if you read hwmon attributes directly ?

Thanks,
Guenter

> Thanks,
> Guenter
> 
> > INFO: task kworker/3:1:44 blocked for more than 61 seconds.
> >       Not tainted 5.19.0-rc4-next-20220627-00012-g08b697b94b8a #2
> > "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> > task:kworker/3:1     state:D stack:    0 pid:   44 ppid:     2
> > flags:0x00000000
> > Workqueue: events_freezable_power_ thermal_zone_device_check
> > Backtrace:
> >  __schedule from schedule+0x60/0xcc
> >  r10:c0fead70 r9:c2854c94 r8:df9a1dac r7:c2814b40 r6:00000002 r5:c1883020
> >  r4:c2814b40
> >  schedule from schedule_preempt_disabled+0x28/0x38
> >  r5:c1883020 r4:c2814b40
> >  schedule_preempt_disabled from __mutex_lock.constprop.0+0x1e0/0x9ac
> >  r5:c1883020 r4:c2854c90
> >  __mutex_lock.constprop.0 from __mutex_lock_slowpath+0x1c/0x20
> >  r10:00000000 r9:c1882ae0 r8:c2854c90 r7:c2854c40 r6:00000001 r5:00000001
> >  r4:c2854c90
> >  __mutex_lock_slowpath from mutex_lock+0x60/0x64
> >  mutex_lock from lm90_read+0x40/0x3d4
> >  r5:00000001 r4:c2854e08
> >  lm90_read from hwmon_thermal_get_temp+0x58/0x8c
> >  r9:c1882ae0 r8:c2814b40 r7:de6aee00 r6:c1db1660 r5:c0af7940 r4:df9a1eb8
> >  hwmon_thermal_get_temp from of_thermal_get_temp+0x38/0x44
> >  r5:df9a1eb8 r4:c1db1400
> >  of_thermal_get_temp from thermal_zone_get_temp+0x58/0x78
> >  thermal_zone_get_temp from thermal_zone_device_update.part.0+0x4c/0x450
> >  r7:de6aee00 r6:c1db1400 r5:00000000 r4:c1db1400
> >  thermal_zone_device_update.part.0 from thermal_zone_device_check+0x58/0x5c
> >  r10:00000000 r9:c1882ae0 r8:c2814b40 r7:de6aee00 r6:c1db1400 r5:c1db1660
> >  r4:00000001
> >  thermal_zone_device_check from process_one_work+0x21c/0x530
> >  r7:de6aee00 r6:de6ab600 r5:c2802c00 r4:c1db167c
> >  process_one_work from worker_thread+0x19c/0x5cc
> >  r10:00000008 r9:c2814b40 r8:c1703d40 r7:de6ab61c r6:c2802c18 r5:de6ab600
> >  r4:c2802c00
> >  worker_thread from kthread+0x100/0x120
> >  r10:00000000 r9:df895e80 r8:c285e3c0 r7:c2802c00 r6:c014cf84 r5:c285e300
> >  r4:c2814b40
> >  kthread from ret_from_fork+0x14/0x2c
> > Exception stack(0xdf9a1fb0 to 0xdf9a1ff8)
> > 
> > > On 16/06/2022 22:25, Daniel Lezcano wrote:
> > >> The get_trend function does already what the generic framework does.
> > >>
> > >> Remove it.
> > >>
> > >> Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
> > >> ---
> > >>   drivers/thermal/tegra/soctherm.c | 32 --------------------------------
> > >>   1 file changed, 32 deletions(-)
> > >>
> > >> diff --git a/drivers/thermal/tegra/soctherm.c
> > >> b/drivers/thermal/tegra/soctherm.c
> > >> index 210325f92559..825eab526619 100644
> > >> --- a/drivers/thermal/tegra/soctherm.c
> > >> +++ b/drivers/thermal/tegra/soctherm.c
> > >> @@ -633,37 +633,6 @@ static int tegra_thermctl_set_trip_temp(void
> > >> *data, int trip, int temp)
> > >>       return 0;
> > >>   }
> > >>   -static int tegra_thermctl_get_trend(void *data, int trip,
> > >> -                    enum thermal_trend *trend)
> > >> -{
> > >> -    struct tegra_thermctl_zone *zone = data;
> > >> -    struct thermal_zone_device *tz = zone->tz;
> > >> -    int trip_temp, temp, last_temp, ret;
> > >> -
> > >> -    if (!tz)
> > >> -        return -EINVAL;
> > >> -
> > >> -    ret = tz->ops->get_trip_temp(zone->tz, trip, &trip_temp);
> > >> -    if (ret)
> > >> -        return ret;
> > >> -
> > >> -    temp = READ_ONCE(tz->temperature);
> > >> -    last_temp = READ_ONCE(tz->last_temperature);
> > >> -
> > >> -    if (temp > trip_temp) {
> > >> -        if (temp >= last_temp)
> > >> -            *trend = THERMAL_TREND_RAISING;
> > >> -        else
> > >> -            *trend = THERMAL_TREND_STABLE;
> > >> -    } else if (temp < trip_temp) {
> > >> -        *trend = THERMAL_TREND_DROPPING;
> > >> -    } else {
> > >> -        *trend = THERMAL_TREND_STABLE;
> > >> -    }
> > >> -
> > >> -    return 0;
> > >> -}
> > >> -
> > >>   static void thermal_irq_enable(struct tegra_thermctl_zone *zn)
> > >>   {
> > >>       u32 r;
> > >> @@ -716,7 +685,6 @@ static int tegra_thermctl_set_trips(void *data,
> > >> int lo, int hi)
> > >>   static const struct thermal_zone_of_device_ops tegra_of_thermal_ops = {
> > >>       .get_temp = tegra_thermctl_get_temp,
> > >>       .set_trip_temp = tegra_thermctl_set_trip_temp,
> > >> -    .get_trend = tegra_thermctl_get_trend,
> > >>       .set_trips = tegra_thermctl_set_trips,
> > >>   };
> > >>   
> > > 
> > > 
> >
Dmitry Osipenko June 29, 2022, 9:35 a.m. UTC | #7
On 6/28/22 21:43, Guenter Roeck wrote:
> On Tue, Jun 28, 2022 at 08:10:30AM -0700, Guenter Roeck wrote:
>> On Tue, Jun 28, 2022 at 02:44:31PM +0300, Dmitry Osipenko wrote:
>>> On 6/28/22 11:41, Daniel Lezcano wrote:
>>>>
>>>> Thierry, Dmitry,
>>>>
>>>> are fine with this patch?
>>>
>>> Seems should be good. I couldn't test it using recent the linux-next
>>> because of a lockup in LM90 driver. There were quite a lot of changes in
>>> LM90 recently, adding Guenter.
>>>
>>
>> Weird, I tested those changes to death with real hardware, and I don't
>> see a code path where the mutex can be left in blocked state unless the
>> underlying i2c driver locks up for some reason. What is the platform,
>> and can you point me to the devicetree file ? Also, is there anything
>> else lm90 or i2c related in the kernel log ?
>>
> 
> Follow-up question: I see that various Tegra systems use lm90 compatible
> chips, and the interrupt line is in general wired up. Can you check if
> you get lots of interrupts on that interrupt line ? Also, can you check
> what happens if you read hwmon attributes directly ?

The number of interrupt fires is okay. It's a Nexus 7 Tegra30 tablet
device that I'm using for the testing.

Today I enabled the lockdep and it immediately showed where the problem is:

======================================================
WARNING: possible circular locking dependency detected
5.19.0-rc4-next-20220628-00002-g94e5dbbe1c58-dirty #24 Not tainted
------------------------------------------------------
irq/91-lm90/130 is trying to acquire lock:
c27ba380 (&tz->lock){+.+.}-{3:3}, at: thermal_zone_device_update+0x2c/0x64

               but task is already holding lock:
c27b42c8 (&data->update_lock){+.+.}-{3:3}, at: lm90_irq_thread+0x2c/0x68

               which lock already depends on the new lock.


               the existing dependency chain (in reverse order) is:

               -> #1 (&data->update_lock){+.+.}-{3:3}:
       __mutex_lock+0x9c/0x984
       mutex_lock_nested+0x2c/0x34
       lm90_read+0x44/0x3e8
       hwmon_thermal_get_temp+0x58/0x8c
       of_thermal_get_temp+0x38/0x44
       thermal_zone_get_temp+0x5c/0x7c
       thermal_zone_device_update.part.0+0x48/0x5fc
       thermal_zone_device_set_mode+0xa0/0xe4
       thermal_zone_device_enable+0x1c/0x20
       thermal_zone_of_sensor_register+0x18c/0x19c
       devm_thermal_zone_of_sensor_register+0x68/0xa4
       __hwmon_device_register+0x704/0x91c
       hwmon_device_register_with_info+0x6c/0x80
       devm_hwmon_device_register_with_info+0x78/0xb4
       lm90_probe+0x618/0x8c0
       i2c_device_probe+0x170/0x2e0
       really_probe+0xd8/0x300
       __driver_probe_device+0x94/0xf4
       driver_probe_device+0x40/0x118
       __device_attach_driver+0xc8/0x10c
       bus_for_each_drv+0x90/0xdc
       __device_attach+0xbc/0x1d4
       device_initial_probe+0x1c/0x20
       bus_probe_device+0x98/0xa0
       deferred_probe_work_func+0x8c/0xbc
       process_one_work+0x2b8/0x774
       worker_thread+0x17c/0x56c
       kthread+0x108/0x13c
       ret_from_fork+0x14/0x28
       0x0

               -> #0 (&tz->lock){+.+.}-{3:3}:
       __lock_acquire+0x173c/0x3198
       lock_acquire+0x128/0x3f0
       __mutex_lock+0x9c/0x984
       mutex_lock_nested+0x2c/0x34
       thermal_zone_device_update+0x2c/0x64
       hwmon_notify_event+0x128/0x138
       lm90_update_alarms_locked+0x35c/0x3b8
       lm90_irq_thread+0x38/0x68
       irq_thread_fn+0x2c/0x8c
       irq_thread+0x190/0x29c
       kthread+0x108/0x13c
       ret_from_fork+0x14/0x28
       0x0

               other info that might help us debug this:

 Possible unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
  lock(&data->update_lock);
                               lock(&tz->lock);
                               lock(&data->update_lock);
  lock(&tz->lock);

                *** DEADLOCK ***

1 lock held by irq/91-lm90/130:
 #0: c27b42c8 (&data->update_lock){+.+.}-{3:3}, at:
lm90_irq_thread+0x2c/0x68

               stack backtrace:
CPU: 1 PID: 130 Comm: irq/91-lm90 Not tainted
5.19.0-rc4-next-20220628-00002-g94e5dbbe1c58-dirty #24
Hardware name: NVIDIA Tegra SoC (Flattened Device Tree)
Backtrace:
 dump_backtrace from show_stack+0x20/0x24
 r7:c33d1b60 r6:00000080 r5:60000093 r4:c168c6a4
 show_stack from dump_stack_lvl+0x68/0x98
 dump_stack_lvl from dump_stack+0x18/0x1c
 r7:c33d1b60 r6:c20328cc r5:c203c700 r4:c20328cc
 dump_stack from print_circular_bug+0x2ec/0x33c
 print_circular_bug from check_noncircular+0x104/0x168
 r10:c1a14cc8 r9:c33d1240 r8:00000001 r7:00000000 r6:dfc3dcc0 r5:c33d1b60
 r4:c33d1b80
 check_noncircular from __lock_acquire+0x173c/0x3198
 r7:c33d1b80 r6:c202bc98 r5:c33d1b60 r4:c21d92ac
 __lock_acquire from lock_acquire+0x128/0x3f0
 r10:60000013 r9:00000000 r8:00000000 r7:00000000 r6:dfc3dd40 r5:c19ac688
 r4:c19ac688
 lock_acquire from __mutex_lock+0x9c/0x984
 r10:c27ba380 r9:00000000 r8:c21d92ac r7:c33d1240 r6:00000000 r5:00000000
 r4:c27ba348
 __mutex_lock from mutex_lock_nested+0x2c/0x34
 r10:c27b4000 r9:00000000 r8:dfc3de87 r7:00000000 r6:c27ba348 r5:00000000
 r4:c27ba000
 mutex_lock_nested from thermal_zone_device_update+0x2c/0x64
 thermal_zone_device_update from hwmon_notify_event+0x128/0x138
 r7:00000000 r6:00000000 r5:c2d23ea4 r4:c33fd040
 hwmon_notify_event from lm90_update_alarms_locked+0x35c/0x3b8
 r8:c27b4378 r7:c2d23c08 r6:00000020 r5:00000000 r4:c27b4240
 lm90_update_alarms_locked from lm90_irq_thread+0x38/0x68
 r9:c01c2814 r8:00000001 r7:c33d2240 r6:c27b4290 r5:c27b4240 r4:c33fc200
 lm90_irq_thread from irq_thread_fn+0x2c/0x8c
 r7:c33d2240 r6:c27b4000 r5:c33d1240 r4:c33fc200
 irq_thread_fn from irq_thread+0x190/0x29c
 r7:c33d2240 r6:c33fc224 r5:c33d1240 r4:00000000
 irq_thread from kthread+0x108/0x13c
 r10:00000000 r9:df9ddbf4 r8:c31d2200 r7:c33fc200 r6:c01c2710 r5:c33d1240
 r4:c33fc240
 kthread from ret_from_fork+0x14/0x28
Guenter Roeck June 29, 2022, 12:56 p.m. UTC | #8
On 6/29/22 02:35, Dmitry Osipenko wrote:
> On 6/28/22 21:43, Guenter Roeck wrote:
>> On Tue, Jun 28, 2022 at 08:10:30AM -0700, Guenter Roeck wrote:
>>> On Tue, Jun 28, 2022 at 02:44:31PM +0300, Dmitry Osipenko wrote:
>>>> On 6/28/22 11:41, Daniel Lezcano wrote:
>>>>>
>>>>> Thierry, Dmitry,
>>>>>
>>>>> are fine with this patch?
>>>>
>>>> Seems should be good. I couldn't test it using recent the linux-next
>>>> because of a lockup in LM90 driver. There were quite a lot of changes in
>>>> LM90 recently, adding Guenter.
>>>>
>>>
>>> Weird, I tested those changes to death with real hardware, and I don't
>>> see a code path where the mutex can be left in blocked state unless the
>>> underlying i2c driver locks up for some reason. What is the platform,
>>> and can you point me to the devicetree file ? Also, is there anything
>>> else lm90 or i2c related in the kernel log ?
>>>
>>
>> Follow-up question: I see that various Tegra systems use lm90 compatible
>> chips, and the interrupt line is in general wired up. Can you check if
>> you get lots of interrupts on that interrupt line ? Also, can you check
>> what happens if you read hwmon attributes directly ?
> 
> The number of interrupt fires is okay. It's a Nexus 7 Tegra30 tablet
> device that I'm using for the testing.
> 
> Today I enabled the lockdep and it immediately showed where the problem is:
> 

Yes, that is an obvious problem. Thanks a lot for testing!

Guenter

> ======================================================
> WARNING: possible circular locking dependency detected
> 5.19.0-rc4-next-20220628-00002-g94e5dbbe1c58-dirty #24 Not tainted
> ------------------------------------------------------
> irq/91-lm90/130 is trying to acquire lock:
> c27ba380 (&tz->lock){+.+.}-{3:3}, at: thermal_zone_device_update+0x2c/0x64
> 
>                 but task is already holding lock:
> c27b42c8 (&data->update_lock){+.+.}-{3:3}, at: lm90_irq_thread+0x2c/0x68
> 
>                 which lock already depends on the new lock.
> 
> 
>                 the existing dependency chain (in reverse order) is:
> 
>                 -> #1 (&data->update_lock){+.+.}-{3:3}:
>         __mutex_lock+0x9c/0x984
>         mutex_lock_nested+0x2c/0x34
>         lm90_read+0x44/0x3e8
>         hwmon_thermal_get_temp+0x58/0x8c
>         of_thermal_get_temp+0x38/0x44
>         thermal_zone_get_temp+0x5c/0x7c
>         thermal_zone_device_update.part.0+0x48/0x5fc
>         thermal_zone_device_set_mode+0xa0/0xe4
>         thermal_zone_device_enable+0x1c/0x20
>         thermal_zone_of_sensor_register+0x18c/0x19c
>         devm_thermal_zone_of_sensor_register+0x68/0xa4
>         __hwmon_device_register+0x704/0x91c
>         hwmon_device_register_with_info+0x6c/0x80
>         devm_hwmon_device_register_with_info+0x78/0xb4
>         lm90_probe+0x618/0x8c0
>         i2c_device_probe+0x170/0x2e0
>         really_probe+0xd8/0x300
>         __driver_probe_device+0x94/0xf4
>         driver_probe_device+0x40/0x118
>         __device_attach_driver+0xc8/0x10c
>         bus_for_each_drv+0x90/0xdc
>         __device_attach+0xbc/0x1d4
>         device_initial_probe+0x1c/0x20
>         bus_probe_device+0x98/0xa0
>         deferred_probe_work_func+0x8c/0xbc
>         process_one_work+0x2b8/0x774
>         worker_thread+0x17c/0x56c
>         kthread+0x108/0x13c
>         ret_from_fork+0x14/0x28
>         0x0
> 
>                 -> #0 (&tz->lock){+.+.}-{3:3}:
>         __lock_acquire+0x173c/0x3198
>         lock_acquire+0x128/0x3f0
>         __mutex_lock+0x9c/0x984
>         mutex_lock_nested+0x2c/0x34
>         thermal_zone_device_update+0x2c/0x64
>         hwmon_notify_event+0x128/0x138
>         lm90_update_alarms_locked+0x35c/0x3b8
>         lm90_irq_thread+0x38/0x68
>         irq_thread_fn+0x2c/0x8c
>         irq_thread+0x190/0x29c >         kthread+0x108/0x13c
>         ret_from_fork+0x14/0x28
>         0x0
> 
>                 other info that might help us debug this:
> 
>   Possible unsafe locking scenario:
> 
>         CPU0                    CPU1
>         ----                    ----
>    lock(&data->update_lock);
>                                 lock(&tz->lock);
>                                 lock(&data->update_lock);
>    lock(&tz->lock);
> 
>                  *** DEADLOCK ***
> 
> 1 lock held by irq/91-lm90/130:
>   #0: c27b42c8 (&data->update_lock){+.+.}-{3:3}, at:
> lm90_irq_thread+0x2c/0x68
> 
>                 stack backtrace:
> CPU: 1 PID: 130 Comm: irq/91-lm90 Not tainted
> 5.19.0-rc4-next-20220628-00002-g94e5dbbe1c58-dirty #24
> Hardware name: NVIDIA Tegra SoC (Flattened Device Tree)
> Backtrace:
>   dump_backtrace from show_stack+0x20/0x24
>   r7:c33d1b60 r6:00000080 r5:60000093 r4:c168c6a4
>   show_stack from dump_stack_lvl+0x68/0x98
>   dump_stack_lvl from dump_stack+0x18/0x1c
>   r7:c33d1b60 r6:c20328cc r5:c203c700 r4:c20328cc
>   dump_stack from print_circular_bug+0x2ec/0x33c
>   print_circular_bug from check_noncircular+0x104/0x168
>   r10:c1a14cc8 r9:c33d1240 r8:00000001 r7:00000000 r6:dfc3dcc0 r5:c33d1b60
>   r4:c33d1b80
>   check_noncircular from __lock_acquire+0x173c/0x3198
>   r7:c33d1b80 r6:c202bc98 r5:c33d1b60 r4:c21d92ac
>   __lock_acquire from lock_acquire+0x128/0x3f0
>   r10:60000013 r9:00000000 r8:00000000 r7:00000000 r6:dfc3dd40 r5:c19ac688
>   r4:c19ac688
>   lock_acquire from __mutex_lock+0x9c/0x984
>   r10:c27ba380 r9:00000000 r8:c21d92ac r7:c33d1240 r6:00000000 r5:00000000
>   r4:c27ba348
>   __mutex_lock from mutex_lock_nested+0x2c/0x34
>   r10:c27b4000 r9:00000000 r8:dfc3de87 r7:00000000 r6:c27ba348 r5:00000000
>   r4:c27ba000
>   mutex_lock_nested from thermal_zone_device_update+0x2c/0x64
>   thermal_zone_device_update from hwmon_notify_event+0x128/0x138
>   r7:00000000 r6:00000000 r5:c2d23ea4 r4:c33fd040
>   hwmon_notify_event from lm90_update_alarms_locked+0x35c/0x3b8
>   r8:c27b4378 r7:c2d23c08 r6:00000020 r5:00000000 r4:c27b4240
>   lm90_update_alarms_locked from lm90_irq_thread+0x38/0x68
>   r9:c01c2814 r8:00000001 r7:c33d2240 r6:c27b4290 r5:c27b4240 r4:c33fc200
>   lm90_irq_thread from irq_thread_fn+0x2c/0x8c
>   r7:c33d2240 r6:c27b4000 r5:c33d1240 r4:c33fc200
>   irq_thread_fn from irq_thread+0x190/0x29c
>   r7:c33d2240 r6:c33fc224 r5:c33d1240 r4:00000000
>   irq_thread from kthread+0x108/0x13c
>   r10:00000000 r9:df9ddbf4 r8:c31d2200 r7:c33fc200 r6:c01c2710 r5:c33d1240
>   r4:c33fc240
>   kthread from ret_from_fork+0x14/0x28
>
Dmitry Osipenko June 29, 2022, 8:05 p.m. UTC | #9
On 6/16/22 23:25, Daniel Lezcano wrote:
> The get_trend function does already what the generic framework does.
> 
> Remove it.
> 
> Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
> ---
>  drivers/thermal/tegra/soctherm.c | 32 --------------------------------
>  1 file changed, 32 deletions(-)
> 
> diff --git a/drivers/thermal/tegra/soctherm.c b/drivers/thermal/tegra/soctherm.c
> index 210325f92559..825eab526619 100644
> --- a/drivers/thermal/tegra/soctherm.c
> +++ b/drivers/thermal/tegra/soctherm.c
> @@ -633,37 +633,6 @@ static int tegra_thermctl_set_trip_temp(void *data, int trip, int temp)
>  	return 0;
>  }
>  
> -static int tegra_thermctl_get_trend(void *data, int trip,
> -				    enum thermal_trend *trend)
> -{
> -	struct tegra_thermctl_zone *zone = data;
> -	struct thermal_zone_device *tz = zone->tz;
> -	int trip_temp, temp, last_temp, ret;
> -
> -	if (!tz)
> -		return -EINVAL;
> -
> -	ret = tz->ops->get_trip_temp(zone->tz, trip, &trip_temp);
> -	if (ret)
> -		return ret;
> -
> -	temp = READ_ONCE(tz->temperature);
> -	last_temp = READ_ONCE(tz->last_temperature);
> -
> -	if (temp > trip_temp) {
> -		if (temp >= last_temp)
> -			*trend = THERMAL_TREND_RAISING;
> -		else
> -			*trend = THERMAL_TREND_STABLE;
> -	} else if (temp < trip_temp) {
> -		*trend = THERMAL_TREND_DROPPING;
> -	} else {
> -		*trend = THERMAL_TREND_STABLE;
> -	}
> -
> -	return 0;
> -}
> -
>  static void thermal_irq_enable(struct tegra_thermctl_zone *zn)
>  {
>  	u32 r;
> @@ -716,7 +685,6 @@ static int tegra_thermctl_set_trips(void *data, int lo, int hi)
>  static const struct thermal_zone_of_device_ops tegra_of_thermal_ops = {
>  	.get_temp = tegra_thermctl_get_temp,
>  	.set_trip_temp = tegra_thermctl_set_trip_temp,
> -	.get_trend = tegra_thermctl_get_trend,
>  	.set_trips = tegra_thermctl_set_trips,
>  };
>  

Guenter fixed the LM90 driver problem. There are other regressions in
the latest -next which complicate testing, but I can't see any problems
from the thermal side.

Tested-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Daniel Lezcano June 30, 2022, 10:17 a.m. UTC | #10
On 29/06/2022 22:05, Dmitry Osipenko wrote:
> On 6/16/22 23:25, Daniel Lezcano wrote:
>> The get_trend function does already what the generic framework does.
>>
>> Remove it.
>>
>> Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
>> ---

[ ... ]

>>   static void thermal_irq_enable(struct tegra_thermctl_zone *zn)
>>   {
>>   	u32 r;
>> @@ -716,7 +685,6 @@ static int tegra_thermctl_set_trips(void *data, int lo, int hi)
>>   static const struct thermal_zone_of_device_ops tegra_of_thermal_ops = {
>>   	.get_temp = tegra_thermctl_get_temp,
>>   	.set_trip_temp = tegra_thermctl_set_trip_temp,
>> -	.get_trend = tegra_thermctl_get_trend,
>>   	.set_trips = tegra_thermctl_set_trips,
>>   };
>>   
> 
> Guenter fixed the LM90 driver problem. There are other regressions in
> the latest -next which complicate testing, but I can't see any problems
> from the thermal side.
> 
> Tested-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>

Great! Thanks for taking the time to test
diff mbox series

Patch

diff --git a/drivers/thermal/tegra/soctherm.c b/drivers/thermal/tegra/soctherm.c
index 210325f92559..825eab526619 100644
--- a/drivers/thermal/tegra/soctherm.c
+++ b/drivers/thermal/tegra/soctherm.c
@@ -633,37 +633,6 @@  static int tegra_thermctl_set_trip_temp(void *data, int trip, int temp)
 	return 0;
 }
 
-static int tegra_thermctl_get_trend(void *data, int trip,
-				    enum thermal_trend *trend)
-{
-	struct tegra_thermctl_zone *zone = data;
-	struct thermal_zone_device *tz = zone->tz;
-	int trip_temp, temp, last_temp, ret;
-
-	if (!tz)
-		return -EINVAL;
-
-	ret = tz->ops->get_trip_temp(zone->tz, trip, &trip_temp);
-	if (ret)
-		return ret;
-
-	temp = READ_ONCE(tz->temperature);
-	last_temp = READ_ONCE(tz->last_temperature);
-
-	if (temp > trip_temp) {
-		if (temp >= last_temp)
-			*trend = THERMAL_TREND_RAISING;
-		else
-			*trend = THERMAL_TREND_STABLE;
-	} else if (temp < trip_temp) {
-		*trend = THERMAL_TREND_DROPPING;
-	} else {
-		*trend = THERMAL_TREND_STABLE;
-	}
-
-	return 0;
-}
-
 static void thermal_irq_enable(struct tegra_thermctl_zone *zn)
 {
 	u32 r;
@@ -716,7 +685,6 @@  static int tegra_thermctl_set_trips(void *data, int lo, int hi)
 static const struct thermal_zone_of_device_ops tegra_of_thermal_ops = {
 	.get_temp = tegra_thermctl_get_temp,
 	.set_trip_temp = tegra_thermctl_set_trip_temp,
-	.get_trend = tegra_thermctl_get_trend,
 	.set_trips = tegra_thermctl_set_trips,
 };