Message ID | 1518005211-2843-2-git-send-email-gavin.guo@canonical.com |
---|---|
State | New |
Headers | show |
Series | [SRU,Xenial] sched/fair: Fix O(nr_cgroups) in load balance path | expand |
On 07.02.2018 13:06, Gavin Guo wrote: > From: Tejun Heo <tj@kernel.org> > > BugLink: https://bugs.launchpad.net/bugs/1747896 > > Currently, rq->leaf_cfs_rq_list is a traversal ordered list of all > live cfs_rqs which have ever been active on the CPU; unfortunately, > this makes update_blocked_averages() O(# total cgroups) which isn't > scalable at all. > > This shows up as a small CPU consumption and scheduling latency > increase in the load balancing path in systems with CPU controller > enabled across most cgroups. In an edge case where temporary cgroups > were leaking, this caused the kernel to consume good several tens of > percents of CPU cycles running update_blocked_averages(), each run > taking multiple millisecs. > > This patch fixes the issue by taking empty and fully decayed cfs_rqs > off the rq->leaf_cfs_rq_list. > > Signed-off-by: Tejun Heo <tj@kernel.org> > [ Added cfs_rq_is_decayed() ] > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> > Acked-by: Vincent Guittot <vincent.guittot@linaro.org> > Cc: Chris Mason <clm@fb.com> > Cc: Linus Torvalds <torvalds@linux-foundation.org> > Cc: Mike Galbraith <efault@gmx.de> > Cc: Paul Turner <pjt@google.com> > Cc: Peter Zijlstra <peterz@infradead.org> > Cc: Thomas Gleixner <tglx@linutronix.de> > Link: http://lkml.kernel.org/r/20170426004350.GB3222@wtj.duckdns.org > Signed-off-by: Ingo Molnar <mingo@kernel.org> > > (backported from commit a9e7f6544b9cebdae54d29f87a7ba2a83c0471b5) > Signed-off-by: Gavin Guo <gavin.guo@canonical.com> Acked-by: Stefan Bader <stefan.bader@canonical.com> > --- Positive testing. > kernel/sched/fair.c | 51 +++++++++++++++++++++++++++++++++++++++------------ > 1 file changed, 39 insertions(+), 12 deletions(-) > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > index c1e78e9c8f05..0c711cb4fbe8 100644 > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -314,8 +314,9 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) > } > > /* Iterate thr' all leaf cfs_rq's on a runqueue */ > -#define for_each_leaf_cfs_rq(rq, cfs_rq) \ > - list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) > +#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ > + list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \ > + leaf_cfs_rq_list) > > /* Do the two (enqueued) entities belong to the same group ? */ > static inline struct cfs_rq * > @@ -408,8 +409,8 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) > { > } > > -#define for_each_leaf_cfs_rq(rq, cfs_rq) \ > - for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) > +#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ > + for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos) > > static inline struct sched_entity *parent_entity(struct sched_entity *se) > { > @@ -4065,9 +4066,9 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) > > static void __maybe_unused update_runtime_enabled(struct rq *rq) > { > - struct cfs_rq *cfs_rq; > + struct cfs_rq *cfs_rq, *pos; > > - for_each_leaf_cfs_rq(rq, cfs_rq) { > + for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) { > struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth; > > raw_spin_lock(&cfs_b->lock); > @@ -4078,9 +4079,9 @@ static void __maybe_unused update_runtime_enabled(struct rq *rq) > > static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) > { > - struct cfs_rq *cfs_rq; > + struct cfs_rq *cfs_rq, *pos; > > - for_each_leaf_cfs_rq(rq, cfs_rq) { > + for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) { > if (!cfs_rq->runtime_enabled) > continue; > > @@ -5966,10 +5967,28 @@ static void attach_tasks(struct lb_env *env) > } > > #ifdef CONFIG_FAIR_GROUP_SCHED > + > +static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) > +{ > + if (cfs_rq->load.weight) > + return false; > + > + if (cfs_rq->avg.load_sum) > + return false; > + > + if (cfs_rq->avg.util_sum) > + return false; > + > + if (cfs_rq->runnable_load_sum) > + return false; > + > + return true; > +} > + > static void update_blocked_averages(int cpu) > { > struct rq *rq = cpu_rq(cpu); > - struct cfs_rq *cfs_rq; > + struct cfs_rq *cfs_rq, *pos; > unsigned long flags; > > raw_spin_lock_irqsave(&rq->lock, flags); > @@ -5979,13 +5998,21 @@ static void update_blocked_averages(int cpu) > * Iterates the task_group tree in a bottom up fashion, see > * list_add_leaf_cfs_rq() for details. > */ > - for_each_leaf_cfs_rq(rq, cfs_rq) { > + for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) { > + > /* throttled entities do not contribute to load */ > if (throttled_hierarchy(cfs_rq)) > continue; > > if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq)) > update_tg_load_avg(cfs_rq, 0); > + > + /* > + * There can be a lot of idle CPU cgroups. Don't let fully > + * decayed cfs_rqs linger on the list. > + */ > + if (cfs_rq_is_decayed(cfs_rq)) > + list_del_leaf_cfs_rq(cfs_rq); > } > raw_spin_unlock_irqrestore(&rq->lock, flags); > } > @@ -8386,10 +8413,10 @@ const struct sched_class fair_sched_class = { > #ifdef CONFIG_SCHED_DEBUG > void print_cfs_stats(struct seq_file *m, int cpu) > { > - struct cfs_rq *cfs_rq; > + struct cfs_rq *cfs_rq, *pos; > > rcu_read_lock(); > - for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) > + for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos) > print_cfs_rq(m, cpu, cfs_rq); > rcu_read_unlock(); > } >
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c1e78e9c8f05..0c711cb4fbe8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -314,8 +314,9 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) } /* Iterate thr' all leaf cfs_rq's on a runqueue */ -#define for_each_leaf_cfs_rq(rq, cfs_rq) \ - list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) +#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ + list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \ + leaf_cfs_rq_list) /* Do the two (enqueued) entities belong to the same group ? */ static inline struct cfs_rq * @@ -408,8 +409,8 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) { } -#define for_each_leaf_cfs_rq(rq, cfs_rq) \ - for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) +#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ + for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos) static inline struct sched_entity *parent_entity(struct sched_entity *se) { @@ -4065,9 +4066,9 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) static void __maybe_unused update_runtime_enabled(struct rq *rq) { - struct cfs_rq *cfs_rq; + struct cfs_rq *cfs_rq, *pos; - for_each_leaf_cfs_rq(rq, cfs_rq) { + for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) { struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth; raw_spin_lock(&cfs_b->lock); @@ -4078,9 +4079,9 @@ static void __maybe_unused update_runtime_enabled(struct rq *rq) static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) { - struct cfs_rq *cfs_rq; + struct cfs_rq *cfs_rq, *pos; - for_each_leaf_cfs_rq(rq, cfs_rq) { + for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) { if (!cfs_rq->runtime_enabled) continue; @@ -5966,10 +5967,28 @@ static void attach_tasks(struct lb_env *env) } #ifdef CONFIG_FAIR_GROUP_SCHED + +static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) +{ + if (cfs_rq->load.weight) + return false; + + if (cfs_rq->avg.load_sum) + return false; + + if (cfs_rq->avg.util_sum) + return false; + + if (cfs_rq->runnable_load_sum) + return false; + + return true; +} + static void update_blocked_averages(int cpu) { struct rq *rq = cpu_rq(cpu); - struct cfs_rq *cfs_rq; + struct cfs_rq *cfs_rq, *pos; unsigned long flags; raw_spin_lock_irqsave(&rq->lock, flags); @@ -5979,13 +5998,21 @@ static void update_blocked_averages(int cpu) * Iterates the task_group tree in a bottom up fashion, see * list_add_leaf_cfs_rq() for details. */ - for_each_leaf_cfs_rq(rq, cfs_rq) { + for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) { + /* throttled entities do not contribute to load */ if (throttled_hierarchy(cfs_rq)) continue; if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq)) update_tg_load_avg(cfs_rq, 0); + + /* + * There can be a lot of idle CPU cgroups. Don't let fully + * decayed cfs_rqs linger on the list. + */ + if (cfs_rq_is_decayed(cfs_rq)) + list_del_leaf_cfs_rq(cfs_rq); } raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -8386,10 +8413,10 @@ const struct sched_class fair_sched_class = { #ifdef CONFIG_SCHED_DEBUG void print_cfs_stats(struct seq_file *m, int cpu) { - struct cfs_rq *cfs_rq; + struct cfs_rq *cfs_rq, *pos; rcu_read_lock(); - for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) + for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos) print_cfs_rq(m, cpu, cfs_rq); rcu_read_unlock(); }