From 9b5e097643d29e2ea43f4e4b64d1e5c3174fc749 Mon Sep 17 00:00:00 2001 From: kondors1995 Date: Sat, 17 Aug 2024 19:34:55 +0300 Subject: [PATCH 01/30] R6.4 --- build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.sh b/build.sh index 32cb41c6bcd6..7b5c09c6c83b 100755 --- a/build.sh +++ b/build.sh @@ -15,7 +15,7 @@ export THINLTO_CACHE=~/ltocache/ DEFCONFIG="raphael_defconfig" # Kernel Details -REV="R6.3" +REV="R6.4" EDITION="BLACK" VER="$EDITION"-"$REV" From f67b4d365ee52cc44eec595b4eb1b2a62c3bedac Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Sun, 4 Aug 2024 11:29:57 +0300 Subject: [PATCH 02/30] cpufreq: sched/schedutil: Remove LATENCY_MULTIPLIER The current LATENCY_MULTIPLIER which has been around for nearly 20 years causes rate_limit_us to be always in ms range. On M1 mac mini I get 50 and 56us transition latency, but due to the 1000 multiplier we end up setting rate_limit_us to 50 and 56ms, which gets capped into 2ms and was 10ms before e13aa799c2a6 ("cpufreq: Change default transition delay to 2ms") On Intel I5 system transition latency is 20us but due to the multiplier we end up with 20ms that again is capped to 2ms. Given how good modern hardware and how modern workloads require systems to be more responsive to cater for sudden changes in workload (tasks sleeping/wakeup/migrating, uclamp causing a sudden boost or cap) and that 2ms is quarter of the time of 120Hz refresh rate system, drop the old logic in favour of providing 50% headroom. rate_limit_us = 1.5 * latency. I considered not adding any headroom which could mean that we can end up with infinite back-to-back requests. I also considered providing a constant headroom (e.g: 100us) assuming that any h/w or f/w dealing with the request shouldn't require a large headroom when transition_latency is actually high. But for both cases I wasn't sure if h/w or f/w can end up being overwhelmed dealing with the freq requests in a potentially busy system. So I opted for providing 50% breathing room. This is expected to impact schedutil only as the other user, dbs_governor, takes the max(2*tick, transition_delay_us) and the former was at least 2ms on 1ms TICK, which is equivalent to the max_delay_us before applying this patch. For systems with TICK of 4ms, this value would have almost always ended up with 8ms sampling rate. For systems that report 0 transition latency, we still default to returning 1ms as transition delay. This helps in eliminating a source of latency for applying requests as mentioned in [1]. For example if we have a 1ms tick, most systems will miss sending an update at tick when updating the util_avg for a task/CPU (rate_limit_us will be 2ms for most systems). [1] https://lore.kernel.org/lkml/20240724212255.mfr2ybiv2j2uqek7@airbuntu/ Signed-off-by: Qais Yousef --- drivers/cpufreq/cpufreq.c | 18 ++++-------------- include/linux/cpufreq.h | 8 -------- 2 files changed, 4 insertions(+), 22 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 1992db8bda2c..c7bb93877d29 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -537,21 +537,11 @@ unsigned int cpufreq_policy_transition_delay_us(struct cpufreq_policy *policy) return policy->transition_delay_us; latency = policy->cpuinfo.transition_latency / NSEC_PER_USEC; - if (latency) { - /* - * For platforms that can change the frequency very fast (< 10 - * us), the above formula gives a decent transition delay. But - * for platforms where transition_latency is in milliseconds, it - * ends up giving unrealistic values. - * - * Cap the default transition delay to 10 ms, which seems to be - * a reasonable amount of time after which we should reevaluate - * the frequency. - */ - return min(latency * LATENCY_MULTIPLIER, (unsigned int)10000); - } + if (latency) + /* Give a 50% breathing room between updates */ + return latency + (latency >> 1); - return LATENCY_MULTIPLIER; + return USEC_PER_MSEC; } EXPORT_SYMBOL_GPL(cpufreq_policy_transition_delay_us); diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 84188e180a2d..40e1fdee07f4 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -493,14 +493,6 @@ static inline unsigned long cpufreq_scale(unsigned long old, u_int div, #define CPUFREQ_POLICY_POWERSAVE (1) #define CPUFREQ_POLICY_PERFORMANCE (2) -/* - * The polling frequency depends on the capability of the processor. Default - * polling frequency is 1000 times the transition latency of the processor. The - * ondemand governor will work on any processor with transition latency <= 10ms, - * using appropriate sampling rate. - */ -#define LATENCY_MULTIPLIER (1000) - struct cpufreq_governor { char name[CPUFREQ_NAME_LEN]; int (*init)(struct cpufreq_policy *policy); From 8f6e4918d3418f3f0a76a3a8940bca823addf924 Mon Sep 17 00:00:00 2001 From: kondors1995 Date: Fri, 23 Aug 2024 18:32:12 +0300 Subject: [PATCH 03/30] Revert "cpufreq: stats: replace the global lock with atomic" This reverts commit 5056373df84ba21294efd178d9092071525c7f87. --- drivers/cpufreq/cpufreq_stats.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c index 1284bc003031..f32a88d57f88 100644 --- a/drivers/cpufreq/cpufreq_stats.c +++ b/drivers/cpufreq/cpufreq_stats.c @@ -14,13 +14,15 @@ #include #include +static DEFINE_SPINLOCK(cpufreq_stats_lock); + struct cpufreq_stats { unsigned int total_trans; - atomic64_t last_time; + unsigned long long last_time; unsigned int max_state; unsigned int state_num; unsigned int last_index; - atomic64_t *time_in_state; + u64 *time_in_state; unsigned int *freq_table; unsigned int *trans_table; }; @@ -28,19 +30,21 @@ struct cpufreq_stats { static void cpufreq_stats_update(struct cpufreq_stats *stats) { unsigned long long cur_time = get_jiffies_64(); - unsigned long long time = cur_time; + unsigned long flags; - time = atomic64_xchg(&stats->last_time, time); - atomic64_add(cur_time - time, &stats->time_in_state[stats->last_index]); + spin_lock_irqsave(&cpufreq_stats_lock, flags); + stats->time_in_state[stats->last_index] += cur_time - stats->last_time; + stats->last_time = cur_time; + spin_unlock_irqrestore(&cpufreq_stats_lock, flags); } static void cpufreq_stats_clear_table(struct cpufreq_stats *stats) { unsigned int count = stats->max_state; - memset(stats->time_in_state, 0, count * sizeof(atomic64_t)); + memset(stats->time_in_state, 0, count * sizeof(u64)); memset(stats->trans_table, 0, count * count * sizeof(int)); - atomic64_set(&stats->last_time, get_jiffies_64()); + stats->last_time = get_jiffies_64(); stats->total_trans = 0; } @@ -59,8 +63,7 @@ static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf) for (i = 0; i < stats->state_num; i++) { len += sprintf(buf + len, "%u %llu\n", stats->freq_table[i], (unsigned long long) - jiffies_64_to_clock_t(atomic64_read( - &stats->time_in_state[i]))); + jiffies_64_to_clock_t(stats->time_in_state[i])); } return len; } @@ -178,7 +181,7 @@ void cpufreq_stats_create_table(struct cpufreq_policy *policy) if (!stats) return; - alloc_size = count * sizeof(int) + count * sizeof(atomic64_t); + alloc_size = count * sizeof(int) + count * sizeof(u64); alloc_size += count * count * sizeof(int); @@ -199,7 +202,7 @@ void cpufreq_stats_create_table(struct cpufreq_policy *policy) stats->freq_table[i++] = pos->frequency; stats->state_num = i; - atomic64_set(&stats->last_time, get_jiffies_64()); + stats->last_time = get_jiffies_64(); stats->last_index = freq_table_get_index(stats, policy->cur); policy->stats = stats; From ff33b53a182245508593a15c96435bb729df4437 Mon Sep 17 00:00:00 2001 From: kondors1995 Date: Fri, 23 Aug 2024 18:38:58 +0300 Subject: [PATCH 04/30] Revert "cpufreq: record CPUFREQ stat for fast switch path" This reverts commit eb4660ce1f8bf5c18ecc11e0eefd1444dabb160d. # Conflicts: # drivers/cpufreq/cpufreq_stats.c --- drivers/cpufreq/cpufreq.c | 4 +--- drivers/cpufreq/cpufreq_stats.c | 11 ++++++++--- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index c7bb93877d29..f319f7b4bd30 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1871,10 +1871,8 @@ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy, target_freq = clamp_val(target_freq, policy->min, policy->max); ret = cpufreq_driver->fast_switch(policy, target_freq); - if (ret) { + if (ret) cpufreq_times_record_transition(policy, ret); - cpufreq_stats_record_transition(policy, ret); - } return ret; } diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c index f32a88d57f88..5b8677e89063 100644 --- a/drivers/cpufreq/cpufreq_stats.c +++ b/drivers/cpufreq/cpufreq_stats.c @@ -30,12 +30,11 @@ struct cpufreq_stats { static void cpufreq_stats_update(struct cpufreq_stats *stats) { unsigned long long cur_time = get_jiffies_64(); - unsigned long flags; - spin_lock_irqsave(&cpufreq_stats_lock, flags); + spin_lock(&cpufreq_stats_lock); stats->time_in_state[stats->last_index] += cur_time - stats->last_time; stats->last_time = cur_time; - spin_unlock_irqrestore(&cpufreq_stats_lock, flags); + spin_unlock(&cpufreq_stats_lock); } static void cpufreq_stats_clear_table(struct cpufreq_stats *stats) @@ -59,6 +58,9 @@ static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf) ssize_t len = 0; int i; + if (policy->fast_switch_enabled) + return 0; + cpufreq_stats_update(stats); for (i = 0; i < stats->state_num; i++) { len += sprintf(buf + len, "%u %llu\n", stats->freq_table[i], @@ -82,6 +84,9 @@ static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf) ssize_t len = 0; int i, j; + if (policy->fast_switch_enabled) + return 0; + len += scnprintf(buf + len, PAGE_SIZE - len, " From : To\n"); len += scnprintf(buf + len, PAGE_SIZE - len, " : "); for (i = 0; i < stats->state_num; i++) { From cbbed7e99468f3375bee586284354669efdf71d4 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 1 Feb 2019 11:45:44 +0530 Subject: [PATCH 05/30] cpufreq: stats: Declare freq-attr right after their callbacks Freq attribute for "trans_table" is defined right after its callback (without any blank line between them), but the others are defined separately later on. Keep this consistent and define all attributes right after their callbacks. Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq_stats.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c index 5b8677e89063..291883ce664f 100644 --- a/drivers/cpufreq/cpufreq_stats.c +++ b/drivers/cpufreq/cpufreq_stats.c @@ -51,6 +51,7 @@ static ssize_t show_total_trans(struct cpufreq_policy *policy, char *buf) { return sprintf(buf, "%d\n", policy->stats->total_trans); } +cpufreq_freq_attr_ro(total_trans); static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf) { @@ -69,6 +70,7 @@ static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf) } return len; } +cpufreq_freq_attr_ro(time_in_state); static ssize_t store_reset(struct cpufreq_policy *policy, const char *buf, size_t count) @@ -77,6 +79,7 @@ static ssize_t store_reset(struct cpufreq_policy *policy, const char *buf, cpufreq_stats_clear_table(policy->stats); return count; } +cpufreq_freq_attr_wo(reset); static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf) { @@ -126,10 +129,6 @@ static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf) } cpufreq_freq_attr_ro(trans_table); -cpufreq_freq_attr_ro(total_trans); -cpufreq_freq_attr_ro(time_in_state); -cpufreq_freq_attr_wo(reset); - static struct attribute *default_attrs[] = { &total_trans.attr, &time_in_state.attr, From fa24b0591778600d83eb09c94d0a006b34076520 Mon Sep 17 00:00:00 2001 From: Eva Huang Date: Fri, 29 Jul 2022 11:43:47 +0800 Subject: [PATCH 06/30] msm: gsi: remove the WARN_ON to avoid panic_on_warn issue on debug build. Refer to QC case 06009586. Bug: 201209987 Signed-off-by: Eva Huang Change-Id: Ifb8089a9d288932f18a26606567e48d1dcf5eaf2 --- drivers/platform/msm/gsi/gsi.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/platform/msm/gsi/gsi.c b/drivers/platform/msm/gsi/gsi.c index 81b54bd38a2a..7b339e3643a6 100644 --- a/drivers/platform/msm/gsi/gsi.c +++ b/drivers/platform/msm/gsi/gsi.c @@ -581,7 +581,6 @@ static void gsi_process_chan(struct gsi_xfer_compl_evt *evt, if (callback) { if (unlikely(atomic_read(&ch_ctx->poll_mode))) { GSIERR("Calling client callback in polling mode\n"); - WARN_ON(1); } ch_ctx->props.xfer_cb(notify); } From 53ae6a505535c23bf67ebd5aa3eae6fc6c55edeb Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 1 Feb 2019 11:45:45 +0530 Subject: [PATCH 07/30] cpufreq: stats: Fix concurrency issues while resetting stats It is possible for cpufreq_stats_clear_table() and cpufreq_stats_record_transition() to get called concurrently and they will try to update same variables simultaneously and may lead to corruption of data. Prevent that with the help of existing spinlock. Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq_stats.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c index 291883ce664f..62ad56262df9 100644 --- a/drivers/cpufreq/cpufreq_stats.c +++ b/drivers/cpufreq/cpufreq_stats.c @@ -31,20 +31,20 @@ static void cpufreq_stats_update(struct cpufreq_stats *stats) { unsigned long long cur_time = get_jiffies_64(); - spin_lock(&cpufreq_stats_lock); stats->time_in_state[stats->last_index] += cur_time - stats->last_time; stats->last_time = cur_time; - spin_unlock(&cpufreq_stats_lock); } static void cpufreq_stats_clear_table(struct cpufreq_stats *stats) { unsigned int count = stats->max_state; + spin_lock(&cpufreq_stats_lock); memset(stats->time_in_state, 0, count * sizeof(u64)); memset(stats->trans_table, 0, count * count * sizeof(int)); stats->last_time = get_jiffies_64(); stats->total_trans = 0; + spin_unlock(&cpufreq_stats_lock); } static ssize_t show_total_trans(struct cpufreq_policy *policy, char *buf) @@ -62,7 +62,10 @@ static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf) if (policy->fast_switch_enabled) return 0; + spin_lock(&cpufreq_stats_lock); cpufreq_stats_update(stats); + spin_unlock(&cpufreq_stats_lock); + for (i = 0; i < stats->state_num; i++) { len += sprintf(buf + len, "%u %llu\n", stats->freq_table[i], (unsigned long long) @@ -239,9 +242,11 @@ void cpufreq_stats_record_transition(struct cpufreq_policy *policy, if (unlikely(old_index == -1 || new_index == -1 || old_index == new_index)) return; + spin_lock(&cpufreq_stats_lock); cpufreq_stats_update(stats); stats->last_index = new_index; stats->trans_table[old_index * stats->max_state + new_index]++; stats->total_trans++; + spin_unlock(&cpufreq_stats_lock); } From b828add3e575e5675b1ec4b8ae6504d1578924f4 Mon Sep 17 00:00:00 2001 From: Kyle Lin Date: Tue, 9 Apr 2019 16:43:04 +0800 Subject: [PATCH 08/30] cpufreq: stats: Use lock by stat to replace global spin lock Stats is updated by each policy, using the lock by stat can reduce the contention. Signed-off-by: Kyle Lin Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq_stats.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c index 62ad56262df9..9ad15a9236ab 100644 --- a/drivers/cpufreq/cpufreq_stats.c +++ b/drivers/cpufreq/cpufreq_stats.c @@ -14,7 +14,6 @@ #include #include -static DEFINE_SPINLOCK(cpufreq_stats_lock); struct cpufreq_stats { unsigned int total_trans; @@ -23,6 +22,7 @@ struct cpufreq_stats { unsigned int state_num; unsigned int last_index; u64 *time_in_state; + spinlock_t lock; unsigned int *freq_table; unsigned int *trans_table; }; @@ -39,12 +39,12 @@ static void cpufreq_stats_clear_table(struct cpufreq_stats *stats) { unsigned int count = stats->max_state; - spin_lock(&cpufreq_stats_lock); + spin_lock(&stats->lock); memset(stats->time_in_state, 0, count * sizeof(u64)); memset(stats->trans_table, 0, count * count * sizeof(int)); stats->last_time = get_jiffies_64(); stats->total_trans = 0; - spin_unlock(&cpufreq_stats_lock); + spin_unlock(&stats->lock); } static ssize_t show_total_trans(struct cpufreq_policy *policy, char *buf) @@ -62,9 +62,9 @@ static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf) if (policy->fast_switch_enabled) return 0; - spin_lock(&cpufreq_stats_lock); + spin_lock(&stats->lock); cpufreq_stats_update(stats); - spin_unlock(&cpufreq_stats_lock); + spin_unlock(&stats->lock); for (i = 0; i < stats->state_num; i++) { len += sprintf(buf + len, "%u %llu\n", stats->freq_table[i], @@ -211,6 +211,7 @@ void cpufreq_stats_create_table(struct cpufreq_policy *policy) stats->state_num = i; stats->last_time = get_jiffies_64(); stats->last_index = freq_table_get_index(stats, policy->cur); + spin_lock_init(&stats->lock); policy->stats = stats; ret = sysfs_create_group(&policy->kobj, &stats_attr_group); @@ -242,11 +243,11 @@ void cpufreq_stats_record_transition(struct cpufreq_policy *policy, if (unlikely(old_index == -1 || new_index == -1 || old_index == new_index)) return; - spin_lock(&cpufreq_stats_lock); + spin_lock(&stats->lock); cpufreq_stats_update(stats); stats->last_index = new_index; stats->trans_table[old_index * stats->max_state + new_index]++; stats->total_trans++; - spin_unlock(&cpufreq_stats_lock); + spin_unlock(&stats->lock); } From c0036f28dcd1b3c5c8ac6bf4153069408d4526e3 Mon Sep 17 00:00:00 2001 From: kondors1995 Date: Fri, 23 Aug 2024 18:44:53 +0300 Subject: [PATCH 09/30] Revert "cpufreq: stats: Mark few conditionals with unlikely()" This reverts commit 54a0849d7a6c52280bd28d66fb9a520ec004b561. --- drivers/cpufreq/cpufreq_stats.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c index 9ad15a9236ab..8f95b9fe257f 100644 --- a/drivers/cpufreq/cpufreq_stats.c +++ b/drivers/cpufreq/cpufreq_stats.c @@ -231,7 +231,7 @@ void cpufreq_stats_record_transition(struct cpufreq_policy *policy, struct cpufreq_stats *stats = policy->stats; int old_index, new_index; - if (unlikely(!stats)) { + if (!stats) { pr_debug("%s: No stats found\n", __func__); return; } @@ -240,7 +240,7 @@ void cpufreq_stats_record_transition(struct cpufreq_policy *policy, new_index = freq_table_get_index(stats, new_freq); /* We can't do stats->time_in_state[-1]= .. */ - if (unlikely(old_index == -1 || new_index == -1 || old_index == new_index)) + if (old_index == -1 || new_index == -1 || old_index == new_index) return; spin_lock(&stats->lock); From 57a2f4a76463ab8dbc2315d78f6ae11b6a4cf254 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 5 Oct 2020 13:26:01 +0530 Subject: [PATCH 10/30] cpufreq: stats: Defer stats update to cpufreq_stats_record_transition() In order to prepare for lock-less stats update, add support to defer any updates to it until cpufreq_stats_record_transition() is called. The stats were updated from two places earlier: - show_time_in_state(): This can be easily deferred, all we need is to calculate the delta duration again in this routine to show the current state's time-in-state. - store_reset(): This is a bit tricky as we need to clear the stats here and avoid races with simultaneous call to cpufreq_stats_record_transition(). Fix that by deferring the reset of the stats (within the code) to the next call to cpufreq_stats_record_transition(), but since we need to keep showing the right stats until that time, we capture the reset time and account for the time since last time reset was called until the time cpufreq_stats_record_transition() update the stats. User space will continue seeing the stats correctly, everything will be 0 after the stats are reset, apart from the time-in-state of the current state, until the time a frequency switch happens. Signed-off-by: Viresh Kumar [ rjw: Minor changelog edits ] Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq_stats.c | 75 ++++++++++++++++++++++++--------- 1 file changed, 56 insertions(+), 19 deletions(-) diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c index 8f95b9fe257f..e1566dbea59f 100644 --- a/drivers/cpufreq/cpufreq_stats.c +++ b/drivers/cpufreq/cpufreq_stats.c @@ -25,17 +25,22 @@ struct cpufreq_stats { spinlock_t lock; unsigned int *freq_table; unsigned int *trans_table; + + /* Deferred reset */ + unsigned int reset_pending; + unsigned long long reset_time; }; -static void cpufreq_stats_update(struct cpufreq_stats *stats) +static void cpufreq_stats_update(struct cpufreq_stats *stats, + unsigned long long time) { unsigned long long cur_time = get_jiffies_64(); - stats->time_in_state[stats->last_index] += cur_time - stats->last_time; + stats->time_in_state[stats->last_index] += cur_time - time; stats->last_time = cur_time; } -static void cpufreq_stats_clear_table(struct cpufreq_stats *stats) +static void cpufreq_stats_reset_table(struct cpufreq_stats *stats) { unsigned int count = stats->max_state; @@ -44,42 +49,67 @@ static void cpufreq_stats_clear_table(struct cpufreq_stats *stats) memset(stats->trans_table, 0, count * count * sizeof(int)); stats->last_time = get_jiffies_64(); stats->total_trans = 0; + + /* Adjust for the time elapsed since reset was requested */ + WRITE_ONCE(stats->reset_pending, 0); + cpufreq_stats_update(stats, READ_ONCE(stats->reset_time)); spin_unlock(&stats->lock); } static ssize_t show_total_trans(struct cpufreq_policy *policy, char *buf) { - return sprintf(buf, "%d\n", policy->stats->total_trans); + struct cpufreq_stats *stats = policy->stats; + + if (READ_ONCE(stats->reset_pending)) + return sprintf(buf, "%d\n", 0); + else + return sprintf(buf, "%d\n", stats->total_trans); } cpufreq_freq_attr_ro(total_trans); static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf) { struct cpufreq_stats *stats = policy->stats; + bool pending = READ_ONCE(stats->reset_pending); + unsigned long long time; ssize_t len = 0; int i; if (policy->fast_switch_enabled) return 0; - spin_lock(&stats->lock); - cpufreq_stats_update(stats); - spin_unlock(&stats->lock); - for (i = 0; i < stats->state_num; i++) { + if (pending) { + if (i == stats->last_index) + time = get_jiffies_64() - READ_ONCE(stats->reset_time); + else + time = 0; + } else { + time = stats->time_in_state[i]; + if (i == stats->last_index) + time += get_jiffies_64() - stats->last_time; + } + len += sprintf(buf + len, "%u %llu\n", stats->freq_table[i], - (unsigned long long) - jiffies_64_to_clock_t(stats->time_in_state[i])); + jiffies_64_to_clock_t(time)); } return len; } cpufreq_freq_attr_ro(time_in_state); +/* We don't care what is written to the attribute */ static ssize_t store_reset(struct cpufreq_policy *policy, const char *buf, size_t count) { - /* We don't care what is written to the attribute. */ - cpufreq_stats_clear_table(policy->stats); + struct cpufreq_stats *stats = policy->stats; + + /* + * Defer resetting of stats to cpufreq_stats_record_transition() to + * avoid races. + */ + WRITE_ONCE(stats->reset_time, get_jiffies_64()); + WRITE_ONCE(stats->reset_pending, 1); + return count; } cpufreq_freq_attr_wo(reset); @@ -87,8 +117,9 @@ cpufreq_freq_attr_wo(reset); static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf) { struct cpufreq_stats *stats = policy->stats; + bool pending = READ_ONCE(stats->reset_pending); ssize_t len = 0; - int i, j; + int i, j, count; if (policy->fast_switch_enabled) return 0; @@ -116,8 +147,13 @@ static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf) for (j = 0; j < stats->state_num; j++) { if (len >= PAGE_SIZE) break; - len += scnprintf(buf + len, PAGE_SIZE - len, "%9u ", - stats->trans_table[i*stats->max_state+j]); + + if (pending) + count = 0; + else + count = stats->trans_table[i * stats->max_state + j]; + + len += scnprintf(buf + len, PAGE_SIZE - len, "%9u ", count); } if (len >= PAGE_SIZE) break; @@ -231,10 +267,11 @@ void cpufreq_stats_record_transition(struct cpufreq_policy *policy, struct cpufreq_stats *stats = policy->stats; int old_index, new_index; - if (!stats) { - pr_debug("%s: No stats found\n", __func__); + if (!stats) return; - } + + if (unlikely(READ_ONCE(stats->reset_pending))) + cpufreq_stats_reset_table(stats); old_index = stats->last_index; new_index = freq_table_get_index(stats, new_freq); @@ -244,7 +281,7 @@ void cpufreq_stats_record_transition(struct cpufreq_policy *policy, return; spin_lock(&stats->lock); - cpufreq_stats_update(stats); + cpufreq_stats_update(stats, stats->last_time); stats->last_index = new_index; stats->trans_table[old_index * stats->max_state + new_index]++; From a4b305d240f5f4976c8bc771eb4572b056826bb8 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 5 Oct 2020 13:26:02 +0530 Subject: [PATCH 11/30] cpufreq: stats: Remove locking The locking isn't required anymore as stats can get updated only from one place at a time. Remove it. Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq_stats.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c index e1566dbea59f..e4bf444a2773 100644 --- a/drivers/cpufreq/cpufreq_stats.c +++ b/drivers/cpufreq/cpufreq_stats.c @@ -22,7 +22,6 @@ struct cpufreq_stats { unsigned int state_num; unsigned int last_index; u64 *time_in_state; - spinlock_t lock; unsigned int *freq_table; unsigned int *trans_table; @@ -44,7 +43,6 @@ static void cpufreq_stats_reset_table(struct cpufreq_stats *stats) { unsigned int count = stats->max_state; - spin_lock(&stats->lock); memset(stats->time_in_state, 0, count * sizeof(u64)); memset(stats->trans_table, 0, count * count * sizeof(int)); stats->last_time = get_jiffies_64(); @@ -53,7 +51,6 @@ static void cpufreq_stats_reset_table(struct cpufreq_stats *stats) /* Adjust for the time elapsed since reset was requested */ WRITE_ONCE(stats->reset_pending, 0); cpufreq_stats_update(stats, READ_ONCE(stats->reset_time)); - spin_unlock(&stats->lock); } static ssize_t show_total_trans(struct cpufreq_policy *policy, char *buf) @@ -247,7 +244,6 @@ void cpufreq_stats_create_table(struct cpufreq_policy *policy) stats->state_num = i; stats->last_time = get_jiffies_64(); stats->last_index = freq_table_get_index(stats, policy->cur); - spin_lock_init(&stats->lock); policy->stats = stats; ret = sysfs_create_group(&policy->kobj, &stats_attr_group); @@ -280,11 +276,9 @@ void cpufreq_stats_record_transition(struct cpufreq_policy *policy, if (old_index == -1 || new_index == -1 || old_index == new_index) return; - spin_lock(&stats->lock); cpufreq_stats_update(stats, stats->last_time); stats->last_index = new_index; stats->trans_table[old_index * stats->max_state + new_index]++; stats->total_trans++; - spin_unlock(&stats->lock); } From 56541410c4135a5a1755d30392353165a8438b50 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 5 Oct 2020 13:26:03 +0530 Subject: [PATCH 12/30] cpufreq: stats: Mark few conditionals with unlikely() Since this will be part of the scheduler's hotpath in some cases, use unlikely() for few of the obvious conditionals. Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq_stats.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c index e4bf444a2773..a7af8abb1386 100644 --- a/drivers/cpufreq/cpufreq_stats.c +++ b/drivers/cpufreq/cpufreq_stats.c @@ -263,7 +263,7 @@ void cpufreq_stats_record_transition(struct cpufreq_policy *policy, struct cpufreq_stats *stats = policy->stats; int old_index, new_index; - if (!stats) + if (unlikely(!stats)) return; if (unlikely(READ_ONCE(stats->reset_pending))) @@ -273,7 +273,7 @@ void cpufreq_stats_record_transition(struct cpufreq_policy *policy, new_index = freq_table_get_index(stats, new_freq); /* We can't do stats->time_in_state[-1]= .. */ - if (old_index == -1 || new_index == -1 || old_index == new_index) + if (unlikely(old_index == -1 || new_index == -1 || old_index == new_index)) return; cpufreq_stats_update(stats, stats->last_time); From 40577bcf39eb0cdfce209f7a8c94a7478d321643 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 5 Oct 2020 13:26:04 +0530 Subject: [PATCH 13/30] cpufreq: stats: Enable stats for fast-switch as well Now that all the blockers are gone for enabling stats in fast-switching case, enable it. Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki [Kazuki: Port to v5.4] Signed-off-by: Kazuki Hashimoto # Conflicts: # drivers/cpufreq/cpufreq.c --- drivers/cpufreq/cpufreq.c | 6 ++++-- drivers/cpufreq/cpufreq_stats.c | 6 ------ 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index f319f7b4bd30..47ac18c107bd 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1870,9 +1870,11 @@ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy, int ret; target_freq = clamp_val(target_freq, policy->min, policy->max); - ret = cpufreq_driver->fast_switch(policy, target_freq); - if (ret) + ret = cpufreq_driver->fast_switch(policy, target_freq); + if (ret) { cpufreq_times_record_transition(policy, ret); + cpufreq_stats_record_transition(policy, ret); + } return ret; } diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c index a7af8abb1386..7195069be26d 100644 --- a/drivers/cpufreq/cpufreq_stats.c +++ b/drivers/cpufreq/cpufreq_stats.c @@ -72,9 +72,6 @@ static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf) ssize_t len = 0; int i; - if (policy->fast_switch_enabled) - return 0; - for (i = 0; i < stats->state_num; i++) { if (pending) { if (i == stats->last_index) @@ -118,9 +115,6 @@ static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf) ssize_t len = 0; int i, j, count; - if (policy->fast_switch_enabled) - return 0; - len += scnprintf(buf + len, PAGE_SIZE - len, " From : To\n"); len += scnprintf(buf + len, PAGE_SIZE - len, " : "); for (i = 0; i < stats->state_num; i++) { From e3f2a5cb0120a983fcbe7e84b4666c40769fbf44 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 6 Oct 2020 21:43:43 +0200 Subject: [PATCH 14/30] cpufreq: stats: Add memory barrier to store_reset() There is nothing to prevent the CPU or the compiler from reordering the writes to stats->reset_time and stats->reset_pending in store_reset(), in which case the readers of stats->reset_time may see a stale value. Moreover, on 32-bit arches the write to reset_time cannot be completed in one go, so the readers of it may see a partially updated value in that case. To prevent that from happening, add a write memory barrier between the writes to stats->reset_time and stats->reset_pending in store_reset() and corresponding read memory barrier in the readers of stats->reset_time. Fixes: 40c3bd4cfa6f ("cpufreq: stats: Defer stats update to cpufreq_stats_record_transition()") Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar --- drivers/cpufreq/cpufreq_stats.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c index 7195069be26d..a9356fe5712d 100644 --- a/drivers/cpufreq/cpufreq_stats.c +++ b/drivers/cpufreq/cpufreq_stats.c @@ -50,6 +50,11 @@ static void cpufreq_stats_reset_table(struct cpufreq_stats *stats) /* Adjust for the time elapsed since reset was requested */ WRITE_ONCE(stats->reset_pending, 0); + /* + * Prevent the reset_time read from being reordered before the + * reset_pending accesses in cpufreq_stats_record_transition(). + */ + smp_rmb(); cpufreq_stats_update(stats, READ_ONCE(stats->reset_time)); } @@ -74,10 +79,16 @@ static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf) for (i = 0; i < stats->state_num; i++) { if (pending) { - if (i == stats->last_index) + if (i == stats->last_index) { + /* + * Prevent the reset_time read from occurring + * before the reset_pending read above. + */ + smp_rmb(); time = get_jiffies_64() - READ_ONCE(stats->reset_time); - else + } else { time = 0; + } } else { time = stats->time_in_state[i]; if (i == stats->last_index) @@ -102,6 +113,11 @@ static ssize_t store_reset(struct cpufreq_policy *policy, const char *buf, * avoid races. */ WRITE_ONCE(stats->reset_time, get_jiffies_64()); + /* + * The memory barrier below is to prevent the readers of reset_time from + * seeing a stale or partially updated value. + */ + smp_wmb(); WRITE_ONCE(stats->reset_pending, 1); return count; From 81c810045d4c815ed115fcf2c36686fd596951cb Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Mon, 12 Oct 2020 10:20:07 +0530 Subject: [PATCH 15/30] cpufreq: stats: Fix string format specifier mismatch Fix following warning: drivers/cpufreq/cpufreq_stats.c:63:10: warning: %d in format string (no. 1) requires 'int' but the argument type is 'unsigned int' Fixes: 40c3bd4cfa6f ("cpufreq: stats: Defer stats update to cpufreq_stats_record_transition()") Reported-by: kernel test robot Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq_stats.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c index a9356fe5712d..ed7a715ba11f 100644 --- a/drivers/cpufreq/cpufreq_stats.c +++ b/drivers/cpufreq/cpufreq_stats.c @@ -65,7 +65,7 @@ static ssize_t show_total_trans(struct cpufreq_policy *policy, char *buf) if (READ_ONCE(stats->reset_pending)) return sprintf(buf, "%d\n", 0); else - return sprintf(buf, "%d\n", stats->total_trans); + return sprintf(buf, "%u\n", stats->total_trans); } cpufreq_freq_attr_ro(total_trans); From 80ad6bacb0cbd722c02fe2b3789d62dffcd50ea5 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 17 Nov 2020 17:02:10 +0530 Subject: [PATCH 16/30] cpufreq: stats: Use local_clock() instead of jiffies local_clock() has better precision and accuracy as compared to jiffies, lets use it for time management in cpufreq stats. Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq_stats.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c index ed7a715ba11f..698cedca02d8 100644 --- a/drivers/cpufreq/cpufreq_stats.c +++ b/drivers/cpufreq/cpufreq_stats.c @@ -12,9 +12,9 @@ #include #include #include +#include #include - struct cpufreq_stats { unsigned int total_trans; unsigned long long last_time; @@ -33,7 +33,7 @@ struct cpufreq_stats { static void cpufreq_stats_update(struct cpufreq_stats *stats, unsigned long long time) { - unsigned long long cur_time = get_jiffies_64(); + unsigned long long cur_time = local_clock(); stats->time_in_state[stats->last_index] += cur_time - time; stats->last_time = cur_time; @@ -45,7 +45,7 @@ static void cpufreq_stats_reset_table(struct cpufreq_stats *stats) memset(stats->time_in_state, 0, count * sizeof(u64)); memset(stats->trans_table, 0, count * count * sizeof(int)); - stats->last_time = get_jiffies_64(); + stats->last_time = local_clock(); stats->total_trans = 0; /* Adjust for the time elapsed since reset was requested */ @@ -85,18 +85,18 @@ static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf) * before the reset_pending read above. */ smp_rmb(); - time = get_jiffies_64() - READ_ONCE(stats->reset_time); + time = local_clock() - READ_ONCE(stats->reset_time); } else { time = 0; } } else { time = stats->time_in_state[i]; if (i == stats->last_index) - time += get_jiffies_64() - stats->last_time; + time += local_clock() - stats->last_time; } len += sprintf(buf + len, "%u %llu\n", stats->freq_table[i], - jiffies_64_to_clock_t(time)); + nsec_to_clock_t(time)); } return len; } @@ -112,7 +112,7 @@ static ssize_t store_reset(struct cpufreq_policy *policy, const char *buf, * Defer resetting of stats to cpufreq_stats_record_transition() to * avoid races. */ - WRITE_ONCE(stats->reset_time, get_jiffies_64()); + WRITE_ONCE(stats->reset_time, local_clock()); /* * The memory barrier below is to prevent the readers of reset_time from * seeing a stale or partially updated value. @@ -252,7 +252,7 @@ void cpufreq_stats_create_table(struct cpufreq_policy *policy) stats->freq_table[i++] = pos->frequency; stats->state_num = i; - stats->last_time = get_jiffies_64(); + stats->last_time = local_clock(); stats->last_index = freq_table_get_index(stats, policy->cur); policy->stats = stats; From a0df1092950cc17db4e25ebad7d514e54f173745 Mon Sep 17 00:00:00 2001 From: Shaokun Zhang Date: Mon, 31 May 2021 15:16:07 +0800 Subject: [PATCH 17/30] cpufreq: stats: Clean up local variable in cpufreq_stats_create_table() Local variable 'count' will be initialized and 'ret' is also not required, so remove the redundant initialization and get rid of 'ret'. Signed-off-by: Shaokun Zhang Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq_stats.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c index 698cedca02d8..bda35fe56b37 100644 --- a/drivers/cpufreq/cpufreq_stats.c +++ b/drivers/cpufreq/cpufreq_stats.c @@ -214,7 +214,7 @@ void cpufreq_stats_free_table(struct cpufreq_policy *policy) void cpufreq_stats_create_table(struct cpufreq_policy *policy) { - unsigned int i = 0, count = 0, ret = -ENOMEM; + unsigned int i = 0, count; struct cpufreq_stats *stats; unsigned int alloc_size; struct cpufreq_frequency_table *pos; @@ -256,8 +256,7 @@ void cpufreq_stats_create_table(struct cpufreq_policy *policy) stats->last_index = freq_table_get_index(stats, policy->cur); policy->stats = stats; - ret = sysfs_create_group(&policy->kobj, &stats_attr_group); - if (!ret) + if (!sysfs_create_group(&policy->kobj, &stats_attr_group)) return; /* We failed, release resources */ From ef859d5eea151741860ac9e87d830638b943eb9e Mon Sep 17 00:00:00 2001 From: Connor O'Brien Date: Fri, 15 Dec 2023 23:55:54 +0200 Subject: [PATCH 18/30] sched/cpufreq: Stop ignoring util updates Eliminate the check for SCHED_CPUFREQ_WALT flag in cpufreq_update_util(), update calling code to stop using that flag, and replace its definition with a placeholder. Test: Trace shows more frequent util updates Bug: 110604715 Change-Id: I3b74e950b984194f08ecbbf91872467a200c9d1d Signed-off-by: Connor O'Brien Conflicts: kernel/sched/core.c kernel/sched/sched.h kernel/sched/walt.c Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> # Conflicts: # kernel/sched/core.c --- include/linux/sched/cpufreq.h | 2 +- kernel/sched/core.c | 7 ++----- kernel/sched/sched.h | 2 -- kernel/sched/walt.c | 2 +- 4 files changed, 4 insertions(+), 9 deletions(-) diff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h index 215e65da1be5..f794f8c31298 100644 --- a/include/linux/sched/cpufreq.h +++ b/include/linux/sched/cpufreq.h @@ -12,7 +12,7 @@ #define SCHED_CPUFREQ_DL (1U << 1) #define SCHED_CPUFREQ_IOWAIT (1U << 2) #define SCHED_CPUFREQ_INTERCLUSTER_MIG (1U << 3) -#define SCHED_CPUFREQ_WALT (1U << 4) +#define SCHED_CPUFREQ_RESERVED (1U << 4) #define SCHED_CPUFREQ_PL (1U << 5) #define SCHED_CPUFREQ_EARLY_DET (1U << 6) #define SCHED_CPUFREQ_FORCE_UPDATE (1U << 7) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bf0a561abed0..fa4811170da3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3138,9 +3138,7 @@ out: if (success && sched_predl) { raw_spin_lock_irqsave(&cpu_rq(cpu)->lock, flags); if (do_pl_notif(cpu_rq(cpu))) - cpufreq_update_util(cpu_rq(cpu), - SCHED_CPUFREQ_WALT | - SCHED_CPUFREQ_PL); + cpufreq_update_util(cpu_rq(cpu), SCHED_CPUFREQ_PL); raw_spin_unlock_irqrestore(&cpu_rq(cpu)->lock, flags); } #endif @@ -4142,9 +4140,8 @@ void scheduler_tick(void) early_notif = early_detection_notify(rq, wallclock); if (early_notif) - flag = SCHED_CPUFREQ_WALT | SCHED_CPUFREQ_EARLY_DET; + cpufreq_update_util(rq, SCHED_CPUFREQ_EARLY_DET); - cpufreq_update_util(rq, flag); rq_unlock(rq, &rf); perf_event_task_tick(); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 55f476b2e201..d2fa46b12a42 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2760,8 +2760,6 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) u64 clock; #ifdef CONFIG_SCHED_WALT - if (!(flags & SCHED_CPUFREQ_WALT)) - return; clock = sched_ktime_clock(); #else clock = rq_clock(rq); diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c index 2fedd010be1c..3577e39003f8 100644 --- a/kernel/sched/walt.c +++ b/kernel/sched/walt.c @@ -3220,7 +3220,7 @@ void walt_irq_work(struct irq_work *irq_work) cpu_online_mask); num_cpus = cpumask_weight(&cluster_online_cpus); for_each_cpu(cpu, &cluster_online_cpus) { - int flag = SCHED_CPUFREQ_WALT; + int flag = 0; rq = cpu_rq(cpu); From fe902b1eb4a5a111bac1c2c59e59dd3bd3d94f43 Mon Sep 17 00:00:00 2001 From: kondors1995 Date: Sun, 25 Aug 2024 15:00:04 +0300 Subject: [PATCH 19/30] Squashed commit of the following: MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 7004ae7dfb08c25dc5165abc447a70036b00b60e Author: kondors1995 Date: Fri Aug 16 17:06:30 2024 +0300 arm64: dts: sm8150-v2: Update gpu votlages commit 7d2152a68429cc4e44ec5e0d6b3340c7b3bd7130 Author: kondors1995 Date: Thu Aug 15 17:46:10 2024 +0300 sched:fair: fix unused warning not sure way its there but lets keep it commit 37a33c25346a355a07b4900a864f6326740c3a61 Author: kondors1995 Date: Thu Aug 15 17:37:29 2024 +0300 arm64: dts: sm8150-v2: Fixup @2cc338b commit ab938b00393e1a8e6a11b0f54d5c24d56f68a4de Author: EmanuelCN Date: Sun Aug 11 14:38:41 2024 +0300 drivers: gpu: drm: Do not affine pm qos requests to prime core *This was accidentally readded when i rebased techpack/display and drm. commit 42966bed31891213f5bdb64be9f23aae0acb2c2d Author: kondors1995 Date: Wed Aug 14 10:56:36 2024 +0300 raphael:defconfig: Disable CONFIG_FAIR_GROUP_SCHED commit 4e1c9539727fc898016524700f5cced0cda9f55d Author: EmanuelCN Date: Wed Aug 7 13:57:13 2024 +0300 cpufreq: schedutil: Store the cached ratelimits values In a merge resolution google accidentally removed this, since we have checkouted the scheduler to redbull this part was missing. commit 2dc779651497edbe71c74f4b41eab79b4b0e2368 Author: EmanuelCN Date: Tue Aug 6 20:24:40 2024 +0300 cpufreq: schedutil: Set rate-limits globally Since we are not gonna modify them per cluster anymore its redundant to keep them this way, plus its less expensive. commit 4ac25bd369c6ba572b582ea7099e42dfd7f7137f Author: Xuewen Yan Date: Wed Jul 19 21:05:27 2023 +0800 cpufreq: schedutil: Update next_freq when cpufreq_limits change When cpufreq's policy is 'single', there is a scenario that will cause sg_policy's next_freq to be unable to update. When the CPU's util is always max, the cpufreq will be max, and then if we change the policy's scaling_max_freq to be a lower freq, indeed, the sg_policy's next_freq need change to be the lower freq, however, because the cpu_is_busy, the next_freq would keep the max_freq. For example: The cpu7 is a single CPU: unisoc:/sys/devices/system/cpu/cpufreq/policy7 # while true;do done& [1] 4737 unisoc:/sys/devices/system/cpu/cpufreq/policy7 # taskset -p 80 4737 pid 4737's current affinity mask: ff pid 4737's new affinity mask: 80 unisoc:/sys/devices/system/cpu/cpufreq/policy7 # cat scaling_max_freq 2301000 unisoc:/sys/devices/system/cpu/cpufreq/policy7 # cat scaling_cur_freq 2301000 unisoc:/sys/devices/system/cpu/cpufreq/policy7 # echo 2171000 > scaling_max_freq unisoc:/sys/devices/system/cpu/cpufreq/policy7 # cat scaling_max_freq 2171000 At this time, the sg_policy's next_freq would stay at 2301000, which is wrong. To fix this, add a check for the ->need_freq_update flag. [ mingo: Clarified the changelog. ] Co-developed-by: Guohua Yan Signed-off-by: Xuewen Yan Signed-off-by: Guohua Yan Signed-off-by: Ingo Molnar Acked-by: "Rafael J. Wysocki" Link: https://lore.kernel.org/r/20230719130527.8074-1-xuewen.yan@unisoc.com commit e88732a5887bfaa8054408514f2af952e119a9c0 Author: kondors1995 Date: Mon Aug 12 20:37:01 2024 +0300 cpufreq: schedutil:remove walt bits commit a19bfda67be888221f9348eff56a53ac3b34756a Author: Dawei Li Date: Sun Aug 4 21:38:34 2024 +0300 sched/fair: Fix initial util_avg calculation Change se->load.weight to se_weight(se) in the calculation for the initial util_avg to avoid unnecessarily inflating the util_avg by 1024 times. The reason is that se->load.weight has the unit/scale as the scaled-up load, while cfs_rg->avg.load_avg has the unit/scale as the true task weight (as mapped directly from the task's nice/priority value). With CONFIG_32BIT, the scaled-up load is equal to the true task weight. With CONFIG_64BIT, the scaled-up load is 1024 times the true task weight. Thus, the current code may inflate the util_avg by 1024 times. The follow-up capping will not allow the util_avg value to go wild. But the calculation should have the correct logic. Signed-off-by: Dawei Li Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Reviewed-by: Vishal Chourasia Link: https://lore.kernel.org/r/20240315015916.21545-1-daweilics@gmail.com commit b147bc04acfe5b5647ef50ea9af580aaed04894a Author: EmanuelCN Date: Wed Aug 7 14:13:40 2024 +0300 cpufreq: schedutil: Give 25% headroom to prime core If we don't give any headroom to prime core it can result in big performance regression commit 010effdcd39496611bca9f284c32a5e554a4fd27 Author: EmanuelCN Date: Wed Jul 3 00:16:04 2024 +0300 sched: Introduce Per-Cluster DVFS Headroom Typically, UI-demanding tasks are handled by little and big cores, while the prime core is more power-hungry and leads to unnecessary boosts, causing power wastage. From my observations, boosting the little cores provides the most significant performance improvement for UI tasks, followed by a modest boost to the big cores. This results in fewer stutters across the entire system. To avoid excessive boosting, the implementation ensures that when the utilization is equal to or higher than the maximum capacity value, util is returned as-is. commit b72fefb30cfc63426415bdbf06d1bb4c87c4d32f Author: Vincent Guittot Date: Sun Jan 14 19:36:00 2024 +0100 sched/fair: Fix frequency selection for non-invariant case Linus reported a ~50% performance regression on single-threaded workloads on his AMD Ryzen system, and bisected it to: 9c0b4bb7f630 ("sched/cpufreq: Rework schedutil governor performance estimation") When frequency invariance is not enabled, get_capacity_ref_freq(policy) is supposed to return the current frequency and the performance margin applied by map_util_perf(), enabling the utilization to go above the maximum compute capacity and to select a higher frequency than the current one. After the changes in 9c0b4bb7f630, the performance margin was applied earlier in the path to take into account utilization clampings and we couldn't get a utilization higher than the maximum compute capacity, and the CPU remained 'stuck' at lower frequencies. To fix this, we must use a frequency above the current frequency to get a chance to select a higher OPP when the current one becomes fully used. Apply the same margin and return a frequency 25% higher than the current one in order to switch to the next OPP before we fully use the CPU at the current one. [ mingo: Clarified the changelog. ] Fixes: 9c0b4bb7f630 ("sched/cpufreq: Rework schedutil governor performance estimation") Reported-by: Linus Torvalds Bisected-by: Linus Torvalds Reported-by: Wyes Karny Signed-off-by: Vincent Guittot Signed-off-by: Ingo Molnar Tested-by: Wyes Karny Link: https://lore.kernel.org/r/20240114183600.135316-1-vincent.guittot@linaro.org commit dcad748d1a2dcdb781fbcc514dc977d8ddc91be6 Author: Vincent Guittot Date: Wed Nov 22 14:39:04 2023 +0100 sched/cpufreq: Rework iowait boost Use the max value that has already been computed inside sugov_get_util() to cap the iowait boost and remove dependency with uclamp_rq_util_with() which is not used anymore. Signed-off-by: Vincent Guittot Signed-off-by: Ingo Molnar Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20231122133904.446032-3-vincent.guittot@linaro.org commit 31a906e3fcb3ba783d45157b722bcb3a321c1f8f Author: Vincent Guittot Date: Wed Nov 22 14:39:03 2023 +0100 sched/cpufreq: Rework schedutil governor performance estimation The current method to take into account uclamp hints when estimating the target frequency can end in a situation where the selected target frequency is finally higher than uclamp hints, whereas there are no real needs. Such cases mainly happen because we are currently mixing the traditional scheduler utilization signal with the uclamp performance hints. By adding these 2 metrics, we loose an important information when it comes to select the target frequency, and we have to make some assumptions which can't fit all cases. Rework the interface between the scheduler and schedutil governor in order to propagate all information down to the cpufreq governor. effective_cpu_util() interface changes and now returns the actual utilization of the CPU with 2 optional inputs: - The minimum performance for this CPU; typically the capacity to handle the deadline task and the interrupt pressure. But also uclamp_min request when available. - The maximum targeting performance for this CPU which reflects the maximum level that we would like to not exceed. By default it will be the CPU capacity but can be reduced because of some performance hints set with uclamp. The value can be lower than actual utilization and/or min performance level. A new sugov_effective_cpu_perf() interface is also available to compute the final performance level that is targeted for the CPU, after applying some cpufreq headroom and taking into account all inputs. With these 2 functions, schedutil is now able to decide when it must go above uclamp hints. It now also has a generic way to get the min performance level. The dependency between energy model and cpufreq governor and its headroom policy doesn't exist anymore. eenv_pd_max_util() asks schedutil for the targeted performance after applying the impact of the waking task. [ mingo: Refined the changelog & C comments. ] Signed-off-by: Vincent Guittot Signed-off-by: Ingo Molnar Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20231122133904.446032-2-vincent.guittot@linaro.org commit 307c28badda88662bfd2cf4afb4168506d1e8e96 Author: Lukasz Luba Date: Thu Dec 8 16:02:56 2022 +0000 cpufreq, sched/util: Optimize operations with single CPU capacity lookup The max CPU capacity is the same for all CPUs sharing frequency domain. There is a way to avoid heavy operations in a loop for each CPU by leveraging this knowledge. Thus, simplify the looping code in the sugov_next_freq_shared() and drop heavy multiplications. Instead, use simple max() to get the highest utilization from these CPUs. This is useful for platforms with many (4 or 6) little CPUs. We avoid heavy 2*PD_CPU_NUM multiplications in that loop, which is called billions of times, since it's not limited by the schedutil time delta filter in sugov_should_update_freq(). When there was no need to change frequency the code bailed out, not updating the sg_policy::last_freq_update_time. Then every visit after delta_ns time longer than the sg_policy::freq_update_delay_ns goes through and triggers the next frequency calculation code. Although, if the next frequency, as outcome of that, would be the same as current frequency, we won't update the sg_policy::last_freq_update_time and the story will be repeated (in a very short period, sometimes a few microseconds). The max CPU capacity must be fetched every time we are called, due to difficulties during the policy setup, where we are not able to get the normalized CPU capacity at the right time. The fetched CPU capacity value is than used in sugov_iowait_apply() to calculate the right boost. This required a few changes in the local functions and arguments. The capacity value should hopefully be fetched once when needed and then passed over CPU registers to those functions. Signed-off-by: Lukasz Luba Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20221208160256.859-2-lukasz.luba@arm.com Cc: Peter Zijlstra Cc: Patrick Bellasi Cc: Vincent Guittot Cc: Rafael J. Wysocki Cc: Viresh Kumar commit db9c6be8f578a2decb289f5e2cce1af14819690e Author: Rafael J. Wysocki Date: Thu Mar 28 11:33:21 2019 +0100 cpufreq: schedutil: Simplify iowait boosting There is not reason for the minimum iowait boost value in the schedutil cpufreq governor to depend on the available range of CPU frequencies. In fact, that dependency is generally confusing, because it causes the iowait boost to behave somewhat differently on CPUs with the same maximum frequency and different minimum frequencies, for example. For this reason, replace the min field in struct sugov_cpu with a constant and choose its values to be 1/8 of SCHED_CAPACITY_SCALE (for consistency with the intel_pstate driver's internal governor). [Note that policy->cpuinfo.max_freq will not be a constant any more after a subsequent change, so this change is depended on by it.] Link: https://lore.kernel.org/lkml/20190305083202.GU32494@hirez.programming.kicks-ass.net/T/#ee20bdc98b7d89f6110c0d00e5c3ee8c2ced93c3d Suggested-by: Peter Zijlstra Signed-off-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) Acked-by: Viresh Kumar commit 7e1f2edaa9a3738cdf7bc23eaec0feb866633e3a Author: Dietmar Eggemann Date: Tue Jun 21 10:04:10 2022 +0100 sched, drivers: Remove max param from effective_cpu_util()/sched_cpu_util() effective_cpu_util() already has a `int cpu' parameter which allows to retrieve the CPU capacity scale factor (or maximum CPU capacity) inside this function via an arch_scale_cpu_capacity(cpu). A lot of code calling effective_cpu_util() (or the shim sched_cpu_util()) needs the maximum CPU capacity, i.e. it will call arch_scale_cpu_capacity() already. But not having to pass it into effective_cpu_util() will make the EAS wake-up code easier, especially when the maximum CPU capacity reduced by the thermal pressure is passed through the EAS wake-up functions. Due to the asymmetric CPU capacity support of arm/arm64 architectures, arch_scale_cpu_capacity(int cpu) is a per-CPU variable read access via per_cpu(cpu_scale, cpu) on such a system. On all other architectures it is a a compile-time constant (SCHED_CAPACITY_SCALE). Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Acked-by: Vincent Guittot Tested-by: Lukasz Luba Link: https://lkml.kernel.org/r/20220621090414.433602-4-vdonnefort@google.com commit caa24c3962bc2a002034548e3c93dce95c9330c4 Author: Rafael J. Wysocki Date: Fri Jun 21 22:06:38 2024 +0300 cpufreq: schedutil: Add util to struct sg_cpu Instead of passing util and max between functions while computing the utilization and capacity, store the former in struct sg_cpu (along with the latter and bw_dl). This will allow the current utilization value to be compared with the one obtained previously (which is requisite for some code changes to follow this one), but also it causes the code to look slightly more consistent and cleaner. Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar commit 176187b62d8637f44e8fcdee2b539dee8c30491b Author: John Galt Date: Thu Jun 27 21:52:34 2024 -0400 sched/eevdf: 1ms base slice commit c64c439cffc03bff0b5b69455dbd9018ae920a64 Author: John Galt Date: Thu Jun 27 21:49:47 2024 -0400 sched/features: enable SCHED_FEAT_PLACE_DEADLINE_INITIAL commit 190d38b6481aea95d454dcfab5801a578a86f3dc Author: Qais Yousef Date: Tue Apr 18 15:09:35 2023 +0100 sched/uclamp: Fix fits_capacity() check in feec() commit 244226035a1f9b2b6c326e55ae5188fab4f428cb upstream. As reported by Yun Hsiang [1], if a task has its uclamp_min >= 0.8 * 1024, it'll always pick the previous CPU because fits_capacity() will always return false in this case. The new util_fits_cpu() logic should handle this correctly for us beside more corner cases where similar failures could occur, like when using UCLAMP_MAX. We open code uclamp_rq_util_with() except for the clamp() part, util_fits_cpu() needs the 'raw' values to be passed to it. Also introduce uclamp_rq_{set, get}() shorthand accessors to get uclamp value for the rq. Makes the code more readable and ensures the right rules (use READ_ONCE/WRITE_ONCE) are respected transparently. [1] https://lists.linaro.org/pipermail/eas-dev/2020-July/001488.html Fixes: 1d42509e475c ("sched/fair: Make EAS wakeup placement consider uclamp restrictions") Reported-by: Yun Hsiang Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220804143609.515789-4-qais.yousef@arm.com (cherry picked from commit 244226035a1f9b2b6c326e55ae5188fab4f428cb) [Fix trivial conflict in kernel/sched/fair.c due to new automatic variables in master vs 5.10] Signed-off-by: Qais Yousef (Google) Signed-off-by: Greg Kroah-Hartman commit a661b80985ae13d3ad1a1be93f482192e4a361b5 Author: Youssef Esmat Date: Thu Jun 20 19:26:56 2024 +0300 sched/eevdf: Change base_slice to 3ms The default base slice of 0.75 msec is causing excessive context switches. Raise it to 3 msecs, which lowers the amount of context switches and the added overhead of the scheduler. BUG=b:308209790 TEST=Manual tested on DUT UPSTREAM-TASK=b:324602237 Change-Id: I7c7a7ecb377933a4032eb11161f5b5873e3e1e4c Signed-off-by: Youssef Esmat Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/5291506 Reviewed-by: Suleiman Souhlal Reviewed-by: Vineeth Pillai Reviewed-by: Steven Rostedt commit 7d18cd0125c2a4347268d2f72b53a9045937d98a Author: Youssef Esmat Date: Thu Jun 20 19:26:07 2024 +0300 sched/eevdf: Add ENFORCE_ELIGIBILITY and default settings Eligibility reduces scheduling latency of high nice task at the cost of runtime of low nice tasks. This hurts the chrome model where UI tasks should not be interrupted by background tasks. This adds a feature to be able to disable eligibility checks. Note that fairness is presvered in the way the deadline is calculated since the deadline is adjusted by the weight of the task and a task. Disabling lag placement is similar to eligibility since lag will adjust the CPU time based on the amount of runtime. BUG=b:308209790 TEST=Manual tested on DUT UPSTREAM-TASK=b:324602237 Signed-off-by: Youssef Esmat commit 6ef762984070d49371eb803bf50aeacfebe51613 Author: Aaron Lu Date: Tue Sep 12 14:58:08 2023 +0800 sched/fair: Ratelimit update to tg->load_avg When using sysbench to benchmark Postgres in a single docker instance with sysbench's nr_threads set to nr_cpu, it is observed there are times update_cfs_group() and update_load_avg() shows noticeable overhead on a 2sockets/112core/224cpu Intel Sapphire Rapids(SPR): 13.75% 13.74% [kernel.vmlinux] [k] update_cfs_group 10.63% 10.04% [kernel.vmlinux] [k] update_load_avg Annotate shows the cycles are mostly spent on accessing tg->load_avg with update_load_avg() being the write side and update_cfs_group() being the read side. tg->load_avg is per task group and when different tasks of the same taskgroup running on different CPUs frequently access tg->load_avg, it can be heavily contended. E.g. when running postgres_sysbench on a 2sockets/112cores/224cpus Intel Sappire Rapids, during a 5s window, the wakeup number is 14millions and migration number is 11millions and with each migration, the task's load will transfer from src cfs_rq to target cfs_rq and each change involves an update to tg->load_avg. Since the workload can trigger as many wakeups and migrations, the access(both read and write) to tg->load_avg can be unbound. As a result, the two mentioned functions showed noticeable overhead. With netperf/nr_client=nr_cpu/UDP_RR, the problem is worse: during a 5s window, wakeup number is 21millions and migration number is 14millions; update_cfs_group() costs ~25% and update_load_avg() costs ~16%. Reduce the overhead by limiting updates to tg->load_avg to at most once per ms. The update frequency is a tradeoff between tracking accuracy and overhead. 1ms is chosen because PELT window is roughly 1ms and it delivered good results for the tests that I've done. After this change, the cost of accessing tg->load_avg is greatly reduced and performance improved. Detailed test results below. ============================== postgres_sysbench on SPR: 25% base: 42382±19.8% patch: 50174±9.5% (noise) 50% base: 67626±1.3% patch: 67365±3.1% (noise) 75% base: 100216±1.2% patch: 112470±0.1% +12.2% 100% base: 93671±0.4% patch: 113563±0.2% +21.2% ============================== hackbench on ICL: group=1 base: 114912±5.2% patch: 117857±2.5% (noise) group=4 base: 359902±1.6% patch: 361685±2.7% (noise) group=8 base: 461070±0.8% patch: 491713±0.3% +6.6% group=16 base: 309032±5.0% patch: 378337±1.3% +22.4% ============================= hackbench on SPR: group=1 base: 100768±2.9% patch: 103134±2.9% (noise) group=4 base: 413830±12.5% patch: 378660±16.6% (noise) group=8 base: 436124±0.6% patch: 490787±3.2% +12.5% group=16 base: 457730±3.2% patch: 680452±1.3% +48.8% ============================ netperf/udp_rr on ICL 25% base: 114413±0.1% patch: 115111±0.0% +0.6% 50% base: 86803±0.5% patch: 86611±0.0% (noise) 75% base: 35959±5.3% patch: 49801±0.6% +38.5% 100% base: 61951±6.4% patch: 70224±0.8% +13.4% =========================== netperf/udp_rr on SPR 25% base: 104954±1.3% patch: 107312±2.8% (noise) 50% base: 55394±4.6% patch: 54940±7.4% (noise) 75% base: 13779±3.1% patch: 36105±1.1% +162% 100% base: 9703±3.7% patch: 28011±0.2% +189% ============================================== netperf/tcp_stream on ICL (all in noise range) 25% base: 43092±0.1% patch: 42891±0.5% 50% base: 19278±14.9% patch: 22369±7.2% 75% base: 16822±3.0% patch: 17086±2.3% 100% base: 18216±0.6% patch: 18078±2.9% =============================================== netperf/tcp_stream on SPR (all in noise range) 25% base: 34491±0.3% patch: 34886±0.5% 50% base: 19278±14.9% patch: 22369±7.2% 75% base: 16822±3.0% patch: 17086±2.3% 100% base: 18216±0.6% patch: 18078±2.9% Reported-by: Nitin Tekchandani Suggested-by: Vincent Guittot Signed-off-by: Aaron Lu Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Reviewed-by: Mathieu Desnoyers Reviewed-by: David Vernet Tested-by: Mathieu Desnoyers Tested-by: Swapnil Sapkal Link: https://lkml.kernel.org/r/20230912065808.2530-2-aaron.lu@intel.com commit c0c01cb55fa02d91f3a9f1fd456ecc6842990f99 Author: Chengming Zhou Date: Thu Aug 18 20:48:02 2022 +0800 sched/fair: Fix another detach on unattached task corner case commit 7dc603c9028e ("sched/fair: Fix PELT integrity for new tasks") fixed two load tracking problems for new task, including detach on unattached new task problem. There still left another detach on unattached task problem for the task which has been woken up by try_to_wake_up() and waiting for actually being woken up by sched_ttwu_pending(). try_to_wake_up(p) cpu = select_task_rq(p) if (task_cpu(p) != cpu) set_task_cpu(p, cpu) migrate_task_rq_fair() remove_entity_load_avg() --> unattached se->avg.last_update_time = 0; __set_task_cpu() ttwu_queue(p, cpu) ttwu_queue_wakelist() __ttwu_queue_wakelist() task_change_group_fair() detach_task_cfs_rq() detach_entity_cfs_rq() detach_entity_load_avg() --> detach on unattached task set_task_rq() attach_task_cfs_rq() attach_entity_cfs_rq() attach_entity_load_avg() The reason of this problem is similar, we should check in detach_entity_cfs_rq() that se->avg.last_update_time != 0, before do detach_entity_load_avg(). Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20220818124805.601-7-zhouchengming@bytedance.com commit 60cfd4b1c31915161d1c942a4a9067314fba8d79 Author: Sultan Alsawaf Date: Mon May 6 23:43:57 2024 -0700 sched/cass: Honor uclamp even when no CPUs can satisfy the requirement When all CPUs available to a uclamp'd process are thermal throttled, it is possible for them to be throttled below the uclamp minimum requirement. In this case, CASS only considers uclamp when it compares relative utilization and nowhere else; i.e., CASS essentially ignores the most important aspect of uclamp. Fix it so that CASS tries to honor uclamp even when no CPUs available to a uclamp'd process are capable of fully meeting the uclamp minimum. Signed-off-by: Sultan Alsawaf commit 82d9f4952ab48ec0b1dd5cd48da1c3619ceb01c9 Author: Sultan Alsawaf Date: Mon May 6 20:14:08 2024 -0700 sched/cass: Fix disproportionate load spreading when CPUs are throttled When CPUs are thermal throttled, CASS tries to spread load such that their resulting P-state is scaled relatively to their _throttled_ maximum capacity, rather than their original capacity. As a result, throttled CPUs are unfairly under-utilized, causing other CPUs to receive the extra burden and thus run at a disproportionately higher P-state relative to the throttled CPUs. This not only hurts performance, but also greatly diminishes energy efficiency since it breaks CASS's basic load balancing principle. To fix this, some convoluted logic is required in order to make CASS aware of a CPU's throttled and non-throttled capacity. The non-throttled capacity is used for the fundamental relative utilization comparison, while the throttled capacity is used in conjunction to ensure a throttled CPU isn't accidentally overloaded as a result. Signed-off-by: Sultan Alsawaf commit 6341ff1997878ad3b6b90a5bf0e20b99aa9e6d89 Author: Patrick Bellasi Date: Tue Dec 18 10:31:30 2018 +0000 ANDROID: sched/fair: EAS: Add uclamp support to find_best_target() Utilization clamping can be used to boost the utilization of small tasks or cap that of big tasks. Thus, one of its possible usages is to bias tasks placement to "promote" small tasks on higher capacity (less energy efficient) CPUs or "constraint" big tasks on small capacity (more energy efficient) CPUs. When the Energy Aware Scheduler (EAS) looks for the most energy efficient CPU to run a task on, it currently considers only the effective utiliation estimated for a task. Fix this by adding an additional check to skip CPUs which capacity is smaller then the task clamped utilization. Change-Id: I43fa6fa27e27c1eb5272c6a45d1a6a5b0faae1aa Signed-off-by: Patrick Bellasi Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Quentin Perret Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit c0ccb5593e4c2ac72758c5a0f68e274a5698a839 Author: EmanuelCN Date: Mon Apr 8 21:44:58 2024 +0300 sched/eevdf: Use the other placement strategy Place lag is currently broken (4.19 needs more backports to make it work as intended) Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 052afe14ea97a104e3c384f56405c30fb571efe9 Author: Xuewen Yan Date: Mon Apr 22 16:22:38 2024 +0800 sched/eevdf: Prevent vlag from going out of bounds in reweight_eevdf() It was possible to have pick_eevdf() return NULL, which then causes a NULL-deref. This turned out to be due to entity_eligible() returning falsely negative because of a s64 multiplcation overflow. Specifically, reweight_eevdf() computes the vlag without considering the limit placed upon vlag as update_entity_lag() does, and then the scaling multiplication (remember that weight is 20bit fixed point) can overflow. This then leads to the new vruntime being weird which then causes the above entity_eligible() to go side-ways and claim nothing is eligible. Thus limit the range of vlag accordingly. All this was quite rare, but fatal when it does happen. Closes: https://lore.kernel.org/all/ZhuYyrh3mweP_Kd8@nz.home/ Closes: https://lore.kernel.org/all/CA+9S74ih+45M_2TPUY_mPPVDhNvyYfy1J1ftSix+KjiTVxg8nw@mail.gmail.com/ Closes: https://lore.kernel.org/lkml/202401301012.2ed95df0-oliver.sang@intel.com/ Fixes: eab03c23c2a1 ("sched/eevdf: Fix vruntime adjustment on reweight") Reported-by: Sergei Trofimovich Reported-by: Igor Raits Reported-by: Breno Leitao Reported-by: kernel test robot Reported-by: Yujie Liu Signed-off-by: Xuewen Yan Reviewed-and-tested-by: Chen Yu Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20240422082238.5784-1-xuewen.yan@unisoc.com Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit e224fc314c4b2f28785f30f8c85af7c566d540b3 Author: Tianchen Ding Date: Wed Mar 6 10:21:33 2024 +0800 sched/eevdf: Fix miscalculation in reweight_entity() when se is not curr reweight_eevdf() only keeps V unchanged inside itself. When se != cfs_rq->curr, it would be dequeued from rb tree first. So that V is changed and the result is wrong. Pass the original V to reweight_eevdf() to fix this issue. Fixes: eab03c23c2a1 ("sched/eevdf: Fix vruntime adjustment on reweight") Signed-off-by: Tianchen Ding [peterz: flip if() condition for clarity] Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Abel Wu Link: https://lkml.kernel.org/r/20240306022133.81008-3-dtcccc@linux.alibaba.com Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 6f605d6768465bec0f16cead98e1b3b08e548afe Author: Tianchen Ding Date: Wed Mar 6 10:21:32 2024 +0800 sched/eevdf: Always update V if se->on_rq when reweighting reweight_eevdf() needs the latest V to do accurate calculation for new ve and vd. So update V unconditionally when se is runnable. Fixes: eab03c23c2a1 ("sched/eevdf: Fix vruntime adjustment on reweight") Suggested-by: Abel Wu Signed-off-by: Tianchen Ding Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Abel Wu Tested-by: K Prateek Nayak Tested-by: Chen Yu Link: https://lore.kernel.org/r/20240306022133.81008-2-dtcccc@linux.alibaba.com Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 3416506799b6b2b3eab0688051082ed2fc1d1f5b Author: Wang Jinchao Date: Thu Dec 14 13:20:29 2023 +0800 sched/fair: Remove unused 'next_buddy_marked' local variable in check_preempt_wakeup_fair() This variable became unused in: 5e963f2bd465 ("sched/fair: Commit to EEVDF") Signed-off-by: Wang Jinchao Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/202312141319+0800-wangjinchao@xfusion.com Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 2cfaa805dc73ae4e6395a8aa69ee79b9580cc7bb Author: Yiwei Lin Date: Fri Nov 17 16:01:06 2023 +0800 sched/fair: Update min_vruntime for reweight_entity() correctly Since reweight_entity() may have chance to change the weight of cfs_rq->curr entity, we should also update_min_vruntime() if this is the case Fixes: eab03c23c2a1 ("sched/eevdf: Fix vruntime adjustment on reweight") Signed-off-by: Yiwei Lin Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Abel Wu Link: https://lore.kernel.org/r/20231117080106.12890-1-s921975628@gmail.com Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 13ffac20f571298c2007b90674694aebe367be89 Author: Abel Wu Date: Wed Nov 15 11:36:46 2023 +0800 sched/eevdf: O(1) fastpath for task selection Since the RB-tree is now sorted by deadline, let's first try the leftmost entity which has the earliest virtual deadline. I've done some benchmarks to see its effectiveness. All the benchmarks are done inside a normal cpu cgroup in a clean environment with cpu turbo disabled, on a dual-CPU Intel Xeon(R) Platinum 8260 with 2 NUMA nodes each of which has 24C/48T. hackbench: process/thread + pipe/socket + 1/2/4/8 groups netperf: TCP/UDP + STREAM/RR + 24/48/72/96/192 threads tbench: loopback 24/48/72/96/192 threads schbench: 1/2/4/8 mthreads direct: cfs_rq has only one entity parity: RUN_TO_PARITY fast: O(1) fastpath slow: heap search (%) direct parity fast slow hackbench 92.95 2.02 4.91 0.12 netperf 68.08 6.60 24.18 1.14 tbench 67.55 11.22 20.61 0.62 schbench 69.91 2.65 25.73 1.71 The above results indicate that this fastpath really makes task selection more efficient. Signed-off-by: Abel Wu Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20231115033647.80785-4-wuyun.abel@bytedance.com Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 010771bc9b9ab17c289743afe1241127ba5263df Author: Abel Wu Date: Wed Nov 15 11:36:45 2023 +0800 sched/eevdf: Sort the rbtree by virtual deadline Sort the task timeline by virtual deadline and keep the min_vruntime in the augmented tree, so we can avoid doubling the worst case cost and make full use of the cached leftmost node to enable O(1) fastpath picking in next patch. Signed-off-by: Abel Wu Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20231115033647.80785-3-wuyun.abel@bytedance.com Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 1e35ae32e182db3531da5d9055e45c7ccf826c96 Author: Abel Wu Date: Tue Nov 21 21:44:26 2023 +0200 sched/eevdf: Fix vruntime adjustment on reweight vruntime of the (on_rq && !0-lag) entity needs to be adjusted when it gets re-weighted, and the calculations can be simplified based on the fact that re-weight won't change the w-average of all the entities. Please check the proofs in comments. But adjusting vruntime can also cause position change in RB-tree hence require re-queue to fix up which might be costly. This might be avoided by deferring adjustment to the time the entity actually leaves tree (dequeue/pick), but that will negatively affect task selection and probably not good enough either. Fixes: 147f3efaa241 ("sched/fair: Implement an EEVDF-like scheduling policy") Signed-off-by: Abel Wu Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20231107090510.71322-2-wuyun.abel@bytedance.com Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 73fa83e38da16aad4009b37fe7ee7a9fb289adb9 Author: Yiwei Lin Date: Fri Oct 20 13:56:17 2023 +0800 sched/fair: Remove unused 'curr' argument from pick_next_entity() The 'curr' argument of pick_next_entity() has become unused after the EEVDF changes. [ mingo: Updated the changelog. ] Signed-off-by: Yiwei Lin Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20231020055617.42064-1-s921975628@gmail.com Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit d2d81ece57c3c14431dc2525f551e2d2eadb2834 Author: Peter Zijlstra Date: Tue Oct 17 16:59:47 2023 +0200 sched/eevdf: Fix heap corruption more Because someone is a flaming idiot... and forgot we have current as se->on_rq but not actually in the tree itself, and walking rb_parent() on an entry not in the tree is 'funky' and KASAN complains. Fixes: 8dafa9d0eb1a ("sched/eevdf: Fix min_deadline heap integrity") Reported-by: 0599jiangyc@gmail.com Reported-by: Dmitry Safonov <0x7f454c46@gmail.com> Signed-off-by: Peter Zijlstra (Intel) Tested-by: Dmitry Safonov <0x7f454c46@gmail.com> Link: https://bugzilla.kernel.org/show_bug.cgi?id=218020 Link: https://lkml.kernel.org/r/CAJwJo6ZGXO07%3DQvW4fgQfbsDzQPs9xj5sAQ1zp%3DmAyPMNbHYww%40mail.gmail.com Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit b97630c04b5e7869bc0a5900e6e7dda8f7bcf5d3 Author: Benjamin Segall Date: Fri Sep 29 17:09:30 2023 -0700 sched/eevdf: Fix pick_eevdf() The old pick_eevdf() could fail to find the actual earliest eligible deadline when it descended to the right looking for min_deadline, but it turned out that that min_deadline wasn't actually eligible. In that case we need to go back and search through any left branches we skipped looking for the actual best _eligible_ min_deadline. This is more expensive, but still O(log n), and at worst should only involve descending two branches of the rbtree. I've run this through a userspace stress test (thank you tools/lib/rbtree.c), so hopefully this implementation doesn't miss any corner cases. Fixes: 147f3efaa241 ("sched/fair: Implement an EEVDF-like scheduling policy") Signed-off-by: Ben Segall Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/xm261qego72d.fsf_-_@google.com Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit df45b4b6a01c84ccb6ac72b0d5d20da7c90572d8 Author: Peter Zijlstra Date: Fri Oct 6 21:24:45 2023 +0200 sched/eevdf: Fix min_deadline heap integrity Marek and Biju reported instances of: "EEVDF scheduling fail, picking leftmost" which Mike correlated with cgroup scheduling and the min_deadline heap getting corrupted; some trace output confirms: > And yeah, min_deadline is hosed somehow: > > validate_cfs_rq: --- / > __print_se: ffff88845cf48080 w: 1024 ve: -58857638 lag: 870381 vd: -55861854 vmd: -66302085 E (11372/tr) > __print_se: ffff88810d165800 w: 25 ve: -80323686 lag: 22336429 vd: -41496434 vmd: -66302085 E (-1//autogroup-31) > __print_se: ffff888108379000 w: 25 ve: 0 lag: -57987257 vd: 114632828 vmd: 114632828 N (-1//autogroup-33) > validate_cfs_rq: min_deadline: -55861854 avg_vruntime: -62278313462 / 1074 = -57987256 Turns out that reweight_entity(), which tries really hard to be fast, does not do the normal dequeue+update+enqueue pattern but *does* scale the deadline. However, it then fails to propagate the updated deadline value up the heap. Fixes: 147f3efaa241 ("sched/fair: Implement an EEVDF-like scheduling policy") Reported-by: Marek Szyprowski Reported-by: Biju Das Reported-by: Mike Galbraith Signed-off-by: Peter Zijlstra (Intel) Tested-by: Marek Szyprowski Tested-by: Biju Das Tested-by: Mike Galbraith Link: https://lkml.kernel.org/r/20231006192445.GE743@noisy.programming.kicks-ass.net Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 3a51a2af114512733bfe3b6965a7320af592ac6e Author: Peter Zijlstra Date: Tue Sep 26 14:29:50 2023 +0200 sched/eevdf: Fix avg_vruntime() The expectation is that placing a task at avg_vruntime() makes it eligible. Turns out there is a corner case where this is not the case. Specifically, avg_vruntime() relies on the fact that integer division is a flooring function (eg. it discards the remainder). By this property the value returned is slightly left of the true average. However! when the average is a negative (relative to min_vruntime) the effect is flipped and it becomes a ceil, with the result that the returned value is just right of the average and thus not eligible. Fixes: af4cf40470c2 ("sched/fair: Add cfs_rq::avg_vruntime") Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 94630ff1c5dace213fc4d886abdddff610331975 Author: Peter Zijlstra Date: Fri Sep 15 00:48:55 2023 +0200 sched/eevdf: Also update slice on placement Tasks that never consume their full slice would not update their slice value. This means that tasks that are spawned before the sysctl scaling keep their original (UP) slice length. Fixes: 147f3efaa241 ("sched/fair: Implement an EEVDF-like scheduling policy") Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20230915124822.847197830@noisy.programming.kicks-ass.net Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit ebdbfa17b89f097f7dbb25126033767d392ec08a Author: Sebastian Andrzej Siewior Date: Wed Sep 20 15:00:24 2023 +0200 sched/debug: Remove the /proc/sys/kernel/sched_child_runs_first sysctl The /proc/sys/kernel/sched_child_runs_first knob is no longer connected since: 5e963f2bd4654 ("sched/fair: Commit to EEVDF") Remove it. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20230920130025.412071-2-bigeasy@linutronix.de Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit f3c2be38e880f64190b90cfa7a587e224036fa58 Author: Peter Zijlstra Date: Wed Aug 16 15:40:59 2023 +0200 sched/eevdf: Curb wakeup-preemption Mike and others noticed that EEVDF does like to over-schedule quite a bit -- which does hurt performance of a number of benchmarks / workloads. In particular, what seems to cause over-scheduling is that when lag is of the same order (or larger) than the request / slice then placement will not only cause the task to be placed left of current, but also with a smaller deadline than current, which causes immediate preemption. [ notably, lag bounds are relative to HZ ] Mike suggested we stick to picking 'current' for as long as it's eligible to run, giving it uninterrupted runtime until it reaches parity with the pack. Augment Mike's suggestion by only allowing it to exhaust it's initial request. One random data point: echo NO_RUN_TO_PARITY > /debug/sched/features perf stat -a -e context-switches --repeat 10 -- perf bench sched messaging -g 20 -t -l 5000 3,723,554 context-switches ( +- 0.56% ) 9.5136 +- 0.0394 seconds time elapsed ( +- 0.41% ) echo RUN_TO_PARITY > /debug/sched/features perf stat -a -e context-switches --repeat 10 -- perf bench sched messaging -g 20 -t -l 5000 2,556,535 context-switches ( +- 0.51% ) 9.2427 +- 0.0302 seconds time elapsed ( +- 0.33% ) Suggested-by: Mike Galbraith Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20230816134059.GC982867@hirez.programming.kicks-ass.net Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit bb48e3bf106a493dad3be6f2a84f507d7a5d0202 Author: Peter Zijlstra Date: Wed May 31 13:58:49 2023 +0200 sched/fair: Propagate enqueue flags into place_entity() This allows place_entity() to consider ENQUEUE_WAKEUP and ENQUEUE_MIGRATED. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20230531124604.274010996@infradead.org Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 498bfeceabaf3803261f7c934e4303d6f2137ccc Author: Peter Zijlstra Date: Wed May 31 13:58:48 2023 +0200 sched/debug: Rename sysctl_sched_min_granularity to sysctl_sched_base_slice EEVDF uses this tunable as the base request/slice -- make sure the name reflects this. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20230531124604.205287511@infradead.org Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit b194d2bfc5dc8070c78357d0c2b63a0cde2877d6 Author: Peter Zijlstra Date: Tue Nov 21 21:21:06 2023 +0200 sched/fair: Commit to EEVDF EEVDF is a better defined scheduling policy, as a result it has less heuristics/tunables. There is no compelling reason to keep CFS around. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20230531124604.137187212@infradead.org [@Helium-Studio: Also remove sysctl entries that dropped by this commit] Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> # Conflicts: # kernel/sched/fair.c # kernel/sysctl.c commit 29c0c9b70590c86dbb3504704b4043ffe793d48f Author: Peter Zijlstra Date: Wed May 31 13:58:46 2023 +0200 sched/smp: Use lag to simplify cross-runqueue placement Using lag is both more correct and simpler when moving between runqueues. Notable, min_vruntime() was invented as a cheap approximation of avg_vruntime() for this very purpose (SMP migration). Since we now have the real thing; use it. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20230531124604.068911180@infradead.org Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 241fa4e083655b4f67cae56fa716ca6234506b30 Author: Chengming Zhou Date: Thu Aug 18 20:48:01 2022 +0800 sched/fair: Combine detach into dequeue when migrating task When we are migrating task out of the CPU, we can combine detach and propagation into dequeue_entity() to save the detach_entity_cfs_rq() in migrate_task_rq_fair(). This optimization is like combining DO_ATTACH in the enqueue_entity() when migrating task to the CPU. So we don't have to traverse the CFS tree extra time to do the detach_entity_cfs_rq() -> propagate_entity_cfs_rq(), which wouldn't be called anymore with this patch's change. detach_task() deactivate_task() dequeue_task_fair() for_each_sched_entity(se) dequeue_entity() update_load_avg() /* (1) */ detach_entity_load_avg() set_task_cpu() migrate_task_rq_fair() detach_entity_cfs_rq() /* (2) */ update_load_avg(); detach_entity_load_avg(); propagate_entity_cfs_rq(); for_each_sched_entity() update_load_avg() This patch save the detach_entity_cfs_rq() called in (2) by doing the detach_entity_load_avg() for a CPU migrating task inside (1) (the task being the first se in the loop) Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20220818124805.601-6-zhouchengming@bytedance.com Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit d563959f4820312d3d1b462c78ca62a8a5fe3157 Author: Peter Zijlstra Date: Wed May 31 13:58:45 2023 +0200 sched/fair: Commit to lag based placement Removes the FAIR_SLEEPERS code in favour of the new LAG based placement. Specifically, the whole FAIR_SLEEPER thing was a very crude approximation to make up for the lack of lag based placement, specifically the 'service owed' part. This is important for things like 'starve' and 'hackbench'. One side effect of FAIR_SLEEPER is that it caused 'small' unfairness, specifically, by always ignoring up-to 'thresh' sleeptime it would have a 50%/50% time distribution for a 50% sleeper vs a 100% runner, while strictly speaking this should (of course) result in a 33%/67% split (as CFS will also do if the sleep period exceeds 'thresh'). Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20230531124604.000198861@infradead.org Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit ac875e302647f7266ce5678d02f06732e4512f2c Author: Peter Zijlstra Date: Wed May 31 13:58:44 2023 +0200 sched/fair: Implement an EEVDF-like scheduling policy Where CFS is currently a WFQ based scheduler with only a single knob, the weight. The addition of a second, latency oriented parameter, makes something like WF2Q or EEVDF based a much better fit. Specifically, EEVDF does EDF like scheduling in the left half of the tree -- those entities that are owed service. Except because this is a virtual time scheduler, the deadlines are in virtual time as well, which is what allows over-subscription. EEVDF has two parameters: - weight, or time-slope: which is mapped to nice just as before - request size, or slice length: which is used to compute the virtual deadline as: vd_i = ve_i + r_i/w_i Basically, by setting a smaller slice, the deadline will be earlier and the task will be more eligible and ran earlier. Tick driven preemption is driven by request/slice completion; while wakeup preemption is driven by the deadline. Because the tree is now effectively an interval tree, and the selection is no longer 'leftmost', over-scheduling is less of a problem. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20230531124603.931005524@infradead.org Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit fa237d82950d832b045ca398b0bdd7088422cb37 Author: Peter Zijlstra Date: Wed May 31 13:58:42 2023 +0200 sched/fair: Add lag based placement With the introduction of avg_vruntime, it is possible to approximate lag (the entire purpose of introducing it in fact). Use this to do lag based placement over sleep+wake. Specifically, the FAIR_SLEEPERS thing places things too far to the left and messes up the deadline aspect of EEVDF. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20230531124603.794929315@infradead.org Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 6063bb0c8a1acd02f35d9de3a2f73c17fd99c3f2 Author: Peter Zijlstra Date: Wed May 31 13:58:41 2023 +0200 sched/fair: Remove sched_feat(START_DEBIT) With the introduction of avg_vruntime() there is no need to use worse approximations. Take the 0-lag point as starting point for inserting new tasks. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20230531124603.722361178@infradead.org Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 45d55875264f7f7756c814df55a0c505de1f13c0 Author: Peter Zijlstra Date: Wed May 31 13:58:40 2023 +0200 sched/fair: Add cfs_rq::avg_vruntime In order to move to an eligibility based scheduling policy, we need to have a better approximation of the ideal scheduler. Specifically, for a virtual time weighted fair queueing based scheduler the ideal scheduler will be the weighted average of the individual virtual runtimes (math in the comment). As such, compute the weighted average to approximate the ideal scheduler -- note that the approximation is in the individual task behaviour, which isn't strictly conformant. Specifically consider adding a task with a vruntime left of center, in this case the average will move backwards in time -- something the ideal scheduler would of course never do. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20230531124603.654144274@infradead.org Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 482a42257efb9da9db63f03a8af5d5f07d3518f6 Author: Jiang Biao Date: Tue Aug 11 19:32:09 2020 +0800 sched/fair: Simplify the work when reweighting entity The code in reweight_entity() can be simplified. For a sched entity on the rq, the entity accounting can be replaced by cfs_rq instantaneous load updates currently called from within the entity accounting. Even though an entity on the rq can't represent a task in reweight_entity() (a task is always dequeued before calling this function) and so the numa task accounting and the rq->cfs_tasks list management of the entity accounting are never called, the redundant cfs_rq->nr_running decrement/increment will be avoided. Signed-off-by: Jiang Biao Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20200811113209.34057-1-benbjiang@tencent.com Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit d21036a9134537a7e56654ea6191797905c78148 Author: Vincent Donnefort Date: Tue Jun 21 10:04:08 2022 +0100 sched/fair: Provide u64 read for 32-bits arch helper Introducing macro helpers u64_u32_{store,load}() to factorize lockless accesses to u64 variables for 32-bits architectures. Users are for now cfs_rq.min_vruntime and sched_avg.last_update_time. To accommodate the later where the copy lies outside of the structure (cfs_rq.last_udpate_time_copy instead of sched_avg.last_update_time_copy), use the _copy() version of those helpers. Those new helpers encapsulate smp_rmb() and smp_wmb() synchronization and therefore, have a small penalty for 32-bits machines in set_task_rq_fair() and init_cfs_rq(). Signed-off-by: Vincent Donnefort Signed-off-by: Vincent Donnefort Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Tested-by: Lukasz Luba Link: https://lkml.kernel.org/r/20220621090414.433602-2-vdonnefort@google.com Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit fded4adb0d75b4c4c5400278f277a5ff8a782020 Author: Peter Zijlstra Date: Wed Apr 29 17:04:12 2020 +0200 rbtree, sched/fair: Use rb_add_cached() Reduce rbtree boiler plate by using the new helper function. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Acked-by: Davidlohr Bueso Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 759f0c10c642f4917e94b9ad3a29a68cfe02f1a5 Author: Michel Lespinasse Date: Wed Sep 25 16:46:10 2019 -0700 augmented rbtree: rework the RB_DECLARE_CALLBACKS macro definition Change the definition of the RBCOMPUTE function. The propagate callback repeatedly calls RBCOMPUTE as it moves from leaf to root. it wants to stop recomputing once the augmented subtree information doesn't change. This was previously checked using the == operator, but that only works when the augmented subtree information is a scalar field. This commit modifies the RBCOMPUTE function so that it now sets the augmented subtree information instead of returning it, and returns a boolean value indicating if the propagate callback should stop. The motivation for this change is that I want to introduce augmented rbtree uses where the augmented data for the subtree is a struct instead of a scalar. Link: http://lkml.kernel.org/r/20190703040156.56953-4-walken@google.com Signed-off-by: Michel Lespinasse Acked-by: Peter Zijlstra (Intel) Cc: David Howells Cc: Davidlohr Bueso Cc: Uladzislau Rezki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 9372e034d9366c3cad0772ee1b310652d90bcb70 Author: Michel Lespinasse Date: Wed Sep 25 16:46:07 2019 -0700 augmented rbtree: add new RB_DECLARE_CALLBACKS_MAX macro Add RB_DECLARE_CALLBACKS_MAX, which generates augmented rbtree callbacks for the case where the augmented value is a scalar whose definition follows a max(f(node)) pattern. This actually covers all present uses of RB_DECLARE_CALLBACKS, and saves some (source) code duplication in the various RBCOMPUTE function definitions. [walken@google.com: fix mm/vmalloc.c] Link: http://lkml.kernel.org/r/CANN689FXgK13wDYNh1zKxdipeTuALG4eKvKpsdZqKFJ-rvtGiQ@mail.gmail.com [walken@google.com: re-add check to check_augmented()] Link: http://lkml.kernel.org/r/20190727022027.GA86863@google.com Link: http://lkml.kernel.org/r/20190703040156.56953-3-walken@google.com Signed-off-by: Michel Lespinasse Acked-by: Peter Zijlstra (Intel) Cc: David Howells Cc: Davidlohr Bueso Cc: Uladzislau Rezki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit a3175d39d501e3512d4df600939201cbda8e9b33 Author: Michel Lespinasse Date: Wed Sep 25 16:46:04 2019 -0700 augmented rbtree: add comments for RB_DECLARE_CALLBACKS macro Patch series "make RB_DECLARE_CALLBACKS more generic", v3. These changes are intended to make the RB_DECLARE_CALLBACKS macro more generic (allowing the aubmented subtree information to be a struct instead of a scalar). I have verified the compiled lib/interval_tree.o and mm/mmap.o files to check that they didn't change. This held as expected for interval_tree.o; mmap.o did have some changes which could be reverted by marking __vma_link_rb as noinline. I did not add such a change to the patchset; I felt it was reasonable enough to leave the inlining decision up to the compiler. This patch (of 3): Add a short comment summarizing the arguments to RB_DECLARE_CALLBACKS. The arguments are also now capitalized. This copies the style of the INTERVAL_TREE_DEFINE macro. No functional changes in this commit, only comments and capitalization. Link: http://lkml.kernel.org/r/20190703040156.56953-2-walken@google.com Signed-off-by: Michel Lespinasse Acked-by: Davidlohr Bueso Acked-by: Peter Zijlstra (Intel) Cc: David Howells Cc: Uladzislau Rezki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 9600cbd0d286c6cf86b68e88ea0600a219815d37 Author: Peter Zijlstra Date: Wed May 31 13:58:43 2023 +0200 rbtree: Add rb_add_augmented_cached() helper While slightly sub-optimal, updating the augmented data while going down the tree during lookup would be faster -- alas the augment interface does not currently allow for that, provide a generic helper to add a node to an augmented cached tree. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20230531124603.862983648@infradead.org Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 3773003b378a5a3e5e44d0083d05c7437fd8a4b5 Author: Peter Zijlstra Date: Mon Oct 9 10:36:53 2017 +0200 sched/core: Ensure load_balance() respects the active_mask While load_balance() masks the source CPUs against active_mask, it had a hole against the destination CPU. Ensure the destination CPU is also part of the 'domain-mask & active-mask' set. Reported-by: Levin, Alexander (Sasha Levin) Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 77d1dfda0e79 ("sched/topology, cpuset: Avoid spurious/wrong domain rebuilds") Signed-off-by: Ingo Molnar commit 1a4a49b4cd14bdcf91e3c9de97aa98c912e11c83 Author: Wanpeng Li Date: Mon Jan 13 08:50:27 2020 +0800 [BACKPORT]sched/nohz: Optimize get_nohz_timer_target() On a machine, CPU 0 is used for housekeeping, the other 39 CPUs in the same socket are in nohz_full mode. We can observe huge time burn in the loop for seaching nearest busy housekeeper cpu by ftrace. 2) | get_nohz_timer_target() { 2) 0.240 us | housekeeping_test_cpu(); 2) 0.458 us | housekeeping_test_cpu(); ... 2) 0.292 us | housekeeping_test_cpu(); 2) 0.240 us | housekeeping_test_cpu(); 2) 0.227 us | housekeeping_any_cpu(); 2) + 43.460 us | } This patch optimizes the searching logic by finding a nearest housekeeper CPU in the housekeeping cpumask, it can minimize the worst searching time from ~44us to < 10us in my testing. In addition, the last iterated busy housekeeper can become a random candidate while current CPU is a better fallback if it is a housekeeper. Signed-off-by: Wanpeng Li Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/1578876627-11938-1-git-send-email-wanpengli@tencent.com Signed-off-by: DennySPB commit f773f3b275ad618158280c9793a1d27639f298f7 Author: Vincent Guittot Date: Fri Dec 14 17:01:56 2018 +0100 sched/fair: Trigger asym_packing during idle load balance Newly idle load balancing is not always triggered when a CPU becomes idle. This prevents the scheduler from getting a chance to migrate the task for asym packing. Enable active migration during idle load balance too. Signed-off-by: Vincent Guittot Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: valentin.schneider@arm.com Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar Signed-off-by: Jesse Chan Signed-off-by: billaids commit 75352e3fb31924ec3af106be73e210d2258fefe8 Author: Peter Zijlstra Date: Mon Oct 9 10:36:53 2017 +0200 sched/core: Ensure load_balance() respects the active_mask While load_balance() masks the source CPUs against active_mask, it had a hole against the destination CPU. Ensure the destination CPU is also part of the 'domain-mask & active-mask' set. Reported-by: Levin, Alexander (Sasha Levin) Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 77d1dfda0e79 ("sched/topology, cpuset: Avoid spurious/wrong domain rebuilds") Signed-off-by: Ingo Molnar commit 74b6dcb03b6b9fa56fbcdf2ffb33e50fa97db6dc Author: Frederic Weisbecker Date: Tue Dec 3 17:01:06 2019 +0100 sched: Use fair:prio_changed() instead of ad-hoc implementation set_user_nice() implements its own version of fair::prio_changed() and therefore misses a specific optimization towards nohz_full CPUs that avoid sending an resched IPI to a reniced task running alone. Use the proper callback instead. Change-Id: I51ba67826dfcec0aa423758281943c01ba267c91 Reported-by: Peter Zijlstra Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Ingo Molnar Link: https://lkml.kernel.org/r/20191203160106.18806-3-frederic@kernel.org Signed-off-by: mydongistiny Signed-off-by: DennySPB commit 1994e58a6bf1726d32a1c58af16b2be6fdfafd80 Author: Vincent Guittot Date: Thu Apr 26 12:19:32 2018 +0200 sched/fair: Fix the update of blocked load when newly idle With commit: 31e77c93e432 ("sched/fair: Update blocked load when newly idle") ... we release the rq->lock when updating blocked load of idle CPUs. This opens a time window during which another CPU can add a task to this CPU's cfs_rq. The check for newly added task of idle_balance() is not in the common path. Move the out label to include this check. Reported-by: Heiner Kallweit Tested-by: Geert Uytterhoeven Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 31e77c93e432 ("sched/fair: Update blocked load when newly idle") Link: http://lkml.kernel.org/r/20180426103133.GA6953@linaro.org Signed-off-by: Ingo Molnar commit ddd49e696ad91a561a8c8bf12ef61bc0e113ac41 Author: Josh Don Date: Tue Aug 4 12:34:13 2020 -0700 sched/fair: Ignore cache hotness for SMT migration SMT siblings share caches, so cache hotness should be irrelevant for cross-sibling migration. Signed-off-by: Josh Don Proposed-by: Venkatesh Pallipadi Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200804193413.510651-1-joshdon@google.com commit de8674e8809c07543874ec4a4c91631c481caf4f Author: Peter Oskolkov Date: Wed Sep 30 10:35:32 2020 -0700 sched/fair: Tweak pick_next_entity() Currently, pick_next_entity(...) has the following structure (simplified): [...] if (last_buddy_ok()) result = last_buddy; if (next_buddy_ok()) result = next_buddy; [...] The intended behavior is to prefer next buddy over last buddy; the current code somewhat obfuscates this, and also wastes cycles checking the last buddy when eventually the next buddy is picked up. So this patch refactors two 'ifs' above into [...] if (next_buddy_ok()) result = next_buddy; else if (last_buddy_ok()) result = last_buddy; [...] Signed-off-by: Peter Oskolkov Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20200930173532.1069092-1-posk@google.com commit 6a20c355412765268bfc7635b626899873f73337 Author: Clement Courbet Date: Wed Mar 3 14:46:53 2021 -0800 sched: Optimize __calc_delta() A significant portion of __calc_delta() time is spent in the loop shifting a u64 by 32 bits. Use `fls` instead of iterating. This is ~7x faster on benchmarks. The generic `fls` implementation (`generic_fls`) is still ~4x faster than the loop. Architectures that have a better implementation will make use of it. For example, on x86 we get an additional factor 2 in speed without dedicated implementation. On GCC, the asm versions of `fls` are about the same speed as the builtin. On Clang, the versions that use fls are more than twice as slow as the builtin. This is because the way the `fls` function is written, clang puts the value in memory: https://godbolt.org/z/EfMbYe. This bug is filed at https://bugs.llvm.org/show_bug.cgi?idI406. ``` name cpu/op BM_Calc<__calc_delta_loop> 9.57ms Â=B112% BM_Calc<__calc_delta_generic_fls> 2.36ms Â=B113% BM_Calc<__calc_delta_asm_fls> 2.45ms Â=B113% BM_Calc<__calc_delta_asm_fls_nomem> 1.66ms Â=B112% BM_Calc<__calc_delta_asm_fls64> 2.46ms Â=B113% BM_Calc<__calc_delta_asm_fls64_nomem> 1.34ms Â=B115% BM_Calc<__calc_delta_builtin> 1.32ms Â=B111% ``` Signed-off-by: Clement Courbet Signed-off-by: Josh Don Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210303224653.2579656-1-joshdon@google.com commit e4793308d8214d546ae2eead6f9fac0426ada0f9 Author: Amit Kucheria Date: Mon Oct 21 17:45:12 2019 +0530 cpufreq: Initialize the governors in core_initcall Initialize the cpufreq governors earlier to allow for earlier performance control during the boot process. Signed-off-by: Amit Kucheria Acked-by: Viresh Kumar Reviewed-by: Rafael J. Wysocki Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/b98eae9b44eb2f034d7f5d12a161f5f831be1eb7.1571656015.git.amit.kucheria@linaro.org # Conflicts: # drivers/cpufreq/cpufreq_performance.c commit 7f1bc529bc787dcbedfd2899cb9c99326dd57215 Author: Mathieu Poirier Date: Fri Jul 19 15:59:53 2019 +0200 sched/topology: Add partition_sched_domains_locked() Introduce the partition_sched_domains_locked() function by taking the mutex locking code out of the original function. That way the work done by partition_sched_domains_locked() can be reused without dropping the mutex lock. No change of functionality is introduced by this patch. Tested-by: Dietmar Eggemann Signed-off-by: Mathieu Poirier Signed-off-by: Peter Zijlstra (Intel) Acked-by: Tejun Heo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bristot@redhat.com Cc: claudio@evidence.eu.com Cc: lizefan@huawei.com Cc: longman@redhat.com Cc: luca.abeni@santannapisa.it Cc: rostedt@goodmis.org Cc: tommaso.cucinotta@santannapisa.it Link: https://lkml.kernel.org/r/20190719140000.31694-2-juri.lelli@redhat.com Signed-off-by: Ingo Molnar commit e17f27e0d39d77e4b1011278796a30e8d7b05d6f Author: Valentin Schneider Date: Tue Apr 9 18:35:45 2019 +0100 sched/topology: Skip duplicate group rewrites in build_sched_groups() While staring at build_sched_domains(), I realized that get_group() does several duplicate (thus useless) writes. If you take the Arm Juno r0 (LITTLEs = [0, 3, 4, 5], bigs = [1, 2]), the sched_group build flow would look like this: ('MC[cpu]->sg' means 'per_cpu_ptr(&tl->data->sg, cpu)' with 'tl == MC') build_sched_groups(MC[CPU0]->sd, CPU0) get_group(0) -> MC[CPU0]->sg get_group(3) -> MC[CPU3]->sg get_group(4) -> MC[CPU4]->sg get_group(5) -> MC[CPU5]->sg build_sched_groups(DIE[CPU0]->sd, CPU0) get_group(0) -> DIE[CPU0]->sg get_group(1) -> DIE[CPU1]->sg <=================+ | build_sched_groups(MC[CPU1]->sd, CPU1) | get_group(1) -> MC[CPU1]->sg | get_group(2) -> MC[CPU2]->sg | | build_sched_groups(DIE[CPU1]->sd, CPU1) ^ get_group(1) -> DIE[CPU1]->sg } We've set up these two up here! get_group(3) -> DIE[CPU0]->sg } From this point on, we will only use sched_groups that have been previously visited & initialized. The only new operation will be which group pointer we affect to sd->groups. On the Juno r0 we get 32 get_group() calls, every single one of them writing to a sched_group->cpumask. However, all of the data structures we need are set up after 8 visits (see above). Return early from get_group() if we've already visited (and thus initialized) the sched_group we're looking at. Overlapping domains are not affected as they do not use build_sched_groups(). Tested on a Juno and a 2 * (Xeon E5-2690) system. ( FWIW I initially checked the refs for both sg && sg->sgc, but figured if they weren't both 0 or > 1 then something must have gone wrong, so I threw in a WARN_ON(). ) No change in functionality intended. Signed-off-by: Valentin Schneider Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar commit c62d6453c251607e2fe6f6e204a0711b7edc03df Author: Vincent Guittot Date: Mon Jun 17 17:00:17 2019 +0200 sched/topology: Remove unused 'sd' parameter from arch_scale_cpu_capacity() The 'struct sched_domain *sd' parameter to arch_scale_cpu_capacity() is unused since commit: 765d0af19f5f ("sched/topology: Remove the ::smt_gain field from 'struct sched_domain'") Remove it. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Viresh Kumar Reviewed-by: Valentin Schneider Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: gregkh@linuxfoundation.org Cc: linux@armlinux.org.uk Cc: quentin.perret@arm.com Cc: rafael@kernel.org Link: https://lkml.kernel.org/r/1560783617-5827-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar commit 360635bafb36f39478a5c32b43e3cb64b1df4dad Author: John Galt Date: Mon Apr 22 16:01:47 2024 -0400 sched-pelt: make half life nonconfigurable 16ms only commit baa288445831e3cef3b7f6cd20bc0c52f331acc2 Author: John Galt Date: Mon Apr 22 15:32:50 2024 -0400 sched/fair: decrease migration cost commit 7e1c7b478d201f3909d4c8f1f9f121064457f4e9 Author: Sebastian Andrzej Siewior Date: Tue Aug 1 17:26:48 2023 +0200 sched/rt: Don't try push tasks if there are none. I have a RT task X at a high priority and cyclictest on each CPU with lower priority than X's. If X is active and each CPU wakes their own cylictest thread then it ends in a longer rto_push storm. A random CPU determines via balance_rt() that the CPU on which X is running needs to push tasks. X has the highest priority, cyclictest is next in line so there is nothing that can be done since the task with the higher priority is not touched. tell_cpu_to_push() increments rto_loop_next and schedules rto_push_irq_work_func() on X's CPU. The other CPUs also increment the loop counter and do the same. Once rto_push_irq_work_func() is active it does nothing because it has _no_ pushable tasks on its runqueue. Then checks rto_next_cpu() and decides to queue irq_work on the local CPU because another CPU requested a push by incrementing the counter. I have traces where ~30 CPUs request this ~3 times each before it finally ends. This greatly increases X's runtime while X isn't making much progress. Teach rto_next_cpu() to only return CPUs which also have tasks on their runqueue which can be pushed away. This does not reduce the tell_cpu_to_push() invocations (rto_loop_next counter increments) but reduces the amount of issued rto_push_irq_work_func() if nothing can be done. As the result the overloaded CPU is blocked less often. There are still cases where the "same job" is repeated several times (for instance the current CPU needs to resched but didn't yet because the irq-work is repeated a few times and so the old task remains on the CPU) but the majority of request end in tell_cpu_to_push() before an IPI is issued. Reviewed-by: "Steven Rostedt (Google)" Link: https://lore.kernel.org/r/20230801152648._y603AS_@linutronix.de Signed-off-by: Sebastian Andrzej Siewior commit 93662a6fbca530b6480fc92e9d119c5ccf09da7f Author: Vincent Guittot Date: Wed May 13 15:55:28 2020 +0200 sched/fair: Fix unthrottle_cfs_rq() for leaf_cfs_rq list Although not exactly identical, unthrottle_cfs_rq() and enqueue_task_fair() are quite close and follow the same sequence for enqueuing an entity in the cfs hierarchy. Modify unthrottle_cfs_rq() to use the same pattern as enqueue_task_fair(). This fixes a problem already faced with the latter and add an optimization in the last for_each_sched_entity loop. Fixes: fe61468b2cb (sched/fair: Fix enqueue_task_fair warning) Reported-by Tao Zhou Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Reviewed-by: Ben Segall Link: https://lkml.kernel.org/r/20200513135528.4742-1-vincent.guittot@linaro.org commit e8164ed37c41537ee1db3584095cd533705d5ad5 Author: Josh Don Date: Fri Apr 10 15:52:08 2020 -0700 sched/fair: Remove distribute_running from CFS bandwidth This is mostly a revert of commit: baa9be4ffb55 ("sched/fair: Fix throttle_list starvation with low CFS quota") The primary use of distribute_running was to determine whether to add throttled entities to the head or the tail of the throttled list. Now that we always add to the tail, we can remove this field. The other use of distribute_running is in the slack_timer, so that we don't start a distribution while one is already running. However, even in the event that this race occurs, it is fine to have two distributions running (especially now that distribute grabs the cfs_b->lock to determine remaining quota before assigning). Signed-off-by: Josh Don Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Tested-by: Phil Auld Link: https://lkml.kernel.org/r/20200410225208.109717-3-joshdon@google.com commit 1eb50996e28558aabcd74d3afb37e044fedac26d Author: bsegall@google.com Date: Thu Jun 6 10:21:01 2019 -0700 sched/fair: Don't push cfs_bandwith slack timers forward When a cfs_rq sleeps and returns its quota, we delay for 5ms before waking any throttled cfs_rqs to coalesce with other cfs_rqs going to sleep, as this has to be done outside of the rq lock we hold. The current code waits for 5ms without any sleeps, instead of waiting for 5ms from the first sleep, which can delay the unthrottle more than we want. Switch this around so that we can't push this forward forever. This requires an extra flag rather than using hrtimer_active, since we need to start a new timer if the current one is in the process of finishing. Signed-off-by: Ben Segall Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Xunlei Pang Acked-by: Phil Auld Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/xm26a7euy6iq.fsf_-_@bsegall-linux.svl.corp.google.com Signed-off-by: Ingo Molnar Signed-off-by: DennySPB commit 2d4490a11d8952e8016647a5bbfa98f6df781dde Author: Paul Turner Date: Fri Apr 10 15:52:07 2020 -0700 sched/fair: Eliminate bandwidth race between throttling and distribution There is a race window in which an entity begins throttling before quota is added to the pool, but does not finish throttling until after we have finished with distribute_cfs_runtime(). This entity is not observed by distribute_cfs_runtime() because it was not on the throttled list at the time that distribution was running. This race manifests as rare period-length statlls for such entities. Rather than heavy-weight the synchronization with the progress of distribution, we can fix this by aborting throttling if bandwidth has become available. Otherwise, we immediately add the entity to the throttled list so that it can be observed by a subsequent distribution. Additionally, we can remove the case of adding the throttled entity to the head of the throttled list, and simply always add to the tail. Thanks to 26a8b12747c97, distribute_cfs_runtime() no longer holds onto its own pool of runtime. This means that if we do hit the !assign and distribute_running case, we know that distribution is about to end. Signed-off-by: Paul Turner Signed-off-by: Ben Segall Signed-off-by: Josh Don Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Link: https://lkml.kernel.org/r/20200410225208.109717-2-joshdon@google.com commit 4f08342e65cedd1082630c987d9a7bafa7703c70 Author: Huaixin Chang Date: Fri Mar 27 11:26:25 2020 +0800 sched/fair: Fix race between runtime distribution and assignment Currently, there is a potential race between distribute_cfs_runtime() and assign_cfs_rq_runtime(). Race happens when cfs_b->runtime is read, distributes without holding lock and finds out there is not enough runtime to charge against after distribution. Because assign_cfs_rq_runtime() might be called during distribution, and use cfs_b->runtime at the same time. Fibtest is the tool to test this race. Assume all gcfs_rq is throttled and cfs period timer runs, slow threads might run and sleep, returning unused cfs_rq runtime and keeping min_cfs_rq_runtime in their local pool. If all this happens sufficiently quickly, cfs_b->runtime will drop a lot. If runtime distributed is large too, over-use of runtime happens. A runtime over-using by about 70 percent of quota is seen when we test fibtest on a 96-core machine. We run fibtest with 1 fast thread and 95 slow threads in test group, configure 10ms quota for this group and see the CPU usage of fibtest is 17.0%, which is far more than the expected 10%. On a smaller machine with 32 cores, we also run fibtest with 96 threads. CPU usage is more than 12%, which is also more than expected 10%. This shows that on similar workloads, this race do affect CPU bandwidth control. Solve this by holding lock inside distribute_cfs_runtime(). Fixes: c06f04c70489 ("sched: Fix potential near-infinite distribute_cfs_runtime() loop") Reviewed-by: Ben Segall Signed-off-by: Huaixin Chang Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/lkml/20200325092602.22471-1-changhuaixin@linux.alibaba.com/ commit 455896c7c0c647b57e0b742d00b3b65aa8f02e87 Author: kondors1995 Date: Thu Apr 4 21:48:12 2024 +0300 sched/cass:fixup commit 4b88ec1c7957d055e3a23067908fa0ec47cbd2c5 Author: Sultan Alsawaf Date: Wed Mar 13 21:25:29 2024 -0700 sched/cass: Eliminate redundant calls to smp_processor_id() Calling smp_processor_id() can be expensive depending on how an arch implements it, so avoid calling it more than necessary. Use the raw variant too since this code is always guaranteed to run with preemption disabled. Signed-off-by: Sultan Alsawaf commit f7d512d05d97dea633689bf689fd8a63960248ca Author: Sultan Alsawaf Date: Tue Mar 12 16:55:56 2024 -0700 sched/cass: Only treat sync waker CPU as idle if there's one task running For synchronized wakes, the waker's CPU should only be treated as idle if there aren't any other running tasks on that CPU. This is because, for synchronized wakes, it is assumed that the waker will immediately go to sleep after waking the wakee; therefore, if there aren't any other tasks running on the waker's CPU, it'll go idle and should be treated as such to improve task placement. This optimization only applies when there aren't any other tasks running on the waker's CPU, however. Fix it by ensuring that there's only the waker running on its CPU. Signed-off-by: Sultan Alsawaf commit 3264ea2961b6a7a2e5589b199c031aa17361401a Author: Sultan Alsawaf Date: Mon Feb 19 13:13:02 2024 -0800 sched/cass: Fix suboptimal task placement when uclamp is used Uclamp is designed to specify a process' CPU performance requirement scaled as a CPU capacity value. It simply denotes the process' requirement for the CPU's raw performance and thus P-state. CASS currently treats uclamp as a CPU load value however, producing wildly suboptimal CPU placement decisions for tasks which use uclamp. This hurts performance and, even worse, massively hurts energy efficiency, with CASS sometimes yielding power consumption that is a few times higher than EAS. Since uclamp inherently throws a wrench into CASS's goal of keeping relative P-states as low as possible across all CPUs, making it cooperate with CASS requires a multipronged approach. Make the following three changes to fix the uclamp task placement issue: 1. Treat uclamp as a CPU performance value rather than a CPU load value. 2. Clamp a CPU's utilization to the task's uclamp floor in order to keep relative P-states as low as possible across all CPUs. 3. Consider preferring a non-idle CPU for uclamped tasks to avoid pushing up the P-state of more than one CPU when there are multiple concurrent uclamped tasks. This fixes CASS's massive energy efficiency and performance issues when uclamp is used. Signed-off-by: Sultan Alsawaf commit dc098ba6aab229c19099642f561d33562f1ff755 Author: Sultan Alsawaf Date: Sat Jan 6 00:34:48 2024 -0800 sched/cass: Perform runqueue selection for RT tasks too RT tasks aren't placed on CPUs in a load-balanced manner, much less an energy efficient one. On systems which contain many RT tasks and/or IRQ threads, energy efficiency and throughput are diminished significantly by the default RT runqueue selection scheme which targets minimal latency. In practice, performance is actually improved by spreading RT tasks fairly, despite the small latency impact. Additionally, energy efficiency is significantly improved since the placement of all tasks benefits from energy-efficient runqueue selection, rather than just CFS tasks. Perform runqueue selection for RT tasks in CASS to significantly improve energy efficiency and overall performance. Signed-off-by: Sultan Alsawaf commit c8bbbe7148731e0024d0681f6cc8738cbe1cee06 Author: kondors1995 Date: Thu Apr 4 12:48:06 2024 +0300 sched/cass:checkout to kerneltoast/android_kernel_google_zuma@63f0b82d3 commit 1d4260eba475bad8a2aba7a974e854baefeb1040 Author: Sultan Alsawaf Date: Wed Feb 28 21:07:44 2024 -0800 sched/cass: Clean up local variable scope in cass_best_cpu() Move `curr` and `idle_state` to within the loop's scope for better readability. Also, leave a comment about `curr->cpu` to make it clear that `curr->cpu` must be initialized within the loop in order for `best->cpu` to be valid. Signed-off-by: Sultan Alsawaf commit cc47ef1ef764b7d261402bfa04a42dc4a418ab0b Author: Sultan Alsawaf Date: Sun Dec 17 19:04:28 2023 -0800 sched/cass: Fix CPU selection when no candidate CPUs are idle When no candidate CPUs are idle, CASS would keep `cidx` unchanged, and thus `best == curr` would always be true. As a result, since the empty candidate slot never changes, the current candidate `curr` always overwrites the best candidate `best`. This causes the last valid CPU to always be selected by CASS when no CPUs are idle (i.e., under heavy load). Fix it by ensuring that the CPU loop in cass_best_cpu() flips the free candidate index after the first candidate CPU is evaluated. Signed-off-by: Sultan Alsawaf commit 2dec239da28c7e1d19ead79686f5114e417e6d65 Author: Patrick Bellasi Date: Mon Nov 5 14:54:00 2018 +0000 UPSTREAM: sched/fair: Add lsub_positive() and use it consistently The following pattern: var -= min_t(typeof(var), var, val); is used multiple times in fair.c. The existing sub_positive() already captures that pattern, but it also adds an explicit load-store to properly support lockless observations. In other cases the pattern above is used to update local, and/or not concurrently accessed, variables. Let's add a simpler version of sub_positive(), targeted at local variables updates, which gives the same readability benefits at calling sites, without enforcing {READ,WRITE}_ONCE() barriers. Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Peter Zijlstra Cc: Quentin Perret Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Link: https://lore.kernel.org/lkml/20181031184527.GA3178@hirez.programming.kicks-ass.net Change-Id: I6a6a3b2ae9e4baa4ab6e906bf2aaed7306303025 Signed-off-by: Jason Edson Signed-off-by: DennySPb Signed-off-by: Tashfin Shakeer Rhythm Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit fc9b2f93da21cddfc7d76fa941eb496935e48db4 Author: Rohit Jain Date: Wed May 2 13:52:10 2018 -0700 sched/core: Don't schedule threads on pre-empted vCPUs In paravirt configurations today, spinlocks figure out whether a vCPU is running to determine whether or not spinlock should bother spinning. We can use the same logic to prioritize CPUs when scheduling threads. If a vCPU has been pre-empted, it will incur the extra cost of VMENTER and the time it actually spends to be running on the host CPU. If we had other vCPUs which were actually running on the host CPU and idle we should schedule threads there. Performance numbers: Note: With patch is referred to as Paravirt in the following and without patch is referred to as Base. 1) When only 1 VM is running: a) Hackbench test on KVM 8 vCPUs, 10,000 loops (lower is better): +-------+-----------------+----------------+ |Number |Paravirt |Base | |of +---------+-------+-------+--------+ |Threads|Average |Std Dev|Average| Std Dev| +-------+---------+-------+-------+--------+ |1 |1.817 |0.076 |1.721 | 0.067 | |2 |3.467 |0.120 |3.468 | 0.074 | |4 |6.266 |0.035 |6.314 | 0.068 | |8 |11.437 |0.105 |11.418 | 0.132 | |16 |21.862 |0.167 |22.161 | 0.129 | |25 |33.341 |0.326 |33.692 | 0.147 | +-------+---------+-------+-------+--------+ 2) When two VMs are running with same CPU affinities: a) tbench test on VM 8 cpus Base: VM1: Throughput 220.59 MB/sec 1 clients 1 procs max_latency=12.872 ms Throughput 448.716 MB/sec 2 clients 2 procs max_latency=7.555 ms Throughput 861.009 MB/sec 4 clients 4 procs max_latency=49.501 ms Throughput 1261.81 MB/sec 7 clients 7 procs max_latency=76.990 ms VM2: Throughput 219.937 MB/sec 1 clients 1 procs max_latency=12.517 ms Throughput 470.99 MB/sec 2 clients 2 procs max_latency=12.419 ms Throughput 841.299 MB/sec 4 clients 4 procs max_latency=37.043 ms Throughput 1240.78 MB/sec 7 clients 7 procs max_latency=77.489 ms Paravirt: VM1: Throughput 222.572 MB/sec 1 clients 1 procs max_latency=7.057 ms Throughput 485.993 MB/sec 2 clients 2 procs max_latency=26.049 ms Throughput 947.095 MB/sec 4 clients 4 procs max_latency=45.338 ms Throughput 1364.26 MB/sec 7 clients 7 procs max_latency=145.124 ms VM2: Throughput 224.128 MB/sec 1 clients 1 procs max_latency=4.564 ms Throughput 501.878 MB/sec 2 clients 2 procs max_latency=11.061 ms Throughput 965.455 MB/sec 4 clients 4 procs max_latency=45.370 ms Throughput 1359.08 MB/sec 7 clients 7 procs max_latency=168.053 ms b) Hackbench with 4 fd 1,000,000 loops +-------+--------------------------------------+----------------------------------------+ |Number |Paravirt |Base | |of +----------+--------+---------+--------+----------+--------+---------+----------+ |Threads|Average1 |Std Dev1|Average2 | Std Dev|Average1 |Std Dev1|Average2 | Std Dev 2| +-------+----------+--------+---------+--------+----------+--------+---------+----------+ | 1 | 3.748 | 0.620 | 3.576 | 0.432 | 4.006 | 0.395 | 3.446 | 0.787 | +-------+----------+--------+---------+--------+----------+--------+---------+----------+ Note that this test was run just to show the interference effect over-subscription can have in baseline c) schbench results with 2 message groups on 8 vCPU VMs +-----------+-------+---------------+--------------+------------+ | | | Paravirt | Base | | +-----------+-------+-------+-------+-------+------+------------+ | |Threads| VM1 | VM2 | VM1 | VM2 |%Improvement| +-----------+-------+-------+-------+-------+------+------------+ |50.0000th | 1 | 52 | 53 | 58 | 54 | +6.25% | |75.0000th | 1 | 69 | 61 | 83 | 59 | +8.45% | |90.0000th | 1 | 80 | 80 | 89 | 83 | +6.98% | |95.0000th | 1 | 83 | 83 | 93 | 87 | +7.78% | |*99.0000th | 1 | 92 | 94 | 99 | 97 | +5.10% | |99.5000th | 1 | 95 | 100 | 102 | 103 | +4.88% | |99.9000th | 1 | 107 | 123 | 105 | 203 | +25.32% | +-----------+-------+-------+-------+-------+------+------------+ |50.0000th | 2 | 56 | 62 | 67 | 59 | +6.35% | |75.0000th | 2 | 69 | 75 | 80 | 71 | +4.64% | |90.0000th | 2 | 80 | 82 | 90 | 81 | +5.26% | |95.0000th | 2 | 85 | 87 | 97 | 91 | +8.51% | |*99.0000th | 2 | 98 | 99 | 107 | 109 | +8.79% | |99.5000th | 2 | 107 | 105 | 109 | 116 | +5.78% | |99.9000th | 2 | 9968 | 609 | 875 | 3116 | -165.02% | +-----------+-------+-------+-------+-------+------+------------+ |50.0000th | 4 | 78 | 77 | 78 | 79 | +1.27% | |75.0000th | 4 | 98 | 106 | 100 | 104 | 0.00% | |90.0000th | 4 | 987 | 1001 | 995 | 1015 | +1.09% | |95.0000th | 4 | 4136 | 5368 | 5752 | 5192 | +13.16% | |*99.0000th | 4 | 11632 | 11344 | 11024| 10736| -5.59% | |99.5000th | 4 | 12624 | 13040 | 12720| 12144| -3.22% | |99.9000th | 4 | 13168 | 18912 | 14992| 17824| +2.24% | +-----------+-------+-------+-------+-------+------+------------+ Note: Improvement is measured for (VM1+VM2) Signed-off-by: Rohit Jain Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dhaval.giani@oracle.com Cc: matt@codeblueprint.co.uk Cc: steven.sistare@oracle.com Cc: subhra.mazumdar@oracle.com Link: http://lkml.kernel.org/r/1525294330-7759-1-git-send-email-rohit.k.jain@oracle.com Signed-off-by: Ingo Molnar Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 495ad164dcd1731db6746346355ecf52a193c720 Author: EmanuelCN Date: Wed Apr 3 18:31:28 2024 +0000 init: Enable SCHED_THERMAL_PRESSURE by default Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 9170eccde2a4c93f9be5b4bc0bed7bce1277f8da Author: Thara Gopinath Date: Fri Feb 21 19:52:13 2020 -0500 sched/fair: Enable tuning of decay period Thermal pressure follows pelt signals which means the decay period for thermal pressure is the default pelt decay period. Depending on SoC characteristics and thermal activity, it might be beneficial to decay thermal pressure slower, but still in-tune with the pelt signals. One way to achieve this is to provide a command line parameter to set a decay shift parameter to an integer between 0 and 10. Signed-off-by: Thara Gopinath Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20200222005213.3873-10-thara.gopinath@linaro.org Signed-off-by: Tashfin Shakeer Rhythm Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 0c8c7aaf09aea4f6296130605b75697fa21f2615 Author: Lukasz Luba Date: Thu Jun 10 16:03:22 2021 +0100 UPSTREAM: thermal: cpufreq_cooling: Update also offline CPUs per-cpu thermal_pressure The thermal pressure signal gives information to the scheduler about reduced CPU capacity due to thermal. It is based on a value stored in a per-cpu 'thermal_pressure' variable. The online CPUs will get the new value there, while the offline won't. Unfortunately, when the CPU is back online, the value read from per-cpu variable might be wrong (stale data). This might affect the scheduler decisions, since it sees the CPU capacity differently than what is actually available. Fix it by making sure that all online+offline CPUs would get the proper value in their per-cpu variable when thermal framework sets capping. Fixes: f12e4f66ab6a3 ("thermal/cpu-cooling: Update thermal pressure in case of a maximum frequency capping") Signed-off-by: Lukasz Luba Acked-by: Viresh Kumar Link: https://lore.kernel.org/all/20210614191030.22241-1-lukasz.luba@arm.com/ Bug: 199501011 Change-Id: I10cceb48b72ccce1f51cfc0a7ecfa8d8e67d4394 (cherry picked from commit 2ad8ccc17d1e4270cf65a3f2a07a7534aa23e3fb) Signed-off-by: Ram Chandrasekar Signed-off-by: Divyanshu-Modi Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit cdbf22d3d8bc1b7a384677f4af7ea20fd169c538 Author: Thara Gopinath Date: Fri Feb 21 19:52:12 2020 -0500 thermal: cpu-cooling: Update thermal pressure in case of a maximum frequency capping Thermal governors can request for a CPU's maximum supported frequency to be capped in case of an overheat event. This in turn means that the maximum capacity available for tasks to run on the particular CPU is reduced. Delta between the original maximum capacity and capped maximum capacity is known as thermal pressure. Enable cpufreq cooling device to update the thermal pressure in event of a capped maximum frequency. Signed-off-by: Thara Gopinath Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20200222005213.3873-9-thara.gopinath@linaro.org Signed-off-by: Divyanshu-Modi Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit d81bc16f14b74721dcea680568e862361f531ca8 Author: Thara Gopinath Date: Fri Feb 21 19:52:10 2020 -0500 sched/fair: Enable periodic update of average thermal pressure Introduce support in scheduler periodic tick and other CFS bookkeeping APIs to trigger the process of computing average thermal pressure for a CPU. Also consider avg_thermal.load_avg in others_have_blocked which allows for decay of pelt signals. Signed-off-by: Thara Gopinath Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20200222005213.3873-7-thara.gopinath@linaro.org Signed-off-by: Divyanshu-Modi Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 0b07e1ccac5d97bf63df30b1a9e36c99a1d59390 Author: Thara Gopinath Date: Fri Feb 21 19:52:09 2020 -0500 arm/topology: Populate arch_scale_thermal_pressure() for ARM platforms Hook up topology_get_thermal_pressure to arch_scale_thermal_pressure thus enabling scheduler to retrieve instantaneous thermal pressure of a CPU. Signed-off-by: Thara Gopinath Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20200222005213.3873-6-thara.gopinath@linaro.org Signed-off-by: Divyanshu-Modi Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit f4dd9d9f8a6e9eb80615f3cf46b33a823f743e3a Author: Thara Gopinath Date: Fri Feb 21 19:52:08 2020 -0500 arm64/topology: Populate arch_scale_thermal_pressure() for arm64 platforms Hook up topology_get_thermal_pressure to arch_scale_thermal_pressure thus enabling scheduler to retrieve instantaneous thermal pressure of a CPU. Signed-off-by: Thara Gopinath Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20200222005213.3873-5-thara.gopinath@linaro.org Signed-off-by: Divyanshu-Modi Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 8d62e23c1aff88f3ea55d7bb1f8f69295e150189 Author: Thara Gopinath Date: Fri Feb 21 19:52:07 2020 -0500 drivers/base/arch_topology: Add infrastructure to store and update instantaneous thermal pressure Add architecture specific APIs to update and track thermal pressure on a per CPU basis. A per CPU variable thermal_pressure is introduced to keep track of instantaneous per CPU thermal pressure. Thermal pressure is the delta between maximum capacity and capped capacity due to a thermal event. topology_get_thermal_pressure can be hooked into the scheduler specified arch_scale_thermal_pressure to retrieve instantaneous thermal pressure of a CPU. arch_set_thermal_pressure can be used to update the thermal pressure. Considering topology_get_thermal_pressure reads thermal_pressure and arch_set_thermal_pressure writes into thermal_pressure, one can argue for some sort of locking mechanism to avoid a stale value. But considering topology_get_thermal_pressure can be called from a system critical path like scheduler tick function, a locking mechanism is not ideal. This means that it is possible the thermal_pressure value used to calculate average thermal pressure for a CPU can be stale for up to 1 tick period. Signed-off-by: Thara Gopinath Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20200222005213.3873-4-thara.gopinath@linaro.org Signed-off-by: Divyanshu-Modi Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 92727ae00ca5d23464f5d7ec7702bf62ab4ba026 Author: Thara Gopinath Date: Fri Feb 21 19:52:06 2020 -0500 sched/topology: Add callback to read per CPU thermal pressure Introduce the arch_scale_thermal_pressure() callback to retrieve per CPU thermal pressure. Signed-off-by: Thara Gopinath Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20200222005213.3873-3-thara.gopinath@linaro.org Signed-off-by: Divyanshu-Modi Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 62abba1d3769cd0c8566f1d354035499fb6cb8a0 Author: Thara Gopinath Date: Fri Feb 21 19:52:05 2020 -0500 sched/pelt: Add support to track thermal pressure Extrapolating on the existing framework to track rt/dl utilization using pelt signals, add a similar mechanism to track thermal pressure. The difference here from rt/dl utilization tracking is that, instead of tracking time spent by a CPU running a RT/DL task through util_avg, the average thermal pressure is tracked through load_avg. This is because thermal pressure signal is weighted time "delta" capacity unlike util_avg which is binary. "delta capacity" here means delta between the actual capacity of a CPU and the decreased capacity a CPU due to a thermal event. In order to track average thermal pressure, a new sched_avg variable avg_thermal is introduced. Function update_thermal_load_avg can be called to do the periodic bookkeeping (accumulate, decay and average) of the thermal pressure. Reviewed-by: Vincent Guittot Signed-off-by: Thara Gopinath Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20200222005213.3873-2-thara.gopinath@linaro.org Signed-off-by: Divyanshu-Modi Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 82a4f68550bde32a72afb0085c8250c7dd672d21 Author: Lingutla Chandrasekhar Date: Thu Feb 4 15:52:03 2021 +0530 sched: fair: consider all running tasks in cpu for load balance Load_balancer considers only cfs running tasks for finding busiest cpu to do load balancing. But cpu may be busy with other type tasks (ex: RT), then that cpu might not selected as busy cpu due to weight vs nr_run checks fails). And possibley cfs tasks running on that cpu would suffer till other type tasks finishes or weight checks passes, while other cpus sitting idle and not able to do load balance. So, consider all running tasks to check cpu busieness. Change-Id: Iddf3f668507e20359f6388fc30ff5897d234c902 Signed-off-by: Lingutla Chandrasekhar Signed-off-by: atndko Signed-off-by: Cyber Knight commit 21028ad73e84230251c5704576d5c97eb4390e31 Author: Lucas Stach Date: Mon Aug 31 13:07:19 2020 +0200 sched/deadline: Fix stale throttling on de-/boosted tasks When a boosted task gets throttled, what normally happens is that it's immediately enqueued again with ENQUEUE_REPLENISH, which replenishes the runtime and clears the dl_throttled flag. There is a special case however: if the throttling happened on sched-out and the task has been deboosted in the meantime, the replenish is skipped as the task will return to its normal scheduling class. This leaves the task with the dl_throttled flag set. Now if the task gets boosted up to the deadline scheduling class again while it is sleeping, it's still in the throttled state. The normal wakeup however will enqueue the task with ENQUEUE_REPLENISH not set, so we don't actually place it on the rq. Thus we end up with a task that is runnable, but not actually on the rq and neither a immediate replenishment happens, nor is the replenishment timer set up, so the task is stuck in forever-throttled limbo. Clear the dl_throttled flag before dropping back to the normal scheduling class to fix this issue. Signed-off-by: Lucas Stach Signed-off-by: Peter Zijlstra (Intel) Acked-by: Juri Lelli Link: https://lkml.kernel.org/r/20200831110719.2126930-1-l.stach@pengutronix.de Signed-off-by: Zlatan Radovanovic Signed-off-by: Cyber Knight commit 0541494a6aa9d50deb09f922e74c12b7d319e211 Author: Vincent Guittot Date: Mon Sep 21 09:24:22 2020 +0200 sched/fair: Reduce minimal imbalance threshold The 25% default imbalance threshold for DIE and NUMA domain is large enough to generate significant unfairness between threads. A typical example is the case of 11 threads running on 2x4 CPUs. The imbalance of 20% between the 2 groups of 4 cores is just low enough to not trigger the load balance between the 2 groups. We will have always the same 6 threads on one group of 4 CPUs and the other 5 threads on the other group of CPUS. With a fair time sharing in each group, we ends up with +20% running time for the group of 5 threads. Consider decreasing the imbalance threshold for overloaded case where we use the load to balance task and to ensure fair time sharing. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Acked-by: Hillf Danton Link: https://lkml.kernel.org/r/20200921072424.14813-3-vincent.guittot@linaro.org Signed-off-by: Zlatan Radovanovic Signed-off-by: Cyber Knight commit ae5539ae9bd8eb9319a3a9773f1df1aa3daacbcb Author: Vincent Guittot Date: Mon Sep 21 09:24:24 2020 +0200 sched/fair: Reduce busy load balance interval The busy_factor, which increases load balance interval when a cpu is busy, is set to 32 by default. This value generates some huge LB interval on large system like the THX2 made of 2 node x 28 cores x 4 threads. For such system, the interval increases from 112ms to 3584ms at MC level. And from 228ms to 7168ms at NUMA level. Even on smaller system, a lower busy factor has shown improvement on the fair distribution of the running time so let reduce it for all. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Link: https://lkml.kernel.org/r/20200921072424.14813-5-vincent.guittot@linaro.org commit e644e2307fbc7b33c1baae6e8e50517b0a5924c0 Author: Valentin Schneider Date: Wed Dec 11 11:38:50 2019 +0000 BACKPORT: sched/fair: Make task_fits_capacity() consider uclamp restrictions task_fits_capacity() drives CPU selection at wakeup time, and is also used to detect misfit tasks. Right now it does so by comparing task_util_est() with a CPU's capacity, but doesn't take into account uclamp restrictions. There's a few interesting uses that can come out of doing this. For instance, a low uclamp.max value could prevent certain tasks from being flagged as misfit tasks, so they could merrily remain on low-capacity CPUs. Similarly, a high uclamp.min value would steer tasks towards high capacity CPUs at wakeup (and, should that fail, later steered via misfit balancing), so such "boosted" tasks would favor CPUs of higher capacity. Introduce uclamp_task_util() and make task_fits_capacity() use it. [QP: fixed missing dependency on fits_capacity() by using the open coded alternative] Bug: 120440300 Tested-By: Dietmar Eggemann Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Quentin Perret Reviewed-by: Vincent Guittot Reviewed-by: Dietmar Eggemann Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191211113851.24241-5-valentin.schneider@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit a7008c07a568278ed2763436404752a98004c7ff) Signed-off-by: Quentin Perret Change-Id: Iabde2eda7252c3bcc273e61260a7a12a7de991b1 Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit fed39a8a5b86100174489179f80a7514c7a85c4b Author: Patrick Bellasi Date: Thu Sep 27 17:58:58 2018 +0100 BACKPORT: ANDROID: sched/core: Move SchedTune task API into UtilClamp wrappers The main SchedTune API calls realted to task tuning attributes are now wrapped by more generic and mainlinish UtilClamp calls. The new APIs are: - uclamp_task(p) <= boosted_task_util(p) - uclamp_boosted(p) <= schedtune_task_boost(p) > 0 - uclamp_latency_sensitive(p) <= schedtune_prefer_idle(p) Let's provide also an implementation of the same API based on the new uclamp.uclamp_latency_sensitive flag. Bug: 120440300 Signed-off-by: Patrick Bellasi [Modified the patch to use uclamp.latency_sensitive instead mainline attributes] Signed-off-by: Qais Yousef Change-Id: Ib1a6902e1c07a82a370e36bf1776d895b7528cbc Signed-off-by: Quentin Perret Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 0152bf348f6c517ba445393032bb9049c533aa27 Author: Rick Yiu Date: Mon Jun 15 22:03:08 2020 +0800 ANDROID: sched/tune: Consider stune boost margin when computing energy If CONFIG_SCHED_TUNE is enabled, it does not use boosted cpu util to compute energy, so it could not reflect the real freq when a cpu has boosted tasks on it. Addressing it by adding boost margin if type is FREQUENCY_UTIL in schedutil_cpu_util(). Bug: 158637636 Signed-off-by: Rick Yiu Change-Id: I570920cb1e67d07de87006fca058d50e9358b7cd Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit a07859716117b26c7794a22ec3a4dec64b618ba0 Author: Patrick Bellasi Date: Thu Apr 4 10:24:43 2019 +0100 ANDROID: sched/tune: Move SchedTune cpu API into UtilClamp wrappers The SchedTune CPU boosting API is currently used from sugov_get_util() to get the boosted utilization and to pass it into schedutil_cpu_util(). When UtilClamp is in use instead we call schedutil_cpu_util() by passing in just the CFS utilization and the clamping is done internally on the aggregated CFS+RT utilization for FREQUENCY_UTIL calls. This asymmetry is not required moreover, schedutil code is polluted by non-mainline SchedTune code. Wrap SchedTune API call related to cpu utilization boosting with a more generic and mainlinish UtilClamp call: - uclamp_rq_util_with(cpu, util, p) <= boosted_cpu_util(cpu) This new API is already used in schedutil_cpu_util() to clamp the aggregated RT+CFS utilization on FREQUENCY_UTIL calls. Move the cpu boosting into uclamp_rq_util_with() so that we remove any SchedTune specific bit from kernel/sched/cpufreq_schedutil.c. Get rid of the no more required boosted_cpu_util(cpu) method and replace it with a stune_util(cpu, util) which signature is better aligned with its uclamp_rq_util_with(cpu, util, p) counterpart. Bug: 120440300 Signed-off-by: Patrick Bellasi Signed-off-by: Qais Yousef Change-Id: I45b0f0f54123fe0a2515fa9f1683842e6b99234f [Removed superfluous __maybe_unused for capacity_orig_of] Signed-off-by: Quentin Perret Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 75d6dd5a51250b259269afa979ee3fdd2f089db2 Author: Quentin Perret Date: Thu Jun 10 15:13:06 2021 +0000 ANDROID: sched/core: Make uclamp changes depend on CAP_SYS_NICE There is currently nothing preventing tasks from changing their per-task clamp values in anyway that they like. The rationale is probably that system administrators are still able to limit those clamps thanks to the cgroup interface. However, this causes pain in a system where both per-task and per-cgroup clamp values are expected to be under the control of core system components (as is the case for Android). To fix this, let's require CAP_SYS_NICE to change per-task clamp values. There are ongoing discussions upstream about more flexible approaches than this using the RLIMIT API -- see [1]. But the upstream discussion has not converged yet, and this is way too late for UAPI changes in android12-5.10 anyway, so let's apply this change which provides the behaviour we want without actually impacting UAPIs. [1] https://lore.kernel.org/lkml/20210623123441.592348-4-qperret@google.com/ Bug: 187186685 Signed-off-by: Quentin Perret Change-Id: I749312a77306460318ac5374cf243d00b78120dd Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit b6e71f141280f787c584a8c597d7898dbff28d10 Author: Quentin Perret Date: Thu Aug 5 11:21:53 2021 +0100 BACKPORT: sched/uclamp: Fix UCLAMP_FLAG_IDLE setting The UCLAMP_FLAG_IDLE flag is set on a runqueue when dequeueing the last uclamp active task (that is, when buckets.tasks reaches 0 for all buckets) to maintain the last uclamp.max and prevent blocked util from suddenly becoming visible. However, there is an asymmetry in how the flag is set and cleared which can lead to having the flag set whilst there are active tasks on the rq. Specifically, the flag is cleared in the uclamp_rq_inc() path, which is called at enqueue time, but set in uclamp_rq_dec_id() which is called both when dequeueing a task _and_ in the update_uclamp_active() path. As a result, when both uclamp_rq_{dec,ind}_id() are called from update_uclamp_active(), the flag ends up being set but not cleared, hence leaving the runqueue in a broken state. Fix this by clearing the flag in update_uclamp_active() as well. Fixes: e496187da710 ("sched/uclamp: Enforce last task's UCLAMP_MAX") Reported-by: Rick Yiu Signed-off-by: Quentin Perret Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Qais Yousef Tested-by: Dietmar Eggemann Link: https://lore.kernel.org/r/20210805102154.590709-2-qperret@google.com Signed-off-by: RuRuTiaSaMa <1009087450@qq.com> Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit c0fda42ce6b4f1af0cd02cdf259aff2aaf8038cc Author: Dietmar Eggemann Date: Fri Nov 13 12:34:54 2020 +0100 UPSTREAM: sched/uclamp: Allow to reset a task uclamp constraint value In case the user wants to stop controlling a uclamp constraint value for a task, use the magic value -1 in sched_util_{min,max} with the appropriate sched_flags (SCHED_FLAG_UTIL_CLAMP_{MIN,MAX}) to indicate the reset. The advantage over the 'additional flag' approach (i.e. introducing SCHED_FLAG_UTIL_CLAMP_RESET) is that no additional flag has to be exported via uapi. This avoids the need to document how this new flag has be used in conjunction with the existing uclamp related flags. The following subtle issue is fixed as well. When a uclamp constraint value is set on a !user_defined uclamp_se it is currently first reset and then set. Fix this by AND'ing !user_defined with !SCHED_FLAG_UTIL_CLAMP which stands for the 'sched class change' case. The related condition 'if (uc_se->user_defined)' moved from __setscheduler_uclamp() into uclamp_reset(). Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Yun Hsiang Link: https://lkml.kernel.org/r/20201113113454.25868-1-dietmar.eggemann@arm.com Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit c2295e608322cc2ed46f7bc785e9e092813a630f Author: YueHaibing Date: Tue Sep 22 21:24:10 2020 +0800 UPSTREAM: sched/core: Remove unused inline function uclamp_bucket_base_value() There is no caller in tree, so can remove it. Signed-off-by: YueHaibing Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Link: https://lkml.kernel.org/r/20200922132410.48440-1-yuehaibing@huawei.com Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 384e707b762fe96e3257dc45fdd06f102077e6f0 Author: Qinglang Miao Date: Sat Jul 25 16:56:29 2020 +0800 UPSTREAM: sched/uclamp: Remove unnecessary mutex_init() The uclamp_mutex lock is initialized statically via DEFINE_MUTEX(), it is unnecessary to initialize it runtime via mutex_init(). Signed-off-by: Qinglang Miao Signed-off-by: Ingo Molnar Cc: Patrick Bellasi Cc: Peter Zijlstra Cc: Vincent Guittot Cc: Dietmar Eggemann Link: https://lore.kernel.org/r/20200725085629.98292-1-miaoqinglang@huawei.com Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit a15a1c8215338f6381551dd7e42fe8103da1b104 Author: Qais Yousef Date: Thu Jul 16 12:03:45 2020 +0100 UPSTREAM: sched/uclamp: Add a new sysctl to control RT default boost value RT tasks by default run at the highest capacity/performance level. When uclamp is selected this default behavior is retained by enforcing the requested uclamp.min (p->uclamp_req[UCLAMP_MIN]) of the RT tasks to be uclamp_none(UCLAMP_MAX), which is SCHED_CAPACITY_SCALE; the maximum value. This is also referred to as 'the default boost value of RT tasks'. See commit 1a00d999971c ("sched/uclamp: Set default clamps for RT tasks"). On battery powered devices, it is desired to control this default (currently hardcoded) behavior at runtime to reduce energy consumed by RT tasks. For example, a mobile device manufacturer where big.LITTLE architecture is dominant, the performance of the little cores varies across SoCs, and on high end ones the big cores could be too power hungry. Given the diversity of SoCs, the new knob allows manufactures to tune the best performance/power for RT tasks for the particular hardware they run on. They could opt to further tune the value when the user selects a different power saving mode or when the device is actively charging. The runtime aspect of it further helps in creating a single kernel image that can be run on multiple devices that require different tuning. Keep in mind that a lot of RT tasks in the system are created by the kernel. On Android for instance I can see over 50 RT tasks, only a handful of which created by the Android framework. To control the default behavior globally by system admins and device integrator, introduce the new sysctl_sched_uclamp_util_min_rt_default to change the default boost value of the RT tasks. I anticipate this to be mostly in the form of modifying the init script of a particular device. To avoid polluting the fast path with unnecessary code, the approach taken is to synchronously do the update by traversing all the existing tasks in the system. This could race with a concurrent fork(), which is dealt with by introducing sched_post_fork() function which will ensure the racy fork will get the right update applied. Tested on Juno-r2 in combination with the RT capacity awareness [1]. By default an RT task will go to the highest capacity CPU and run at the maximum frequency, which is particularly energy inefficient on high end mobile devices because the biggest core[s] are 'huge' and power hungry. With this patch the RT task can be controlled to run anywhere by default, and doesn't cause the frequency to be maximum all the time. Yet any task that really needs to be boosted can easily escape this default behavior by modifying its requested uclamp.min value (p->uclamp_req[UCLAMP_MIN]) via sched_setattr() syscall. [1] 804d402fb6f6: ("sched/rt: Make RT capacity-aware") Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200716110347.19553-2-qais.yousef@arm.com Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 7f15a8da33343d817d828b5ca95d207e4a89495f Author: Qais Yousef Date: Thu Dec 2 11:20:33 2021 +0000 UPSTREAM: sched/uclamp: Fix rq->uclamp_max not set on first enqueue Commit d81ae8aac85c ("sched/uclamp: Fix initialization of struct uclamp_rq") introduced a bug where uclamp_max of the rq is not reset to match the woken up task's uclamp_max when the rq is idle. The code was relying on rq->uclamp_max initialized to zero, so on first enqueue static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, enum uclamp_id clamp_id) { ... if (uc_se->value > READ_ONCE(uc_rq->value)) WRITE_ONCE(uc_rq->value, uc_se->value); } was actually resetting it. But since commit d81ae8aac85c changed the default to 1024, this no longer works. And since rq->uclamp_flags is also initialized to 0, neither above code path nor uclamp_idle_reset() update the rq->uclamp_max on first wake up from idle. This is only visible from first wake up(s) until the first dequeue to idle after enabling the static key. And it only matters if the uclamp_max of this task is < 1024 since only then its uclamp_max will be effectively ignored. Fix it by properly initializing rq->uclamp_flags = UCLAMP_FLAG_IDLE to ensure uclamp_idle_reset() is called which then will update the rq uclamp_max value as expected. Bug: 254441685 Fixes: d81ae8aac85c ("sched/uclamp: Fix initialization of struct uclamp_rq") Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Tested-by: Dietmar Eggemann Link: https://lkml.kernel.org/r/20211202112033.1705279-1-qais.yousef@arm.com (cherry picked from commit 315c4f884800c45cb6bd8c90422fad554a8b9588) Signed-off-by: Lee Jones Change-Id: I621fc463a3e51361516c2479aff6c80213aaf918 Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 3aa7ab1653f0b9cb224c095c7f9cfa92782749f0 Author: Qais Yousef Date: Thu Jun 17 17:51:55 2021 +0100 UPSTREAM: sched/uclamp: Fix uclamp_tg_restrict() Now cpu.uclamp.min acts as a protection, we need to make sure that the uclamp request of the task is within the allowed range of the cgroup, that is it is clamp()'ed correctly by tg->uclamp[UCLAMP_MIN] and tg->uclamp[UCLAMP_MAX]. As reported by Xuewen [1] we can have some corner cases where there's inversion between uclamp requested by task (p) and the uclamp values of the taskgroup it's attached to (tg). Following table demonstrates 2 corner cases: | p | tg | effective -----------+-----+------+----------- CASE 1 -----------+-----+------+----------- uclamp_min | 60% | 0% | 60% -----------+-----+------+----------- uclamp_max | 80% | 50% | 50% -----------+-----+------+----------- CASE 2 -----------+-----+------+----------- uclamp_min | 0% | 30% | 30% -----------+-----+------+----------- uclamp_max | 20% | 50% | 20% -----------+-----+------+----------- With this fix we get: | p | tg | effective -----------+-----+------+----------- CASE 1 -----------+-----+------+----------- uclamp_min | 60% | 0% | 50% -----------+-----+------+----------- uclamp_max | 80% | 50% | 50% -----------+-----+------+----------- CASE 2 -----------+-----+------+----------- uclamp_min | 0% | 30% | 30% -----------+-----+------+----------- uclamp_max | 20% | 50% | 30% -----------+-----+------+----------- Additionally uclamp_update_active_tasks() must now unconditionally update both UCLAMP_MIN/MAX because changing the tg's UCLAMP_MAX for instance could have an impact on the effective UCLAMP_MIN of the tasks. | p | tg | effective -----------+-----+------+----------- old -----------+-----+------+----------- uclamp_min | 60% | 0% | 50% -----------+-----+------+----------- uclamp_max | 80% | 50% | 50% -----------+-----+------+----------- *new* -----------+-----+------+----------- uclamp_min | 60% | 0% | *60%* -----------+-----+------+----------- uclamp_max | 80% |*70%* | *70%* -----------+-----+------+----------- [1] https://lore.kernel.org/lkml/CAB8ipk_a6VFNjiEnHRHkUMBKbA+qzPQvhtNjJ_YNzQhqV_o8Zw@mail.gmail.com/ Bug: 254441685 Fixes: 0c18f2ecfcc2 ("sched/uclamp: Fix wrong implementation of cpu.uclamp.min") Reported-by: Xuewen Yan Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210617165155.3774110-1-qais.yousef@arm.com (cherry picked from commit 0213b7083e81f4acd69db32cb72eb4e5f220329a) Signed-off-by: Lee Jones Change-Id: I128d75fea2900ec7bc360b44f18cada76c968578 Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 02bc00f504e7174e58fbf90c176d37c91e1e759e Author: Qais Yousef Date: Thu Jul 16 12:03:47 2020 +0100 UPSTREAM: sched/uclamp: Fix a deadlock when enabling uclamp static key The following splat was caught when setting uclamp value of a task: BUG: sleeping function called from invalid context at ./include/linux/percpu-rwsem.h:49 cpus_read_lock+0x68/0x130 static_key_enable+0x1c/0x38 __sched_setscheduler+0x900/0xad8 Fix by ensuring we enable the key outside of the critical section in __sched_setscheduler() Bug: 254441685 Fixes: 46609ce22703 ("sched/uclamp: Protect uclamp fast path code with static key") Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200716110347.19553-4-qais.yousef@arm.com (cherry picked from commit e65855a52b479f98674998cb23b21ef5a8144b04) Signed-off-by: Lee Jones Change-Id: I9b33882f72b2f5a8bb8a1e077e7785f3462d1cee Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 557b3cd9f47d60ea7d5bf8be9f6d06e34786a8ec Author: Xuewen Yan Date: Wed Jun 30 22:12:04 2021 +0800 UPSTREAM: sched/uclamp: Ignore max aggregation if rq is idle When a task wakes up on an idle rq, uclamp_rq_util_with() would max aggregate with rq value. But since there is no task enqueued yet, the values are stale based on the last task that was running. When the new task actually wakes up and enqueued, then the rq uclamp values should reflect that of the newly woken up task effective uclamp values. This is a problem particularly for uclamp_max because it default to 1024. If a task p with uclamp_max = 512 wakes up, then max aggregation would ignore the capping that should apply when this task is enqueued, which is wrong. Fix that by ignoring max aggregation if the rq is idle since in that case the effective uclamp value of the rq will be the ones of the task that will wake up. Bug: 254441685 Fixes: 9d20ad7dfc9a ("sched/uclamp: Add uclamp_util_with()") Signed-off-by: Xuewen Yan Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider [qias: Changelog] Reviewed-by: Qais Yousef Link: https://lore.kernel.org/r/20210630141204.8197-1-xuewen.yan94@gmail.com (cherry picked from commit 3e1493f46390618ea78607cb30c58fc19e2a5035) Signed-off-by: Lee Jones Change-Id: I6ea180d854d9d8ffa94abdac4800c9cb130f77cf Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 04eeff8cf3fc25bde9471441080f3900ccb09c3a Author: Qais Yousef Date: Mon May 10 15:50:32 2021 +0100 UPSTREAM: sched/uclamp: Fix locking around cpu_util_update_eff() cpu_cgroup_css_online() calls cpu_util_update_eff() without holding the uclamp_mutex or rcu_read_lock() like other call sites, which is a mistake. The uclamp_mutex is required to protect against concurrent reads and writes that could update the cgroup hierarchy. The rcu_read_lock() is required to traverse the cgroup data structures in cpu_util_update_eff(). Surround the caller with the required locks and add some asserts to better document the dependency in cpu_util_update_eff(). Bug: 254441685 Fixes: 7226017ad37a ("sched/uclamp: Fix a bug in propagating uclamp value in new cgroups") Reported-by: Quentin Perret Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210510145032.1934078-3-qais.yousef@arm.com (cherry picked from commit 93b73858701fd01de26a4a874eb95f9b7156fd4b) Signed-off-by: Lee Jones Change-Id: I6b11073f23f58ce4c2415cdfc46140a60e3411a2 Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit b60f7bed3b76a71ec848c5b690f6927bf36e2f0e Author: Qais Yousef Date: Mon May 10 15:50:31 2021 +0100 UPSTREAM: sched/uclamp: Fix wrong implementation of cpu.uclamp.min cpu.uclamp.min is a protection as described in cgroup-v2 Resource Distribution Model Documentation/admin-guide/cgroup-v2.rst which means we try our best to preserve the minimum performance point of tasks in this group. See full description of cpu.uclamp.min in the cgroup-v2.rst. But the current implementation makes it a limit, which is not what was intended. For example: tg->cpu.uclamp.min = 20% p0->uclamp[UCLAMP_MIN] = 0 p1->uclamp[UCLAMP_MIN] = 50% Previous Behavior (limit): p0->effective_uclamp = 0 p1->effective_uclamp = 20% New Behavior (Protection): p0->effective_uclamp = 20% p1->effective_uclamp = 50% Which is inline with how protections should work. With this change the cgroup and per-task behaviors are the same, as expected. Additionally, we remove the confusing relationship between cgroup and !user_defined flag. We don't want for example RT tasks that are boosted by default to max to change their boost value when they attach to a cgroup. If a cgroup wants to limit the max performance point of tasks attached to it, then cpu.uclamp.max must be set accordingly. Or if they want to set different boost value based on cgroup, then sysctl_sched_util_clamp_min_rt_default must be used to NOT boost to max and set the right cpu.uclamp.min for each group to let the RT tasks obtain the desired boost value when attached to that group. As it stands the dependency on !user_defined flag adds an extra layer of complexity that is not required now cpu.uclamp.min behaves properly as a protection. The propagation model of effective cpu.uclamp.min in child cgroups as implemented by cpu_util_update_eff() is still correct. The parent protection sets an upper limit of what the child cgroups will effectively get. Bug: 254441685 Fixes: 3eac870a3247 (sched/uclamp: Use TG's clamps to restrict TASK's clamps) Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210510145032.1934078-2-qais.yousef@arm.com (cherry picked from commit 0c18f2ecfcc274a4bcc1d122f79ebd4001c3b445) Signed-off-by: Lee Jones Change-Id: I9f9f7b9e7ef3d19ccb1685f271639c9ed76b580f Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit b2135621698d06e6a73b0f4ef15529321cc68e31 Author: Qais Yousef Date: Tue Jun 30 12:21:23 2020 +0100 BACKPORT: sched/uclamp: Protect uclamp fast path code with static key There is a report that when uclamp is enabled, a netperf UDP test regresses compared to a kernel compiled without uclamp. https://lore.kernel.org/lkml/20200529100806.GA3070@suse.de/ While investigating the root cause, there were no sign that the uclamp code is doing anything particularly expensive but could suffer from bad cache behavior under certain circumstances that are yet to be understood. https://lore.kernel.org/lkml/20200616110824.dgkkbyapn3io6wik@e107158-lin/ To reduce the pressure on the fast path anyway, add a static key that is by default will skip executing uclamp logic in the enqueue/dequeue_task() fast path until it's needed. As soon as the user start using util clamp by: 1. Changing uclamp value of a task with sched_setattr() 2. Modifying the default sysctl_sched_util_clamp_{min, max} 3. Modifying the default cpu.uclamp.{min, max} value in cgroup We flip the static key now that the user has opted to use util clamp. Effectively re-introducing uclamp logic in the enqueue/dequeue_task() fast path. It stays on from that point forward until the next reboot. This should help minimize the effect of util clamp on workloads that don't need it but still allow distros to ship their kernels with uclamp compiled in by default. SCHED_WARN_ON() in uclamp_rq_dec_id() was removed since now we can end up with unbalanced call to uclamp_rq_dec_id() if we flip the key while a task is running in the rq. Since we know it is harmless we just quietly return if we attempt a uclamp_rq_dec_id() when rq->uclamp[].bucket[].tasks is 0. In schedutil, we introduce a new uclamp_is_enabled() helper which takes the static key into account to ensure RT boosting behavior is retained. The following results demonstrates how this helps on 2 Sockets Xeon E5 2x10-Cores system. nouclamp uclamp uclamp-static-key Hmean send-64 162.43 ( 0.00%) 157.84 * -2.82%* 163.39 * 0.59%* Hmean send-128 324.71 ( 0.00%) 314.78 * -3.06%* 326.18 * 0.45%* Hmean send-256 641.55 ( 0.00%) 628.67 * -2.01%* 648.12 * 1.02%* Hmean send-1024 2525.28 ( 0.00%) 2448.26 * -3.05%* 2543.73 * 0.73%* Hmean send-2048 4836.14 ( 0.00%) 4712.08 * -2.57%* 4867.69 * 0.65%* Hmean send-3312 7540.83 ( 0.00%) 7425.45 * -1.53%* 7621.06 * 1.06%* Hmean send-4096 9124.53 ( 0.00%) 8948.82 * -1.93%* 9276.25 * 1.66%* Hmean send-8192 15589.67 ( 0.00%) 15486.35 * -0.66%* 15819.98 * 1.48%* Hmean send-16384 26386.47 ( 0.00%) 25752.25 * -2.40%* 26773.74 * 1.47%* The perf diff between nouclamp and uclamp-static-key when uclamp is disabled in the fast path: 8.73% -1.55% [kernel.kallsyms] [k] try_to_wake_up 0.07% +0.04% [kernel.kallsyms] [k] deactivate_task 0.13% -0.02% [kernel.kallsyms] [k] activate_task The diff between nouclamp and uclamp-static-key when uclamp is enabled in the fast path: 8.73% -0.72% [kernel.kallsyms] [k] try_to_wake_up 0.13% +0.39% [kernel.kallsyms] [k] activate_task 0.07% +0.38% [kernel.kallsyms] [k] deactivate_task Bug: 254441685 Fixes: 69842cba9ace ("sched/uclamp: Add CPU's clamp buckets refcounting") Reported-by: Mel Gorman Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Tested-by: Lukasz Luba Link: https://lkml.kernel.org/r/20200630112123.12076-3-qais.yousef@arm.com (cherry picked from commit 46609ce227039fd192e0ecc7d940bed587fd2c78) Signed-off-by: Lee Jones Change-Id: I80555c22b856fbbd46692f83d501f03b6f393c35 Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 322f697f698c34fd19271a1c21e756bbe0752aad Author: Qais Yousef Date: Tue Jun 30 12:21:22 2020 +0100 BACKPORT: sched/uclamp: Fix initialization of struct uclamp_rq struct uclamp_rq was zeroed out entirely in assumption that in the first call to uclamp_rq_inc() they'd be initialized correctly in accordance to default settings. But when next patch introduces a static key to skip uclamp_rq_{inc,dec}() until userspace opts in to use uclamp, schedutil will fail to perform any frequency changes because the rq->uclamp[UCLAMP_MAX].value is zeroed at init and stays as such. Which means all rqs are capped to 0 by default. Fix it by making sure we do proper initialization at init without relying on uclamp_rq_inc() doing it later. Bug: 254441685 Fixes: 69842cba9ace ("sched/uclamp: Add CPU's clamp buckets refcounting") Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Tested-by: Lukasz Luba Link: https://lkml.kernel.org/r/20200630112123.12076-2-qais.yousef@arm.com (cherry picked from commit d81ae8aac85ca2e307d273f6dc7863a721bf054e) Signed-off-by: Lee Jones Change-Id: I014101dbe53c85f412f87f7f6937b18d2f141800 Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 416ea5e54d1471df3352093b1aa0c3b7a4341a8f Author: Quentin Perret Date: Fri Apr 30 15:14:12 2021 +0000 UPSTREAM: sched: Fix out-of-bound access in uclamp Util-clamp places tasks in different buckets based on their clamp values for performance reasons. However, the size of buckets is currently computed using a rounding division, which can lead to an off-by-one error in some configurations. For instance, with 20 buckets, the bucket size will be 1024/20=51. A task with a clamp of 1024 will be mapped to bucket id 1024/51=20. Sadly, correct indexes are in range [0,19], hence leading to an out of bound memory access. Clamp the bucket id to fix the issue. Bug: 186415778 Fixes: 69842cba9ace ("sched/uclamp: Add CPU's clamp buckets refcounting") Suggested-by: Qais Yousef Signed-off-by: Quentin Perret Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Reviewed-by: Dietmar Eggemann Link: https://lkml.kernel.org/r/20210430151412.160913-1-qperret@google.com (cherry picked from commit 6d2f8909a5fabb73fe2a63918117943986c39b6c) Signed-off-by: Quentin Perret Change-Id: I8097f5ed34abcff36c5ed395643d65727ea969eb Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 41abff075e5f46de03fcb89a58b41a73c3f137a6 Author: Quentin Perret Date: Thu Apr 16 09:59:56 2020 +0100 BACKPORT: sched/core: Fix reset-on-fork from RT with uclamp uclamp_fork() resets the uclamp values to their default when the reset-on-fork flag is set. It also checks whether the task has a RT policy, and sets its uclamp.min to 1024 accordingly. However, during reset-on-fork, the task's policy is lowered to SCHED_NORMAL right after, hence leading to an erroneous uclamp.min setting for the new task if it was forked from RT. Fix this by removing the unnecessary check on rt_task() in uclamp_fork() as this doesn't make sense if the reset-on-fork flag is set. [ qperret: BACKPORT because of a conflict with the Android-specific SUGOV_RT_MAX_FREQ sched_feat, which is equally unnecessary in this path ] Bug: 120440300 Fixes: 1a00d999971c ("sched/uclamp: Set default clamps for RT tasks") Reported-by: Chitti Babu Theegala Signed-off-by: Quentin Perret Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Patrick Bellasi Reviewed-by: Dietmar Eggemann Link: https://lkml.kernel.org/r/20200416085956.217587-1-qperret@google.com (cherry picked from commit eaf5a92ebde5bca3bb2565616115bd6d579486cd) Signed-off-by: Quentin Perret Change-Id: I9a19ac5474d0508b8437e4a1d859573b4106ed08 Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit c719bd9543490474cb07d09d04cf7a01780c7a28 Author: Qais Yousef Date: Tue Jan 14 21:09:47 2020 +0000 UPSTREAM: sched/uclamp: Reject negative values in cpu_uclamp_write() The check to ensure that the new written value into cpu.uclamp.{min,max} is within range, [0:100], wasn't working because of the signed comparison 7301 if (req.percent > UCLAMP_PERCENT_SCALE) { 7302 req.ret = -ERANGE; 7303 return req; 7304 } # echo -1 > cpu.uclamp.min # cat cpu.uclamp.min 42949671.96 Cast req.percent into u64 to force the comparison to be unsigned and work as intended in capacity_from_percent(). # echo -1 > cpu.uclamp.min sh: write error: Numerical result out of range Bug: 120440300 Fixes: 2480c093130f ("sched/uclamp: Extend CPU's cgroup controller") Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20200114210947.14083-1-qais.yousef@arm.com (cherry picked from commit b562d140649966d4daedd0483a8fe59ad3bb465a) Signed-off-by: Qais Yousef Change-Id: I17fc2b119dcbffb212e130ed2c37ae3a8d5bbb61 Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit bea7af6318b69aed941c04b64fb7aec6760692fa Author: Quentin Perret Date: Tue Jul 30 13:54:00 2019 +0100 BACKPORT: ANDROID: sched/core: Add a latency-sensitive flag to uclamp Add a 'latency_sensitive' flag to uclamp in order to express the need for some tasks to find a CPU where they can wake-up quickly. This is not expected to be used without cgroup support, so add solely a cgroup interface for it. As this flag represents a boolean attribute and not an amount of resources to be shared, it is not clear what the delegation logic should be. As such, it is kept simple: every new cgroup starts with latency_sensitive set to false, regardless of the parent. In essence, this is similar to SchedTune's prefer-idle flag which was used in android-4.19 and prior. Bug: 120440300 Change-Id: I722d8ecabb428bb7b95a5b54bc70a87f182dde2a Signed-off-by: Quentin Perret (cherry picked from commit ad7dd648fc7dbe11f23673a3463af2468a274998) Signed-off-by: Qais Yousef Signed-off-by: Quentin Perret Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit dcf228d2be609f20fb826ec420e1af7074b3265a Author: Li Guanglei Date: Wed Dec 25 15:44:04 2019 +0800 FROMGIT: sched/core: Fix size of rq::uclamp initialization rq::uclamp is an array of struct uclamp_rq, make sure we clear the whole thing. Bug: 120440300 Fixes: 69842cba9ace ("sched/uclamp: Add CPU's clamp buckets refcountinga") Signed-off-by: Li Guanglei Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Qais Yousef Link: https://lkml.kernel.org/r/1577259844-12677-1-git-send-email-guangleix.li@gmail.com (cherry picked from commit dcd6dffb0a75741471297724640733fa4e958d72 https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched/core) Signed-off-by: Qais Yousef Change-Id: Id36a2b77c45e586535e8fadfb7d66868ca8fe8c7 Signed-off-by: Quentin Perret Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 2b98fd7366175ec1903c23885f0357b51bc4c929 Author: Qais Yousef Date: Tue Dec 24 11:54:04 2019 +0000 FROMGIT: sched/uclamp: Fix a bug in propagating uclamp value in new cgroups When a new cgroup is created, the effective uclamp value wasn't updated with a call to cpu_util_update_eff() that looks at the hierarchy and update to the most restrictive values. Fix it by ensuring to call cpu_util_update_eff() when a new cgroup becomes online. Without this change, the newly created cgroup uses the default root_task_group uclamp values, which is 1024 for both uclamp_{min, max}, which will cause the rq to to be clamped to max, hence cause the system to run at max frequency. The problem was observed on Ubuntu server and was reproduced on Debian and Buildroot rootfs. By default, Ubuntu and Debian create a cpu controller cgroup hierarchy and add all tasks to it - which creates enough noise to keep the rq uclamp value at max most of the time. Imitating this behavior makes the problem visible in Buildroot too which otherwise looks fine since it's a minimal userspace. Bug: 120440300 Fixes: 0b60ba2dd342 ("sched/uclamp: Propagate parent clamps") Reported-by: Doug Smythies Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Tested-by: Doug Smythies Link: https://lore.kernel.org/lkml/000701d5b965$361b6c60$a2524520$@net/ (cherry picked from commit 7226017ad37a888915628e59a84a2d1e57b40707 https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched/core) Signed-off-by: Qais Yousef Change-Id: I9636c60e04d58bbfc5041df1305b34a12b5a3f46 Signed-off-by: Quentin Perret Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit ca1d456778b535f2b7aa6c473d4d67e4ece5a6d0 Author: Valentin Schneider Date: Wed Dec 11 11:38:49 2019 +0000 FROMGIT: sched/uclamp: Rename uclamp_util_with() into uclamp_rq_util_with() The current helper returns (CPU) rq utilization with uclamp restrictions taken into account. A uclamp task utilization helper would be quite helpful, but this requires some renaming. Prepare the code for the introduction of a uclamp_task_util() by renaming the existing uclamp_util_with() to uclamp_rq_util_with(). Bug: 120440300 Tested-By: Dietmar Eggemann Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Quentin Perret Reviewed-by: Vincent Guittot Reviewed-by: Dietmar Eggemann Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191211113851.24241-4-valentin.schneider@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit d2b58a286e89824900d501db0be1d4f6aed474fc https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched/core) Signed-off-by: Qais Yousef Change-Id: I3e7146b788e079e400167203df5e5dadee2fd232 Signed-off-by: Quentin Perret Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 3b187fe5f97ec4e11f313001b70ce174b068efb8 Author: Valentin Schneider Date: Wed Dec 11 11:38:48 2019 +0000 FROMGIT: sched/uclamp: Make uclamp util helpers use and return UL values Vincent pointed out recently that the canonical type for utilization values is 'unsigned long'. Internally uclamp uses 'unsigned int' values for cache optimization, but this doesn't have to be exported to its users. Make the uclamp helpers that deal with utilization use and return unsigned long values. Bug: 120440300 Tested-By: Dietmar Eggemann Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Quentin Perret Reviewed-by: Vincent Guittot Reviewed-by: Dietmar Eggemann Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191211113851.24241-3-valentin.schneider@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit 686516b55e98edf18c2a02d36aaaa6f4c0f6c39c https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched/core) Signed-off-by: Qais Yousef Change-Id: Id3837f12237e5b77eb3a236bd32457dcd7de743e Signed-off-by: Quentin Perret Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 9f7cbbc57004a73d8550358a95c803f49a4067bb Author: Valentin Schneider Date: Wed Dec 11 11:38:47 2019 +0000 FROMGIT: sched/uclamp: Remove uclamp_util() The sole user of uclamp_util(), schedutil_cpu_util(), was made to use uclamp_util_with() instead in commit: af24bde8df20 ("sched/uclamp: Add uclamp support to energy_compute()") From then on, uclamp_util() has remained unused. Being a simple wrapper around uclamp_util_with(), we can get rid of it and win back a few lines. Bug: 120440300 Tested-By: Dietmar Eggemann Suggested-by: Dietmar Eggemann Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191211113851.24241-2-valentin.schneider@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit 59fe675248ffc37d4167e9ec6920a2f3d5ec67bb https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched/core) Signed-off-by: Qais Yousef Change-Id: I11dbff80c6c4be9666438800b2527aca8cd24025 Signed-off-by: Quentin Perret Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 1e2856646d5f69ab499a1c5cd9f2e3376b7a1ef9 Author: Qais Yousef Date: Wed Oct 9 11:46:11 2019 +0100 BACKPORT: sched/rt: Make RT capacity-aware Capacity Awareness refers to the fact that on heterogeneous systems (like Arm big.LITTLE), the capacity of the CPUs is not uniform, hence when placing tasks we need to be aware of this difference of CPU capacities. In such scenarios we want to ensure that the selected CPU has enough capacity to meet the requirement of the running task. Enough capacity means here that capacity_orig_of(cpu) >= task.requirement. The definition of task.requirement is dependent on the scheduling class. For CFS, utilization is used to select a CPU that has >= capacity value than the cfs_task.util. capacity_orig_of(cpu) >= cfs_task.util DL isn't capacity aware at the moment but can make use of the bandwidth reservation to implement that in a similar manner CFS uses utilization. The following patchset implements that: https://lore.kernel.org/lkml/20190506044836.2914-1-luca.abeni@santannapisa.it/ capacity_orig_of(cpu)/SCHED_CAPACITY >= dl_deadline/dl_runtime For RT we don't have a per task utilization signal and we lack any information in general about what performance requirement the RT task needs. But with the introduction of uclamp, RT tasks can now control that by setting uclamp_min to guarantee a minimum performance point. ATM the uclamp value are only used for frequency selection; but on heterogeneous systems this is not enough and we need to ensure that the capacity of the CPU is >= uclamp_min. Which is what implemented here. capacity_orig_of(cpu) >= rt_task.uclamp_min Note that by default uclamp.min is 1024, which means that RT tasks will always be biased towards the big CPUs, which make for a better more predictable behavior for the default case. Must stress that the bias acts as a hint rather than a definite placement strategy. For example, if all big cores are busy executing other RT tasks we can't guarantee that a new RT task will be placed there. On non-heterogeneous systems the original behavior of RT should be retained. Similarly if uclamp is not selected in the config. [ mingo: Minor edits to comments. ] Bug: 120440300 Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Reviewed-by: Steven Rostedt (VMware) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191009104611.15363-1-qais.yousef@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit 804d402fb6f6487b825aae8cf42fda6426c62867 https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git) [Qais: resolved minor conflict in kernel/sched/cpupri.c] Signed-off-by: Qais Yousef Change-Id: Ifc9da1c47de1aec9b4d87be2614e4c8968366900 Signed-off-by: Quentin Perret Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit fbcf2f9dcec79826507ced228a9278ae404c2a8a Author: Valentin Schneider Date: Fri Nov 15 10:39:08 2019 +0000 UPSTREAM: sched/uclamp: Fix overzealous type replacement Some uclamp helpers had their return type changed from 'unsigned int' to 'enum uclamp_id' by commit 0413d7f33e60 ("sched/uclamp: Always use 'enum uclamp_id' for clamp_id values") but it happens that some do return a value in the [0, SCHED_CAPACITY_SCALE] range, which should really be unsigned int. The affected helpers are uclamp_none(), uclamp_rq_max_value() and uclamp_eff_value(). Fix those up. Note that this doesn't lead to any obj diff using a relatively recent aarch64 compiler (8.3-2019.03). The current code of e.g. uclamp_eff_value() properly returns an 11 bit value (bits_per(1024)) and doesn't seem to do anything funny. I'm still marking this as fixing the above commit to be on the safe side. Bug: 120440300 Signed-off-by: Valentin Schneider Reviewed-by: Qais Yousef Acked-by: Vincent Guittot Cc: Dietmar.Eggemann@arm.com Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: patrick.bellasi@matbug.net Cc: qperret@google.com Cc: surenb@google.com Cc: tj@kernel.org Fixes: 0413d7f33e60 ("sched/uclamp: Always use 'enum uclamp_id' for clamp_id values") Link: https://lkml.kernel.org/r/20191115103908.27610-1-valentin.schneider@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit 7763baace1b738d65efa46d68326c9406311c6bf) Signed-off-by: Qais Yousef Change-Id: I924a99c125372a8fca81cb4bc0c82e6a7183fc8a Signed-off-by: Quentin Perret Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit b52d3d696bf2b1aecece38539b2fcf4f7f8445f5 Author: Qais Yousef Date: Thu Nov 14 21:10:52 2019 +0000 UPSTREAM: sched/uclamp: Fix incorrect condition uclamp_update_active() should perform the update when p->uclamp[clamp_id].active is true. But when the logic was inverted in [1], the if condition wasn't inverted correctly too. [1] https://lore.kernel.org/lkml/20190902073836.GO2369@hirez.programming.kicks-ass.net/ Bug: 120440300 Reported-by: Suren Baghdasaryan Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Acked-by: Vincent Guittot Cc: Ben Segall Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Patrick Bellasi Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Fixes: babbe170e053 ("sched/uclamp: Update CPU's refcount on TG's clamp changes") Link: https://lkml.kernel.org/r/20191114211052.15116-1-qais.yousef@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit 6e1ff0773f49c7d38e8b4a9df598def6afb9f415) Signed-off-by: Qais Yousef Change-Id: I51b58a6089290277e08a0aaa72b86f852eec1512 Signed-off-by: Quentin Perret Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit bbe475a9c1f66f4da03c5870aff5883fed09eb77 Author: Qais Yousef Date: Tue Nov 5 11:22:12 2019 +0000 UPSTREAM: sched/core: Fix compilation error when cgroup not selected When cgroup is disabled the following compilation error was hit kernel/sched/core.c: In function ‘uclamp_update_active_tasks’: kernel/sched/core.c:1081:23: error: storage size of ‘it’ isn’t known struct css_task_iter it; ^~ kernel/sched/core.c:1084:2: error: implicit declaration of function ‘css_task_iter_start’; did you mean ‘__sg_page_iter_start’? [-Werror=implicit-function-declaration] css_task_iter_start(css, 0, &it); ^~~~~~~~~~~~~~~~~~~ __sg_page_iter_start kernel/sched/core.c:1085:14: error: implicit declaration of function ‘css_task_iter_next’; did you mean ‘__sg_page_iter_next’? [-Werror=implicit-function-declaration] while ((p = css_task_iter_next(&it))) { ^~~~~~~~~~~~~~~~~~ __sg_page_iter_next kernel/sched/core.c:1091:2: error: implicit declaration of function ‘css_task_iter_end’; did you mean ‘get_task_cred’? [-Werror=implicit-function-declaration] css_task_iter_end(&it); ^~~~~~~~~~~~~~~~~ get_task_cred kernel/sched/core.c:1081:23: warning: unused variable ‘it’ [-Wunused-variable] struct css_task_iter it; ^~ cc1: some warnings being treated as errors make[2]: *** [kernel/sched/core.o] Error 1 Fix by protetion uclamp_update_active_tasks() with CONFIG_UCLAMP_TASK_GROUP Bug: 120440300 Fixes: babbe170e053 ("sched/uclamp: Update CPU's refcount on TG's clamp changes") Reported-by: Randy Dunlap Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Tested-by: Randy Dunlap Cc: Steven Rostedt Cc: Ingo Molnar Cc: Vincent Guittot Cc: Patrick Bellasi Cc: Mel Gorman Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Ben Segall Link: https://lkml.kernel.org/r/20191105112212.596-1-qais.yousef@arm.com (cherry picked from commit e3b8b6a0d12cccf772113d6b5c1875192186fbd4) Signed-off-by: Qais Yousef Change-Id: Ia4c0f801d68050526f9f117ec9189e448b01345a Signed-off-by: Quentin Perret Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 35f8e35625c692abfc81b698e5112058d37dacad Author: Ingo Molnar Date: Wed Sep 4 09:55:32 2019 +0200 UPSTREAM: sched/core: Fix uclamp ABI bug, clean up and robustify sched_read_attr() ABI logic and code Thadeu Lima de Souza Cascardo reported that 'chrt' broke on recent kernels: $ chrt -p $$ chrt: failed to get pid 26306's policy: Argument list too long and he has root-caused the bug to the following commit increasing sched_attr size and breaking sched_read_attr() into returning -EFBIG: a509a7cd7974 ("sched/uclamp: Extend sched_setattr() to support utilization clamping") The other, bigger bug is that the whole sched_getattr() and sched_read_attr() logic of checking non-zero bits in new ABI components is arguably broken, and pretty much any extension of the ABI will spuriously break the ABI. That's way too fragile. Instead implement the perf syscall's extensible ABI instead, which we already implement on the sched_setattr() side: - if user-attributes have the same size as kernel attributes then the logic is unchanged. - if user-attributes are larger than the kernel knows about then simply skip the extra bits, but set attr->size to the (smaller) kernel size so that tooling can (in principle) handle older kernel as well. - if user-attributes are smaller than the kernel knows about then just copy whatever user-space can accept. Also clean up the whole logic: - Simplify the code flow - there's no need for 'ret' for example. - Standardize on 'kattr/uattr' and 'ksize/usize' naming to make sure we always know which side we are dealing with. - Why is it called 'read' when what it does is to copy to user? This code is so far away from VFS read() semantics that the naming is actively confusing. Name it sched_attr_copy_to_user() instead, which mirrors other copy_to_user() functionality. - Move the attr->size assignment from the head of sched_getattr() to the sched_attr_copy_to_user() function. Nothing else within the kernel should care about the size of the structure. With these fixes the sched_getattr() syscall now nicely supports an extensible ABI in both a forward and backward compatible fashion, and will also fix the chrt bug. As an added bonus the bogus -EFBIG return is removed as well, which as Thadeu noted should have been -E2BIG to begin with. Bug: 120440300 Reported-by: Thadeu Lima de Souza Cascardo Tested-by: Dietmar Eggemann Tested-by: Thadeu Lima de Souza Cascardo Acked-by: Thadeu Lima de Souza Cascardo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Patrick Bellasi Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: a509a7cd7974 ("sched/uclamp: Extend sched_setattr() to support utilization clamping") Link: https://lkml.kernel.org/r/20190904075532.GA26751@gmail.com Signed-off-by: Ingo Molnar (cherry picked from commit 1251201c0d34fadf69d56efa675c2b7dd0a90eca) Signed-off-by: Qais Yousef Change-Id: I67e653c4f69db0140e9651c125b60e2b8cfd62f1 Signed-off-by: Quentin Perret Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 2e5fab725ddfc0db94558a0eb68fbb7584ab1a93 Author: Patrick Bellasi Date: Thu Aug 22 14:28:11 2019 +0100 UPSTREAM: sched/uclamp: Always use 'enum uclamp_id' for clamp_id values The supported clamp indexes are defined in 'enum clamp_id', however, because of the code logic in some of the first utilization clamping series version, sometimes we needed to use 'unsigned int' to represent indices. This is not more required since the final version of the uclamp_* APIs can always use the proper enum uclamp_id type. Fix it with a bulk rename now that we have all the bits merged. Bug: 120440300 Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Michal Koutny Acked-by: Tejun Heo Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190822132811.31294-7-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit 0413d7f33e60751570fd6c179546bde2f7d82dcb) Signed-off-by: Qais Yousef Change-Id: I0be680b2489fa07244bac63b5c6fe1a79a53bef7 Signed-off-by: Quentin Perret Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 27326e00f48ecb7241256b777090282295061a10 Author: Patrick Bellasi Date: Thu Aug 22 14:28:10 2019 +0100 UPSTREAM: sched/uclamp: Update CPU's refcount on TG's clamp changes On updates of task group (TG) clamp values, ensure that these new values are enforced on all RUNNABLE tasks of the task group, i.e. all RUNNABLE tasks are immediately boosted and/or capped as requested. Do that each time we update effective clamps from cpu_util_update_eff(). Use the *cgroup_subsys_state (css) to walk the list of tasks in each affected TG and update their RUNNABLE tasks. Update each task by using the same mechanism used for cpu affinity masks updates, i.e. by taking the rq lock. Bug: 120440300 Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Michal Koutny Acked-by: Tejun Heo Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190822132811.31294-6-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit babbe170e053c6ec2343751749995b7b9fd5fd2c) Signed-off-by: Qais Yousef Change-Id: I5e48891bd48c266dd282e1bab8f60533e4e29b48 Signed-off-by: Quentin Perret Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit e66ba8289ce3a479f0c6c52d25307c4d2b22f256 Author: Patrick Bellasi Date: Thu Aug 22 14:28:09 2019 +0100 UPSTREAM: sched/uclamp: Use TG's clamps to restrict TASK's clamps When a task specific clamp value is configured via sched_setattr(2), this value is accounted in the corresponding clamp bucket every time the task is {en,de}qeued. However, when cgroups are also in use, the task specific clamp values could be restricted by the task_group (TG) clamp values. Update uclamp_cpu_inc() to aggregate task and TG clamp values. Every time a task is enqueued, it's accounted in the clamp bucket tracking the smaller clamp between the task specific value and its TG effective value. This allows to: 1. ensure cgroup clamps are always used to restrict task specific requests, i.e. boosted not more than its TG effective protection and capped at least as its TG effective limit. 2. implement a "nice-like" policy, where tasks are still allowed to request less than what enforced by their TG effective limits and protections Do this by exploiting the concept of "effective" clamp, which is already used by a TG to track parent enforced restrictions. Apply task group clamp restrictions only to tasks belonging to a child group. While, for tasks in the root group or in an autogroup, system defaults are still enforced. Bug: 120440300 Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Michal Koutny Acked-by: Tejun Heo Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190822132811.31294-5-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit 3eac870a324728e5d17118888840dad70bcd37f3) Signed-off-by: Qais Yousef Change-Id: I0215e0a68cc0fa7c441e33052757f8571b7c99b9 Signed-off-by: Quentin Perret Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit d26df4b554e45013689e091881875e767cc55122 Author: Patrick Bellasi Date: Thu Aug 22 14:28:08 2019 +0100 UPSTREAM: sched/uclamp: Propagate system defaults to the root group The clamp values are not tunable at the level of the root task group. That's for two main reasons: - the root group represents "system resources" which are always entirely available from the cgroup standpoint. - when tuning/restricting "system resources" makes sense, tuning must be done using a system wide API which should also be available when control groups are not. When a system wide restriction is available, cgroups should be aware of its value in order to know exactly how much "system resources" are available for the subgroups. Utilization clamping supports already the concepts of: - system defaults: which define the maximum possible clamp values usable by tasks. - effective clamps: which allows a parent cgroup to constraint (maybe temporarily) its descendants without losing the information related to the values "requested" from them. Exploit these two concepts and bind them together in such a way that, whenever system default are tuned, the new values are propagated to (possibly) restrict or relax the "effective" value of nested cgroups. When cgroups are in use, force an update of all the RUNNABLE tasks. Otherwise, keep things simple and do just a lazy update next time each task will be enqueued. Do that since we assume a more strict resource control is required when cgroups are in use. This allows also to keep "effective" clamp values updated in case we need to expose them to user-space. Bug: 120440300 Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Michal Koutny Acked-by: Tejun Heo Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190822132811.31294-4-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit 7274a5c1bbec45f06f1fff4b8c8b5855b6cc189d) Signed-off-by: Qais Yousef Change-Id: Ibf7ce5c46b67c79765b56b792ee22ed9595802c3 Signed-off-by: Quentin Perret Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 46b50a614c89320b23ab7615fb3cd92083c38bcf Author: Patrick Bellasi Date: Thu Aug 22 14:28:07 2019 +0100 UPSTREAM: sched/uclamp: Propagate parent clamps In order to properly support hierarchical resources control, the cgroup delegation model requires that attribute writes from a child group never fail but still are locally consistent and constrained based on parent's assigned resources. This requires to properly propagate and aggregate parent attributes down to its descendants. Implement this mechanism by adding a new "effective" clamp value for each task group. The effective clamp value is defined as the smaller value between the clamp value of a group and the effective clamp value of its parent. This is the actual clamp value enforced on tasks in a task group. Since it's possible for a cpu.uclamp.min value to be bigger than the cpu.uclamp.max value, ensure local consistency by restricting each "protection" (i.e. min utilization) with the corresponding "limit" (i.e. max utilization). Do that at effective clamps propagation to ensure all user-space write never fails while still always tracking the most restrictive values. Bug: 120440300 Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Michal Koutny Acked-by: Tejun Heo Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190822132811.31294-3-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit 0b60ba2dd342016e4e717dbaa4ca9af3a43f4434) Signed-off-by: Qais Yousef Change-Id: If1cc136e1fb4a8f4c6ea15dc440b28d833a8d7e7 Signed-off-by: Quentin Perret Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 067c0b19313dc9e3730abf9ceb5942420a2c3fdd Author: Patrick Bellasi Date: Thu Aug 22 14:28:06 2019 +0100 BACKPORT: sched/uclamp: Extend CPU's cgroup controller The cgroup CPU bandwidth controller allows to assign a specified (maximum) bandwidth to the tasks of a group. However this bandwidth is defined and enforced only on a temporal base, without considering the actual frequency a CPU is running on. Thus, the amount of computation completed by a task within an allocated bandwidth can be very different depending on the actual frequency the CPU is running that task. The amount of computation can be affected also by the specific CPU a task is running on, especially when running on asymmetric capacity systems like Arm's big.LITTLE. With the availability of schedutil, the scheduler is now able to drive frequency selections based on actual task utilization. Moreover, the utilization clamping support provides a mechanism to bias the frequency selection operated by schedutil depending on constraints assigned to the tasks currently RUNNABLE on a CPU. Giving the mechanisms described above, it is now possible to extend the cpu controller to specify the minimum (or maximum) utilization which should be considered for tasks RUNNABLE on a cpu. This makes it possible to better defined the actual computational power assigned to task groups, thus improving the cgroup CPU bandwidth controller which is currently based just on time constraints. Extend the CPU controller with a couple of new attributes uclamp.{min,max} which allow to enforce utilization boosting and capping for all the tasks in a group. Specifically: - uclamp.min: defines the minimum utilization which should be considered i.e. the RUNNABLE tasks of this group will run at least at a minimum frequency which corresponds to the uclamp.min utilization - uclamp.max: defines the maximum utilization which should be considered i.e. the RUNNABLE tasks of this group will run up to a maximum frequency which corresponds to the uclamp.max utilization These attributes: a) are available only for non-root nodes, both on default and legacy hierarchies, while system wide clamps are defined by a generic interface which does not depends on cgroups. This system wide interface enforces constraints on tasks in the root node. b) enforce effective constraints at each level of the hierarchy which are a restriction of the group requests considering its parent's effective constraints. Root group effective constraints are defined by the system wide interface. This mechanism allows each (non-root) level of the hierarchy to: - request whatever clamp values it would like to get - effectively get only up to the maximum amount allowed by its parent c) have higher priority than task-specific clamps, defined via sched_setattr(), thus allowing to control and restrict task requests. Add two new attributes to the cpu controller to collect "requested" clamp values. Allow that at each non-root level of the hierarchy. Keep it simple by not caring now about "effective" values computation and propagation along the hierarchy. Update sysctl_sched_uclamp_handler() to use the newly introduced uclamp_mutex so that we serialize system default updates with cgroup relate updates. Bug: 120440300 Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Michal Koutny Acked-by: Tejun Heo Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190822132811.31294-2-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit 2480c093130f64ac3a410504fa8b3db1fc4b87ce) Signed-off-by: Qais Yousef Change-Id: I0285c44910bf073b80d7996361e6698bc5aedfae Signed-off-by: Quentin Perret Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 64b042aac4283cf4456f213f3546daee0c731785 Author: Patrick Bellasi Date: Fri Jun 21 09:42:11 2019 +0100 UPSTREAM: sched/uclamp: Add uclamp_util_with() So far uclamp_util() allows to clamp a specified utilization considering the clamp values requested by RUNNABLE tasks in a CPU. For the Energy Aware Scheduler (EAS) it is interesting to test how clamp values will change when a task is becoming RUNNABLE on a given CPU. For example, EAS is interested in comparing the energy impact of different scheduling decisions and the clamp values can play a role on that. Add uclamp_util_with() which allows to clamp a given utilization by considering the possible impact on CPU clamp values of a specified task. Bug: 120440300 Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Tejun Heo Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190621084217.8167-11-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit 9d20ad7dfc9a5cc64e33d725902d3863d350a66a) Signed-off-by: Qais Yousef Change-Id: Ida153a3526b87f5674a6e037d4725d99eec7b478 Signed-off-by: Quentin Perret Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 0fc858cc7a9712c08da29adc187e69279768a6e4 Author: Patrick Bellasi Date: Fri Jun 21 09:42:10 2019 +0100 BACKPORT: sched/cpufreq, sched/uclamp: Add clamps for FAIR and RT tasks Each time a frequency update is required via schedutil, a frequency is selected to (possibly) satisfy the utilization reported by each scheduling class and irqs. However, when utilization clamping is in use, the frequency selection should consider userspace utilization clamping hints. This will allow, for example, to: - boost tasks which are directly affecting the user experience by running them at least at a minimum "requested" frequency - cap low priority tasks not directly affecting the user experience by running them only up to a maximum "allowed" frequency These constraints are meant to support a per-task based tuning of the frequency selection thus supporting a fine grained definition of performance boosting vs energy saving strategies in kernel space. Add support to clamp the utilization of RUNNABLE FAIR and RT tasks within the boundaries defined by their aggregated utilization clamp constraints. Do that by considering the max(min_util, max_util) to give boosted tasks the performance they need even when they happen to be co-scheduled with other capped tasks. Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Tejun Heo Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190621084217.8167-10-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit 982d9cdc22c9f6df5ad790caa229ff74fb1d95e7) Conflicts: kernel/sched/cpufreq_schedutil.c 1. Merged the if condition to include the non-upstream sched_feat(SUGOV_RT_MAX_FREQ) check 2. Change the function signature to pass util_cfs and define util as an automatic variable. Bug: 120440300 Signed-off-by: Qais Yousef Change-Id: Ie222c9ad84776fc2948e30c116eee876df697a17 Signed-off-by: Quentin Perret Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit c927abaedd42cf29f8e3dc92613e84c4375e4773 Author: Patrick Bellasi Date: Fri Jun 21 09:42:09 2019 +0100 UPSTREAM: sched/uclamp: Set default clamps for RT tasks By default FAIR tasks start without clamps, i.e. neither boosted nor capped, and they run at the best frequency matching their utilization demand. This default behavior does not fit RT tasks which instead are expected to run at the maximum available frequency, if not otherwise required by explicitly capping them. Enforce the correct behavior for RT tasks by setting util_min to max whenever: 1. the task is switched to the RT class and it does not already have a user-defined clamp value assigned. 2. an RT task is forked from a parent with RESET_ON_FORK set. NOTE: utilization clamp values are cross scheduling class attributes and thus they are never changed/reset once a value has been explicitly defined from user-space. Bug: 120440300 Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Tejun Heo Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190621084217.8167-9-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit 1a00d999971c78ab024a17b0efc37d78404dd120) Signed-off-by: Qais Yousef Change-Id: I81fcadaea34f557e531fa5ac6aab84fcb0ee37c7 Signed-off-by: Quentin Perret Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit bd6028f582359eac294fe7363206330696c84b9c Author: Patrick Bellasi Date: Fri Jun 21 09:42:08 2019 +0100 UPSTREAM: sched/uclamp: Reset uclamp values on RESET_ON_FORK A forked tasks gets the same clamp values of its parent however, when the RESET_ON_FORK flag is set on parent, e.g. via: sys_sched_setattr() sched_setattr() __sched_setscheduler(attr::SCHED_FLAG_RESET_ON_FORK) the new forked task is expected to start with all attributes reset to default values. Do that for utilization clamp values too by checking the reset request from the existing uclamp_fork() call which already provides the required initialization for other uclamp related bits. Bug: 120440300 Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Tejun Heo Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190621084217.8167-8-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit a87498ace58e23b62a572dc7267579ede4c8495c) Signed-off-by: Qais Yousef Change-Id: If7bda202707aac3a2696a42f8146f607cdd36905 Signed-off-by: Quentin Perret Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit c713ddf1b1d0c7fac555eacf8579b8ffd5bdd96d Author: Patrick Bellasi Date: Fri Jun 21 09:42:07 2019 +0100 BACKPORT: sched/uclamp: Extend sched_setattr() to support utilization clamping The SCHED_DEADLINE scheduling class provides an advanced and formal model to define tasks requirements that can translate into proper decisions for both task placements and frequencies selections. Other classes have a more simplified model based on the POSIX concept of priorities. Such a simple priority based model however does not allow to exploit most advanced features of the Linux scheduler like, for example, driving frequencies selection via the schedutil cpufreq governor. However, also for non SCHED_DEADLINE tasks, it's still interesting to define tasks properties to support scheduler decisions. Utilization clamping exposes to user-space a new set of per-task attributes the scheduler can use as hints about the expected/required utilization for a task. This allows to implement a "proactive" per-task frequency control policy, a more advanced policy than the current one based just on "passive" measured task utilization. For example, it's possible to boost interactive tasks (e.g. to get better performance) or cap background tasks (e.g. to be more energy/thermal efficient). Introduce a new API to set utilization clamping values for a specified task by extending sched_setattr(), a syscall which already allows to define task specific properties for different scheduling classes. A new pair of attributes allows to specify a minimum and maximum utilization the scheduler can consider for a task. Do that by validating the required clamp values before and then applying the required changes using _the_ same pattern already in use for __setscheduler(). This ensures that the task is re-enqueued with the new clamp values. Bug: 120440300 Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Tejun Heo Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190621084217.8167-7-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit a509a7cd79747074a2c018a45bbbc52d1f4aed44) Signed-off-by: Qais Yousef Change-Id: I420e7ece5628bc639811a79654c35135a65bfd02 Signed-off-by: Quentin Perret Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 6306516692f50c3a2d2119142e393f5ef9f48259 Author: Patrick Bellasi Date: Fri Jun 21 09:42:06 2019 +0100 BACKPORT: sched/core: Allow sched_setattr() to use the current policy The sched_setattr() syscall mandates that a policy is always specified. This requires to always know which policy a task will have when attributes are configured and this makes it impossible to add more generic task attributes valid across different scheduling policies. Reading the policy before setting generic tasks attributes is racy since we cannot be sure it is not changed concurrently. Introduce the required support to change generic task attributes without affecting the current task policy. This is done by adding an attribute flag (SCHED_FLAG_KEEP_POLICY) to enforce the usage of the current policy. Add support for the SETPARAM_POLICY policy, which is already used by the sched_setparam() POSIX syscall, to the sched_setattr() non-POSIX syscall. Bug: 120440300 Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Tejun Heo Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190621084217.8167-6-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit 1d6362fa0cfc8c7b243fa92924429d826599e691) Signed-off-by: Qais Yousef Change-Id: I41cbe73d7aa30123adbd757fa30e346938651784 Signed-off-by: Quentin Perret Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 62d50c5a8556377176c6e3ef1226aa968e714730 Author: Patrick Bellasi Date: Fri Jun 21 09:42:05 2019 +0100 UPSTREAM: sched/uclamp: Add system default clamps Tasks without a user-defined clamp value are considered not clamped and by default their utilization can have any value in the [0..SCHED_CAPACITY_SCALE] range. Tasks with a user-defined clamp value are allowed to request any value in that range, and the required clamp is unconditionally enforced. However, a "System Management Software" could be interested in limiting the range of clamp values allowed for all tasks. Add a privileged interface to define a system default configuration via: /proc/sys/kernel/sched_uclamp_util_{min,max} which works as an unconditional clamp range restriction for all tasks. With the default configuration, the full SCHED_CAPACITY_SCALE range of values is allowed for each clamp index. Otherwise, the task-specific clamp is capped by the corresponding system default value. Do that by tracking, for each task, the "effective" clamp value and bucket the task has been refcounted in at enqueue time. This allows to lazy aggregate "requested" and "system default" values at enqueue time and simplifies refcounting updates at dequeue time. The cached bucket ids are used to avoid (relatively) more expensive integer divisions every time a task is enqueued. An active flag is used to report when the "effective" value is valid and thus the task is actually refcounted in the corresponding rq's bucket. Bug: 120440300 Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Tejun Heo Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190621084217.8167-5-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit e8f14172c6b11e9a86c65532497087f8eb0f91b1) Signed-off-by: Qais Yousef Change-Id: I4f014c5ec9c312aaad606518f6e205fd0cfbcaa2 Signed-off-by: Quentin Perret Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 8bd52ebd588ad2c724fb789ee11b6e3179145eba Author: Patrick Bellasi Date: Fri Jun 21 09:42:04 2019 +0100 UPSTREAM: sched/uclamp: Enforce last task's UCLAMP_MAX When a task sleeps it removes its max utilization clamp from its CPU. However, the blocked utilization on that CPU can be higher than the max clamp value enforced while the task was running. This allows undesired CPU frequency increases while a CPU is idle, for example, when another CPU on the same frequency domain triggers a frequency update, since schedutil can now see the full not clamped blocked utilization of the idle CPU. Fix this by using: uclamp_rq_dec_id(p, rq, UCLAMP_MAX) uclamp_rq_max_value(rq, UCLAMP_MAX, clamp_value) to detect when a CPU has no more RUNNABLE clamped tasks and to flag this condition. Don't track any minimum utilization clamps since an idle CPU never requires a minimum frequency. The decay of the blocked utilization is good enough to reduce the CPU frequency. Bug: 120440300 Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Tejun Heo Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190621084217.8167-4-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit e496187da71070687b55ff455e7d8d7d7f0ae0b9) Signed-off-by: Qais Yousef Change-Id: Ie9eab897eb654ec9d4fba5eda20f66a91a712817 Signed-off-by: Quentin Perret Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit d3048c160f56d4003be0a9867b975140ea5efc0e Author: Patrick Bellasi Date: Fri Jun 21 09:42:03 2019 +0100 UPSTREAM: sched/uclamp: Add bucket local max tracking Because of bucketization, different task-specific clamp values are tracked in the same bucket. For example, with 20% bucket size and assuming to have: Task1: util_min=25% Task2: util_min=35% both tasks will be refcounted in the [20..39]% bucket and always boosted only up to 20% thus implementing a simple floor aggregation normally used in histograms. In systems with only few and well-defined clamp values, it would be useful to track the exact clamp value required by a task whenever possible. For example, if a system requires only 23% and 47% boost values then it's possible to track the exact boost required by each task using only 3 buckets of ~33% size each. Introduce a mechanism to max aggregate the requested clamp values of RUNNABLE tasks in the same bucket. Keep it simple by resetting the bucket value to its base value only when a bucket becomes inactive. Allow a limited and controlled overboosting margin for tasks recounted in the same bucket. In systems where the boost values are not known in advance, it is still possible to control the maximum acceptable overboosting margin by tuning the number of clamp groups. For example, 20 groups ensure a 5% maximum overboost. Remove the rq bucket initialization code since a correct bucket value is now computed when a task is refcounted into a CPU's rq. Bug: 120440300 Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Tejun Heo Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190621084217.8167-3-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit 60daf9c19410604f08c99e146bc378c8a64f4ccd) Signed-off-by: Qais Yousef Change-Id: I8782971f8867033cee5aaf981c96f9de33a5288c Signed-off-by: Quentin Perret Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 016fa3c1ea0a4a5bed08ff564ae45903ce3dfb9b Author: Patrick Bellasi Date: Fri Jun 21 09:42:02 2019 +0100 BACKPORT: sched/uclamp: Add CPU's clamp buckets refcounting Utilization clamping allows to clamp the CPU's utilization within a [util_min, util_max] range, depending on the set of RUNNABLE tasks on that CPU. Each task references two "clamp buckets" defining its minimum and maximum (util_{min,max}) utilization "clamp values". A CPU's clamp bucket is active if there is at least one RUNNABLE tasks enqueued on that CPU and refcounting that bucket. When a task is {en,de}queued {on,from} a rq, the set of active clamp buckets on that CPU can change. If the set of active clamp buckets changes for a CPU a new "aggregated" clamp value is computed for that CPU. This is because each clamp bucket enforces a different utilization clamp value. Clamp values are always MAX aggregated for both util_min and util_max. This ensures that no task can affect the performance of other co-scheduled tasks which are more boosted (i.e. with higher util_min clamp) or less capped (i.e. with higher util_max clamp). A task has: task_struct::uclamp[clamp_id]::bucket_id to track the "bucket index" of the CPU's clamp bucket it refcounts while enqueued, for each clamp index (clamp_id). A runqueue has: rq::uclamp[clamp_id]::bucket[bucket_id].tasks to track how many RUNNABLE tasks on that CPU refcount each clamp bucket (bucket_id) of a clamp index (clamp_id). It also has a: rq::uclamp[clamp_id]::bucket[bucket_id].value to track the clamp value of each clamp bucket (bucket_id) of a clamp index (clamp_id). The rq::uclamp::bucket[clamp_id][] array is scanned every time it's needed to find a new MAX aggregated clamp value for a clamp_id. This operation is required only when it's dequeued the last task of a clamp bucket tracking the current MAX aggregated clamp value. In this case, the CPU is either entering IDLE or going to schedule a less boosted or more clamped task. The expected number of different clamp values configured at build time is small enough to fit the full unordered array into a single cache line, for configurations of up to 7 buckets. Add to struct rq the basic data structures required to refcount the number of RUNNABLE tasks for each clamp bucket. Add also the max aggregation required to update the rq's clamp value at each enqueue/dequeue event. Use a simple linear mapping of clamp values into clamp buckets. Pre-compute and cache bucket_id to avoid integer divisions at enqueue/dequeue time. Bug: 120440300 Signed-off-by: Patrick Bellasi Signed-off-by: Peter Zijlstra (Intel) Cc: Alessio Balsini Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Juri Lelli Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J . Wysocki Cc: Steve Muckle Cc: Suren Baghdasaryan Cc: Tejun Heo Cc: Thomas Gleixner Cc: Todd Kjos Cc: Vincent Guittot Cc: Viresh Kumar Link: https://lkml.kernel.org/r/20190621084217.8167-2-patrick.bellasi@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit 69842cba9ace84849bb9b8edcdf2cefccd97901c) Signed-off-by: Qais Yousef Change-Id: I2c2c23572fb82e004f815cc9c783881355df6836 Signed-off-by: Quentin Perret Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit ff72a25515d4673ccdfe6a7a8606326725ee057f Author: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> Date: Wed May 22 15:56:14 2024 +0800 cpufreq: schedutil: Checkout to msm-4.19 * From: https://github.com/EmanuelCN/kernel_xiaomi_sm8250/blob/staging/kernel/sched/cpufreq_schedutil.c * Removed sugov_get_util() under CONFIG_SCHED_WALT guard, because we don't use WALT anymore * Preserved 819f63f and c2bdfaf, also introduce new definitions from 4.19 for new schedutil Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit eecd40ea2f0a9b22cd79f288da288e1d4513c086 Author: Viresh Kumar Date: Tue May 22 15:31:30 2018 +0530 cpufreq: Rename cpufreq_can_do_remote_dvfs() This routine checks if the CPU running this code belongs to the policy of the target CPU or if not, can it do remote DVFS for it remotely. But the current name of it implies as if it is only about doing remote updates. Rename it to make it more relevant. Suggested-by: Rafael J. Wysocki Signed-off-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit c201be3d3447e286e7e6924dc06fd61fe923954b Author: kondors1995 Date: Mon Aug 5 10:10:39 2024 +0300 Revert "cpufreq: Avoid leaving stale IRQ work items during CPU offline" This reverts commit c0079a7b3b1a3069ec86d4bb5d870edc9b292f99. commit 69078f1d44b561a0eddceb43873b2ad5a772fe9d Author: Vincent Guittot Date: Wed Feb 6 17:14:22 2019 +0100 sched/fair: Fix O(nr_cgroups) in the load balancing path commit 039ae8bcf7a5f4476f4487e6bf816885fb3fb617 upstream. This re-applies the commit reverted here: commit c40f7d74c741 ("sched/fair: Fix infinite loop in update_blocked_averages() by reverting a9e7f6544b9c") I.e. now that cfs_rq can be safely removed/added in the list, we can re-apply: commit a9e7f6544b9c ("sched/fair: Fix O(nr_cgroups) in load balance path") Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: sargun@sargun.me Cc: tj@kernel.org Cc: xiexiuqi@huawei.com Cc: xiezhipeng1@huawei.com Link: https://lkml.kernel.org/r/1549469662-13614-3-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar Cc: Vishnu Rangayyan Signed-off-by: Greg Kroah-Hartman Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 178af989ed8209fc342dfd89bdf23b935f7a2ce1 Author: Vincent Guittot Date: Wed Feb 6 17:14:21 2019 +0100 sched/fair: Optimize update_blocked_averages() commit 31bc6aeaab1d1de8959b67edbed5c7a4b3cdbe7c upstream. Removing a cfs_rq from rq->leaf_cfs_rq_list can break the parent/child ordering of the list when it will be added back. In order to remove an empty and fully decayed cfs_rq, we must remove its children too, so they will be added back in the right order next time. With a normal decay of PELT, a parent will be empty and fully decayed if all children are empty and fully decayed too. In such a case, we just have to ensure that the whole branch will be added when a new task is enqueued. This is default behavior since : commit f6783319737f ("sched/fair: Fix insertion in rq->leaf_cfs_rq_list") In case of throttling, the PELT of throttled cfs_rq will not be updated whereas the parent will. This breaks the assumption made above unless we remove the children of a cfs_rq that is throttled. Then, they will be added back when unthrottled and a sched_entity will be enqueued. As throttled cfs_rq are now removed from the list, we can remove the associated test in update_blocked_averages(). Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: sargun@sargun.me Cc: tj@kernel.org Cc: xiexiuqi@huawei.com Cc: xiezhipeng1@huawei.com Link: https://lkml.kernel.org/r/1549469662-13614-2-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar Cc: Vishnu Rangayyan Signed-off-by: Greg Kroah-Hartman Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 5f6df438eb114f68750cbaf73ff655cf09767aa9 Author: Vincent Guittot Date: Wed Jan 30 06:22:47 2019 +0100 sched/fair: Fix insertion in rq->leaf_cfs_rq_list commit f6783319737f28e4436a69611853a5a098cbe974 upstream. Sargun reported a crash: "I picked up c40f7d74c741a907cfaeb73a7697081881c497d0 sched/fair: Fix infinite loop in update_blocked_averages() by reverting a9e7f6544b9c and put it on top of 4.19.13. In addition to this, I uninlined list_add_leaf_cfs_rq for debugging. This revealed a new bug that we didn't get to because we kept getting crashes from the previous issue. When we are running with cgroups that are rapidly changing, with CFS bandwidth control, and in addition using the cpusets cgroup, we see this crash. Specifically, it seems to occur with cgroups that are throttled and we change the allowed cpuset." The algorithm used to order cfs_rq in rq->leaf_cfs_rq_list assumes that it will walk down to root the 1st time a cfs_rq is used and we will finish to add either a cfs_rq without parent or a cfs_rq with a parent that is already on the list. But this is not always true in presence of throttling. Because a cfs_rq can be throttled even if it has never been used but other CPUs of the cgroup have already used all the bandwdith, we are not sure to go down to the root and add all cfs_rq in the list. Ensure that all cfs_rq will be added in the list even if they are throttled. [ mingo: Fix !CGROUPS build. ] Reported-by: Sargun Dhillon Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: tj@kernel.org Fixes: 9c2791f936ef ("Fix hierarchical order in rq->leaf_cfs_rq_list") Link: https://lkml.kernel.org/r/1548825767-10799-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar Cc: Janne Huttunen Signed-off-by: Greg Kroah-Hartman Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 43bf835d9112937ae2b7d07b2b0a64b53598623d Author: Peter Zijlstra Date: Wed Jan 30 14:41:04 2019 +0100 sched/fair: Add tmp_alone_branch assertion commit 5d299eabea5a251fbf66e8277704b874bbba92dc upstream. The magic in list_add_leaf_cfs_rq() requires that at the end of enqueue_task_fair(): rq->tmp_alone_branch == &rq->lead_cfs_rq_list If this is violated, list integrity is compromised for list entries and the tmp_alone_branch pointer might dangle. Also, reflow list_add_leaf_cfs_rq() while there. This looses one indentation level and generates a form that's convenient for the next patch. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar Cc: Janne Huttunen Signed-off-by: Greg Kroah-Hartman Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 4a1c8a809ec62e802d9c2b174bcddd90750e6805 Author: Vincent Guittot Date: Fri Jun 12 17:47:03 2020 +0200 sched/pelt: Cleanup PELT divider Factorize in a single place the calculation of the divider to be used to to compute *_avg from *_sum value Suggested-by: Dietmar Eggemann Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200612154703.23555-1-vincent.guittot@linaro.org Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit ed092eafa66750326f15caa1a8e7c2f5f62f5951 Author: Dietmar Eggemann Date: Wed Jun 3 10:03:01 2020 +0200 sched/pelt: Remove redundant cap_scale() definition Besides in PELT cap_scale() is used in the Deadline scheduler class for scale-invariant bandwidth enforcement. Remove the cap_scale() definition in kernel/sched/pelt.c and keep the one in kernel/sched/sched.h. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20200603080304.16548-2-dietmar.eggemann@arm.com Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit a48e6b38b3b57cb00853bd405524f2158d454ba9 Author: Chengming Zhou Date: Fri Apr 8 19:53:08 2022 +0800 UPSTREAM: sched/fair: Fix cfs_rq_clock_pelt() for throttled cfs_rq Since commit 23127296889f ("sched/fair: Update scale invariance of PELT") change to use rq_clock_pelt() instead of rq_clock_task(), we should also use rq_clock_pelt() for throttled_clock_task_time and throttled_clock_task accounting to get correct cfs_rq_clock_pelt() of throttled cfs_rq. And rename throttled_clock_task(_time) to be clock_pelt rather than clock_task. Bug: 254441685 Fixes: 23127296889f ("sched/fair: Update scale invariance of PELT") Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ben Segall Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20220408115309.81603-1-zhouchengming@bytedance.com (cherry picked from commit 64eaf50731ac0a8c76ce2fedd50ef6652aabc5ff) Signed-off-by: Lee Jones Change-Id: I61e971d09f14708b8ee170fd5d5109144bba6e34 Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 98290e08beea376902e4e3d2eaeb3851b0192f74 Author: Vincent Guittot Date: Wed Jan 23 16:26:53 2019 +0100 UPSTREAM: sched/fair: Update scale invariance of PELT The current implementation of load tracking invariance scales the contribution with current frequency and uarch performance (only for utilization) of the CPU. One main result of this formula is that the figures are capped by current capacity of CPU. Another one is that the load_avg is not invariant because not scaled with uarch. The util_avg of a periodic task that runs r time slots every p time slots varies in the range : U * (1-y^r)/(1-y^p) * y^i < Utilization < U * (1-y^r)/(1-y^p) with U is the max util_avg value = SCHED_CAPACITY_SCALE At a lower capacity, the range becomes: U * C * (1-y^r')/(1-y^p) * y^i' < Utilization < U * C * (1-y^r')/(1-y^p) with C reflecting the compute capacity ratio between current capacity and max capacity. so C tries to compensate changes in (1-y^r') but it can't be accurate. Instead of scaling the contribution value of PELT algo, we should scale the running time. The PELT signal aims to track the amount of computation of tasks and/or rq so it seems more correct to scale the running time to reflect the effective amount of computation done since the last update. In order to be fully invariant, we need to apply the same amount of running time and idle time whatever the current capacity. Because running at lower capacity implies that the task will run longer, we have to ensure that the same amount of idle time will be applied when system becomes idle and no idle time has been "stolen". But reaching the maximum utilization value (SCHED_CAPACITY_SCALE) means that the task is seen as an always-running task whatever the capacity of the CPU (even at max compute capacity). In this case, we can discard this "stolen" idle times which becomes meaningless. In order to achieve this time scaling, a new clock_pelt is created per rq. The increase of this clock scales with current capacity when something is running on rq and synchronizes with clock_task when rq is idle. With this mechanism, we ensure the same running and idle time whatever the current capacity. This also enables to simplify the pelt algorithm by removing all references of uarch and frequency and applying the same contribution to utilization and loads. Furthermore, the scaling is done only once per update of clock (update_rq_clock_task()) instead of during each update of sched_entities and cfs/rt/dl_rq of the rq like the current implementation. This is interesting when cgroup are involved as shown in the results below: On a hikey (octo Arm64 platform). Performance cpufreq governor and only shallowest c-state to remove variance generated by those power features so we only track the impact of pelt algo. each test runs 16 times: ./perf bench sched pipe (higher is better) kernel tip/sched/core + patch ops/seconds ops/seconds diff cgroup root 59652(+/- 0.18%) 59876(+/- 0.24%) +0.38% level1 55608(+/- 0.27%) 55923(+/- 0.24%) +0.57% level2 52115(+/- 0.29%) 52564(+/- 0.22%) +0.86% hackbench -l 1000 (lower is better) kernel tip/sched/core + patch duration(sec) duration(sec) diff cgroup root 4.453(+/- 2.37%) 4.383(+/- 2.88%) -1.57% level1 4.859(+/- 8.50%) 4.830(+/- 7.07%) -0.60% level2 5.063(+/- 9.83%) 4.928(+/- 9.66%) -2.66% Then, the responsiveness of PELT is improved when CPU is not running at max capacity with this new algorithm. I have put below some examples of duration to reach some typical load values according to the capacity of the CPU with current implementation and with this patch. These values has been computed based on the geometric series and the half period value: Util (%) max capacity half capacity(mainline) half capacity(w/ patch) 972 (95%) 138ms not reachable 276ms 486 (47.5%) 30ms 138ms 60ms 256 (25%) 13ms 32ms 26ms On my hikey (octo Arm64 platform) with schedutil governor, the time to reach max OPP when starting from a null utilization, decreases from 223ms with current scale invariance down to 121ms with the new algorithm. Bug: 120440300 Change-Id: I0bd4ed2317f2a9a965634e53ce1476417af697a6 Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bsegall@google.com Cc: dietmar.eggemann@arm.com Cc: patrick.bellasi@arm.com Cc: pjt@google.com Cc: pkondeti@codeaurora.org Cc: quentin.perret@arm.com Cc: rjw@rjwysocki.net Cc: srinivas.pandruvada@linux.intel.com Cc: thara.gopinath@linaro.org Link: https://lkml.kernel.org/r/1548257214-13745-3-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar (cherry picked from commit 23127296889fe84b0762b191b5d041e8ba6f2599) Signed-off-by: Quentin Perret Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 0a7060dfca03fe7a078974f343372ad623e7a25d Author: Vincent Guittot Date: Wed Jan 23 16:26:52 2019 +0100 UPSTREAM: sched/fair: Move the rq_of() helper function Move rq_of() helper function so it can be used in pelt.c [ mingo: Improve readability while at it. ] Bug: 120440300 Change-Id: I2133979476631d68baaffcaa308f4cdab94f22b1 Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bsegall@google.com Cc: dietmar.eggemann@arm.com Cc: patrick.bellasi@arm.com Cc: pjt@google.com Cc: pkondeti@codeaurora.org Cc: quentin.perret@arm.com Cc: rjw@rjwysocki.net Cc: srinivas.pandruvada@linux.intel.com Cc: thara.gopinath@linaro.org Link: https://lkml.kernel.org/r/1548257214-13745-2-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar (cherry picked from commit 62478d9911fab9694c195f0ca8e4701de09be98e) Signed-off-by: Quentin Perret Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 4ed482487f9d5ad5d11417da5376c596ffc2956c Author: Dietmar Eggemann Date: Fri Aug 3 15:05:38 2018 +0100 UPSTREAM: sched/fair: Remove setting task's se->runnable_weight during PELT update A CFS (SCHED_OTHER, SCHED_BATCH or SCHED_IDLE policy) task's se->runnable_weight must always be in sync with its se->load.weight. se->runnable_weight is set to se->load.weight when the task is forked (init_entity_runnable_average()) or reniced (reweight_entity()). There are two cases in set_load_weight() which since they currently only set se->load.weight could lead to a situation in which se->load.weight is different to se->runnable_weight for a CFS task: (1) A task switches to SCHED_IDLE. (2) A SCHED_FIFO, SCHED_RR or SCHED_DEADLINE task which has been reniced (during which only its static priority gets set) switches to SCHED_OTHER or SCHED_BATCH. Set se->runnable_weight to se->load.weight in these two cases to prevent this. This eliminates the need to explicitly set it to se->load.weight during PELT updates in the CFS scheduler fastpath. Bug: 120440300 Change-Id: I52184a9e1fd53cb42ef3ae546b1fae78b744c9ad Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Cc: Joel Fernandes Cc: Linus Torvalds Cc: Morten Rasmussen Cc: Patrick Bellasi Cc: Peter Zijlstra Cc: Quentin Perret Cc: Thomas Gleixner Cc: Vincent Guittot Link: http://lkml.kernel.org/r/20180803140538.1178-1-dietmar.eggemann@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit 4a465e3ebbc8004ce4f7f08f6022ee8315a94edf) Signed-off-by: Quentin Perret Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit a97016777175a6305b14bbae29f4ce0a3a3930cd Author: Vincent Guittot Date: Fri Dec 14 23:10:06 2018 +0100 sched/pelt: Fix warning and clean up IRQ PELT config Commit 11d4afd4ff667f9b6178ee8c142c36cb78bd84db upstream. Create a config for enabling irq load tracking in the scheduler. irq load tracking is useful only when irq or paravirtual time is accounted but it's only possible with SMP for now. Also use __maybe_unused to remove the compilation warning in update_rq_clock_task() that has been introduced by: 2e62c4743adc ("sched/fair: Remove #ifdefs from scale_rt_capacity()") Suggested-by: Ingo Molnar Reported-by: Dou Liyang Reported-by: Miguel Ojeda Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bp@alien8.de Cc: dou_liyang@163.com Fixes: 2e62c4743adc ("sched/fair: Remove #ifdefs from scale_rt_capacity()") Link: http://lkml.kernel.org/r/1537867062-27285-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar Signed-off-by: Sasha Levin Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 8c3f9eb95d52cb68c7f62e2cfcf99a662008e4f7 Author: Vincent Guittot Date: Thu Jul 19 14:00:06 2018 +0200 sched/fair: Remove #ifdefs from scale_rt_capacity() Reuse cpu_util_irq() that has been defined for schedutil and set irq util to 0 when !CONFIG_IRQ_TIME_ACCOUNTING. But the compiler is not able to optimize the sequence (at least with aarch64 GCC 7.2.1): free *= (max - irq); free /= max; when irq is fixed to 0 Add a new inline function scale_irq_capacity() that will scale utilization when irq is accounted. Reuse this funciton in schedutil which applies similar formula. Suggested-by: Ingo Molnar Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Acked-by: Viresh Kumar Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: rjw@rjwysocki.net Link: http://lkml.kernel.org/r/1532001606-6689-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit bf59401f1068438e811e2bb93f5b2ac3c88f1b52 Author: Vincent Guittot Date: Fri Aug 31 17:22:55 2018 +0200 sched/pelt: Fix update_blocked_averages() for RT and DL classes update_blocked_averages() is called to periodiccally decay the stalled load of idle CPUs and to sync all loads before running load balance. When cfs rq is idle, it trigs a load balance during pick_next_task_fair() in order to potentially pull tasks and to use this newly idle CPU. This load balance happens whereas prev task from another class has not been put and its utilization updated yet. This may lead to wrongly account running time as idle time for RT or DL classes. Test that no RT or DL task is running when updating their utilization in update_blocked_averages(). We still update RT and DL utilization instead of simply skipping them to make sure that all metrics are synced when used during load balance. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 371bf4273269 ("sched/rt: Add rt_rq utilization tracking") Fixes: 3727e0e16340 ("sched/dl: Add dl_rq utilization tracking") Link: http://lkml.kernel.org/r/1535728975-22799-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit cc819e146c895dd29b3fe93fbd558ccd0a190ed6 Author: Vincent Guittot Date: Thu Jun 28 17:45:12 2018 +0200 sched/core: Use PELT for scale_rt_capacity() The utilization of the CPU by RT, DL and IRQs are now tracked with PELT so we can use these metrics instead of rt_avg to evaluate the remaining capacity available for CFS class. scale_rt_capacity() behavior has been changed and now returns the remaining capacity available for CFS instead of a scaling factor because RT, DL and IRQ provide now absolute utilization value. The same formula as schedutil is used: IRQ util_avg + (1 - IRQ util_avg / max capacity ) * /Sum rq util_avg but the implementation is different because it doesn't return the same value and doesn't benefit of the same optimization. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: claudio@evidence.eu.com Cc: daniel.lezcano@linaro.org Cc: dietmar.eggemann@arm.com Cc: joel@joelfernandes.org Cc: juri.lelli@redhat.com Cc: luca.abeni@santannapisa.it Cc: patrick.bellasi@arm.com Cc: quentin.perret@arm.com Cc: rjw@rjwysocki.net Cc: valentin.schneider@arm.com Cc: viresh.kumar@linaro.org Link: http://lkml.kernel.org/r/1530200714-4504-10-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 0f6598d6f0def162745d059a67d1148cf42cfaa3 Author: Vincent Guittot Date: Thu Jun 28 17:45:09 2018 +0200 sched/irq: Add IRQ utilization tracking interrupt and steal time are the only remaining activities tracked by rt_avg. Like for sched classes, we can use PELT to track their average utilization of the CPU. But unlike sched class, we don't track when entering/leaving interrupt; Instead, we take into account the time spent under interrupt context when we update rqs' clock (rq_clock_task). This also means that we have to decay the normal context time and account for interrupt time during the update. That's also important to note that because: rq_clock == rq_clock_task + interrupt time and rq_clock_task is used by a sched class to compute its utilization, the util_avg of a sched class only reflects the utilization of the time spent in normal context and not of the whole time of the CPU. The utilization of interrupt gives an more accurate level of utilization of CPU. The CPU utilization is: avg_irq + (1 - avg_irq / max capacity) * /Sum avg_rq Most of the time, avg_irq is small and neglictible so the use of the approximation CPU utilization = /Sum avg_rq was enough. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: claudio@evidence.eu.com Cc: daniel.lezcano@linaro.org Cc: dietmar.eggemann@arm.com Cc: joel@joelfernandes.org Cc: juri.lelli@redhat.com Cc: luca.abeni@santannapisa.it Cc: patrick.bellasi@arm.com Cc: quentin.perret@arm.com Cc: rjw@rjwysocki.net Cc: valentin.schneider@arm.com Cc: viresh.kumar@linaro.org Link: http://lkml.kernel.org/r/1530200714-4504-7-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit c616a7241810a508cb961a6e1658748113161221 Author: Vincent Guittot Date: Thu Jun 28 17:45:07 2018 +0200 sched/dl: Add dl_rq utilization tracking Similarly to what happens with RT tasks, CFS tasks can be preempted by DL tasks and the CFS's utilization might no longer describes the real utilization level. Current DL bandwidth reflects the requirements to meet deadline when tasks are enqueued but not the current utilization of the DL sched class. We track DL class utilization to estimate the system utilization. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: claudio@evidence.eu.com Cc: daniel.lezcano@linaro.org Cc: dietmar.eggemann@arm.com Cc: joel@joelfernandes.org Cc: juri.lelli@redhat.com Cc: luca.abeni@santannapisa.it Cc: patrick.bellasi@arm.com Cc: quentin.perret@arm.com Cc: rjw@rjwysocki.net Cc: valentin.schneider@arm.com Cc: viresh.kumar@linaro.org Link: http://lkml.kernel.org/r/1530200714-4504-5-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit abc5cabb9b98b262baad2250bbafbe54d8f2fc3e Author: Vincent Guittot Date: Thu Jun 28 17:45:05 2018 +0200 sched/rt: Add rt_rq utilization tracking schedutil governor relies on cfs_rq's util_avg to choose the OPP when CFS tasks are running. When the CPU is overloaded by CFS and RT tasks, CFS tasks are preempted by RT tasks and in this case util_avg reflects the remaining capacity but not what CFS want to use. In such case, schedutil can select a lower OPP whereas the CPU is overloaded. In order to have a more accurate view of the utilization of the CPU, we track the utilization of RT tasks. Only util_avg is correctly tracked but not load_avg and runnable_load_avg which are useless for rt_rq. rt_rq uses rq_clock_task and cfs_rq uses cfs_rq_clock_task but they are the same at the root group level, so the PELT windows of the util_sum are aligned. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: claudio@evidence.eu.com Cc: daniel.lezcano@linaro.org Cc: dietmar.eggemann@arm.com Cc: joel@joelfernandes.org Cc: juri.lelli@redhat.com Cc: luca.abeni@santannapisa.it Cc: patrick.bellasi@arm.com Cc: quentin.perret@arm.com Cc: rjw@rjwysocki.net Cc: valentin.schneider@arm.com Cc: viresh.kumar@linaro.org Link: http://lkml.kernel.org/r/1530200714-4504-3-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 7ce46e2c80931a4da93dbe7674998fee851c8f1f Author: Vincent Guittot Date: Thu Jun 28 17:45:04 2018 +0200 sched/pelt: Move PELT related code in a dedicated file We want to track rt_rq's utilization as a part of the estimation of the whole rq's utilization. This is necessary because rt tasks can steal utilization to cfs tasks and make them lighter than they are. As we want to use the same load tracking mecanism for both and prevent useless dependency between cfs and rt code, PELT code is moved in a dedicated file. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Morten.Rasmussen@arm.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: claudio@evidence.eu.com Cc: daniel.lezcano@linaro.org Cc: dietmar.eggemann@arm.com Cc: joel@joelfernandes.org Cc: juri.lelli@redhat.com Cc: luca.abeni@santannapisa.it Cc: patrick.bellasi@arm.com Cc: quentin.perret@arm.com Cc: rjw@rjwysocki.net Cc: valentin.schneider@arm.com Cc: viresh.kumar@linaro.org Link: http://lkml.kernel.org/r/1530200714-4504-2-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit a7703d111f4372d3eba166af9ebd9092ff3dc66b Author: Vincent Guittot Date: Thu Nov 16 15:21:52 2017 +0100 sched/fair: Update and fix the runnable propagation rule Unlike running, the runnable part can't be directly propagated through the hierarchy when we migrate a task. The main reason is that runnable time can be shared with other sched_entities that stay on the rq and this runnable time will also remain on prev cfs_rq and must not be removed. Instead, we can estimate what should be the new runnable of the prev cfs_rq and check that this estimation stay in a possible range. The prop_runnable_sum is a good estimation when adding runnable_sum but fails most often when we remove it. Instead, we could use the formula below instead: gcfs_rq's runnable_sum = gcfs_rq->avg.load_sum / gcfs_rq->load.weight which assumes that tasks are equally runnable which is not true but easy to compute. Beside these estimates, we have several simple rules that help us to filter out wrong ones: - ge->avg.runnable_sum <= than LOAD_AVG_MAX - ge->avg.runnable_sum >= ge->avg.running_sum (ge->avg.util_sum << LOAD_AVG_MAX) - ge->avg.runnable_sum can't increase when we detach a task The effect of these fixes is better cgroups balancing. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Acked-by: Peter Zijlstra (Intel) Cc: Ben Segall Cc: Chris Mason Cc: Dietmar Eggemann Cc: Josef Bacik Cc: Linus Torvalds Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Tejun Heo Cc: Thomas Gleixner Cc: Yuyang Du Link: http://lkml.kernel.org/r/1510842112-21028-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 480aacf775e2ec594e0fed31b15678ef967f29fb Author: Peter Zijlstra Date: Thu Aug 24 13:06:35 2017 +0200 sched/fair: Update calc_group_*() comments I had a wee bit of trouble recalling how the calc_group_runnable() stuff worked.. add hopefully better comments. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 5b49acfeddbd52712395f011c4ab6e4695863738 Author: Josef Bacik Date: Thu Aug 3 11:13:39 2017 -0400 sched/fair: Calculate runnable_weight slightly differently Our runnable_weight currently looks like this runnable_weight = shares * runnable_load_avg / load_avg The goal is to scale the runnable weight for the group based on its runnable to load_avg ratio. The problem with this is it biases us towards tasks that never go to sleep. Tasks that go to sleep are going to have their runnable_load_avg decayed pretty hard, which will drastically reduce the runnable weight of groups with interactive tasks. To solve this imbalance we tweak this slightly, so in the ideal case it is still the above, but in the interactive case it is runnable_weight = shares * runnable_weight / load_weight which will make the weight distribution fairer between interactive and non-interactive groups. Signed-off-by: Josef Bacik Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kernel-team@fb.com Cc: linux-kernel@vger.kernel.org Cc: riel@redhat.com Cc: tj@kernel.org Link: http://lkml.kernel.org/r/1501773219-18774-2-git-send-email-jbacik@fb.com Signed-off-by: Ingo Molnar Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 7be10d3b02e103159be137da4bc8fd9ebfda2b81 Author: Peter Zijlstra Date: Fri May 12 14:18:10 2017 +0200 sched/fair: Implement more accurate async detach The problem with the overestimate is that it will subtract too big a value from the load_sum, thereby pushing it down further than it ought to go. Since runnable_load_avg is not subject to a similar 'force', this results in the occasional 'runnable_load > load' situation. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit eebab46a1ff61d392a032456a0274185ec09c78d Author: Peter Zijlstra Date: Fri May 12 14:16:30 2017 +0200 sched/fair: Align PELT windows between cfs_rq and its se The PELT _sum values are a saw-tooth function, dropping on the decay edge and then growing back up again during the window. When these window-edges are not aligned between cfs_rq and se, we can have the situation where, for example, on dequeue, the se decays first. Its _sum values will be small(er), while the cfs_rq _sum values will still be on their way up. Because of this, the subtraction: cfs_rq->avg._sum -= se->avg._sum will result in a positive value. This will then, once the cfs_rq reaches an edge, translate into its _avg value jumping up. This is especially visible with the runnable_load bits, since they get added/subtracted a lot. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit dd849181691e6b895696b331c9a750139a1bffd4 Author: Peter Zijlstra Date: Thu May 11 17:57:24 2017 +0200 sched/fair: Implement synchonous PELT detach on load-balance migrate Vincent wondered why his self migrating task had a roughly 50% dip in load_avg when landing on the new CPU. This is because we uncondionally take the asynchronous detatch_entity route, which can lead to the attach on the new CPU still seeing the old CPU's contribution to tg->load_avg, effectively halving the new CPU's shares. While in general this is something we have to live with, there is the special case of runnable migration where we can do better. Tested-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit fa4e5dee6619a0a3c5368e25147263b8576318e4 Author: Peter Zijlstra Date: Sat May 6 15:59:54 2017 +0200 sched/fair: Propagate an effective runnable_load_avg The load balancer uses runnable_load_avg as load indicator. For !cgroup this is: runnable_load_avg = \Sum se->avg.load_avg ; where se->on_rq That is, a direct sum of all runnable tasks on that runqueue. As opposed to load_avg, which is a sum of all tasks on the runqueue, which includes a blocked component. However, in the cgroup case, this comes apart since the group entities are always runnable, even if most of their constituent entities are blocked. Therefore introduce a runnable_weight which for task entities is the same as the regular weight, but for group entities is a fraction of the entity weight and represents the runnable part of the group runqueue. Then propagate this load through the PELT hierarchy to arrive at an effective runnable load avgerage -- which we should not confuse with the canonical runnable load average. Suggested-by: Tejun Heo Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit c83be534eabf6400d82a999eb1072a5e7ecf2eb2 Author: Peter Zijlstra Date: Mon May 8 17:30:46 2017 +0200 sched/fair: Rewrite PELT migration propagation When an entity migrates in (or out) of a runqueue, we need to add (or remove) its contribution from the entire PELT hierarchy, because even non-runnable entities are included in the load average sums. In order to do this we have some propagation logic that updates the PELT tree, however the way it 'propagates' the runnable (or load) change is (more or less): tg->weight * grq->avg.load_avg ge->avg.load_avg = ------------------------------ tg->load_avg But that is the expression for ge->weight, and per the definition of load_avg: ge->avg.load_avg := ge->weight * ge->avg.runnable_avg That destroys the runnable_avg (by setting it to 1) we wanted to propagate. Instead directly propagate runnable_sum. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 067555d1c451e393e04adf8056c17505de978ac1 Author: Peter Zijlstra Date: Mon May 8 16:51:41 2017 +0200 sched/fair: Rewrite cfs_rq->removed_*avg Since on wakeup migration we don't hold the rq->lock for the old CPU we cannot update its state. Instead we add the removed 'load' to an atomic variable and have the next update on that CPU collect and process it. Currently we have 2 atomic variables; which already have the issue that they can be read out-of-sync. Also, two atomic ops on a single cacheline is already more expensive than an uncontended lock. Since we want to add more, convert the thing over to an explicit cacheline with a lock in. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 938acabba842513db34a25ace967f7771998ee29 Author: Vincent Guittot Date: Wed May 17 11:50:45 2017 +0200 sched/fair: Use reweight_entity() for set_user_nice() Now that we directly change load_avg and propagate that change into the sums, sys_nice() and co should do the same, otherwise its possible to confuse load accounting when we migrate near the weight change. Fixes-by: Josef Bacik Signed-off-by: Vincent Guittot [ Added changelog, fixed the call condition. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20170517095045.GA8420@linaro.org Signed-off-by: Ingo Molnar Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit c1d25beea75097359c3d0300330e2bba0106cff0 Author: Peter Zijlstra Date: Sat May 6 16:11:34 2017 +0200 sched/fair: More accurate reweight_entity() When a (group) entity changes it's weight we should instantly change its load_avg and propagate that change into the sums it is part of. Because we use these values to predict future behaviour and are not interested in its historical value. Without this change, the change in load would need to propagate through the average, by which time it could again have changed etc.. always chasing itself. With this change, the cfs_rq load_avg sum will more accurately reflect the current runnable and expected return of blocked load. Reported-by: Paul Turner [josef: compile fix !SMP || !FAIR_GROUP] Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 3092e27836e4e71089267a27e5a9ab4100627618 Author: Peter Zijlstra Date: Thu Aug 24 17:45:35 2017 +0200 sched/fair: Introduce {en,de}queue_load_avg() Analogous to the existing {en,de}queue_runnable_load_avg() add helpers for {en,de}queue_load_avg(). More users will follow. Includes some code movement to avoid fwd declarations. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 2d9eb6d1ce196555a3571506a0566791f53da0d4 Author: Peter Zijlstra Date: Thu Aug 24 17:38:30 2017 +0200 sched/fair: Rename {en,de}queue_entity_load_avg() Since they're now purely about runnable_load, rename them. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit beda9e28dbb0eb34d787d4cbac2bc8399fa6f1a6 Author: Peter Zijlstra Date: Sat May 6 17:37:03 2017 +0200 sched/fair: Move enqueue migrate handling Move the entity migrate handling from enqueue_entity_load_avg() to update_load_avg(). This has two benefits: - {en,de}queue_entity_load_avg() will become purely about managing runnable_load - we can avoid a double update_tg_load_avg() and reduce pressure on the global tg->shares cacheline The reason we do this is so that we can change update_cfs_shares() to change both weight and (future) runnable_weight. For this to work we need to have the cfs_rq averages up-to-date (which means having done the attach), but we need the cfs_rq->avg.runnable_avg to not yet include the se's contribution (since se->on_rq == 0). Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 94f87f43b19ceeecc31c9e7169533ad64efeb085 Author: Peter Zijlstra Date: Sat May 6 17:32:43 2017 +0200 sched/fair: Change update_load_avg() arguments Most call sites of update_load_avg() already have cfs_rq_of(se) available, pass it down instead of recomputing it. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit f1bfd1f0c849264e58ce1415ddea244440a92797 Author: Peter Zijlstra Date: Sat May 6 16:42:08 2017 +0200 sched/fair: Remove se->load.weight from se->avg.load_sum Remove the load from the load_sum for sched_entities, basically turning load_sum into runnable_sum. This prepares for better reweighting of group entities. Since we now have different rules for computing load_avg, split ___update_load_avg() into two parts, ___update_load_sum() and ___update_load_avg(). So for se: ___update_load_sum(.weight = 1) ___upate_load_avg(.weight = se->load.weight) and for cfs_rq: ___update_load_sum(.weight = cfs_rq->load.weight) ___upate_load_avg(.weight = 1) Since the primary consumable is load_avg, most things will not be affected. Only those few sites that initialize/modify load_sum need attention. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 9600274a37f34758ce3dfda9f5257b7737f298bb Author: Peter Zijlstra Date: Thu May 11 18:16:06 2017 +0200 sched/fair: Cure calc_cfs_shares() vs. reweight_entity() Vincent reported that when running in a cgroup, his root cfs_rq->avg.load_avg dropped to 0 on task idle. This is because reweight_entity() will now immediately propagate the weight change of the group entity to its cfs_rq, and as it happens, our approxmation (5) for calc_cfs_shares() results in 0 when the group is idle. Avoid this by using the correct (3) as a lower bound on (5). This way the empty cgroup will slowly decay instead of instantly drop to 0. Reported-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 664ab7a3e73cbb37c84125925f2cbbc91ed7fdca Author: Peter Zijlstra Date: Tue May 9 11:04:07 2017 +0200 sched/fair: Add comment to calc_cfs_shares() Explain the magic equation in calc_cfs_shares() a bit better. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit 8cdb20cedab8458aafc8d4766ecb08a28d6ad076 Author: Peter Zijlstra Date: Sat May 6 16:03:17 2017 +0200 sched/fair: Clean up calc_cfs_shares() For consistencies sake, we should have only a single reading of tg->shares. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> commit e4f0f21e76d364601055d5d1f14eae828fd4383a Author: kondors1995 Date: Sat Aug 17 12:00:36 2024 +0300 Revert "UPSTREAM: sched/uclamp: Add CPU's clamp buckets refcounting" This reverts commit 116cf381f43abb961d1c7276c9be64b03a91cbfc. commit 8a408ac031b6990f1a2d538680ffac88e42e477c Author: kondors1995 Date: Sat Aug 17 12:00:36 2024 +0300 Revert "UPSTREAM: sched/uclamp: Add bucket local max tracking" This reverts commit a684fdc2dec4430a1c96b34360a7eccee2dc16d1. commit 6f2949c1073b97efb28f05d086fdc9e473d421c4 Author: kondors1995 Date: Sat Aug 17 12:00:35 2024 +0300 Revert "UPSTREAM: sched/uclamp: Enforce last task's UCLAMP_MAX" This reverts commit 41ce2236860d4f0e9c7e9f7a3659080c19bbdd66. commit 86d7baea30401efcc1d609c619c1526a952e6419 Author: kondors1995 Date: Sat Aug 17 12:00:35 2024 +0300 Revert "UPSTREAM: sched/uclamp: Add system default clamps" This reverts commit 5bee2de619c41ffb7ba39cb6df3db0b2ed041c08. commit 9d758d18c03983c60e0aba8bfd98ca0084e7d7c4 Author: kondors1995 Date: Sat Aug 17 12:00:33 2024 +0300 Revert "UPSTREAM: sched/core: Allow sched_setattr() to use the current policy" This reverts commit 38b1e6723b4accdd747cc75ab954f337559efad9. commit 164913a4e0ab61a201d7802eb0de81099453971a Author: kondors1995 Date: Sat Aug 17 12:00:20 2024 +0300 Revert "UPSTREAM: sched/uclamp: Extend sched_setattr() to support utilization clamping" This reverts commit 486d86679747400e8b0cf39e2b0d35562f9a9989. commit 98262c0640ec825cc93c5faf076c7d24ae8ae6a7 Author: kondors1995 Date: Sat Aug 17 11:59:57 2024 +0300 Revert "UPSTREAM: sched/uclamp: Reset uclamp values on RESET_ON_FORK" This reverts commit 13caaea80c308cbc2600644ecd2cbca95f5e5095. commit 17a9d13f5c3bc718f83c71b7b18b40a6525f22c0 Author: kondors1995 Date: Sat Aug 17 11:59:57 2024 +0300 Revert "UPSTREAM: sched/uclamp: Set default clamps for RT tasks" This reverts commit 1e06d5830af9e0952a8e311786e6b11b0deb492a. commit 65d7db99c966d5c853da01eb8897be026f5674ed Author: kondors1995 Date: Sat Aug 17 11:59:57 2024 +0300 Revert "BACKPORT: sched/cpufreq, sched/uclamp: Add clamps for FAIR and RT tasks" This reverts commit 1f42ab6cc15a3611c4015feca3787d78d07c56d7. commit 318ea0855d8084332eb060ca8948bb970cf08129 Author: kondors1995 Date: Sat Aug 17 11:59:57 2024 +0300 Revert "UPSTREAM: sched/uclamp: Add uclamp_util_with()" This reverts commit e6a7b88cc257c9318d5a6aa2ad1116bcb4be0770. commit f1ec7c15ea378c0fa4089a36d753e99054265c10 Author: kondors1995 Date: Sat Aug 17 11:59:57 2024 +0300 Revert "UPSTREAM: sched/uclamp: Extend CPU's cgroup controller" This reverts commit 6a413999607b529bf0e33ce12ca1df84302e9f14. commit 8814a297260c377a08f12350ab47f302d333043b Author: kondors1995 Date: Sat Aug 17 11:59:56 2024 +0300 Revert "UPSTREAM: sched/uclamp: Propagate parent clamps" This reverts commit 4963c9685e76a78b8cdbb8cc9843970f607e6c32. commit fe9d3b66e407939f83c8b9ee26b9318ec4316609 Author: kondors1995 Date: Sat Aug 17 11:59:56 2024 +0300 Revert "UPSTREAM: sched/uclamp: Propagate system defaults to the root group" This reverts commit a3bbe484012e0e1343e1be875f3abca40d79cd04. commit f6256062570adc1d7e261ffff855f0fa1128d8b1 Author: kondors1995 Date: Sat Aug 17 11:59:56 2024 +0300 Revert "UPSTREAM: sched/uclamp: Use TG's clamps to restrict TASK's clamps" This reverts commit dbb3b287e2dd3c012bce9e90e5f2e5007ef8d5de. commit 21d5ea4fd377bedd6ad5b49a829fa7b3867d9bd6 Author: kondors1995 Date: Sat Aug 17 11:59:56 2024 +0300 Revert "UPSTREAM: sched/uclamp: Update CPU's refcount on TG's clamp changes" This reverts commit b5c75307c4f7017a3a340514917bb28cd7d0acae. commit 2d9b5a47c47a33c965f744ce34c8924524558bae Author: kondors1995 Date: Sat Aug 17 11:59:56 2024 +0300 Revert "UPSTREAM: sched/uclamp: Always use 'enum uclamp_id' for clamp_id values" This reverts commit a479e1de9bc60243dcafea50d9a52bf784e1ba0f. commit 962708c53ab41d17d9233543581553226f6da382 Author: kondors1995 Date: Sat Aug 17 11:59:51 2024 +0300 Revert "UPSTREAM: sched/core: Fix uclamp ABI bug, clean up and robustify sched_read_attr() ABI logic and code" This reverts commit fe18af67327ccfb1fe1f49fbf85884e18cf39a58. commit f1bb535dbf77aec377d06a795a52c3e1c3489ab9 Author: kondors1995 Date: Sat Aug 17 11:59:19 2024 +0300 Revert "UPSTREAM: sched/core: Fix compilation error when cgroup not selected" This reverts commit 4f89a28e12c87ebcea8bc11d44e60c54f1c38021. commit 7e33f0d51debf2dd24a33ea7fbb8ac19edb42b1e Author: kondors1995 Date: Sat Aug 17 11:59:19 2024 +0300 Revert "sched/uclamp: Reject negative values in cpu_uclamp_write()" This reverts commit a12fbe805e840557b28f97ecce6c4749be7a5daa. commit 546d51a84878a1046177b739ccdfbfe4748818b4 Author: kondors1995 Date: Sat Aug 17 11:59:19 2024 +0300 Revert "sched/core: Fix size of rq::uclamp initialization" This reverts commit c7be60603f26b5ad2ac973670d2ca99e1586ea48. commit f8456cecf7592fd67d02f5250579d5332a3e168d Author: kondors1995 Date: Sat Aug 17 11:59:19 2024 +0300 Revert "sched/core: Fix reset-on-fork from RT with uclamp" This reverts commit fbeca2f3560cb9601c015b4b7e2986dfd59e6113. commit 7f497bb9e5d2be6a0182e30b1b9cd6feef3b70ab Author: kondors1995 Date: Sat Aug 17 11:59:19 2024 +0300 Revert "BACKPORT: sched/uclamp: Remove uclamp_util()" This reverts commit b6f58cb73defb3e7c62c4ef73caa1f6913b9d48c. commit 1573bacd6d6455ddcf9aa62e6ed71080ee8fe73d Author: kondors1995 Date: Sat Aug 17 11:59:19 2024 +0300 Revert "UPSTREAM: sched/uclamp: Make uclamp util helpers use and return UL values" This reverts commit 1fbeb27caf01456e489b3c14dbdbd9d50816edd5. commit 33c125b0a8d140d9f6333d1334784b6eaae89009 Author: kondors1995 Date: Sat Aug 17 11:59:18 2024 +0300 Revert "UPSTREAM: sched/uclamp: Fix initialization of struct uclamp_rq" This reverts commit 294a7066186224c38540adb652e852762a5e60cc. commit fd06089869bde9880e4953e1f43f288db2fc5b14 Author: kondors1995 Date: Sat Aug 17 11:59:18 2024 +0300 Revert "BACKPORT: sched/uclamp: Protect uclamp fast path code with static key" This reverts commit b2f4e5d8c3f7684796b442a2cecfecd93f25a675. commit 592e4a8ed2c0a59fac12c39b2a023c3517c7c801 Author: kondors1995 Date: Sat Aug 17 11:59:18 2024 +0300 Revert "UPSTREAM: sched/uclamp: Fix a deadlock when enabling uclamp static key" This reverts commit 3eb05145c9dbdc5cb046ae836b330a3f4a2df427. commit 60dc9ca3b6caf6db7cb3b209527111812b78587b Author: kondors1995 Date: Sat Aug 17 11:59:13 2024 +0300 Revert "BACKPORT: sched/uclamp: Add a new sysctl to control RT default boost value" This reverts commit 1abcbcbea7d4c646092ba9a2418cf9fe83121a6b. commit 72296d4745d8b53ccedd64edfaa455ca6c8fae9c Author: kondors1995 Date: Sat Aug 17 11:58:42 2024 +0300 Revert "ANDROID: sched/core: Add a latency-sensitive flag to uclamp" This reverts commit b5fa516b3aa41f6095b4badca38273f064f434ea. commit 9b478422f4676d419918afb41a71015a02f79a84 Author: kondors1995 Date: Sat Aug 17 11:58:42 2024 +0300 Revert "ANDROID: sched: Introduce uclamp latency and boost wrapper" This reverts commit fe73bc33dfdeb0c59b9ef177af505402a09b4fd7. commit 2de1a0534caaa6fa2255b9eda5d64fb2b9daa209 Author: kondors1995 Date: Sat Aug 17 11:58:42 2024 +0300 Revert "sched/fair: Modify boosted_task_util() to reflect uclamp changes" This reverts commit 8c84f3cd7fa1ef844aca19ceb2ad11e89750af27. commit 523dab791c4ebd52b8b6473d52967edeb7bdbfd8 Author: kondors1995 Date: Sat Aug 17 11:58:36 2024 +0300 Revert "Revert "sched/fair: Revert Google's capacity margin hacks"" This reverts commit b623172b47e4655c5eab885f4eb8517ba2392928. commit 43bd6d52219dcca4cdbd64129e4c4b6919549d9a Author: kondors1995 Date: Sat Aug 17 11:58:36 2024 +0300 Revert "Revert "sched: Stub prefer_high_cap"" This reverts commit 8e1140bfebc45a509e28bc4962316487c2410792. commit 64662ef0b3e7c6b11d7516ebc3f83483a1fa71a9 Author: kondors1995 Date: Sat Aug 17 11:58:30 2024 +0300 Revert "Revert "Revert "sched: separate boost signal from placement hint""" This reverts commit 79111a20d915fbc86c41239bbc4b0b4d10b364a5. commit 1bac20806128df71140bd14c52b83dc1117ade38 Author: kondors1995 Date: Sat Aug 17 11:57:50 2024 +0300 Revert "Revert "Revert "sched/fair: check if mid capacity cpu exists""" This reverts commit 7508259cb6aad46cd1d4a0d9cdb2c918411e3b92. commit 42e0ef5e1b4f56d6337c1a6f3504d275cbc9a043 Author: kondors1995 Date: Sat Aug 17 11:56:58 2024 +0300 Revert "Revert "Revert "sched: separate capacity margin for boosted tasks""" This reverts commit 435cf700fa3b7e968b90f0061cb95260f62ece7e. commit eab623c38f37d7cc0973478f55256173ce3d213d Author: kondors1995 Date: Sat Aug 17 11:56:57 2024 +0300 Revert "Revert "Revert "sched: change capacity margin down to 20% for non-boost task""" This reverts commit 0567c2dfa0b34ad7b492f075d5ce3bae4d84977f. commit 81f6c4feb22ee8da71f7adde5513ce1ae8a48346 Author: kondors1995 Date: Sat Aug 17 11:56:57 2024 +0300 Revert "Revert "Revert "sched/fair: use actual cpu capacity to calculate boosted util""" This reverts commit 9468b2ce801ec8799f68742e52aa1ed58d587a21. commit c5bff7085b144149dc1ed774a3312c19a1669103 Author: kondors1995 Date: Sat Aug 17 11:56:57 2024 +0300 Revert "Revert "Revert "sched/fair: do not use boosted margin for prefer_high_cap case""" This reverts commit 25f6b813b40bc4eddf9dee0ea675208e8ff624ef. commit d5cc737eb99b7aeced3b5d1546799ce1cffe0129 Author: kondors1995 Date: Sat Aug 17 11:56:57 2024 +0300 Revert "Revert "Revert "Revert "sched/fair: use actual cpu capacity to calculate boosted util"""" This reverts commit 607d0c1f5ac9dff3b34cc7874d9b07c5d2b71bfb. commit 944f85f208f1a520eb9759571d49f75db84b3ffc Author: kondors1995 Date: Sat Aug 17 11:56:50 2024 +0300 Revert "sched/fair: Make boosted and prefer_idle tunables uclamp aware" This reverts commit 8d583957240d990d35b3ae951e09c914b77f7c46. commit ff7fc5b1110dd9b31817b7186467938974ed8017 Author: kondors1995 Date: Sat Aug 17 11:56:43 2024 +0300 Revert "sched/uclamp: Fix incorrect uclamp.latency_sensitive setting" This reverts commit 282a8ac6f588cce1aebf7b01f3281cba98b4bf0d. commit 4e5d3a1c362e4c3616126dfc3422d6207eeae977 Author: kondors1995 Date: Sat Aug 17 11:56:43 2024 +0300 Revert "sched/uclamp: Make uclamp_boosted() return proper boosted value" This reverts commit e1ffe11cccd14b460289c0d3e5f7bb0bb1717ac5. commit b6ae08ea8bbe41f3c4b90b303e2c39a4c7c5f7b3 Author: kondors1995 Date: Sat Aug 17 11:56:10 2024 +0300 Revert "sched/uclamp: Allow to reset a task uclamp constraint value" This reverts commit b6b69eece539ebeed9bd2276a9a1dcedb28a7665. commit 658ddb506825027979ee263aff8d55e0cff3dd98 Author: kondors1995 Date: Sat Aug 17 11:55:14 2024 +0300 Revert "UPSTREAM: sched/uclamp: Fix incorrect condition" This reverts commit ce1a62791ee282cad50a8c37ff6a0083174ceacd. commit 1c5a70035dafad8750eafa8c5ee40b11115be18e Author: kondors1995 Date: Sat Aug 17 11:55:14 2024 +0300 Revert "FROMGIT: sched/uclamp: Fix a bug in propagating uclamp value in new cgroups" This reverts commit 16d73cafeabe4c9726f170839baa325b9d6399fa. commit ed892012da806647a464df46203dad13ca07e4d8 Author: kondors1995 Date: Sat Aug 17 11:55:14 2024 +0300 Revert "sched/uclamp: Fix locking around cpu_util_update_eff()" This reverts commit e954be1dbf9565e866a0a2e191d88916076a8892. commit 505f58a688443900fdaeb4cca4e3ca66b0cf867c Author: kondors1995 Date: Sat Aug 17 11:55:13 2024 +0300 Revert "sched/uclamp: Remove unnecessary mutex_init()" This reverts commit 31714e404d1478317f97bf068ddf8890e8c6df52. commit 8f96c904caa429cb4e413bae75e746a38b9f1214 Author: kondors1995 Date: Sat Aug 17 11:55:13 2024 +0300 Revert "FROMLIST: sched: Fix out-of-bound access in uclamp" This reverts commit 973affb92e72ba195d187c904e4fe5ab0cd5e478. commit 29df4134d5a23f686000ebfd31ee3edaf2f6bb44 Author: kondors1995 Date: Sat Aug 17 11:55:13 2024 +0300 Revert "sched/uclamp: Fix wrong implementation of cpu.uclamp.min" This reverts commit 07c3a9af2f16b0bdbda148b62f64441075d1eafb. commit 4efefc8fcc48eaff905fc548099b5c5f181e8c8c Author: kondors1995 Date: Sat Aug 17 11:55:13 2024 +0300 Revert "sched/uclamp: Fix uclamp_tg_restrict()" This reverts commit 76563866876422268a39e26ade95c39c73933f4b. commit 7d5d5fe448300a61d6718919d3c38277e32285db Author: kondors1995 Date: Sat Aug 17 11:55:13 2024 +0300 Revert "sched/uclamp: Ignore max aggregation if rq is idle" This reverts commit 08a6710f896689e691316ea2f617bc8524badf4c. commit 488c9711417e6fc0d07401e928e9523bd241915c Author: kondors1995 Date: Sat Aug 17 11:55:13 2024 +0300 Revert "ANDROID: sched: Make uclamp changes depend on CAP_SYS_NICE" This reverts commit 957d737358746d018b17a024d914b7637301e970. commit 1ac8237c8049014ea11f5505c95fb0c91d5608d1 Author: kondors1995 Date: Sat Aug 17 11:55:13 2024 +0300 Revert "sched: Fix UCLAMP_FLAG_IDLE setting" This reverts commit e9a6a0f1062d90141caf298c4b1fd797056da4d6. commit 0ef1e5ecb196bd556499ba0a7021a53ed87cadc4 Author: kondors1995 Date: Sat Aug 17 11:54:15 2024 +0300 Revert "sched/uclamp: Fix rq->uclamp_max not set on first enqueue" This reverts commit 977a80d7951e9cc356e85378bdbe856a6fab3fb8. commit b69fc8ea5fbcc014e4a8fdafefb468dc348baa38 Author: kondors1995 Date: Sat Aug 17 11:53:53 2024 +0300 Revert "sched/fair: Reduce busy load balance interval" This reverts commit c54355abd4ab0e8d81c3ba5e5d22f9934183aabc. commit 45daeebd161dab7ff6550030d0a8bcb302b2c271 Author: kondors1995 Date: Sat Aug 17 11:53:40 2024 +0300 Revert "sched/fair: Reduce minimal imbalance threshold" This reverts commit 47a4df59c1bc5de153cacc79c287a0b4626591e7. commit 8557606f8d15209e2f68ff49485e62fb97a441c3 Author: kondors1995 Date: Sat Aug 17 11:53:32 2024 +0300 Revert "sched/deadline: Fix stale throttling on de-/boosted tasks" This reverts commit 1f2196de96395b2125287bb8df3838d67495b752. commit 705944ee989af6d24932895c541abca24caa1b21 Author: kondors1995 Date: Sat Aug 17 11:53:22 2024 +0300 Revert "sched: fair: consider all running tasks in cpu for load balance" This reverts commit c1e04a4d1360eac58d9e731b93bb4676a08b9b99. commit ecffc3948595a8cb3c246135082ee17ab7d414ca Author: kondors1995 Date: Sat Aug 17 11:53:03 2024 +0300 Revert "sched/fair: Revert Google's capacity margin hacks" This reverts commit e7c82d8c4debb8e558d36eb81507f86e223298fa. commit c064aca8d86c768eb1ebf8536c25e49718cb55b9 Author: kondors1995 Date: Sat Aug 17 11:48:56 2024 +0300 Revert "BACKPORT: sched/fair: Make task_fits_capacity() consider uclamp restrictions" This reverts commit a59a899e6a995cd2ba80a5b041d89353e6afb117. # Conflicts: # include/linux/sched/sysctl.h # kernel/sched/fair.c # kernel/sched/sched.h # kernel/sched/tune.c --- Documentation/cgroup-v2.txt | 34 ++++++++++ Documentation/scheduler/sched-tune.txt | 4 +- include/linux/sched/sysctl.h | 8 +-- kernel/sched/core.c | 84 ++++++++++++++++++++++--- kernel/sched/cpupri.c | 25 +++++++- kernel/sched/cpupri.h | 5 +- kernel/sched/fair.c | 72 ++++++--------------- kernel/sched/rt.c | 81 +++++++++++++++++++----- kernel/sched/sched.h | 86 ++++++-------------------- kernel/sched/tune.c | 5 +- kernel/sched/tune.h | 4 +- 11 files changed, 252 insertions(+), 156 deletions(-) diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt index 7e3672812510..6d247618dca2 100644 --- a/Documentation/cgroup-v2.txt +++ b/Documentation/cgroup-v2.txt @@ -902,6 +902,13 @@ controller implements weight and absolute bandwidth limit models for normal scheduling policy and absolute bandwidth allocation model for realtime scheduling policy. +In all the above models, cycles distribution is defined only on a temporal +base and it does not account for the frequency at which tasks are executed. +The (optional) utilization clamping support allows to hint the schedutil +cpufreq governor about the minimum desired frequency which should always be +provided by a CPU, as well as the maximum desired frequency, which should not +be exceeded by a CPU. + CPU Interface Files ~~~~~~~~~~~~~~~~~~~ @@ -964,6 +971,33 @@ All time durations are in microseconds. Shows pressure stall information for CPU. See Documentation/accounting/psi.txt for details. + cpu.uclamp.min + A read-write single value file which exists on non-root cgroups. + The default is "0", i.e. no utilization boosting. + + The requested minimum utilization (protection) as a percentage + rational number, e.g. 12.34 for 12.34%. + + This interface allows reading and setting minimum utilization clamp + values similar to the sched_setattr(2). This minimum utilization + value is used to clamp the task specific minimum utilization clamp. + + The requested minimum utilization (protection) is always capped by + the current value for the maximum utilization (limit), i.e. + `cpu.uclamp.max`. + + cpu.uclamp.max + A read-write single value file which exists on non-root cgroups. + The default is "max". i.e. no utilization capping + + The requested maximum utilization (limit) as a percentage rational + number, e.g. 98.76 for 98.76%. + + This interface allows reading and setting maximum utilization clamp + values similar to the sched_setattr(2). This maximum utilization + value is used to clamp the task specific maximum utilization clamp. + + Memory ------ diff --git a/Documentation/scheduler/sched-tune.txt b/Documentation/scheduler/sched-tune.txt index 1a103715f7bd..be728705fe25 100644 --- a/Documentation/scheduler/sched-tune.txt +++ b/Documentation/scheduler/sched-tune.txt @@ -233,9 +233,9 @@ Thus, with the sched_cfs_boost enabled we have the following main functions to get the current utilization of a CPU: cpu_util() - boosted_cpu_util() + stune_util() -The new boosted_cpu_util() is similar to the first but returns a boosted +The new stune_util() is similar to the first but returns a boosted utilization signal which is a function of the sched_cfs_boost value. This function is used in the CFS scheduler code paths where schedutil needs to diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 2ee605775225..4091e5547a69 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -116,16 +116,16 @@ extern int sched_rt_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); +extern int sched_updown_migrate_handler(struct ctl_table *table, + int write, void __user *buffer, + size_t *lenp, loff_t *ppos); + #ifdef CONFIG_UCLAMP_TASK extern int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); #endif -extern int sched_updown_migrate_handler(struct ctl_table *table, - int write, void __user *buffer, - size_t *lenp, loff_t *ppos); - extern int sysctl_numa_balancing(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fa4811170da3..d64bc47e0532 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -45,6 +45,7 @@ #include "sched.h" #include "walt.h" +#include "tune.h" #include "../workqueue_internal.h" #include "../smpboot.h" @@ -800,7 +801,7 @@ unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE; * This knob will not override the system default sched_util_clamp_min defined * above. */ -unsigned int sysctl_sched_uclamp_util_min_rt_default = 0; +unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE; /* All clamps are required to be less or equal than these values */ static struct uclamp_se uclamp_default[UCLAMP_CNT]; @@ -836,12 +837,7 @@ static inline unsigned int uclamp_bucket_id(unsigned int clamp_value) return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1); } -static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value) -{ - return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value); -} - -static inline enum uclamp_id uclamp_none(enum uclamp_id clamp_id) +static inline unsigned int uclamp_none(enum uclamp_id clamp_id) { if (clamp_id == UCLAMP_MIN) return 0; @@ -884,7 +880,7 @@ static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id, } static inline -enum uclamp_id uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id, +unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id, unsigned int clamp_value) { struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket; @@ -1450,6 +1446,40 @@ static void uclamp_post_fork(struct task_struct *p) uclamp_update_util_min_rt_default(p); } +#ifdef CONFIG_SMP +unsigned int uclamp_task(struct task_struct *p) +{ + unsigned long util; + + util = task_util_est(p); + util = max(util, uclamp_eff_value(p, UCLAMP_MIN)); + util = min(util, uclamp_eff_value(p, UCLAMP_MAX)); + + return util; +} + +bool uclamp_boosted(struct task_struct *p) +{ + return uclamp_eff_value(p, UCLAMP_MIN) > 0; +} + +bool uclamp_latency_sensitive(struct task_struct *p) +{ +#ifdef CONFIG_UCLAMP_TASK_GROUP + struct cgroup_subsys_state *css = task_css(p, cpu_cgrp_id); + struct task_group *tg; + + if (!css) + return false; + tg = container_of(css, struct task_group, css); + + return tg->latency_sensitive; +#else + return false; +#endif +} +#endif /* CONFIG_SMP */ + static void __init init_uclamp_rq(struct rq *rq) { enum uclamp_id clamp_id; @@ -1501,6 +1531,41 @@ static void __setscheduler_uclamp(struct task_struct *p, const struct sched_attr *attr) { } static inline void uclamp_fork(struct task_struct *p) { } static inline void uclamp_post_fork(struct task_struct *p) { } + +long schedtune_task_margin(struct task_struct *task); + +#ifdef CONFIG_SMP +unsigned int uclamp_task(struct task_struct *p) +{ + unsigned long util = task_util_est(p); +#ifdef CONFIG_SCHED_TUNE + long margin = schedtune_task_margin(p); + + trace_sched_boost_task(p, util, margin); + + util += margin; +#endif + + return util; +} + +bool uclamp_boosted(struct task_struct *p) +{ +#ifdef CONFIG_SCHED_TUNE + return schedtune_task_boost(p) > 0; +#endif + return false; +} + +bool uclamp_latency_sensitive(struct task_struct *p) +{ +#ifdef CONFIG_SCHED_TUNE + return schedtune_prefer_idle(p) != 0; +#endif + return false; +} +#endif /* CONFIG_SMP */ + static inline void init_uclamp(void) { } #endif /* CONFIG_UCLAMP_TASK */ @@ -8273,6 +8338,9 @@ static void cpu_util_update_eff(struct cgroup_subsys_state *css) enum uclamp_id clamp_id; unsigned int clamps; + lockdep_assert_held(&uclamp_mutex); + SCHED_WARN_ON(!rcu_read_lock_held()); + css_for_each_descendant_pre(css, top_css) { uc_parent = css_tg(css)->parent ? css_tg(css)->parent->uclamp : NULL; diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index f37daebed44e..487e4fdf5055 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -55,6 +55,8 @@ static int convert_prio(int prio) * @cp: The cpupri context * @p: The task * @lowest_mask: A mask to fill in with selected CPUs (or NULL) + * @fitness_fn: A pointer to a function to do custom checks whether the CPU + * fits a specific criteria so that we only return those CPUs. * * Note: This function returns the recommended CPUs as calculated during the * current invocation. By the time the call returns, the CPUs may have in @@ -66,7 +68,8 @@ static int convert_prio(int prio) * Return: (int)bool - CPUs were found */ int cpupri_find(struct cpupri *cp, struct task_struct *p, - struct cpumask *lowest_mask) + struct cpumask *lowest_mask, + bool (*fitness_fn)(struct task_struct *p, int cpu)) { int idx = 0; int task_pri = convert_prio(p->prio); @@ -107,6 +110,8 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, continue; if (lowest_mask) { + int cpu; + cpumask_and(lowest_mask, p->cpus_ptr, vec->mask); cpumask_andnot(lowest_mask, lowest_mask, cpu_isolated_mask); @@ -119,7 +124,23 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, * condition, simply act as though we never hit this * priority level and continue on. */ - if (cpumask_any(lowest_mask) >= nr_cpu_ids) + if (cpumask_empty(lowest_mask)) + continue; + + if (!fitness_fn) + return 1; + + /* Ensure the capacity of the CPUs fit the task */ + for_each_cpu(cpu, lowest_mask) { + if (!fitness_fn(p, cpu)) + cpumask_clear_cpu(cpu, lowest_mask); + } + + /* + * If no CPU at the current priority can fit the task + * continue looking + */ + if (cpumask_empty(lowest_mask)) continue; } diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index bab050019071..c08add835730 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h @@ -22,8 +22,9 @@ struct cpupri { }; #ifdef CONFIG_SMP -int cpupri_find(struct cpupri *cp, - struct task_struct *p, struct cpumask *lowest_mask); +int cpupri_find(struct cpupri *cp, struct task_struct *p, + struct cpumask *lowest_mask, + bool (*fitness_fn)(struct task_struct *p, int cpu)); void cpupri_set(struct cpupri *cp, int cpu, int pri); int cpupri_init(struct cpupri *cp); void cpupri_cleanup(struct cpupri *cp); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 797fd81f470c..339ae3761ba6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4113,7 +4113,7 @@ static inline unsigned long _task_util_est(struct task_struct *p) return (max(ue.ewma, ue.enqueued) | UTIL_AVG_UNCHANGED); } -static inline unsigned long task_util_est(struct task_struct *p) +unsigned long task_util_est(struct task_struct *p) { #ifdef CONFIG_SCHED_WALT if (likely(!walt_disabled && sysctl_sched_use_walt_task_util)) @@ -4132,7 +4132,7 @@ static inline unsigned long uclamp_task_util(struct task_struct *p) #else static inline unsigned long uclamp_task_util(struct task_struct *p) { - return boosted_task_util(p); + return task_util_est(p); } #endif @@ -6875,19 +6875,23 @@ schedtune_margin(unsigned long signal, long boost, long capacity) return margin; } -static inline int -schedtune_cpu_margin(unsigned long util, int cpu) +inline long +schedtune_cpu_margin_with(unsigned long util, int cpu, struct task_struct *p) { - int boost = schedtune_cpu_boost(cpu); + int boost = schedtune_cpu_boost_with(cpu, p); + long margin; if (boost == 0) - return 0; + margin = 0; + else + margin = schedtune_margin(util, boost); - return schedtune_margin(util, boost, capacity_orig_of(cpu)); + trace_sched_boost_cpu(cpu, util, margin); + + return margin; } -static inline long -schedtune_task_margin(struct task_struct *task) +long schedtune_task_margin(struct task_struct *task) { int boost = schedtune_task_boost(task); unsigned long util; @@ -6904,50 +6908,14 @@ schedtune_task_margin(struct task_struct *task) #else /* CONFIG_SCHED_TUNE */ -static inline int -schedtune_cpu_margin(unsigned long util, int cpu) -{ - return 0; -} - -static inline int -schedtune_task_margin(struct task_struct *task) +inline long +schedtune_cpu_margin_with(unsigned long util, int cpu, struct task_struct *p) { return 0; } #endif /* CONFIG_SCHED_TUNE */ -unsigned long -boosted_cpu_util(int cpu, struct sched_walt_cpu_load *walt_load) -{ - unsigned long util = cpu_util_freq(cpu, walt_load); - long margin = schedtune_cpu_margin(util, cpu); - - trace_sched_boost_cpu(cpu, util, margin); - - return util + margin; -} - -static inline unsigned long -boosted_task_util(struct task_struct *task) -{ -#ifdef CONFIG_UCLAMP_TASK_GROUP - unsigned long util = task_util_est(task); - unsigned long util_min = uclamp_eff_value(task, UCLAMP_MIN); - unsigned long util_max = uclamp_eff_value(task, UCLAMP_MAX); - - return clamp(util, util_min, util_max); -#else - unsigned long util = task_util_est(task); - long margin = schedtune_task_margin(task); - - trace_sched_boost_task(task, util, margin); - - return util + margin; -#endif -} - static unsigned long cpu_util_without(int cpu, struct task_struct *p); static unsigned long capacity_spare_without(int cpu, struct task_struct *p) @@ -7465,7 +7433,7 @@ static inline int select_idle_sibling_cstate_aware(struct task_struct *p, int pr continue; /* figure out if the task can fit here at all */ - new_usage = boosted_task_util(p); + new_usage = uclamp_task(p); capacity_orig = capacity_orig_of(i); if (new_usage > capacity_orig) @@ -7627,7 +7595,7 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu, bool prefer_idle, struct find_best_target_env *fbt_env) { - unsigned long min_util = boosted_task_util(p); + unsigned long min_util = uclamp_task(p); unsigned long target_capacity = ULONG_MAX; unsigned long min_wake_util = ULONG_MAX; unsigned long target_max_spare_cap = 0; @@ -7724,10 +7692,6 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu, if (sched_cpu_high_irqload(i)) continue; - /* Skip CPUs which do not fit task requirements */ - if (capacity_of(i) < boosted_task_util(p)) - continue; - /* * p's blocked utilization is still accounted for on prev_cpu * so prev_cpu will receive a negative bias due to the double @@ -8280,7 +8244,7 @@ static inline struct energy_env *get_eenv(struct task_struct *p, int prev_cpu) * util for group utilization calculations */ eenv->util_delta = task_util_est(p); - eenv->util_delta_boosted = boosted_task_util(p); + eenv->util_delta_boosted = uclamp_task(p); cpumask_and(&cpumask_possible_cpus, p->cpus_ptr, cpu_online_mask); eenv->max_cpu_count = cpumask_weight(&cpumask_possible_cpus); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index d6ece8c45020..8cd41e4b39e9 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -458,6 +458,45 @@ static inline int on_rt_rq(struct sched_rt_entity *rt_se) return rt_se->on_rq; } +#ifdef CONFIG_UCLAMP_TASK +/* + * Verify the fitness of task @p to run on @cpu taking into account the uclamp + * settings. + * + * This check is only important for heterogeneous systems where uclamp_min value + * is higher than the capacity of a @cpu. For non-heterogeneous system this + * function will always return true. + * + * The function will return true if the capacity of the @cpu is >= the + * uclamp_min and false otherwise. + * + * Note that uclamp_min will be clamped to uclamp_max if uclamp_min + * > uclamp_max. + */ +static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) +{ + unsigned int min_cap; + unsigned int max_cap; + unsigned int cpu_cap; + + /* Only heterogeneous systems can benefit from this check */ + if (!static_branch_unlikely(&sched_asym_cpucapacity)) + return true; + + min_cap = uclamp_eff_value(p, UCLAMP_MIN); + max_cap = uclamp_eff_value(p, UCLAMP_MAX); + + cpu_cap = capacity_orig_of(cpu); + + return cpu_cap >= min(min_cap, max_cap); +} +#else +static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) +{ + return true; +} +#endif + #ifdef CONFIG_RT_GROUP_SCHED static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) @@ -1481,6 +1520,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) { struct task_struct *curr; struct rq *rq; + bool test; /* For anything but wake ups, just return the task_cpu */ if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) @@ -1512,11 +1552,17 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) * * This test is optimistic, if we get it wrong the load-balancer * will have to sort it out. + * + * We take into account the capacity of the CPU to ensure it fits the + * requirement of the task - which is only important on heterogeneous + * systems like big.LITTLE. */ - if (energy_aware() || + test = energy_aware() || (curr && unlikely(rt_task(curr)) && (curr->nr_cpus_allowed < 2 || - curr->prio <= p->prio))) { + curr->prio <= p->prio)); + + if (test || !rt_task_fits_capacity(p, cpu)) { int target = find_lowest_rq(p); /* @@ -1540,15 +1586,15 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) * let's hope p can move out. */ if (rq->curr->nr_cpus_allowed == 1 || - !cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) + !cpupri_find(&rq->rd->cpupri, rq->curr, NULL, NULL)) return; /* * p is migratable, so let's not schedule it and * see if it is pushed or pulled somewhere else. */ - if (p->nr_cpus_allowed != 1 - && cpupri_find(&rq->rd->cpupri, p, NULL)) + if (p->nr_cpus_allowed != 1 && + cpupri_find(&rq->rd->cpupri, p, NULL, NULL)) return; /* @@ -1706,7 +1752,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && - cpumask_test_cpu(cpu, p->cpus_ptr)) + cpumask_test_cpu(cpu, p->cpus_ptr) && + rt_task_fits_capacity(p, cpu)) return 1; return 0; } @@ -1850,7 +1897,8 @@ static int find_lowest_rq(struct task_struct *task) if (task->nr_cpus_allowed == 1) return -1; /* No other targets possible */ - if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) + if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask, + rt_task_fits_capacity)) return -1; /* No targets found */ if (energy_aware()) @@ -2368,12 +2416,14 @@ skip: */ static void task_woken_rt(struct rq *rq, struct task_struct *p) { - if (!task_running(rq, p) && - !test_tsk_need_resched(rq->curr) && - p->nr_cpus_allowed > 1 && - (dl_task(rq->curr) || rt_task(rq->curr)) && - (rq->curr->nr_cpus_allowed < 2 || - rq->curr->prio <= p->prio)) + bool need_to_push = !task_running(rq, p) && + !test_tsk_need_resched(rq->curr) && + p->nr_cpus_allowed > 1 && + (dl_task(rq->curr) || rt_task(rq->curr)) && + (rq->curr->nr_cpus_allowed < 2 || + rq->curr->prio <= p->prio); + + if (need_to_push || !rt_task_fits_capacity(p, cpu_of(rq))) push_rt_tasks(rq); } @@ -2446,7 +2496,10 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) */ if (task_on_rq_queued(p) && rq->curr != p) { #ifdef CONFIG_SMP - if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) + bool need_to_push = rq->rt.overloaded || + !rt_task_fits_capacity(p, cpu_of(rq)); + + if (p->nr_cpus_allowed > 1 && need_to_push) queue_push_tasks(rq); #endif /* CONFIG_SMP */ if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq))) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d2fa46b12a42..9fb0b3530f94 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -427,8 +427,6 @@ struct task_group { struct uclamp_se uclamp[UCLAMP_CNT]; /* Latency-sensitive flag used for a task group */ unsigned int latency_sensitive; - /* Boosted flag for a task group */ - unsigned int boosted; #endif }; @@ -953,15 +951,16 @@ struct rq { unsigned long nr_load_updates; u64 nr_switches; + struct cfs_rq cfs; + struct rt_rq rt; + struct dl_rq dl; + #ifdef CONFIG_UCLAMP_TASK /* Utilization clamp values based on CPU's RUNNABLE tasks */ struct uclamp_rq uclamp[UCLAMP_CNT] ____cacheline_aligned; unsigned int uclamp_flags; #define UCLAMP_FLAG_IDLE 0x01 #endif - struct cfs_rq cfs; - struct rt_rq rt; - struct dl_rq dl; #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ @@ -2323,7 +2322,7 @@ cpu_util_freq_walt(int cpu, struct sched_walt_cpu_load *walt_load) static inline unsigned long cpu_util_freq(int cpu, struct sched_walt_cpu_load *walt_load) { - return cpu_util_freq_walt(cpu, walt_load); + return min(cpu_util(cpu), capacity_orig_of(cpu)); } #else @@ -2412,12 +2411,6 @@ static inline unsigned long cpu_util(int cpu) return min(__cpu_util(cpu) + cpu_util_rt(cpu_rq(cpu)), capacity_orig_of(cpu)); } -static inline unsigned long -cpu_util_freq(int cpu, struct sched_walt_cpu_load *walt_load) -{ - return min(cpu_util(cpu), capacity_orig_of(cpu)); -} - extern unsigned int capacity_margin_freq; static inline unsigned long @@ -2774,6 +2767,17 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #endif /* CONFIG_CPU_FREQ */ +#ifdef CONFIG_SCHED_WALT + +static inline bool +walt_task_in_cum_window_demand(struct rq *rq, struct task_struct *p) +{ + return cpu_of(rq) == task_cpu(p) && + (p->on_rq || p->last_sleep_ts >= rq->window_start); +} + +#endif /* CONFIG_SCHED_WALT */ + #ifdef CONFIG_UCLAMP_TASK unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); @@ -2892,60 +2896,10 @@ static inline bool uclamp_rq_is_idle(struct rq *rq) } #endif /* CONFIG_UCLAMP_TASK */ -#ifdef CONFIG_UCLAMP_TASK_GROUP -static inline bool uclamp_latency_sensitive(struct task_struct *p) -{ - struct cgroup_subsys_state *css = task_css(p, cpuset_cgrp_id); - struct task_group *tg; - - if (!css) - return false; - - if (!strlen(css->cgroup->kn->name)) - return 0; - - tg = container_of(css, struct task_group, css); - - return tg->latency_sensitive; -} - -static inline bool uclamp_boosted(struct task_struct *p) -{ - struct cgroup_subsys_state *css = task_css(p, cpuset_cgrp_id); - struct task_group *tg; - - if (!css) - return false; - - if (!strlen(css->cgroup->kn->name)) - return 0; - - tg = container_of(css, struct task_group, css); - - return tg->boosted; -} -#else -static inline bool uclamp_latency_sensitive(struct task_struct *p) -{ - return false; -} - -static inline bool uclamp_boosted(struct task_struct *p) -{ - return false; -} -#endif /* CONFIG_UCLAMP_TASK_GROUP */ - -#ifdef CONFIG_SCHED_WALT - -static inline bool -walt_task_in_cum_window_demand(struct rq *rq, struct task_struct *p) -{ - return cpu_of(rq) == task_cpu(p) && - (p->on_rq || p->last_sleep_ts >= rq->window_start); -} - -#endif /* CONFIG_SCHED_WALT */ +unsigned long task_util_est(struct task_struct *p); +unsigned int uclamp_task(struct task_struct *p); +bool uclamp_latency_sensitive(struct task_struct *p); +bool uclamp_boosted(struct task_struct *p); #ifdef arch_scale_freq_capacity #ifndef arch_scale_freq_invariant diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index eb1028697665..5f5c94e2a1df 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -529,10 +529,11 @@ void schedtune_dequeue_task(struct task_struct *p, int cpu) raw_spin_unlock_irqrestore(&bg->lock, irq_flags); } -int schedtune_cpu_boost(int cpu) +int schedtune_cpu_boost_with(int cpu, struct task_struct *p) { struct boost_groups *bg; u64 now; + int task_boost = p ? schedtune_task_boost(p) : -100; bg = &per_cpu(cpu_boost_groups, cpu); now = sched_clock_cpu(cpu); @@ -541,7 +542,7 @@ int schedtune_cpu_boost(int cpu) if (schedtune_boost_timeout(now, bg->boost_ts)) schedtune_cpu_update(cpu, now); - return bg->boost_max; + return max(bg->boost_max, task_boost); } int schedtune_task_boost(struct task_struct *p) diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h index 9508c151a42b..4ab18eddd8e6 100644 --- a/kernel/sched/tune.h +++ b/kernel/sched/tune.h @@ -12,7 +12,7 @@ struct target_nrg { struct reciprocal_value rdiv; }; -int schedtune_cpu_boost(int cpu); +int schedtune_cpu_boost_with(int cpu, struct task_struct *p); int schedtune_task_boost(struct task_struct *tsk); int schedtune_task_boost_rcu_locked(struct task_struct *tsk); @@ -23,7 +23,7 @@ void schedtune_dequeue_task(struct task_struct *p, int cpu); #else /* CONFIG_SCHED_TUNE */ -#define schedtune_cpu_boost(cpu) 0 +#define schedtune_cpu_boost_with(cpu, p) 0 #define schedtune_task_boost(tsk) 0 #define schedtune_prefer_idle(tsk) 0 From 4915f9a08afc3a883b4f011267469ff119d2cc69 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 8 May 2023 08:17:44 +0200 Subject: [PATCH 20/30] Revert "softirq: Let ksoftirqd do its job" This reverts the following commits: 4cd13c21b207 ("softirq: Let ksoftirqd do its job") 3c53776e29f8 ("Mark HI and TASKLET softirq synchronous") 1342d8080f61 ("softirq: Don't skip softirq execution when softirq thread is parking") in a single change to avoid known bad intermediate states introduced by a patch series reverting them individually. Due to the mentioned commit, when the ksoftirqd threads take charge of softirq processing, the system can experience high latencies. In the past a few workarounds have been implemented for specific side-effects of the initial ksoftirqd enforcement commit: commit 1ff688209e2e ("watchdog: core: make sure the watchdog_worker is not deferred") commit 8d5755b3f77b ("watchdog: softdog: fire watchdog even if softirqs do not get to run") commit 217f69743681 ("net: busy-poll: allow preemption in sk_busy_loop()") commit 3c53776e29f8 ("Mark HI and TASKLET softirq synchronous") But the latency problem still exists in real-life workloads, see the link below. The reverted commit intended to solve a live-lock scenario that can now be addressed with the NAPI threaded mode, introduced with commit 29863d41bb6e ("net: implement threaded-able napi poll loop support"), which is nowadays in a pretty stable status. While a complete solution to put softirq processing under nice resource control would be preferable, that has proven to be a very hard task. In the short term, remove the main pain point, and also simplify a bit the current softirq implementation. Signed-off-by: Paolo Abeni Signed-off-by: Thomas Gleixner Tested-by: Jason Xing Reviewed-by: Jakub Kicinski Reviewed-by: Eric Dumazet Reviewed-by: Sebastian Andrzej Siewior Cc: "Paul E. McKenney" Cc: Peter Zijlstra Cc: netdev@vger.kernel.org Link: https://lore.kernel.org/netdev/305d7742212cbe98621b16be782b0562f1012cb6.camel@redhat.com Link: https://lore.kernel.org/r/57e66b364f1b6f09c9bc0316742c3b14f4ce83bd.1683526542.git.pabeni@redhat.com (cherry picked from commit d15121be7485655129101f3960ae6add40204463) Change-Id: If014afbfa3bb56f7c490a22b8334857c8308f901 Signed-off-by: Alexander Winkowski --- kernel/softirq.c | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/kernel/softirq.c b/kernel/softirq.c index 4896a0eb178e..5f8ebcccbb0a 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -77,21 +77,6 @@ static void wakeup_softirqd(void) wake_up_process(tsk); } -/* - * If ksoftirqd is scheduled, we do not want to process pending softirqs - * right now. Let ksoftirqd handle this at its own rate, to get fairness, - * unless we're doing some of the synchronous softirqs. - */ -#define SOFTIRQ_NOW_MASK ((1 << HI_SOFTIRQ) | (1 << TASKLET_SOFTIRQ)) -static bool ksoftirqd_running(unsigned long pending) -{ - struct task_struct *tsk = __this_cpu_read(ksoftirqd); - - if (pending & SOFTIRQ_NOW_MASK) - return false; - return tsk && (tsk->state == TASK_RUNNING); -} - /* * preempt_count and SOFTIRQ_OFFSET usage: * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving @@ -333,7 +318,7 @@ asmlinkage __visible void do_softirq(void) pending = local_softirq_pending(); - if (pending && !ksoftirqd_running(pending)) + if (pending) do_softirq_own_stack(); local_irq_restore(flags); @@ -360,9 +345,6 @@ void irq_enter(void) static inline void invoke_softirq(void) { - if (ksoftirqd_running(local_softirq_pending())) - return; - if (!force_irqthreads) { #ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK /* From 6be9588eedcbc0df294cc69b78aa0a687ac91225 Mon Sep 17 00:00:00 2001 From: kondors1995 Date: Sun, 25 Aug 2024 22:06:20 +0300 Subject: [PATCH 21/30] Revert "sched/core: wake up from idle by sending IPI_WAKEUP" This reverts commit f49348dd5a0e22fce053a15b335dc1daf9ecb922. --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d64bc47e0532..bfdd75dcab5a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2821,7 +2821,7 @@ void wake_up_if_idle(int cpu) } else { rq_lock_irqsave(rq, &rf); if (is_idle_task(rq->curr)) - arch_send_wakeup_ipi_mask(cpumask_of(cpu)); + smp_send_reschedule(cpu); /* Else CPU is not idle, do nothing here: */ rq_unlock_irqrestore(rq, &rf); } From 5ed0fda4620f5e9745b8963845f599921c89d262 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Sep 2021 22:16:02 +0200 Subject: [PATCH 22/30] sched: Simplify wake_up_*idle*() Simplify and make wake_up_if_idle() more robust, also don't iterate the whole machine with preempt_disable() in it's caller: wake_up_all_idle_cpus(). This prepares for another wake_up_if_idle() user that needs a full do_idle() cycle. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Vasily Gorbik Tested-by: Vasily Gorbik # on s390 Link: https://lkml.kernel.org/r/20210929152428.769328779@infradead.org Change-Id: If9f0dd5d99dd675828656161bfdba299f447f998 (cherry picked from 8850cb663b5cda04d33f9cfbc38889d73d3c8e24) Signed-off-by: Alexander Winkowski --- kernel/sched/core.c | 14 +++++--------- kernel/smp.c | 6 +++--- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bfdd75dcab5a..836ac6c3df85 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2816,15 +2816,11 @@ void wake_up_if_idle(int cpu) if (!is_idle_task(rcu_dereference(rq->curr))) goto out; - if (set_nr_if_polling(rq->idle)) { - trace_sched_wake_idle_without_ipi(cpu); - } else { - rq_lock_irqsave(rq, &rf); - if (is_idle_task(rq->curr)) - smp_send_reschedule(cpu); - /* Else CPU is not idle, do nothing here: */ - rq_unlock_irqrestore(rq, &rf); - } + rq_lock_irqsave(rq, &rf); + if (is_idle_task(rq->curr)) + resched_curr(rq); + /* Else CPU is not idle, do nothing here: */ + rq_unlock_irqrestore(rq, &rf); out: rcu_read_unlock(); diff --git a/kernel/smp.c b/kernel/smp.c index fd749ced516f..fcb8d7285c41 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -811,16 +811,16 @@ void wake_up_all_idle_cpus(void) { int cpu; - preempt_disable(); + cpus_read_lock(); for_each_online_cpu(cpu) { - if (cpu == smp_processor_id()) + if (cpu == raw_smp_processor_id()) continue; if (s2idle_state == S2IDLE_STATE_ENTER || !cpu_isolated(cpu)) wake_up_if_idle(cpu); } - preempt_enable(); + cpus_read_unlock(); } EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus); From 73bad98694733f43495f91376198b9e593241277 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 18 Oct 2021 16:41:05 +0200 Subject: [PATCH 23/30] sched: Improve wake_up_all_idle_cpus() take #2 As reported by syzbot and experienced by Pavel, using cpus_read_lock() in wake_up_all_idle_cpus() generates lock inversion (against mmap_sem and possibly others). Instead, shrink the preempt disable region by iterating all CPUs and checking the online status for each individual CPU while having preemption disabled. Fixes: 8850cb663b5c ("sched: Simplify wake_up_*idle*()") Reported-by: syzbot+d5b23b18d2f4feae8a67@syzkaller.appspotmail.com Reported-by: Pavel Machek Reported-by: Qian Cai Signed-off-by: Peter Zijlstra (Intel) Tested-by: Qian Cai Change-Id: I652eb678e8a2e30d71beeebae35a36d4d4c49a8d (cherry picked from 96611c26dc351c33f73b48756a9feacc109e5bab) Signed-off-by: Alexander Winkowski # Conflicts: # kernel/smp.c --- kernel/smp.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/kernel/smp.c b/kernel/smp.c index fcb8d7285c41..bac329f2b441 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -811,16 +811,12 @@ void wake_up_all_idle_cpus(void) { int cpu; - cpus_read_lock(); - for_each_online_cpu(cpu) { - if (cpu == raw_smp_processor_id()) - continue; - - if (s2idle_state == S2IDLE_STATE_ENTER || - !cpu_isolated(cpu)) + for_each_possible_cpu(cpu) { + preempt_disable(); + if (cpu != smp_processor_id() && cpu_online(cpu)) wake_up_if_idle(cpu); + preempt_enable(); } - cpus_read_unlock(); } EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus); From be5603e38e45e5a4809d5460720a06f1a3b57397 Mon Sep 17 00:00:00 2001 From: kondors1995 Date: Sun, 25 Aug 2024 22:12:59 +0300 Subject: [PATCH 24/30] sched:remove unused value --- kernel/sched/core.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 836ac6c3df85..ad722395ce14 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4182,7 +4182,6 @@ void scheduler_tick(void) bool early_notif; u32 old_load; struct related_thread_group *grp; - unsigned int flag = 0; unsigned long thermal_pressure; sched_clock_tick(); From ea10cc32c6677019cd1cc0322f90e9d4847ff246 Mon Sep 17 00:00:00 2001 From: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> Date: Wed, 22 May 2024 15:56:14 +0800 Subject: [PATCH 25/30] cpufreq: schedutil: Checkout to msm-4.19 * HEAD: https://github.com/EmanuelCN/kernel_xiaomi_sm8250/commit/2c85e5f15e7f5467949d619e9d445dc734984e10 * Remove WALT and SchedTune pieces as we won't use them anymore * Remove SCHED_CPUFREQ_RT_DL in include/linux/sched/cpufreq.h as it's unused now * Add some new scheduler definitions from msm-4.19 for new schedutil * Optimize freq switching for sm8150 (sm8150 use software freq switching) * Minor format and code adjustments (compared to HEAD) * Note: The new schedutil requires UClamp, which will be backported later Signed-off-by: Helium-Studio <67852324+Helium-Studio@users.noreply.github.com> --- include/linux/sched/cpufreq.h | 2 - kernel/sched/cpufreq_schedutil.c | 104 ++++++++++---------- kernel/sched/sched.h | 162 ++++++++++++++++--------------- 3 files changed, 138 insertions(+), 130 deletions(-) diff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h index f794f8c31298..7a4050b79ec4 100644 --- a/include/linux/sched/cpufreq.h +++ b/include/linux/sched/cpufreq.h @@ -18,8 +18,6 @@ #define SCHED_CPUFREQ_FORCE_UPDATE (1U << 7) #define SCHED_CPUFREQ_CONTINUE (1U << 8) -#define SCHED_CPUFREQ_RT_DL (SCHED_CPUFREQ_RT | SCHED_CPUFREQ_DL) - #ifdef CONFIG_CPU_FREQ struct update_util_data { void (*func)(struct update_util_data *data, u64 time, unsigned int flags); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 064e09359a6c..8d95439408eb 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -108,6 +108,10 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) return true; } + /* If the last frequency wasn't set yet then we can still amend it */ + if (sg_policy->work_in_progress) + return true; + /* No need to recalculate next freq for min_rate_limit_us * at least. However we might still decide to further rate * limit once frequency change direction is decided, according @@ -120,7 +124,11 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) static inline bool use_pelt(void) { +#ifdef CONFIG_SCHED_WALT + return false; +#else return true; +#endif } static bool sugov_up_down_rate_limit(struct sugov_policy *sg_policy, u64 time, @@ -248,9 +256,6 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, return l_freq; } -extern long -schedtune_cpu_margin_with(unsigned long util, int cpu, struct task_struct *p); - /* * This function computes an effective utilization for the given CPU, to be * used for frequency selection given the linear relation: f = u * f_max. @@ -357,11 +362,10 @@ unsigned long apply_dvfs_headroom(int cpu, unsigned long util, unsigned long max if (!util || util >= max_cap) return util; - if (cpumask_test_cpu(cpu, cpu_lp_mask)) { + if (cpumask_test_cpu(cpu, cpu_lp_mask)) headroom = util + (util >> 1); - } else { + else headroom = util + (util >> 2); - } return headroom; } @@ -543,8 +547,8 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, struct sugov_policy *sg_policy = sg_cpu->sg_policy; unsigned long max_cap; unsigned int next_f; + unsigned long boost; bool busy; - unsigned long boost; max_cap = arch_scale_cpu_capacity(sg_cpu->cpu); @@ -601,7 +605,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) for_each_cpu(j, policy->cpus) { struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); - unsigned long boost; + unsigned long boost; s64 delta_ns; /* @@ -761,6 +765,28 @@ static struct attribute *sugov_attributes[] = { NULL }; +static void sugov_tunables_save(struct cpufreq_policy *policy, + struct sugov_tunables *tunables) +{ + int cpu; + struct sugov_tunables *cached = per_cpu(cached_tunables, policy->cpu); + + if (!have_governor_per_policy()) + return; + + if (!cached) { + cached = kzalloc(sizeof(*tunables), GFP_KERNEL); + if (!cached) + return; + + for_each_cpu(cpu, policy->related_cpus) + per_cpu(cached_tunables, cpu) = cached; + } + + cached->up_rate_limit_us = tunables->up_rate_limit_us; + cached->down_rate_limit_us = tunables->down_rate_limit_us; +} + static void sugov_tunables_free(struct kobject *kobj) { struct gov_attr_set *attr_set = container_of(kobj, struct gov_attr_set, kobj); @@ -768,6 +794,19 @@ static void sugov_tunables_free(struct kobject *kobj) kfree(to_sugov_tunables(attr_set)); } +static void sugov_tunables_restore(struct cpufreq_policy *policy) +{ + struct sugov_policy *sg_policy = policy->governor_data; + struct sugov_tunables *tunables = sg_policy->tunables; + struct sugov_tunables *cached = per_cpu(cached_tunables, policy->cpu); + + if (!cached) + return; + + tunables->up_rate_limit_us = cached->up_rate_limit_us; + tunables->down_rate_limit_us = cached->down_rate_limit_us; +} + static struct kobj_type sugov_tunables_ktype = { .default_attrs = sugov_attributes, .sysfs_ops = &governor_sysfs_ops, @@ -825,7 +864,8 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) } sg_policy->thread = thread; - kthread_bind_mask(thread, policy->related_cpus); + if (!policy->dvfs_possible_from_any_cpu) + kthread_bind_mask(thread, policy->related_cpus); init_irq_work(&sg_policy->irq_work, sugov_irq_work); mutex_init(&sg_policy->work_lock); @@ -858,48 +898,12 @@ static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_polic return tunables; } -static void sugov_tunables_save(struct cpufreq_policy *policy, - struct sugov_tunables *tunables) -{ - int cpu; - struct sugov_tunables *cached = per_cpu(cached_tunables, policy->cpu); - - if (!have_governor_per_policy()) - return; - - if (!cached) { - cached = kzalloc(sizeof(*tunables), GFP_KERNEL); - if (!cached) - return; - - for_each_cpu(cpu, policy->related_cpus) - per_cpu(cached_tunables, cpu) = cached; - } - - cached->up_rate_limit_us = tunables->up_rate_limit_us; - cached->down_rate_limit_us = tunables->down_rate_limit_us; -} - - static void sugov_clear_global_tunables(void) { if (!have_governor_per_policy()) global_tunables = NULL; } -static void sugov_tunables_restore(struct cpufreq_policy *policy) -{ - struct sugov_policy *sg_policy = policy->governor_data; - struct sugov_tunables *tunables = sg_policy->tunables; - struct sugov_tunables *cached = per_cpu(cached_tunables, policy->cpu); - - if (!cached) - return; - - tunables->up_rate_limit_us = cached->up_rate_limit_us; - tunables->down_rate_limit_us = cached->down_rate_limit_us; -} - static int sugov_init(struct cpufreq_policy *policy) { struct sugov_policy *sg_policy; @@ -989,13 +993,15 @@ static void sugov_exit(struct cpufreq_policy *policy) mutex_lock(&global_tunables_lock); + /* Save tunables before last owner release it in gov_attr_set_put() */ + if (tunables->attr_set.usage_count == 1) + sugov_tunables_save(policy, tunables); + count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook); policy->governor_data = NULL; - - if (!count) { - sugov_tunables_save(policy, tunables); + if (!count) sugov_clear_global_tunables(); - } + mutex_unlock(&global_tunables_lock); sugov_kthread_stop(sg_policy); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9fb0b3530f94..2ef572842824 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2332,85 +2332,6 @@ cpu_util_freq(int cpu, struct sched_walt_cpu_load *walt_load) #endif /* CONFIG_SCHED_WALT */ -#ifdef CONFIG_SMP -static inline unsigned long cpu_util_cfs(struct rq *rq) -{ - unsigned long util = READ_ONCE(rq->cfs.avg.util_avg); - - if (sched_feat(UTIL_EST)) { - util = max_t(unsigned long, util, - READ_ONCE(rq->cfs.avg.util_est.enqueued)); - } - - return util; -} -#endif - -#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL -unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, - unsigned long *min, - unsigned long *max); - -unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual, - unsigned long min, - unsigned long max); - -static inline unsigned long cpu_bw_dl(struct rq *rq) -{ - return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; -} - -static inline unsigned long cpu_util_dl(struct rq *rq) -{ - return READ_ONCE(rq->avg_dl.util_avg); -} - -static inline unsigned long cpu_util_rt(struct rq *rq) -{ - return READ_ONCE(rq->avg_rt.util_avg); -} -#else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ -static inline unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, - unsigned long max, enum schedutil_type type, - struct task_struct *p) -{ - return 0; -} -#endif /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ - -#ifdef CONFIG_HAVE_SCHED_AVG_IRQ -static inline unsigned long cpu_util_irq(struct rq *rq) -{ - return rq->avg_irq.util_avg; -} - -static inline -unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) -{ - util *= (max - irq); - util /= max; - - return util; - -} -#else -static inline unsigned long cpu_util_irq(struct rq *rq) -{ - return 0; -} - -static inline -unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) -{ - return util; -} -#endif - -static inline unsigned long cpu_util(int cpu) -{ - return min(__cpu_util(cpu) + cpu_util_rt(cpu_rq(cpu)), capacity_orig_of(cpu)); -} - extern unsigned int capacity_margin_freq; static inline unsigned long @@ -2909,6 +2830,89 @@ bool uclamp_boosted(struct task_struct *p); #define arch_scale_freq_invariant() (false) #endif +#ifdef CONFIG_SMP +static inline unsigned long cpu_util_cfs(struct rq *rq) +{ + unsigned long util = READ_ONCE(rq->cfs.avg.util_avg); + + if (sched_feat(UTIL_EST)) { + util = max_t(unsigned long, util, + READ_ONCE(rq->cfs.avg.util_est.enqueued)); + } + + return util; +} +#endif + +#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL +unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, + unsigned long *min, + unsigned long *max); + +unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual, + unsigned long min, + unsigned long max); + +static inline unsigned long cpu_bw_dl(struct rq *rq) +{ + return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; +} + +static inline unsigned long cpu_util_dl(struct rq *rq) +{ + return READ_ONCE(rq->avg_dl.util_avg); +} + +static inline unsigned long cpu_util_rt(struct rq *rq) +{ + return READ_ONCE(rq->avg_rt.util_avg); +} +#endif + +#ifdef CONFIG_SMP +#ifndef CONFIG_SCHED_WALT +static inline unsigned long cpu_util(int cpu) +{ + return min(__cpu_util(cpu) + cpu_util_rt(cpu_rq(cpu)), + capacity_orig_of(cpu)); +} +#endif +#endif + +#ifdef CONFIG_HAVE_SCHED_AVG_IRQ +static inline unsigned long cpu_util_irq(struct rq *rq) +{ + return rq->avg_irq.util_avg; +} + +static inline +unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) +{ + util *= (max - irq); + util /= max; + + return util; + +} +#else +static inline unsigned long cpu_util_irq(struct rq *rq) +{ + return 0; +} + +static inline +unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) +{ + return util; +} +#endif + +static inline unsigned long +cpu_util_freq(int cpu, struct sched_walt_cpu_load *walt_load) +{ + return min(cpu_util(cpu), capacity_orig_of(cpu)); +} + enum sched_boost_policy { SCHED_BOOST_NONE, SCHED_BOOST_ON_BIG, From de9a4993cee2769b3199108ba5be0a6dc8a794b6 Mon Sep 17 00:00:00 2001 From: EmanuelCN Date: Fri, 23 Aug 2024 16:40:59 +0300 Subject: [PATCH 26/30] Revert "cpufreq: schedutil: Ignore CPU load older than WALT window size" This reverts commit b9d1ecec68e9750f819dca02af47caee55d06a58. --- kernel/sched/cpufreq_schedutil.c | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 8d95439408eb..6cb0730e089e 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -75,7 +75,6 @@ struct sugov_cpu { }; static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu); -static unsigned int stale_ns; static DEFINE_PER_CPU(struct sugov_tunables *, cached_tunables); /************************ Governor internals ***********************/ @@ -606,21 +605,6 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) for_each_cpu(j, policy->cpus) { struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); unsigned long boost; - s64 delta_ns; - - /* - * If the CPU utilization was last updated before the previous - * frequency update and the time elapsed between the last update - * of the CPU utilization and the last frequency update is long - * enough, don't take the CPU into account as it probably is - * idle now (and clear iowait_boost for it). - */ - delta_ns = time - j_sg_cpu->last_update; - if (delta_ns > stale_ns) { - sugov_iowait_reset(j_sg_cpu, time, false); - continue; - } - boost = sugov_iowait_apply(j_sg_cpu, time, max_cap); sugov_get_util(j_sg_cpu, boost); @@ -952,8 +936,6 @@ static int sugov_init(struct cpufreq_policy *policy) policy->governor_data = sg_policy; sg_policy->tunables = tunables; - stale_ns = sched_ravg_window + (sched_ravg_window >> 3); - sugov_tunables_restore(policy); ret = kobject_init_and_add(&tunables->attr_set.kobj, &sugov_tunables_ktype, From d9324db2412fde8a5f86aefa5ab30e1e0e4be1ce Mon Sep 17 00:00:00 2001 From: Sultan Alsawaf Date: Wed, 28 Aug 2024 22:56:59 +0300 Subject: [PATCH 27/30] cpufreq: schedutil: Allow single-CPU frequency to drop without idling Given that a CPU's clock is gated at even the shallowest idle state, waiting until a CPU idles at least once before reducing its frequency is putting the cart before the horse. For long-running workloads with low compute needs, requiring an idle call since the last frequency update to lower the CPU's frequency results in significantly increased energy usage. Given that there is already a mechanism in place to ratelimit frequency changes, this heuristic is wholly unnecessary. Allow single-CPU performance domains to drop their frequency without requiring an idle call in between to improve energy. Right off the bat, this reduces CPU power consumption by 7.5% playing a cat gif in Firefox on a Pixel 8 (270 mW -> 250 mW). And there is no visible loss of performance. Signed-off-by: Sultan Alsawaf --- kernel/sched/cpufreq_schedutil.c | 34 -------------------------------- 1 file changed, 34 deletions(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 6cb0730e089e..eb1df0c899a6 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -67,11 +67,6 @@ struct sugov_cpu { unsigned long util; unsigned long bw_min; - - /* The field below is for single-CPU policies only: */ -#ifdef CONFIG_NO_HZ_COMMON - unsigned long saved_idle_calls; -#endif }; static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu); @@ -516,19 +511,6 @@ static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, return (sg_cpu->iowait_boost * max_cap) >> SCHED_CAPACITY_SHIFT; } -#ifdef CONFIG_NO_HZ_COMMON -static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) -{ - unsigned long idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu); - bool ret = idle_calls == sg_cpu->saved_idle_calls; - - sg_cpu->saved_idle_calls = idle_calls; - return ret; -} -#else -static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } -#endif /* CONFIG_NO_HZ_COMMON */ - /* * Make sugov_should_update_freq() ignore the rate limit when DL * has increased the utilization. @@ -547,7 +529,6 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, unsigned long max_cap; unsigned int next_f; unsigned long boost; - bool busy; max_cap = arch_scale_cpu_capacity(sg_cpu->cpu); @@ -559,25 +540,10 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, if (!sugov_should_update_freq(sg_policy, time)) return; - /* Limits may have changed, don't skip frequency update */ - busy = use_pelt() && !sg_policy->need_freq_update && - sugov_cpu_is_busy(sg_cpu); - boost = sugov_iowait_apply(sg_cpu, time, max_cap); sugov_get_util(sg_cpu, boost); next_f = get_next_freq(sg_policy, sg_cpu->util, max_cap); - /* - * Do not reduce the frequency if the CPU has not been idle - * recently, as the reduction is likely to be premature then. - */ - if (busy && next_f < sg_policy->next_freq && - !sg_policy->need_freq_update) { - next_f = sg_policy->next_freq; - - /* Restore cached freq as next_freq has changed */ - sg_policy->cached_raw_freq = sg_policy->prev_cached_raw_freq; - } /* * This code runs under rq->lock for the target CPU, so it won't run From 1845102b9f56aaf2ad2843073e5cfc920556a357 Mon Sep 17 00:00:00 2001 From: Sheenam Monga Date: Wed, 17 Apr 2024 15:39:57 +0530 Subject: [PATCH 28/30] BACKPORT: qcacmn: Fix potential OOB read in util_scan_parse_rnr_ie Currently, while parsing scan RNR Ie data is moved to next neighbor_ap_info_field after parsing the current neighbor_ap_info_field. But in last iteration pointer may try to access invalid data if (uint8_t *)ie + rnr_ie_len + 2) bytes are less than sizeof neighbor_ap_info_field and same is the case with tbtt_length access. Fix is to add a length check of data + next data size to be parsed < (uint8_t *)ie + rnr_ie_len + 2) instead of adding a validation of data length only. CRs-Fixed: 3710080 Change-Id: I05e5a9a02f0f4f9bc468db894588e676f0a248c0 --- .../umac/scan/dispatcher/src/wlan_scan_utils_api.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/staging/qca-wifi-host-cmn/umac/scan/dispatcher/src/wlan_scan_utils_api.c b/drivers/staging/qca-wifi-host-cmn/umac/scan/dispatcher/src/wlan_scan_utils_api.c index 86c1c1b81a02..02ab28c5bc97 100644 --- a/drivers/staging/qca-wifi-host-cmn/umac/scan/dispatcher/src/wlan_scan_utils_api.c +++ b/drivers/staging/qca-wifi-host-cmn/umac/scan/dispatcher/src/wlan_scan_utils_api.c @@ -709,7 +709,8 @@ util_scan_parse_rnr_ie(struct scan_cache_entry *scan_entry, rnr_ie_len = ie->ie_len; data = (uint8_t *)ie + sizeof(struct ie_header); - while (data < ((uint8_t *)ie + rnr_ie_len + 2)) { + while ((data + sizeof(struct neighbor_ap_info_field)) <= + ((uint8_t *)ie + rnr_ie_len + 2)) { neighbor_ap_info = (struct neighbor_ap_info_field *)data; tbtt_count = neighbor_ap_info->tbtt_header.tbtt_info_count; tbtt_length = neighbor_ap_info->tbtt_header.tbtt_info_length; @@ -725,7 +726,8 @@ util_scan_parse_rnr_ie(struct scan_cache_entry *scan_entry, break; for (i = 0; i < (tbtt_count + 1) && - data < ((uint8_t *)ie + rnr_ie_len + 2); i++) { + (data + tbtt_length) <= + ((uint8_t *)ie + rnr_ie_len + 2); i++) { if (i < MAX_RNR_BSS) util_scan_update_rnr( &scan_entry->rnr.bss_info[i], From d5d983781ff4d797e036d307b53aa8734d3232c7 Mon Sep 17 00:00:00 2001 From: John Galt Date: Fri, 16 Aug 2024 08:49:11 -0400 Subject: [PATCH 29/30] raphael_defconfig: enable audit + avc stats --- arch/arm64/configs/raphael_defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/configs/raphael_defconfig b/arch/arm64/configs/raphael_defconfig index 05c1b41e1b5c..864dcfa430e1 100644 --- a/arch/arm64/configs/raphael_defconfig +++ b/arch/arm64/configs/raphael_defconfig @@ -2,6 +2,7 @@ CONFIG_TOOLS_SUPPORT_RELR=y CONFIG_LOCALVERSION="-SOVIET-STAR-" CONFIG_INLINE_OPTIMIZATION=y # CONFIG_FHANDLE is not set +CONFIG_AUDIT=y CONFIG_IRQ_SBALANCE=y CONFIG_SBALANCE_EXCLUDE_CPUS="3,6,7" CONFIG_NO_HZ=y From e5a1fc529ab0d6da058111756a8e80d0eef951ac Mon Sep 17 00:00:00 2001 From: kondors1995 Date: Sun, 1 Sep 2024 18:00:27 +0300 Subject: [PATCH 30/30] R6.5 --- build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.sh b/build.sh index 7b5c09c6c83b..bfe429a1f87f 100755 --- a/build.sh +++ b/build.sh @@ -15,7 +15,7 @@ export THINLTO_CACHE=~/ltocache/ DEFCONFIG="raphael_defconfig" # Kernel Details -REV="R6.4" +REV="R6.5" EDITION="BLACK" VER="$EDITION"-"$REV"